From fe24634d14bc0973ca38222db2f58eafbf0c890d Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Fri, 23 Jun 2017 00:43:21 -0700
Subject: [PATCH 001/779] [SPARK-21145][SS] Added StateStoreProviderId with
 queryRunId to reload StateStoreProviders when query is restarted

## What changes were proposed in this pull request?
StateStoreProvider instances are loaded on-demand in a executor when a query is started. When a query is restarted, the loaded provider instance will get reused. Now, there is a non-trivial chance, that the task of the previous query run is still running, while the tasks of the restarted run has started. So for a stateful partition, there may be two concurrent tasks related to the same stateful partition, and there for using the same provider instance. This can lead to inconsistent results and possibly random failures, as state store implementations are not designed to be thread-safe.

To fix this, I have introduced a `StateStoreProviderId`, that unique identifies a provider loaded in an executor. It has the query run id in it, thus making sure that restarted queries will force the executor to load a new provider instance, thus avoiding two concurrent tasks (from two different runs) from reusing the same provider instance.

Additional minor bug fixes
- All state stores related to query run is marked as deactivated in the `StateStoreCoordinator` so that the executors can unload them and clear resources.
- Moved the code that determined the checkpoint directory of a state store from implementation-specific code (`HDFSBackedStateStoreProvider`) to non-specific code (StateStoreId), so that implementation do not accidentally get it wrong.
  - Also added store name to the path, to support multiple stores per sql operator partition.

*Note:* This change does not address the scenario where two tasks of the same run (e.g. speculative tasks) are concurrently running in the same executor. The chance of this very small, because ideally speculative tasks should never run in the same executor.

## How was this patch tested?
Existing unit tests + new unit test.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #18355 from tdas/SPARK-21145.
---
 .../sql/execution/aggregate/AggUtils.scala    |  2 +-
 .../sql/execution/command/commands.scala      |  5 +-
 .../FlatMapGroupsWithStateExec.scala          |  7 +-
 .../streaming/IncrementalExecution.scala      | 27 +++---
 .../execution/streaming/StreamExecution.scala |  1 +
 .../state/HDFSBackedStateStoreProvider.scala  | 16 ++--
 .../streaming/state/StateStore.scala          | 91 +++++++++++++-----
 .../state/StateStoreCoordinator.scala         | 41 ++++----
 .../streaming/state/StateStoreRDD.scala       | 21 ++++-
 .../execution/streaming/state/package.scala   | 25 ++---
 .../streaming/statefulOperators.scala         | 38 ++++----
 .../sql/streaming/StreamingQueryManager.scala |  1 +
 .../state/StateStoreCoordinatorSuite.scala    | 61 ++++++++++--
 .../streaming/state/StateStoreRDDSuite.scala  | 51 +++++-----
 .../streaming/state/StateStoreSuite.scala     | 93 +++++++++++++++----
 .../spark/sql/streaming/StreamSuite.scala     |  2 +-
 .../spark/sql/streaming/StreamTest.scala      | 13 ++-
 17 files changed, 329 insertions(+), 166 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
index aa789af6f812f..12f8cffb6774a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
@@ -311,7 +311,7 @@ object AggUtils {
     val saved =
       StateStoreSaveExec(
         groupingAttributes,
-        stateId = None,
+        stateInfo = None,
         outputMode = None,
         eventTimeWatermark = None,
         partialMerged2)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
index 2d82fcf4da6e9..81bc93e7ebcf4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.execution.command
 
+import java.util.UUID
+
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{Row, SparkSession}
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
@@ -117,7 +119,8 @@ case class ExplainCommand(
         // This is used only by explaining `Dataset/DataFrame` created by `spark.readStream`, so the
         // output mode does not matter since there is no `Sink`.
         new IncrementalExecution(
-          sparkSession, logicalPlan, OutputMode.Append(), "<unknown>", 0, OffsetSeqMetadata(0, 0))
+          sparkSession, logicalPlan, OutputMode.Append(), "<unknown>",
+          UUID.randomUUID, 0, OffsetSeqMetadata(0, 0))
       } else {
         sparkSession.sessionState.executePlan(logicalPlan)
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala
index 2aad8701a4eca..9dcac33b4107c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala
@@ -50,7 +50,7 @@ case class FlatMapGroupsWithStateExec(
     groupingAttributes: Seq[Attribute],
     dataAttributes: Seq[Attribute],
     outputObjAttr: Attribute,
-    stateId: Option[OperatorStateId],
+    stateInfo: Option[StatefulOperatorStateInfo],
     stateEncoder: ExpressionEncoder[Any],
     outputMode: OutputMode,
     timeoutConf: GroupStateTimeout,
@@ -107,10 +107,7 @@ case class FlatMapGroupsWithStateExec(
     }
 
     child.execute().mapPartitionsWithStateStore[InternalRow](
-      getStateId.checkpointLocation,
-      getStateId.operatorId,
-      storeName = "default",
-      getStateId.batchId,
+      getStateInfo,
       groupingAttributes.toStructType,
       stateAttributes.toStructType,
       indexOrdinal = None,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
index 622e049630db2..ab89dc6b705d5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution.streaming
 
+import java.util.UUID
 import java.util.concurrent.atomic.AtomicInteger
 
 import org.apache.spark.internal.Logging
@@ -36,6 +37,7 @@ class IncrementalExecution(
     logicalPlan: LogicalPlan,
     val outputMode: OutputMode,
     val checkpointLocation: String,
+    val runId: UUID,
     val currentBatchId: Long,
     offsetSeqMetadata: OffsetSeqMetadata)
   extends QueryExecution(sparkSession, logicalPlan) with Logging {
@@ -69,7 +71,13 @@ class IncrementalExecution(
    * Records the current id for a given stateful operator in the query plan as the `state`
    * preparation walks the query plan.
    */
-  private val operatorId = new AtomicInteger(0)
+  private val statefulOperatorId = new AtomicInteger(0)
+
+  /** Get the state info of the next stateful operator */
+  private def nextStatefulOperationStateInfo(): StatefulOperatorStateInfo = {
+    StatefulOperatorStateInfo(
+      checkpointLocation, runId, statefulOperatorId.getAndIncrement(), currentBatchId)
+  }
 
   /** Locates save/restore pairs surrounding aggregation. */
   val state = new Rule[SparkPlan] {
@@ -78,35 +86,28 @@ class IncrementalExecution(
       case StateStoreSaveExec(keys, None, None, None,
              UnaryExecNode(agg,
                StateStoreRestoreExec(keys2, None, child))) =>
-        val stateId =
-          OperatorStateId(checkpointLocation, operatorId.getAndIncrement(), currentBatchId)
-
+        val aggStateInfo = nextStatefulOperationStateInfo
         StateStoreSaveExec(
           keys,
-          Some(stateId),
+          Some(aggStateInfo),
           Some(outputMode),
           Some(offsetSeqMetadata.batchWatermarkMs),
           agg.withNewChildren(
             StateStoreRestoreExec(
               keys,
-              Some(stateId),
+              Some(aggStateInfo),
               child) :: Nil))
 
       case StreamingDeduplicateExec(keys, child, None, None) =>
-        val stateId =
-          OperatorStateId(checkpointLocation, operatorId.getAndIncrement(), currentBatchId)
-
         StreamingDeduplicateExec(
           keys,
           child,
-          Some(stateId),
+          Some(nextStatefulOperationStateInfo),
           Some(offsetSeqMetadata.batchWatermarkMs))
 
       case m: FlatMapGroupsWithStateExec =>
-        val stateId =
-          OperatorStateId(checkpointLocation, operatorId.getAndIncrement(), currentBatchId)
         m.copy(
-          stateId = Some(stateId),
+          stateInfo = Some(nextStatefulOperationStateInfo),
           batchTimestampMs = Some(offsetSeqMetadata.batchTimestampMs),
           eventTimeWatermark = Some(offsetSeqMetadata.batchWatermarkMs))
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 74f0f509bbf85..06bdec8b06407 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -652,6 +652,7 @@ class StreamExecution(
         triggerLogicalPlan,
         outputMode,
         checkpointFile("state"),
+        runId,
         currentBatchId,
         offsetSeqMetadata)
       lastExecution.executedPlan // Force the lazy generation of execution plan
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
index 67d86daf10812..bae7a15165e43 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
@@ -92,7 +92,7 @@ private[state] class HDFSBackedStateStoreProvider extends StateStoreProvider wit
     @volatile private var state: STATE = UPDATING
     @volatile private var finalDeltaFile: Path = null
 
-    override def id: StateStoreId = HDFSBackedStateStoreProvider.this.id
+    override def id: StateStoreId = HDFSBackedStateStoreProvider.this.stateStoreId
 
     override def get(key: UnsafeRow): UnsafeRow = {
       mapToUpdate.get(key)
@@ -177,7 +177,7 @@ private[state] class HDFSBackedStateStoreProvider extends StateStoreProvider wit
     /**
      * Whether all updates have been committed
      */
-    override private[streaming] def hasCommitted: Boolean = {
+    override def hasCommitted: Boolean = {
       state == COMMITTED
     }
 
@@ -205,7 +205,7 @@ private[state] class HDFSBackedStateStoreProvider extends StateStoreProvider wit
       indexOrdinal: Option[Int], // for sorting the data
       storeConf: StateStoreConf,
       hadoopConf: Configuration): Unit = {
-    this.stateStoreId = stateStoreId
+    this.stateStoreId_ = stateStoreId
     this.keySchema = keySchema
     this.valueSchema = valueSchema
     this.storeConf = storeConf
@@ -213,7 +213,7 @@ private[state] class HDFSBackedStateStoreProvider extends StateStoreProvider wit
     fs.mkdirs(baseDir)
   }
 
-  override def id: StateStoreId = stateStoreId
+  override def stateStoreId: StateStoreId = stateStoreId_
 
   /** Do maintenance backing data files, including creating snapshots and cleaning up old files */
   override def doMaintenance(): Unit = {
@@ -231,20 +231,20 @@ private[state] class HDFSBackedStateStoreProvider extends StateStoreProvider wit
   }
 
   override def toString(): String = {
-    s"HDFSStateStoreProvider[id = (op=${id.operatorId}, part=${id.partitionId}), dir = $baseDir]"
+    s"HDFSStateStoreProvider[" +
+      s"id = (op=${stateStoreId.operatorId},part=${stateStoreId.partitionId}),dir = $baseDir]"
   }
 
   /* Internal fields and methods */
 
-  @volatile private var stateStoreId: StateStoreId = _
+  @volatile private var stateStoreId_ : StateStoreId = _
   @volatile private var keySchema: StructType = _
   @volatile private var valueSchema: StructType = _
   @volatile private var storeConf: StateStoreConf = _
   @volatile private var hadoopConf: Configuration = _
 
   private lazy val loadedMaps = new mutable.HashMap[Long, MapType]
-  private lazy val baseDir =
-    new Path(id.checkpointLocation, s"${id.operatorId}/${id.partitionId.toString}")
+  private lazy val baseDir = stateStoreId.storeCheckpointLocation()
   private lazy val fs = baseDir.getFileSystem(hadoopConf)
   private lazy val sparkConf = Option(SparkEnv.get).map(_.conf).getOrElse(new SparkConf)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
index 29c456f86e1ed..a94ff8a7ebd1e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution.streaming.state
 
+import java.util.UUID
 import java.util.concurrent.{ScheduledFuture, TimeUnit}
 import javax.annotation.concurrent.GuardedBy
 
@@ -24,14 +25,14 @@ import scala.collection.mutable
 import scala.util.control.NonFatal
 
 import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
 
-import org.apache.spark.SparkEnv
+import org.apache.spark.{SparkContext, SparkEnv}
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.{ThreadUtils, Utils}
 
-
 /**
  * Base trait for a versioned key-value store. Each instance of a `StateStore` represents a specific
  * version of state data, and such instances are created through a [[StateStoreProvider]].
@@ -99,7 +100,7 @@ trait StateStore {
   /**
    * Whether all updates have been committed
    */
-  private[streaming] def hasCommitted: Boolean
+  def hasCommitted: Boolean
 }
 
 
@@ -147,7 +148,7 @@ trait StateStoreProvider {
    * Return the id of the StateStores this provider will generate.
    * Should be the same as the one passed in init().
    */
-  def id: StateStoreId
+  def stateStoreId: StateStoreId
 
   /** Called when the provider instance is unloaded from the executor */
   def close(): Unit
@@ -179,13 +180,46 @@ object StateStoreProvider {
   }
 }
 
+/**
+ * Unique identifier for a provider, used to identify when providers can be reused.
+ * Note that `queryRunId` is used uniquely identify a provider, so that the same provider
+ * instance is not reused across query restarts.
+ */
+case class StateStoreProviderId(storeId: StateStoreId, queryRunId: UUID)
 
-/** Unique identifier for a bunch of keyed state data. */
+/**
+ * Unique identifier for a bunch of keyed state data.
+ * @param checkpointRootLocation Root directory where all the state data of a query is stored
+ * @param operatorId Unique id of a stateful operator
+ * @param partitionId Index of the partition of an operators state data
+ * @param storeName Optional, name of the store. Each partition can optionally use multiple state
+ *                  stores, but they have to be identified by distinct names.
+ */
 case class StateStoreId(
-    checkpointLocation: String,
+    checkpointRootLocation: String,
     operatorId: Long,
     partitionId: Int,
-    name: String = "")
+    storeName: String = StateStoreId.DEFAULT_STORE_NAME) {
+
+  /**
+   * Checkpoint directory to be used by a single state store, identified uniquely by the tuple
+   * (operatorId, partitionId, storeName). All implementations of [[StateStoreProvider]] should
+   * use this path for saving state data, as this ensures that distinct stores will write to
+   * different locations.
+   */
+  def storeCheckpointLocation(): Path = {
+    if (storeName == StateStoreId.DEFAULT_STORE_NAME) {
+      // For reading state store data that was generated before store names were used (Spark <= 2.2)
+      new Path(checkpointRootLocation, s"$operatorId/$partitionId")
+    } else {
+      new Path(checkpointRootLocation, s"$operatorId/$partitionId/$storeName")
+    }
+  }
+}
+
+object StateStoreId {
+  val DEFAULT_STORE_NAME = "default"
+}
 
 /** Mutable, and reusable class for representing a pair of UnsafeRows. */
 class UnsafeRowPair(var key: UnsafeRow = null, var value: UnsafeRow = null) {
@@ -211,7 +245,7 @@ object StateStore extends Logging {
   val MAINTENANCE_INTERVAL_DEFAULT_SECS = 60
 
   @GuardedBy("loadedProviders")
-  private val loadedProviders = new mutable.HashMap[StateStoreId, StateStoreProvider]()
+  private val loadedProviders = new mutable.HashMap[StateStoreProviderId, StateStoreProvider]()
 
   /**
    * Runs the `task` periodically and automatically cancels it if there is an exception. `onError`
@@ -253,7 +287,7 @@ object StateStore extends Logging {
 
   /** Get or create a store associated with the id. */
   def get(
-      storeId: StateStoreId,
+      storeProviderId: StateStoreProviderId,
       keySchema: StructType,
       valueSchema: StructType,
       indexOrdinal: Option[Int],
@@ -264,24 +298,24 @@ object StateStore extends Logging {
     val storeProvider = loadedProviders.synchronized {
       startMaintenanceIfNeeded()
       val provider = loadedProviders.getOrElseUpdate(
-        storeId,
+        storeProviderId,
         StateStoreProvider.instantiate(
-          storeId, keySchema, valueSchema, indexOrdinal, storeConf, hadoopConf)
+          storeProviderId.storeId, keySchema, valueSchema, indexOrdinal, storeConf, hadoopConf)
       )
-      reportActiveStoreInstance(storeId)
+      reportActiveStoreInstance(storeProviderId)
       provider
     }
     storeProvider.getStore(version)
   }
 
   /** Unload a state store provider */
-  def unload(storeId: StateStoreId): Unit = loadedProviders.synchronized {
-    loadedProviders.remove(storeId).foreach(_.close())
+  def unload(storeProviderId: StateStoreProviderId): Unit = loadedProviders.synchronized {
+    loadedProviders.remove(storeProviderId).foreach(_.close())
   }
 
   /** Whether a state store provider is loaded or not */
-  def isLoaded(storeId: StateStoreId): Boolean = loadedProviders.synchronized {
-    loadedProviders.contains(storeId)
+  def isLoaded(storeProviderId: StateStoreProviderId): Boolean = loadedProviders.synchronized {
+    loadedProviders.contains(storeProviderId)
   }
 
   def isMaintenanceRunning: Boolean = loadedProviders.synchronized {
@@ -340,21 +374,21 @@ object StateStore extends Logging {
     }
   }
 
-  private def reportActiveStoreInstance(storeId: StateStoreId): Unit = {
+  private def reportActiveStoreInstance(storeProviderId: StateStoreProviderId): Unit = {
     if (SparkEnv.get != null) {
       val host = SparkEnv.get.blockManager.blockManagerId.host
       val executorId = SparkEnv.get.blockManager.blockManagerId.executorId
-      coordinatorRef.foreach(_.reportActiveInstance(storeId, host, executorId))
-      logDebug(s"Reported that the loaded instance $storeId is active")
+      coordinatorRef.foreach(_.reportActiveInstance(storeProviderId, host, executorId))
+      logInfo(s"Reported that the loaded instance $storeProviderId is active")
     }
   }
 
-  private def verifyIfStoreInstanceActive(storeId: StateStoreId): Boolean = {
+  private def verifyIfStoreInstanceActive(storeProviderId: StateStoreProviderId): Boolean = {
     if (SparkEnv.get != null) {
       val executorId = SparkEnv.get.blockManager.blockManagerId.executorId
       val verified =
-        coordinatorRef.map(_.verifyIfInstanceActive(storeId, executorId)).getOrElse(false)
-      logDebug(s"Verified whether the loaded instance $storeId is active: $verified")
+        coordinatorRef.map(_.verifyIfInstanceActive(storeProviderId, executorId)).getOrElse(false)
+      logDebug(s"Verified whether the loaded instance $storeProviderId is active: $verified")
       verified
     } else {
       false
@@ -364,12 +398,21 @@ object StateStore extends Logging {
   private def coordinatorRef: Option[StateStoreCoordinatorRef] = loadedProviders.synchronized {
     val env = SparkEnv.get
     if (env != null) {
-      if (_coordRef == null) {
+      logInfo("Env is not null")
+      val isDriver =
+        env.executorId == SparkContext.DRIVER_IDENTIFIER ||
+          env.executorId == SparkContext.LEGACY_DRIVER_IDENTIFIER
+      // If running locally, then the coordinator reference in _coordRef may be have become inactive
+      // as SparkContext + SparkEnv may have been restarted. Hence, when running in driver,
+      // always recreate the reference.
+      if (isDriver || _coordRef == null) {
+        logInfo("Getting StateStoreCoordinatorRef")
         _coordRef = StateStoreCoordinatorRef.forExecutor(env)
       }
-      logDebug(s"Retrieved reference to StateStoreCoordinator: ${_coordRef}")
+      logInfo(s"Retrieved reference to StateStoreCoordinator: ${_coordRef}")
       Some(_coordRef)
     } else {
+      logInfo("Env is null")
       _coordRef = null
       None
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreCoordinator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreCoordinator.scala
index d0f81887e62d1..3884f5e6ce766 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreCoordinator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreCoordinator.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.execution.streaming.state
 
+import java.util.UUID
+
 import scala.collection.mutable
 
 import org.apache.spark.SparkEnv
@@ -29,16 +31,19 @@ import org.apache.spark.util.RpcUtils
 private sealed trait StateStoreCoordinatorMessage extends Serializable
 
 /** Classes representing messages */
-private case class ReportActiveInstance(storeId: StateStoreId, host: String, executorId: String)
+private case class ReportActiveInstance(
+    storeId: StateStoreProviderId,
+    host: String,
+    executorId: String)
   extends StateStoreCoordinatorMessage
 
-private case class VerifyIfInstanceActive(storeId: StateStoreId, executorId: String)
+private case class VerifyIfInstanceActive(storeId: StateStoreProviderId, executorId: String)
   extends StateStoreCoordinatorMessage
 
-private case class GetLocation(storeId: StateStoreId)
+private case class GetLocation(storeId: StateStoreProviderId)
   extends StateStoreCoordinatorMessage
 
-private case class DeactivateInstances(checkpointLocation: String)
+private case class DeactivateInstances(runId: UUID)
   extends StateStoreCoordinatorMessage
 
 private object StopCoordinator
@@ -80,25 +85,27 @@ object StateStoreCoordinatorRef extends Logging {
 class StateStoreCoordinatorRef private(rpcEndpointRef: RpcEndpointRef) {
 
   private[state] def reportActiveInstance(
-      storeId: StateStoreId,
+      stateStoreProviderId: StateStoreProviderId,
       host: String,
       executorId: String): Unit = {
-    rpcEndpointRef.send(ReportActiveInstance(storeId, host, executorId))
+    rpcEndpointRef.send(ReportActiveInstance(stateStoreProviderId, host, executorId))
   }
 
   /** Verify whether the given executor has the active instance of a state store */
-  private[state] def verifyIfInstanceActive(storeId: StateStoreId, executorId: String): Boolean = {
-    rpcEndpointRef.askSync[Boolean](VerifyIfInstanceActive(storeId, executorId))
+  private[state] def verifyIfInstanceActive(
+      stateStoreProviderId: StateStoreProviderId,
+      executorId: String): Boolean = {
+    rpcEndpointRef.askSync[Boolean](VerifyIfInstanceActive(stateStoreProviderId, executorId))
   }
 
   /** Get the location of the state store */
-  private[state] def getLocation(storeId: StateStoreId): Option[String] = {
-    rpcEndpointRef.askSync[Option[String]](GetLocation(storeId))
+  private[state] def getLocation(stateStoreProviderId: StateStoreProviderId): Option[String] = {
+    rpcEndpointRef.askSync[Option[String]](GetLocation(stateStoreProviderId))
   }
 
-  /** Deactivate instances related to a set of operator */
-  private[state] def deactivateInstances(storeRootLocation: String): Unit = {
-    rpcEndpointRef.askSync[Boolean](DeactivateInstances(storeRootLocation))
+  /** Deactivate instances related to a query */
+  private[sql] def deactivateInstances(runId: UUID): Unit = {
+    rpcEndpointRef.askSync[Boolean](DeactivateInstances(runId))
   }
 
   private[state] def stop(): Unit = {
@@ -113,7 +120,7 @@ class StateStoreCoordinatorRef private(rpcEndpointRef: RpcEndpointRef) {
  */
 private class StateStoreCoordinator(override val rpcEnv: RpcEnv)
     extends ThreadSafeRpcEndpoint with Logging {
-  private val instances = new mutable.HashMap[StateStoreId, ExecutorCacheTaskLocation]
+  private val instances = new mutable.HashMap[StateStoreProviderId, ExecutorCacheTaskLocation]
 
   override def receive: PartialFunction[Any, Unit] = {
     case ReportActiveInstance(id, host, executorId) =>
@@ -135,11 +142,11 @@ private class StateStoreCoordinator(override val rpcEnv: RpcEnv)
       logDebug(s"Got location of the state store $id: $executorId")
       context.reply(executorId)
 
-    case DeactivateInstances(checkpointLocation) =>
+    case DeactivateInstances(runId) =>
       val storeIdsToRemove =
-        instances.keys.filter(_.checkpointLocation == checkpointLocation).toSeq
+        instances.keys.filter(_.queryRunId == runId).toSeq
       instances --= storeIdsToRemove
-      logDebug(s"Deactivating instances related to checkpoint location $checkpointLocation: " +
+      logDebug(s"Deactivating instances related to checkpoint location $runId: " +
         storeIdsToRemove.mkString(", "))
       context.reply(true)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreRDD.scala
index b744c25dc97a8..01d8e75980993 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreRDD.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.execution.streaming.state
 
+import java.util.UUID
+
 import scala.reflect.ClassTag
 
 import org.apache.spark.{Partition, TaskContext}
@@ -34,8 +36,8 @@ class StateStoreRDD[T: ClassTag, U: ClassTag](
     dataRDD: RDD[T],
     storeUpdateFunction: (StateStore, Iterator[T]) => Iterator[U],
     checkpointLocation: String,
+    queryRunId: UUID,
     operatorId: Long,
-    storeName: String,
     storeVersion: Long,
     keySchema: StructType,
     valueSchema: StructType,
@@ -52,16 +54,25 @@ class StateStoreRDD[T: ClassTag, U: ClassTag](
 
   override protected def getPartitions: Array[Partition] = dataRDD.partitions
 
+  /**
+   * Set the preferred location of each partition using the executor that has the related
+   * [[StateStoreProvider]] already loaded.
+   */
   override def getPreferredLocations(partition: Partition): Seq[String] = {
-    val storeId = StateStoreId(checkpointLocation, operatorId, partition.index, storeName)
-    storeCoordinator.flatMap(_.getLocation(storeId)).toSeq
+    val stateStoreProviderId = StateStoreProviderId(
+      StateStoreId(checkpointLocation, operatorId, partition.index),
+      queryRunId)
+    storeCoordinator.flatMap(_.getLocation(stateStoreProviderId)).toSeq
   }
 
   override def compute(partition: Partition, ctxt: TaskContext): Iterator[U] = {
     var store: StateStore = null
-    val storeId = StateStoreId(checkpointLocation, operatorId, partition.index, storeName)
+    val storeProviderId = StateStoreProviderId(
+      StateStoreId(checkpointLocation, operatorId, partition.index),
+      queryRunId)
+
     store = StateStore.get(
-      storeId, keySchema, valueSchema, indexOrdinal, storeVersion,
+      storeProviderId, keySchema, valueSchema, indexOrdinal, storeVersion,
       storeConf, hadoopConfBroadcast.value.value)
     val inputIter = dataRDD.iterator(partition, ctxt)
     storeUpdateFunction(store, inputIter)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/package.scala
index 228fe86d59940..a0086e251f9c6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/package.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.execution.streaming
 
+import java.util.UUID
+
 import scala.reflect.ClassTag
 
 import org.apache.spark.TaskContext
@@ -32,20 +34,14 @@ package object state {
     /** Map each partition of an RDD along with data in a [[StateStore]]. */
     def mapPartitionsWithStateStore[U: ClassTag](
         sqlContext: SQLContext,
-        checkpointLocation: String,
-        operatorId: Long,
-        storeName: String,
-        storeVersion: Long,
+        stateInfo: StatefulOperatorStateInfo,
         keySchema: StructType,
         valueSchema: StructType,
         indexOrdinal: Option[Int])(
         storeUpdateFunction: (StateStore, Iterator[T]) => Iterator[U]): StateStoreRDD[T, U] = {
 
       mapPartitionsWithStateStore(
-        checkpointLocation,
-        operatorId,
-        storeName,
-        storeVersion,
+        stateInfo,
         keySchema,
         valueSchema,
         indexOrdinal,
@@ -56,10 +52,7 @@ package object state {
 
     /** Map each partition of an RDD along with data in a [[StateStore]]. */
     private[streaming] def mapPartitionsWithStateStore[U: ClassTag](
-        checkpointLocation: String,
-        operatorId: Long,
-        storeName: String,
-        storeVersion: Long,
+        stateInfo: StatefulOperatorStateInfo,
         keySchema: StructType,
         valueSchema: StructType,
         indexOrdinal: Option[Int],
@@ -79,10 +72,10 @@ package object state {
       new StateStoreRDD(
         dataRDD,
         wrappedF,
-        checkpointLocation,
-        operatorId,
-        storeName,
-        storeVersion,
+        stateInfo.checkpointLocation,
+        stateInfo.queryRunId,
+        stateInfo.operatorId,
+        stateInfo.storeVersion,
         keySchema,
         valueSchema,
         indexOrdinal,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala
index 3e57f3fbada32..c5722466a33af 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution.streaming
 
+import java.util.UUID
 import java.util.concurrent.TimeUnit._
 
 import org.apache.spark.rdd.RDD
@@ -36,20 +37,22 @@ import org.apache.spark.util.{CompletionIterator, NextIterator}
 
 
 /** Used to identify the state store for a given operator. */
-case class OperatorStateId(
+case class StatefulOperatorStateInfo(
     checkpointLocation: String,
+    queryRunId: UUID,
     operatorId: Long,
-    batchId: Long)
+    storeVersion: Long)
 
 /**
- * An operator that reads or writes state from the [[StateStore]].  The [[OperatorStateId]] should
- * be filled in by `prepareForExecution` in [[IncrementalExecution]].
+ * An operator that reads or writes state from the [[StateStore]].
+ * The [[StatefulOperatorStateInfo]] should be filled in by `prepareForExecution` in
+ * [[IncrementalExecution]].
  */
 trait StatefulOperator extends SparkPlan {
-  def stateId: Option[OperatorStateId]
+  def stateInfo: Option[StatefulOperatorStateInfo]
 
-  protected def getStateId: OperatorStateId = attachTree(this) {
-    stateId.getOrElse {
+  protected def getStateInfo: StatefulOperatorStateInfo = attachTree(this) {
+    stateInfo.getOrElse {
       throw new IllegalStateException("State location not present for execution")
     }
   }
@@ -140,7 +143,7 @@ trait WatermarkSupport extends UnaryExecNode {
  */
 case class StateStoreRestoreExec(
     keyExpressions: Seq[Attribute],
-    stateId: Option[OperatorStateId],
+    stateInfo: Option[StatefulOperatorStateInfo],
     child: SparkPlan)
   extends UnaryExecNode with StateStoreReader {
 
@@ -148,10 +151,7 @@ case class StateStoreRestoreExec(
     val numOutputRows = longMetric("numOutputRows")
 
     child.execute().mapPartitionsWithStateStore(
-      getStateId.checkpointLocation,
-      operatorId = getStateId.operatorId,
-      storeName = "default",
-      storeVersion = getStateId.batchId,
+      getStateInfo,
       keyExpressions.toStructType,
       child.output.toStructType,
       indexOrdinal = None,
@@ -177,7 +177,7 @@ case class StateStoreRestoreExec(
  */
 case class StateStoreSaveExec(
     keyExpressions: Seq[Attribute],
-    stateId: Option[OperatorStateId] = None,
+    stateInfo: Option[StatefulOperatorStateInfo] = None,
     outputMode: Option[OutputMode] = None,
     eventTimeWatermark: Option[Long] = None,
     child: SparkPlan)
@@ -189,10 +189,7 @@ case class StateStoreSaveExec(
       "Incorrect planning in IncrementalExecution, outputMode has not been set")
 
     child.execute().mapPartitionsWithStateStore(
-      getStateId.checkpointLocation,
-      getStateId.operatorId,
-      storeName = "default",
-      getStateId.batchId,
+      getStateInfo,
       keyExpressions.toStructType,
       child.output.toStructType,
       indexOrdinal = None,
@@ -319,7 +316,7 @@ case class StateStoreSaveExec(
 case class StreamingDeduplicateExec(
     keyExpressions: Seq[Attribute],
     child: SparkPlan,
-    stateId: Option[OperatorStateId] = None,
+    stateInfo: Option[StatefulOperatorStateInfo] = None,
     eventTimeWatermark: Option[Long] = None)
   extends UnaryExecNode with StateStoreWriter with WatermarkSupport {
 
@@ -331,10 +328,7 @@ case class StreamingDeduplicateExec(
     metrics // force lazy init at driver
 
     child.execute().mapPartitionsWithStateStore(
-      getStateId.checkpointLocation,
-      getStateId.operatorId,
-      storeName = "default",
-      getStateId.batchId,
+      getStateInfo,
       keyExpressions.toStructType,
       child.output.toStructType,
       indexOrdinal = None,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
index 002c45413b4c2..48b0ea20e5da1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
@@ -332,5 +332,6 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo
       }
       awaitTerminationLock.notifyAll()
     }
+    stateStoreCoordinator.deactivateInstances(terminatedQuery.runId)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreCoordinatorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreCoordinatorSuite.scala
index a7e32626264cc..9a7595eee7bd0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreCoordinatorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreCoordinatorSuite.scala
@@ -17,11 +17,17 @@
 
 package org.apache.spark.sql.execution.streaming.state
 
+import java.util.UUID
+
 import org.scalatest.concurrent.Eventually._
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.{SharedSparkContext, SparkContext, SparkFunSuite}
 import org.apache.spark.scheduler.ExecutorCacheTaskLocation
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.execution.streaming.{MemoryStream, StreamingQueryWrapper}
+import org.apache.spark.sql.functions.count
+import org.apache.spark.util.Utils
 
 class StateStoreCoordinatorSuite extends SparkFunSuite with SharedSparkContext {
 
@@ -29,7 +35,7 @@ class StateStoreCoordinatorSuite extends SparkFunSuite with SharedSparkContext {
 
   test("report, verify, getLocation") {
     withCoordinatorRef(sc) { coordinatorRef =>
-      val id = StateStoreId("x", 0, 0)
+      val id = StateStoreProviderId(StateStoreId("x", 0, 0), UUID.randomUUID)
 
       assert(coordinatorRef.verifyIfInstanceActive(id, "exec1") === false)
       assert(coordinatorRef.getLocation(id) === None)
@@ -57,9 +63,11 @@ class StateStoreCoordinatorSuite extends SparkFunSuite with SharedSparkContext {
 
   test("make inactive") {
     withCoordinatorRef(sc) { coordinatorRef =>
-      val id1 = StateStoreId("x", 0, 0)
-      val id2 = StateStoreId("y", 1, 0)
-      val id3 = StateStoreId("x", 0, 1)
+      val runId1 = UUID.randomUUID
+      val runId2 = UUID.randomUUID
+      val id1 = StateStoreProviderId(StateStoreId("x", 0, 0), runId1)
+      val id2 = StateStoreProviderId(StateStoreId("y", 1, 0), runId2)
+      val id3 = StateStoreProviderId(StateStoreId("x", 0, 1), runId1)
       val host = "hostX"
       val exec = "exec1"
 
@@ -73,7 +81,7 @@ class StateStoreCoordinatorSuite extends SparkFunSuite with SharedSparkContext {
         assert(coordinatorRef.verifyIfInstanceActive(id3, exec) === true)
       }
 
-      coordinatorRef.deactivateInstances("x")
+      coordinatorRef.deactivateInstances(runId1)
 
       assert(coordinatorRef.verifyIfInstanceActive(id1, exec) === false)
       assert(coordinatorRef.verifyIfInstanceActive(id2, exec) === true)
@@ -85,7 +93,7 @@ class StateStoreCoordinatorSuite extends SparkFunSuite with SharedSparkContext {
           Some(ExecutorCacheTaskLocation(host, exec).toString))
       assert(coordinatorRef.getLocation(id3) === None)
 
-      coordinatorRef.deactivateInstances("y")
+      coordinatorRef.deactivateInstances(runId2)
       assert(coordinatorRef.verifyIfInstanceActive(id2, exec) === false)
       assert(coordinatorRef.getLocation(id2) === None)
     }
@@ -95,7 +103,7 @@ class StateStoreCoordinatorSuite extends SparkFunSuite with SharedSparkContext {
     withCoordinatorRef(sc) { coordRef1 =>
       val coordRef2 = StateStoreCoordinatorRef.forDriver(sc.env)
 
-      val id = StateStoreId("x", 0, 0)
+      val id = StateStoreProviderId(StateStoreId("x", 0, 0), UUID.randomUUID)
 
       coordRef1.reportActiveInstance(id, "hostX", "exec1")
 
@@ -107,6 +115,45 @@ class StateStoreCoordinatorSuite extends SparkFunSuite with SharedSparkContext {
       }
     }
   }
+
+  test("query stop deactivates related store providers") {
+    var coordRef: StateStoreCoordinatorRef = null
+    try {
+      val spark = SparkSession.builder().sparkContext(sc).getOrCreate()
+      SparkSession.setActiveSession(spark)
+      import spark.implicits._
+      coordRef = spark.streams.stateStoreCoordinator
+      implicit val sqlContext = spark.sqlContext
+      spark.conf.set("spark.sql.shuffle.partitions", "1")
+
+      // Start a query and run a batch to load state stores
+      val inputData = MemoryStream[Int]
+      val aggregated = inputData.toDF().groupBy("value").agg(count("*")) // stateful query
+      val checkpointLocation = Utils.createTempDir().getAbsoluteFile
+      val query = aggregated.writeStream
+        .format("memory")
+        .outputMode("update")
+        .queryName("query")
+        .option("checkpointLocation", checkpointLocation.toString)
+        .start()
+      inputData.addData(1, 2, 3)
+      query.processAllAvailable()
+
+      // Verify state store has been loaded
+      val stateCheckpointDir =
+        query.asInstanceOf[StreamingQueryWrapper].streamingQuery.lastExecution.checkpointLocation
+      val providerId = StateStoreProviderId(StateStoreId(stateCheckpointDir, 0, 0), query.runId)
+      assert(coordRef.getLocation(providerId).nonEmpty)
+
+      // Stop and verify whether the stores are deactivated in the coordinator
+      query.stop()
+      assert(coordRef.getLocation(providerId).isEmpty)
+    } finally {
+      SparkSession.getActiveSession.foreach(_.streams.active.foreach(_.stop()))
+      if (coordRef != null) coordRef.stop()
+      StateStore.stop()
+    }
+  }
 }
 
 object StateStoreCoordinatorSuite {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreRDDSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreRDDSuite.scala
index 4a1a089af54c2..defb9ed63a881 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreRDDSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreRDDSuite.scala
@@ -19,20 +19,19 @@ package org.apache.spark.sql.execution.streaming.state
 
 import java.io.File
 import java.nio.file.Files
+import java.util.UUID
 
 import scala.util.Random
 
 import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll}
-import org.scalatest.concurrent.Eventually._
-import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite}
-import org.apache.spark.sql.LocalSparkSession._
-import org.apache.spark.LocalSparkContext._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.scheduler.ExecutorCacheTaskLocation
+import org.apache.spark.sql.LocalSparkSession._
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.util.quietly
+import org.apache.spark.sql.execution.streaming.StatefulOperatorStateInfo
 import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
 import org.apache.spark.util.{CompletionIterator, Utils}
 
@@ -57,16 +56,14 @@ class StateStoreRDDSuite extends SparkFunSuite with BeforeAndAfter with BeforeAn
   test("versioning and immutability") {
     withSparkSession(SparkSession.builder.config(sparkConf).getOrCreate()) { spark =>
       val path = Utils.createDirectory(tempDir, Random.nextString(10)).toString
-      val opId = 0
-      val rdd1 =
-        makeRDD(spark.sparkContext, Seq("a", "b", "a")).mapPartitionsWithStateStore(
-            spark.sqlContext, path, opId, "name", storeVersion = 0, keySchema, valueSchema, None)(
+      val rdd1 = makeRDD(spark.sparkContext, Seq("a", "b", "a")).mapPartitionsWithStateStore(
+            spark.sqlContext, operatorStateInfo(path, version = 0), keySchema, valueSchema, None)(
             increment)
       assert(rdd1.collect().toSet === Set("a" -> 2, "b" -> 1))
 
       // Generate next version of stores
       val rdd2 = makeRDD(spark.sparkContext, Seq("a", "c")).mapPartitionsWithStateStore(
-        spark.sqlContext, path, opId, "name", storeVersion = 1, keySchema, valueSchema, None)(
+        spark.sqlContext, operatorStateInfo(path, version = 1), keySchema, valueSchema, None)(
         increment)
       assert(rdd2.collect().toSet === Set("a" -> 3, "b" -> 1, "c" -> 1))
 
@@ -76,7 +73,6 @@ class StateStoreRDDSuite extends SparkFunSuite with BeforeAndAfter with BeforeAn
   }
 
   test("recovering from files") {
-    val opId = 0
     val path = Utils.createDirectory(tempDir, Random.nextString(10)).toString
 
     def makeStoreRDD(
@@ -85,7 +81,8 @@ class StateStoreRDDSuite extends SparkFunSuite with BeforeAndAfter with BeforeAn
         storeVersion: Int): RDD[(String, Int)] = {
       implicit val sqlContext = spark.sqlContext
       makeRDD(spark.sparkContext, Seq("a")).mapPartitionsWithStateStore(
-        sqlContext, path, opId, "name", storeVersion, keySchema, valueSchema, None)(increment)
+        sqlContext, operatorStateInfo(path, version = storeVersion),
+        keySchema, valueSchema, None)(increment)
     }
 
     // Generate RDDs and state store data
@@ -132,17 +129,17 @@ class StateStoreRDDSuite extends SparkFunSuite with BeforeAndAfter with BeforeAn
       }
 
       val rddOfGets1 = makeRDD(spark.sparkContext, Seq("a", "b", "c")).mapPartitionsWithStateStore(
-        spark.sqlContext, path, opId, "name", storeVersion = 0, keySchema, valueSchema, None)(
+        spark.sqlContext, operatorStateInfo(path, version = 0), keySchema, valueSchema, None)(
         iteratorOfGets)
       assert(rddOfGets1.collect().toSet === Set("a" -> None, "b" -> None, "c" -> None))
 
       val rddOfPuts = makeRDD(spark.sparkContext, Seq("a", "b", "a")).mapPartitionsWithStateStore(
-        sqlContext, path, opId, "name", storeVersion = 0, keySchema, valueSchema, None)(
+        sqlContext, operatorStateInfo(path, version = 0), keySchema, valueSchema, None)(
         iteratorOfPuts)
       assert(rddOfPuts.collect().toSet === Set("a" -> 1, "a" -> 2, "b" -> 1))
 
       val rddOfGets2 = makeRDD(spark.sparkContext, Seq("a", "b", "c")).mapPartitionsWithStateStore(
-        sqlContext, path, opId, "name", storeVersion = 1, keySchema, valueSchema, None)(
+        sqlContext, operatorStateInfo(path, version = 1), keySchema, valueSchema, None)(
         iteratorOfGets)
       assert(rddOfGets2.collect().toSet === Set("a" -> Some(2), "b" -> Some(1), "c" -> None))
     }
@@ -150,22 +147,25 @@ class StateStoreRDDSuite extends SparkFunSuite with BeforeAndAfter with BeforeAn
 
   test("preferred locations using StateStoreCoordinator") {
     quietly {
+      val queryRunId = UUID.randomUUID
       val opId = 0
       val path = Utils.createDirectory(tempDir, Random.nextString(10)).toString
 
       withSparkSession(SparkSession.builder.config(sparkConf).getOrCreate()) { spark =>
         implicit val sqlContext = spark.sqlContext
         val coordinatorRef = sqlContext.streams.stateStoreCoordinator
-        coordinatorRef.reportActiveInstance(StateStoreId(path, opId, 0, "name"), "host1", "exec1")
-        coordinatorRef.reportActiveInstance(StateStoreId(path, opId, 1, "name"), "host2", "exec2")
+        val storeProviderId1 = StateStoreProviderId(StateStoreId(path, opId, 0), queryRunId)
+        val storeProviderId2 = StateStoreProviderId(StateStoreId(path, opId, 1), queryRunId)
+        coordinatorRef.reportActiveInstance(storeProviderId1, "host1", "exec1")
+        coordinatorRef.reportActiveInstance(storeProviderId2, "host2", "exec2")
 
-        assert(
-          coordinatorRef.getLocation(StateStoreId(path, opId, 0, "name")) ===
+        require(
+          coordinatorRef.getLocation(storeProviderId1) ===
             Some(ExecutorCacheTaskLocation("host1", "exec1").toString))
 
         val rdd = makeRDD(spark.sparkContext, Seq("a", "b", "a")).mapPartitionsWithStateStore(
-          sqlContext, path, opId, "name", storeVersion = 0, keySchema, valueSchema, None)(
-          increment)
+          sqlContext, operatorStateInfo(path, queryRunId = queryRunId),
+          keySchema, valueSchema, None)(increment)
         require(rdd.partitions.length === 2)
 
         assert(
@@ -192,12 +192,12 @@ class StateStoreRDDSuite extends SparkFunSuite with BeforeAndAfter with BeforeAn
         val path = Utils.createDirectory(tempDir, Random.nextString(10)).toString
         val opId = 0
         val rdd1 = makeRDD(spark.sparkContext, Seq("a", "b", "a")).mapPartitionsWithStateStore(
-          sqlContext, path, opId, "name", storeVersion = 0, keySchema, valueSchema, None)(increment)
+          sqlContext, operatorStateInfo(path, version = 0), keySchema, valueSchema, None)(increment)
         assert(rdd1.collect().toSet === Set("a" -> 2, "b" -> 1))
 
         // Generate next version of stores
         val rdd2 = makeRDD(spark.sparkContext, Seq("a", "c")).mapPartitionsWithStateStore(
-          sqlContext, path, opId, "name", storeVersion = 1, keySchema, valueSchema, None)(increment)
+          sqlContext, operatorStateInfo(path, version = 1), keySchema, valueSchema, None)(increment)
         assert(rdd2.collect().toSet === Set("a" -> 3, "b" -> 1, "c" -> 1))
 
         // Make sure the previous RDD still has the same data.
@@ -210,6 +210,13 @@ class StateStoreRDDSuite extends SparkFunSuite with BeforeAndAfter with BeforeAn
     sc.makeRDD(seq, 2).groupBy(x => x).flatMap(_._2)
   }
 
+  private def operatorStateInfo(
+      path: String,
+      queryRunId: UUID = UUID.randomUUID,
+      version: Int = 0): StatefulOperatorStateInfo = {
+    StatefulOperatorStateInfo(path, queryRunId, operatorId = 0, version)
+  }
+
   private val increment = (store: StateStore, iter: Iterator[String]) => {
     iter.foreach { s =>
       val key = stringToRow(s)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala
index af2b9f1c11fb6..c2087ec219e57 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.streaming.state
 
 import java.io.{File, IOException}
 import java.net.URI
+import java.util.UUID
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable
@@ -33,8 +34,11 @@ import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.{SparkConf, SparkContext, SparkEnv, SparkFunSuite}
 import org.apache.spark.LocalSparkContext._
+import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection, UnsafeRow}
 import org.apache.spark.sql.catalyst.util.quietly
+import org.apache.spark.sql.execution.streaming.{MemoryStream, StreamingQueryWrapper}
+import org.apache.spark.sql.functions.count
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
@@ -143,7 +147,7 @@ class StateStoreSuite extends StateStoreSuiteBase[HDFSBackedStateStoreProvider]
     provider.getStore(0).commit()
 
     // Verify we don't leak temp files
-    val tempFiles = FileUtils.listFiles(new File(provider.id.checkpointLocation),
+    val tempFiles = FileUtils.listFiles(new File(provider.stateStoreId.checkpointRootLocation),
       null, true).asScala.filter(_.getName.startsWith("temp-"))
     assert(tempFiles.isEmpty)
   }
@@ -183,7 +187,7 @@ class StateStoreSuite extends StateStoreSuiteBase[HDFSBackedStateStoreProvider]
   test("StateStore.get") {
     quietly {
       val dir = newDir()
-      val storeId = StateStoreId(dir, 0, 0)
+      val storeId = StateStoreProviderId(StateStoreId(dir, 0, 0), UUID.randomUUID)
       val storeConf = StateStoreConf.empty
       val hadoopConf = new Configuration()
 
@@ -243,18 +247,18 @@ class StateStoreSuite extends StateStoreSuiteBase[HDFSBackedStateStoreProvider]
       .set("spark.rpc.numRetries", "1")
     val opId = 0
     val dir = newDir()
-    val storeId = StateStoreId(dir, opId, 0)
+    val storeProviderId = StateStoreProviderId(StateStoreId(dir, opId, 0), UUID.randomUUID)
     val sqlConf = new SQLConf()
     sqlConf.setConf(SQLConf.MIN_BATCHES_TO_RETAIN, 2)
     val storeConf = StateStoreConf(sqlConf)
     val hadoopConf = new Configuration()
-    val provider = newStoreProvider(storeId)
+    val provider = newStoreProvider(storeProviderId.storeId)
 
     var latestStoreVersion = 0
 
     def generateStoreVersions() {
       for (i <- 1 to 20) {
-        val store = StateStore.get(storeId, keySchema, valueSchema, None,
+        val store = StateStore.get(storeProviderId, keySchema, valueSchema, None,
           latestStoreVersion, storeConf, hadoopConf)
         put(store, "a", i)
         store.commit()
@@ -274,7 +278,8 @@ class StateStoreSuite extends StateStoreSuiteBase[HDFSBackedStateStoreProvider]
 
           eventually(timeout(timeoutDuration)) {
             // Store should have been reported to the coordinator
-            assert(coordinatorRef.getLocation(storeId).nonEmpty, "active instance was not reported")
+            assert(coordinatorRef.getLocation(storeProviderId).nonEmpty,
+              "active instance was not reported")
 
             // Background maintenance should clean up and generate snapshots
             assert(StateStore.isMaintenanceRunning, "Maintenance task is not running")
@@ -295,35 +300,35 @@ class StateStoreSuite extends StateStoreSuiteBase[HDFSBackedStateStoreProvider]
             assert(!fileExists(provider, 1, isSnapshot = false), "earliest file not deleted")
           }
 
-          // If driver decides to deactivate all instances of the store, then this instance
-          // should be unloaded
-          coordinatorRef.deactivateInstances(dir)
+          // If driver decides to deactivate all stores related to a query run,
+          // then this instance should be unloaded
+          coordinatorRef.deactivateInstances(storeProviderId.queryRunId)
           eventually(timeout(timeoutDuration)) {
-            assert(!StateStore.isLoaded(storeId))
+            assert(!StateStore.isLoaded(storeProviderId))
           }
 
           // Reload the store and verify
-          StateStore.get(storeId, keySchema, valueSchema, indexOrdinal = None,
+          StateStore.get(storeProviderId, keySchema, valueSchema, indexOrdinal = None,
             latestStoreVersion, storeConf, hadoopConf)
-          assert(StateStore.isLoaded(storeId))
+          assert(StateStore.isLoaded(storeProviderId))
 
           // If some other executor loads the store, then this instance should be unloaded
-          coordinatorRef.reportActiveInstance(storeId, "other-host", "other-exec")
+          coordinatorRef.reportActiveInstance(storeProviderId, "other-host", "other-exec")
           eventually(timeout(timeoutDuration)) {
-            assert(!StateStore.isLoaded(storeId))
+            assert(!StateStore.isLoaded(storeProviderId))
           }
 
           // Reload the store and verify
-          StateStore.get(storeId, keySchema, valueSchema, indexOrdinal = None,
+          StateStore.get(storeProviderId, keySchema, valueSchema, indexOrdinal = None,
             latestStoreVersion, storeConf, hadoopConf)
-          assert(StateStore.isLoaded(storeId))
+          assert(StateStore.isLoaded(storeProviderId))
         }
       }
 
       // Verify if instance is unloaded if SparkContext is stopped
       eventually(timeout(timeoutDuration)) {
         require(SparkEnv.get === null)
-        assert(!StateStore.isLoaded(storeId))
+        assert(!StateStore.isLoaded(storeProviderId))
         assert(!StateStore.isMaintenanceRunning)
       }
     }
@@ -344,7 +349,7 @@ class StateStoreSuite extends StateStoreSuiteBase[HDFSBackedStateStoreProvider]
 
   test("SPARK-18416: do not create temp delta file until the store is updated") {
     val dir = newDir()
-    val storeId = StateStoreId(dir, 0, 0)
+    val storeId = StateStoreProviderId(StateStoreId(dir, 0, 0), UUID.randomUUID)
     val storeConf = StateStoreConf.empty
     val hadoopConf = new Configuration()
     val deltaFileDir = new File(s"$dir/0/0/")
@@ -408,12 +413,60 @@ class StateStoreSuite extends StateStoreSuiteBase[HDFSBackedStateStoreProvider]
     assert(numDeltaFiles === 3)
   }
 
+  test("SPARK-21145: Restarted queries create new provider instances") {
+    try {
+      val checkpointLocation = Utils.createTempDir().getAbsoluteFile
+      val spark = SparkSession.builder().master("local[2]").getOrCreate()
+      SparkSession.setActiveSession(spark)
+      implicit val sqlContext = spark.sqlContext
+      spark.conf.set("spark.sql.shuffle.partitions", "1")
+      import spark.implicits._
+      val inputData = MemoryStream[Int]
+
+      def runQueryAndGetLoadedProviders(): Seq[StateStoreProvider] = {
+        val aggregated = inputData.toDF().groupBy("value").agg(count("*"))
+        // stateful query
+        val query = aggregated.writeStream
+          .format("memory")
+          .outputMode("complete")
+          .queryName("query")
+          .option("checkpointLocation", checkpointLocation.toString)
+          .start()
+        inputData.addData(1, 2, 3)
+        query.processAllAvailable()
+        require(query.lastProgress != null) // at least one batch processed after start
+        val loadedProvidersMethod =
+          PrivateMethod[mutable.HashMap[StateStoreProviderId, StateStoreProvider]]('loadedProviders)
+        val loadedProvidersMap = StateStore invokePrivate loadedProvidersMethod()
+        val loadedProviders = loadedProvidersMap.synchronized { loadedProvidersMap.values.toSeq }
+        query.stop()
+        loadedProviders
+      }
+
+      val loadedProvidersAfterRun1 = runQueryAndGetLoadedProviders()
+      require(loadedProvidersAfterRun1.length === 1)
+
+      val loadedProvidersAfterRun2 = runQueryAndGetLoadedProviders()
+      assert(loadedProvidersAfterRun2.length === 2)   // two providers loaded for 2 runs
+
+      // Both providers should have the same StateStoreId, but the should be different objects
+      assert(loadedProvidersAfterRun2(0).stateStoreId === loadedProvidersAfterRun2(1).stateStoreId)
+      assert(loadedProvidersAfterRun2(0) ne loadedProvidersAfterRun2(1))
+
+    } finally {
+      SparkSession.getActiveSession.foreach { spark =>
+        spark.streams.active.foreach(_.stop())
+        spark.stop()
+      }
+    }
+  }
+
   override def newStoreProvider(): HDFSBackedStateStoreProvider = {
     newStoreProvider(opId = Random.nextInt(), partition = 0)
   }
 
   override def newStoreProvider(storeId: StateStoreId): HDFSBackedStateStoreProvider = {
-    newStoreProvider(storeId.operatorId, storeId.partitionId, dir = storeId.checkpointLocation)
+    newStoreProvider(storeId.operatorId, storeId.partitionId, dir = storeId.checkpointRootLocation)
   }
 
   override def getLatestData(storeProvider: HDFSBackedStateStoreProvider): Set[(String, Int)] = {
@@ -423,7 +476,7 @@ class StateStoreSuite extends StateStoreSuiteBase[HDFSBackedStateStoreProvider]
   override def getData(
     provider: HDFSBackedStateStoreProvider,
     version: Int = -1): Set[(String, Int)] = {
-    val reloadedProvider = newStoreProvider(provider.id)
+    val reloadedProvider = newStoreProvider(provider.stateStoreId)
     if (version < 0) {
       reloadedProvider.latestIterator().map(rowsToStringInt).toSet
     } else {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
index 4ede4fd9a035e..86c3a35a59c13 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
@@ -777,7 +777,7 @@ class TestStateStoreProvider extends StateStoreProvider {
     throw new Exception("Successfully instantiated")
   }
 
-  override def id: StateStoreId = null
+  override def stateStoreId: StateStoreId = null
 
   override def close(): Unit = { }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index 2a4039cc5831a..b2c42eef88f6d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -26,9 +26,8 @@ import scala.reflect.ClassTag
 import scala.util.Random
 import scala.util.control.NonFatal
 
-import org.scalatest.Assertions
+import org.scalatest.{Assertions, BeforeAndAfterAll}
 import org.scalatest.concurrent.{Eventually, Timeouts}
-import org.scalatest.concurrent.Eventually._
 import org.scalatest.concurrent.PatienceConfiguration.Timeout
 import org.scalatest.exceptions.TestFailedDueToTimeoutException
 import org.scalatest.time.Span
@@ -39,9 +38,10 @@ import org.apache.spark.sql.catalyst.encoders.{encoderFor, ExpressionEncoder, Ro
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.execution.streaming._
+import org.apache.spark.sql.execution.streaming.state.StateStore
 import org.apache.spark.sql.streaming.StreamingQueryListener._
 import org.apache.spark.sql.test.SharedSQLContext
-import org.apache.spark.util.{Clock, ManualClock, SystemClock, Utils}
+import org.apache.spark.util.{Clock, SystemClock, Utils}
 
 /**
  * A framework for implementing tests for streaming queries and sources.
@@ -67,7 +67,12 @@ import org.apache.spark.util.{Clock, ManualClock, SystemClock, Utils}
  * avoid hanging forever in the case of failures. However, individual suites can change this
  * by overriding `streamingTimeout`.
  */
-trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
+trait StreamTest extends QueryTest with SharedSQLContext with Timeouts with BeforeAndAfterAll {
+
+  override def afterAll(): Unit = {
+    super.afterAll()
+    StateStore.stop() // stop the state store maintenance thread and unload store providers
+  }
 
   /** How long to wait for an active stream to catch up when checking a result. */
   val streamingTimeout = 10.seconds

From 153dd49b74e1b6df2b8e35760806c9754ca7bfae Mon Sep 17 00:00:00 2001
From: jinxing <jinxing6042@126.com>
Date: Fri, 23 Jun 2017 20:41:17 +0800
Subject: [PATCH 002/779] [SPARK-21047] Add test suites for complicated cases
 in ColumnarBatchSuite

## What changes were proposed in this pull request?
Current ColumnarBatchSuite has very simple test cases for `Array` and `Struct`. This pr wants to add  some test suites for complicated cases in ColumnVector.

Author: jinxing <jinxing6042@126.com>

Closes #18327 from jinxing64/SPARK-21047.
---
 .../execution/vectorized/ColumnarBatch.java   |  35 ++++-
 .../vectorized/ColumnarBatchSuite.scala       | 122 ++++++++++++++++++
 2 files changed, 156 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java
index 8b7b0e655b31d..e23a64350cbc5 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java
@@ -241,7 +241,40 @@ public MapData getMap(int ordinal) {
 
     @Override
     public Object get(int ordinal, DataType dataType) {
-      throw new UnsupportedOperationException();
+      if (dataType instanceof BooleanType) {
+        return getBoolean(ordinal);
+      } else if (dataType instanceof ByteType) {
+        return getByte(ordinal);
+      } else if (dataType instanceof ShortType) {
+        return getShort(ordinal);
+      } else if (dataType instanceof IntegerType) {
+        return getInt(ordinal);
+      } else if (dataType instanceof LongType) {
+        return getLong(ordinal);
+      } else if (dataType instanceof FloatType) {
+        return getFloat(ordinal);
+      } else if (dataType instanceof DoubleType) {
+        return getDouble(ordinal);
+      } else if (dataType instanceof StringType) {
+        return getUTF8String(ordinal);
+      } else if (dataType instanceof BinaryType) {
+        return getBinary(ordinal);
+      } else if (dataType instanceof DecimalType) {
+        DecimalType t = (DecimalType) dataType;
+        return getDecimal(ordinal, t.precision(), t.scale());
+      } else if (dataType instanceof DateType) {
+        return getInt(ordinal);
+      } else if (dataType instanceof TimestampType) {
+        return getLong(ordinal);
+      } else if (dataType instanceof ArrayType) {
+        return getArray(ordinal);
+      } else if (dataType instanceof StructType) {
+        return getStruct(ordinal, ((StructType)dataType).fields().length);
+      } else if (dataType instanceof MapType) {
+        return getMap(ordinal);
+      } else {
+        throw new UnsupportedOperationException("Datatype not supported " + dataType);
+      }
     }
 
     @Override
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
index e48e3f6402901..80d41577dcf2d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
@@ -739,6 +739,128 @@ class ColumnarBatchSuite extends SparkFunSuite {
     }}
   }
 
+  test("Nest Array in Array.") {
+    (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode =>
+      val column = ColumnVector.allocate(10, new ArrayType(new ArrayType(IntegerType, true), true),
+        memMode)
+      val childColumn = column.arrayData()
+      val data = column.arrayData().arrayData()
+      (0 until 6).foreach {
+        case 3 => data.putNull(3)
+        case i => data.putInt(i, i)
+      }
+      // Arrays in child column: [0], [1, 2], [], [null, 4, 5]
+      childColumn.putArray(0, 0, 1)
+      childColumn.putArray(1, 1, 2)
+      childColumn.putArray(2, 2, 0)
+      childColumn.putArray(3, 3, 3)
+      // Arrays in column: [[0]], [[1, 2], []], [[], [null, 4, 5]], null
+      column.putArray(0, 0, 1)
+      column.putArray(1, 1, 2)
+      column.putArray(2, 2, 2)
+      column.putNull(3)
+
+      assert(column.getArray(0).getArray(0).toIntArray() === Array(0))
+      assert(column.getArray(1).getArray(0).toIntArray() === Array(1, 2))
+      assert(column.getArray(1).getArray(1).toIntArray() === Array())
+      assert(column.getArray(2).getArray(0).toIntArray() === Array())
+      assert(column.getArray(2).getArray(1).isNullAt(0))
+      assert(column.getArray(2).getArray(1).getInt(1) === 4)
+      assert(column.getArray(2).getArray(1).getInt(2) === 5)
+      assert(column.isNullAt(3))
+    }
+  }
+
+  test("Nest Struct in Array.") {
+    (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode =>
+      val schema = new StructType().add("int", IntegerType).add("long", LongType)
+      val column = ColumnVector.allocate(10, new ArrayType(schema, true), memMode)
+      val data = column.arrayData()
+      val c0 = data.getChildColumn(0)
+      val c1 = data.getChildColumn(1)
+      // Structs in child column: (0, 0), (1, 10), (2, 20), (3, 30), (4, 40), (5, 50)
+      (0 until 6).foreach { i =>
+        c0.putInt(i, i)
+        c1.putLong(i, i * 10)
+      }
+      // Arrays in column: [(0, 0), (1, 10)], [(1, 10), (2, 20), (3, 30)],
+      // [(4, 40), (5, 50)]
+      column.putArray(0, 0, 2)
+      column.putArray(1, 1, 3)
+      column.putArray(2, 4, 2)
+
+      assert(column.getArray(0).getStruct(0, 2).toSeq(schema) === Seq(0, 0))
+      assert(column.getArray(0).getStruct(1, 2).toSeq(schema) === Seq(1, 10))
+      assert(column.getArray(1).getStruct(0, 2).toSeq(schema) === Seq(1, 10))
+      assert(column.getArray(1).getStruct(1, 2).toSeq(schema) === Seq(2, 20))
+      assert(column.getArray(1).getStruct(2, 2).toSeq(schema) === Seq(3, 30))
+      assert(column.getArray(2).getStruct(0, 2).toSeq(schema) === Seq(4, 40))
+      assert(column.getArray(2).getStruct(1, 2).toSeq(schema) === Seq(5, 50))
+    }
+  }
+
+  test("Nest Array in Struct.") {
+    (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode =>
+      val schema = new StructType()
+        .add("int", IntegerType)
+        .add("array", new ArrayType(IntegerType, true))
+      val column = ColumnVector.allocate(10, schema, memMode)
+      val c0 = column.getChildColumn(0)
+      val c1 = column.getChildColumn(1)
+      c0.putInt(0, 0)
+      c0.putInt(1, 1)
+      c0.putInt(2, 2)
+      val c1Child = c1.arrayData()
+      (0 until 6).foreach { i =>
+        c1Child.putInt(i, i)
+      }
+      // Arrays in c1: [0, 1], [2], [3, 4, 5]
+      c1.putArray(0, 0, 2)
+      c1.putArray(1, 2, 1)
+      c1.putArray(2, 3, 3)
+
+      assert(column.getStruct(0).getInt(0) === 0)
+      assert(column.getStruct(0).getArray(1).toIntArray() === Array(0, 1))
+      assert(column.getStruct(1).getInt(0) === 1)
+      assert(column.getStruct(1).getArray(1).toIntArray() === Array(2))
+      assert(column.getStruct(2).getInt(0) === 2)
+      assert(column.getStruct(2).getArray(1).toIntArray() === Array(3, 4, 5))
+    }
+  }
+
+  test("Nest Struct in Struct.") {
+    (MemoryMode.ON_HEAP :: Nil).foreach { memMode =>
+      val subSchema = new StructType()
+        .add("int", IntegerType)
+        .add("int", IntegerType)
+      val schema = new StructType()
+        .add("int", IntegerType)
+        .add("struct", subSchema)
+      val column = ColumnVector.allocate(10, schema, memMode)
+      val c0 = column.getChildColumn(0)
+      val c1 = column.getChildColumn(1)
+      c0.putInt(0, 0)
+      c0.putInt(1, 1)
+      c0.putInt(2, 2)
+      val c1c0 = c1.getChildColumn(0)
+      val c1c1 = c1.getChildColumn(1)
+      // Structs in c1: (7, 70), (8, 80), (9, 90)
+      c1c0.putInt(0, 7)
+      c1c0.putInt(1, 8)
+      c1c0.putInt(2, 9)
+      c1c1.putInt(0, 70)
+      c1c1.putInt(1, 80)
+      c1c1.putInt(2, 90)
+
+      assert(column.getStruct(0).getInt(0) === 0)
+      assert(column.getStruct(0).getStruct(1, 2).toSeq(subSchema) === Seq(7, 70))
+      assert(column.getStruct(1).getInt(0) === 1)
+      assert(column.getStruct(1).getStruct(1, 2).toSeq(subSchema) === Seq(8, 80))
+      assert(column.getStruct(2).getInt(0) === 2)
+      assert(column.getStruct(2).getStruct(1, 2).toSeq(subSchema) === Seq(9, 90))
+    }
+  }
+
   test("ColumnarBatch basic") {
     (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode => {
       val schema = new StructType()

From acd208ee50b29bde4e097bf88761867b1d57a665 Mon Sep 17 00:00:00 2001
From: 10129659 <chen.yanshan@zte.com.cn>
Date: Fri, 23 Jun 2017 20:53:26 +0800
Subject: [PATCH 003/779] [SPARK-21115][CORE] If the cores left is less than
 the coresPerExecutor,the cores left will not be allocated, so it should not
 to check in every schedule

## What changes were proposed in this pull request?
If we start an app with the param --total-executor-cores=4 and spark.executor.cores=3, the cores left is always 1, so it will try to allocate executors in the function org.apache.spark.deploy.master.startExecutorsOnWorkers in every schedule.
Another question is, is it will be better to allocate another executor with 1 core for the cores left.

## How was this patch tested?
unit test

Author: 10129659 <chen.yanshan@zte.com.cn>

Closes #18322 from eatoncys/leftcores.
---
 .../scala/org/apache/spark/SparkConf.scala    | 11 +++++++
 .../apache/spark/deploy/master/Master.scala   | 29 ++++++++++---------
 2 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index ba7a65f79c414..de2f475c6895f 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -543,6 +543,17 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
       }
     }
 
+    if (contains("spark.cores.max") && contains("spark.executor.cores")) {
+      val totalCores = getInt("spark.cores.max", 1)
+      val executorCores = getInt("spark.executor.cores", 1)
+      val leftCores = totalCores % executorCores
+      if (leftCores != 0) {
+        logWarning(s"Total executor cores: ${totalCores} is not " +
+          s"divisible by cores per executor: ${executorCores}, " +
+          s"the left cores: ${leftCores} will not be allocated")
+      }
+    }
+
     val encryptionEnabled = get(NETWORK_ENCRYPTION_ENABLED) || get(SASL_ENCRYPTION_ENABLED)
     require(!encryptionEnabled || get(NETWORK_AUTH_ENABLED),
       s"${NETWORK_AUTH_ENABLED.key} must be enabled when enabling encryption.")
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
index c192a0cc82ef6..0dee25fb2ebe2 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
@@ -659,19 +659,22 @@ private[deploy] class Master(
   private def startExecutorsOnWorkers(): Unit = {
     // Right now this is a very simple FIFO scheduler. We keep trying to fit in the first app
     // in the queue, then the second app, etc.
-    for (app <- waitingApps if app.coresLeft > 0) {
-      val coresPerExecutor: Option[Int] = app.desc.coresPerExecutor
-      // Filter out workers that don't have enough resources to launch an executor
-      val usableWorkers = workers.toArray.filter(_.state == WorkerState.ALIVE)
-        .filter(worker => worker.memoryFree >= app.desc.memoryPerExecutorMB &&
-          worker.coresFree >= coresPerExecutor.getOrElse(1))
-        .sortBy(_.coresFree).reverse
-      val assignedCores = scheduleExecutorsOnWorkers(app, usableWorkers, spreadOutApps)
-
-      // Now that we've decided how many cores to allocate on each worker, let's allocate them
-      for (pos <- 0 until usableWorkers.length if assignedCores(pos) > 0) {
-        allocateWorkerResourceToExecutors(
-          app, assignedCores(pos), coresPerExecutor, usableWorkers(pos))
+    for (app <- waitingApps) {
+      val coresPerExecutor = app.desc.coresPerExecutor.getOrElse(1)
+      // If the cores left is less than the coresPerExecutor,the cores left will not be allocated
+      if (app.coresLeft >= coresPerExecutor) {
+        // Filter out workers that don't have enough resources to launch an executor
+        val usableWorkers = workers.toArray.filter(_.state == WorkerState.ALIVE)
+          .filter(worker => worker.memoryFree >= app.desc.memoryPerExecutorMB &&
+            worker.coresFree >= coresPerExecutor)
+          .sortBy(_.coresFree).reverse
+        val assignedCores = scheduleExecutorsOnWorkers(app, usableWorkers, spreadOutApps)
+
+        // Now that we've decided how many cores to allocate on each worker, let's allocate them
+        for (pos <- 0 until usableWorkers.length if assignedCores(pos) > 0) {
+          allocateWorkerResourceToExecutors(
+            app, assignedCores(pos), app.desc.coresPerExecutor, usableWorkers(pos))
+        }
       }
     }
   }

From 5dca10b8fdec81a3cc476301fa4f82ea917c34ec Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Fri, 23 Jun 2017 21:51:55 +0800
Subject: [PATCH 004/779] [SPARK-21193][PYTHON] Specify Pandas version in
 setup.py

## What changes were proposed in this pull request?

It looks we missed specifying the Pandas version. This PR proposes to fix it. For the current state, it should be Pandas 0.13.0 given my test. This PR propose to fix it as 0.13.0.

Running the codes below:

```python
from pyspark.sql.types import *

schema = StructType().add("a", IntegerType()).add("b", StringType())\
                     .add("c", BooleanType()).add("d", FloatType())
data = [
    (1, "foo", True, 3.0,), (2, "foo", True, 5.0),
    (3, "bar", False, -1.0), (4, "bar", False, 6.0),
]
spark.createDataFrame(data, schema).toPandas().dtypes
```

prints ...

**With Pandas 0.13.0** - released, 2014-01

```
a      int32
b     object
c       bool
d    float32
dtype: object
```

**With Pandas 0.12.0** -  - released, 2013-06

```
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File ".../spark/python/pyspark/sql/dataframe.py", line 1734, in toPandas
    pdf[f] = pdf[f].astype(t, copy=False)
TypeError: astype() got an unexpected keyword argument 'copy'
```

without `copy`

```
a      int32
b     object
c       bool
d    float32
dtype: object
```

**With Pandas 0.11.0** - released, 2013-03

```
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File ".../spark/python/pyspark/sql/dataframe.py", line 1734, in toPandas
    pdf[f] = pdf[f].astype(t, copy=False)
TypeError: astype() got an unexpected keyword argument 'copy'
```

without `copy`

```
a      int32
b     object
c       bool
d    float32
dtype: object
```

**With Pandas 0.10.0** -  released, 2012-12

```
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File ".../spark/python/pyspark/sql/dataframe.py", line 1734, in toPandas
    pdf[f] = pdf[f].astype(t, copy=False)
TypeError: astype() got an unexpected keyword argument 'copy'
```

without `copy`

```
a      int64  # <- this should be 'int32'
b     object
c       bool
d    float64  # <- this should be 'float32'
```

## How was this patch tested?

Manually tested with Pandas from 0.10.0 to 0.13.0.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #18403 from HyukjinKwon/SPARK-21193.
---
 python/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/setup.py b/python/setup.py
index f50035435e26b..2644d3e79dea1 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -199,7 +199,7 @@ def _supports_symlinks():
         extras_require={
             'ml': ['numpy>=1.7'],
             'mllib': ['numpy>=1.7'],
-            'sql': ['pandas']
+            'sql': ['pandas>=0.13.0']
         },
         classifiers=[
             'Development Status :: 5 - Production/Stable',

From f3dea60793d86212ba1068e88ad89cb3dcf07801 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Fri, 23 Jun 2017 09:28:02 -0700
Subject: [PATCH 005/779] [SPARK-21144][SQL] Print a warning if the data schema
 and partition schema have the duplicate columns

## What changes were proposed in this pull request?
The current master outputs unexpected results when the data schema and partition schema have the duplicate columns:
```
withTempPath { dir =>
  val basePath = dir.getCanonicalPath
  spark.range(0, 3).toDF("foo").write.parquet(new Path(basePath, "foo=1").toString)
  spark.range(0, 3).toDF("foo").write.parquet(new Path(basePath, "foo=a").toString)
  spark.read.parquet(basePath).show()
}

+---+
|foo|
+---+
|  1|
|  1|
|  a|
|  a|
|  1|
|  a|
+---+
```
This patch added code to print a warning when the duplication found.

## How was this patch tested?
Manually checked.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #18375 from maropu/SPARK-21144-3.
---
 .../apache/spark/sql/util/SchemaUtils.scala   | 53 +++++++++++++++++++
 .../execution/datasources/DataSource.scala    |  6 +++
 2 files changed, 59 insertions(+)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala
new file mode 100644
index 0000000000000..e881685ce6262
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.util
+
+import org.apache.spark.internal.Logging
+
+
+/**
+ * Utils for handling schemas.
+ *
+ * TODO: Merge this file with [[org.apache.spark.ml.util.SchemaUtils]].
+ */
+private[spark] object SchemaUtils extends Logging {
+
+  /**
+   * Checks if input column names have duplicate identifiers. Prints a warning message if
+   * the duplication exists.
+   *
+   * @param columnNames column names to check
+   * @param colType column type name, used in a warning message
+   * @param caseSensitiveAnalysis whether duplication checks should be case sensitive or not
+   */
+  def checkColumnNameDuplication(
+      columnNames: Seq[String], colType: String, caseSensitiveAnalysis: Boolean): Unit = {
+    val names = if (caseSensitiveAnalysis) {
+      columnNames
+    } else {
+      columnNames.map(_.toLowerCase)
+    }
+    if (names.distinct.length != names.length) {
+      val duplicateColumns = names.groupBy(identity).collect {
+        case (x, ys) if ys.length > 1 => s"`$x`"
+      }
+      logWarning(s"Found duplicate column(s) $colType: ${duplicateColumns.mkString(", ")}. " +
+        "You might need to assign different column names.")
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index 08c78e6e326af..75e530607570f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -40,6 +40,7 @@ import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.{CalendarIntervalType, StructType}
+import org.apache.spark.sql.util.SchemaUtils
 import org.apache.spark.util.Utils
 
 /**
@@ -182,6 +183,11 @@ case class DataSource(
       throw new AnalysisException(
         s"Unable to infer schema for $format. It must be specified manually.")
     }
+
+    SchemaUtils.checkColumnNameDuplication(
+      (dataSchema ++ partitionSchema).map(_.name), "in the data schema and the partition schema",
+      sparkSession.sessionState.conf.caseSensitiveAnalysis)
+
     (dataSchema, partitionSchema)
   }
 

From 07479b3cfb7a617a18feca14e9e31c208c80630e Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Fri, 23 Jun 2017 09:59:24 -0700
Subject: [PATCH 006/779] [SPARK-21149][R] Add job description API for R

## What changes were proposed in this pull request?

Extend `setJobDescription` to SparkR API.

## How was this patch tested?

It looks difficult to add a test. Manually tested as below:

```r
df <- createDataFrame(iris)
count(df)
setJobDescription("This is an example job.")
count(df)
```

prints ...

![2017-06-22 12 05 49](https://user-images.githubusercontent.com/6477701/27415670-2a649936-5743-11e7-8e95-312f1cd103af.png)

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #18382 from HyukjinKwon/SPARK-21149.
---
 R/pkg/NAMESPACE                      |  3 ++-
 R/pkg/R/sparkR.R                     | 17 +++++++++++++++++
 R/pkg/tests/fulltests/test_context.R |  1 +
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 229de4a997eef..b7fdae58de459 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -75,7 +75,8 @@ exportMethods("glm",
 # Job group lifecycle management methods
 export("setJobGroup",
        "clearJobGroup",
-       "cancelJobGroup")
+       "cancelJobGroup",
+       "setJobDescription")
 
 # Export Utility methods
 export("setLogLevel")
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index d0a12b7ecec65..f2d2620e5447a 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -535,6 +535,23 @@ cancelJobGroup <- function(sc, groupId) {
   }
 }
 
+#' Set a human readable description of the current job.
+#'
+#' Set a description that is shown as a job description in UI.
+#'
+#' @param value The job description of the current job.
+#' @rdname setJobDescription
+#' @name setJobDescription
+#' @examples
+#'\dontrun{
+#' setJobDescription("This is an example job.")
+#'}
+#' @note setJobDescription since 2.3.0
+setJobDescription <- function(value) {
+  sc <- getSparkContext()
+  invisible(callJMethod(sc, "setJobDescription", value))
+}
+
 sparkConfToSubmitOps <- new.env()
 sparkConfToSubmitOps[["spark.driver.memory"]]           <- "--driver-memory"
 sparkConfToSubmitOps[["spark.driver.extraClassPath"]]   <- "--driver-class-path"
diff --git a/R/pkg/tests/fulltests/test_context.R b/R/pkg/tests/fulltests/test_context.R
index 710485d56685a..77635c5a256b9 100644
--- a/R/pkg/tests/fulltests/test_context.R
+++ b/R/pkg/tests/fulltests/test_context.R
@@ -100,6 +100,7 @@ test_that("job group functions can be called", {
   setJobGroup("groupId", "job description", TRUE)
   cancelJobGroup("groupId")
   clearJobGroup()
+  setJobDescription("job description")
 
   suppressWarnings(setJobGroup(sc, "groupId", "job description", TRUE))
   suppressWarnings(cancelJobGroup(sc, "groupId"))

From b803b66a8133f705463039325ee71ee6827ce1a7 Mon Sep 17 00:00:00 2001
From: wangzhenhua <wangzhenhua@huawei.com>
Date: Fri, 23 Jun 2017 10:33:53 -0700
Subject: [PATCH 007/779] [SPARK-21180][SQL] Remove conf from stats functions
 since now we have conf in LogicalPlan

## What changes were proposed in this pull request?

After wiring `SQLConf` in logical plan ([PR 18299](https://github.com/apache/spark/pull/18299)), we can remove the need of passing `conf` into `def stats` and `def computeStats`.

## How was this patch tested?

Covered by existing tests, plus some modified existing tests.

Author: wangzhenhua <wangzhenhua@huawei.com>
Author: Zhenhua Wang <wzh_zju@163.com>

Closes #18391 from wzhfy/removeConf.
---
 .../sql/catalyst/catalog/interface.scala      |  3 +-
 .../optimizer/CostBasedJoinReorder.scala      |  4 +-
 .../sql/catalyst/optimizer/Optimizer.scala    |  2 +-
 .../optimizer/StarSchemaDetection.scala       | 14 ++--
 .../plans/logical/LocalRelation.scala         |  3 +-
 .../catalyst/plans/logical/LogicalPlan.scala  | 15 ++---
 .../plans/logical/basicLogicalOperators.scala | 65 +++++++++----------
 .../sql/catalyst/plans/logical/hints.scala    |  5 +-
 .../statsEstimation/AggregateEstimation.scala |  7 +-
 .../statsEstimation/EstimationUtils.scala     |  5 +-
 .../statsEstimation/FilterEstimation.scala    |  5 +-
 .../statsEstimation/JoinEstimation.scala      | 21 +++---
 .../statsEstimation/ProjectEstimation.scala   |  7 +-
 .../optimizer/JoinOptimizationSuite.scala     |  2 +-
 .../optimizer/LimitPushdownSuite.scala        |  6 +-
 .../AggregateEstimationSuite.scala            | 30 +++++----
 .../BasicStatsEstimationSuite.scala           | 27 +++++---
 .../FilterEstimationSuite.scala               |  2 +-
 .../statsEstimation/JoinEstimationSuite.scala | 26 ++++----
 .../ProjectEstimationSuite.scala              |  4 +-
 .../StatsEstimationTestBase.scala             | 18 +++--
 .../spark/sql/execution/ExistingRDD.scala     |  5 +-
 .../spark/sql/execution/QueryExecution.scala  |  2 +-
 .../spark/sql/execution/SparkStrategies.scala | 13 ++--
 .../execution/columnar/InMemoryRelation.scala |  3 +-
 .../datasources/LogicalRelation.scala         |  3 +-
 .../sql/execution/streaming/memory.scala      |  3 +-
 .../apache/spark/sql/CachedTableSuite.scala   |  2 +-
 .../org/apache/spark/sql/DatasetSuite.scala   |  2 +-
 .../org/apache/spark/sql/JoinSuite.scala      |  2 +-
 .../spark/sql/StatisticsCollectionSuite.scala | 18 ++---
 .../columnar/InMemoryColumnarQuerySuite.scala |  2 +-
 .../datasources/HadoopFsRelationSuite.scala   |  2 +-
 .../execution/streaming/MemorySinkSuite.scala |  6 +-
 .../apache/spark/sql/test/SQLTestData.scala   |  3 -
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  2 +-
 .../spark/sql/hive/StatisticsSuite.scala      | 10 +--
 .../PruneFileSourcePartitionsSuite.scala      |  2 +-
 38 files changed, 178 insertions(+), 173 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index c043ed9c431b7..b63bef9193332 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -31,7 +31,6 @@ import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, Attri
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
 import org.apache.spark.sql.catalyst.util.quoteIdentifier
-import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.StructType
 
 
@@ -436,7 +435,7 @@ case class CatalogRelation(
     createTime = -1
   ))
 
-  override def computeStats(conf: SQLConf): Statistics = {
+  override def computeStats: Statistics = {
     // For data source tables, we will create a `LogicalRelation` and won't call this method, for
     // hive serde tables, we will always generate a statistics.
     // TODO: unify the table stats generation.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/CostBasedJoinReorder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/CostBasedJoinReorder.scala
index 51eca6ca33760..3a7543e2141e9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/CostBasedJoinReorder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/CostBasedJoinReorder.scala
@@ -58,7 +58,7 @@ case class CostBasedJoinReorder(conf: SQLConf) extends Rule[LogicalPlan] with Pr
       // Do reordering if the number of items is appropriate and join conditions exist.
       // We also need to check if costs of all items can be evaluated.
       if (items.size > 2 && items.size <= conf.joinReorderDPThreshold && conditions.nonEmpty &&
-          items.forall(_.stats(conf).rowCount.isDefined)) {
+          items.forall(_.stats.rowCount.isDefined)) {
         JoinReorderDP.search(conf, items, conditions, output)
       } else {
         plan
@@ -322,7 +322,7 @@ object JoinReorderDP extends PredicateHelper with Logging {
     /** Get the cost of the root node of this plan tree. */
     def rootCost(conf: SQLConf): Cost = {
       if (itemIds.size > 1) {
-        val rootStats = plan.stats(conf)
+        val rootStats = plan.stats
         Cost(rootStats.rowCount.get, rootStats.sizeInBytes)
       } else {
         // If the plan is a leaf item, it has zero cost.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 3ab70fb90470c..b410312030c5d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -317,7 +317,7 @@ case class LimitPushDown(conf: SQLConf) extends Rule[LogicalPlan] {
         case FullOuter =>
           (left.maxRows, right.maxRows) match {
             case (None, None) =>
-              if (left.stats(conf).sizeInBytes >= right.stats(conf).sizeInBytes) {
+              if (left.stats.sizeInBytes >= right.stats.sizeInBytes) {
                 join.copy(left = maybePushLimit(exp, left))
               } else {
                 join.copy(right = maybePushLimit(exp, right))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/StarSchemaDetection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/StarSchemaDetection.scala
index 97ee9988386dd..ca729127e7d1d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/StarSchemaDetection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/StarSchemaDetection.scala
@@ -82,7 +82,7 @@ case class StarSchemaDetection(conf: SQLConf) extends PredicateHelper {
       // Find if the input plans are eligible for star join detection.
       // An eligible plan is a base table access with valid statistics.
       val foundEligibleJoin = input.forall {
-        case PhysicalOperation(_, _, t: LeafNode) if t.stats(conf).rowCount.isDefined => true
+        case PhysicalOperation(_, _, t: LeafNode) if t.stats.rowCount.isDefined => true
         case _ => false
       }
 
@@ -181,7 +181,7 @@ case class StarSchemaDetection(conf: SQLConf) extends PredicateHelper {
       val leafCol = findLeafNodeCol(column, plan)
       leafCol match {
         case Some(col) if t.outputSet.contains(col) =>
-          val stats = t.stats(conf)
+          val stats = t.stats
           stats.rowCount match {
             case Some(rowCount) if rowCount >= 0 =>
               if (stats.attributeStats.nonEmpty && stats.attributeStats.contains(col)) {
@@ -237,7 +237,7 @@ case class StarSchemaDetection(conf: SQLConf) extends PredicateHelper {
       val leafCol = findLeafNodeCol(column, plan)
       leafCol match {
         case Some(col) if t.outputSet.contains(col) =>
-          val stats = t.stats(conf)
+          val stats = t.stats
           stats.attributeStats.nonEmpty && stats.attributeStats.contains(col)
         case None => false
       }
@@ -296,11 +296,11 @@ case class StarSchemaDetection(conf: SQLConf) extends PredicateHelper {
    */
   private def getTableAccessCardinality(
       input: LogicalPlan): Option[BigInt] = input match {
-    case PhysicalOperation(_, cond, t: LeafNode) if t.stats(conf).rowCount.isDefined =>
-      if (conf.cboEnabled && input.stats(conf).rowCount.isDefined) {
-        Option(input.stats(conf).rowCount.get)
+    case PhysicalOperation(_, cond, t: LeafNode) if t.stats.rowCount.isDefined =>
+      if (conf.cboEnabled && input.stats.rowCount.isDefined) {
+        Option(input.stats.rowCount.get)
       } else {
-        Option(t.stats(conf).rowCount.get)
+        Option(t.stats.rowCount.get)
       }
     case _ => None
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala
index 9cd5dfd21b160..dc2add64b68b7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala
@@ -21,7 +21,6 @@ import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.analysis
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal}
-import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{StructField, StructType}
 
 object LocalRelation {
@@ -67,7 +66,7 @@ case class LocalRelation(output: Seq[Attribute], data: Seq[InternalRow] = Nil)
     }
   }
 
-  override def computeStats(conf: SQLConf): Statistics =
+  override def computeStats: Statistics =
     Statistics(sizeInBytes =
       output.map(n => BigInt(n.dataType.defaultSize)).sum * data.length)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index 95b4165f6b10d..0c098ac0209e8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -23,7 +23,6 @@ import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.trees.CurrentOrigin
-import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.StructType
 
 
@@ -90,8 +89,8 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with QueryPlanConstrai
    * first time. If the configuration changes, the cache can be invalidated by calling
    * [[invalidateStatsCache()]].
    */
-  final def stats(conf: SQLConf): Statistics = statsCache.getOrElse {
-    statsCache = Some(computeStats(conf))
+  final def stats: Statistics = statsCache.getOrElse {
+    statsCache = Some(computeStats)
     statsCache.get
   }
 
@@ -108,11 +107,11 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with QueryPlanConstrai
    *
    * [[LeafNode]]s must override this.
    */
-  protected def computeStats(conf: SQLConf): Statistics = {
+  protected def computeStats: Statistics = {
     if (children.isEmpty) {
       throw new UnsupportedOperationException(s"LeafNode $nodeName must implement statistics.")
     }
-    Statistics(sizeInBytes = children.map(_.stats(conf).sizeInBytes).product)
+    Statistics(sizeInBytes = children.map(_.stats.sizeInBytes).product)
   }
 
   override def verboseStringWithSuffix: String = {
@@ -333,13 +332,13 @@ abstract class UnaryNode extends LogicalPlan {
 
   override protected def validConstraints: Set[Expression] = child.constraints
 
-  override def computeStats(conf: SQLConf): Statistics = {
+  override def computeStats: Statistics = {
     // There should be some overhead in Row object, the size should not be zero when there is
     // no columns, this help to prevent divide-by-zero error.
     val childRowSize = child.output.map(_.dataType.defaultSize).sum + 8
     val outputRowSize = output.map(_.dataType.defaultSize).sum + 8
     // Assume there will be the same number of rows as child has.
-    var sizeInBytes = (child.stats(conf).sizeInBytes * outputRowSize) / childRowSize
+    var sizeInBytes = (child.stats.sizeInBytes * outputRowSize) / childRowSize
     if (sizeInBytes == 0) {
       // sizeInBytes can't be zero, or sizeInBytes of BinaryNode will also be zero
       // (product of children).
@@ -347,7 +346,7 @@ abstract class UnaryNode extends LogicalPlan {
     }
 
     // Don't propagate rowCount and attributeStats, since they are not estimated here.
-    Statistics(sizeInBytes = sizeInBytes, hints = child.stats(conf).hints)
+    Statistics(sizeInBytes = sizeInBytes, hints = child.stats.hints)
   }
 }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index 6e88b7a57dc33..d8f89b108e63f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -23,7 +23,6 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical.statsEstimation._
-import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 import org.apache.spark.util.random.RandomSampler
@@ -65,11 +64,11 @@ case class Project(projectList: Seq[NamedExpression], child: LogicalPlan) extend
   override def validConstraints: Set[Expression] =
     child.constraints.union(getAliasedConstraints(projectList))
 
-  override def computeStats(conf: SQLConf): Statistics = {
+  override def computeStats: Statistics = {
     if (conf.cboEnabled) {
-      ProjectEstimation.estimate(conf, this).getOrElse(super.computeStats(conf))
+      ProjectEstimation.estimate(this).getOrElse(super.computeStats)
     } else {
-      super.computeStats(conf)
+      super.computeStats
     }
   }
 }
@@ -139,11 +138,11 @@ case class Filter(condition: Expression, child: LogicalPlan)
     child.constraints.union(predicates.toSet)
   }
 
-  override def computeStats(conf: SQLConf): Statistics = {
+  override def computeStats: Statistics = {
     if (conf.cboEnabled) {
-      FilterEstimation(this, conf).estimate.getOrElse(super.computeStats(conf))
+      FilterEstimation(this).estimate.getOrElse(super.computeStats)
     } else {
-      super.computeStats(conf)
+      super.computeStats
     }
   }
 }
@@ -192,13 +191,13 @@ case class Intersect(left: LogicalPlan, right: LogicalPlan) extends SetOperation
     }
   }
 
-  override def computeStats(conf: SQLConf): Statistics = {
-    val leftSize = left.stats(conf).sizeInBytes
-    val rightSize = right.stats(conf).sizeInBytes
+  override def computeStats: Statistics = {
+    val leftSize = left.stats.sizeInBytes
+    val rightSize = right.stats.sizeInBytes
     val sizeInBytes = if (leftSize < rightSize) leftSize else rightSize
     Statistics(
       sizeInBytes = sizeInBytes,
-      hints = left.stats(conf).hints.resetForJoin())
+      hints = left.stats.hints.resetForJoin())
   }
 }
 
@@ -209,8 +208,8 @@ case class Except(left: LogicalPlan, right: LogicalPlan) extends SetOperation(le
 
   override protected def validConstraints: Set[Expression] = leftConstraints
 
-  override def computeStats(conf: SQLConf): Statistics = {
-    left.stats(conf).copy()
+  override def computeStats: Statistics = {
+    left.stats.copy()
   }
 }
 
@@ -248,8 +247,8 @@ case class Union(children: Seq[LogicalPlan]) extends LogicalPlan {
     children.length > 1 && childrenResolved && allChildrenCompatible
   }
 
-  override def computeStats(conf: SQLConf): Statistics = {
-    val sizeInBytes = children.map(_.stats(conf).sizeInBytes).sum
+  override def computeStats: Statistics = {
+    val sizeInBytes = children.map(_.stats.sizeInBytes).sum
     Statistics(sizeInBytes = sizeInBytes)
   }
 
@@ -357,20 +356,20 @@ case class Join(
     case _ => resolvedExceptNatural
   }
 
-  override def computeStats(conf: SQLConf): Statistics = {
+  override def computeStats: Statistics = {
     def simpleEstimation: Statistics = joinType match {
       case LeftAnti | LeftSemi =>
         // LeftSemi and LeftAnti won't ever be bigger than left
-        left.stats(conf)
+        left.stats
       case _ =>
         // Make sure we don't propagate isBroadcastable in other joins, because
         // they could explode the size.
-        val stats = super.computeStats(conf)
+        val stats = super.computeStats
         stats.copy(hints = stats.hints.resetForJoin())
     }
 
     if (conf.cboEnabled) {
-      JoinEstimation.estimate(conf, this).getOrElse(simpleEstimation)
+      JoinEstimation.estimate(this).getOrElse(simpleEstimation)
     } else {
       simpleEstimation
     }
@@ -523,7 +522,7 @@ case class Range(
 
   override def newInstance(): Range = copy(output = output.map(_.newInstance()))
 
-  override def computeStats(conf: SQLConf): Statistics = {
+  override def computeStats: Statistics = {
     val sizeInBytes = LongType.defaultSize * numElements
     Statistics( sizeInBytes = sizeInBytes )
   }
@@ -556,20 +555,20 @@ case class Aggregate(
     child.constraints.union(getAliasedConstraints(nonAgg))
   }
 
-  override def computeStats(conf: SQLConf): Statistics = {
+  override def computeStats: Statistics = {
     def simpleEstimation: Statistics = {
       if (groupingExpressions.isEmpty) {
         Statistics(
           sizeInBytes = EstimationUtils.getOutputSize(output, outputRowCount = 1),
           rowCount = Some(1),
-          hints = child.stats(conf).hints)
+          hints = child.stats.hints)
       } else {
-        super.computeStats(conf)
+        super.computeStats
       }
     }
 
     if (conf.cboEnabled) {
-      AggregateEstimation.estimate(conf, this).getOrElse(simpleEstimation)
+      AggregateEstimation.estimate(this).getOrElse(simpleEstimation)
     } else {
       simpleEstimation
     }
@@ -672,8 +671,8 @@ case class Expand(
   override def references: AttributeSet =
     AttributeSet(projections.flatten.flatMap(_.references))
 
-  override def computeStats(conf: SQLConf): Statistics = {
-    val sizeInBytes = super.computeStats(conf).sizeInBytes * projections.length
+  override def computeStats: Statistics = {
+    val sizeInBytes = super.computeStats.sizeInBytes * projections.length
     Statistics(sizeInBytes = sizeInBytes)
   }
 
@@ -743,9 +742,9 @@ case class GlobalLimit(limitExpr: Expression, child: LogicalPlan) extends UnaryN
       case _ => None
     }
   }
-  override def computeStats(conf: SQLConf): Statistics = {
+  override def computeStats: Statistics = {
     val limit = limitExpr.eval().asInstanceOf[Int]
-    val childStats = child.stats(conf)
+    val childStats = child.stats
     val rowCount: BigInt = childStats.rowCount.map(_.min(limit)).getOrElse(limit)
     // Don't propagate column stats, because we don't know the distribution after a limit operation
     Statistics(
@@ -763,9 +762,9 @@ case class LocalLimit(limitExpr: Expression, child: LogicalPlan) extends UnaryNo
       case _ => None
     }
   }
-  override def computeStats(conf: SQLConf): Statistics = {
+  override def computeStats: Statistics = {
     val limit = limitExpr.eval().asInstanceOf[Int]
-    val childStats = child.stats(conf)
+    val childStats = child.stats
     if (limit == 0) {
       // sizeInBytes can't be zero, or sizeInBytes of BinaryNode will also be zero
       // (product of children).
@@ -832,9 +831,9 @@ case class Sample(
 
   override def output: Seq[Attribute] = child.output
 
-  override def computeStats(conf: SQLConf): Statistics = {
+  override def computeStats: Statistics = {
     val ratio = upperBound - lowerBound
-    val childStats = child.stats(conf)
+    val childStats = child.stats
     var sizeInBytes = EstimationUtils.ceil(BigDecimal(childStats.sizeInBytes) * ratio)
     if (sizeInBytes == 0) {
       sizeInBytes = 1
@@ -898,7 +897,7 @@ case class RepartitionByExpression(
 case object OneRowRelation extends LeafNode {
   override def maxRows: Option[Long] = Some(1)
   override def output: Seq[Attribute] = Nil
-  override def computeStats(conf: SQLConf): Statistics = Statistics(sizeInBytes = 1)
+  override def computeStats: Statistics = Statistics(sizeInBytes = 1)
 }
 
 /** A logical plan for `dropDuplicates`. */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala
index e49970df80457..8479c702d7561 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql.catalyst.plans.logical
 
 import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.internal.SQLConf
 
 /**
  * A general hint for the child that is not yet resolved. This node is generated by the parser and
@@ -44,8 +43,8 @@ case class ResolvedHint(child: LogicalPlan, hints: HintInfo = HintInfo())
 
   override lazy val canonicalized: LogicalPlan = child.canonicalized
 
-  override def computeStats(conf: SQLConf): Statistics = {
-    val stats = child.stats(conf)
+  override def computeStats: Statistics = {
+    val stats = child.stats
     stats.copy(hints = hints)
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala
index a0c23198451a8..c41fac4015ec0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala
@@ -19,7 +19,6 @@ package org.apache.spark.sql.catalyst.plans.logical.statsEstimation
 
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Statistics}
-import org.apache.spark.sql.internal.SQLConf
 
 
 object AggregateEstimation {
@@ -29,13 +28,13 @@ object AggregateEstimation {
    * Estimate the number of output rows based on column stats of group-by columns, and propagate
    * column stats for aggregate expressions.
    */
-  def estimate(conf: SQLConf, agg: Aggregate): Option[Statistics] = {
-    val childStats = agg.child.stats(conf)
+  def estimate(agg: Aggregate): Option[Statistics] = {
+    val childStats = agg.child.stats
     // Check if we have column stats for all group-by columns.
     val colStatsExist = agg.groupingExpressions.forall { e =>
       e.isInstanceOf[Attribute] && childStats.attributeStats.contains(e.asInstanceOf[Attribute])
     }
-    if (rowCountsExist(conf, agg.child) && colStatsExist) {
+    if (rowCountsExist(agg.child) && colStatsExist) {
       // Multiply distinct counts of group-by columns. This is an upper bound, which assumes
       // the data contains all combinations of distinct values of group-by columns.
       var outputRows: BigInt = agg.groupingExpressions.foldLeft(BigInt(1))(
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
index e5fcdf9039be9..9c34a9b7aa756 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
@@ -21,15 +21,14 @@ import scala.math.BigDecimal.RoundingMode
 
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap}
 import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, LogicalPlan, Statistics}
-import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{DecimalType, _}
 
 
 object EstimationUtils {
 
   /** Check if each plan has rowCount in its statistics. */
-  def rowCountsExist(conf: SQLConf, plans: LogicalPlan*): Boolean =
-    plans.forall(_.stats(conf).rowCount.isDefined)
+  def rowCountsExist(plans: LogicalPlan*): Boolean =
+    plans.forall(_.stats.rowCount.isDefined)
 
   /** Check if each attribute has column stat in the corresponding statistics. */
   def columnStatsExist(statsAndAttr: (Statistics, Attribute)*): Boolean = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
index df190867189ec..5a3bee7b9e449 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
@@ -25,12 +25,11 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.Literal.{FalseLiteral, TrueLiteral}
 import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Filter, LeafNode, Statistics}
 import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.EstimationUtils._
-import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 
-case class FilterEstimation(plan: Filter, catalystConf: SQLConf) extends Logging {
+case class FilterEstimation(plan: Filter) extends Logging {
 
-  private val childStats = plan.child.stats(catalystConf)
+  private val childStats = plan.child.stats
 
   private val colStatsMap = new ColumnStatsMap(childStats.attributeStats)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/JoinEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/JoinEstimation.scala
index 8ef905c45d50d..f48196997a24d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/JoinEstimation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/JoinEstimation.scala
@@ -26,7 +26,6 @@ import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Join, Statistics}
 import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.EstimationUtils._
-import org.apache.spark.sql.internal.SQLConf
 
 
 object JoinEstimation extends Logging {
@@ -34,12 +33,12 @@ object JoinEstimation extends Logging {
    * Estimate statistics after join. Return `None` if the join type is not supported, or we don't
    * have enough statistics for estimation.
    */
-  def estimate(conf: SQLConf, join: Join): Option[Statistics] = {
+  def estimate(join: Join): Option[Statistics] = {
     join.joinType match {
       case Inner | Cross | LeftOuter | RightOuter | FullOuter =>
-        InnerOuterEstimation(conf, join).doEstimate()
+        InnerOuterEstimation(join).doEstimate()
       case LeftSemi | LeftAnti =>
-        LeftSemiAntiEstimation(conf, join).doEstimate()
+        LeftSemiAntiEstimation(join).doEstimate()
       case _ =>
         logDebug(s"[CBO] Unsupported join type: ${join.joinType}")
         None
@@ -47,16 +46,16 @@ object JoinEstimation extends Logging {
   }
 }
 
-case class InnerOuterEstimation(conf: SQLConf, join: Join) extends Logging {
+case class InnerOuterEstimation(join: Join) extends Logging {
 
-  private val leftStats = join.left.stats(conf)
-  private val rightStats = join.right.stats(conf)
+  private val leftStats = join.left.stats
+  private val rightStats = join.right.stats
 
   /**
    * Estimate output size and number of rows after a join operator, and update output column stats.
    */
   def doEstimate(): Option[Statistics] = join match {
-    case _ if !rowCountsExist(conf, join.left, join.right) =>
+    case _ if !rowCountsExist(join.left, join.right) =>
       None
 
     case ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, _, _, _) =>
@@ -273,13 +272,13 @@ case class InnerOuterEstimation(conf: SQLConf, join: Join) extends Logging {
   }
 }
 
-case class LeftSemiAntiEstimation(conf: SQLConf, join: Join) {
+case class LeftSemiAntiEstimation(join: Join) {
   def doEstimate(): Option[Statistics] = {
     // TODO: It's error-prone to estimate cardinalities for LeftSemi and LeftAnti based on basic
     // column stats. Now we just propagate the statistics from left side. We should do more
     // accurate estimation when advanced stats (e.g. histograms) are available.
-    if (rowCountsExist(conf, join.left)) {
-      val leftStats = join.left.stats(conf)
+    if (rowCountsExist(join.left)) {
+      val leftStats = join.left.stats
       // Propagate the original column stats for cartesian product
       val outputRows = leftStats.rowCount.get
       Some(Statistics(
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/ProjectEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/ProjectEstimation.scala
index d700cd3b20f7d..489eb904ffd05 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/ProjectEstimation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/ProjectEstimation.scala
@@ -19,14 +19,13 @@ package org.apache.spark.sql.catalyst.plans.logical.statsEstimation
 
 import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeMap}
 import org.apache.spark.sql.catalyst.plans.logical.{Project, Statistics}
-import org.apache.spark.sql.internal.SQLConf
 
 object ProjectEstimation {
   import EstimationUtils._
 
-  def estimate(conf: SQLConf, project: Project): Option[Statistics] = {
-    if (rowCountsExist(conf, project.child)) {
-      val childStats = project.child.stats(conf)
+  def estimate(project: Project): Option[Statistics] = {
+    if (rowCountsExist(project.child)) {
+      val childStats = project.child.stats
       val inputAttrStats = childStats.attributeStats
       // Match alias with its child's column stat
       val aliasStats = project.expressions.collect {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/JoinOptimizationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/JoinOptimizationSuite.scala
index 105407d43bf39..a6584aa5fbba7 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/JoinOptimizationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/JoinOptimizationSuite.scala
@@ -142,7 +142,7 @@ class JoinOptimizationSuite extends PlanTest {
     comparePlans(optimized, expected)
 
     val broadcastChildren = optimized.collect {
-      case Join(_, r, _, _) if r.stats(conf).sizeInBytes == 1 => r
+      case Join(_, r, _, _) if r.stats.sizeInBytes == 1 => r
     }
     assert(broadcastChildren.size == 1)
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LimitPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LimitPushdownSuite.scala
index fb34c82de468b..d8302dfc9462d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LimitPushdownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LimitPushdownSuite.scala
@@ -112,7 +112,7 @@ class LimitPushdownSuite extends PlanTest {
   }
 
   test("full outer join where neither side is limited and both sides have same statistics") {
-    assert(x.stats(conf).sizeInBytes === y.stats(conf).sizeInBytes)
+    assert(x.stats.sizeInBytes === y.stats.sizeInBytes)
     val originalQuery = x.join(y, FullOuter).limit(1)
     val optimized = Optimize.execute(originalQuery.analyze)
     val correctAnswer = Limit(1, LocalLimit(1, x).join(y, FullOuter)).analyze
@@ -121,7 +121,7 @@ class LimitPushdownSuite extends PlanTest {
 
   test("full outer join where neither side is limited and left side has larger statistics") {
     val xBig = testRelation.copy(data = Seq.fill(2)(null)).subquery('x)
-    assert(xBig.stats(conf).sizeInBytes > y.stats(conf).sizeInBytes)
+    assert(xBig.stats.sizeInBytes > y.stats.sizeInBytes)
     val originalQuery = xBig.join(y, FullOuter).limit(1)
     val optimized = Optimize.execute(originalQuery.analyze)
     val correctAnswer = Limit(1, LocalLimit(1, xBig).join(y, FullOuter)).analyze
@@ -130,7 +130,7 @@ class LimitPushdownSuite extends PlanTest {
 
   test("full outer join where neither side is limited and right side has larger statistics") {
     val yBig = testRelation.copy(data = Seq.fill(2)(null)).subquery('y)
-    assert(x.stats(conf).sizeInBytes < yBig.stats(conf).sizeInBytes)
+    assert(x.stats.sizeInBytes < yBig.stats.sizeInBytes)
     val originalQuery = x.join(yBig, FullOuter).limit(1)
     val optimized = Optimize.execute(originalQuery.analyze)
     val correctAnswer = Limit(1, x.join(LocalLimit(1, yBig), FullOuter)).analyze
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala
index 38483a298cef0..30ddf03bd3c4f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala
@@ -100,17 +100,23 @@ class AggregateEstimationSuite extends StatsEstimationTestBase {
       size = Some(4 * (8 + 4)),
       attributeStats = AttributeMap(Seq("key12").map(nameToColInfo)))
 
-    val noGroupAgg = Aggregate(groupingExpressions = Nil,
-      aggregateExpressions = Seq(Alias(Count(Literal(1)), "cnt")()), child)
-    assert(noGroupAgg.stats(conf.copy(SQLConf.CBO_ENABLED -> false)) ==
-      // overhead + count result size
-      Statistics(sizeInBytes = 8 + 8, rowCount = Some(1)))
-
-    val hasGroupAgg = Aggregate(groupingExpressions = attributes,
-      aggregateExpressions = attributes :+ Alias(Count(Literal(1)), "cnt")(), child)
-    assert(hasGroupAgg.stats(conf.copy(SQLConf.CBO_ENABLED -> false)) ==
-      // From UnaryNode.computeStats, childSize * outputRowSize / childRowSize
-      Statistics(sizeInBytes = 48 * (8 + 4 + 8) / (8 + 4)))
+    val originalValue = SQLConf.get.getConf(SQLConf.CBO_ENABLED)
+    try {
+      SQLConf.get.setConf(SQLConf.CBO_ENABLED, false)
+      val noGroupAgg = Aggregate(groupingExpressions = Nil,
+        aggregateExpressions = Seq(Alias(Count(Literal(1)), "cnt")()), child)
+      assert(noGroupAgg.stats ==
+        // overhead + count result size
+        Statistics(sizeInBytes = 8 + 8, rowCount = Some(1)))
+
+      val hasGroupAgg = Aggregate(groupingExpressions = attributes,
+        aggregateExpressions = attributes :+ Alias(Count(Literal(1)), "cnt")(), child)
+      assert(hasGroupAgg.stats ==
+        // From UnaryNode.computeStats, childSize * outputRowSize / childRowSize
+        Statistics(sizeInBytes = 48 * (8 + 4 + 8) / (8 + 4)))
+    } finally {
+      SQLConf.get.setConf(SQLConf.CBO_ENABLED, originalValue)
+    }
   }
 
   private def checkAggStats(
@@ -134,6 +140,6 @@ class AggregateEstimationSuite extends StatsEstimationTestBase {
       rowCount = Some(expectedOutputRowCount),
       attributeStats = expectedAttrStats)
 
-    assert(testAgg.stats(conf) == expectedStats)
+    assert(testAgg.stats == expectedStats)
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/BasicStatsEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/BasicStatsEstimationSuite.scala
index 833f5a71994f7..e9ed36feec48c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/BasicStatsEstimationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/BasicStatsEstimationSuite.scala
@@ -57,16 +57,16 @@ class BasicStatsEstimationSuite extends StatsEstimationTestBase {
     val localLimit = LocalLimit(Literal(2), plan)
     val globalLimit = GlobalLimit(Literal(2), plan)
     // LocalLimit's stats is just its child's stats except column stats
-    checkStats(localLimit, plan.stats(conf).copy(attributeStats = AttributeMap(Nil)))
+    checkStats(localLimit, plan.stats.copy(attributeStats = AttributeMap(Nil)))
     checkStats(globalLimit, Statistics(sizeInBytes = 24, rowCount = Some(2)))
   }
 
   test("limit estimation: limit > child's rowCount") {
     val localLimit = LocalLimit(Literal(20), plan)
     val globalLimit = GlobalLimit(Literal(20), plan)
-    checkStats(localLimit, plan.stats(conf).copy(attributeStats = AttributeMap(Nil)))
+    checkStats(localLimit, plan.stats.copy(attributeStats = AttributeMap(Nil)))
     // Limit is larger than child's rowCount, so GlobalLimit's stats is equal to its child's stats.
-    checkStats(globalLimit, plan.stats(conf).copy(attributeStats = AttributeMap(Nil)))
+    checkStats(globalLimit, plan.stats.copy(attributeStats = AttributeMap(Nil)))
   }
 
   test("limit estimation: limit = 0") {
@@ -113,12 +113,19 @@ class BasicStatsEstimationSuite extends StatsEstimationTestBase {
       plan: LogicalPlan,
       expectedStatsCboOn: Statistics,
       expectedStatsCboOff: Statistics): Unit = {
-    // Invalidate statistics
-    plan.invalidateStatsCache()
-    assert(plan.stats(conf.copy(SQLConf.CBO_ENABLED -> true)) == expectedStatsCboOn)
-
-    plan.invalidateStatsCache()
-    assert(plan.stats(conf.copy(SQLConf.CBO_ENABLED -> false)) == expectedStatsCboOff)
+    val originalValue = SQLConf.get.getConf(SQLConf.CBO_ENABLED)
+    try {
+      // Invalidate statistics
+      plan.invalidateStatsCache()
+      SQLConf.get.setConf(SQLConf.CBO_ENABLED, true)
+      assert(plan.stats == expectedStatsCboOn)
+
+      plan.invalidateStatsCache()
+      SQLConf.get.setConf(SQLConf.CBO_ENABLED, false)
+      assert(plan.stats == expectedStatsCboOff)
+    } finally {
+      SQLConf.get.setConf(SQLConf.CBO_ENABLED, originalValue)
+    }
   }
 
   /** Check estimated stats when it's the same whether cbo is turned on or off. */
@@ -135,6 +142,6 @@ private case class DummyLogicalPlan(
     cboStats: Statistics) extends LogicalPlan {
   override def output: Seq[Attribute] = Nil
   override def children: Seq[LogicalPlan] = Nil
-  override def computeStats(conf: SQLConf): Statistics =
+  override def computeStats: Statistics =
     if (conf.cboEnabled) cboStats else defaultStats
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
index 2fa53a6466ef2..455037e6c9952 100755
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
@@ -620,7 +620,7 @@ class FilterEstimationSuite extends StatsEstimationTestBase {
         rowCount = Some(expectedRowCount),
         attributeStats = expectedAttributeMap)
 
-      val filterStats = filter.stats(conf)
+      val filterStats = filter.stats
       assert(filterStats.sizeInBytes == expectedStats.sizeInBytes)
       assert(filterStats.rowCount == expectedStats.rowCount)
       val rowCountValue = filterStats.rowCount.getOrElse(0)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/JoinEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/JoinEstimationSuite.scala
index 2d6b6e8e21f34..097c78eb27fca 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/JoinEstimationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/JoinEstimationSuite.scala
@@ -77,7 +77,7 @@ class JoinEstimationSuite extends StatsEstimationTestBase {
       // Keep the column stat from both sides unchanged.
       attributeStats = AttributeMap(
         Seq("key-1-5", "key-5-9", "key-1-2", "key-2-4").map(nameToColInfo)))
-    assert(join.stats(conf) == expectedStats)
+    assert(join.stats == expectedStats)
   }
 
   test("disjoint inner join") {
@@ -90,7 +90,7 @@ class JoinEstimationSuite extends StatsEstimationTestBase {
       sizeInBytes = 1,
       rowCount = Some(0),
       attributeStats = AttributeMap(Nil))
-    assert(join.stats(conf) == expectedStats)
+    assert(join.stats == expectedStats)
   }
 
   test("disjoint left outer join") {
@@ -106,7 +106,7 @@ class JoinEstimationSuite extends StatsEstimationTestBase {
         // Null count for right side columns = left row count
         Seq(nameToAttr("key-1-2") -> nullColumnStat(nameToAttr("key-1-2").dataType, 5),
           nameToAttr("key-2-4") -> nullColumnStat(nameToAttr("key-2-4").dataType, 5))))
-    assert(join.stats(conf) == expectedStats)
+    assert(join.stats == expectedStats)
   }
 
   test("disjoint right outer join") {
@@ -122,7 +122,7 @@ class JoinEstimationSuite extends StatsEstimationTestBase {
         // Null count for left side columns = right row count
         Seq(nameToAttr("key-1-5") -> nullColumnStat(nameToAttr("key-1-5").dataType, 3),
           nameToAttr("key-5-9") -> nullColumnStat(nameToAttr("key-5-9").dataType, 3))))
-    assert(join.stats(conf) == expectedStats)
+    assert(join.stats == expectedStats)
   }
 
   test("disjoint full outer join") {
@@ -140,7 +140,7 @@ class JoinEstimationSuite extends StatsEstimationTestBase {
           nameToAttr("key-5-9") -> columnInfo(nameToAttr("key-5-9")).copy(nullCount = 3),
           nameToAttr("key-1-2") -> columnInfo(nameToAttr("key-1-2")).copy(nullCount = 5),
           nameToAttr("key-2-4") -> columnInfo(nameToAttr("key-2-4")).copy(nullCount = 5))))
-    assert(join.stats(conf) == expectedStats)
+    assert(join.stats == expectedStats)
   }
 
   test("inner join") {
@@ -161,7 +161,7 @@ class JoinEstimationSuite extends StatsEstimationTestBase {
       attributeStats = AttributeMap(
         Seq(nameToAttr("key-1-5") -> joinedColStat, nameToAttr("key-1-2") -> joinedColStat,
           nameToAttr("key-5-9") -> colStatForkey59, nameToColInfo("key-2-4"))))
-    assert(join.stats(conf) == expectedStats)
+    assert(join.stats == expectedStats)
   }
 
   test("inner join with multiple equi-join keys") {
@@ -183,7 +183,7 @@ class JoinEstimationSuite extends StatsEstimationTestBase {
       attributeStats = AttributeMap(
         Seq(nameToAttr("key-1-2") -> joinedColStat1, nameToAttr("key-1-2") -> joinedColStat1,
           nameToAttr("key-2-4") -> joinedColStat2, nameToAttr("key-2-3") -> joinedColStat2)))
-    assert(join.stats(conf) == expectedStats)
+    assert(join.stats == expectedStats)
   }
 
   test("left outer join") {
@@ -201,7 +201,7 @@ class JoinEstimationSuite extends StatsEstimationTestBase {
       attributeStats = AttributeMap(
         Seq(nameToColInfo("key-1-2"), nameToColInfo("key-2-3"),
           nameToColInfo("key-1-2"), nameToAttr("key-2-4") -> joinedColStat)))
-    assert(join.stats(conf) == expectedStats)
+    assert(join.stats == expectedStats)
   }
 
   test("right outer join") {
@@ -219,7 +219,7 @@ class JoinEstimationSuite extends StatsEstimationTestBase {
       attributeStats = AttributeMap(
         Seq(nameToColInfo("key-1-2"), nameToAttr("key-2-4") -> joinedColStat,
           nameToColInfo("key-1-2"), nameToColInfo("key-2-3"))))
-    assert(join.stats(conf) == expectedStats)
+    assert(join.stats == expectedStats)
   }
 
   test("full outer join") {
@@ -234,7 +234,7 @@ class JoinEstimationSuite extends StatsEstimationTestBase {
       // Keep the column stat from both sides unchanged.
       attributeStats = AttributeMap(Seq(nameToColInfo("key-1-2"), nameToColInfo("key-2-4"),
         nameToColInfo("key-1-2"), nameToColInfo("key-2-3"))))
-    assert(join.stats(conf) == expectedStats)
+    assert(join.stats == expectedStats)
   }
 
   test("left semi/anti join") {
@@ -248,7 +248,7 @@ class JoinEstimationSuite extends StatsEstimationTestBase {
         sizeInBytes = 3 * (8 + 4 * 2),
         rowCount = Some(3),
         attributeStats = AttributeMap(Seq(nameToColInfo("key-1-2"), nameToColInfo("key-2-4"))))
-      assert(join.stats(conf) == expectedStats)
+      assert(join.stats == expectedStats)
     }
   }
 
@@ -306,7 +306,7 @@ class JoinEstimationSuite extends StatsEstimationTestBase {
           sizeInBytes = 1 * (8 + 2 * getColSize(key1, columnInfo1(key1))),
           rowCount = Some(1),
           attributeStats = AttributeMap(Seq(key1 -> columnInfo1(key1), key2 -> columnInfo1(key1))))
-        assert(join.stats(conf) == expectedStats)
+        assert(join.stats == expectedStats)
       }
     }
   }
@@ -323,6 +323,6 @@ class JoinEstimationSuite extends StatsEstimationTestBase {
       sizeInBytes = 1,
       rowCount = Some(0),
       attributeStats = AttributeMap(Nil))
-    assert(join.stats(conf) == expectedStats)
+    assert(join.stats == expectedStats)
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/ProjectEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/ProjectEstimationSuite.scala
index a5c4d22a29386..cda54fa9d64f4 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/ProjectEstimationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/ProjectEstimationSuite.scala
@@ -45,7 +45,7 @@ class ProjectEstimationSuite extends StatsEstimationTestBase {
       sizeInBytes = 2 * (8 + 4 + 4),
       rowCount = Some(2),
       attributeStats = expectedAttrStats)
-    assert(proj.stats(conf) == expectedStats)
+    assert(proj.stats == expectedStats)
   }
 
   test("project on empty table") {
@@ -131,6 +131,6 @@ class ProjectEstimationSuite extends StatsEstimationTestBase {
       sizeInBytes = expectedSize,
       rowCount = Some(expectedRowCount),
       attributeStats = projectAttrMap)
-    assert(proj.stats(conf) == expectedStats)
+    assert(proj.stats == expectedStats)
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/StatsEstimationTestBase.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/StatsEstimationTestBase.scala
index 263f4e18803d5..eaa33e44a6a5a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/StatsEstimationTestBase.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/StatsEstimationTestBase.scala
@@ -21,14 +21,24 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference}
 import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, LeafNode, LogicalPlan, Statistics}
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.internal.SQLConf.{CASE_SENSITIVE, CBO_ENABLED}
 import org.apache.spark.sql.types.{IntegerType, StringType}
 
 
 trait StatsEstimationTestBase extends SparkFunSuite {
 
-  /** Enable stats estimation based on CBO. */
-  protected val conf = new SQLConf().copy(CASE_SENSITIVE -> true, CBO_ENABLED -> true)
+  var originalValue: Boolean = false
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    // Enable stats estimation based on CBO.
+    originalValue = SQLConf.get.getConf(SQLConf.CBO_ENABLED)
+    SQLConf.get.setConf(SQLConf.CBO_ENABLED, true)
+  }
+
+  override def afterAll(): Unit = {
+    SQLConf.get.setConf(SQLConf.CBO_ENABLED, originalValue)
+    super.afterAll()
+  }
 
   def getColSize(attribute: Attribute, colStat: ColumnStat): Long = attribute.dataType match {
     // For UTF8String: base + offset + numBytes
@@ -55,7 +65,7 @@ case class StatsTestPlan(
     attributeStats: AttributeMap[ColumnStat],
     size: Option[BigInt] = None) extends LeafNode {
   override def output: Seq[Attribute] = outputList
-  override def computeStats(conf: SQLConf): Statistics = Statistics(
+  override def computeStats: Statistics = Statistics(
     // If sizeInBytes is useless in testing, we just use a fake value
     sizeInBytes = size.getOrElse(Int.MaxValue),
     rowCount = Some(rowCount),
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
index 3d1b481a53e75..66f66a289a065 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
@@ -25,7 +25,6 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartitioning}
 import org.apache.spark.sql.execution.metric.SQLMetrics
-import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.DataType
 import org.apache.spark.util.Utils
 
@@ -89,7 +88,7 @@ case class ExternalRDD[T](
 
   override protected def stringArgs: Iterator[Any] = Iterator(output)
 
-  @transient override def computeStats(conf: SQLConf): Statistics = Statistics(
+  @transient override def computeStats: Statistics = Statistics(
     // TODO: Instead of returning a default value here, find a way to return a meaningful size
     // estimate for RDDs. See PR 1238 for more discussions.
     sizeInBytes = BigInt(session.sessionState.conf.defaultSizeInBytes)
@@ -157,7 +156,7 @@ case class LogicalRDD(
 
   override protected def stringArgs: Iterator[Any] = Iterator(output)
 
-  @transient override def computeStats(conf: SQLConf): Statistics = Statistics(
+  @transient override def computeStats: Statistics = Statistics(
     // TODO: Instead of returning a default value here, find a way to return a meaningful size
     // estimate for RDDs. See PR 1238 for more discussions.
     sizeInBytes = BigInt(session.sessionState.conf.defaultSizeInBytes)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
index 34998cbd61552..c7cac332a0377 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -221,7 +221,7 @@ class QueryExecution(val sparkSession: SparkSession, val logical: LogicalPlan) {
 
   def stringWithStats: String = {
     // trigger to compute stats for logical plans
-    optimizedPlan.stats(sparkSession.sessionState.conf)
+    optimizedPlan.stats
 
     // only show optimized logical plan and physical plan
     s"""== Optimized Logical Plan ==
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index ea86f6e00fefa..a57d5abb90c0e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -22,7 +22,6 @@ import org.apache.spark.sql.Strategy
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.aggregate.First
 import org.apache.spark.sql.catalyst.planning._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
@@ -114,9 +113,9 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
      * Matches a plan whose output should be small enough to be used in broadcast join.
      */
     private def canBroadcast(plan: LogicalPlan): Boolean = {
-      plan.stats(conf).hints.broadcast ||
-        (plan.stats(conf).sizeInBytes >= 0 &&
-          plan.stats(conf).sizeInBytes <= conf.autoBroadcastJoinThreshold)
+      plan.stats.hints.broadcast ||
+        (plan.stats.sizeInBytes >= 0 &&
+          plan.stats.sizeInBytes <= conf.autoBroadcastJoinThreshold)
     }
 
     /**
@@ -126,7 +125,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
      * dynamic.
      */
     private def canBuildLocalHashMap(plan: LogicalPlan): Boolean = {
-      plan.stats(conf).sizeInBytes < conf.autoBroadcastJoinThreshold * conf.numShufflePartitions
+      plan.stats.sizeInBytes < conf.autoBroadcastJoinThreshold * conf.numShufflePartitions
     }
 
     /**
@@ -137,7 +136,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
      * use the size of bytes here as estimation.
      */
     private def muchSmaller(a: LogicalPlan, b: LogicalPlan): Boolean = {
-      a.stats(conf).sizeInBytes * 3 <= b.stats(conf).sizeInBytes
+      a.stats.sizeInBytes * 3 <= b.stats.sizeInBytes
     }
 
     private def canBuildRight(joinType: JoinType): Boolean = joinType match {
@@ -206,7 +205,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
 
       case logical.Join(left, right, joinType, condition) =>
         val buildSide =
-          if (right.stats(conf).sizeInBytes <= left.stats(conf).sizeInBytes) {
+          if (right.stats.sizeInBytes <= left.stats.sizeInBytes) {
             BuildRight
           } else {
             BuildLeft
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
index 456a8f3b20f30..2972132336de0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
@@ -27,7 +27,6 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.Statistics
 import org.apache.spark.sql.execution.SparkPlan
-import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.LongAccumulator
 
@@ -70,7 +69,7 @@ case class InMemoryRelation(
 
   @transient val partitionStatistics = new PartitionStatistics(output)
 
-  override def computeStats(conf: SQLConf): Statistics = {
+  override def computeStats: Statistics = {
     if (batchStats.value == 0L) {
       // Underlying columnar RDD hasn't been materialized, no useful statistics information
       // available, return the default statistics.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
index 3813f953e06a3..c1b2895f1747e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
@@ -20,7 +20,6 @@ import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
 import org.apache.spark.sql.catalyst.catalog.CatalogTable
 import org.apache.spark.sql.catalyst.expressions.{AttributeMap, AttributeReference}
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics}
-import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources.BaseRelation
 import org.apache.spark.util.Utils
 
@@ -46,7 +45,7 @@ case class LogicalRelation(
   // Only care about relation when canonicalizing.
   override def preCanonicalized: LogicalPlan = copy(catalogTable = None)
 
-  @transient override def computeStats(conf: SQLConf): Statistics = {
+  @transient override def computeStats: Statistics = {
     catalogTable.flatMap(_.stats.map(_.toPlanStats(output))).getOrElse(
       Statistics(sizeInBytes = relation.sizeInBytes))
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
index 7eaa803a9ecb4..a5dac469f85b6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
@@ -29,7 +29,6 @@ import org.apache.spark.sql.catalyst.encoders.encoderFor
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Statistics}
 import org.apache.spark.sql.catalyst.streaming.InternalOutputModes._
-import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.Utils
@@ -230,6 +229,6 @@ case class MemoryPlan(sink: MemorySink, output: Seq[Attribute]) extends LeafNode
 
   private val sizePerRow = sink.schema.toAttributes.map(_.dataType.defaultSize).sum
 
-  override def computeStats(conf: SQLConf): Statistics =
+  override def computeStats: Statistics =
     Statistics(sizePerRow * sink.allData.size)
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
index 8532a5b5bc8eb..506cc2548e260 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
@@ -313,7 +313,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext
     spark.table("testData").queryExecution.withCachedData.collect {
       case cached: InMemoryRelation =>
         val actualSizeInBytes = (1 to 100).map(i => 4 + i.toString.length + 4).sum
-        assert(cached.stats(sqlConf).sizeInBytes === actualSizeInBytes)
+        assert(cached.stats.sizeInBytes === actualSizeInBytes)
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 165176f6c040e..87b7b090de3bf 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -1146,7 +1146,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
     // instead of Int for avoiding possible overflow.
     val ds = (0 to 10000).map( i =>
       (i, Seq((i, Seq((i, "This is really not that long of a string")))))).toDS()
-    val sizeInBytes = ds.logicalPlan.stats(sqlConf).sizeInBytes
+    val sizeInBytes = ds.logicalPlan.stats.sizeInBytes
     // sizeInBytes is 2404280404, before the fix, it overflows to a negative number
     assert(sizeInBytes > 0)
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index 1a66aa85f5a02..895ca196a7a51 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -33,7 +33,7 @@ class JoinSuite extends QueryTest with SharedSQLContext {
   setupTestData()
 
   def statisticSizeInByte(df: DataFrame): BigInt = {
-    df.queryExecution.optimizedPlan.stats(sqlConf).sizeInBytes
+    df.queryExecution.optimizedPlan.stats.sizeInBytes
   }
 
   test("equi-join is hash-join") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
index 601324f2c0172..9824062f969b3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
@@ -60,7 +60,7 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
       val df = df1.join(df2, Seq("k"), "left")
 
       val sizes = df.queryExecution.analyzed.collect { case g: Join =>
-        g.stats(conf).sizeInBytes
+        g.stats.sizeInBytes
       }
 
       assert(sizes.size === 1, s"number of Join nodes is wrong:\n ${df.queryExecution}")
@@ -107,9 +107,9 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
   test("SPARK-15392: DataFrame created from RDD should not be broadcasted") {
     val rdd = sparkContext.range(1, 100).map(i => Row(i, i))
     val df = spark.createDataFrame(rdd, new StructType().add("a", LongType).add("b", LongType))
-    assert(df.queryExecution.analyzed.stats(conf).sizeInBytes >
+    assert(df.queryExecution.analyzed.stats.sizeInBytes >
       spark.sessionState.conf.autoBroadcastJoinThreshold)
-    assert(df.selectExpr("a").queryExecution.analyzed.stats(conf).sizeInBytes >
+    assert(df.selectExpr("a").queryExecution.analyzed.stats.sizeInBytes >
       spark.sessionState.conf.autoBroadcastJoinThreshold)
   }
 
@@ -250,13 +250,13 @@ abstract class StatisticsCollectionTestBase extends QueryTest with SQLTestUtils
   test("SPARK-18856: non-empty partitioned table should not report zero size") {
     withTable("ds_tbl", "hive_tbl") {
       spark.range(100).select($"id", $"id" % 5 as "p").write.partitionBy("p").saveAsTable("ds_tbl")
-      val stats = spark.table("ds_tbl").queryExecution.optimizedPlan.stats(conf)
+      val stats = spark.table("ds_tbl").queryExecution.optimizedPlan.stats
       assert(stats.sizeInBytes > 0, "non-empty partitioned table should not report zero size.")
 
       if (spark.conf.get(StaticSQLConf.CATALOG_IMPLEMENTATION) == "hive") {
         sql("CREATE TABLE hive_tbl(i int) PARTITIONED BY (j int)")
         sql("INSERT INTO hive_tbl PARTITION(j=1) SELECT 1")
-        val stats2 = spark.table("hive_tbl").queryExecution.optimizedPlan.stats(conf)
+        val stats2 = spark.table("hive_tbl").queryExecution.optimizedPlan.stats
         assert(stats2.sizeInBytes > 0, "non-empty partitioned table should not report zero size.")
       }
     }
@@ -296,10 +296,10 @@ abstract class StatisticsCollectionTestBase extends QueryTest with SQLTestUtils
     assert(catalogTable.stats.get.colStats == Map("c1" -> emptyColStat))
 
     // Check relation statistics
-    assert(relation.stats(conf).sizeInBytes == 0)
-    assert(relation.stats(conf).rowCount == Some(0))
-    assert(relation.stats(conf).attributeStats.size == 1)
-    val (attribute, colStat) = relation.stats(conf).attributeStats.head
+    assert(relation.stats.sizeInBytes == 0)
+    assert(relation.stats.rowCount == Some(0))
+    assert(relation.stats.attributeStats.size == 1)
+    val (attribute, colStat) = relation.stats.attributeStats.head
     assert(attribute.name == "c1")
     assert(colStat == emptyColStat)
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
index 109b1d9db60d2..8d411eb191cd9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
@@ -126,7 +126,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
       .toDF().createOrReplaceTempView("sizeTst")
     spark.catalog.cacheTable("sizeTst")
     assert(
-      spark.table("sizeTst").queryExecution.analyzed.stats(sqlConf).sizeInBytes >
+      spark.table("sizeTst").queryExecution.analyzed.stats.sizeInBytes >
         spark.conf.get(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD))
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala
index becb3aa270401..caf03885e3873 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala
@@ -36,7 +36,7 @@ class HadoopFsRelationSuite extends QueryTest with SharedSQLContext {
       })
       val totalSize = allFiles.map(_.length()).sum
       val df = spark.read.parquet(dir.toString)
-      assert(df.queryExecution.logical.stats(sqlConf).sizeInBytes === BigInt(totalSize))
+      assert(df.queryExecution.logical.stats.sizeInBytes === BigInt(totalSize))
     }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/MemorySinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/MemorySinkSuite.scala
index 24a7b7740fa5b..e8420eee7fe9d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/MemorySinkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/MemorySinkSuite.scala
@@ -216,15 +216,15 @@ class MemorySinkSuite extends StreamTest with BeforeAndAfter {
 
     // Before adding data, check output
     checkAnswer(sink.allData, Seq.empty)
-    assert(plan.stats(sqlConf).sizeInBytes === 0)
+    assert(plan.stats.sizeInBytes === 0)
 
     sink.addBatch(0, 1 to 3)
     plan.invalidateStatsCache()
-    assert(plan.stats(sqlConf).sizeInBytes === 12)
+    assert(plan.stats.sizeInBytes === 12)
 
     sink.addBatch(1, 4 to 6)
     plan.invalidateStatsCache()
-    assert(plan.stats(sqlConf).sizeInBytes === 24)
+    assert(plan.stats.sizeInBytes === 24)
   }
 
   ignore("stress test") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestData.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestData.scala
index f9b3ff8405823..0cfe260e52152 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestData.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestData.scala
@@ -21,7 +21,6 @@ import java.nio.charset.StandardCharsets
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, SparkSession, SQLContext, SQLImplicits}
-import org.apache.spark.sql.internal.SQLConf
 
 /**
  * A collection of sample data used in SQL tests.
@@ -29,8 +28,6 @@ import org.apache.spark.sql.internal.SQLConf
 private[sql] trait SQLTestData { self =>
   protected def spark: SparkSession
 
-  protected def sqlConf: SQLConf = spark.sessionState.conf
-
   // Helper object to import SQL implicits without a concrete SQLContext
   private object internalImplicits extends SQLImplicits {
     protected override def _sqlContext: SQLContext = self.spark.sqlContext
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index ff5afc8e3ce05..808dc013f170b 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -154,7 +154,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
           Some(partitionSchema))
 
         val logicalRelation = cached.getOrElse {
-          val sizeInBytes = relation.stats(sparkSession.sessionState.conf).sizeInBytes.toLong
+          val sizeInBytes = relation.stats.sizeInBytes.toLong
           val fileIndex = {
             val index = new CatalogFileIndex(sparkSession, relation.tableMeta, sizeInBytes)
             if (lazyPruningEnabled) {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index 001bbc230ff18..279db9a397258 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -68,7 +68,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
           assert(properties("totalSize").toLong <= 0, "external table totalSize must be <= 0")
           assert(properties("rawDataSize").toLong <= 0, "external table rawDataSize must be <= 0")
 
-          val sizeInBytes = relation.stats(conf).sizeInBytes
+          val sizeInBytes = relation.stats.sizeInBytes
           assert(sizeInBytes === BigInt(file1.length() + file2.length()))
         }
       }
@@ -77,7 +77,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
 
   test("analyze Hive serde tables") {
     def queryTotalSize(tableName: String): BigInt =
-      spark.table(tableName).queryExecution.analyzed.stats(conf).sizeInBytes
+      spark.table(tableName).queryExecution.analyzed.stats.sizeInBytes
 
     // Non-partitioned table
     sql("CREATE TABLE analyzeTable (key STRING, value STRING)").collect()
@@ -659,7 +659,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
   test("estimates the size of a test Hive serde tables") {
     val df = sql("""SELECT * FROM src""")
     val sizes = df.queryExecution.analyzed.collect {
-      case relation: CatalogRelation => relation.stats(conf).sizeInBytes
+      case relation: CatalogRelation => relation.stats.sizeInBytes
     }
     assert(sizes.size === 1, s"Size wrong for:\n ${df.queryExecution}")
     assert(sizes(0).equals(BigInt(5812)),
@@ -679,7 +679,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
 
       // Assert src has a size smaller than the threshold.
       val sizes = df.queryExecution.analyzed.collect {
-        case r if ct.runtimeClass.isAssignableFrom(r.getClass) => r.stats(conf).sizeInBytes
+        case r if ct.runtimeClass.isAssignableFrom(r.getClass) => r.stats.sizeInBytes
       }
       assert(sizes.size === 2 && sizes(0) <= spark.sessionState.conf.autoBroadcastJoinThreshold
         && sizes(1) <= spark.sessionState.conf.autoBroadcastJoinThreshold,
@@ -733,7 +733,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
 
     // Assert src has a size smaller than the threshold.
     val sizes = df.queryExecution.analyzed.collect {
-      case relation: CatalogRelation => relation.stats(conf).sizeInBytes
+      case relation: CatalogRelation => relation.stats.sizeInBytes
     }
     assert(sizes.size === 2 && sizes(1) <= spark.sessionState.conf.autoBroadcastJoinThreshold
       && sizes(0) <= spark.sessionState.conf.autoBroadcastJoinThreshold,
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala
index d91f25a4da013..3a724aa14f2a9 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala
@@ -86,7 +86,7 @@ class PruneFileSourcePartitionsSuite extends QueryTest with SQLTestUtils with Te
         case relation: LogicalRelation => relation
       }
       assert(relations.size === 1, s"Size wrong for:\n ${df.queryExecution}")
-      val size2 = relations(0).computeStats(conf).sizeInBytes
+      val size2 = relations(0).computeStats.sizeInBytes
       assert(size2 == relations(0).catalogTable.get.stats.get.sizeInBytes)
       assert(size2 < tableStats.get.sizeInBytes)
     }

From 1ebe7ffe072bcac03360e65e959a6cd36530a9c4 Mon Sep 17 00:00:00 2001
From: Dhruve Ashar <dhruveashar@gmail.com>
Date: Fri, 23 Jun 2017 10:36:29 -0700
Subject: [PATCH 008/779] [SPARK-21181] Release byteBuffers to suppress netty
 error messages

## What changes were proposed in this pull request?
We are explicitly calling release on the byteBuf's used to encode the string to Base64 to suppress the memory leak error message reported by netty. This is to make it less confusing for the user.

### Changes proposed in this fix
By explicitly invoking release on the byteBuf's we are decrement the internal reference counts for the wrappedByteBuf's. Now, when the GC kicks in, these would be reclaimed as before, just that netty wouldn't report any memory leak error messages as the internal ref. counts are now 0.

## How was this patch tested?
Ran a few spark-applications and examined the logs. The error message no longer appears.

Original PR was opened against branch-2.1 => https://github.com/apache/spark/pull/18392

Author: Dhruve Ashar <dhruveashar@gmail.com>

Closes #18407 from dhruve/master.
---
 .../spark/network/sasl/SparkSaslServer.java   | 26 ++++++++++++++++---
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/common/network-common/src/main/java/org/apache/spark/network/sasl/SparkSaslServer.java b/common/network-common/src/main/java/org/apache/spark/network/sasl/SparkSaslServer.java
index e24fdf0c74de3..00f3e83dbc8b3 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/sasl/SparkSaslServer.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/sasl/SparkSaslServer.java
@@ -34,6 +34,7 @@
 import com.google.common.base.Preconditions;
 import com.google.common.base.Throwables;
 import com.google.common.collect.ImmutableMap;
+import io.netty.buffer.ByteBuf;
 import io.netty.buffer.Unpooled;
 import io.netty.handler.codec.base64.Base64;
 import org.slf4j.Logger;
@@ -187,14 +188,31 @@ public void handle(Callback[] callbacks) throws IOException, UnsupportedCallback
   /* Encode a byte[] identifier as a Base64-encoded string. */
   public static String encodeIdentifier(String identifier) {
     Preconditions.checkNotNull(identifier, "User cannot be null if SASL is enabled");
-    return Base64.encode(Unpooled.wrappedBuffer(identifier.getBytes(StandardCharsets.UTF_8)))
-      .toString(StandardCharsets.UTF_8);
+    return getBase64EncodedString(identifier);
   }
 
   /** Encode a password as a base64-encoded char[] array. */
   public static char[] encodePassword(String password) {
     Preconditions.checkNotNull(password, "Password cannot be null if SASL is enabled");
-    return Base64.encode(Unpooled.wrappedBuffer(password.getBytes(StandardCharsets.UTF_8)))
-      .toString(StandardCharsets.UTF_8).toCharArray();
+    return getBase64EncodedString(password).toCharArray();
+  }
+
+  /** Return a Base64-encoded string. */
+  private static String getBase64EncodedString(String str) {
+    ByteBuf byteBuf = null;
+    ByteBuf encodedByteBuf = null;
+    try {
+      byteBuf = Unpooled.wrappedBuffer(str.getBytes(StandardCharsets.UTF_8));
+      encodedByteBuf = Base64.encode(byteBuf);
+      return encodedByteBuf.toString(StandardCharsets.UTF_8);
+    } finally {
+      // The release is called to suppress the memory leak error messages raised by netty.
+      if (byteBuf != null) {
+        byteBuf.release();
+        if (encodedByteBuf != null) {
+          encodedByteBuf.release();
+        }
+      }
+    }
   }
 }

From 2ebd0838d165fe33b404e8d86c0fa445d1f47439 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Fri, 23 Jun 2017 10:55:02 -0700
Subject: [PATCH 009/779] [SPARK-21192][SS] Preserve State Store provider class
 configuration across StreamingQuery restarts

## What changes were proposed in this pull request?

If the SQL conf for StateStore provider class is changed between restarts (i.e. query started with providerClass1 and attempted to restart using providerClass2), then the query will fail in a unpredictable way as files saved by one provider class cannot be used by the newer one.

Ideally, the provider class used to start the query should be used to restart the query, and the configuration in the session where it is being restarted should be ignored.

This PR saves the provider class config to OffsetSeqLog, in the same way # shuffle partitions is saved and recovered.

## How was this patch tested?
new unit tests

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #18402 from tdas/SPARK-21192.
---
 .../apache/spark/sql/internal/SQLConf.scala   |  5 +-
 .../sql/execution/streaming/OffsetSeq.scala   | 39 +++++++++++++-
 .../execution/streaming/StreamExecution.scala | 26 +++-------
 .../streaming/state/StateStore.scala          |  3 +-
 .../streaming/state/StateStoreConf.scala      |  2 +-
 .../streaming/OffsetSeqLogSuite.scala         | 10 ++--
 .../spark/sql/streaming/StreamSuite.scala     | 51 +++++++++++++++----
 7 files changed, 96 insertions(+), 40 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index e609256db2802..9c8e26a8eeadf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -601,7 +601,8 @@ object SQLConf {
         "The class used to manage state data in stateful streaming queries. This class must " +
           "be a subclass of StateStoreProvider, and must have a zero-arg constructor.")
       .stringConf
-      .createOptional
+      .createWithDefault(
+        "org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider")
 
   val STATE_STORE_MIN_DELTAS_FOR_SNAPSHOT =
     buildConf("spark.sql.streaming.stateStore.minDeltasForSnapshot")
@@ -897,7 +898,7 @@ class SQLConf extends Serializable with Logging {
 
   def optimizerInSetConversionThreshold: Int = getConf(OPTIMIZER_INSET_CONVERSION_THRESHOLD)
 
-  def stateStoreProviderClass: Option[String] = getConf(STATE_STORE_PROVIDER_CLASS)
+  def stateStoreProviderClass: String = getConf(STATE_STORE_PROVIDER_CLASS)
 
   def stateStoreMinDeltasForSnapshot: Int = getConf(STATE_STORE_MIN_DELTAS_FOR_SNAPSHOT)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala
index 8249adab4bba8..4e0a468b962a2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala
@@ -20,6 +20,10 @@ package org.apache.spark.sql.execution.streaming
 import org.json4s.NoTypeHints
 import org.json4s.jackson.Serialization
 
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.RuntimeConfig
+import org.apache.spark.sql.internal.SQLConf.{SHUFFLE_PARTITIONS, STATE_STORE_PROVIDER_CLASS}
+
 /**
  * An ordered collection of offsets, used to track the progress of processing data from one or more
  * [[Source]]s that are present in a streaming query. This is similar to simplified, single-instance
@@ -78,7 +82,40 @@ case class OffsetSeqMetadata(
   def json: String = Serialization.write(this)(OffsetSeqMetadata.format)
 }
 
-object OffsetSeqMetadata {
+object OffsetSeqMetadata extends Logging {
   private implicit val format = Serialization.formats(NoTypeHints)
+  private val relevantSQLConfs = Seq(SHUFFLE_PARTITIONS, STATE_STORE_PROVIDER_CLASS)
+
   def apply(json: String): OffsetSeqMetadata = Serialization.read[OffsetSeqMetadata](json)
+
+  def apply(
+      batchWatermarkMs: Long,
+      batchTimestampMs: Long,
+      sessionConf: RuntimeConfig): OffsetSeqMetadata = {
+    val confs = relevantSQLConfs.map { conf => conf.key -> sessionConf.get(conf.key) }.toMap
+    OffsetSeqMetadata(batchWatermarkMs, batchTimestampMs, confs)
+  }
+
+  /** Set the SparkSession configuration with the values in the metadata */
+  def setSessionConf(metadata: OffsetSeqMetadata, sessionConf: RuntimeConfig): Unit = {
+    OffsetSeqMetadata.relevantSQLConfs.map(_.key).foreach { confKey =>
+
+      metadata.conf.get(confKey) match {
+
+        case Some(valueInMetadata) =>
+          // Config value exists in the metadata, update the session config with this value
+          val optionalValueInSession = sessionConf.getOption(confKey)
+          if (optionalValueInSession.isDefined && optionalValueInSession.get != valueInMetadata) {
+            logWarning(s"Updating the value of conf '$confKey' in current session from " +
+              s"'${optionalValueInSession.get}' to '$valueInMetadata'.")
+          }
+          sessionConf.set(confKey, valueInMetadata)
+
+        case None =>
+          // For backward compatibility, if a config was not recorded in the offset log,
+          // then log it, and let the existing conf value in SparkSession prevail.
+          logWarning (s"Conf '$confKey' was not found in the offset log, using existing value")
+      }
+    }
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 06bdec8b06407..d5f8d2acba92b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -125,9 +125,8 @@ class StreamExecution(
   }
 
   /** Metadata associated with the offset seq of a batch in the query. */
-  protected var offsetSeqMetadata = OffsetSeqMetadata(batchWatermarkMs = 0, batchTimestampMs = 0,
-    conf = Map(SQLConf.SHUFFLE_PARTITIONS.key ->
-      sparkSession.conf.get(SQLConf.SHUFFLE_PARTITIONS).toString))
+  protected var offsetSeqMetadata = OffsetSeqMetadata(
+    batchWatermarkMs = 0, batchTimestampMs = 0, sparkSession.conf)
 
   override val id: UUID = UUID.fromString(streamMetadata.id)
 
@@ -285,9 +284,8 @@ class StreamExecution(
       val sparkSessionToRunBatches = sparkSession.cloneSession()
       // Adaptive execution can change num shuffle partitions, disallow
       sparkSessionToRunBatches.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "false")
-      offsetSeqMetadata = OffsetSeqMetadata(batchWatermarkMs = 0, batchTimestampMs = 0,
-        conf = Map(SQLConf.SHUFFLE_PARTITIONS.key ->
-          sparkSessionToRunBatches.conf.get(SQLConf.SHUFFLE_PARTITIONS.key)))
+      offsetSeqMetadata = OffsetSeqMetadata(
+        batchWatermarkMs = 0, batchTimestampMs = 0, sparkSessionToRunBatches.conf)
 
       if (state.compareAndSet(INITIALIZING, ACTIVE)) {
         // Unblock `awaitInitialization`
@@ -441,21 +439,9 @@ class StreamExecution(
 
         // update offset metadata
         nextOffsets.metadata.foreach { metadata =>
-          val shufflePartitionsSparkSession: Int =
-            sparkSessionToRunBatches.conf.get(SQLConf.SHUFFLE_PARTITIONS)
-          val shufflePartitionsToUse = metadata.conf.getOrElse(SQLConf.SHUFFLE_PARTITIONS.key, {
-            // For backward compatibility, if # partitions was not recorded in the offset log,
-            // then ensure it is not missing. The new value is picked up from the conf.
-            logWarning("Number of shuffle partitions from previous run not found in checkpoint. "
-              + s"Using the value from the conf, $shufflePartitionsSparkSession partitions.")
-            shufflePartitionsSparkSession
-          })
+          OffsetSeqMetadata.setSessionConf(metadata, sparkSessionToRunBatches.conf)
           offsetSeqMetadata = OffsetSeqMetadata(
-            metadata.batchWatermarkMs, metadata.batchTimestampMs,
-            metadata.conf + (SQLConf.SHUFFLE_PARTITIONS.key -> shufflePartitionsToUse.toString))
-          // Update conf with correct number of shuffle partitions
-          sparkSessionToRunBatches.conf.set(
-            SQLConf.SHUFFLE_PARTITIONS.key, shufflePartitionsToUse.toString)
+            metadata.batchWatermarkMs, metadata.batchTimestampMs, sparkSessionToRunBatches.conf)
         }
 
         /* identify the current batch id: if commit log indicates we successfully processed the
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
index a94ff8a7ebd1e..86886466c4f56 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
@@ -172,8 +172,7 @@ object StateStoreProvider {
       indexOrdinal: Option[Int], // for sorting the data
       storeConf: StateStoreConf,
       hadoopConf: Configuration): StateStoreProvider = {
-    val providerClass = storeConf.providerClass.map(Utils.classForName)
-        .getOrElse(classOf[HDFSBackedStateStoreProvider])
+    val providerClass = Utils.classForName(storeConf.providerClass)
     val provider = providerClass.newInstance().asInstanceOf[StateStoreProvider]
     provider.init(stateStoreId, keySchema, valueSchema, indexOrdinal, storeConf, hadoopConf)
     provider
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreConf.scala
index bab297c7df594..765ff076cb467 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreConf.scala
@@ -38,7 +38,7 @@ class StateStoreConf(@transient private val sqlConf: SQLConf)
    * Optional fully qualified name of the subclass of [[StateStoreProvider]]
    * managing state data. That is, the implementation of the State Store to use.
    */
-  val providerClass: Option[String] = sqlConf.stateStoreProviderClass
+  val providerClass: String = sqlConf.stateStoreProviderClass
 
   /**
    * Additional configurations related to state store. This will capture all configs in
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala
index dc556322beddb..e6cdc063c4e9f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala
@@ -37,16 +37,18 @@ class OffsetSeqLogSuite extends SparkFunSuite with SharedSQLContext {
     }
 
     // None set
-    assert(OffsetSeqMetadata(0, 0, Map.empty) === OffsetSeqMetadata("""{}"""))
+    assert(new OffsetSeqMetadata(0, 0, Map.empty) === OffsetSeqMetadata("""{}"""))
 
     // One set
-    assert(OffsetSeqMetadata(1, 0, Map.empty) === OffsetSeqMetadata("""{"batchWatermarkMs":1}"""))
-    assert(OffsetSeqMetadata(0, 2, Map.empty) === OffsetSeqMetadata("""{"batchTimestampMs":2}"""))
+    assert(new OffsetSeqMetadata(1, 0, Map.empty) ===
+      OffsetSeqMetadata("""{"batchWatermarkMs":1}"""))
+    assert(new OffsetSeqMetadata(0, 2, Map.empty) ===
+      OffsetSeqMetadata("""{"batchTimestampMs":2}"""))
     assert(OffsetSeqMetadata(0, 0, getConfWith(shufflePartitions = 2)) ===
       OffsetSeqMetadata(s"""{"conf": {"$key":2}}"""))
 
     // Two set
-    assert(OffsetSeqMetadata(1, 2, Map.empty) ===
+    assert(new OffsetSeqMetadata(1, 2, Map.empty) ===
       OffsetSeqMetadata("""{"batchWatermarkMs":1,"batchTimestampMs":2}"""))
     assert(OffsetSeqMetadata(1, 0, getConfWith(shufflePartitions = 3)) ===
       OffsetSeqMetadata(s"""{"batchWatermarkMs":1,"conf": {"$key":3}}"""))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
index 86c3a35a59c13..6f7b9d35a6bb3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
@@ -637,19 +637,11 @@ class StreamSuite extends StreamTest {
   }
 
   testQuietly("specify custom state store provider") {
-    val queryName = "memStream"
     val providerClassName = classOf[TestStateStoreProvider].getCanonicalName
     withSQLConf("spark.sql.streaming.stateStore.providerClass" -> providerClassName) {
       val input = MemoryStream[Int]
-      val query = input
-        .toDS()
-        .groupBy()
-        .count()
-        .writeStream
-        .outputMode("complete")
-        .format("memory")
-        .queryName(queryName)
-        .start()
+      val df = input.toDS().groupBy().count()
+      val query = df.writeStream.outputMode("complete").format("memory").queryName("name").start()
       input.addData(1, 2, 3)
       val e = intercept[Exception] {
         query.awaitTermination()
@@ -659,6 +651,45 @@ class StreamSuite extends StreamTest {
       assert(e.getMessage.contains("instantiated"))
     }
   }
+
+  testQuietly("custom state store provider read from offset log") {
+    val input = MemoryStream[Int]
+    val df = input.toDS().groupBy().count()
+    val providerConf1 = "spark.sql.streaming.stateStore.providerClass" ->
+      "org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider"
+    val providerConf2 = "spark.sql.streaming.stateStore.providerClass" ->
+      classOf[TestStateStoreProvider].getCanonicalName
+
+    def runQuery(queryName: String, checkpointLoc: String): Unit = {
+      val query = df.writeStream
+        .outputMode("complete")
+        .format("memory")
+        .queryName(queryName)
+        .option("checkpointLocation", checkpointLoc)
+        .start()
+      input.addData(1, 2, 3)
+      query.processAllAvailable()
+      query.stop()
+    }
+
+    withTempDir { dir =>
+      val checkpointLoc1 = new File(dir, "1").getCanonicalPath
+      withSQLConf(providerConf1) {
+        runQuery("query1", checkpointLoc1)  // generate checkpoints
+      }
+
+      val checkpointLoc2 = new File(dir, "2").getCanonicalPath
+      withSQLConf(providerConf2) {
+        // Verify new query will use new provider that throw error on loading
+        intercept[Exception] {
+          runQuery("query2", checkpointLoc2)
+        }
+
+        // Verify old query from checkpoint will still use old provider
+        runQuery("query1", checkpointLoc1)
+      }
+    }
+  }
 }
 
 abstract class FakeSource extends StreamSourceProvider {

From 4cc62951a2b12a372a2b267bf8597a0a31e2b2cb Mon Sep 17 00:00:00 2001
From: Ong Ming Yang <me@ongmingyang.com>
Date: Fri, 23 Jun 2017 10:56:59 -0700
Subject: [PATCH 010/779] [MINOR][DOCS] Docs in DataFrameNaFunctions.scala use
 wrong method

## What changes were proposed in this pull request?

* Following the first few examples in this file, the remaining methods should also be methods of `df.na` not `df`.
* Filled in some missing parentheses

## How was this patch tested?

N/A

Author: Ong Ming Yang <me@ongmingyang.com>

Closes #18398 from ongmingyang/master.
---
 .../spark/sql/DataFrameNaFunctions.scala      | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
index ee949e78fa3ba..871fff71e5538 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
@@ -268,13 +268,13 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
    *   import com.google.common.collect.ImmutableMap;
    *
    *   // Replaces all occurrences of 1.0 with 2.0 in column "height".
-   *   df.replace("height", ImmutableMap.of(1.0, 2.0));
+   *   df.na.replace("height", ImmutableMap.of(1.0, 2.0));
    *
    *   // Replaces all occurrences of "UNKNOWN" with "unnamed" in column "name".
-   *   df.replace("name", ImmutableMap.of("UNKNOWN", "unnamed"));
+   *   df.na.replace("name", ImmutableMap.of("UNKNOWN", "unnamed"));
    *
    *   // Replaces all occurrences of "UNKNOWN" with "unnamed" in all string columns.
-   *   df.replace("*", ImmutableMap.of("UNKNOWN", "unnamed"));
+   *   df.na.replace("*", ImmutableMap.of("UNKNOWN", "unnamed"));
    * }}}
    *
    * @param col name of the column to apply the value replacement
@@ -295,10 +295,10 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
    *   import com.google.common.collect.ImmutableMap;
    *
    *   // Replaces all occurrences of 1.0 with 2.0 in column "height" and "weight".
-   *   df.replace(new String[] {"height", "weight"}, ImmutableMap.of(1.0, 2.0));
+   *   df.na.replace(new String[] {"height", "weight"}, ImmutableMap.of(1.0, 2.0));
    *
    *   // Replaces all occurrences of "UNKNOWN" with "unnamed" in column "firstname" and "lastname".
-   *   df.replace(new String[] {"firstname", "lastname"}, ImmutableMap.of("UNKNOWN", "unnamed"));
+   *   df.na.replace(new String[] {"firstname", "lastname"}, ImmutableMap.of("UNKNOWN", "unnamed"));
    * }}}
    *
    * @param cols list of columns to apply the value replacement
@@ -319,13 +319,13 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
    *
    * {{{
    *   // Replaces all occurrences of 1.0 with 2.0 in column "height".
-   *   df.replace("height", Map(1.0 -> 2.0))
+   *   df.na.replace("height", Map(1.0 -> 2.0));
    *
    *   // Replaces all occurrences of "UNKNOWN" with "unnamed" in column "name".
-   *   df.replace("name", Map("UNKNOWN" -> "unnamed")
+   *   df.na.replace("name", Map("UNKNOWN" -> "unnamed"));
    *
    *   // Replaces all occurrences of "UNKNOWN" with "unnamed" in all string columns.
-   *   df.replace("*", Map("UNKNOWN" -> "unnamed")
+   *   df.na.replace("*", Map("UNKNOWN" -> "unnamed"));
    * }}}
    *
    * @param col name of the column to apply the value replacement
@@ -348,10 +348,10 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
    *
    * {{{
    *   // Replaces all occurrences of 1.0 with 2.0 in column "height" and "weight".
-   *   df.replace("height" :: "weight" :: Nil, Map(1.0 -> 2.0));
+   *   df.na.replace("height" :: "weight" :: Nil, Map(1.0 -> 2.0));
    *
    *   // Replaces all occurrences of "UNKNOWN" with "unnamed" in column "firstname" and "lastname".
-   *   df.replace("firstname" :: "lastname" :: Nil, Map("UNKNOWN" -> "unnamed");
+   *   df.na.replace("firstname" :: "lastname" :: Nil, Map("UNKNOWN" -> "unnamed"));
    * }}}
    *
    * @param cols list of columns to apply the value replacement

From 13c2a4f2f8c6d3484f920caadddf4e5edce0a945 Mon Sep 17 00:00:00 2001
From: Dilip Biswal <dbiswal@us.ibm.com>
Date: Fri, 23 Jun 2017 11:02:54 -0700
Subject: [PATCH 011/779] [SPARK-20417][SQL] Move subquery error handling to
 checkAnalysis from Analyzer

## What changes were proposed in this pull request?
Currently we do a lot of validations for subquery in the Analyzer. We should move them to CheckAnalysis which is the framework to catch and report Analysis errors. This was mentioned as a review comment in SPARK-18874.

## How was this patch tested?
Exists tests + A few tests added to SQLQueryTestSuite.

Author: Dilip Biswal <dbiswal@us.ibm.com>

Closes #17713 from dilipbiswal/subquery_checkanalysis.
---
 .../sql/catalyst/analysis/Analyzer.scala      | 230 +-----------
 .../sql/catalyst/analysis/CheckAnalysis.scala | 338 ++++++++++++++----
 .../sql/catalyst/expressions/predicates.scala |  46 ++-
 .../analysis/AnalysisErrorSuite.scala         |   3 +-
 .../analysis/ResolveSubquerySuite.scala       |   2 +-
 .../negative-cases/subq-input-typecheck.sql   |  47 +++
 .../subq-input-typecheck.sql.out              | 106 ++++++
 .../org/apache/spark/sql/SubquerySuite.scala  |   2 +-
 8 files changed, 464 insertions(+), 310 deletions(-)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/subquery/negative-cases/subq-input-typecheck.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/subq-input-typecheck.sql.out

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 647fc0b9342c1..193082eb77024 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -28,7 +28,6 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.expressions.objects.{LambdaVariable, MapObjects, NewInstance, UnresolvedMapObjects}
 import org.apache.spark.sql.catalyst.expressions.SubExprUtils._
-import org.apache.spark.sql.catalyst.optimizer.BooleanSimplification
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, _}
 import org.apache.spark.sql.catalyst.rules._
@@ -1257,217 +1256,16 @@ class Analyzer(
     }
 
     /**
-     * Validates to make sure the outer references appearing inside the subquery
-     * are legal. This function also returns the list of expressions
-     * that contain outer references. These outer references would be kept as children
-     * of subquery expressions by the caller of this function.
-     */
-    private def checkAndGetOuterReferences(sub: LogicalPlan): Seq[Expression] = {
-      val outerReferences = ArrayBuffer.empty[Expression]
-
-      // Validate that correlated aggregate expression do not contain a mixture
-      // of outer and local references.
-      def checkMixedReferencesInsideAggregateExpr(expr: Expression): Unit = {
-        expr.foreach {
-          case a: AggregateExpression if containsOuter(a) =>
-            val outer = a.collect { case OuterReference(e) => e.toAttribute }
-            val local = a.references -- outer
-            if (local.nonEmpty) {
-              val msg =
-                s"""
-                   |Found an aggregate expression in a correlated predicate that has both
-                   |outer and local references, which is not supported yet.
-                   |Aggregate expression: ${SubExprUtils.stripOuterReference(a).sql},
-                   |Outer references: ${outer.map(_.sql).mkString(", ")},
-                   |Local references: ${local.map(_.sql).mkString(", ")}.
-                 """.stripMargin.replace("\n", " ").trim()
-              failAnalysis(msg)
-            }
-          case _ =>
-        }
-      }
-
-      // Make sure a plan's subtree does not contain outer references
-      def failOnOuterReferenceInSubTree(p: LogicalPlan): Unit = {
-        if (hasOuterReferences(p)) {
-          failAnalysis(s"Accessing outer query column is not allowed in:\n$p")
-        }
-      }
-
-      // Make sure a plan's expressions do not contain :
-      // 1. Aggregate expressions that have mixture of outer and local references.
-      // 2. Expressions containing outer references on plan nodes other than Filter.
-      def failOnInvalidOuterReference(p: LogicalPlan): Unit = {
-        p.expressions.foreach(checkMixedReferencesInsideAggregateExpr)
-        if (!p.isInstanceOf[Filter] && p.expressions.exists(containsOuter)) {
-          failAnalysis(
-            "Expressions referencing the outer query are not supported outside of WHERE/HAVING " +
-              s"clauses:\n$p")
-        }
-      }
-
-      // SPARK-17348: A potential incorrect result case.
-      // When a correlated predicate is a non-equality predicate,
-      // certain operators are not permitted from the operator
-      // hosting the correlated predicate up to the operator on the outer table.
-      // Otherwise, the pull up of the correlated predicate
-      // will generate a plan with a different semantics
-      // which could return incorrect result.
-      // Currently we check for Aggregate and Window operators
-      //
-      // Below shows an example of a Logical Plan during Analyzer phase that
-      // show this problem. Pulling the correlated predicate [outer(c2#77) >= ..]
-      // through the Aggregate (or Window) operator could alter the result of
-      // the Aggregate.
-      //
-      // Project [c1#76]
-      // +- Project [c1#87, c2#88]
-      // :  (Aggregate or Window operator)
-      // :  +- Filter [outer(c2#77) >= c2#88)]
-      // :     +- SubqueryAlias t2, `t2`
-      // :        +- Project [_1#84 AS c1#87, _2#85 AS c2#88]
-      // :           +- LocalRelation [_1#84, _2#85]
-      // +- SubqueryAlias t1, `t1`
-      // +- Project [_1#73 AS c1#76, _2#74 AS c2#77]
-      // +- LocalRelation [_1#73, _2#74]
-      def failOnNonEqualCorrelatedPredicate(found: Boolean, p: LogicalPlan): Unit = {
-        if (found) {
-          // Report a non-supported case as an exception
-          failAnalysis(s"Correlated column is not allowed in a non-equality predicate:\n$p")
-        }
-      }
-
-      var foundNonEqualCorrelatedPred : Boolean = false
-
-      // Simplify the predicates before validating any unsupported correlation patterns
-      // in the plan.
-      BooleanSimplification(sub).foreachUp {
-
-        // Whitelist operators allowed in a correlated subquery
-        // There are 4 categories:
-        // 1. Operators that are allowed anywhere in a correlated subquery, and,
-        //    by definition of the operators, they either do not contain
-        //    any columns or cannot host outer references.
-        // 2. Operators that are allowed anywhere in a correlated subquery
-        //    so long as they do not host outer references.
-        // 3. Operators that need special handlings. These operators are
-        //    Project, Filter, Join, Aggregate, and Generate.
-        //
-        // Any operators that are not in the above list are allowed
-        // in a correlated subquery only if they are not on a correlation path.
-        // In other word, these operators are allowed only under a correlation point.
-        //
-        // A correlation path is defined as the sub-tree of all the operators that
-        // are on the path from the operator hosting the correlated expressions
-        // up to the operator producing the correlated values.
-
-        // Category 1:
-        // BroadcastHint, Distinct, LeafNode, Repartition, and SubqueryAlias
-        case _: ResolvedHint | _: Distinct | _: LeafNode | _: Repartition | _: SubqueryAlias =>
-
-        // Category 2:
-        // These operators can be anywhere in a correlated subquery.
-        // so long as they do not host outer references in the operators.
-        case s: Sort =>
-          failOnInvalidOuterReference(s)
-        case r: RepartitionByExpression =>
-          failOnInvalidOuterReference(r)
-
-        // Category 3:
-        // Filter is one of the two operators allowed to host correlated expressions.
-        // The other operator is Join. Filter can be anywhere in a correlated subquery.
-        case f: Filter =>
-          // Find all predicates with an outer reference.
-          val (correlated, _) = splitConjunctivePredicates(f.condition).partition(containsOuter)
-
-          // Find any non-equality correlated predicates
-          foundNonEqualCorrelatedPred = foundNonEqualCorrelatedPred || correlated.exists {
-            case _: EqualTo | _: EqualNullSafe => false
-            case _ => true
-          }
-
-          failOnInvalidOuterReference(f)
-          // The aggregate expressions are treated in a special way by getOuterReferences. If the
-          // aggregate expression contains only outer reference attributes then the entire aggregate
-          // expression is isolated as an OuterReference.
-          // i.e min(OuterReference(b)) => OuterReference(min(b))
-          outerReferences ++= getOuterReferences(correlated)
-
-        // Project cannot host any correlated expressions
-        // but can be anywhere in a correlated subquery.
-        case p: Project =>
-          failOnInvalidOuterReference(p)
-
-        // Aggregate cannot host any correlated expressions
-        // It can be on a correlation path if the correlation contains
-        // only equality correlated predicates.
-        // It cannot be on a correlation path if the correlation has
-        // non-equality correlated predicates.
-        case a: Aggregate =>
-          failOnInvalidOuterReference(a)
-          failOnNonEqualCorrelatedPredicate(foundNonEqualCorrelatedPred, a)
-
-        // Join can host correlated expressions.
-        case j @ Join(left, right, joinType, _) =>
-          joinType match {
-            // Inner join, like Filter, can be anywhere.
-            case _: InnerLike =>
-              failOnInvalidOuterReference(j)
-
-            // Left outer join's right operand cannot be on a correlation path.
-            // LeftAnti and ExistenceJoin are special cases of LeftOuter.
-            // Note that ExistenceJoin cannot be expressed externally in both SQL and DataFrame
-            // so it should not show up here in Analysis phase. This is just a safety net.
-            //
-            // LeftSemi does not allow output from the right operand.
-            // Any correlated references in the subplan
-            // of the right operand cannot be pulled up.
-            case LeftOuter | LeftSemi | LeftAnti | ExistenceJoin(_) =>
-              failOnInvalidOuterReference(j)
-              failOnOuterReferenceInSubTree(right)
-
-            // Likewise, Right outer join's left operand cannot be on a correlation path.
-            case RightOuter =>
-              failOnInvalidOuterReference(j)
-              failOnOuterReferenceInSubTree(left)
-
-            // Any other join types not explicitly listed above,
-            // including Full outer join, are treated as Category 4.
-            case _ =>
-              failOnOuterReferenceInSubTree(j)
-          }
-
-        // Generator with join=true, i.e., expressed with
-        // LATERAL VIEW [OUTER], similar to inner join,
-        // allows to have correlation under it
-        // but must not host any outer references.
-        // Note:
-        // Generator with join=false is treated as Category 4.
-        case g: Generate if g.join =>
-          failOnInvalidOuterReference(g)
-
-        // Category 4: Any other operators not in the above 3 categories
-        // cannot be on a correlation path, that is they are allowed only
-        // under a correlation point but they and their descendant operators
-        // are not allowed to have any correlated expressions.
-        case p =>
-          failOnOuterReferenceInSubTree(p)
-      }
-      outerReferences
-    }
-
-    /**
-     * Resolves the subquery. The subquery is resolved using its outer plans. This method
-     * will resolve the subquery by alternating between the regular analyzer and by applying the
-     * resolveOuterReferences rule.
+     * Resolves the subquery plan that is referenced in a subquery expression. The normal
+     * attribute references are resolved using regular analyzer and the outer references are
+     * resolved from the outer plans using the resolveOuterReferences method.
      *
      * Outer references from the correlated predicates are updated as children of
      * Subquery expression.
      */
     private def resolveSubQuery(
         e: SubqueryExpression,
-        plans: Seq[LogicalPlan],
-        requiredColumns: Int = 0)(
+        plans: Seq[LogicalPlan])(
         f: (LogicalPlan, Seq[Expression]) => SubqueryExpression): SubqueryExpression = {
       // Step 1: Resolve the outer expressions.
       var previous: LogicalPlan = null
@@ -1488,15 +1286,8 @@ class Analyzer(
       // Step 2: If the subquery plan is fully resolved, pull the outer references and record
       // them as children of SubqueryExpression.
       if (current.resolved) {
-        // Make sure the resolved query has the required number of output columns. This is only
-        // needed for Scalar and IN subqueries.
-        if (requiredColumns > 0 && requiredColumns != current.output.size) {
-          failAnalysis(s"The number of columns in the subquery (${current.output.size}) " +
-            s"does not match the required number of columns ($requiredColumns)")
-        }
-        // Validate the outer reference and record the outer references as children of
-        // subquery expression.
-        f(current, checkAndGetOuterReferences(current))
+        // Record the outer references as children of subquery expression.
+        f(current, SubExprUtils.getOuterReferences(current))
       } else {
         e.withNewPlan(current)
       }
@@ -1514,16 +1305,11 @@ class Analyzer(
     private def resolveSubQueries(plan: LogicalPlan, plans: Seq[LogicalPlan]): LogicalPlan = {
       plan transformExpressions {
         case s @ ScalarSubquery(sub, _, exprId) if !sub.resolved =>
-          resolveSubQuery(s, plans, 1)(ScalarSubquery(_, _, exprId))
+          resolveSubQuery(s, plans)(ScalarSubquery(_, _, exprId))
         case e @ Exists(sub, _, exprId) if !sub.resolved =>
           resolveSubQuery(e, plans)(Exists(_, _, exprId))
         case In(value, Seq(l @ ListQuery(sub, _, exprId))) if value.resolved && !sub.resolved =>
-          // Get the left hand side expressions.
-          val expressions = value match {
-            case cns : CreateNamedStruct => cns.valExprs
-            case expr => Seq(expr)
-          }
-          val expr = resolveSubQuery(l, plans, expressions.size)(ListQuery(_, _, exprId))
+          val expr = resolveSubQuery(l, plans)(ListQuery(_, _, exprId))
           In(value, Seq(expr))
       }
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 2e3ac3e474866..fb81a7006bc5e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -21,6 +21,8 @@ import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.expressions.SubExprUtils._
+import org.apache.spark.sql.catalyst.optimizer.BooleanSimplification
+import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.types._
 
@@ -129,61 +131,8 @@ trait CheckAnalysis extends PredicateHelper {
               case None => w
             }
 
-          case s @ ScalarSubquery(query, conditions, _) =>
-            checkAnalysis(query)
-
-            // If no correlation, the output must be exactly one column
-            if (conditions.isEmpty && query.output.size != 1) {
-              failAnalysis(
-                s"Scalar subquery must return only one column, but got ${query.output.size}")
-            } else if (conditions.nonEmpty) {
-              def checkAggregate(agg: Aggregate): Unit = {
-                // Make sure correlated scalar subqueries contain one row for every outer row by
-                // enforcing that they are aggregates containing exactly one aggregate expression.
-                // The analyzer has already checked that subquery contained only one output column,
-                // and added all the grouping expressions to the aggregate.
-                val aggregates = agg.expressions.flatMap(_.collect {
-                  case a: AggregateExpression => a
-                })
-                if (aggregates.isEmpty) {
-                  failAnalysis("The output of a correlated scalar subquery must be aggregated")
-                }
-
-                // SPARK-18504/SPARK-18814: Block cases where GROUP BY columns
-                // are not part of the correlated columns.
-                val groupByCols = AttributeSet(agg.groupingExpressions.flatMap(_.references))
-                // Collect the local references from the correlated predicate in the subquery.
-                val subqueryColumns = getCorrelatedPredicates(query).flatMap(_.references)
-                  .filterNot(conditions.flatMap(_.references).contains)
-                val correlatedCols = AttributeSet(subqueryColumns)
-                val invalidCols = groupByCols -- correlatedCols
-                // GROUP BY columns must be a subset of columns in the predicates
-                if (invalidCols.nonEmpty) {
-                  failAnalysis(
-                    "A GROUP BY clause in a scalar correlated subquery " +
-                      "cannot contain non-correlated columns: " +
-                      invalidCols.mkString(","))
-                }
-              }
-
-              // Skip subquery aliases added by the Analyzer.
-              // For projects, do the necessary mapping and skip to its child.
-              def cleanQuery(p: LogicalPlan): LogicalPlan = p match {
-                case s: SubqueryAlias => cleanQuery(s.child)
-                case p: Project => cleanQuery(p.child)
-                case child => child
-              }
-
-              cleanQuery(query) match {
-                case a: Aggregate => checkAggregate(a)
-                case Filter(_, a: Aggregate) => checkAggregate(a)
-                case fail => failAnalysis(s"Correlated scalar subqueries must be Aggregated: $fail")
-              }
-            }
-            s
-
           case s: SubqueryExpression =>
-            checkAnalysis(s.plan)
+            checkSubqueryExpression(operator, s)
             s
         }
 
@@ -291,19 +240,6 @@ trait CheckAnalysis extends PredicateHelper {
 
           case LocalLimit(limitExpr, _) => checkLimitClause(limitExpr)
 
-          case p if p.expressions.exists(ScalarSubquery.hasCorrelatedScalarSubquery) =>
-            p match {
-              case _: Filter | _: Aggregate | _: Project => // Ok
-              case other => failAnalysis(
-                s"Correlated scalar sub-queries can only be used in a Filter/Aggregate/Project: $p")
-            }
-
-          case p if p.expressions.exists(SubqueryExpression.hasInOrExistsSubquery) =>
-            p match {
-              case _: Filter => // Ok
-              case _ => failAnalysis(s"Predicate sub-queries can only be used in a Filter: $p")
-            }
-
           case _: Union | _: SetOperation if operator.children.length > 1 =>
             def dataTypes(plan: LogicalPlan): Seq[DataType] = plan.output.map(_.dataType)
             def ordinalNumber(i: Int): String = i match {
@@ -414,4 +350,272 @@ trait CheckAnalysis extends PredicateHelper {
 
     plan.foreach(_.setAnalyzed())
   }
+
+  /**
+   * Validates subquery expressions in the plan. Upon failure, returns an user facing error.
+   */
+  private def checkSubqueryExpression(plan: LogicalPlan, expr: SubqueryExpression): Unit = {
+    def checkAggregateInScalarSubquery(
+        conditions: Seq[Expression],
+        query: LogicalPlan, agg: Aggregate): Unit = {
+      // Make sure correlated scalar subqueries contain one row for every outer row by
+      // enforcing that they are aggregates containing exactly one aggregate expression.
+      val aggregates = agg.expressions.flatMap(_.collect {
+        case a: AggregateExpression => a
+      })
+      if (aggregates.isEmpty) {
+        failAnalysis("The output of a correlated scalar subquery must be aggregated")
+      }
+
+      // SPARK-18504/SPARK-18814: Block cases where GROUP BY columns
+      // are not part of the correlated columns.
+      val groupByCols = AttributeSet(agg.groupingExpressions.flatMap(_.references))
+      // Collect the local references from the correlated predicate in the subquery.
+      val subqueryColumns = getCorrelatedPredicates(query).flatMap(_.references)
+        .filterNot(conditions.flatMap(_.references).contains)
+      val correlatedCols = AttributeSet(subqueryColumns)
+      val invalidCols = groupByCols -- correlatedCols
+      // GROUP BY columns must be a subset of columns in the predicates
+      if (invalidCols.nonEmpty) {
+        failAnalysis(
+          "A GROUP BY clause in a scalar correlated subquery " +
+            "cannot contain non-correlated columns: " +
+            invalidCols.mkString(","))
+      }
+    }
+
+    // Skip subquery aliases added by the Analyzer.
+    // For projects, do the necessary mapping and skip to its child.
+    def cleanQueryInScalarSubquery(p: LogicalPlan): LogicalPlan = p match {
+      case s: SubqueryAlias => cleanQueryInScalarSubquery(s.child)
+      case p: Project => cleanQueryInScalarSubquery(p.child)
+      case child => child
+    }
+
+    // Validate the subquery plan.
+    checkAnalysis(expr.plan)
+
+    expr match {
+      case ScalarSubquery(query, conditions, _) =>
+        // Scalar subquery must return one column as output.
+        if (query.output.size != 1) {
+          failAnalysis(
+            s"Scalar subquery must return only one column, but got ${query.output.size}")
+        }
+
+        if (conditions.nonEmpty) {
+          cleanQueryInScalarSubquery(query) match {
+            case a: Aggregate => checkAggregateInScalarSubquery(conditions, query, a)
+            case Filter(_, a: Aggregate) => checkAggregateInScalarSubquery(conditions, query, a)
+            case fail => failAnalysis(s"Correlated scalar subqueries must be aggregated: $fail")
+          }
+
+          // Only certain operators are allowed to host subquery expression containing
+          // outer references.
+          plan match {
+            case _: Filter | _: Aggregate | _: Project => // Ok
+            case other => failAnalysis(
+              "Correlated scalar sub-queries can only be used in a " +
+                s"Filter/Aggregate/Project: $plan")
+          }
+        }
+
+      case inSubqueryOrExistsSubquery =>
+        plan match {
+          case _: Filter => // Ok
+          case _ =>
+            failAnalysis(s"IN/EXISTS predicate sub-queries can only be used in a Filter: $plan")
+        }
+    }
+
+    // Validate to make sure the correlations appearing in the query are valid and
+    // allowed by spark.
+    checkCorrelationsInSubquery(expr.plan)
+  }
+
+  /**
+   * Validates to make sure the outer references appearing inside the subquery
+   * are allowed.
+   */
+  private def checkCorrelationsInSubquery(sub: LogicalPlan): Unit = {
+    // Validate that correlated aggregate expression do not contain a mixture
+    // of outer and local references.
+    def checkMixedReferencesInsideAggregateExpr(expr: Expression): Unit = {
+      expr.foreach {
+        case a: AggregateExpression if containsOuter(a) =>
+          val outer = a.collect { case OuterReference(e) => e.toAttribute }
+          val local = a.references -- outer
+          if (local.nonEmpty) {
+            val msg =
+              s"""
+                 |Found an aggregate expression in a correlated predicate that has both
+                 |outer and local references, which is not supported yet.
+                 |Aggregate expression: ${SubExprUtils.stripOuterReference(a).sql},
+                 |Outer references: ${outer.map(_.sql).mkString(", ")},
+                 |Local references: ${local.map(_.sql).mkString(", ")}.
+               """.stripMargin.replace("\n", " ").trim()
+            failAnalysis(msg)
+          }
+        case _ =>
+      }
+    }
+
+    // Make sure a plan's subtree does not contain outer references
+    def failOnOuterReferenceInSubTree(p: LogicalPlan): Unit = {
+      if (hasOuterReferences(p)) {
+        failAnalysis(s"Accessing outer query column is not allowed in:\n$p")
+      }
+    }
+
+    // Make sure a plan's expressions do not contain :
+    // 1. Aggregate expressions that have mixture of outer and local references.
+    // 2. Expressions containing outer references on plan nodes other than Filter.
+    def failOnInvalidOuterReference(p: LogicalPlan): Unit = {
+      p.expressions.foreach(checkMixedReferencesInsideAggregateExpr)
+      if (!p.isInstanceOf[Filter] && p.expressions.exists(containsOuter)) {
+        failAnalysis(
+          "Expressions referencing the outer query are not supported outside of WHERE/HAVING " +
+            s"clauses:\n$p")
+      }
+    }
+
+    // SPARK-17348: A potential incorrect result case.
+    // When a correlated predicate is a non-equality predicate,
+    // certain operators are not permitted from the operator
+    // hosting the correlated predicate up to the operator on the outer table.
+    // Otherwise, the pull up of the correlated predicate
+    // will generate a plan with a different semantics
+    // which could return incorrect result.
+    // Currently we check for Aggregate and Window operators
+    //
+    // Below shows an example of a Logical Plan during Analyzer phase that
+    // show this problem. Pulling the correlated predicate [outer(c2#77) >= ..]
+    // through the Aggregate (or Window) operator could alter the result of
+    // the Aggregate.
+    //
+    // Project [c1#76]
+    // +- Project [c1#87, c2#88]
+    // :  (Aggregate or Window operator)
+    // :  +- Filter [outer(c2#77) >= c2#88)]
+    // :     +- SubqueryAlias t2, `t2`
+    // :        +- Project [_1#84 AS c1#87, _2#85 AS c2#88]
+    // :           +- LocalRelation [_1#84, _2#85]
+    // +- SubqueryAlias t1, `t1`
+    // +- Project [_1#73 AS c1#76, _2#74 AS c2#77]
+    // +- LocalRelation [_1#73, _2#74]
+    def failOnNonEqualCorrelatedPredicate(found: Boolean, p: LogicalPlan): Unit = {
+      if (found) {
+        // Report a non-supported case as an exception
+        failAnalysis(s"Correlated column is not allowed in a non-equality predicate:\n$p")
+      }
+    }
+
+    var foundNonEqualCorrelatedPred: Boolean = false
+
+    // Simplify the predicates before validating any unsupported correlation patterns
+    // in the plan.
+    BooleanSimplification(sub).foreachUp {
+      // Whitelist operators allowed in a correlated subquery
+      // There are 4 categories:
+      // 1. Operators that are allowed anywhere in a correlated subquery, and,
+      //    by definition of the operators, they either do not contain
+      //    any columns or cannot host outer references.
+      // 2. Operators that are allowed anywhere in a correlated subquery
+      //    so long as they do not host outer references.
+      // 3. Operators that need special handlings. These operators are
+      //    Filter, Join, Aggregate, and Generate.
+      //
+      // Any operators that are not in the above list are allowed
+      // in a correlated subquery only if they are not on a correlation path.
+      // In other word, these operators are allowed only under a correlation point.
+      //
+      // A correlation path is defined as the sub-tree of all the operators that
+      // are on the path from the operator hosting the correlated expressions
+      // up to the operator producing the correlated values.
+
+      // Category 1:
+      // ResolvedHint, Distinct, LeafNode, Repartition, and SubqueryAlias
+      case _: ResolvedHint | _: Distinct | _: LeafNode | _: Repartition | _: SubqueryAlias =>
+
+      // Category 2:
+      // These operators can be anywhere in a correlated subquery.
+      // so long as they do not host outer references in the operators.
+      case p: Project =>
+        failOnInvalidOuterReference(p)
+
+      case s: Sort =>
+        failOnInvalidOuterReference(s)
+
+      case r: RepartitionByExpression =>
+        failOnInvalidOuterReference(r)
+
+      // Category 3:
+      // Filter is one of the two operators allowed to host correlated expressions.
+      // The other operator is Join. Filter can be anywhere in a correlated subquery.
+      case f: Filter =>
+        val (correlated, _) = splitConjunctivePredicates(f.condition).partition(containsOuter)
+
+        // Find any non-equality correlated predicates
+        foundNonEqualCorrelatedPred = foundNonEqualCorrelatedPred || correlated.exists {
+          case _: EqualTo | _: EqualNullSafe => false
+          case _ => true
+        }
+        failOnInvalidOuterReference(f)
+
+      // Aggregate cannot host any correlated expressions
+      // It can be on a correlation path if the correlation contains
+      // only equality correlated predicates.
+      // It cannot be on a correlation path if the correlation has
+      // non-equality correlated predicates.
+      case a: Aggregate =>
+        failOnInvalidOuterReference(a)
+        failOnNonEqualCorrelatedPredicate(foundNonEqualCorrelatedPred, a)
+
+      // Join can host correlated expressions.
+      case j @ Join(left, right, joinType, _) =>
+        joinType match {
+          // Inner join, like Filter, can be anywhere.
+          case _: InnerLike =>
+            failOnInvalidOuterReference(j)
+
+          // Left outer join's right operand cannot be on a correlation path.
+          // LeftAnti and ExistenceJoin are special cases of LeftOuter.
+          // Note that ExistenceJoin cannot be expressed externally in both SQL and DataFrame
+          // so it should not show up here in Analysis phase. This is just a safety net.
+          //
+          // LeftSemi does not allow output from the right operand.
+          // Any correlated references in the subplan
+          // of the right operand cannot be pulled up.
+          case LeftOuter | LeftSemi | LeftAnti | ExistenceJoin(_) =>
+            failOnInvalidOuterReference(j)
+            failOnOuterReferenceInSubTree(right)
+
+          // Likewise, Right outer join's left operand cannot be on a correlation path.
+          case RightOuter =>
+            failOnInvalidOuterReference(j)
+            failOnOuterReferenceInSubTree(left)
+
+          // Any other join types not explicitly listed above,
+          // including Full outer join, are treated as Category 4.
+          case _ =>
+            failOnOuterReferenceInSubTree(j)
+        }
+
+      // Generator with join=true, i.e., expressed with
+      // LATERAL VIEW [OUTER], similar to inner join,
+      // allows to have correlation under it
+      // but must not host any outer references.
+      // Note:
+      // Generator with join=false is treated as Category 4.
+      case g: Generate if g.join =>
+        failOnInvalidOuterReference(g)
+
+      // Category 4: Any other operators not in the above 3 categories
+      // cannot be on a correlation path, that is they are allowed only
+      // under a correlation point but they and their descendant operators
+      // are not allowed to have any correlated expressions.
+      case p =>
+        failOnOuterReferenceInSubTree(p)
+    }
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index c15ee2ab270bc..f3fe58caa6fe2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -144,27 +144,39 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate {
           case cns: CreateNamedStruct => cns.valExprs
           case expr => Seq(expr)
         }
-
-        val mismatchedColumns = valExprs.zip(sub.output).flatMap {
-          case (l, r) if l.dataType != r.dataType =>
-            s"(${l.sql}:${l.dataType.catalogString}, ${r.sql}:${r.dataType.catalogString})"
-          case _ => None
-        }
-
-        if (mismatchedColumns.nonEmpty) {
+        if (valExprs.length != sub.output.length) {
           TypeCheckResult.TypeCheckFailure(
             s"""
-               |The data type of one or more elements in the left hand side of an IN subquery
-               |is not compatible with the data type of the output of the subquery
-               |Mismatched columns:
-               |[${mismatchedColumns.mkString(", ")}]
-               |Left side:
-               |[${valExprs.map(_.dataType.catalogString).mkString(", ")}].
-               |Right side:
-               |[${sub.output.map(_.dataType.catalogString).mkString(", ")}].
+               |The number of columns in the left hand side of an IN subquery does not match the
+               |number of columns in the output of subquery.
+               |#columns in left hand side: ${valExprs.length}.
+               |#columns in right hand side: ${sub.output.length}.
+               |Left side columns:
+               |[${valExprs.map(_.sql).mkString(", ")}].
+               |Right side columns:
+               |[${sub.output.map(_.sql).mkString(", ")}].
              """.stripMargin)
         } else {
-          TypeCheckResult.TypeCheckSuccess
+          val mismatchedColumns = valExprs.zip(sub.output).flatMap {
+            case (l, r) if l.dataType != r.dataType =>
+              s"(${l.sql}:${l.dataType.catalogString}, ${r.sql}:${r.dataType.catalogString})"
+            case _ => None
+          }
+          if (mismatchedColumns.nonEmpty) {
+            TypeCheckResult.TypeCheckFailure(
+              s"""
+                 |The data type of one or more elements in the left hand side of an IN subquery
+                 |is not compatible with the data type of the output of the subquery
+                 |Mismatched columns:
+                 |[${mismatchedColumns.mkString(", ")}]
+                 |Left side:
+                 |[${valExprs.map(_.dataType.catalogString).mkString(", ")}].
+                 |Right side:
+                 |[${sub.output.map(_.dataType.catalogString).mkString(", ")}].
+               """.stripMargin)
+          } else {
+            TypeCheckResult.TypeCheckSuccess
+          }
         }
       case _ =>
         if (list.exists(l => l.dataType != value.dataType)) {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
index 5050318d96358..4ed995e20d7ce 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
@@ -111,8 +111,7 @@ class AnalysisErrorSuite extends AnalysisTest {
     "scalar subquery with 2 columns",
      testRelation.select(
        (ScalarSubquery(testRelation.select('a, dateLit.as('b))) + Literal(1)).as('a)),
-       "The number of columns in the subquery (2)" ::
-       "does not match the required number of columns (1)":: Nil)
+       "Scalar subquery must return only one column, but got 2" :: Nil)
 
   errorTest(
     "scalar subquery with no column",
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveSubquerySuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveSubquerySuite.scala
index 55693121431a2..1bf8d76da04d8 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveSubquerySuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveSubquerySuite.scala
@@ -35,7 +35,7 @@ class ResolveSubquerySuite extends AnalysisTest {
   test("SPARK-17251 Improve `OuterReference` to be `NamedExpression`") {
     val expr = Filter(In(a, Seq(ListQuery(Project(Seq(UnresolvedAttribute("a")), t2)))), t1)
     val m = intercept[AnalysisException] {
-      SimpleAnalyzer.ResolveSubquery(expr)
+      SimpleAnalyzer.checkAnalysis(SimpleAnalyzer.ResolveSubquery(expr))
     }.getMessage
     assert(m.contains(
       "Expressions referencing the outer query are not supported outside of WHERE/HAVING clauses"))
diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/negative-cases/subq-input-typecheck.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/negative-cases/subq-input-typecheck.sql
new file mode 100644
index 0000000000000..b15f4da81dd93
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/negative-cases/subq-input-typecheck.sql
@@ -0,0 +1,47 @@
+-- The test file contains negative test cases
+-- of invalid queries where error messages are expected.
+
+CREATE TEMPORARY VIEW t1 AS SELECT * FROM VALUES
+  (1, 2, 3)
+AS t1(t1a, t1b, t1c);
+
+CREATE TEMPORARY VIEW t2 AS SELECT * FROM VALUES
+  (1, 0, 1)
+AS t2(t2a, t2b, t2c);
+
+CREATE TEMPORARY VIEW t3 AS SELECT * FROM VALUES
+  (3, 1, 2)
+AS t3(t3a, t3b, t3c);
+
+-- TC 01.01
+SELECT 
+  ( SELECT max(t2b), min(t2b) 
+    FROM t2 
+    WHERE t2.t2b = t1.t1b
+    GROUP BY t2.t2b
+  )
+FROM t1;
+
+-- TC 01.01
+SELECT 
+  ( SELECT max(t2b), min(t2b) 
+    FROM t2 
+    WHERE t2.t2b > 0
+    GROUP BY t2.t2b
+  )
+FROM t1;
+
+-- TC 01.03
+SELECT * FROM t1
+WHERE
+t1a IN (SELECT t2a, t2b 
+        FROM t2
+        WHERE t1a = t2a);
+
+-- TC 01.04
+SELECT * FROM T1 
+WHERE
+(t1a, t1b) IN (SELECT t2a
+               FROM t2
+               WHERE t1a = t2a);
+
diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/subq-input-typecheck.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/subq-input-typecheck.sql.out
new file mode 100644
index 0000000000000..9ea9d3c4c6f40
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/subq-input-typecheck.sql.out
@@ -0,0 +1,106 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 7
+
+
+-- !query 0
+CREATE TEMPORARY VIEW t1 AS SELECT * FROM VALUES
+  (1, 2, 3)
+AS t1(t1a, t1b, t1c)
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+CREATE TEMPORARY VIEW t2 AS SELECT * FROM VALUES
+  (1, 0, 1)
+AS t2(t2a, t2b, t2c)
+-- !query 1 schema
+struct<>
+-- !query 1 output
+
+
+
+-- !query 2
+CREATE TEMPORARY VIEW t3 AS SELECT * FROM VALUES
+  (3, 1, 2)
+AS t3(t3a, t3b, t3c)
+-- !query 2 schema
+struct<>
+-- !query 2 output
+
+
+
+-- !query 3
+SELECT 
+  ( SELECT max(t2b), min(t2b) 
+    FROM t2 
+    WHERE t2.t2b = t1.t1b
+    GROUP BY t2.t2b
+  )
+FROM t1
+-- !query 3 schema
+struct<>
+-- !query 3 output
+org.apache.spark.sql.AnalysisException
+Scalar subquery must return only one column, but got 2;
+
+
+-- !query 4
+SELECT 
+  ( SELECT max(t2b), min(t2b) 
+    FROM t2 
+    WHERE t2.t2b > 0
+    GROUP BY t2.t2b
+  )
+FROM t1
+-- !query 4 schema
+struct<>
+-- !query 4 output
+org.apache.spark.sql.AnalysisException
+Scalar subquery must return only one column, but got 2;
+
+
+-- !query 5
+SELECT * FROM t1
+WHERE
+t1a IN (SELECT t2a, t2b 
+        FROM t2
+        WHERE t1a = t2a)
+-- !query 5 schema
+struct<>
+-- !query 5 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(t1.`t1a` IN (listquery(t1.`t1a`)))' due to data type mismatch: 
+The number of columns in the left hand side of an IN subquery does not match the
+number of columns in the output of subquery.
+#columns in left hand side: 1.
+#columns in right hand side: 2.
+Left side columns:
+[t1.`t1a`].
+Right side columns:
+[t2.`t2a`, t2.`t2b`].
+             ;
+
+
+-- !query 6
+SELECT * FROM T1 
+WHERE
+(t1a, t1b) IN (SELECT t2a
+               FROM t2
+               WHERE t1a = t2a)
+-- !query 6 schema
+struct<>
+-- !query 6 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(named_struct('t1a', t1.`t1a`, 't1b', t1.`t1b`) IN (listquery(t1.`t1a`)))' due to data type mismatch: 
+The number of columns in the left hand side of an IN subquery does not match the
+number of columns in the output of subquery.
+#columns in left hand side: 2.
+#columns in right hand side: 1.
+Left side columns:
+[t1.`t1a`, t1.`t1b`].
+Right side columns:
+[t2.`t2a`].
+             ;
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
index 4629a8c0dbe5f..820cff655c4ff 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
@@ -517,7 +517,7 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
     val msg1 = intercept[AnalysisException] {
       sql("select a, (select b from l l2 where l2.a = l1.a) sum_b from l l1")
     }
-    assert(msg1.getMessage.contains("Correlated scalar subqueries must be Aggregated"))
+    assert(msg1.getMessage.contains("Correlated scalar subqueries must be aggregated"))
 
     val msg2 = intercept[AnalysisException] {
       sql("select a, (select b from l l2 where l2.a = l1.a group by 1) sum_b from l l1")

From 03eb6117affcca21798be25706a39e0d5a2f7288 Mon Sep 17 00:00:00 2001
From: Xiao Li <gatorsmile@gmail.com>
Date: Fri, 23 Jun 2017 14:48:33 -0700
Subject: [PATCH 012/779] [SPARK-21164][SQL] Remove isTableSample from Sample
 and isGenerated from Alias and AttributeReference

## What changes were proposed in this pull request?
`isTableSample` and `isGenerated ` were introduced for SQL Generation respectively by https://github.com/apache/spark/pull/11148 and https://github.com/apache/spark/pull/11050

Since SQL Generation is removed, we do not need to keep `isTableSample`.

## How was this patch tested?
The existing test cases

Author: Xiao Li <gatorsmile@gmail.com>

Closes #18379 from gatorsmile/CleanSample.
---
 .../sql/catalyst/analysis/Analyzer.scala      |  8 ++---
 .../expressions/namedExpressions.scala        | 34 +++++++------------
 .../optimizer/RewriteDistinctAggregates.scala |  2 +-
 .../sql/catalyst/parser/AstBuilder.scala      |  2 +-
 .../sql/catalyst/planning/patterns.scala      |  4 +--
 .../spark/sql/catalyst/plans/QueryPlan.scala  |  2 +-
 .../catalyst/plans/logical/LogicalPlan.scala  |  2 +-
 .../plans/logical/basicLogicalOperators.scala |  6 +---
 .../analysis/AnalysisErrorSuite.scala         |  2 +-
 .../analysis/UnsupportedOperationsSuite.scala |  2 +-
 .../optimizer/ColumnPruningSuite.scala        |  8 ++---
 .../sql/catalyst/parser/PlanParserSuite.scala |  4 +--
 .../spark/sql/catalyst/plans/PlanTest.scala   | 10 +++---
 .../BasicStatsEstimationSuite.scala           |  4 +--
 .../scala/org/apache/spark/sql/Dataset.scala  |  4 +--
 15 files changed, 40 insertions(+), 54 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 193082eb77024..7e5ebfc93286f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -874,7 +874,7 @@ class Analyzer(
 
     def newAliases(expressions: Seq[NamedExpression]): Seq[NamedExpression] = {
       expressions.map {
-        case a: Alias => Alias(a.child, a.name)(isGenerated = a.isGenerated)
+        case a: Alias => Alias(a.child, a.name)()
         case other => other
       }
     }
@@ -1368,7 +1368,7 @@ class Analyzer(
           val aggregatedCondition =
             Aggregate(
               grouping,
-              Alias(havingCondition, "havingCondition")(isGenerated = true) :: Nil,
+              Alias(havingCondition, "havingCondition")() :: Nil,
               child)
           val resolvedOperator = execute(aggregatedCondition)
           def resolvedAggregateFilter =
@@ -1424,7 +1424,7 @@ class Analyzer(
         try {
           val unresolvedSortOrders = sortOrder.filter(s => !s.resolved || containsAggregate(s))
           val aliasedOrdering =
-            unresolvedSortOrders.map(o => Alias(o.child, "aggOrder")(isGenerated = true))
+            unresolvedSortOrders.map(o => Alias(o.child, "aggOrder")())
           val aggregatedOrdering = aggregate.copy(aggregateExpressions = aliasedOrdering)
           val resolvedAggregate: Aggregate = execute(aggregatedOrdering).asInstanceOf[Aggregate]
           val resolvedAliasedOrdering: Seq[Alias] =
@@ -1935,7 +1935,7 @@ class Analyzer(
         leafNondeterministic.distinct.map { e =>
           val ne = e match {
             case n: NamedExpression => n
-            case _ => Alias(e, "_nondeterministic")(isGenerated = true)
+            case _ => Alias(e, "_nondeterministic")()
           }
           e -> ne
         }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
index c842f85af693c..29c33804f077a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
@@ -81,9 +81,6 @@ trait NamedExpression extends Expression {
   /** Returns the metadata when an expression is a reference to another expression with metadata. */
   def metadata: Metadata = Metadata.empty
 
-  /** Returns true if the expression is generated by Catalyst */
-  def isGenerated: java.lang.Boolean = false
-
   /** Returns a copy of this expression with a new `exprId`. */
   def newInstance(): NamedExpression
 
@@ -128,13 +125,11 @@ abstract class Attribute extends LeafExpression with NamedExpression with NullIn
  *                   qualified way. Consider the examples tableName.name, subQueryAlias.name.
  *                   tableName and subQueryAlias are possible qualifiers.
  * @param explicitMetadata Explicit metadata associated with this alias that overwrites child's.
- * @param isGenerated A flag to indicate if this alias is generated by Catalyst
  */
 case class Alias(child: Expression, name: String)(
     val exprId: ExprId = NamedExpression.newExprId,
     val qualifier: Option[String] = None,
-    val explicitMetadata: Option[Metadata] = None,
-    override val isGenerated: java.lang.Boolean = false)
+    val explicitMetadata: Option[Metadata] = None)
   extends UnaryExpression with NamedExpression {
 
   // Alias(Generator, xx) need to be transformed into Generate(generator, ...)
@@ -159,13 +154,11 @@ case class Alias(child: Expression, name: String)(
   }
 
   def newInstance(): NamedExpression =
-    Alias(child, name)(
-      qualifier = qualifier, explicitMetadata = explicitMetadata, isGenerated = isGenerated)
+    Alias(child, name)(qualifier = qualifier, explicitMetadata = explicitMetadata)
 
   override def toAttribute: Attribute = {
     if (resolved) {
-      AttributeReference(name, child.dataType, child.nullable, metadata)(
-        exprId, qualifier, isGenerated)
+      AttributeReference(name, child.dataType, child.nullable, metadata)(exprId, qualifier)
     } else {
       UnresolvedAttribute(name)
     }
@@ -174,7 +167,7 @@ case class Alias(child: Expression, name: String)(
   override def toString: String = s"$child AS $name#${exprId.id}$typeSuffix"
 
   override protected final def otherCopyArgs: Seq[AnyRef] = {
-    exprId :: qualifier :: explicitMetadata :: isGenerated :: Nil
+    exprId :: qualifier :: explicitMetadata :: Nil
   }
 
   override def hashCode(): Int = {
@@ -207,7 +200,6 @@ case class Alias(child: Expression, name: String)(
  * @param qualifier An optional string that can be used to referred to this attribute in a fully
  *                  qualified way. Consider the examples tableName.name, subQueryAlias.name.
  *                  tableName and subQueryAlias are possible qualifiers.
- * @param isGenerated A flag to indicate if this reference is generated by Catalyst
  */
 case class AttributeReference(
     name: String,
@@ -215,8 +207,7 @@ case class AttributeReference(
     nullable: Boolean = true,
     override val metadata: Metadata = Metadata.empty)(
     val exprId: ExprId = NamedExpression.newExprId,
-    val qualifier: Option[String] = None,
-    override val isGenerated: java.lang.Boolean = false)
+    val qualifier: Option[String] = None)
   extends Attribute with Unevaluable {
 
   /**
@@ -253,8 +244,7 @@ case class AttributeReference(
   }
 
   override def newInstance(): AttributeReference =
-    AttributeReference(name, dataType, nullable, metadata)(
-      qualifier = qualifier, isGenerated = isGenerated)
+    AttributeReference(name, dataType, nullable, metadata)(qualifier = qualifier)
 
   /**
    * Returns a copy of this [[AttributeReference]] with changed nullability.
@@ -263,7 +253,7 @@ case class AttributeReference(
     if (nullable == newNullability) {
       this
     } else {
-      AttributeReference(name, dataType, newNullability, metadata)(exprId, qualifier, isGenerated)
+      AttributeReference(name, dataType, newNullability, metadata)(exprId, qualifier)
     }
   }
 
@@ -271,7 +261,7 @@ case class AttributeReference(
     if (name == newName) {
       this
     } else {
-      AttributeReference(newName, dataType, nullable, metadata)(exprId, qualifier, isGenerated)
+      AttributeReference(newName, dataType, nullable, metadata)(exprId, qualifier)
     }
   }
 
@@ -282,7 +272,7 @@ case class AttributeReference(
     if (newQualifier == qualifier) {
       this
     } else {
-      AttributeReference(name, dataType, nullable, metadata)(exprId, newQualifier, isGenerated)
+      AttributeReference(name, dataType, nullable, metadata)(exprId, newQualifier)
     }
   }
 
@@ -290,16 +280,16 @@ case class AttributeReference(
     if (exprId == newExprId) {
       this
     } else {
-      AttributeReference(name, dataType, nullable, metadata)(newExprId, qualifier, isGenerated)
+      AttributeReference(name, dataType, nullable, metadata)(newExprId, qualifier)
     }
   }
 
   override def withMetadata(newMetadata: Metadata): Attribute = {
-    AttributeReference(name, dataType, nullable, newMetadata)(exprId, qualifier, isGenerated)
+    AttributeReference(name, dataType, nullable, newMetadata)(exprId, qualifier)
   }
 
   override protected final def otherCopyArgs: Seq[AnyRef] = {
-    exprId :: qualifier :: isGenerated :: Nil
+    exprId :: qualifier :: Nil
   }
 
   /** Used to signal the column used to calculate an eventTime watermark (e.g. a#1-T{delayMs}) */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala
index 3b27cd2ffe028..4448ace7105a4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala
@@ -134,7 +134,7 @@ object RewriteDistinctAggregates extends Rule[LogicalPlan] {
     // Aggregation strategy can handle queries with a single distinct group.
     if (distinctAggGroups.size > 1) {
       // Create the attributes for the grouping id and the group by clause.
-      val gid = AttributeReference("gid", IntegerType, nullable = false)(isGenerated = true)
+      val gid = AttributeReference("gid", IntegerType, nullable = false)()
       val groupByMap = a.groupingExpressions.collect {
         case ne: NamedExpression => ne -> ne.toAttribute
         case e => e -> AttributeReference(e.sql, e.dataType, e.nullable)()
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 315c6721b3f65..ef79cbcaa0ce6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -627,7 +627,7 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
       validate(fraction >= 0.0 - eps && fraction <= 1.0 + eps,
         s"Sampling fraction ($fraction) must be on interval [0, 1]",
         ctx)
-      Sample(0.0, fraction, withReplacement = false, (math.random * 1000).toInt, query)(true)
+      Sample(0.0, fraction, withReplacement = false, (math.random * 1000).toInt, query)
     }
 
     ctx.sampleType.getType match {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
index ef925f92ecc7e..7f370fb731b2f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
@@ -80,12 +80,12 @@ object PhysicalOperation extends PredicateHelper {
     expr.transform {
       case a @ Alias(ref: AttributeReference, name) =>
         aliases.get(ref)
-          .map(Alias(_, name)(a.exprId, a.qualifier, isGenerated = a.isGenerated))
+          .map(Alias(_, name)(a.exprId, a.qualifier))
           .getOrElse(a)
 
       case a: AttributeReference =>
         aliases.get(a)
-          .map(Alias(_, a.name)(a.exprId, a.qualifier, isGenerated = a.isGenerated)).getOrElse(a)
+          .map(Alias(_, a.name)(a.exprId, a.qualifier)).getOrElse(a)
     }
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
index 1f6d05bc8d816..01b3da3f7c482 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -200,7 +200,7 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] extends TreeNode[PlanT
         // normalize that for equality testing, by assigning expr id from 0 incrementally. The
         // alias name doesn't matter and should be erased.
         val normalizedChild = QueryPlan.normalizeExprId(a.child, allAttributes)
-        Alias(normalizedChild, "")(ExprId(id), a.qualifier, isGenerated = a.isGenerated)
+        Alias(normalizedChild, "")(ExprId(id), a.qualifier)
 
       case ar: AttributeReference if allAttributes.indexOf(ar.exprId) == -1 =>
         // Top level `AttributeReference` may also be used for output like `Alias`, we should
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index 0c098ac0209e8..0d30aa76049a5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -221,7 +221,7 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with QueryPlanConstrai
       nameParts: Seq[String],
       resolver: Resolver,
       attribute: Attribute): Option[(Attribute, List[String])] = {
-    if (!attribute.isGenerated && resolver(attribute.name, nameParts.head)) {
+    if (resolver(attribute.name, nameParts.head)) {
       Option((attribute.withName(nameParts.head), nameParts.tail.toList))
     } else {
       None
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index d8f89b108e63f..e89caabf252d7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -807,15 +807,13 @@ case class SubqueryAlias(
  * @param withReplacement Whether to sample with replacement.
  * @param seed the random seed
  * @param child the LogicalPlan
- * @param isTableSample Is created from TABLESAMPLE in the parser.
  */
 case class Sample(
     lowerBound: Double,
     upperBound: Double,
     withReplacement: Boolean,
     seed: Long,
-    child: LogicalPlan)(
-    val isTableSample: java.lang.Boolean = false) extends UnaryNode {
+    child: LogicalPlan) extends UnaryNode {
 
   val eps = RandomSampler.roundingEpsilon
   val fraction = upperBound - lowerBound
@@ -842,8 +840,6 @@ case class Sample(
     // Don't propagate column stats, because we don't know the distribution after a sample operation
     Statistics(sizeInBytes, sampledRowCount, hints = childStats.hints)
   }
-
-  override protected def otherCopyArgs: Seq[AnyRef] = isTableSample :: Nil
 }
 
 /**
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
index 4ed995e20d7ce..7311dc3899e53 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
@@ -573,7 +573,7 @@ class AnalysisErrorSuite extends AnalysisTest {
     val plan5 = Filter(
       Exists(
         Sample(0.0, 0.5, false, 1L,
-          Filter(EqualTo(UnresolvedAttribute("a"), b), LocalRelation(b)))().select('b)
+          Filter(EqualTo(UnresolvedAttribute("a"), b), LocalRelation(b))).select('b)
       ),
       LocalRelation(a))
     assertAnalysisError(plan5,
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala
index c39e372c272b1..f68d930f60523 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala
@@ -491,7 +491,7 @@ class UnsupportedOperationsSuite extends SparkFunSuite {
 
   // Other unary operations
   testUnaryOperatorInStreamingPlan(
-    "sample", Sample(0.1, 1, true, 1L, _)(), expectedMsg = "sampling")
+    "sample", Sample(0.1, 1, true, 1L, _), expectedMsg = "sampling")
   testUnaryOperatorInStreamingPlan(
     "window", Window(Nil, Nil, Nil, _), expectedMsg = "non-time-based windows")
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala
index 0b419e9631b29..08e58d47e0e25 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala
@@ -349,14 +349,14 @@ class ColumnPruningSuite extends PlanTest {
     val testRelation = LocalRelation('a.int, 'b.int, 'c.int)
     val x = testRelation.subquery('x)
 
-    val query1 = Sample(0.0, 0.6, false, 11L, x)().select('a)
+    val query1 = Sample(0.0, 0.6, false, 11L, x).select('a)
     val optimized1 = Optimize.execute(query1.analyze)
-    val expected1 = Sample(0.0, 0.6, false, 11L, x.select('a))()
+    val expected1 = Sample(0.0, 0.6, false, 11L, x.select('a))
     comparePlans(optimized1, expected1.analyze)
 
-    val query2 = Sample(0.0, 0.6, false, 11L, x)().select('a as 'aa)
+    val query2 = Sample(0.0, 0.6, false, 11L, x).select('a as 'aa)
     val optimized2 = Optimize.execute(query2.analyze)
-    val expected2 = Sample(0.0, 0.6, false, 11L, x.select('a))().select('a as 'aa)
+    val expected2 = Sample(0.0, 0.6, false, 11L, x.select('a)).select('a as 'aa)
     comparePlans(optimized2, expected2.analyze)
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
index 0a4ae098d65cc..bf15b85d5b510 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
@@ -411,9 +411,9 @@ class PlanParserSuite extends AnalysisTest {
     assertEqual(s"$sql tablesample(100 rows)",
       table("t").limit(100).select(star()))
     assertEqual(s"$sql tablesample(43 percent) as x",
-      Sample(0, .43d, withReplacement = false, 10L, table("t").as("x"))(true).select(star()))
+      Sample(0, .43d, withReplacement = false, 10L, table("t").as("x")).select(star()))
     assertEqual(s"$sql tablesample(bucket 4 out of 10) as x",
-      Sample(0, .4d, withReplacement = false, 10L, table("t").as("x"))(true).select(star()))
+      Sample(0, .4d, withReplacement = false, 10L, table("t").as("x")).select(star()))
     intercept(s"$sql tablesample(bucket 4 out of 10 on x) as x",
       "TABLESAMPLE(BUCKET x OUT OF y ON colname) is not supported")
     intercept(s"$sql tablesample(bucket 11 out of 10) as x",
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
index 25313af2be184..6883d23d477e4 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
@@ -63,14 +63,14 @@ abstract class PlanTest extends SparkFunSuite with PredicateHelper {
    */
   protected def normalizePlan(plan: LogicalPlan): LogicalPlan = {
     plan transform {
-      case filter @ Filter(condition: Expression, child: LogicalPlan) =>
-        Filter(splitConjunctivePredicates(condition).map(rewriteEqual(_)).sortBy(_.hashCode())
+      case Filter(condition: Expression, child: LogicalPlan) =>
+        Filter(splitConjunctivePredicates(condition).map(rewriteEqual).sortBy(_.hashCode())
           .reduce(And), child)
       case sample: Sample =>
-        sample.copy(seed = 0L)(true)
-      case join @ Join(left, right, joinType, condition) if condition.isDefined =>
+        sample.copy(seed = 0L)
+      case Join(left, right, joinType, condition) if condition.isDefined =>
         val newCondition =
-          splitConjunctivePredicates(condition.get).map(rewriteEqual(_)).sortBy(_.hashCode())
+          splitConjunctivePredicates(condition.get).map(rewriteEqual).sortBy(_.hashCode())
             .reduce(And)
         Join(left, right, joinType, Some(newCondition))
     }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/BasicStatsEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/BasicStatsEstimationSuite.scala
index e9ed36feec48c..912c5fed63450 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/BasicStatsEstimationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/BasicStatsEstimationSuite.scala
@@ -78,14 +78,14 @@ class BasicStatsEstimationSuite extends StatsEstimationTestBase {
   }
 
   test("sample estimation") {
-    val sample = Sample(0.0, 0.5, withReplacement = false, (math.random * 1000).toLong, plan)()
+    val sample = Sample(0.0, 0.5, withReplacement = false, (math.random * 1000).toLong, plan)
     checkStats(sample, Statistics(sizeInBytes = 60, rowCount = Some(5)))
 
     // Child doesn't have rowCount in stats
     val childStats = Statistics(sizeInBytes = 120)
     val childPlan = DummyLogicalPlan(childStats, childStats)
     val sample2 =
-      Sample(0.0, 0.11, withReplacement = false, (math.random * 1000).toLong, childPlan)()
+      Sample(0.0, 0.11, withReplacement = false, (math.random * 1000).toLong, childPlan)
     checkStats(sample2, Statistics(sizeInBytes = 14))
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 767dad3e63a6d..6e66e92091ff9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -1807,7 +1807,7 @@ class Dataset[T] private[sql](
    */
   def sample(withReplacement: Boolean, fraction: Double, seed: Long): Dataset[T] = {
     withTypedPlan {
-      Sample(0.0, fraction, withReplacement, seed, logicalPlan)()
+      Sample(0.0, fraction, withReplacement, seed, logicalPlan)
     }
   }
 
@@ -1863,7 +1863,7 @@ class Dataset[T] private[sql](
     val normalizedCumWeights = weights.map(_ / sum).scanLeft(0.0d)(_ + _)
     normalizedCumWeights.sliding(2).map { x =>
       new Dataset[T](
-        sparkSession, Sample(x(0), x(1), withReplacement = false, seed, plan)(), encoder)
+        sparkSession, Sample(x(0), x(1), withReplacement = false, seed, plan), encoder)
     }.toArray
   }
 

From 7525ce98b4575b1ac4e44cc9b3a5773f03eba19e Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sat, 24 Jun 2017 11:39:41 +0800
Subject: [PATCH 013/779] [SPARK-20431][SS][FOLLOWUP] Specify a schema by using
 a DDL-formatted string in DataStreamReader

## What changes were proposed in this pull request?

This pr supported a DDL-formatted string in `DataStreamReader.schema`.
This fix could make users easily define a schema without importing the type classes.

For example,

```scala
scala> spark.readStream.schema("col0 INT, col1 DOUBLE").load("/tmp/abc").printSchema()
root
 |-- col0: integer (nullable = true)
 |-- col1: double (nullable = true)
```

## How was this patch tested?

Added tests in `DataStreamReaderWriterSuite`.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #18373 from HyukjinKwon/SPARK-20431.
---
 python/pyspark/sql/readwriter.py              |  2 ++
 python/pyspark/sql/streaming.py               | 24 ++++++++++++-------
 .../sql/streaming/DataStreamReader.scala      | 12 ++++++++++
 .../test/DataStreamReaderWriterSuite.scala    | 12 ++++++++++
 4 files changed, 42 insertions(+), 8 deletions(-)

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index aef71f9ca7001..7279173df6e4f 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -98,6 +98,8 @@ def schema(self, schema):
 
         :param schema: a :class:`pyspark.sql.types.StructType` object or a DDL-formatted string
                        (For example ``col0 INT, col1 DOUBLE``).
+
+        >>> s = spark.read.schema("col0 INT, col1 DOUBLE")
         """
         from pyspark.sql import SparkSession
         spark = SparkSession.builder.getOrCreate()
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index 58aa2468e006d..5bbd70cf0a789 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -319,16 +319,21 @@ def schema(self, schema):
 
         .. note:: Evolving.
 
-        :param schema: a :class:`pyspark.sql.types.StructType` object
+        :param schema: a :class:`pyspark.sql.types.StructType` object or a DDL-formatted string
+                       (For example ``col0 INT, col1 DOUBLE``).
 
         >>> s = spark.readStream.schema(sdf_schema)
+        >>> s = spark.readStream.schema("col0 INT, col1 DOUBLE")
         """
         from pyspark.sql import SparkSession
-        if not isinstance(schema, StructType):
-            raise TypeError("schema should be StructType")
         spark = SparkSession.builder.getOrCreate()
-        jschema = spark._jsparkSession.parseDataType(schema.json())
-        self._jreader = self._jreader.schema(jschema)
+        if isinstance(schema, StructType):
+            jschema = spark._jsparkSession.parseDataType(schema.json())
+            self._jreader = self._jreader.schema(jschema)
+        elif isinstance(schema, basestring):
+            self._jreader = self._jreader.schema(schema)
+        else:
+            raise TypeError("schema should be StructType or string")
         return self
 
     @since(2.0)
@@ -372,7 +377,8 @@ def load(self, path=None, format=None, schema=None, **options):
 
         :param path: optional string for file-system backed data sources.
         :param format: optional string for format of the data source. Default to 'parquet'.
-        :param schema: optional :class:`pyspark.sql.types.StructType` for the input schema.
+        :param schema: optional :class:`pyspark.sql.types.StructType` for the input schema
+                       or a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``).
         :param options: all other string options
 
         >>> json_sdf = spark.readStream.format("json") \\
@@ -415,7 +421,8 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
 
         :param path: string represents path to the JSON dataset,
                      or RDD of Strings storing JSON objects.
-        :param schema: an optional :class:`pyspark.sql.types.StructType` for the input schema.
+        :param schema: an optional :class:`pyspark.sql.types.StructType` for the input schema
+                       or a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``).
         :param primitivesAsString: infers all primitive values as a string type. If None is set,
                                    it uses the default value, ``false``.
         :param prefersDecimal: infers all floating-point values as a decimal type. If the values
@@ -542,7 +549,8 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
         .. note:: Evolving.
 
         :param path: string, or list of strings, for input path(s).
-        :param schema: an optional :class:`pyspark.sql.types.StructType` for the input schema.
+        :param schema: an optional :class:`pyspark.sql.types.StructType` for the input schema
+                       or a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``).
         :param sep: sets the single character as a separator for each field and value.
                     If None is set, it uses the default value, ``,``.
         :param encoding: decodes the CSV files by the given encoding type. If None is set,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
index 7e8e6394b4862..70ddfa8e9b835 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
@@ -59,6 +59,18 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
     this
   }
 
+  /**
+   * Specifies the schema by using the input DDL-formatted string. Some data sources (e.g. JSON) can
+   * infer the input schema automatically from data. By specifying the schema here, the underlying
+   * data source can skip the schema inference step, and thus speed up data loading.
+   *
+   * @since 2.3.0
+   */
+  def schema(schemaString: String): DataStreamReader = {
+    this.userSpecifiedSchema = Option(StructType.fromDDL(schemaString))
+    this
+  }
+
   /**
    * Adds an input option for the underlying data source.
    *
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
index b5f1e28d7396a..3de0ae67a3892 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
@@ -663,4 +663,16 @@ class DataStreamReaderWriterSuite extends StreamTest with BeforeAndAfter {
     }
     assert(fs.exists(checkpointDir))
   }
+
+  test("SPARK-20431: Specify a schema by using a DDL-formatted string") {
+    spark.readStream
+      .format("org.apache.spark.sql.streaming.test")
+      .schema("aa INT")
+      .load()
+
+    assert(LastOptions.schema.isDefined)
+    assert(LastOptions.schema.get === StructType(StructField("aa", IntegerType) :: Nil))
+
+    LastOptions.clear()
+  }
 }

From b837bf9ae97cf7ee7558c10a5a34636e69367a05 Mon Sep 17 00:00:00 2001
From: Gabor Feher <gabor.feher@lynxanalytics.com>
Date: Fri, 23 Jun 2017 21:53:38 -0700
Subject: [PATCH 014/779] [SPARK-20555][SQL] Fix mapping of Oracle DECIMAL
 types to Spark types in read path

## What changes were proposed in this pull request?

This PR is to revert some code changes in the read path of https://github.com/apache/spark/pull/14377. The original fix is https://github.com/apache/spark/pull/17830

When merging this PR, please give the credit to gaborfeher

## How was this patch tested?

Added a test case to OracleIntegrationSuite.scala

Author: Gabor Feher <gabor.feher@lynxanalytics.com>
Author: gatorsmile <gatorsmile@gmail.com>

Closes #18408 from gatorsmile/OracleType.
---
 .../sql/jdbc/OracleIntegrationSuite.scala     | 65 +++++++++++++------
 .../apache/spark/sql/jdbc/OracleDialect.scala |  4 --
 2 files changed, 45 insertions(+), 24 deletions(-)

diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala
index f7b1ec34ced76..b2f096964427e 100644
--- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala
+++ b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.jdbc
 
 import java.sql.{Connection, Date, Timestamp}
 import java.util.Properties
+import java.math.BigDecimal
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.test.SharedSQLContext
@@ -93,8 +94,31 @@ class OracleIntegrationSuite extends DockerJDBCIntegrationSuite with SharedSQLCo
         |USING org.apache.spark.sql.jdbc
         |OPTIONS (url '$jdbcUrl', dbTable 'datetime1', oracle.jdbc.mapDateToTimestamp 'false')
       """.stripMargin.replaceAll("\n", " "))
+
+
+    conn.prepareStatement("CREATE TABLE numerics (b DECIMAL(1), f DECIMAL(3, 2), i DECIMAL(10))").executeUpdate();
+    conn.prepareStatement(
+      "INSERT INTO numerics VALUES (4, 1.23, 9999999999)").executeUpdate();
+    conn.commit();
   }
 
+
+  test("SPARK-16625 : Importing Oracle numeric types") { 
+    val df = sqlContext.read.jdbc(jdbcUrl, "numerics", new Properties);
+    val rows = df.collect()
+    assert(rows.size == 1)
+    val row = rows(0)
+    // The main point of the below assertions is not to make sure that these Oracle types are
+    // mapped to decimal types, but to make sure that the returned values are correct.
+    // A value > 1 from DECIMAL(1) is correct:
+    assert(row.getDecimal(0).compareTo(BigDecimal.valueOf(4)) == 0)
+    // A value with fractions from DECIMAL(3, 2) is correct:
+    assert(row.getDecimal(1).compareTo(BigDecimal.valueOf(1.23)) == 0)
+    // A value > Int.MaxValue from DECIMAL(10) is correct:
+    assert(row.getDecimal(2).compareTo(BigDecimal.valueOf(9999999999l)) == 0)
+  }
+
+
   test("SPARK-12941: String datatypes to be mapped to Varchar in Oracle") {
     // create a sample dataframe with string type
     val df1 = sparkContext.parallelize(Seq(("foo"))).toDF("x")
@@ -154,27 +178,28 @@ class OracleIntegrationSuite extends DockerJDBCIntegrationSuite with SharedSQLCo
     val dfRead = spark.read.jdbc(jdbcUrl, tableName, props)
     val rows = dfRead.collect()
     // verify the data type is inserted
-    val types = rows(0).toSeq.map(x => x.getClass.toString)
-    assert(types(0).equals("class java.lang.Boolean"))
-    assert(types(1).equals("class java.lang.Integer"))
-    assert(types(2).equals("class java.lang.Long"))
-    assert(types(3).equals("class java.lang.Float"))
-    assert(types(4).equals("class java.lang.Float"))
-    assert(types(5).equals("class java.lang.Integer"))
-    assert(types(6).equals("class java.lang.Integer"))
-    assert(types(7).equals("class java.lang.String"))
-    assert(types(8).equals("class [B"))
-    assert(types(9).equals("class java.sql.Date"))
-    assert(types(10).equals("class java.sql.Timestamp"))
+    val types = dfRead.schema.map(field => field.dataType)
+    assert(types(0).equals(DecimalType(1, 0)))
+    assert(types(1).equals(DecimalType(10, 0)))
+    assert(types(2).equals(DecimalType(19, 0)))
+    assert(types(3).equals(DecimalType(19, 4)))
+    assert(types(4).equals(DecimalType(19, 4)))
+    assert(types(5).equals(DecimalType(3, 0)))
+    assert(types(6).equals(DecimalType(5, 0)))
+    assert(types(7).equals(StringType))
+    assert(types(8).equals(BinaryType))
+    assert(types(9).equals(DateType))
+    assert(types(10).equals(TimestampType))
+
     // verify the value is the inserted correct or not
     val values = rows(0)
-    assert(values.getBoolean(0).equals(booleanVal))
-    assert(values.getInt(1).equals(integerVal))
-    assert(values.getLong(2).equals(longVal))
-    assert(values.getFloat(3).equals(floatVal))
-    assert(values.getFloat(4).equals(doubleVal.toFloat))
-    assert(values.getInt(5).equals(byteVal.toInt))
-    assert(values.getInt(6).equals(shortVal.toInt))
+    assert(values.getDecimal(0).compareTo(BigDecimal.valueOf(1)) == 0)
+    assert(values.getDecimal(1).compareTo(BigDecimal.valueOf(integerVal)) == 0)
+    assert(values.getDecimal(2).compareTo(BigDecimal.valueOf(longVal)) == 0)
+    assert(values.getDecimal(3).compareTo(BigDecimal.valueOf(floatVal)) == 0)
+    assert(values.getDecimal(4).compareTo(BigDecimal.valueOf(doubleVal)) == 0)
+    assert(values.getDecimal(5).compareTo(BigDecimal.valueOf(byteVal)) == 0)
+    assert(values.getDecimal(6).compareTo(BigDecimal.valueOf(shortVal)) == 0)
     assert(values.getString(7).equals(stringVal))
     assert(values.getAs[Array[Byte]](8).mkString.equals("678"))
     assert(values.getDate(9).equals(dateVal))
@@ -183,7 +208,7 @@ class OracleIntegrationSuite extends DockerJDBCIntegrationSuite with SharedSQLCo
 
   test("SPARK-19318: connection property keys should be case-sensitive") {
     def checkRow(row: Row): Unit = {
-      assert(row.getInt(0) == 1)
+      assert(row.getDecimal(0).equals(BigDecimal.valueOf(1)))
       assert(row.getDate(1).equals(Date.valueOf("1991-11-09")))
       assert(row.getTimestamp(2).equals(Timestamp.valueOf("1996-01-01 01:23:45")))
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala
index f541996b651e9..20e634c06b610 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala
@@ -43,10 +43,6 @@ private case object OracleDialect extends JdbcDialect {
         // Not sure if there is a more robust way to identify the field as a float (or other
         // numeric types that do not specify a scale.
         case _ if scale == -127L => Option(DecimalType(DecimalType.MAX_PRECISION, 10))
-        case 1 => Option(BooleanType)
-        case 3 | 5 | 10 => Option(IntegerType)
-        case 19 if scale == 0L => Option(LongType)
-        case 19 if scale == 4L => Option(FloatType)
         case _ => None
       }
     } else {

From bfd73a7c48b87456d1b84d826e04eca938a1be64 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Sat, 24 Jun 2017 13:23:43 +0800
Subject: [PATCH 015/779] [SPARK-21159][CORE] Don't try to connect to launcher
 in standalone cluster mode.

Monitoring for standalone cluster mode is not implemented (see SPARK-11033), but
the same scheduler implementation is used, and if it tries to connect to the
launcher it will fail. So fix the scheduler so it only tries that in client mode;
cluster mode applications will be correctly launched and will work, but monitoring
through the launcher handle will not be available.

Tested by running a cluster mode app with "SparkLauncher.startApplication".

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #18397 from vanzin/SPARK-21159.
---
 .../scheduler/cluster/StandaloneSchedulerBackend.scala    | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala
index fd8e64454bf70..a4e2a74341283 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala
@@ -58,7 +58,13 @@ private[spark] class StandaloneSchedulerBackend(
 
   override def start() {
     super.start()
-    launcherBackend.connect()
+
+    // SPARK-21159. The scheduler backend should only try to connect to the launcher when in client
+    // mode. In cluster mode, the code that submits the application to the Master needs to connect
+    // to the launcher instead.
+    if (sc.deployMode == "client") {
+      launcherBackend.connect()
+    }
 
     // The endpoint for executors to talk to us
     val driverUrl = RpcEndpointAddress(

From 7c7bc8fc0ff85fe70968b47433bb7757326a6b12 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sat, 24 Jun 2017 10:14:31 +0100
Subject: [PATCH 016/779] [SPARK-21189][INFRA] Handle unknown error codes in
 Jenkins rather then leaving incomplete comment in PRs

## What changes were proposed in this pull request?

Recently, Jenkins tests were unstable due to unknown reasons as below:

```
 /home/jenkins/workspace/SparkPullRequestBuilder/dev/lint-r ; process was terminated by signal 9
    test_result_code, test_result_note = run_tests(tests_timeout)
  File "./dev/run-tests-jenkins.py", line 140, in run_tests
    test_result_note = ' * This patch **fails %s**.' % failure_note_by_errcode[test_result_code]
KeyError: -9
```

```
Traceback (most recent call last):
  File "./dev/run-tests-jenkins.py", line 226, in <module>
    main()
  File "./dev/run-tests-jenkins.py", line 213, in main
    test_result_code, test_result_note = run_tests(tests_timeout)
  File "./dev/run-tests-jenkins.py", line 140, in run_tests
    test_result_note = ' * This patch **fails %s**.' % failure_note_by_errcode[test_result_code]
KeyError: -10
```

This exception looks causing failing to update the comments in the PR. For example:

![2017-06-23 4 19 41](https://user-images.githubusercontent.com/6477701/27470626-d035ecd8-582f-11e7-883e-0ae6941659b7.png)

![2017-06-23 4 19 50](https://user-images.githubusercontent.com/6477701/27470629-d11ba782-582f-11e7-97e0-64d28cbc19aa.png)

these comment just remain.

This always requires, for both reviewers and the author, a overhead to click and check the logs, which I believe are not really useful.

This PR proposes to leave the code in the PR comment messages and let update the comments.

## How was this patch tested?

Jenkins tests below, I manually gave the error code to test this.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #18399 from HyukjinKwon/jenkins-print-errors.
---
 dev/run-tests-jenkins.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/dev/run-tests-jenkins.py b/dev/run-tests-jenkins.py
index 53061bc947e5f..914eb93622d51 100755
--- a/dev/run-tests-jenkins.py
+++ b/dev/run-tests-jenkins.py
@@ -137,7 +137,9 @@ def run_tests(tests_timeout):
     if test_result_code == 0:
         test_result_note = ' * This patch passes all tests.'
     else:
-        test_result_note = ' * This patch **fails %s**.' % failure_note_by_errcode[test_result_code]
+        note = failure_note_by_errcode.get(
+            test_result_code, "due to an unknown error code, %s" % test_result_code)
+        test_result_note = ' * This patch **fails %s**.' % note
 
     return [test_result_code, test_result_note]
 

From 2e1586f60a77ea0adb6f3f68ba74323f0c242199 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Sat, 24 Jun 2017 22:35:59 +0800
Subject: [PATCH 017/779] [SPARK-21203][SQL] Fix wrong results of insertion of
 Array of Struct

### What changes were proposed in this pull request?
```SQL
CREATE TABLE `tab1`
(`custom_fields` ARRAY<STRUCT<`id`: BIGINT, `value`: STRING>>)
USING parquet

INSERT INTO `tab1`
SELECT ARRAY(named_struct('id', 1, 'value', 'a'), named_struct('id', 2, 'value', 'b'))

SELECT custom_fields.id, custom_fields.value FROM tab1
```

The above query always return the last struct of the array, because the rule `SimplifyCasts` incorrectly rewrites the query. The underlying cause is we always use the same `GenericInternalRow` object when doing the cast.

### How was this patch tested?

Author: gatorsmile <gatorsmile@gmail.com>

Closes #18412 from gatorsmile/castStruct.
---
 .../spark/sql/catalyst/expressions/Cast.scala |  4 ++--
 .../spark/sql/sources/InsertSuite.scala       | 21 +++++++++++++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index a53ef426f79b5..43df19ba009a8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -482,15 +482,15 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
       case (fromField, toField) => cast(fromField.dataType, toField.dataType)
     }
     // TODO: Could be faster?
-    val newRow = new GenericInternalRow(from.fields.length)
     buildCast[InternalRow](_, row => {
+      val newRow = new GenericInternalRow(from.fields.length)
       var i = 0
       while (i < row.numFields) {
         newRow.update(i,
           if (row.isNullAt(i)) null else castFuncs(i)(row.get(i, from.apply(i).dataType)))
         i += 1
       }
-      newRow.copy()
+      newRow
     })
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
index 2eae66dda88de..41abff2a5da25 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
@@ -345,4 +345,25 @@ class InsertSuite extends DataSourceTest with SharedSQLContext {
       )
     }
   }
+
+  test("SPARK-21203 wrong results of insertion of Array of Struct") {
+    val tabName = "tab1"
+    withTable(tabName) {
+      spark.sql(
+        """
+          |CREATE TABLE `tab1`
+          |(`custom_fields` ARRAY<STRUCT<`id`: BIGINT, `value`: STRING>>)
+          |USING parquet
+        """.stripMargin)
+      spark.sql(
+        """
+          |INSERT INTO `tab1`
+          |SELECT ARRAY(named_struct('id', 1, 'value', 'a'), named_struct('id', 2, 'value', 'b'))
+        """.stripMargin)
+
+      checkAnswer(
+        spark.sql("SELECT custom_fields.id, custom_fields.value FROM tab1"),
+        Row(Array(1, 2), Array("a", "b")))
+    }
+  }
 }

From b449a1d6aa322a50cf221cd7a2ae85a91d6c7e9f Mon Sep 17 00:00:00 2001
From: Masha Basmanova <mbasmanova@fb.com>
Date: Sat, 24 Jun 2017 22:49:35 -0700
Subject: [PATCH 018/779] [SPARK-21079][SQL] Calculate total size of a
 partition table as a sum of individual partitions

## What changes were proposed in this pull request?

Storage URI of a partitioned table may or may not point to a directory under which individual partitions are stored. In fact, individual partitions may be located in totally unrelated directories. Before this change, ANALYZE TABLE table COMPUTE STATISTICS command calculated total size of a table by adding up sizes of files found under table's storage URI. This calculation could produce 0 if partitions are stored elsewhere.

This change uses storage URIs of individual partitions to calculate the sizes of all partitions of a table and adds these up to produce the total size of a table.

CC: wzhfy

## How was this patch tested?

Added unit test.

Ran ANALYZE TABLE xxx COMPUTE STATISTICS on a partitioned Hive table and verified that sizeInBytes is calculated correctly. Before this change, the size would be zero.

Author: Masha Basmanova <mbasmanova@fb.com>

Closes #18309 from mbasmanova/mbasmanova-analyze-part-table.
---
 .../command/AnalyzeTableCommand.scala         | 29 ++++++--
 .../spark/sql/hive/StatisticsSuite.scala      | 72 +++++++++++++++++++
 2 files changed, 95 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
index 3c59b982c2dca..06e588f56f1e9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.execution.command
 
+import java.net.URI
+
 import scala.util.control.NonFatal
 
 import org.apache.hadoop.fs.{FileSystem, Path}
@@ -81,6 +83,21 @@ case class AnalyzeTableCommand(
 object AnalyzeTableCommand extends Logging {
 
   def calculateTotalSize(sessionState: SessionState, catalogTable: CatalogTable): Long = {
+    if (catalogTable.partitionColumnNames.isEmpty) {
+      calculateLocationSize(sessionState, catalogTable.identifier, catalogTable.storage.locationUri)
+    } else {
+      // Calculate table size as a sum of the visible partitions. See SPARK-21079
+      val partitions = sessionState.catalog.listPartitions(catalogTable.identifier)
+      partitions.map(p =>
+        calculateLocationSize(sessionState, catalogTable.identifier, p.storage.locationUri)
+      ).sum
+    }
+  }
+
+  private def calculateLocationSize(
+      sessionState: SessionState,
+      tableId: TableIdentifier,
+      locationUri: Option[URI]): Long = {
     // This method is mainly based on
     // org.apache.hadoop.hive.ql.stats.StatsUtils.getFileSizeForTable(HiveConf, Table)
     // in Hive 0.13 (except that we do not use fs.getContentSummary).
@@ -91,13 +108,13 @@ object AnalyzeTableCommand extends Logging {
     // countFileSize to count the table size.
     val stagingDir = sessionState.conf.getConfString("hive.exec.stagingdir", ".hive-staging")
 
-    def calculateTableSize(fs: FileSystem, path: Path): Long = {
+    def calculateLocationSize(fs: FileSystem, path: Path): Long = {
       val fileStatus = fs.getFileStatus(path)
       val size = if (fileStatus.isDirectory) {
         fs.listStatus(path)
           .map { status =>
             if (!status.getPath.getName.startsWith(stagingDir)) {
-              calculateTableSize(fs, status.getPath)
+              calculateLocationSize(fs, status.getPath)
             } else {
               0L
             }
@@ -109,16 +126,16 @@ object AnalyzeTableCommand extends Logging {
       size
     }
 
-    catalogTable.storage.locationUri.map { p =>
+    locationUri.map { p =>
       val path = new Path(p)
       try {
         val fs = path.getFileSystem(sessionState.newHadoopConf())
-        calculateTableSize(fs, path)
+        calculateLocationSize(fs, path)
       } catch {
         case NonFatal(e) =>
           logWarning(
-            s"Failed to get the size of table ${catalogTable.identifier.table} in the " +
-              s"database ${catalogTable.identifier.database} because of ${e.toString}", e)
+            s"Failed to get the size of table ${tableId.table} in the " +
+              s"database ${tableId.database} because of ${e.toString}", e)
           0L
       }
     }.getOrElse(0L)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index 279db9a397258..0ee18bbe9befe 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -33,6 +33,7 @@ import org.apache.spark.sql.execution.joins._
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
+import org.apache.spark.util.Utils
 
 class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleton {
 
@@ -128,6 +129,77 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
       TableIdentifier("tempTable"), ignoreIfNotExists = true, purge = false)
   }
 
+  test("SPARK-21079 - analyze table with location different than that of individual partitions") {
+    def queryTotalSize(tableName: String): BigInt =
+      spark.table(tableName).queryExecution.analyzed.stats(conf).sizeInBytes
+
+    val tableName = "analyzeTable_part"
+    withTable(tableName) {
+      withTempPath { path =>
+        sql(s"CREATE TABLE $tableName (key STRING, value STRING) PARTITIONED BY (ds STRING)")
+
+        val partitionDates = List("2010-01-01", "2010-01-02", "2010-01-03")
+        partitionDates.foreach { ds =>
+          sql(s"INSERT INTO TABLE $tableName PARTITION (ds='$ds') SELECT * FROM src")
+        }
+
+        sql(s"ALTER TABLE $tableName SET LOCATION '$path'")
+
+        sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS noscan")
+
+        assert(queryTotalSize(tableName) === BigInt(17436))
+      }
+    }
+  }
+
+  test("SPARK-21079 - analyze partitioned table with only a subset of partitions visible") {
+    def queryTotalSize(tableName: String): BigInt =
+      spark.table(tableName).queryExecution.analyzed.stats(conf).sizeInBytes
+
+    val sourceTableName = "analyzeTable_part"
+    val tableName = "analyzeTable_part_vis"
+    withTable(sourceTableName, tableName) {
+      withTempPath { path =>
+          // Create a table with 3 partitions all located under a single top-level directory 'path'
+          sql(
+            s"""
+               |CREATE TABLE $sourceTableName (key STRING, value STRING)
+               |PARTITIONED BY (ds STRING)
+               |LOCATION '$path'
+             """.stripMargin)
+
+          val partitionDates = List("2010-01-01", "2010-01-02", "2010-01-03")
+          partitionDates.foreach { ds =>
+              sql(
+                s"""
+                   |INSERT INTO TABLE $sourceTableName PARTITION (ds='$ds')
+                   |SELECT * FROM src
+                 """.stripMargin)
+          }
+
+          // Create another table referring to the same location
+          sql(
+            s"""
+               |CREATE TABLE $tableName (key STRING, value STRING)
+               |PARTITIONED BY (ds STRING)
+               |LOCATION '$path'
+             """.stripMargin)
+
+          // Register only one of the partitions found on disk
+          val ds = partitionDates.head
+          sql(s"ALTER TABLE $tableName ADD PARTITION (ds='$ds')").collect()
+
+          // Analyze original table - expect 3 partitions
+          sql(s"ANALYZE TABLE $sourceTableName COMPUTE STATISTICS noscan")
+          assert(queryTotalSize(sourceTableName) === BigInt(3 * 5812))
+
+          // Analyze partial-copy table - expect only 1 partition
+          sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS noscan")
+          assert(queryTotalSize(tableName) === BigInt(5812))
+        }
+    }
+  }
+
   test("analyzing views is not supported") {
     def assertAnalyzeUnsupported(analyzeCommand: String): Unit = {
       val err = intercept[AnalysisException] {

From 884347e1f79e4e7c157834881e79447d7ee58f88 Mon Sep 17 00:00:00 2001
From: Zhenhua Wang <wzh_zju@163.com>
Date: Sun, 25 Jun 2017 15:06:29 +0100
Subject: [PATCH 019/779] [HOT FIX] fix stats functions in the recent patch

## What changes were proposed in this pull request?

Builds failed due to the recent [merge](https://github.com/apache/spark/commit/b449a1d6aa322a50cf221cd7a2ae85a91d6c7e9f). This is because [PR#18309](https://github.com/apache/spark/pull/18309) needed update after [this patch](https://github.com/apache/spark/commit/b803b66a8133f705463039325ee71ee6827ce1a7) was merged.

## How was this patch tested?
N/A

Author: Zhenhua Wang <wzh_zju@163.com>

Closes #18415 from wzhfy/hotfixStats.
---
 .../scala/org/apache/spark/sql/hive/StatisticsSuite.scala    | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index 0ee18bbe9befe..64deb3818d5d1 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -33,7 +33,6 @@ import org.apache.spark.sql.execution.joins._
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
-import org.apache.spark.util.Utils
 
 class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleton {
 
@@ -131,7 +130,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
 
   test("SPARK-21079 - analyze table with location different than that of individual partitions") {
     def queryTotalSize(tableName: String): BigInt =
-      spark.table(tableName).queryExecution.analyzed.stats(conf).sizeInBytes
+      spark.table(tableName).queryExecution.analyzed.stats.sizeInBytes
 
     val tableName = "analyzeTable_part"
     withTable(tableName) {
@@ -154,7 +153,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
 
   test("SPARK-21079 - analyze partitioned table with only a subset of partitions visible") {
     def queryTotalSize(tableName: String): BigInt =
-      spark.table(tableName).queryExecution.analyzed.stats(conf).sizeInBytes
+      spark.table(tableName).queryExecution.analyzed.stats.sizeInBytes
 
     val sourceTableName = "analyzeTable_part"
     val tableName = "analyzeTable_part_vis"

From 6b3d02285ee0debc73cbcab01b10398a498fbeb8 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sun, 25 Jun 2017 11:05:57 -0700
Subject: [PATCH 020/779] [SPARK-21093][R] Terminate R's worker processes in
 the parent of R's daemon to prevent a leak

## What changes were proposed in this pull request?

`mcfork` in R looks opening a pipe ahead but the existing logic does not properly close it when it is executed hot. This leads to the failure of more forking due to the limit for number of files open.

This hot execution looks particularly for `gapply`/`gapplyCollect`. For unknown reason, this happens more easily in CentOS and could be reproduced in Mac too.

All the details are described in https://issues.apache.org/jira/browse/SPARK-21093

This PR proposes simply to terminate R's worker processes in the parent of R's daemon to prevent a leak.

## How was this patch tested?

I ran the codes below on both CentOS and Mac with that configuration disabled/enabled.

```r
df <- createDataFrame(list(list(1L, 1, "1", 0.1)), c("a", "b", "c", "d"))
collect(gapply(df, "a", function(key, x) { x }, schema(df)))
collect(gapply(df, "a", function(key, x) { x }, schema(df)))
...  # 30 times
```

Also, now it passes R tests on CentOS as below:

```
SparkSQL functions: Spark package found in SPARK_HOME: .../spark
..............................................................................................................................................................
..............................................................................................................................................................
..............................................................................................................................................................
..............................................................................................................................................................
..............................................................................................................................................................
....................................................................................................................................
```

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #18320 from HyukjinKwon/SPARK-21093.
---
 R/pkg/inst/worker/daemon.R | 59 +++++++++++++++++++++++++++++++++++---
 1 file changed, 55 insertions(+), 4 deletions(-)

diff --git a/R/pkg/inst/worker/daemon.R b/R/pkg/inst/worker/daemon.R
index 3a318b71ea06d..6e385b2a27622 100644
--- a/R/pkg/inst/worker/daemon.R
+++ b/R/pkg/inst/worker/daemon.R
@@ -30,8 +30,55 @@ port <- as.integer(Sys.getenv("SPARKR_WORKER_PORT"))
 inputCon <- socketConnection(
     port = port, open = "rb", blocking = TRUE, timeout = connectionTimeout)
 
+# Waits indefinitely for a socket connecion by default.
+selectTimeout <- NULL
+
+# Exit code that children send to the parent to indicate they exited.
+exitCode <- 1
+
 while (TRUE) {
-  ready <- socketSelect(list(inputCon))
+  ready <- socketSelect(list(inputCon), timeout = selectTimeout)
+
+  # Note that the children should be terminated in the parent. If each child terminates
+  # itself, it appears that the resource is not released properly, that causes an unexpected
+  # termination of this daemon due to, for example, running out of file descriptors
+  # (see SPARK-21093). Therefore, the current implementation tries to retrieve children
+  # that are exited (but not terminated) and then sends a kill signal to terminate them properly
+  # in the parent.
+  #
+  # There are two paths that it attempts to send a signal to terminate the children in the parent.
+  #
+  #   1. Every second if any socket connection is not available and if there are child workers
+  #     running.
+  #   2. Right after a socket connection is available.
+  #
+  # In other words, the parent attempts to send the signal to the children every second if
+  # any worker is running or right before launching other worker children from the following
+  # new socket connection.
+
+  # Only the process IDs of children sent data to the parent are returned below. The children
+  # send a custom exit code to the parent after being exited and the parent tries
+  # to terminate them only if they sent the exit code.
+  children <- parallel:::selectChildren(timeout = 0)
+
+  if (is.integer(children)) {
+    lapply(children, function(child) {
+      # This data should be raw bytes if any data was sent from this child.
+      # Otherwise, this returns the PID.
+      data <- parallel:::readChild(child)
+      if (is.raw(data)) {
+        # This checks if the data from this child is the exit code that indicates an exited child.
+        if (unserialize(data) == exitCode) {
+          # If so, we terminate this child.
+          tools::pskill(child, tools::SIGUSR1)
+        }
+      }
+    })
+  } else if (is.null(children)) {
+    # If it is NULL, there are no children. Waits indefinitely for a socket connecion.
+    selectTimeout <- NULL
+  }
+
   if (ready) {
     port <- SparkR:::readInt(inputCon)
     # There is a small chance that it could be interrupted by signal, retry one time
@@ -44,12 +91,16 @@ while (TRUE) {
     }
     p <- parallel:::mcfork()
     if (inherits(p, "masterProcess")) {
+      # Reach here because this is a child process.
       close(inputCon)
       Sys.setenv(SPARKR_WORKER_PORT = port)
       try(source(script))
-      # Set SIGUSR1 so that child can exit
-      tools::pskill(Sys.getpid(), tools::SIGUSR1)
-      parallel:::mcexit(0L)
+      # Note that this mcexit does not fully terminate this child. So, this writes back
+      # a custom exit code so that the parent can read and terminate this child.
+      parallel:::mcexit(0L, send = exitCode)
+    } else {
+      # Forking succeeded and we need to check if they finished their jobs every second.
+      selectTimeout <- 1
     }
   }
 }

From 5282bae0408dec8aa0cefafd7673dd34d232ead9 Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Mon, 26 Jun 2017 01:26:32 -0700
Subject: [PATCH 021/779] [SPARK-21153] Use project instead of expand in
 tumbling windows

## What changes were proposed in this pull request?

Time windowing in Spark currently performs an Expand + Filter, because there is no way to guarantee the amount of windows a timestamp will fall in, in the general case. However, for tumbling windows, a record is guaranteed to fall into a single bucket. In this case, doubling the number of records with Expand is wasteful, and can be improved by using a simple Projection instead.

Benchmarks show that we get an order of magnitude performance improvement after this patch.

## How was this patch tested?

Existing unit tests. Benchmarked using the following code:

```scala
import org.apache.spark.sql.functions._

spark.time {
  spark.range(numRecords)
    .select(from_unixtime((current_timestamp().cast("long") * 1000 + 'id / 1000) / 1000) as 'time)
    .select(window('time, "10 seconds"))
    .count()
}
```

Setup:
 - 1 c3.2xlarge worker (8 cores)

![image](https://user-images.githubusercontent.com/5243515/27348748-ed991b84-55a9-11e7-8f8b-6e7abc524417.png)

1 B rows ran in 287 seconds after this optimization. I didn't wait for it to finish without the optimization. Shows about 5x improvement for large number of records.

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #18364 from brkyvz/opt-tumble.
---
 .../sql/catalyst/analysis/Analyzer.scala      | 72 +++++++++++++------
 .../sql/catalyst/expressions/TimeWindow.scala | 12 ++--
 .../sql/DataFrameTimeWindowingSuite.scala     | 49 +++++++++----
 3 files changed, 94 insertions(+), 39 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 7e5ebfc93286f..434b6ffee37fa 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -2301,6 +2301,7 @@ object EliminateEventTimeWatermark extends Rule[LogicalPlan] {
 object TimeWindowing extends Rule[LogicalPlan] {
   import org.apache.spark.sql.catalyst.dsl.expressions._
 
+  private final val WINDOW_COL_NAME = "window"
   private final val WINDOW_START = "start"
   private final val WINDOW_END = "end"
 
@@ -2336,49 +2337,76 @@ object TimeWindowing extends Rule[LogicalPlan] {
     case p: LogicalPlan if p.children.size == 1 =>
       val child = p.children.head
       val windowExpressions =
-        p.expressions.flatMap(_.collect { case t: TimeWindow => t }).distinct.toList // Not correct.
+        p.expressions.flatMap(_.collect { case t: TimeWindow => t }).toSet
 
       // Only support a single window expression for now
       if (windowExpressions.size == 1 &&
           windowExpressions.head.timeColumn.resolved &&
           windowExpressions.head.checkInputDataTypes().isSuccess) {
+
         val window = windowExpressions.head
 
         val metadata = window.timeColumn match {
           case a: Attribute => a.metadata
           case _ => Metadata.empty
         }
-        val windowAttr =
-          AttributeReference("window", window.dataType, metadata = metadata)()
-
-        val maxNumOverlapping = math.ceil(window.windowDuration * 1.0 / window.slideDuration).toInt
-        val windows = Seq.tabulate(maxNumOverlapping + 1) { i =>
-          val windowId = Ceil((PreciseTimestamp(window.timeColumn) - window.startTime) /
-            window.slideDuration)
-          val windowStart = (windowId + i - maxNumOverlapping) *
-              window.slideDuration + window.startTime
+
+        def getWindow(i: Int, overlappingWindows: Int): Expression = {
+          val division = (PreciseTimestampConversion(
+            window.timeColumn, TimestampType, LongType) - window.startTime) / window.slideDuration
+          val ceil = Ceil(division)
+          // if the division is equal to the ceiling, our record is the start of a window
+          val windowId = CaseWhen(Seq((ceil === division, ceil + 1)), Some(ceil))
+          val windowStart = (windowId + i - overlappingWindows) *
+            window.slideDuration + window.startTime
           val windowEnd = windowStart + window.windowDuration
 
           CreateNamedStruct(
-            Literal(WINDOW_START) :: windowStart ::
-            Literal(WINDOW_END) :: windowEnd :: Nil)
+            Literal(WINDOW_START) ::
+              PreciseTimestampConversion(windowStart, LongType, TimestampType) ::
+              Literal(WINDOW_END) ::
+              PreciseTimestampConversion(windowEnd, LongType, TimestampType) ::
+              Nil)
         }
 
-        val projections = windows.map(_ +: p.children.head.output)
+        val windowAttr = AttributeReference(
+          WINDOW_COL_NAME, window.dataType, metadata = metadata)()
+
+        if (window.windowDuration == window.slideDuration) {
+          val windowStruct = Alias(getWindow(0, 1), WINDOW_COL_NAME)(
+            exprId = windowAttr.exprId)
+
+          val replacedPlan = p transformExpressions {
+            case t: TimeWindow => windowAttr
+          }
+
+          // For backwards compatibility we add a filter to filter out nulls
+          val filterExpr = IsNotNull(window.timeColumn)
 
-        val filterExpr =
-          window.timeColumn >= windowAttr.getField(WINDOW_START) &&
-          window.timeColumn < windowAttr.getField(WINDOW_END)
+          replacedPlan.withNewChildren(
+            Filter(filterExpr,
+              Project(windowStruct +: child.output, child)) :: Nil)
+        } else {
+          val overlappingWindows =
+            math.ceil(window.windowDuration * 1.0 / window.slideDuration).toInt
+          val windows =
+            Seq.tabulate(overlappingWindows)(i => getWindow(i, overlappingWindows))
+
+          val projections = windows.map(_ +: child.output)
+
+          val filterExpr =
+            window.timeColumn >= windowAttr.getField(WINDOW_START) &&
+              window.timeColumn < windowAttr.getField(WINDOW_END)
 
-        val expandedPlan =
-          Filter(filterExpr,
+          val substitutedPlan = Filter(filterExpr,
             Expand(projections, windowAttr +: child.output, child))
 
-        val substitutedPlan = p transformExpressions {
-          case t: TimeWindow => windowAttr
-        }
+          val renamedPlan = p transformExpressions {
+            case t: TimeWindow => windowAttr
+          }
 
-        substitutedPlan.withNewChildren(expandedPlan :: Nil)
+          renamedPlan.withNewChildren(substitutedPlan :: Nil)
+        }
       } else if (windowExpressions.size > 1) {
         p.failAnalysis("Multiple time window expressions would result in a cartesian product " +
           "of rows, therefore they are currently not supported.")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TimeWindow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TimeWindow.scala
index 7ff61ee479452..9a9f579b37f58 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TimeWindow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TimeWindow.scala
@@ -152,12 +152,15 @@ object TimeWindow {
 }
 
 /**
- * Expression used internally to convert the TimestampType to Long without losing
+ * Expression used internally to convert the TimestampType to Long and back without losing
  * precision, i.e. in microseconds. Used in time windowing.
  */
-case class PreciseTimestamp(child: Expression) extends UnaryExpression with ExpectsInputTypes {
-  override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType)
-  override def dataType: DataType = LongType
+case class PreciseTimestampConversion(
+    child: Expression,
+    fromType: DataType,
+    toType: DataType) extends UnaryExpression with ExpectsInputTypes {
+  override def inputTypes: Seq[AbstractDataType] = Seq(fromType)
+  override def dataType: DataType = toType
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val eval = child.genCode(ctx)
     ev.copy(code = eval.code +
@@ -165,4 +168,5 @@ case class PreciseTimestamp(child: Expression) extends UnaryExpression with Expe
          |${ctx.javaType(dataType)} ${ev.value} = ${eval.value};
        """.stripMargin)
   }
+  override def nullSafeEval(input: Any): Any = input
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTimeWindowingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTimeWindowingSuite.scala
index 22d5c47a6fb51..6fe356877c268 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTimeWindowingSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTimeWindowingSuite.scala
@@ -17,10 +17,9 @@
 
 package org.apache.spark.sql
 
-import java.util.TimeZone
-
 import org.scalatest.BeforeAndAfterEach
 
+import org.apache.spark.sql.catalyst.plans.logical.Expand
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types.StringType
@@ -29,11 +28,27 @@ class DataFrameTimeWindowingSuite extends QueryTest with SharedSQLContext with B
 
   import testImplicits._
 
+  test("simple tumbling window with record at window start") {
+    val df = Seq(
+      ("2016-03-27 19:39:30", 1, "a")).toDF("time", "value", "id")
+
+    checkAnswer(
+      df.groupBy(window($"time", "10 seconds"))
+        .agg(count("*").as("counts"))
+        .orderBy($"window.start".asc)
+        .select($"window.start".cast("string"), $"window.end".cast("string"), $"counts"),
+      Seq(
+        Row("2016-03-27 19:39:30", "2016-03-27 19:39:40", 1)
+      )
+    )
+  }
+
   test("tumbling window groupBy statement") {
     val df = Seq(
       ("2016-03-27 19:39:34", 1, "a"),
       ("2016-03-27 19:39:56", 2, "a"),
       ("2016-03-27 19:39:27", 4, "b")).toDF("time", "value", "id")
+
     checkAnswer(
       df.groupBy(window($"time", "10 seconds"))
         .agg(count("*").as("counts"))
@@ -59,14 +74,18 @@ class DataFrameTimeWindowingSuite extends QueryTest with SharedSQLContext with B
 
   test("tumbling window with multi-column projection") {
     val df = Seq(
-      ("2016-03-27 19:39:34", 1, "a"),
-      ("2016-03-27 19:39:56", 2, "a"),
-      ("2016-03-27 19:39:27", 4, "b")).toDF("time", "value", "id")
+        ("2016-03-27 19:39:34", 1, "a"),
+        ("2016-03-27 19:39:56", 2, "a"),
+        ("2016-03-27 19:39:27", 4, "b")).toDF("time", "value", "id")
+      .select(window($"time", "10 seconds"), $"value")
+      .orderBy($"window.start".asc)
+      .select($"window.start".cast("string"), $"window.end".cast("string"), $"value")
+
+    val expands = df.queryExecution.optimizedPlan.find(_.isInstanceOf[Expand])
+    assert(expands.isEmpty, "Tumbling windows shouldn't require expand")
 
     checkAnswer(
-      df.select(window($"time", "10 seconds"), $"value")
-        .orderBy($"window.start".asc)
-        .select($"window.start".cast("string"), $"window.end".cast("string"), $"value"),
+      df,
       Seq(
         Row("2016-03-27 19:39:20", "2016-03-27 19:39:30", 4),
         Row("2016-03-27 19:39:30", "2016-03-27 19:39:40", 1),
@@ -104,13 +123,17 @@ class DataFrameTimeWindowingSuite extends QueryTest with SharedSQLContext with B
 
   test("sliding window projection") {
     val df = Seq(
-      ("2016-03-27 19:39:34", 1, "a"),
-      ("2016-03-27 19:39:56", 2, "a"),
-      ("2016-03-27 19:39:27", 4, "b")).toDF("time", "value", "id")
+        ("2016-03-27 19:39:34", 1, "a"),
+        ("2016-03-27 19:39:56", 2, "a"),
+        ("2016-03-27 19:39:27", 4, "b")).toDF("time", "value", "id")
+      .select(window($"time", "10 seconds", "3 seconds", "0 second"), $"value")
+      .orderBy($"window.start".asc, $"value".desc).select("value")
+
+    val expands = df.queryExecution.optimizedPlan.find(_.isInstanceOf[Expand])
+    assert(expands.nonEmpty, "Sliding windows require expand")
 
     checkAnswer(
-      df.select(window($"time", "10 seconds", "3 seconds", "0 second"), $"value")
-        .orderBy($"window.start".asc, $"value".desc).select("value"),
+      df,
       // 2016-03-27 19:39:27 UTC -> 4 bins
       // 2016-03-27 19:39:34 UTC -> 3 bins
       // 2016-03-27 19:39:56 UTC -> 3 bins

From 9e50a1d37a4cf0c34e20a7c1a910ceaff41535a2 Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Mon, 26 Jun 2017 11:14:03 -0500
Subject: [PATCH 022/779] [SPARK-13669][SPARK-20898][CORE] Improve the
 blacklist mechanism to handle external shuffle service unavailable situation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

Currently we are running into an issue with Yarn work preserving enabled + external shuffle service.
In the work preserving enabled scenario, the failure of NM will not lead to the exit of executors, so executors can still accept and run the tasks. The problem here is when NM is failed, external shuffle service is actually inaccessible, so reduce tasks will always complain about the “Fetch failure”, and the failure of reduce stage will make the parent stage (map stage) rerun. The tricky thing here is Spark scheduler is not aware of the unavailability of external shuffle service, and will reschedule the map tasks on the executor where NM is failed, and again reduce stage will be failed with “Fetch failure”, and after 4 retries, the job is failed. This could also apply to other cluster manager with external shuffle service.

So here the main problem is that we should avoid assigning tasks to those bad executors (where shuffle service is unavailable). Current Spark's blacklist mechanism could blacklist executors/nodes by failure tasks, but it doesn't handle this specific fetch failure scenario. So here propose to improve the current application blacklist mechanism to handle fetch failure issue (especially with external shuffle service unavailable issue), to blacklist the executors/nodes where shuffle fetch is unavailable.

## How was this patch tested?

Unit test and small cluster verification.

Author: jerryshao <sshao@hortonworks.com>

Closes #17113 from jerryshao/SPARK-13669.
---
 .../spark/internal/config/package.scala       |  5 +
 .../spark/scheduler/BlacklistTracker.scala    | 95 ++++++++++++++-----
 .../spark/scheduler/TaskSchedulerImpl.scala   | 18 +---
 .../spark/scheduler/TaskSetManager.scala      |  6 ++
 .../scheduler/BlacklistTrackerSuite.scala     | 55 +++++++++++
 .../scheduler/TaskSchedulerImplSuite.scala    |  4 +-
 .../spark/scheduler/TaskSetManagerSuite.scala | 32 +++++++
 docs/configuration.md                         |  9 ++
 8 files changed, 186 insertions(+), 38 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index 462c1890fd8df..be63c637a3a13 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -149,6 +149,11 @@ package object config {
       .internal()
       .timeConf(TimeUnit.MILLISECONDS)
       .createOptional
+
+  private[spark] val BLACKLIST_FETCH_FAILURE_ENABLED =
+    ConfigBuilder("spark.blacklist.application.fetchFailure.enabled")
+      .booleanConf
+      .createWithDefault(false)
   // End blacklist confs
 
   private[spark] val UNREGISTER_OUTPUT_ON_HOST_ON_FETCH_FAILURE =
diff --git a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
index e130e609e4f63..cd8e61d6d0208 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
@@ -61,6 +61,7 @@ private[scheduler] class BlacklistTracker (
   private val MAX_FAILURES_PER_EXEC = conf.get(config.MAX_FAILURES_PER_EXEC)
   private val MAX_FAILED_EXEC_PER_NODE = conf.get(config.MAX_FAILED_EXEC_PER_NODE)
   val BLACKLIST_TIMEOUT_MILLIS = BlacklistTracker.getBlacklistTimeout(conf)
+  private val BLACKLIST_FETCH_FAILURE_ENABLED = conf.get(config.BLACKLIST_FETCH_FAILURE_ENABLED)
 
   /**
    * A map from executorId to information on task failures.  Tracks the time of each task failure,
@@ -145,6 +146,74 @@ private[scheduler] class BlacklistTracker (
     nextExpiryTime = math.min(execMinExpiry, nodeMinExpiry)
   }
 
+  private def killBlacklistedExecutor(exec: String): Unit = {
+    if (conf.get(config.BLACKLIST_KILL_ENABLED)) {
+      allocationClient match {
+        case Some(a) =>
+          logInfo(s"Killing blacklisted executor id $exec " +
+            s"since ${config.BLACKLIST_KILL_ENABLED.key} is set.")
+          a.killExecutors(Seq(exec), true, true)
+        case None =>
+          logWarning(s"Not attempting to kill blacklisted executor id $exec " +
+            s"since allocation client is not defined.")
+      }
+    }
+  }
+
+  private def killExecutorsOnBlacklistedNode(node: String): Unit = {
+    if (conf.get(config.BLACKLIST_KILL_ENABLED)) {
+      allocationClient match {
+        case Some(a) =>
+          logInfo(s"Killing all executors on blacklisted host $node " +
+            s"since ${config.BLACKLIST_KILL_ENABLED.key} is set.")
+          if (a.killExecutorsOnHost(node) == false) {
+            logError(s"Killing executors on node $node failed.")
+          }
+        case None =>
+          logWarning(s"Not attempting to kill executors on blacklisted host $node " +
+            s"since allocation client is not defined.")
+      }
+    }
+  }
+
+  def updateBlacklistForFetchFailure(host: String, exec: String): Unit = {
+    if (BLACKLIST_FETCH_FAILURE_ENABLED) {
+      // If we blacklist on fetch failures, we are implicitly saying that we believe the failure is
+      // non-transient, and can't be recovered from (even if this is the first fetch failure,
+      // stage is retried after just one failure, so we don't always get a chance to collect
+      // multiple fetch failures).
+      // If the external shuffle-service is on, then every other executor on this node would
+      // be suffering from the same issue, so we should blacklist (and potentially kill) all
+      // of them immediately.
+
+      val now = clock.getTimeMillis()
+      val expiryTimeForNewBlacklists = now + BLACKLIST_TIMEOUT_MILLIS
+
+      if (conf.get(config.SHUFFLE_SERVICE_ENABLED)) {
+        if (!nodeIdToBlacklistExpiryTime.contains(host)) {
+          logInfo(s"blacklisting node $host due to fetch failure of external shuffle service")
+
+          nodeIdToBlacklistExpiryTime.put(host, expiryTimeForNewBlacklists)
+          listenerBus.post(SparkListenerNodeBlacklisted(now, host, 1))
+          _nodeBlacklist.set(nodeIdToBlacklistExpiryTime.keySet.toSet)
+          killExecutorsOnBlacklistedNode(host)
+          updateNextExpiryTime()
+        }
+      } else if (!executorIdToBlacklistStatus.contains(exec)) {
+        logInfo(s"Blacklisting executor $exec due to fetch failure")
+
+        executorIdToBlacklistStatus.put(exec, BlacklistedExecutor(host, expiryTimeForNewBlacklists))
+        // We hardcoded number of failure tasks to 1 for fetch failure, because there's no
+        // reattempt for such failure.
+        listenerBus.post(SparkListenerExecutorBlacklisted(now, exec, 1))
+        updateNextExpiryTime()
+        killBlacklistedExecutor(exec)
+
+        val blacklistedExecsOnNode = nodeToBlacklistedExecs.getOrElseUpdate(exec, HashSet[String]())
+        blacklistedExecsOnNode += exec
+      }
+    }
+  }
 
   def updateBlacklistForSuccessfulTaskSet(
       stageId: Int,
@@ -174,17 +243,7 @@ private[scheduler] class BlacklistTracker (
         listenerBus.post(SparkListenerExecutorBlacklisted(now, exec, newTotal))
         executorIdToFailureList.remove(exec)
         updateNextExpiryTime()
-        if (conf.get(config.BLACKLIST_KILL_ENABLED)) {
-          allocationClient match {
-            case Some(allocationClient) =>
-              logInfo(s"Killing blacklisted executor id $exec " +
-                s"since spark.blacklist.killBlacklistedExecutors is set.")
-              allocationClient.killExecutors(Seq(exec), true, true)
-            case None =>
-              logWarning(s"Not attempting to kill blacklisted executor id $exec " +
-                s"since allocation client is not defined.")
-          }
-        }
+        killBlacklistedExecutor(exec)
 
         // In addition to blacklisting the executor, we also update the data for failures on the
         // node, and potentially put the entire node into a blacklist as well.
@@ -199,19 +258,7 @@ private[scheduler] class BlacklistTracker (
           nodeIdToBlacklistExpiryTime.put(node, expiryTimeForNewBlacklists)
           listenerBus.post(SparkListenerNodeBlacklisted(now, node, blacklistedExecsOnNode.size))
           _nodeBlacklist.set(nodeIdToBlacklistExpiryTime.keySet.toSet)
-          if (conf.get(config.BLACKLIST_KILL_ENABLED)) {
-            allocationClient match {
-              case Some(allocationClient) =>
-                logInfo(s"Killing all executors on blacklisted host $node " +
-                  s"since spark.blacklist.killBlacklistedExecutors is set.")
-                if (allocationClient.killExecutorsOnHost(node) == false) {
-                  logError(s"Killing executors on node $node failed.")
-                }
-              case None =>
-                logWarning(s"Not attempting to kill executors on blacklisted host $node " +
-                  s"since allocation client is not defined.")
-            }
-          }
+          killExecutorsOnBlacklistedNode(node)
         }
       }
     }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
index bba0b294f1afb..91ec172ffeda1 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -51,29 +51,21 @@ import org.apache.spark.util.{AccumulatorV2, ThreadUtils, Utils}
  * acquire a lock on us, so we need to make sure that we don't try to lock the backend while
  * we are holding a lock on ourselves.
  */
-private[spark] class TaskSchedulerImpl private[scheduler](
+private[spark] class TaskSchedulerImpl(
     val sc: SparkContext,
     val maxTaskFailures: Int,
-    private[scheduler] val blacklistTrackerOpt: Option[BlacklistTracker],
     isLocal: Boolean = false)
   extends TaskScheduler with Logging {
 
   import TaskSchedulerImpl._
 
   def this(sc: SparkContext) = {
-    this(
-      sc,
-      sc.conf.get(config.MAX_TASK_FAILURES),
-      TaskSchedulerImpl.maybeCreateBlacklistTracker(sc))
+    this(sc, sc.conf.get(config.MAX_TASK_FAILURES))
   }
 
-  def this(sc: SparkContext, maxTaskFailures: Int, isLocal: Boolean) = {
-    this(
-      sc,
-      maxTaskFailures,
-      TaskSchedulerImpl.maybeCreateBlacklistTracker(sc),
-      isLocal = isLocal)
-  }
+  // Lazily initializing blackListTrackOpt to avoid getting empty ExecutorAllocationClient,
+  // because ExecutorAllocationClient is created after this TaskSchedulerImpl.
+  private[scheduler] lazy val blacklistTrackerOpt = maybeCreateBlacklistTracker(sc)
 
   val conf = sc.conf
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index a41b059fa7dec..02d374dc37cd5 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -774,6 +774,12 @@ private[spark] class TaskSetManager(
           tasksSuccessful += 1
         }
         isZombie = true
+
+        if (fetchFailed.bmAddress != null) {
+          blacklistTracker.foreach(_.updateBlacklistForFetchFailure(
+            fetchFailed.bmAddress.host, fetchFailed.bmAddress.executorId))
+        }
+
         None
 
       case ef: ExceptionFailure =>
diff --git a/core/src/test/scala/org/apache/spark/scheduler/BlacklistTrackerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/BlacklistTrackerSuite.scala
index 571c6bbb4585d..7ff03c44b0611 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/BlacklistTrackerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/BlacklistTrackerSuite.scala
@@ -530,4 +530,59 @@ class BlacklistTrackerSuite extends SparkFunSuite with BeforeAndAfterEach with M
     verify(allocationClientMock).killExecutors(Seq("2"), true, true)
     verify(allocationClientMock).killExecutorsOnHost("hostA")
   }
+
+  test("fetch failure blacklisting kills executors, configured by BLACKLIST_KILL_ENABLED") {
+    val allocationClientMock = mock[ExecutorAllocationClient]
+    when(allocationClientMock.killExecutors(any(), any(), any())).thenReturn(Seq("called"))
+    when(allocationClientMock.killExecutorsOnHost("hostA")).thenAnswer(new Answer[Boolean] {
+      // To avoid a race between blacklisting and killing, it is important that the nodeBlacklist
+      // is updated before we ask the executor allocation client to kill all the executors
+      // on a particular host.
+      override def answer(invocation: InvocationOnMock): Boolean = {
+        if (blacklist.nodeBlacklist.contains("hostA") == false) {
+          throw new IllegalStateException("hostA should be on the blacklist")
+        }
+        true
+      }
+    })
+
+    conf.set(config.BLACKLIST_FETCH_FAILURE_ENABLED, true)
+    blacklist = new BlacklistTracker(listenerBusMock, conf, Some(allocationClientMock), clock)
+
+    // Disable auto-kill. Blacklist an executor and make sure killExecutors is not called.
+    conf.set(config.BLACKLIST_KILL_ENABLED, false)
+    blacklist.updateBlacklistForFetchFailure("hostA", exec = "1")
+
+    verify(allocationClientMock, never).killExecutors(any(), any(), any())
+    verify(allocationClientMock, never).killExecutorsOnHost(any())
+
+    // Enable auto-kill. Blacklist an executor and make sure killExecutors is called.
+    conf.set(config.BLACKLIST_KILL_ENABLED, true)
+    blacklist = new BlacklistTracker(listenerBusMock, conf, Some(allocationClientMock), clock)
+    clock.advance(1000)
+    blacklist.updateBlacklistForFetchFailure("hostA", exec = "1")
+
+    verify(allocationClientMock).killExecutors(Seq("1"), true, true)
+    verify(allocationClientMock, never).killExecutorsOnHost(any())
+
+    assert(blacklist.executorIdToBlacklistStatus.contains("1"))
+    assert(blacklist.executorIdToBlacklistStatus("1").node === "hostA")
+    assert(blacklist.executorIdToBlacklistStatus("1").expiryTime ===
+      1000 + blacklist.BLACKLIST_TIMEOUT_MILLIS)
+    assert(blacklist.nextExpiryTime === 1000 + blacklist.BLACKLIST_TIMEOUT_MILLIS)
+    assert(blacklist.nodeIdToBlacklistExpiryTime.isEmpty)
+
+    // Enable external shuffle service to see if all the executors on this node will be killed.
+    conf.set(config.SHUFFLE_SERVICE_ENABLED, true)
+    clock.advance(1000)
+    blacklist.updateBlacklistForFetchFailure("hostA", exec = "2")
+
+    verify(allocationClientMock, never).killExecutors(Seq("2"), true, true)
+    verify(allocationClientMock).killExecutorsOnHost("hostA")
+
+    assert(blacklist.nodeIdToBlacklistExpiryTime.contains("hostA"))
+    assert(blacklist.nodeIdToBlacklistExpiryTime("hostA") ===
+      2000 + blacklist.BLACKLIST_TIMEOUT_MILLIS)
+    assert(blacklist.nextExpiryTime === 1000 + blacklist.BLACKLIST_TIMEOUT_MILLIS)
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
index 8b9d45f734cda..a00337776dadc 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
@@ -87,7 +87,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B
     conf.set(config.BLACKLIST_ENABLED, true)
     sc = new SparkContext(conf)
     taskScheduler =
-      new TaskSchedulerImpl(sc, sc.conf.getInt("spark.task.maxFailures", 4), Some(blacklist)) {
+      new TaskSchedulerImpl(sc, sc.conf.getInt("spark.task.maxFailures", 4)) {
         override def createTaskSetManager(taskSet: TaskSet, maxFailures: Int): TaskSetManager = {
           val tsm = super.createTaskSetManager(taskSet, maxFailures)
           // we need to create a spied tsm just so we can set the TaskSetBlacklist
@@ -98,6 +98,8 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B
           stageToMockTaskSetBlacklist(taskSet.stageId) = taskSetBlacklist
           tsmSpy
         }
+
+        override private[scheduler] lazy val blacklistTrackerOpt = Some(blacklist)
       }
     setupHelper()
   }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
index db14c9acfdce5..80fb674725814 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
@@ -1140,6 +1140,38 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg
       .updateBlacklistForFailedTask(anyString(), anyString(), anyInt())
   }
 
+  test("update application blacklist for shuffle-fetch") {
+    // Setup a taskset, and fail some one task for fetch failure.
+    val conf = new SparkConf()
+      .set(config.BLACKLIST_ENABLED, true)
+      .set(config.SHUFFLE_SERVICE_ENABLED, true)
+      .set(config.BLACKLIST_FETCH_FAILURE_ENABLED, true)
+    sc = new SparkContext("local", "test", conf)
+    sched = new FakeTaskScheduler(sc, ("exec1", "host1"), ("exec2", "host2"))
+    val taskSet = FakeTask.createTaskSet(4)
+    val blacklistTracker = new BlacklistTracker(sc, None)
+    val tsm = new TaskSetManager(sched, taskSet, 4, Some(blacklistTracker))
+
+    // make some offers to our taskset, to get tasks we will fail
+    val taskDescs = Seq(
+      "exec1" -> "host1",
+      "exec2" -> "host2"
+    ).flatMap { case (exec, host) =>
+      // offer each executor twice (simulating 2 cores per executor)
+      (0 until 2).flatMap{ _ => tsm.resourceOffer(exec, host, TaskLocality.ANY)}
+    }
+    assert(taskDescs.size === 4)
+
+    assert(!blacklistTracker.isExecutorBlacklisted(taskDescs(0).executorId))
+    assert(!blacklistTracker.isNodeBlacklisted("host1"))
+
+    // Fail the task with fetch failure
+    tsm.handleFailedTask(taskDescs(0).taskId, TaskState.FAILED,
+      FetchFailed(BlockManagerId(taskDescs(0).executorId, "host1", 12345), 0, 0, 0, "ignored"))
+
+    assert(blacklistTracker.isNodeBlacklisted("host1"))
+  }
+
   private def createTaskResult(
       id: Int,
       accumUpdates: Seq[AccumulatorV2[_, _]] = Seq.empty): DirectTaskResult[Int] = {
diff --git a/docs/configuration.md b/docs/configuration.md
index f4bec589208be..c8e61537a457c 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1479,6 +1479,15 @@ Apart from these, the following properties are also available, and may be useful
     all of the executors on that node will be killed.
   </td>
 </tr>
+<tr>
+  <td><code>spark.blacklist.application.fetchFailure.enabled</code></td>
+  <td>false</td>
+  <td>
+    (Experimental) If set to "true", Spark will blacklist the executor immediately when a fetch 
+    failure happenes. If external shuffle service is enabled, then the whole node will be 
+    blacklisted.
+  </td>
+</tr>
 <tr>
   <td><code>spark.speculation</code></td>
   <td>false</td>

From c22810004fb2db249be6477c9801d09b807af851 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 27 Jun 2017 02:35:51 +0800
Subject: [PATCH 023/779] [SPARK-20213][SQL][FOLLOW-UP] introduce
 SQLExecution.ignoreNestedExecutionId

## What changes were proposed in this pull request?

in https://github.com/apache/spark/pull/18064, to work around the nested sql execution id issue, we introduced several internal methods in `Dataset`, like `collectInternal`, `countInternal`, `showInternal`, etc., to avoid nested execution id.

However, this approach has poor expansibility. When we hit other nested execution id cases, we may need to add more internal methods in `Dataset`.

Our goal is to ignore the nested execution id in some cases, and we can have a better approach to achieve this goal, by introducing `SQLExecution.ignoreNestedExecutionId`. Whenever we find a place which needs to ignore the nested execution, we can just wrap the action with `SQLExecution.ignoreNestedExecutionId`, and this is more expansible than the previous approach.

The idea comes from https://github.com/apache/spark/pull/17540/files#diff-ab49028253e599e6e74cc4f4dcb2e3a8R57 by rdblue

## How was this patch tested?

existing tests.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #18419 from cloud-fan/follow.
---
 .../scala/org/apache/spark/sql/Dataset.scala  | 39 ++-----------------
 .../spark/sql/execution/SQLExecution.scala    | 39 +++++++++++++++++--
 .../command/AnalyzeTableCommand.scala         |  5 ++-
 .../spark/sql/execution/command/cache.scala   | 19 ++++-----
 .../datasources/csv/CSVDataSource.scala       |  6 ++-
 .../datasources/jdbc/JDBCRelation.scala       | 14 +++----
 .../sql/execution/streaming/console.scala     | 13 +++++--
 .../sql/execution/streaming/memory.scala      | 33 +++++++++-------
 8 files changed, 89 insertions(+), 79 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 6e66e92091ff9..268a37ff5d271 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -246,13 +246,8 @@ class Dataset[T] private[sql](
       _numRows: Int, truncate: Int = 20, vertical: Boolean = false): String = {
     val numRows = _numRows.max(0)
     val takeResult = toDF().take(numRows + 1)
-    showString(takeResult, numRows, truncate, vertical)
-  }
-
-  private def showString(
-      dataWithOneMoreRow: Array[Row], numRows: Int, truncate: Int, vertical: Boolean): String = {
-    val hasMoreData = dataWithOneMoreRow.length > numRows
-    val data = dataWithOneMoreRow.take(numRows)
+    val hasMoreData = takeResult.length > numRows
+    val data = takeResult.take(numRows)
 
     lazy val timeZone =
       DateTimeUtils.getTimeZone(sparkSession.sessionState.conf.sessionLocalTimeZone)
@@ -688,19 +683,6 @@ class Dataset[T] private[sql](
     println(showString(numRows, truncate = 0))
   }
 
-  // An internal version of `show`, which won't set execution id and trigger listeners.
-  private[sql] def showInternal(_numRows: Int, truncate: Boolean): Unit = {
-    val numRows = _numRows.max(0)
-    val takeResult = toDF().takeInternal(numRows + 1)
-
-    if (truncate) {
-      println(showString(takeResult, numRows, truncate = 20, vertical = false))
-    } else {
-      println(showString(takeResult, numRows, truncate = 0, vertical = false))
-    }
-  }
-  // scalastyle:on println
-
   /**
    * Displays the Dataset in a tabular form. For example:
    * {{{
@@ -2467,11 +2449,6 @@ class Dataset[T] private[sql](
    */
   def take(n: Int): Array[T] = head(n)
 
-  // An internal version of `take`, which won't set execution id and trigger listeners.
-  private[sql] def takeInternal(n: Int): Array[T] = {
-    collectFromPlan(limit(n).queryExecution.executedPlan)
-  }
-
   /**
    * Returns the first `n` rows in the Dataset as a list.
    *
@@ -2496,11 +2473,6 @@ class Dataset[T] private[sql](
    */
   def collect(): Array[T] = withAction("collect", queryExecution)(collectFromPlan)
 
-  // An internal version of `collect`, which won't set execution id and trigger listeners.
-  private[sql] def collectInternal(): Array[T] = {
-    collectFromPlan(queryExecution.executedPlan)
-  }
-
   /**
    * Returns a Java list that contains all rows in this Dataset.
    *
@@ -2542,11 +2514,6 @@ class Dataset[T] private[sql](
     plan.executeCollect().head.getLong(0)
   }
 
-  // An internal version of `count`, which won't set execution id and trigger listeners.
-  private[sql] def countInternal(): Long = {
-    groupBy().count().queryExecution.executedPlan.executeCollect().head.getLong(0)
-  }
-
   /**
    * Returns a new Dataset that has exactly `numPartitions` partitions.
    *
@@ -2792,7 +2759,7 @@ class Dataset[T] private[sql](
     createTempViewCommand(viewName, replace = true, global = true)
   }
 
-  private[spark] def createTempViewCommand(
+  private def createTempViewCommand(
       viewName: String,
       replace: Boolean,
       global: Boolean): CreateViewCommand = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala
index bb206e84325fd..ca8bed5214f87 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala
@@ -29,6 +29,8 @@ object SQLExecution {
 
   val EXECUTION_ID_KEY = "spark.sql.execution.id"
 
+  private val IGNORE_NESTED_EXECUTION_ID = "spark.sql.execution.ignoreNestedExecutionId"
+
   private val _nextExecutionId = new AtomicLong(0)
 
   private def nextExecutionId: Long = _nextExecutionId.getAndIncrement
@@ -42,8 +44,11 @@ object SQLExecution {
   private val testing = sys.props.contains("spark.testing")
 
   private[sql] def checkSQLExecutionId(sparkSession: SparkSession): Unit = {
+    val sc = sparkSession.sparkContext
+    val isNestedExecution = sc.getLocalProperty(IGNORE_NESTED_EXECUTION_ID) != null
+    val hasExecutionId = sc.getLocalProperty(EXECUTION_ID_KEY) != null
     // only throw an exception during tests. a missing execution ID should not fail a job.
-    if (testing && sparkSession.sparkContext.getLocalProperty(EXECUTION_ID_KEY) == null) {
+    if (testing && !isNestedExecution && !hasExecutionId) {
       // Attention testers: when a test fails with this exception, it means that the action that
       // started execution of a query didn't call withNewExecutionId. The execution ID should be
       // set by calling withNewExecutionId in the action that begins execution, like
@@ -65,7 +70,7 @@ object SQLExecution {
       val executionId = SQLExecution.nextExecutionId
       sc.setLocalProperty(EXECUTION_ID_KEY, executionId.toString)
       executionIdToQueryExecution.put(executionId, queryExecution)
-      val r = try {
+      try {
         // sparkContext.getCallSite() would first try to pick up any call site that was previously
         // set, then fall back to Utils.getCallSite(); call Utils.getCallSite() directly on
         // streaming queries would give us call site like "run at <unknown>:0"
@@ -84,7 +89,15 @@ object SQLExecution {
         executionIdToQueryExecution.remove(executionId)
         sc.setLocalProperty(EXECUTION_ID_KEY, null)
       }
-      r
+    } else if (sc.getLocalProperty(IGNORE_NESTED_EXECUTION_ID) != null) {
+      // If `IGNORE_NESTED_EXECUTION_ID` is set, just ignore the execution id while evaluating the
+      // `body`, so that Spark jobs issued in the `body` won't be tracked.
+      try {
+        sc.setLocalProperty(EXECUTION_ID_KEY, null)
+        body
+      } finally {
+        sc.setLocalProperty(EXECUTION_ID_KEY, oldExecutionId)
+      }
     } else {
       // Don't support nested `withNewExecutionId`. This is an example of the nested
       // `withNewExecutionId`:
@@ -100,7 +113,9 @@ object SQLExecution {
       // all accumulator metrics will be 0. It will confuse people if we show them in Web UI.
       //
       // A real case is the `DataFrame.count` method.
-      throw new IllegalArgumentException(s"$EXECUTION_ID_KEY is already set")
+      throw new IllegalArgumentException(s"$EXECUTION_ID_KEY is already set, please wrap your " +
+        "action with SQLExecution.ignoreNestedExecutionId if you don't want to track the Spark " +
+        "jobs issued by the nested execution.")
     }
   }
 
@@ -118,4 +133,20 @@ object SQLExecution {
       sc.setLocalProperty(SQLExecution.EXECUTION_ID_KEY, oldExecutionId)
     }
   }
+
+  /**
+   * Wrap an action which may have nested execution id. This method can be used to run an execution
+   * inside another execution, e.g., `CacheTableCommand` need to call `Dataset.collect`. Note that,
+   * all Spark jobs issued in the body won't be tracked in UI.
+   */
+  def ignoreNestedExecutionId[T](sparkSession: SparkSession)(body: => T): T = {
+    val sc = sparkSession.sparkContext
+    val allowNestedPreviousValue = sc.getLocalProperty(IGNORE_NESTED_EXECUTION_ID)
+    try {
+      sc.setLocalProperty(IGNORE_NESTED_EXECUTION_ID, "true")
+      body
+    } finally {
+      sc.setLocalProperty(IGNORE_NESTED_EXECUTION_ID, allowNestedPreviousValue)
+    }
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
index 06e588f56f1e9..13b8faff844c7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
@@ -27,6 +27,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog.{CatalogStatistics, CatalogTable, CatalogTableType}
+import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.internal.SessionState
 
 
@@ -58,7 +59,9 @@ case class AnalyzeTableCommand(
     // 2. when total size is changed, `oldRowCount` becomes invalid.
     // This is to make sure that we only record the right statistics.
     if (!noscan) {
-      val newRowCount = sparkSession.table(tableIdentWithDB).countInternal()
+      val newRowCount = SQLExecution.ignoreNestedExecutionId(sparkSession) {
+        sparkSession.table(tableIdentWithDB).count()
+      }
       if (newRowCount >= 0 && newRowCount != oldRowCount) {
         newStats = if (newStats.isDefined) {
           newStats.map(_.copy(rowCount = Some(BigInt(newRowCount))))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala
index 184d0387ebfa9..d36eb7587a3ef 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala
@@ -22,6 +22,7 @@ import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.NoSuchTableException
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.execution.SQLExecution
 
 case class CacheTableCommand(
     tableIdent: TableIdentifier,
@@ -33,16 +34,16 @@ case class CacheTableCommand(
   override def innerChildren: Seq[QueryPlan[_]] = plan.toSeq
 
   override def run(sparkSession: SparkSession): Seq[Row] = {
-    plan.foreach { logicalPlan =>
-      Dataset.ofRows(sparkSession, logicalPlan)
-        .createTempViewCommand(tableIdent.quotedString, replace = false, global = false)
-        .run(sparkSession)
-    }
-    sparkSession.catalog.cacheTable(tableIdent.quotedString)
+    SQLExecution.ignoreNestedExecutionId(sparkSession) {
+      plan.foreach { logicalPlan =>
+        Dataset.ofRows(sparkSession, logicalPlan).createTempView(tableIdent.quotedString)
+      }
+      sparkSession.catalog.cacheTable(tableIdent.quotedString)
 
-    if (!isLazy) {
-      // Performs eager caching
-      sparkSession.table(tableIdent).countInternal()
+      if (!isLazy) {
+        // Performs eager caching
+        sparkSession.table(tableIdent).count()
+      }
     }
 
     Seq.empty[Row]
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index eadc6c94f4b3c..99133bd70989a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -32,6 +32,7 @@ import org.apache.spark.input.{PortableDataStream, StreamInputFormat}
 import org.apache.spark.rdd.{BinaryFileRDD, RDD}
 import org.apache.spark.sql.{Dataset, Encoders, SparkSession}
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.execution.datasources.text.TextFileFormat
 import org.apache.spark.sql.types.StructType
@@ -144,8 +145,9 @@ object TextInputCSVDataSource extends CSVDataSource {
       inputPaths: Seq[FileStatus],
       parsedOptions: CSVOptions): StructType = {
     val csv = createBaseDataset(sparkSession, inputPaths, parsedOptions)
-    val maybeFirstLine =
-      CSVUtils.filterCommentAndEmpty(csv, parsedOptions).takeInternal(1).headOption
+    val maybeFirstLine = SQLExecution.ignoreNestedExecutionId(sparkSession) {
+      CSVUtils.filterCommentAndEmpty(csv, parsedOptions).take(1).headOption
+    }
     inferFromDataset(sparkSession, csv, maybeFirstLine, parsedOptions)
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
index a06f1ce3287e6..b11da7045de22 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
@@ -23,6 +23,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.Partition
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Row, SaveMode, SparkSession, SQLContext}
+import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.jdbc.JdbcDialects
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.StructType
@@ -129,14 +130,11 @@ private[sql] case class JDBCRelation(
   }
 
   override def insert(data: DataFrame, overwrite: Boolean): Unit = {
-    import scala.collection.JavaConverters._
-
-    val options = jdbcOptions.asProperties.asScala +
-      ("url" -> jdbcOptions.url, "dbtable" -> jdbcOptions.table)
-    val mode = if (overwrite) SaveMode.Overwrite else SaveMode.Append
-
-    new JdbcRelationProvider().createRelation(
-      data.sparkSession.sqlContext, mode, options.toMap, data)
+    SQLExecution.ignoreNestedExecutionId(data.sparkSession) {
+      data.write
+        .mode(if (overwrite) SaveMode.Overwrite else SaveMode.Append)
+        .jdbc(jdbcOptions.url, jdbcOptions.table, jdbcOptions.asProperties)
+    }
   }
 
   override def toString: String = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
index 9e889ff679450..6fa7c113defaa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
@@ -22,6 +22,7 @@ import org.apache.spark.sql.{DataFrame, SQLContext}
 import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, StreamSinkProvider}
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.SaveMode
+import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.types.StructType
 
 class ConsoleSink(options: Map[String, String]) extends Sink with Logging {
@@ -47,9 +48,11 @@ class ConsoleSink(options: Map[String, String]) extends Sink with Logging {
     println(batchIdStr)
     println("-------------------------------------------")
     // scalastyle:off println
-    data.sparkSession.createDataFrame(
-      data.sparkSession.sparkContext.parallelize(data.collectInternal()), data.schema)
-      .showInternal(numRowsToShow, isTruncated)
+    SQLExecution.ignoreNestedExecutionId(data.sparkSession) {
+      data.sparkSession.createDataFrame(
+        data.sparkSession.sparkContext.parallelize(data.collect()), data.schema)
+        .show(numRowsToShow, isTruncated)
+    }
   }
 }
 
@@ -79,7 +82,9 @@ class ConsoleSinkProvider extends StreamSinkProvider
 
     // Truncate the displayed data if it is too long, by default it is true
     val isTruncated = parameters.get("truncate").map(_.toBoolean).getOrElse(true)
-    data.showInternal(numRowsToShow, isTruncated)
+    SQLExecution.ignoreNestedExecutionId(sqlContext.sparkSession) {
+      data.show(numRowsToShow, isTruncated)
+    }
 
     ConsoleRelation(sqlContext, data)
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
index a5dac469f85b6..198a342582804 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.catalyst.encoders.encoderFor
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Statistics}
 import org.apache.spark.sql.catalyst.streaming.InternalOutputModes._
+import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.Utils
@@ -193,21 +194,23 @@ class MemorySink(val schema: StructType, outputMode: OutputMode) extends Sink wi
     }
     if (notCommitted) {
       logDebug(s"Committing batch $batchId to $this")
-      outputMode match {
-        case Append | Update =>
-          val rows = AddedData(batchId, data.collectInternal())
-          synchronized { batches += rows }
-
-        case Complete =>
-          val rows = AddedData(batchId, data.collectInternal())
-          synchronized {
-            batches.clear()
-            batches += rows
-          }
-
-        case _ =>
-          throw new IllegalArgumentException(
-            s"Output mode $outputMode is not supported by MemorySink")
+      SQLExecution.ignoreNestedExecutionId(data.sparkSession) {
+        outputMode match {
+          case Append | Update =>
+            val rows = AddedData(batchId, data.collect())
+            synchronized { batches += rows }
+
+          case Complete =>
+            val rows = AddedData(batchId, data.collect())
+            synchronized {
+              batches.clear()
+              batches += rows
+            }
+
+          case _ =>
+            throw new IllegalArgumentException(
+              s"Output mode $outputMode is not supported by MemorySink")
+        }
       }
     } else {
       logDebug(s"Skipping already committed batch: $batchId")

From 3cb3ccce120fa9f0273133912624b877b42d95fd Mon Sep 17 00:00:00 2001
From: Wang Gengliang <ltnwgl@gmail.com>
Date: Tue, 27 Jun 2017 17:24:46 +0800
Subject: [PATCH 024/779] [SPARK-21196] Split codegen info of query plan into
 sequence

codegen info of query plan can be very long.
In debugging console / web page, it would be more readable if the subtrees and corresponding codegen are split into sequence.

Example:

```java
codegenStringSeq(sql("select 1").queryExecution.executedPlan)
```
The example will return Seq[(String, String)] of length 1, containing the subtree as string and the corresponding generated code.

The subtree as string:

> (*Project [1 AS 1#0]
> +- Scan OneRowRelation[]

The generated code:
```java
/* 001 */ public Object generate(Object[] references) {
/* 002 */   return new GeneratedIterator(references);
/* 003 */ }
/* 004 */
/* 005 */ final class GeneratedIterator extends org.apache.spark.sql.execution.BufferedRowIterator {
/* 006 */   private Object[] references;
/* 007 */   private scala.collection.Iterator[] inputs;
/* 008 */   private scala.collection.Iterator inputadapter_input;
/* 009 */   private UnsafeRow project_result;
/* 010 */   private org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder project_holder;
/* 011 */   private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter project_rowWriter;
/* 012 */
/* 013 */   public GeneratedIterator(Object[] references) {
/* 014 */     this.references = references;
/* 015 */   }
/* 016 */
/* 017 */   public void init(int index, scala.collection.Iterator[] inputs) {
/* 018 */     partitionIndex = index;
/* 019 */     this.inputs = inputs;
/* 020 */     inputadapter_input = inputs[0];
/* 021 */     project_result = new UnsafeRow(1);
/* 022 */     project_holder = new org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder(project_result, 0);
/* 023 */     project_rowWriter = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(project_holder, 1);
/* 024 */
/* 025 */   }
/* 026 */
/* 027 */   protected void processNext() throws java.io.IOException {
/* 028 */     while (inputadapter_input.hasNext() && !stopEarly()) {
/* 029 */       InternalRow inputadapter_row = (InternalRow) inputadapter_input.next();
/* 030 */       project_rowWriter.write(0, 1);
/* 031 */       append(project_result);
/* 032 */       if (shouldStop()) return;
/* 033 */     }
/* 034 */   }
/* 035 */
/* 036 */ }
```
## What changes were proposed in this pull request?
add method codegenToSeq: split codegen info of query plan into sequence

## How was this patch tested?
unit test

cloud-fan gatorsmile
Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Wang Gengliang <ltnwgl@gmail.com>

Closes #18409 from gengliangwang/codegen.
---
 .../spark/sql/execution/QueryExecution.scala  |  9 +++++
 .../spark/sql/execution/debug/package.scala   | 35 ++++++++++++++-----
 .../sql/execution/debug/DebuggingSuite.scala  |  7 ++++
 3 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
index c7cac332a0377..9533144214a10 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -245,5 +245,14 @@ class QueryExecution(val sparkSession: SparkSession, val logical: LogicalPlan) {
       println(org.apache.spark.sql.execution.debug.codegenString(executedPlan))
       // scalastyle:on println
     }
+
+    /**
+     * Get WholeStageCodegenExec subtrees and the codegen in a query plan
+     *
+     * @return Sequence of WholeStageCodegen subtrees and corresponding codegen
+     */
+    def codegenToSeq(): Seq[(String, String)] = {
+      org.apache.spark.sql.execution.debug.codegenStringSeq(executedPlan)
+    }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
index 0395c43ba2cbc..a717cbd4a7df9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
@@ -50,7 +50,31 @@ package object debug {
     // scalastyle:on println
   }
 
+  /**
+   * Get WholeStageCodegenExec subtrees and the codegen in a query plan into one String
+   *
+   * @param plan the query plan for codegen
+   * @return single String containing all WholeStageCodegen subtrees and corresponding codegen
+   */
   def codegenString(plan: SparkPlan): String = {
+    val codegenSeq = codegenStringSeq(plan)
+    var output = s"Found ${codegenSeq.size} WholeStageCodegen subtrees.\n"
+    for (((subtree, code), i) <- codegenSeq.zipWithIndex) {
+      output += s"== Subtree ${i + 1} / ${codegenSeq.size} ==\n"
+      output += subtree
+      output += "\nGenerated code:\n"
+      output += s"${code}\n"
+    }
+    output
+  }
+
+  /**
+   * Get WholeStageCodegenExec subtrees and the codegen in a query plan
+   *
+   * @param plan the query plan for codegen
+   * @return Sequence of WholeStageCodegen subtrees and corresponding codegen
+   */
+  def codegenStringSeq(plan: SparkPlan): Seq[(String, String)] = {
     val codegenSubtrees = new collection.mutable.HashSet[WholeStageCodegenExec]()
     plan transform {
       case s: WholeStageCodegenExec =>
@@ -58,15 +82,10 @@ package object debug {
         s
       case s => s
     }
-    var output = s"Found ${codegenSubtrees.size} WholeStageCodegen subtrees.\n"
-    for ((s, i) <- codegenSubtrees.toSeq.zipWithIndex) {
-      output += s"== Subtree ${i + 1} / ${codegenSubtrees.size} ==\n"
-      output += s
-      output += "\nGenerated code:\n"
-      val (_, source) = s.doCodeGen()
-      output += s"${CodeFormatter.format(source)}\n"
+    codegenSubtrees.toSeq.map { subtree =>
+      val (_, source) = subtree.doCodeGen()
+      (subtree.toString, CodeFormatter.format(source))
     }
-    output
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala
index 4fc52c99fbeeb..adcaf2d76519f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala
@@ -38,4 +38,11 @@ class DebuggingSuite extends SparkFunSuite with SharedSQLContext {
     assert(res.contains("Subtree 2 / 2"))
     assert(res.contains("Object[]"))
   }
+
+  test("debugCodegenStringSeq") {
+    val res = codegenStringSeq(spark.range(10).groupBy("id").count().queryExecution.executedPlan)
+    assert(res.length == 2)
+    assert(res.forall{ case (subtree, code) =>
+      subtree.contains("Range") && code.contains("Object[]")})
+  }
 }

From b32bd005e46443bbd487b7a1f1078578c8f4c181 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Tue, 27 Jun 2017 13:14:12 +0100
Subject: [PATCH 025/779] [INFRA] Close stale PRs

## What changes were proposed in this pull request?

This PR proposes to close stale PRs, mostly the same instances with https://github.com/apache/spark/pull/18017

I believe the author in #14807 removed his account.

Closes #7075
Closes #8927
Closes #9202
Closes #9366
Closes #10861
Closes #11420
Closes #12356
Closes #13028
Closes #13506
Closes #14191
Closes #14198
Closes #14330
Closes #14807
Closes #15839
Closes #16225
Closes #16685
Closes #16692
Closes #16995
Closes #17181
Closes #17211
Closes #17235
Closes #17237
Closes #17248
Closes #17341
Closes #17708
Closes #17716
Closes #17721
Closes #17937

Added:
Closes #14739
Closes #17139
Closes #17445
Closes #18042
Closes #18359

Added:
Closes #16450
Closes #16525
Closes #17738

Added:
Closes #16458
Closes #16508
Closes #17714

Added:
Closes #17830
Closes #14742

## How was this patch tested?

N/A

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #18417 from HyukjinKwon/close-stale-pr.

From fd8c931a30a084ee981b75aa469fc97dda6cfaa9 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Wed, 28 Jun 2017 00:57:05 +0800
Subject: [PATCH 026/779] [SPARK-19104][SQL] Lambda variables in
 ExternalMapToCatalyst should be global

## What changes were proposed in this pull request?

The issue happens in `ExternalMapToCatalyst`. For example, the following codes create `ExternalMapToCatalyst` to convert Scala Map to catalyst map format.

    val data = Seq.tabulate(10)(i => NestedData(1, Map("key" -> InnerData("name", i + 100))))
    val ds = spark.createDataset(data)

The `valueConverter` in `ExternalMapToCatalyst` looks like:

    if (isnull(lambdavariable(ExternalMapToCatalyst_value52, ExternalMapToCatalyst_value_isNull52, ObjectType(class org.apache.spark.sql.InnerData), true))) null else named_struct(name, staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, assertnotnull(lambdavariable(ExternalMapToCatalyst_value52, ExternalMapToCatalyst_value_isNull52, ObjectType(class org.apache.spark.sql.InnerData), true)).name, true), value, assertnotnull(lambdavariable(ExternalMapToCatalyst_value52, ExternalMapToCatalyst_value_isNull52, ObjectType(class org.apache.spark.sql.InnerData), true)).value)

There is a `CreateNamedStruct` expression (`named_struct`) to create a row of `InnerData.name` and `InnerData.value` that are referred by `ExternalMapToCatalyst_value52`.

Because `ExternalMapToCatalyst_value52` are local variable, when `CreateNamedStruct` splits expressions to individual functions, the local variable can't be accessed anymore.

## How was this patch tested?

Jenkins tests.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #18418 from viirya/SPARK-19104.
---
 .../catalyst/expressions/objects/objects.scala | 18 ++++++++++++------
 .../spark/sql/DatasetPrimitiveSuite.scala      |  8 ++++++++
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index 073993cccdf8a..4b651836ff4d2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -911,6 +911,12 @@ case class ExternalMapToCatalyst private(
     val entry = ctx.freshName("entry")
     val entries = ctx.freshName("entries")
 
+    val keyElementJavaType = ctx.javaType(keyType)
+    val valueElementJavaType = ctx.javaType(valueType)
+    ctx.addMutableState(keyElementJavaType, key, "")
+    ctx.addMutableState("boolean", valueIsNull, "")
+    ctx.addMutableState(valueElementJavaType, value, "")
+
     val (defineEntries, defineKeyValue) = child.dataType match {
       case ObjectType(cls) if classOf[java.util.Map[_, _]].isAssignableFrom(cls) =>
         val javaIteratorCls = classOf[java.util.Iterator[_]].getName
@@ -922,8 +928,8 @@ case class ExternalMapToCatalyst private(
         val defineKeyValue =
           s"""
             final $javaMapEntryCls $entry = ($javaMapEntryCls) $entries.next();
-            ${ctx.javaType(keyType)} $key = (${ctx.boxedType(keyType)}) $entry.getKey();
-            ${ctx.javaType(valueType)} $value = (${ctx.boxedType(valueType)}) $entry.getValue();
+            $key = (${ctx.boxedType(keyType)}) $entry.getKey();
+            $value = (${ctx.boxedType(valueType)}) $entry.getValue();
           """
 
         defineEntries -> defineKeyValue
@@ -937,17 +943,17 @@ case class ExternalMapToCatalyst private(
         val defineKeyValue =
           s"""
             final $scalaMapEntryCls $entry = ($scalaMapEntryCls) $entries.next();
-            ${ctx.javaType(keyType)} $key = (${ctx.boxedType(keyType)}) $entry._1();
-            ${ctx.javaType(valueType)} $value = (${ctx.boxedType(valueType)}) $entry._2();
+            $key = (${ctx.boxedType(keyType)}) $entry._1();
+            $value = (${ctx.boxedType(valueType)}) $entry._2();
           """
 
         defineEntries -> defineKeyValue
     }
 
     val valueNullCheck = if (ctx.isPrimitiveType(valueType)) {
-      s"boolean $valueIsNull = false;"
+      s"$valueIsNull = false;"
     } else {
-      s"boolean $valueIsNull = $value == null;"
+      s"$valueIsNull = $value == null;"
     }
 
     val arrayCls = classOf[GenericArrayData].getName
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala
index 4126660b5d102..a6847dcfbffc4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala
@@ -39,6 +39,9 @@ case class ComplexClass(seq: SeqClass, list: ListClass, queue: QueueClass)
 
 case class ComplexMapClass(map: MapClass, lhmap: LHMapClass)
 
+case class InnerData(name: String, value: Int)
+case class NestedData(id: Int, param: Map[String, InnerData])
+
 package object packageobject {
   case class PackageClass(value: Int)
 }
@@ -354,4 +357,9 @@ class DatasetPrimitiveSuite extends QueryTest with SharedSQLContext {
     checkDataset(Seq(PackageClass(1)).toDS(), PackageClass(1))
   }
 
+  test("SPARK-19104: Lambda variables in ExternalMapToCatalyst should be global") {
+    val data = Seq.tabulate(10)(i => NestedData(1, Map("key" -> InnerData("name", i + 100))))
+    val ds = spark.createDataset(data)
+    checkDataset(ds, data: _*)
+  }
 }

From 2d686a19e341a31d976aa42228b7589f87dfd6c2 Mon Sep 17 00:00:00 2001
From: Eric Vandenberg <ericvandenberg@fb.com>
Date: Wed, 28 Jun 2017 09:26:33 +0800
Subject: [PATCH 027/779] [SPARK-21155][WEBUI] Add (? running tasks) into Spark
 UI progress

## What changes were proposed in this pull request?

Add metric on number of running tasks to status bar on Jobs / Active Jobs.

## How was this patch tested?

Run a long running (1 minute) query in spark-shell and use localhost:4040 web UI to observe progress.  See jira for screen snapshot.

Author: Eric Vandenberg <ericvandenberg@fb.com>

Closes #18369 from ericvandenbergfb/runningTasks.
---
 core/src/main/scala/org/apache/spark/ui/UIUtils.scala | 1 +
 1 file changed, 1 insertion(+)

diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
index 2610f673d27f6..ba798df13c95d 100644
--- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
@@ -356,6 +356,7 @@ private[spark] object UIUtils extends Logging {
     <div class="progress">
       <span style="text-align:center; position:absolute; width:100%; left:0;">
         {completed}/{total}
+        { if (failed == 0 && skipped == 0 && started > 0) s"($started running)" }
         { if (failed > 0) s"($failed failed)" }
         { if (skipped > 0) s"($skipped skipped)" }
         { reasonToNumKilled.toSeq.sortBy(-_._2).map {

From e793bf248bc3c71b9664f26377bce06b0ffa97a7 Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Tue, 27 Jun 2017 23:15:45 -0700
Subject: [PATCH 028/779] [SPARK-20889][SPARKR] Grouped documentation for MATH
 column methods

## What changes were proposed in this pull request?

Grouped documentation for math column methods.

Author: actuaryzhang <actuaryzhang10@gmail.com>
Author: Wayne Zhang <actuaryzhang10@gmail.com>

Closes #18371 from actuaryzhang/sparkRDocMath.
---
 R/pkg/R/functions.R | 619 +++++++++++++++-----------------------------
 R/pkg/R/generics.R  |  48 ++--
 2 files changed, 241 insertions(+), 426 deletions(-)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 31028585aaa13..23ccdf941a8c7 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -86,6 +86,31 @@ NULL
 #' df <- createDataFrame(data.frame(time = as.POSIXct(dts), y = y))}
 NULL
 
+#' Math functions for Column operations
+#'
+#' Math functions defined for \code{Column}.
+#'
+#' @param x Column to compute on. In \code{shiftLeft}, \code{shiftRight} and \code{shiftRightUnsigned},
+#'          this is the number of bits to shift.
+#' @param y Column to compute on.
+#' @param ... additional argument(s).
+#' @name column_math_functions
+#' @rdname column_math_functions
+#' @family math functions
+#' @examples
+#' \dontrun{
+#' # Dataframe used throughout this doc
+#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))
+#' tmp <- mutate(df, v1 = log(df$mpg), v2 = cbrt(df$disp),
+#'                   v3 = bround(df$wt, 1), v4 = bin(df$cyl),
+#'                   v5 = hex(df$wt), v6 = toDegrees(df$gear),
+#'                   v7 = atan2(df$cyl, df$am), v8 = hypot(df$cyl, df$am),
+#'                   v9 = pmod(df$hp, df$cyl), v10 = shiftLeft(df$disp, 1),
+#'                   v11 = conv(df$hp, 10, 16), v12 = sign(df$vs - 0.5),
+#'                   v13 = sqrt(df$disp), v14 = ceil(df$wt))
+#' head(tmp)}
+NULL
+
 #' lit
 #'
 #' A new \linkS4class{Column} is created to represent the literal value.
@@ -112,18 +137,12 @@ setMethod("lit", signature("ANY"),
             column(jc)
           })
 
-#' abs
-#'
-#' Computes the absolute value.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{abs}: Computes the absolute value.
 #'
-#' @rdname abs
-#' @name abs
-#' @family non-aggregate functions
+#' @rdname column_math_functions
 #' @export
-#' @examples \dontrun{abs(df$c)}
-#' @aliases abs,Column-method
+#' @aliases abs abs,Column-method
 #' @note abs since 1.5.0
 setMethod("abs",
           signature(x = "Column"),
@@ -132,19 +151,13 @@ setMethod("abs",
             column(jc)
           })
 
-#' acos
-#'
-#' Computes the cosine inverse of the given value; the returned angle is in the range
-#' 0.0 through pi.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{acos}: Computes the cosine inverse of the given value; the returned angle is in
+#' the range 0.0 through pi.
 #'
-#' @rdname acos
-#' @name acos
-#' @family math functions
+#' @rdname column_math_functions
 #' @export
-#' @examples \dontrun{acos(df$c)}
-#' @aliases acos,Column-method
+#' @aliases acos acos,Column-method
 #' @note acos since 1.5.0
 setMethod("acos",
           signature(x = "Column"),
@@ -196,19 +209,13 @@ setMethod("ascii",
             column(jc)
           })
 
-#' asin
-#'
-#' Computes the sine inverse of the given value; the returned angle is in the range
-#' -pi/2 through pi/2.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{asin}: Computes the sine inverse of the given value; the returned angle is in
+#' the range -pi/2 through pi/2.
 #'
-#' @rdname asin
-#' @name asin
-#' @family math functions
+#' @rdname column_math_functions
 #' @export
-#' @aliases asin,Column-method
-#' @examples \dontrun{asin(df$c)}
+#' @aliases asin asin,Column-method
 #' @note asin since 1.5.0
 setMethod("asin",
           signature(x = "Column"),
@@ -217,18 +224,12 @@ setMethod("asin",
             column(jc)
           })
 
-#' atan
-#'
-#' Computes the tangent inverse of the given value.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{atan}: Computes the tangent inverse of the given value.
 #'
-#' @rdname atan
-#' @name atan
-#' @family math functions
+#' @rdname column_math_functions
 #' @export
-#' @aliases atan,Column-method
-#' @examples \dontrun{atan(df$c)}
+#' @aliases atan atan,Column-method
 #' @note atan since 1.5.0
 setMethod("atan",
           signature(x = "Column"),
@@ -276,19 +277,13 @@ setMethod("base64",
             column(jc)
           })
 
-#' bin
-#'
-#' An expression that returns the string representation of the binary value of the given long
-#' column. For example, bin("12") returns "1100".
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{bin}: An expression that returns the string representation of the binary value
+#' of the given long column. For example, bin("12") returns "1100".
 #'
-#' @rdname bin
-#' @name bin
-#' @family math functions
+#' @rdname column_math_functions
 #' @export
-#' @aliases bin,Column-method
-#' @examples \dontrun{bin(df$c)}
+#' @aliases bin bin,Column-method
 #' @note bin since 1.5.0
 setMethod("bin",
           signature(x = "Column"),
@@ -317,18 +312,12 @@ setMethod("bitwiseNOT",
             column(jc)
           })
 
-#' cbrt
-#'
-#' Computes the cube-root of the given value.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{cbrt}: Computes the cube-root of the given value.
 #'
-#' @rdname cbrt
-#' @name cbrt
-#' @family math functions
+#' @rdname column_math_functions
 #' @export
-#' @aliases cbrt,Column-method
-#' @examples \dontrun{cbrt(df$c)}
+#' @aliases cbrt cbrt,Column-method
 #' @note cbrt since 1.4.0
 setMethod("cbrt",
           signature(x = "Column"),
@@ -337,18 +326,12 @@ setMethod("cbrt",
             column(jc)
           })
 
-#' Computes the ceiling of the given value
-#'
-#' Computes the ceiling of the given value.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{ceil}: Computes the ceiling of the given value.
 #'
-#' @rdname ceil
-#' @name ceil
-#' @family math functions
+#' @rdname column_math_functions
 #' @export
-#' @aliases ceil,Column-method
-#' @examples \dontrun{ceil(df$c)}
+#' @aliases ceil ceil,Column-method
 #' @note ceil since 1.5.0
 setMethod("ceil",
           signature(x = "Column"),
@@ -357,6 +340,19 @@ setMethod("ceil",
             column(jc)
           })
 
+#' @details
+#' \code{ceiling}: Alias for \code{ceil}.
+#'
+#' @rdname column_math_functions
+#' @aliases ceiling ceiling,Column-method
+#' @export
+#' @note ceiling since 1.5.0
+setMethod("ceiling",
+          signature(x = "Column"),
+          function(x) {
+            ceil(x)
+          })
+
 #' Returns the first column that is not NA
 #'
 #' Returns the first column that is not NA, or NA if all inputs are.
@@ -405,6 +401,7 @@ setMethod("column",
           function(x) {
             col(x)
           })
+
 #' corr
 #'
 #' Computes the Pearson Correlation Coefficient for two Columns.
@@ -493,18 +490,12 @@ setMethod("covar_pop", signature(col1 = "characterOrColumn", col2 = "characterOr
             column(jc)
           })
 
-#' cos
-#'
-#' Computes the cosine of the given value.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{cos}: Computes the cosine of the given value.
 #'
-#' @rdname cos
-#' @name cos
-#' @family math functions
-#' @aliases cos,Column-method
+#' @rdname column_math_functions
+#' @aliases cos cos,Column-method
 #' @export
-#' @examples \dontrun{cos(df$c)}
 #' @note cos since 1.5.0
 setMethod("cos",
           signature(x = "Column"),
@@ -513,18 +504,12 @@ setMethod("cos",
             column(jc)
           })
 
-#' cosh
-#'
-#' Computes the hyperbolic cosine of the given value.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{cosh}: Computes the hyperbolic cosine of the given value.
 #'
-#' @rdname cosh
-#' @name cosh
-#' @family math functions
-#' @aliases cosh,Column-method
+#' @rdname column_math_functions
+#' @aliases cosh cosh,Column-method
 #' @export
-#' @examples \dontrun{cosh(df$c)}
 #' @note cosh since 1.5.0
 setMethod("cosh",
           signature(x = "Column"),
@@ -679,18 +664,12 @@ setMethod("encode",
             column(jc)
           })
 
-#' exp
-#'
-#' Computes the exponential of the given value.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{exp}: Computes the exponential of the given value.
 #'
-#' @rdname exp
-#' @name exp
-#' @family math functions
-#' @aliases exp,Column-method
+#' @rdname column_math_functions
+#' @aliases exp exp,Column-method
 #' @export
-#' @examples \dontrun{exp(df$c)}
 #' @note exp since 1.5.0
 setMethod("exp",
           signature(x = "Column"),
@@ -699,18 +678,12 @@ setMethod("exp",
             column(jc)
           })
 
-#' expm1
-#'
-#' Computes the exponential of the given value minus one.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{expm1}: Computes the exponential of the given value minus one.
 #'
-#' @rdname expm1
-#' @name expm1
-#' @aliases expm1,Column-method
-#' @family math functions
+#' @rdname column_math_functions
+#' @aliases expm1 expm1,Column-method
 #' @export
-#' @examples \dontrun{expm1(df$c)}
 #' @note expm1 since 1.5.0
 setMethod("expm1",
           signature(x = "Column"),
@@ -719,18 +692,12 @@ setMethod("expm1",
             column(jc)
           })
 
-#' factorial
-#'
-#' Computes the factorial of the given value.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{factorial}: Computes the factorial of the given value.
 #'
-#' @rdname factorial
-#' @name factorial
-#' @aliases factorial,Column-method
-#' @family math functions
+#' @rdname column_math_functions
+#' @aliases factorial factorial,Column-method
 #' @export
-#' @examples \dontrun{factorial(df$c)}
 #' @note factorial since 1.5.0
 setMethod("factorial",
           signature(x = "Column"),
@@ -772,18 +739,12 @@ setMethod("first",
             column(jc)
           })
 
-#' floor
-#'
-#' Computes the floor of the given value.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{floor}: Computes the floor of the given value.
 #'
-#' @rdname floor
-#' @name floor
-#' @aliases floor,Column-method
-#' @family math functions
+#' @rdname column_math_functions
+#' @aliases floor floor,Column-method
 #' @export
-#' @examples \dontrun{floor(df$c)}
 #' @note floor since 1.5.0
 setMethod("floor",
           signature(x = "Column"),
@@ -792,18 +753,12 @@ setMethod("floor",
             column(jc)
           })
 
-#' hex
-#'
-#' Computes hex value of the given column.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{hex}: Computes hex value of the given column.
 #'
-#' @rdname hex
-#' @name hex
-#' @family math functions
-#' @aliases hex,Column-method
+#' @rdname column_math_functions
+#' @aliases hex hex,Column-method
 #' @export
-#' @examples \dontrun{hex(df$c)}
 #' @note hex since 1.5.0
 setMethod("hex",
           signature(x = "Column"),
@@ -983,18 +938,12 @@ setMethod("length",
             column(jc)
           })
 
-#' log
-#'
-#' Computes the natural logarithm of the given value.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{log}: Computes the natural logarithm of the given value.
 #'
-#' @rdname log
-#' @name log
-#' @aliases log,Column-method
-#' @family math functions
+#' @rdname column_math_functions
+#' @aliases log log,Column-method
 #' @export
-#' @examples \dontrun{log(df$c)}
 #' @note log since 1.5.0
 setMethod("log",
           signature(x = "Column"),
@@ -1003,18 +952,12 @@ setMethod("log",
             column(jc)
           })
 
-#' log10
-#'
-#' Computes the logarithm of the given value in base 10.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{log10}: Computes the logarithm of the given value in base 10.
 #'
-#' @rdname log10
-#' @name log10
-#' @family math functions
-#' @aliases log10,Column-method
+#' @rdname column_math_functions
+#' @aliases log10 log10,Column-method
 #' @export
-#' @examples \dontrun{log10(df$c)}
 #' @note log10 since 1.5.0
 setMethod("log10",
           signature(x = "Column"),
@@ -1023,18 +966,12 @@ setMethod("log10",
             column(jc)
           })
 
-#' log1p
-#'
-#' Computes the natural logarithm of the given value plus one.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{log1p}: Computes the natural logarithm of the given value plus one.
 #'
-#' @rdname log1p
-#' @name log1p
-#' @family math functions
-#' @aliases log1p,Column-method
+#' @rdname column_math_functions
+#' @aliases log1p log1p,Column-method
 #' @export
-#' @examples \dontrun{log1p(df$c)}
 #' @note log1p since 1.5.0
 setMethod("log1p",
           signature(x = "Column"),
@@ -1043,18 +980,12 @@ setMethod("log1p",
             column(jc)
           })
 
-#' log2
-#'
-#' Computes the logarithm of the given column in base 2.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{log2}: Computes the logarithm of the given column in base 2.
 #'
-#' @rdname log2
-#' @name log2
-#' @family math functions
-#' @aliases log2,Column-method
+#' @rdname column_math_functions
+#' @aliases log2 log2,Column-method
 #' @export
-#' @examples \dontrun{log2(df$c)}
 #' @note log2 since 1.5.0
 setMethod("log2",
           signature(x = "Column"),
@@ -1287,19 +1218,13 @@ setMethod("reverse",
             column(jc)
           })
 
-#' rint
-#'
-#' Returns the double value that is closest in value to the argument and
+#' @details
+#' \code{rint}: Returns the double value that is closest in value to the argument and
 #' is equal to a mathematical integer.
 #'
-#' @param x Column to compute on.
-#'
-#' @rdname rint
-#' @name rint
-#' @family math functions
-#' @aliases rint,Column-method
+#' @rdname column_math_functions
+#' @aliases rint rint,Column-method
 #' @export
-#' @examples \dontrun{rint(df$c)}
 #' @note rint since 1.5.0
 setMethod("rint",
           signature(x = "Column"),
@@ -1308,18 +1233,13 @@ setMethod("rint",
             column(jc)
           })
 
-#' round
-#'
-#' Returns the value of the column \code{e} rounded to 0 decimal places using HALF_UP rounding mode.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{round}: Returns the value of the column rounded to 0 decimal places
+#' using HALF_UP rounding mode.
 #'
-#' @rdname round
-#' @name round
-#' @family math functions
-#' @aliases round,Column-method
+#' @rdname column_math_functions
+#' @aliases round round,Column-method
 #' @export
-#' @examples \dontrun{round(df$c)}
 #' @note round since 1.5.0
 setMethod("round",
           signature(x = "Column"),
@@ -1328,24 +1248,18 @@ setMethod("round",
             column(jc)
           })
 
-#' bround
-#'
-#' Returns the value of the column \code{e} rounded to \code{scale} decimal places using HALF_EVEN rounding
-#' mode if \code{scale} >= 0 or at integer part when \code{scale} < 0.
+#' @details
+#' \code{bround}: Returns the value of the column \code{e} rounded to \code{scale} decimal places
+#' using HALF_EVEN rounding mode if \code{scale} >= 0 or at integer part when \code{scale} < 0.
 #' Also known as Gaussian rounding or bankers' rounding that rounds to the nearest even number.
 #' bround(2.5, 0) = 2, bround(3.5, 0) = 4.
 #'
-#' @param x Column to compute on.
 #' @param scale round to \code{scale} digits to the right of the decimal point when \code{scale} > 0,
 #'        the nearest even number when \code{scale} = 0, and \code{scale} digits to the left
 #'        of the decimal point when \code{scale} < 0.
-#' @param ... further arguments to be passed to or from other methods.
-#' @rdname bround
-#' @name bround
-#' @family math functions
-#' @aliases bround,Column-method
+#' @rdname column_math_functions
+#' @aliases bround bround,Column-method
 #' @export
-#' @examples \dontrun{bround(df$c, 0)}
 #' @note bround since 2.0.0
 setMethod("bround",
           signature(x = "Column"),
@@ -1354,7 +1268,6 @@ setMethod("bround",
             column(jc)
           })
 
-
 #' rtrim
 #'
 #' Trim the spaces from right end for the specified string value.
@@ -1375,7 +1288,6 @@ setMethod("rtrim",
             column(jc)
           })
 
-
 #' @details
 #' \code{sd}: Alias for \code{stddev_samp}.
 #'
@@ -1429,18 +1341,12 @@ setMethod("sha1",
             column(jc)
           })
 
-#' signum
-#'
-#' Computes the signum of the given value.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{signum}: Computes the signum of the given value.
 #'
-#' @rdname sign
-#' @name signum
-#' @aliases signum,Column-method
-#' @family math functions
+#' @rdname column_math_functions
+#' @aliases signum signum,Column-method
 #' @export
-#' @examples \dontrun{signum(df$c)}
 #' @note signum since 1.5.0
 setMethod("signum",
           signature(x = "Column"),
@@ -1449,18 +1355,24 @@ setMethod("signum",
             column(jc)
           })
 
-#' sin
-#'
-#' Computes the sine of the given value.
+#' @details
+#' \code{sign}: Alias for \code{signum}.
 #'
-#' @param x Column to compute on.
+#' @rdname column_math_functions
+#' @aliases sign sign,Column-method
+#' @export
+#' @note sign since 1.5.0
+setMethod("sign", signature(x = "Column"),
+          function(x) {
+            signum(x)
+          })
+
+#' @details
+#' \code{sin}: Computes the sine of the given value.
 #'
-#' @rdname sin
-#' @name sin
-#' @family math functions
-#' @aliases sin,Column-method
+#' @rdname column_math_functions
+#' @aliases sin sin,Column-method
 #' @export
-#' @examples \dontrun{sin(df$c)}
 #' @note sin since 1.5.0
 setMethod("sin",
           signature(x = "Column"),
@@ -1469,18 +1381,12 @@ setMethod("sin",
             column(jc)
           })
 
-#' sinh
-#'
-#' Computes the hyperbolic sine of the given value.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{sinh}: Computes the hyperbolic sine of the given value.
 #'
-#' @rdname sinh
-#' @name sinh
-#' @family math functions
-#' @aliases sinh,Column-method
+#' @rdname column_math_functions
+#' @aliases sinh sinh,Column-method
 #' @export
-#' @examples \dontrun{sinh(df$c)}
 #' @note sinh since 1.5.0
 setMethod("sinh",
           signature(x = "Column"),
@@ -1616,18 +1522,12 @@ setMethod("struct",
             column(jc)
           })
 
-#' sqrt
-#'
-#' Computes the square root of the specified float value.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{sqrt}: Computes the square root of the specified float value.
 #'
-#' @rdname sqrt
-#' @name sqrt
-#' @family math functions
-#' @aliases sqrt,Column-method
+#' @rdname column_math_functions
+#' @aliases sqrt sqrt,Column-method
 #' @export
-#' @examples \dontrun{sqrt(df$c)}
 #' @note sqrt since 1.5.0
 setMethod("sqrt",
           signature(x = "Column"),
@@ -1669,18 +1569,12 @@ setMethod("sumDistinct",
             column(jc)
           })
 
-#' tan
-#'
-#' Computes the tangent of the given value.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{tan}: Computes the tangent of the given value.
 #'
-#' @rdname tan
-#' @name tan
-#' @family math functions
-#' @aliases tan,Column-method
+#' @rdname column_math_functions
+#' @aliases tan tan,Column-method
 #' @export
-#' @examples \dontrun{tan(df$c)}
 #' @note tan since 1.5.0
 setMethod("tan",
           signature(x = "Column"),
@@ -1689,18 +1583,12 @@ setMethod("tan",
             column(jc)
           })
 
-#' tanh
-#'
-#' Computes the hyperbolic tangent of the given value.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{tanh}: Computes the hyperbolic tangent of the given value.
 #'
-#' @rdname tanh
-#' @name tanh
-#' @family math functions
-#' @aliases tanh,Column-method
+#' @rdname column_math_functions
+#' @aliases tanh tanh,Column-method
 #' @export
-#' @examples \dontrun{tanh(df$c)}
 #' @note tanh since 1.5.0
 setMethod("tanh",
           signature(x = "Column"),
@@ -1709,18 +1597,13 @@ setMethod("tanh",
             column(jc)
           })
 
-#' toDegrees
-#'
-#' Converts an angle measured in radians to an approximately equivalent angle measured in degrees.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{toDegrees}: Converts an angle measured in radians to an approximately equivalent angle
+#' measured in degrees.
 #'
-#' @rdname toDegrees
-#' @name toDegrees
-#' @family math functions
-#' @aliases toDegrees,Column-method
+#' @rdname column_math_functions
+#' @aliases toDegrees toDegrees,Column-method
 #' @export
-#' @examples \dontrun{toDegrees(df$c)}
 #' @note toDegrees since 1.4.0
 setMethod("toDegrees",
           signature(x = "Column"),
@@ -1729,18 +1612,13 @@ setMethod("toDegrees",
             column(jc)
           })
 
-#' toRadians
-#'
-#' Converts an angle measured in degrees to an approximately equivalent angle measured in radians.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{toRadians}: Converts an angle measured in degrees to an approximately equivalent angle
+#' measured in radians.
 #'
-#' @rdname toRadians
-#' @name toRadians
-#' @family math functions
-#' @aliases toRadians,Column-method
+#' @rdname column_math_functions
+#' @aliases toRadians toRadians,Column-method
 #' @export
-#' @examples \dontrun{toRadians(df$c)}
 #' @note toRadians since 1.4.0
 setMethod("toRadians",
           signature(x = "Column"),
@@ -1894,19 +1772,13 @@ setMethod("unbase64",
             column(jc)
           })
 
-#' unhex
-#'
-#' Inverse of hex. Interprets each pair of characters as a hexadecimal number
+#' @details
+#' \code{unhex}: Inverse of hex. Interprets each pair of characters as a hexadecimal number
 #' and converts to the byte representation of number.
 #'
-#' @param x Column to compute on.
-#'
-#' @rdname unhex
-#' @name unhex
-#' @family math functions
-#' @aliases unhex,Column-method
+#' @rdname column_math_functions
+#' @aliases unhex unhex,Column-method
 #' @export
-#' @examples \dontrun{unhex(df$c)}
 #' @note unhex since 1.5.0
 setMethod("unhex",
           signature(x = "Column"),
@@ -2020,20 +1892,13 @@ setMethod("year",
             column(jc)
           })
 
-#' atan2
-#'
-#' Returns the angle theta from the conversion of rectangular coordinates (x, y) to
-#' polar coordinates (r, theta).
-#
-#' @param x Column to compute on.
-#' @param y Column to compute on.
+#' @details
+#' \code{atan2}: Returns the angle theta from the conversion of rectangular coordinates
+#' (x, y) to polar coordinates (r, theta).
 #'
-#' @rdname atan2
-#' @name atan2
-#' @family math functions
-#' @aliases atan2,Column-method
+#' @rdname column_math_functions
+#' @aliases atan2 atan2,Column-method
 #' @export
-#' @examples \dontrun{atan2(df$c, x)}
 #' @note atan2 since 1.5.0
 setMethod("atan2", signature(y = "Column"),
           function(y, x) {
@@ -2068,19 +1933,12 @@ setMethod("datediff", signature(y = "Column"),
             column(jc)
           })
 
-#' hypot
-#'
-#' Computes "sqrt(a^2 + b^2)" without intermediate overflow or underflow.
-#
-#' @param x Column to compute on.
-#' @param y Column to compute on.
+#' @details
+#' \code{hypot}: Computes "sqrt(a^2 + b^2)" without intermediate overflow or underflow.
 #'
-#' @rdname hypot
-#' @name hypot
-#' @family math functions
-#' @aliases hypot,Column-method
+#' @rdname column_math_functions
+#' @aliases hypot hypot,Column-method
 #' @export
-#' @examples \dontrun{hypot(df$c, x)}
 #' @note hypot since 1.4.0
 setMethod("hypot", signature(y = "Column"),
           function(y, x) {
@@ -2154,20 +2012,13 @@ setMethod("nanvl", signature(y = "Column"),
             column(jc)
           })
 
-#' pmod
-#'
-#' Returns the positive value of dividend mod divisor.
-#'
-#' @param x divisor Column.
-#' @param y dividend Column.
+#' @details
+#' \code{pmod}: Returns the positive value of dividend mod divisor.
+#' Column \code{x} is divisor column, and column \code{y} is the dividend column.
 #'
-#' @rdname pmod
-#' @name pmod
-#' @docType methods
-#' @family math functions
-#' @aliases pmod,Column-method
+#' @rdname column_math_functions
+#' @aliases pmod pmod,Column-method
 #' @export
-#' @examples \dontrun{pmod(df$c, x)}
 #' @note pmod since 1.5.0
 setMethod("pmod", signature(y = "Column"),
           function(y, x) {
@@ -2290,31 +2141,6 @@ setMethod("least",
             column(jc)
           })
 
-#' @rdname ceil
-#'
-#' @name ceiling
-#' @aliases ceiling,Column-method
-#' @export
-#' @examples \dontrun{ceiling(df$c)}
-#' @note ceiling since 1.5.0
-setMethod("ceiling",
-          signature(x = "Column"),
-          function(x) {
-            ceil(x)
-          })
-
-#' @rdname sign
-#'
-#' @name sign
-#' @aliases sign,Column-method
-#' @export
-#' @examples \dontrun{sign(df$c)}
-#' @note sign since 1.5.0
-setMethod("sign", signature(x = "Column"),
-          function(x) {
-            signum(x)
-          })
-
 #' @details
 #' \code{n_distinct}: Returns the number of distinct items in a group.
 #'
@@ -2564,20 +2390,13 @@ setMethod("sha2", signature(y = "Column", x = "numeric"),
             column(jc)
           })
 
-#' shiftLeft
-#'
-#' Shift the given value numBits left. If the given value is a long value, this function
-#' will return a long value else it will return an integer value.
-#'
-#' @param y column to compute on.
-#' @param x number of bits to shift.
+#' @details
+#' \code{shiftLeft}: Shifts the given value numBits left. If the given value is a long value,
+#' this function will return a long value else it will return an integer value.
 #'
-#' @family math functions
-#' @rdname shiftLeft
-#' @name shiftLeft
-#' @aliases shiftLeft,Column,numeric-method
+#' @rdname column_math_functions
+#' @aliases shiftLeft shiftLeft,Column,numeric-method
 #' @export
-#' @examples \dontrun{shiftLeft(df$c, 1)}
 #' @note shiftLeft since 1.5.0
 setMethod("shiftLeft", signature(y = "Column", x = "numeric"),
           function(y, x) {
@@ -2587,20 +2406,13 @@ setMethod("shiftLeft", signature(y = "Column", x = "numeric"),
             column(jc)
           })
 
-#' shiftRight
-#'
-#' (Signed) shift the given value numBits right. If the given value is a long value, it will return
-#' a long value else it will return an integer value.
-#'
-#' @param y column to compute on.
-#' @param x number of bits to shift.
+#' @details
+#' \code{shiftRight}: (Signed) shifts the given value numBits right. If the given value is a long value,
+#' it will return a long value else it will return an integer value.
 #'
-#' @family math functions
-#' @rdname shiftRight
-#' @name shiftRight
-#' @aliases shiftRight,Column,numeric-method
+#' @rdname column_math_functions
+#' @aliases shiftRight shiftRight,Column,numeric-method
 #' @export
-#' @examples \dontrun{shiftRight(df$c, 1)}
 #' @note shiftRight since 1.5.0
 setMethod("shiftRight", signature(y = "Column", x = "numeric"),
           function(y, x) {
@@ -2610,20 +2422,13 @@ setMethod("shiftRight", signature(y = "Column", x = "numeric"),
             column(jc)
           })
 
-#' shiftRightUnsigned
-#'
-#' Unsigned shift the given value numBits right. If the given value is a long value,
+#' @details
+#' \code{shiftRight}: (Unigned) shifts the given value numBits right. If the given value is a long value,
 #' it will return a long value else it will return an integer value.
 #'
-#' @param y column to compute on.
-#' @param x number of bits to shift.
-#'
-#' @family math functions
-#' @rdname shiftRightUnsigned
-#' @name shiftRightUnsigned
-#' @aliases shiftRightUnsigned,Column,numeric-method
+#' @rdname column_math_functions
+#' @aliases shiftRightUnsigned shiftRightUnsigned,Column,numeric-method
 #' @export
-#' @examples \dontrun{shiftRightUnsigned(df$c, 1)}
 #' @note shiftRightUnsigned since 1.5.0
 setMethod("shiftRightUnsigned", signature(y = "Column", x = "numeric"),
           function(y, x) {
@@ -2656,20 +2461,14 @@ setMethod("concat_ws", signature(sep = "character", x = "Column"),
             column(jc)
           })
 
-#' conv
-#'
-#' Convert a number in a string column from one base to another.
+#' @details
+#' \code{conv}: Converts a number in a string column from one base to another.
 #'
-#' @param x column to convert.
 #' @param fromBase base to convert from.
 #' @param toBase base to convert to.
-#'
-#' @family math functions
-#' @rdname conv
-#' @aliases conv,Column,numeric,numeric-method
-#' @name conv
+#' @rdname column_math_functions
+#' @aliases conv conv,Column,numeric,numeric-method
 #' @export
-#' @examples \dontrun{conv(df$n, 2, 16)}
 #' @note conv since 1.5.0
 setMethod("conv", signature(x = "Column", fromBase = "numeric", toBase = "numeric"),
           function(x, fromBase, toBase) {
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index f105174cea70d..0248ec585d771 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -931,24 +931,28 @@ setGeneric("avg", function(x, ...) { standardGeneric("avg") })
 #' @export
 setGeneric("base64", function(x) { standardGeneric("base64") })
 
-#' @rdname bin
+#' @rdname column_math_functions
 #' @export
+#' @name NULL
 setGeneric("bin", function(x) { standardGeneric("bin") })
 
 #' @rdname bitwiseNOT
 #' @export
 setGeneric("bitwiseNOT", function(x) { standardGeneric("bitwiseNOT") })
 
-#' @rdname bround
+#' @rdname column_math_functions
 #' @export
+#' @name NULL
 setGeneric("bround", function(x, ...) { standardGeneric("bround") })
 
-#' @rdname cbrt
+#' @rdname column_math_functions
 #' @export
+#' @name NULL
 setGeneric("cbrt", function(x) { standardGeneric("cbrt") })
 
-#' @rdname ceil
+#' @rdname column_math_functions
 #' @export
+#' @name NULL
 setGeneric("ceil", function(x) { standardGeneric("ceil") })
 
 #' @rdname column_aggregate_functions
@@ -973,8 +977,9 @@ setGeneric("concat", function(x, ...) { standardGeneric("concat") })
 #' @export
 setGeneric("concat_ws", function(sep, x, ...) { standardGeneric("concat_ws") })
 
-#' @rdname conv
+#' @rdname column_math_functions
 #' @export
+#' @name NULL
 setGeneric("conv", function(x, fromBase, toBase) { standardGeneric("conv") })
 
 #' @rdname column_aggregate_functions
@@ -1094,8 +1099,9 @@ setGeneric("grouping_bit", function(x) { standardGeneric("grouping_bit") })
 #' @name NULL
 setGeneric("grouping_id", function(x, ...) { standardGeneric("grouping_id") })
 
-#' @rdname hex
+#' @rdname column_math_functions
 #' @export
+#' @name NULL
 setGeneric("hex", function(x) { standardGeneric("hex") })
 
 #' @rdname column_datetime_functions
@@ -1103,8 +1109,9 @@ setGeneric("hex", function(x) { standardGeneric("hex") })
 #' @name NULL
 setGeneric("hour", function(x) { standardGeneric("hour") })
 
-#' @rdname hypot
+#' @rdname column_math_functions
 #' @export
+#' @name NULL
 setGeneric("hypot", function(y, x) { standardGeneric("hypot") })
 
 #' @rdname initcap
@@ -1235,8 +1242,9 @@ setGeneric("n_distinct", function(x, ...) { standardGeneric("n_distinct") })
 #' @export
 setGeneric("percent_rank", function(x = "missing") { standardGeneric("percent_rank") })
 
-#' @rdname pmod
+#' @rdname column_math_functions
 #' @export
+#' @name NULL
 setGeneric("pmod", function(y, x) { standardGeneric("pmod") })
 
 #' @rdname posexplode
@@ -1281,8 +1289,9 @@ setGeneric("repeat_string", function(x, n) { standardGeneric("repeat_string") })
 #' @export
 setGeneric("reverse", function(x) { standardGeneric("reverse") })
 
-#' @rdname rint
+#' @rdname column_math_functions
 #' @export
+#' @name NULL
 setGeneric("rint", function(x) { standardGeneric("rint") })
 
 #' @param x empty. Should be used with no argument.
@@ -1316,20 +1325,24 @@ setGeneric("sha1", function(x) { standardGeneric("sha1") })
 #' @export
 setGeneric("sha2", function(y, x) { standardGeneric("sha2") })
 
-#' @rdname shiftLeft
+#' @rdname column_math_functions
 #' @export
+#' @name NULL
 setGeneric("shiftLeft", function(y, x) { standardGeneric("shiftLeft") })
 
-#' @rdname shiftRight
+#' @rdname column_math_functions
 #' @export
+#' @name NULL
 setGeneric("shiftRight", function(y, x) { standardGeneric("shiftRight") })
 
-#' @rdname shiftRightUnsigned
+#' @rdname column_math_functions
 #' @export
+#' @name NULL
 setGeneric("shiftRightUnsigned", function(y, x) { standardGeneric("shiftRightUnsigned") })
 
-#' @rdname sign
+#' @rdname column_math_functions
 #' @export
+#' @name NULL
 setGeneric("signum", function(x) { standardGeneric("signum") })
 
 #' @rdname size
@@ -1386,12 +1399,14 @@ setGeneric("substring_index", function(x, delim, count) { standardGeneric("subst
 #' @name NULL
 setGeneric("sumDistinct", function(x) { standardGeneric("sumDistinct") })
 
-#' @rdname toDegrees
+#' @rdname column_math_functions
 #' @export
+#' @name NULL
 setGeneric("toDegrees", function(x) { standardGeneric("toDegrees") })
 
-#' @rdname toRadians
+#' @rdname column_math_functions
 #' @export
+#' @name NULL
 setGeneric("toRadians", function(x) { standardGeneric("toRadians") })
 
 #' @rdname column_datetime_functions
@@ -1425,8 +1440,9 @@ setGeneric("trim", function(x) { standardGeneric("trim") })
 #' @export
 setGeneric("unbase64", function(x) { standardGeneric("unbase64") })
 
-#' @rdname unhex
+#' @rdname column_math_functions
 #' @export
+#' @name NULL
 setGeneric("unhex", function(x) { standardGeneric("unhex") })
 
 #' @rdname column_datetime_functions

From 838effb98a0d3410766771533402ce0386133af3 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 28 Jun 2017 14:28:40 +0800
Subject: [PATCH 029/779] Revert "[SPARK-13534][PYSPARK] Using Apache Arrow to
 increase performance of DataFrame.toPandas"

This reverts commit e44697606f429b01808c1a22cb44cb5b89585c5c.
---
 bin/pyspark                                   |    2 +-
 dev/deps/spark-deps-hadoop-2.6                |    5 -
 dev/deps/spark-deps-hadoop-2.7                |    5 -
 dev/run-pip-tests                             |    6 -
 pom.xml                                       |   20 -
 python/pyspark/serializers.py                 |   17 -
 python/pyspark/sql/dataframe.py               |   48 +-
 python/pyspark/sql/tests.py                   |   79 +-
 .../apache/spark/sql/internal/SQLConf.scala   |   22 -
 sql/core/pom.xml                              |    4 -
 .../scala/org/apache/spark/sql/Dataset.scala  |   20 -
 .../sql/execution/arrow/ArrowConverters.scala |  429 ------
 .../arrow/ArrowConvertersSuite.scala          | 1222 -----------------
 13 files changed, 13 insertions(+), 1866 deletions(-)
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
 delete mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala

diff --git a/bin/pyspark b/bin/pyspark
index 8eeea7716cc98..98387c2ec5b8a 100755
--- a/bin/pyspark
+++ b/bin/pyspark
@@ -68,7 +68,7 @@ if [[ -n "$SPARK_TESTING" ]]; then
   unset YARN_CONF_DIR
   unset HADOOP_CONF_DIR
   export PYTHONHASHSEED=0
-  exec "$PYSPARK_DRIVER_PYTHON" -m "$@"
+  exec "$PYSPARK_DRIVER_PYTHON" -m "$1"
   exit
 fi
 
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 9868c1ab7c2ab..9287bd47cf113 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -13,9 +13,6 @@ apacheds-kerberos-codec-2.0.0-M15.jar
 api-asn1-api-1.0.0-M20.jar
 api-util-1.0.0-M20.jar
 arpack_combined_all-0.1.jar
-arrow-format-0.4.0.jar
-arrow-memory-0.4.0.jar
-arrow-vector-0.4.0.jar
 avro-1.7.7.jar
 avro-ipc-1.7.7.jar
 avro-mapred-1.7.7-hadoop2.jar
@@ -58,7 +55,6 @@ datanucleus-core-3.2.10.jar
 datanucleus-rdbms-3.2.9.jar
 derby-10.12.1.1.jar
 eigenbase-properties-1.1.5.jar
-flatbuffers-1.2.0-3f79e055.jar
 gson-2.2.4.jar
 guava-14.0.1.jar
 guice-3.0.jar
@@ -81,7 +77,6 @@ hadoop-yarn-server-web-proxy-2.6.5.jar
 hk2-api-2.4.0-b34.jar
 hk2-locator-2.4.0-b34.jar
 hk2-utils-2.4.0-b34.jar
-hppc-0.7.1.jar
 htrace-core-3.0.4.jar
 httpclient-4.5.2.jar
 httpcore-4.4.4.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 57c78cfe12087..9127413ab6c23 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -13,9 +13,6 @@ apacheds-kerberos-codec-2.0.0-M15.jar
 api-asn1-api-1.0.0-M20.jar
 api-util-1.0.0-M20.jar
 arpack_combined_all-0.1.jar
-arrow-format-0.4.0.jar
-arrow-memory-0.4.0.jar
-arrow-vector-0.4.0.jar
 avro-1.7.7.jar
 avro-ipc-1.7.7.jar
 avro-mapred-1.7.7-hadoop2.jar
@@ -58,7 +55,6 @@ datanucleus-core-3.2.10.jar
 datanucleus-rdbms-3.2.9.jar
 derby-10.12.1.1.jar
 eigenbase-properties-1.1.5.jar
-flatbuffers-1.2.0-3f79e055.jar
 gson-2.2.4.jar
 guava-14.0.1.jar
 guice-3.0.jar
@@ -81,7 +77,6 @@ hadoop-yarn-server-web-proxy-2.7.3.jar
 hk2-api-2.4.0-b34.jar
 hk2-locator-2.4.0-b34.jar
 hk2-utils-2.4.0-b34.jar
-hppc-0.7.1.jar
 htrace-core-3.1.0-incubating.jar
 httpclient-4.5.2.jar
 httpcore-4.4.4.jar
diff --git a/dev/run-pip-tests b/dev/run-pip-tests
index 225e9209536f0..d51dde12a03c5 100755
--- a/dev/run-pip-tests
+++ b/dev/run-pip-tests
@@ -83,8 +83,6 @@ for python in "${PYTHON_EXECS[@]}"; do
     if [ -n "$USE_CONDA" ]; then
       conda create -y -p "$VIRTUALENV_PATH" python=$python numpy pandas pip setuptools
       source activate "$VIRTUALENV_PATH"
-      conda install -y -c conda-forge pyarrow=0.4.0
-      TEST_PYARROW=1
     else
       mkdir -p "$VIRTUALENV_PATH"
       virtualenv --python=$python "$VIRTUALENV_PATH"
@@ -122,10 +120,6 @@ for python in "${PYTHON_EXECS[@]}"; do
     python "$FWDIR"/dev/pip-sanity-check.py
     echo "Run the tests for context.py"
     python "$FWDIR"/python/pyspark/context.py
-    if [ -n "$TEST_PYARROW" ]; then
-      echo "Run tests for pyarrow"
-      SPARK_TESTING=1 "$FWDIR"/bin/pyspark pyspark.sql.tests ArrowTests
-    fi
 
     cd "$FWDIR"
 
diff --git a/pom.xml b/pom.xml
index f124ba45007b7..5f524079495c0 100644
--- a/pom.xml
+++ b/pom.xml
@@ -181,7 +181,6 @@
     <paranamer.version>2.6</paranamer.version>
     <maven-antrun.version>1.8</maven-antrun.version>
     <commons-crypto.version>1.0.0</commons-crypto.version>
-    <arrow.version>0.4.0</arrow.version>
 
     <test.java.home>${java.home}</test.java.home>
     <test.exclude.tags></test.exclude.tags>
@@ -1879,25 +1878,6 @@
         <artifactId>paranamer</artifactId>
         <version>${paranamer.version}</version>
       </dependency>
-      <dependency>
-        <groupId>org.apache.arrow</groupId>
-        <artifactId>arrow-vector</artifactId>
-        <version>${arrow.version}</version>
-        <exclusions>
-          <exclusion>
-            <groupId>com.fasterxml.jackson.core</groupId>
-            <artifactId>jackson-annotations</artifactId>
-          </exclusion>
-          <exclusion>
-            <groupId>com.fasterxml.jackson.core</groupId>
-            <artifactId>jackson-databind</artifactId>
-          </exclusion>
-          <exclusion>
-            <groupId>io.netty</groupId>
-            <artifactId>netty-handler</artifactId>
-          </exclusion>
-        </exclusions>
-      </dependency>
     </dependencies>
   </dependencyManagement>
 
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index d5c2a7518b18f..ea5e00e9eeef5 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -182,23 +182,6 @@ def loads(self, obj):
         raise NotImplementedError
 
 
-class ArrowSerializer(FramedSerializer):
-    """
-    Serializes an Arrow stream.
-    """
-
-    def dumps(self, obj):
-        raise NotImplementedError
-
-    def loads(self, obj):
-        import pyarrow as pa
-        reader = pa.RecordBatchFileReader(pa.BufferReader(obj))
-        return reader.read_all()
-
-    def __repr__(self):
-        return "ArrowSerializer"
-
-
 class BatchedSerializer(Serializer):
 
     """
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 760f113dfd197..0649271ed2246 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -29,8 +29,7 @@
 
 from pyspark import copy_func, since
 from pyspark.rdd import RDD, _load_from_socket, ignore_unicode_prefix
-from pyspark.serializers import ArrowSerializer, BatchedSerializer, PickleSerializer, \
-    UTF8Deserializer
+from pyspark.serializers import BatchedSerializer, PickleSerializer, UTF8Deserializer
 from pyspark.storagelevel import StorageLevel
 from pyspark.traceback_utils import SCCallSiteSync
 from pyspark.sql.types import _parse_datatype_json_string
@@ -1709,8 +1708,7 @@ def toDF(self, *cols):
 
     @since(1.3)
     def toPandas(self):
-        """
-        Returns the contents of this :class:`DataFrame` as Pandas ``pandas.DataFrame``.
+        """Returns the contents of this :class:`DataFrame` as Pandas ``pandas.DataFrame``.
 
         This is only available if Pandas is installed and available.
 
@@ -1723,42 +1721,18 @@ def toPandas(self):
         1    5    Bob
         """
         import pandas as pd
-        if self.sql_ctx.getConf("spark.sql.execution.arrow.enable", "false").lower() == "true":
-            try:
-                import pyarrow
-                tables = self._collectAsArrow()
-                if tables:
-                    table = pyarrow.concat_tables(tables)
-                    return table.to_pandas()
-                else:
-                    return pd.DataFrame.from_records([], columns=self.columns)
-            except ImportError as e:
-                msg = "note: pyarrow must be installed and available on calling Python process " \
-                      "if using spark.sql.execution.arrow.enable=true"
-                raise ImportError("%s\n%s" % (e.message, msg))
-        else:
-            dtype = {}
-            for field in self.schema:
-                pandas_type = _to_corrected_pandas_type(field.dataType)
-                if pandas_type is not None:
-                    dtype[field.name] = pandas_type
 
-            pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
+        dtype = {}
+        for field in self.schema:
+            pandas_type = _to_corrected_pandas_type(field.dataType)
+            if pandas_type is not None:
+                dtype[field.name] = pandas_type
 
-            for f, t in dtype.items():
-                pdf[f] = pdf[f].astype(t, copy=False)
-            return pdf
+        pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
 
-    def _collectAsArrow(self):
-        """
-        Returns all records as list of deserialized ArrowPayloads, pyarrow must be installed
-        and available.
-
-        .. note:: Experimental.
-        """
-        with SCCallSiteSync(self._sc) as css:
-            port = self._jdf.collectAsArrowToPython()
-        return list(_load_from_socket(port, ArrowSerializer()))
+        for f, t in dtype.items():
+            pdf[f] = pdf[f].astype(t, copy=False)
+        return pdf
 
     ##########################################################################################
     # Pandas compatibility
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 326e8548a617c..0a1cd6856b8e8 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -58,21 +58,12 @@
 from pyspark.sql import SparkSession, SQLContext, HiveContext, Column, Row
 from pyspark.sql.types import *
 from pyspark.sql.types import UserDefinedType, _infer_type
-from pyspark.tests import QuietTest, ReusedPySparkTestCase, SparkSubmitTests
+from pyspark.tests import ReusedPySparkTestCase, SparkSubmitTests
 from pyspark.sql.functions import UserDefinedFunction, sha2, lit
 from pyspark.sql.window import Window
 from pyspark.sql.utils import AnalysisException, ParseException, IllegalArgumentException
 
 
-_have_arrow = False
-try:
-    import pyarrow
-    _have_arrow = True
-except:
-    # No Arrow, but that's okay, we'll skip those tests
-    pass
-
-
 class UTCOffsetTimezone(datetime.tzinfo):
     """
     Specifies timezone in UTC offset
@@ -2629,74 +2620,6 @@ def range_frame_match():
 
         importlib.reload(window)
 
-
-@unittest.skipIf(not _have_arrow, "Arrow not installed")
-class ArrowTests(ReusedPySparkTestCase):
-
-    @classmethod
-    def setUpClass(cls):
-        ReusedPySparkTestCase.setUpClass()
-        cls.spark = SparkSession(cls.sc)
-        cls.spark.conf.set("spark.sql.execution.arrow.enable", "true")
-        cls.schema = StructType([
-            StructField("1_str_t", StringType(), True),
-            StructField("2_int_t", IntegerType(), True),
-            StructField("3_long_t", LongType(), True),
-            StructField("4_float_t", FloatType(), True),
-            StructField("5_double_t", DoubleType(), True)])
-        cls.data = [("a", 1, 10, 0.2, 2.0),
-                    ("b", 2, 20, 0.4, 4.0),
-                    ("c", 3, 30, 0.8, 6.0)]
-
-    def assertFramesEqual(self, df_with_arrow, df_without):
-        msg = ("DataFrame from Arrow is not equal" +
-               ("\n\nWith Arrow:\n%s\n%s" % (df_with_arrow, df_with_arrow.dtypes)) +
-               ("\n\nWithout:\n%s\n%s" % (df_without, df_without.dtypes)))
-        self.assertTrue(df_without.equals(df_with_arrow), msg=msg)
-
-    def test_unsupported_datatype(self):
-        schema = StructType([StructField("array", ArrayType(IntegerType(), False), True)])
-        df = self.spark.createDataFrame([([1, 2, 3],)], schema=schema)
-        with QuietTest(self.sc):
-            self.assertRaises(Exception, lambda: df.toPandas())
-
-    def test_null_conversion(self):
-        df_null = self.spark.createDataFrame([tuple([None for _ in range(len(self.data[0]))])] +
-                                             self.data)
-        pdf = df_null.toPandas()
-        null_counts = pdf.isnull().sum().tolist()
-        self.assertTrue(all([c == 1 for c in null_counts]))
-
-    def test_toPandas_arrow_toggle(self):
-        df = self.spark.createDataFrame(self.data, schema=self.schema)
-        self.spark.conf.set("spark.sql.execution.arrow.enable", "false")
-        pdf = df.toPandas()
-        self.spark.conf.set("spark.sql.execution.arrow.enable", "true")
-        pdf_arrow = df.toPandas()
-        self.assertFramesEqual(pdf_arrow, pdf)
-
-    def test_pandas_round_trip(self):
-        import pandas as pd
-        import numpy as np
-        data_dict = {}
-        for j, name in enumerate(self.schema.names):
-            data_dict[name] = [self.data[i][j] for i in range(len(self.data))]
-        # need to convert these to numpy types first
-        data_dict["2_int_t"] = np.int32(data_dict["2_int_t"])
-        data_dict["4_float_t"] = np.float32(data_dict["4_float_t"])
-        pdf = pd.DataFrame(data=data_dict)
-        df = self.spark.createDataFrame(self.data, schema=self.schema)
-        pdf_arrow = df.toPandas()
-        self.assertFramesEqual(pdf_arrow, pdf)
-
-    def test_filtered_frame(self):
-        df = self.spark.range(3).toDF("i")
-        pdf = df.filter("i < 0").toPandas()
-        self.assertEqual(len(pdf.columns), 1)
-        self.assertEqual(pdf.columns[0], "i")
-        self.assertTrue(pdf.empty)
-
-
 if __name__ == "__main__":
     from pyspark.sql.tests import *
     if xmlrunner:
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 9c8e26a8eeadf..c641e4d3a23e1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -847,24 +847,6 @@ object SQLConf {
       .intConf
       .createWithDefault(UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD.toInt)
 
-  val ARROW_EXECUTION_ENABLE =
-    buildConf("spark.sql.execution.arrow.enable")
-      .internal()
-      .doc("Make use of Apache Arrow for columnar data transfers. Currently available " +
-        "for use with pyspark.sql.DataFrame.toPandas with the following data types: " +
-        "StringType, BinaryType, BooleanType, DoubleType, FloatType, ByteType, IntegerType, " +
-        "LongType, ShortType")
-      .booleanConf
-      .createWithDefault(false)
-
-  val ARROW_EXECUTION_MAX_RECORDS_PER_BATCH =
-    buildConf("spark.sql.execution.arrow.maxRecordsPerBatch")
-      .internal()
-      .doc("When using Apache Arrow, limit the maximum number of records that can be written " +
-        "to a single ArrowRecordBatch in memory. If set to zero or negative there is no limit.")
-      .intConf
-      .createWithDefault(10000)
-
   object Deprecated {
     val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"
   }
@@ -1123,10 +1105,6 @@ class SQLConf extends Serializable with Logging {
 
   def starSchemaFTRatio: Double = getConf(STARSCHEMA_FACT_TABLE_RATIO)
 
-  def arrowEnable: Boolean = getConf(ARROW_EXECUTION_ENABLE)
-
-  def arrowMaxRecordsPerBatch: Int = getConf(ARROW_EXECUTION_MAX_RECORDS_PER_BATCH)
-
   /** ********************** SQLConf functionality methods ************ */
 
   /** Set Spark SQL configuration properties. */
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 661c31ded7148..1bc34a6b069d9 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -103,10 +103,6 @@
       <artifactId>jackson-databind</artifactId>
       <version>${fasterxml.jackson.version}</version>
     </dependency>
-    <dependency>
-      <groupId>org.apache.arrow</groupId>
-      <artifactId>arrow-vector</artifactId>
-    </dependency>
     <dependency>
       <groupId>org.apache.xbean</groupId>
       <artifactId>xbean-asm5-shaded</artifactId>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 268a37ff5d271..7be4aa1ca9562 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -47,7 +47,6 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, PartitioningCollection}
 import org.apache.spark.sql.catalyst.util.{usePrettyExpression, DateTimeUtils}
 import org.apache.spark.sql.execution._
-import org.apache.spark.sql.execution.arrow.{ArrowConverters, ArrowPayload}
 import org.apache.spark.sql.execution.command._
 import org.apache.spark.sql.execution.datasources.LogicalRelation
 import org.apache.spark.sql.execution.python.EvaluatePython
@@ -2887,16 +2886,6 @@ class Dataset[T] private[sql](
     }
   }
 
-  /**
-   * Collect a Dataset as ArrowPayload byte arrays and serve to PySpark.
-   */
-  private[sql] def collectAsArrowToPython(): Int = {
-    withNewExecutionId {
-      val iter = toArrowPayload.collect().iterator.map(_.asPythonSerializable)
-      PythonRDD.serveIterator(iter, "serve-Arrow")
-    }
-  }
-
   private[sql] def toPythonIterator(): Int = {
     withNewExecutionId {
       PythonRDD.toLocalIteratorAndServe(javaToPython.rdd)
@@ -2978,13 +2967,4 @@ class Dataset[T] private[sql](
       Dataset(sparkSession, logicalPlan)
     }
   }
-
-  /** Convert to an RDD of ArrowPayload byte arrays */
-  private[sql] def toArrowPayload: RDD[ArrowPayload] = {
-    val schemaCaptured = this.schema
-    val maxRecordsPerBatch = sparkSession.sessionState.conf.arrowMaxRecordsPerBatch
-    queryExecution.toRdd.mapPartitionsInternal { iter =>
-      ArrowConverters.toPayloadIterator(iter, schemaCaptured, maxRecordsPerBatch)
-    }
-  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
deleted file mode 100644
index 6af5c73422377..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
+++ /dev/null
@@ -1,429 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-package org.apache.spark.sql.execution.arrow
-
-import java.io.ByteArrayOutputStream
-import java.nio.channels.Channels
-
-import scala.collection.JavaConverters._
-
-import io.netty.buffer.ArrowBuf
-import org.apache.arrow.memory.{BufferAllocator, RootAllocator}
-import org.apache.arrow.vector._
-import org.apache.arrow.vector.BaseValueVector.BaseMutator
-import org.apache.arrow.vector.file._
-import org.apache.arrow.vector.schema.{ArrowFieldNode, ArrowRecordBatch}
-import org.apache.arrow.vector.types.FloatingPointPrecision
-import org.apache.arrow.vector.types.pojo.{ArrowType, Field, FieldType, Schema}
-import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel
-
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.types._
-import org.apache.spark.util.Utils
-
-
-/**
- * Store Arrow data in a form that can be serialized by Spark and served to a Python process.
- */
-private[sql] class ArrowPayload private[arrow] (payload: Array[Byte]) extends Serializable {
-
-  /**
-   * Convert the ArrowPayload to an ArrowRecordBatch.
-   */
-  def loadBatch(allocator: BufferAllocator): ArrowRecordBatch = {
-    ArrowConverters.byteArrayToBatch(payload, allocator)
-  }
-
-  /**
-   * Get the ArrowPayload as a type that can be served to Python.
-   */
-  def asPythonSerializable: Array[Byte] = payload
-}
-
-private[sql] object ArrowPayload {
-
-  /**
-   * Create an ArrowPayload from an ArrowRecordBatch and Spark schema.
-   */
-  def apply(
-      batch: ArrowRecordBatch,
-      schema: StructType,
-      allocator: BufferAllocator): ArrowPayload = {
-    new ArrowPayload(ArrowConverters.batchToByteArray(batch, schema, allocator))
-  }
-}
-
-private[sql] object ArrowConverters {
-
-  /**
-   * Map a Spark DataType to ArrowType.
-   */
-  private[arrow] def sparkTypeToArrowType(dataType: DataType): ArrowType = {
-    dataType match {
-      case BooleanType => ArrowType.Bool.INSTANCE
-      case ShortType => new ArrowType.Int(8 * ShortType.defaultSize, true)
-      case IntegerType => new ArrowType.Int(8 * IntegerType.defaultSize, true)
-      case LongType => new ArrowType.Int(8 * LongType.defaultSize, true)
-      case FloatType => new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)
-      case DoubleType => new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)
-      case ByteType => new ArrowType.Int(8, true)
-      case StringType => ArrowType.Utf8.INSTANCE
-      case BinaryType => ArrowType.Binary.INSTANCE
-      case _ => throw new UnsupportedOperationException(s"Unsupported data type: $dataType")
-    }
-  }
-
-  /**
-   * Convert a Spark Dataset schema to Arrow schema.
-   */
-  private[arrow] def schemaToArrowSchema(schema: StructType): Schema = {
-    val arrowFields = schema.fields.map { f =>
-      new Field(f.name, f.nullable, sparkTypeToArrowType(f.dataType), List.empty[Field].asJava)
-    }
-    new Schema(arrowFields.toList.asJava)
-  }
-
-  /**
-   * Maps Iterator from InternalRow to ArrowPayload. Limit ArrowRecordBatch size in ArrowPayload
-   * by setting maxRecordsPerBatch or use 0 to fully consume rowIter.
-   */
-  private[sql] def toPayloadIterator(
-      rowIter: Iterator[InternalRow],
-      schema: StructType,
-      maxRecordsPerBatch: Int): Iterator[ArrowPayload] = {
-    new Iterator[ArrowPayload] {
-      private val _allocator = new RootAllocator(Long.MaxValue)
-      private var _nextPayload = if (rowIter.nonEmpty) convert() else null
-
-      override def hasNext: Boolean = _nextPayload != null
-
-      override def next(): ArrowPayload = {
-        val obj = _nextPayload
-        if (hasNext) {
-          if (rowIter.hasNext) {
-            _nextPayload = convert()
-          } else {
-            _allocator.close()
-            _nextPayload = null
-          }
-        }
-        obj
-      }
-
-      private def convert(): ArrowPayload = {
-        val batch = internalRowIterToArrowBatch(rowIter, schema, _allocator, maxRecordsPerBatch)
-        ArrowPayload(batch, schema, _allocator)
-      }
-    }
-  }
-
-  /**
-   * Iterate over InternalRows and write to an ArrowRecordBatch, stopping when rowIter is consumed
-   * or the number of records in the batch equals maxRecordsInBatch.  If maxRecordsPerBatch is 0,
-   * then rowIter will be fully consumed.
-   */
-  private def internalRowIterToArrowBatch(
-      rowIter: Iterator[InternalRow],
-      schema: StructType,
-      allocator: BufferAllocator,
-      maxRecordsPerBatch: Int = 0): ArrowRecordBatch = {
-
-    val columnWriters = schema.fields.zipWithIndex.map { case (field, ordinal) =>
-      ColumnWriter(field.dataType, ordinal, allocator).init()
-    }
-
-    val writerLength = columnWriters.length
-    var recordsInBatch = 0
-    while (rowIter.hasNext && (maxRecordsPerBatch <= 0 || recordsInBatch < maxRecordsPerBatch)) {
-      val row = rowIter.next()
-      var i = 0
-      while (i < writerLength) {
-        columnWriters(i).write(row)
-        i += 1
-      }
-      recordsInBatch += 1
-    }
-
-    val (fieldNodes, bufferArrays) = columnWriters.map(_.finish()).unzip
-    val buffers = bufferArrays.flatten
-
-    val rowLength = if (fieldNodes.nonEmpty) fieldNodes.head.getLength else 0
-    val recordBatch = new ArrowRecordBatch(rowLength,
-      fieldNodes.toList.asJava, buffers.toList.asJava)
-
-    buffers.foreach(_.release())
-    recordBatch
-  }
-
-  /**
-   * Convert an ArrowRecordBatch to a byte array and close batch to release resources. Once closed,
-   * the batch can no longer be used.
-   */
-  private[arrow] def batchToByteArray(
-      batch: ArrowRecordBatch,
-      schema: StructType,
-      allocator: BufferAllocator): Array[Byte] = {
-    val arrowSchema = ArrowConverters.schemaToArrowSchema(schema)
-    val root = VectorSchemaRoot.create(arrowSchema, allocator)
-    val out = new ByteArrayOutputStream()
-    val writer = new ArrowFileWriter(root, null, Channels.newChannel(out))
-
-    // Write a batch to byte stream, ensure the batch, allocator and writer are closed
-    Utils.tryWithSafeFinally {
-      val loader = new VectorLoader(root)
-      loader.load(batch)
-      writer.writeBatch()  // writeBatch can throw IOException
-    } {
-      batch.close()
-      root.close()
-      writer.close()
-    }
-    out.toByteArray
-  }
-
-  /**
-   * Convert a byte array to an ArrowRecordBatch.
-   */
-  private[arrow] def byteArrayToBatch(
-      batchBytes: Array[Byte],
-      allocator: BufferAllocator): ArrowRecordBatch = {
-    val in = new ByteArrayReadableSeekableByteChannel(batchBytes)
-    val reader = new ArrowFileReader(in, allocator)
-
-    // Read a batch from a byte stream, ensure the reader is closed
-    Utils.tryWithSafeFinally {
-      val root = reader.getVectorSchemaRoot  // throws IOException
-      val unloader = new VectorUnloader(root)
-      reader.loadNextBatch()  // throws IOException
-      unloader.getRecordBatch
-    } {
-      reader.close()
-    }
-  }
-}
-
-/**
- * Interface for writing InternalRows to Arrow Buffers.
- */
-private[arrow] trait ColumnWriter {
-  def init(): this.type
-  def write(row: InternalRow): Unit
-
-  /**
-   * Clear the column writer and return the ArrowFieldNode and ArrowBuf.
-   * This should be called only once after all the data is written.
-   */
-  def finish(): (ArrowFieldNode, Array[ArrowBuf])
-}
-
-/**
- * Base class for flat arrow column writer, i.e., column without children.
- */
-private[arrow] abstract class PrimitiveColumnWriter(val ordinal: Int)
-  extends ColumnWriter {
-
-  def getFieldType(dtype: ArrowType): FieldType = FieldType.nullable(dtype)
-
-  def valueVector: BaseDataValueVector
-  def valueMutator: BaseMutator
-
-  def setNull(): Unit
-  def setValue(row: InternalRow): Unit
-
-  protected var count = 0
-  protected var nullCount = 0
-
-  override def init(): this.type = {
-    valueVector.allocateNew()
-    this
-  }
-
-  override def write(row: InternalRow): Unit = {
-    if (row.isNullAt(ordinal)) {
-      setNull()
-      nullCount += 1
-    } else {
-      setValue(row)
-    }
-    count += 1
-  }
-
-  override def finish(): (ArrowFieldNode, Array[ArrowBuf]) = {
-    valueMutator.setValueCount(count)
-    val fieldNode = new ArrowFieldNode(count, nullCount)
-    val valueBuffers = valueVector.getBuffers(true)
-    (fieldNode, valueBuffers)
-  }
-}
-
-private[arrow] class BooleanColumnWriter(dtype: ArrowType, ordinal: Int, allocator: BufferAllocator)
-  extends PrimitiveColumnWriter(ordinal) {
-  override val valueVector: NullableBitVector
-    = new NullableBitVector("BooleanValue", getFieldType(dtype), allocator)
-  override val valueMutator: NullableBitVector#Mutator = valueVector.getMutator
-
-  override def setNull(): Unit = valueMutator.setNull(count)
-  override def setValue(row: InternalRow): Unit
-    = valueMutator.setSafe(count, if (row.getBoolean(ordinal)) 1 else 0 )
-}
-
-private[arrow] class ShortColumnWriter(dtype: ArrowType, ordinal: Int, allocator: BufferAllocator)
-  extends PrimitiveColumnWriter(ordinal) {
-  override val valueVector: NullableSmallIntVector
-    = new NullableSmallIntVector("ShortValue", getFieldType(dtype: ArrowType), allocator)
-  override val valueMutator: NullableSmallIntVector#Mutator = valueVector.getMutator
-
-  override def setNull(): Unit = valueMutator.setNull(count)
-  override def setValue(row: InternalRow): Unit
-    = valueMutator.setSafe(count, row.getShort(ordinal))
-}
-
-private[arrow] class IntegerColumnWriter(dtype: ArrowType, ordinal: Int, allocator: BufferAllocator)
-  extends PrimitiveColumnWriter(ordinal) {
-  override val valueVector: NullableIntVector
-    = new NullableIntVector("IntValue", getFieldType(dtype), allocator)
-  override val valueMutator: NullableIntVector#Mutator = valueVector.getMutator
-
-  override def setNull(): Unit = valueMutator.setNull(count)
-  override def setValue(row: InternalRow): Unit
-    = valueMutator.setSafe(count, row.getInt(ordinal))
-}
-
-private[arrow] class LongColumnWriter(dtype: ArrowType, ordinal: Int, allocator: BufferAllocator)
-  extends PrimitiveColumnWriter(ordinal) {
-  override val valueVector: NullableBigIntVector
-    = new NullableBigIntVector("LongValue", getFieldType(dtype), allocator)
-  override val valueMutator: NullableBigIntVector#Mutator = valueVector.getMutator
-
-  override def setNull(): Unit = valueMutator.setNull(count)
-  override def setValue(row: InternalRow): Unit
-    = valueMutator.setSafe(count, row.getLong(ordinal))
-}
-
-private[arrow] class FloatColumnWriter(dtype: ArrowType, ordinal: Int, allocator: BufferAllocator)
-  extends PrimitiveColumnWriter(ordinal) {
-  override val valueVector: NullableFloat4Vector
-    = new NullableFloat4Vector("FloatValue", getFieldType(dtype), allocator)
-  override val valueMutator: NullableFloat4Vector#Mutator = valueVector.getMutator
-
-  override def setNull(): Unit = valueMutator.setNull(count)
-  override def setValue(row: InternalRow): Unit
-    = valueMutator.setSafe(count, row.getFloat(ordinal))
-}
-
-private[arrow] class DoubleColumnWriter(dtype: ArrowType, ordinal: Int, allocator: BufferAllocator)
-  extends PrimitiveColumnWriter(ordinal) {
-  override val valueVector: NullableFloat8Vector
-    = new NullableFloat8Vector("DoubleValue", getFieldType(dtype), allocator)
-  override val valueMutator: NullableFloat8Vector#Mutator = valueVector.getMutator
-
-  override def setNull(): Unit = valueMutator.setNull(count)
-  override def setValue(row: InternalRow): Unit
-    = valueMutator.setSafe(count, row.getDouble(ordinal))
-}
-
-private[arrow] class ByteColumnWriter(dtype: ArrowType, ordinal: Int, allocator: BufferAllocator)
-  extends PrimitiveColumnWriter(ordinal) {
-  override val valueVector: NullableUInt1Vector
-    = new NullableUInt1Vector("ByteValue", getFieldType(dtype), allocator)
-  override val valueMutator: NullableUInt1Vector#Mutator = valueVector.getMutator
-
-  override def setNull(): Unit = valueMutator.setNull(count)
-  override def setValue(row: InternalRow): Unit
-    = valueMutator.setSafe(count, row.getByte(ordinal))
-}
-
-private[arrow] class UTF8StringColumnWriter(
-    dtype: ArrowType,
-    ordinal: Int,
-    allocator: BufferAllocator)
-  extends PrimitiveColumnWriter(ordinal) {
-  override val valueVector: NullableVarCharVector
-    = new NullableVarCharVector("UTF8StringValue", getFieldType(dtype), allocator)
-  override val valueMutator: NullableVarCharVector#Mutator = valueVector.getMutator
-
-  override def setNull(): Unit = valueMutator.setNull(count)
-  override def setValue(row: InternalRow): Unit = {
-    val str = row.getUTF8String(ordinal)
-    valueMutator.setSafe(count, str.getByteBuffer, 0, str.numBytes)
-  }
-}
-
-private[arrow] class BinaryColumnWriter(dtype: ArrowType, ordinal: Int, allocator: BufferAllocator)
-  extends PrimitiveColumnWriter(ordinal) {
-  override val valueVector: NullableVarBinaryVector
-    = new NullableVarBinaryVector("BinaryValue", getFieldType(dtype), allocator)
-  override val valueMutator: NullableVarBinaryVector#Mutator = valueVector.getMutator
-
-  override def setNull(): Unit = valueMutator.setNull(count)
-  override def setValue(row: InternalRow): Unit = {
-    val bytes = row.getBinary(ordinal)
-    valueMutator.setSafe(count, bytes, 0, bytes.length)
-  }
-}
-
-private[arrow] class DateColumnWriter(dtype: ArrowType, ordinal: Int, allocator: BufferAllocator)
-  extends PrimitiveColumnWriter(ordinal) {
-  override val valueVector: NullableDateDayVector
-    = new NullableDateDayVector("DateValue", getFieldType(dtype), allocator)
-  override val valueMutator: NullableDateDayVector#Mutator = valueVector.getMutator
-
-  override def setNull(): Unit = valueMutator.setNull(count)
-  override def setValue(row: InternalRow): Unit = {
-    valueMutator.setSafe(count, row.getInt(ordinal))
-  }
-}
-
-private[arrow] class TimeStampColumnWriter(
-    dtype: ArrowType,
-    ordinal: Int,
-    allocator: BufferAllocator)
-  extends PrimitiveColumnWriter(ordinal) {
-  override val valueVector: NullableTimeStampMicroVector
-    = new NullableTimeStampMicroVector("TimeStampValue", getFieldType(dtype), allocator)
-  override val valueMutator: NullableTimeStampMicroVector#Mutator = valueVector.getMutator
-
-  override def setNull(): Unit = valueMutator.setNull(count)
-  override def setValue(row: InternalRow): Unit = {
-    valueMutator.setSafe(count, row.getLong(ordinal))
-  }
-}
-
-private[arrow] object ColumnWriter {
-
-  /**
-   * Create an Arrow ColumnWriter given the type and ordinal of row.
-   */
-  def apply(dataType: DataType, ordinal: Int, allocator: BufferAllocator): ColumnWriter = {
-    val dtype = ArrowConverters.sparkTypeToArrowType(dataType)
-    dataType match {
-      case BooleanType => new BooleanColumnWriter(dtype, ordinal, allocator)
-      case ShortType => new ShortColumnWriter(dtype, ordinal, allocator)
-      case IntegerType => new IntegerColumnWriter(dtype, ordinal, allocator)
-      case LongType => new LongColumnWriter(dtype, ordinal, allocator)
-      case FloatType => new FloatColumnWriter(dtype, ordinal, allocator)
-      case DoubleType => new DoubleColumnWriter(dtype, ordinal, allocator)
-      case ByteType => new ByteColumnWriter(dtype, ordinal, allocator)
-      case StringType => new UTF8StringColumnWriter(dtype, ordinal, allocator)
-      case BinaryType => new BinaryColumnWriter(dtype, ordinal, allocator)
-      case DateType => new DateColumnWriter(dtype, ordinal, allocator)
-      case TimestampType => new TimeStampColumnWriter(dtype, ordinal, allocator)
-      case _ => throw new UnsupportedOperationException(s"Unsupported data type: $dataType")
-    }
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala
deleted file mode 100644
index 159328cc0d958..0000000000000
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala
+++ /dev/null
@@ -1,1222 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.sql.execution.arrow
-
-import java.io.File
-import java.nio.charset.StandardCharsets
-import java.sql.{Date, Timestamp}
-import java.text.SimpleDateFormat
-import java.util.Locale
-
-import com.google.common.io.Files
-import org.apache.arrow.memory.RootAllocator
-import org.apache.arrow.vector.{VectorLoader, VectorSchemaRoot}
-import org.apache.arrow.vector.file.json.JsonFileReader
-import org.apache.arrow.vector.util.Validator
-import org.scalatest.BeforeAndAfterAll
-
-import org.apache.spark.SparkException
-import org.apache.spark.sql.{DataFrame, Row}
-import org.apache.spark.sql.test.SharedSQLContext
-import org.apache.spark.sql.types.{BinaryType, StructField, StructType}
-import org.apache.spark.util.Utils
-
-
-class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
-  import testImplicits._
-
-  private var tempDataPath: String = _
-
-  override def beforeAll(): Unit = {
-    super.beforeAll()
-    tempDataPath = Utils.createTempDir(namePrefix = "arrow").getAbsolutePath
-  }
-
-  test("collect to arrow record batch") {
-    val indexData = (1 to 6).toDF("i")
-    val arrowPayloads = indexData.toArrowPayload.collect()
-    assert(arrowPayloads.nonEmpty)
-    assert(arrowPayloads.length == indexData.rdd.getNumPartitions)
-    val allocator = new RootAllocator(Long.MaxValue)
-    val arrowRecordBatches = arrowPayloads.map(_.loadBatch(allocator))
-    val rowCount = arrowRecordBatches.map(_.getLength).sum
-    assert(rowCount === indexData.count())
-    arrowRecordBatches.foreach(batch => assert(batch.getNodes.size() > 0))
-    arrowRecordBatches.foreach(_.close())
-    allocator.close()
-  }
-
-  test("short conversion") {
-    val json =
-      s"""
-         |{
-         |  "schema" : {
-         |    "fields" : [ {
-         |      "name" : "a_s",
-         |      "type" : {
-         |        "name" : "int",
-         |        "isSigned" : true,
-         |        "bitWidth" : 16
-         |      },
-         |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 16
-         |        } ]
-         |      }
-         |    }, {
-         |      "name" : "b_s",
-         |      "type" : {
-         |        "name" : "int",
-         |        "isSigned" : true,
-         |        "bitWidth" : 16
-         |      },
-         |      "nullable" : true,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 16
-         |        } ]
-         |      }
-         |    } ]
-         |  },
-         |  "batches" : [ {
-         |    "count" : 6,
-         |    "columns" : [ {
-         |      "name" : "a_s",
-         |      "count" : 6,
-         |      "VALIDITY" : [ 1, 1, 1, 1, 1, 1 ],
-         |      "DATA" : [ 1, -1, 2, -2, 32767, -32768 ]
-         |    }, {
-         |      "name" : "b_s",
-         |      "count" : 6,
-         |      "VALIDITY" : [ 1, 0, 0, 1, 0, 1 ],
-         |      "DATA" : [ 1, 0, 0, -2, 0, -32768 ]
-         |    } ]
-         |  } ]
-         |}
-       """.stripMargin
-
-    val a_s = List[Short](1, -1, 2, -2, 32767, -32768)
-    val b_s = List[Option[Short]](Some(1), None, None, Some(-2), None, Some(-32768))
-    val df = a_s.zip(b_s).toDF("a_s", "b_s")
-
-    collectAndValidate(df, json, "integer-16bit.json")
-  }
-
-  test("int conversion") {
-    val json =
-      s"""
-         |{
-         |  "schema" : {
-         |    "fields" : [ {
-         |      "name" : "a_i",
-         |      "type" : {
-         |        "name" : "int",
-         |        "isSigned" : true,
-         |        "bitWidth" : 32
-         |      },
-         |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
-         |    }, {
-         |      "name" : "b_i",
-         |      "type" : {
-         |        "name" : "int",
-         |        "isSigned" : true,
-         |        "bitWidth" : 32
-         |      },
-         |      "nullable" : true,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
-         |    } ]
-         |  },
-         |  "batches" : [ {
-         |    "count" : 6,
-         |    "columns" : [ {
-         |      "name" : "a_i",
-         |      "count" : 6,
-         |      "VALIDITY" : [ 1, 1, 1, 1, 1, 1 ],
-         |      "DATA" : [ 1, -1, 2, -2, 2147483647, -2147483648 ]
-         |    }, {
-         |      "name" : "b_i",
-         |      "count" : 6,
-         |      "VALIDITY" : [ 1, 0, 0, 1, 0, 1 ],
-         |      "DATA" : [ 1, 0, 0, -2, 0, -2147483648 ]
-         |    } ]
-         |  } ]
-         |}
-       """.stripMargin
-
-    val a_i = List[Int](1, -1, 2, -2, 2147483647, -2147483648)
-    val b_i = List[Option[Int]](Some(1), None, None, Some(-2), None, Some(-2147483648))
-    val df = a_i.zip(b_i).toDF("a_i", "b_i")
-
-    collectAndValidate(df, json, "integer-32bit.json")
-  }
-
-  test("long conversion") {
-    val json =
-      s"""
-         |{
-         |  "schema" : {
-         |    "fields" : [ {
-         |      "name" : "a_l",
-         |      "type" : {
-         |        "name" : "int",
-         |        "isSigned" : true,
-         |        "bitWidth" : 64
-         |      },
-         |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 64
-         |        } ]
-         |      }
-         |    }, {
-         |      "name" : "b_l",
-         |      "type" : {
-         |        "name" : "int",
-         |        "isSigned" : true,
-         |        "bitWidth" : 64
-         |      },
-         |      "nullable" : true,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 64
-         |        } ]
-         |      }
-         |    } ]
-         |  },
-         |  "batches" : [ {
-         |    "count" : 6,
-         |    "columns" : [ {
-         |      "name" : "a_l",
-         |      "count" : 6,
-         |      "VALIDITY" : [ 1, 1, 1, 1, 1, 1 ],
-         |      "DATA" : [ 1, -1, 2, -2, 9223372036854775807, -9223372036854775808 ]
-         |    }, {
-         |      "name" : "b_l",
-         |      "count" : 6,
-         |      "VALIDITY" : [ 1, 0, 0, 1, 0, 1 ],
-         |      "DATA" : [ 1, 0, 0, -2, 0, -9223372036854775808 ]
-         |    } ]
-         |  } ]
-         |}
-       """.stripMargin
-
-    val a_l = List[Long](1, -1, 2, -2, 9223372036854775807L, -9223372036854775808L)
-    val b_l = List[Option[Long]](Some(1), None, None, Some(-2), None, Some(-9223372036854775808L))
-    val df = a_l.zip(b_l).toDF("a_l", "b_l")
-
-    collectAndValidate(df, json, "integer-64bit.json")
-  }
-
-  test("float conversion") {
-    val json =
-      s"""
-         |{
-         |  "schema" : {
-         |    "fields" : [ {
-         |      "name" : "a_f",
-         |      "type" : {
-         |        "name" : "floatingpoint",
-         |        "precision" : "SINGLE"
-         |      },
-         |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
-         |    }, {
-         |      "name" : "b_f",
-         |      "type" : {
-         |        "name" : "floatingpoint",
-         |        "precision" : "SINGLE"
-         |      },
-         |      "nullable" : true,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
-         |    } ]
-         |  },
-         |  "batches" : [ {
-         |    "count" : 6,
-         |    "columns" : [ {
-         |      "name" : "a_f",
-         |      "count" : 6,
-         |      "VALIDITY" : [ 1, 1, 1, 1, 1, 1 ],
-         |      "DATA" : [ 1.0, 2.0, 0.01, 200.0, 0.0001, 20000.0 ]
-         |    }, {
-         |      "name" : "b_f",
-         |      "count" : 6,
-         |      "VALIDITY" : [ 1, 0, 0, 1, 0, 1 ],
-         |      "DATA" : [ 1.1, 0.0, 0.0, 2.2, 0.0, 3.3 ]
-         |    } ]
-         |  } ]
-         |}
-       """.stripMargin
-
-    val a_f = List(1.0f, 2.0f, 0.01f, 200.0f, 0.0001f, 20000.0f)
-    val b_f = List[Option[Float]](Some(1.1f), None, None, Some(2.2f), None, Some(3.3f))
-    val df = a_f.zip(b_f).toDF("a_f", "b_f")
-
-    collectAndValidate(df, json, "floating_point-single_precision.json")
-  }
-
-  test("double conversion") {
-    val json =
-      s"""
-         |{
-         |  "schema" : {
-         |    "fields" : [ {
-         |      "name" : "a_d",
-         |      "type" : {
-         |        "name" : "floatingpoint",
-         |        "precision" : "DOUBLE"
-         |      },
-         |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 64
-         |        } ]
-         |      }
-         |    }, {
-         |      "name" : "b_d",
-         |      "type" : {
-         |        "name" : "floatingpoint",
-         |        "precision" : "DOUBLE"
-         |      },
-         |      "nullable" : true,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 64
-         |        } ]
-         |      }
-         |    } ]
-         |  },
-         |  "batches" : [ {
-         |    "count" : 6,
-         |    "columns" : [ {
-         |      "name" : "a_d",
-         |      "count" : 6,
-         |      "VALIDITY" : [ 1, 1, 1, 1, 1, 1 ],
-         |      "DATA" : [ 1.0, 2.0, 0.01, 200.0, 1.0E-4, 20000.0 ]
-         |    }, {
-         |      "name" : "b_d",
-         |      "count" : 6,
-         |      "VALIDITY" : [ 1, 0, 0, 1, 0, 1 ],
-         |      "DATA" : [ 1.1, 0.0, 0.0, 2.2, 0.0, 3.3 ]
-         |    } ]
-         |  } ]
-         |}
-       """.stripMargin
-
-    val a_d = List(1.0, 2.0, 0.01, 200.0, 0.0001, 20000.0)
-    val b_d = List[Option[Double]](Some(1.1), None, None, Some(2.2), None, Some(3.3))
-    val df = a_d.zip(b_d).toDF("a_d", "b_d")
-
-    collectAndValidate(df, json, "floating_point-double_precision.json")
-  }
-
-  test("index conversion") {
-    val data = List[Int](1, 2, 3, 4, 5, 6)
-    val json =
-      s"""
-         |{
-         |  "schema" : {
-         |    "fields" : [ {
-         |      "name" : "i",
-         |      "type" : {
-         |        "name" : "int",
-         |        "isSigned" : true,
-         |        "bitWidth" : 32
-         |      },
-         |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
-         |    } ]
-         |  },
-         |  "batches" : [ {
-         |    "count" : 6,
-         |    "columns" : [ {
-         |      "name" : "i",
-         |      "count" : 6,
-         |      "VALIDITY" : [ 1, 1, 1, 1, 1, 1 ],
-         |      "DATA" : [ 1, 2, 3, 4, 5, 6 ]
-         |    } ]
-         |  } ]
-         |}
-       """.stripMargin
-    val df = data.toDF("i")
-
-    collectAndValidate(df, json, "indexData-ints.json")
-  }
-
-  test("mixed numeric type conversion") {
-    val json =
-      s"""
-         |{
-         |  "schema" : {
-         |    "fields" : [ {
-         |      "name" : "a",
-         |      "type" : {
-         |        "name" : "int",
-         |        "isSigned" : true,
-         |        "bitWidth" : 16
-         |      },
-         |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 16
-         |        } ]
-         |      }
-         |    }, {
-         |      "name" : "b",
-         |      "type" : {
-         |        "name" : "floatingpoint",
-         |        "precision" : "SINGLE"
-         |      },
-         |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
-         |    }, {
-         |      "name" : "c",
-         |      "type" : {
-         |        "name" : "int",
-         |        "isSigned" : true,
-         |        "bitWidth" : 32
-         |      },
-         |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
-         |    }, {
-         |      "name" : "d",
-         |      "type" : {
-         |        "name" : "floatingpoint",
-         |        "precision" : "DOUBLE"
-         |      },
-         |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 64
-         |        } ]
-         |      }
-         |    }, {
-         |      "name" : "e",
-         |      "type" : {
-         |        "name" : "int",
-         |        "isSigned" : true,
-         |        "bitWidth" : 64
-         |      },
-         |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 64
-         |        } ]
-         |      }
-         |    } ]
-         |  },
-         |  "batches" : [ {
-         |    "count" : 6,
-         |    "columns" : [ {
-         |      "name" : "a",
-         |      "count" : 6,
-         |      "VALIDITY" : [ 1, 1, 1, 1, 1, 1 ],
-         |      "DATA" : [ 1, 2, 3, 4, 5, 6 ]
-         |    }, {
-         |      "name" : "b",
-         |      "count" : 6,
-         |      "VALIDITY" : [ 1, 1, 1, 1, 1, 1 ],
-         |      "DATA" : [ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0 ]
-         |    }, {
-         |      "name" : "c",
-         |      "count" : 6,
-         |      "VALIDITY" : [ 1, 1, 1, 1, 1, 1 ],
-         |      "DATA" : [ 1, 2, 3, 4, 5, 6 ]
-         |    }, {
-         |      "name" : "d",
-         |      "count" : 6,
-         |      "VALIDITY" : [ 1, 1, 1, 1, 1, 1 ],
-         |      "DATA" : [ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0 ]
-         |    }, {
-         |      "name" : "e",
-         |      "count" : 6,
-         |      "VALIDITY" : [ 1, 1, 1, 1, 1, 1 ],
-         |      "DATA" : [ 1, 2, 3, 4, 5, 6 ]
-         |    } ]
-         |  } ]
-         |}
-       """.stripMargin
-
-    val data = List(1, 2, 3, 4, 5, 6)
-    val data_tuples = for (d <- data) yield {
-      (d.toShort, d.toFloat, d.toInt, d.toDouble, d.toLong)
-    }
-    val df = data_tuples.toDF("a", "b", "c", "d", "e")
-
-    collectAndValidate(df, json, "mixed_numeric_types.json")
-  }
-
-  test("string type conversion") {
-    val json =
-      s"""
-         |{
-         |  "schema" : {
-         |    "fields" : [ {
-         |      "name" : "upper_case",
-         |      "type" : {
-         |        "name" : "utf8"
-         |      },
-         |      "nullable" : true,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "OFFSET",
-         |          "typeBitWidth" : 32
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 8
-         |        } ]
-         |      }
-         |    }, {
-         |      "name" : "lower_case",
-         |      "type" : {
-         |        "name" : "utf8"
-         |      },
-         |      "nullable" : true,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "OFFSET",
-         |          "typeBitWidth" : 32
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 8
-         |        } ]
-         |      }
-         |    }, {
-         |      "name" : "null_str",
-         |      "type" : {
-         |        "name" : "utf8"
-         |      },
-         |      "nullable" : true,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "OFFSET",
-         |          "typeBitWidth" : 32
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 8
-         |        } ]
-         |      }
-         |    } ]
-         |  },
-         |  "batches" : [ {
-         |    "count" : 3,
-         |    "columns" : [ {
-         |      "name" : "upper_case",
-         |      "count" : 3,
-         |      "VALIDITY" : [ 1, 1, 1 ],
-         |      "OFFSET" : [ 0, 1, 2, 3 ],
-         |      "DATA" : [ "A", "B", "C" ]
-         |    }, {
-         |      "name" : "lower_case",
-         |      "count" : 3,
-         |      "VALIDITY" : [ 1, 1, 1 ],
-         |      "OFFSET" : [ 0, 1, 2, 3 ],
-         |      "DATA" : [ "a", "b", "c" ]
-         |    }, {
-         |      "name" : "null_str",
-         |      "count" : 3,
-         |      "VALIDITY" : [ 1, 1, 0 ],
-         |      "OFFSET" : [ 0, 2, 5, 5 ],
-         |      "DATA" : [ "ab", "CDE", "" ]
-         |    } ]
-         |  } ]
-         |}
-       """.stripMargin
-
-    val upperCase = Seq("A", "B", "C")
-    val lowerCase = Seq("a", "b", "c")
-    val nullStr = Seq("ab", "CDE", null)
-    val df = (upperCase, lowerCase, nullStr).zipped.toList
-      .toDF("upper_case", "lower_case", "null_str")
-
-    collectAndValidate(df, json, "stringData.json")
-  }
-
-  test("boolean type conversion") {
-    val json =
-      s"""
-         |{
-         |  "schema" : {
-         |    "fields" : [ {
-         |      "name" : "a_bool",
-         |      "type" : {
-         |        "name" : "bool"
-         |      },
-         |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 1
-         |        } ]
-         |      }
-         |    } ]
-         |  },
-         |  "batches" : [ {
-         |    "count" : 4,
-         |    "columns" : [ {
-         |      "name" : "a_bool",
-         |      "count" : 4,
-         |      "VALIDITY" : [ 1, 1, 1, 1 ],
-         |      "DATA" : [ true, true, false, true ]
-         |    } ]
-         |  } ]
-         |}
-       """.stripMargin
-    val df = Seq(true, true, false, true).toDF("a_bool")
-    collectAndValidate(df, json, "boolData.json")
-  }
-
-  test("byte type conversion") {
-    val json =
-      s"""
-         |{
-         |  "schema" : {
-         |    "fields" : [ {
-         |      "name" : "a_byte",
-         |      "type" : {
-         |        "name" : "int",
-         |        "isSigned" : true,
-         |        "bitWidth" : 8
-         |      },
-         |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 8
-         |        } ]
-         |      }
-         |    } ]
-         |  },
-         |  "batches" : [ {
-         |    "count" : 4,
-         |    "columns" : [ {
-         |      "name" : "a_byte",
-         |      "count" : 4,
-         |      "VALIDITY" : [ 1, 1, 1, 1 ],
-         |      "DATA" : [ 1, -1, 64, 127 ]
-         |    } ]
-         |  } ]
-         |}
-         |
-       """.stripMargin
-    val df = List[Byte](1.toByte, (-1).toByte, 64.toByte, Byte.MaxValue).toDF("a_byte")
-    collectAndValidate(df, json, "byteData.json")
-  }
-
-  test("binary type conversion") {
-    val json =
-      s"""
-         |{
-         |  "schema" : {
-         |    "fields" : [ {
-         |      "name" : "a_binary",
-         |      "type" : {
-         |        "name" : "binary"
-         |      },
-         |      "nullable" : true,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "OFFSET",
-         |          "typeBitWidth" : 32
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 8
-         |        } ]
-         |      }
-         |    } ]
-         |  },
-         |  "batches" : [ {
-         |    "count" : 3,
-         |    "columns" : [ {
-         |      "name" : "a_binary",
-         |      "count" : 3,
-         |      "VALIDITY" : [ 1, 1, 1 ],
-         |      "OFFSET" : [ 0, 3, 4, 6 ],
-         |      "DATA" : [ "616263", "64", "6566" ]
-         |    } ]
-         |  } ]
-         |}
-       """.stripMargin
-
-    val data = Seq("abc", "d", "ef")
-    val rdd = sparkContext.parallelize(data.map(s => Row(s.getBytes("utf-8"))))
-    val df = spark.createDataFrame(rdd, StructType(Seq(StructField("a_binary", BinaryType))))
-
-    collectAndValidate(df, json, "binaryData.json")
-  }
-
-  test("floating-point NaN") {
-    val json =
-      s"""
-         |{
-         |  "schema" : {
-         |    "fields" : [ {
-         |      "name" : "NaN_f",
-         |      "type" : {
-         |        "name" : "floatingpoint",
-         |        "precision" : "SINGLE"
-         |      },
-         |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
-         |    }, {
-         |      "name" : "NaN_d",
-         |      "type" : {
-         |        "name" : "floatingpoint",
-         |        "precision" : "DOUBLE"
-         |      },
-         |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 64
-         |        } ]
-         |      }
-         |    } ]
-         |  },
-         |  "batches" : [ {
-         |    "count" : 2,
-         |    "columns" : [ {
-         |      "name" : "NaN_f",
-         |      "count" : 2,
-         |      "VALIDITY" : [ 1, 1 ],
-         |      "DATA" : [ 1.2000000476837158, "NaN" ]
-         |    }, {
-         |      "name" : "NaN_d",
-         |      "count" : 2,
-         |      "VALIDITY" : [ 1, 1 ],
-         |      "DATA" : [ "NaN", 1.2 ]
-         |    } ]
-         |  } ]
-         |}
-       """.stripMargin
-
-    val fnan = Seq(1.2F, Float.NaN)
-    val dnan = Seq(Double.NaN, 1.2)
-    val df = fnan.zip(dnan).toDF("NaN_f", "NaN_d")
-
-    collectAndValidate(df, json, "nanData-floating_point.json")
-  }
-
-  test("partitioned DataFrame") {
-    val json1 =
-      s"""
-         |{
-         |  "schema" : {
-         |    "fields" : [ {
-         |      "name" : "a",
-         |      "type" : {
-         |        "name" : "int",
-         |        "isSigned" : true,
-         |        "bitWidth" : 32
-         |      },
-         |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
-         |    }, {
-         |      "name" : "b",
-         |      "type" : {
-         |        "name" : "int",
-         |        "isSigned" : true,
-         |        "bitWidth" : 32
-         |      },
-         |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
-         |    } ]
-         |  },
-         |  "batches" : [ {
-         |    "count" : 3,
-         |    "columns" : [ {
-         |      "name" : "a",
-         |      "count" : 3,
-         |      "VALIDITY" : [ 1, 1, 1 ],
-         |      "DATA" : [ 1, 1, 2 ]
-         |    }, {
-         |      "name" : "b",
-         |      "count" : 3,
-         |      "VALIDITY" : [ 1, 1, 1 ],
-         |      "DATA" : [ 1, 2, 1 ]
-         |    } ]
-         |  } ]
-         |}
-       """.stripMargin
-    val json2 =
-      s"""
-         |{
-         |  "schema" : {
-         |    "fields" : [ {
-         |      "name" : "a",
-         |      "type" : {
-         |        "name" : "int",
-         |        "isSigned" : true,
-         |        "bitWidth" : 32
-         |      },
-         |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
-         |    }, {
-         |      "name" : "b",
-         |      "type" : {
-         |        "name" : "int",
-         |        "isSigned" : true,
-         |        "bitWidth" : 32
-         |      },
-         |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
-         |    } ]
-         |  },
-         |  "batches" : [ {
-         |    "count" : 3,
-         |    "columns" : [ {
-         |      "name" : "a",
-         |      "count" : 3,
-         |      "VALIDITY" : [ 1, 1, 1 ],
-         |      "DATA" : [ 2, 3, 3 ]
-         |    }, {
-         |      "name" : "b",
-         |      "count" : 3,
-         |      "VALIDITY" : [ 1, 1, 1 ],
-         |      "DATA" : [ 2, 1, 2 ]
-         |    } ]
-         |  } ]
-         |}
-       """.stripMargin
-
-    val arrowPayloads = testData2.toArrowPayload.collect()
-    // NOTE: testData2 should have 2 partitions -> 2 arrow batches in payload
-    assert(arrowPayloads.length === 2)
-    val schema = testData2.schema
-
-    val tempFile1 = new File(tempDataPath, "testData2-ints-part1.json")
-    val tempFile2 = new File(tempDataPath, "testData2-ints-part2.json")
-    Files.write(json1, tempFile1, StandardCharsets.UTF_8)
-    Files.write(json2, tempFile2, StandardCharsets.UTF_8)
-
-    validateConversion(schema, arrowPayloads(0), tempFile1)
-    validateConversion(schema, arrowPayloads(1), tempFile2)
-  }
-
-  test("empty frame collect") {
-    val arrowPayload = spark.emptyDataFrame.toArrowPayload.collect()
-    assert(arrowPayload.isEmpty)
-
-    val filteredDF = List[Int](1, 2, 3, 4, 5, 6).toDF("i")
-    val filteredArrowPayload = filteredDF.filter("i < 0").toArrowPayload.collect()
-    assert(filteredArrowPayload.isEmpty)
-  }
-
-  test("empty partition collect") {
-    val emptyPart = spark.sparkContext.parallelize(Seq(1), 2).toDF("i")
-    val arrowPayloads = emptyPart.toArrowPayload.collect()
-    assert(arrowPayloads.length === 1)
-    val allocator = new RootAllocator(Long.MaxValue)
-    val arrowRecordBatches = arrowPayloads.map(_.loadBatch(allocator))
-    assert(arrowRecordBatches.head.getLength == 1)
-    arrowRecordBatches.foreach(_.close())
-    allocator.close()
-  }
-
-  test("max records in batch conf") {
-    val totalRecords = 10
-    val maxRecordsPerBatch = 3
-    spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", maxRecordsPerBatch)
-    val df = spark.sparkContext.parallelize(1 to totalRecords, 2).toDF("i")
-    val arrowPayloads = df.toArrowPayload.collect()
-    val allocator = new RootAllocator(Long.MaxValue)
-    val arrowRecordBatches = arrowPayloads.map(_.loadBatch(allocator))
-    var recordCount = 0
-    arrowRecordBatches.foreach { batch =>
-      assert(batch.getLength > 0)
-      assert(batch.getLength <= maxRecordsPerBatch)
-      recordCount += batch.getLength
-      batch.close()
-    }
-    assert(recordCount == totalRecords)
-    allocator.close()
-    spark.conf.unset("spark.sql.execution.arrow.maxRecordsPerBatch")
-  }
-
-  testQuietly("unsupported types") {
-    def runUnsupported(block: => Unit): Unit = {
-      val msg = intercept[SparkException] {
-        block
-      }
-      assert(msg.getMessage.contains("Unsupported data type"))
-      assert(msg.getCause.getClass === classOf[UnsupportedOperationException])
-    }
-
-    runUnsupported { decimalData.toArrowPayload.collect() }
-    runUnsupported { arrayData.toDF().toArrowPayload.collect() }
-    runUnsupported { mapData.toDF().toArrowPayload.collect() }
-    runUnsupported { complexData.toArrowPayload.collect() }
-
-    val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS z", Locale.US)
-    val d1 = new Date(sdf.parse("2015-04-08 13:10:15.000 UTC").getTime)
-    val d2 = new Date(sdf.parse("2016-05-09 13:10:15.000 UTC").getTime)
-    runUnsupported { Seq(d1, d2).toDF("date").toArrowPayload.collect() }
-
-    val ts1 = new Timestamp(sdf.parse("2013-04-08 01:10:15.567 UTC").getTime)
-    val ts2 = new Timestamp(sdf.parse("2013-04-08 13:10:10.789 UTC").getTime)
-    runUnsupported { Seq(ts1, ts2).toDF("timestamp").toArrowPayload.collect() }
-  }
-
-  test("test Arrow Validator") {
-    val json =
-      s"""
-         |{
-         |  "schema" : {
-         |    "fields" : [ {
-         |      "name" : "a_i",
-         |      "type" : {
-         |        "name" : "int",
-         |        "isSigned" : true,
-         |        "bitWidth" : 32
-         |      },
-         |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
-         |    }, {
-         |      "name" : "b_i",
-         |      "type" : {
-         |        "name" : "int",
-         |        "isSigned" : true,
-         |        "bitWidth" : 32
-         |      },
-         |      "nullable" : true,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
-         |    } ]
-         |  },
-         |  "batches" : [ {
-         |    "count" : 6,
-         |    "columns" : [ {
-         |      "name" : "a_i",
-         |      "count" : 6,
-         |      "VALIDITY" : [ 1, 1, 1, 1, 1, 1 ],
-         |      "DATA" : [ 1, -1, 2, -2, 2147483647, -2147483648 ]
-         |    }, {
-         |      "name" : "b_i",
-         |      "count" : 6,
-         |      "VALIDITY" : [ 1, 0, 0, 1, 0, 1 ],
-         |      "DATA" : [ 1, 0, 0, -2, 0, -2147483648 ]
-         |    } ]
-         |  } ]
-         |}
-       """.stripMargin
-    val json_diff_col_order =
-      s"""
-         |{
-         |  "schema" : {
-         |    "fields" : [ {
-         |      "name" : "b_i",
-         |      "type" : {
-         |        "name" : "int",
-         |        "isSigned" : true,
-         |        "bitWidth" : 32
-         |      },
-         |      "nullable" : true,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
-         |    }, {
-         |      "name" : "a_i",
-         |      "type" : {
-         |        "name" : "int",
-         |        "isSigned" : true,
-         |        "bitWidth" : 32
-         |      },
-         |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
-         |    } ]
-         |  },
-         |  "batches" : [ {
-         |    "count" : 6,
-         |    "columns" : [ {
-         |      "name" : "a_i",
-         |      "count" : 6,
-         |      "VALIDITY" : [ 1, 1, 1, 1, 1, 1 ],
-         |      "DATA" : [ 1, -1, 2, -2, 2147483647, -2147483648 ]
-         |    }, {
-         |      "name" : "b_i",
-         |      "count" : 6,
-         |      "VALIDITY" : [ 1, 0, 0, 1, 0, 1 ],
-         |      "DATA" : [ 1, 0, 0, -2, 0, -2147483648 ]
-         |    } ]
-         |  } ]
-         |}
-       """.stripMargin
-
-    val a_i = List[Int](1, -1, 2, -2, 2147483647, -2147483648)
-    val b_i = List[Option[Int]](Some(1), None, None, Some(-2), None, Some(-2147483648))
-    val df = a_i.zip(b_i).toDF("a_i", "b_i")
-
-    // Different schema
-    intercept[IllegalArgumentException] {
-      collectAndValidate(df, json_diff_col_order, "validator_diff_schema.json")
-    }
-
-    // Different values
-    intercept[IllegalArgumentException] {
-      collectAndValidate(df.sort($"a_i".desc), json, "validator_diff_values.json")
-    }
-  }
-
-  /** Test that a converted DataFrame to Arrow record batch equals batch read from JSON file */
-  private def collectAndValidate(df: DataFrame, json: String, file: String): Unit = {
-    // NOTE: coalesce to single partition because can only load 1 batch in validator
-    val arrowPayload = df.coalesce(1).toArrowPayload.collect().head
-    val tempFile = new File(tempDataPath, file)
-    Files.write(json, tempFile, StandardCharsets.UTF_8)
-    validateConversion(df.schema, arrowPayload, tempFile)
-  }
-
-  private def validateConversion(
-      sparkSchema: StructType,
-      arrowPayload: ArrowPayload,
-      jsonFile: File): Unit = {
-    val allocator = new RootAllocator(Long.MaxValue)
-    val jsonReader = new JsonFileReader(jsonFile, allocator)
-
-    val arrowSchema = ArrowConverters.schemaToArrowSchema(sparkSchema)
-    val jsonSchema = jsonReader.start()
-    Validator.compareSchemas(arrowSchema, jsonSchema)
-
-    val arrowRoot = VectorSchemaRoot.create(arrowSchema, allocator)
-    val vectorLoader = new VectorLoader(arrowRoot)
-    val arrowRecordBatch = arrowPayload.loadBatch(allocator)
-    vectorLoader.load(arrowRecordBatch)
-    val jsonRoot = jsonReader.read()
-    Validator.compareVectorSchemaRoot(arrowRoot, jsonRoot)
-
-    jsonRoot.close()
-    jsonReader.close()
-    arrowRecordBatch.close()
-    arrowRoot.close()
-    allocator.close()
-  }
-}

From e68aed70fbf1cfa59ba51df70287d718d737a193 Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Wed, 28 Jun 2017 10:45:45 -0700
Subject: [PATCH 030/779] [SPARK-21216][SS] Hive strategies missed in
 Structured Streaming IncrementalExecution

## What changes were proposed in this pull request?

If someone creates a HiveSession, the planner in `IncrementalExecution` doesn't take into account the Hive scan strategies. This causes joins of Streaming DataFrame's with Hive tables to fail.

## How was this patch tested?

Regression test

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #18426 from brkyvz/hive-join.
---
 .../streaming/IncrementalExecution.scala      |  4 ++
 .../sql/hive/execution/HiveDDLSuite.scala     | 41 ++++++++++++++++++-
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
index ab89dc6b705d5..dbe652b3b1ed2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
@@ -47,6 +47,10 @@ class IncrementalExecution(
       sparkSession.sparkContext,
       sparkSession.sessionState.conf,
       sparkSession.sessionState.experimentalMethods) {
+    override def strategies: Seq[Strategy] =
+      extraPlanningStrategies ++
+      sparkSession.sessionState.planner.strategies
+
     override def extraPlanningStrategies: Seq[Strategy] =
       StatefulAggregationStrategy ::
       FlatMapGroupsWithStateStrategy ::
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index aca964907d4cd..31fa3d2447467 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -160,7 +160,6 @@ class HiveCatalogedDDLSuite extends DDLSuite with TestHiveSingleton with BeforeA
   test("drop table") {
     testDropTable(isDatasourceTable = false)
   }
-
 }
 
 class HiveDDLSuite
@@ -1956,4 +1955,44 @@ class HiveDDLSuite
       }
     }
   }
+
+  test("SPARK-21216: join with a streaming DataFrame") {
+    import org.apache.spark.sql.execution.streaming.MemoryStream
+    import testImplicits._
+
+    implicit val _sqlContext = spark.sqlContext
+
+    Seq((1, "one"), (2, "two"), (4, "four")).toDF("number", "word").createOrReplaceTempView("t1")
+    // Make a table and ensure it will be broadcast.
+    sql("""CREATE TABLE smallTable(word string, number int)
+          |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
+          |STORED AS TEXTFILE
+        """.stripMargin)
+
+    sql(
+      """INSERT INTO smallTable
+        |SELECT word, number from t1
+      """.stripMargin)
+
+    val inputData = MemoryStream[Int]
+    val joined = inputData.toDS().toDF()
+      .join(spark.table("smallTable"), $"value" === $"number")
+
+    val sq = joined.writeStream
+      .format("memory")
+      .queryName("t2")
+      .start()
+    try {
+      inputData.addData(1, 2)
+
+      sq.processAllAvailable()
+
+      checkAnswer(
+        spark.table("t2"),
+        Seq(Row(1, "one", 1), Row(2, "two", 2))
+      )
+    } finally {
+      sq.stop()
+    }
+  }
 }

From b72b8521d9cad878a1a4e4dbb19cf980169dcbc7 Mon Sep 17 00:00:00 2001
From: Wang Gengliang <ltnwgl@gmail.com>
Date: Thu, 29 Jun 2017 08:47:31 +0800
Subject: [PATCH 031/779] [SPARK-21222] Move elimination of Distinct clause
 from analyzer to optimizer

## What changes were proposed in this pull request?

Move elimination of Distinct clause from analyzer to optimizer

Distinct clause is useless after MAX/MIN clause. For example,
"Select MAX(distinct a) FROM src from"
is equivalent of
"Select MAX(a) FROM src from"
However, this optimization is implemented in analyzer. It should be in optimizer.

## How was this patch tested?

Unit test

gatorsmile cloud-fan

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Wang Gengliang <ltnwgl@gmail.com>

Closes #18429 from gengliangwang/distinct_opt.
---
 .../sql/catalyst/analysis/Analyzer.scala      |  5 --
 .../spark/sql/catalyst/dsl/package.scala      |  2 +
 .../sql/catalyst/optimizer/Optimizer.scala    | 15 +++++
 .../optimizer/EliminateDistinctSuite.scala    | 56 +++++++++++++++++++
 4 files changed, 73 insertions(+), 5 deletions(-)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateDistinctSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 434b6ffee37fa..53536496d0457 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -1197,11 +1197,6 @@ class Analyzer(
           case u @ UnresolvedFunction(funcId, children, isDistinct) =>
             withPosition(u) {
               catalog.lookupFunction(funcId, children) match {
-                // DISTINCT is not meaningful for a Max or a Min.
-                case max: Max if isDistinct =>
-                  AggregateExpression(max, Complete, isDistinct = false)
-                case min: Min if isDistinct =>
-                  AggregateExpression(min, Complete, isDistinct = false)
                 // AggregateWindowFunctions are AggregateFunctions that can only be evaluated within
                 // the context of a Window clause. They do not need to be wrapped in an
                 // AggregateExpression.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
index beee93d906f0f..f6792569b704e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -159,7 +159,9 @@ package object dsl {
     def first(e: Expression): Expression = new First(e).toAggregateExpression()
     def last(e: Expression): Expression = new Last(e).toAggregateExpression()
     def min(e: Expression): Expression = Min(e).toAggregateExpression()
+    def minDistinct(e: Expression): Expression = Min(e).toAggregateExpression(isDistinct = true)
     def max(e: Expression): Expression = Max(e).toAggregateExpression()
+    def maxDistinct(e: Expression): Expression = Max(e).toAggregateExpression(isDistinct = true)
     def upper(e: Expression): Expression = Upper(e)
     def lower(e: Expression): Expression = Lower(e)
     def sqrt(e: Expression): Expression = Sqrt(e)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index b410312030c5d..946fa7bae0199 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -40,6 +40,7 @@ abstract class Optimizer(sessionCatalog: SessionCatalog, conf: SQLConf)
   protected val fixedPoint = FixedPoint(conf.optimizerMaxIterations)
 
   def batches: Seq[Batch] = {
+    Batch("Eliminate Distinct", Once, EliminateDistinct) ::
     // Technically some of the rules in Finish Analysis are not optimizer rules and belong more
     // in the analyzer, because they are needed for correctness (e.g. ComputeCurrentTime).
     // However, because we also use the analyzer to canonicalized queries (for view definition),
@@ -151,6 +152,20 @@ abstract class Optimizer(sessionCatalog: SessionCatalog, conf: SQLConf)
   def extendedOperatorOptimizationRules: Seq[Rule[LogicalPlan]] = Nil
 }
 
+/**
+ * Remove useless DISTINCT for MAX and MIN.
+ * This rule should be applied before RewriteDistinctAggregates.
+ */
+object EliminateDistinct extends Rule[LogicalPlan] {
+  override def apply(plan: LogicalPlan): LogicalPlan = plan transformExpressions  {
+    case ae: AggregateExpression if ae.isDistinct =>
+      ae.aggregateFunction match {
+        case _: Max | _: Min => ae.copy(isDistinct = false)
+        case _ => ae
+      }
+  }
+}
+
 /**
  * An optimizer used in test code.
  *
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateDistinctSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateDistinctSuite.scala
new file mode 100644
index 0000000000000..f40691bd1a038
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateDistinctSuite.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.catalyst.optimizer
+
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Expand, LocalRelation, LogicalPlan}
+import org.apache.spark.sql.catalyst.rules.RuleExecutor
+
+class EliminateDistinctSuite extends PlanTest {
+
+  object Optimize extends RuleExecutor[LogicalPlan] {
+    val batches =
+      Batch("Operator Optimizations", Once,
+        EliminateDistinct) :: Nil
+  }
+
+  val testRelation = LocalRelation('a.int)
+
+  test("Eliminate Distinct in Max") {
+    val query = testRelation
+      .select(maxDistinct('a).as('result))
+      .analyze
+    val answer = testRelation
+      .select(max('a).as('result))
+      .analyze
+    assert(query != answer)
+    comparePlans(Optimize.execute(query), answer)
+  }
+
+  test("Eliminate Distinct in Min") {
+    val query = testRelation
+      .select(minDistinct('a).as('result))
+      .analyze
+    val answer = testRelation
+      .select(min('a).as('result))
+      .analyze
+    assert(query != answer)
+    comparePlans(Optimize.execute(query), answer)
+  }
+}

From 376d90d556fcd4fd84f70ee42a1323e1f48f829d Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Wed, 28 Jun 2017 19:31:54 -0700
Subject: [PATCH 032/779] [SPARK-20889][SPARKR] Grouped documentation for
 STRING column methods

## What changes were proposed in this pull request?

Grouped documentation for string column methods.

Author: actuaryzhang <actuaryzhang10@gmail.com>
Author: Wayne Zhang <actuaryzhang10@gmail.com>

Closes #18366 from actuaryzhang/sparkRDocString.
---
 R/pkg/R/functions.R | 573 +++++++++++++++++++-------------------------
 R/pkg/R/generics.R  |  84 ++++---
 2 files changed, 300 insertions(+), 357 deletions(-)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 23ccdf941a8c7..70ea620b471fe 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -111,6 +111,27 @@ NULL
 #' head(tmp)}
 NULL
 
+#' String functions for Column operations
+#'
+#' String functions defined for \code{Column}.
+#'
+#' @param x Column to compute on except in the following methods:
+#'      \itemize{
+#'      \item \code{instr}: \code{character}, the substring to check. See 'Details'.
+#'      \item \code{format_number}: \code{numeric}, the number of decimal place to
+#'           format to. See 'Details'.
+#'      }
+#' @param y Column to compute on.
+#' @param ... additional columns.
+#' @name column_string_functions
+#' @rdname column_string_functions
+#' @family string functions
+#' @examples
+#' \dontrun{
+#' # Dataframe used throughout this doc
+#' df <- createDataFrame(as.data.frame(Titanic, stringsAsFactors = FALSE))}
+NULL
+
 #' lit
 #'
 #' A new \linkS4class{Column} is created to represent the literal value.
@@ -188,19 +209,17 @@ setMethod("approxCountDistinct",
             column(jc)
           })
 
-#' ascii
-#'
-#' Computes the numeric value of the first character of the string column, and returns the
-#' result as a int column.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{ascii}: Computes the numeric value of the first character of the string column,
+#' and returns the result as an int column.
 #'
-#' @rdname ascii
-#' @name ascii
-#' @family string functions
+#' @rdname column_string_functions
 #' @export
-#' @aliases ascii,Column-method
-#' @examples \dontrun{\dontrun{ascii(df$c)}}
+#' @aliases ascii ascii,Column-method
+#' @examples
+#'
+#' \dontrun{
+#' head(select(df, ascii(df$Class), ascii(df$Sex)))}
 #' @note ascii since 1.5.0
 setMethod("ascii",
           signature(x = "Column"),
@@ -256,19 +275,22 @@ setMethod("avg",
             column(jc)
           })
 
-#' base64
-#'
-#' Computes the BASE64 encoding of a binary column and returns it as a string column.
-#' This is the reverse of unbase64.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{base64}: Computes the BASE64 encoding of a binary column and returns it as
+#' a string column. This is the reverse of unbase64.
 #'
-#' @rdname base64
-#' @name base64
-#' @family string functions
+#' @rdname column_string_functions
 #' @export
-#' @aliases base64,Column-method
-#' @examples \dontrun{base64(df$c)}
+#' @aliases base64 base64,Column-method
+#' @examples
+#'
+#' \dontrun{
+#' tmp <- mutate(df, s1 = encode(df$Class, "UTF-8"))
+#' str(tmp)
+#' tmp2 <- mutate(tmp, s2 = base64(tmp$s1), s3 = decode(tmp$s1, "UTF-8"),
+#'                     s4 = soundex(tmp$Sex))
+#' head(tmp2)
+#' head(select(tmp2, unbase64(tmp2$s2)))}
 #' @note base64 since 1.5.0
 setMethod("base64",
           signature(x = "Column"),
@@ -620,20 +642,16 @@ setMethod("dayofyear",
             column(jc)
           })
 
-#' decode
-#'
-#' Computes the first argument into a string from a binary using the provided character set
-#' (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16').
+#' @details
+#' \code{decode}: Computes the first argument into a string from a binary using the provided
+#' character set.
 #'
-#' @param x Column to compute on.
-#' @param charset Character set to use
+#' @param charset Character set to use (one of "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE",
+#'                "UTF-16LE", "UTF-16").
 #'
-#' @rdname decode
-#' @name decode
-#' @family string functions
-#' @aliases decode,Column,character-method
+#' @rdname column_string_functions
+#' @aliases decode decode,Column,character-method
 #' @export
-#' @examples \dontrun{decode(df$c, "UTF-8")}
 #' @note decode since 1.6.0
 setMethod("decode",
           signature(x = "Column", charset = "character"),
@@ -642,20 +660,13 @@ setMethod("decode",
             column(jc)
           })
 
-#' encode
-#'
-#' Computes the first argument into a binary from a string using the provided character set
-#' (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16').
-#'
-#' @param x Column to compute on.
-#' @param charset Character set to use
+#' @details
+#' \code{encode}: Computes the first argument into a binary from a string using the provided
+#' character set.
 #'
-#' @rdname encode
-#' @name encode
-#' @family string functions
-#' @aliases encode,Column,character-method
+#' @rdname column_string_functions
+#' @aliases encode encode,Column,character-method
 #' @export
-#' @examples \dontrun{encode(df$c, "UTF-8")}
 #' @note encode since 1.6.0
 setMethod("encode",
           signature(x = "Column", charset = "character"),
@@ -788,21 +799,23 @@ setMethod("hour",
             column(jc)
           })
 
-#' initcap
-#'
-#' Returns a new string column by converting the first letter of each word to uppercase.
-#' Words are delimited by whitespace.
-#'
-#' For example, "hello world" will become "Hello World".
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{initcap}: Returns a new string column by converting the first letter of
+#' each word to uppercase. Words are delimited by whitespace. For example, "hello world"
+#' will become "Hello World".
 #'
-#' @rdname initcap
-#' @name initcap
-#' @family string functions
-#' @aliases initcap,Column-method
+#' @rdname column_string_functions
+#' @aliases initcap initcap,Column-method
 #' @export
-#' @examples \dontrun{initcap(df$c)}
+#' @examples
+#'
+#' \dontrun{
+#' tmp <- mutate(df, sex_lower = lower(df$Sex), age_upper = upper(df$age),
+#'                   sex_age = concat_ws(" ", lower(df$sex), lower(df$age)))
+#' head(tmp)
+#' tmp2 <- mutate(tmp, s1 = initcap(tmp$sex_lower), s2 = initcap(tmp$sex_age),
+#'                     s3 = reverse(df$Sex))
+#' head(tmp2)}
 #' @note initcap since 1.5.0
 setMethod("initcap",
           signature(x = "Column"),
@@ -918,18 +931,12 @@ setMethod("last_day",
             column(jc)
           })
 
-#' length
-#'
-#' Computes the length of a given string or binary column.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{length}: Computes the length of a given string or binary column.
 #'
-#' @rdname length
-#' @name length
-#' @aliases length,Column-method
-#' @family string functions
+#' @rdname column_string_functions
+#' @aliases length length,Column-method
 #' @export
-#' @examples \dontrun{length(df$c)}
 #' @note length since 1.5.0
 setMethod("length",
           signature(x = "Column"),
@@ -994,18 +1001,12 @@ setMethod("log2",
             column(jc)
           })
 
-#' lower
-#'
-#' Converts a string column to lower case.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{lower}: Converts a string column to lower case.
 #'
-#' @rdname lower
-#' @name lower
-#' @family string functions
-#' @aliases lower,Column-method
+#' @rdname column_string_functions
+#' @aliases lower lower,Column-method
 #' @export
-#' @examples \dontrun{lower(df$c)}
 #' @note lower since 1.4.0
 setMethod("lower",
           signature(x = "Column"),
@@ -1014,18 +1015,24 @@ setMethod("lower",
             column(jc)
           })
 
-#' ltrim
-#'
-#' Trim the spaces from left end for the specified string value.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{ltrim}: Trims the spaces from left end for the specified string value.
 #'
-#' @rdname ltrim
-#' @name ltrim
-#' @family string functions
-#' @aliases ltrim,Column-method
+#' @rdname column_string_functions
+#' @aliases ltrim ltrim,Column-method
 #' @export
-#' @examples \dontrun{ltrim(df$c)}
+#' @examples
+#'
+#' \dontrun{
+#' tmp <- mutate(df, SexLpad = lpad(df$Sex, 6, " "), SexRpad = rpad(df$Sex, 7, " "))
+#' head(select(tmp, length(tmp$Sex), length(tmp$SexLpad), length(tmp$SexRpad)))
+#' tmp2 <- mutate(tmp, SexLtrim = ltrim(tmp$SexLpad), SexRtrim = rtrim(tmp$SexRpad),
+#'                     SexTrim = trim(tmp$SexLpad))
+#' head(select(tmp2, length(tmp2$Sex), length(tmp2$SexLtrim),
+#'                   length(tmp2$SexRtrim), length(tmp2$SexTrim)))
+#'
+#' tmp <- mutate(df, SexLpad = lpad(df$Sex, 6, "xx"), SexRpad = rpad(df$Sex, 7, "xx"))
+#' head(tmp)}
 #' @note ltrim since 1.5.0
 setMethod("ltrim",
           signature(x = "Column"),
@@ -1198,18 +1205,12 @@ setMethod("quarter",
             column(jc)
           })
 
-#' reverse
-#'
-#' Reverses the string column and returns it as a new string column.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{reverse}: Reverses the string column and returns it as a new string column.
 #'
-#' @rdname reverse
-#' @name reverse
-#' @family string functions
-#' @aliases reverse,Column-method
+#' @rdname column_string_functions
+#' @aliases reverse reverse,Column-method
 #' @export
-#' @examples \dontrun{reverse(df$c)}
 #' @note reverse since 1.5.0
 setMethod("reverse",
           signature(x = "Column"),
@@ -1268,18 +1269,12 @@ setMethod("bround",
             column(jc)
           })
 
-#' rtrim
-#'
-#' Trim the spaces from right end for the specified string value.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{rtrim}: Trims the spaces from right end for the specified string value.
 #'
-#' @rdname rtrim
-#' @name rtrim
-#' @family string functions
-#' @aliases rtrim,Column-method
+#' @rdname column_string_functions
+#' @aliases rtrim rtrim,Column-method
 #' @export
-#' @examples \dontrun{rtrim(df$c)}
 #' @note rtrim since 1.5.0
 setMethod("rtrim",
           signature(x = "Column"),
@@ -1409,18 +1404,12 @@ setMethod("skewness",
             column(jc)
           })
 
-#' soundex
-#'
-#' Return the soundex code for the specified expression.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{soundex}: Returns the soundex code for the specified expression.
 #'
-#' @rdname soundex
-#' @name soundex
-#' @family string functions
-#' @aliases soundex,Column-method
+#' @rdname column_string_functions
+#' @aliases soundex soundex,Column-method
 #' @export
-#' @examples \dontrun{soundex(df$c)}
 #' @note soundex since 1.5.0
 setMethod("soundex",
           signature(x = "Column"),
@@ -1731,18 +1720,12 @@ setMethod("to_timestamp",
             column(jc)
           })
 
-#' trim
-#'
-#' Trim the spaces from both ends for the specified string column.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{trim}: Trims the spaces from both ends for the specified string column.
 #'
-#' @rdname trim
-#' @name trim
-#' @family string functions
-#' @aliases trim,Column-method
+#' @rdname column_string_functions
+#' @aliases trim trim,Column-method
 #' @export
-#' @examples \dontrun{trim(df$c)}
 #' @note trim since 1.5.0
 setMethod("trim",
           signature(x = "Column"),
@@ -1751,19 +1734,13 @@ setMethod("trim",
             column(jc)
           })
 
-#' unbase64
-#'
-#' Decodes a BASE64 encoded string column and returns it as a binary column.
+#' @details
+#' \code{unbase64}: Decodes a BASE64 encoded string column and returns it as a binary column.
 #' This is the reverse of base64.
 #'
-#' @param x Column to compute on.
-#'
-#' @rdname unbase64
-#' @name unbase64
-#' @family string functions
-#' @aliases unbase64,Column-method
+#' @rdname column_string_functions
+#' @aliases unbase64 unbase64,Column-method
 #' @export
-#' @examples \dontrun{unbase64(df$c)}
 #' @note unbase64 since 1.5.0
 setMethod("unbase64",
           signature(x = "Column"),
@@ -1787,18 +1764,12 @@ setMethod("unhex",
             column(jc)
           })
 
-#' upper
-#'
-#' Converts a string column to upper case.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{upper}: Converts a string column to upper case.
 #'
-#' @rdname upper
-#' @name upper
-#' @family string functions
-#' @aliases upper,Column-method
+#' @rdname column_string_functions
+#' @aliases upper upper,Column-method
 #' @export
-#' @examples \dontrun{upper(df$c)}
 #' @note upper since 1.4.0
 setMethod("upper",
           signature(x = "Column"),
@@ -1949,19 +1920,19 @@ setMethod("hypot", signature(y = "Column"),
             column(jc)
           })
 
-#' levenshtein
-#'
-#' Computes the Levenshtein distance of the two given string columns.
-#'
-#' @param x Column to compute on.
-#' @param y Column to compute on.
+#' @details
+#' \code{levenshtein}: Computes the Levenshtein distance of the two given string columns.
 #'
-#' @rdname levenshtein
-#' @name levenshtein
-#' @family string functions
-#' @aliases levenshtein,Column-method
+#' @rdname column_string_functions
+#' @aliases levenshtein levenshtein,Column-method
 #' @export
-#' @examples \dontrun{levenshtein(df$c, x)}
+#' @examples
+#'
+#' \dontrun{
+#' tmp <- mutate(df, d1 = levenshtein(df$Class, df$Sex),
+#'                   d2 = levenshtein(df$Age, df$Sex),
+#'                   d3 = levenshtein(df$Age, df$Age))
+#' head(tmp)}
 #' @note levenshtein since 1.5.0
 setMethod("levenshtein", signature(y = "Column"),
           function(y, x) {
@@ -2061,20 +2032,22 @@ setMethod("countDistinct",
             column(jc)
           })
 
-
-#' concat
-#'
-#' Concatenates multiple input string columns together into a single string column.
-#'
-#' @param x Column to compute on
-#' @param ... other columns
+#' @details
+#' \code{concat}: Concatenates multiple input string columns together into a single string column.
 #'
-#' @family string functions
-#' @rdname concat
-#' @name concat
-#' @aliases concat,Column-method
+#' @rdname column_string_functions
+#' @aliases concat concat,Column-method
 #' @export
-#' @examples \dontrun{concat(df$strings, df$strings2)}
+#' @examples
+#'
+#' \dontrun{
+#' # concatenate strings
+#' tmp <- mutate(df, s1 = concat(df$Class, df$Sex),
+#'                   s2 = concat(df$Class, df$Sex, df$Age),
+#'                   s3 = concat(df$Class, df$Sex, df$Age, df$Class),
+#'                   s4 = concat_ws("_", df$Class, df$Sex),
+#'                   s5 = concat_ws("+", df$Class, df$Sex, df$Age, df$Survived))
+#' head(tmp)}
 #' @note concat since 1.5.0
 setMethod("concat",
           signature(x = "Column"),
@@ -2243,22 +2216,21 @@ setMethod("from_utc_timestamp", signature(y = "Column", x = "character"),
             column(jc)
           })
 
-#' instr
-#'
-#' Locate the position of the first occurrence of substr column in the given string.
-#' Returns null if either of the arguments are null.
-#'
-#' Note: The position is not zero based, but 1 based index. Returns 0 if substr
-#' could not be found in str.
+#' @details
+#' \code{instr}: Locates the position of the first occurrence of a substring (\code{x})
+#' in the given string column (\code{y}). Returns null if either of the arguments are null.
+#' Note: The position is not zero based, but 1 based index. Returns 0 if the substring
+#' could not be found in the string column.
 #'
-#' @param y column to check
-#' @param x substring to check
-#' @family string functions
-#' @aliases instr,Column,character-method
-#' @rdname instr
-#' @name instr
+#' @rdname column_string_functions
+#' @aliases instr instr,Column,character-method
 #' @export
-#' @examples \dontrun{instr(df$c, 'b')}
+#' @examples
+#'
+#' \dontrun{
+#' tmp <- mutate(df, s1 = instr(df$Sex, "m"), s2 = instr(df$Sex, "M"),
+#'                   s3 = locate("m", df$Sex), s4 = locate("m", df$Sex, pos = 4))
+#' head(tmp)}
 #' @note instr since 1.5.0
 setMethod("instr", signature(y = "Column", x = "character"),
           function(y, x) {
@@ -2345,22 +2317,22 @@ setMethod("date_sub", signature(y = "Column", x = "numeric"),
             column(jc)
           })
 
-#' format_number
-#'
-#' Formats numeric column y to a format like '#,###,###.##', rounded to x decimal places
-#' with HALF_EVEN round mode, and returns the result as a string column.
-#'
-#' If x is 0, the result has no decimal point or fractional part.
-#' If x < 0, the result will be null.
+#' @details
+#' \code{format_number}: Formats numeric column \code{y} to a format like '#,###,###.##',
+#' rounded to \code{x} decimal places with HALF_EVEN round mode, and returns the result
+#' as a string column.
+#' If \code{x} is 0, the result has no decimal point or fractional part.
+#' If \code{x} < 0, the result will be null.
 #'
-#' @param y column to format
-#' @param x number of decimal place to format to
-#' @family string functions
-#' @rdname format_number
-#' @name format_number
-#' @aliases format_number,Column,numeric-method
+#' @rdname column_string_functions
+#' @aliases format_number format_number,Column,numeric-method
 #' @export
-#' @examples \dontrun{format_number(df$n, 4)}
+#' @examples
+#'
+#' \dontrun{
+#' tmp <- mutate(df, v1 = df$Freq/3)
+#' head(select(tmp, format_number(tmp$v1, 0), format_number(tmp$v1, 2),
+#'                  format_string("%4.2f %s", tmp$v1, tmp$Sex)), 10)}
 #' @note format_number since 1.5.0
 setMethod("format_number", signature(y = "Column", x = "numeric"),
           function(y, x) {
@@ -2438,21 +2410,14 @@ setMethod("shiftRightUnsigned", signature(y = "Column", x = "numeric"),
             column(jc)
           })
 
-#' concat_ws
-#'
-#' Concatenates multiple input string columns together into a single string column,
-#' using the given separator.
+#' @details
+#' \code{concat_ws}: Concatenates multiple input string columns together into a single
+#' string column, using the given separator.
 #'
-#' @param x column to concatenate.
 #' @param sep separator to use.
-#' @param ... other columns to concatenate.
-#'
-#' @family string functions
-#' @rdname concat_ws
-#' @name concat_ws
-#' @aliases concat_ws,character,Column-method
+#' @rdname column_string_functions
+#' @aliases concat_ws concat_ws,character,Column-method
 #' @export
-#' @examples \dontrun{concat_ws('-', df$s, df$d)}
 #' @note concat_ws since 1.5.0
 setMethod("concat_ws", signature(sep = "character", x = "Column"),
           function(sep, x, ...) {
@@ -2499,19 +2464,14 @@ setMethod("expr", signature(x = "character"),
             column(jc)
           })
 
-#' format_string
-#'
-#' Formats the arguments in printf-style and returns the result as a string column.
+#' @details
+#' \code{format_string}: Formats the arguments in printf-style and returns the result
+#' as a string column.
 #'
 #' @param format a character object of format strings.
-#' @param x a Column.
-#' @param ... additional Column(s).
-#' @family string functions
-#' @rdname format_string
-#' @name format_string
-#' @aliases format_string,character,Column-method
+#' @rdname column_string_functions
+#' @aliases format_string format_string,character,Column-method
 #' @export
-#' @examples \dontrun{format_string('%d %s', df$a, df$b)}
 #' @note format_string since 1.5.0
 setMethod("format_string", signature(format = "character", x = "Column"),
           function(format, x, ...) {
@@ -2620,23 +2580,17 @@ setMethod("window", signature(x = "Column"),
             column(jc)
           })
 
-#' locate
-#'
-#' Locate the position of the first occurrence of substr.
-#'
+#' @details
+#' \code{locate}: Locates the position of the first occurrence of substr.
 #' Note: The position is not zero based, but 1 based index. Returns 0 if substr
 #' could not be found in str.
 #'
 #' @param substr a character string to be matched.
 #' @param str a Column where matches are sought for each entry.
 #' @param pos start position of search.
-#' @param ... further arguments to be passed to or from other methods.
-#' @family string functions
-#' @rdname locate
-#' @aliases locate,character,Column-method
-#' @name locate
+#' @rdname column_string_functions
+#' @aliases locate locate,character,Column-method
 #' @export
-#' @examples \dontrun{locate('b', df$c, 1)}
 #' @note locate since 1.5.0
 setMethod("locate", signature(substr = "character", str = "Column"),
           function(substr, str, pos = 1) {
@@ -2646,19 +2600,14 @@ setMethod("locate", signature(substr = "character", str = "Column"),
             column(jc)
           })
 
-#' lpad
-#'
-#' Left-pad the string column with
+#' @details
+#' \code{lpad}: Left-padded with pad to a length of len.
 #'
-#' @param x the string Column to be left-padded.
 #' @param len maximum length of each output result.
 #' @param pad a character string to be padded with.
-#' @family string functions
-#' @rdname lpad
-#' @aliases lpad,Column,numeric,character-method
-#' @name lpad
+#' @rdname column_string_functions
+#' @aliases lpad lpad,Column,numeric,character-method
 #' @export
-#' @examples \dontrun{lpad(df$c, 6, '#')}
 #' @note lpad since 1.5.0
 setMethod("lpad", signature(x = "Column", len = "numeric", pad = "character"),
           function(x, len, pad) {
@@ -2728,20 +2677,27 @@ setMethod("randn", signature(seed = "numeric"),
             column(jc)
           })
 
-#' regexp_extract
-#'
-#' Extract a specific \code{idx} group identified by a Java regex, from the specified string column.
-#' If the regex did not match, or the specified group did not match, an empty string is returned.
+#' @details
+#' \code{regexp_extract}: Extracts a specific \code{idx} group identified by a Java regex,
+#' from the specified string column. If the regex did not match, or the specified group did
+#' not match, an empty string is returned.
 #'
-#' @param x a string Column.
 #' @param pattern a regular expression.
 #' @param idx a group index.
-#' @family string functions
-#' @rdname regexp_extract
-#' @name regexp_extract
-#' @aliases regexp_extract,Column,character,numeric-method
+#' @rdname column_string_functions
+#' @aliases regexp_extract regexp_extract,Column,character,numeric-method
 #' @export
-#' @examples \dontrun{regexp_extract(df$c, '(\d+)-(\d+)', 1)}
+#' @examples
+#'
+#' \dontrun{
+#' tmp <- mutate(df, s1 = regexp_extract(df$Class, "(\\d+)\\w+", 1),
+#'                   s2 = regexp_extract(df$Sex, "^(\\w)\\w+", 1),
+#'                   s3 = regexp_replace(df$Class, "\\D+", ""),
+#'                   s4 = substring_index(df$Sex, "a", 1),
+#'                   s5 = substring_index(df$Sex, "a", -1),
+#'                   s6 = translate(df$Sex, "ale", ""),
+#'                   s7 = translate(df$Sex, "a", "-"))
+#' head(tmp)}
 #' @note regexp_extract since 1.5.0
 setMethod("regexp_extract",
           signature(x = "Column", pattern = "character", idx = "numeric"),
@@ -2752,19 +2708,14 @@ setMethod("regexp_extract",
             column(jc)
           })
 
-#' regexp_replace
-#'
-#' Replace all substrings of the specified string value that match regexp with rep.
+#' @details
+#' \code{regexp_replace}: Replaces all substrings of the specified string value that
+#' match regexp with rep.
 #'
-#' @param x a string Column.
-#' @param pattern a regular expression.
 #' @param replacement a character string that a matched \code{pattern} is replaced with.
-#' @family string functions
-#' @rdname regexp_replace
-#' @name regexp_replace
-#' @aliases regexp_replace,Column,character,character-method
+#' @rdname column_string_functions
+#' @aliases regexp_replace regexp_replace,Column,character,character-method
 #' @export
-#' @examples \dontrun{regexp_replace(df$c, '(\\d+)', '--')}
 #' @note regexp_replace since 1.5.0
 setMethod("regexp_replace",
           signature(x = "Column", pattern = "character", replacement = "character"),
@@ -2775,19 +2726,12 @@ setMethod("regexp_replace",
             column(jc)
           })
 
-#' rpad
-#'
-#' Right-padded with pad to a length of len.
+#' @details
+#' \code{rpad}: Right-padded with pad to a length of len.
 #'
-#' @param x the string Column to be right-padded.
-#' @param len maximum length of each output result.
-#' @param pad a character string to be padded with.
-#' @family string functions
-#' @rdname rpad
-#' @name rpad
-#' @aliases rpad,Column,numeric,character-method
+#' @rdname column_string_functions
+#' @aliases rpad rpad,Column,numeric,character-method
 #' @export
-#' @examples \dontrun{rpad(df$c, 6, '#')}
 #' @note rpad since 1.5.0
 setMethod("rpad", signature(x = "Column", len = "numeric", pad = "character"),
           function(x, len, pad) {
@@ -2797,28 +2741,20 @@ setMethod("rpad", signature(x = "Column", len = "numeric", pad = "character"),
             column(jc)
           })
 
-#' substring_index
-#'
-#' Returns the substring from string str before count occurrences of the delimiter delim.
-#' If count is positive, everything the left of the final delimiter (counting from left) is
-#' returned. If count is negative, every to the right of the final delimiter (counting from the
-#' right) is returned. substring_index performs a case-sensitive match when searching for delim.
+#' @details
+#' \code{substring_index}: Returns the substring from string str before count occurrences of
+#' the delimiter delim. If count is positive, everything the left of the final delimiter
+#' (counting from left) is returned. If count is negative, every to the right of the final
+#' delimiter (counting from the right) is returned. substring_index performs a case-sensitive
+#' match when searching for delim.
 #'
-#' @param x a Column.
 #' @param delim a delimiter string.
 #' @param count number of occurrences of \code{delim} before the substring is returned.
 #'              A positive number means counting from the left, while negative means
 #'              counting from the right.
-#' @family string functions
-#' @rdname substring_index
-#' @aliases substring_index,Column,character,numeric-method
-#' @name substring_index
+#' @rdname column_string_functions
+#' @aliases substring_index substring_index,Column,character,numeric-method
 #' @export
-#' @examples
-#'\dontrun{
-#'substring_index(df$c, '.', 2)
-#'substring_index(df$c, '.', -1)
-#'}
 #' @note substring_index since 1.5.0
 setMethod("substring_index",
           signature(x = "Column", delim = "character", count = "numeric"),
@@ -2829,24 +2765,19 @@ setMethod("substring_index",
             column(jc)
           })
 
-#' translate
-#'
-#' Translate any character in the src by a character in replaceString.
+#' @details
+#' \code{translate}: Translates any character in the src by a character in replaceString.
 #' The characters in replaceString is corresponding to the characters in matchingString.
 #' The translate will happen when any character in the string matching with the character
 #' in the matchingString.
 #'
-#' @param x a string Column.
 #' @param matchingString a source string where each character will be translated.
 #' @param replaceString a target string where each \code{matchingString} character will
 #'                      be replaced by the character in \code{replaceString}
 #'                      at the same location, if any.
-#' @family string functions
-#' @rdname translate
-#' @name translate
-#' @aliases translate,Column,character,character-method
+#' @rdname column_string_functions
+#' @aliases translate translate,Column,character,character-method
 #' @export
-#' @examples \dontrun{translate(df$c, 'rnlt', '123')}
 #' @note translate since 1.5.0
 setMethod("translate",
           signature(x = "Column", matchingString = "character", replaceString = "character"),
@@ -3419,28 +3350,20 @@ setMethod("collect_set",
             column(jc)
           })
 
-#' split_string
-#'
-#' Splits string on regular expression.
-#'
-#' Equivalent to \code{split} SQL function
-#'
-#' @param x Column to compute on
-#' @param pattern Java regular expression
+#' @details
+#' \code{split_string}: Splits string on regular expression.
+#' Equivalent to \code{split} SQL function.
 #'
-#' @rdname split_string
-#' @family string functions
-#' @aliases split_string,Column-method
+#' @rdname column_string_functions
+#' @aliases split_string split_string,Column-method
 #' @export
 #' @examples
-#' \dontrun{
-#' df <- read.text("README.md")
-#'
-#' head(select(df, split_string(df$value, "\\s+")))
 #'
+#' \dontrun{
+#' head(select(df, split_string(df$Sex, "a")))
+#' head(select(df, split_string(df$Class, "\\d")))
 #' # This is equivalent to the following SQL expression
-#' head(selectExpr(df, "split(value, '\\\\s+')"))
-#' }
+#' head(selectExpr(df, "split(Class, '\\\\d')"))}
 #' @note split_string 2.3.0
 setMethod("split_string",
           signature(x = "Column", pattern = "character"),
@@ -3449,28 +3372,20 @@ setMethod("split_string",
             column(jc)
           })
 
-#' repeat_string
-#'
-#' Repeats string n times.
-#'
-#' Equivalent to \code{repeat} SQL function
+#' @details
+#' \code{repeat_string}: Repeats string n times.
+#' Equivalent to \code{repeat} SQL function.
 #'
-#' @param x Column to compute on
 #' @param n Number of repetitions
-#'
-#' @rdname repeat_string
-#' @family string functions
-#' @aliases repeat_string,Column-method
+#' @rdname column_string_functions
+#' @aliases repeat_string repeat_string,Column-method
 #' @export
 #' @examples
-#' \dontrun{
-#' df <- read.text("README.md")
-#'
-#' first(select(df, repeat_string(df$value, 3)))
 #'
+#' \dontrun{
+#' head(select(df, repeat_string(df$Class, 3)))
 #' # This is equivalent to the following SQL expression
-#' first(selectExpr(df, "repeat(value, 3)"))
-#' }
+#' head(selectExpr(df, "repeat(Class, 3)"))}
 #' @note repeat_string since 2.3.0
 setMethod("repeat_string",
           signature(x = "Column", n = "numeric"),
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 0248ec585d771..dc99e3d94b269 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -917,8 +917,9 @@ setGeneric("approxCountDistinct", function(x, ...) { standardGeneric("approxCoun
 #' @export
 setGeneric("array_contains", function(x, value) { standardGeneric("array_contains") })
 
-#' @rdname ascii
+#' @rdname column_string_functions
 #' @export
+#' @name NULL
 setGeneric("ascii", function(x) { standardGeneric("ascii") })
 
 #' @param x Column to compute on or a GroupedData object.
@@ -927,8 +928,9 @@ setGeneric("ascii", function(x) { standardGeneric("ascii") })
 #' @export
 setGeneric("avg", function(x, ...) { standardGeneric("avg") })
 
-#' @rdname base64
+#' @rdname column_string_functions
 #' @export
+#' @name NULL
 setGeneric("base64", function(x) { standardGeneric("base64") })
 
 #' @rdname column_math_functions
@@ -969,12 +971,14 @@ setGeneric("collect_set", function(x) { standardGeneric("collect_set") })
 #' @export
 setGeneric("column", function(x) { standardGeneric("column") })
 
-#' @rdname concat
+#' @rdname column_string_functions
 #' @export
+#' @name NULL
 setGeneric("concat", function(x, ...) { standardGeneric("concat") })
 
-#' @rdname concat_ws
+#' @rdname column_string_functions
 #' @export
+#' @name NULL
 setGeneric("concat_ws", function(sep, x, ...) { standardGeneric("concat_ws") })
 
 #' @rdname column_math_functions
@@ -1038,8 +1042,9 @@ setGeneric("dayofmonth", function(x) { standardGeneric("dayofmonth") })
 #' @name NULL
 setGeneric("dayofyear", function(x) { standardGeneric("dayofyear") })
 
-#' @rdname decode
+#' @rdname column_string_functions
 #' @export
+#' @name NULL
 setGeneric("decode", function(x, charset) { standardGeneric("decode") })
 
 #' @param x empty. Should be used with no argument.
@@ -1047,8 +1052,9 @@ setGeneric("decode", function(x, charset) { standardGeneric("decode") })
 #' @export
 setGeneric("dense_rank", function(x = "missing") { standardGeneric("dense_rank") })
 
-#' @rdname encode
+#' @rdname column_string_functions
 #' @export
+#' @name NULL
 setGeneric("encode", function(x, charset) { standardGeneric("encode") })
 
 #' @rdname explode
@@ -1068,12 +1074,14 @@ setGeneric("expr", function(x) { standardGeneric("expr") })
 #' @name NULL
 setGeneric("from_utc_timestamp", function(y, x) { standardGeneric("from_utc_timestamp") })
 
-#' @rdname format_number
+#' @rdname column_string_functions
 #' @export
+#' @name NULL
 setGeneric("format_number", function(y, x) { standardGeneric("format_number") })
 
-#' @rdname format_string
+#' @rdname column_string_functions
 #' @export
+#' @name NULL
 setGeneric("format_string", function(format, x, ...) { standardGeneric("format_string") })
 
 #' @rdname from_json
@@ -1114,8 +1122,9 @@ setGeneric("hour", function(x) { standardGeneric("hour") })
 #' @name NULL
 setGeneric("hypot", function(y, x) { standardGeneric("hypot") })
 
-#' @rdname initcap
+#' @rdname column_string_functions
 #' @export
+#' @name NULL
 setGeneric("initcap", function(x) { standardGeneric("initcap") })
 
 #' @param x empty. Should be used with no argument.
@@ -1124,8 +1133,9 @@ setGeneric("initcap", function(x) { standardGeneric("initcap") })
 setGeneric("input_file_name",
            function(x = "missing") { standardGeneric("input_file_name") })
 
-#' @rdname instr
+#' @rdname column_string_functions
 #' @export
+#' @name NULL
 setGeneric("instr", function(y, x) { standardGeneric("instr") })
 
 #' @rdname is.nan
@@ -1158,28 +1168,33 @@ setGeneric("lead", function(x, offset, defaultValue = NULL) { standardGeneric("l
 #' @export
 setGeneric("least", function(x, ...) { standardGeneric("least") })
 
-#' @rdname levenshtein
+#' @rdname column_string_functions
 #' @export
+#' @name NULL
 setGeneric("levenshtein", function(y, x) { standardGeneric("levenshtein") })
 
 #' @rdname lit
 #' @export
 setGeneric("lit", function(x) { standardGeneric("lit") })
 
-#' @rdname locate
+#' @rdname column_string_functions
 #' @export
+#' @name NULL
 setGeneric("locate", function(substr, str, ...) { standardGeneric("locate") })
 
-#' @rdname lower
+#' @rdname column_string_functions
 #' @export
+#' @name NULL
 setGeneric("lower", function(x) { standardGeneric("lower") })
 
-#' @rdname lpad
+#' @rdname column_string_functions
 #' @export
+#' @name NULL
 setGeneric("lpad", function(x, len, pad) { standardGeneric("lpad") })
 
-#' @rdname ltrim
+#' @rdname column_string_functions
 #' @export
+#' @name NULL
 setGeneric("ltrim", function(x) { standardGeneric("ltrim") })
 
 #' @rdname md5
@@ -1272,21 +1287,25 @@ setGeneric("randn", function(seed) { standardGeneric("randn") })
 #' @export
 setGeneric("rank", function(x, ...) { standardGeneric("rank") })
 
-#' @rdname regexp_extract
+#' @rdname column_string_functions
 #' @export
+#' @name NULL
 setGeneric("regexp_extract", function(x, pattern, idx) { standardGeneric("regexp_extract") })
 
-#' @rdname regexp_replace
+#' @rdname column_string_functions
 #' @export
+#' @name NULL
 setGeneric("regexp_replace",
            function(x, pattern, replacement) { standardGeneric("regexp_replace") })
 
-#' @rdname repeat_string
+#' @rdname column_string_functions
 #' @export
+#' @name NULL
 setGeneric("repeat_string", function(x, n) { standardGeneric("repeat_string") })
 
-#' @rdname reverse
+#' @rdname column_string_functions
 #' @export
+#' @name NULL
 setGeneric("reverse", function(x) { standardGeneric("reverse") })
 
 #' @rdname column_math_functions
@@ -1299,12 +1318,14 @@ setGeneric("rint", function(x) { standardGeneric("rint") })
 #' @export
 setGeneric("row_number", function(x = "missing") { standardGeneric("row_number") })
 
-#' @rdname rpad
+#' @rdname column_string_functions
 #' @export
+#' @name NULL
 setGeneric("rpad", function(x, len, pad) { standardGeneric("rpad") })
 
-#' @rdname rtrim
+#' @rdname column_string_functions
 #' @export
+#' @name NULL
 setGeneric("rtrim", function(x) { standardGeneric("rtrim") })
 
 #' @rdname column_aggregate_functions
@@ -1358,12 +1379,14 @@ setGeneric("skewness", function(x) { standardGeneric("skewness") })
 #' @export
 setGeneric("sort_array", function(x, asc = TRUE) { standardGeneric("sort_array") })
 
-#' @rdname split_string
+#' @rdname column_string_functions
 #' @export
+#' @name NULL
 setGeneric("split_string", function(x, pattern) { standardGeneric("split_string") })
 
-#' @rdname soundex
+#' @rdname column_string_functions
 #' @export
+#' @name NULL
 setGeneric("soundex", function(x) { standardGeneric("soundex") })
 
 #' @param x empty. Should be used with no argument.
@@ -1390,8 +1413,9 @@ setGeneric("stddev_samp", function(x) { standardGeneric("stddev_samp") })
 #' @export
 setGeneric("struct", function(x, ...) { standardGeneric("struct") })
 
-#' @rdname substring_index
+#' @rdname column_string_functions
 #' @export
+#' @name NULL
 setGeneric("substring_index", function(x, delim, count) { standardGeneric("substring_index") })
 
 #' @rdname column_aggregate_functions
@@ -1428,16 +1452,19 @@ setGeneric("to_timestamp", function(x, format) { standardGeneric("to_timestamp")
 #' @name NULL
 setGeneric("to_utc_timestamp", function(y, x) { standardGeneric("to_utc_timestamp") })
 
-#' @rdname translate
+#' @rdname column_string_functions
 #' @export
+#' @name NULL
 setGeneric("translate", function(x, matchingString, replaceString) { standardGeneric("translate") })
 
-#' @rdname trim
+#' @rdname column_string_functions
 #' @export
+#' @name NULL
 setGeneric("trim", function(x) { standardGeneric("trim") })
 
-#' @rdname unbase64
+#' @rdname column_string_functions
 #' @export
+#' @name NULL
 setGeneric("unbase64", function(x) { standardGeneric("unbase64") })
 
 #' @rdname column_math_functions
@@ -1450,8 +1477,9 @@ setGeneric("unhex", function(x) { standardGeneric("unhex") })
 #' @name NULL
 setGeneric("unix_timestamp", function(x, format) { standardGeneric("unix_timestamp") })
 
-#' @rdname upper
+#' @rdname column_string_functions
 #' @export
+#' @name NULL
 setGeneric("upper", function(x) { standardGeneric("upper") })
 
 #' @rdname column_aggregate_functions

From 0c8444cf6d0620cd219ddcf5f50b12ff648639e9 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 29 Jun 2017 10:32:32 +0800
Subject: [PATCH 033/779] [SPARK-14657][SPARKR][ML] RFormula w/o intercept
 should output reference category when encoding string terms

## What changes were proposed in this pull request?

Please see [SPARK-14657](https://issues.apache.org/jira/browse/SPARK-14657) for detail of this bug.
I searched online and test some other cases, found when we fit R glm model(or other models powered by R formula) w/o intercept on a dataset including string/category features, one of the categories in the first category feature is being used as reference category, we will not drop any category for that feature.
I think we should keep consistent semantics between Spark RFormula and R formula.
## How was this patch tested?

Add standard unit tests.

cc mengxr

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #12414 from yanboliang/spark-14657.
---
 .../apache/spark/ml/feature/RFormula.scala    | 10 ++-
 .../spark/ml/feature/RFormulaSuite.scala      | 83 +++++++++++++++++++
 2 files changed, 92 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index 1fad0a6fc9443..4b44878784c90 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -205,12 +205,20 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
     }.toMap
 
     // Then we handle one-hot encoding and interactions between terms.
+    var keepReferenceCategory = false
     val encodedTerms = resolvedFormula.terms.map {
       case Seq(term) if dataset.schema(term).dataType == StringType =>
         val encodedCol = tmpColumn("onehot")
-        encoderStages += new OneHotEncoder()
+        var encoder = new OneHotEncoder()
           .setInputCol(indexed(term))
           .setOutputCol(encodedCol)
+        // Formula w/o intercept, one of the categories in the first category feature is
+        // being used as reference category, we will not drop any category for that feature.
+        if (!hasIntercept && !keepReferenceCategory) {
+          encoder = encoder.setDropLast(false)
+          keepReferenceCategory = true
+        }
+        encoderStages += encoder
         prefixesToRewrite(encodedCol + "_") = term + "_"
         encodedCol
       case Seq(term) =>
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
index 41d0062c2cabd..23570d6e0b4cb 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
@@ -213,6 +213,89 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
     assert(result.collect() === expected.collect())
   }
 
+  test("formula w/o intercept, we should output reference category when encoding string terms") {
+    /*
+     R code:
+
+     df <- data.frame(id = c(1, 2, 3, 4),
+                  a = c("foo", "bar", "bar", "baz"),
+                  b = c("zq", "zz", "zz", "zz"),
+                  c = c(4, 4, 5, 5))
+     model.matrix(id ~ a + b + c - 1, df)
+
+       abar abaz afoo bzz c
+     1    0    0    1   0 4
+     2    1    0    0   1 4
+     3    1    0    0   1 5
+     4    0    1    0   1 5
+
+     model.matrix(id ~ a:b + c - 1, df)
+
+       c abar:bzq abaz:bzq afoo:bzq abar:bzz abaz:bzz afoo:bzz
+     1 4        0        0        1        0        0        0
+     2 4        0        0        0        1        0        0
+     3 5        0        0        0        1        0        0
+     4 5        0        0        0        0        1        0
+    */
+    val original = Seq((1, "foo", "zq", 4), (2, "bar", "zz", 4), (3, "bar", "zz", 5),
+      (4, "baz", "zz", 5)).toDF("id", "a", "b", "c")
+
+    val formula1 = new RFormula().setFormula("id ~ a + b + c - 1")
+      .setStringIndexerOrderType(StringIndexer.alphabetDesc)
+    val model1 = formula1.fit(original)
+    val result1 = model1.transform(original)
+    val resultSchema1 = model1.transformSchema(original.schema)
+    // Note the column order is different between R and Spark.
+    val expected1 = Seq(
+      (1, "foo", "zq", 4, Vectors.sparse(5, Array(0, 4), Array(1.0, 4.0)), 1.0),
+      (2, "bar", "zz", 4, Vectors.dense(0.0, 0.0, 1.0, 1.0, 4.0), 2.0),
+      (3, "bar", "zz", 5, Vectors.dense(0.0, 0.0, 1.0, 1.0, 5.0), 3.0),
+      (4, "baz", "zz", 5, Vectors.dense(0.0, 1.0, 0.0, 1.0, 5.0), 4.0)
+    ).toDF("id", "a", "b", "c", "features", "label")
+    assert(result1.schema.toString == resultSchema1.toString)
+    assert(result1.collect() === expected1.collect())
+
+    val attrs1 = AttributeGroup.fromStructField(result1.schema("features"))
+    val expectedAttrs1 = new AttributeGroup(
+      "features",
+      Array[Attribute](
+        new BinaryAttribute(Some("a_foo"), Some(1)),
+        new BinaryAttribute(Some("a_baz"), Some(2)),
+        new BinaryAttribute(Some("a_bar"), Some(3)),
+        new BinaryAttribute(Some("b_zz"), Some(4)),
+        new NumericAttribute(Some("c"), Some(5))))
+    assert(attrs1 === expectedAttrs1)
+
+    // There is no impact for string terms interaction.
+    val formula2 = new RFormula().setFormula("id ~ a:b + c - 1")
+      .setStringIndexerOrderType(StringIndexer.alphabetDesc)
+    val model2 = formula2.fit(original)
+    val result2 = model2.transform(original)
+    val resultSchema2 = model2.transformSchema(original.schema)
+    // Note the column order is different between R and Spark.
+    val expected2 = Seq(
+      (1, "foo", "zq", 4, Vectors.sparse(7, Array(1, 6), Array(1.0, 4.0)), 1.0),
+      (2, "bar", "zz", 4, Vectors.sparse(7, Array(4, 6), Array(1.0, 4.0)), 2.0),
+      (3, "bar", "zz", 5, Vectors.sparse(7, Array(4, 6), Array(1.0, 5.0)), 3.0),
+      (4, "baz", "zz", 5, Vectors.sparse(7, Array(2, 6), Array(1.0, 5.0)), 4.0)
+    ).toDF("id", "a", "b", "c", "features", "label")
+    assert(result2.schema.toString == resultSchema2.toString)
+    assert(result2.collect() === expected2.collect())
+
+    val attrs2 = AttributeGroup.fromStructField(result2.schema("features"))
+    val expectedAttrs2 = new AttributeGroup(
+      "features",
+      Array[Attribute](
+        new NumericAttribute(Some("a_foo:b_zz"), Some(1)),
+        new NumericAttribute(Some("a_foo:b_zq"), Some(2)),
+        new NumericAttribute(Some("a_baz:b_zz"), Some(3)),
+        new NumericAttribute(Some("a_baz:b_zq"), Some(4)),
+        new NumericAttribute(Some("a_bar:b_zz"), Some(5)),
+        new NumericAttribute(Some("a_bar:b_zq"), Some(6)),
+        new NumericAttribute(Some("c"), Some(7))))
+    assert(attrs2 === expectedAttrs2)
+  }
+
   test("index string label") {
     val formula = new RFormula().setFormula("id ~ a + b")
     val original =

From db44f5f3e8b5bc28c33b154319539d51c05a089c Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Wed, 28 Jun 2017 19:36:00 -0700
Subject: [PATCH 034/779] [SPARK-21224][R] Specify a schema by using a
 DDL-formatted string when reading in R

## What changes were proposed in this pull request?

This PR proposes to support a DDL-formetted string as schema as below:

```r
mockLines <- c("{\"name\":\"Michael\"}",
               "{\"name\":\"Andy\", \"age\":30}",
               "{\"name\":\"Justin\", \"age\":19}")
jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
writeLines(mockLines, jsonPath)
df <- read.df(jsonPath, "json", "name STRING, age DOUBLE")
collect(df)
```

## How was this patch tested?

Tests added in `test_streaming.R` and `test_sparkSQL.R` and manual tests.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #18431 from HyukjinKwon/r-ddl-schema.
---
 R/pkg/R/SQLContext.R                          | 38 +++++++++++++------
 R/pkg/tests/fulltests/test_sparkSQL.R         | 20 +++++++++-
 R/pkg/tests/fulltests/test_streaming.R        | 23 +++++++++++
 .../org/apache/spark/sql/api/r/SQLUtils.scala | 15 --------
 4 files changed, 67 insertions(+), 29 deletions(-)

diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index e3528bc7c3135..3b7f71bbbffb8 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -584,7 +584,7 @@ tableToDF <- function(tableName) {
 #'
 #' @param path The path of files to load
 #' @param source The name of external data source
-#' @param schema The data schema defined in structType
+#' @param schema The data schema defined in structType or a DDL-formatted string.
 #' @param na.strings Default string value for NA when source is "csv"
 #' @param ... additional external data source specific named properties.
 #' @return SparkDataFrame
@@ -600,6 +600,8 @@ tableToDF <- function(tableName) {
 #'                      structField("info", "map<string,double>"))
 #' df2 <- read.df(mapTypeJsonPath, "json", schema, multiLine = TRUE)
 #' df3 <- loadDF("data/test_table", "parquet", mergeSchema = "true")
+#' stringSchema <- "name STRING, info MAP<STRING, DOUBLE>"
+#' df4 <- read.df(mapTypeJsonPath, "json", stringSchema, multiLine = TRUE)
 #' }
 #' @name read.df
 #' @method read.df default
@@ -623,14 +625,19 @@ read.df.default <- function(path = NULL, source = NULL, schema = NULL, na.string
   if (source == "csv" && is.null(options[["nullValue"]])) {
     options[["nullValue"]] <- na.strings
   }
+  read <- callJMethod(sparkSession, "read")
+  read <- callJMethod(read, "format", source)
   if (!is.null(schema)) {
-    stopifnot(class(schema) == "structType")
-    sdf <- handledCallJStatic("org.apache.spark.sql.api.r.SQLUtils", "loadDF", sparkSession,
-                              source, schema$jobj, options)
-  } else {
-    sdf <- handledCallJStatic("org.apache.spark.sql.api.r.SQLUtils", "loadDF", sparkSession,
-                              source, options)
+    if (class(schema) == "structType") {
+      read <- callJMethod(read, "schema", schema$jobj)
+    } else if (is.character(schema)) {
+      read <- callJMethod(read, "schema", schema)
+    } else {
+      stop("schema should be structType or character.")
+    }
   }
+  read <- callJMethod(read, "options", options)
+  sdf <- handledCallJMethod(read, "load")
   dataFrame(sdf)
 }
 
@@ -717,8 +724,8 @@ read.jdbc <- function(url, tableName,
 #' "spark.sql.sources.default" will be used.
 #'
 #' @param source The name of external data source
-#' @param schema The data schema defined in structType, this is required for file-based streaming
-#'               data source
+#' @param schema The data schema defined in structType or a DDL-formatted string, this is
+#'               required for file-based streaming data source
 #' @param ... additional external data source specific named options, for instance \code{path} for
 #'        file-based streaming data source
 #' @return SparkDataFrame
@@ -733,6 +740,8 @@ read.jdbc <- function(url, tableName,
 #' q <- write.stream(df, "text", path = "/home/user/out", checkpointLocation = "/home/user/cp")
 #'
 #' df <- read.stream("json", path = jsonDir, schema = schema, maxFilesPerTrigger = 1)
+#' stringSchema <- "name STRING, info MAP<STRING, DOUBLE>"
+#' df1 <- read.stream("json", path = jsonDir, schema = stringSchema, maxFilesPerTrigger = 1)
 #' }
 #' @name read.stream
 #' @note read.stream since 2.2.0
@@ -750,10 +759,15 @@ read.stream <- function(source = NULL, schema = NULL, ...) {
   read <- callJMethod(sparkSession, "readStream")
   read <- callJMethod(read, "format", source)
   if (!is.null(schema)) {
-    stopifnot(class(schema) == "structType")
-    read <- callJMethod(read, "schema", schema$jobj)
+    if (class(schema) == "structType") {
+      read <- callJMethod(read, "schema", schema$jobj)
+    } else if (is.character(schema)) {
+      read <- callJMethod(read, "schema", schema)
+    } else {
+      stop("schema should be structType or character.")
+    }
   }
   read <- callJMethod(read, "options", options)
   sdf <- handledCallJMethod(read, "load")
-  dataFrame(callJMethod(sdf, "toDF"))
+  dataFrame(sdf)
 }
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index 911b73b9ee551..a2bcb5aefe16d 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -3248,9 +3248,9 @@ test_that("Call DataFrameWriter.load() API in Java without path and check argume
   # It makes sure that we can omit path argument in read.df API and then it calls
   # DataFrameWriter.load() without path.
   expect_error(read.df(source = "json"),
-               paste("Error in loadDF : analysis error - Unable to infer schema for JSON.",
+               paste("Error in load : analysis error - Unable to infer schema for JSON.",
                      "It must be specified manually"))
-  expect_error(read.df("arbitrary_path"), "Error in loadDF : analysis error - Path does not exist")
+  expect_error(read.df("arbitrary_path"), "Error in load : analysis error - Path does not exist")
   expect_error(read.json("arbitrary_path"), "Error in json : analysis error - Path does not exist")
   expect_error(read.text("arbitrary_path"), "Error in text : analysis error - Path does not exist")
   expect_error(read.orc("arbitrary_path"), "Error in orc : analysis error - Path does not exist")
@@ -3268,6 +3268,22 @@ test_that("Call DataFrameWriter.load() API in Java without path and check argume
                  "Unnamed arguments ignored: 2, 3, a.")
 })
 
+test_that("Specify a schema by using a DDL-formatted string when reading", {
+  # Test read.df with a user defined schema in a DDL-formatted string.
+  df1 <- read.df(jsonPath, "json", "name STRING, age DOUBLE")
+  expect_is(df1, "SparkDataFrame")
+  expect_equal(dtypes(df1), list(c("name", "string"), c("age", "double")))
+
+  expect_error(read.df(jsonPath, "json", "name stri"), "DataType stri is not supported.")
+
+  # Test loadDF with a user defined schema in a DDL-formatted string.
+  df2 <- loadDF(jsonPath, "json", "name STRING, age DOUBLE")
+  expect_is(df2, "SparkDataFrame")
+  expect_equal(dtypes(df2), list(c("name", "string"), c("age", "double")))
+
+  expect_error(loadDF(jsonPath, "json", "name stri"), "DataType stri is not supported.")
+})
+
 test_that("Collect on DataFrame when NAs exists at the top of a timestamp column", {
   ldf <- data.frame(col1 = c(0, 1, 2),
                    col2 = c(as.POSIXct("2017-01-01 00:00:01"),
diff --git a/R/pkg/tests/fulltests/test_streaming.R b/R/pkg/tests/fulltests/test_streaming.R
index d691de7cd725d..54f40bbd5f517 100644
--- a/R/pkg/tests/fulltests/test_streaming.R
+++ b/R/pkg/tests/fulltests/test_streaming.R
@@ -46,6 +46,8 @@ schema <- structType(structField("name", "string"),
                      structField("age", "integer"),
                      structField("count", "double"))
 
+stringSchema <- "name STRING, age INTEGER, count DOUBLE"
+
 test_that("read.stream, write.stream, awaitTermination, stopQuery", {
   df <- read.stream("json", path = jsonDir, schema = schema, maxFilesPerTrigger = 1)
   expect_true(isStreaming(df))
@@ -111,6 +113,27 @@ test_that("Stream other format", {
   unlink(parquetPath)
 })
 
+test_that("Specify a schema by using a DDL-formatted string when reading", {
+  # Test read.stream with a user defined schema in a DDL-formatted string.
+  parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet")
+  df <- read.df(jsonPath, "json", schema)
+  write.df(df, parquetPath, "parquet", "overwrite")
+
+  df <- read.stream(path = parquetPath, schema = stringSchema)
+  expect_true(isStreaming(df))
+  counts <- count(group_by(df, "name"))
+  q <- write.stream(counts, "memory", queryName = "people3", outputMode = "complete")
+
+  expect_false(awaitTermination(q, 5 * 1000))
+  callJMethod(q@ssq, "processAllAvailable")
+  expect_equal(head(sql("SELECT count(*) FROM people3"))[[1]], 3)
+
+  expect_error(read.stream(path = parquetPath, schema = "name stri"),
+               "DataType stri is not supported.")
+
+  unlink(parquetPath)
+})
+
 test_that("Non-streaming DataFrame", {
   c <- as.DataFrame(cars)
   expect_false(isStreaming(c))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
index d94e528a3ad47..9bd2987057dbc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
@@ -193,21 +193,6 @@ private[sql] object SQLUtils extends Logging {
     }
   }
 
-  def loadDF(
-      sparkSession: SparkSession,
-      source: String,
-      options: java.util.Map[String, String]): DataFrame = {
-    sparkSession.read.format(source).options(options).load()
-  }
-
-  def loadDF(
-      sparkSession: SparkSession,
-      source: String,
-      schema: StructType,
-      options: java.util.Map[String, String]): DataFrame = {
-    sparkSession.read.format(source).schema(schema).options(options).load()
-  }
-
   def readSqlObject(dis: DataInputStream, dataType: Char): Object = {
     dataType match {
       case 's' =>

From fc92d25f2a27e81ef2d5031dcf856af1cc1d8c31 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung@apache.org>
Date: Wed, 28 Jun 2017 20:06:29 -0700
Subject: [PATCH 035/779] Revert "[SPARK-21094][R] Terminate R's worker
 processes in the parent of R's daemon to prevent a leak"

This reverts commit 6b3d02285ee0debc73cbcab01b10398a498fbeb8.
---
 R/pkg/inst/worker/daemon.R | 59 +++-----------------------------------
 1 file changed, 4 insertions(+), 55 deletions(-)

diff --git a/R/pkg/inst/worker/daemon.R b/R/pkg/inst/worker/daemon.R
index 6e385b2a27622..3a318b71ea06d 100644
--- a/R/pkg/inst/worker/daemon.R
+++ b/R/pkg/inst/worker/daemon.R
@@ -30,55 +30,8 @@ port <- as.integer(Sys.getenv("SPARKR_WORKER_PORT"))
 inputCon <- socketConnection(
     port = port, open = "rb", blocking = TRUE, timeout = connectionTimeout)
 
-# Waits indefinitely for a socket connecion by default.
-selectTimeout <- NULL
-
-# Exit code that children send to the parent to indicate they exited.
-exitCode <- 1
-
 while (TRUE) {
-  ready <- socketSelect(list(inputCon), timeout = selectTimeout)
-
-  # Note that the children should be terminated in the parent. If each child terminates
-  # itself, it appears that the resource is not released properly, that causes an unexpected
-  # termination of this daemon due to, for example, running out of file descriptors
-  # (see SPARK-21093). Therefore, the current implementation tries to retrieve children
-  # that are exited (but not terminated) and then sends a kill signal to terminate them properly
-  # in the parent.
-  #
-  # There are two paths that it attempts to send a signal to terminate the children in the parent.
-  #
-  #   1. Every second if any socket connection is not available and if there are child workers
-  #     running.
-  #   2. Right after a socket connection is available.
-  #
-  # In other words, the parent attempts to send the signal to the children every second if
-  # any worker is running or right before launching other worker children from the following
-  # new socket connection.
-
-  # Only the process IDs of children sent data to the parent are returned below. The children
-  # send a custom exit code to the parent after being exited and the parent tries
-  # to terminate them only if they sent the exit code.
-  children <- parallel:::selectChildren(timeout = 0)
-
-  if (is.integer(children)) {
-    lapply(children, function(child) {
-      # This data should be raw bytes if any data was sent from this child.
-      # Otherwise, this returns the PID.
-      data <- parallel:::readChild(child)
-      if (is.raw(data)) {
-        # This checks if the data from this child is the exit code that indicates an exited child.
-        if (unserialize(data) == exitCode) {
-          # If so, we terminate this child.
-          tools::pskill(child, tools::SIGUSR1)
-        }
-      }
-    })
-  } else if (is.null(children)) {
-    # If it is NULL, there are no children. Waits indefinitely for a socket connecion.
-    selectTimeout <- NULL
-  }
-
+  ready <- socketSelect(list(inputCon))
   if (ready) {
     port <- SparkR:::readInt(inputCon)
     # There is a small chance that it could be interrupted by signal, retry one time
@@ -91,16 +44,12 @@ while (TRUE) {
     }
     p <- parallel:::mcfork()
     if (inherits(p, "masterProcess")) {
-      # Reach here because this is a child process.
       close(inputCon)
       Sys.setenv(SPARKR_WORKER_PORT = port)
       try(source(script))
-      # Note that this mcexit does not fully terminate this child. So, this writes back
-      # a custom exit code so that the parent can read and terminate this child.
-      parallel:::mcexit(0L, send = exitCode)
-    } else {
-      # Forking succeeded and we need to check if they finished their jobs every second.
-      selectTimeout <- 1
+      # Set SIGUSR1 so that child can exit
+      tools::pskill(Sys.getpid(), tools::SIGUSR1)
+      parallel:::mcexit(0L)
     }
   }
 }

From 25c2edf6f9da9d4d45fc628cf97de657f2a2cc7e Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 29 Jun 2017 11:21:50 +0800
Subject: [PATCH 036/779] [SPARK-21229][SQL] remove QueryPlan.preCanonicalized

## What changes were proposed in this pull request?

`QueryPlan.preCanonicalized` is only overridden in a few places, and it does introduce an extra concept to `QueryPlan` which may confuse people.

This PR removes it and override `canonicalized` in these places

## How was this patch tested?

existing tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #18440 from cloud-fan/minor.
---
 .../sql/catalyst/catalog/interface.scala      | 23 +++++++++++--------
 .../spark/sql/catalyst/plans/QueryPlan.scala  | 13 ++++-------
 .../sql/execution/DataSourceScanExec.scala    |  8 +++++--
 .../datasources/LogicalRelation.scala         |  5 +++-
 4 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index b63bef9193332..da50b0e7e8e42 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -27,7 +27,8 @@ import com.google.common.base.Objects
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.{FunctionIdentifier, InternalRow, TableIdentifier}
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference, Cast, Literal}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference, Cast, ExprId, Literal}
+import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
 import org.apache.spark.sql.catalyst.util.quoteIdentifier
@@ -425,15 +426,17 @@ case class CatalogRelation(
     Objects.hashCode(tableMeta.identifier, output)
   }
 
-  override def preCanonicalized: LogicalPlan = copy(tableMeta = CatalogTable(
-    identifier = tableMeta.identifier,
-    tableType = tableMeta.tableType,
-    storage = CatalogStorageFormat.empty,
-    schema = tableMeta.schema,
-    partitionColumnNames = tableMeta.partitionColumnNames,
-    bucketSpec = tableMeta.bucketSpec,
-    createTime = -1
-  ))
+  override lazy val canonicalized: LogicalPlan = copy(
+    tableMeta = tableMeta.copy(
+      storage = CatalogStorageFormat.empty,
+      createTime = -1
+    ),
+    dataCols = dataCols.zipWithIndex.map {
+      case (attr, index) => attr.withExprId(ExprId(index))
+    },
+    partitionCols = partitionCols.zipWithIndex.map {
+      case (attr, index) => attr.withExprId(ExprId(index + dataCols.length))
+    })
 
   override def computeStats: Statistics = {
     // For data source tables, we will create a `LogicalRelation` and won't call this method, for
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
index 01b3da3f7c482..7addbaaa9afa5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -188,12 +188,13 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] extends TreeNode[PlanT
    * Plans where `this.canonicalized == other.canonicalized` will always evaluate to the same
    * result.
    *
-   * Some nodes should overwrite this to provide proper canonicalize logic.
+   * Some nodes should overwrite this to provide proper canonicalize logic, but they should remove
+   * expressions cosmetic variations themselves.
    */
   lazy val canonicalized: PlanType = {
     val canonicalizedChildren = children.map(_.canonicalized)
     var id = -1
-    preCanonicalized.mapExpressions {
+    mapExpressions {
       case a: Alias =>
         id += 1
         // As the root of the expression, Alias will always take an arbitrary exprId, we need to
@@ -206,18 +207,12 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] extends TreeNode[PlanT
         // Top level `AttributeReference` may also be used for output like `Alias`, we should
         // normalize the epxrId too.
         id += 1
-        ar.withExprId(ExprId(id))
+        ar.withExprId(ExprId(id)).canonicalized
 
       case other => QueryPlan.normalizeExprId(other, allAttributes)
     }.withNewChildren(canonicalizedChildren)
   }
 
-  /**
-   * Do some simple transformation on this plan before canonicalizing. Implementations can override
-   * this method to provide customized canonicalize logic without rewriting the whole logic.
-   */
-  protected def preCanonicalized: PlanType = this
-
 
   /**
    * Returns true when the given query plan will return the same results as this query plan.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index 74fc23a52a141..a0def68d88e0d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -138,8 +138,12 @@ case class RowDataSourceScanExec(
   }
 
   // Only care about `relation` and `metadata` when canonicalizing.
-  override def preCanonicalized: SparkPlan =
-    copy(rdd = null, outputPartitioning = null, metastoreTableIdentifier = None)
+  override lazy val canonicalized: SparkPlan =
+    copy(
+      output.map(QueryPlan.normalizeExprId(_, output)),
+      rdd = null,
+      outputPartitioning = null,
+      metastoreTableIdentifier = None)
 }
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
index c1b2895f1747e..6ba190b9e5dcf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.datasources
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
 import org.apache.spark.sql.catalyst.catalog.CatalogTable
 import org.apache.spark.sql.catalyst.expressions.{AttributeMap, AttributeReference}
+import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics}
 import org.apache.spark.sql.sources.BaseRelation
 import org.apache.spark.util.Utils
@@ -43,7 +44,9 @@ case class LogicalRelation(
   }
 
   // Only care about relation when canonicalizing.
-  override def preCanonicalized: LogicalPlan = copy(catalogTable = None)
+  override lazy val canonicalized: LogicalPlan = copy(
+    output = output.map(QueryPlan.normalizeExprId(_, output)),
+    catalogTable = None)
 
   @transient override def computeStats: Statistics = {
     catalogTable.flatMap(_.stats.map(_.toPlanStats(output))).getOrElse(

From 82e24912d6e15a9e4fbadd83da9a08d4f80a592b Mon Sep 17 00:00:00 2001
From: wangzhenhua <wangzhenhua@huawei.com>
Date: Thu, 29 Jun 2017 11:32:29 +0800
Subject: [PATCH 037/779] [SPARK-21237][SQL] Invalidate stats once table data
 is changed

## What changes were proposed in this pull request?

Invalidate spark's stats after data changing commands:

- InsertIntoHadoopFsRelationCommand
- InsertIntoHiveTable
- LoadDataCommand
- TruncateTableCommand
- AlterTableSetLocationCommand
- AlterTableDropPartitionCommand

## How was this patch tested?

Added test cases.

Author: wangzhenhua <wangzhenhua@huawei.com>

Closes #18449 from wzhfy/removeStats.
---
 .../catalyst/catalog/ExternalCatalog.scala    |   3 +-
 .../catalyst/catalog/InMemoryCatalog.scala    |   4 +-
 .../sql/catalyst/catalog/SessionCatalog.scala |   2 +-
 .../catalog/ExternalCatalogSuite.scala        |   2 +-
 .../catalog/SessionCatalogSuite.scala         |   2 +-
 .../command/AnalyzeColumnCommand.scala        |   4 +-
 .../command/AnalyzeTableCommand.scala         |  76 +---------
 .../sql/execution/command/CommandUtils.scala  | 102 ++++++++++++++
 .../spark/sql/execution/command/ddl.scala     |   9 +-
 .../spark/sql/execution/command/tables.scala  |   7 +
 .../InsertIntoHadoopFsRelationCommand.scala   |   5 +
 .../spark/sql/StatisticsCollectionSuite.scala |  85 ++++++++++--
 .../apache/spark/sql/test/SQLTestUtils.scala  |  14 ++
 .../spark/sql/hive/HiveExternalCatalog.scala  |  24 ++--
 .../hive/execution/InsertIntoHiveTable.scala  |   4 +-
 .../spark/sql/hive/StatisticsSuite.scala      | 130 ++++++++++++++----
 16 files changed, 340 insertions(+), 133 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
index 12ba5aedde026..0254b6bb6d136 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
@@ -160,7 +160,8 @@ abstract class ExternalCatalog
    */
   def alterTableSchema(db: String, table: String, schema: StructType): Unit
 
-  def alterTableStats(db: String, table: String, stats: CatalogStatistics): Unit
+  /** Alter the statistics of a table. If `stats` is None, then remove all existing statistics. */
+  def alterTableStats(db: String, table: String, stats: Option[CatalogStatistics]): Unit
 
   def getTable(db: String, table: String): CatalogTable
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
index 9820522a230e3..747190faa3c8c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
@@ -315,10 +315,10 @@ class InMemoryCatalog(
   override def alterTableStats(
       db: String,
       table: String,
-      stats: CatalogStatistics): Unit = synchronized {
+      stats: Option[CatalogStatistics]): Unit = synchronized {
     requireTableExists(db, table)
     val origTable = catalog(db).tables(table).table
-    catalog(db).tables(table).table = origTable.copy(stats = Some(stats))
+    catalog(db).tables(table).table = origTable.copy(stats = stats)
   }
 
   override def getTable(db: String, table: String): CatalogTable = synchronized {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index cf02da8993658..7ece77df7fc14 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -380,7 +380,7 @@ class SessionCatalog(
    * Alter Spark's statistics of an existing metastore table identified by the provided table
    * identifier.
    */
-  def alterTableStats(identifier: TableIdentifier, newStats: CatalogStatistics): Unit = {
+  def alterTableStats(identifier: TableIdentifier, newStats: Option[CatalogStatistics]): Unit = {
     val db = formatDatabaseName(identifier.database.getOrElse(getCurrentDatabase))
     val table = formatTableName(identifier.table)
     val tableIdentifier = TableIdentifier(table, Some(db))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
index 557b0970b54e5..c22d55fc96a65 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
@@ -260,7 +260,7 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac
     val oldTableStats = catalog.getTable("db2", "tbl1").stats
     assert(oldTableStats.isEmpty)
     val newStats = CatalogStatistics(sizeInBytes = 1)
-    catalog.alterTableStats("db2", "tbl1", newStats)
+    catalog.alterTableStats("db2", "tbl1", Some(newStats))
     val newTableStats = catalog.getTable("db2", "tbl1").stats
     assert(newTableStats.get == newStats)
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
index a6dc21b03d446..fc3893e197792 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
@@ -454,7 +454,7 @@ abstract class SessionCatalogSuite extends AnalysisTest {
       val oldTableStats = catalog.getTableMetadata(tableId).stats
       assert(oldTableStats.isEmpty)
       val newStats = CatalogStatistics(sizeInBytes = 1)
-      catalog.alterTableStats(tableId, newStats)
+      catalog.alterTableStats(tableId, Some(newStats))
       val newTableStats = catalog.getTableMetadata(tableId).stats
       assert(newTableStats.get == newStats)
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
index 2f273b63e8348..6588993ef9ad9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
@@ -42,7 +42,7 @@ case class AnalyzeColumnCommand(
     if (tableMeta.tableType == CatalogTableType.VIEW) {
       throw new AnalysisException("ANALYZE TABLE is not supported on views.")
     }
-    val sizeInBytes = AnalyzeTableCommand.calculateTotalSize(sessionState, tableMeta)
+    val sizeInBytes = CommandUtils.calculateTotalSize(sessionState, tableMeta)
 
     // Compute stats for each column
     val (rowCount, newColStats) = computeColumnStats(sparkSession, tableIdentWithDB, columnNames)
@@ -54,7 +54,7 @@ case class AnalyzeColumnCommand(
       // Newly computed column stats should override the existing ones.
       colStats = tableMeta.stats.map(_.colStats).getOrElse(Map.empty) ++ newColStats)
 
-    sessionState.catalog.alterTableStats(tableIdentWithDB, statistics)
+    sessionState.catalog.alterTableStats(tableIdentWithDB, Some(statistics))
 
     // Refresh the cached data source table in the catalog.
     sessionState.catalog.refreshTable(tableIdentWithDB)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
index 13b8faff844c7..d780ef42f3fae 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
@@ -17,18 +17,10 @@
 
 package org.apache.spark.sql.execution.command
 
-import java.net.URI
-
-import scala.util.control.NonFatal
-
-import org.apache.hadoop.fs.{FileSystem, Path}
-
-import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.catalog.{CatalogStatistics, CatalogTable, CatalogTableType}
+import org.apache.spark.sql.catalyst.catalog.{CatalogStatistics, CatalogTableType}
 import org.apache.spark.sql.execution.SQLExecution
-import org.apache.spark.sql.internal.SessionState
 
 
 /**
@@ -46,7 +38,7 @@ case class AnalyzeTableCommand(
     if (tableMeta.tableType == CatalogTableType.VIEW) {
       throw new AnalysisException("ANALYZE TABLE is not supported on views.")
     }
-    val newTotalSize = AnalyzeTableCommand.calculateTotalSize(sessionState, tableMeta)
+    val newTotalSize = CommandUtils.calculateTotalSize(sessionState, tableMeta)
 
     val oldTotalSize = tableMeta.stats.map(_.sizeInBytes.toLong).getOrElse(0L)
     val oldRowCount = tableMeta.stats.flatMap(_.rowCount.map(_.toLong)).getOrElse(-1L)
@@ -74,7 +66,7 @@ case class AnalyzeTableCommand(
     // Update the metastore if the above statistics of the table are different from those
     // recorded in the metastore.
     if (newStats.isDefined) {
-      sessionState.catalog.alterTableStats(tableIdentWithDB, newStats.get)
+      sessionState.catalog.alterTableStats(tableIdentWithDB, newStats)
       // Refresh the cached data source table in the catalog.
       sessionState.catalog.refreshTable(tableIdentWithDB)
     }
@@ -82,65 +74,3 @@ case class AnalyzeTableCommand(
     Seq.empty[Row]
   }
 }
-
-object AnalyzeTableCommand extends Logging {
-
-  def calculateTotalSize(sessionState: SessionState, catalogTable: CatalogTable): Long = {
-    if (catalogTable.partitionColumnNames.isEmpty) {
-      calculateLocationSize(sessionState, catalogTable.identifier, catalogTable.storage.locationUri)
-    } else {
-      // Calculate table size as a sum of the visible partitions. See SPARK-21079
-      val partitions = sessionState.catalog.listPartitions(catalogTable.identifier)
-      partitions.map(p =>
-        calculateLocationSize(sessionState, catalogTable.identifier, p.storage.locationUri)
-      ).sum
-    }
-  }
-
-  private def calculateLocationSize(
-      sessionState: SessionState,
-      tableId: TableIdentifier,
-      locationUri: Option[URI]): Long = {
-    // This method is mainly based on
-    // org.apache.hadoop.hive.ql.stats.StatsUtils.getFileSizeForTable(HiveConf, Table)
-    // in Hive 0.13 (except that we do not use fs.getContentSummary).
-    // TODO: Generalize statistics collection.
-    // TODO: Why fs.getContentSummary returns wrong size on Jenkins?
-    // Can we use fs.getContentSummary in future?
-    // Seems fs.getContentSummary returns wrong table size on Jenkins. So we use
-    // countFileSize to count the table size.
-    val stagingDir = sessionState.conf.getConfString("hive.exec.stagingdir", ".hive-staging")
-
-    def calculateLocationSize(fs: FileSystem, path: Path): Long = {
-      val fileStatus = fs.getFileStatus(path)
-      val size = if (fileStatus.isDirectory) {
-        fs.listStatus(path)
-          .map { status =>
-            if (!status.getPath.getName.startsWith(stagingDir)) {
-              calculateLocationSize(fs, status.getPath)
-            } else {
-              0L
-            }
-          }.sum
-      } else {
-        fileStatus.getLen
-      }
-
-      size
-    }
-
-    locationUri.map { p =>
-      val path = new Path(p)
-      try {
-        val fs = path.getFileSystem(sessionState.newHadoopConf())
-        calculateLocationSize(fs, path)
-      } catch {
-        case NonFatal(e) =>
-          logWarning(
-            s"Failed to get the size of table ${tableId.table} in the " +
-              s"database ${tableId.database} because of ${e.toString}", e)
-          0L
-      }
-    }.getOrElse(0L)
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
new file mode 100644
index 0000000000000..92397607f38fd
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
@@ -0,0 +1,102 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.command
+
+import java.net.URI
+
+import scala.util.control.NonFatal
+
+import org.apache.hadoop.fs.{FileSystem, Path}
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.catalog.{CatalogStatistics, CatalogTable}
+import org.apache.spark.sql.internal.SessionState
+
+
+object CommandUtils extends Logging {
+
+  /** Change statistics after changing data by commands. */
+  def updateTableStats(sparkSession: SparkSession, table: CatalogTable): Unit = {
+    if (table.stats.nonEmpty) {
+      val catalog = sparkSession.sessionState.catalog
+      catalog.alterTableStats(table.identifier, None)
+    }
+  }
+
+  def calculateTotalSize(sessionState: SessionState, catalogTable: CatalogTable): BigInt = {
+    if (catalogTable.partitionColumnNames.isEmpty) {
+      calculateLocationSize(sessionState, catalogTable.identifier, catalogTable.storage.locationUri)
+    } else {
+      // Calculate table size as a sum of the visible partitions. See SPARK-21079
+      val partitions = sessionState.catalog.listPartitions(catalogTable.identifier)
+      partitions.map { p =>
+        calculateLocationSize(sessionState, catalogTable.identifier, p.storage.locationUri)
+      }.sum
+    }
+  }
+
+  def calculateLocationSize(
+      sessionState: SessionState,
+      identifier: TableIdentifier,
+      locationUri: Option[URI]): Long = {
+    // This method is mainly based on
+    // org.apache.hadoop.hive.ql.stats.StatsUtils.getFileSizeForTable(HiveConf, Table)
+    // in Hive 0.13 (except that we do not use fs.getContentSummary).
+    // TODO: Generalize statistics collection.
+    // TODO: Why fs.getContentSummary returns wrong size on Jenkins?
+    // Can we use fs.getContentSummary in future?
+    // Seems fs.getContentSummary returns wrong table size on Jenkins. So we use
+    // countFileSize to count the table size.
+    val stagingDir = sessionState.conf.getConfString("hive.exec.stagingdir", ".hive-staging")
+
+    def getPathSize(fs: FileSystem, path: Path): Long = {
+      val fileStatus = fs.getFileStatus(path)
+      val size = if (fileStatus.isDirectory) {
+        fs.listStatus(path)
+          .map { status =>
+            if (!status.getPath.getName.startsWith(stagingDir)) {
+              getPathSize(fs, status.getPath)
+            } else {
+              0L
+            }
+          }.sum
+      } else {
+        fileStatus.getLen
+      }
+
+      size
+    }
+
+    locationUri.map { p =>
+      val path = new Path(p)
+      try {
+        val fs = path.getFileSystem(sessionState.newHadoopConf())
+        getPathSize(fs, path)
+      } catch {
+        case NonFatal(e) =>
+          logWarning(
+            s"Failed to get the size of table ${identifier.table} in the " +
+              s"database ${identifier.database} because of ${e.toString}", e)
+          0L
+      }
+    }.getOrElse(0L)
+  }
+
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index 413f5f3ba539c..ac897c1b22d77 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -433,9 +433,11 @@ case class AlterTableAddPartitionCommand(
         sparkSession.sessionState.conf.resolver)
       // inherit table storage format (possibly except for location)
       CatalogTablePartition(normalizedSpec, table.storage.copy(
-        locationUri = location.map(CatalogUtils.stringToURI(_))))
+        locationUri = location.map(CatalogUtils.stringToURI)))
     }
     catalog.createPartitions(table.identifier, parts, ignoreIfExists = ifNotExists)
+
+    CommandUtils.updateTableStats(sparkSession, table)
     Seq.empty[Row]
   }
 
@@ -519,6 +521,9 @@ case class AlterTableDropPartitionCommand(
     catalog.dropPartitions(
       table.identifier, normalizedSpecs, ignoreIfNotExists = ifExists, purge = purge,
       retainData = retainData)
+
+    CommandUtils.updateTableStats(sparkSession, table)
+
     Seq.empty[Row]
   }
 
@@ -768,6 +773,8 @@ case class AlterTableSetLocationCommand(
         // No partition spec is specified, so we set the location for the table itself
         catalog.alterTable(table.withNewStorage(locationUri = Some(locUri)))
     }
+
+    CommandUtils.updateTableStats(sparkSession, table)
     Seq.empty[Row]
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index b937a8a9f375b..8ded1060f7bf0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -400,6 +400,7 @@ case class LoadDataCommand(
     // Refresh the metadata cache to ensure the data visible to the users
     catalog.refreshTable(targetTable.identifier)
 
+    CommandUtils.updateTableStats(sparkSession, targetTable)
     Seq.empty[Row]
   }
 }
@@ -487,6 +488,12 @@ case class TruncateTableCommand(
       case NonFatal(e) =>
         log.warn(s"Exception when attempting to uncache table $tableIdentWithDB", e)
     }
+
+    if (table.stats.nonEmpty) {
+      // empty table after truncation
+      val newStats = CatalogStatistics(sizeInBytes = 0, rowCount = Some(0))
+      catalog.alterTableStats(tableName, Some(newStats))
+    }
     Seq.empty[Row]
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index 00aa1240886e4..ab26f2affbce5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -161,6 +161,11 @@ case class InsertIntoHadoopFsRelationCommand(
       fileIndex.foreach(_.refresh())
       // refresh data cache if table is cached
       sparkSession.catalog.refreshByPath(outputPath.toString)
+
+      if (catalogTable.nonEmpty) {
+        CommandUtils.updateTableStats(sparkSession, catalogTable.get)
+      }
+
     } else {
       logInfo("Skipping insertion into a relation that already exists.")
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
index 9824062f969b3..b031c52dad8b5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
@@ -40,17 +40,6 @@ import org.apache.spark.sql.types._
 class StatisticsCollectionSuite extends StatisticsCollectionTestBase with SharedSQLContext {
   import testImplicits._
 
-  private def checkTableStats(tableName: String, expectedRowCount: Option[Int])
-    : Option[CatalogStatistics] = {
-    val df = spark.table(tableName)
-    val stats = df.queryExecution.analyzed.collect { case rel: LogicalRelation =>
-      assert(rel.catalogTable.get.stats.flatMap(_.rowCount) === expectedRowCount)
-      rel.catalogTable.get.stats
-    }
-    assert(stats.size == 1)
-    stats.head
-  }
-
   test("estimates the size of a limit 0 on outer join") {
     withTempView("test") {
       Seq(("one", 1), ("two", 2), ("three", 3), ("four", 4)).toDF("k", "v")
@@ -96,11 +85,11 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
 
       // noscan won't count the number of rows
       sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS noscan")
-      checkTableStats(tableName, expectedRowCount = None)
+      checkTableStats(tableName, hasSizeInBytes = true, expectedRowCounts = None)
 
       // without noscan, we count the number of rows
       sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
-      checkTableStats(tableName, expectedRowCount = Some(2))
+      checkTableStats(tableName, hasSizeInBytes = true, expectedRowCounts = Some(2))
     }
   }
 
@@ -168,6 +157,60 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
       assert(stats.simpleString == expectedString)
     }
   }
+
+  test("change stats after truncate command") {
+    val table = "change_stats_truncate_table"
+    withTable(table) {
+      spark.range(100).select($"id", $"id" % 5 as "value").write.saveAsTable(table)
+      // analyze to get initial stats
+      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS id, value")
+      val fetched1 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(100))
+      assert(fetched1.get.sizeInBytes > 0)
+      assert(fetched1.get.colStats.size == 2)
+
+      // truncate table command
+      sql(s"TRUNCATE TABLE $table")
+      val fetched2 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(0))
+      assert(fetched2.get.sizeInBytes == 0)
+      assert(fetched2.get.colStats.isEmpty)
+    }
+  }
+
+  test("change stats after set location command") {
+    val table = "change_stats_set_location_table"
+    withTable(table) {
+      spark.range(100).select($"id", $"id" % 5 as "value").write.saveAsTable(table)
+      // analyze to get initial stats
+      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS id, value")
+      val fetched1 = checkTableStats(
+        table, hasSizeInBytes = true, expectedRowCounts = Some(100))
+      assert(fetched1.get.sizeInBytes > 0)
+      assert(fetched1.get.colStats.size == 2)
+
+      // set location command
+      withTempDir { newLocation =>
+        sql(s"ALTER TABLE $table SET LOCATION '${newLocation.toURI.toString}'")
+        checkTableStats(table, hasSizeInBytes = false, expectedRowCounts = None)
+      }
+    }
+  }
+
+  test("change stats after insert command for datasource table") {
+    val table = "change_stats_insert_datasource_table"
+    withTable(table) {
+      sql(s"CREATE TABLE $table (i int, j string) USING PARQUET")
+      // analyze to get initial stats
+      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS i, j")
+      val fetched1 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(0))
+      assert(fetched1.get.sizeInBytes == 0)
+      assert(fetched1.get.colStats.size == 2)
+
+      // insert into command
+      sql(s"INSERT INTO TABLE $table SELECT 1, 'abc'")
+      checkTableStats(table, hasSizeInBytes = false, expectedRowCounts = None)
+    }
+  }
+
 }
 
 
@@ -219,6 +262,22 @@ abstract class StatisticsCollectionTestBase extends QueryTest with SQLTestUtils
 
   private val randomName = new Random(31)
 
+  def checkTableStats(
+      tableName: String,
+      hasSizeInBytes: Boolean,
+      expectedRowCounts: Option[Int]): Option[CatalogStatistics] = {
+    val stats = spark.sessionState.catalog.getTableMetadata(TableIdentifier(tableName)).stats
+    if (hasSizeInBytes || expectedRowCounts.nonEmpty) {
+      assert(stats.isDefined)
+      assert(stats.get.sizeInBytes >= 0)
+      assert(stats.get.rowCount === expectedRowCounts)
+    } else {
+      assert(stats.isEmpty)
+    }
+
+    stats
+  }
+
   /**
    * Compute column stats for the given DataFrame and compare it with colStats.
    */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
index f6d47734d7e83..d74a7cce25ed6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
@@ -149,6 +149,7 @@ private[sql] trait SQLTestUtils
         .getExecutorInfos.map(_.numRunningTasks()).sum == 0)
     }
   }
+
   /**
    * Creates a temporary directory, which is then passed to `f` and will be deleted after `f`
    * returns.
@@ -164,6 +165,19 @@ private[sql] trait SQLTestUtils
     }
   }
 
+  /**
+   * Creates the specified number of temporary directories, which is then passed to `f` and will be
+   * deleted after `f` returns.
+   */
+  protected def withTempPaths(numPaths: Int)(f: Seq[File] => Unit): Unit = {
+    val files = Array.fill[File](numPaths)(Utils.createTempDir().getCanonicalFile)
+    try f(files) finally {
+      // wait for all tasks to finish before deleting files
+      waitForTasksToFinish()
+      files.foreach(Utils.deleteRecursively)
+    }
+  }
+
   /**
    * Drops functions after calling `f`. A function is represented by (functionName, isTemporary).
    */
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 6e7c475fa34c9..2a17849fa8a34 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -631,21 +631,23 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
   override def alterTableStats(
       db: String,
       table: String,
-      stats: CatalogStatistics): Unit = withClient {
+      stats: Option[CatalogStatistics]): Unit = withClient {
     requireTableExists(db, table)
     val rawTable = getRawTable(db, table)
 
     // convert table statistics to properties so that we can persist them through hive client
-    var statsProperties: Map[String, String] =
-      Map(STATISTICS_TOTAL_SIZE -> stats.sizeInBytes.toString())
-    if (stats.rowCount.isDefined) {
-      statsProperties += STATISTICS_NUM_ROWS -> stats.rowCount.get.toString()
-    }
-    val colNameTypeMap: Map[String, DataType] =
-      rawTable.schema.fields.map(f => (f.name, f.dataType)).toMap
-    stats.colStats.foreach { case (colName, colStat) =>
-      colStat.toMap(colName, colNameTypeMap(colName)).foreach { case (k, v) =>
-        statsProperties += (columnStatKeyPropName(colName, k) -> v)
+    val statsProperties = new mutable.HashMap[String, String]()
+    if (stats.isDefined) {
+      statsProperties += STATISTICS_TOTAL_SIZE -> stats.get.sizeInBytes.toString()
+      if (stats.get.rowCount.isDefined) {
+        statsProperties += STATISTICS_NUM_ROWS -> stats.get.rowCount.get.toString()
+      }
+      val colNameTypeMap: Map[String, DataType] =
+        rawTable.schema.fields.map(f => (f.name, f.dataType)).toMap
+      stats.get.colStats.foreach { case (colName, colStat) =>
+        colStat.toMap(colName, colNameTypeMap(colName)).foreach { case (k, v) =>
+          statsProperties += (columnStatKeyPropName(colName, k) -> v)
+        }
       }
     }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index 392b7cfaa8eff..223d375232393 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -37,7 +37,7 @@ import org.apache.spark.sql.catalyst.catalog.CatalogTable
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SparkPlan
-import org.apache.spark.sql.execution.command.RunnableCommand
+import org.apache.spark.sql.execution.command.{CommandUtils, RunnableCommand}
 import org.apache.spark.sql.execution.datasources.FileFormatWriter
 import org.apache.spark.sql.hive._
 import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc}
@@ -434,6 +434,8 @@ case class InsertIntoHiveTable(
     sparkSession.catalog.uncacheTable(table.identifier.quotedString)
     sparkSession.sessionState.catalog.refreshTable(table.identifier)
 
+    CommandUtils.updateTableStats(sparkSession, table)
+
     // It would be nice to just return the childRdd unchanged so insert operations could be chained,
     // however for now we return an empty list to simplify compatibility checks with hive, which
     // does not return anything for insert operations.
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index 64deb3818d5d1..5fd266c2d033c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -30,10 +30,12 @@ import org.apache.spark.sql.catalyst.util.StringUtils
 import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.execution.datasources.LogicalRelation
 import org.apache.spark.sql.execution.joins._
+import org.apache.spark.sql.hive.HiveExternalCatalog._
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 
+
 class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleton {
 
   test("Hive serde tables should fallback to HDFS for size estimation") {
@@ -219,23 +221,6 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
     }
   }
 
-  private def checkTableStats(
-      tableName: String,
-      hasSizeInBytes: Boolean,
-      expectedRowCounts: Option[Int]): Option[CatalogStatistics] = {
-    val stats = spark.sessionState.catalog.getTableMetadata(TableIdentifier(tableName)).stats
-
-    if (hasSizeInBytes || expectedRowCounts.nonEmpty) {
-      assert(stats.isDefined)
-      assert(stats.get.sizeInBytes > 0)
-      assert(stats.get.rowCount === expectedRowCounts)
-    } else {
-      assert(stats.isEmpty)
-    }
-
-    stats
-  }
-
   test("test table-level statistics for hive tables created in HiveExternalCatalog") {
     val textTable = "textTable"
     withTable(textTable) {
@@ -326,7 +311,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
       descOutput: Seq[String],
       propKey: String): Option[BigInt] = {
     val str = descOutput
-      .filterNot(_.contains(HiveExternalCatalog.STATISTICS_PREFIX))
+      .filterNot(_.contains(STATISTICS_PREFIX))
       .filter(_.contains(propKey))
     if (str.isEmpty) {
       None
@@ -448,6 +433,103 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
       "ALTER TABLE unset_prop_table UNSET TBLPROPERTIES ('prop1')")
   }
 
+  /**
+   * To see if stats exist, we need to check spark's stats properties instead of catalog
+   * statistics, because hive would change stats in metastore and thus change catalog statistics.
+   */
+  private def getStatsProperties(tableName: String): Map[String, String] = {
+    val hTable = hiveClient.getTable(spark.sessionState.catalog.getCurrentDatabase, tableName)
+    hTable.properties.filterKeys(_.startsWith(STATISTICS_PREFIX))
+  }
+
+  test("change stats after insert command for hive table") {
+    val table = s"change_stats_insert_hive_table"
+    withTable(table) {
+      sql(s"CREATE TABLE $table (i int, j string)")
+      // analyze to get initial stats
+      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS i, j")
+      val fetched1 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(0))
+      assert(fetched1.get.sizeInBytes == 0)
+      assert(fetched1.get.colStats.size == 2)
+
+      // insert into command
+      sql(s"INSERT INTO TABLE $table SELECT 1, 'abc'")
+      assert(getStatsProperties(table).isEmpty)
+    }
+  }
+
+  test("change stats after load data command") {
+    val table = "change_stats_load_table"
+    withTable(table) {
+      sql(s"CREATE TABLE $table (i INT, j STRING) STORED AS PARQUET")
+      // analyze to get initial stats
+      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS i, j")
+      val fetched1 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(0))
+      assert(fetched1.get.sizeInBytes == 0)
+      assert(fetched1.get.colStats.size == 2)
+
+      withTempDir { loadPath =>
+        // load data command
+        val file = new File(loadPath + "/data")
+        val writer = new PrintWriter(file)
+        writer.write("2,xyz")
+        writer.close()
+        sql(s"LOAD DATA INPATH '${loadPath.toURI.toString}' INTO TABLE $table")
+        assert(getStatsProperties(table).isEmpty)
+      }
+    }
+  }
+
+  test("change stats after add/drop partition command") {
+    val table = "change_stats_part_table"
+    withTable(table) {
+      sql(s"CREATE TABLE $table (i INT, j STRING) PARTITIONED BY (ds STRING, hr STRING)")
+      // table has two partitions initially
+      for (ds <- Seq("2008-04-08"); hr <- Seq("11", "12")) {
+        sql(s"INSERT OVERWRITE TABLE $table PARTITION (ds='$ds',hr='$hr') SELECT 1, 'a'")
+      }
+      // analyze to get initial stats
+      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS i, j")
+      val fetched1 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(2))
+      assert(fetched1.get.sizeInBytes > 0)
+      assert(fetched1.get.colStats.size == 2)
+
+      withTempPaths(numPaths = 2) { case Seq(dir1, dir2) =>
+        val file1 = new File(dir1 + "/data")
+        val writer1 = new PrintWriter(file1)
+        writer1.write("1,a")
+        writer1.close()
+
+        val file2 = new File(dir2 + "/data")
+        val writer2 = new PrintWriter(file2)
+        writer2.write("1,a")
+        writer2.close()
+
+        // add partition command
+        sql(
+          s"""
+             |ALTER TABLE $table ADD
+             |PARTITION (ds='2008-04-09', hr='11') LOCATION '${dir1.toURI.toString}'
+             |PARTITION (ds='2008-04-09', hr='12') LOCATION '${dir2.toURI.toString}'
+        """.stripMargin)
+        assert(getStatsProperties(table).isEmpty)
+
+        // generate stats again
+        sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS i, j")
+        val fetched2 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(4))
+        assert(fetched2.get.sizeInBytes > 0)
+        assert(fetched2.get.colStats.size == 2)
+
+        // drop partition command
+        sql(s"ALTER TABLE $table DROP PARTITION (ds='2008-04-08'), PARTITION (hr='12')")
+        // only one partition left
+        assert(spark.sessionState.catalog.listPartitions(TableIdentifier(table))
+          .map(_.spec).toSet == Set(Map("ds" -> "2008-04-09", "hr" -> "11")))
+        assert(getStatsProperties(table).isEmpty)
+      }
+    }
+  }
+
   test("add/drop partitions - managed table") {
     val catalog = spark.sessionState.catalog
     val managedTable = "partitionedTable"
@@ -483,23 +565,19 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
       assert(catalog.listPartitions(TableIdentifier(managedTable)).map(_.spec).toSet ==
         Set(Map("ds" -> "2008-04-09", "hr" -> "11")))
 
-      val stats2 = checkTableStats(
-        managedTable, hasSizeInBytes = true, expectedRowCounts = Some(4))
-      assert(stats1 == stats2)
-
       sql(s"ANALYZE TABLE $managedTable COMPUTE STATISTICS")
 
-      val stats3 = checkTableStats(
+      val stats2 = checkTableStats(
         managedTable, hasSizeInBytes = true, expectedRowCounts = Some(1))
-      assert(stats2.get.sizeInBytes > stats3.get.sizeInBytes)
+      assert(stats1.get.sizeInBytes > stats2.get.sizeInBytes)
 
       sql(s"ALTER TABLE $managedTable ADD PARTITION (ds='2008-04-08', hr='12')")
       sql(s"ANALYZE TABLE $managedTable COMPUTE STATISTICS")
       val stats4 = checkTableStats(
         managedTable, hasSizeInBytes = true, expectedRowCounts = Some(1))
 
-      assert(stats2.get.sizeInBytes > stats4.get.sizeInBytes)
-      assert(stats4.get.sizeInBytes == stats3.get.sizeInBytes)
+      assert(stats1.get.sizeInBytes > stats4.get.sizeInBytes)
+      assert(stats4.get.sizeInBytes == stats2.get.sizeInBytes)
     }
   }
 

From a946be35ac177737e99942ad42de6f319f186138 Mon Sep 17 00:00:00 2001
From: Sital Kedia <skedia@fb.com>
Date: Thu, 29 Jun 2017 14:25:51 +0800
Subject: [PATCH 038/779] [SPARK-3577] Report Spill size on disk for
 UnsafeExternalSorter

## What changes were proposed in this pull request?

Report Spill size on disk for UnsafeExternalSorter

## How was this patch tested?

Tested by running a job on cluster and verify the spill size on disk.

Author: Sital Kedia <skedia@fb.com>

Closes #17471 from sitalkedia/fix_disk_spill_size.
---
 .../unsafe/sort/UnsafeExternalSorter.java     |  9 +++----
 .../sort/UnsafeExternalSorterSuite.java       | 25 +++++++++++++++++++
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
index f312fa2b2ddd7..82d03e3e9190c 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
@@ -54,7 +54,6 @@ public final class UnsafeExternalSorter extends MemoryConsumer {
   private final BlockManager blockManager;
   private final SerializerManager serializerManager;
   private final TaskContext taskContext;
-  private ShuffleWriteMetrics writeMetrics;
 
   /** The buffer size to use when writing spills using DiskBlockObjectWriter */
   private final int fileBufferSizeBytes;
@@ -144,10 +143,6 @@ private UnsafeExternalSorter(
     // Use getSizeAsKb (not bytes) to maintain backwards compatibility for units
     // this.fileBufferSizeBytes = (int) conf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024
     this.fileBufferSizeBytes = 32 * 1024;
-    // The spill metrics are stored in a new ShuffleWriteMetrics,
-    // and then discarded (this fixes SPARK-16827).
-    // TODO: Instead, separate spill metrics should be stored and reported (tracked in SPARK-3577).
-    this.writeMetrics = new ShuffleWriteMetrics();
 
     if (existingInMemorySorter == null) {
       this.inMemSorter = new UnsafeInMemorySorter(
@@ -199,6 +194,7 @@ public long spill(long size, MemoryConsumer trigger) throws IOException {
       spillWriters.size(),
       spillWriters.size() > 1 ? " times" : " time");
 
+    ShuffleWriteMetrics writeMetrics = new ShuffleWriteMetrics();
     // We only write out contents of the inMemSorter if it is not empty.
     if (inMemSorter.numRecords() > 0) {
       final UnsafeSorterSpillWriter spillWriter =
@@ -226,6 +222,7 @@ public long spill(long size, MemoryConsumer trigger) throws IOException {
     // pages, we might not be able to get memory for the pointer array.
 
     taskContext.taskMetrics().incMemoryBytesSpilled(spillSize);
+    taskContext.taskMetrics().incDiskBytesSpilled(writeMetrics.bytesWritten());
     totalSpillBytes += spillSize;
     return spillSize;
   }
@@ -502,6 +499,7 @@ public long spill() throws IOException {
         UnsafeInMemorySorter.SortedIterator inMemIterator =
           ((UnsafeInMemorySorter.SortedIterator) upstream).clone();
 
+       ShuffleWriteMetrics writeMetrics = new ShuffleWriteMetrics();
         // Iterate over the records that have not been returned and spill them.
         final UnsafeSorterSpillWriter spillWriter =
           new UnsafeSorterSpillWriter(blockManager, fileBufferSizeBytes, writeMetrics, numRecords);
@@ -540,6 +538,7 @@ public long spill() throws IOException {
         inMemSorter.free();
         inMemSorter = null;
         taskContext.taskMetrics().incMemoryBytesSpilled(released);
+        taskContext.taskMetrics().incDiskBytesSpilled(writeMetrics.bytesWritten());
         totalSpillBytes += released;
         return released;
       }
diff --git a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
index 771d39016c188..d31d7c1c0900c 100644
--- a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
+++ b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
@@ -405,6 +405,31 @@ public void forcedSpillingWithoutComparator() throws Exception {
     assertSpillFilesWereCleanedUp();
   }
 
+  @Test
+  public void testDiskSpilledBytes() throws Exception {
+    final UnsafeExternalSorter sorter = newSorter();
+    long[] record = new long[100];
+    int recordSize = record.length * 8;
+    int n = (int) pageSizeBytes / recordSize * 3;
+    for (int i = 0; i < n; i++) {
+      record[0] = (long) i;
+      sorter.insertRecord(record, Platform.LONG_ARRAY_OFFSET, recordSize, 0, false);
+    }
+    // We will have at-least 2 memory pages allocated because of rounding happening due to
+    // integer division of pageSizeBytes and recordSize.
+    assertTrue(sorter.getNumberOfAllocatedPages() >= 2);
+    assertTrue(taskContext.taskMetrics().diskBytesSpilled() == 0);
+    UnsafeExternalSorter.SpillableIterator iter =
+            (UnsafeExternalSorter.SpillableIterator) sorter.getSortedIterator();
+    assertTrue(iter.spill() > 0);
+    assertTrue(taskContext.taskMetrics().diskBytesSpilled() > 0);
+    assertEquals(0, iter.spill());
+    // Even if we did not spill second time, the disk spilled bytes should still be non-zero
+    assertTrue(taskContext.taskMetrics().diskBytesSpilled() > 0);
+    sorter.cleanupResources();
+    assertSpillFilesWereCleanedUp();
+  }
+
   @Test
   public void testPeakMemoryUsed() throws Exception {
     final long recordLengthBytes = 8;

From 9f6b3e65ccfa0daec31b58c5a6386b3a890c2149 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 29 Jun 2017 14:37:42 +0800
Subject: [PATCH 039/779] [SPARK-21238][SQL] allow nested SQL execution

## What changes were proposed in this pull request?

This is kind of another follow-up for https://github.com/apache/spark/pull/18064 .

In #18064 , we wrap every SQL command with SQL execution, which makes nested SQL execution very likely to happen. #18419 trid to improve it a little bit, by introduing `SQLExecition.ignoreNestedExecutionId`. However, this is not friendly to data source developers, they may need to update their code to use this `ignoreNestedExecutionId` API.

This PR proposes a new solution, to just allow nested execution. The downside is that, we may have multiple executions for one query. We can improve this by updating the data organization in SQLListener, to have 1-n mapping from query to execution, instead of 1-1 mapping. This can be done in a follow-up.

## How was this patch tested?

existing tests.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #18450 from cloud-fan/execution-id.
---
 .../spark/sql/execution/SQLExecution.scala    | 88 ++++---------------
 .../command/AnalyzeTableCommand.scala         |  4 +-
 .../spark/sql/execution/command/cache.scala   | 16 ++--
 .../datasources/csv/CSVDataSource.scala       |  4 +-
 .../datasources/jdbc/JDBCRelation.scala       |  8 +-
 .../sql/execution/streaming/console.scala     | 12 +--
 .../sql/execution/streaming/memory.scala      | 32 ++++---
 .../sql/execution/SQLExecutionSuite.scala     | 24 -----
 8 files changed, 50 insertions(+), 138 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala
index ca8bed5214f87..e991da7df0bde 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala
@@ -22,15 +22,12 @@ import java.util.concurrent.atomic.AtomicLong
 
 import org.apache.spark.SparkContext
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.execution.ui.{SparkListenerSQLExecutionEnd,
-  SparkListenerSQLExecutionStart}
+import org.apache.spark.sql.execution.ui.{SparkListenerSQLExecutionEnd, SparkListenerSQLExecutionStart}
 
 object SQLExecution {
 
   val EXECUTION_ID_KEY = "spark.sql.execution.id"
 
-  private val IGNORE_NESTED_EXECUTION_ID = "spark.sql.execution.ignoreNestedExecutionId"
-
   private val _nextExecutionId = new AtomicLong(0)
 
   private def nextExecutionId: Long = _nextExecutionId.getAndIncrement
@@ -45,10 +42,8 @@ object SQLExecution {
 
   private[sql] def checkSQLExecutionId(sparkSession: SparkSession): Unit = {
     val sc = sparkSession.sparkContext
-    val isNestedExecution = sc.getLocalProperty(IGNORE_NESTED_EXECUTION_ID) != null
-    val hasExecutionId = sc.getLocalProperty(EXECUTION_ID_KEY) != null
     // only throw an exception during tests. a missing execution ID should not fail a job.
-    if (testing && !isNestedExecution && !hasExecutionId) {
+    if (testing && sc.getLocalProperty(EXECUTION_ID_KEY) == null) {
       // Attention testers: when a test fails with this exception, it means that the action that
       // started execution of a query didn't call withNewExecutionId. The execution ID should be
       // set by calling withNewExecutionId in the action that begins execution, like
@@ -66,56 +61,27 @@ object SQLExecution {
       queryExecution: QueryExecution)(body: => T): T = {
     val sc = sparkSession.sparkContext
     val oldExecutionId = sc.getLocalProperty(EXECUTION_ID_KEY)
-    if (oldExecutionId == null) {
-      val executionId = SQLExecution.nextExecutionId
-      sc.setLocalProperty(EXECUTION_ID_KEY, executionId.toString)
-      executionIdToQueryExecution.put(executionId, queryExecution)
-      try {
-        // sparkContext.getCallSite() would first try to pick up any call site that was previously
-        // set, then fall back to Utils.getCallSite(); call Utils.getCallSite() directly on
-        // streaming queries would give us call site like "run at <unknown>:0"
-        val callSite = sparkSession.sparkContext.getCallSite()
-
-        sparkSession.sparkContext.listenerBus.post(SparkListenerSQLExecutionStart(
-          executionId, callSite.shortForm, callSite.longForm, queryExecution.toString,
-          SparkPlanInfo.fromSparkPlan(queryExecution.executedPlan), System.currentTimeMillis()))
-        try {
-          body
-        } finally {
-          sparkSession.sparkContext.listenerBus.post(SparkListenerSQLExecutionEnd(
-            executionId, System.currentTimeMillis()))
-        }
-      } finally {
-        executionIdToQueryExecution.remove(executionId)
-        sc.setLocalProperty(EXECUTION_ID_KEY, null)
-      }
-    } else if (sc.getLocalProperty(IGNORE_NESTED_EXECUTION_ID) != null) {
-      // If `IGNORE_NESTED_EXECUTION_ID` is set, just ignore the execution id while evaluating the
-      // `body`, so that Spark jobs issued in the `body` won't be tracked.
+    val executionId = SQLExecution.nextExecutionId
+    sc.setLocalProperty(EXECUTION_ID_KEY, executionId.toString)
+    executionIdToQueryExecution.put(executionId, queryExecution)
+    try {
+      // sparkContext.getCallSite() would first try to pick up any call site that was previously
+      // set, then fall back to Utils.getCallSite(); call Utils.getCallSite() directly on
+      // streaming queries would give us call site like "run at <unknown>:0"
+      val callSite = sparkSession.sparkContext.getCallSite()
+
+      sparkSession.sparkContext.listenerBus.post(SparkListenerSQLExecutionStart(
+        executionId, callSite.shortForm, callSite.longForm, queryExecution.toString,
+        SparkPlanInfo.fromSparkPlan(queryExecution.executedPlan), System.currentTimeMillis()))
       try {
-        sc.setLocalProperty(EXECUTION_ID_KEY, null)
         body
       } finally {
-        sc.setLocalProperty(EXECUTION_ID_KEY, oldExecutionId)
+        sparkSession.sparkContext.listenerBus.post(SparkListenerSQLExecutionEnd(
+          executionId, System.currentTimeMillis()))
       }
-    } else {
-      // Don't support nested `withNewExecutionId`. This is an example of the nested
-      // `withNewExecutionId`:
-      //
-      // class DataFrame {
-      //   def foo: T = withNewExecutionId { something.createNewDataFrame().collect() }
-      // }
-      //
-      // Note: `collect` will call withNewExecutionId
-      // In this case, only the "executedPlan" for "collect" will be executed. The "executedPlan"
-      // for the outer DataFrame won't be executed. So it's meaningless to create a new Execution
-      // for the outer DataFrame. Even if we track it, since its "executedPlan" doesn't run,
-      // all accumulator metrics will be 0. It will confuse people if we show them in Web UI.
-      //
-      // A real case is the `DataFrame.count` method.
-      throw new IllegalArgumentException(s"$EXECUTION_ID_KEY is already set, please wrap your " +
-        "action with SQLExecution.ignoreNestedExecutionId if you don't want to track the Spark " +
-        "jobs issued by the nested execution.")
+    } finally {
+      executionIdToQueryExecution.remove(executionId)
+      sc.setLocalProperty(EXECUTION_ID_KEY, oldExecutionId)
     }
   }
 
@@ -133,20 +99,4 @@ object SQLExecution {
       sc.setLocalProperty(SQLExecution.EXECUTION_ID_KEY, oldExecutionId)
     }
   }
-
-  /**
-   * Wrap an action which may have nested execution id. This method can be used to run an execution
-   * inside another execution, e.g., `CacheTableCommand` need to call `Dataset.collect`. Note that,
-   * all Spark jobs issued in the body won't be tracked in UI.
-   */
-  def ignoreNestedExecutionId[T](sparkSession: SparkSession)(body: => T): T = {
-    val sc = sparkSession.sparkContext
-    val allowNestedPreviousValue = sc.getLocalProperty(IGNORE_NESTED_EXECUTION_ID)
-    try {
-      sc.setLocalProperty(IGNORE_NESTED_EXECUTION_ID, "true")
-      body
-    } finally {
-      sc.setLocalProperty(IGNORE_NESTED_EXECUTION_ID, allowNestedPreviousValue)
-    }
-  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
index d780ef42f3fae..42e2a9ca5c4e2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
@@ -51,9 +51,7 @@ case class AnalyzeTableCommand(
     // 2. when total size is changed, `oldRowCount` becomes invalid.
     // This is to make sure that we only record the right statistics.
     if (!noscan) {
-      val newRowCount = SQLExecution.ignoreNestedExecutionId(sparkSession) {
-        sparkSession.table(tableIdentWithDB).count()
-      }
+      val newRowCount = sparkSession.table(tableIdentWithDB).count()
       if (newRowCount >= 0 && newRowCount != oldRowCount) {
         newStats = if (newStats.isDefined) {
           newStats.map(_.copy(rowCount = Some(BigInt(newRowCount))))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala
index d36eb7587a3ef..47952f2f227a3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala
@@ -34,16 +34,14 @@ case class CacheTableCommand(
   override def innerChildren: Seq[QueryPlan[_]] = plan.toSeq
 
   override def run(sparkSession: SparkSession): Seq[Row] = {
-    SQLExecution.ignoreNestedExecutionId(sparkSession) {
-      plan.foreach { logicalPlan =>
-        Dataset.ofRows(sparkSession, logicalPlan).createTempView(tableIdent.quotedString)
-      }
-      sparkSession.catalog.cacheTable(tableIdent.quotedString)
+    plan.foreach { logicalPlan =>
+      Dataset.ofRows(sparkSession, logicalPlan).createTempView(tableIdent.quotedString)
+    }
+    sparkSession.catalog.cacheTable(tableIdent.quotedString)
 
-      if (!isLazy) {
-        // Performs eager caching
-        sparkSession.table(tableIdent).count()
-      }
+    if (!isLazy) {
+      // Performs eager caching
+      sparkSession.table(tableIdent).count()
     }
 
     Seq.empty[Row]
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index 99133bd70989a..2031381dd2e10 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -145,9 +145,7 @@ object TextInputCSVDataSource extends CSVDataSource {
       inputPaths: Seq[FileStatus],
       parsedOptions: CSVOptions): StructType = {
     val csv = createBaseDataset(sparkSession, inputPaths, parsedOptions)
-    val maybeFirstLine = SQLExecution.ignoreNestedExecutionId(sparkSession) {
-      CSVUtils.filterCommentAndEmpty(csv, parsedOptions).take(1).headOption
-    }
+    val maybeFirstLine = CSVUtils.filterCommentAndEmpty(csv, parsedOptions).take(1).headOption
     inferFromDataset(sparkSession, csv, maybeFirstLine, parsedOptions)
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
index b11da7045de22..a521fd1323852 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
@@ -130,11 +130,9 @@ private[sql] case class JDBCRelation(
   }
 
   override def insert(data: DataFrame, overwrite: Boolean): Unit = {
-    SQLExecution.ignoreNestedExecutionId(data.sparkSession) {
-      data.write
-        .mode(if (overwrite) SaveMode.Overwrite else SaveMode.Append)
-        .jdbc(jdbcOptions.url, jdbcOptions.table, jdbcOptions.asProperties)
-    }
+    data.write
+      .mode(if (overwrite) SaveMode.Overwrite else SaveMode.Append)
+      .jdbc(jdbcOptions.url, jdbcOptions.table, jdbcOptions.asProperties)
   }
 
   override def toString: String = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
index 6fa7c113defaa..3baea6376069f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
@@ -48,11 +48,9 @@ class ConsoleSink(options: Map[String, String]) extends Sink with Logging {
     println(batchIdStr)
     println("-------------------------------------------")
     // scalastyle:off println
-    SQLExecution.ignoreNestedExecutionId(data.sparkSession) {
-      data.sparkSession.createDataFrame(
-        data.sparkSession.sparkContext.parallelize(data.collect()), data.schema)
-        .show(numRowsToShow, isTruncated)
-    }
+    data.sparkSession.createDataFrame(
+      data.sparkSession.sparkContext.parallelize(data.collect()), data.schema)
+      .show(numRowsToShow, isTruncated)
   }
 }
 
@@ -82,9 +80,7 @@ class ConsoleSinkProvider extends StreamSinkProvider
 
     // Truncate the displayed data if it is too long, by default it is true
     val isTruncated = parameters.get("truncate").map(_.toBoolean).getOrElse(true)
-    SQLExecution.ignoreNestedExecutionId(sqlContext.sparkSession) {
-      data.show(numRowsToShow, isTruncated)
-    }
+    data.show(numRowsToShow, isTruncated)
 
     ConsoleRelation(sqlContext, data)
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
index 198a342582804..4979873ee3c7f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
@@ -194,23 +194,21 @@ class MemorySink(val schema: StructType, outputMode: OutputMode) extends Sink wi
     }
     if (notCommitted) {
       logDebug(s"Committing batch $batchId to $this")
-      SQLExecution.ignoreNestedExecutionId(data.sparkSession) {
-        outputMode match {
-          case Append | Update =>
-            val rows = AddedData(batchId, data.collect())
-            synchronized { batches += rows }
-
-          case Complete =>
-            val rows = AddedData(batchId, data.collect())
-            synchronized {
-              batches.clear()
-              batches += rows
-            }
-
-          case _ =>
-            throw new IllegalArgumentException(
-              s"Output mode $outputMode is not supported by MemorySink")
-        }
+      outputMode match {
+        case Append | Update =>
+          val rows = AddedData(batchId, data.collect())
+          synchronized { batches += rows }
+
+        case Complete =>
+          val rows = AddedData(batchId, data.collect())
+          synchronized {
+            batches.clear()
+            batches += rows
+          }
+
+        case _ =>
+          throw new IllegalArgumentException(
+            s"Output mode $outputMode is not supported by MemorySink")
       }
     } else {
       logDebug(s"Skipping already committed batch: $batchId")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLExecutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLExecutionSuite.scala
index fe78a76568837..f6b006b98edd1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLExecutionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLExecutionSuite.scala
@@ -26,22 +26,9 @@ import org.apache.spark.sql.SparkSession
 class SQLExecutionSuite extends SparkFunSuite {
 
   test("concurrent query execution (SPARK-10548)") {
-    // Try to reproduce the issue with the old SparkContext
     val conf = new SparkConf()
       .setMaster("local[*]")
       .setAppName("test")
-    val badSparkContext = new BadSparkContext(conf)
-    try {
-      testConcurrentQueryExecution(badSparkContext)
-      fail("unable to reproduce SPARK-10548")
-    } catch {
-      case e: IllegalArgumentException =>
-        assert(e.getMessage.contains(SQLExecution.EXECUTION_ID_KEY))
-    } finally {
-      badSparkContext.stop()
-    }
-
-    // Verify that the issue is fixed with the latest SparkContext
     val goodSparkContext = new SparkContext(conf)
     try {
       testConcurrentQueryExecution(goodSparkContext)
@@ -134,17 +121,6 @@ class SQLExecutionSuite extends SparkFunSuite {
   }
 }
 
-/**
- * A bad [[SparkContext]] that does not clone the inheritable thread local properties
- * when passing them to children threads.
- */
-private class BadSparkContext(conf: SparkConf) extends SparkContext(conf) {
-  protected[spark] override val localProperties = new InheritableThreadLocal[Properties] {
-    override protected def childValue(parent: Properties): Properties = new Properties(parent)
-    override protected def initialValue(): Properties = new Properties()
-  }
-}
-
 object SQLExecutionSuite {
   @volatile var canProgress = false
 }

From a2d5623548194f15989e7b68118d744673e33819 Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Thu, 29 Jun 2017 01:23:13 -0700
Subject: [PATCH 040/779] [SPARK-20889][SPARKR] Grouped documentation for
 NONAGGREGATE column methods

## What changes were proposed in this pull request?

Grouped documentation for nonaggregate column methods.

Author: actuaryzhang <actuaryzhang10@gmail.com>
Author: Wayne Zhang <actuaryzhang10@gmail.com>

Closes #18422 from actuaryzhang/sparkRDocNonAgg.
---
 R/pkg/R/functions.R | 360 ++++++++++++++++++--------------------------
 R/pkg/R/generics.R  |  55 ++++---
 2 files changed, 182 insertions(+), 233 deletions(-)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 70ea620b471fe..cb09e847d739a 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -132,23 +132,39 @@ NULL
 #' df <- createDataFrame(as.data.frame(Titanic, stringsAsFactors = FALSE))}
 NULL
 
-#' lit
+#' Non-aggregate functions for Column operations
 #'
-#' A new \linkS4class{Column} is created to represent the literal value.
-#' If the parameter is a \linkS4class{Column}, it is returned unchanged.
+#' Non-aggregate functions defined for \code{Column}.
 #'
-#' @param x a literal value or a Column.
+#' @param x Column to compute on. In \code{lit}, it is a literal value or a Column.
+#'          In \code{expr}, it contains an expression character object to be parsed.
+#' @param y Column to compute on.
+#' @param ... additional Columns.
+#' @name column_nonaggregate_functions
+#' @rdname column_nonaggregate_functions
+#' @seealso coalesce,SparkDataFrame-method
 #' @family non-aggregate functions
-#' @rdname lit
-#' @name lit
+#' @examples
+#' \dontrun{
+#' # Dataframe used throughout this doc
+#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))}
+NULL
+
+#' @details
+#' \code{lit}: A new Column is created to represent the literal value.
+#' If the parameter is a Column, it is returned unchanged.
+#'
+#' @rdname column_nonaggregate_functions
 #' @export
-#' @aliases lit,ANY-method
+#' @aliases lit lit,ANY-method
 #' @examples
+#'
 #' \dontrun{
-#' lit(df$name)
-#' select(df, lit("x"))
-#' select(df, lit("2015-01-01"))
-#'}
+#' tmp <- mutate(df, v1 = lit(df$mpg), v2 = lit("x"), v3 = lit("2015-01-01"),
+#'                   v4 = negate(df$mpg), v5 = expr('length(model)'),
+#'                   v6 = greatest(df$vs, df$am), v7 = least(df$vs, df$am),
+#'                   v8 = column("mpg"))
+#' head(tmp)}
 #' @note lit since 1.5.0
 setMethod("lit", signature("ANY"),
           function(x) {
@@ -314,18 +330,16 @@ setMethod("bin",
             column(jc)
           })
 
-#' bitwiseNOT
-#'
-#' Computes bitwise NOT.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{bitwiseNOT}: Computes bitwise NOT.
 #'
-#' @rdname bitwiseNOT
-#' @name bitwiseNOT
-#' @family non-aggregate functions
+#' @rdname column_nonaggregate_functions
 #' @export
-#' @aliases bitwiseNOT,Column-method
-#' @examples \dontrun{bitwiseNOT(df$c)}
+#' @aliases bitwiseNOT bitwiseNOT,Column-method
+#' @examples
+#'
+#' \dontrun{
+#' head(select(df, bitwiseNOT(cast(df$vs, "int"))))}
 #' @note bitwiseNOT since 1.5.0
 setMethod("bitwiseNOT",
           signature(x = "Column"),
@@ -375,16 +389,12 @@ setMethod("ceiling",
             ceil(x)
           })
 
-#' Returns the first column that is not NA
-#'
-#' Returns the first column that is not NA, or NA if all inputs are.
+#' @details
+#' \code{coalesce}: Returns the first column that is not NA, or NA if all inputs are.
 #'
-#' @rdname coalesce
-#' @name coalesce
-#' @family non-aggregate functions
+#' @rdname column_nonaggregate_functions
 #' @export
 #' @aliases coalesce,Column-method
-#' @examples \dontrun{coalesce(df$c, df$d, df$e)}
 #' @note coalesce(Column) since 2.1.1
 setMethod("coalesce",
           signature(x = "Column"),
@@ -824,22 +834,24 @@ setMethod("initcap",
             column(jc)
           })
 
-#' is.nan
-#'
-#' Return true if the column is NaN, alias for \link{isnan}
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{isnan}: Returns true if the column is NaN.
+#' @rdname column_nonaggregate_functions
+#' @aliases isnan isnan,Column-method
+#' @note isnan since 2.0.0
+setMethod("isnan",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "isnan", x@jc)
+            column(jc)
+          })
+
+#' @details
+#' \code{is.nan}: Alias for \link{isnan}.
 #'
-#' @rdname is.nan
-#' @name is.nan
-#' @family non-aggregate functions
-#' @aliases is.nan,Column-method
+#' @rdname column_nonaggregate_functions
+#' @aliases is.nan is.nan,Column-method
 #' @export
-#' @examples
-#' \dontrun{
-#' is.nan(df$c)
-#' isnan(df$c)
-#' }
 #' @note is.nan since 2.0.0
 setMethod("is.nan",
           signature(x = "Column"),
@@ -847,17 +859,6 @@ setMethod("is.nan",
             isnan(x)
           })
 
-#' @rdname is.nan
-#' @name isnan
-#' @aliases isnan,Column-method
-#' @note isnan since 2.0.0
-setMethod("isnan",
-          signature(x = "Column"),
-          function(x) {
-            jc <- callJStatic("org.apache.spark.sql.functions", "isnan", x@jc)
-            column(jc)
-          })
-
 #' @details
 #' \code{kurtosis}: Returns the kurtosis of the values in a group.
 #'
@@ -1129,27 +1130,24 @@ setMethod("minute",
             column(jc)
           })
 
-#' monotonically_increasing_id
-#'
-#' Return a column that generates monotonically increasing 64-bit integers.
-#'
-#' The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive.
-#' The current implementation puts the partition ID in the upper 31 bits, and the record number
-#' within each partition in the lower 33 bits. The assumption is that the SparkDataFrame has
-#' less than 1 billion partitions, and each partition has less than 8 billion records.
-#'
-#' As an example, consider a SparkDataFrame with two partitions, each with 3 records.
+#' @details
+#' \code{monotonically_increasing_id}: Returns a column that generates monotonically increasing
+#' 64-bit integers. The generated ID is guaranteed to be monotonically increasing and unique,
+#' but not consecutive. The current implementation puts the partition ID in the upper 31 bits,
+#' and the record number within each partition in the lower 33 bits. The assumption is that the
+#' SparkDataFrame has less than 1 billion partitions, and each partition has less than 8 billion
+#' records. As an example, consider a SparkDataFrame with two partitions, each with 3 records.
 #' This expression would return the following IDs:
 #' 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594.
-#'
 #' This is equivalent to the MONOTONICALLY_INCREASING_ID function in SQL.
+#' The method should be used with no argument.
 #'
-#' @rdname monotonically_increasing_id
-#' @aliases monotonically_increasing_id,missing-method
-#' @name monotonically_increasing_id
-#' @family misc functions
+#' @rdname column_nonaggregate_functions
+#' @aliases monotonically_increasing_id monotonically_increasing_id,missing-method
 #' @export
-#' @examples \dontrun{select(df, monotonically_increasing_id())}
+#' @examples
+#'
+#' \dontrun{head(select(df, monotonically_increasing_id()))}
 setMethod("monotonically_increasing_id",
           signature("missing"),
           function() {
@@ -1171,18 +1169,12 @@ setMethod("month",
             column(jc)
           })
 
-#' negate
-#'
-#' Unary minus, i.e. negate the expression.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{negate}: Unary minus, i.e. negate the expression.
 #'
-#' @rdname negate
-#' @name negate
-#' @family non-aggregate functions
-#' @aliases negate,Column-method
+#' @rdname column_nonaggregate_functions
+#' @aliases negate negate,Column-method
 #' @export
-#' @examples \dontrun{negate(df$c)}
 #' @note negate since 1.5.0
 setMethod("negate",
           signature(x = "Column"),
@@ -1481,23 +1473,19 @@ setMethod("stddev_samp",
             column(jc)
           })
 
-#' struct
-#'
-#' Creates a new struct column that composes multiple input columns.
-#'
-#' @param x a column to compute on.
-#' @param ... optional column(s) to be included.
+#' @details
+#' \code{struct}: Creates a new struct column that composes multiple input columns.
 #'
-#' @rdname struct
-#' @name struct
-#' @family non-aggregate functions
-#' @aliases struct,characterOrColumn-method
+#' @rdname column_nonaggregate_functions
+#' @aliases struct struct,characterOrColumn-method
 #' @export
 #' @examples
+#'
 #' \dontrun{
-#' struct(df$c, df$d)
-#' struct("col1", "col2")
-#' }
+#' tmp <- mutate(df, v1 = struct(df$mpg, df$cyl), v2 = struct("hp", "wt", "vs"),
+#'                   v3 = create_array(df$mpg, df$cyl, df$hp),
+#'                   v4 = create_map(lit("x"), lit(1.0), lit("y"), lit(-1.0)))
+#' head(tmp)}
 #' @note struct since 1.6.0
 setMethod("struct",
           signature(x = "characterOrColumn"),
@@ -1959,20 +1947,13 @@ setMethod("months_between", signature(y = "Column"),
             column(jc)
           })
 
-#' nanvl
-#'
-#' Returns col1 if it is not NaN, or col2 if col1 is NaN.
-#' Both inputs should be floating point columns (DoubleType or FloatType).
-#'
-#' @param x first Column.
-#' @param y second Column.
+#' @details
+#' \code{nanvl}: Returns the first column (\code{y}) if it is not NaN, or the second column (\code{x}) if
+#' the first column is NaN. Both inputs should be floating point columns (DoubleType or FloatType).
 #'
-#' @rdname nanvl
-#' @name nanvl
-#' @family non-aggregate functions
-#' @aliases nanvl,Column-method
+#' @rdname column_nonaggregate_functions
+#' @aliases nanvl nanvl,Column-method
 #' @export
-#' @examples \dontrun{nanvl(df$c, x)}
 #' @note nanvl since 1.5.0
 setMethod("nanvl", signature(y = "Column"),
           function(y, x) {
@@ -2060,20 +2041,13 @@ setMethod("concat",
             column(jc)
           })
 
-#' greatest
-#'
-#' Returns the greatest value of the list of column names, skipping null values.
+#' @details
+#' \code{greatest}: Returns the greatest value of the list of column names, skipping null values.
 #' This function takes at least 2 parameters. It will return null if all parameters are null.
 #'
-#' @param x Column to compute on
-#' @param ... other columns
-#'
-#' @family non-aggregate functions
-#' @rdname greatest
-#' @name greatest
-#' @aliases greatest,Column-method
+#' @rdname column_nonaggregate_functions
+#' @aliases greatest greatest,Column-method
 #' @export
-#' @examples \dontrun{greatest(df$c, df$d)}
 #' @note greatest since 1.5.0
 setMethod("greatest",
           signature(x = "Column"),
@@ -2087,20 +2061,13 @@ setMethod("greatest",
             column(jc)
           })
 
-#' least
-#'
-#' Returns the least value of the list of column names, skipping null values.
+#' @details
+#' \code{least}: Returns the least value of the list of column names, skipping null values.
 #' This function takes at least 2 parameters. It will return null if all parameters are null.
 #'
-#' @param x Column to compute on
-#' @param ... other columns
-#'
-#' @family non-aggregate functions
-#' @rdname least
-#' @aliases least,Column-method
-#' @name least
+#' @rdname column_nonaggregate_functions
+#' @aliases least least,Column-method
 #' @export
-#' @examples \dontrun{least(df$c, df$d)}
 #' @note least since 1.5.0
 setMethod("least",
           signature(x = "Column"),
@@ -2445,18 +2412,13 @@ setMethod("conv", signature(x = "Column", fromBase = "numeric", toBase = "numeri
             column(jc)
           })
 
-#' expr
-#'
-#' Parses the expression string into the column that it represents, similar to
-#' SparkDataFrame.selectExpr
+#' @details
+#' \code{expr}: Parses the expression string into the column that it represents, similar to
+#' \code{SparkDataFrame.selectExpr}
 #'
-#' @param x an expression character object to be parsed.
-#' @family non-aggregate functions
-#' @rdname expr
-#' @aliases expr,character-method
-#' @name expr
+#' @rdname column_nonaggregate_functions
+#' @aliases expr expr,character-method
 #' @export
-#' @examples \dontrun{expr('length(name)')}
 #' @note expr since 1.5.0
 setMethod("expr", signature(x = "character"),
           function(x) {
@@ -2617,18 +2579,19 @@ setMethod("lpad", signature(x = "Column", len = "numeric", pad = "character"),
             column(jc)
           })
 
-#' rand
-#'
-#' Generate a random column with independent and identically distributed (i.i.d.) samples
+#' @details
+#' \code{rand}: Generates a random column with independent and identically distributed (i.i.d.) samples
 #' from U[0.0, 1.0].
 #'
+#' @rdname column_nonaggregate_functions
 #' @param seed a random seed. Can be missing.
-#' @family non-aggregate functions
-#' @rdname rand
-#' @name rand
-#' @aliases rand,missing-method
+#' @aliases rand rand,missing-method
 #' @export
-#' @examples \dontrun{rand()}
+#' @examples
+#'
+#' \dontrun{
+#' tmp <- mutate(df, r1 = rand(), r2 = rand(10), r3 = randn(), r4 = randn(10))
+#' head(tmp)}
 #' @note rand since 1.5.0
 setMethod("rand", signature(seed = "missing"),
           function(seed) {
@@ -2636,8 +2599,7 @@ setMethod("rand", signature(seed = "missing"),
             column(jc)
           })
 
-#' @rdname rand
-#' @name rand
+#' @rdname column_nonaggregate_functions
 #' @aliases rand,numeric-method
 #' @export
 #' @note rand(numeric) since 1.5.0
@@ -2647,18 +2609,13 @@ setMethod("rand", signature(seed = "numeric"),
             column(jc)
           })
 
-#' randn
-#'
-#' Generate a column with independent and identically distributed (i.i.d.) samples from
+#' @details
+#' \code{randn}: Generates a column with independent and identically distributed (i.i.d.) samples from
 #' the standard normal distribution.
 #'
-#' @param seed a random seed. Can be missing.
-#' @family non-aggregate functions
-#' @rdname randn
-#' @name randn
-#' @aliases randn,missing-method
+#' @rdname column_nonaggregate_functions
+#' @aliases randn randn,missing-method
 #' @export
-#' @examples \dontrun{randn()}
 #' @note randn since 1.5.0
 setMethod("randn", signature(seed = "missing"),
           function(seed) {
@@ -2666,8 +2623,7 @@ setMethod("randn", signature(seed = "missing"),
             column(jc)
           })
 
-#' @rdname randn
-#' @name randn
+#' @rdname column_nonaggregate_functions
 #' @aliases randn,numeric-method
 #' @export
 #' @note randn(numeric) since 1.5.0
@@ -2819,20 +2775,26 @@ setMethod("unix_timestamp", signature(x = "Column", format = "character"),
             jc <- callJStatic("org.apache.spark.sql.functions", "unix_timestamp", x@jc, format)
             column(jc)
           })
-#' when
-#'
-#' Evaluates a list of conditions and returns one of multiple possible result expressions.
+
+#' @details
+#' \code{when}: Evaluates a list of conditions and returns one of multiple possible result expressions.
 #' For unmatched expressions null is returned.
 #'
+#' @rdname column_nonaggregate_functions
 #' @param condition the condition to test on. Must be a Column expression.
 #' @param value result expression.
-#' @family non-aggregate functions
-#' @rdname when
-#' @name when
-#' @aliases when,Column-method
-#' @seealso \link{ifelse}
+#' @aliases when when,Column-method
 #' @export
-#' @examples \dontrun{when(df$age == 2, df$age + 1)}
+#' @examples
+#'
+#' \dontrun{
+#' tmp <- mutate(df, mpg_na = otherwise(when(df$mpg > 20, df$mpg), lit(NaN)),
+#'                   mpg2 = ifelse(df$mpg > 20 & df$am > 0, 0, 1),
+#'                   mpg3 = ifelse(df$mpg > 20, df$mpg, 20.0))
+#' head(tmp)
+#' tmp <- mutate(tmp, ind_na1 = is.nan(tmp$mpg_na), ind_na2 = isnan(tmp$mpg_na))
+#' head(select(tmp, coalesce(tmp$mpg_na, tmp$mpg)))
+#' head(select(tmp, nanvl(tmp$mpg_na, tmp$hp)))}
 #' @note when since 1.5.0
 setMethod("when", signature(condition = "Column", value = "ANY"),
           function(condition, value) {
@@ -2842,25 +2804,16 @@ setMethod("when", signature(condition = "Column", value = "ANY"),
               column(jc)
           })
 
-#' ifelse
-#'
-#' Evaluates a list of conditions and returns \code{yes} if the conditions are satisfied.
+#' @details
+#' \code{ifelse}: Evaluates a list of conditions and returns \code{yes} if the conditions are satisfied.
 #' Otherwise \code{no} is returned for unmatched conditions.
 #'
+#' @rdname column_nonaggregate_functions
 #' @param test a Column expression that describes the condition.
 #' @param yes return values for \code{TRUE} elements of test.
 #' @param no return values for \code{FALSE} elements of test.
-#' @family non-aggregate functions
-#' @rdname ifelse
-#' @name ifelse
-#' @aliases ifelse,Column-method
-#' @seealso \link{when}
+#' @aliases ifelse ifelse,Column-method
 #' @export
-#' @examples
-#' \dontrun{
-#' ifelse(df$a > 1 & df$b > 2, 0, 1)
-#' ifelse(df$a > 1, df$a, 1)
-#' }
 #' @note ifelse since 1.5.0
 setMethod("ifelse",
           signature(test = "Column", yes = "ANY", no = "ANY"),
@@ -3263,19 +3216,12 @@ setMethod("posexplode",
             column(jc)
           })
 
-#' create_array
-#'
-#' Creates a new array column. The input columns must all have the same data type.
-#'
-#' @param x Column to compute on
-#' @param ... additional Column(s).
+#' @details
+#' \code{create_array}: Creates a new array column. The input columns must all have the same data type.
 #'
-#' @family non-aggregate functions
-#' @rdname create_array
-#' @name create_array
-#' @aliases create_array,Column-method
+#' @rdname column_nonaggregate_functions
+#' @aliases create_array create_array,Column-method
 #' @export
-#' @examples \dontrun{create_array(df$x, df$y, df$z)}
 #' @note create_array since 2.3.0
 setMethod("create_array",
           signature(x = "Column"),
@@ -3288,22 +3234,15 @@ setMethod("create_array",
             column(jc)
           })
 
-#' create_map
-#'
-#' Creates a new map column. The input columns must be grouped as key-value pairs,
+#' @details
+#' \code{create_map}: Creates a new map column. The input columns must be grouped as key-value pairs,
 #' e.g. (key1, value1, key2, value2, ...).
 #' The key columns must all have the same data type, and can't be null.
 #' The value columns must all have the same data type.
 #'
-#' @param x Column to compute on
-#' @param ... additional Column(s).
-#'
-#' @family non-aggregate functions
-#' @rdname create_map
-#' @name create_map
-#' @aliases create_map,Column-method
+#' @rdname column_nonaggregate_functions
+#' @aliases create_map create_map,Column-method
 #' @export
-#' @examples \dontrun{create_map(lit("x"), lit(1.0), lit("y"), lit(-1.0))}
 #' @note create_map since 2.3.0
 setMethod("create_map",
           signature(x = "Column"),
@@ -3554,21 +3493,18 @@ setMethod("grouping_id",
             column(jc)
           })
 
-#' input_file_name
-#'
-#' Creates a string column with the input file name for a given row
+#' @details
+#' \code{input_file_name}: Creates a string column with the input file name for a given row.
+#' The method should be used with no argument.
 #'
-#' @rdname input_file_name
-#' @name input_file_name
-#' @family non-aggregate functions
-#' @aliases input_file_name,missing-method
+#' @rdname column_nonaggregate_functions
+#' @aliases input_file_name input_file_name,missing-method
 #' @export
 #' @examples
-#' \dontrun{
-#' df <- read.text("README.md")
 #'
-#' head(select(df, input_file_name()))
-#' }
+#' \dontrun{
+#' tmp <- read.text("README.md")
+#' head(select(tmp, input_file_name()))}
 #' @note input_file_name since 2.3.0
 setMethod("input_file_name", signature("missing"),
           function() {
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index dc99e3d94b269..1deb057bb1b82 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -422,9 +422,8 @@ setGeneric("cache", function(x) { standardGeneric("cache") })
 setGeneric("checkpoint", function(x, eager = TRUE) { standardGeneric("checkpoint") })
 
 #' @rdname coalesce
-#' @param x a Column or a SparkDataFrame.
-#' @param ... additional argument(s). If \code{x} is a Column, additional Columns can be optionally
-#'        provided.
+#' @param x a SparkDataFrame.
+#' @param ... additional argument(s).
 #' @export
 setGeneric("coalesce", function(x, ...) { standardGeneric("coalesce") })
 
@@ -863,8 +862,9 @@ setGeneric("rlike", function(x, ...) { standardGeneric("rlike") })
 #' @export
 setGeneric("startsWith", function(x, prefix) { standardGeneric("startsWith") })
 
-#' @rdname when
+#' @rdname column_nonaggregate_functions
 #' @export
+#' @name NULL
 setGeneric("when", function(condition, value) { standardGeneric("when") })
 
 #' @rdname otherwise
@@ -938,8 +938,9 @@ setGeneric("base64", function(x) { standardGeneric("base64") })
 #' @name NULL
 setGeneric("bin", function(x) { standardGeneric("bin") })
 
-#' @rdname bitwiseNOT
+#' @rdname column_nonaggregate_functions
 #' @export
+#' @name NULL
 setGeneric("bitwiseNOT", function(x) { standardGeneric("bitwiseNOT") })
 
 #' @rdname column_math_functions
@@ -995,12 +996,14 @@ setGeneric("countDistinct", function(x, ...) { standardGeneric("countDistinct")
 #' @export
 setGeneric("crc32", function(x) { standardGeneric("crc32") })
 
-#' @rdname create_array
+#' @rdname column_nonaggregate_functions
 #' @export
+#' @name NULL
 setGeneric("create_array", function(x, ...) { standardGeneric("create_array") })
 
-#' @rdname create_map
+#' @rdname column_nonaggregate_functions
 #' @export
+#' @name NULL
 setGeneric("create_map", function(x, ...) { standardGeneric("create_map") })
 
 #' @rdname hash
@@ -1065,8 +1068,9 @@ setGeneric("explode", function(x) { standardGeneric("explode") })
 #' @export
 setGeneric("explode_outer", function(x) { standardGeneric("explode_outer") })
 
-#' @rdname expr
+#' @rdname column_nonaggregate_functions
 #' @export
+#' @name NULL
 setGeneric("expr", function(x) { standardGeneric("expr") })
 
 #' @rdname column_datetime_diff_functions
@@ -1093,8 +1097,9 @@ setGeneric("from_json", function(x, schema, ...) { standardGeneric("from_json")
 #' @name NULL
 setGeneric("from_unixtime", function(x, ...) { standardGeneric("from_unixtime") })
 
-#' @rdname greatest
+#' @rdname column_nonaggregate_functions
 #' @export
+#' @name NULL
 setGeneric("greatest", function(x, ...) { standardGeneric("greatest") })
 
 #' @rdname column_aggregate_functions
@@ -1127,9 +1132,9 @@ setGeneric("hypot", function(y, x) { standardGeneric("hypot") })
 #' @name NULL
 setGeneric("initcap", function(x) { standardGeneric("initcap") })
 
-#' @param x empty. Should be used with no argument.
-#' @rdname input_file_name
+#' @rdname column_nonaggregate_functions
 #' @export
+#' @name NULL
 setGeneric("input_file_name",
            function(x = "missing") { standardGeneric("input_file_name") })
 
@@ -1138,8 +1143,9 @@ setGeneric("input_file_name",
 #' @name NULL
 setGeneric("instr", function(y, x) { standardGeneric("instr") })
 
-#' @rdname is.nan
+#' @rdname column_nonaggregate_functions
 #' @export
+#' @name NULL
 setGeneric("isnan", function(x) { standardGeneric("isnan") })
 
 #' @rdname column_aggregate_functions
@@ -1164,8 +1170,9 @@ setGeneric("last_day", function(x) { standardGeneric("last_day") })
 #' @export
 setGeneric("lead", function(x, offset, defaultValue = NULL) { standardGeneric("lead") })
 
-#' @rdname least
+#' @rdname column_nonaggregate_functions
 #' @export
+#' @name NULL
 setGeneric("least", function(x, ...) { standardGeneric("least") })
 
 #' @rdname column_string_functions
@@ -1173,8 +1180,9 @@ setGeneric("least", function(x, ...) { standardGeneric("least") })
 #' @name NULL
 setGeneric("levenshtein", function(y, x) { standardGeneric("levenshtein") })
 
-#' @rdname lit
+#' @rdname column_nonaggregate_functions
 #' @export
+#' @name NULL
 setGeneric("lit", function(x) { standardGeneric("lit") })
 
 #' @rdname column_string_functions
@@ -1206,9 +1214,9 @@ setGeneric("md5", function(x) { standardGeneric("md5") })
 #' @name NULL
 setGeneric("minute", function(x) { standardGeneric("minute") })
 
-#' @param x empty. Should be used with no argument.
-#' @rdname monotonically_increasing_id
+#' @rdname column_nonaggregate_functions
 #' @export
+#' @name NULL
 setGeneric("monotonically_increasing_id",
            function(x = "missing") { standardGeneric("monotonically_increasing_id") })
 
@@ -1226,12 +1234,14 @@ setGeneric("months_between", function(y, x) { standardGeneric("months_between")
 #' @export
 setGeneric("n", function(x) { standardGeneric("n") })
 
-#' @rdname nanvl
+#' @rdname column_nonaggregate_functions
 #' @export
+#' @name NULL
 setGeneric("nanvl", function(y, x) { standardGeneric("nanvl") })
 
-#' @rdname negate
+#' @rdname column_nonaggregate_functions
 #' @export
+#' @name NULL
 setGeneric("negate", function(x) { standardGeneric("negate") })
 
 #' @rdname not
@@ -1275,12 +1285,14 @@ setGeneric("posexplode_outer", function(x) { standardGeneric("posexplode_outer")
 #' @name NULL
 setGeneric("quarter", function(x) { standardGeneric("quarter") })
 
-#' @rdname rand
+#' @rdname column_nonaggregate_functions
 #' @export
+#' @name NULL
 setGeneric("rand", function(seed) { standardGeneric("rand") })
 
-#' @rdname randn
+#' @rdname column_nonaggregate_functions
 #' @export
+#' @name NULL
 setGeneric("randn", function(seed) { standardGeneric("randn") })
 
 #' @rdname rank
@@ -1409,8 +1421,9 @@ setGeneric("stddev_pop", function(x) { standardGeneric("stddev_pop") })
 #' @name NULL
 setGeneric("stddev_samp", function(x) { standardGeneric("stddev_samp") })
 
-#' @rdname struct
+#' @rdname column_nonaggregate_functions
 #' @export
+#' @name NULL
 setGeneric("struct", function(x, ...) { standardGeneric("struct") })
 
 #' @rdname column_string_functions

From 70085e83d1ee728b23f7df15f570eb8d77f67a7a Mon Sep 17 00:00:00 2001
From: Nick Pentreath <nickp@za.ibm.com>
Date: Thu, 29 Jun 2017 09:51:12 +0100
Subject: [PATCH 041/779] [SPARK-21210][DOC][ML] Javadoc 8 fixes for ML shared
 param traits

PR #15999 included fixes for doc strings in the ML shared param traits (occurrences of `>` and `>=`).

This PR simply uses the HTML-escaped version of the param doc to embed into the Scaladoc, to ensure that when `SharedParamsCodeGen` is run, the generated javadoc will be compliant for Java 8.

## How was this patch tested?
Existing tests

Author: Nick Pentreath <nickp@za.ibm.com>

Closes #18420 from MLnick/shared-params-javadoc8.
---
 .../apache/spark/ml/param/shared/SharedParamsCodeGen.scala   | 5 ++++-
 .../org/apache/spark/ml/param/shared/sharedParams.scala      | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
index c94b8b4e9dfda..013817a41baf5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
@@ -20,6 +20,7 @@ package org.apache.spark.ml.param.shared
 import java.io.PrintWriter
 
 import scala.reflect.ClassTag
+import scala.xml.Utility
 
 /**
  * Code generator for shared params (sharedParams.scala). Run under the Spark folder with
@@ -167,6 +168,8 @@ private[shared] object SharedParamsCodeGen {
       "def"
     }
 
+    val htmlCompliantDoc = Utility.escape(doc)
+
     s"""
       |/**
       | * Trait for shared param $name$defaultValueDoc.
@@ -174,7 +177,7 @@ private[shared] object SharedParamsCodeGen {
       |private[ml] trait Has$Name extends Params {
       |
       |  /**
-      |   * Param for $doc.
+      |   * Param for $htmlCompliantDoc.
       |   * @group ${groupStr(0)}
       |   */
       |  final val $name: $Param = new $Param(this, "$name", "$doc"$isValid)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
index e3e03dfd43dd6..50619607a5054 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -176,7 +176,7 @@ private[ml] trait HasThreshold extends Params {
 private[ml] trait HasThresholds extends Params {
 
   /**
-   * Param for Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values > 0 excepting that at most one value may be 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class's threshold.
+   * Param for Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values &gt; 0 excepting that at most one value may be 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class's threshold.
    * @group param
    */
   final val thresholds: DoubleArrayParam = new DoubleArrayParam(this, "thresholds", "Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values > 0 excepting that at most one value may be 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class's threshold", (t: Array[Double]) => t.forall(_ >= 0) && t.count(_ == 0) <= 1)

From d106a74c53f493c3c18741a9b19cb821dace4ba2 Mon Sep 17 00:00:00 2001
From: jinxing <jinxing6042@126.com>
Date: Thu, 29 Jun 2017 09:59:36 +0100
Subject: [PATCH 042/779] [SPARK-21240] Fix code style for constructing and
 stopping a SparkContext in UT.

## What changes were proposed in this pull request?

Same with SPARK-20985.
Fix code style for constructing and stopping a `SparkContext`. Assure the context is stopped to avoid other tests complain that there's only one `SparkContext` can exist.

Author: jinxing <jinxing6042@126.com>

Closes #18454 from jinxing64/SPARK-21240.
---
 .../scala/org/apache/spark/scheduler/MapStatusSuite.scala   | 6 ++----
 .../apache/spark/sql/execution/ui/SQLListenerSuite.scala    | 6 ++----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala
index e6120139f4958..276169e02f01d 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala
@@ -26,6 +26,7 @@ import org.roaringbitmap.RoaringBitmap
 
 import org.apache.spark.{SparkConf, SparkContext, SparkEnv, SparkFunSuite}
 import org.apache.spark.internal.config
+import org.apache.spark.LocalSparkContext._
 import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
 import org.apache.spark.storage.BlockManagerId
 
@@ -160,12 +161,9 @@ class MapStatusSuite extends SparkFunSuite {
       .set("spark.serializer", classOf[KryoSerializer].getName)
       .setMaster("local")
       .setAppName("SPARK-21133")
-    val sc = new SparkContext(conf)
-    try {
+    withSpark(new SparkContext(conf)) { sc =>
       val count = sc.parallelize(0 until 3000, 10).repartition(2001).collect().length
       assert(count === 3000)
-    } finally {
-      sc.stop()
     }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
index e6cd41e4facf1..82eff5e6491ef 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
@@ -25,6 +25,7 @@ import org.mockito.Mockito.mock
 import org.apache.spark._
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.internal.config
+import org.apache.spark.LocalSparkContext._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.scheduler._
 import org.apache.spark.sql.{DataFrame, SparkSession}
@@ -496,8 +497,7 @@ class SQLListenerMemoryLeakSuite extends SparkFunSuite {
         .setAppName("test")
         .set(config.MAX_TASK_FAILURES, 1) // Don't retry the tasks to run this test quickly
         .set("spark.sql.ui.retainedExecutions", "50") // Set it to 50 to run this test quickly
-      val sc = new SparkContext(conf)
-      try {
+      withSpark(new SparkContext(conf)) { sc =>
         SparkSession.sqlListener.set(null)
         val spark = new SparkSession(sc)
         import spark.implicits._
@@ -522,8 +522,6 @@ class SQLListenerMemoryLeakSuite extends SparkFunSuite {
         assert(spark.sharedState.listener.executionIdToData.size <= 100)
         assert(spark.sharedState.listener.jobIdToExecutionId.size <= 100)
         assert(spark.sharedState.listener.stageIdToStageMetrics.size <= 100)
-      } finally {
-        sc.stop()
       }
     }
   }

From d7da2b94d6107341b33ca9224e9bfa4c9a92ed88 Mon Sep 17 00:00:00 2001
From: fjh100456 <fu.jinhua6@zte.com.cn>
Date: Thu, 29 Jun 2017 10:01:12 +0100
Subject: [PATCH 043/779] =?UTF-8?q?[SPARK-21135][WEB=20UI]=20On=20history?=
 =?UTF-8?q?=20server=20page=EF=BC=8Cduration=20of=20incompleted=20applicat?=
 =?UTF-8?q?ions=20should=20be=20hidden=20instead=20of=20showing=20up=20as?=
 =?UTF-8?q?=200?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

Hide duration of incompleted applications.

## How was this patch tested?

manual tests

Author: fjh100456 <fu.jinhua6@zte.com.cn>

Closes #18351 from fjh100456/master.
---
 .../spark/ui/static/historypage-template.html     |  4 ++--
 .../org/apache/spark/ui/static/historypage.js     | 15 ++++++++++-----
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/core/src/main/resources/org/apache/spark/ui/static/historypage-template.html b/core/src/main/resources/org/apache/spark/ui/static/historypage-template.html
index bfe31aae555ba..6cff0068d8bcb 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/historypage-template.html
+++ b/core/src/main/resources/org/apache/spark/ui/static/historypage-template.html
@@ -44,7 +44,7 @@
           Completed
         </span>
       </th>
-      <th>
+      <th class="durationColumn">
         <span data-toggle="tooltip" data-placement="top" title="The duration time of this application.">
           Duration
         </span>
@@ -74,7 +74,7 @@
       <td class="attemptIDSpan"><a href="{{uiroot}}/history/{{id}}/{{attemptId}}/jobs/">{{attemptId}}</a></td>
       <td>{{startTime}}</td>
       <td class="completedColumn">{{endTime}}</td>
-      <td><span title="{{duration}}" class="durationClass">{{duration}}</span></td>
+      <td class="durationColumn"><span title="{{duration}}" class="durationClass">{{duration}}</span></td>
       <td>{{sparkUser}}</td>
       <td>{{lastUpdated}}</td>
       <td><a href="{{log}}" class="btn btn-info btn-mini">Download</a></td>
diff --git a/core/src/main/resources/org/apache/spark/ui/static/historypage.js b/core/src/main/resources/org/apache/spark/ui/static/historypage.js
index 5ec1ce15a2127..9edd3ba0e0ba6 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/historypage.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/historypage.js
@@ -182,12 +182,17 @@ $(document).ready(function() {
           for (i = 0; i < completedCells.length; i++) {
             completedCells[i].style.display='none';
           }
-        }
 
-        var durationCells = document.getElementsByClassName("durationClass");
-        for (i = 0; i < durationCells.length; i++) {
-          var timeInMilliseconds = parseInt(durationCells[i].title);
-          durationCells[i].innerHTML = formatDuration(timeInMilliseconds);
+          var durationCells = document.getElementsByClassName("durationColumn");
+          for (i = 0; i < durationCells.length; i++) {
+            durationCells[i].style.display='none';
+          }
+        } else {
+          var durationCells = document.getElementsByClassName("durationClass");
+          for (i = 0; i < durationCells.length; i++) {
+            var timeInMilliseconds = parseInt(durationCells[i].title);
+            durationCells[i].innerHTML = formatDuration(timeInMilliseconds);
+          }
         }
 
         if ($(selector.concat(" tr")).length < 20) {

From 29bd251dd5914fc3b6146eb4fe0b45f1c84dba62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=A8=E6=B2=BB=E5=9B=BD10192065?=
 <yang.zhiguo@zte.com.cn>
Date: Thu, 29 Jun 2017 20:53:48 +0800
Subject: [PATCH 044/779] [SPARK-21225][CORE] Considering CPUS_PER_TASK when
 allocating task slots for each WorkerOffer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

JIRA Issue:https://issues.apache.org/jira/browse/SPARK-21225
    In the function "resourceOffers", It declare a variable "tasks" for storage the tasks which have allocated a executor. It declared like this:
`val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores))`
    But, I think this code only conside a situation for that one task per core. If the user set "spark.task.cpus" as 2 or 3, It really don't need so much Mem. I think It can motify as follow:
val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores / CPUS_PER_TASK))
 to instead.
    Motify like this the other earning is that it's more easy to understand the way how the tasks allocate offers.

Author: 杨治国10192065 <yang.zhiguo@zte.com.cn>

Closes #18435 from JackYangzg/motifyTaskCoreDisp.
---
 .../scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
index 91ec172ffeda1..737b383631148 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -345,7 +345,7 @@ private[spark] class TaskSchedulerImpl(
 
     val shuffledOffers = shuffleOffers(filteredOffers)
     // Build a list of tasks to assign to each worker.
-    val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores))
+    val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores / CPUS_PER_TASK))
     val availableCpus = shuffledOffers.map(o => o.cores).toArray
     val sortedTaskSets = rootPool.getSortedTaskSetQueue
     for (taskSet <- sortedTaskSets) {

From 18066f2e61f430b691ed8a777c9b4e5786bf9dbc Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Thu, 29 Jun 2017 21:28:48 +0800
Subject: [PATCH 045/779] [SPARK-21052][SQL] Add hash map metrics to join

## What changes were proposed in this pull request?

This adds the average hash map probe metrics to join operator such as `BroadcastHashJoin` and `ShuffledHashJoin`.

This PR adds the API to `HashedRelation` to get average hash map probe.

## How was this patch tested?

Related test cases are added.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #18301 from viirya/SPARK-21052.
---
 .../aggregate/HashAggregateExec.scala         |  15 +-
 .../TungstenAggregationIterator.scala         |  34 ++--
 .../joins/BroadcastHashJoinExec.scala         |  30 ++-
 .../spark/sql/execution/joins/HashJoin.scala  |   8 +-
 .../sql/execution/joins/HashedRelation.scala  |  43 +++-
 .../joins/ShuffledHashJoinExec.scala          |   6 +-
 .../sql/execution/metric/SQLMetrics.scala     |  32 ++-
 .../execution/metric/SQLMetricsSuite.scala    | 188 ++++++++++++++++--
 8 files changed, 296 insertions(+), 60 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
index 5027a615ced7a..56f61c30c4a38 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
@@ -60,7 +60,7 @@ case class HashAggregateExec(
     "peakMemory" -> SQLMetrics.createSizeMetric(sparkContext, "peak memory"),
     "spillSize" -> SQLMetrics.createSizeMetric(sparkContext, "spill size"),
     "aggTime" -> SQLMetrics.createTimingMetric(sparkContext, "aggregate time"),
-    "avgHashmapProbe" -> SQLMetrics.createAverageMetric(sparkContext, "avg hashmap probe"))
+    "avgHashProbe" -> SQLMetrics.createAverageMetric(sparkContext, "avg hash probe"))
 
   override def output: Seq[Attribute] = resultExpressions.map(_.toAttribute)
 
@@ -94,7 +94,7 @@ case class HashAggregateExec(
     val numOutputRows = longMetric("numOutputRows")
     val peakMemory = longMetric("peakMemory")
     val spillSize = longMetric("spillSize")
-    val avgHashmapProbe = longMetric("avgHashmapProbe")
+    val avgHashProbe = longMetric("avgHashProbe")
 
     child.execute().mapPartitions { iter =>
 
@@ -119,7 +119,7 @@ case class HashAggregateExec(
             numOutputRows,
             peakMemory,
             spillSize,
-            avgHashmapProbe)
+            avgHashProbe)
         if (!hasInput && groupingExpressions.isEmpty) {
           numOutputRows += 1
           Iterator.single[UnsafeRow](aggregationIterator.outputForEmptyGroupingKeyWithoutInput())
@@ -344,7 +344,7 @@ case class HashAggregateExec(
       sorter: UnsafeKVExternalSorter,
       peakMemory: SQLMetric,
       spillSize: SQLMetric,
-      avgHashmapProbe: SQLMetric): KVIterator[UnsafeRow, UnsafeRow] = {
+      avgHashProbe: SQLMetric): KVIterator[UnsafeRow, UnsafeRow] = {
 
     // update peak execution memory
     val mapMemory = hashMap.getPeakMemoryUsedBytes
@@ -355,8 +355,7 @@ case class HashAggregateExec(
     metrics.incPeakExecutionMemory(maxMemory)
 
     // Update average hashmap probe
-    val avgProbes = hashMap.getAverageProbesPerLookup()
-    avgHashmapProbe.add(avgProbes.ceil.toLong)
+    avgHashProbe.set(hashMap.getAverageProbesPerLookup())
 
     if (sorter == null) {
       // not spilled
@@ -584,7 +583,7 @@ case class HashAggregateExec(
     val doAgg = ctx.freshName("doAggregateWithKeys")
     val peakMemory = metricTerm(ctx, "peakMemory")
     val spillSize = metricTerm(ctx, "spillSize")
-    val avgHashmapProbe = metricTerm(ctx, "avgHashmapProbe")
+    val avgHashProbe = metricTerm(ctx, "avgHashProbe")
 
     def generateGenerateCode(): String = {
       if (isFastHashMapEnabled) {
@@ -611,7 +610,7 @@ case class HashAggregateExec(
               s"$iterTermForFastHashMap = $fastHashMapTerm.rowIterator();"} else ""}
 
           $iterTerm = $thisPlan.finishAggregate($hashMapTerm, $sorterTerm, $peakMemory, $spillSize,
-            $avgHashmapProbe);
+            $avgHashProbe);
         }
        """)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
index 8efa95d48aea0..cfa930607360c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
@@ -89,7 +89,7 @@ class TungstenAggregationIterator(
     numOutputRows: SQLMetric,
     peakMemory: SQLMetric,
     spillSize: SQLMetric,
-    avgHashmapProbe: SQLMetric)
+    avgHashProbe: SQLMetric)
   extends AggregationIterator(
     groupingExpressions,
     originalInputAttributes,
@@ -367,6 +367,22 @@ class TungstenAggregationIterator(
     }
   }
 
+  TaskContext.get().addTaskCompletionListener(_ => {
+    // At the end of the task, update the task's peak memory usage. Since we destroy
+    // the map to create the sorter, their memory usages should not overlap, so it is safe
+    // to just use the max of the two.
+    val mapMemory = hashMap.getPeakMemoryUsedBytes
+    val sorterMemory = Option(externalSorter).map(_.getPeakMemoryUsedBytes).getOrElse(0L)
+    val maxMemory = Math.max(mapMemory, sorterMemory)
+    val metrics = TaskContext.get().taskMetrics()
+    peakMemory.set(maxMemory)
+    spillSize.set(metrics.memoryBytesSpilled - spillSizeBefore)
+    metrics.incPeakExecutionMemory(maxMemory)
+
+    // Updating average hashmap probe
+    avgHashProbe.set(hashMap.getAverageProbesPerLookup())
+  })
+
   ///////////////////////////////////////////////////////////////////////////
   // Part 7: Iterator's public methods.
   ///////////////////////////////////////////////////////////////////////////
@@ -409,22 +425,6 @@ class TungstenAggregationIterator(
         }
       }
 
-      // If this is the last record, update the task's peak memory usage. Since we destroy
-      // the map to create the sorter, their memory usages should not overlap, so it is safe
-      // to just use the max of the two.
-      if (!hasNext) {
-        val mapMemory = hashMap.getPeakMemoryUsedBytes
-        val sorterMemory = Option(externalSorter).map(_.getPeakMemoryUsedBytes).getOrElse(0L)
-        val maxMemory = Math.max(mapMemory, sorterMemory)
-        val metrics = TaskContext.get().taskMetrics()
-        peakMemory += maxMemory
-        spillSize += metrics.memoryBytesSpilled - spillSizeBefore
-        metrics.incPeakExecutionMemory(maxMemory)
-
-        // Update average hashmap probe if this is the last record.
-        val averageProbes = hashMap.getAverageProbesPerLookup()
-        avgHashmapProbe.add(averageProbes.ceil.toLong)
-      }
       numOutputRows += 1
       res
     } else {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
index 0bc261d593df4..bfa1e9d49a545 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
@@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.plans.physical.{BroadcastDistribution, Dist
 import org.apache.spark.sql.execution.{BinaryExecNode, CodegenSupport, SparkPlan}
 import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.sql.types.LongType
+import org.apache.spark.util.TaskCompletionListener
 
 /**
  * Performs an inner hash join of two child relations.  When the output RDD of this operator is
@@ -46,7 +47,8 @@ case class BroadcastHashJoinExec(
   extends BinaryExecNode with HashJoin with CodegenSupport {
 
   override lazy val metrics = Map(
-    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
+    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
+    "avgHashProbe" -> SQLMetrics.createAverageMetric(sparkContext, "avg hash probe"))
 
   override def requiredChildDistribution: Seq[Distribution] = {
     val mode = HashedRelationBroadcastMode(buildKeys)
@@ -60,12 +62,13 @@ case class BroadcastHashJoinExec(
 
   protected override def doExecute(): RDD[InternalRow] = {
     val numOutputRows = longMetric("numOutputRows")
+    val avgHashProbe = longMetric("avgHashProbe")
 
     val broadcastRelation = buildPlan.executeBroadcast[HashedRelation]()
     streamedPlan.execute().mapPartitions { streamedIter =>
       val hashed = broadcastRelation.value.asReadOnlyCopy()
       TaskContext.get().taskMetrics().incPeakExecutionMemory(hashed.estimatedSize)
-      join(streamedIter, hashed, numOutputRows)
+      join(streamedIter, hashed, numOutputRows, avgHashProbe)
     }
   }
 
@@ -90,6 +93,23 @@ case class BroadcastHashJoinExec(
     }
   }
 
+  /**
+   * Returns the codes used to add a task completion listener to update avg hash probe
+   * at the end of the task.
+   */
+  private def genTaskListener(avgHashProbe: String, relationTerm: String): String = {
+    val listenerClass = classOf[TaskCompletionListener].getName
+    val taskContextClass = classOf[TaskContext].getName
+    s"""
+       | $taskContextClass$$.MODULE$$.get().addTaskCompletionListener(new $listenerClass() {
+       |   @Override
+       |   public void onTaskCompletion($taskContextClass context) {
+       |     $avgHashProbe.set($relationTerm.getAverageProbesPerLookup());
+       |   }
+       | });
+     """.stripMargin
+  }
+
   /**
    * Returns a tuple of Broadcast of HashedRelation and the variable name for it.
    */
@@ -99,10 +119,16 @@ case class BroadcastHashJoinExec(
     val broadcast = ctx.addReferenceObj("broadcast", broadcastRelation)
     val relationTerm = ctx.freshName("relation")
     val clsName = broadcastRelation.value.getClass.getName
+
+    // At the end of the task, we update the avg hash probe.
+    val avgHashProbe = metricTerm(ctx, "avgHashProbe")
+    val addTaskListener = genTaskListener(avgHashProbe, relationTerm)
+
     ctx.addMutableState(clsName, relationTerm,
       s"""
          | $relationTerm = (($clsName) $broadcast.value()).asReadOnlyCopy();
          | incPeakExecutionMemory($relationTerm.estimatedSize());
+         | $addTaskListener
        """.stripMargin)
     (broadcastRelation, relationTerm)
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
index 1aef5f6864263..b09edf380c2d4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution.joins
 
+import org.apache.spark.TaskContext
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans._
@@ -193,7 +194,8 @@ trait HashJoin {
   protected def join(
       streamedIter: Iterator[InternalRow],
       hashed: HashedRelation,
-      numOutputRows: SQLMetric): Iterator[InternalRow] = {
+      numOutputRows: SQLMetric,
+      avgHashProbe: SQLMetric): Iterator[InternalRow] = {
 
     val joinedIter = joinType match {
       case _: InnerLike =>
@@ -211,6 +213,10 @@ trait HashJoin {
           s"BroadcastHashJoin should not take $x as the JoinType")
     }
 
+    // At the end of the task, we update the avg hash probe.
+    TaskContext.get().addTaskCompletionListener(_ =>
+      avgHashProbe.set(hashed.getAverageProbesPerLookup()))
+
     val resultProj = createResultProjection
     joinedIter.map { r =>
       numOutputRows += 1
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
index 2dd1dc3da96c9..3c702856114f9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
@@ -79,6 +79,11 @@ private[execution] sealed trait HashedRelation extends KnownSizeEstimation {
    * Release any used resources.
    */
   def close(): Unit
+
+  /**
+   * Returns the average number of probes per key lookup.
+   */
+  def getAverageProbesPerLookup(): Double
 }
 
 private[execution] object HashedRelation {
@@ -242,7 +247,8 @@ private[joins] class UnsafeHashedRelation(
     binaryMap = new BytesToBytesMap(
       taskMemoryManager,
       (nKeys * 1.5 + 1).toInt, // reduce hash collision
-      pageSizeBytes)
+      pageSizeBytes,
+      true)
 
     var i = 0
     var keyBuffer = new Array[Byte](1024)
@@ -273,6 +279,8 @@ private[joins] class UnsafeHashedRelation(
   override def read(kryo: Kryo, in: Input): Unit = Utils.tryOrIOException {
     read(in.readInt, in.readLong, in.readBytes)
   }
+
+  override def getAverageProbesPerLookup(): Double = binaryMap.getAverageProbesPerLookup()
 }
 
 private[joins] object UnsafeHashedRelation {
@@ -290,7 +298,8 @@ private[joins] object UnsafeHashedRelation {
       taskMemoryManager,
       // Only 70% of the slots can be used before growing, more capacity help to reduce collision
       (sizeEstimate * 1.5 + 1).toInt,
-      pageSizeBytes)
+      pageSizeBytes,
+      true)
 
     // Create a mapping of buildKeys -> rows
     val keyGenerator = UnsafeProjection.create(key)
@@ -344,7 +353,7 @@ private[joins] object UnsafeHashedRelation {
  *  determined by `key1 - minKey`.
  *
  * The map is created as sparse mode, then key-value could be appended into it. Once finish
- * appending, caller could all optimize() to try to turn the map into dense mode, which is faster
+ * appending, caller could call optimize() to try to turn the map into dense mode, which is faster
  * to probe.
  *
  * see http://java-performance.info/implementing-world-fastest-java-int-to-int-hash-map/
@@ -385,6 +394,10 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
   // The number of unique keys.
   private var numKeys = 0L
 
+  // Tracking average number of probes per key lookup.
+  private var numKeyLookups = 0L
+  private var numProbes = 0L
+
   // needed by serializer
   def this() = {
     this(
@@ -469,6 +482,8 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
    */
   def getValue(key: Long, resultRow: UnsafeRow): UnsafeRow = {
     if (isDense) {
+      numKeyLookups += 1
+      numProbes += 1
       if (key >= minKey && key <= maxKey) {
         val value = array((key - minKey).toInt)
         if (value > 0) {
@@ -477,11 +492,14 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
       }
     } else {
       var pos = firstSlot(key)
+      numKeyLookups += 1
+      numProbes += 1
       while (array(pos + 1) != 0) {
         if (array(pos) == key) {
           return getRow(array(pos + 1), resultRow)
         }
         pos = nextSlot(pos)
+        numProbes += 1
       }
     }
     null
@@ -509,6 +527,8 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
    */
   def get(key: Long, resultRow: UnsafeRow): Iterator[UnsafeRow] = {
     if (isDense) {
+      numKeyLookups += 1
+      numProbes += 1
       if (key >= minKey && key <= maxKey) {
         val value = array((key - minKey).toInt)
         if (value > 0) {
@@ -517,11 +537,14 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
       }
     } else {
       var pos = firstSlot(key)
+      numKeyLookups += 1
+      numProbes += 1
       while (array(pos + 1) != 0) {
         if (array(pos) == key) {
           return valueIter(array(pos + 1), resultRow)
         }
         pos = nextSlot(pos)
+        numProbes += 1
       }
     }
     null
@@ -573,8 +596,11 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
   private def updateIndex(key: Long, address: Long): Unit = {
     var pos = firstSlot(key)
     assert(numKeys < array.length / 2)
+    numKeyLookups += 1
+    numProbes += 1
     while (array(pos) != key && array(pos + 1) != 0) {
       pos = nextSlot(pos)
+      numProbes += 1
     }
     if (array(pos + 1) == 0) {
       // this is the first value for this key, put the address in array.
@@ -686,6 +712,8 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
     writeLong(maxKey)
     writeLong(numKeys)
     writeLong(numValues)
+    writeLong(numKeyLookups)
+    writeLong(numProbes)
 
     writeLong(array.length)
     writeLongArray(writeBuffer, array, array.length)
@@ -727,6 +755,8 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
     maxKey = readLong()
     numKeys = readLong()
     numValues = readLong()
+    numKeyLookups = readLong()
+    numProbes = readLong()
 
     val length = readLong().toInt
     mask = length - 2
@@ -742,6 +772,11 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
   override def read(kryo: Kryo, in: Input): Unit = {
     read(in.readBoolean, in.readLong, in.readBytes)
   }
+
+  /**
+   * Returns the average number of probes per key lookup.
+   */
+  def getAverageProbesPerLookup(): Double = numProbes.toDouble / numKeyLookups
 }
 
 private[joins] class LongHashedRelation(
@@ -793,6 +828,8 @@ private[joins] class LongHashedRelation(
     resultRow = new UnsafeRow(nFields)
     map = in.readObject().asInstanceOf[LongToUnsafeRowMap]
   }
+
+  override def getAverageProbesPerLookup(): Double = map.getAverageProbesPerLookup()
 }
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala
index afb6e5e3dd235..f1df41ca49c27 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala
@@ -42,7 +42,8 @@ case class ShuffledHashJoinExec(
   override lazy val metrics = Map(
     "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
     "buildDataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size of build side"),
-    "buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map"))
+    "buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map"),
+    "avgHashProbe" -> SQLMetrics.createAverageMetric(sparkContext, "avg hash probe"))
 
   override def requiredChildDistribution: Seq[Distribution] =
     ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil
@@ -62,9 +63,10 @@ case class ShuffledHashJoinExec(
 
   protected override def doExecute(): RDD[InternalRow] = {
     val numOutputRows = longMetric("numOutputRows")
+    val avgHashProbe = longMetric("avgHashProbe")
     streamedPlan.execute().zipPartitions(buildPlan.execute()) { (streamIter, buildIter) =>
       val hashed = buildHashedRelation(buildIter)
-      join(streamIter, hashed, numOutputRows)
+      join(streamIter, hashed, numOutputRows, avgHashProbe)
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
index 49cab04de2bf0..b4653c1b564f3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
@@ -57,6 +57,12 @@ class SQLMetric(val metricType: String, initValue: Long = 0L) extends Accumulato
 
   override def add(v: Long): Unit = _value += v
 
+  // We can set a double value to `SQLMetric` which stores only long value, if it is
+  // average metrics.
+  def set(v: Double): Unit = SQLMetrics.setDoubleForAverageMetrics(this, v)
+
+  def set(v: Long): Unit = _value = v
+
   def +=(v: Long): Unit = _value += v
 
   override def value: Long = _value
@@ -74,6 +80,19 @@ object SQLMetrics {
   private val TIMING_METRIC = "timing"
   private val AVERAGE_METRIC = "average"
 
+  private val baseForAvgMetric: Int = 10
+
+  /**
+   * Converts a double value to long value by multiplying a base integer, so we can store it in
+   * `SQLMetrics`. It only works for average metrics. When showing the metrics on UI, we restore
+   * it back to a double value up to the decimal places bound by the base integer.
+   */
+  private[sql] def setDoubleForAverageMetrics(metric: SQLMetric, v: Double): Unit = {
+    assert(metric.metricType == AVERAGE_METRIC,
+      s"Can't set a double to a metric of metrics type: ${metric.metricType}")
+    metric.set((v * baseForAvgMetric).toLong)
+  }
+
   def createMetric(sc: SparkContext, name: String): SQLMetric = {
     val acc = new SQLMetric(SUM_METRIC)
     acc.register(sc, name = Some(name), countFailedValues = false)
@@ -104,15 +123,14 @@ object SQLMetrics {
 
   /**
    * Create a metric to report the average information (including min, med, max) like
-   * avg hashmap probe. Because `SQLMetric` stores long values, we take the ceil of the average
-   * values before storing them. This metric is used to record an average value computed in the
-   * end of a task. It should be set once. The initial values (zeros) of this metrics will be
-   * excluded after.
+   * avg hash probe. As average metrics are double values, this kind of metrics should be
+   * only set with `SQLMetric.set` method instead of other methods like `SQLMetric.add`.
+   * The initial values (zeros) of this metrics will be excluded after.
    */
   def createAverageMetric(sc: SparkContext, name: String): SQLMetric = {
     // The final result of this metric in physical operator UI may looks like:
     // probe avg (min, med, max):
-    // (1, 2, 6)
+    // (1.2, 2.2, 6.3)
     val acc = new SQLMetric(AVERAGE_METRIC)
     acc.register(sc, name = Some(s"$name (min, med, max)"), countFailedValues = false)
     acc
@@ -127,7 +145,7 @@ object SQLMetrics {
       val numberFormat = NumberFormat.getIntegerInstance(Locale.US)
       numberFormat.format(values.sum)
     } else if (metricsType == AVERAGE_METRIC) {
-      val numberFormat = NumberFormat.getIntegerInstance(Locale.US)
+      val numberFormat = NumberFormat.getNumberInstance(Locale.US)
 
       val validValues = values.filter(_ > 0)
       val Seq(min, med, max) = {
@@ -137,7 +155,7 @@ object SQLMetrics {
           val sorted = validValues.sorted
           Seq(sorted(0), sorted(validValues.length / 2), sorted(validValues.length - 1))
         }
-        metric.map(numberFormat.format)
+        metric.map(v => numberFormat.format(v.toDouble / baseForAvgMetric))
       }
       s"\n($min, $med, $max)"
     } else {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
index a12ce2b9eba34..cb3405b2fe19b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
@@ -47,9 +47,10 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
   private def getSparkPlanMetrics(
       df: DataFrame,
       expectedNumOfJobs: Int,
-      expectedNodeIds: Set[Long]): Option[Map[Long, (String, Map[String, Any])]] = {
+      expectedNodeIds: Set[Long],
+      enableWholeStage: Boolean = false): Option[Map[Long, (String, Map[String, Any])]] = {
     val previousExecutionIds = spark.sharedState.listener.executionIdToData.keySet
-    withSQLConf("spark.sql.codegen.wholeStage" -> "false") {
+    withSQLConf("spark.sql.codegen.wholeStage" -> enableWholeStage.toString) {
       df.collect()
     }
     sparkContext.listenerBus.waitUntilEmpty(10000)
@@ -110,6 +111,20 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
     }
   }
 
+  /**
+   * Generates a `DataFrame` by filling randomly generated bytes for hash collision.
+   */
+  private def generateRandomBytesDF(numRows: Int = 65535): DataFrame = {
+    val random = new Random()
+    val manyBytes = (0 until numRows).map { _ =>
+      val byteArrSize = random.nextInt(100)
+      val bytes = new Array[Byte](byteArrSize)
+      random.nextBytes(bytes)
+      (bytes, random.nextInt(100))
+    }
+    manyBytes.toSeq.toDF("a", "b")
+  }
+
   test("LocalTableScanExec computes metrics in collect and take") {
     val df1 = spark.createDataset(Seq(1, 2, 3))
     val logical = df1.queryExecution.logical
@@ -151,9 +166,9 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
     val df = testData2.groupBy().count() // 2 partitions
     val expected1 = Seq(
       Map("number of output rows" -> 2L,
-        "avg hashmap probe (min, med, max)" -> "\n(1, 1, 1)"),
+        "avg hash probe (min, med, max)" -> "\n(1, 1, 1)"),
       Map("number of output rows" -> 1L,
-        "avg hashmap probe (min, med, max)" -> "\n(1, 1, 1)"))
+        "avg hash probe (min, med, max)" -> "\n(1, 1, 1)"))
     testSparkPlanMetrics(df, 1, Map(
       2L -> ("HashAggregate", expected1(0)),
       0L -> ("HashAggregate", expected1(1)))
@@ -163,9 +178,9 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
     val df2 = testData2.groupBy('a).count()
     val expected2 = Seq(
       Map("number of output rows" -> 4L,
-        "avg hashmap probe (min, med, max)" -> "\n(1, 1, 1)"),
+        "avg hash probe (min, med, max)" -> "\n(1, 1, 1)"),
       Map("number of output rows" -> 3L,
-        "avg hashmap probe (min, med, max)" -> "\n(1, 1, 1)"))
+        "avg hash probe (min, med, max)" -> "\n(1, 1, 1)"))
     testSparkPlanMetrics(df2, 1, Map(
       2L -> ("HashAggregate", expected2(0)),
       0L -> ("HashAggregate", expected2(1)))
@@ -173,19 +188,42 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
   }
 
   test("Aggregate metrics: track avg probe") {
-    val random = new Random()
-    val manyBytes = (0 until 65535).map { _ =>
-      val byteArrSize = random.nextInt(100)
-      val bytes = new Array[Byte](byteArrSize)
-      random.nextBytes(bytes)
-      (bytes, random.nextInt(100))
-    }
-    val df = manyBytes.toSeq.toDF("a", "b").repartition(1).groupBy('a).count()
-    val metrics = getSparkPlanMetrics(df, 1, Set(2L, 0L)).get
-    Seq(metrics(2L)._2("avg hashmap probe (min, med, max)"),
-        metrics(0L)._2("avg hashmap probe (min, med, max)")).foreach { probes =>
-      probes.toString.stripPrefix("\n(").stripSuffix(")").split(", ").foreach { probe =>
-        assert(probe.toInt > 1)
+    // The executed plan looks like:
+    // HashAggregate(keys=[a#61], functions=[count(1)], output=[a#61, count#71L])
+    // +- Exchange hashpartitioning(a#61, 5)
+    //    +- HashAggregate(keys=[a#61], functions=[partial_count(1)], output=[a#61, count#76L])
+    //       +- Exchange RoundRobinPartitioning(1)
+    //          +- LocalTableScan [a#61]
+    //
+    // Assume the execution plan with node id is:
+    // Wholestage disabled:
+    // HashAggregate(nodeId = 0)
+    //   Exchange(nodeId = 1)
+    //     HashAggregate(nodeId = 2)
+    //       Exchange (nodeId = 3)
+    //         LocalTableScan(nodeId = 4)
+    //
+    // Wholestage enabled:
+    // WholeStageCodegen(nodeId = 0)
+    //   HashAggregate(nodeId = 1)
+    //     Exchange(nodeId = 2)
+    //       WholeStageCodegen(nodeId = 3)
+    //         HashAggregate(nodeId = 4)
+    //           Exchange(nodeId = 5)
+    //             LocalTableScan(nodeId = 6)
+    Seq(true, false).foreach { enableWholeStage =>
+      val df = generateRandomBytesDF().repartition(1).groupBy('a).count()
+      val nodeIds = if (enableWholeStage) {
+        Set(4L, 1L)
+      } else {
+        Set(2L, 0L)
+      }
+      val metrics = getSparkPlanMetrics(df, 1, nodeIds, enableWholeStage).get
+      nodeIds.foreach { nodeId =>
+        val probes = metrics(nodeId)._2("avg hash probe (min, med, max)")
+        probes.toString.stripPrefix("\n(").stripSuffix(")").split(", ").foreach { probe =>
+          assert(probe.toDouble > 1.0)
+        }
       }
     }
   }
@@ -267,10 +305,120 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
     val df = df1.join(broadcast(df2), "key")
     testSparkPlanMetrics(df, 2, Map(
       1L -> ("BroadcastHashJoin", Map(
-        "number of output rows" -> 2L)))
+        "number of output rows" -> 2L,
+        "avg hash probe (min, med, max)" -> "\n(1, 1, 1)")))
     )
   }
 
+  test("BroadcastHashJoin metrics: track avg probe") {
+    // The executed plan looks like:
+    // Project [a#210, b#211, b#221]
+    // +- BroadcastHashJoin [a#210], [a#220], Inner, BuildRight
+    //    :- Project [_1#207 AS a#210, _2#208 AS b#211]
+    //    :  +- Filter isnotnull(_1#207)
+    //    :     +- LocalTableScan [_1#207, _2#208]
+    //    +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, binary, true]))
+    //       +- Project [_1#217 AS a#220, _2#218 AS b#221]
+    //          +- Filter isnotnull(_1#217)
+    //             +- LocalTableScan [_1#217, _2#218]
+    //
+    // Assume the execution plan with node id is
+    // WholeStageCodegen disabled:
+    // Project(nodeId = 0)
+    //   BroadcastHashJoin(nodeId = 1)
+    //     ...(ignored)
+    //
+    // WholeStageCodegen enabled:
+    // WholeStageCodegen(nodeId = 0)
+    //   Project(nodeId = 1)
+    //     BroadcastHashJoin(nodeId = 2)
+    //       Project(nodeId = 3)
+    //         Filter(nodeId = 4)
+    //           ...(ignored)
+    Seq(true, false).foreach { enableWholeStage =>
+      val df1 = generateRandomBytesDF()
+      val df2 = generateRandomBytesDF()
+      val df = df1.join(broadcast(df2), "a")
+      val nodeIds = if (enableWholeStage) {
+        Set(2L)
+      } else {
+        Set(1L)
+      }
+      val metrics = getSparkPlanMetrics(df, 2, nodeIds, enableWholeStage).get
+      nodeIds.foreach { nodeId =>
+        val probes = metrics(nodeId)._2("avg hash probe (min, med, max)")
+        probes.toString.stripPrefix("\n(").stripSuffix(")").split(", ").foreach { probe =>
+          assert(probe.toDouble > 1.0)
+        }
+      }
+    }
+  }
+
+  test("ShuffledHashJoin metrics") {
+    withSQLConf("spark.sql.autoBroadcastJoinThreshold" -> "40",
+        "spark.sql.shuffle.partitions" -> "2",
+        "spark.sql.join.preferSortMergeJoin" -> "false") {
+      val df1 = Seq((1, "1"), (2, "2")).toDF("key", "value")
+      val df2 = (1 to 10).map(i => (i, i.toString)).toSeq.toDF("key", "value")
+      // Assume the execution plan is
+      // ... -> ShuffledHashJoin(nodeId = 1) -> Project(nodeId = 0)
+      val df = df1.join(df2, "key")
+      val metrics = getSparkPlanMetrics(df, 1, Set(1L))
+      testSparkPlanMetrics(df, 1, Map(
+        1L -> ("ShuffledHashJoin", Map(
+          "number of output rows" -> 2L,
+          "avg hash probe (min, med, max)" -> "\n(1, 1, 1)")))
+      )
+    }
+  }
+
+  test("ShuffledHashJoin metrics: track avg probe") {
+    // The executed plan looks like:
+    // Project [a#308, b#309, b#319]
+    // +- ShuffledHashJoin [a#308], [a#318], Inner, BuildRight
+    //    :- Exchange hashpartitioning(a#308, 2)
+    //    :  +- Project [_1#305 AS a#308, _2#306 AS b#309]
+    //    :     +- Filter isnotnull(_1#305)
+    //    :        +- LocalTableScan [_1#305, _2#306]
+    //    +- Exchange hashpartitioning(a#318, 2)
+    //       +- Project [_1#315 AS a#318, _2#316 AS b#319]
+    //          +- Filter isnotnull(_1#315)
+    //             +- LocalTableScan [_1#315, _2#316]
+    //
+    // Assume the execution plan with node id is
+    // WholeStageCodegen disabled:
+    // Project(nodeId = 0)
+    //   ShuffledHashJoin(nodeId = 1)
+    //     ...(ignored)
+    //
+    // WholeStageCodegen enabled:
+    // WholeStageCodegen(nodeId = 0)
+    //   Project(nodeId = 1)
+    //     ShuffledHashJoin(nodeId = 2)
+    //       ...(ignored)
+    withSQLConf("spark.sql.autoBroadcastJoinThreshold" -> "5000000",
+        "spark.sql.shuffle.partitions" -> "2",
+        "spark.sql.join.preferSortMergeJoin" -> "false") {
+      Seq(true, false).foreach { enableWholeStage =>
+        val df1 = generateRandomBytesDF(65535 * 5)
+        val df2 = generateRandomBytesDF(65535)
+        val df = df1.join(df2, "a")
+        val nodeIds = if (enableWholeStage) {
+          Set(2L)
+        } else {
+          Set(1L)
+        }
+        val metrics = getSparkPlanMetrics(df, 1, nodeIds, enableWholeStage).get
+        nodeIds.foreach { nodeId =>
+          val probes = metrics(nodeId)._2("avg hash probe (min, med, max)")
+          probes.toString.stripPrefix("\n(").stripSuffix(")").split(", ").foreach { probe =>
+            assert(probe.toDouble > 1.0)
+          }
+        }
+      }
+    }
+  }
+
   test("BroadcastHashJoin(outer) metrics") {
     val df1 = Seq((1, "a"), (1, "b"), (4, "c")).toDF("key", "value")
     val df2 = Seq((1, "a"), (1, "b"), (2, "c"), (3, "d")).toDF("key2", "value")

From f9151bebca986d44cdab7699959fec2bc050773a Mon Sep 17 00:00:00 2001
From: Feng Liu <fengliu@databricks.com>
Date: Thu, 29 Jun 2017 16:03:15 -0700
Subject: [PATCH 046/779] [SPARK-21188][CORE] releaseAllLocksForTask should
 synchronize the whole method

## What changes were proposed in this pull request?

Since the objects `readLocksByTask`, `writeLocksByTask` and `info`s are coupled and supposed to be modified by other threads concurrently, all the read and writes of them in the method `releaseAllLocksForTask` should be protected by a single synchronized block like other similar methods.

## How was this patch tested?

existing tests

Author: Feng Liu <fengliu@databricks.com>

Closes #18400 from liufengdb/synchronize.
---
 .../spark/storage/BlockInfoManager.scala      | 24 +++++++------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/storage/BlockInfoManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockInfoManager.scala
index 7064872ec1c77..219a0e799cc73 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockInfoManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockInfoManager.scala
@@ -341,15 +341,11 @@ private[storage] class BlockInfoManager extends Logging {
    *
    * @return the ids of blocks whose pins were released
    */
-  def releaseAllLocksForTask(taskAttemptId: TaskAttemptId): Seq[BlockId] = {
+  def releaseAllLocksForTask(taskAttemptId: TaskAttemptId): Seq[BlockId] = synchronized {
     val blocksWithReleasedLocks = mutable.ArrayBuffer[BlockId]()
 
-    val readLocks = synchronized {
-      readLocksByTask.remove(taskAttemptId).getOrElse(ImmutableMultiset.of[BlockId]())
-    }
-    val writeLocks = synchronized {
-      writeLocksByTask.remove(taskAttemptId).getOrElse(Seq.empty)
-    }
+    val readLocks = readLocksByTask.remove(taskAttemptId).getOrElse(ImmutableMultiset.of[BlockId]())
+    val writeLocks = writeLocksByTask.remove(taskAttemptId).getOrElse(Seq.empty)
 
     for (blockId <- writeLocks) {
       infos.get(blockId).foreach { info =>
@@ -358,21 +354,19 @@ private[storage] class BlockInfoManager extends Logging {
       }
       blocksWithReleasedLocks += blockId
     }
+
     readLocks.entrySet().iterator().asScala.foreach { entry =>
       val blockId = entry.getElement
       val lockCount = entry.getCount
       blocksWithReleasedLocks += blockId
-      synchronized {
-        get(blockId).foreach { info =>
-          info.readerCount -= lockCount
-          assert(info.readerCount >= 0)
-        }
+      get(blockId).foreach { info =>
+        info.readerCount -= lockCount
+        assert(info.readerCount >= 0)
       }
     }
 
-    synchronized {
-      notifyAll()
-    }
+    notifyAll()
+
     blocksWithReleasedLocks
   }
 

From 4996c53949376153f9ebdc74524fed7226968808 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Fri, 30 Jun 2017 10:56:48 +0800
Subject: [PATCH 047/779] [SPARK-21253][CORE] Fix a bug that StreamCallback may
 not be notified if network errors happen

## What changes were proposed in this pull request?

If a network error happens before processing StreamResponse/StreamFailure events, StreamCallback.onFailure won't be called.

This PR fixes `failOutstandingRequests` to also notify outstanding StreamCallbacks.

## How was this patch tested?

The new unit tests.

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #18472 from zsxwing/fix-stream-2.
---
 .../spark/network/client/TransportClient.java |  2 +-
 .../client/TransportResponseHandler.java      | 38 ++++++++++++++-----
 .../TransportResponseHandlerSuite.java        | 31 ++++++++++++++-
 3 files changed, 59 insertions(+), 12 deletions(-)

diff --git a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java
index a6f527c118218..8f354ad78bbaa 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java
@@ -179,7 +179,7 @@ public void stream(String streamId, StreamCallback callback) {
     // written to the socket atomically, so that callbacks are called in the right order
     // when responses arrive.
     synchronized (this) {
-      handler.addStreamCallback(callback);
+      handler.addStreamCallback(streamId, callback);
       channel.writeAndFlush(new StreamRequest(streamId)).addListener(future -> {
         if (future.isSuccess()) {
           long timeTaken = System.currentTimeMillis() - startTime;
diff --git a/common/network-common/src/main/java/org/apache/spark/network/client/TransportResponseHandler.java b/common/network-common/src/main/java/org/apache/spark/network/client/TransportResponseHandler.java
index 41bead546cad6..be9f18203c8e4 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/client/TransportResponseHandler.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/client/TransportResponseHandler.java
@@ -24,6 +24,8 @@
 import java.util.concurrent.ConcurrentLinkedQueue;
 import java.util.concurrent.atomic.AtomicLong;
 
+import scala.Tuple2;
+
 import com.google.common.annotations.VisibleForTesting;
 import io.netty.channel.Channel;
 import org.slf4j.Logger;
@@ -56,7 +58,7 @@ public class TransportResponseHandler extends MessageHandler<ResponseMessage> {
 
   private final Map<Long, RpcResponseCallback> outstandingRpcs;
 
-  private final Queue<StreamCallback> streamCallbacks;
+  private final Queue<Tuple2<String, StreamCallback>> streamCallbacks;
   private volatile boolean streamActive;
 
   /** Records the time (in system nanoseconds) that the last fetch or RPC request was sent. */
@@ -88,9 +90,9 @@ public void removeRpcRequest(long requestId) {
     outstandingRpcs.remove(requestId);
   }
 
-  public void addStreamCallback(StreamCallback callback) {
+  public void addStreamCallback(String streamId, StreamCallback callback) {
     timeOfLastRequestNs.set(System.nanoTime());
-    streamCallbacks.offer(callback);
+    streamCallbacks.offer(Tuple2.apply(streamId, callback));
   }
 
   @VisibleForTesting
@@ -104,15 +106,31 @@ public void deactivateStream() {
    */
   private void failOutstandingRequests(Throwable cause) {
     for (Map.Entry<StreamChunkId, ChunkReceivedCallback> entry : outstandingFetches.entrySet()) {
-      entry.getValue().onFailure(entry.getKey().chunkIndex, cause);
+      try {
+        entry.getValue().onFailure(entry.getKey().chunkIndex, cause);
+      } catch (Exception e) {
+        logger.warn("ChunkReceivedCallback.onFailure throws exception", e);
+      }
     }
     for (Map.Entry<Long, RpcResponseCallback> entry : outstandingRpcs.entrySet()) {
-      entry.getValue().onFailure(cause);
+      try {
+        entry.getValue().onFailure(cause);
+      } catch (Exception e) {
+        logger.warn("RpcResponseCallback.onFailure throws exception", e);
+      }
+    }
+    for (Tuple2<String, StreamCallback> entry : streamCallbacks) {
+      try {
+        entry._2().onFailure(entry._1(), cause);
+      } catch (Exception e) {
+        logger.warn("StreamCallback.onFailure throws exception", e);
+      }
     }
 
     // It's OK if new fetches appear, as they will fail immediately.
     outstandingFetches.clear();
     outstandingRpcs.clear();
+    streamCallbacks.clear();
   }
 
   @Override
@@ -190,8 +208,9 @@ public void handle(ResponseMessage message) throws Exception {
       }
     } else if (message instanceof StreamResponse) {
       StreamResponse resp = (StreamResponse) message;
-      StreamCallback callback = streamCallbacks.poll();
-      if (callback != null) {
+      Tuple2<String, StreamCallback> entry = streamCallbacks.poll();
+      if (entry != null) {
+        StreamCallback callback = entry._2();
         if (resp.byteCount > 0) {
           StreamInterceptor interceptor = new StreamInterceptor(this, resp.streamId, resp.byteCount,
             callback);
@@ -216,8 +235,9 @@ public void handle(ResponseMessage message) throws Exception {
       }
     } else if (message instanceof StreamFailure) {
       StreamFailure resp = (StreamFailure) message;
-      StreamCallback callback = streamCallbacks.poll();
-      if (callback != null) {
+      Tuple2<String, StreamCallback> entry = streamCallbacks.poll();
+      if (entry != null) {
+        StreamCallback callback = entry._2();
         try {
           callback.onFailure(resp.streamId, new RuntimeException(resp.error));
         } catch (IOException ioe) {
diff --git a/common/network-common/src/test/java/org/apache/spark/network/TransportResponseHandlerSuite.java b/common/network-common/src/test/java/org/apache/spark/network/TransportResponseHandlerSuite.java
index 09fc80d12d510..b4032c4c3f031 100644
--- a/common/network-common/src/test/java/org/apache/spark/network/TransportResponseHandlerSuite.java
+++ b/common/network-common/src/test/java/org/apache/spark/network/TransportResponseHandlerSuite.java
@@ -17,6 +17,7 @@
 
 package org.apache.spark.network;
 
+import java.io.IOException;
 import java.nio.ByteBuffer;
 
 import io.netty.channel.Channel;
@@ -127,7 +128,7 @@ public void testActiveStreams() throws Exception {
 
     StreamResponse response = new StreamResponse("stream", 1234L, null);
     StreamCallback cb = mock(StreamCallback.class);
-    handler.addStreamCallback(cb);
+    handler.addStreamCallback("stream", cb);
     assertEquals(1, handler.numOutstandingRequests());
     handler.handle(response);
     assertEquals(1, handler.numOutstandingRequests());
@@ -135,9 +136,35 @@ public void testActiveStreams() throws Exception {
     assertEquals(0, handler.numOutstandingRequests());
 
     StreamFailure failure = new StreamFailure("stream", "uh-oh");
-    handler.addStreamCallback(cb);
+    handler.addStreamCallback("stream", cb);
     assertEquals(1, handler.numOutstandingRequests());
     handler.handle(failure);
     assertEquals(0, handler.numOutstandingRequests());
   }
+
+  @Test
+  public void failOutstandingStreamCallbackOnClose() throws Exception {
+    Channel c = new LocalChannel();
+    c.pipeline().addLast(TransportFrameDecoder.HANDLER_NAME, new TransportFrameDecoder());
+    TransportResponseHandler handler = new TransportResponseHandler(c);
+
+    StreamCallback cb = mock(StreamCallback.class);
+    handler.addStreamCallback("stream-1", cb);
+    handler.channelInactive();
+
+    verify(cb).onFailure(eq("stream-1"), isA(IOException.class));
+  }
+
+  @Test
+  public void failOutstandingStreamCallbackOnException() throws Exception {
+    Channel c = new LocalChannel();
+    c.pipeline().addLast(TransportFrameDecoder.HANDLER_NAME, new TransportFrameDecoder());
+    TransportResponseHandler handler = new TransportResponseHandler(c);
+
+    StreamCallback cb = mock(StreamCallback.class);
+    handler.addStreamCallback("stream-1", cb);
+    handler.exceptionCaught(new IOException("Oops!"));
+
+    verify(cb).onFailure(eq("stream-1"), isA(IOException.class));
+  }
 }

From 80f7ac3a601709dd9471092244612023363f54cd Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Fri, 30 Jun 2017 11:02:22 +0800
Subject: [PATCH 048/779] [SPARK-21253][CORE] Disable
 spark.reducer.maxReqSizeShuffleToMem

## What changes were proposed in this pull request?

Disable spark.reducer.maxReqSizeShuffleToMem because it breaks the old shuffle service.

Credits to wangyum

Closes #18466

## How was this patch tested?

Jenkins

Author: Shixiong Zhu <shixiong@databricks.com>
Author: Yuming Wang <wgyumg@gmail.com>

Closes #18467 from zsxwing/SPARK-21253.
---
 .../scala/org/apache/spark/internal/config/package.scala  | 3 ++-
 docs/configuration.md                                     | 8 --------
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index be63c637a3a13..8dee0d970c4c6 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -323,10 +323,11 @@ package object config {
 
   private[spark] val REDUCER_MAX_REQ_SIZE_SHUFFLE_TO_MEM =
     ConfigBuilder("spark.reducer.maxReqSizeShuffleToMem")
+      .internal()
       .doc("The blocks of a shuffle request will be fetched to disk when size of the request is " +
         "above this threshold. This is to avoid a giant request takes too much memory.")
       .bytesConf(ByteUnit.BYTE)
-      .createWithDefaultString("200m")
+      .createWithDefault(Long.MaxValue)
 
   private[spark] val TASK_METRICS_TRACK_UPDATED_BLOCK_STATUSES =
     ConfigBuilder("spark.taskMetrics.trackUpdatedBlockStatuses")
diff --git a/docs/configuration.md b/docs/configuration.md
index c8e61537a457c..bd6a1f9e240e2 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -528,14 +528,6 @@ Apart from these, the following properties are also available, and may be useful
     By allowing it to limit the number of fetch requests, this scenario can be mitigated.
   </td>
 </tr>
-<tr>
-  <td><code>spark.reducer.maxReqSizeShuffleToMem</code></td>
-  <td>200m</td>
-  <td>
-    The blocks of a shuffle request will be fetched to disk when size of the request is above
-    this threshold. This is to avoid a giant request takes too much memory.
-  </td>
-</tr>
 <tr>
   <td><code>spark.shuffle.compress</code></td>
   <td>true</td>

From 88a536babf119b7e331d02aac5d52b57658803bf Mon Sep 17 00:00:00 2001
From: IngoSchuster <ingo.schuster@de.ibm.com>
Date: Fri, 30 Jun 2017 11:16:09 +0800
Subject: [PATCH 049/779] [SPARK-21176][WEB UI] Limit number of selector
 threads for admin ui proxy servlets to 8

## What changes were proposed in this pull request?
Please see also https://issues.apache.org/jira/browse/SPARK-21176

This change limits the number of selector threads that jetty creates to maximum 8 per proxy servlet (Jetty default is number of processors / 2).
The newHttpClient for Jettys ProxyServlet class is overwritten to avoid the Jetty defaults (which are designed for high-performance http servers).
Once https://github.com/eclipse/jetty.project/issues/1643 is available, the code could be cleaned up to avoid the method override.

I really need this on v2.1.1 - what is the best way for a backport automatic merge works fine)? Shall I create another PR?

## How was this patch tested?
(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)
The patch was tested manually on a Spark cluster with a head node that has 88 processors using JMX to verify that the number of selector threads is now limited to 8 per proxy.

gurvindersingh zsxwing can you please review the change?

Author: IngoSchuster <ingo.schuster@de.ibm.com>
Author: Ingo Schuster <ingo.schuster@de.ibm.com>

Closes #18437 from IngoSchuster/master.
---
 .../main/scala/org/apache/spark/ui/JettyUtils.scala  | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
index edf328b5ae538..b9371c7ad7b45 100644
--- a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
@@ -26,6 +26,8 @@ import scala.language.implicitConversions
 import scala.xml.Node
 
 import org.eclipse.jetty.client.api.Response
+import org.eclipse.jetty.client.HttpClient
+import org.eclipse.jetty.client.http.HttpClientTransportOverHTTP
 import org.eclipse.jetty.proxy.ProxyServlet
 import org.eclipse.jetty.server._
 import org.eclipse.jetty.server.handler._
@@ -208,6 +210,16 @@ private[spark] object JettyUtils extends Logging {
         rewrittenURI.toString()
       }
 
+      override def newHttpClient(): HttpClient = {
+        // SPARK-21176: Use the Jetty logic to calculate the number of selector threads (#CPUs/2),
+        // but limit it to 8 max.
+        // Otherwise, it might happen that we exhaust the threadpool since in reverse proxy mode
+        // a proxy is instantiated for each executor. If the head node has many processors, this
+        // can quickly add up to an unreasonably high number of threads.
+        val numSelectors = math.max(1, math.min(8, Runtime.getRuntime().availableProcessors() / 2))
+        new HttpClient(new HttpClientTransportOverHTTP(numSelectors), null)
+      }
+
       override def filterServerResponseHeader(
           clientRequest: HttpServletRequest,
           serverResponse: Response,

From cfc696f4a4289acf132cb26baf7c02c5b6305277 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Thu, 29 Jun 2017 20:56:37 -0700
Subject: [PATCH 050/779] [SPARK-21253][CORE][HOTFIX] Fix Scala 2.10 build

## What changes were proposed in this pull request?

A follow up PR to fix Scala 2.10 build for #18472

## How was this patch tested?

Jenkins

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #18478 from zsxwing/SPARK-21253-2.
---
 .../apache/spark/network/client/TransportResponseHandler.java   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/network-common/src/main/java/org/apache/spark/network/client/TransportResponseHandler.java b/common/network-common/src/main/java/org/apache/spark/network/client/TransportResponseHandler.java
index be9f18203c8e4..340b8b96aabc6 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/client/TransportResponseHandler.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/client/TransportResponseHandler.java
@@ -92,7 +92,7 @@ public void removeRpcRequest(long requestId) {
 
   public void addStreamCallback(String streamId, StreamCallback callback) {
     timeOfLastRequestNs.set(System.nanoTime());
-    streamCallbacks.offer(Tuple2.apply(streamId, callback));
+    streamCallbacks.offer(new Tuple2<>(streamId, callback));
   }
 
   @VisibleForTesting

From e2f32ee45ac907f1f53fde7e412676a849a94872 Mon Sep 17 00:00:00 2001
From: Herman van Hovell <hvanhovell@databricks.com>
Date: Fri, 30 Jun 2017 12:34:09 +0800
Subject: [PATCH 051/779] [SPARK-21258][SQL] Fix WindowExec complex object
 aggregation with spilling

## What changes were proposed in this pull request?
`WindowExec` currently improperly stores complex objects (UnsafeRow, UnsafeArrayData, UnsafeMapData, UTF8String) during aggregation by keeping a reference in the buffer used by `GeneratedMutableProjections` to the actual input data. Things go wrong when the input object (or the backing bytes) are reused for other things. This could happen in window functions when it starts spilling to disk. When reading the back the spill files the `UnsafeSorterSpillReader` reuses the buffer to which the `UnsafeRow` points, leading to weird corruption scenario's. Note that this only happens for aggregate functions that preserve (parts of) their input, for example `FIRST`, `LAST`, `MIN` & `MAX`.

This was not seen before, because the spilling logic was not doing actual spills as much and actually used an in-memory page. This page was not cleaned up during window processing and made sure unsafe objects point to their own dedicated memory location. This was changed by https://github.com/apache/spark/pull/16909, after this PR Spark spills more eagerly.

This PR provides a surgical fix because we are close to releasing Spark 2.2. This change just makes sure that there cannot be any object reuse at the expensive of a little bit of performance. We will follow-up with a more subtle solution at a later point.

## How was this patch tested?
Added a regression test to `DataFrameWindowFunctionsSuite`.

Author: Herman van Hovell <hvanhovell@databricks.com>

Closes #18470 from hvanhovell/SPARK-21258.
---
 .../execution/window/AggregateProcessor.scala |  7 ++-
 .../sql/DataFrameWindowFunctionsSuite.scala   | 47 ++++++++++++++++++-
 2 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/AggregateProcessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/AggregateProcessor.scala
index bc141b36e63b4..2195c6ea95948 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/AggregateProcessor.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/AggregateProcessor.scala
@@ -145,10 +145,13 @@ private[window] final class AggregateProcessor(
 
   /** Update the buffer. */
   def update(input: InternalRow): Unit = {
-    updateProjection(join(buffer, input))
+    // TODO(hvanhovell) this sacrifices performance for correctness. We should make sure that
+    // MutableProjection makes copies of the complex input objects it buffer.
+    val copy = input.copy()
+    updateProjection(join(buffer, copy))
     var i = 0
     while (i < numImperatives) {
-      imperatives(i).update(buffer, input)
+      imperatives(i).update(buffer, copy)
       i += 1
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala
index 1255c49104718..204858fa29787 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala
@@ -19,8 +19,9 @@ package org.apache.spark.sql
 
 import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction, Window}
 import org.apache.spark.sql.functions._
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
-import org.apache.spark.sql.types.{DataType, LongType, StructType}
+import org.apache.spark.sql.types._
 
 /**
  * Window function testing for DataFrame API.
@@ -423,4 +424,48 @@ class DataFrameWindowFunctionsSuite extends QueryTest with SharedSQLContext {
       df.select(selectList: _*).where($"value" < 2),
       Seq(Row(3, "1", null, 3.0, 4.0, 3.0), Row(5, "1", false, 4.0, 5.0, 5.0)))
   }
+
+  test("SPARK-21258: complex object in combination with spilling") {
+    // Make sure we trigger the spilling path.
+    withSQLConf(SQLConf.WINDOW_EXEC_BUFFER_SPILL_THRESHOLD.key -> "17") {
+      val sampleSchema = new StructType().
+        add("f0", StringType).
+        add("f1", LongType).
+        add("f2", ArrayType(new StructType().
+          add("f20", StringType))).
+        add("f3", ArrayType(new StructType().
+          add("f30", StringType)))
+
+      val w0 = Window.partitionBy("f0").orderBy("f1")
+      val w1 = w0.rowsBetween(Long.MinValue, Long.MaxValue)
+
+      val c0 = first(struct($"f2", $"f3")).over(w0) as "c0"
+      val c1 = last(struct($"f2", $"f3")).over(w1) as "c1"
+
+      val input =
+        """{"f1":1497820153720,"f2":[{"f20":"x","f21":0}],"f3":[{"f30":"x","f31":0}]}
+          |{"f1":1497802179638}
+          |{"f1":1497802189347}
+          |{"f1":1497802189593}
+          |{"f1":1497802189597}
+          |{"f1":1497802189599}
+          |{"f1":1497802192103}
+          |{"f1":1497802193414}
+          |{"f1":1497802193577}
+          |{"f1":1497802193709}
+          |{"f1":1497802202883}
+          |{"f1":1497802203006}
+          |{"f1":1497802203743}
+          |{"f1":1497802203834}
+          |{"f1":1497802203887}
+          |{"f1":1497802203893}
+          |{"f1":1497802203976}
+          |{"f1":1497820168098}
+          |""".stripMargin.split("\n").toSeq
+
+      import testImplicits._
+
+      spark.read.schema(sampleSchema).json(input.toDS()).select(c0, c1).foreach { _ => () }
+    }
+  }
 }

From fddb63f46345be36c40d9a7f3660920af6502bbd Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Thu, 29 Jun 2017 21:35:01 -0700
Subject: [PATCH 052/779] [SPARK-20889][SPARKR] Grouped documentation for MISC
 column methods

## What changes were proposed in this pull request?
Grouped documentation for column misc methods.

Author: actuaryzhang <actuaryzhang10@gmail.com>
Author: Wayne Zhang <actuaryzhang10@gmail.com>

Closes #18448 from actuaryzhang/sparkRDocMisc.
---
 R/pkg/R/functions.R | 98 +++++++++++++++++++++------------------------
 R/pkg/R/generics.R  | 15 ++++---
 2 files changed, 55 insertions(+), 58 deletions(-)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index cb09e847d739a..67cb7a7f6db08 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -150,6 +150,27 @@ NULL
 #' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))}
 NULL
 
+#' Miscellaneous functions for Column operations
+#'
+#' Miscellaneous functions defined for \code{Column}.
+#'
+#' @param x Column to compute on. In \code{sha2}, it is one of 224, 256, 384, or 512.
+#' @param y Column to compute on.
+#' @param ... additional Columns.
+#' @name column_misc_functions
+#' @rdname column_misc_functions
+#' @family misc functions
+#' @examples
+#' \dontrun{
+#' # Dataframe used throughout this doc
+#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars)[, 1:2])
+#' tmp <- mutate(df, v1 = crc32(df$model), v2 = hash(df$model),
+#'                   v3 = hash(df$model, df$mpg), v4 = md5(df$model),
+#'                   v5 = sha1(df$model), v6 = sha2(df$model, 256))
+#' head(tmp)
+#' }
+NULL
+
 #' @details
 #' \code{lit}: A new Column is created to represent the literal value.
 #' If the parameter is a Column, it is returned unchanged.
@@ -569,19 +590,13 @@ setMethod("count",
             column(jc)
           })
 
-#' crc32
-#'
-#' Calculates the cyclic redundancy check value  (CRC32) of a binary column and
-#' returns the value as a bigint.
-#'
-#' @param x Column to compute on.
+#' @details
+#' \code{crc32}: Calculates the cyclic redundancy check value  (CRC32) of a binary column
+#' and returns the value as a bigint.
 #'
-#' @rdname crc32
-#' @name crc32
-#' @family misc functions
-#' @aliases crc32,Column-method
+#' @rdname column_misc_functions
+#' @aliases crc32 crc32,Column-method
 #' @export
-#' @examples \dontrun{crc32(df$c)}
 #' @note crc32 since 1.5.0
 setMethod("crc32",
           signature(x = "Column"),
@@ -590,19 +605,13 @@ setMethod("crc32",
             column(jc)
           })
 
-#' hash
-#'
-#' Calculates the hash code of given columns, and returns the result as a int column.
-#'
-#' @param x Column to compute on.
-#' @param ... additional Column(s) to be included.
+#' @details
+#' \code{hash}: Calculates the hash code of given columns, and returns the result
+#' as an int column.
 #'
-#' @rdname hash
-#' @name hash
-#' @family misc functions
-#' @aliases hash,Column-method
+#' @rdname column_misc_functions
+#' @aliases hash hash,Column-method
 #' @export
-#' @examples \dontrun{hash(df$c)}
 #' @note hash since 2.0.0
 setMethod("hash",
           signature(x = "Column"),
@@ -1055,19 +1064,13 @@ setMethod("max",
             column(jc)
           })
 
-#' md5
-#'
-#' Calculates the MD5 digest of a binary column and returns the value
+#' @details
+#' \code{md5}: Calculates the MD5 digest of a binary column and returns the value
 #' as a 32 character hex string.
 #'
-#' @param x Column to compute on.
-#'
-#' @rdname md5
-#' @name md5
-#' @family misc functions
-#' @aliases md5,Column-method
+#' @rdname column_misc_functions
+#' @aliases md5 md5,Column-method
 #' @export
-#' @examples \dontrun{md5(df$c)}
 #' @note md5 since 1.5.0
 setMethod("md5",
           signature(x = "Column"),
@@ -1307,19 +1310,13 @@ setMethod("second",
             column(jc)
           })
 
-#' sha1
-#'
-#' Calculates the SHA-1 digest of a binary column and returns the value
+#' @details
+#' \code{sha1}: Calculates the SHA-1 digest of a binary column and returns the value
 #' as a 40 character hex string.
 #'
-#' @param x Column to compute on.
-#'
-#' @rdname sha1
-#' @name sha1
-#' @family misc functions
-#' @aliases sha1,Column-method
+#' @rdname column_misc_functions
+#' @aliases sha1 sha1,Column-method
 #' @export
-#' @examples \dontrun{sha1(df$c)}
 #' @note sha1 since 1.5.0
 setMethod("sha1",
           signature(x = "Column"),
@@ -2309,19 +2306,14 @@ setMethod("format_number", signature(y = "Column", x = "numeric"),
             column(jc)
           })
 
-#' sha2
-#'
-#' Calculates the SHA-2 family of hash functions of a binary column and
-#' returns the value as a hex string.
+#' @details
+#' \code{sha2}: Calculates the SHA-2 family of hash functions of a binary column and
+#' returns the value as a hex string. The second argument \code{x} specifies the number
+#' of bits, and is one of 224, 256, 384, or 512.
 #'
-#' @param y column to compute SHA-2 on.
-#' @param x one of 224, 256, 384, or 512.
-#' @family misc functions
-#' @rdname sha2
-#' @name sha2
-#' @aliases sha2,Column,numeric-method
+#' @rdname column_misc_functions
+#' @aliases sha2 sha2,Column,numeric-method
 #' @export
-#' @examples \dontrun{sha2(df$c, 256)}
 #' @note sha2 since 1.5.0
 setMethod("sha2", signature(y = "Column", x = "numeric"),
           function(y, x) {
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 1deb057bb1b82..bdd4b360f4973 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -992,8 +992,9 @@ setGeneric("conv", function(x, fromBase, toBase) { standardGeneric("conv") })
 #' @name NULL
 setGeneric("countDistinct", function(x, ...) { standardGeneric("countDistinct") })
 
-#' @rdname crc32
+#' @rdname column_misc_functions
 #' @export
+#' @name NULL
 setGeneric("crc32", function(x) { standardGeneric("crc32") })
 
 #' @rdname column_nonaggregate_functions
@@ -1006,8 +1007,9 @@ setGeneric("create_array", function(x, ...) { standardGeneric("create_array") })
 #' @name NULL
 setGeneric("create_map", function(x, ...) { standardGeneric("create_map") })
 
-#' @rdname hash
+#' @rdname column_misc_functions
 #' @export
+#' @name NULL
 setGeneric("hash", function(x, ...) { standardGeneric("hash") })
 
 #' @param x empty. Should be used with no argument.
@@ -1205,8 +1207,9 @@ setGeneric("lpad", function(x, len, pad) { standardGeneric("lpad") })
 #' @name NULL
 setGeneric("ltrim", function(x) { standardGeneric("ltrim") })
 
-#' @rdname md5
+#' @rdname column_misc_functions
 #' @export
+#' @name NULL
 setGeneric("md5", function(x) { standardGeneric("md5") })
 
 #' @rdname column_datetime_functions
@@ -1350,12 +1353,14 @@ setGeneric("sd", function(x, na.rm = FALSE) { standardGeneric("sd") })
 #' @name NULL
 setGeneric("second", function(x) { standardGeneric("second") })
 
-#' @rdname sha1
+#' @rdname column_misc_functions
 #' @export
+#' @name NULL
 setGeneric("sha1", function(x) { standardGeneric("sha1") })
 
-#' @rdname sha2
+#' @rdname column_misc_functions
 #' @export
+#' @name NULL
 setGeneric("sha2", function(y, x) { standardGeneric("sha2") })
 
 #' @rdname column_math_functions

From 52981715bb8d653a1141f55b36da804412eb783a Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Thu, 29 Jun 2017 23:00:50 -0700
Subject: [PATCH 053/779] [SPARK-20889][SPARKR] Grouped documentation for
 COLLECTION column methods

## What changes were proposed in this pull request?

Grouped documentation for column collection methods.

Author: actuaryzhang <actuaryzhang10@gmail.com>
Author: Wayne Zhang <actuaryzhang10@gmail.com>

Closes #18458 from actuaryzhang/sparkRDocCollection.
---
 R/pkg/R/functions.R | 204 +++++++++++++++++++-------------------------
 R/pkg/R/generics.R  |  27 ++++--
 2 files changed, 108 insertions(+), 123 deletions(-)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 67cb7a7f6db08..a1f5c4f8cc18d 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -171,6 +171,35 @@ NULL
 #' }
 NULL
 
+#' Collection functions for Column operations
+#'
+#' Collection functions defined for \code{Column}.
+#'
+#' @param x Column to compute on. Note the difference in the following methods:
+#'          \itemize{
+#'          \item \code{to_json}: it is the column containing the struct or array of the structs.
+#'          \item \code{from_json}: it is the column containing the JSON string.
+#'          }
+#' @param ... additional argument(s). In \code{to_json} and \code{from_json}, this contains
+#'            additional named properties to control how it is converted, accepts the same
+#'            options as the JSON data source.
+#' @name column_collection_functions
+#' @rdname column_collection_functions
+#' @family collection functions
+#' @examples
+#' \dontrun{
+#' # Dataframe used throughout this doc
+#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))
+#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))
+#' tmp <- mutate(df, v1 = create_array(df$mpg, df$cyl, df$hp))
+#' head(select(tmp, array_contains(tmp$v1, 21), size(tmp$v1)))
+#' tmp2 <- mutate(tmp, v2 = explode(tmp$v1))
+#' head(tmp2)
+#' head(select(tmp, posexplode(tmp$v1)))
+#' head(select(tmp, sort_array(tmp$v1)))
+#' head(select(tmp, sort_array(tmp$v1, asc = FALSE)))}
+NULL
+
 #' @details
 #' \code{lit}: A new Column is created to represent the literal value.
 #' If the parameter is a Column, it is returned unchanged.
@@ -1642,30 +1671,23 @@ setMethod("to_date",
             column(jc)
           })
 
-#' to_json
-#'
-#' Converts a column containing a \code{structType} or array of \code{structType} into a Column
-#' of JSON string. Resolving the Column can fail if an unsupported type is encountered.
-#'
-#' @param x Column containing the struct or array of the structs
-#' @param ... additional named properties to control how it is converted, accepts the same options
-#'            as the JSON data source.
+#' @details
+#' \code{to_json}: Converts a column containing a \code{structType} or array of \code{structType}
+#' into a Column of JSON string. Resolving the Column can fail if an unsupported type is encountered.
 #'
-#' @family non-aggregate functions
-#' @rdname to_json
-#' @name to_json
-#' @aliases to_json,Column-method
+#' @rdname column_collection_functions
+#' @aliases to_json to_json,Column-method
 #' @export
 #' @examples
+#'
 #' \dontrun{
 #' # Converts a struct into a JSON object
-#' df <- sql("SELECT named_struct('date', cast('2000-01-01' as date)) as d")
-#' select(df, to_json(df$d, dateFormat = 'dd/MM/yyyy'))
+#' df2 <- sql("SELECT named_struct('date', cast('2000-01-01' as date)) as d")
+#' select(df2, to_json(df2$d, dateFormat = 'dd/MM/yyyy'))
 #'
 #' # Converts an array of structs into a JSON array
-#' df <- sql("SELECT array(named_struct('name', 'Bob'), named_struct('name', 'Alice')) as people")
-#' select(df, to_json(df$people))
-#'}
+#' df2 <- sql("SELECT array(named_struct('name', 'Bob'), named_struct('name', 'Alice')) as people")
+#' df2 <- mutate(df2, people_json = to_json(df2$people))}
 #' @note to_json since 2.2.0
 setMethod("to_json", signature(x = "Column"),
           function(x, ...) {
@@ -2120,28 +2142,28 @@ setMethod("date_format", signature(y = "Column", x = "character"),
             column(jc)
           })
 
-#' from_json
-#'
-#' Parses a column containing a JSON string into a Column of \code{structType} with the specified
-#' \code{schema} or array of \code{structType} if \code{as.json.array} is set to \code{TRUE}.
-#' If the string is unparseable, the Column will contains the value NA.
+#' @details
+#' \code{from_json}: Parses a column containing a JSON string into a Column of \code{structType}
+#' with the specified \code{schema} or array of \code{structType} if \code{as.json.array} is set
+#' to \code{TRUE}. If the string is unparseable, the Column will contain the value NA.
 #'
-#' @param x Column containing the JSON string.
+#' @rdname column_collection_functions
 #' @param schema a structType object to use as the schema to use when parsing the JSON string.
 #' @param as.json.array indicating if input string is JSON array of objects or a single object.
-#' @param ... additional named properties to control how the json is parsed, accepts the same
-#'            options as the JSON data source.
-#'
-#' @family non-aggregate functions
-#' @rdname from_json
-#' @name from_json
-#' @aliases from_json,Column,structType-method
+#' @aliases from_json from_json,Column,structType-method
 #' @export
 #' @examples
+#'
 #' \dontrun{
-#' schema <- structType(structField("name", "string"),
-#' select(df, from_json(df$value, schema, dateFormat = "dd/MM/yyyy"))
-#'}
+#' df2 <- sql("SELECT named_struct('date', cast('2000-01-01' as date)) as d")
+#' df2 <- mutate(df2, d2 = to_json(df2$d, dateFormat = 'dd/MM/yyyy'))
+#' schema <- structType(structField("date", "string"))
+#' head(select(df2, from_json(df2$d2, schema, dateFormat = 'dd/MM/yyyy')))
+
+#' df2 <- sql("SELECT named_struct('name', 'Bob') as people")
+#' df2 <- mutate(df2, people_json = to_json(df2$people))
+#' schema <- structType(structField("name", "string"))
+#' head(select(df2, from_json(df2$people_json, schema)))}
 #' @note from_json since 2.2.0
 setMethod("from_json", signature(x = "Column", schema = "structType"),
           function(x, schema, as.json.array = FALSE, ...) {
@@ -3101,18 +3123,14 @@ setMethod("row_number",
 
 ###################### Collection functions######################
 
-#' array_contains
-#'
-#' Returns null if the array is null, true if the array contains the value, and false otherwise.
+#' @details
+#' \code{array_contains}: Returns null if the array is null, true if the array contains
+#' the value, and false otherwise.
 #'
-#' @param x A Column
 #' @param value A value to be checked if contained in the column
-#' @rdname array_contains
-#' @aliases array_contains,Column-method
-#' @name array_contains
-#' @family collection functions
+#' @rdname column_collection_functions
+#' @aliases array_contains array_contains,Column-method
 #' @export
-#' @examples \dontrun{array_contains(df$c, 1)}
 #' @note array_contains since 1.6.0
 setMethod("array_contains",
           signature(x = "Column", value = "ANY"),
@@ -3121,18 +3139,12 @@ setMethod("array_contains",
             column(jc)
           })
 
-#' explode
-#'
-#' Creates a new row for each element in the given array or map column.
-#'
-#' @param x Column to compute on
+#' @details
+#' \code{explode}: Creates a new row for each element in the given array or map column.
 #'
-#' @rdname explode
-#' @name explode
-#' @family collection functions
-#' @aliases explode,Column-method
+#' @rdname column_collection_functions
+#' @aliases explode explode,Column-method
 #' @export
-#' @examples \dontrun{explode(df$c)}
 #' @note explode since 1.5.0
 setMethod("explode",
           signature(x = "Column"),
@@ -3141,18 +3153,12 @@ setMethod("explode",
             column(jc)
           })
 
-#' size
-#'
-#' Returns length of array or map.
-#'
-#' @param x Column to compute on
+#' @details
+#' \code{size}: Returns length of array or map.
 #'
-#' @rdname size
-#' @name size
-#' @aliases size,Column-method
-#' @family collection functions
+#' @rdname column_collection_functions
+#' @aliases size size,Column-method
 #' @export
-#' @examples \dontrun{size(df$c)}
 #' @note size since 1.5.0
 setMethod("size",
           signature(x = "Column"),
@@ -3161,25 +3167,16 @@ setMethod("size",
             column(jc)
           })
 
-#' sort_array
-#'
-#' Sorts the input array in ascending or descending order according
+#' @details
+#' \code{sort_array}: Sorts the input array in ascending or descending order according
 #' to the natural ordering of the array elements.
 #'
-#' @param x A Column to sort
+#' @rdname column_collection_functions
 #' @param asc A logical flag indicating the sorting order.
 #'            TRUE, sorting is in ascending order.
 #'            FALSE, sorting is in descending order.
-#' @rdname sort_array
-#' @name sort_array
-#' @aliases sort_array,Column-method
-#' @family collection functions
+#' @aliases sort_array sort_array,Column-method
 #' @export
-#' @examples
-#' \dontrun{
-#' sort_array(df$c)
-#' sort_array(df$c, FALSE)
-#' }
 #' @note sort_array since 1.6.0
 setMethod("sort_array",
           signature(x = "Column"),
@@ -3188,18 +3185,13 @@ setMethod("sort_array",
             column(jc)
           })
 
-#' posexplode
-#'
-#' Creates a new row for each element with position in the given array or map column.
-#'
-#' @param x Column to compute on
+#' @details
+#' \code{posexplode}: Creates a new row for each element with position in the given array
+#' or map column.
 #'
-#' @rdname posexplode
-#' @name posexplode
-#' @family collection functions
-#' @aliases posexplode,Column-method
+#' @rdname column_collection_functions
+#' @aliases posexplode posexplode,Column-method
 #' @export
-#' @examples \dontrun{posexplode(df$c)}
 #' @note posexplode since 2.1.0
 setMethod("posexplode",
           signature(x = "Column"),
@@ -3325,27 +3317,24 @@ setMethod("repeat_string",
             column(jc)
           })
 
-#' explode_outer
-#'
-#' Creates a new row for each element in the given array or map column.
+#' @details
+#' \code{explode}: Creates a new row for each element in the given array or map column.
 #' Unlike \code{explode}, if the array/map is \code{null} or empty
 #' then \code{null} is produced.
 #'
-#' @param x Column to compute on
 #'
-#' @rdname explode_outer
-#' @name explode_outer
-#' @family collection functions
-#' @aliases explode_outer,Column-method
+#' @rdname column_collection_functions
+#' @aliases explode_outer explode_outer,Column-method
 #' @export
 #' @examples
+#'
 #' \dontrun{
-#' df <- createDataFrame(data.frame(
+#' df2 <- createDataFrame(data.frame(
 #'   id = c(1, 2, 3), text = c("a,b,c", NA, "d,e")
 #' ))
 #'
-#' head(select(df, df$id, explode_outer(split_string(df$text, ","))))
-#' }
+#' head(select(df2, df2$id, explode_outer(split_string(df2$text, ","))))
+#' head(select(df2, df2$id, posexplode_outer(split_string(df2$text, ","))))}
 #' @note explode_outer since 2.3.0
 setMethod("explode_outer",
           signature(x = "Column"),
@@ -3354,27 +3343,14 @@ setMethod("explode_outer",
             column(jc)
           })
 
-#' posexplode_outer
-#'
-#' Creates a new row for each element with position in the given array or map column.
-#' Unlike \code{posexplode}, if the array/map is \code{null} or empty
+#' @details
+#' \code{posexplode_outer}: Creates a new row for each element with position in the given
+#' array or map column. Unlike \code{posexplode}, if the array/map is \code{null} or empty
 #' then the row (\code{null}, \code{null}) is produced.
 #'
-#' @param x Column to compute on
-#'
-#' @rdname posexplode_outer
-#' @name posexplode_outer
-#' @family collection functions
-#' @aliases posexplode_outer,Column-method
+#' @rdname column_collection_functions
+#' @aliases posexplode_outer posexplode_outer,Column-method
 #' @export
-#' @examples
-#' \dontrun{
-#' df <- createDataFrame(data.frame(
-#'   id = c(1, 2, 3), text = c("a,b,c", NA, "d,e")
-#' ))
-#'
-#' head(select(df, df$id, posexplode_outer(split_string(df$text, ","))))
-#' }
 #' @note posexplode_outer since 2.3.0
 setMethod("posexplode_outer",
           signature(x = "Column"),
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index bdd4b360f4973..b901b74e4728d 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -913,8 +913,9 @@ setGeneric("add_months", function(y, x) { standardGeneric("add_months") })
 #' @name NULL
 setGeneric("approxCountDistinct", function(x, ...) { standardGeneric("approxCountDistinct") })
 
-#' @rdname array_contains
+#' @rdname column_collection_functions
 #' @export
+#' @name NULL
 setGeneric("array_contains", function(x, value) { standardGeneric("array_contains") })
 
 #' @rdname column_string_functions
@@ -1062,12 +1063,14 @@ setGeneric("dense_rank", function(x = "missing") { standardGeneric("dense_rank")
 #' @name NULL
 setGeneric("encode", function(x, charset) { standardGeneric("encode") })
 
-#' @rdname explode
+#' @rdname column_collection_functions
 #' @export
+#' @name NULL
 setGeneric("explode", function(x) { standardGeneric("explode") })
 
-#' @rdname explode_outer
+#' @rdname column_collection_functions
 #' @export
+#' @name NULL
 setGeneric("explode_outer", function(x) { standardGeneric("explode_outer") })
 
 #' @rdname column_nonaggregate_functions
@@ -1090,8 +1093,9 @@ setGeneric("format_number", function(y, x) { standardGeneric("format_number") })
 #' @name NULL
 setGeneric("format_string", function(format, x, ...) { standardGeneric("format_string") })
 
-#' @rdname from_json
+#' @rdname column_collection_functions
 #' @export
+#' @name NULL
 setGeneric("from_json", function(x, schema, ...) { standardGeneric("from_json") })
 
 #' @rdname column_datetime_functions
@@ -1275,12 +1279,14 @@ setGeneric("percent_rank", function(x = "missing") { standardGeneric("percent_ra
 #' @name NULL
 setGeneric("pmod", function(y, x) { standardGeneric("pmod") })
 
-#' @rdname posexplode
+#' @rdname column_collection_functions
 #' @export
+#' @name NULL
 setGeneric("posexplode", function(x) { standardGeneric("posexplode") })
 
-#' @rdname posexplode_outer
+#' @rdname column_collection_functions
 #' @export
+#' @name NULL
 setGeneric("posexplode_outer", function(x) { standardGeneric("posexplode_outer") })
 
 #' @rdname column_datetime_functions
@@ -1383,8 +1389,9 @@ setGeneric("shiftRightUnsigned", function(y, x) { standardGeneric("shiftRightUns
 #' @name NULL
 setGeneric("signum", function(x) { standardGeneric("signum") })
 
-#' @rdname size
+#' @rdname column_collection_functions
 #' @export
+#' @name NULL
 setGeneric("size", function(x) { standardGeneric("size") })
 
 #' @rdname column_aggregate_functions
@@ -1392,8 +1399,9 @@ setGeneric("size", function(x) { standardGeneric("size") })
 #' @name NULL
 setGeneric("skewness", function(x) { standardGeneric("skewness") })
 
-#' @rdname sort_array
+#' @rdname column_collection_functions
 #' @export
+#' @name NULL
 setGeneric("sort_array", function(x, asc = TRUE) { standardGeneric("sort_array") })
 
 #' @rdname column_string_functions
@@ -1456,8 +1464,9 @@ setGeneric("toRadians", function(x) { standardGeneric("toRadians") })
 #' @name NULL
 setGeneric("to_date", function(x, format) { standardGeneric("to_date") })
 
-#' @rdname to_json
+#' @rdname column_collection_functions
 #' @export
+#' @name NULL
 setGeneric("to_json", function(x, ...) { standardGeneric("to_json") })
 
 #' @rdname column_datetime_functions

From 49d767d838691fc7d964be2c4349662f5500ff2b Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Fri, 30 Jun 2017 20:02:15 +0800
Subject: [PATCH 054/779] [SPARK-18710][ML] Add offset in GLM

## What changes were proposed in this pull request?
Add support for offset in GLM. This is useful for at least two reasons:

1. Account for exposure: e.g., when modeling the number of accidents, we may need to use miles driven as an offset to access factors on frequency.
2. Test incremental effects of new variables: we can use predictions from the existing model as offset and run a much smaller model on only new variables. This avoids re-estimating the large model with all variables (old + new) and can be very important for efficient large-scaled analysis.

## How was this patch tested?
New test.

yanboliang srowen felixcheung sethah

Author: actuaryzhang <actuaryzhang10@gmail.com>

Closes #16699 from actuaryzhang/offset.
---
 .../apache/spark/ml/feature/Instance.scala    |  21 +
 .../IterativelyReweightedLeastSquares.scala   |  14 +-
 .../spark/ml/optim/WeightedLeastSquares.scala |   2 +-
 .../GeneralizedLinearRegression.scala         | 184 +++--
 ...erativelyReweightedLeastSquaresSuite.scala |  40 +-
 .../GeneralizedLinearRegressionSuite.scala    | 634 ++++++++++--------
 6 files changed, 534 insertions(+), 361 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Instance.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Instance.scala
index cce3ca45ccd8f..dd56fbbfa2b63 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Instance.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Instance.scala
@@ -27,3 +27,24 @@ import org.apache.spark.ml.linalg.Vector
  * @param features The vector of features for this data point.
  */
 private[ml] case class Instance(label: Double, weight: Double, features: Vector)
+
+/**
+ * Case class that represents an instance of data point with
+ * label, weight, offset and features.
+ * This is mainly used in GeneralizedLinearRegression currently.
+ *
+ * @param label Label for this data point.
+ * @param weight The weight of this instance.
+ * @param offset The offset used for this data point.
+ * @param features The vector of features for this data point.
+ */
+private[ml] case class OffsetInstance(
+    label: Double,
+    weight: Double,
+    offset: Double,
+    features: Vector) {
+
+  /** Converts to an [[Instance]] object by leaving out the offset. */
+  def toInstance: Instance = Instance(label, weight, features)
+
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquares.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquares.scala
index 9c495512422ba..6961b45f55e4d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquares.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquares.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.ml.optim
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.ml.feature.Instance
+import org.apache.spark.ml.feature.{Instance, OffsetInstance}
 import org.apache.spark.ml.linalg._
 import org.apache.spark.rdd.RDD
 
@@ -43,7 +43,7 @@ private[ml] class IterativelyReweightedLeastSquaresModel(
  * find M-estimator in robust regression and other optimization problems.
  *
  * @param initialModel the initial guess model.
- * @param reweightFunc the reweight function which is used to update offsets and weights
+ * @param reweightFunc the reweight function which is used to update working labels and weights
  *                     at each iteration.
  * @param fitIntercept whether to fit intercept.
  * @param regParam L2 regularization parameter used by WLS.
@@ -57,13 +57,13 @@ private[ml] class IterativelyReweightedLeastSquaresModel(
  */
 private[ml] class IterativelyReweightedLeastSquares(
     val initialModel: WeightedLeastSquaresModel,
-    val reweightFunc: (Instance, WeightedLeastSquaresModel) => (Double, Double),
+    val reweightFunc: (OffsetInstance, WeightedLeastSquaresModel) => (Double, Double),
     val fitIntercept: Boolean,
     val regParam: Double,
     val maxIter: Int,
     val tol: Double) extends Logging with Serializable {
 
-  def fit(instances: RDD[Instance]): IterativelyReweightedLeastSquaresModel = {
+  def fit(instances: RDD[OffsetInstance]): IterativelyReweightedLeastSquaresModel = {
 
     var converged = false
     var iter = 0
@@ -75,10 +75,10 @@ private[ml] class IterativelyReweightedLeastSquares(
 
       oldModel = model
 
-      // Update offsets and weights using reweightFunc
+      // Update working labels and weights using reweightFunc
       val newInstances = instances.map { instance =>
-        val (newOffset, newWeight) = reweightFunc(instance, oldModel)
-        Instance(newOffset, newWeight, instance.features)
+        val (newLabel, newWeight) = reweightFunc(instance, oldModel)
+        Instance(newLabel, newWeight, instance.features)
       }
 
       // Estimate new model
diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
index 56ab9675700a0..32b0af72ba9bb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.ml.optim
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.ml.feature.Instance
+import org.apache.spark.ml.feature.{Instance, OffsetInstance}
 import org.apache.spark.ml.linalg._
 import org.apache.spark.rdd.RDD
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index bff0d9bbb46ff..ce3460ae43566 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -26,8 +26,8 @@ import org.apache.spark.SparkException
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.internal.Logging
 import org.apache.spark.ml.PredictorParams
-import org.apache.spark.ml.feature.Instance
-import org.apache.spark.ml.linalg.{BLAS, Vector}
+import org.apache.spark.ml.feature.{Instance, OffsetInstance}
+import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors}
 import org.apache.spark.ml.optim._
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
@@ -138,6 +138,27 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam
   @Since("2.0.0")
   def getLinkPredictionCol: String = $(linkPredictionCol)
 
+  /**
+   * Param for offset column name. If this is not set or empty, we treat all instance offsets
+   * as 0.0. The feature specified as offset has a constant coefficient of 1.0.
+   * @group param
+   */
+  @Since("2.3.0")
+  final val offsetCol: Param[String] = new Param[String](this, "offsetCol", "The offset " +
+    "column name. If this is not set or empty, we treat all instance offsets as 0.0")
+
+  /** @group getParam */
+  @Since("2.3.0")
+  def getOffsetCol: String = $(offsetCol)
+
+  /** Checks whether weight column is set and nonempty. */
+  private[regression] def hasWeightCol: Boolean =
+    isSet(weightCol) && $(weightCol).nonEmpty
+
+  /** Checks whether offset column is set and nonempty. */
+  private[regression] def hasOffsetCol: Boolean =
+    isSet(offsetCol) && $(offsetCol).nonEmpty
+
   /** Checks whether we should output link prediction. */
   private[regression] def hasLinkPredictionCol: Boolean = {
     isDefined(linkPredictionCol) && $(linkPredictionCol).nonEmpty
@@ -172,6 +193,11 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam
     }
 
     val newSchema = super.validateAndTransformSchema(schema, fitting, featuresDataType)
+
+    if (hasOffsetCol) {
+      SchemaUtils.checkNumericType(schema, $(offsetCol))
+    }
+
     if (hasLinkPredictionCol) {
       SchemaUtils.appendColumn(newSchema, $(linkPredictionCol), DoubleType)
     } else {
@@ -306,6 +332,16 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val
   @Since("2.0.0")
   def setWeightCol(value: String): this.type = set(weightCol, value)
 
+  /**
+   * Sets the value of param [[offsetCol]].
+   * If this is not set or empty, we treat all instance offsets as 0.0.
+   * Default is not set, so all instances have offset 0.0.
+   *
+   * @group setParam
+   */
+  @Since("2.3.0")
+  def setOffsetCol(value: String): this.type = set(offsetCol, value)
+
   /**
    * Sets the solver algorithm used for optimization.
    * Currently only supports "irls" which is also the default solver.
@@ -329,7 +365,7 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val
 
     val numFeatures = dataset.select(col($(featuresCol))).first().getAs[Vector](0).size
     val instr = Instrumentation.create(this, dataset)
-    instr.logParams(labelCol, featuresCol, weightCol, predictionCol, linkPredictionCol,
+    instr.logParams(labelCol, featuresCol, weightCol, offsetCol, predictionCol, linkPredictionCol,
       family, solver, fitIntercept, link, maxIter, regParam, tol)
     instr.logNumFeatures(numFeatures)
 
@@ -343,15 +379,16 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val
       "GeneralizedLinearRegression was given data with 0 features, and with Param fitIntercept " +
         "set to false. To fit a model with 0 features, fitIntercept must be set to true." )
 
-    val w = if (!isDefined(weightCol) || $(weightCol).isEmpty) lit(1.0) else col($(weightCol))
-    val instances: RDD[Instance] =
-      dataset.select(col($(labelCol)), w, col($(featuresCol))).rdd.map {
-        case Row(label: Double, weight: Double, features: Vector) =>
-          Instance(label, weight, features)
-      }
+    val w = if (!hasWeightCol) lit(1.0) else col($(weightCol))
+    val offset = if (!hasOffsetCol) lit(0.0) else col($(offsetCol)).cast(DoubleType)
 
     val model = if (familyAndLink.family == Gaussian && familyAndLink.link == Identity) {
       // TODO: Make standardizeFeatures and standardizeLabel configurable.
+      val instances: RDD[Instance] =
+        dataset.select(col($(labelCol)), w, offset, col($(featuresCol))).rdd.map {
+          case Row(label: Double, weight: Double, offset: Double, features: Vector) =>
+            Instance(label - offset, weight, features)
+        }
       val optimizer = new WeightedLeastSquares($(fitIntercept), $(regParam), elasticNetParam = 0.0,
         standardizeFeatures = true, standardizeLabel = true)
       val wlsModel = optimizer.fit(instances)
@@ -362,6 +399,11 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val
         wlsModel.diagInvAtWA.toArray, 1, getSolver)
       model.setSummary(Some(trainingSummary))
     } else {
+      val instances: RDD[OffsetInstance] =
+        dataset.select(col($(labelCol)), w, offset, col($(featuresCol))).rdd.map {
+          case Row(label: Double, weight: Double, offset: Double, features: Vector) =>
+            OffsetInstance(label, weight, offset, features)
+        }
       // Fit Generalized Linear Model by iteratively reweighted least squares (IRLS).
       val initialModel = familyAndLink.initialize(instances, $(fitIntercept), $(regParam))
       val optimizer = new IterativelyReweightedLeastSquares(initialModel,
@@ -425,12 +467,12 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
      * Get the initial guess model for [[IterativelyReweightedLeastSquares]].
      */
     def initialize(
-        instances: RDD[Instance],
+        instances: RDD[OffsetInstance],
         fitIntercept: Boolean,
         regParam: Double): WeightedLeastSquaresModel = {
       val newInstances = instances.map { instance =>
         val mu = family.initialize(instance.label, instance.weight)
-        val eta = predict(mu)
+        val eta = predict(mu) - instance.offset
         Instance(eta, instance.weight, instance.features)
       }
       // TODO: Make standardizeFeatures and standardizeLabel configurable.
@@ -441,16 +483,16 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
     }
 
     /**
-     * The reweight function used to update offsets and weights
+     * The reweight function used to update working labels and weights
      * at each iteration of [[IterativelyReweightedLeastSquares]].
      */
-    val reweightFunc: (Instance, WeightedLeastSquaresModel) => (Double, Double) = {
-      (instance: Instance, model: WeightedLeastSquaresModel) => {
-        val eta = model.predict(instance.features)
+    val reweightFunc: (OffsetInstance, WeightedLeastSquaresModel) => (Double, Double) = {
+      (instance: OffsetInstance, model: WeightedLeastSquaresModel) => {
+        val eta = model.predict(instance.features) + instance.offset
         val mu = fitted(eta)
-        val offset = eta + (instance.label - mu) * link.deriv(mu)
-        val weight = instance.weight / (math.pow(this.link.deriv(mu), 2.0) * family.variance(mu))
-        (offset, weight)
+        val newLabel = eta - instance.offset + (instance.label - mu) * link.deriv(mu)
+        val newWeight = instance.weight / (math.pow(this.link.deriv(mu), 2.0) * family.variance(mu))
+        (newLabel, newWeight)
       }
     }
   }
@@ -950,15 +992,22 @@ class GeneralizedLinearRegressionModel private[ml] (
   private lazy val familyAndLink = FamilyAndLink(this)
 
   override protected def predict(features: Vector): Double = {
-    val eta = predictLink(features)
+    predict(features, 0.0)
+  }
+
+  /**
+   * Calculates the predicted value when offset is set.
+   */
+  private def predict(features: Vector, offset: Double): Double = {
+    val eta = predictLink(features, offset)
     familyAndLink.fitted(eta)
   }
 
   /**
-   * Calculate the link prediction (linear predictor) of the given instance.
+   * Calculates the link prediction (linear predictor) of the given instance.
    */
-  private def predictLink(features: Vector): Double = {
-    BLAS.dot(features, coefficients) + intercept
+  private def predictLink(features: Vector, offset: Double): Double = {
+    BLAS.dot(features, coefficients) + intercept + offset
   }
 
   override def transform(dataset: Dataset[_]): DataFrame = {
@@ -967,14 +1016,16 @@ class GeneralizedLinearRegressionModel private[ml] (
   }
 
   override protected def transformImpl(dataset: Dataset[_]): DataFrame = {
-    val predictUDF = udf { (features: Vector) => predict(features) }
-    val predictLinkUDF = udf { (features: Vector) => predictLink(features) }
+    val predictUDF = udf { (features: Vector, offset: Double) => predict(features, offset) }
+    val predictLinkUDF = udf { (features: Vector, offset: Double) => predictLink(features, offset) }
+
+    val offset = if (!hasOffsetCol) lit(0.0) else col($(offsetCol)).cast(DoubleType)
     var output = dataset
     if ($(predictionCol).nonEmpty) {
-      output = output.withColumn($(predictionCol), predictUDF(col($(featuresCol))))
+      output = output.withColumn($(predictionCol), predictUDF(col($(featuresCol)), offset))
     }
     if (hasLinkPredictionCol) {
-      output = output.withColumn($(linkPredictionCol), predictLinkUDF(col($(featuresCol))))
+      output = output.withColumn($(linkPredictionCol), predictLinkUDF(col($(featuresCol)), offset))
     }
     output.toDF()
   }
@@ -1146,9 +1197,7 @@ class GeneralizedLinearRegressionSummary private[regression] (
 
   /** Degrees of freedom. */
   @Since("2.0.0")
-  lazy val degreesOfFreedom: Long = {
-    numInstances - rank
-  }
+  lazy val degreesOfFreedom: Long = numInstances - rank
 
   /** The residual degrees of freedom. */
   @Since("2.0.0")
@@ -1156,18 +1205,20 @@ class GeneralizedLinearRegressionSummary private[regression] (
 
   /** The residual degrees of freedom for the null model. */
   @Since("2.0.0")
-  lazy val residualDegreeOfFreedomNull: Long = if (model.getFitIntercept) {
-    numInstances - 1
-  } else {
-    numInstances
+  lazy val residualDegreeOfFreedomNull: Long = {
+    if (model.getFitIntercept) numInstances - 1 else numInstances
   }
 
-  private def weightCol: Column = {
-    if (!model.isDefined(model.weightCol) || model.getWeightCol.isEmpty) {
-      lit(1.0)
-    } else {
-      col(model.getWeightCol)
-    }
+  private def label: Column = col(model.getLabelCol).cast(DoubleType)
+
+  private def prediction: Column = col(predictionCol)
+
+  private def weight: Column = {
+    if (!model.hasWeightCol) lit(1.0) else col(model.getWeightCol)
+  }
+
+  private def offset: Column = {
+    if (!model.hasOffsetCol) lit(0.0) else col(model.getOffsetCol).cast(DoubleType)
   }
 
   private[regression] lazy val devianceResiduals: DataFrame = {
@@ -1175,25 +1226,23 @@ class GeneralizedLinearRegressionSummary private[regression] (
       val r = math.sqrt(math.max(family.deviance(y, mu, weight), 0.0))
       if (y > mu) r else -1.0 * r
     }
-    val w = weightCol
     predictions.select(
-      drUDF(col(model.getLabelCol), col(predictionCol), w).as("devianceResiduals"))
+      drUDF(label, prediction, weight).as("devianceResiduals"))
   }
 
   private[regression] lazy val pearsonResiduals: DataFrame = {
     val prUDF = udf { mu: Double => family.variance(mu) }
-    val w = weightCol
-    predictions.select(col(model.getLabelCol).minus(col(predictionCol))
-      .multiply(sqrt(w)).divide(sqrt(prUDF(col(predictionCol)))).as("pearsonResiduals"))
+    predictions.select(label.minus(prediction)
+      .multiply(sqrt(weight)).divide(sqrt(prUDF(prediction))).as("pearsonResiduals"))
   }
 
   private[regression] lazy val workingResiduals: DataFrame = {
     val wrUDF = udf { (y: Double, mu: Double) => (y - mu) * link.deriv(mu) }
-    predictions.select(wrUDF(col(model.getLabelCol), col(predictionCol)).as("workingResiduals"))
+    predictions.select(wrUDF(label, prediction).as("workingResiduals"))
   }
 
   private[regression] lazy val responseResiduals: DataFrame = {
-    predictions.select(col(model.getLabelCol).minus(col(predictionCol)).as("responseResiduals"))
+    predictions.select(label.minus(prediction).as("responseResiduals"))
   }
 
   /**
@@ -1225,16 +1274,35 @@ class GeneralizedLinearRegressionSummary private[regression] (
    */
   @Since("2.0.0")
   lazy val nullDeviance: Double = {
-    val w = weightCol
-    val wtdmu: Double = if (model.getFitIntercept) {
-      val agg = predictions.agg(sum(w.multiply(col(model.getLabelCol))), sum(w)).first()
-      agg.getDouble(0) / agg.getDouble(1)
+    val intercept: Double = if (!model.getFitIntercept) {
+      0.0
     } else {
-      link.unlink(0.0)
+      /*
+        Estimate intercept analytically when there is no offset, or when there is offset but
+        the model is Gaussian family with identity link. Otherwise, fit an intercept only model.
+       */
+      if (!model.hasOffsetCol ||
+        (model.hasOffsetCol && family == Gaussian && link == Identity)) {
+        val agg = predictions.agg(sum(weight.multiply(
+          label.minus(offset))), sum(weight)).first()
+        link.link(agg.getDouble(0) / agg.getDouble(1))
+      } else {
+        // Create empty feature column and fit intercept only model using param setting from model
+        val featureNull = "feature_" + java.util.UUID.randomUUID.toString
+        val paramMap = model.extractParamMap()
+        paramMap.put(model.featuresCol, featureNull)
+        if (family.name != "tweedie") {
+          paramMap.remove(model.variancePower)
+        }
+        val emptyVectorUDF = udf{ () => Vectors.zeros(0) }
+        model.parent.fit(
+          dataset.withColumn(featureNull, emptyVectorUDF()), paramMap
+        ).intercept
+      }
     }
-    predictions.select(col(model.getLabelCol).cast(DoubleType), w).rdd.map {
-      case Row(y: Double, weight: Double) =>
-        family.deviance(y, wtdmu, weight)
+    predictions.select(label, offset, weight).rdd.map {
+      case Row(y: Double, offset: Double, weight: Double) =>
+        family.deviance(y, link.unlink(intercept + offset), weight)
     }.sum()
   }
 
@@ -1243,8 +1311,7 @@ class GeneralizedLinearRegressionSummary private[regression] (
    */
   @Since("2.0.0")
   lazy val deviance: Double = {
-    val w = weightCol
-    predictions.select(col(model.getLabelCol).cast(DoubleType), col(predictionCol), w).rdd.map {
+    predictions.select(label, prediction, weight).rdd.map {
       case Row(label: Double, pred: Double, weight: Double) =>
         family.deviance(label, pred, weight)
     }.sum()
@@ -1269,10 +1336,9 @@ class GeneralizedLinearRegressionSummary private[regression] (
   /** Akaike Information Criterion (AIC) for the fitted model. */
   @Since("2.0.0")
   lazy val aic: Double = {
-    val w = weightCol
-    val weightSum = predictions.select(w).agg(sum(w)).first().getDouble(0)
+    val weightSum = predictions.select(weight).agg(sum(weight)).first().getDouble(0)
     val t = predictions.select(
-      col(model.getLabelCol).cast(DoubleType), col(predictionCol), w).rdd.map {
+      label, prediction, weight).rdd.map {
         case Row(label: Double, pred: Double, weight: Double) =>
           (label, pred, weight)
     }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquaresSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquaresSuite.scala
index 50260952ecb66..6d143504fcf58 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquaresSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquaresSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.ml.optim
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.ml.feature.Instance
+import org.apache.spark.ml.feature.{Instance, OffsetInstance}
 import org.apache.spark.ml.linalg.Vectors
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.util.MLlibTestSparkContext
@@ -26,8 +26,8 @@ import org.apache.spark.rdd.RDD
 
 class IterativelyReweightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext {
 
-  private var instances1: RDD[Instance] = _
-  private var instances2: RDD[Instance] = _
+  private var instances1: RDD[OffsetInstance] = _
+  private var instances2: RDD[OffsetInstance] = _
 
   override def beforeAll(): Unit = {
     super.beforeAll()
@@ -39,10 +39,10 @@ class IterativelyReweightedLeastSquaresSuite extends SparkFunSuite with MLlibTes
        w <- c(1, 2, 3, 4)
      */
     instances1 = sc.parallelize(Seq(
-      Instance(1.0, 1.0, Vectors.dense(0.0, 5.0).toSparse),
-      Instance(0.0, 2.0, Vectors.dense(1.0, 2.0)),
-      Instance(1.0, 3.0, Vectors.dense(2.0, 1.0)),
-      Instance(0.0, 4.0, Vectors.dense(3.0, 3.0))
+      OffsetInstance(1.0, 1.0, 0.0, Vectors.dense(0.0, 5.0).toSparse),
+      OffsetInstance(0.0, 2.0, 0.0, Vectors.dense(1.0, 2.0)),
+      OffsetInstance(1.0, 3.0, 0.0, Vectors.dense(2.0, 1.0)),
+      OffsetInstance(0.0, 4.0, 0.0, Vectors.dense(3.0, 3.0))
     ), 2)
     /*
        R code:
@@ -52,10 +52,10 @@ class IterativelyReweightedLeastSquaresSuite extends SparkFunSuite with MLlibTes
        w <- c(1, 2, 3, 4)
      */
     instances2 = sc.parallelize(Seq(
-      Instance(2.0, 1.0, Vectors.dense(0.0, 5.0).toSparse),
-      Instance(8.0, 2.0, Vectors.dense(1.0, 7.0)),
-      Instance(3.0, 3.0, Vectors.dense(2.0, 11.0)),
-      Instance(9.0, 4.0, Vectors.dense(3.0, 13.0))
+      OffsetInstance(2.0, 1.0, 0.0, Vectors.dense(0.0, 5.0).toSparse),
+      OffsetInstance(8.0, 2.0, 0.0, Vectors.dense(1.0, 7.0)),
+      OffsetInstance(3.0, 3.0, 0.0, Vectors.dense(2.0, 11.0)),
+      OffsetInstance(9.0, 4.0, 0.0, Vectors.dense(3.0, 13.0))
     ), 2)
   }
 
@@ -156,7 +156,7 @@ class IterativelyReweightedLeastSquaresSuite extends SparkFunSuite with MLlibTes
     var idx = 0
     for (fitIntercept <- Seq(false, true)) {
       val initial = new WeightedLeastSquares(fitIntercept, regParam = 0.0, elasticNetParam = 0.0,
-        standardizeFeatures = false, standardizeLabel = false).fit(instances2)
+        standardizeFeatures = false, standardizeLabel = false).fit(instances2.map(_.toInstance))
       val irls = new IterativelyReweightedLeastSquares(initial, L1RegressionReweightFunc,
         fitIntercept, regParam = 0.0, maxIter = 200, tol = 1e-7).fit(instances2)
       val actual = Vectors.dense(irls.intercept, irls.coefficients(0), irls.coefficients(1))
@@ -169,29 +169,29 @@ class IterativelyReweightedLeastSquaresSuite extends SparkFunSuite with MLlibTes
 object IterativelyReweightedLeastSquaresSuite {
 
   def BinomialReweightFunc(
-      instance: Instance,
+      instance: OffsetInstance,
       model: WeightedLeastSquaresModel): (Double, Double) = {
-    val eta = model.predict(instance.features)
+    val eta = model.predict(instance.features) + instance.offset
     val mu = 1.0 / (1.0 + math.exp(-1.0 * eta))
-    val z = eta + (instance.label - mu) / (mu * (1.0 - mu))
+    val z = eta - instance.offset + (instance.label - mu) / (mu * (1.0 - mu))
     val w = mu * (1 - mu) * instance.weight
     (z, w)
   }
 
   def PoissonReweightFunc(
-      instance: Instance,
+      instance: OffsetInstance,
       model: WeightedLeastSquaresModel): (Double, Double) = {
-    val eta = model.predict(instance.features)
+    val eta = model.predict(instance.features) + instance.offset
     val mu = math.exp(eta)
-    val z = eta + (instance.label - mu) / mu
+    val z = eta - instance.offset + (instance.label - mu) / mu
     val w = mu * instance.weight
     (z, w)
   }
 
   def L1RegressionReweightFunc(
-      instance: Instance,
+      instance: OffsetInstance,
       model: WeightedLeastSquaresModel): (Double, Double) = {
-    val eta = model.predict(instance.features)
+    val eta = model.predict(instance.features) + instance.offset
     val e = math.max(math.abs(eta - instance.label), 1e-7)
     val w = 1 / e
     val y = instance.label
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index f7c7c001a36af..cfaa57314bd66 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -21,7 +21,7 @@ import scala.util.Random
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.classification.LogisticRegressionSuite._
-import org.apache.spark.ml.feature.Instance
+import org.apache.spark.ml.feature.{Instance, OffsetInstance}
 import org.apache.spark.ml.feature.LabeledPoint
 import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors}
 import org.apache.spark.ml.param.{ParamMap, ParamsSuite}
@@ -797,77 +797,160 @@ class GeneralizedLinearRegressionSuite
     }
   }
 
-  test("glm summary: gaussian family with weight") {
+  test("generalized linear regression with weight and offset") {
     /*
-       R code:
+      R code:
+      library(statmod)
 
-       A <- matrix(c(0, 1, 2, 3, 5, 7, 11, 13), 4, 2)
-       b <- c(17, 19, 23, 29)
-       w <- c(1, 2, 3, 4)
-       df <- as.data.frame(cbind(A, b))
-     */
-    val datasetWithWeight = Seq(
-      Instance(17.0, 1.0, Vectors.dense(0.0, 5.0).toSparse),
-      Instance(19.0, 2.0, Vectors.dense(1.0, 7.0)),
-      Instance(23.0, 3.0, Vectors.dense(2.0, 11.0)),
-      Instance(29.0, 4.0, Vectors.dense(3.0, 13.0))
+      df <- as.data.frame(matrix(c(
+        0.2, 1.0, 2.0, 0.0, 5.0,
+        0.5, 2.1, 0.5, 1.0, 2.0,
+        0.9, 0.4, 1.0, 2.0, 1.0,
+        0.7, 0.7, 0.0, 3.0, 3.0), 4, 5, byrow = TRUE))
+      families <- list(gaussian, binomial, poisson, Gamma, tweedie(1.5))
+      f1 <- V1 ~ -1 + V4 + V5
+      f2 <- V1 ~ V4 + V5
+      for (f in c(f1, f2)) {
+        for (fam in families) {
+          model <- glm(f, df, family = fam, weights = V2, offset = V3)
+          print(as.vector(coef(model)))
+        }
+      }
+      [1]  0.5169222 -0.3344444
+      [1]  0.9419107 -0.6864404
+      [1]  0.1812436 -0.6568422
+      [1] -0.2869094  0.7857710
+      [1] 0.1055254 0.2979113
+      [1] -0.05990345  0.53188982 -0.32118415
+      [1] -0.2147117  0.9911750 -0.6356096
+      [1] -1.5616130  0.6646470 -0.3192581
+      [1]  0.3390397 -0.3406099  0.6870259
+      [1] 0.3665034 0.1039416 0.1484616
+    */
+    val dataset = Seq(
+      OffsetInstance(0.2, 1.0, 2.0, Vectors.dense(0.0, 5.0)),
+      OffsetInstance(0.5, 2.1, 0.5, Vectors.dense(1.0, 2.0)),
+      OffsetInstance(0.9, 0.4, 1.0, Vectors.dense(2.0, 1.0)),
+      OffsetInstance(0.7, 0.7, 0.0, Vectors.dense(3.0, 3.0))
     ).toDF()
+
+    val expected = Seq(
+      Vectors.dense(0, 0.5169222, -0.3344444),
+      Vectors.dense(0, 0.9419107, -0.6864404),
+      Vectors.dense(0, 0.1812436, -0.6568422),
+      Vectors.dense(0, -0.2869094, 0.785771),
+      Vectors.dense(0, 0.1055254, 0.2979113),
+      Vectors.dense(-0.05990345, 0.53188982, -0.32118415),
+      Vectors.dense(-0.2147117, 0.991175, -0.6356096),
+      Vectors.dense(-1.561613, 0.664647, -0.3192581),
+      Vectors.dense(0.3390397, -0.3406099, 0.6870259),
+      Vectors.dense(0.3665034, 0.1039416, 0.1484616))
+
+    import GeneralizedLinearRegression._
+
+    var idx = 0
+
+    for (fitIntercept <- Seq(false, true)) {
+      for (family <- Seq("gaussian", "binomial", "poisson", "gamma", "tweedie")) {
+        val trainer = new GeneralizedLinearRegression().setFamily(family)
+          .setFitIntercept(fitIntercept).setOffsetCol("offset")
+          .setWeightCol("weight").setLinkPredictionCol("linkPrediction")
+        if (family == "tweedie") trainer.setVariancePower(1.5)
+        val model = trainer.fit(dataset)
+        val actual = Vectors.dense(model.intercept, model.coefficients(0), model.coefficients(1))
+        assert(actual ~= expected(idx) absTol 1e-4, s"Model mismatch: GLM with family = $family," +
+          s" and fitIntercept = $fitIntercept.")
+
+        val familyLink = FamilyAndLink(trainer)
+        model.transform(dataset).select("features", "offset", "prediction", "linkPrediction")
+          .collect().foreach {
+          case Row(features: DenseVector, offset: Double, prediction1: Double,
+          linkPrediction1: Double) =>
+            val eta = BLAS.dot(features, model.coefficients) + model.intercept + offset
+            val prediction2 = familyLink.fitted(eta)
+            val linkPrediction2 = eta
+            assert(prediction1 ~= prediction2 relTol 1E-5, "Prediction mismatch: GLM with " +
+              s"family = $family, and fitIntercept = $fitIntercept.")
+            assert(linkPrediction1 ~= linkPrediction2 relTol 1E-5, "Link Prediction mismatch: " +
+              s"GLM with family = $family, and fitIntercept = $fitIntercept.")
+        }
+
+        idx += 1
+      }
+    }
+  }
+
+  test("glm summary: gaussian family with weight and offset") {
     /*
-       R code:
+      R code:
 
-       model <- glm(formula = "b ~ .", family="gaussian", data = df, weights = w)
-       summary(model)
+      A <- matrix(c(0, 1, 2, 3, 5, 7, 11, 13), 4, 2)
+      b <- c(17, 19, 23, 29)
+      w <- c(1, 2, 3, 4)
+      off <- c(2, 3, 1, 4)
+      df <- as.data.frame(cbind(A, b))
+     */
+    val dataset = Seq(
+      OffsetInstance(17.0, 1.0, 2.0, Vectors.dense(0.0, 5.0).toSparse),
+      OffsetInstance(19.0, 2.0, 3.0, Vectors.dense(1.0, 7.0)),
+      OffsetInstance(23.0, 3.0, 1.0, Vectors.dense(2.0, 11.0)),
+      OffsetInstance(29.0, 4.0, 4.0, Vectors.dense(3.0, 13.0))
+    ).toDF()
+    /*
+      R code:
 
-       Deviance Residuals:
-           1       2       3       4
-       1.920  -1.358  -1.109   0.960
+      model <- glm(formula = "b ~ .", family = "gaussian", data = df,
+                   weights = w, offset = off)
+      summary(model)
 
-       Coefficients:
-                   Estimate Std. Error t value Pr(>|t|)
-       (Intercept)   18.080      9.608   1.882    0.311
-       V1             6.080      5.556   1.094    0.471
-       V2            -0.600      1.960  -0.306    0.811
+      Deviance Residuals:
+            1        2        3        4
+       0.9600  -0.6788  -0.5543   0.4800
 
-       (Dispersion parameter for gaussian family taken to be 7.68)
+      Coefficients:
+                  Estimate Std. Error t value Pr(>|t|)
+      (Intercept)   5.5400     4.8040   1.153    0.455
+      V1           -0.9600     2.7782  -0.346    0.788
+      V2            1.7000     0.9798   1.735    0.333
 
-           Null deviance: 202.00  on 3  degrees of freedom
-       Residual deviance:   7.68  on 1  degrees of freedom
-       AIC: 18.783
+      (Dispersion parameter for gaussian family taken to be 1.92)
 
-       Number of Fisher Scoring iterations: 2
+          Null deviance: 152.10  on 3  degrees of freedom
+      Residual deviance:   1.92  on 1  degrees of freedom
+      AIC: 13.238
 
-       residuals(model, type="pearson")
-              1         2         3         4
-       1.920000 -1.357645 -1.108513  0.960000
+      Number of Fisher Scoring iterations: 2
 
-       residuals(model, type="working")
+      residuals(model, type = "pearson")
+               1          2          3          4
+      0.9600000 -0.6788225 -0.5542563  0.4800000
+      residuals(model, type = "working")
           1     2     3     4
-       1.92 -0.96 -0.64  0.48
-
-       residuals(model, type="response")
+      0.96 -0.48 -0.32  0.24
+      residuals(model, type = "response")
           1     2     3     4
-       1.92 -0.96 -0.64  0.48
+      0.96 -0.48 -0.32  0.24
      */
     val trainer = new GeneralizedLinearRegression()
-      .setWeightCol("weight")
+      .setWeightCol("weight").setOffsetCol("offset")
+
+    val model = trainer.fit(dataset)
 
-    val model = trainer.fit(datasetWithWeight)
-
-    val coefficientsR = Vectors.dense(Array(6.080, -0.600))
-    val interceptR = 18.080
-    val devianceResidualsR = Array(1.920, -1.358, -1.109, 0.960)
-    val pearsonResidualsR = Array(1.920000, -1.357645, -1.108513, 0.960000)
-    val workingResidualsR = Array(1.92, -0.96, -0.64, 0.48)
-    val responseResidualsR = Array(1.92, -0.96, -0.64, 0.48)
-    val seCoefR = Array(5.556, 1.960, 9.608)
-    val tValsR = Array(1.094, -0.306, 1.882)
-    val pValsR = Array(0.471, 0.811, 0.311)
-    val dispersionR = 7.68
-    val nullDevianceR = 202.00
-    val residualDevianceR = 7.68
+    val coefficientsR = Vectors.dense(Array(-0.96, 1.7))
+    val interceptR = 5.54
+    val devianceResidualsR = Array(0.96, -0.67882, -0.55426, 0.48)
+    val pearsonResidualsR = Array(0.96, -0.67882, -0.55426, 0.48)
+    val workingResidualsR = Array(0.96, -0.48, -0.32, 0.24)
+    val responseResidualsR = Array(0.96, -0.48, -0.32, 0.24)
+    val seCoefR = Array(2.7782, 0.9798, 4.804)
+    val tValsR = Array(-0.34555, 1.73506, 1.15321)
+    val pValsR = Array(0.78819, 0.33286, 0.45478)
+    val dispersionR = 1.92
+    val nullDevianceR = 152.1
+    val residualDevianceR = 1.92
     val residualDegreeOfFreedomNullR = 3
     val residualDegreeOfFreedomR = 1
-    val aicR = 18.783
+    val aicR = 13.23758
 
     assert(model.hasSummary)
     val summary = model.summary
@@ -912,7 +995,7 @@ class GeneralizedLinearRegressionSuite
     assert(summary.aic ~== aicR absTol 1E-3)
     assert(summary.solver === "irls")
 
-    val summary2: GeneralizedLinearRegressionSummary = model.evaluate(datasetWithWeight)
+    val summary2: GeneralizedLinearRegressionSummary = model.evaluate(dataset)
     assert(summary.predictions.columns.toSet === summary2.predictions.columns.toSet)
     assert(summary.predictionCol === summary2.predictionCol)
     assert(summary.rank === summary2.rank)
@@ -925,79 +1008,79 @@ class GeneralizedLinearRegressionSuite
     assert(summary.aic === summary2.aic)
   }
 
-  test("glm summary: binomial family with weight") {
+  test("glm summary: binomial family with weight and offset") {
     /*
-       R code:
+      R code:
 
-       A <- matrix(c(0, 1, 2, 3, 5, 2, 1, 3), 4, 2)
-       b <- c(1, 0.5, 1, 0)
-       w <- c(1, 2.0, 0.3, 4.7)
-       df <- as.data.frame(cbind(A, b))
+      df <- as.data.frame(matrix(c(
+          0.2, 1.0, 2.0, 0.0, 5.0,
+          0.5, 2.1, 0.5, 1.0, 2.0,
+          0.9, 0.4, 1.0, 2.0, 1.0,
+          0.7, 0.7, 0.0, 3.0, 3.0), 4, 5, byrow = TRUE))
      */
-    val datasetWithWeight = Seq(
-      Instance(1.0, 1.0, Vectors.dense(0.0, 5.0).toSparse),
-      Instance(0.5, 2.0, Vectors.dense(1.0, 2.0)),
-      Instance(1.0, 0.3, Vectors.dense(2.0, 1.0)),
-      Instance(0.0, 4.7, Vectors.dense(3.0, 3.0))
+    val dataset = Seq(
+      OffsetInstance(0.2, 1.0, 2.0, Vectors.dense(0.0, 5.0)),
+      OffsetInstance(0.5, 2.1, 0.5, Vectors.dense(1.0, 2.0)),
+      OffsetInstance(0.9, 0.4, 1.0, Vectors.dense(2.0, 1.0)),
+      OffsetInstance(0.7, 0.7, 0.0, Vectors.dense(3.0, 3.0))
     ).toDF()
-
     /*
-       R code:
-
-       model <- glm(formula = "b ~ . -1", family="binomial", data = df, weights = w)
-       summary(model)
-
-       Deviance Residuals:
-             1        2        3        4
-        0.2404   0.1965   1.2824  -0.6916
+      R code:
 
-       Coefficients:
-          Estimate Std. Error z value Pr(>|z|)
-       x1  -1.6901     1.2764  -1.324    0.185
-       x2   0.7059     0.9449   0.747    0.455
+      model <- glm(formula = "V1 ~ V4 + V5", family = "binomial", data = df,
+                   weights = V2, offset = V3)
+      summary(model)
 
-       (Dispersion parameter for binomial family taken to be 1)
+      Deviance Residuals:
+              1          2          3          4
+       0.002584  -0.003800   0.012478  -0.001796
 
-           Null deviance: 8.3178  on 4  degrees of freedom
-       Residual deviance: 2.2193  on 2  degrees of freedom
-       AIC: 5.9915
+      Coefficients:
+                  Estimate Std. Error z value Pr(>|z|)
+      (Intercept)  -0.2147     3.5687  -0.060    0.952
+      V4            0.9912     1.2344   0.803    0.422
+      V5           -0.6356     0.9669  -0.657    0.511
 
-       Number of Fisher Scoring iterations: 5
+      (Dispersion parameter for binomial family taken to be 1)
 
-       residuals(model, type="pearson")
-              1         2         3         4
-       0.171217  0.197406  2.085864 -0.495332
+          Null deviance: 2.17560881  on 3  degrees of freedom
+      Residual deviance: 0.00018005  on 1  degrees of freedom
+      AIC: 10.245
 
-       residuals(model, type="working")
-              1         2         3         4
-       1.029315  0.281881 15.502768 -1.052203
+      Number of Fisher Scoring iterations: 4
 
-       residuals(model, type="response")
-              1          2          3          4
-       0.028480  0.069123  0.935495 -0.049613
+      residuals(model, type = "pearson")
+                 1            2            3            4
+      0.002586113 -0.003799744  0.012372235 -0.001796892
+      residuals(model, type = "working")
+                 1            2            3            4
+      0.006477857 -0.005244163  0.063541250 -0.004691064
+      residuals(model, type = "response")
+                  1             2             3             4
+      0.0010324375 -0.0013110318  0.0060225522 -0.0009832738
     */
     val trainer = new GeneralizedLinearRegression()
       .setFamily("Binomial")
       .setWeightCol("weight")
-      .setFitIntercept(false)
-
-    val model = trainer.fit(datasetWithWeight)
-
-    val coefficientsR = Vectors.dense(Array(-1.690134, 0.705929))
-    val interceptR = 0.0
-    val devianceResidualsR = Array(0.2404, 0.1965, 1.2824, -0.6916)
-    val pearsonResidualsR = Array(0.171217, 0.197406, 2.085864, -0.495332)
-    val workingResidualsR = Array(1.029315, 0.281881, 15.502768, -1.052203)
-    val responseResidualsR = Array(0.02848, 0.069123, 0.935495, -0.049613)
-    val seCoefR = Array(1.276417, 0.944934)
-    val tValsR = Array(-1.324124, 0.747068)
-    val pValsR = Array(0.185462, 0.455023)
-    val dispersionR = 1.0
-    val nullDevianceR = 8.3178
-    val residualDevianceR = 2.2193
-    val residualDegreeOfFreedomNullR = 4
-    val residualDegreeOfFreedomR = 2
-    val aicR = 5.991537
+      .setOffsetCol("offset")
+
+    val model = trainer.fit(dataset)
+
+    val coefficientsR = Vectors.dense(Array(0.99117, -0.63561))
+    val interceptR = -0.21471
+    val devianceResidualsR = Array(0.00258, -0.0038, 0.01248, -0.0018)
+    val pearsonResidualsR = Array(0.00259, -0.0038, 0.01237, -0.0018)
+    val workingResidualsR = Array(0.00648, -0.00524, 0.06354, -0.00469)
+    val responseResidualsR = Array(0.00103, -0.00131, 0.00602, -0.00098)
+    val seCoefR = Array(1.23439, 0.9669, 3.56866)
+    val tValsR = Array(0.80297, -0.65737, -0.06017)
+    val pValsR = Array(0.42199, 0.51094, 0.95202)
+    val dispersionR = 1
+    val nullDevianceR = 2.17561
+    val residualDevianceR = 0.00018
+    val residualDegreeOfFreedomNullR = 3
+    val residualDegreeOfFreedomR = 1
+    val aicR = 10.24453
 
     val summary = model.summary
     val devianceResiduals = summary.residuals()
@@ -1040,81 +1123,79 @@ class GeneralizedLinearRegressionSuite
     assert(summary.solver === "irls")
   }
 
-  test("glm summary: poisson family with weight") {
+  test("glm summary: poisson family with weight and offset") {
     /*
-       R code:
+      R code:
 
-       A <- matrix(c(0, 1, 2, 3, 5, 7, 11, 13), 4, 2)
-       b <- c(2, 8, 3, 9)
-       w <- c(1, 2, 3, 4)
-       df <- as.data.frame(cbind(A, b))
+      A <- matrix(c(0, 1, 2, 3, 5, 7, 11, 13), 4, 2)
+      b <- c(2, 8, 3, 9)
+      w <- c(1, 2, 3, 4)
+      off <- c(2, 3, 1, 4)
+      df <- as.data.frame(cbind(A, b))
      */
-    val datasetWithWeight = Seq(
-      Instance(2.0, 1.0, Vectors.dense(0.0, 5.0).toSparse),
-      Instance(8.0, 2.0, Vectors.dense(1.0, 7.0)),
-      Instance(3.0, 3.0, Vectors.dense(2.0, 11.0)),
-      Instance(9.0, 4.0, Vectors.dense(3.0, 13.0))
+    val dataset = Seq(
+      OffsetInstance(2.0, 1.0, 2.0, Vectors.dense(0.0, 5.0).toSparse),
+      OffsetInstance(8.0, 2.0, 3.0, Vectors.dense(1.0, 7.0)),
+      OffsetInstance(3.0, 3.0, 1.0, Vectors.dense(2.0, 11.0)),
+      OffsetInstance(9.0, 4.0, 4.0, Vectors.dense(3.0, 13.0))
     ).toDF()
     /*
-       R code:
-
-       model <- glm(formula = "b ~ .", family="poisson", data = df, weights = w)
-       summary(model)
-
-       Deviance Residuals:
-              1         2         3         4
-       -0.28952   0.11048   0.14839  -0.07268
-
-       Coefficients:
-                   Estimate Std. Error z value Pr(>|z|)
-       (Intercept)   6.2999     1.6086   3.916 8.99e-05 ***
-       V1            3.3241     1.0184   3.264  0.00110 **
-       V2           -1.0818     0.3522  -3.071  0.00213 **
-       ---
-       Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-
-       (Dispersion parameter for poisson family taken to be 1)
-
-           Null deviance: 15.38066  on 3  degrees of freedom
-       Residual deviance:  0.12333  on 1  degrees of freedom
-       AIC: 41.803
-
-       Number of Fisher Scoring iterations: 3
+      R code:
 
-       residuals(model, type="pearson")
-                 1           2           3           4
-       -0.28043145  0.11099310  0.14963714 -0.07253611
+      model <- glm(formula = "b ~ .", family = "poisson", data = df,
+                   weights = w, offset = off)
+      summary(model)
 
-       residuals(model, type="working")
-                 1           2           3           4
-       -0.17960679  0.02813593  0.05113852 -0.01201650
+      Deviance Residuals:
+            1        2        3        4
+      -2.0480   1.2315   1.8293  -0.7107
 
-       residuals(model, type="response")
-                1          2          3          4
-       -0.4378554  0.2189277  0.1459518 -0.1094638
+      Coefficients:
+                  Estimate Std. Error z value Pr(>|z|)
+      (Intercept)  -4.5678     1.9625  -2.328   0.0199
+      V1           -2.8784     1.1683  -2.464   0.0137
+      V2            0.8859     0.4170   2.124   0.0336
+
+      (Dispersion parameter for poisson family taken to be 1)
+
+          Null deviance: 22.5585  on 3  degrees of freedom
+      Residual deviance:  9.5622  on 1  degrees of freedom
+      AIC: 51.242
+
+      Number of Fisher Scoring iterations: 5
+
+      residuals(model, type = "pearson")
+               1          2          3          4
+      -1.7480418  1.3037611  2.0750099 -0.6972966
+      residuals(model, type = "working")
+               1          2          3          4
+      -0.6891489  0.3833588  0.9710682 -0.1096590
+      residuals(model, type = "response")
+              1         2         3         4
+      -4.433948  2.216974  1.477983 -1.108487
      */
     val trainer = new GeneralizedLinearRegression()
       .setFamily("Poisson")
       .setWeightCol("weight")
-      .setFitIntercept(true)
-
-    val model = trainer.fit(datasetWithWeight)
-
-    val coefficientsR = Vectors.dense(Array(3.3241, -1.0818))
-    val interceptR = 6.2999
-    val devianceResidualsR = Array(-0.28952, 0.11048, 0.14839, -0.07268)
-    val pearsonResidualsR = Array(-0.28043145, 0.11099310, 0.14963714, -0.07253611)
-    val workingResidualsR = Array(-0.17960679, 0.02813593, 0.05113852, -0.01201650)
-    val responseResidualsR = Array(-0.4378554, 0.2189277, 0.1459518, -0.1094638)
-    val seCoefR = Array(1.0184, 0.3522, 1.6086)
-    val tValsR = Array(3.264, -3.071, 3.916)
-    val pValsR = Array(0.00110, 0.00213, 0.00009)
-    val dispersionR = 1.0
-    val nullDevianceR = 15.38066
-    val residualDevianceR = 0.12333
+      .setOffsetCol("offset")
+
+    val model = trainer.fit(dataset)
+
+    val coefficientsR = Vectors.dense(Array(-2.87843, 0.88589))
+    val interceptR = -4.56784
+    val devianceResidualsR = Array(-2.04796, 1.23149, 1.82933, -0.71066)
+    val pearsonResidualsR = Array(-1.74804, 1.30376, 2.07501, -0.6973)
+    val workingResidualsR = Array(-0.68915, 0.38336, 0.97107, -0.10966)
+    val responseResidualsR = Array(-4.43395, 2.21697, 1.47798, -1.10849)
+    val seCoefR = Array(1.16826, 0.41703, 1.96249)
+    val tValsR = Array(-2.46387, 2.12428, -2.32757)
+    val pValsR = Array(0.01374, 0.03365, 0.01993)
+    val dispersionR = 1
+    val nullDevianceR = 22.55853
+    val residualDevianceR = 9.5622
     val residualDegreeOfFreedomNullR = 3
     val residualDegreeOfFreedomR = 1
-    val aicR = 41.803
+    val aicR = 51.24218
 
     val summary = model.summary
     val devianceResiduals = summary.residuals()
@@ -1157,78 +1238,79 @@ class GeneralizedLinearRegressionSuite
     assert(summary.solver === "irls")
   }
 
-  test("glm summary: gamma family with weight") {
+  test("glm summary: gamma family with weight and offset") {
     /*
-       R code:
+      R code:
 
-       A <- matrix(c(0, 1, 2, 3, 5, 7, 11, 13), 4, 2)
-       b <- c(2, 8, 3, 9)
-       w <- c(1, 2, 3, 4)
-       df <- as.data.frame(cbind(A, b))
+      A <- matrix(c(0, 5, 1, 2, 2, 1, 3, 3), 4, 2, byrow = TRUE)
+      b <- c(1, 2, 1, 2)
+      w <- c(1, 2, 3, 4)
+      off <- c(0, 0.5, 1, 0)
+      df <- as.data.frame(cbind(A, b))
      */
-    val datasetWithWeight = Seq(
-      Instance(2.0, 1.0, Vectors.dense(0.0, 5.0).toSparse),
-      Instance(8.0, 2.0, Vectors.dense(1.0, 7.0)),
-      Instance(3.0, 3.0, Vectors.dense(2.0, 11.0)),
-      Instance(9.0, 4.0, Vectors.dense(3.0, 13.0))
+    val dataset = Seq(
+      OffsetInstance(1.0, 1.0, 0.0, Vectors.dense(0.0, 5.0)),
+      OffsetInstance(2.0, 2.0, 0.5, Vectors.dense(1.0, 2.0)),
+      OffsetInstance(1.0, 3.0, 1.0, Vectors.dense(2.0, 1.0)),
+      OffsetInstance(2.0, 4.0, 0.0, Vectors.dense(3.0, 3.0))
     ).toDF()
     /*
-       R code:
-
-       model <- glm(formula = "b ~ .", family="Gamma", data = df, weights = w)
-       summary(model)
+      R code:
 
-       Deviance Residuals:
-              1         2         3         4
-       -0.26343   0.05761   0.12818  -0.03484
+      model <- glm(formula = "b ~ .", family = "Gamma", data = df,
+                   weights = w, offset = off)
+      summary(model)
 
-       Coefficients:
-                   Estimate Std. Error t value Pr(>|t|)
-       (Intercept) -0.81511    0.23449  -3.476    0.178
-       V1          -0.72730    0.16137  -4.507    0.139
-       V2           0.23894    0.05481   4.359    0.144
+      Deviance Residuals:
+             1         2         3         4
+      -0.17095   0.19867  -0.23604   0.03241
 
-       (Dispersion parameter for Gamma family taken to be 0.07986091)
+      Coefficients:
+                  Estimate Std. Error t value Pr(>|t|)
+      (Intercept) -0.56474    0.23866  -2.366    0.255
+      V1           0.07695    0.06931   1.110    0.467
+      V2           0.28068    0.07320   3.835    0.162
 
-           Null deviance: 2.937462  on 3  degrees of freedom
-       Residual deviance: 0.090358  on 1  degrees of freedom
-       AIC: 23.202
+      (Dispersion parameter for Gamma family taken to be 0.1212174)
 
-       Number of Fisher Scoring iterations: 4
+          Null deviance: 2.02568  on 3  degrees of freedom
+      Residual deviance: 0.12546  on 1  degrees of freedom
+      AIC: 0.93388
 
-       residuals(model, type="pearson")
-                 1           2           3           4
-       -0.24082508  0.05839241  0.13135766 -0.03463621
+      Number of Fisher Scoring iterations: 4
 
-       residuals(model, type="working")
+      residuals(model, type = "pearson")
+                1           2           3           4
+      -0.16134949  0.20807694 -0.22544551  0.03258777
+      residuals(model, type = "working")
                  1            2            3            4
-       0.091414181 -0.005374314 -0.027196998  0.001890910
-
-       residuals(model, type="response")
-                1          2          3          4
-       -0.6344390  0.3172195  0.2114797 -0.1586097
+      0.135315831 -0.084390309  0.113219135 -0.008279688
+      residuals(model, type = "response")
+               1          2          3          4
+      -0.1923918  0.2565224 -0.1496381  0.0320653
      */
     val trainer = new GeneralizedLinearRegression()
       .setFamily("Gamma")
       .setWeightCol("weight")
+      .setOffsetCol("offset")
+
+    val model = trainer.fit(dataset)
 
-    val model = trainer.fit(datasetWithWeight)
-
-    val coefficientsR = Vectors.dense(Array(-0.72730, 0.23894))
-    val interceptR = -0.81511
-    val devianceResidualsR = Array(-0.26343, 0.05761, 0.12818, -0.03484)
-    val pearsonResidualsR = Array(-0.24082508, 0.05839241, 0.13135766, -0.03463621)
-    val workingResidualsR = Array(0.091414181, -0.005374314, -0.027196998, 0.001890910)
-    val responseResidualsR = Array(-0.6344390, 0.3172195, 0.2114797, -0.1586097)
-    val seCoefR = Array(0.16137, 0.05481, 0.23449)
-    val tValsR = Array(-4.507, 4.359, -3.476)
-    val pValsR = Array(0.139, 0.144, 0.178)
-    val dispersionR = 0.07986091
-    val nullDevianceR = 2.937462
-    val residualDevianceR = 0.090358
+    val coefficientsR = Vectors.dense(Array(0.07695, 0.28068))
+    val interceptR = -0.56474
+    val devianceResidualsR = Array(-0.17095, 0.19867, -0.23604, 0.03241)
+    val pearsonResidualsR = Array(-0.16135, 0.20808, -0.22545, 0.03259)
+    val workingResidualsR = Array(0.13532, -0.08439, 0.11322, -0.00828)
+    val responseResidualsR = Array(-0.19239, 0.25652, -0.14964, 0.03207)
+    val seCoefR = Array(0.06931, 0.0732, 0.23866)
+    val tValsR = Array(1.11031, 3.83453, -2.3663)
+    val pValsR = Array(0.46675, 0.16241, 0.25454)
+    val dispersionR = 0.12122
+    val nullDevianceR = 2.02568
+    val residualDevianceR = 0.12546
     val residualDegreeOfFreedomNullR = 3
     val residualDegreeOfFreedomR = 1
-    val aicR = 23.202
+    val aicR = 0.93388
 
     val summary = model.summary
     val devianceResiduals = summary.residuals()
@@ -1271,77 +1353,81 @@ class GeneralizedLinearRegressionSuite
     assert(summary.solver === "irls")
   }
 
-  test("glm summary: tweedie family with weight") {
+  test("glm summary: tweedie family with weight and offset") {
     /*
       R code:
 
-      library(statmod)
       df <- as.data.frame(matrix(c(
-        1.0, 1.0, 0.0, 5.0,
-        0.5, 2.0, 1.0, 2.0,
-        1.0, 3.0, 2.0, 1.0,
-        0.0, 4.0, 3.0, 3.0), 4, 4, byrow = TRUE))
+        1.0, 1.0, 1.0, 0.0, 5.0,
+        0.5, 2.0, 3.0, 1.0, 2.0,
+        1.0, 3.0, 2.0, 2.0, 1.0,
+        0.0, 4.0, 0.0, 3.0, 3.0), 4, 5, byrow = TRUE))
+    */
+    val dataset = Seq(
+      OffsetInstance(1.0, 1.0, 1.0, Vectors.dense(0.0, 5.0)),
+      OffsetInstance(0.5, 2.0, 3.0, Vectors.dense(1.0, 2.0)),
+      OffsetInstance(1.0, 3.0, 2.0, Vectors.dense(2.0, 1.0)),
+      OffsetInstance(0.0, 4.0, 0.0, Vectors.dense(3.0, 3.0))
+    ).toDF()
+    /*
+      R code:
 
-      model <- glm(V1 ~ -1 + V3 + V4, data = df, weights = V2,
-          family = tweedie(var.power = 1.6, link.power = 0))
+      library(statmod)
+      model <- glm(V1 ~ V4 + V5, data = df, weights = V2, offset = V3,
+                   family = tweedie(var.power = 1.6, link.power = 0.0))
       summary(model)
 
       Deviance Residuals:
             1        2        3        4
-       0.6210  -0.0515   1.6935  -3.2539
+       0.8917  -2.1396   1.2252  -1.7946
 
       Coefficients:
-         Estimate Std. Error t value Pr(>|t|)
-      V3  -0.4087     0.5205  -0.785    0.515
-      V4  -0.1212     0.4082  -0.297    0.794
+                  Estimate Std. Error t value Pr(>|t|)
+      (Intercept) -0.03047    3.65000  -0.008    0.995
+      V4          -1.14577    1.41674  -0.809    0.567
+      V5          -0.36585    0.97065  -0.377    0.771
 
-      (Dispersion parameter for Tweedie family taken to be 3.830036)
+      (Dispersion parameter for Tweedie family taken to be 6.334961)
 
-          Null deviance: 20.702  on 4  degrees of freedom
-      Residual deviance: 13.844  on 2  degrees of freedom
+          Null deviance: 12.784  on 3  degrees of freedom
+      Residual deviance: 10.095  on 1  degrees of freedom
       AIC: NA
 
-      Number of Fisher Scoring iterations: 11
-
-      residuals(model, type="pearson")
-           1           2           3           4
-      0.7383616 -0.0509458  2.2348337 -1.4552090
-      residuals(model, type="working")
-           1            2            3            4
-      0.83354150 -0.04103552  1.55676369 -1.00000000
-      residuals(model, type="response")
-           1            2            3            4
-      0.45460738 -0.02139574  0.60888055 -0.20392801
+      Number of Fisher Scoring iterations: 18
+
+      residuals(model, type = "pearson")
+               1          2          3          4
+      1.1472554 -1.4642569  1.4935199 -0.8025842
+      residuals(model, type = "working")
+               1          2          3          4
+      1.3624928 -0.8322375  0.9894580 -1.0000000
+      residuals(model, type = "response")
+                1           2           3           4
+      0.57671828 -2.48040354  0.49735052 -0.01040646
      */
-    val datasetWithWeight = Seq(
-      Instance(1.0, 1.0, Vectors.dense(0.0, 5.0)),
-      Instance(0.5, 2.0, Vectors.dense(1.0, 2.0)),
-      Instance(1.0, 3.0, Vectors.dense(2.0, 1.0)),
-      Instance(0.0, 4.0, Vectors.dense(3.0, 3.0))
-    ).toDF()
-
     val trainer = new GeneralizedLinearRegression()
       .setFamily("tweedie")
       .setVariancePower(1.6)
       .setLinkPower(0.0)
       .setWeightCol("weight")
-      .setFitIntercept(false)
-
-    val model = trainer.fit(datasetWithWeight)
-    val coefficientsR = Vectors.dense(Array(-0.408746, -0.12125))
-    val interceptR = 0.0
-    val devianceResidualsR = Array(0.621047, -0.051515, 1.693473, -3.253946)
-    val pearsonResidualsR = Array(0.738362, -0.050946, 2.234834, -1.455209)
-    val workingResidualsR = Array(0.833541, -0.041036, 1.556764, -1.0)
-    val responseResidualsR = Array(0.454607, -0.021396, 0.608881, -0.203928)
-    val seCoefR = Array(0.520519, 0.408215)
-    val tValsR = Array(-0.785267, -0.297024)
-    val pValsR = Array(0.514549, 0.794457)
-    val dispersionR = 3.830036
-    val nullDevianceR = 20.702
-    val residualDevianceR = 13.844
-    val residualDegreeOfFreedomNullR = 4
-    val residualDegreeOfFreedomR = 2
+      .setOffsetCol("offset")
+
+    val model = trainer.fit(dataset)
+
+    val coefficientsR = Vectors.dense(Array(-1.14577, -0.36585))
+    val interceptR = -0.03047
+    val devianceResidualsR = Array(0.89171, -2.13961, 1.2252, -1.79463)
+    val pearsonResidualsR = Array(1.14726, -1.46426, 1.49352, -0.80258)
+    val workingResidualsR = Array(1.36249, -0.83224, 0.98946, -1)
+    val responseResidualsR = Array(0.57672, -2.4804, 0.49735, -0.01041)
+    val seCoefR = Array(1.41674, 0.97065, 3.65)
+    val tValsR = Array(-0.80873, -0.37691, -0.00835)
+    val pValsR = Array(0.56707, 0.77053, 0.99468)
+    val dispersionR = 6.33496
+    val nullDevianceR = 12.78358
+    val residualDevianceR = 10.09488
+    val residualDegreeOfFreedomNullR = 3
+    val residualDegreeOfFreedomR = 1
 
     val summary = model.summary
 

From 3c2fc19d478256f8dc0ae7219fdd188030218c07 Mon Sep 17 00:00:00 2001
From: Xingbo Jiang <xingbo.jiang@databricks.com>
Date: Fri, 30 Jun 2017 20:30:26 +0800
Subject: [PATCH 055/779] [SPARK-18294][CORE] Implement commit protocol to
 support `mapred` package's committer

## What changes were proposed in this pull request?

This PR makes the following changes:

- Implement a new commit protocol `HadoopMapRedCommitProtocol` which support the old `mapred` package's committer;
- Refactor SparkHadoopWriter and SparkHadoopMapReduceWriter, now they are combined together, thus we can support write through both mapred and mapreduce API by the new SparkHadoopWriter, a lot of duplicated codes are removed.

After this change, it should be pretty easy for us to support the committer from both the new and the old hadoop API at high level.

## How was this patch tested?
No major behavior change, passed the existing test cases.

Author: Xingbo Jiang <xingbo.jiang@databricks.com>

Closes #18438 from jiangxb1987/SparkHadoopWriter.
---
 .../io/HadoopMapRedCommitProtocol.scala       |  36 ++
 .../internal/io/HadoopWriteConfigUtil.scala   |  79 ++++
 .../io/SparkHadoopMapReduceWriter.scala       | 181 --------
 .../spark/internal/io/SparkHadoopWriter.scala | 393 ++++++++++++++----
 .../apache/spark/rdd/PairRDDFunctions.scala   |  72 +---
 .../spark/rdd/PairRDDFunctionsSuite.scala     |   2 +-
 .../OutputCommitCoordinatorSuite.scala        |  35 +-
 7 files changed, 461 insertions(+), 337 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala
 create mode 100644 core/src/main/scala/org/apache/spark/internal/io/HadoopWriteConfigUtil.scala
 delete mode 100644 core/src/main/scala/org/apache/spark/internal/io/SparkHadoopMapReduceWriter.scala

diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala
new file mode 100644
index 0000000000000..ddbd624b380d4
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.internal.io
+
+import org.apache.hadoop.mapred._
+import org.apache.hadoop.mapreduce.{TaskAttemptContext => NewTaskAttemptContext}
+
+/**
+ * An [[FileCommitProtocol]] implementation backed by an underlying Hadoop OutputCommitter
+ * (from the old mapred API).
+ *
+ * Unlike Hadoop's OutputCommitter, this implementation is serializable.
+ */
+class HadoopMapRedCommitProtocol(jobId: String, path: String)
+  extends HadoopMapReduceCommitProtocol(jobId, path) {
+
+  override def setupCommitter(context: NewTaskAttemptContext): OutputCommitter = {
+    val config = context.getConfiguration.asInstanceOf[JobConf]
+    config.getOutputCommitter
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopWriteConfigUtil.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopWriteConfigUtil.scala
new file mode 100644
index 0000000000000..9b987e0e1bb67
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopWriteConfigUtil.scala
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.internal.io
+
+import scala.reflect.ClassTag
+
+import org.apache.hadoop.mapreduce._
+
+import org.apache.spark.SparkConf
+
+/**
+ * Interface for create output format/committer/writer used during saving an RDD using a Hadoop
+ * OutputFormat (both from the old mapred API and the new mapreduce API)
+ *
+ * Notes:
+ * 1. Implementations should throw [[IllegalArgumentException]] when wrong hadoop API is
+ *    referenced;
+ * 2. Implementations must be serializable, as the instance instantiated on the driver
+ *    will be used for tasks on executors;
+ * 3. Implementations should have a constructor with exactly one argument:
+ *    (conf: SerializableConfiguration) or (conf: SerializableJobConf).
+ */
+abstract class HadoopWriteConfigUtil[K, V: ClassTag] extends Serializable {
+
+  // --------------------------------------------------------------------------
+  // Create JobContext/TaskAttemptContext
+  // --------------------------------------------------------------------------
+
+  def createJobContext(jobTrackerId: String, jobId: Int): JobContext
+
+  def createTaskAttemptContext(
+      jobTrackerId: String,
+      jobId: Int,
+      splitId: Int,
+      taskAttemptId: Int): TaskAttemptContext
+
+  // --------------------------------------------------------------------------
+  // Create committer
+  // --------------------------------------------------------------------------
+
+  def createCommitter(jobId: Int): HadoopMapReduceCommitProtocol
+
+  // --------------------------------------------------------------------------
+  // Create writer
+  // --------------------------------------------------------------------------
+
+  def initWriter(taskContext: TaskAttemptContext, splitId: Int): Unit
+
+  def write(pair: (K, V)): Unit
+
+  def closeWriter(taskContext: TaskAttemptContext): Unit
+
+  // --------------------------------------------------------------------------
+  // Create OutputFormat
+  // --------------------------------------------------------------------------
+
+  def initOutputFormat(jobContext: JobContext): Unit
+
+  // --------------------------------------------------------------------------
+  // Verify hadoop config
+  // --------------------------------------------------------------------------
+
+  def assertConf(jobContext: JobContext, conf: SparkConf): Unit
+}
diff --git a/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopMapReduceWriter.scala b/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopMapReduceWriter.scala
deleted file mode 100644
index 376ff9bb19f74..0000000000000
--- a/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopMapReduceWriter.scala
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.internal.io
-
-import java.text.SimpleDateFormat
-import java.util.{Date, Locale}
-
-import scala.reflect.ClassTag
-import scala.util.DynamicVariable
-
-import org.apache.hadoop.conf.{Configurable, Configuration}
-import org.apache.hadoop.fs.Path
-import org.apache.hadoop.mapred.{JobConf, JobID}
-import org.apache.hadoop.mapreduce._
-import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
-
-import org.apache.spark.{SparkConf, SparkException, TaskContext}
-import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.executor.OutputMetrics
-import org.apache.spark.internal.Logging
-import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage
-import org.apache.spark.rdd.RDD
-import org.apache.spark.util.{SerializableConfiguration, Utils}
-
-/**
- * A helper object that saves an RDD using a Hadoop OutputFormat
- * (from the newer mapreduce API, not the old mapred API).
- */
-private[spark]
-object SparkHadoopMapReduceWriter extends Logging {
-
-  /**
-   * Basic work flow of this command is:
-   * 1. Driver side setup, prepare the data source and hadoop configuration for the write job to
-   *    be issued.
-   * 2. Issues a write job consists of one or more executor side tasks, each of which writes all
-   *    rows within an RDD partition.
-   * 3. If no exception is thrown in a task, commits that task, otherwise aborts that task;  If any
-   *    exception is thrown during task commitment, also aborts that task.
-   * 4. If all tasks are committed, commit the job, otherwise aborts the job;  If any exception is
-   *    thrown during job commitment, also aborts the job.
-   */
-  def write[K, V: ClassTag](
-      rdd: RDD[(K, V)],
-      hadoopConf: Configuration): Unit = {
-    // Extract context and configuration from RDD.
-    val sparkContext = rdd.context
-    val stageId = rdd.id
-    val sparkConf = rdd.conf
-    val conf = new SerializableConfiguration(hadoopConf)
-
-    // Set up a job.
-    val jobTrackerId = SparkHadoopWriterUtils.createJobTrackerID(new Date())
-    val jobAttemptId = new TaskAttemptID(jobTrackerId, stageId, TaskType.MAP, 0, 0)
-    val jobContext = new TaskAttemptContextImpl(conf.value, jobAttemptId)
-    val format = jobContext.getOutputFormatClass
-
-    if (SparkHadoopWriterUtils.isOutputSpecValidationEnabled(sparkConf)) {
-      // FileOutputFormat ignores the filesystem parameter
-      val jobFormat = format.newInstance
-      jobFormat.checkOutputSpecs(jobContext)
-    }
-
-    val committer = FileCommitProtocol.instantiate(
-      className = classOf[HadoopMapReduceCommitProtocol].getName,
-      jobId = stageId.toString,
-      outputPath = conf.value.get("mapreduce.output.fileoutputformat.outputdir"),
-      isAppend = false).asInstanceOf[HadoopMapReduceCommitProtocol]
-    committer.setupJob(jobContext)
-
-    // Try to write all RDD partitions as a Hadoop OutputFormat.
-    try {
-      val ret = sparkContext.runJob(rdd, (context: TaskContext, iter: Iterator[(K, V)]) => {
-        executeTask(
-          context = context,
-          jobTrackerId = jobTrackerId,
-          sparkStageId = context.stageId,
-          sparkPartitionId = context.partitionId,
-          sparkAttemptNumber = context.attemptNumber,
-          committer = committer,
-          hadoopConf = conf.value,
-          outputFormat = format.asInstanceOf[Class[OutputFormat[K, V]]],
-          iterator = iter)
-      })
-
-      committer.commitJob(jobContext, ret)
-      logInfo(s"Job ${jobContext.getJobID} committed.")
-    } catch {
-      case cause: Throwable =>
-        logError(s"Aborting job ${jobContext.getJobID}.", cause)
-        committer.abortJob(jobContext)
-        throw new SparkException("Job aborted.", cause)
-    }
-  }
-
-  /** Write an RDD partition out in a single Spark task. */
-  private def executeTask[K, V: ClassTag](
-      context: TaskContext,
-      jobTrackerId: String,
-      sparkStageId: Int,
-      sparkPartitionId: Int,
-      sparkAttemptNumber: Int,
-      committer: FileCommitProtocol,
-      hadoopConf: Configuration,
-      outputFormat: Class[_ <: OutputFormat[K, V]],
-      iterator: Iterator[(K, V)]): TaskCommitMessage = {
-    // Set up a task.
-    val attemptId = new TaskAttemptID(jobTrackerId, sparkStageId, TaskType.REDUCE,
-      sparkPartitionId, sparkAttemptNumber)
-    val taskContext = new TaskAttemptContextImpl(hadoopConf, attemptId)
-    committer.setupTask(taskContext)
-
-    val (outputMetrics, callback) = SparkHadoopWriterUtils.initHadoopOutputMetrics(context)
-
-    // Initiate the writer.
-    val taskFormat = outputFormat.newInstance()
-    // If OutputFormat is Configurable, we should set conf to it.
-    taskFormat match {
-      case c: Configurable => c.setConf(hadoopConf)
-      case _ => ()
-    }
-    var writer = taskFormat.getRecordWriter(taskContext)
-      .asInstanceOf[RecordWriter[K, V]]
-    require(writer != null, "Unable to obtain RecordWriter")
-    var recordsWritten = 0L
-
-    // Write all rows in RDD partition.
-    try {
-      val ret = Utils.tryWithSafeFinallyAndFailureCallbacks {
-        // Write rows out, release resource and commit the task.
-        while (iterator.hasNext) {
-          val pair = iterator.next()
-          writer.write(pair._1, pair._2)
-
-          // Update bytes written metric every few records
-          SparkHadoopWriterUtils.maybeUpdateOutputMetrics(outputMetrics, callback, recordsWritten)
-          recordsWritten += 1
-        }
-        if (writer != null) {
-          writer.close(taskContext)
-          writer = null
-        }
-        committer.commitTask(taskContext)
-      }(catchBlock = {
-        // If there is an error, release resource and then abort the task.
-        try {
-          if (writer != null) {
-            writer.close(taskContext)
-            writer = null
-          }
-        } finally {
-          committer.abortTask(taskContext)
-          logError(s"Task ${taskContext.getTaskAttemptID} aborted.")
-        }
-      })
-
-      outputMetrics.setBytesWritten(callback())
-      outputMetrics.setRecordsWritten(recordsWritten)
-
-      ret
-    } catch {
-      case t: Throwable =>
-        throw new SparkException("Task failed while writing rows", t)
-    }
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriter.scala b/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriter.scala
index acc9c38571007..7d846f9354df6 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriter.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriter.scala
@@ -17,143 +17,374 @@
 
 package org.apache.spark.internal.io
 
-import java.io.IOException
-import java.text.{NumberFormat, SimpleDateFormat}
+import java.text.NumberFormat
 import java.util.{Date, Locale}
 
+import scala.reflect.ClassTag
+
+import org.apache.hadoop.conf.{Configurable, Configuration}
 import org.apache.hadoop.fs.FileSystem
 import org.apache.hadoop.mapred._
-import org.apache.hadoop.mapreduce.TaskType
+import org.apache.hadoop.mapreduce.{JobContext => NewJobContext,
+OutputFormat => NewOutputFormat, RecordWriter => NewRecordWriter,
+TaskAttemptContext => NewTaskAttemptContext, TaskAttemptID => NewTaskAttemptID, TaskType}
+import org.apache.hadoop.mapreduce.task.{TaskAttemptContextImpl => NewTaskAttemptContextImpl}
 
-import org.apache.spark.SerializableWritable
+import org.apache.spark.{SerializableWritable, SparkConf, SparkException, TaskContext}
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
-import org.apache.spark.mapred.SparkHadoopMapRedUtil
-import org.apache.spark.rdd.HadoopRDD
-import org.apache.spark.util.SerializableJobConf
+import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage
+import org.apache.spark.rdd.{HadoopRDD, RDD}
+import org.apache.spark.util.{SerializableConfiguration, SerializableJobConf, Utils}
 
 /**
- * Internal helper class that saves an RDD using a Hadoop OutputFormat.
- *
- * Saves the RDD using a JobConf, which should contain an output key class, an output value class,
- * a filename to write to, etc, exactly like in a Hadoop MapReduce job.
+ * A helper object that saves an RDD using a Hadoop OutputFormat.
+ */
+private[spark]
+object SparkHadoopWriter extends Logging {
+  import SparkHadoopWriterUtils._
+
+  /**
+   * Basic work flow of this command is:
+   * 1. Driver side setup, prepare the data source and hadoop configuration for the write job to
+   *    be issued.
+   * 2. Issues a write job consists of one or more executor side tasks, each of which writes all
+   *    rows within an RDD partition.
+   * 3. If no exception is thrown in a task, commits that task, otherwise aborts that task;  If any
+   *    exception is thrown during task commitment, also aborts that task.
+   * 4. If all tasks are committed, commit the job, otherwise aborts the job;  If any exception is
+   *    thrown during job commitment, also aborts the job.
+   */
+  def write[K, V: ClassTag](
+      rdd: RDD[(K, V)],
+      config: HadoopWriteConfigUtil[K, V]): Unit = {
+    // Extract context and configuration from RDD.
+    val sparkContext = rdd.context
+    val stageId = rdd.id
+
+    // Set up a job.
+    val jobTrackerId = createJobTrackerID(new Date())
+    val jobContext = config.createJobContext(jobTrackerId, stageId)
+    config.initOutputFormat(jobContext)
+
+    // Assert the output format/key/value class is set in JobConf.
+    config.assertConf(jobContext, rdd.conf)
+
+    val committer = config.createCommitter(stageId)
+    committer.setupJob(jobContext)
+
+    // Try to write all RDD partitions as a Hadoop OutputFormat.
+    try {
+      val ret = sparkContext.runJob(rdd, (context: TaskContext, iter: Iterator[(K, V)]) => {
+        executeTask(
+          context = context,
+          config = config,
+          jobTrackerId = jobTrackerId,
+          sparkStageId = context.stageId,
+          sparkPartitionId = context.partitionId,
+          sparkAttemptNumber = context.attemptNumber,
+          committer = committer,
+          iterator = iter)
+      })
+
+      committer.commitJob(jobContext, ret)
+      logInfo(s"Job ${jobContext.getJobID} committed.")
+    } catch {
+      case cause: Throwable =>
+        logError(s"Aborting job ${jobContext.getJobID}.", cause)
+        committer.abortJob(jobContext)
+        throw new SparkException("Job aborted.", cause)
+    }
+  }
+
+  /** Write a RDD partition out in a single Spark task. */
+  private def executeTask[K, V: ClassTag](
+      context: TaskContext,
+      config: HadoopWriteConfigUtil[K, V],
+      jobTrackerId: String,
+      sparkStageId: Int,
+      sparkPartitionId: Int,
+      sparkAttemptNumber: Int,
+      committer: FileCommitProtocol,
+      iterator: Iterator[(K, V)]): TaskCommitMessage = {
+    // Set up a task.
+    val taskContext = config.createTaskAttemptContext(
+      jobTrackerId, sparkStageId, sparkPartitionId, sparkAttemptNumber)
+    committer.setupTask(taskContext)
+
+    val (outputMetrics, callback) = initHadoopOutputMetrics(context)
+
+    // Initiate the writer.
+    config.initWriter(taskContext, sparkPartitionId)
+    var recordsWritten = 0L
+
+    // Write all rows in RDD partition.
+    try {
+      val ret = Utils.tryWithSafeFinallyAndFailureCallbacks {
+        while (iterator.hasNext) {
+          val pair = iterator.next()
+          config.write(pair)
+
+          // Update bytes written metric every few records
+          maybeUpdateOutputMetrics(outputMetrics, callback, recordsWritten)
+          recordsWritten += 1
+        }
+
+        config.closeWriter(taskContext)
+        committer.commitTask(taskContext)
+      }(catchBlock = {
+        // If there is an error, release resource and then abort the task.
+        try {
+          config.closeWriter(taskContext)
+        } finally {
+          committer.abortTask(taskContext)
+          logError(s"Task ${taskContext.getTaskAttemptID} aborted.")
+        }
+      })
+
+      outputMetrics.setBytesWritten(callback())
+      outputMetrics.setRecordsWritten(recordsWritten)
+
+      ret
+    } catch {
+      case t: Throwable =>
+        throw new SparkException("Task failed while writing rows", t)
+    }
+  }
+}
+
+/**
+ * A helper class that reads JobConf from older mapred API, creates output Format/Committer/Writer.
  */
 private[spark]
-class SparkHadoopWriter(jobConf: JobConf) extends Logging with Serializable {
+class HadoopMapRedWriteConfigUtil[K, V: ClassTag](conf: SerializableJobConf)
+  extends HadoopWriteConfigUtil[K, V] with Logging {
 
-  private val now = new Date()
-  private val conf = new SerializableJobConf(jobConf)
+  private var outputFormat: Class[_ <: OutputFormat[K, V]] = null
+  private var writer: RecordWriter[K, V] = null
 
-  private var jobID = 0
-  private var splitID = 0
-  private var attemptID = 0
-  private var jID: SerializableWritable[JobID] = null
-  private var taID: SerializableWritable[TaskAttemptID] = null
+  private def getConf: JobConf = conf.value
 
-  @transient private var writer: RecordWriter[AnyRef, AnyRef] = null
-  @transient private var format: OutputFormat[AnyRef, AnyRef] = null
-  @transient private var committer: OutputCommitter = null
-  @transient private var jobContext: JobContext = null
-  @transient private var taskContext: TaskAttemptContext = null
+  // --------------------------------------------------------------------------
+  // Create JobContext/TaskAttemptContext
+  // --------------------------------------------------------------------------
 
-  def preSetup() {
-    setIDs(0, 0, 0)
-    HadoopRDD.addLocalConfiguration("", 0, 0, 0, conf.value)
+  override def createJobContext(jobTrackerId: String, jobId: Int): NewJobContext = {
+    val jobAttemptId = new SerializableWritable(new JobID(jobTrackerId, jobId))
+    new JobContextImpl(getConf, jobAttemptId.value)
+  }
 
-    val jCtxt = getJobContext()
-    getOutputCommitter().setupJob(jCtxt)
+  override def createTaskAttemptContext(
+      jobTrackerId: String,
+      jobId: Int,
+      splitId: Int,
+      taskAttemptId: Int): NewTaskAttemptContext = {
+    // Update JobConf.
+    HadoopRDD.addLocalConfiguration(jobTrackerId, jobId, splitId, taskAttemptId, conf.value)
+    // Create taskContext.
+    val attemptId = new TaskAttemptID(jobTrackerId, jobId, TaskType.MAP, splitId, taskAttemptId)
+    new TaskAttemptContextImpl(getConf, attemptId)
   }
 
+  // --------------------------------------------------------------------------
+  // Create committer
+  // --------------------------------------------------------------------------
 
-  def setup(jobid: Int, splitid: Int, attemptid: Int) {
-    setIDs(jobid, splitid, attemptid)
-    HadoopRDD.addLocalConfiguration(new SimpleDateFormat("yyyyMMddHHmmss", Locale.US).format(now),
-      jobid, splitID, attemptID, conf.value)
+  override def createCommitter(jobId: Int): HadoopMapReduceCommitProtocol = {
+    // Update JobConf.
+    HadoopRDD.addLocalConfiguration("", 0, 0, 0, getConf)
+    // Create commit protocol.
+    FileCommitProtocol.instantiate(
+      className = classOf[HadoopMapRedCommitProtocol].getName,
+      jobId = jobId.toString,
+      outputPath = getConf.get("mapred.output.dir"),
+      isAppend = false).asInstanceOf[HadoopMapReduceCommitProtocol]
   }
 
-  def open() {
+  // --------------------------------------------------------------------------
+  // Create writer
+  // --------------------------------------------------------------------------
+
+  override def initWriter(taskContext: NewTaskAttemptContext, splitId: Int): Unit = {
     val numfmt = NumberFormat.getInstance(Locale.US)
     numfmt.setMinimumIntegerDigits(5)
     numfmt.setGroupingUsed(false)
 
-    val outputName = "part-"  + numfmt.format(splitID)
-    val path = FileOutputFormat.getOutputPath(conf.value)
+    val outputName = "part-" + numfmt.format(splitId)
+    val path = FileOutputFormat.getOutputPath(getConf)
     val fs: FileSystem = {
       if (path != null) {
-        path.getFileSystem(conf.value)
+        path.getFileSystem(getConf)
       } else {
-        FileSystem.get(conf.value)
+        FileSystem.get(getConf)
       }
     }
 
-    getOutputCommitter().setupTask(getTaskContext())
-    writer = getOutputFormat().getRecordWriter(fs, conf.value, outputName, Reporter.NULL)
+    writer = getConf.getOutputFormat
+      .getRecordWriter(fs, getConf, outputName, Reporter.NULL)
+      .asInstanceOf[RecordWriter[K, V]]
+
+    require(writer != null, "Unable to obtain RecordWriter")
   }
 
-  def write(key: AnyRef, value: AnyRef) {
+  override def write(pair: (K, V)): Unit = {
+    require(writer != null, "Must call createWriter before write.")
+    writer.write(pair._1, pair._2)
+  }
+
+  override def closeWriter(taskContext: NewTaskAttemptContext): Unit = {
     if (writer != null) {
-      writer.write(key, value)
-    } else {
-      throw new IOException("Writer is null, open() has not been called")
+      writer.close(Reporter.NULL)
     }
   }
 
-  def close() {
-    writer.close(Reporter.NULL)
-  }
+  // --------------------------------------------------------------------------
+  // Create OutputFormat
+  // --------------------------------------------------------------------------
 
-  def commit() {
-    SparkHadoopMapRedUtil.commitTask(getOutputCommitter(), getTaskContext(), jobID, splitID)
+  override def initOutputFormat(jobContext: NewJobContext): Unit = {
+    if (outputFormat == null) {
+      outputFormat = getConf.getOutputFormat.getClass
+        .asInstanceOf[Class[_ <: OutputFormat[K, V]]]
+    }
   }
 
-  def commitJob() {
-    val cmtr = getOutputCommitter()
-    cmtr.commitJob(getJobContext())
+  private def getOutputFormat(): OutputFormat[K, V] = {
+    require(outputFormat != null, "Must call initOutputFormat first.")
+
+    outputFormat.newInstance()
   }
 
-  // ********* Private Functions *********
+  // --------------------------------------------------------------------------
+  // Verify hadoop config
+  // --------------------------------------------------------------------------
+
+  override def assertConf(jobContext: NewJobContext, conf: SparkConf): Unit = {
+    val outputFormatInstance = getOutputFormat()
+    val keyClass = getConf.getOutputKeyClass
+    val valueClass = getConf.getOutputValueClass
+    if (outputFormatInstance == null) {
+      throw new SparkException("Output format class not set")
+    }
+    if (keyClass == null) {
+      throw new SparkException("Output key class not set")
+    }
+    if (valueClass == null) {
+      throw new SparkException("Output value class not set")
+    }
+    SparkHadoopUtil.get.addCredentials(getConf)
+
+    logDebug("Saving as hadoop file of type (" + keyClass.getSimpleName + ", " +
+      valueClass.getSimpleName + ")")
 
-  private def getOutputFormat(): OutputFormat[AnyRef, AnyRef] = {
-    if (format == null) {
-      format = conf.value.getOutputFormat()
-        .asInstanceOf[OutputFormat[AnyRef, AnyRef]]
+    if (SparkHadoopWriterUtils.isOutputSpecValidationEnabled(conf)) {
+      // FileOutputFormat ignores the filesystem parameter
+      val ignoredFs = FileSystem.get(getConf)
+      getOutputFormat().checkOutputSpecs(ignoredFs, getConf)
     }
-    format
+  }
+}
+
+/**
+ * A helper class that reads Configuration from newer mapreduce API, creates output
+ * Format/Committer/Writer.
+ */
+private[spark]
+class HadoopMapReduceWriteConfigUtil[K, V: ClassTag](conf: SerializableConfiguration)
+  extends HadoopWriteConfigUtil[K, V] with Logging {
+
+  private var outputFormat: Class[_ <: NewOutputFormat[K, V]] = null
+  private var writer: NewRecordWriter[K, V] = null
+
+  private def getConf: Configuration = conf.value
+
+  // --------------------------------------------------------------------------
+  // Create JobContext/TaskAttemptContext
+  // --------------------------------------------------------------------------
+
+  override def createJobContext(jobTrackerId: String, jobId: Int): NewJobContext = {
+    val jobAttemptId = new NewTaskAttemptID(jobTrackerId, jobId, TaskType.MAP, 0, 0)
+    new NewTaskAttemptContextImpl(getConf, jobAttemptId)
+  }
+
+  override def createTaskAttemptContext(
+      jobTrackerId: String,
+      jobId: Int,
+      splitId: Int,
+      taskAttemptId: Int): NewTaskAttemptContext = {
+    val attemptId = new NewTaskAttemptID(
+      jobTrackerId, jobId, TaskType.REDUCE, splitId, taskAttemptId)
+    new NewTaskAttemptContextImpl(getConf, attemptId)
+  }
+
+  // --------------------------------------------------------------------------
+  // Create committer
+  // --------------------------------------------------------------------------
+
+  override def createCommitter(jobId: Int): HadoopMapReduceCommitProtocol = {
+    FileCommitProtocol.instantiate(
+      className = classOf[HadoopMapReduceCommitProtocol].getName,
+      jobId = jobId.toString,
+      outputPath = getConf.get("mapreduce.output.fileoutputformat.outputdir"),
+      isAppend = false).asInstanceOf[HadoopMapReduceCommitProtocol]
   }
 
-  private def getOutputCommitter(): OutputCommitter = {
-    if (committer == null) {
-      committer = conf.value.getOutputCommitter
+  // --------------------------------------------------------------------------
+  // Create writer
+  // --------------------------------------------------------------------------
+
+  override def initWriter(taskContext: NewTaskAttemptContext, splitId: Int): Unit = {
+    val taskFormat = getOutputFormat()
+    // If OutputFormat is Configurable, we should set conf to it.
+    taskFormat match {
+      case c: Configurable => c.setConf(getConf)
+      case _ => ()
     }
-    committer
+
+    writer = taskFormat.getRecordWriter(taskContext)
+      .asInstanceOf[NewRecordWriter[K, V]]
+
+    require(writer != null, "Unable to obtain RecordWriter")
+  }
+
+  override def write(pair: (K, V)): Unit = {
+    require(writer != null, "Must call createWriter before write.")
+    writer.write(pair._1, pair._2)
   }
 
-  private def getJobContext(): JobContext = {
-    if (jobContext == null) {
-      jobContext = new JobContextImpl(conf.value, jID.value)
+  override def closeWriter(taskContext: NewTaskAttemptContext): Unit = {
+    if (writer != null) {
+      writer.close(taskContext)
+      writer = null
+    } else {
+      logWarning("Writer has been closed.")
     }
-    jobContext
   }
 
-  private def getTaskContext(): TaskAttemptContext = {
-    if (taskContext == null) {
-      taskContext = newTaskAttemptContext(conf.value, taID.value)
+  // --------------------------------------------------------------------------
+  // Create OutputFormat
+  // --------------------------------------------------------------------------
+
+  override def initOutputFormat(jobContext: NewJobContext): Unit = {
+    if (outputFormat == null) {
+      outputFormat = jobContext.getOutputFormatClass
+        .asInstanceOf[Class[_ <: NewOutputFormat[K, V]]]
     }
-    taskContext
   }
 
-  protected def newTaskAttemptContext(
-      conf: JobConf,
-      attemptId: TaskAttemptID): TaskAttemptContext = {
-    new TaskAttemptContextImpl(conf, attemptId)
+  private def getOutputFormat(): NewOutputFormat[K, V] = {
+    require(outputFormat != null, "Must call initOutputFormat first.")
+
+    outputFormat.newInstance()
   }
 
-  private def setIDs(jobid: Int, splitid: Int, attemptid: Int) {
-    jobID = jobid
-    splitID = splitid
-    attemptID = attemptid
+  // --------------------------------------------------------------------------
+  // Verify hadoop config
+  // --------------------------------------------------------------------------
 
-    jID = new SerializableWritable[JobID](SparkHadoopWriterUtils.createJobID(now, jobid))
-    taID = new SerializableWritable[TaskAttemptID](
-        new TaskAttemptID(new TaskID(jID.value, TaskType.MAP, splitID), attemptID))
+  override def assertConf(jobContext: NewJobContext, conf: SparkConf): Unit = {
+    if (SparkHadoopWriterUtils.isOutputSpecValidationEnabled(conf)) {
+      getOutputFormat().checkOutputSpecs(jobContext)
+    }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 58762cc0838cd..4628fa8ba270e 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -27,7 +27,6 @@ import scala.reflect.ClassTag
 
 import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.FileSystem
 import org.apache.hadoop.io.SequenceFile.CompressionType
 import org.apache.hadoop.io.compress.CompressionCodec
 import org.apache.hadoop.mapred.{FileOutputCommitter, FileOutputFormat, JobConf, OutputFormat}
@@ -36,13 +35,11 @@ import org.apache.hadoop.mapreduce.{Job => NewAPIHadoopJob, OutputFormat => NewO
 import org.apache.spark._
 import org.apache.spark.Partitioner.defaultPartitioner
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.internal.io.{SparkHadoopMapReduceWriter, SparkHadoopWriter,
-  SparkHadoopWriterUtils}
+import org.apache.spark.internal.io._
 import org.apache.spark.internal.Logging
 import org.apache.spark.partial.{BoundedDouble, PartialResult}
 import org.apache.spark.serializer.Serializer
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{SerializableConfiguration, SerializableJobConf, Utils}
 import org.apache.spark.util.collection.CompactBuffer
 import org.apache.spark.util.random.StratifiedSamplingUtils
 
@@ -1082,9 +1079,10 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * result of using direct output committer with speculation enabled.
    */
   def saveAsNewAPIHadoopDataset(conf: Configuration): Unit = self.withScope {
-    SparkHadoopMapReduceWriter.write(
+    val config = new HadoopMapReduceWriteConfigUtil[K, V](new SerializableConfiguration(conf))
+    SparkHadoopWriter.write(
       rdd = self,
-      hadoopConf = conf)
+      config = config)
   }
 
   /**
@@ -1094,62 +1092,10 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * MapReduce job.
    */
   def saveAsHadoopDataset(conf: JobConf): Unit = self.withScope {
-    // Rename this as hadoopConf internally to avoid shadowing (see SPARK-2038).
-    val hadoopConf = conf
-    val outputFormatInstance = hadoopConf.getOutputFormat
-    val keyClass = hadoopConf.getOutputKeyClass
-    val valueClass = hadoopConf.getOutputValueClass
-    if (outputFormatInstance == null) {
-      throw new SparkException("Output format class not set")
-    }
-    if (keyClass == null) {
-      throw new SparkException("Output key class not set")
-    }
-    if (valueClass == null) {
-      throw new SparkException("Output value class not set")
-    }
-    SparkHadoopUtil.get.addCredentials(hadoopConf)
-
-    logDebug("Saving as hadoop file of type (" + keyClass.getSimpleName + ", " +
-      valueClass.getSimpleName + ")")
-
-    if (SparkHadoopWriterUtils.isOutputSpecValidationEnabled(self.conf)) {
-      // FileOutputFormat ignores the filesystem parameter
-      val ignoredFs = FileSystem.get(hadoopConf)
-      hadoopConf.getOutputFormat.checkOutputSpecs(ignoredFs, hadoopConf)
-    }
-
-    val writer = new SparkHadoopWriter(hadoopConf)
-    writer.preSetup()
-
-    val writeToFile = (context: TaskContext, iter: Iterator[(K, V)]) => {
-      // Hadoop wants a 32-bit task attempt ID, so if ours is bigger than Int.MaxValue, roll it
-      // around by taking a mod. We expect that no task will be attempted 2 billion times.
-      val taskAttemptId = (context.taskAttemptId % Int.MaxValue).toInt
-
-      val (outputMetrics, callback) = SparkHadoopWriterUtils.initHadoopOutputMetrics(context)
-
-      writer.setup(context.stageId, context.partitionId, taskAttemptId)
-      writer.open()
-      var recordsWritten = 0L
-
-      Utils.tryWithSafeFinallyAndFailureCallbacks {
-        while (iter.hasNext) {
-          val record = iter.next()
-          writer.write(record._1.asInstanceOf[AnyRef], record._2.asInstanceOf[AnyRef])
-
-          // Update bytes written metric every few records
-          SparkHadoopWriterUtils.maybeUpdateOutputMetrics(outputMetrics, callback, recordsWritten)
-          recordsWritten += 1
-        }
-      }(finallyBlock = writer.close())
-      writer.commit()
-      outputMetrics.setBytesWritten(callback())
-      outputMetrics.setRecordsWritten(recordsWritten)
-    }
-
-    self.context.runJob(self, writeToFile)
-    writer.commitJob()
+    val config = new HadoopMapRedWriteConfigUtil[K, V](new SerializableJobConf(conf))
+    SparkHadoopWriter.write(
+      rdd = self,
+      config = config)
   }
 
   /**
diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
index 02df157be377c..44dd955ce8690 100644
--- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
@@ -561,7 +561,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
       pairs.saveAsHadoopFile(
         "ignored", pairs.keyClass, pairs.valueClass, classOf[FakeFormatWithCallback], conf)
     }
-    assert(e.getMessage contains "failed to write")
+    assert(e.getCause.getMessage contains "failed to write")
 
     assert(FakeWriterWithCallback.calledBy === "write,callback,close")
     assert(FakeWriterWithCallback.exception != null, "exception should be captured")
diff --git a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
index e51e6a0d3ff6b..1579b614ea5b0 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
@@ -18,12 +18,14 @@
 package org.apache.spark.scheduler
 
 import java.io.File
+import java.util.Date
 import java.util.concurrent.TimeoutException
 
 import scala.concurrent.duration._
 import scala.language.postfixOps
 
-import org.apache.hadoop.mapred.{JobConf, OutputCommitter, TaskAttemptContext, TaskAttemptID}
+import org.apache.hadoop.mapred._
+import org.apache.hadoop.mapreduce.TaskType
 import org.mockito.Matchers
 import org.mockito.Mockito._
 import org.mockito.invocation.InvocationOnMock
@@ -31,7 +33,7 @@ import org.mockito.stubbing.Answer
 import org.scalatest.BeforeAndAfter
 
 import org.apache.spark._
-import org.apache.spark.internal.io.SparkHadoopWriter
+import org.apache.spark.internal.io.{FileCommitProtocol, HadoopMapRedCommitProtocol, SparkHadoopWriterUtils}
 import org.apache.spark.rdd.{FakeOutputCommitter, RDD}
 import org.apache.spark.util.{ThreadUtils, Utils}
 
@@ -214,6 +216,8 @@ class OutputCommitCoordinatorSuite extends SparkFunSuite with BeforeAndAfter {
  */
 private case class OutputCommitFunctions(tempDirPath: String) {
 
+  private val jobId = new SerializableWritable(SparkHadoopWriterUtils.createJobID(new Date, 0))
+
   // Mock output committer that simulates a successful commit (after commit is authorized)
   private def successfulOutputCommitter = new FakeOutputCommitter {
     override def commitTask(context: TaskAttemptContext): Unit = {
@@ -256,14 +260,23 @@ private case class OutputCommitFunctions(tempDirPath: String) {
     def jobConf = new JobConf {
       override def getOutputCommitter(): OutputCommitter = outputCommitter
     }
-    val sparkHadoopWriter = new SparkHadoopWriter(jobConf) {
-      override def newTaskAttemptContext(
-        conf: JobConf,
-        attemptId: TaskAttemptID): TaskAttemptContext = {
-        mock(classOf[TaskAttemptContext])
-      }
-    }
-    sparkHadoopWriter.setup(ctx.stageId, ctx.partitionId, ctx.attemptNumber)
-    sparkHadoopWriter.commit()
+
+    // Instantiate committer.
+    val committer = FileCommitProtocol.instantiate(
+      className = classOf[HadoopMapRedCommitProtocol].getName,
+      jobId = jobId.value.getId.toString,
+      outputPath = jobConf.get("mapred.output.dir"),
+      isAppend = false)
+
+    // Create TaskAttemptContext.
+    // Hadoop wants a 32-bit task attempt ID, so if ours is bigger than Int.MaxValue, roll it
+    // around by taking a mod. We expect that no task will be attempted 2 billion times.
+    val taskAttemptId = (ctx.taskAttemptId % Int.MaxValue).toInt
+    val attemptId = new TaskAttemptID(
+      new TaskID(jobId.value, TaskType.MAP, ctx.partitionId), taskAttemptId)
+    val taskContext = new TaskAttemptContextImpl(jobConf, attemptId)
+
+    committer.setupTask(taskContext)
+    committer.commitTask(taskContext)
   }
 }

From 528c9281aecc49e9bff204dd303962c705c6f237 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Fri, 30 Jun 2017 23:25:14 +0800
Subject: [PATCH 056/779] [ML] Fix scala-2.10 build failure of
 GeneralizedLinearRegressionSuite.

## What changes were proposed in this pull request?
Fix scala-2.10 build failure of ```GeneralizedLinearRegressionSuite```.

## How was this patch tested?
Build with scala-2.10.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #18489 from yanboliang/glr.
---
 .../ml/regression/GeneralizedLinearRegressionSuite.scala  | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index cfaa57314bd66..83f1344a7bcb1 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -1075,7 +1075,7 @@ class GeneralizedLinearRegressionSuite
     val seCoefR = Array(1.23439, 0.9669, 3.56866)
     val tValsR = Array(0.80297, -0.65737, -0.06017)
     val pValsR = Array(0.42199, 0.51094, 0.95202)
-    val dispersionR = 1
+    val dispersionR = 1.0
     val nullDevianceR = 2.17561
     val residualDevianceR = 0.00018
     val residualDegreeOfFreedomNullR = 3
@@ -1114,7 +1114,7 @@ class GeneralizedLinearRegressionSuite
       assert(x._1 ~== x._2 absTol 1E-3) }
     summary.tValues.zip(tValsR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) }
     summary.pValues.zip(pValsR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) }
-    assert(summary.dispersion ~== dispersionR absTol 1E-3)
+    assert(summary.dispersion === dispersionR)
     assert(summary.nullDeviance ~== nullDevianceR absTol 1E-3)
     assert(summary.deviance ~== residualDevianceR absTol 1E-3)
     assert(summary.residualDegreeOfFreedom === residualDegreeOfFreedomR)
@@ -1190,7 +1190,7 @@ class GeneralizedLinearRegressionSuite
     val seCoefR = Array(1.16826, 0.41703, 1.96249)
     val tValsR = Array(-2.46387, 2.12428, -2.32757)
     val pValsR = Array(0.01374, 0.03365, 0.01993)
-    val dispersionR = 1
+    val dispersionR = 1.0
     val nullDevianceR = 22.55853
     val residualDevianceR = 9.5622
     val residualDegreeOfFreedomNullR = 3
@@ -1229,7 +1229,7 @@ class GeneralizedLinearRegressionSuite
       assert(x._1 ~== x._2 absTol 1E-3) }
     summary.tValues.zip(tValsR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) }
     summary.pValues.zip(pValsR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) }
-    assert(summary.dispersion ~== dispersionR absTol 1E-3)
+    assert(summary.dispersion === dispersionR)
     assert(summary.nullDeviance ~== nullDevianceR absTol 1E-3)
     assert(summary.deviance ~== residualDevianceR absTol 1E-3)
     assert(summary.residualDegreeOfFreedom === residualDegreeOfFreedomR)

From 1fe08d62f022e12f2f0161af5d8f9eac51baf1b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9B=BE=E6=9E=97=E8=A5=BF?= <zenglinxi@meituan.com>
Date: Fri, 30 Jun 2017 19:28:43 +0100
Subject: [PATCH 057/779] [SPARK-21223] Change fileToAppInfo in
 FsHistoryProvider to fix concurrent issue.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What issue does this PR address ?
Jira:https://issues.apache.org/jira/browse/SPARK-21223
fix the Thread-safety issue in FsHistoryProvider
Currently, Spark HistoryServer use a HashMap named fileToAppInfo in class FsHistoryProvider to store the map of eventlog path and attemptInfo.
When use ThreadPool to Replay the log files in the list and merge the list of old applications with new ones, multi thread may update fileToAppInfo at the same time, which may cause Thread-safety issues, such as  falling into an infinite loop because of calling resize func of the hashtable.

Author: 曾林西 <zenglinxi@meituan.com>

Closes #18430 from zenglinxi0615/master.
---
 .../apache/spark/deploy/history/FsHistoryProvider.scala  | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index d05ca142b618b..b2a50bd055712 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -19,7 +19,7 @@ package org.apache.spark.deploy.history
 
 import java.io.{FileNotFoundException, IOException, OutputStream}
 import java.util.UUID
-import java.util.concurrent.{Executors, ExecutorService, Future, TimeUnit}
+import java.util.concurrent.{ConcurrentHashMap, Executors, ExecutorService, Future, TimeUnit}
 import java.util.zip.{ZipEntry, ZipOutputStream}
 
 import scala.collection.mutable
@@ -122,7 +122,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
   @volatile private var applications: mutable.LinkedHashMap[String, FsApplicationHistoryInfo]
     = new mutable.LinkedHashMap()
 
-  val fileToAppInfo = new mutable.HashMap[Path, FsApplicationAttemptInfo]()
+  val fileToAppInfo = new ConcurrentHashMap[Path, FsApplicationAttemptInfo]()
 
   // List of application logs to be deleted by event log cleaner.
   private var attemptsToClean = new mutable.ListBuffer[FsApplicationAttemptInfo]
@@ -321,7 +321,8 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
       // scan for modified applications, replay and merge them
       val logInfos: Seq[FileStatus] = statusList
         .filter { entry =>
-          val prevFileSize = fileToAppInfo.get(entry.getPath()).map{_.fileSize}.getOrElse(0L)
+          val fileInfo = fileToAppInfo.get(entry.getPath())
+          val prevFileSize = if (fileInfo != null) fileInfo.fileSize else 0L
           !entry.isDirectory() &&
             // FsHistoryProvider generates a hidden file which can't be read.  Accidentally
             // reading a garbage file is safe, but we would log an error which can be scary to
@@ -475,7 +476,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
           fileStatus.getLen(),
           appListener.appSparkVersion.getOrElse("")
         )
-        fileToAppInfo(logPath) = attemptInfo
+        fileToAppInfo.put(logPath, attemptInfo)
         logDebug(s"Application log ${attemptInfo.logPath} loaded successfully: $attemptInfo")
         Some(attemptInfo)
       } else {

From eed9c4ef859fdb75a816a3e0ce2d593b34b23444 Mon Sep 17 00:00:00 2001
From: Xiao Li <gatorsmile@gmail.com>
Date: Fri, 30 Jun 2017 14:23:56 -0700
Subject: [PATCH 058/779] [SPARK-21129][SQL] Arguments of SQL function call
 should not be named expressions

### What changes were proposed in this pull request?

Function argument should not be named expressions. It could cause two issues:
- Misleading error message
- Unexpected query results when the column name is `distinct`, which is not a reserved word in our parser.

```
spark-sql> select count(distinct c1, distinct c2) from t1;
Error in query: cannot resolve '`distinct`' given input columns: [c1, c2]; line 1 pos 26;
'Project [unresolvedalias('count(c1#30, 'distinct), None)]
+- SubqueryAlias t1
   +- CatalogRelation `default`.`t1`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [c1#30, c2#31]
```

After the fix, the error message becomes
```
spark-sql> select count(distinct c1, distinct c2) from t1;
Error in query:
extraneous input 'c2' expecting {')', ',', '.', '[', 'OR', 'AND', 'IN', NOT, 'BETWEEN', 'LIKE', RLIKE, 'IS', EQ, '<=>', '<>', '!=', '<', LTE, '>', GTE, '+', '-', '*', '/', '%', 'DIV', '&', '|', '||', '^'}(line 1, pos 35)

== SQL ==
select count(distinct c1, distinct c2) from t1
-----------------------------------^^^
```

### How was this patch tested?
Added a test case to parser suite.

Author: Xiao Li <gatorsmile@gmail.com>
Author: gatorsmile <gatorsmile@gmail.com>

Closes #18338 from gatorsmile/parserDistinctAggFunc.
---
 .../spark/sql/catalyst/parser/SqlBase.g4      |  3 +-
 .../spark/sql/catalyst/dsl/package.scala      |  1 +
 .../sql/catalyst/parser/AstBuilder.scala      |  9 +++++-
 .../parser/ExpressionParserSuite.scala        |  6 ++--
 .../sql/catalyst/parser/PlanParserSuite.scala |  6 ++++
 .../resources/sql-tests/inputs/struct.sql     |  7 ++++
 .../sql-tests/results/struct.sql.out          | 32 ++++++++++++++++++-
 7 files changed, 59 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index 9456031736528..7ffa150096333 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -561,6 +561,7 @@ primaryExpression
     | CASE whenClause+ (ELSE elseExpression=expression)? END                                   #searchedCase
     | CASE value=expression whenClause+ (ELSE elseExpression=expression)? END                  #simpleCase
     | CAST '(' expression AS dataType ')'                                                      #cast
+    | STRUCT '(' (argument+=namedExpression (',' argument+=namedExpression)*)? ')'             #struct
     | FIRST '(' expression (IGNORE NULLS)? ')'                                                 #first
     | LAST '(' expression (IGNORE NULLS)? ')'                                                  #last
     | POSITION '(' substr=valueExpression IN str=valueExpression ')'                           #position
@@ -569,7 +570,7 @@ primaryExpression
     | qualifiedName '.' ASTERISK                                                               #star
     | '(' namedExpression (',' namedExpression)+ ')'                                           #rowConstructor
     | '(' query ')'                                                                            #subqueryExpression
-    | qualifiedName '(' (setQuantifier? namedExpression (',' namedExpression)*)? ')'
+    | qualifiedName '(' (setQuantifier? argument+=expression (',' argument+=expression)*)? ')'
        (OVER windowSpec)?                                                                      #functionCall
     | value=primaryExpression '[' index=valueExpression ']'                                    #subscript
     | identifier                                                                               #columnReference
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
index f6792569b704e..7c100afcd738f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -170,6 +170,7 @@ package object dsl {
       case Seq() => UnresolvedStar(None)
       case target => UnresolvedStar(Option(target))
     }
+    def namedStruct(e: Expression*): Expression = CreateNamedStruct(e)
 
     def callFunction[T, U](
         func: T => U,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index ef79cbcaa0ce6..8eac3ef2d3568 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -1061,6 +1061,13 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
     Cast(expression(ctx.expression), visitSparkDataType(ctx.dataType))
   }
 
+  /**
+   * Create a [[CreateStruct]] expression.
+   */
+  override def visitStruct(ctx: StructContext): Expression = withOrigin(ctx) {
+    CreateStruct(ctx.argument.asScala.map(expression))
+  }
+
   /**
    * Create a [[First]] expression.
    */
@@ -1091,7 +1098,7 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
     // Create the function call.
     val name = ctx.qualifiedName.getText
     val isDistinct = Option(ctx.setQuantifier()).exists(_.DISTINCT != null)
-    val arguments = ctx.namedExpression().asScala.map(expression) match {
+    val arguments = ctx.argument.asScala.map(expression) match {
       case Seq(UnresolvedStar(None))
         if name.toLowerCase(Locale.ROOT) == "count" && !isDistinct =>
         // Transform COUNT(*) into COUNT(1).
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
index 4d08f016a4a16..45f9f72dccc45 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
@@ -231,7 +231,7 @@ class ExpressionParserSuite extends PlanTest {
     assertEqual("foo(distinct a, b)", 'foo.distinctFunction('a, 'b))
     assertEqual("grouping(distinct a, b)", 'grouping.distinctFunction('a, 'b))
     assertEqual("`select`(all a, b)", 'select.function('a, 'b))
-    assertEqual("foo(a as x, b as e)", 'foo.function('a as 'x, 'b as 'e))
+    intercept("foo(a x)", "extraneous input 'x'")
   }
 
   test("window function expressions") {
@@ -330,7 +330,9 @@ class ExpressionParserSuite extends PlanTest {
     assertEqual("a.b", UnresolvedAttribute("a.b"))
     assertEqual("`select`.b", UnresolvedAttribute("select.b"))
     assertEqual("(a + b).b", ('a + 'b).getField("b")) // This will fail analysis.
-    assertEqual("struct(a, b).b", 'struct.function('a, 'b).getField("b"))
+    assertEqual(
+      "struct(a, b).b",
+      namedStruct(NamePlaceholder, 'a, NamePlaceholder, 'b).getField("b"))
   }
 
   test("reference") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
index bf15b85d5b510..5b2573fa4d601 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
@@ -223,6 +223,12 @@ class PlanParserSuite extends AnalysisTest {
     assertEqual(s"$sql grouping sets((a, b), (a), ())",
       GroupingSets(Seq(Seq('a, 'b), Seq('a), Seq()), Seq('a, 'b), table("d"),
         Seq('a, 'b, 'sum.function('c).as("c"))))
+
+    val m = intercept[ParseException] {
+      parsePlan("SELECT a, b, count(distinct a, distinct b) as c FROM d GROUP BY a, b")
+    }.getMessage
+    assert(m.contains("extraneous input 'b'"))
+
   }
 
   test("limit") {
diff --git a/sql/core/src/test/resources/sql-tests/inputs/struct.sql b/sql/core/src/test/resources/sql-tests/inputs/struct.sql
index e56344dc4de80..93a1238ab18c2 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/struct.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/struct.sql
@@ -18,3 +18,10 @@ SELECT ID, STRUCT(ST.*,CAST(ID AS STRING) AS E) NST FROM tbl_x;
 
 -- Prepend a column to a struct
 SELECT ID, STRUCT(CAST(ID AS STRING) AS AA, ST.*) NST FROM tbl_x;
+
+-- Select a column from a struct
+SELECT ID, STRUCT(ST.*).C NST FROM tbl_x;
+SELECT ID, STRUCT(ST.C, ST.D).D NST FROM tbl_x;
+
+-- Select an alias from a struct
+SELECT ID, STRUCT(ST.C as STC, ST.D as STD).STD FROM tbl_x;
\ No newline at end of file
diff --git a/sql/core/src/test/resources/sql-tests/results/struct.sql.out b/sql/core/src/test/resources/sql-tests/results/struct.sql.out
index 3e32f46195464..1da33bc736f0b 100644
--- a/sql/core/src/test/resources/sql-tests/results/struct.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/struct.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 6
+-- Number of queries: 9
 
 
 -- !query 0
@@ -58,3 +58,33 @@ struct<ID:int,NST:struct<AA:string,C:string,D:string>>
 1	{"AA":"1","C":"gamma","D":"delta"}
 2	{"AA":"2","C":"epsilon","D":"eta"}
 3	{"AA":"3","C":"theta","D":"iota"}
+
+
+-- !query 6
+SELECT ID, STRUCT(ST.*).C NST FROM tbl_x
+-- !query 6 schema
+struct<ID:int,NST:string>
+-- !query 6 output
+1	gamma
+2	epsilon
+3	theta
+
+
+-- !query 7
+SELECT ID, STRUCT(ST.C, ST.D).D NST FROM tbl_x
+-- !query 7 schema
+struct<ID:int,NST:string>
+-- !query 7 output
+1	delta
+2	eta
+3	iota
+
+
+-- !query 8
+SELECT ID, STRUCT(ST.C as STC, ST.D as STD).STD FROM tbl_x
+-- !query 8 schema
+struct<ID:int,named_struct(STC, ST.C AS `C` AS `STC`, STD, ST.D AS `D` AS `STD`).STD:string>
+-- !query 8 output
+1	delta
+2	eta
+3	iota

From fd1325522549937232f37215db53d6478f48644c Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Fri, 30 Jun 2017 15:11:27 -0700
Subject: [PATCH 059/779] [SPARK-21052][SQL][FOLLOW-UP] Add hash map metrics to
 join

## What changes were proposed in this pull request?

Remove `numHashCollisions` in `BytesToBytesMap`. And change `getAverageProbesPerLookup()` to `getAverageProbesPerLookup` as suggested.

## How was this patch tested?

Existing tests.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #18480 from viirya/SPARK-21052-followup.
---
 .../spark/unsafe/map/BytesToBytesMap.java     | 33 -------------------
 .../spark/sql/execution/joins/HashJoin.scala  |  2 +-
 .../sql/execution/joins/HashedRelation.scala  |  8 ++---
 3 files changed, 5 insertions(+), 38 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
index 4bef21b6b4e4d..3b6200e74f1e1 100644
--- a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
+++ b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
@@ -160,14 +160,10 @@ public final class BytesToBytesMap extends MemoryConsumer {
 
   private final boolean enablePerfMetrics;
 
-  private long timeSpentResizingNs = 0;
-
   private long numProbes = 0;
 
   private long numKeyLookups = 0;
 
-  private long numHashCollisions = 0;
-
   private long peakMemoryUsedBytes = 0L;
 
   private final int initialCapacity;
@@ -489,10 +485,6 @@ public void safeLookup(Object keyBase, long keyOffset, int keyLength, Location l
             );
             if (areEqual) {
               return;
-            } else {
-              if (enablePerfMetrics) {
-                numHashCollisions++;
-              }
             }
           }
         }
@@ -859,16 +851,6 @@ public long getPeakMemoryUsedBytes() {
     return peakMemoryUsedBytes;
   }
 
-  /**
-   * Returns the total amount of time spent resizing this map (in nanoseconds).
-   */
-  public long getTimeSpentResizingNs() {
-    if (!enablePerfMetrics) {
-      throw new IllegalStateException();
-    }
-    return timeSpentResizingNs;
-  }
-
   /**
    * Returns the average number of probes per key lookup.
    */
@@ -879,13 +861,6 @@ public double getAverageProbesPerLookup() {
     return (1.0 * numProbes) / numKeyLookups;
   }
 
-  public long getNumHashCollisions() {
-    if (!enablePerfMetrics) {
-      throw new IllegalStateException();
-    }
-    return numHashCollisions;
-  }
-
   @VisibleForTesting
   public int getNumDataPages() {
     return dataPages.size();
@@ -923,10 +898,6 @@ public void reset() {
   void growAndRehash() {
     assert(longArray != null);
 
-    long resizeStartTime = -1;
-    if (enablePerfMetrics) {
-      resizeStartTime = System.nanoTime();
-    }
     // Store references to the old data structures to be used when we re-hash
     final LongArray oldLongArray = longArray;
     final int oldCapacity = (int) oldLongArray.size() / 2;
@@ -951,9 +922,5 @@ void growAndRehash() {
       longArray.set(newPos * 2 + 1, hashcode);
     }
     freeArray(oldLongArray);
-
-    if (enablePerfMetrics) {
-      timeSpentResizingNs += System.nanoTime() - resizeStartTime;
-    }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
index b09edf380c2d4..0396168d3f311 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
@@ -215,7 +215,7 @@ trait HashJoin {
 
     // At the end of the task, we update the avg hash probe.
     TaskContext.get().addTaskCompletionListener(_ =>
-      avgHashProbe.set(hashed.getAverageProbesPerLookup()))
+      avgHashProbe.set(hashed.getAverageProbesPerLookup))
 
     val resultProj = createResultProjection
     joinedIter.map { r =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
index 3c702856114f9..2038cb9edb67d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
@@ -83,7 +83,7 @@ private[execution] sealed trait HashedRelation extends KnownSizeEstimation {
   /**
    * Returns the average number of probes per key lookup.
    */
-  def getAverageProbesPerLookup(): Double
+  def getAverageProbesPerLookup: Double
 }
 
 private[execution] object HashedRelation {
@@ -280,7 +280,7 @@ private[joins] class UnsafeHashedRelation(
     read(in.readInt, in.readLong, in.readBytes)
   }
 
-  override def getAverageProbesPerLookup(): Double = binaryMap.getAverageProbesPerLookup()
+  override def getAverageProbesPerLookup: Double = binaryMap.getAverageProbesPerLookup
 }
 
 private[joins] object UnsafeHashedRelation {
@@ -776,7 +776,7 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
   /**
    * Returns the average number of probes per key lookup.
    */
-  def getAverageProbesPerLookup(): Double = numProbes.toDouble / numKeyLookups
+  def getAverageProbesPerLookup: Double = numProbes.toDouble / numKeyLookups
 }
 
 private[joins] class LongHashedRelation(
@@ -829,7 +829,7 @@ private[joins] class LongHashedRelation(
     map = in.readObject().asInstanceOf[LongToUnsafeRowMap]
   }
 
-  override def getAverageProbesPerLookup(): Double = map.getAverageProbesPerLookup()
+  override def getAverageProbesPerLookup: Double = map.getAverageProbesPerLookup
 }
 
 /**

From 4eb41879ce774dec1d16b2281ab1fbf41f9d418a Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Sat, 1 Jul 2017 09:25:29 +0800
Subject: [PATCH 060/779] [SPARK-17528][SQL] data should be copied properly
 before saving into InternalRow

## What changes were proposed in this pull request?

For performance reasons, `UnsafeRow.getString`, `getStruct`, etc. return a "pointer" that points to a memory region of this unsafe row. This makes the unsafe projection a little dangerous, because all of its output rows share one instance.

When we implement SQL operators, we should be careful to not cache the input rows because they may be produced by unsafe projection from child operator and thus its content may change overtime.

However, when we updating values of InternalRow(e.g. in mutable projection and safe projection), we only copy UTF8String, we should also copy InternalRow, ArrayData and MapData. This PR fixes this, and also fixes the copy of vairous InternalRow, ArrayData and MapData implementations.

## How was this patch tested?

new regression tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #18483 from cloud-fan/fix-copy.
---
 .../apache/spark/unsafe/types/UTF8String.java |   6 +
 .../spark/sql/catalyst/InternalRow.scala      |  27 ++++-
 .../spark/sql/catalyst/expressions/Cast.scala |   2 +-
 .../expressions/SpecificInternalRow.scala     |  12 --
 .../expressions/aggregate/collect.scala       |   2 +-
 .../expressions/aggregate/interfaces.scala    |   6 +
 .../expressions/codegen/CodeGenerator.scala   |   6 +-
 .../codegen/GenerateSafeProjection.scala      |   2 -
 .../spark/sql/catalyst/expressions/rows.scala |  23 ++--
 .../sql/catalyst/util/GenericArrayData.scala  |  10 +-
 .../scala/org/apache/spark/sql/RowTest.scala  |   4 -
 .../catalyst/expressions/MapDataSuite.scala   |  57 ----------
 .../codegen/GeneratedProjectionSuite.scala    |  36 ++++++
 .../sql/catalyst/util/ComplexDataSuite.scala  | 107 ++++++++++++++++++
 .../execution/vectorized/ColumnarBatch.java   |   2 +-
 .../SortBasedAggregationIterator.scala        |  15 +--
 .../columnar/GenerateColumnAccessor.scala     |   1 -
 .../execution/window/AggregateProcessor.scala |   7 +-
 18 files changed, 212 insertions(+), 113 deletions(-)
 delete mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MapDataSuite.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ComplexDataSuite.scala

diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 40b9fc9534f44..9de4ca71ff6d4 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -1088,6 +1088,12 @@ public UTF8String clone() {
     return fromBytes(getBytes());
   }
 
+  public UTF8String copy() {
+    byte[] bytes = new byte[numBytes];
+    copyMemory(base, offset, bytes, BYTE_ARRAY_OFFSET, numBytes);
+    return fromBytes(bytes);
+  }
+
   @Override
   public int compareTo(@Nonnull final UTF8String other) {
     int len = Math.min(numBytes, other.numBytes);
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
index 256f64e320be8..29110640d64f2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
@@ -18,7 +18,9 @@
 package org.apache.spark.sql.catalyst
 
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.util.{ArrayData, MapData}
 import org.apache.spark.sql.types.{DataType, Decimal, StructType}
+import org.apache.spark.unsafe.types.UTF8String
 
 /**
  * An abstract class for row used internally in Spark SQL, which only contains the columns as
@@ -33,6 +35,10 @@ abstract class InternalRow extends SpecializedGetters with Serializable {
 
   def setNullAt(i: Int): Unit
 
+  /**
+   * Updates the value at column `i`. Note that after updating, the given value will be kept in this
+   * row, and the caller side should guarantee that this value won't be changed afterwards.
+   */
   def update(i: Int, value: Any): Unit
 
   // default implementation (slow)
@@ -58,7 +64,15 @@ abstract class InternalRow extends SpecializedGetters with Serializable {
   def copy(): InternalRow
 
   /** Returns true if there are any NULL values in this row. */
-  def anyNull: Boolean
+  def anyNull: Boolean = {
+    val len = numFields
+    var i = 0
+    while (i < len) {
+      if (isNullAt(i)) { return true }
+      i += 1
+    }
+    false
+  }
 
   /* ---------------------- utility methods for Scala ---------------------- */
 
@@ -94,4 +108,15 @@ object InternalRow {
 
   /** Returns an empty [[InternalRow]]. */
   val empty = apply()
+
+  /**
+   * Copies the given value if it's string/struct/array/map type.
+   */
+  def copyValue(value: Any): Any = value match {
+    case v: UTF8String => v.copy()
+    case v: InternalRow => v.copy()
+    case v: ArrayData => v.copy()
+    case v: MapData => v.copy()
+    case _ => value
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 43df19ba009a8..3862e64b9d828 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -1047,7 +1047,7 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
         final $rowClass $result = new $rowClass(${fieldsCasts.length});
         final InternalRow $tmpRow = $c;
         $fieldsEvalCode
-        $evPrim = $result.copy();
+        $evPrim = $result;
       """
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificInternalRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificInternalRow.scala
index 74e0b4691d4cc..75feaf670c84a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificInternalRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificInternalRow.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.types._
 
 /**
@@ -220,17 +219,6 @@ final class SpecificInternalRow(val values: Array[MutableValue]) extends BaseGen
 
   override def isNullAt(i: Int): Boolean = values(i).isNull
 
-  override def copy(): InternalRow = {
-    val newValues = new Array[Any](values.length)
-    var i = 0
-    while (i < values.length) {
-      newValues(i) = values(i).boxed
-      i += 1
-    }
-
-    new GenericInternalRow(newValues)
-  }
-
   override protected def genericGet(i: Int): Any = values(i).boxed
 
   override def update(ordinal: Int, value: Any) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
index 26cd9ab665383..0d2f9889a27d5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
@@ -52,7 +52,7 @@ abstract class Collect[T <: Growable[Any] with Iterable[Any]] extends TypedImper
     // Do not allow null values. We follow the semantics of Hive's collect_list/collect_set here.
     // See: org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMkCollectionEvaluator
     if (value != null) {
-      buffer += value
+      buffer += InternalRow.copyValue(value)
     }
     buffer
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
index fffcc7c9ef53a..7af4901435857 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
@@ -317,6 +317,9 @@ abstract class ImperativeAggregate extends AggregateFunction with CodegenFallbac
    * Updates its aggregation buffer, located in `mutableAggBuffer`, based on the given `inputRow`.
    *
    * Use `fieldNumber + mutableAggBufferOffset` to access fields of `mutableAggBuffer`.
+   *
+   * Note that, the input row may be produced by unsafe projection and it may not be safe to cache
+   * some fields of the input row, as the values can be changed unexpectedly.
    */
   def update(mutableAggBuffer: InternalRow, inputRow: InternalRow): Unit
 
@@ -326,6 +329,9 @@ abstract class ImperativeAggregate extends AggregateFunction with CodegenFallbac
    *
    * Use `fieldNumber + mutableAggBufferOffset` to access fields of `mutableAggBuffer`.
    * Use `fieldNumber + inputAggBufferOffset` to access fields of `inputAggBuffer`.
+   *
+   * Note that, the input row may be produced by unsafe projection and it may not be safe to cache
+   * some fields of the input row, as the values can be changed unexpectedly.
    */
   def merge(mutableAggBuffer: InternalRow, inputAggBuffer: InternalRow): Unit
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 5158949b95629..b15bf2ca7c116 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -408,9 +408,11 @@ class CodegenContext {
     dataType match {
       case _ if isPrimitiveType(jt) => s"$row.set${primitiveTypeName(jt)}($ordinal, $value)"
       case t: DecimalType => s"$row.setDecimal($ordinal, $value, ${t.precision})"
-      // The UTF8String may came from UnsafeRow, otherwise clone is cheap (re-use the bytes)
-      case StringType => s"$row.update($ordinal, $value.clone())"
       case udt: UserDefinedType[_] => setColumn(row, udt.sqlType, ordinal, value)
+      // The UTF8String, InternalRow, ArrayData and MapData may came from UnsafeRow, we should copy
+      // it to avoid keeping a "pointer" to a memory region which may get updated afterwards.
+      case StringType | _: StructType | _: ArrayType | _: MapType =>
+        s"$row.update($ordinal, $value.copy())"
       case _ => s"$row.update($ordinal, $value)"
     }
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
index f708aeff2b146..dd0419d2286d1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
@@ -131,8 +131,6 @@ object GenerateSafeProjection extends CodeGenerator[Seq[Expression], Projection]
     case s: StructType => createCodeForStruct(ctx, input, s)
     case ArrayType(elementType, _) => createCodeForArray(ctx, input, elementType)
     case MapType(keyType, valueType, _) => createCodeForMap(ctx, input, keyType, valueType)
-    // UTF8String act as a pointer if it's inside UnsafeRow, so copy it to make it safe.
-    case StringType => ExprCode("", "false", s"$input.clone()")
     case udt: UserDefinedType[_] => convertToSafe(ctx, input, udt.sqlType)
     case _ => ExprCode("", "false", input)
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
index 751b821e1b009..65539a2f00e6c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
@@ -50,16 +50,6 @@ trait BaseGenericInternalRow extends InternalRow {
   override def getMap(ordinal: Int): MapData = getAs(ordinal)
   override def getStruct(ordinal: Int, numFields: Int): InternalRow = getAs(ordinal)
 
-  override def anyNull: Boolean = {
-    val len = numFields
-    var i = 0
-    while (i < len) {
-      if (isNullAt(i)) { return true }
-      i += 1
-    }
-    false
-  }
-
   override def toString: String = {
     if (numFields == 0) {
       "[empty row]"
@@ -79,6 +69,17 @@ trait BaseGenericInternalRow extends InternalRow {
     }
   }
 
+  override def copy(): GenericInternalRow = {
+    val len = numFields
+    val newValues = new Array[Any](len)
+    var i = 0
+    while (i < len) {
+      newValues(i) = InternalRow.copyValue(genericGet(i))
+      i += 1
+    }
+    new GenericInternalRow(newValues)
+  }
+
   override def equals(o: Any): Boolean = {
     if (!o.isInstanceOf[BaseGenericInternalRow]) {
       return false
@@ -206,6 +207,4 @@ class GenericInternalRow(val values: Array[Any]) extends BaseGenericInternalRow
   override def setNullAt(i: Int): Unit = { values(i) = null}
 
   override def update(i: Int, value: Any): Unit = { values(i) = value }
-
-  override def copy(): GenericInternalRow = this
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala
index dd660c80a9c3c..9e39ed9c3a778 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala
@@ -49,7 +49,15 @@ class GenericArrayData(val array: Array[Any]) extends ArrayData {
 
   def this(seqOrArray: Any) = this(GenericArrayData.anyToSeq(seqOrArray))
 
-  override def copy(): ArrayData = new GenericArrayData(array.clone())
+  override def copy(): ArrayData = {
+    val newValues = new Array[Any](array.length)
+    var i = 0
+    while (i < array.length) {
+      newValues(i) = InternalRow.copyValue(array(i))
+      i += 1
+    }
+    new GenericArrayData(newValues)
+  }
 
   override def numElements(): Int = array.length
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RowTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RowTest.scala
index c9c9599e7f463..25699de33d717 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RowTest.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RowTest.scala
@@ -121,10 +121,6 @@ class RowTest extends FunSpec with Matchers {
       externalRow should be theSameInstanceAs externalRow.copy()
     }
 
-    it("copy should return same ref for internal rows") {
-      internalRow should be theSameInstanceAs internalRow.copy()
-    }
-
     it("toSeq should not expose internal state for external rows") {
       val modifiedValues = modifyValues(externalRow.toSeq)
       externalRow.toSeq should not equal modifiedValues
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MapDataSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MapDataSuite.scala
deleted file mode 100644
index 25a675a90276d..0000000000000
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MapDataSuite.scala
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.expressions
-
-import scala.collection._
-
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.util.ArrayBasedMapData
-import org.apache.spark.sql.types.{DataType, IntegerType, MapType, StringType}
-import org.apache.spark.unsafe.types.UTF8String
-
-class MapDataSuite extends SparkFunSuite {
-
-  test("inequality tests") {
-    def u(str: String): UTF8String = UTF8String.fromString(str)
-
-    // test data
-    val testMap1 = Map(u("key1") -> 1)
-    val testMap2 = Map(u("key1") -> 1, u("key2") -> 2)
-    val testMap3 = Map(u("key1") -> 1)
-    val testMap4 = Map(u("key1") -> 1, u("key2") -> 2)
-
-    // ArrayBasedMapData
-    val testArrayMap1 = ArrayBasedMapData(testMap1.toMap)
-    val testArrayMap2 = ArrayBasedMapData(testMap2.toMap)
-    val testArrayMap3 = ArrayBasedMapData(testMap3.toMap)
-    val testArrayMap4 = ArrayBasedMapData(testMap4.toMap)
-    assert(testArrayMap1 !== testArrayMap3)
-    assert(testArrayMap2 !== testArrayMap4)
-
-    // UnsafeMapData
-    val unsafeConverter = UnsafeProjection.create(Array[DataType](MapType(StringType, IntegerType)))
-    val row = new GenericInternalRow(1)
-    def toUnsafeMap(map: ArrayBasedMapData): UnsafeMapData = {
-      row.update(0, map)
-      val unsafeRow = unsafeConverter.apply(row)
-      unsafeRow.getMap(0).copy
-    }
-    assert(toUnsafeMap(testArrayMap1) !== toUnsafeMap(testArrayMap3))
-    assert(toUnsafeMap(testArrayMap2) !== toUnsafeMap(testArrayMap4))
-  }
-}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala
index 58ea5b9cb52d3..0cd0d8859145f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala
@@ -172,4 +172,40 @@ class GeneratedProjectionSuite extends SparkFunSuite {
     assert(unsafe1 === unsafe3)
     assert(unsafe1.getStruct(1, 7) === unsafe3.getStruct(1, 7))
   }
+
+  test("MutableProjection should not cache content from the input row") {
+    val mutableProj = GenerateMutableProjection.generate(
+      Seq(BoundReference(0, new StructType().add("i", StringType), true)))
+    val row = new GenericInternalRow(1)
+    mutableProj.target(row)
+
+    val unsafeProj = GenerateUnsafeProjection.generate(
+      Seq(BoundReference(0, new StructType().add("i", StringType), true)))
+    val unsafeRow = unsafeProj.apply(InternalRow(InternalRow(UTF8String.fromString("a"))))
+
+    mutableProj.apply(unsafeRow)
+    assert(row.getStruct(0, 1).getString(0) == "a")
+
+    // Even if the input row of the mutable projection has been changed, the target mutable row
+    // should keep same.
+    unsafeProj.apply(InternalRow(InternalRow(UTF8String.fromString("b"))))
+    assert(row.getStruct(0, 1).getString(0).toString == "a")
+  }
+
+  test("SafeProjection should not cache content from the input row") {
+    val safeProj = GenerateSafeProjection.generate(
+      Seq(BoundReference(0, new StructType().add("i", StringType), true)))
+
+    val unsafeProj = GenerateUnsafeProjection.generate(
+      Seq(BoundReference(0, new StructType().add("i", StringType), true)))
+    val unsafeRow = unsafeProj.apply(InternalRow(InternalRow(UTF8String.fromString("a"))))
+
+    val row = safeProj.apply(unsafeRow)
+    assert(row.getStruct(0, 1).getString(0) == "a")
+
+    // Even if the input row of the mutable projection has been changed, the target mutable row
+    // should keep same.
+    unsafeProj.apply(InternalRow(InternalRow(UTF8String.fromString("b"))))
+    assert(row.getStruct(0, 1).getString(0).toString == "a")
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ComplexDataSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ComplexDataSuite.scala
new file mode 100644
index 0000000000000..9d285916bcf42
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ComplexDataSuite.scala
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.util
+
+import scala.collection._
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{BoundReference, GenericInternalRow, SpecificInternalRow, UnsafeMapData, UnsafeProjection}
+import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
+import org.apache.spark.sql.types.{DataType, IntegerType, MapType, StringType}
+import org.apache.spark.unsafe.types.UTF8String
+
+class ComplexDataSuite extends SparkFunSuite {
+  def utf8(str: String): UTF8String = UTF8String.fromString(str)
+
+  test("inequality tests for MapData") {
+    // test data
+    val testMap1 = Map(utf8("key1") -> 1)
+    val testMap2 = Map(utf8("key1") -> 1, utf8("key2") -> 2)
+    val testMap3 = Map(utf8("key1") -> 1)
+    val testMap4 = Map(utf8("key1") -> 1, utf8("key2") -> 2)
+
+    // ArrayBasedMapData
+    val testArrayMap1 = ArrayBasedMapData(testMap1.toMap)
+    val testArrayMap2 = ArrayBasedMapData(testMap2.toMap)
+    val testArrayMap3 = ArrayBasedMapData(testMap3.toMap)
+    val testArrayMap4 = ArrayBasedMapData(testMap4.toMap)
+    assert(testArrayMap1 !== testArrayMap3)
+    assert(testArrayMap2 !== testArrayMap4)
+
+    // UnsafeMapData
+    val unsafeConverter = UnsafeProjection.create(Array[DataType](MapType(StringType, IntegerType)))
+    val row = new GenericInternalRow(1)
+    def toUnsafeMap(map: ArrayBasedMapData): UnsafeMapData = {
+      row.update(0, map)
+      val unsafeRow = unsafeConverter.apply(row)
+      unsafeRow.getMap(0).copy
+    }
+    assert(toUnsafeMap(testArrayMap1) !== toUnsafeMap(testArrayMap3))
+    assert(toUnsafeMap(testArrayMap2) !== toUnsafeMap(testArrayMap4))
+  }
+
+  test("GenericInternalRow.copy return a new instance that is independent from the old one") {
+    val project = GenerateUnsafeProjection.generate(Seq(BoundReference(0, StringType, true)))
+    val unsafeRow = project.apply(InternalRow(utf8("a")))
+
+    val genericRow = new GenericInternalRow(Array[Any](unsafeRow.getUTF8String(0)))
+    val copiedGenericRow = genericRow.copy()
+    assert(copiedGenericRow.getString(0) == "a")
+    project.apply(InternalRow(UTF8String.fromString("b")))
+    // The copied internal row should not be changed externally.
+    assert(copiedGenericRow.getString(0) == "a")
+  }
+
+  test("SpecificMutableRow.copy return a new instance that is independent from the old one") {
+    val project = GenerateUnsafeProjection.generate(Seq(BoundReference(0, StringType, true)))
+    val unsafeRow = project.apply(InternalRow(utf8("a")))
+
+    val mutableRow = new SpecificInternalRow(Seq(StringType))
+    mutableRow(0) = unsafeRow.getUTF8String(0)
+    val copiedMutableRow = mutableRow.copy()
+    assert(copiedMutableRow.getString(0) == "a")
+    project.apply(InternalRow(UTF8String.fromString("b")))
+    // The copied internal row should not be changed externally.
+    assert(copiedMutableRow.getString(0) == "a")
+  }
+
+  test("GenericArrayData.copy return a new instance that is independent from the old one") {
+    val project = GenerateUnsafeProjection.generate(Seq(BoundReference(0, StringType, true)))
+    val unsafeRow = project.apply(InternalRow(utf8("a")))
+
+    val genericArray = new GenericArrayData(Array[Any](unsafeRow.getUTF8String(0)))
+    val copiedGenericArray = genericArray.copy()
+    assert(copiedGenericArray.getUTF8String(0).toString == "a")
+    project.apply(InternalRow(UTF8String.fromString("b")))
+    // The copied array data should not be changed externally.
+    assert(copiedGenericArray.getUTF8String(0).toString == "a")
+  }
+
+  test("copy on nested complex type") {
+    val project = GenerateUnsafeProjection.generate(Seq(BoundReference(0, StringType, true)))
+    val unsafeRow = project.apply(InternalRow(utf8("a")))
+
+    val arrayOfRow = new GenericArrayData(Array[Any](InternalRow(unsafeRow.getUTF8String(0))))
+    val copied = arrayOfRow.copy()
+    assert(copied.getStruct(0, 1).getUTF8String(0).toString == "a")
+    project.apply(InternalRow(UTF8String.fromString("b")))
+    // The copied data should not be changed externally.
+    assert(copied.getStruct(0, 1).getUTF8String(0).toString == "a")
+  }
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java
index e23a64350cbc5..34dc3af9b85c8 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java
@@ -149,7 +149,7 @@ public InternalRow copy() {
           } else if (dt instanceof DoubleType) {
             row.setDouble(i, getDouble(i));
           } else if (dt instanceof StringType) {
-            row.update(i, getUTF8String(i));
+            row.update(i, getUTF8String(i).copy());
           } else if (dt instanceof BinaryType) {
             row.update(i, getBinary(i));
           } else if (dt instanceof DecimalType) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala
index bea2dce1a7657..a5a444b160c63 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala
@@ -86,17 +86,6 @@ class SortBasedAggregationIterator(
   // The aggregation buffer used by the sort-based aggregation.
   private[this] val sortBasedAggregationBuffer: InternalRow = newBuffer
 
-  // This safe projection is used to turn the input row into safe row. This is necessary
-  // because the input row may be produced by unsafe projection in child operator and all the
-  // produced rows share one byte array. However, when we update the aggregate buffer according to
-  // the input row, we may cache some values from input row, e.g. `Max` will keep the max value from
-  // input row via MutableProjection, `CollectList` will keep all values in an array via
-  // ImperativeAggregate framework. These values may get changed unexpectedly if the underlying
-  // unsafe projection update the shared byte array. By applying a safe projection to the input row,
-  // we can cut down the connection from input row to the shared byte array, and thus it's safe to
-  // cache values from input row while updating the aggregation buffer.
-  private[this] val safeProj: Projection = FromUnsafeProjection(valueAttributes.map(_.dataType))
-
   protected def initialize(): Unit = {
     if (inputIterator.hasNext) {
       initializeBuffer(sortBasedAggregationBuffer)
@@ -119,7 +108,7 @@ class SortBasedAggregationIterator(
     // We create a variable to track if we see the next group.
     var findNextPartition = false
     // firstRowInNextGroup is the first row of this group. We first process it.
-    processRow(sortBasedAggregationBuffer, safeProj(firstRowInNextGroup))
+    processRow(sortBasedAggregationBuffer, firstRowInNextGroup)
 
     // The search will stop when we see the next group or there is no
     // input row left in the iter.
@@ -130,7 +119,7 @@ class SortBasedAggregationIterator(
 
       // Check if the current row belongs the current input row.
       if (currentGroupingKey == groupingKey) {
-        processRow(sortBasedAggregationBuffer, safeProj(currentRow))
+        processRow(sortBasedAggregationBuffer, currentRow)
       } else {
         // We find a new group.
         findNextPartition = true
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala
index d3fa0dcd2d7c3..fc977f2fd5530 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala
@@ -56,7 +56,6 @@ class MutableUnsafeRow(val writer: UnsafeRowWriter) extends BaseGenericInternalR
   // all other methods inherited from GenericMutableRow are not need
   override protected def genericGet(ordinal: Int): Any = throw new UnsupportedOperationException
   override def numFields: Int = throw new UnsupportedOperationException
-  override def copy(): InternalRow = throw new UnsupportedOperationException
 }
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/AggregateProcessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/AggregateProcessor.scala
index 2195c6ea95948..bc141b36e63b4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/AggregateProcessor.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/AggregateProcessor.scala
@@ -145,13 +145,10 @@ private[window] final class AggregateProcessor(
 
   /** Update the buffer. */
   def update(input: InternalRow): Unit = {
-    // TODO(hvanhovell) this sacrifices performance for correctness. We should make sure that
-    // MutableProjection makes copies of the complex input objects it buffer.
-    val copy = input.copy()
-    updateProjection(join(buffer, copy))
+    updateProjection(join(buffer, input))
     var i = 0
     while (i < numImperatives) {
-      imperatives(i).update(buffer, copy)
+      imperatives(i).update(buffer, input)
       i += 1
     }
   }

From 61b5df567eb8ae0df4059cb0e334316fff462de9 Mon Sep 17 00:00:00 2001
From: wangzhenhua <wangzhenhua@huawei.com>
Date: Sat, 1 Jul 2017 10:01:44 +0800
Subject: [PATCH 061/779] [SPARK-21127][SQL] Update statistics after data
 changing commands

## What changes were proposed in this pull request?

Update stats after the following data changing commands:

- InsertIntoHadoopFsRelationCommand
- InsertIntoHiveTable
- LoadDataCommand
- TruncateTableCommand
- AlterTableSetLocationCommand
- AlterTableDropPartitionCommand

## How was this patch tested?
Added new test cases.

Author: wangzhenhua <wangzhenhua@huawei.com>
Author: Zhenhua Wang <wzh_zju@163.com>

Closes #18334 from wzhfy/changeStatsForOperation.
---
 .../apache/spark/sql/internal/SQLConf.scala   |  10 +
 .../sql/execution/command/CommandUtils.scala  |  17 +-
 .../spark/sql/execution/command/ddl.scala     |  15 +-
 .../spark/sql/StatisticsCollectionSuite.scala |  77 +++++---
 .../spark/sql/hive/StatisticsSuite.scala      | 187 +++++++++++-------
 5 files changed, 207 insertions(+), 99 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index c641e4d3a23e1..25152f3e32d6b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -774,6 +774,14 @@ object SQLConf {
       .doubleConf
       .createWithDefault(0.05)
 
+  val AUTO_UPDATE_SIZE =
+    buildConf("spark.sql.statistics.autoUpdate.size")
+      .doc("Enables automatic update for table size once table's data is changed. Note that if " +
+        "the total number of files of the table is very large, this can be expensive and slow " +
+        "down data change commands.")
+      .booleanConf
+      .createWithDefault(false)
+
   val CBO_ENABLED =
     buildConf("spark.sql.cbo.enabled")
       .doc("Enables CBO for estimation of plan statistics when set true.")
@@ -1083,6 +1091,8 @@ class SQLConf extends Serializable with Logging {
 
   def cboEnabled: Boolean = getConf(SQLConf.CBO_ENABLED)
 
+  def autoUpdateSize: Boolean = getConf(SQLConf.AUTO_UPDATE_SIZE)
+
   def joinReorderEnabled: Boolean = getConf(SQLConf.JOIN_REORDER_ENABLED)
 
   def joinReorderDPThreshold: Int = getConf(SQLConf.JOIN_REORDER_DP_THRESHOLD)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
index 92397607f38fd..fce12cc96620c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
@@ -36,7 +36,14 @@ object CommandUtils extends Logging {
   def updateTableStats(sparkSession: SparkSession, table: CatalogTable): Unit = {
     if (table.stats.nonEmpty) {
       val catalog = sparkSession.sessionState.catalog
-      catalog.alterTableStats(table.identifier, None)
+      if (sparkSession.sessionState.conf.autoUpdateSize) {
+        val newTable = catalog.getTableMetadata(table.identifier)
+        val newSize = CommandUtils.calculateTotalSize(sparkSession.sessionState, newTable)
+        val newStats = CatalogStatistics(sizeInBytes = newSize)
+        catalog.alterTableStats(table.identifier, Some(newStats))
+      } else {
+        catalog.alterTableStats(table.identifier, None)
+      }
     }
   }
 
@@ -84,7 +91,9 @@ object CommandUtils extends Logging {
       size
     }
 
-    locationUri.map { p =>
+    val startTime = System.nanoTime()
+    logInfo(s"Starting to calculate the total file size under path $locationUri.")
+    val size = locationUri.map { p =>
       val path = new Path(p)
       try {
         val fs = path.getFileSystem(sessionState.newHadoopConf())
@@ -97,6 +106,10 @@ object CommandUtils extends Logging {
           0L
       }
     }.getOrElse(0L)
+    val durationInMs = (System.nanoTime() - startTime) / (1000 * 1000)
+    logInfo(s"It took $durationInMs ms to calculate the total file size under path $locationUri.")
+
+    size
   }
 
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index ac897c1b22d77..ba7ca84f229fc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -437,7 +437,20 @@ case class AlterTableAddPartitionCommand(
     }
     catalog.createPartitions(table.identifier, parts, ignoreIfExists = ifNotExists)
 
-    CommandUtils.updateTableStats(sparkSession, table)
+    if (table.stats.nonEmpty) {
+      if (sparkSession.sessionState.conf.autoUpdateSize) {
+        val addedSize = parts.map { part =>
+          CommandUtils.calculateLocationSize(sparkSession.sessionState, table.identifier,
+            part.storage.locationUri)
+        }.sum
+        if (addedSize > 0) {
+          val newStats = CatalogStatistics(sizeInBytes = table.stats.get.sizeInBytes + addedSize)
+          catalog.alterTableStats(table.identifier, Some(newStats))
+        }
+      } else {
+        catalog.alterTableStats(table.identifier, None)
+      }
+    }
     Seq.empty[Row]
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
index b031c52dad8b5..d9392de37a815 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.catalog.{CatalogRelation, CatalogStatistics
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.datasources.LogicalRelation
-import org.apache.spark.sql.internal.StaticSQLConf
+import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf}
 import org.apache.spark.sql.test.{SharedSQLContext, SQLTestUtils}
 import org.apache.spark.sql.test.SQLTestData.ArrayData
 import org.apache.spark.sql.types._
@@ -178,36 +178,63 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
 
   test("change stats after set location command") {
     val table = "change_stats_set_location_table"
-    withTable(table) {
-      spark.range(100).select($"id", $"id" % 5 as "value").write.saveAsTable(table)
-      // analyze to get initial stats
-      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS id, value")
-      val fetched1 = checkTableStats(
-        table, hasSizeInBytes = true, expectedRowCounts = Some(100))
-      assert(fetched1.get.sizeInBytes > 0)
-      assert(fetched1.get.colStats.size == 2)
-
-      // set location command
-      withTempDir { newLocation =>
-        sql(s"ALTER TABLE $table SET LOCATION '${newLocation.toURI.toString}'")
-        checkTableStats(table, hasSizeInBytes = false, expectedRowCounts = None)
+    Seq(false, true).foreach { autoUpdate =>
+      withSQLConf(SQLConf.AUTO_UPDATE_SIZE.key -> autoUpdate.toString) {
+        withTable(table) {
+          spark.range(100).select($"id", $"id" % 5 as "value").write.saveAsTable(table)
+          // analyze to get initial stats
+          sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS id, value")
+          val fetched1 = checkTableStats(
+            table, hasSizeInBytes = true, expectedRowCounts = Some(100))
+          assert(fetched1.get.sizeInBytes > 0)
+          assert(fetched1.get.colStats.size == 2)
+
+          // set location command
+          val initLocation = spark.sessionState.catalog.getTableMetadata(TableIdentifier(table))
+            .storage.locationUri.get.toString
+          withTempDir { newLocation =>
+            sql(s"ALTER TABLE $table SET LOCATION '${newLocation.toURI.toString}'")
+            if (autoUpdate) {
+              val fetched2 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = None)
+              assert(fetched2.get.sizeInBytes == 0)
+              assert(fetched2.get.colStats.isEmpty)
+
+              // set back to the initial location
+              sql(s"ALTER TABLE $table SET LOCATION '$initLocation'")
+              val fetched3 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = None)
+              assert(fetched3.get.sizeInBytes == fetched1.get.sizeInBytes)
+            } else {
+              checkTableStats(table, hasSizeInBytes = false, expectedRowCounts = None)
+            }
+          }
+        }
       }
     }
   }
 
   test("change stats after insert command for datasource table") {
     val table = "change_stats_insert_datasource_table"
-    withTable(table) {
-      sql(s"CREATE TABLE $table (i int, j string) USING PARQUET")
-      // analyze to get initial stats
-      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS i, j")
-      val fetched1 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(0))
-      assert(fetched1.get.sizeInBytes == 0)
-      assert(fetched1.get.colStats.size == 2)
-
-      // insert into command
-      sql(s"INSERT INTO TABLE $table SELECT 1, 'abc'")
-      checkTableStats(table, hasSizeInBytes = false, expectedRowCounts = None)
+    Seq(false, true).foreach { autoUpdate =>
+      withSQLConf(SQLConf.AUTO_UPDATE_SIZE.key -> autoUpdate.toString) {
+        withTable(table) {
+          sql(s"CREATE TABLE $table (i int, j string) USING PARQUET")
+          // analyze to get initial stats
+          sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS i, j")
+          val fetched1 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(0))
+          assert(fetched1.get.sizeInBytes == 0)
+          assert(fetched1.get.colStats.size == 2)
+
+          // insert into command
+          sql(s"INSERT INTO TABLE $table SELECT 1, 'abc'")
+          if (autoUpdate) {
+            val fetched2 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = None)
+            assert(fetched2.get.sizeInBytes > 0)
+            assert(fetched2.get.colStats.isEmpty)
+          } else {
+            checkTableStats(table, hasSizeInBytes = false, expectedRowCounts = None)
+          }
+        }
+      }
     }
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index 5fd266c2d033c..c601038a2b0af 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -444,88 +444,133 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
 
   test("change stats after insert command for hive table") {
     val table = s"change_stats_insert_hive_table"
-    withTable(table) {
-      sql(s"CREATE TABLE $table (i int, j string)")
-      // analyze to get initial stats
-      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS i, j")
-      val fetched1 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(0))
-      assert(fetched1.get.sizeInBytes == 0)
-      assert(fetched1.get.colStats.size == 2)
-
-      // insert into command
-      sql(s"INSERT INTO TABLE $table SELECT 1, 'abc'")
-      assert(getStatsProperties(table).isEmpty)
+    Seq(false, true).foreach { autoUpdate =>
+      withSQLConf(SQLConf.AUTO_UPDATE_SIZE.key -> autoUpdate.toString) {
+        withTable(table) {
+          sql(s"CREATE TABLE $table (i int, j string)")
+          // analyze to get initial stats
+          sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS i, j")
+          val fetched1 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(0))
+          assert(fetched1.get.sizeInBytes == 0)
+          assert(fetched1.get.colStats.size == 2)
+
+          // insert into command
+          sql(s"INSERT INTO TABLE $table SELECT 1, 'abc'")
+          if (autoUpdate) {
+            val fetched2 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = None)
+            assert(fetched2.get.sizeInBytes > 0)
+            assert(fetched2.get.colStats.isEmpty)
+            val statsProp = getStatsProperties(table)
+            assert(statsProp(STATISTICS_TOTAL_SIZE).toLong == fetched2.get.sizeInBytes)
+          } else {
+            assert(getStatsProperties(table).isEmpty)
+          }
+        }
+      }
     }
   }
 
   test("change stats after load data command") {
     val table = "change_stats_load_table"
-    withTable(table) {
-      sql(s"CREATE TABLE $table (i INT, j STRING) STORED AS PARQUET")
-      // analyze to get initial stats
-      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS i, j")
-      val fetched1 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(0))
-      assert(fetched1.get.sizeInBytes == 0)
-      assert(fetched1.get.colStats.size == 2)
-
-      withTempDir { loadPath =>
-        // load data command
-        val file = new File(loadPath + "/data")
-        val writer = new PrintWriter(file)
-        writer.write("2,xyz")
-        writer.close()
-        sql(s"LOAD DATA INPATH '${loadPath.toURI.toString}' INTO TABLE $table")
-        assert(getStatsProperties(table).isEmpty)
+    Seq(false, true).foreach { autoUpdate =>
+      withSQLConf(SQLConf.AUTO_UPDATE_SIZE.key -> autoUpdate.toString) {
+        withTable(table) {
+          sql(s"CREATE TABLE $table (i INT, j STRING) STORED AS PARQUET")
+          // analyze to get initial stats
+          sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS i, j")
+          val fetched1 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(0))
+          assert(fetched1.get.sizeInBytes == 0)
+          assert(fetched1.get.colStats.size == 2)
+
+          withTempDir { loadPath =>
+            // load data command
+            val file = new File(loadPath + "/data")
+            val writer = new PrintWriter(file)
+            writer.write("2,xyz")
+            writer.close()
+            sql(s"LOAD DATA INPATH '${loadPath.toURI.toString}' INTO TABLE $table")
+            if (autoUpdate) {
+              val fetched2 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = None)
+              assert(fetched2.get.sizeInBytes > 0)
+              assert(fetched2.get.colStats.isEmpty)
+              val statsProp = getStatsProperties(table)
+              assert(statsProp(STATISTICS_TOTAL_SIZE).toLong == fetched2.get.sizeInBytes)
+            } else {
+              assert(getStatsProperties(table).isEmpty)
+            }
+          }
+        }
       }
     }
   }
 
   test("change stats after add/drop partition command") {
     val table = "change_stats_part_table"
-    withTable(table) {
-      sql(s"CREATE TABLE $table (i INT, j STRING) PARTITIONED BY (ds STRING, hr STRING)")
-      // table has two partitions initially
-      for (ds <- Seq("2008-04-08"); hr <- Seq("11", "12")) {
-        sql(s"INSERT OVERWRITE TABLE $table PARTITION (ds='$ds',hr='$hr') SELECT 1, 'a'")
-      }
-      // analyze to get initial stats
-      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS i, j")
-      val fetched1 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(2))
-      assert(fetched1.get.sizeInBytes > 0)
-      assert(fetched1.get.colStats.size == 2)
-
-      withTempPaths(numPaths = 2) { case Seq(dir1, dir2) =>
-        val file1 = new File(dir1 + "/data")
-        val writer1 = new PrintWriter(file1)
-        writer1.write("1,a")
-        writer1.close()
-
-        val file2 = new File(dir2 + "/data")
-        val writer2 = new PrintWriter(file2)
-        writer2.write("1,a")
-        writer2.close()
-
-        // add partition command
-        sql(
-          s"""
-             |ALTER TABLE $table ADD
-             |PARTITION (ds='2008-04-09', hr='11') LOCATION '${dir1.toURI.toString}'
-             |PARTITION (ds='2008-04-09', hr='12') LOCATION '${dir2.toURI.toString}'
-        """.stripMargin)
-        assert(getStatsProperties(table).isEmpty)
-
-        // generate stats again
-        sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS i, j")
-        val fetched2 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(4))
-        assert(fetched2.get.sizeInBytes > 0)
-        assert(fetched2.get.colStats.size == 2)
-
-        // drop partition command
-        sql(s"ALTER TABLE $table DROP PARTITION (ds='2008-04-08'), PARTITION (hr='12')")
-        // only one partition left
-        assert(spark.sessionState.catalog.listPartitions(TableIdentifier(table))
-          .map(_.spec).toSet == Set(Map("ds" -> "2008-04-09", "hr" -> "11")))
-        assert(getStatsProperties(table).isEmpty)
+    Seq(false, true).foreach { autoUpdate =>
+      withSQLConf(SQLConf.AUTO_UPDATE_SIZE.key -> autoUpdate.toString) {
+        withTable(table) {
+          sql(s"CREATE TABLE $table (i INT, j STRING) PARTITIONED BY (ds STRING, hr STRING)")
+          // table has two partitions initially
+          for (ds <- Seq("2008-04-08"); hr <- Seq("11", "12")) {
+            sql(s"INSERT OVERWRITE TABLE $table PARTITION (ds='$ds',hr='$hr') SELECT 1, 'a'")
+          }
+          // analyze to get initial stats
+          sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS i, j")
+          val fetched1 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(2))
+          assert(fetched1.get.sizeInBytes > 0)
+          assert(fetched1.get.colStats.size == 2)
+
+          withTempPaths(numPaths = 2) { case Seq(dir1, dir2) =>
+            val file1 = new File(dir1 + "/data")
+            val writer1 = new PrintWriter(file1)
+            writer1.write("1,a")
+            writer1.close()
+
+            val file2 = new File(dir2 + "/data")
+            val writer2 = new PrintWriter(file2)
+            writer2.write("1,a")
+            writer2.close()
+
+            // add partition command
+            sql(
+              s"""
+                 |ALTER TABLE $table ADD
+                 |PARTITION (ds='2008-04-09', hr='11') LOCATION '${dir1.toURI.toString}'
+                 |PARTITION (ds='2008-04-09', hr='12') LOCATION '${dir2.toURI.toString}'
+            """.stripMargin)
+            if (autoUpdate) {
+              val fetched2 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = None)
+              assert(fetched2.get.sizeInBytes > fetched1.get.sizeInBytes)
+              assert(fetched2.get.colStats.isEmpty)
+              val statsProp = getStatsProperties(table)
+              assert(statsProp(STATISTICS_TOTAL_SIZE).toLong == fetched2.get.sizeInBytes)
+            } else {
+              assert(getStatsProperties(table).isEmpty)
+            }
+
+            // now the table has four partitions, generate stats again
+            sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS i, j")
+            val fetched3 = checkTableStats(
+              table, hasSizeInBytes = true, expectedRowCounts = Some(4))
+            assert(fetched3.get.sizeInBytes > 0)
+            assert(fetched3.get.colStats.size == 2)
+
+            // drop partition command
+            sql(s"ALTER TABLE $table DROP PARTITION (ds='2008-04-08'), PARTITION (hr='12')")
+            assert(spark.sessionState.catalog.listPartitions(TableIdentifier(table))
+              .map(_.spec).toSet == Set(Map("ds" -> "2008-04-09", "hr" -> "11")))
+            // only one partition left
+            if (autoUpdate) {
+              val fetched4 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = None)
+              assert(fetched4.get.sizeInBytes < fetched1.get.sizeInBytes)
+              assert(fetched4.get.colStats.isEmpty)
+              val statsProp = getStatsProperties(table)
+              assert(statsProp(STATISTICS_TOTAL_SIZE).toLong == fetched4.get.sizeInBytes)
+            } else {
+              assert(getStatsProperties(table).isEmpty)
+            }
+          }
+        }
       }
     }
   }

From b1d719e7c9faeb5661a7e712b3ecefca56bf356f Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Fri, 30 Jun 2017 21:10:23 -0700
Subject: [PATCH 062/779] [SPARK-21273][SQL] Propagate logical plan stats using
 visitor pattern and mixin

## What changes were proposed in this pull request?
We currently implement statistics propagation directly in logical plan. Given we already have two different implementations, it'd make sense to actually decouple the two and add stats propagation using mixin. This would reduce the coupling between logical plan and statistics handling.

This can also be a powerful pattern in the future to add additional properties (e.g. constraints).

## How was this patch tested?
Should be covered by existing test cases.

Author: Reynold Xin <rxin@databricks.com>

Closes #18479 from rxin/stats-trait.
---
 .../sql/catalyst/catalog/interface.scala      |   2 +-
 .../plans/logical/LocalRelation.scala         |   5 +-
 .../catalyst/plans/logical/LogicalPlan.scala  |  61 +------
 .../plans/logical/LogicalPlanVisitor.scala    |  87 ++++++++++
 .../plans/logical/basicLogicalOperators.scala | 128 +-------------
 .../sql/catalyst/plans/logical/hints.scala    |   5 -
 .../BasicStatsPlanVisitor.scala               |  82 +++++++++
 .../statsEstimation/LogicalPlanStats.scala    |  50 ++++++
 .../SizeInBytesOnlyStatsPlanVisitor.scala     | 163 ++++++++++++++++++
 .../BasicStatsEstimationSuite.scala           |  44 -----
 .../StatsEstimationTestBase.scala             |   2 +-
 .../spark/sql/execution/ExistingRDD.scala     |   4 +-
 .../execution/columnar/InMemoryRelation.scala |   2 +-
 .../datasources/LogicalRelation.scala         |   7 +-
 .../sql/execution/streaming/memory.scala      |   3 +-
 .../PruneFileSourcePartitionsSuite.scala      |   2 +-
 16 files changed, 409 insertions(+), 238 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanVisitor.scala
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/BasicStatsPlanVisitor.scala
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/LogicalPlanStats.scala
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/SizeInBytesOnlyStatsPlanVisitor.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index da50b0e7e8e42..9531456434a15 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -438,7 +438,7 @@ case class CatalogRelation(
       case (attr, index) => attr.withExprId(ExprId(index + dataCols.length))
     })
 
-  override def computeStats: Statistics = {
+  override def computeStats(): Statistics = {
     // For data source tables, we will create a `LogicalRelation` and won't call this method, for
     // hive serde tables, we will always generate a statistics.
     // TODO: unify the table stats generation.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala
index dc2add64b68b7..1c986fbde7ada 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala
@@ -66,9 +66,8 @@ case class LocalRelation(output: Seq[Attribute], data: Seq[InternalRow] = Nil)
     }
   }
 
-  override def computeStats: Statistics =
-    Statistics(sizeInBytes =
-      output.map(n => BigInt(n.dataType.defaultSize)).sum * data.length)
+  override def computeStats(): Statistics =
+    Statistics(sizeInBytes = output.map(n => BigInt(n.dataType.defaultSize)).sum * data.length)
 
   def toSQL(inlineTableName: String): String = {
     require(data.nonEmpty)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index 0d30aa76049a5..8649603b1a9f5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -22,11 +22,16 @@ import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.QueryPlan
+import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.LogicalPlanStats
 import org.apache.spark.sql.catalyst.trees.CurrentOrigin
 import org.apache.spark.sql.types.StructType
 
 
-abstract class LogicalPlan extends QueryPlan[LogicalPlan] with QueryPlanConstraints with Logging {
+abstract class LogicalPlan
+  extends QueryPlan[LogicalPlan]
+  with LogicalPlanStats
+  with QueryPlanConstraints
+  with Logging {
 
   private var _analyzed: Boolean = false
 
@@ -80,40 +85,6 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with QueryPlanConstrai
     }
   }
 
-  /** A cache for the estimated statistics, such that it will only be computed once. */
-  private var statsCache: Option[Statistics] = None
-
-  /**
-   * Returns the estimated statistics for the current logical plan node. Under the hood, this
-   * method caches the return value, which is computed based on the configuration passed in the
-   * first time. If the configuration changes, the cache can be invalidated by calling
-   * [[invalidateStatsCache()]].
-   */
-  final def stats: Statistics = statsCache.getOrElse {
-    statsCache = Some(computeStats)
-    statsCache.get
-  }
-
-  /** Invalidates the stats cache. See [[stats]] for more information. */
-  final def invalidateStatsCache(): Unit = {
-    statsCache = None
-    children.foreach(_.invalidateStatsCache())
-  }
-
-  /**
-   * Computes [[Statistics]] for this plan. The default implementation assumes the output
-   * cardinality is the product of all child plan's cardinality, i.e. applies in the case
-   * of cartesian joins.
-   *
-   * [[LeafNode]]s must override this.
-   */
-  protected def computeStats: Statistics = {
-    if (children.isEmpty) {
-      throw new UnsupportedOperationException(s"LeafNode $nodeName must implement statistics.")
-    }
-    Statistics(sizeInBytes = children.map(_.stats.sizeInBytes).product)
-  }
-
   override def verboseStringWithSuffix: String = {
     super.verboseString + statsCache.map(", " + _.toString).getOrElse("")
   }
@@ -300,6 +271,9 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with QueryPlanConstrai
 abstract class LeafNode extends LogicalPlan {
   override final def children: Seq[LogicalPlan] = Nil
   override def producedAttributes: AttributeSet = outputSet
+
+  /** Leaf nodes that can survive analysis must define their own statistics. */
+  def computeStats(): Statistics = throw new UnsupportedOperationException
 }
 
 /**
@@ -331,23 +305,6 @@ abstract class UnaryNode extends LogicalPlan {
   }
 
   override protected def validConstraints: Set[Expression] = child.constraints
-
-  override def computeStats: Statistics = {
-    // There should be some overhead in Row object, the size should not be zero when there is
-    // no columns, this help to prevent divide-by-zero error.
-    val childRowSize = child.output.map(_.dataType.defaultSize).sum + 8
-    val outputRowSize = output.map(_.dataType.defaultSize).sum + 8
-    // Assume there will be the same number of rows as child has.
-    var sizeInBytes = (child.stats.sizeInBytes * outputRowSize) / childRowSize
-    if (sizeInBytes == 0) {
-      // sizeInBytes can't be zero, or sizeInBytes of BinaryNode will also be zero
-      // (product of children).
-      sizeInBytes = 1
-    }
-
-    // Don't propagate rowCount and attributeStats, since they are not estimated here.
-    Statistics(sizeInBytes = sizeInBytes, hints = child.stats.hints)
-  }
 }
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanVisitor.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanVisitor.scala
new file mode 100644
index 0000000000000..b23045810a4f6
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanVisitor.scala
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.plans.logical
+
+/**
+ * A visitor pattern for traversing a [[LogicalPlan]] tree and compute some properties.
+ */
+trait LogicalPlanVisitor[T] {
+
+  def visit(p: LogicalPlan): T = p match {
+    case p: Aggregate => visitAggregate(p)
+    case p: Distinct => visitDistinct(p)
+    case p: Except => visitExcept(p)
+    case p: Expand => visitExpand(p)
+    case p: Filter => visitFilter(p)
+    case p: Generate => visitGenerate(p)
+    case p: GlobalLimit => visitGlobalLimit(p)
+    case p: Intersect => visitIntersect(p)
+    case p: Join => visitJoin(p)
+    case p: LocalLimit => visitLocalLimit(p)
+    case p: Pivot => visitPivot(p)
+    case p: Project => visitProject(p)
+    case p: Range => visitRange(p)
+    case p: Repartition => visitRepartition(p)
+    case p: RepartitionByExpression => visitRepartitionByExpr(p)
+    case p: Sample => visitSample(p)
+    case p: ScriptTransformation => visitScriptTransform(p)
+    case p: Union => visitUnion(p)
+    case p: ResolvedHint => visitHint(p)
+    case p: LogicalPlan => default(p)
+  }
+
+  def default(p: LogicalPlan): T
+
+  def visitAggregate(p: Aggregate): T
+
+  def visitDistinct(p: Distinct): T
+
+  def visitExcept(p: Except): T
+
+  def visitExpand(p: Expand): T
+
+  def visitFilter(p: Filter): T
+
+  def visitGenerate(p: Generate): T
+
+  def visitGlobalLimit(p: GlobalLimit): T
+
+  def visitHint(p: ResolvedHint): T
+
+  def visitIntersect(p: Intersect): T
+
+  def visitJoin(p: Join): T
+
+  def visitLocalLimit(p: LocalLimit): T
+
+  def visitPivot(p: Pivot): T
+
+  def visitProject(p: Project): T
+
+  def visitRange(p: Range): T
+
+  def visitRepartition(p: Repartition): T
+
+  def visitRepartitionByExpr(p: RepartitionByExpression): T
+
+  def visitSample(p: Sample): T
+
+  def visitScriptTransform(p: ScriptTransformation): T
+
+  def visitUnion(p: Union): T
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index e89caabf252d7..0bd3166352d35 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -63,14 +63,6 @@ case class Project(projectList: Seq[NamedExpression], child: LogicalPlan) extend
 
   override def validConstraints: Set[Expression] =
     child.constraints.union(getAliasedConstraints(projectList))
-
-  override def computeStats: Statistics = {
-    if (conf.cboEnabled) {
-      ProjectEstimation.estimate(this).getOrElse(super.computeStats)
-    } else {
-      super.computeStats
-    }
-  }
 }
 
 /**
@@ -137,14 +129,6 @@ case class Filter(condition: Expression, child: LogicalPlan)
       .filterNot(SubqueryExpression.hasCorrelatedSubquery)
     child.constraints.union(predicates.toSet)
   }
-
-  override def computeStats: Statistics = {
-    if (conf.cboEnabled) {
-      FilterEstimation(this).estimate.getOrElse(super.computeStats)
-    } else {
-      super.computeStats
-    }
-  }
 }
 
 abstract class SetOperation(left: LogicalPlan, right: LogicalPlan) extends BinaryNode {
@@ -190,15 +174,6 @@ case class Intersect(left: LogicalPlan, right: LogicalPlan) extends SetOperation
       Some(children.flatMap(_.maxRows).min)
     }
   }
-
-  override def computeStats: Statistics = {
-    val leftSize = left.stats.sizeInBytes
-    val rightSize = right.stats.sizeInBytes
-    val sizeInBytes = if (leftSize < rightSize) leftSize else rightSize
-    Statistics(
-      sizeInBytes = sizeInBytes,
-      hints = left.stats.hints.resetForJoin())
-  }
 }
 
 case class Except(left: LogicalPlan, right: LogicalPlan) extends SetOperation(left, right) {
@@ -207,10 +182,6 @@ case class Except(left: LogicalPlan, right: LogicalPlan) extends SetOperation(le
   override def output: Seq[Attribute] = left.output
 
   override protected def validConstraints: Set[Expression] = leftConstraints
-
-  override def computeStats: Statistics = {
-    left.stats.copy()
-  }
 }
 
 /** Factory for constructing new `Union` nodes. */
@@ -247,11 +218,6 @@ case class Union(children: Seq[LogicalPlan]) extends LogicalPlan {
     children.length > 1 && childrenResolved && allChildrenCompatible
   }
 
-  override def computeStats: Statistics = {
-    val sizeInBytes = children.map(_.stats.sizeInBytes).sum
-    Statistics(sizeInBytes = sizeInBytes)
-  }
-
   /**
    * Maps the constraints containing a given (original) sequence of attributes to those with a
    * given (reference) sequence of attributes. Given the nature of union, we expect that the
@@ -355,25 +321,6 @@ case class Join(
     case UsingJoin(_, _) => false
     case _ => resolvedExceptNatural
   }
-
-  override def computeStats: Statistics = {
-    def simpleEstimation: Statistics = joinType match {
-      case LeftAnti | LeftSemi =>
-        // LeftSemi and LeftAnti won't ever be bigger than left
-        left.stats
-      case _ =>
-        // Make sure we don't propagate isBroadcastable in other joins, because
-        // they could explode the size.
-        val stats = super.computeStats
-        stats.copy(hints = stats.hints.resetForJoin())
-    }
-
-    if (conf.cboEnabled) {
-      JoinEstimation.estimate(this).getOrElse(simpleEstimation)
-    } else {
-      simpleEstimation
-    }
-  }
 }
 
 /**
@@ -522,14 +469,13 @@ case class Range(
 
   override def newInstance(): Range = copy(output = output.map(_.newInstance()))
 
-  override def computeStats: Statistics = {
-    val sizeInBytes = LongType.defaultSize * numElements
-    Statistics( sizeInBytes = sizeInBytes )
-  }
-
   override def simpleString: String = {
     s"Range ($start, $end, step=$step, splits=$numSlices)"
   }
+
+  override def computeStats(): Statistics = {
+    Statistics(sizeInBytes = LongType.defaultSize * numElements)
+  }
 }
 
 case class Aggregate(
@@ -554,25 +500,6 @@ case class Aggregate(
     val nonAgg = aggregateExpressions.filter(_.find(_.isInstanceOf[AggregateExpression]).isEmpty)
     child.constraints.union(getAliasedConstraints(nonAgg))
   }
-
-  override def computeStats: Statistics = {
-    def simpleEstimation: Statistics = {
-      if (groupingExpressions.isEmpty) {
-        Statistics(
-          sizeInBytes = EstimationUtils.getOutputSize(output, outputRowCount = 1),
-          rowCount = Some(1),
-          hints = child.stats.hints)
-      } else {
-        super.computeStats
-      }
-    }
-
-    if (conf.cboEnabled) {
-      AggregateEstimation.estimate(this).getOrElse(simpleEstimation)
-    } else {
-      simpleEstimation
-    }
-  }
 }
 
 case class Window(
@@ -671,11 +598,6 @@ case class Expand(
   override def references: AttributeSet =
     AttributeSet(projections.flatten.flatMap(_.references))
 
-  override def computeStats: Statistics = {
-    val sizeInBytes = super.computeStats.sizeInBytes * projections.length
-    Statistics(sizeInBytes = sizeInBytes)
-  }
-
   // This operator can reuse attributes (for example making them null when doing a roll up) so
   // the constraints of the child may no longer be valid.
   override protected def validConstraints: Set[Expression] = Set.empty[Expression]
@@ -742,16 +664,6 @@ case class GlobalLimit(limitExpr: Expression, child: LogicalPlan) extends UnaryN
       case _ => None
     }
   }
-  override def computeStats: Statistics = {
-    val limit = limitExpr.eval().asInstanceOf[Int]
-    val childStats = child.stats
-    val rowCount: BigInt = childStats.rowCount.map(_.min(limit)).getOrElse(limit)
-    // Don't propagate column stats, because we don't know the distribution after a limit operation
-    Statistics(
-      sizeInBytes = EstimationUtils.getOutputSize(output, rowCount, childStats.attributeStats),
-      rowCount = Some(rowCount),
-      hints = childStats.hints)
-  }
 }
 
 case class LocalLimit(limitExpr: Expression, child: LogicalPlan) extends UnaryNode {
@@ -762,24 +674,6 @@ case class LocalLimit(limitExpr: Expression, child: LogicalPlan) extends UnaryNo
       case _ => None
     }
   }
-  override def computeStats: Statistics = {
-    val limit = limitExpr.eval().asInstanceOf[Int]
-    val childStats = child.stats
-    if (limit == 0) {
-      // sizeInBytes can't be zero, or sizeInBytes of BinaryNode will also be zero
-      // (product of children).
-      Statistics(
-        sizeInBytes = 1,
-        rowCount = Some(0),
-        hints = childStats.hints)
-    } else {
-      // The output row count of LocalLimit should be the sum of row counts from each partition.
-      // However, since the number of partitions is not available here, we just use statistics of
-      // the child. Because the distribution after a limit operation is unknown, we do not propagate
-      // the column stats.
-      childStats.copy(attributeStats = AttributeMap(Nil))
-    }
-  }
 }
 
 /**
@@ -828,18 +722,6 @@ case class Sample(
   }
 
   override def output: Seq[Attribute] = child.output
-
-  override def computeStats: Statistics = {
-    val ratio = upperBound - lowerBound
-    val childStats = child.stats
-    var sizeInBytes = EstimationUtils.ceil(BigDecimal(childStats.sizeInBytes) * ratio)
-    if (sizeInBytes == 0) {
-      sizeInBytes = 1
-    }
-    val sampledRowCount = childStats.rowCount.map(c => EstimationUtils.ceil(BigDecimal(c) * ratio))
-    // Don't propagate column stats, because we don't know the distribution after a sample operation
-    Statistics(sizeInBytes, sampledRowCount, hints = childStats.hints)
-  }
 }
 
 /**
@@ -893,7 +775,7 @@ case class RepartitionByExpression(
 case object OneRowRelation extends LeafNode {
   override def maxRows: Option[Long] = Some(1)
   override def output: Seq[Attribute] = Nil
-  override def computeStats: Statistics = Statistics(sizeInBytes = 1)
+  override def computeStats(): Statistics = Statistics(sizeInBytes = 1)
 }
 
 /** A logical plan for `dropDuplicates`. */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala
index 8479c702d7561..29a43528124d8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala
@@ -42,11 +42,6 @@ case class ResolvedHint(child: LogicalPlan, hints: HintInfo = HintInfo())
   override def output: Seq[Attribute] = child.output
 
   override lazy val canonicalized: LogicalPlan = child.canonicalized
-
-  override def computeStats: Statistics = {
-    val stats = child.stats
-    stats.copy(hints = hints)
-  }
 }
 
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/BasicStatsPlanVisitor.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/BasicStatsPlanVisitor.scala
new file mode 100644
index 0000000000000..93908b04fb643
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/BasicStatsPlanVisitor.scala
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.plans.logical.statsEstimation
+
+import org.apache.spark.sql.catalyst.plans.logical
+import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.types.LongType
+
+/**
+ * An [[LogicalPlanVisitor]] that computes a the statistics used in a cost-based optimizer.
+ */
+object BasicStatsPlanVisitor extends LogicalPlanVisitor[Statistics] {
+
+  /** Falls back to the estimation computed by [[SizeInBytesOnlyStatsPlanVisitor]]. */
+  private def fallback(p: LogicalPlan): Statistics = SizeInBytesOnlyStatsPlanVisitor.visit(p)
+
+  override def default(p: LogicalPlan): Statistics = fallback(p)
+
+  override def visitAggregate(p: Aggregate): Statistics = {
+    AggregateEstimation.estimate(p).getOrElse(fallback(p))
+  }
+
+  override def visitDistinct(p: Distinct): Statistics = fallback(p)
+
+  override def visitExcept(p: Except): Statistics = fallback(p)
+
+  override def visitExpand(p: Expand): Statistics = fallback(p)
+
+  override def visitFilter(p: Filter): Statistics = {
+    FilterEstimation(p).estimate.getOrElse(fallback(p))
+  }
+
+  override def visitGenerate(p: Generate): Statistics = fallback(p)
+
+  override def visitGlobalLimit(p: GlobalLimit): Statistics = fallback(p)
+
+  override def visitHint(p: ResolvedHint): Statistics = fallback(p)
+
+  override def visitIntersect(p: Intersect): Statistics = fallback(p)
+
+  override def visitJoin(p: Join): Statistics = {
+    JoinEstimation.estimate(p).getOrElse(fallback(p))
+  }
+
+  override def visitLocalLimit(p: LocalLimit): Statistics = fallback(p)
+
+  override def visitPivot(p: Pivot): Statistics = fallback(p)
+
+  override def visitProject(p: Project): Statistics = {
+    ProjectEstimation.estimate(p).getOrElse(fallback(p))
+  }
+
+  override def visitRange(p: logical.Range): Statistics = {
+    val sizeInBytes = LongType.defaultSize * p.numElements
+    Statistics(sizeInBytes = sizeInBytes)
+  }
+
+  override def visitRepartition(p: Repartition): Statistics = fallback(p)
+
+  override def visitRepartitionByExpr(p: RepartitionByExpression): Statistics = fallback(p)
+
+  override def visitSample(p: Sample): Statistics = fallback(p)
+
+  override def visitScriptTransform(p: ScriptTransformation): Statistics = fallback(p)
+
+  override def visitUnion(p: Union): Statistics = fallback(p)
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/LogicalPlanStats.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/LogicalPlanStats.scala
new file mode 100644
index 0000000000000..8660d93550192
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/LogicalPlanStats.scala
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.plans.logical.statsEstimation
+
+import org.apache.spark.sql.catalyst.plans.logical._
+
+/**
+ * A trait to add statistics propagation to [[LogicalPlan]].
+ */
+trait LogicalPlanStats { self: LogicalPlan =>
+
+  /**
+   * Returns the estimated statistics for the current logical plan node. Under the hood, this
+   * method caches the return value, which is computed based on the configuration passed in the
+   * first time. If the configuration changes, the cache can be invalidated by calling
+   * [[invalidateStatsCache()]].
+   */
+  def stats: Statistics = statsCache.getOrElse {
+    if (conf.cboEnabled) {
+      statsCache = Option(BasicStatsPlanVisitor.visit(self))
+    } else {
+      statsCache = Option(SizeInBytesOnlyStatsPlanVisitor.visit(self))
+    }
+    statsCache.get
+  }
+
+  /** A cache for the estimated statistics, such that it will only be computed once. */
+  protected var statsCache: Option[Statistics] = None
+
+  /** Invalidates the stats cache. See [[stats]] for more information. */
+  final def invalidateStatsCache(): Unit = {
+    statsCache = None
+    children.foreach(_.invalidateStatsCache())
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/SizeInBytesOnlyStatsPlanVisitor.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/SizeInBytesOnlyStatsPlanVisitor.scala
new file mode 100644
index 0000000000000..559f12072e448
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/SizeInBytesOnlyStatsPlanVisitor.scala
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.plans.logical.statsEstimation
+
+import org.apache.spark.sql.catalyst.expressions.AttributeMap
+import org.apache.spark.sql.catalyst.plans.{LeftAnti, LeftSemi}
+import org.apache.spark.sql.catalyst.plans.logical
+import org.apache.spark.sql.catalyst.plans.logical._
+
+/**
+ * An [[LogicalPlanVisitor]] that computes a single dimension for plan stats: size in bytes.
+ */
+object SizeInBytesOnlyStatsPlanVisitor extends LogicalPlanVisitor[Statistics] {
+
+  /**
+   * A default, commonly used estimation for unary nodes. We assume the input row number is the
+   * same as the output row number, and compute sizes based on the column types.
+   */
+  private def visitUnaryNode(p: UnaryNode): Statistics = {
+    // There should be some overhead in Row object, the size should not be zero when there is
+    // no columns, this help to prevent divide-by-zero error.
+    val childRowSize = p.child.output.map(_.dataType.defaultSize).sum + 8
+    val outputRowSize = p.output.map(_.dataType.defaultSize).sum + 8
+    // Assume there will be the same number of rows as child has.
+    var sizeInBytes = (p.child.stats.sizeInBytes * outputRowSize) / childRowSize
+    if (sizeInBytes == 0) {
+      // sizeInBytes can't be zero, or sizeInBytes of BinaryNode will also be zero
+      // (product of children).
+      sizeInBytes = 1
+    }
+
+    // Don't propagate rowCount and attributeStats, since they are not estimated here.
+    Statistics(sizeInBytes = sizeInBytes, hints = p.child.stats.hints)
+  }
+
+  /**
+   * For leaf nodes, use its computeStats. For other nodes, we assume the size in bytes is the
+   * sum of all of the children's.
+   */
+  override def default(p: LogicalPlan): Statistics = p match {
+    case p: LeafNode => p.computeStats()
+    case _: LogicalPlan => Statistics(sizeInBytes = p.children.map(_.stats.sizeInBytes).product)
+  }
+
+  override def visitAggregate(p: Aggregate): Statistics = {
+    if (p.groupingExpressions.isEmpty) {
+      Statistics(
+        sizeInBytes = EstimationUtils.getOutputSize(p.output, outputRowCount = 1),
+        rowCount = Some(1),
+        hints = p.child.stats.hints)
+    } else {
+      visitUnaryNode(p)
+    }
+  }
+
+  override def visitDistinct(p: Distinct): Statistics = default(p)
+
+  override def visitExcept(p: Except): Statistics = p.left.stats.copy()
+
+  override def visitExpand(p: Expand): Statistics = {
+    val sizeInBytes = visitUnaryNode(p).sizeInBytes * p.projections.length
+    Statistics(sizeInBytes = sizeInBytes)
+  }
+
+  override def visitFilter(p: Filter): Statistics = visitUnaryNode(p)
+
+  override def visitGenerate(p: Generate): Statistics = default(p)
+
+  override def visitGlobalLimit(p: GlobalLimit): Statistics = {
+    val limit = p.limitExpr.eval().asInstanceOf[Int]
+    val childStats = p.child.stats
+    val rowCount: BigInt = childStats.rowCount.map(_.min(limit)).getOrElse(limit)
+    // Don't propagate column stats, because we don't know the distribution after limit
+    Statistics(
+      sizeInBytes = EstimationUtils.getOutputSize(p.output, rowCount, childStats.attributeStats),
+      rowCount = Some(rowCount),
+      hints = childStats.hints)
+  }
+
+  override def visitHint(p: ResolvedHint): Statistics = p.child.stats.copy(hints = p.hints)
+
+  override def visitIntersect(p: Intersect): Statistics = {
+    val leftSize = p.left.stats.sizeInBytes
+    val rightSize = p.right.stats.sizeInBytes
+    val sizeInBytes = if (leftSize < rightSize) leftSize else rightSize
+    Statistics(
+      sizeInBytes = sizeInBytes,
+      hints = p.left.stats.hints.resetForJoin())
+  }
+
+  override def visitJoin(p: Join): Statistics = {
+    p.joinType match {
+      case LeftAnti | LeftSemi =>
+        // LeftSemi and LeftAnti won't ever be bigger than left
+        p.left.stats
+      case _ =>
+        // Make sure we don't propagate isBroadcastable in other joins, because
+        // they could explode the size.
+        val stats = default(p)
+        stats.copy(hints = stats.hints.resetForJoin())
+    }
+  }
+
+  override def visitLocalLimit(p: LocalLimit): Statistics = {
+    val limit = p.limitExpr.eval().asInstanceOf[Int]
+    val childStats = p.child.stats
+    if (limit == 0) {
+      // sizeInBytes can't be zero, or sizeInBytes of BinaryNode will also be zero
+      // (product of children).
+      Statistics(sizeInBytes = 1, rowCount = Some(0), hints = childStats.hints)
+    } else {
+      // The output row count of LocalLimit should be the sum of row counts from each partition.
+      // However, since the number of partitions is not available here, we just use statistics of
+      // the child. Because the distribution after a limit operation is unknown, we do not propagate
+      // the column stats.
+      childStats.copy(attributeStats = AttributeMap(Nil))
+    }
+  }
+
+  override def visitPivot(p: Pivot): Statistics = default(p)
+
+  override def visitProject(p: Project): Statistics = visitUnaryNode(p)
+
+  override def visitRange(p: logical.Range): Statistics = {
+    p.computeStats()
+  }
+
+  override def visitRepartition(p: Repartition): Statistics = default(p)
+
+  override def visitRepartitionByExpr(p: RepartitionByExpression): Statistics = default(p)
+
+  override def visitSample(p: Sample): Statistics = {
+    val ratio = p.upperBound - p.lowerBound
+    var sizeInBytes = EstimationUtils.ceil(BigDecimal(p.child.stats.sizeInBytes) * ratio)
+    if (sizeInBytes == 0) {
+      sizeInBytes = 1
+    }
+    val sampleRows = p.child.stats.rowCount.map(c => EstimationUtils.ceil(BigDecimal(c) * ratio))
+    // Don't propagate column stats, because we don't know the distribution after a sample operation
+    Statistics(sizeInBytes, sampleRows, hints = p.child.stats.hints)
+  }
+
+  override def visitScriptTransform(p: ScriptTransformation): Statistics = default(p)
+
+  override def visitUnion(p: Union): Statistics = {
+    Statistics(sizeInBytes = p.children.map(_.stats.sizeInBytes).sum)
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/BasicStatsEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/BasicStatsEstimationSuite.scala
index 912c5fed63450..31a8cbdee9777 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/BasicStatsEstimationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/BasicStatsEstimationSuite.scala
@@ -77,37 +77,6 @@ class BasicStatsEstimationSuite extends StatsEstimationTestBase {
     checkStats(globalLimit, stats)
   }
 
-  test("sample estimation") {
-    val sample = Sample(0.0, 0.5, withReplacement = false, (math.random * 1000).toLong, plan)
-    checkStats(sample, Statistics(sizeInBytes = 60, rowCount = Some(5)))
-
-    // Child doesn't have rowCount in stats
-    val childStats = Statistics(sizeInBytes = 120)
-    val childPlan = DummyLogicalPlan(childStats, childStats)
-    val sample2 =
-      Sample(0.0, 0.11, withReplacement = false, (math.random * 1000).toLong, childPlan)
-    checkStats(sample2, Statistics(sizeInBytes = 14))
-  }
-
-  test("estimate statistics when the conf changes") {
-    val expectedDefaultStats =
-      Statistics(
-        sizeInBytes = 40,
-        rowCount = Some(10),
-        attributeStats = AttributeMap(Seq(
-          AttributeReference("c1", IntegerType)() -> ColumnStat(10, Some(1), Some(10), 0, 4, 4))))
-    val expectedCboStats =
-      Statistics(
-        sizeInBytes = 4,
-        rowCount = Some(1),
-        attributeStats = AttributeMap(Seq(
-          AttributeReference("c1", IntegerType)() -> ColumnStat(1, Some(5), Some(5), 0, 4, 4))))
-
-    val plan = DummyLogicalPlan(defaultStats = expectedDefaultStats, cboStats = expectedCboStats)
-    checkStats(
-      plan, expectedStatsCboOn = expectedCboStats, expectedStatsCboOff = expectedDefaultStats)
-  }
-
   /** Check estimated stats when cbo is turned on/off. */
   private def checkStats(
       plan: LogicalPlan,
@@ -132,16 +101,3 @@ class BasicStatsEstimationSuite extends StatsEstimationTestBase {
   private def checkStats(plan: LogicalPlan, expectedStats: Statistics): Unit =
     checkStats(plan, expectedStats, expectedStats)
 }
-
-/**
- * This class is used for unit-testing the cbo switch, it mimics a logical plan which computes
- * a simple statistics or a cbo estimated statistics based on the conf.
- */
-private case class DummyLogicalPlan(
-    defaultStats: Statistics,
-    cboStats: Statistics) extends LogicalPlan {
-  override def output: Seq[Attribute] = Nil
-  override def children: Seq[LogicalPlan] = Nil
-  override def computeStats: Statistics =
-    if (conf.cboEnabled) cboStats else defaultStats
-}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/StatsEstimationTestBase.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/StatsEstimationTestBase.scala
index eaa33e44a6a5a..31dea2e3e7f1d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/StatsEstimationTestBase.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/StatsEstimationTestBase.scala
@@ -65,7 +65,7 @@ case class StatsTestPlan(
     attributeStats: AttributeMap[ColumnStat],
     size: Option[BigInt] = None) extends LeafNode {
   override def output: Seq[Attribute] = outputList
-  override def computeStats: Statistics = Statistics(
+  override def computeStats(): Statistics = Statistics(
     // If sizeInBytes is useless in testing, we just use a fake value
     sizeInBytes = size.getOrElse(Int.MaxValue),
     rowCount = Some(rowCount),
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
index 66f66a289a065..dcb918eeb9d10 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
@@ -88,7 +88,7 @@ case class ExternalRDD[T](
 
   override protected def stringArgs: Iterator[Any] = Iterator(output)
 
-  @transient override def computeStats: Statistics = Statistics(
+  override def computeStats(): Statistics = Statistics(
     // TODO: Instead of returning a default value here, find a way to return a meaningful size
     // estimate for RDDs. See PR 1238 for more discussions.
     sizeInBytes = BigInt(session.sessionState.conf.defaultSizeInBytes)
@@ -156,7 +156,7 @@ case class LogicalRDD(
 
   override protected def stringArgs: Iterator[Any] = Iterator(output)
 
-  @transient override def computeStats: Statistics = Statistics(
+  override def computeStats(): Statistics = Statistics(
     // TODO: Instead of returning a default value here, find a way to return a meaningful size
     // estimate for RDDs. See PR 1238 for more discussions.
     sizeInBytes = BigInt(session.sessionState.conf.defaultSizeInBytes)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
index 2972132336de0..39cf8fcac5116 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
@@ -69,7 +69,7 @@ case class InMemoryRelation(
 
   @transient val partitionStatistics = new PartitionStatistics(output)
 
-  override def computeStats: Statistics = {
+  override def computeStats(): Statistics = {
     if (batchStats.value == 0L) {
       // Underlying columnar RDD hasn't been materialized, no useful statistics information
       // available, return the default statistics.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
index 6ba190b9e5dcf..699f1bad9c4ed 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
@@ -48,9 +48,10 @@ case class LogicalRelation(
     output = output.map(QueryPlan.normalizeExprId(_, output)),
     catalogTable = None)
 
-  @transient override def computeStats: Statistics = {
-    catalogTable.flatMap(_.stats.map(_.toPlanStats(output))).getOrElse(
-      Statistics(sizeInBytes = relation.sizeInBytes))
+  override def computeStats(): Statistics = {
+    catalogTable
+      .flatMap(_.stats.map(_.toPlanStats(output)))
+      .getOrElse(Statistics(sizeInBytes = relation.sizeInBytes))
   }
 
   /** Used to lookup original attribute capitalization */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
index 4979873ee3c7f..587ae2bfb63fb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
@@ -230,6 +230,5 @@ case class MemoryPlan(sink: MemorySink, output: Seq[Attribute]) extends LeafNode
 
   private val sizePerRow = sink.schema.toAttributes.map(_.dataType.defaultSize).sum
 
-  override def computeStats: Statistics =
-    Statistics(sizePerRow * sink.allData.size)
+  override def computeStats(): Statistics = Statistics(sizePerRow * sink.allData.size)
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala
index 3a724aa14f2a9..94384185d190a 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala
@@ -86,7 +86,7 @@ class PruneFileSourcePartitionsSuite extends QueryTest with SQLTestUtils with Te
         case relation: LogicalRelation => relation
       }
       assert(relations.size === 1, s"Size wrong for:\n ${df.queryExecution}")
-      val size2 = relations(0).computeStats.sizeInBytes
+      val size2 = relations(0).stats.sizeInBytes
       assert(size2 == relations(0).catalogTable.get.stats.get.sizeInBytes)
       assert(size2 < tableStats.get.sizeInBytes)
     }

From 37ef32e515ea071afe63b56ba0d4299bb76e8a75 Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Sat, 1 Jul 2017 14:57:57 +0800
Subject: [PATCH 063/779] [SPARK-21275][ML] Update GLM test to use
 supportedFamilyNames

## What changes were proposed in this pull request?
Update GLM test to use supportedFamilyNames as suggested here:
https://github.com/apache/spark/pull/16699#discussion-diff-100574976R855

Author: actuaryzhang <actuaryzhang10@gmail.com>

Closes #18495 from actuaryzhang/mlGlmTest2.
---
 .../GeneralizedLinearRegressionSuite.scala    | 33 +++++++++----------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index 83f1344a7bcb1..a47bd17f47bb1 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -749,15 +749,15 @@ class GeneralizedLinearRegressionSuite
       library(statmod)
       y <- c(1.0, 0.5, 0.7, 0.3)
       w <- c(1, 2, 3, 4)
-      for (fam in list(gaussian(), poisson(), binomial(), Gamma(), tweedie(1.6))) {
+      for (fam in list(binomial(), Gamma(), gaussian(), poisson(), tweedie(1.6))) {
         model1 <- glm(y ~ 1, family = fam)
         model2 <- glm(y ~ 1, family = fam, weights = w)
         print(as.vector(c(coef(model1), coef(model2))))
       }
-      [1] 0.625 0.530
-      [1] -0.4700036 -0.6348783
       [1] 0.5108256 0.1201443
       [1] 1.600000 1.886792
+      [1] 0.625 0.530
+      [1] -0.4700036 -0.6348783
       [1] 1.325782 1.463641
      */
 
@@ -768,13 +768,13 @@ class GeneralizedLinearRegressionSuite
       Instance(0.3, 4.0, Vectors.zeros(0))
     ).toDF()
 
-    val expected = Seq(0.625, 0.530, -0.4700036, -0.6348783, 0.5108256, 0.1201443,
-      1.600000, 1.886792, 1.325782, 1.463641)
+    val expected = Seq(0.5108256, 0.1201443, 1.600000, 1.886792, 0.625, 0.530,
+      -0.4700036, -0.6348783, 1.325782, 1.463641)
 
     import GeneralizedLinearRegression._
 
     var idx = 0
-    for (family <- Seq("gaussian", "poisson", "binomial", "gamma", "tweedie")) {
+    for (family <- GeneralizedLinearRegression.supportedFamilyNames.sortWith(_ < _)) {
       for (useWeight <- Seq(false, true)) {
         val trainer = new GeneralizedLinearRegression().setFamily(family)
         if (useWeight) trainer.setWeightCol("weight")
@@ -807,7 +807,7 @@ class GeneralizedLinearRegressionSuite
         0.5, 2.1, 0.5, 1.0, 2.0,
         0.9, 0.4, 1.0, 2.0, 1.0,
         0.7, 0.7, 0.0, 3.0, 3.0), 4, 5, byrow = TRUE))
-      families <- list(gaussian, binomial, poisson, Gamma, tweedie(1.5))
+      families <- list(binomial, Gamma, gaussian, poisson, tweedie(1.5))
       f1 <- V1 ~ -1 + V4 + V5
       f2 <- V1 ~ V4 + V5
       for (f in c(f1, f2)) {
@@ -816,15 +816,15 @@ class GeneralizedLinearRegressionSuite
           print(as.vector(coef(model)))
         }
       }
-      [1]  0.5169222 -0.3344444
       [1]  0.9419107 -0.6864404
-      [1]  0.1812436 -0.6568422
       [1] -0.2869094  0.7857710
+      [1]  0.5169222 -0.3344444
+      [1]  0.1812436 -0.6568422
       [1] 0.1055254 0.2979113
-      [1] -0.05990345  0.53188982 -0.32118415
       [1] -0.2147117  0.9911750 -0.6356096
-      [1] -1.5616130  0.6646470 -0.3192581
       [1]  0.3390397 -0.3406099  0.6870259
+      [1] -0.05990345  0.53188982 -0.32118415
+      [1] -1.5616130  0.6646470 -0.3192581
       [1] 0.3665034 0.1039416 0.1484616
     */
     val dataset = Seq(
@@ -835,23 +835,22 @@ class GeneralizedLinearRegressionSuite
     ).toDF()
 
     val expected = Seq(
-      Vectors.dense(0, 0.5169222, -0.3344444),
       Vectors.dense(0, 0.9419107, -0.6864404),
-      Vectors.dense(0, 0.1812436, -0.6568422),
       Vectors.dense(0, -0.2869094, 0.785771),
+      Vectors.dense(0, 0.5169222, -0.3344444),
+      Vectors.dense(0, 0.1812436, -0.6568422),
       Vectors.dense(0, 0.1055254, 0.2979113),
-      Vectors.dense(-0.05990345, 0.53188982, -0.32118415),
       Vectors.dense(-0.2147117, 0.991175, -0.6356096),
-      Vectors.dense(-1.561613, 0.664647, -0.3192581),
       Vectors.dense(0.3390397, -0.3406099, 0.6870259),
+      Vectors.dense(-0.05990345, 0.53188982, -0.32118415),
+      Vectors.dense(-1.561613, 0.664647, -0.3192581),
       Vectors.dense(0.3665034, 0.1039416, 0.1484616))
 
     import GeneralizedLinearRegression._
 
     var idx = 0
-
     for (fitIntercept <- Seq(false, true)) {
-      for (family <- Seq("gaussian", "binomial", "poisson", "gamma", "tweedie")) {
+      for (family <- GeneralizedLinearRegression.supportedFamilyNames.sortWith(_ < _)) {
         val trainer = new GeneralizedLinearRegression().setFamily(family)
           .setFitIntercept(fitIntercept).setOffsetCol("offset")
           .setWeightCol("weight").setLinkPredictionCol("linkPrediction")

From e0b047eafed92eadf6842a9df964438095e12d41 Mon Sep 17 00:00:00 2001
From: Ruifeng Zheng <ruifengz@foxmail.com>
Date: Sat, 1 Jul 2017 15:37:41 +0800
Subject: [PATCH 064/779] [SPARK-18518][ML] HasSolver supports override

## What changes were proposed in this pull request?
1, make param support non-final with `finalFields` option
2, generate `HasSolver` with `finalFields = false`
3, override `solver` in LiR, GLR, and make MLPC inherit `HasSolver`

## How was this patch tested?
existing tests

Author: Ruifeng Zheng <ruifengz@foxmail.com>
Author: Zheng RuiFeng <ruifengz@foxmail.com>

Closes #16028 from zhengruifeng/param_non_final.
---
 .../MultilayerPerceptronClassifier.scala      | 19 ++++----
 .../ml/param/shared/SharedParamsCodeGen.scala | 11 +++--
 .../spark/ml/param/shared/sharedParams.scala  |  8 ++--
 .../GeneralizedLinearRegression.scala         | 21 ++++++++-
 .../ml/regression/LinearRegression.scala      | 46 +++++++++++++++----
 python/pyspark/ml/classification.py           | 18 +-------
 python/pyspark/ml/regression.py               |  5 ++
 7 files changed, 82 insertions(+), 46 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
index ec39f964e213a..ceba11edc93be 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
@@ -27,13 +27,16 @@ import org.apache.spark.ml.ann.{FeedForwardTopology, FeedForwardTrainer}
 import org.apache.spark.ml.feature.LabeledPoint
 import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.param._
-import org.apache.spark.ml.param.shared.{HasMaxIter, HasSeed, HasStepSize, HasTol}
+import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.Dataset
 
 /** Params for Multilayer Perceptron. */
 private[classification] trait MultilayerPerceptronParams extends PredictorParams
-  with HasSeed with HasMaxIter with HasTol with HasStepSize {
+  with HasSeed with HasMaxIter with HasTol with HasStepSize with HasSolver {
+
+  import MultilayerPerceptronClassifier._
+
   /**
    * Layer sizes including input size and output size.
    *
@@ -78,14 +81,10 @@ private[classification] trait MultilayerPerceptronParams extends PredictorParams
    * @group expertParam
    */
   @Since("2.0.0")
-  final val solver: Param[String] = new Param[String](this, "solver",
+  final override val solver: Param[String] = new Param[String](this, "solver",
     "The solver algorithm for optimization. Supported options: " +
-      s"${MultilayerPerceptronClassifier.supportedSolvers.mkString(", ")}. (Default l-bfgs)",
-    ParamValidators.inArray[String](MultilayerPerceptronClassifier.supportedSolvers))
-
-  /** @group expertGetParam */
-  @Since("2.0.0")
-  final def getSolver: String = $(solver)
+      s"${supportedSolvers.mkString(", ")}. (Default l-bfgs)",
+    ParamValidators.inArray[String](supportedSolvers))
 
   /**
    * The initial weights of the model.
@@ -101,7 +100,7 @@ private[classification] trait MultilayerPerceptronParams extends PredictorParams
   final def getInitialWeights: Vector = $(initialWeights)
 
   setDefault(maxIter -> 100, tol -> 1e-6, blockSize -> 128,
-    solver -> MultilayerPerceptronClassifier.LBFGS, stepSize -> 0.03)
+    solver -> LBFGS, stepSize -> 0.03)
 }
 
 /** Label to vector converter. */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
index 013817a41baf5..23e0d45d943a0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
@@ -80,8 +80,7 @@ private[shared] object SharedParamsCodeGen {
         " 0)", isValid = "ParamValidators.gt(0)"),
       ParamDesc[String]("weightCol", "weight column name. If this is not set or empty, we treat " +
         "all instance weights as 1.0"),
-      ParamDesc[String]("solver", "the solver algorithm for optimization. If this is not set or " +
-        "empty, default value is 'auto'", Some("\"auto\"")),
+      ParamDesc[String]("solver", "the solver algorithm for optimization", finalFields = false),
       ParamDesc[Int]("aggregationDepth", "suggested depth for treeAggregate (>= 2)", Some("2"),
         isValid = "ParamValidators.gtEq(2)", isExpertParam = true))
 
@@ -99,6 +98,7 @@ private[shared] object SharedParamsCodeGen {
       defaultValueStr: Option[String] = None,
       isValid: String = "",
       finalMethods: Boolean = true,
+      finalFields: Boolean = true,
       isExpertParam: Boolean = false) {
 
     require(name.matches("[a-z][a-zA-Z0-9]*"), s"Param name $name is invalid.")
@@ -167,6 +167,11 @@ private[shared] object SharedParamsCodeGen {
     } else {
       "def"
     }
+    val fieldStr = if (param.finalFields) {
+      "final val"
+    } else {
+      "val"
+    }
 
     val htmlCompliantDoc = Utility.escape(doc)
 
@@ -180,7 +185,7 @@ private[shared] object SharedParamsCodeGen {
       |   * Param for $htmlCompliantDoc.
       |   * @group ${groupStr(0)}
       |   */
-      |  final val $name: $Param = new $Param(this, "$name", "$doc"$isValid)
+      |  $fieldStr $name: $Param = new $Param(this, "$name", "$doc"$isValid)
       |$setDefault
       |  /** @group ${groupStr(1)} */
       |  $methodStr get$Name: $T = $$($name)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
index 50619607a5054..1a8f499798b80 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -374,17 +374,15 @@ private[ml] trait HasWeightCol extends Params {
 }
 
 /**
- * Trait for shared param solver (default: "auto").
+ * Trait for shared param solver.
  */
 private[ml] trait HasSolver extends Params {
 
   /**
-   * Param for the solver algorithm for optimization. If this is not set or empty, default value is 'auto'.
+   * Param for the solver algorithm for optimization.
    * @group param
    */
-  final val solver: Param[String] = new Param[String](this, "solver", "the solver algorithm for optimization. If this is not set or empty, default value is 'auto'")
-
-  setDefault(solver, "auto")
+  val solver: Param[String] = new Param[String](this, "solver", "the solver algorithm for optimization")
 
   /** @group getParam */
   final def getSolver: String = $(solver)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index ce3460ae43566..c600b87bdc64a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -164,7 +164,18 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam
     isDefined(linkPredictionCol) && $(linkPredictionCol).nonEmpty
   }
 
-  import GeneralizedLinearRegression._
+  /**
+   * The solver algorithm for optimization.
+   * Supported options: "irls" (iteratively reweighted least squares).
+   * Default: "irls"
+   *
+   * @group param
+   */
+  @Since("2.3.0")
+  final override val solver: Param[String] = new Param[String](this, "solver",
+    "The solver algorithm for optimization. Supported options: " +
+      s"${supportedSolvers.mkString(", ")}. (Default irls)",
+    ParamValidators.inArray[String](supportedSolvers))
 
   @Since("2.0.0")
   override def validateAndTransformSchema(
@@ -350,7 +361,7 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val
    */
   @Since("2.0.0")
   def setSolver(value: String): this.type = set(solver, value)
-  setDefault(solver -> "irls")
+  setDefault(solver -> IRLS)
 
   /**
    * Sets the link prediction (linear predictor) column name.
@@ -442,6 +453,12 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
     Gamma -> Inverse, Gamma -> Identity, Gamma -> Log
   )
 
+  /** String name for "irls" (iteratively reweighted least squares) solver. */
+  private[regression] val IRLS = "irls"
+
+  /** Set of solvers that GeneralizedLinearRegression supports. */
+  private[regression] val supportedSolvers = Array(IRLS)
+
   /** Set of family names that GeneralizedLinearRegression supports. */
   private[regression] lazy val supportedFamilyNames =
     supportedFamilyAndLinkPairs.map(_._1.name).toArray :+ "tweedie"
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index db5ac4f14bd3b..ce5e0797915df 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -34,7 +34,7 @@ import org.apache.spark.ml.optim.WeightedLeastSquares
 import org.apache.spark.ml.PredictorParams
 import org.apache.spark.ml.optim.aggregator.LeastSquaresAggregator
 import org.apache.spark.ml.optim.loss.{L2Regularization, RDDLossFunction}
-import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
 import org.apache.spark.mllib.evaluation.RegressionMetrics
@@ -53,7 +53,23 @@ import org.apache.spark.storage.StorageLevel
 private[regression] trait LinearRegressionParams extends PredictorParams
     with HasRegParam with HasElasticNetParam with HasMaxIter with HasTol
     with HasFitIntercept with HasStandardization with HasWeightCol with HasSolver
-    with HasAggregationDepth
+    with HasAggregationDepth {
+
+  import LinearRegression._
+
+  /**
+   * The solver algorithm for optimization.
+   * Supported options: "l-bfgs", "normal" and "auto".
+   * Default: "auto"
+   *
+   * @group param
+   */
+  @Since("2.3.0")
+  final override val solver: Param[String] = new Param[String](this, "solver",
+    "The solver algorithm for optimization. Supported options: " +
+      s"${supportedSolvers.mkString(", ")}. (Default auto)",
+    ParamValidators.inArray[String](supportedSolvers))
+}
 
 /**
  * Linear regression.
@@ -78,6 +94,8 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
   extends Regressor[Vector, LinearRegression, LinearRegressionModel]
   with LinearRegressionParams with DefaultParamsWritable with Logging {
 
+  import LinearRegression._
+
   @Since("1.4.0")
   def this() = this(Identifiable.randomUID("linReg"))
 
@@ -175,12 +193,8 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
    * @group setParam
    */
   @Since("1.6.0")
-  def setSolver(value: String): this.type = {
-    require(Set("auto", "l-bfgs", "normal").contains(value),
-      s"Solver $value was not supported. Supported options: auto, l-bfgs, normal")
-    set(solver, value)
-  }
-  setDefault(solver -> "auto")
+  def setSolver(value: String): this.type = set(solver, value)
+  setDefault(solver -> AUTO)
 
   /**
    * Suggested depth for treeAggregate (greater than or equal to 2).
@@ -210,8 +224,8 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
       elasticNetParam, fitIntercept, maxIter, regParam, standardization, aggregationDepth)
     instr.logNumFeatures(numFeatures)
 
-    if (($(solver) == "auto" &&
-      numFeatures <= WeightedLeastSquares.MAX_NUM_FEATURES) || $(solver) == "normal") {
+    if (($(solver) == AUTO &&
+      numFeatures <= WeightedLeastSquares.MAX_NUM_FEATURES) || $(solver) == NORMAL) {
       // For low dimensional data, WeightedLeastSquares is more efficient since the
       // training algorithm only requires one pass through the data. (SPARK-10668)
 
@@ -444,6 +458,18 @@ object LinearRegression extends DefaultParamsReadable[LinearRegression] {
    */
   @Since("2.1.0")
   val MAX_FEATURES_FOR_NORMAL_SOLVER: Int = WeightedLeastSquares.MAX_NUM_FEATURES
+
+  /** String name for "auto". */
+  private[regression] val AUTO = "auto"
+
+  /** String name for "normal". */
+  private[regression] val NORMAL = "normal"
+
+  /** String name for "l-bfgs". */
+  private[regression] val LBFGS = "l-bfgs"
+
+  /** Set of solvers that LinearRegression supports. */
+  private[regression] val supportedSolvers = Array(AUTO, NORMAL, LBFGS)
 }
 
 /**
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 9b345ac73f3d9..948806a5c936c 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -1265,8 +1265,8 @@ def theta(self):
 
 @inherit_doc
 class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
-                                     HasMaxIter, HasTol, HasSeed, HasStepSize, JavaMLWritable,
-                                     JavaMLReadable):
+                                     HasMaxIter, HasTol, HasSeed, HasStepSize, HasSolver,
+                                     JavaMLWritable, JavaMLReadable):
     """
     Classifier trainer based on the Multilayer Perceptron.
     Each layer has sigmoid activation function, output layer has softmax.
@@ -1407,20 +1407,6 @@ def getStepSize(self):
         """
         return self.getOrDefault(self.stepSize)
 
-    @since("2.0.0")
-    def setSolver(self, value):
-        """
-        Sets the value of :py:attr:`solver`.
-        """
-        return self._set(solver=value)
-
-    @since("2.0.0")
-    def getSolver(self):
-        """
-        Gets the value of solver or its default value.
-        """
-        return self.getOrDefault(self.solver)
-
     @since("2.0.0")
     def setInitialWeights(self, value):
         """
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index 2d17f95b0c44f..84d843369e105 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -95,6 +95,9 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPrediction
     .. versionadded:: 1.4.0
     """
 
+    solver = Param(Params._dummy(), "solver", "The solver algorithm for optimization. Supported " +
+                   "options: auto, normal, l-bfgs.", typeConverter=TypeConverters.toString)
+
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                  maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
@@ -1371,6 +1374,8 @@ class GeneralizedLinearRegression(JavaEstimator, HasLabelCol, HasFeaturesCol, Ha
     linkPower = Param(Params._dummy(), "linkPower", "The index in the power link function. " +
                       "Only applicable to the Tweedie family.",
                       typeConverter=TypeConverters.toFloat)
+    solver = Param(Params._dummy(), "solver", "The solver algorithm for optimization. Supported " +
+                   "options: irls.", typeConverter=TypeConverters.toString)
 
     @keyword_only
     def __init__(self, labelCol="label", featuresCol="features", predictionCol="prediction",

From 6beca9ce94f484de2f9ffb946bef8334781b3122 Mon Sep 17 00:00:00 2001
From: Devaraj K <devaraj@apache.org>
Date: Sat, 1 Jul 2017 15:53:49 +0100
Subject: [PATCH 065/779] [SPARK-21170][CORE]
 Utils.tryWithSafeFinallyAndFailureCallbacks throws IllegalArgumentException:
 Self-suppression not permitted

## What changes were proposed in this pull request?

Not adding the exception to the suppressed if it is the same instance as originalThrowable.

## How was this patch tested?

Added new tests to verify this, these tests fail without source code changes and passes with the change.

Author: Devaraj K <devaraj@apache.org>

Closes #18384 from devaraj-kavali/SPARK-21170.
---
 .../scala/org/apache/spark/util/Utils.scala   | 30 +++----
 .../org/apache/spark/util/UtilsSuite.scala    | 88 ++++++++++++++++++-
 2 files changed, 99 insertions(+), 19 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index bbb7999e2a144..26f61e25da4d3 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -1348,14 +1348,10 @@ private[spark] object Utils extends Logging {
       try {
         finallyBlock
       } catch {
-        case t: Throwable =>
-          if (originalThrowable != null) {
-            originalThrowable.addSuppressed(t)
-            logWarning(s"Suppressing exception in finally: " + t.getMessage, t)
-            throw originalThrowable
-          } else {
-            throw t
-          }
+        case t: Throwable if (originalThrowable != null && originalThrowable != t) =>
+          originalThrowable.addSuppressed(t)
+          logWarning(s"Suppressing exception in finally: ${t.getMessage}", t)
+          throw originalThrowable
       }
     }
   }
@@ -1387,22 +1383,20 @@ private[spark] object Utils extends Logging {
           catchBlock
         } catch {
           case t: Throwable =>
-            originalThrowable.addSuppressed(t)
-            logWarning(s"Suppressing exception in catch: " + t.getMessage, t)
+            if (originalThrowable != t) {
+              originalThrowable.addSuppressed(t)
+              logWarning(s"Suppressing exception in catch: ${t.getMessage}", t)
+            }
         }
         throw originalThrowable
     } finally {
       try {
         finallyBlock
       } catch {
-        case t: Throwable =>
-          if (originalThrowable != null) {
-            originalThrowable.addSuppressed(t)
-            logWarning(s"Suppressing exception in finally: " + t.getMessage, t)
-            throw originalThrowable
-          } else {
-            throw t
-          }
+        case t: Throwable if (originalThrowable != null && originalThrowable != t) =>
+          originalThrowable.addSuppressed(t)
+          logWarning(s"Suppressing exception in finally: ${t.getMessage}", t)
+          throw originalThrowable
       }
     }
   }
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index f7bc8f888b0d5..4ce143f18bbf1 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -38,7 +38,7 @@ import org.apache.commons.math3.stat.inference.ChiSquareTest
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 
-import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.{SparkConf, SparkFunSuite, TaskContext}
 import org.apache.spark.internal.Logging
 import org.apache.spark.network.util.ByteUnit
 
@@ -1024,4 +1024,90 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
     assert(redactedConf("spark.sensitive.property") === Utils.REDACTION_REPLACEMENT_TEXT)
 
   }
+
+  test("tryWithSafeFinally") {
+    var e = new Error("Block0")
+    val finallyBlockError = new Error("Finally Block")
+    var isErrorOccurred = false
+    // if the try and finally blocks throw different exception instances
+    try {
+      Utils.tryWithSafeFinally { throw e }(finallyBlock = { throw finallyBlockError })
+    } catch {
+      case t: Error =>
+        assert(t.getSuppressed.head == finallyBlockError)
+        isErrorOccurred = true
+    }
+    assert(isErrorOccurred)
+    // if the try and finally blocks throw the same exception instance then it should not
+    // try to add to suppressed and get IllegalArgumentException
+    e = new Error("Block1")
+    isErrorOccurred = false
+    try {
+      Utils.tryWithSafeFinally { throw e }(finallyBlock = { throw e })
+    } catch {
+      case t: Error =>
+        assert(t.getSuppressed.length == 0)
+        isErrorOccurred = true
+    }
+    assert(isErrorOccurred)
+    // if the try throws the exception and finally doesn't throw exception
+    e = new Error("Block2")
+    isErrorOccurred = false
+    try {
+      Utils.tryWithSafeFinally { throw e }(finallyBlock = {})
+    } catch {
+      case t: Error =>
+        assert(t.getSuppressed.length == 0)
+        isErrorOccurred = true
+    }
+    assert(isErrorOccurred)
+    // if the try and finally block don't throw exception
+    Utils.tryWithSafeFinally {}(finallyBlock = {})
+  }
+
+  test("tryWithSafeFinallyAndFailureCallbacks") {
+    var e = new Error("Block0")
+    val catchBlockError = new Error("Catch Block")
+    val finallyBlockError = new Error("Finally Block")
+    var isErrorOccurred = false
+    TaskContext.setTaskContext(TaskContext.empty())
+    // if the try, catch and finally blocks throw different exception instances
+    try {
+      Utils.tryWithSafeFinallyAndFailureCallbacks { throw e }(
+        catchBlock = { throw catchBlockError }, finallyBlock = { throw finallyBlockError })
+    } catch {
+      case t: Error =>
+        assert(t.getSuppressed.head == catchBlockError)
+        assert(t.getSuppressed.last == finallyBlockError)
+        isErrorOccurred = true
+    }
+    assert(isErrorOccurred)
+    // if the try, catch and finally blocks throw the same exception instance then it should not
+    // try to add to suppressed and get IllegalArgumentException
+    e = new Error("Block1")
+    isErrorOccurred = false
+    try {
+      Utils.tryWithSafeFinallyAndFailureCallbacks { throw e }(catchBlock = { throw e },
+        finallyBlock = { throw e })
+    } catch {
+      case t: Error =>
+        assert(t.getSuppressed.length == 0)
+        isErrorOccurred = true
+    }
+    assert(isErrorOccurred)
+    // if the try throws the exception, catch and finally don't throw exceptions
+    e = new Error("Block2")
+    isErrorOccurred = false
+    try {
+      Utils.tryWithSafeFinallyAndFailureCallbacks { throw e }(catchBlock = {}, finallyBlock = {})
+    } catch {
+      case t: Error =>
+        assert(t.getSuppressed.length == 0)
+        isErrorOccurred = true
+    }
+    assert(isErrorOccurred)
+    // if the try, catch and finally blocks don't throw exceptions
+    Utils.tryWithSafeFinallyAndFailureCallbacks {}(catchBlock = {}, finallyBlock = {})
+    TaskContext.unset
+  }
 }

From c605fee01f180588ecb2f48710a7b84073bd3b9a Mon Sep 17 00:00:00 2001
From: Xingbo Jiang <xingbo.jiang@databricks.com>
Date: Sun, 2 Jul 2017 08:50:48 +0100
Subject: [PATCH 066/779] [SPARK-21260][SQL][MINOR] Remove the unused
 OutputFakerExec

## What changes were proposed in this pull request?

OutputFakerExec was added long ago and is not used anywhere now so we should remove it.

## How was this patch tested?
N/A

Author: Xingbo Jiang <xingbo.jiang@databricks.com>

Closes #18473 from jiangxb1987/OutputFakerExec.
---
 .../spark/sql/execution/basicPhysicalOperators.scala  | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
index f3ca8397047fe..2151c339b9b87 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
@@ -584,17 +584,6 @@ case class CoalesceExec(numPartitions: Int, child: SparkPlan) extends UnaryExecN
   }
 }
 
-/**
- * A plan node that does nothing but lie about the output of its child.  Used to spice a
- * (hopefully structurally equivalent) tree from a different optimization sequence into an already
- * resolved tree.
- */
-case class OutputFakerExec(output: Seq[Attribute], child: SparkPlan) extends SparkPlan {
-  def children: Seq[SparkPlan] = child :: Nil
-
-  protected override def doExecute(): RDD[InternalRow] = child.execute()
-}
-
 /**
  * Physical plan for a subquery.
  */

From c19680be1c532dded1e70edce7a981ba28af09ad Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Sun, 2 Jul 2017 16:17:03 +0800
Subject: [PATCH 067/779] [SPARK-19852][PYSPARK][ML] Python StringIndexer
 supports 'keep' to handle invalid data

## What changes were proposed in this pull request?
This PR is to maintain API parity with changes made in SPARK-17498 to support a new option
'keep' in StringIndexer to handle unseen labels or NULL values with PySpark.

Note: This is updated version of #17237 , the primary author of this PR is VinceShieh .
## How was this patch tested?
Unit tests.

Author: VinceShieh <vincent.xie@intel.com>
Author: Yanbo Liang <ybliang8@gmail.com>

Closes #18453 from yanboliang/spark-19852.
---
 python/pyspark/ml/feature.py |  6 ++++++
 python/pyspark/ml/tests.py   | 21 +++++++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 77de1cc18246d..25ad06f682ed9 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -2132,6 +2132,12 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid,
                             "frequencyDesc, frequencyAsc, alphabetDesc, alphabetAsc.",
                             typeConverter=TypeConverters.toString)
 
+    handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid data (unseen " +
+                          "labels or NULL values). Options are 'skip' (filter out rows with " +
+                          "invalid data), error (throw an error), or 'keep' (put invalid data " +
+                          "in a special additional bucket, at index numLabels).",
+                          typeConverter=TypeConverters.toString)
+
     @keyword_only
     def __init__(self, inputCol=None, outputCol=None, handleInvalid="error",
                  stringOrderType="frequencyDesc"):
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 17a39472e1fe5..ffb8b0a890ff8 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -551,6 +551,27 @@ def test_rformula_string_indexer_order_type(self):
         for i in range(0, len(expected)):
             self.assertTrue(all(observed[i]["features"].toArray() == expected[i]))
 
+    def test_string_indexer_handle_invalid(self):
+        df = self.spark.createDataFrame([
+            (0, "a"),
+            (1, "d"),
+            (2, None)], ["id", "label"])
+
+        si1 = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="keep",
+                            stringOrderType="alphabetAsc")
+        model1 = si1.fit(df)
+        td1 = model1.transform(df)
+        actual1 = td1.select("id", "indexed").collect()
+        expected1 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0), Row(id=2, indexed=2.0)]
+        self.assertEqual(actual1, expected1)
+
+        si2 = si1.setHandleInvalid("skip")
+        model2 = si2.fit(df)
+        td2 = model2.transform(df)
+        actual2 = td2.select("id", "indexed").collect()
+        expected2 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0)]
+        self.assertEqual(actual2, expected2)
+
 
 class HasInducedError(Params):
 

From d4107196d59638845bd19da6aab074424d90ddaf Mon Sep 17 00:00:00 2001
From: Rui Zha <zrdt713@gmail.com>
Date: Sun, 2 Jul 2017 17:37:47 -0700
Subject: [PATCH 068/779] [SPARK-18004][SQL] Make sure the date or timestamp
 related predicate can be pushed down to Oracle correctly

## What changes were proposed in this pull request?

Move `compileValue` method in JDBCRDD to JdbcDialect, and override the `compileValue` method in OracleDialect to rewrite the Oracle-specific timestamp and date literals in where clause.

## How was this patch tested?

An integration test has been added.

Author: Rui Zha <zrdt713@gmail.com>
Author: Zharui <zrdt713@gmail.com>

Closes #18451 from SharpRay/extend-compileValue-to-dialects.
---
 .../sql/jdbc/OracleIntegrationSuite.scala     | 45 +++++++++++++++++++
 .../execution/datasources/jdbc/JDBCRDD.scala  | 35 +++++----------
 .../apache/spark/sql/jdbc/JdbcDialects.scala  | 27 ++++++++++-
 .../apache/spark/sql/jdbc/OracleDialect.scala | 15 ++++++-
 4 files changed, 95 insertions(+), 27 deletions(-)

diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala
index b2f096964427e..e14810a32edc6 100644
--- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala
+++ b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala
@@ -223,4 +223,49 @@ class OracleIntegrationSuite extends DockerJDBCIntegrationSuite with SharedSQLCo
     val types = rows(0).toSeq.map(x => x.getClass.toString)
     assert(types(1).equals("class java.sql.Timestamp"))
   }
+
+  test("SPARK-18004: Make sure date or timestamp related predicate is pushed down correctly") {
+    val props = new Properties()
+    props.put("oracle.jdbc.mapDateToTimestamp", "false")
+
+    val schema = StructType(Seq(
+      StructField("date_type", DateType, true),
+      StructField("timestamp_type", TimestampType, true)
+    ))
+
+    val tableName = "test_date_timestamp_pushdown"
+    val dateVal = Date.valueOf("2017-06-22")
+    val timestampVal = Timestamp.valueOf("2017-06-22 21:30:07")
+
+    val data = spark.sparkContext.parallelize(Seq(
+      Row(dateVal, timestampVal)
+    ))
+
+    val dfWrite = spark.createDataFrame(data, schema)
+    dfWrite.write.jdbc(jdbcUrl, tableName, props)
+
+    val dfRead = spark.read.jdbc(jdbcUrl, tableName, props)
+
+    val millis = System.currentTimeMillis()
+    val dt = new java.sql.Date(millis)
+    val ts = new java.sql.Timestamp(millis)
+
+    // Query Oracle table with date and timestamp predicates
+    // which should be pushed down to Oracle.
+    val df = dfRead.filter(dfRead.col("date_type").lt(dt))
+      .filter(dfRead.col("timestamp_type").lt(ts))
+
+    val metadata = df.queryExecution.sparkPlan.metadata
+    // The "PushedFilters" part should be exist in Datafrome's
+    // physical plan and the existence of right literals in
+    // "PushedFilters" is used to prove that the predicates
+    // pushing down have been effective.
+    assert(metadata.get("PushedFilters").ne(None))
+    assert(metadata("PushedFilters").contains(dt.toString))
+    assert(metadata("PushedFilters").contains(ts.toString))
+
+    val row = df.collect()(0)
+    assert(row.getDate(0).equals(dateVal))
+    assert(row.getTimestamp(1).equals(timestampVal))
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
index 2bdc43254133e..0f53b5c7c6f0f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
@@ -17,12 +17,10 @@
 
 package org.apache.spark.sql.execution.datasources.jdbc
 
-import java.sql.{Connection, Date, PreparedStatement, ResultSet, SQLException, Timestamp}
+import java.sql.{Connection, PreparedStatement, ResultSet, SQLException}
 
 import scala.util.control.NonFatal
 
-import org.apache.commons.lang3.StringUtils
-
 import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext}
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
@@ -86,20 +84,6 @@ object JDBCRDD extends Logging {
     new StructType(columns.map(name => fieldMap(name)))
   }
 
-  /**
-   * Converts value to SQL expression.
-   */
-  private def compileValue(value: Any): Any = value match {
-    case stringValue: String => s"'${escapeSql(stringValue)}'"
-    case timestampValue: Timestamp => "'" + timestampValue + "'"
-    case dateValue: Date => "'" + dateValue + "'"
-    case arrayValue: Array[Any] => arrayValue.map(compileValue).mkString(", ")
-    case _ => value
-  }
-
-  private def escapeSql(value: String): String =
-    if (value == null) null else StringUtils.replace(value, "'", "''")
-
   /**
    * Turns a single Filter into a String representing a SQL expression.
    * Returns None for an unhandled filter.
@@ -108,15 +92,16 @@ object JDBCRDD extends Logging {
     def quote(colName: String): String = dialect.quoteIdentifier(colName)
 
     Option(f match {
-      case EqualTo(attr, value) => s"${quote(attr)} = ${compileValue(value)}"
+      case EqualTo(attr, value) => s"${quote(attr)} = ${dialect.compileValue(value)}"
       case EqualNullSafe(attr, value) =>
         val col = quote(attr)
-        s"(NOT ($col != ${compileValue(value)} OR $col IS NULL OR " +
-          s"${compileValue(value)} IS NULL) OR ($col IS NULL AND ${compileValue(value)} IS NULL))"
-      case LessThan(attr, value) => s"${quote(attr)} < ${compileValue(value)}"
-      case GreaterThan(attr, value) => s"${quote(attr)} > ${compileValue(value)}"
-      case LessThanOrEqual(attr, value) => s"${quote(attr)} <= ${compileValue(value)}"
-      case GreaterThanOrEqual(attr, value) => s"${quote(attr)} >= ${compileValue(value)}"
+        s"(NOT ($col != ${dialect.compileValue(value)} OR $col IS NULL OR " +
+          s"${dialect.compileValue(value)} IS NULL) OR " +
+          s"($col IS NULL AND ${dialect.compileValue(value)} IS NULL))"
+      case LessThan(attr, value) => s"${quote(attr)} < ${dialect.compileValue(value)}"
+      case GreaterThan(attr, value) => s"${quote(attr)} > ${dialect.compileValue(value)}"
+      case LessThanOrEqual(attr, value) => s"${quote(attr)} <= ${dialect.compileValue(value)}"
+      case GreaterThanOrEqual(attr, value) => s"${quote(attr)} >= ${dialect.compileValue(value)}"
       case IsNull(attr) => s"${quote(attr)} IS NULL"
       case IsNotNull(attr) => s"${quote(attr)} IS NOT NULL"
       case StringStartsWith(attr, value) => s"${quote(attr)} LIKE '${value}%'"
@@ -124,7 +109,7 @@ object JDBCRDD extends Logging {
       case StringContains(attr, value) => s"${quote(attr)} LIKE '%${value}%'"
       case In(attr, value) if value.isEmpty =>
         s"CASE WHEN ${quote(attr)} IS NULL THEN NULL ELSE FALSE END"
-      case In(attr, value) => s"${quote(attr)} IN (${compileValue(value)})"
+      case In(attr, value) => s"${quote(attr)} IN (${dialect.compileValue(value)})"
       case Not(f) => compileFilter(f, dialect).map(p => s"(NOT ($p))").getOrElse(null)
       case Or(f1, f2) =>
         // We can't compile Or filter unless both sub-filters are compiled successfully.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
index a86a86d408906..7c38ed68c0413 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
@@ -17,7 +17,9 @@
 
 package org.apache.spark.sql.jdbc
 
-import java.sql.Connection
+import java.sql.{Connection, Date, Timestamp}
+
+import org.apache.commons.lang3.StringUtils
 
 import org.apache.spark.annotation.{DeveloperApi, InterfaceStability, Since}
 import org.apache.spark.sql.types._
@@ -123,6 +125,29 @@ abstract class JdbcDialect extends Serializable {
   def beforeFetch(connection: Connection, properties: Map[String, String]): Unit = {
   }
 
+  /**
+   * Escape special characters in SQL string literals.
+   * @param value The string to be escaped.
+   * @return Escaped string.
+   */
+  @Since("2.3.0")
+  protected[jdbc] def escapeSql(value: String): String =
+    if (value == null) null else StringUtils.replace(value, "'", "''")
+
+  /**
+   * Converts value to SQL expression.
+   * @param value The value to be converted.
+   * @return Converted value.
+   */
+  @Since("2.3.0")
+  def compileValue(value: Any): Any = value match {
+    case stringValue: String => s"'${escapeSql(stringValue)}'"
+    case timestampValue: Timestamp => "'" + timestampValue + "'"
+    case dateValue: Date => "'" + dateValue + "'"
+    case arrayValue: Array[Any] => arrayValue.map(compileValue).mkString(", ")
+    case _ => value
+  }
+
   /**
    * Return Some[true] iff `TRUNCATE TABLE` causes cascading default.
    * Some[true] : TRUNCATE TABLE causes cascading.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala
index 20e634c06b610..3b44c1de93a61 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.jdbc
 
-import java.sql.Types
+import java.sql.{Date, Timestamp, Types}
 
 import org.apache.spark.sql.types._
 
@@ -64,5 +64,18 @@ private case object OracleDialect extends JdbcDialect {
     case _ => None
   }
 
+  override def compileValue(value: Any): Any = value match {
+    // The JDBC drivers support date literals in SQL statements written in the
+    // format: {d 'yyyy-mm-dd'} and timestamp literals in SQL statements written
+    // in the format: {ts 'yyyy-mm-dd hh:mm:ss.f...'}. For details, see
+    // 'Oracle Database JDBC Developer’s Guide and Reference, 11g Release 1 (11.1)'
+    // Appendix A Reference Information.
+    case stringValue: String => s"'${escapeSql(stringValue)}'"
+    case timestampValue: Timestamp => "{ts '" + timestampValue + "'}"
+    case dateValue: Date => "{d '" + dateValue + "'}"
+    case arrayValue: Array[Any] => arrayValue.map(compileValue).mkString(", ")
+    case _ => value
+  }
+
   override def isCascadingTruncateTable(): Option[Boolean] = Some(false)
 }

From d913db16a0de0983961f9d0c5f9b146be7226ac1 Mon Sep 17 00:00:00 2001
From: guoxiaolong <guo.xiaolong1@zte.com.cn>
Date: Mon, 3 Jul 2017 13:31:01 +0800
Subject: [PATCH 069/779] [SPARK-21250][WEB-UI] Add a url in the table of
 'Running Executors' in worker page to visit job page.

## What changes were proposed in this pull request?

Add a url in the table of 'Running Executors' in worker page to visit job page.

When I click URL of 'Name', the current page jumps to the job page. Of course this is only in the table of 'Running Executors'.

This URL of 'Name' is in the table of 'Finished Executors' does not exist, the click will not jump to any page.

fix before:
![1](https://user-images.githubusercontent.com/26266482/27679397-30ddc262-5ceb-11e7-839b-0889d1f42480.png)

fix after:
![2](https://user-images.githubusercontent.com/26266482/27679405-3588ef12-5ceb-11e7-9756-0a93815cd698.png)

## How was this patch tested?
manual tests

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: guoxiaolong <guo.xiaolong1@zte.com.cn>

Closes #18464 from guoxiaolongzte/SPARK-21250.
---
 .../apache/spark/deploy/worker/ui/WorkerPage.scala   | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerPage.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerPage.scala
index 1ad973122b609..ea39b0dce0a41 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerPage.scala
@@ -23,8 +23,8 @@ import scala.xml.Node
 
 import org.json4s.JValue
 
+import org.apache.spark.deploy.{ExecutorState, JsonProtocol}
 import org.apache.spark.deploy.DeployMessages.{RequestWorkerState, WorkerStateResponse}
-import org.apache.spark.deploy.JsonProtocol
 import org.apache.spark.deploy.master.DriverState
 import org.apache.spark.deploy.worker.{DriverRunner, ExecutorRunner}
 import org.apache.spark.ui.{UIUtils, WebUIPage}
@@ -112,7 +112,15 @@ private[ui] class WorkerPage(parent: WorkerWebUI) extends WebUIPage("") {
       <td>
         <ul class="unstyled">
           <li><strong>ID:</strong> {executor.appId}</li>
-          <li><strong>Name:</strong> {executor.appDesc.name}</li>
+          <li><strong>Name:</strong>
+          {
+            if ({executor.state == ExecutorState.RUNNING} && executor.appDesc.appUiUrl.nonEmpty) {
+              <a href={executor.appDesc.appUiUrl}> {executor.appDesc.name}</a>
+            } else {
+              {executor.appDesc.name}
+            }
+          }
+          </li>
           <li><strong>User:</strong> {executor.appDesc.user}</li>
         </ul>
       </td>

From a9339db99f0620d4828eb903523be55dfbf2fb64 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Mon, 3 Jul 2017 19:52:39 +0800
Subject: [PATCH 070/779] [SPARK-21137][CORE] Spark reads many small files
 slowly

## What changes were proposed in this pull request?

Parallelize FileInputFormat.listStatus in Hadoop API via LIST_STATUS_NUM_THREADS to speed up examination of file sizes for wholeTextFiles et al

## How was this patch tested?

Existing tests, which will exercise the key path here: using a local file system.

Author: Sean Owen <sowen@cloudera.com>

Closes #18441 from srowen/SPARK-21137.
---
 .../main/scala/org/apache/spark/rdd/BinaryFileRDD.scala    | 7 ++++++-
 .../main/scala/org/apache/spark/rdd/WholeTextFileRDD.scala | 7 ++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala b/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala
index 50d977a92da51..a14bad47dfe10 100644
--- a/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala
@@ -20,6 +20,7 @@ package org.apache.spark.rdd
 import org.apache.hadoop.conf.{Configurable, Configuration}
 import org.apache.hadoop.io.Writable
 import org.apache.hadoop.mapreduce._
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
 import org.apache.hadoop.mapreduce.task.JobContextImpl
 
 import org.apache.spark.{Partition, SparkContext}
@@ -35,8 +36,12 @@ private[spark] class BinaryFileRDD[T](
   extends NewHadoopRDD[String, T](sc, inputFormatClass, keyClass, valueClass, conf) {
 
   override def getPartitions: Array[Partition] = {
-    val inputFormat = inputFormatClass.newInstance
     val conf = getConf
+    // setMinPartitions below will call FileInputFormat.listStatus(), which can be quite slow when
+    // traversing a large number of directories and files. Parallelize it.
+    conf.setIfUnset(FileInputFormat.LIST_STATUS_NUM_THREADS,
+      Runtime.getRuntime.availableProcessors().toString)
+    val inputFormat = inputFormatClass.newInstance
     inputFormat match {
       case configurable: Configurable =>
         configurable.setConf(conf)
diff --git a/core/src/main/scala/org/apache/spark/rdd/WholeTextFileRDD.scala b/core/src/main/scala/org/apache/spark/rdd/WholeTextFileRDD.scala
index 8e1baae796fc5..9f3d0745c33c9 100644
--- a/core/src/main/scala/org/apache/spark/rdd/WholeTextFileRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/WholeTextFileRDD.scala
@@ -20,6 +20,7 @@ package org.apache.spark.rdd
 import org.apache.hadoop.conf.{Configurable, Configuration}
 import org.apache.hadoop.io.{Text, Writable}
 import org.apache.hadoop.mapreduce.InputSplit
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
 import org.apache.hadoop.mapreduce.task.JobContextImpl
 
 import org.apache.spark.{Partition, SparkContext}
@@ -38,8 +39,12 @@ private[spark] class WholeTextFileRDD(
   extends NewHadoopRDD[Text, Text](sc, inputFormatClass, keyClass, valueClass, conf) {
 
   override def getPartitions: Array[Partition] = {
-    val inputFormat = inputFormatClass.newInstance
     val conf = getConf
+    // setMinPartitions below will call FileInputFormat.listStatus(), which can be quite slow when
+    // traversing a large number of directories and files. Parallelize it.
+    conf.setIfUnset(FileInputFormat.LIST_STATUS_NUM_THREADS,
+      Runtime.getRuntime.availableProcessors().toString)
+    val inputFormat = inputFormatClass.newInstance
     inputFormat match {
       case configurable: Configurable =>
         configurable.setConf(conf)

From eb7a5a66bbd5837c01f13c76b68de2a6034976f3 Mon Sep 17 00:00:00 2001
From: Zhenhua Wang <wzh_zju@163.com>
Date: Mon, 3 Jul 2017 09:01:42 -0700
Subject: [PATCH 071/779] [TEST] Load test table based on case sensitivity

## What changes were proposed in this pull request?

It is strange that we will get "table not found" error if **the first sql** uses upper case table names, when developers write tests with `TestHiveSingleton`, **although case insensitivity**. This is because in `TestHiveQueryExecution`, test tables are loaded based on exact matching instead of case sensitivity.

## How was this patch tested?

Added a new test case.

Author: Zhenhua Wang <wzh_zju@163.com>

Closes #18504 from wzhfy/testHive.
---
 .../apache/spark/sql/hive/test/TestHive.scala |  7 ++-
 .../apache/spark/sql/hive/TestHiveSuite.scala | 45 +++++++++++++++++++
 2 files changed, 51 insertions(+), 1 deletion(-)
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/TestHiveSuite.scala

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 4e1792321c89b..801f9b9923641 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -449,6 +449,8 @@ private[hive] class TestHiveSparkSession(
 
   private val loadedTables = new collection.mutable.HashSet[String]
 
+  def getLoadedTables: collection.mutable.HashSet[String] = loadedTables
+
   def loadTestTable(name: String) {
     if (!(loadedTables contains name)) {
       // Marks the table as loaded first to prevent infinite mutually recursive table loading.
@@ -553,7 +555,10 @@ private[hive] class TestHiveQueryExecution(
     val referencedTables =
       describedTables ++
         logical.collect { case UnresolvedRelation(tableIdent, _) => tableIdent.table }
-    val referencedTestTables = referencedTables.filter(sparkSession.testTables.contains)
+    val resolver = sparkSession.sessionState.conf.resolver
+    val referencedTestTables = sparkSession.testTables.keys.filter { testTable =>
+      referencedTables.exists(resolver(_, testTable))
+    }
     logDebug(s"Query references test tables: ${referencedTestTables.mkString(", ")}")
     referencedTestTables.foreach(sparkSession.loadTestTable)
     // Proceed with analysis.
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/TestHiveSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/TestHiveSuite.scala
new file mode 100644
index 0000000000000..193fa83dbad99
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/TestHiveSuite.scala
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.hive.test.{TestHiveSingleton, TestHiveSparkSession}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.test.SQLTestUtils
+
+
+class TestHiveSuite extends TestHiveSingleton with SQLTestUtils {
+  test("load test table based on case sensitivity") {
+    val testHiveSparkSession = spark.asInstanceOf[TestHiveSparkSession]
+
+    withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") {
+      sql("SELECT * FROM SRC").queryExecution.analyzed
+      assert(testHiveSparkSession.getLoadedTables.contains("src"))
+      assert(testHiveSparkSession.getLoadedTables.size == 1)
+    }
+    testHiveSparkSession.reset()
+
+    withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") {
+      val err = intercept[AnalysisException] {
+        sql("SELECT * FROM SRC").queryExecution.analyzed
+      }
+      assert(err.message.contains("Table or view not found"))
+    }
+    testHiveSparkSession.reset()
+  }
+}

From 17bdc36ef16a544b693c628db276fe32db87fe7a Mon Sep 17 00:00:00 2001
From: aokolnychyi <anton.okolnychyi@sap.com>
Date: Mon, 3 Jul 2017 09:35:49 -0700
Subject: [PATCH 072/779] [SPARK-21102][SQL] Refresh command is too aggressive
 in parsing

### Idea

This PR adds validation to REFRESH sql statements. Currently, users can specify whatever they want as resource path. For example, spark.sql("REFRESH ! $ !") will be executed without any exceptions.

### Implementation

I am not sure that my current implementation is the most optimal, so any feedback is appreciated. My first idea was to make the grammar as strict as possible. Unfortunately, there were some problems. I tried the approach below:

SqlBase.g4
```
...
    | REFRESH TABLE tableIdentifier                                    #refreshTable
    | REFRESH resourcePath                                             #refreshResource
...

resourcePath
    : STRING
    | (IDENTIFIER | number | nonReserved | '/' | '-')+ // other symbols can be added if needed
    ;
```
It is not flexible enough and requires to explicitly mention all possible symbols. Therefore, I came up with the current approach that is implemented in the code.

Let me know your opinion on which one is better.

Author: aokolnychyi <anton.okolnychyi@sap.com>

Closes #18368 from aokolnychyi/spark-21102.
---
 .../spark/sql/catalyst/parser/SqlBase.g4      |  2 +-
 .../spark/sql/execution/SparkSqlParser.scala  | 20 +++++++++++++++---
 .../sql/execution/SparkSqlParserSuite.scala   | 21 ++++++++++++++++++-
 3 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index 7ffa150096333..29f554451ed4a 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -149,7 +149,7 @@ statement
     | (DESC | DESCRIBE) TABLE? option=(EXTENDED | FORMATTED)?
         tableIdentifier partitionSpec? describeColName?                #describeTable
     | REFRESH TABLE tableIdentifier                                    #refreshTable
-    | REFRESH .*?                                                      #refreshResource
+    | REFRESH (STRING | .*?)                                           #refreshResource
     | CACHE LAZY? TABLE tableIdentifier (AS? query)?                   #cacheTable
     | UNCACHE TABLE (IF EXISTS)? tableIdentifier                       #uncacheTable
     | CLEAR CACHE                                                      #clearCache
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index 3c58c6e1b6780..2b79eb5eac0f1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -230,11 +230,25 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
   }
 
   /**
-   * Create a [[RefreshTable]] logical plan.
+   * Create a [[RefreshResource]] logical plan.
    */
   override def visitRefreshResource(ctx: RefreshResourceContext): LogicalPlan = withOrigin(ctx) {
-    val resourcePath = remainder(ctx.REFRESH.getSymbol).trim
-    RefreshResource(resourcePath)
+    val path = if (ctx.STRING != null) string(ctx.STRING) else extractUnquotedResourcePath(ctx)
+    RefreshResource(path)
+  }
+
+  private def extractUnquotedResourcePath(ctx: RefreshResourceContext): String = withOrigin(ctx) {
+    val unquotedPath = remainder(ctx.REFRESH.getSymbol).trim
+    validate(
+      unquotedPath != null && !unquotedPath.isEmpty,
+      "Resource paths cannot be empty in REFRESH statements. Use / to match everything",
+      ctx)
+    val forbiddenSymbols = Seq(" ", "\n", "\r", "\t")
+    validate(
+      !forbiddenSymbols.exists(unquotedPath.contains(_)),
+      "REFRESH statements cannot contain ' ', '\\n', '\\r', '\\t' inside unquoted resource paths",
+      ctx)
+    unquotedPath
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala
index bd9c2ebd6fab9..d238c76fbeeff 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.expressions.{Ascending, Concat, SortOrder}
 import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, RepartitionByExpression, Sort}
 import org.apache.spark.sql.execution.command._
-import org.apache.spark.sql.execution.datasources.CreateTable
+import org.apache.spark.sql.execution.datasources.{CreateTable, RefreshResource}
 import org.apache.spark.sql.internal.{HiveSerDe, SQLConf}
 import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType}
 
@@ -66,6 +66,25 @@ class SparkSqlParserSuite extends AnalysisTest {
     }
   }
 
+  test("refresh resource") {
+    assertEqual("REFRESH prefix_path", RefreshResource("prefix_path"))
+    assertEqual("REFRESH /", RefreshResource("/"))
+    assertEqual("REFRESH /path///a", RefreshResource("/path///a"))
+    assertEqual("REFRESH pat1h/112/_1a", RefreshResource("pat1h/112/_1a"))
+    assertEqual("REFRESH pat1h/112/_1a/a-1", RefreshResource("pat1h/112/_1a/a-1"))
+    assertEqual("REFRESH path-with-dash", RefreshResource("path-with-dash"))
+    assertEqual("REFRESH \'path with space\'", RefreshResource("path with space"))
+    assertEqual("REFRESH \"path with space 2\"", RefreshResource("path with space 2"))
+    intercept("REFRESH a b", "REFRESH statements cannot contain")
+    intercept("REFRESH a\tb", "REFRESH statements cannot contain")
+    intercept("REFRESH a\nb", "REFRESH statements cannot contain")
+    intercept("REFRESH a\rb", "REFRESH statements cannot contain")
+    intercept("REFRESH a\r\nb", "REFRESH statements cannot contain")
+    intercept("REFRESH @ $a$", "REFRESH statements cannot contain")
+    intercept("REFRESH  ", "Resource paths cannot be empty in REFRESH statements")
+    intercept("REFRESH", "Resource paths cannot be empty in REFRESH statements")
+  }
+
   test("show functions") {
     assertEqual("show functions", ShowFunctionsCommand(None, None, true, true))
     assertEqual("show all functions", ShowFunctionsCommand(None, None, true, true))

From 363bfe30ba44852a8fac946a37032f76480f6f1b Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Mon, 3 Jul 2017 10:14:03 -0700
Subject: [PATCH 073/779] [SPARK-20073][SQL] Prints an explicit warning message
 in case of NULL-safe equals

## What changes were proposed in this pull request?
This pr added code to print the same warning messages with `===` cases when using NULL-safe equals (`<=>`).

## How was this patch tested?
Existing tests.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #18436 from maropu/SPARK-20073.
---
 .../src/main/scala/org/apache/spark/sql/Column.scala   | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index 7e1f1d83cb3de..bd1669b6dba69 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -464,7 +464,15 @@ class Column(val expr: Expression) extends Logging {
    * @group expr_ops
    * @since 1.3.0
    */
-  def <=> (other: Any): Column = withExpr { EqualNullSafe(expr, lit(other).expr) }
+  def <=> (other: Any): Column = withExpr {
+    val right = lit(other).expr
+    if (this.expr == right) {
+      logWarning(
+        s"Constructing trivially true equals predicate, '${this.expr} <=> $right'. " +
+          "Perhaps you need to use aliases.")
+    }
+    EqualNullSafe(expr, right)
+  }
 
   /**
    * Equality test that is safe for null values.

From f953ca56eccdaef29ac580d44613a028415ba3f5 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Mon, 3 Jul 2017 10:51:44 -0700
Subject: [PATCH 074/779] [SPARK-21284][SQL] rename
 SessionCatalog.registerFunction parameter name

## What changes were proposed in this pull request?

Looking at the code in `SessionCatalog.registerFunction`, the parameter `ignoreIfExists` is a wrong name. When `ignoreIfExists` is true, we will override the function if it already exists. So `overrideIfExists` should be the corrected name.

## How was this patch tested?

N/A

Author: Wenchen Fan <wenchen@databricks.com>

Closes #18510 from cloud-fan/minor.
---
 .../sql/catalyst/catalog/SessionCatalog.scala |  6 +++---
 .../catalog/SessionCatalogSuite.scala         | 20 ++++++++++---------
 .../sql/execution/command/functions.scala     |  3 +--
 .../spark/sql/internal/CatalogSuite.scala     |  2 +-
 .../spark/sql/hive/HiveSessionCatalog.scala   |  2 +-
 .../ObjectHashAggregateExecBenchmark.scala    |  2 +-
 6 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index 7ece77df7fc14..a86604e4353ab 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -1104,10 +1104,10 @@ class SessionCatalog(
    */
   def registerFunction(
       funcDefinition: CatalogFunction,
-      ignoreIfExists: Boolean,
+      overrideIfExists: Boolean,
       functionBuilder: Option[FunctionBuilder] = None): Unit = {
     val func = funcDefinition.identifier
-    if (functionRegistry.functionExists(func) && !ignoreIfExists) {
+    if (functionRegistry.functionExists(func) && !overrideIfExists) {
       throw new AnalysisException(s"Function $func already exists")
     }
     val info = new ExpressionInfo(funcDefinition.className, func.database.orNull, func.funcName)
@@ -1219,7 +1219,7 @@ class SessionCatalog(
     // catalog. So, it is possible that qualifiedName is not exactly the same as
     // catalogFunction.identifier.unquotedString (difference is on case-sensitivity).
     // At here, we preserve the input from the user.
-    registerFunction(catalogFunction.copy(identifier = qualifiedName), ignoreIfExists = false)
+    registerFunction(catalogFunction.copy(identifier = qualifiedName), overrideIfExists = false)
     // Now, we need to create the Expression.
     functionRegistry.lookupFunction(qualifiedName, children)
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
index fc3893e197792..8f856a0daad15 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
@@ -1175,9 +1175,9 @@ abstract class SessionCatalogSuite extends AnalysisTest {
       val tempFunc1 = (e: Seq[Expression]) => e.head
       val tempFunc2 = (e: Seq[Expression]) => e.last
       catalog.registerFunction(
-        newFunc("temp1", None), ignoreIfExists = false, functionBuilder = Some(tempFunc1))
+        newFunc("temp1", None), overrideIfExists = false, functionBuilder = Some(tempFunc1))
       catalog.registerFunction(
-        newFunc("temp2", None), ignoreIfExists = false, functionBuilder = Some(tempFunc2))
+        newFunc("temp2", None), overrideIfExists = false, functionBuilder = Some(tempFunc2))
       val arguments = Seq(Literal(1), Literal(2), Literal(3))
       assert(catalog.lookupFunction(FunctionIdentifier("temp1"), arguments) === Literal(1))
       assert(catalog.lookupFunction(FunctionIdentifier("temp2"), arguments) === Literal(3))
@@ -1189,12 +1189,12 @@ abstract class SessionCatalogSuite extends AnalysisTest {
       // Temporary function already exists
       val e = intercept[AnalysisException] {
         catalog.registerFunction(
-          newFunc("temp1", None), ignoreIfExists = false, functionBuilder = Some(tempFunc3))
+          newFunc("temp1", None), overrideIfExists = false, functionBuilder = Some(tempFunc3))
       }.getMessage
       assert(e.contains("Function temp1 already exists"))
       // Temporary function is overridden
       catalog.registerFunction(
-        newFunc("temp1", None), ignoreIfExists = true, functionBuilder = Some(tempFunc3))
+        newFunc("temp1", None), overrideIfExists = true, functionBuilder = Some(tempFunc3))
       assert(
         catalog.lookupFunction(
           FunctionIdentifier("temp1"), arguments) === Literal(arguments.length))
@@ -1208,7 +1208,7 @@ abstract class SessionCatalogSuite extends AnalysisTest {
 
       val tempFunc1 = (e: Seq[Expression]) => e.head
       catalog.registerFunction(
-        newFunc("temp1", None), ignoreIfExists = false, functionBuilder = Some(tempFunc1))
+        newFunc("temp1", None), overrideIfExists = false, functionBuilder = Some(tempFunc1))
 
       // Returns true when the function is temporary
       assert(catalog.isTemporaryFunction(FunctionIdentifier("temp1")))
@@ -1259,7 +1259,7 @@ abstract class SessionCatalogSuite extends AnalysisTest {
     withBasicCatalog { catalog =>
       val tempFunc = (e: Seq[Expression]) => e.head
       catalog.registerFunction(
-        newFunc("func1", None), ignoreIfExists = false, functionBuilder = Some(tempFunc))
+        newFunc("func1", None), overrideIfExists = false, functionBuilder = Some(tempFunc))
       val arguments = Seq(Literal(1), Literal(2), Literal(3))
       assert(catalog.lookupFunction(FunctionIdentifier("func1"), arguments) === Literal(1))
       catalog.dropTempFunction("func1", ignoreIfNotExists = false)
@@ -1300,7 +1300,7 @@ abstract class SessionCatalogSuite extends AnalysisTest {
     withBasicCatalog { catalog =>
       val tempFunc1 = (e: Seq[Expression]) => e.head
       catalog.registerFunction(
-        newFunc("func1", None), ignoreIfExists = false, functionBuilder = Some(tempFunc1))
+        newFunc("func1", None), overrideIfExists = false, functionBuilder = Some(tempFunc1))
       assert(catalog.lookupFunction(
         FunctionIdentifier("func1"), Seq(Literal(1), Literal(2), Literal(3))) == Literal(1))
       catalog.dropTempFunction("func1", ignoreIfNotExists = false)
@@ -1318,8 +1318,10 @@ abstract class SessionCatalogSuite extends AnalysisTest {
       val tempFunc2 = (e: Seq[Expression]) => e.last
       catalog.createFunction(newFunc("func2", Some("db2")), ignoreIfExists = false)
       catalog.createFunction(newFunc("not_me", Some("db2")), ignoreIfExists = false)
-      catalog.registerFunction(funcMeta1, ignoreIfExists = false, functionBuilder = Some(tempFunc1))
-      catalog.registerFunction(funcMeta2, ignoreIfExists = false, functionBuilder = Some(tempFunc2))
+      catalog.registerFunction(
+        funcMeta1, overrideIfExists = false, functionBuilder = Some(tempFunc1))
+      catalog.registerFunction(
+        funcMeta2, overrideIfExists = false, functionBuilder = Some(tempFunc2))
       assert(catalog.listFunctions("db1", "*").map(_._1).toSet ==
         Set(FunctionIdentifier("func1"),
           FunctionIdentifier("yes_me")))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala
index f39a3269efaf1..a91ad413f4d1b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala
@@ -58,9 +58,8 @@ case class CreateFunctionCommand(
           s"is not allowed: '${databaseName.get}'")
       }
       // We first load resources and then put the builder in the function registry.
-      // Please note that it is allowed to overwrite an existing temp function.
       catalog.loadFunctionResources(resources)
-      catalog.registerFunction(func, ignoreIfExists = false)
+      catalog.registerFunction(func, overrideIfExists = false)
     } else {
       // For a permanent, we will store the metadata into underlying external catalog.
       // This function will be loaded into the FunctionRegistry when a query uses it.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala
index b2d568ce320e6..6acac1a9aa317 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala
@@ -79,7 +79,7 @@ class CatalogSuite
     val tempFunc = (e: Seq[Expression]) => e.head
     val funcMeta = CatalogFunction(FunctionIdentifier(name, None), "className", Nil)
     sessionCatalog.registerFunction(
-      funcMeta, ignoreIfExists = false, functionBuilder = Some(tempFunc))
+      funcMeta, overrideIfExists = false, functionBuilder = Some(tempFunc))
   }
 
   private def dropFunction(name: String, db: Option[String] = None): Unit = {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
index da87f0218e3ad..0d0269f694300 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
@@ -161,7 +161,7 @@ private[sql] class HiveSessionCatalog(
             FunctionIdentifier(functionName.toLowerCase(Locale.ROOT), database)
           val func = CatalogFunction(functionIdentifier, className, Nil)
           // Put this Hive built-in function to our function registry.
-          registerFunction(func, ignoreIfExists = false)
+          registerFunction(func, overrideIfExists = false)
           // Now, we need to create the Expression.
           functionRegistry.lookupFunction(functionIdentifier, children)
         }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/execution/benchmark/ObjectHashAggregateExecBenchmark.scala b/sql/hive/src/test/scala/org/apache/spark/sql/execution/benchmark/ObjectHashAggregateExecBenchmark.scala
index 73383ae4d4118..e599d1ab1d486 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/execution/benchmark/ObjectHashAggregateExecBenchmark.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/execution/benchmark/ObjectHashAggregateExecBenchmark.scala
@@ -221,7 +221,7 @@ class ObjectHashAggregateExecBenchmark extends BenchmarkBase with TestHiveSingle
     val sessionCatalog = sparkSession.sessionState.catalog.asInstanceOf[HiveSessionCatalog]
     val functionIdentifier = FunctionIdentifier(functionName, database = None)
     val func = CatalogFunction(functionIdentifier, clazz.getName, resources = Nil)
-    sessionCatalog.registerFunction(func, ignoreIfExists = false)
+    sessionCatalog.registerFunction(func, overrideIfExists = false)
   }
 
   private def percentile_approx(

From c79c10ebaf3d63b697b8d6d1a7e55aa2d406af69 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Mon, 3 Jul 2017 16:18:54 -0700
Subject: [PATCH 075/779] [TEST] Different behaviors of SparkContext Conf when
 building SparkSession

## What changes were proposed in this pull request?
If the created ACTIVE sparkContext is not EXPLICITLY passed through the Builder's API `sparkContext()`, the conf of this sparkContext will also contain the conf set through the API `config()`; otherwise, the conf of this sparkContext will NOT contain the conf set through the API `config()`

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #18517 from gatorsmile/fixTestCase2.
---
 .../spark/sql/SparkSessionBuilderSuite.scala  | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSuite.scala
index 386d13d07a95f..4f6d5f79d466e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSuite.scala
@@ -98,12 +98,31 @@ class SparkSessionBuilderSuite extends SparkFunSuite {
     val session = SparkSession.builder().config("key2", "value2").getOrCreate()
     assert(session.conf.get("key1") == "value1")
     assert(session.conf.get("key2") == "value2")
+    assert(session.sparkContext == sparkContext2)
     assert(session.sparkContext.conf.get("key1") == "value1")
+    // If the created sparkContext is not passed through the Builder's API sparkContext,
+    // the conf of this sparkContext will also contain the conf set through the API config.
     assert(session.sparkContext.conf.get("key2") == "value2")
     assert(session.sparkContext.conf.get("spark.app.name") == "test")
     session.stop()
   }
 
+  test("create SparkContext first then pass context to SparkSession") {
+    sparkContext.stop()
+    val conf = new SparkConf().setAppName("test").setMaster("local").set("key1", "value1")
+    val newSC = new SparkContext(conf)
+    val session = SparkSession.builder().sparkContext(newSC).config("key2", "value2").getOrCreate()
+    assert(session.conf.get("key1") == "value1")
+    assert(session.conf.get("key2") == "value2")
+    assert(session.sparkContext == newSC)
+    assert(session.sparkContext.conf.get("key1") == "value1")
+    // If the created sparkContext is passed through the Builder's API sparkContext,
+    // the conf of this sparkContext will not contain the conf set through the API config.
+    assert(!session.sparkContext.conf.contains("key2"))
+    assert(session.sparkContext.conf.get("spark.app.name") == "test")
+    session.stop()
+  }
+
   test("SPARK-15887: hive-site.xml should be loaded") {
     val session = SparkSession.builder().master("local").getOrCreate()
     assert(session.sessionState.newHadoopConf().get("hive.in.test") == "true")

From 6657e00de36b59011d3fe78e8613fb64e54c957a Mon Sep 17 00:00:00 2001
From: liuxian <liu.xian3@zte.com.cn>
Date: Tue, 4 Jul 2017 09:16:40 +0800
Subject: [PATCH 076/779] [SPARK-21283][CORE] FileOutputStream should be
 created as append mode

## What changes were proposed in this pull request?

`FileAppender` is used to write `stderr` and `stdout` files  in `ExecutorRunner`, But before writing `ErrorStream` into the the `stderr` file, the header information has been written into ,if  FileOutputStream is  not created as append mode, the  header information will be lost

## How was this patch tested?
unit test case

Author: liuxian <liu.xian3@zte.com.cn>

Closes #18507 from 10110346/wip-lx-0703.
---
 .../scala/org/apache/spark/util/logging/FileAppender.scala   | 2 +-
 .../test/scala/org/apache/spark/util/FileAppenderSuite.scala | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/logging/FileAppender.scala b/core/src/main/scala/org/apache/spark/util/logging/FileAppender.scala
index fdb1495899bc3..8a0cc709bccc5 100644
--- a/core/src/main/scala/org/apache/spark/util/logging/FileAppender.scala
+++ b/core/src/main/scala/org/apache/spark/util/logging/FileAppender.scala
@@ -94,7 +94,7 @@ private[spark] class FileAppender(inputStream: InputStream, file: File, bufferSi
 
   /** Open the file output stream */
   protected def openFile() {
-    outputStream = new FileOutputStream(file, false)
+    outputStream = new FileOutputStream(file, true)
     logDebug(s"Opened file $file")
   }
 
diff --git a/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala b/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala
index 7e2da8e141532..cd0ed5b036bf9 100644
--- a/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala
@@ -52,10 +52,13 @@ class FileAppenderSuite extends SparkFunSuite with BeforeAndAfter with Logging {
   test("basic file appender") {
     val testString = (1 to 1000).mkString(", ")
     val inputStream = new ByteArrayInputStream(testString.getBytes(StandardCharsets.UTF_8))
+    // The `header` should not be covered
+    val header = "Add header"
+    Files.write(header, testFile, StandardCharsets.UTF_8)
     val appender = new FileAppender(inputStream, testFile)
     inputStream.close()
     appender.awaitTermination()
-    assert(Files.toString(testFile, StandardCharsets.UTF_8) === testString)
+    assert(Files.toString(testFile, StandardCharsets.UTF_8) === header + testString)
   }
 
   test("rolling file appender - time-based rolling") {

From a848d552ef6b5d0d3bb3b2da903478437a8b10aa Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Tue, 4 Jul 2017 11:35:08 +0900
Subject: [PATCH 077/779] [SPARK-21264][PYTHON] Call cross join path in join
 without 'on' and with 'how'

## What changes were proposed in this pull request?

Currently, it throws a NPE when missing columns but join type is speicified in join at PySpark as below:

```python
spark.conf.set("spark.sql.crossJoin.enabled", "false")
spark.range(1).join(spark.range(1), how="inner").show()
```

```
Traceback (most recent call last):
...
py4j.protocol.Py4JJavaError: An error occurred while calling o66.join.
: java.lang.NullPointerException
	at org.apache.spark.sql.Dataset.join(Dataset.scala:931)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
...
```

```python
spark.conf.set("spark.sql.crossJoin.enabled", "true")
spark.range(1).join(spark.range(1), how="inner").show()
```

```
...
py4j.protocol.Py4JJavaError: An error occurred while calling o84.join.
: java.lang.NullPointerException
	at org.apache.spark.sql.Dataset.join(Dataset.scala:931)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
...
```

This PR suggests to follow Scala's one as below:

```scala
scala> spark.conf.set("spark.sql.crossJoin.enabled", "false")

scala> spark.range(1).join(spark.range(1), Seq.empty[String], "inner").show()
```

```
org.apache.spark.sql.AnalysisException: Detected cartesian product for INNER join between logical plans
Range (0, 1, step=1, splits=Some(8))
and
Range (0, 1, step=1, splits=Some(8))
Join condition is missing or trivial.
Use the CROSS JOIN syntax to allow cartesian products between these relations.;
...
```

```scala
scala> spark.conf.set("spark.sql.crossJoin.enabled", "true")

scala> spark.range(1).join(spark.range(1), Seq.empty[String], "inner").show()
```
```
+---+---+
| id| id|
+---+---+
|  0|  0|
+---+---+
```

**After**

```python
spark.conf.set("spark.sql.crossJoin.enabled", "false")
spark.range(1).join(spark.range(1), how="inner").show()
```

```
Traceback (most recent call last):
...
pyspark.sql.utils.AnalysisException: u'Detected cartesian product for INNER join between logical plans\nRange (0, 1, step=1, splits=Some(8))\nand\nRange (0, 1, step=1, splits=Some(8))\nJoin condition is missing or trivial.\nUse the CROSS JOIN syntax to allow cartesian products between these relations.;'
```

```python
spark.conf.set("spark.sql.crossJoin.enabled", "true")
spark.range(1).join(spark.range(1), how="inner").show()
```
```
+---+---+
| id| id|
+---+---+
|  0|  0|
+---+---+
```

## How was this patch tested?

Added tests in `python/pyspark/sql/tests.py`.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #18484 from HyukjinKwon/SPARK-21264.
---
 python/pyspark/sql/dataframe.py |  2 ++
 python/pyspark/sql/tests.py     | 16 ++++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 0649271ed2246..27a6dad8917d3 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -833,6 +833,8 @@ def join(self, other, on=None, how=None):
         else:
             if how is None:
                 how = "inner"
+            if on is None:
+                on = self._jseq([])
             assert isinstance(how, basestring), "how should be basestring"
             jdf = self._jdf.join(other._jdf, on, how)
         return DataFrame(jdf, self.sql_ctx)
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 0a1cd6856b8e8..c105969b26b97 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -2021,6 +2021,22 @@ def test_toDF_with_schema_string(self):
         self.assertEqual(df.schema.simpleString(), "struct<value:int>")
         self.assertEqual(df.collect(), [Row(key=i) for i in range(100)])
 
+    def test_join_without_on(self):
+        df1 = self.spark.range(1).toDF("a")
+        df2 = self.spark.range(1).toDF("b")
+
+        try:
+            self.spark.conf.set("spark.sql.crossJoin.enabled", "false")
+            self.assertRaises(AnalysisException, lambda: df1.join(df2, how="inner").collect())
+
+            self.spark.conf.set("spark.sql.crossJoin.enabled", "true")
+            actual = df1.join(df2, how="inner").collect()
+            expected = [Row(a=0, b=0)]
+            self.assertEqual(actual, expected)
+        finally:
+            # We should unset this. Otherwise, other tests are affected.
+            self.spark.conf.unset("spark.sql.crossJoin.enabled")
+
     # Regression test for invalid join methods when on is None, Spark-14761
     def test_invalid_join_method(self):
         df1 = self.spark.createDataFrame([("Alice", 5), ("Bob", 8)], ["name", "age"])

From 8ca4ebefa6301d9cb633ea15cf71f49c2d7f8607 Mon Sep 17 00:00:00 2001
From: Thomas Decaux <ebuildy@gmail.com>
Date: Tue, 4 Jul 2017 12:17:48 +0100
Subject: [PATCH 078/779] [MINOR] Add french stop word "les"

## What changes were proposed in this pull request?

Added "les" as french stop word (plurial of le)

Author: Thomas Decaux <ebuildy@gmail.com>

Closes #18514 from ebuildy/patch-1.
---
 .../resources/org/apache/spark/ml/feature/stopwords/french.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/french.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/french.txt
index 94b8f8f39a3e1..a59a0424616cc 100644
--- a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/french.txt
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/french.txt
@@ -15,6 +15,7 @@ il
 je
 la
 le
+les
 leur
 lui
 ma
@@ -152,4 +153,4 @@ eusses
 eût
 eussions
 eussiez
-eussent
\ No newline at end of file
+eussent

From 2b1e94b9add82b30bc94f639fa97492624bf0dce Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Tue, 4 Jul 2017 12:18:42 +0100
Subject: [PATCH 079/779] [MINOR][SPARK SUBMIT] Print out R file usage in
 spark-submit

## What changes were proposed in this pull request?

Currently, running the shell below:

```bash
$ ./bin/spark-submit tmp.R a b c
```

with R file, `tmp.R` as below:

```r
#!/usr/bin/env Rscript

library(SparkR)
sparkRSQL.init(sparkR.init(master = "local"))
collect(createDataFrame(list(list(1))))
print(commandArgs(trailingOnly = TRUE))
```

working fine as below:

```bash
  _1
1  1
[1] "a" "b" "c"
```

However, it looks not printed in usage documentation as below:

```bash
$ ./bin/spark-submit
```

```
Usage: spark-submit [options] <app jar | python file> [app arguments]
...
```

For `./bin/sparkR`, it looks fine as below:

```bash
$ ./bin/sparkR tmp.R
```

```
Running R applications through 'sparkR' is not supported as of Spark 2.0.
Use ./bin/spark-submit <R file>
```

Running the script below:

```bash
$ ./bin/spark-submit
```

**Before**

```
Usage: spark-submit [options] <app jar | python file> [app arguments]
...
```

**After**

```
Usage: spark-submit [options] <app jar | python file | R file> [app arguments]
...
```

## How was this patch tested?

Manually tested.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #18505 from HyukjinKwon/minor-doc-summit.
---
 .../scala/org/apache/spark/deploy/SparkSubmitArguments.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
index 3d9a14c51618b..7800d3d624e3e 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -504,7 +504,7 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
       outStream.println("Unknown/unsupported param " + unknownParam)
     }
     val command = sys.env.get("_SPARK_CMD_USAGE").getOrElse(
-      """Usage: spark-submit [options] <app jar | python file> [app arguments]
+      """Usage: spark-submit [options] <app jar | python file | R file> [app arguments]
         |Usage: spark-submit --kill [submission ID] --master [spark://...]
         |Usage: spark-submit --status [submission ID] --master [spark://...]
         |Usage: spark-submit run-example [options] example-class [example args]""".stripMargin)

From d492cc5a21cd67b3999b85d97f5c41c3734b1ba3 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Tue, 4 Jul 2017 20:45:58 +0800
Subject: [PATCH 080/779] [SPARK-19507][SPARK-21296][PYTHON] Avoid per-record
 type dispatch in schema verification and improve exception message

## What changes were proposed in this pull request?
**Context**

While reviewing https://github.com/apache/spark/pull/17227, I realised here we type-dispatch per record. The PR itself is fine in terms of performance as is but this prints a prefix, `"obj"` in exception message as below:

```
from pyspark.sql.types import *
schema = StructType([StructField('s', IntegerType(), nullable=False)])
spark.createDataFrame([["1"]], schema)
...
TypeError: obj.s: IntegerType can not accept object '1' in type <type 'str'>
```

I suggested to get rid of this but during investigating this, I realised my approach might bring a performance regression as it is a hot path.

Only for SPARK-19507 and https://github.com/apache/spark/pull/17227, It needs more changes to cleanly get rid of the prefix and I rather decided to fix both issues together.

**Propersal**

This PR tried to

  - get rid of per-record type dispatch as we do in many code paths in Scala  so that it improves the performance (roughly ~25% improvement) - SPARK-21296

    This was tested with a simple code `spark.createDataFrame(range(1000000), "int")`. However, I am quite sure the actual improvement in practice is larger than this, in particular, when the schema is complicated.

   - improve error message in exception describing field information as prose - SPARK-19507

## How was this patch tested?

Manually tested and unit tests were added in `python/pyspark/sql/tests.py`.

Benchmark - codes: https://gist.github.com/HyukjinKwon/c3397469c56cb26c2d7dd521ed0bc5a3
Error message - codes: https://gist.github.com/HyukjinKwon/b1b2c7f65865444c4a8836435100e398

**Before**

Benchmark:
  - Results: https://gist.github.com/HyukjinKwon/4a291dab45542106301a0c1abcdca924

Error message
  - Results: https://gist.github.com/HyukjinKwon/57b1916395794ce924faa32b14a3fe19

**After**

Benchmark
  - Results: https://gist.github.com/HyukjinKwon/21496feecc4a920e50c4e455f836266e

Error message
  - Results: https://gist.github.com/HyukjinKwon/7a494e4557fe32a652ce1236e504a395

Closes #17227

Author: hyukjinkwon <gurwls223@gmail.com>
Author: David Gingrich <david@textio.com>

Closes #18521 from HyukjinKwon/python-type-dispatch.
---
 python/pyspark/rdd.py         |   1 -
 python/pyspark/sql/session.py |  12 +-
 python/pyspark/sql/tests.py   | 203 ++++++++++++++++++++++++++++++-
 python/pyspark/sql/types.py   | 219 +++++++++++++++++++++++-----------
 4 files changed, 352 insertions(+), 83 deletions(-)

diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 60141792d499b..7dfa17f68a943 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -627,7 +627,6 @@ def sortPartition(iterator):
     def sortByKey(self, ascending=True, numPartitions=None, keyfunc=lambda x: x):
         """
         Sorts this RDD, which is assumed to consist of (key, value) pairs.
-        # noqa
 
         >>> tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]
         >>> sc.parallelize(tmp).sortByKey().first()
diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
index e3bf0f35ea15e..2cc0e2d1d7b8d 100644
--- a/python/pyspark/sql/session.py
+++ b/python/pyspark/sql/session.py
@@ -33,7 +33,7 @@
 from pyspark.sql.dataframe import DataFrame
 from pyspark.sql.readwriter import DataFrameReader
 from pyspark.sql.streaming import DataStreamReader
-from pyspark.sql.types import Row, DataType, StringType, StructType, _verify_type, \
+from pyspark.sql.types import Row, DataType, StringType, StructType, _make_type_verifier, \
     _infer_schema, _has_nulltype, _merge_type, _create_converter, _parse_datatype_string
 from pyspark.sql.utils import install_exception_handler
 
@@ -514,17 +514,21 @@ def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=Tr
                 schema = [str(x) for x in data.columns]
             data = [r.tolist() for r in data.to_records(index=False)]
 
-        verify_func = _verify_type if verifySchema else lambda _, t: True
         if isinstance(schema, StructType):
+            verify_func = _make_type_verifier(schema) if verifySchema else lambda _: True
+
             def prepare(obj):
-                verify_func(obj, schema)
+                verify_func(obj)
                 return obj
         elif isinstance(schema, DataType):
             dataType = schema
             schema = StructType().add("value", schema)
 
+            verify_func = _make_type_verifier(
+                dataType, name="field value") if verifySchema else lambda _: True
+
             def prepare(obj):
-                verify_func(obj, dataType)
+                verify_func(obj)
                 return obj,
         else:
             if isinstance(schema, list):
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index c105969b26b97..16ba8bd73f400 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -57,7 +57,7 @@
 from pyspark import SparkContext
 from pyspark.sql import SparkSession, SQLContext, HiveContext, Column, Row
 from pyspark.sql.types import *
-from pyspark.sql.types import UserDefinedType, _infer_type
+from pyspark.sql.types import UserDefinedType, _infer_type, _make_type_verifier
 from pyspark.tests import ReusedPySparkTestCase, SparkSubmitTests
 from pyspark.sql.functions import UserDefinedFunction, sha2, lit
 from pyspark.sql.window import Window
@@ -852,7 +852,7 @@ def test_convert_row_to_dict(self):
         self.assertEqual(1.0, row.asDict()['d']['key'].c)
 
     def test_udt(self):
-        from pyspark.sql.types import _parse_datatype_json_string, _infer_type, _verify_type
+        from pyspark.sql.types import _parse_datatype_json_string, _infer_type, _make_type_verifier
         from pyspark.sql.tests import ExamplePointUDT, ExamplePoint
 
         def check_datatype(datatype):
@@ -868,8 +868,8 @@ def check_datatype(datatype):
         check_datatype(structtype_with_udt)
         p = ExamplePoint(1.0, 2.0)
         self.assertEqual(_infer_type(p), ExamplePointUDT())
-        _verify_type(ExamplePoint(1.0, 2.0), ExamplePointUDT())
-        self.assertRaises(ValueError, lambda: _verify_type([1.0, 2.0], ExamplePointUDT()))
+        _make_type_verifier(ExamplePointUDT())(ExamplePoint(1.0, 2.0))
+        self.assertRaises(ValueError, lambda: _make_type_verifier(ExamplePointUDT())([1.0, 2.0]))
 
         check_datatype(PythonOnlyUDT())
         structtype_with_udt = StructType([StructField("label", DoubleType(), False),
@@ -877,8 +877,10 @@ def check_datatype(datatype):
         check_datatype(structtype_with_udt)
         p = PythonOnlyPoint(1.0, 2.0)
         self.assertEqual(_infer_type(p), PythonOnlyUDT())
-        _verify_type(PythonOnlyPoint(1.0, 2.0), PythonOnlyUDT())
-        self.assertRaises(ValueError, lambda: _verify_type([1.0, 2.0], PythonOnlyUDT()))
+        _make_type_verifier(PythonOnlyUDT())(PythonOnlyPoint(1.0, 2.0))
+        self.assertRaises(
+            ValueError,
+            lambda: _make_type_verifier(PythonOnlyUDT())([1.0, 2.0]))
 
     def test_simple_udt_in_df(self):
         schema = StructType().add("key", LongType()).add("val", PythonOnlyUDT())
@@ -2636,6 +2638,195 @@ def range_frame_match():
 
         importlib.reload(window)
 
+
+class DataTypeVerificationTests(unittest.TestCase):
+
+    def test_verify_type_exception_msg(self):
+        self.assertRaisesRegexp(
+            ValueError,
+            "test_name",
+            lambda: _make_type_verifier(StringType(), nullable=False, name="test_name")(None))
+
+        schema = StructType([StructField('a', StructType([StructField('b', IntegerType())]))])
+        self.assertRaisesRegexp(
+            TypeError,
+            "field b in field a",
+            lambda: _make_type_verifier(schema)([["data"]]))
+
+    def test_verify_type_ok_nullable(self):
+        obj = None
+        types = [IntegerType(), FloatType(), StringType(), StructType([])]
+        for data_type in types:
+            try:
+                _make_type_verifier(data_type, nullable=True)(obj)
+            except Exception:
+                self.fail("verify_type(%s, %s, nullable=True)" % (obj, data_type))
+
+    def test_verify_type_not_nullable(self):
+        import array
+        import datetime
+        import decimal
+
+        schema = StructType([
+            StructField('s', StringType(), nullable=False),
+            StructField('i', IntegerType(), nullable=True)])
+
+        class MyObj:
+            def __init__(self, **kwargs):
+                for k, v in kwargs.items():
+                    setattr(self, k, v)
+
+        # obj, data_type
+        success_spec = [
+            # String
+            ("", StringType()),
+            (u"", StringType()),
+            (1, StringType()),
+            (1.0, StringType()),
+            ([], StringType()),
+            ({}, StringType()),
+
+            # UDT
+            (ExamplePoint(1.0, 2.0), ExamplePointUDT()),
+
+            # Boolean
+            (True, BooleanType()),
+
+            # Byte
+            (-(2**7), ByteType()),
+            (2**7 - 1, ByteType()),
+
+            # Short
+            (-(2**15), ShortType()),
+            (2**15 - 1, ShortType()),
+
+            # Integer
+            (-(2**31), IntegerType()),
+            (2**31 - 1, IntegerType()),
+
+            # Long
+            (2**64, LongType()),
+
+            # Float & Double
+            (1.0, FloatType()),
+            (1.0, DoubleType()),
+
+            # Decimal
+            (decimal.Decimal("1.0"), DecimalType()),
+
+            # Binary
+            (bytearray([1, 2]), BinaryType()),
+
+            # Date/Timestamp
+            (datetime.date(2000, 1, 2), DateType()),
+            (datetime.datetime(2000, 1, 2, 3, 4), DateType()),
+            (datetime.datetime(2000, 1, 2, 3, 4), TimestampType()),
+
+            # Array
+            ([], ArrayType(IntegerType())),
+            (["1", None], ArrayType(StringType(), containsNull=True)),
+            ([1, 2], ArrayType(IntegerType())),
+            ((1, 2), ArrayType(IntegerType())),
+            (array.array('h', [1, 2]), ArrayType(IntegerType())),
+
+            # Map
+            ({}, MapType(StringType(), IntegerType())),
+            ({"a": 1}, MapType(StringType(), IntegerType())),
+            ({"a": None}, MapType(StringType(), IntegerType(), valueContainsNull=True)),
+
+            # Struct
+            ({"s": "a", "i": 1}, schema),
+            ({"s": "a", "i": None}, schema),
+            ({"s": "a"}, schema),
+            ({"s": "a", "f": 1.0}, schema),
+            (Row(s="a", i=1), schema),
+            (Row(s="a", i=None), schema),
+            (Row(s="a", i=1, f=1.0), schema),
+            (["a", 1], schema),
+            (["a", None], schema),
+            (("a", 1), schema),
+            (MyObj(s="a", i=1), schema),
+            (MyObj(s="a", i=None), schema),
+            (MyObj(s="a"), schema),
+        ]
+
+        # obj, data_type, exception class
+        failure_spec = [
+            # String (match anything but None)
+            (None, StringType(), ValueError),
+
+            # UDT
+            (ExamplePoint(1.0, 2.0), PythonOnlyUDT(), ValueError),
+
+            # Boolean
+            (1, BooleanType(), TypeError),
+            ("True", BooleanType(), TypeError),
+            ([1], BooleanType(), TypeError),
+
+            # Byte
+            (-(2**7) - 1, ByteType(), ValueError),
+            (2**7, ByteType(), ValueError),
+            ("1", ByteType(), TypeError),
+            (1.0, ByteType(), TypeError),
+
+            # Short
+            (-(2**15) - 1, ShortType(), ValueError),
+            (2**15, ShortType(), ValueError),
+
+            # Integer
+            (-(2**31) - 1, IntegerType(), ValueError),
+            (2**31, IntegerType(), ValueError),
+
+            # Float & Double
+            (1, FloatType(), TypeError),
+            (1, DoubleType(), TypeError),
+
+            # Decimal
+            (1.0, DecimalType(), TypeError),
+            (1, DecimalType(), TypeError),
+            ("1.0", DecimalType(), TypeError),
+
+            # Binary
+            (1, BinaryType(), TypeError),
+
+            # Date/Timestamp
+            ("2000-01-02", DateType(), TypeError),
+            (946811040, TimestampType(), TypeError),
+
+            # Array
+            (["1", None], ArrayType(StringType(), containsNull=False), ValueError),
+            ([1, "2"], ArrayType(IntegerType()), TypeError),
+
+            # Map
+            ({"a": 1}, MapType(IntegerType(), IntegerType()), TypeError),
+            ({"a": "1"}, MapType(StringType(), IntegerType()), TypeError),
+            ({"a": None}, MapType(StringType(), IntegerType(), valueContainsNull=False),
+             ValueError),
+
+            # Struct
+            ({"s": "a", "i": "1"}, schema, TypeError),
+            (Row(s="a"), schema, ValueError),     # Row can't have missing field
+            (Row(s="a", i="1"), schema, TypeError),
+            (["a"], schema, ValueError),
+            (["a", "1"], schema, TypeError),
+            (MyObj(s="a", i="1"), schema, TypeError),
+            (MyObj(s=None, i="1"), schema, ValueError),
+        ]
+
+        # Check success cases
+        for obj, data_type in success_spec:
+            try:
+                _make_type_verifier(data_type, nullable=False)(obj)
+            except Exception:
+                self.fail("verify_type(%s, %s, nullable=False)" % (obj, data_type))
+
+        # Check failure cases
+        for obj, data_type, exp in failure_spec:
+            msg = "verify_type(%s, %s, nullable=False) == %s" % (obj, data_type, exp)
+            with self.assertRaises(exp, msg=msg):
+                _make_type_verifier(data_type, nullable=False)(obj)
+
+
 if __name__ == "__main__":
     from pyspark.sql.tests import *
     if xmlrunner:
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 26b54a7fb3709..f5505ed4722ad 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -1249,121 +1249,196 @@ def _infer_schema_type(obj, dataType):
 }
 
 
-def _verify_type(obj, dataType, nullable=True):
+def _make_type_verifier(dataType, nullable=True, name=None):
     """
-    Verify the type of obj against dataType, raise a TypeError if they do not match.
-
-    Also verify the value of obj against datatype, raise a ValueError if it's not within the allowed
-    range, e.g. using 128 as ByteType will overflow. Note that, Python float is not checked, so it
-    will become infinity when cast to Java float if it overflows.
-
-    >>> _verify_type(None, StructType([]))
-    >>> _verify_type("", StringType())
-    >>> _verify_type(0, LongType())
-    >>> _verify_type(list(range(3)), ArrayType(ShortType()))
-    >>> _verify_type(set(), ArrayType(StringType())) # doctest: +IGNORE_EXCEPTION_DETAIL
+    Make a verifier that checks the type of obj against dataType and raises a TypeError if they do
+    not match.
+
+    This verifier also checks the value of obj against datatype and raises a ValueError if it's not
+    within the allowed range, e.g. using 128 as ByteType will overflow. Note that, Python float is
+    not checked, so it will become infinity when cast to Java float if it overflows.
+
+    >>> _make_type_verifier(StructType([]))(None)
+    >>> _make_type_verifier(StringType())("")
+    >>> _make_type_verifier(LongType())(0)
+    >>> _make_type_verifier(ArrayType(ShortType()))(list(range(3)))
+    >>> _make_type_verifier(ArrayType(StringType()))(set()) # doctest: +IGNORE_EXCEPTION_DETAIL
     Traceback (most recent call last):
         ...
     TypeError:...
-    >>> _verify_type({}, MapType(StringType(), IntegerType()))
-    >>> _verify_type((), StructType([]))
-    >>> _verify_type([], StructType([]))
-    >>> _verify_type([1], StructType([])) # doctest: +IGNORE_EXCEPTION_DETAIL
+    >>> _make_type_verifier(MapType(StringType(), IntegerType()))({})
+    >>> _make_type_verifier(StructType([]))(())
+    >>> _make_type_verifier(StructType([]))([])
+    >>> _make_type_verifier(StructType([]))([1]) # doctest: +IGNORE_EXCEPTION_DETAIL
     Traceback (most recent call last):
         ...
     ValueError:...
     >>> # Check if numeric values are within the allowed range.
-    >>> _verify_type(12, ByteType())
-    >>> _verify_type(1234, ByteType()) # doctest: +IGNORE_EXCEPTION_DETAIL
+    >>> _make_type_verifier(ByteType())(12)
+    >>> _make_type_verifier(ByteType())(1234) # doctest: +IGNORE_EXCEPTION_DETAIL
     Traceback (most recent call last):
         ...
     ValueError:...
-    >>> _verify_type(None, ByteType(), False) # doctest: +IGNORE_EXCEPTION_DETAIL
+    >>> _make_type_verifier(ByteType(), False)(None) # doctest: +IGNORE_EXCEPTION_DETAIL
     Traceback (most recent call last):
         ...
     ValueError:...
-    >>> _verify_type([1, None], ArrayType(ShortType(), False)) # doctest: +IGNORE_EXCEPTION_DETAIL
+    >>> _make_type_verifier(
+    ...     ArrayType(ShortType(), False))([1, None]) # doctest: +IGNORE_EXCEPTION_DETAIL
     Traceback (most recent call last):
         ...
     ValueError:...
-    >>> _verify_type({None: 1}, MapType(StringType(), IntegerType()))
+    >>> _make_type_verifier(MapType(StringType(), IntegerType()))({None: 1})
     Traceback (most recent call last):
         ...
     ValueError:...
     >>> schema = StructType().add("a", IntegerType()).add("b", StringType(), False)
-    >>> _verify_type((1, None), schema) # doctest: +IGNORE_EXCEPTION_DETAIL
+    >>> _make_type_verifier(schema)((1, None)) # doctest: +IGNORE_EXCEPTION_DETAIL
     Traceback (most recent call last):
         ...
     ValueError:...
     """
-    if obj is None:
-        if nullable:
-            return
-        else:
-            raise ValueError("This field is not nullable, but got None")
 
-    # StringType can work with any types
-    if isinstance(dataType, StringType):
-        return
+    if name is None:
+        new_msg = lambda msg: msg
+        new_name = lambda n: "field %s" % n
+    else:
+        new_msg = lambda msg: "%s: %s" % (name, msg)
+        new_name = lambda n: "field %s in %s" % (n, name)
 
-    if isinstance(dataType, UserDefinedType):
-        if not (hasattr(obj, '__UDT__') and obj.__UDT__ == dataType):
-            raise ValueError("%r is not an instance of type %r" % (obj, dataType))
-        _verify_type(dataType.toInternal(obj), dataType.sqlType())
-        return
+    def verify_nullability(obj):
+        if obj is None:
+            if nullable:
+                return True
+            else:
+                raise ValueError(new_msg("This field is not nullable, but got None"))
+        else:
+            return False
 
     _type = type(dataType)
-    assert _type in _acceptable_types, "unknown datatype: %s for object %r" % (dataType, obj)
 
-    if _type is StructType:
-        # check the type and fields later
-        pass
-    else:
+    def assert_acceptable_types(obj):
+        assert _type in _acceptable_types, \
+            new_msg("unknown datatype: %s for object %r" % (dataType, obj))
+
+    def verify_acceptable_types(obj):
         # subclass of them can not be fromInternal in JVM
         if type(obj) not in _acceptable_types[_type]:
-            raise TypeError("%s can not accept object %r in type %s" % (dataType, obj, type(obj)))
+            raise TypeError(new_msg("%s can not accept object %r in type %s"
+                                    % (dataType, obj, type(obj))))
+
+    if isinstance(dataType, StringType):
+        # StringType can work with any types
+        verify_value = lambda _: _
+
+    elif isinstance(dataType, UserDefinedType):
+        verifier = _make_type_verifier(dataType.sqlType(), name=name)
 
-    if isinstance(dataType, ByteType):
-        if obj < -128 or obj > 127:
-            raise ValueError("object of ByteType out of range, got: %s" % obj)
+        def verify_udf(obj):
+            if not (hasattr(obj, '__UDT__') and obj.__UDT__ == dataType):
+                raise ValueError(new_msg("%r is not an instance of type %r" % (obj, dataType)))
+            verifier(dataType.toInternal(obj))
+
+        verify_value = verify_udf
+
+    elif isinstance(dataType, ByteType):
+        def verify_byte(obj):
+            assert_acceptable_types(obj)
+            verify_acceptable_types(obj)
+            if obj < -128 or obj > 127:
+                raise ValueError(new_msg("object of ByteType out of range, got: %s" % obj))
+
+        verify_value = verify_byte
 
     elif isinstance(dataType, ShortType):
-        if obj < -32768 or obj > 32767:
-            raise ValueError("object of ShortType out of range, got: %s" % obj)
+        def verify_short(obj):
+            assert_acceptable_types(obj)
+            verify_acceptable_types(obj)
+            if obj < -32768 or obj > 32767:
+                raise ValueError(new_msg("object of ShortType out of range, got: %s" % obj))
+
+        verify_value = verify_short
 
     elif isinstance(dataType, IntegerType):
-        if obj < -2147483648 or obj > 2147483647:
-            raise ValueError("object of IntegerType out of range, got: %s" % obj)
+        def verify_integer(obj):
+            assert_acceptable_types(obj)
+            verify_acceptable_types(obj)
+            if obj < -2147483648 or obj > 2147483647:
+                raise ValueError(
+                    new_msg("object of IntegerType out of range, got: %s" % obj))
+
+        verify_value = verify_integer
 
     elif isinstance(dataType, ArrayType):
-        for i in obj:
-            _verify_type(i, dataType.elementType, dataType.containsNull)
+        element_verifier = _make_type_verifier(
+            dataType.elementType, dataType.containsNull, name="element in array %s" % name)
+
+        def verify_array(obj):
+            assert_acceptable_types(obj)
+            verify_acceptable_types(obj)
+            for i in obj:
+                element_verifier(i)
+
+        verify_value = verify_array
 
     elif isinstance(dataType, MapType):
-        for k, v in obj.items():
-            _verify_type(k, dataType.keyType, False)
-            _verify_type(v, dataType.valueType, dataType.valueContainsNull)
+        key_verifier = _make_type_verifier(dataType.keyType, False, name="key of map %s" % name)
+        value_verifier = _make_type_verifier(
+            dataType.valueType, dataType.valueContainsNull, name="value of map %s" % name)
+
+        def verify_map(obj):
+            assert_acceptable_types(obj)
+            verify_acceptable_types(obj)
+            for k, v in obj.items():
+                key_verifier(k)
+                value_verifier(v)
+
+        verify_value = verify_map
 
     elif isinstance(dataType, StructType):
-        if isinstance(obj, dict):
-            for f in dataType.fields:
-                _verify_type(obj.get(f.name), f.dataType, f.nullable)
-        elif isinstance(obj, Row) and getattr(obj, "__from_dict__", False):
-            # the order in obj could be different than dataType.fields
-            for f in dataType.fields:
-                _verify_type(obj[f.name], f.dataType, f.nullable)
-        elif isinstance(obj, (tuple, list)):
-            if len(obj) != len(dataType.fields):
-                raise ValueError("Length of object (%d) does not match with "
-                                 "length of fields (%d)" % (len(obj), len(dataType.fields)))
-            for v, f in zip(obj, dataType.fields):
-                _verify_type(v, f.dataType, f.nullable)
-        elif hasattr(obj, "__dict__"):
-            d = obj.__dict__
-            for f in dataType.fields:
-                _verify_type(d.get(f.name), f.dataType, f.nullable)
-        else:
-            raise TypeError("StructType can not accept object %r in type %s" % (obj, type(obj)))
+        verifiers = []
+        for f in dataType.fields:
+            verifier = _make_type_verifier(f.dataType, f.nullable, name=new_name(f.name))
+            verifiers.append((f.name, verifier))
+
+        def verify_struct(obj):
+            assert_acceptable_types(obj)
+
+            if isinstance(obj, dict):
+                for f, verifier in verifiers:
+                    verifier(obj.get(f))
+            elif isinstance(obj, Row) and getattr(obj, "__from_dict__", False):
+                # the order in obj could be different than dataType.fields
+                for f, verifier in verifiers:
+                    verifier(obj[f])
+            elif isinstance(obj, (tuple, list)):
+                if len(obj) != len(verifiers):
+                    raise ValueError(
+                        new_msg("Length of object (%d) does not match with "
+                                "length of fields (%d)" % (len(obj), len(verifiers))))
+                for v, (_, verifier) in zip(obj, verifiers):
+                    verifier(v)
+            elif hasattr(obj, "__dict__"):
+                d = obj.__dict__
+                for f, verifier in verifiers:
+                    verifier(d.get(f))
+            else:
+                raise TypeError(new_msg("StructType can not accept object %r in type %s"
+                                        % (obj, type(obj))))
+        verify_value = verify_struct
+
+    else:
+        def verify_default(obj):
+            assert_acceptable_types(obj)
+            verify_acceptable_types(obj)
+
+        verify_value = verify_default
+
+    def verify(obj):
+        if not verify_nullability(obj):
+            verify_value(obj)
+
+    return verify
 
 
 # This is used to unpickle a Row from JVM

From 29b1f6b09f98e216af71e893a9da0c4717c80679 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 4 Jul 2017 08:54:07 -0700
Subject: [PATCH 081/779] [SPARK-21256][SQL] Add withSQLConf to Catalyst Test

### What changes were proposed in this pull request?
SQLConf is moved to Catalyst. We are adding more and more test cases for verifying the conf-specific behaviors. It is nice to add a helper function to simplify the test cases.

### How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #18469 from gatorsmile/withSQLConf.
---
 .../InferFiltersFromConstraintsSuite.scala    |  5 +--
 .../optimizer/OuterJoinEliminationSuite.scala |  6 +---
 .../optimizer/PruneFiltersSuite.scala         |  6 +---
 .../plans/ConstraintPropagationSuite.scala    | 24 +++++++-------
 .../spark/sql/catalyst/plans/PlanTest.scala   | 32 ++++++++++++++++++-
 .../AggregateEstimationSuite.scala            |  9 ++----
 .../BasicStatsEstimationSuite.scala           | 12 +++----
 .../spark/sql/SparkSessionBuilderSuite.scala  |  3 ++
 .../apache/spark/sql/test/SQLTestUtils.scala  | 30 ++++-------------
 9 files changed, 64 insertions(+), 63 deletions(-)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala
index cdc9f25cf8777..d2dd469e2d74f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala
@@ -206,13 +206,10 @@ class InferFiltersFromConstraintsSuite extends PlanTest {
   }
 
   test("No inferred filter when constraint propagation is disabled") {
-    try {
-      SQLConf.get.setConf(SQLConf.CONSTRAINT_PROPAGATION_ENABLED, false)
+    withSQLConf(SQLConf.CONSTRAINT_PROPAGATION_ENABLED.key -> "false") {
       val originalQuery = testRelation.where('a === 1 && 'a === 'b).analyze
       val optimized = Optimize.execute(originalQuery)
       comparePlans(optimized, originalQuery)
-    } finally {
-      SQLConf.get.unsetConf(SQLConf.CONSTRAINT_PROPAGATION_ENABLED)
     }
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OuterJoinEliminationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OuterJoinEliminationSuite.scala
index 623ff3d446a5f..893c111c2906b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OuterJoinEliminationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OuterJoinEliminationSuite.scala
@@ -234,9 +234,7 @@ class OuterJoinEliminationSuite extends PlanTest {
   }
 
   test("no outer join elimination if constraint propagation is disabled") {
-    try {
-      SQLConf.get.setConf(SQLConf.CONSTRAINT_PROPAGATION_ENABLED, false)
-
+    withSQLConf(SQLConf.CONSTRAINT_PROPAGATION_ENABLED.key -> "false") {
       val x = testRelation.subquery('x)
       val y = testRelation1.subquery('y)
 
@@ -251,8 +249,6 @@ class OuterJoinEliminationSuite extends PlanTest {
       val optimized = Optimize.execute(originalQuery.analyze)
 
       comparePlans(optimized, originalQuery.analyze)
-    } finally {
-      SQLConf.get.unsetConf(SQLConf.CONSTRAINT_PROPAGATION_ENABLED)
     }
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PruneFiltersSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PruneFiltersSuite.scala
index 706634cdd29b8..6d1a05f3c998e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PruneFiltersSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PruneFiltersSuite.scala
@@ -25,7 +25,6 @@ import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.internal.SQLConf.CONSTRAINT_PROPAGATION_ENABLED
 
 class PruneFiltersSuite extends PlanTest {
 
@@ -149,8 +148,7 @@ class PruneFiltersSuite extends PlanTest {
         ("tr1.a".attr > 10 || "tr1.c".attr < 10) &&
           'd.attr < 100)
 
-    SQLConf.get.setConf(SQLConf.CONSTRAINT_PROPAGATION_ENABLED, false)
-    try {
+    withSQLConf(SQLConf.CONSTRAINT_PROPAGATION_ENABLED.key -> "false") {
       val optimized = Optimize.execute(queryWithUselessFilter.analyze)
       // When constraint propagation is disabled, the useless filter won't be pruned.
       // It gets pushed down. Because the rule `CombineFilters` runs only once, there are redundant
@@ -160,8 +158,6 @@ class PruneFiltersSuite extends PlanTest {
         .join(tr2.where('d.attr < 100).where('d.attr < 100),
           Inner, Some("tr1.a".attr === "tr2.a".attr)).analyze
       comparePlans(optimized, correctAnswer)
-    } finally {
-      SQLConf.get.unsetConf(SQLConf.CONSTRAINT_PROPAGATION_ENABLED)
     }
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
index a3948d90b0e4d..a37e06d922642 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{DataType, DoubleType, IntegerType, LongType, StringType}
 
-class ConstraintPropagationSuite extends SparkFunSuite {
+class ConstraintPropagationSuite extends SparkFunSuite with PlanTest {
 
   private def resolveColumn(tr: LocalRelation, columnName: String): Expression =
     resolveColumn(tr.analyze, columnName)
@@ -400,26 +400,26 @@ class ConstraintPropagationSuite extends SparkFunSuite {
   }
 
   test("enable/disable constraint propagation") {
-    try {
-      val tr = LocalRelation('a.int, 'b.string, 'c.int)
-      val filterRelation = tr.where('a.attr > 10)
+    val tr = LocalRelation('a.int, 'b.string, 'c.int)
+    val filterRelation = tr.where('a.attr > 10)
 
-      SQLConf.get.setConf(SQLConf.CONSTRAINT_PROPAGATION_ENABLED, true)
+    withSQLConf(SQLConf.CONSTRAINT_PROPAGATION_ENABLED.key -> "true") {
       assert(filterRelation.analyze.constraints.nonEmpty)
+    }
 
-      SQLConf.get.setConf(SQLConf.CONSTRAINT_PROPAGATION_ENABLED, false)
+    withSQLConf(SQLConf.CONSTRAINT_PROPAGATION_ENABLED.key -> "false") {
       assert(filterRelation.analyze.constraints.isEmpty)
+    }
 
-      val aliasedRelation = tr.where('c.attr > 10 && 'a.attr < 5)
-        .groupBy('a, 'c, 'b)('a, 'c.as("c1"), count('a).as("a3")).select('c1, 'a, 'a3)
+    val aliasedRelation = tr.where('c.attr > 10 && 'a.attr < 5)
+      .groupBy('a, 'c, 'b)('a, 'c.as("c1"), count('a).as("a3")).select('c1, 'a, 'a3)
 
-      SQLConf.get.setConf(SQLConf.CONSTRAINT_PROPAGATION_ENABLED, true)
+    withSQLConf(SQLConf.CONSTRAINT_PROPAGATION_ENABLED.key -> "true") {
       assert(aliasedRelation.analyze.constraints.nonEmpty)
+    }
 
-      SQLConf.get.setConf(SQLConf.CONSTRAINT_PROPAGATION_ENABLED, false)
+    withSQLConf(SQLConf.CONSTRAINT_PROPAGATION_ENABLED.key -> "false") {
       assert(aliasedRelation.analyze.constraints.isEmpty)
-    } finally {
-      SQLConf.get.unsetConf(SQLConf.CONSTRAINT_PROPAGATION_ENABLED)
     }
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
index 6883d23d477e4..e9679d3361509 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.plans
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
@@ -28,8 +29,9 @@ import org.apache.spark.sql.internal.SQLConf
 /**
  * Provides helper methods for comparing plans.
  */
-abstract class PlanTest extends SparkFunSuite with PredicateHelper {
+trait PlanTest extends SparkFunSuite with PredicateHelper {
 
+  // TODO(gatorsmile): remove this from PlanTest and all the analyzer/optimizer rules
   protected val conf = new SQLConf().copy(SQLConf.CASE_SENSITIVE -> true)
 
   /**
@@ -142,4 +144,32 @@ abstract class PlanTest extends SparkFunSuite with PredicateHelper {
         plan1 == plan2
     }
   }
+
+  /**
+   * Sets all SQL configurations specified in `pairs`, calls `f`, and then restore all SQL
+   * configurations.
+   */
+  protected def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = {
+    val conf = SQLConf.get
+    val (keys, values) = pairs.unzip
+    val currentValues = keys.map { key =>
+      if (conf.contains(key)) {
+        Some(conf.getConfString(key))
+      } else {
+        None
+      }
+    }
+    (keys, values).zipped.foreach { (k, v) =>
+      if (SQLConf.staticConfKeys.contains(k)) {
+        throw new AnalysisException(s"Cannot modify the value of a static config: $k")
+      }
+      conf.setConfString(k, v)
+    }
+    try f finally {
+      keys.zip(currentValues).foreach {
+        case (key, Some(value)) => conf.setConfString(key, value)
+        case (key, None) => conf.unsetConf(key)
+      }
+    }
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala
index 30ddf03bd3c4f..23f95a6cc2ac2 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala
@@ -19,12 +19,13 @@ package org.apache.spark.sql.catalyst.statsEstimation
 
 import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeMap, Literal}
 import org.apache.spark.sql.catalyst.expressions.aggregate.Count
+import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.EstimationUtils._
 import org.apache.spark.sql.internal.SQLConf
 
 
-class AggregateEstimationSuite extends StatsEstimationTestBase {
+class AggregateEstimationSuite extends StatsEstimationTestBase with PlanTest {
 
   /** Columns for testing */
   private val columnInfo: AttributeMap[ColumnStat] = AttributeMap(Seq(
@@ -100,9 +101,7 @@ class AggregateEstimationSuite extends StatsEstimationTestBase {
       size = Some(4 * (8 + 4)),
       attributeStats = AttributeMap(Seq("key12").map(nameToColInfo)))
 
-    val originalValue = SQLConf.get.getConf(SQLConf.CBO_ENABLED)
-    try {
-      SQLConf.get.setConf(SQLConf.CBO_ENABLED, false)
+    withSQLConf(SQLConf.CBO_ENABLED.key -> "false") {
       val noGroupAgg = Aggregate(groupingExpressions = Nil,
         aggregateExpressions = Seq(Alias(Count(Literal(1)), "cnt")()), child)
       assert(noGroupAgg.stats ==
@@ -114,8 +113,6 @@ class AggregateEstimationSuite extends StatsEstimationTestBase {
       assert(hasGroupAgg.stats ==
         // From UnaryNode.computeStats, childSize * outputRowSize / childRowSize
         Statistics(sizeInBytes = 48 * (8 + 4 + 8) / (8 + 4)))
-    } finally {
-      SQLConf.get.setConf(SQLConf.CBO_ENABLED, originalValue)
     }
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/BasicStatsEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/BasicStatsEstimationSuite.scala
index 31a8cbdee9777..5fd21a06a109d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/BasicStatsEstimationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/BasicStatsEstimationSuite.scala
@@ -18,12 +18,13 @@
 package org.apache.spark.sql.catalyst.statsEstimation
 
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference, Literal}
+import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.IntegerType
 
 
-class BasicStatsEstimationSuite extends StatsEstimationTestBase {
+class BasicStatsEstimationSuite extends PlanTest with StatsEstimationTestBase {
   val attribute = attr("key")
   val colStat = ColumnStat(distinctCount = 10, min = Some(1), max = Some(10),
     nullCount = 0, avgLen = 4, maxLen = 4)
@@ -82,18 +83,15 @@ class BasicStatsEstimationSuite extends StatsEstimationTestBase {
       plan: LogicalPlan,
       expectedStatsCboOn: Statistics,
       expectedStatsCboOff: Statistics): Unit = {
-    val originalValue = SQLConf.get.getConf(SQLConf.CBO_ENABLED)
-    try {
+    withSQLConf(SQLConf.CBO_ENABLED.key -> "true") {
       // Invalidate statistics
       plan.invalidateStatsCache()
-      SQLConf.get.setConf(SQLConf.CBO_ENABLED, true)
       assert(plan.stats == expectedStatsCboOn)
+    }
 
+    withSQLConf(SQLConf.CBO_ENABLED.key -> "false") {
       plan.invalidateStatsCache()
-      SQLConf.get.setConf(SQLConf.CBO_ENABLED, false)
       assert(plan.stats == expectedStatsCboOff)
-    } finally {
-      SQLConf.get.setConf(SQLConf.CBO_ENABLED, originalValue)
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSuite.scala
index 4f6d5f79d466e..cdac6827082c4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql
 
 import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite}
+import org.apache.spark.sql.internal.SQLConf
 
 /**
  * Test cases for the builder pattern of [[SparkSession]].
@@ -67,6 +68,8 @@ class SparkSessionBuilderSuite extends SparkFunSuite {
     assert(activeSession != defaultSession)
     assert(session == activeSession)
     assert(session.conf.get("spark-config2") == "a")
+    assert(session.sessionState.conf == SQLConf.get)
+    assert(SQLConf.get.getConfString("spark-config2") == "a")
     SparkSession.clearActiveSession()
 
     assert(SparkSession.builder().getOrCreate() == defaultSession)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
index d74a7cce25ed6..92ee7d596acd1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
@@ -35,9 +35,11 @@ import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.analysis.NoSuchTableException
 import org.apache.spark.sql.catalyst.catalog.SessionCatalog.DEFAULT_DATABASE
 import org.apache.spark.sql.catalyst.FunctionIdentifier
+import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.execution.FilterExec
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.util.{UninterruptibleThread, Utils}
 
 /**
@@ -53,7 +55,8 @@ import org.apache.spark.util.{UninterruptibleThread, Utils}
 private[sql] trait SQLTestUtils
   extends SparkFunSuite with Eventually
   with BeforeAndAfterAll
-  with SQLTestData { self =>
+  with SQLTestData
+  with PlanTest { self =>
 
   protected def sparkContext = spark.sparkContext
 
@@ -89,28 +92,9 @@ private[sql] trait SQLTestUtils
     }
   }
 
-  /**
-   * Sets all SQL configurations specified in `pairs`, calls `f`, and then restore all SQL
-   * configurations.
-   *
-   * @todo Probably this method should be moved to a more general place
-   */
-  protected def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = {
-    val (keys, values) = pairs.unzip
-    val currentValues = keys.map { key =>
-      if (spark.conf.contains(key)) {
-        Some(spark.conf.get(key))
-      } else {
-        None
-      }
-    }
-    (keys, values).zipped.foreach(spark.conf.set)
-    try f finally {
-      keys.zip(currentValues).foreach {
-        case (key, Some(value)) => spark.conf.set(key, value)
-        case (key, None) => spark.conf.unset(key)
-      }
-    }
+  protected override def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = {
+    SparkSession.setActiveSession(spark)
+    super.withSQLConf(pairs: _*)(f)
   }
 
   /**

From a3c29fcbbda02c1528b4185bcb880c91077d480c Mon Sep 17 00:00:00 2001
From: "YIHAODIAN\\wangshuangshuang" <wangshuangshuang@yihaodian.com>
Date: Tue, 4 Jul 2017 09:44:27 -0700
Subject: [PATCH 082/779] [SPARK-19726][SQL] Faild to insert null timestamp
 value to mysql using spark jdbc

## What changes were proposed in this pull request?

when creating table like following:
> create table timestamp_test(id int(11), time_stamp timestamp not null default current_timestamp);

The result of Excuting "insert into timestamp_test values (111, null)" is different between Spark and JDBC.
```
mysql> select * from timestamp_test;
+------+---------------------+
| id   | time_stamp          |
+------+---------------------+
|  111 | 1970-01-01 00:00:00 | -> spark
|  111 | 2017-06-27 19:32:38 | -> mysql
+------+---------------------+
2 rows in set (0.00 sec)
```
   Because in such case ```StructField.nullable``` is false, so the generated codes of ```InvokeLike``` and ```BoundReference``` don't check whether the field is null or not. Instead, they directly use ```CodegenContext.INPUT_ROW.getLong(1)```, however, ```UnsafeRow.setNullAt(1)``` will put 0 in the underlying memory.

   The PR will ```always``` set ```StructField.nullable```  true after obtaining metadata from jdbc connection, Since we can insert null to not null timestamp column in MySQL. In this way, spark will propagate null to underlying DB engine, and let DB to choose how to process NULL.

## How was this patch tested?

Added tests.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: YIHAODIAN\wangshuangshuang <wangshuangshuang@yihaodian.com>
Author: Shuangshuang Wang <wsszone@gmail.com>

Closes #18445 from shuangshuangwang/SPARK-19726.
---
 .../sql/execution/datasources/jdbc/JDBCRDD.scala     |  2 +-
 .../sql/execution/datasources/jdbc/JdbcUtils.scala   | 12 ++++++++++--
 .../org/apache/spark/sql/jdbc/JDBCWriteSuite.scala   |  8 ++++++++
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
index 0f53b5c7c6f0f..57e9bc9b70454 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
@@ -59,7 +59,7 @@ object JDBCRDD extends Logging {
       try {
         val rs = statement.executeQuery()
         try {
-          JdbcUtils.getSchema(rs, dialect)
+          JdbcUtils.getSchema(rs, dialect, alwaysNullable = true)
         } finally {
           rs.close()
         }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
index ca61c2efe2ddf..55b2539c13381 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
@@ -266,10 +266,14 @@ object JdbcUtils extends Logging {
   /**
    * Takes a [[ResultSet]] and returns its Catalyst schema.
    *
+   * @param alwaysNullable If true, all the columns are nullable.
    * @return A [[StructType]] giving the Catalyst schema.
    * @throws SQLException if the schema contains an unsupported type.
    */
-  def getSchema(resultSet: ResultSet, dialect: JdbcDialect): StructType = {
+  def getSchema(
+      resultSet: ResultSet,
+      dialect: JdbcDialect,
+      alwaysNullable: Boolean = false): StructType = {
     val rsmd = resultSet.getMetaData
     val ncols = rsmd.getColumnCount
     val fields = new Array[StructField](ncols)
@@ -290,7 +294,11 @@ object JdbcUtils extends Logging {
             rsmd.getClass.getName == "org.apache.hive.jdbc.HiveResultSetMetaData" => true
         }
       }
-      val nullable = rsmd.isNullable(i + 1) != ResultSetMetaData.columnNoNulls
+      val nullable = if (alwaysNullable) {
+        true
+      } else {
+        rsmd.isNullable(i + 1) != ResultSetMetaData.columnNoNulls
+      }
       val metadata = new MetadataBuilder()
         .putString("name", columnName)
         .putLong("scale", fieldScale)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
index bf1fd160704fa..92f50a095f19b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
@@ -24,6 +24,7 @@ import scala.collection.JavaConverters.propertiesAsScalaMapConverter
 
 import org.scalatest.BeforeAndAfter
 
+import org.apache.spark.SparkException
 import org.apache.spark.sql.{AnalysisException, DataFrame, Row, SaveMode}
 import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JdbcUtils}
@@ -506,4 +507,11 @@ class JDBCWriteSuite extends SharedSQLContext with BeforeAndAfter {
         "schema struct<name:string,id:int>"))
     }
   }
+
+  test("SPARK-19726: INSERT null to a NOT NULL column") {
+    val e = intercept[SparkException] {
+      sql("INSERT INTO PEOPLE1 values (null, null)")
+    }.getMessage
+    assert(e.contains("NULL not allowed for column \"NAME\""))
+  }
 }

From 1b50e0e0d6fd9d1b815a3bb37647ea659222e3f1 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Tue, 4 Jul 2017 09:48:40 -0700
Subject: [PATCH 083/779] [SPARK-20256][SQL] SessionState should be created
 more lazily

## What changes were proposed in this pull request?

`SessionState` is designed to be created lazily. However, in reality, it created immediately in `SparkSession.Builder.getOrCreate` ([here](https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala#L943)).

This PR aims to recover the lazy behavior by keeping the options into `initialSessionOptions`. The benefit is like the following. Users can start `spark-shell` and use RDD operations without any problems.

**BEFORE**
```scala
$ bin/spark-shell
java.lang.IllegalArgumentException: Error while instantiating 'org.apache.spark.sql.hive.HiveSessionStateBuilder'
...
Caused by: org.apache.spark.sql.AnalysisException:
    org.apache.hadoop.hive.ql.metadata.HiveException:
       MetaException(message:java.security.AccessControlException:
          Permission denied: user=spark, access=READ,
             inode="/apps/hive/warehouse":hive:hdfs:drwx------
```
As reported in SPARK-20256, this happens when the warehouse directory is not allowed for this user.

**AFTER**
```scala
$ bin/spark-shell
...
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 2.3.0-SNAPSHOT
      /_/

Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_112)
Type in expressions to have them evaluated.
Type :help for more information.

scala> sc.range(0, 10, 1).count()
res0: Long = 10
```

## How was this patch tested?

Manual.

This closes #18512 .

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #18501 from dongjoon-hyun/SPARK-20256.
---
 .../scala/org/apache/spark/sql/SparkSession.scala    | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index 2c38f7d7c88da..0ddcd2111aa58 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -117,6 +117,12 @@ class SparkSession private(
     existingSharedState.getOrElse(new SharedState(sparkContext))
   }
 
+  /**
+   * Initial options for session. This options are applied once when sessionState is created.
+   */
+  @transient
+  private[sql] val initialSessionOptions = new scala.collection.mutable.HashMap[String, String]
+
   /**
    * State isolated across sessions, including SQL configurations, temporary tables, registered
    * functions, and everything else that accepts a [[org.apache.spark.sql.internal.SQLConf]].
@@ -132,9 +138,11 @@ class SparkSession private(
     parentSessionState
       .map(_.clone(this))
       .getOrElse {
-        SparkSession.instantiateSessionState(
+        val state = SparkSession.instantiateSessionState(
           SparkSession.sessionStateClassName(sparkContext.conf),
           self)
+        initialSessionOptions.foreach { case (k, v) => state.conf.setConfString(k, v) }
+        state
       }
   }
 
@@ -940,7 +948,7 @@ object SparkSession {
         }
 
         session = new SparkSession(sparkContext, None, None, extensions)
-        options.foreach { case (k, v) => session.sessionState.conf.setConfString(k, v) }
+        options.foreach { case (k, v) => session.initialSessionOptions.put(k, v) }
         defaultSession.set(session)
 
         // Register a successfully instantiated context to the singleton. This should be at the

From 4d6d8192c807006ff89488a1d38bc6f7d41de5cf Mon Sep 17 00:00:00 2001
From: dardelet <guillaumegorp@gmail.com>
Date: Tue, 4 Jul 2017 17:58:44 +0100
Subject: [PATCH 084/779] [SPARK-21268][MLLIB] Move center calculations to a
 distributed map in KMeans

## What changes were proposed in this pull request?

The scal() and creation of newCenter vector is done in the driver, after a collectAsMap operation while it could be done in the distributed RDD.
This PR moves this code before the collectAsMap for more efficiency

## How was this patch tested?

This was tested manually by running the KMeansExample and verifying that the new code ran without error and gave same output as before.

Author: dardelet <guillaumegorp@gmail.com>
Author: Guillaume Dardelet <dardelet@users.noreply.github.com>

Closes #18491 from dardelet/move-center-calculation-to-distributed-map-kmean.
---
 .../org/apache/spark/mllib/clustering/KMeans.scala    | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index fa72b72e2d921..98e50c5b45cfd 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -272,8 +272,8 @@ class KMeans private (
       val costAccum = sc.doubleAccumulator
       val bcCenters = sc.broadcast(centers)
 
-      // Find the sum and count of points mapping to each center
-      val totalContribs = data.mapPartitions { points =>
+      // Find the new centers
+      val newCenters = data.mapPartitions { points =>
         val thisCenters = bcCenters.value
         val dims = thisCenters.head.vector.size
 
@@ -292,15 +292,16 @@ class KMeans private (
       }.reduceByKey { case ((sum1, count1), (sum2, count2)) =>
         axpy(1.0, sum2, sum1)
         (sum1, count1 + count2)
+      }.mapValues { case (sum, count) =>
+        scal(1.0 / count, sum)
+        new VectorWithNorm(sum)
       }.collectAsMap()
 
       bcCenters.destroy(blocking = false)
 
       // Update the cluster centers and costs
       converged = true
-      totalContribs.foreach { case (j, (sum, count)) =>
-        scal(1.0 / count, sum)
-        val newCenter = new VectorWithNorm(sum)
+      newCenters.foreach { case (j, newCenter) =>
         if (converged && KMeans.fastSquaredDistance(newCenter, centers(j)) > epsilon * epsilon) {
           converged = false
         }

From cec392150451a64c9c2902b7f8f4b3b38f25cbea Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Tue, 4 Jul 2017 12:18:51 -0700
Subject: [PATCH 085/779] [SPARK-20889][SPARKR] Grouped documentation for
 WINDOW column methods

## What changes were proposed in this pull request?

Grouped documentation for column window methods.

Author: actuaryzhang <actuaryzhang10@gmail.com>

Closes #18481 from actuaryzhang/sparkRDocWindow.
---
 R/pkg/R/functions.R | 225 ++++++++++++++------------------------------
 R/pkg/R/generics.R  |  28 +++---
 2 files changed, 88 insertions(+), 165 deletions(-)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index a1f5c4f8cc18d..8c12308c1d7c1 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -200,6 +200,34 @@ NULL
 #' head(select(tmp, sort_array(tmp$v1, asc = FALSE)))}
 NULL
 
+#' Window functions for Column operations
+#'
+#' Window functions defined for \code{Column}.
+#'
+#' @param x In \code{lag} and \code{lead}, it is the column as a character string or a Column
+#'          to compute on. In \code{ntile}, it is the number of ntile groups.
+#' @param offset In \code{lag}, the number of rows back from the current row from which to obtain
+#'               a value. In \code{lead}, the number of rows after the current row from which to
+#'               obtain a value. If not specified, the default is 1.
+#' @param defaultValue (optional) default to use when the offset row does not exist.
+#' @param ... additional argument(s).
+#' @name column_window_functions
+#' @rdname column_window_functions
+#' @family window functions
+#' @examples
+#' \dontrun{
+#' # Dataframe used throughout this doc
+#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))
+#' ws <- orderBy(windowPartitionBy("am"), "hp")
+#' tmp <- mutate(df, dist = over(cume_dist(), ws), dense_rank = over(dense_rank(), ws),
+#'               lag = over(lag(df$mpg), ws), lead = over(lead(df$mpg, 1), ws),
+#'               percent_rank = over(percent_rank(), ws),
+#'               rank = over(rank(), ws), row_number = over(row_number(), ws))
+#' # Get ntile group id (1-4) for hp
+#' tmp <- mutate(tmp, ntile = over(ntile(4), ws))
+#' head(tmp)}
+NULL
+
 #' @details
 #' \code{lit}: A new Column is created to represent the literal value.
 #' If the parameter is a Column, it is returned unchanged.
@@ -2844,27 +2872,16 @@ setMethod("ifelse",
 
 ###################### Window functions######################
 
-#' cume_dist
-#'
-#' Window function: returns the cumulative distribution of values within a window partition,
-#' i.e. the fraction of rows that are below the current row.
-#'
-#'   N = total number of rows in the partition
-#'   cume_dist(x) = number of values before (and including) x / N
-#'
+#' @details
+#' \code{cume_dist}: Returns the cumulative distribution of values within a window partition,
+#' i.e. the fraction of rows that are below the current row:
+#' (number of values before and including x) / (total number of rows in the partition).
 #' This is equivalent to the \code{CUME_DIST} function in SQL.
+#' The method should be used with no argument.
 #'
-#' @rdname cume_dist
-#' @name cume_dist
-#' @family window functions
-#' @aliases cume_dist,missing-method
+#' @rdname column_window_functions
+#' @aliases cume_dist cume_dist,missing-method
 #' @export
-#' @examples
-#' \dontrun{
-#'   df <- createDataFrame(mtcars)
-#'   ws <- orderBy(windowPartitionBy("am"), "hp")
-#'   out <- select(df, over(cume_dist(), ws), df$hp, df$am)
-#' }
 #' @note cume_dist since 1.6.0
 setMethod("cume_dist",
           signature("missing"),
@@ -2873,28 +2890,19 @@ setMethod("cume_dist",
             column(jc)
           })
 
-#' dense_rank
-#'
-#' Window function: returns the rank of rows within a window partition, without any gaps.
+#' @details
+#' \code{dense_rank}: Returns the rank of rows within a window partition, without any gaps.
 #' The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking
 #' sequence when there are ties. That is, if you were ranking a competition using dense_rank
 #' and had three people tie for second place, you would say that all three were in second
 #' place and that the next person came in third. Rank would give me sequential numbers, making
 #' the person that came in third place (after the ties) would register as coming in fifth.
-#'
 #' This is equivalent to the \code{DENSE_RANK} function in SQL.
+#' The method should be used with no argument.
 #'
-#' @rdname dense_rank
-#' @name dense_rank
-#' @family window functions
-#' @aliases dense_rank,missing-method
+#' @rdname column_window_functions
+#' @aliases dense_rank dense_rank,missing-method
 #' @export
-#' @examples
-#' \dontrun{
-#'   df <- createDataFrame(mtcars)
-#'   ws <- orderBy(windowPartitionBy("am"), "hp")
-#'   out <- select(df, over(dense_rank(), ws), df$hp, df$am)
-#' }
 #' @note dense_rank since 1.6.0
 setMethod("dense_rank",
           signature("missing"),
@@ -2903,34 +2911,15 @@ setMethod("dense_rank",
             column(jc)
           })
 
-#' lag
-#'
-#' Window function: returns the value that is \code{offset} rows before the current row, and
+#' @details
+#' \code{lag}: Returns the value that is \code{offset} rows before the current row, and
 #' \code{defaultValue} if there is less than \code{offset} rows before the current row. For example,
 #' an \code{offset} of one will return the previous row at any given point in the window partition.
-#'
 #' This is equivalent to the \code{LAG} function in SQL.
 #'
-#' @param x the column as a character string or a Column to compute on.
-#' @param offset the number of rows back from the current row from which to obtain a value.
-#'               If not specified, the default is 1.
-#' @param defaultValue (optional) default to use when the offset row does not exist.
-#' @param ... further arguments to be passed to or from other methods.
-#' @rdname lag
-#' @name lag
-#' @aliases lag,characterOrColumn-method
-#' @family window functions
+#' @rdname column_window_functions
+#' @aliases lag lag,characterOrColumn-method
 #' @export
-#' @examples
-#' \dontrun{
-#'   df <- createDataFrame(mtcars)
-#'
-#'   # Partition by am (transmission) and order by hp (horsepower)
-#'   ws <- orderBy(windowPartitionBy("am"), "hp")
-#'
-#'   # Lag mpg values by 1 row on the partition-and-ordered table
-#'   out <- select(df, over(lag(df$mpg), ws), df$mpg, df$hp, df$am)
-#' }
 #' @note lag since 1.6.0
 setMethod("lag",
           signature(x = "characterOrColumn"),
@@ -2946,35 +2935,16 @@ setMethod("lag",
             column(jc)
           })
 
-#' lead
-#'
-#' Window function: returns the value that is \code{offset} rows after the current row, and
+#' @details
+#' \code{lead}: Returns the value that is \code{offset} rows after the current row, and
 #' \code{defaultValue} if there is less than \code{offset} rows after the current row.
 #' For example, an \code{offset} of one will return the next row at any given point
 #' in the window partition.
-#'
 #' This is equivalent to the \code{LEAD} function in SQL.
 #'
-#' @param x the column as a character string or a Column to compute on.
-#' @param offset the number of rows after the current row from which to obtain a value.
-#'               If not specified, the default is 1.
-#' @param defaultValue (optional) default to use when the offset row does not exist.
-#'
-#' @rdname lead
-#' @name lead
-#' @family window functions
-#' @aliases lead,characterOrColumn,numeric-method
+#' @rdname column_window_functions
+#' @aliases lead lead,characterOrColumn,numeric-method
 #' @export
-#' @examples
-#' \dontrun{
-#'   df <- createDataFrame(mtcars)
-#'
-#'   # Partition by am (transmission) and order by hp (horsepower)
-#'   ws <- orderBy(windowPartitionBy("am"), "hp")
-#'
-#'   # Lead mpg values by 1 row on the partition-and-ordered table
-#'   out <- select(df, over(lead(df$mpg), ws), df$mpg, df$hp, df$am)
-#' }
 #' @note lead since 1.6.0
 setMethod("lead",
           signature(x = "characterOrColumn", offset = "numeric", defaultValue = "ANY"),
@@ -2990,31 +2960,15 @@ setMethod("lead",
             column(jc)
           })
 
-#' ntile
-#'
-#' Window function: returns the ntile group id (from 1 to n inclusive) in an ordered window
+#' @details
+#' \code{ntile}: Returns the ntile group id (from 1 to n inclusive) in an ordered window
 #' partition. For example, if n is 4, the first quarter of the rows will get value 1, the second
 #' quarter will get 2, the third quarter will get 3, and the last quarter will get 4.
-#'
 #' This is equivalent to the \code{NTILE} function in SQL.
 #'
-#' @param x Number of ntile groups
-#'
-#' @rdname ntile
-#' @name ntile
-#' @aliases ntile,numeric-method
-#' @family window functions
+#' @rdname column_window_functions
+#' @aliases ntile ntile,numeric-method
 #' @export
-#' @examples
-#' \dontrun{
-#'   df <- createDataFrame(mtcars)
-#'
-#'   # Partition by am (transmission) and order by hp (horsepower)
-#'   ws <- orderBy(windowPartitionBy("am"), "hp")
-#'
-#'   # Get ntile group id (1-4) for hp
-#'   out <- select(df, over(ntile(4), ws), df$hp, df$am)
-#' }
 #' @note ntile since 1.6.0
 setMethod("ntile",
           signature(x = "numeric"),
@@ -3023,27 +2977,15 @@ setMethod("ntile",
             column(jc)
           })
 
-#' percent_rank
-#'
-#' Window function: returns the relative rank (i.e. percentile) of rows within a window partition.
-#'
-#' This is computed by:
-#'
-#'   (rank of row in its partition - 1) / (number of rows in the partition - 1)
-#'
-#' This is equivalent to the PERCENT_RANK function in SQL.
+#' @details
+#' \code{percent_rank}: Returns the relative rank (i.e. percentile) of rows within a window partition.
+#' This is computed by: (rank of row in its partition - 1) / (number of rows in the partition - 1).
+#' This is equivalent to the \code{PERCENT_RANK} function in SQL.
+#' The method should be used with no argument.
 #'
-#' @rdname percent_rank
-#' @name percent_rank
-#' @family window functions
-#' @aliases percent_rank,missing-method
+#' @rdname column_window_functions
+#' @aliases percent_rank percent_rank,missing-method
 #' @export
-#' @examples
-#' \dontrun{
-#'   df <- createDataFrame(mtcars)
-#'   ws <- orderBy(windowPartitionBy("am"), "hp")
-#'   out <- select(df, over(percent_rank(), ws), df$hp, df$am)
-#' }
 #' @note percent_rank since 1.6.0
 setMethod("percent_rank",
           signature("missing"),
@@ -3052,29 +2994,19 @@ setMethod("percent_rank",
             column(jc)
           })
 
-#' rank
-#'
-#' Window function: returns the rank of rows within a window partition.
-#'
+#' @details
+#' \code{rank}: Returns the rank of rows within a window partition.
 #' The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking
 #' sequence when there are ties. That is, if you were ranking a competition using dense_rank
 #' and had three people tie for second place, you would say that all three were in second
 #' place and that the next person came in third. Rank would give me sequential numbers, making
 #' the person that came in third place (after the ties) would register as coming in fifth.
+#' This is equivalent to the \code{RANK} function in SQL.
+#' The method should be used with no argument.
 #'
-#' This is equivalent to the RANK function in SQL.
-#'
-#' @rdname rank
-#' @name rank
-#' @family window functions
-#' @aliases rank,missing-method
+#' @rdname column_window_functions
+#' @aliases rank rank,missing-method
 #' @export
-#' @examples
-#' \dontrun{
-#'   df <- createDataFrame(mtcars)
-#'   ws <- orderBy(windowPartitionBy("am"), "hp")
-#'   out <- select(df, over(rank(), ws), df$hp, df$am)
-#' }
 #' @note rank since 1.6.0
 setMethod("rank",
           signature(x = "missing"),
@@ -3083,11 +3015,7 @@ setMethod("rank",
             column(jc)
           })
 
-# Expose rank() in the R base package
-#' @param x a numeric, complex, character or logical vector.
-#' @param ... additional argument(s) passed to the method.
-#' @name rank
-#' @rdname rank
+#' @rdname column_window_functions
 #' @aliases rank,ANY-method
 #' @export
 setMethod("rank",
@@ -3096,23 +3024,14 @@ setMethod("rank",
             base::rank(x, ...)
           })
 
-#' row_number
-#'
-#' Window function: returns a sequential number starting at 1 within a window partition.
-#'
-#' This is equivalent to the ROW_NUMBER function in SQL.
+#' @details
+#' \code{row_number}: Returns a sequential number starting at 1 within a window partition.
+#' This is equivalent to the \code{ROW_NUMBER} function in SQL.
+#' The method should be used with no argument.
 #'
-#' @rdname row_number
-#' @name row_number
-#' @aliases row_number,missing-method
-#' @family window functions
+#' @rdname column_window_functions
+#' @aliases row_number row_number,missing-method
 #' @export
-#' @examples
-#' \dontrun{
-#'   df <- createDataFrame(mtcars)
-#'   ws <- orderBy(windowPartitionBy("am"), "hp")
-#'   out <- select(df, over(row_number(), ws), df$hp, df$am)
-#' }
 #' @note row_number since 1.6.0
 setMethod("row_number",
           signature("missing"),
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index b901b74e4728d..beac18e412736 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -1013,9 +1013,9 @@ setGeneric("create_map", function(x, ...) { standardGeneric("create_map") })
 #' @name NULL
 setGeneric("hash", function(x, ...) { standardGeneric("hash") })
 
-#' @param x empty. Should be used with no argument.
-#' @rdname cume_dist
+#' @rdname column_window_functions
 #' @export
+#' @name NULL
 setGeneric("cume_dist", function(x = "missing") { standardGeneric("cume_dist") })
 
 #' @rdname column_datetime_diff_functions
@@ -1053,9 +1053,9 @@ setGeneric("dayofyear", function(x) { standardGeneric("dayofyear") })
 #' @name NULL
 setGeneric("decode", function(x, charset) { standardGeneric("decode") })
 
-#' @param x empty. Should be used with no argument.
-#' @rdname dense_rank
+#' @rdname column_window_functions
 #' @export
+#' @name NULL
 setGeneric("dense_rank", function(x = "missing") { standardGeneric("dense_rank") })
 
 #' @rdname column_string_functions
@@ -1159,8 +1159,9 @@ setGeneric("isnan", function(x) { standardGeneric("isnan") })
 #' @name NULL
 setGeneric("kurtosis", function(x) { standardGeneric("kurtosis") })
 
-#' @rdname lag
+#' @rdname column_window_functions
 #' @export
+#' @name NULL
 setGeneric("lag", function(x, ...) { standardGeneric("lag") })
 
 #' @rdname last
@@ -1172,8 +1173,9 @@ setGeneric("last", function(x, ...) { standardGeneric("last") })
 #' @name NULL
 setGeneric("last_day", function(x) { standardGeneric("last_day") })
 
-#' @rdname lead
+#' @rdname column_window_functions
 #' @export
+#' @name NULL
 setGeneric("lead", function(x, offset, defaultValue = NULL) { standardGeneric("lead") })
 
 #' @rdname column_nonaggregate_functions
@@ -1260,8 +1262,9 @@ setGeneric("not", function(x) { standardGeneric("not") })
 #' @name NULL
 setGeneric("next_day", function(y, x) { standardGeneric("next_day") })
 
-#' @rdname ntile
+#' @rdname column_window_functions
 #' @export
+#' @name NULL
 setGeneric("ntile", function(x) { standardGeneric("ntile") })
 
 #' @rdname column_aggregate_functions
@@ -1269,9 +1272,9 @@ setGeneric("ntile", function(x) { standardGeneric("ntile") })
 #' @name NULL
 setGeneric("n_distinct", function(x, ...) { standardGeneric("n_distinct") })
 
-#' @param x empty. Should be used with no argument.
-#' @rdname percent_rank
+#' @rdname column_window_functions
 #' @export
+#' @name NULL
 setGeneric("percent_rank", function(x = "missing") { standardGeneric("percent_rank") })
 
 #' @rdname column_math_functions
@@ -1304,8 +1307,9 @@ setGeneric("rand", function(seed) { standardGeneric("rand") })
 #' @name NULL
 setGeneric("randn", function(seed) { standardGeneric("randn") })
 
-#' @rdname rank
+#' @rdname column_window_functions
 #' @export
+#' @name NULL
 setGeneric("rank", function(x, ...) { standardGeneric("rank") })
 
 #' @rdname column_string_functions
@@ -1334,9 +1338,9 @@ setGeneric("reverse", function(x) { standardGeneric("reverse") })
 #' @name NULL
 setGeneric("rint", function(x) { standardGeneric("rint") })
 
-#' @param x empty. Should be used with no argument.
-#' @rdname row_number
+#' @rdname column_window_functions
 #' @export
+#' @name NULL
 setGeneric("row_number", function(x = "missing") { standardGeneric("row_number") })
 
 #' @rdname column_string_functions

From daabf425ec0272951b11f286e4bec7a48f42cc0d Mon Sep 17 00:00:00 2001
From: wangmiao1981 <wm624@hotmail.com>
Date: Tue, 4 Jul 2017 12:37:29 -0700
Subject: [PATCH 086/779] [MINOR][SPARKR] ignore Rplots.pdf test output after
 running R tests

## What changes were proposed in this pull request?

After running R tests in local build, it outputs Rplots.pdf. This one should be ignored in the git repository.

Author: wangmiao1981 <wm624@hotmail.com>

Closes #18518 from wangmiao1981/ignore.
---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 1d91b43c23fa7..cf9780db37ad7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,6 +25,7 @@ R-unit-tests.log
 R/unit-tests.out
 R/cran-check.out
 R/pkg/vignettes/sparkr-vignettes.html
+R/pkg/tests/fulltests/Rplots.pdf
 build/*.jar
 build/apache-maven*
 build/scala*

From de14086e1f6a2474bb9ba1452ada94e0ce58cf9c Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Wed, 5 Jul 2017 10:40:02 +0800
Subject: [PATCH 087/779] [SPARK-21295][SQL] Use qualified names in error
 message for missing references

### What changes were proposed in this pull request?
It is strange to see the following error message. Actually, the column is from another table.
```
cannot resolve '`right.a`' given input columns: [a, c, d];
```

After the PR, the error message looks like
```
cannot resolve '`right.a`' given input columns: [left.a, right.c, right.d];
```

### How was this patch tested?
Added a test case

Author: gatorsmile <gatorsmile@gmail.com>

Closes #18520 from gatorsmile/removeSQLConf.
---
 .../sql/catalyst/analysis/CheckAnalysis.scala  |  2 +-
 .../results/columnresolution-negative.sql.out  | 10 +++++-----
 .../results/columnresolution-views.sql.out     |  2 +-
 .../sql-tests/results/columnresolution.sql.out | 18 +++++++++---------
 .../sql-tests/results/group-by.sql.out         |  2 +-
 .../sql-tests/results/table-aliases.sql.out    |  2 +-
 .../org/apache/spark/sql/SubquerySuite.scala   |  4 ++--
 7 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index fb81a7006bc5e..85c52792ef659 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -86,7 +86,7 @@ trait CheckAnalysis extends PredicateHelper {
       case operator: LogicalPlan =>
         operator transformExpressionsUp {
           case a: Attribute if !a.resolved =>
-            val from = operator.inputSet.map(_.name).mkString(", ")
+            val from = operator.inputSet.map(_.qualifiedName).mkString(", ")
             a.failAnalysis(s"cannot resolve '${a.sql}' given input columns: [$from]")
 
           case e: Expression if e.checkInputDataTypes().isFailure =>
diff --git a/sql/core/src/test/resources/sql-tests/results/columnresolution-negative.sql.out b/sql/core/src/test/resources/sql-tests/results/columnresolution-negative.sql.out
index 60bd8e9cc99db..9e60e592c2bd1 100644
--- a/sql/core/src/test/resources/sql-tests/results/columnresolution-negative.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/columnresolution-negative.sql.out
@@ -90,7 +90,7 @@ SELECT mydb1.t1.i1 FROM t1, mydb1.t1
 struct<>
 -- !query 10 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '`mydb1.t1.i1`' given input columns: [i1, i1]; line 1 pos 7
+cannot resolve '`mydb1.t1.i1`' given input columns: [t1.i1, t1.i1]; line 1 pos 7
 
 
 -- !query 11
@@ -161,7 +161,7 @@ SELECT db1.t1.i1 FROM t1, mydb2.t1
 struct<>
 -- !query 18 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '`db1.t1.i1`' given input columns: [i1, i1]; line 1 pos 7
+cannot resolve '`db1.t1.i1`' given input columns: [t1.i1, t1.i1]; line 1 pos 7
 
 
 -- !query 19
@@ -186,7 +186,7 @@ SELECT mydb1.t1 FROM t1
 struct<>
 -- !query 21 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '`mydb1.t1`' given input columns: [i1]; line 1 pos 7
+cannot resolve '`mydb1.t1`' given input columns: [t1.i1]; line 1 pos 7
 
 
 -- !query 22
@@ -204,7 +204,7 @@ SELECT t1 FROM mydb1.t1
 struct<>
 -- !query 23 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '`t1`' given input columns: [i1]; line 1 pos 7
+cannot resolve '`t1`' given input columns: [t1.i1]; line 1 pos 7
 
 
 -- !query 24
@@ -221,7 +221,7 @@ SELECT mydb1.t1.i1 FROM t1
 struct<>
 -- !query 25 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '`mydb1.t1.i1`' given input columns: [i1]; line 1 pos 7
+cannot resolve '`mydb1.t1.i1`' given input columns: [t1.i1]; line 1 pos 7
 
 
 -- !query 26
diff --git a/sql/core/src/test/resources/sql-tests/results/columnresolution-views.sql.out b/sql/core/src/test/resources/sql-tests/results/columnresolution-views.sql.out
index 616421d6f2b28..7c451c2aa5b5c 100644
--- a/sql/core/src/test/resources/sql-tests/results/columnresolution-views.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/columnresolution-views.sql.out
@@ -105,7 +105,7 @@ SELECT global_temp.view1.i1 FROM global_temp.view1
 struct<>
 -- !query 12 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '`global_temp.view1.i1`' given input columns: [i1]; line 1 pos 7
+cannot resolve '`global_temp.view1.i1`' given input columns: [view1.i1]; line 1 pos 7
 
 
 -- !query 13
diff --git a/sql/core/src/test/resources/sql-tests/results/columnresolution.sql.out b/sql/core/src/test/resources/sql-tests/results/columnresolution.sql.out
index 764cad0e3943c..d3ca4443cce55 100644
--- a/sql/core/src/test/resources/sql-tests/results/columnresolution.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/columnresolution.sql.out
@@ -96,7 +96,7 @@ SELECT mydb1.t1.i1 FROM t1
 struct<>
 -- !query 11 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '`mydb1.t1.i1`' given input columns: [i1]; line 1 pos 7
+cannot resolve '`mydb1.t1.i1`' given input columns: [t1.i1]; line 1 pos 7
 
 
 -- !query 12
@@ -105,7 +105,7 @@ SELECT mydb1.t1.i1 FROM mydb1.t1
 struct<>
 -- !query 12 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '`mydb1.t1.i1`' given input columns: [i1]; line 1 pos 7
+cannot resolve '`mydb1.t1.i1`' given input columns: [t1.i1]; line 1 pos 7
 
 
 -- !query 13
@@ -154,7 +154,7 @@ SELECT mydb1.t1.i1 FROM mydb1.t1
 struct<>
 -- !query 18 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '`mydb1.t1.i1`' given input columns: [i1]; line 1 pos 7
+cannot resolve '`mydb1.t1.i1`' given input columns: [t1.i1]; line 1 pos 7
 
 
 -- !query 19
@@ -270,7 +270,7 @@ SELECT * FROM mydb1.t3 WHERE c1 IN
 struct<>
 -- !query 32 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '`mydb1.t4.c3`' given input columns: [c2, c3]; line 2 pos 42
+cannot resolve '`mydb1.t4.c3`' given input columns: [t4.c2, t4.c3]; line 2 pos 42
 
 
 -- !query 33
@@ -287,7 +287,7 @@ SELECT mydb1.t1.i1 FROM t1, mydb2.t1
 struct<>
 -- !query 34 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '`mydb1.t1.i1`' given input columns: [i1, i1]; line 1 pos 7
+cannot resolve '`mydb1.t1.i1`' given input columns: [t1.i1, t1.i1]; line 1 pos 7
 
 
 -- !query 35
@@ -296,7 +296,7 @@ SELECT mydb1.t1.i1 FROM mydb1.t1, mydb2.t1
 struct<>
 -- !query 35 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '`mydb1.t1.i1`' given input columns: [i1, i1]; line 1 pos 7
+cannot resolve '`mydb1.t1.i1`' given input columns: [t1.i1, t1.i1]; line 1 pos 7
 
 
 -- !query 36
@@ -313,7 +313,7 @@ SELECT mydb1.t1.i1 FROM t1, mydb1.t1
 struct<>
 -- !query 37 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '`mydb1.t1.i1`' given input columns: [i1, i1]; line 1 pos 7
+cannot resolve '`mydb1.t1.i1`' given input columns: [t1.i1, t1.i1]; line 1 pos 7
 
 
 -- !query 38
@@ -402,7 +402,7 @@ SELECT mydb1.t5.t5.i1 FROM mydb1.t5
 struct<>
 -- !query 48 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '`mydb1.t5.t5.i1`' given input columns: [i1, t5]; line 1 pos 7
+cannot resolve '`mydb1.t5.t5.i1`' given input columns: [t5.i1, t5.t5]; line 1 pos 7
 
 
 -- !query 49
@@ -411,7 +411,7 @@ SELECT mydb1.t5.t5.i2 FROM mydb1.t5
 struct<>
 -- !query 49 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '`mydb1.t5.t5.i2`' given input columns: [i1, t5]; line 1 pos 7
+cannot resolve '`mydb1.t5.t5.i2`' given input columns: [t5.i1, t5.t5]; line 1 pos 7
 
 
 -- !query 50
diff --git a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
index 14679850c692e..e23ebd4e822fa 100644
--- a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
@@ -202,7 +202,7 @@ SELECT a AS k, COUNT(b) FROM testData GROUP BY k
 struct<>
 -- !query 21 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '`k`' given input columns: [a, b]; line 1 pos 47
+cannot resolve '`k`' given input columns: [testdata.a, testdata.b]; line 1 pos 47
 
 
 -- !query 22
diff --git a/sql/core/src/test/resources/sql-tests/results/table-aliases.sql.out b/sql/core/src/test/resources/sql-tests/results/table-aliases.sql.out
index c318018dced29..7abbcd834a523 100644
--- a/sql/core/src/test/resources/sql-tests/results/table-aliases.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/table-aliases.sql.out
@@ -60,4 +60,4 @@ SELECT a AS col1, b AS col2 FROM testData AS t(c, d)
 struct<>
 -- !query 6 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '`a`' given input columns: [c, d]; line 1 pos 7
+cannot resolve '`a`' given input columns: [t.c, t.d]; line 1 pos 7
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
index 820cff655c4ff..c0a3b5add313a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
@@ -870,9 +870,9 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
 
   test("SPARK-20688: correctly check analysis for scalar sub-queries") {
     withTempView("t") {
-      Seq(1 -> "a").toDF("i", "j").createTempView("t")
+      Seq(1 -> "a").toDF("i", "j").createOrReplaceTempView("t")
       val e = intercept[AnalysisException](sql("SELECT (SELECT count(*) FROM t WHERE a = 1)"))
-      assert(e.message.contains("cannot resolve '`a`' given input columns: [i, j]"))
+      assert(e.message.contains("cannot resolve '`a`' given input columns: [t.i, t.j]"))
     }
   }
 }

From ce10545d3401c555e56a214b7c2f334274803660 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Wed, 5 Jul 2017 11:24:38 +0800
Subject: [PATCH 088/779] [SPARK-21300][SQL] ExternalMapToCatalyst should
 null-check map key prior to converting to internal value.

## What changes were proposed in this pull request?

`ExternalMapToCatalyst` should null-check map key prior to converting to internal value to throw an appropriate Exception instead of something like NPE.

## How was this patch tested?

Added a test and existing tests.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #18524 from ueshin/issues/SPARK-21300.
---
 .../spark/sql/catalyst/JavaTypeInference.scala   |  1 +
 .../spark/sql/catalyst/ScalaReflection.scala     |  1 +
 .../catalyst/expressions/objects/objects.scala   | 16 +++++++++++++++-
 .../encoders/ExpressionEncoderSuite.scala        |  8 +++++++-
 4 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
index 7683ee7074e7d..90ec699877dec 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
@@ -418,6 +418,7 @@ object JavaTypeInference {
             inputObject,
             ObjectType(keyType.getRawType),
             serializerFor(_, keyType),
+            keyNullable = true,
             ObjectType(valueType.getRawType),
             serializerFor(_, valueType),
             valueNullable = true
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index d580cf4d3391c..f3c1e4150017d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -494,6 +494,7 @@ object ScalaReflection extends ScalaReflection {
           inputObject,
           dataTypeFor(keyType),
           serializerFor(_, keyType, keyPath, seenTypeSet),
+          keyNullable = !keyType.typeSymbol.asClass.isPrimitive,
           dataTypeFor(valueType),
           serializerFor(_, valueType, valuePath, seenTypeSet),
           valueNullable = !valueType.typeSymbol.asClass.isPrimitive)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index 4b651836ff4d2..d6d06aecc077b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -841,18 +841,21 @@ object ExternalMapToCatalyst {
       inputMap: Expression,
       keyType: DataType,
       keyConverter: Expression => Expression,
+      keyNullable: Boolean,
       valueType: DataType,
       valueConverter: Expression => Expression,
       valueNullable: Boolean): ExternalMapToCatalyst = {
     val id = curId.getAndIncrement()
     val keyName = "ExternalMapToCatalyst_key" + id
+    val keyIsNull = "ExternalMapToCatalyst_key_isNull" + id
     val valueName = "ExternalMapToCatalyst_value" + id
     val valueIsNull = "ExternalMapToCatalyst_value_isNull" + id
 
     ExternalMapToCatalyst(
       keyName,
+      keyIsNull,
       keyType,
-      keyConverter(LambdaVariable(keyName, "false", keyType, false)),
+      keyConverter(LambdaVariable(keyName, keyIsNull, keyType, keyNullable)),
       valueName,
       valueIsNull,
       valueType,
@@ -868,6 +871,8 @@ object ExternalMapToCatalyst {
  *
  * @param key the name of the map key variable that used when iterate the map, and used as input for
  *            the `keyConverter`
+ * @param keyIsNull the nullability of the map key variable that used when iterate the map, and
+ *                  used as input for the `keyConverter`
  * @param keyType the data type of the map key variable that used when iterate the map, and used as
  *                input for the `keyConverter`
  * @param keyConverter A function that take the `key` as input, and converts it to catalyst format.
@@ -883,6 +888,7 @@ object ExternalMapToCatalyst {
  */
 case class ExternalMapToCatalyst private(
     key: String,
+    keyIsNull: String,
     keyType: DataType,
     keyConverter: Expression,
     value: String,
@@ -913,6 +919,7 @@ case class ExternalMapToCatalyst private(
 
     val keyElementJavaType = ctx.javaType(keyType)
     val valueElementJavaType = ctx.javaType(valueType)
+    ctx.addMutableState("boolean", keyIsNull, "")
     ctx.addMutableState(keyElementJavaType, key, "")
     ctx.addMutableState("boolean", valueIsNull, "")
     ctx.addMutableState(valueElementJavaType, value, "")
@@ -950,6 +957,12 @@ case class ExternalMapToCatalyst private(
         defineEntries -> defineKeyValue
     }
 
+    val keyNullCheck = if (ctx.isPrimitiveType(keyType)) {
+      s"$keyIsNull = false;"
+    } else {
+      s"$keyIsNull = $key == null;"
+    }
+
     val valueNullCheck = if (ctx.isPrimitiveType(valueType)) {
       s"$valueIsNull = false;"
     } else {
@@ -972,6 +985,7 @@ case class ExternalMapToCatalyst private(
           $defineEntries
           while($entries.hasNext()) {
             $defineKeyValue
+            $keyNullCheck
             $valueNullCheck
 
             ${genKeyConverter.code}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala
index 080f11b769388..bb1955a1ae242 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala
@@ -355,12 +355,18 @@ class ExpressionEncoderSuite extends PlanTest with AnalysisTest {
     checkNullable[String](true)
   }
 
-  test("null check for map key") {
+  test("null check for map key: String") {
     val encoder = ExpressionEncoder[Map[String, Int]]()
     val e = intercept[RuntimeException](encoder.toRow(Map(("a", 1), (null, 2))))
     assert(e.getMessage.contains("Cannot use null as map key"))
   }
 
+  test("null check for map key: Integer") {
+    val encoder = ExpressionEncoder[Map[Integer, String]]()
+    val e = intercept[RuntimeException](encoder.toRow(Map((1, "a"), (null, "b"))))
+    assert(e.getMessage.contains("Cannot use null as map key"))
+  }
+
   private def encodeDecodeTest[T : ExpressionEncoder](
       input: T,
       testName: String): Unit = {

From e9a93f8140c913b91781b35e0e1b051c30244882 Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Tue, 4 Jul 2017 21:05:05 -0700
Subject: [PATCH 089/779] [SPARK-20889][SPARKR][FOLLOWUP] Clean up grouped doc
 for column methods

## What changes were proposed in this pull request?
Add doc for methods that were left out, and fix various style and consistency issues.

Author: actuaryzhang <actuaryzhang10@gmail.com>

Closes #18493 from actuaryzhang/sparkRDocCleanup.
---
 R/pkg/R/functions.R | 100 ++++++++++++++++++++------------------------
 R/pkg/R/generics.R  |   7 ++--
 2 files changed, 49 insertions(+), 58 deletions(-)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 8c12308c1d7c1..c529d83060f50 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -38,10 +38,10 @@ NULL
 #'
 #' Date time functions defined for \code{Column}.
 #'
-#' @param x Column to compute on.
+#' @param x Column to compute on. In \code{window}, it must be a time Column of \code{TimestampType}.
 #' @param format For \code{to_date} and \code{to_timestamp}, it is the string to use to parse
-#'               x Column to DateType or TimestampType. For \code{trunc}, it is the string used
-#'               for specifying the truncation method. For example, "year", "yyyy", "yy" for
+#'               Column \code{x} to DateType or TimestampType. For \code{trunc}, it is the string
+#'               to use to specify the truncation method. For example, "year", "yyyy", "yy" for
 #'               truncate by year, or "month", "mon", "mm" for truncate by month.
 #' @param ... additional argument(s).
 #' @name column_datetime_functions
@@ -122,7 +122,7 @@ NULL
 #'           format to. See 'Details'.
 #'      }
 #' @param y Column to compute on.
-#' @param ... additional columns.
+#' @param ... additional Columns.
 #' @name column_string_functions
 #' @rdname column_string_functions
 #' @family string functions
@@ -167,8 +167,7 @@ NULL
 #' tmp <- mutate(df, v1 = crc32(df$model), v2 = hash(df$model),
 #'                   v3 = hash(df$model, df$mpg), v4 = md5(df$model),
 #'                   v5 = sha1(df$model), v6 = sha2(df$model, 256))
-#' head(tmp)
-#' }
+#' head(tmp)}
 NULL
 
 #' Collection functions for Column operations
@@ -190,7 +189,6 @@ NULL
 #' \dontrun{
 #' # Dataframe used throughout this doc
 #' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))
-#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))
 #' tmp <- mutate(df, v1 = create_array(df$mpg, df$cyl, df$hp))
 #' head(select(tmp, array_contains(tmp$v1, 21), size(tmp$v1)))
 #' tmp2 <- mutate(tmp, v2 = explode(tmp$v1))
@@ -394,7 +392,7 @@ setMethod("base64",
           })
 
 #' @details
-#' \code{bin}: An expression that returns the string representation of the binary value
+#' \code{bin}: Returns the string representation of the binary value
 #' of the given long column. For example, bin("12") returns "1100".
 #'
 #' @rdname column_math_functions
@@ -722,7 +720,7 @@ setMethod("dayofyear",
 #' \code{decode}: Computes the first argument into a string from a binary using the provided
 #' character set.
 #'
-#' @param charset Character set to use (one of "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE",
+#' @param charset character set to use (one of "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE",
 #'                "UTF-16LE", "UTF-16").
 #'
 #' @rdname column_string_functions
@@ -855,7 +853,7 @@ setMethod("hex",
           })
 
 #' @details
-#' \code{hour}: Extracts the hours as an integer from a given date/timestamp/string.
+#' \code{hour}: Extracts the hour as an integer from a given date/timestamp/string.
 #'
 #' @rdname column_datetime_functions
 #' @aliases hour hour,Column-method
@@ -1177,7 +1175,7 @@ setMethod("min",
           })
 
 #' @details
-#' \code{minute}: Extracts the minutes as an integer from a given date/timestamp/string.
+#' \code{minute}: Extracts the minute as an integer from a given date/timestamp/string.
 #'
 #' @rdname column_datetime_functions
 #' @aliases minute minute,Column-method
@@ -1354,7 +1352,7 @@ setMethod("sd",
           })
 
 #' @details
-#' \code{second}: Extracts the seconds as an integer from a given date/timestamp/string.
+#' \code{second}: Extracts the second as an integer from a given date/timestamp/string.
 #'
 #' @rdname column_datetime_functions
 #' @aliases second second,Column-method
@@ -1464,20 +1462,18 @@ setMethod("soundex",
             column(jc)
           })
 
-#' Return the partition ID as a column
-#'
-#' Return the partition ID as a SparkDataFrame column.
+#' @details
+#' \code{spark_partition_id}: Returns the partition ID as a SparkDataFrame column.
 #' Note that this is nondeterministic because it depends on data partitioning and
 #' task scheduling.
+#' This is equivalent to the \code{SPARK_PARTITION_ID} function in SQL.
 #'
-#' This is equivalent to the SPARK_PARTITION_ID function in SQL.
-#'
-#' @rdname spark_partition_id
-#' @name spark_partition_id
-#' @aliases spark_partition_id,missing-method
+#' @rdname column_nonaggregate_functions
+#' @aliases spark_partition_id spark_partition_id,missing-method
 #' @export
 #' @examples
-#' \dontrun{select(df, spark_partition_id())}
+#'
+#' \dontrun{head(select(df, spark_partition_id()))}
 #' @note spark_partition_id since 2.0.0
 setMethod("spark_partition_id",
           signature("missing"),
@@ -2028,7 +2024,7 @@ setMethod("pmod", signature(y = "Column"),
             column(jc)
           })
 
-#' @param rsd maximum estimation error allowed (default = 0.05)
+#' @param rsd maximum estimation error allowed (default = 0.05).
 #'
 #' @rdname column_aggregate_functions
 #' @aliases approxCountDistinct,Column-method
@@ -2220,8 +2216,8 @@ setMethod("from_json", signature(x = "Column", schema = "structType"),
 #' @examples
 #'
 #' \dontrun{
-#' tmp <- mutate(df, from_utc = from_utc_timestamp(df$time, 'PST'),
-#'                  to_utc = to_utc_timestamp(df$time, 'PST'))
+#' tmp <- mutate(df, from_utc = from_utc_timestamp(df$time, "PST"),
+#'                  to_utc = to_utc_timestamp(df$time, "PST"))
 #' head(tmp)}
 #' @note from_utc_timestamp since 1.5.0
 setMethod("from_utc_timestamp", signature(y = "Column", x = "character"),
@@ -2255,7 +2251,7 @@ setMethod("instr", signature(y = "Column", x = "character"),
 #' @details
 #' \code{next_day}: Given a date column, returns the first date which is later than the value of
 #' the date column that is on the specified day of the week. For example,
-#' \code{next_day('2015-07-27', "Sunday")} returns 2015-08-02 because that is the first Sunday
+#' \code{next_day("2015-07-27", "Sunday")} returns 2015-08-02 because that is the first Sunday
 #' after 2015-07-27. Day of the week parameter is case insensitive, and accepts first three or
 #' two characters: "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun".
 #'
@@ -2295,7 +2291,7 @@ setMethod("to_utc_timestamp", signature(y = "Column", x = "character"),
 #' tmp <- mutate(df, t1 = add_months(df$time, 1),
 #'                   t2 = date_add(df$time, 2),
 #'                   t3 = date_sub(df$time, 3),
-#'                   t4 = next_day(df$time, 'Sun'))
+#'                   t4 = next_day(df$time, "Sun"))
 #' head(tmp)}
 #' @note add_months since 1.5.0
 setMethod("add_months", signature(y = "Column", x = "numeric"),
@@ -2404,8 +2400,8 @@ setMethod("shiftRight", signature(y = "Column", x = "numeric"),
           })
 
 #' @details
-#' \code{shiftRight}: (Unigned) shifts the given value numBits right. If the given value is a long value,
-#' it will return a long value else it will return an integer value.
+#' \code{shiftRightUnsigned}: (Unigned) shifts the given value numBits right. If the given value is
+#' a long value, it will return a long value else it will return an integer value.
 #'
 #' @rdname column_math_functions
 #' @aliases shiftRightUnsigned shiftRightUnsigned,Column,numeric-method
@@ -2513,14 +2509,13 @@ setMethod("from_unixtime", signature(x = "Column"),
             column(jc)
           })
 
-#' window
-#'
-#' Bucketize rows into one or more time windows given a timestamp specifying column. Window
-#' starts are inclusive but the window ends are exclusive, e.g. 12:05 will be in the window
+#' @details
+#' \code{window}: Bucketizes rows into one or more time windows given a timestamp specifying column.
+#' Window starts are inclusive but the window ends are exclusive, e.g. 12:05 will be in the window
 #' [12:05,12:10) but not in [12:00,12:05). Windows can support microsecond precision. Windows in
-#' the order of months are not supported.
+#' the order of months are not supported. It returns an output column of struct called 'window'
+#' by default with the nested columns 'start' and 'end'
 #'
-#' @param x a time Column. Must be of TimestampType.
 #' @param windowDuration a string specifying the width of the window, e.g. '1 second',
 #'                       '1 day 12 hours', '2 minutes'. Valid interval strings are 'week',
 #'                       'day', 'hour', 'minute', 'second', 'millisecond', 'microsecond'. Note that
@@ -2536,27 +2531,22 @@ setMethod("from_unixtime", signature(x = "Column"),
 #'                  window intervals. For example, in order to have hourly tumbling windows
 #'                  that start 15 minutes past the hour, e.g. 12:15-13:15, 13:15-14:15... provide
 #'                  \code{startTime} as \code{"15 minutes"}.
-#' @param ... further arguments to be passed to or from other methods.
-#' @return An output column of struct called 'window' by default with the nested columns 'start'
-#'         and 'end'.
-#' @family date time functions
-#' @rdname window
-#' @name window
-#' @aliases window,Column-method
+#' @rdname column_datetime_functions
+#' @aliases window window,Column-method
 #' @export
 #' @examples
-#'\dontrun{
-#'   # One minute windows every 15 seconds 10 seconds after the minute, e.g. 09:00:10-09:01:10,
-#'   # 09:00:25-09:01:25, 09:00:40-09:01:40, ...
-#'   window(df$time, "1 minute", "15 seconds", "10 seconds")
 #'
-#'   # One minute tumbling windows 15 seconds after the minute, e.g. 09:00:15-09:01:15,
-#'    # 09:01:15-09:02:15...
-#'   window(df$time, "1 minute", startTime = "15 seconds")
+#' \dontrun{
+#' # One minute windows every 15 seconds 10 seconds after the minute, e.g. 09:00:10-09:01:10,
+#' # 09:00:25-09:01:25, 09:00:40-09:01:40, ...
+#' window(df$time, "1 minute", "15 seconds", "10 seconds")
 #'
-#'   # Thirty-second windows every 10 seconds, e.g. 09:00:00-09:00:30, 09:00:10-09:00:40, ...
-#'   window(df$time, "30 seconds", "10 seconds")
-#'}
+#' # One minute tumbling windows 15 seconds after the minute, e.g. 09:00:15-09:01:15,
+#' # 09:01:15-09:02:15...
+#' window(df$time, "1 minute", startTime = "15 seconds")
+#'
+#' # Thirty-second windows every 10 seconds, e.g. 09:00:00-09:00:30, 09:00:10-09:00:40, ...
+#' window(df$time, "30 seconds", "10 seconds")}
 #' @note window since 2.0.0
 setMethod("window", signature(x = "Column"),
           function(x, windowDuration, slideDuration = NULL, startTime = NULL) {
@@ -3046,7 +3036,7 @@ setMethod("row_number",
 #' \code{array_contains}: Returns null if the array is null, true if the array contains
 #' the value, and false otherwise.
 #'
-#' @param value A value to be checked if contained in the column
+#' @param value a value to be checked if contained in the column
 #' @rdname column_collection_functions
 #' @aliases array_contains array_contains,Column-method
 #' @export
@@ -3091,7 +3081,7 @@ setMethod("size",
 #' to the natural ordering of the array elements.
 #'
 #' @rdname column_collection_functions
-#' @param asc A logical flag indicating the sorting order.
+#' @param asc a logical flag indicating the sorting order.
 #'            TRUE, sorting is in ascending order.
 #'            FALSE, sorting is in descending order.
 #' @aliases sort_array sort_array,Column-method
@@ -3218,7 +3208,7 @@ setMethod("split_string",
 #' \code{repeat_string}: Repeats string n times.
 #' Equivalent to \code{repeat} SQL function.
 #'
-#' @param n Number of repetitions
+#' @param n number of repetitions.
 #' @rdname column_string_functions
 #' @aliases repeat_string repeat_string,Column-method
 #' @export
@@ -3347,7 +3337,7 @@ setMethod("grouping_bit",
 #' \code{grouping_id}: Returns the level of grouping.
 #' Equals to \code{
 #' grouping_bit(c1) * 2^(n - 1) + grouping_bit(c2) * 2^(n - 2)  + ... + grouping_bit(cn)
-#' }
+#' }.
 #'
 #' @rdname column_aggregate_functions
 #' @aliases grouping_id grouping_id,Column-method
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index beac18e412736..92098741f72f9 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -1418,9 +1418,9 @@ setGeneric("split_string", function(x, pattern) { standardGeneric("split_string"
 #' @name NULL
 setGeneric("soundex", function(x) { standardGeneric("soundex") })
 
-#' @param x empty. Should be used with no argument.
-#' @rdname spark_partition_id
+#' @rdname column_nonaggregate_functions
 #' @export
+#' @name NULL
 setGeneric("spark_partition_id", function(x = "missing") { standardGeneric("spark_partition_id") })
 
 #' @rdname column_aggregate_functions
@@ -1538,8 +1538,9 @@ setGeneric("var_samp", function(x) { standardGeneric("var_samp") })
 #' @name NULL
 setGeneric("weekofyear", function(x) { standardGeneric("weekofyear") })
 
-#' @rdname window
+#' @rdname column_datetime_functions
 #' @export
+#' @name NULL
 setGeneric("window", function(x, ...) { standardGeneric("window") })
 
 #' @rdname column_datetime_functions

From f2c3b1dd69423cf52880e0ffa5f673ad6041b40e Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 5 Jul 2017 14:17:26 +0800
Subject: [PATCH 090/779] [SPARK-21304][SQL] remove unnecessary isNull variable
 for collection related encoder expressions

## What changes were proposed in this pull request?

For these collection-related encoder expressions, we don't need to create `isNull` variable if the loop element is not nullable.

## How was this patch tested?

existing tests.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #18529 from cloud-fan/minor.
---
 .../spark/sql/catalyst/ScalaReflection.scala  |  2 +-
 .../expressions/objects/objects.scala         | 77 ++++++++++++-------
 2 files changed, 50 insertions(+), 29 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index f3c1e4150017d..bea0de4d90c2f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -335,7 +335,7 @@ object ScalaReflection extends ScalaReflection {
         // TODO: add walked type path for map
         val TypeRef(_, _, Seq(keyType, valueType)) = t
 
-        CollectObjectsToMap(
+        CatalystToExternalMap(
           p => deserializerFor(keyType, Some(p), walkedTypePath),
           p => deserializerFor(valueType, Some(p), walkedTypePath),
           getPath,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index d6d06aecc077b..ce07f4a25c189 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -465,7 +465,11 @@ object MapObjects {
       customCollectionCls: Option[Class[_]] = None): MapObjects = {
     val id = curId.getAndIncrement()
     val loopValue = s"MapObjects_loopValue$id"
-    val loopIsNull = s"MapObjects_loopIsNull$id"
+    val loopIsNull = if (elementNullable) {
+      s"MapObjects_loopIsNull$id"
+    } else {
+      "false"
+    }
     val loopVar = LambdaVariable(loopValue, loopIsNull, elementType, elementNullable)
     MapObjects(
       loopValue, loopIsNull, elementType, function(loopVar), inputData, customCollectionCls)
@@ -517,7 +521,6 @@ case class MapObjects private(
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val elementJavaType = ctx.javaType(loopVarDataType)
-    ctx.addMutableState("boolean", loopIsNull, "")
     ctx.addMutableState(elementJavaType, loopValue, "")
     val genInputData = inputData.genCode(ctx)
     val genFunction = lambdaFunction.genCode(ctx)
@@ -588,12 +591,14 @@ case class MapObjects private(
       case _ => genFunction.value
     }
 
-    val loopNullCheck = inputDataType match {
-      case _: ArrayType => s"$loopIsNull = ${genInputData.value}.isNullAt($loopIndex);"
-      // The element of primitive array will never be null.
-      case ObjectType(cls) if cls.isArray && cls.getComponentType.isPrimitive =>
-        s"$loopIsNull = false"
-      case _ => s"$loopIsNull = $loopValue == null;"
+    val loopNullCheck = if (loopIsNull != "false") {
+      ctx.addMutableState("boolean", loopIsNull, "")
+      inputDataType match {
+        case _: ArrayType => s"$loopIsNull = ${genInputData.value}.isNullAt($loopIndex);"
+        case _ => s"$loopIsNull = $loopValue == null;"
+      }
+    } else {
+      ""
     }
 
     val (initCollection, addElement, getResult): (String, String => String, String) =
@@ -667,11 +672,11 @@ case class MapObjects private(
   }
 }
 
-object CollectObjectsToMap {
+object CatalystToExternalMap {
   private val curId = new java.util.concurrent.atomic.AtomicInteger()
 
   /**
-   * Construct an instance of CollectObjectsToMap case class.
+   * Construct an instance of CatalystToExternalMap case class.
    *
    * @param keyFunction The function applied on the key collection elements.
    * @param valueFunction The function applied on the value collection elements.
@@ -682,15 +687,19 @@ object CollectObjectsToMap {
       keyFunction: Expression => Expression,
       valueFunction: Expression => Expression,
       inputData: Expression,
-      collClass: Class[_]): CollectObjectsToMap = {
+      collClass: Class[_]): CatalystToExternalMap = {
     val id = curId.getAndIncrement()
-    val keyLoopValue = s"CollectObjectsToMap_keyLoopValue$id"
+    val keyLoopValue = s"CatalystToExternalMap_keyLoopValue$id"
     val mapType = inputData.dataType.asInstanceOf[MapType]
     val keyLoopVar = LambdaVariable(keyLoopValue, "", mapType.keyType, nullable = false)
-    val valueLoopValue = s"CollectObjectsToMap_valueLoopValue$id"
-    val valueLoopIsNull = s"CollectObjectsToMap_valueLoopIsNull$id"
+    val valueLoopValue = s"CatalystToExternalMap_valueLoopValue$id"
+    val valueLoopIsNull = if (mapType.valueContainsNull) {
+      s"CatalystToExternalMap_valueLoopIsNull$id"
+    } else {
+      "false"
+    }
     val valueLoopVar = LambdaVariable(valueLoopValue, valueLoopIsNull, mapType.valueType)
-    CollectObjectsToMap(
+    CatalystToExternalMap(
       keyLoopValue, keyFunction(keyLoopVar),
       valueLoopValue, valueLoopIsNull, valueFunction(valueLoopVar),
       inputData, collClass)
@@ -716,7 +725,7 @@ object CollectObjectsToMap {
  * @param inputData An expression that when evaluated returns a map object.
  * @param collClass The type of the resulting collection.
  */
-case class CollectObjectsToMap private(
+case class CatalystToExternalMap private(
     keyLoopValue: String,
     keyLambdaFunction: Expression,
     valueLoopValue: String,
@@ -748,7 +757,6 @@ case class CollectObjectsToMap private(
     ctx.addMutableState(keyElementJavaType, keyLoopValue, "")
     val genKeyFunction = keyLambdaFunction.genCode(ctx)
     val valueElementJavaType = ctx.javaType(mapType.valueType)
-    ctx.addMutableState("boolean", valueLoopIsNull, "")
     ctx.addMutableState(valueElementJavaType, valueLoopValue, "")
     val genValueFunction = valueLambdaFunction.genCode(ctx)
     val genInputData = inputData.genCode(ctx)
@@ -781,7 +789,12 @@ case class CollectObjectsToMap private(
     val genKeyFunctionValue = genFunctionValue(keyLambdaFunction, genKeyFunction)
     val genValueFunctionValue = genFunctionValue(valueLambdaFunction, genValueFunction)
 
-    val valueLoopNullCheck = s"$valueLoopIsNull = $valueArray.isNullAt($loopIndex);"
+    val valueLoopNullCheck = if (valueLoopIsNull != "false") {
+      ctx.addMutableState("boolean", valueLoopIsNull, "")
+      s"$valueLoopIsNull = $valueArray.isNullAt($loopIndex);"
+    } else {
+      ""
+    }
 
     val builderClass = classOf[Builder[_, _]].getName
     val constructBuilder = s"""
@@ -847,9 +860,17 @@ object ExternalMapToCatalyst {
       valueNullable: Boolean): ExternalMapToCatalyst = {
     val id = curId.getAndIncrement()
     val keyName = "ExternalMapToCatalyst_key" + id
-    val keyIsNull = "ExternalMapToCatalyst_key_isNull" + id
+    val keyIsNull = if (keyNullable) {
+      "ExternalMapToCatalyst_key_isNull" + id
+    } else {
+      "false"
+    }
     val valueName = "ExternalMapToCatalyst_value" + id
-    val valueIsNull = "ExternalMapToCatalyst_value_isNull" + id
+    val valueIsNull = if (valueNullable) {
+      "ExternalMapToCatalyst_value_isNull" + id
+    } else {
+      "false"
+    }
 
     ExternalMapToCatalyst(
       keyName,
@@ -919,9 +940,7 @@ case class ExternalMapToCatalyst private(
 
     val keyElementJavaType = ctx.javaType(keyType)
     val valueElementJavaType = ctx.javaType(valueType)
-    ctx.addMutableState("boolean", keyIsNull, "")
     ctx.addMutableState(keyElementJavaType, key, "")
-    ctx.addMutableState("boolean", valueIsNull, "")
     ctx.addMutableState(valueElementJavaType, value, "")
 
     val (defineEntries, defineKeyValue) = child.dataType match {
@@ -957,16 +976,18 @@ case class ExternalMapToCatalyst private(
         defineEntries -> defineKeyValue
     }
 
-    val keyNullCheck = if (ctx.isPrimitiveType(keyType)) {
-      s"$keyIsNull = false;"
-    } else {
+    val keyNullCheck = if (keyIsNull != "false") {
+      ctx.addMutableState("boolean", keyIsNull, "")
       s"$keyIsNull = $key == null;"
+    } else {
+      ""
     }
 
-    val valueNullCheck = if (ctx.isPrimitiveType(valueType)) {
-      s"$valueIsNull = false;"
-    } else {
+    val valueNullCheck = if (valueIsNull != "false") {
+      ctx.addMutableState("boolean", valueIsNull, "")
       s"$valueIsNull = $value == null;"
+    } else {
+      ""
     }
 
     val arrayCls = classOf[GenericArrayData].getName

From a38643256691947ff7f7c474b85c052a7d5d8553 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@happy-camper.st>
Date: Wed, 5 Jul 2017 14:25:26 +0800
Subject: [PATCH 091/779] [SPARK-18623][SQL] Add `returnNullable` to
 `StaticInvoke` and modify it to handle properly.

## What changes were proposed in this pull request?

Add `returnNullable` to `StaticInvoke` the same as #15780 is trying to add to `Invoke` and modify to handle properly.

## How was this patch tested?

Existing tests.

Author: Takuya UESHIN <ueshin@happy-camper.st>
Author: Takuya UESHIN <ueshin@databricks.com>

Closes #16056 from ueshin/issues/SPARK-18623.
---
 .../sql/catalyst/JavaTypeInference.scala      | 21 +++++----
 .../spark/sql/catalyst/ScalaReflection.scala  | 44 ++++++++++++-------
 .../sql/catalyst/encoders/RowEncoder.scala    | 27 ++++++++----
 .../expressions/objects/objects.scala         | 42 ++++++++++++++----
 4 files changed, 91 insertions(+), 43 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
index 90ec699877dec..21363d3ba82c1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
@@ -216,7 +216,7 @@ object JavaTypeInference {
           ObjectType(c),
           "valueOf",
           getPath :: Nil,
-          propagateNull = true)
+          returnNullable = false)
 
       case c if c == classOf[java.sql.Date] =>
         StaticInvoke(
@@ -224,7 +224,7 @@ object JavaTypeInference {
           ObjectType(c),
           "toJavaDate",
           getPath :: Nil,
-          propagateNull = true)
+          returnNullable = false)
 
       case c if c == classOf[java.sql.Timestamp] =>
         StaticInvoke(
@@ -232,7 +232,7 @@ object JavaTypeInference {
           ObjectType(c),
           "toJavaTimestamp",
           getPath :: Nil,
-          propagateNull = true)
+          returnNullable = false)
 
       case c if c == classOf[java.lang.String] =>
         Invoke(getPath, "toString", ObjectType(classOf[String]))
@@ -300,7 +300,8 @@ object JavaTypeInference {
           ArrayBasedMapData.getClass,
           ObjectType(classOf[JMap[_, _]]),
           "toJavaMap",
-          keyData :: valueData :: Nil)
+          keyData :: valueData :: Nil,
+          returnNullable = false)
 
       case other =>
         val properties = getJavaBeanReadableAndWritableProperties(other)
@@ -367,28 +368,32 @@ object JavaTypeInference {
             classOf[UTF8String],
             StringType,
             "fromString",
-            inputObject :: Nil)
+            inputObject :: Nil,
+            returnNullable = false)
 
         case c if c == classOf[java.sql.Timestamp] =>
           StaticInvoke(
             DateTimeUtils.getClass,
             TimestampType,
             "fromJavaTimestamp",
-            inputObject :: Nil)
+            inputObject :: Nil,
+            returnNullable = false)
 
         case c if c == classOf[java.sql.Date] =>
           StaticInvoke(
             DateTimeUtils.getClass,
             DateType,
             "fromJavaDate",
-            inputObject :: Nil)
+            inputObject :: Nil,
+            returnNullable = false)
 
         case c if c == classOf[java.math.BigDecimal] =>
           StaticInvoke(
             Decimal.getClass,
             DecimalType.SYSTEM_DEFAULT,
             "apply",
-            inputObject :: Nil)
+            inputObject :: Nil,
+            returnNullable = false)
 
         case c if c == classOf[java.lang.Boolean] =>
           Invoke(inputObject, "booleanValue", BooleanType)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index bea0de4d90c2f..814f2c10b9097 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -206,51 +206,53 @@ object ScalaReflection extends ScalaReflection {
       case t if t <:< localTypeOf[java.lang.Integer] =>
         val boxedType = classOf[java.lang.Integer]
         val objectType = ObjectType(boxedType)
-        StaticInvoke(boxedType, objectType, "valueOf", getPath :: Nil, propagateNull = true)
+        StaticInvoke(boxedType, objectType, "valueOf", getPath :: Nil, returnNullable = false)
 
       case t if t <:< localTypeOf[java.lang.Long] =>
         val boxedType = classOf[java.lang.Long]
         val objectType = ObjectType(boxedType)
-        StaticInvoke(boxedType, objectType, "valueOf", getPath :: Nil, propagateNull = true)
+        StaticInvoke(boxedType, objectType, "valueOf", getPath :: Nil, returnNullable = false)
 
       case t if t <:< localTypeOf[java.lang.Double] =>
         val boxedType = classOf[java.lang.Double]
         val objectType = ObjectType(boxedType)
-        StaticInvoke(boxedType, objectType, "valueOf", getPath :: Nil, propagateNull = true)
+        StaticInvoke(boxedType, objectType, "valueOf", getPath :: Nil, returnNullable = false)
 
       case t if t <:< localTypeOf[java.lang.Float] =>
         val boxedType = classOf[java.lang.Float]
         val objectType = ObjectType(boxedType)
-        StaticInvoke(boxedType, objectType, "valueOf", getPath :: Nil, propagateNull = true)
+        StaticInvoke(boxedType, objectType, "valueOf", getPath :: Nil, returnNullable = false)
 
       case t if t <:< localTypeOf[java.lang.Short] =>
         val boxedType = classOf[java.lang.Short]
         val objectType = ObjectType(boxedType)
-        StaticInvoke(boxedType, objectType, "valueOf", getPath :: Nil, propagateNull = true)
+        StaticInvoke(boxedType, objectType, "valueOf", getPath :: Nil, returnNullable = false)
 
       case t if t <:< localTypeOf[java.lang.Byte] =>
         val boxedType = classOf[java.lang.Byte]
         val objectType = ObjectType(boxedType)
-        StaticInvoke(boxedType, objectType, "valueOf", getPath :: Nil, propagateNull = true)
+        StaticInvoke(boxedType, objectType, "valueOf", getPath :: Nil, returnNullable = false)
 
       case t if t <:< localTypeOf[java.lang.Boolean] =>
         val boxedType = classOf[java.lang.Boolean]
         val objectType = ObjectType(boxedType)
-        StaticInvoke(boxedType, objectType, "valueOf", getPath :: Nil, propagateNull = true)
+        StaticInvoke(boxedType, objectType, "valueOf", getPath :: Nil, returnNullable = false)
 
       case t if t <:< localTypeOf[java.sql.Date] =>
         StaticInvoke(
           DateTimeUtils.getClass,
           ObjectType(classOf[java.sql.Date]),
           "toJavaDate",
-          getPath :: Nil)
+          getPath :: Nil,
+          returnNullable = false)
 
       case t if t <:< localTypeOf[java.sql.Timestamp] =>
         StaticInvoke(
           DateTimeUtils.getClass,
           ObjectType(classOf[java.sql.Timestamp]),
           "toJavaTimestamp",
-          getPath :: Nil)
+          getPath :: Nil,
+          returnNullable = false)
 
       case t if t <:< localTypeOf[java.lang.String] =>
         Invoke(getPath, "toString", ObjectType(classOf[String]), returnNullable = false)
@@ -446,7 +448,8 @@ object ScalaReflection extends ScalaReflection {
               classOf[UnsafeArrayData],
               ArrayType(dt, false),
               "fromPrimitiveArray",
-              input :: Nil)
+              input :: Nil,
+              returnNullable = false)
           } else {
             NewInstance(
               classOf[GenericArrayData],
@@ -504,49 +507,56 @@ object ScalaReflection extends ScalaReflection {
           classOf[UTF8String],
           StringType,
           "fromString",
-          inputObject :: Nil)
+          inputObject :: Nil,
+          returnNullable = false)
 
       case t if t <:< localTypeOf[java.sql.Timestamp] =>
         StaticInvoke(
           DateTimeUtils.getClass,
           TimestampType,
           "fromJavaTimestamp",
-          inputObject :: Nil)
+          inputObject :: Nil,
+          returnNullable = false)
 
       case t if t <:< localTypeOf[java.sql.Date] =>
         StaticInvoke(
           DateTimeUtils.getClass,
           DateType,
           "fromJavaDate",
-          inputObject :: Nil)
+          inputObject :: Nil,
+          returnNullable = false)
 
       case t if t <:< localTypeOf[BigDecimal] =>
         StaticInvoke(
           Decimal.getClass,
           DecimalType.SYSTEM_DEFAULT,
           "apply",
-          inputObject :: Nil)
+          inputObject :: Nil,
+          returnNullable = false)
 
       case t if t <:< localTypeOf[java.math.BigDecimal] =>
         StaticInvoke(
           Decimal.getClass,
           DecimalType.SYSTEM_DEFAULT,
           "apply",
-          inputObject :: Nil)
+          inputObject :: Nil,
+          returnNullable = false)
 
       case t if t <:< localTypeOf[java.math.BigInteger] =>
         StaticInvoke(
           Decimal.getClass,
           DecimalType.BigIntDecimal,
           "apply",
-          inputObject :: Nil)
+          inputObject :: Nil,
+          returnNullable = false)
 
       case t if t <:< localTypeOf[scala.math.BigInt] =>
         StaticInvoke(
           Decimal.getClass,
           DecimalType.BigIntDecimal,
           "apply",
-          inputObject :: Nil)
+          inputObject :: Nil,
+          returnNullable = false)
 
       case t if t <:< localTypeOf[java.lang.Integer] =>
         Invoke(inputObject, "intValue", IntegerType)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
index 0f8282d3b2f1f..cc32fac67e924 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
@@ -96,28 +96,32 @@ object RowEncoder {
         DateTimeUtils.getClass,
         TimestampType,
         "fromJavaTimestamp",
-        inputObject :: Nil)
+        inputObject :: Nil,
+        returnNullable = false)
 
     case DateType =>
       StaticInvoke(
         DateTimeUtils.getClass,
         DateType,
         "fromJavaDate",
-        inputObject :: Nil)
+        inputObject :: Nil,
+        returnNullable = false)
 
     case d: DecimalType =>
       StaticInvoke(
         Decimal.getClass,
         d,
         "fromDecimal",
-        inputObject :: Nil)
+        inputObject :: Nil,
+        returnNullable = false)
 
     case StringType =>
       StaticInvoke(
         classOf[UTF8String],
         StringType,
         "fromString",
-        inputObject :: Nil)
+        inputObject :: Nil,
+        returnNullable = false)
 
     case t @ ArrayType(et, cn) =>
       et match {
@@ -126,7 +130,8 @@ object RowEncoder {
             classOf[ArrayData],
             t,
             "toArrayData",
-            inputObject :: Nil)
+            inputObject :: Nil,
+            returnNullable = false)
         case _ => MapObjects(
           element => serializerFor(ValidateExternalType(element, et), et),
           inputObject,
@@ -254,14 +259,16 @@ object RowEncoder {
         DateTimeUtils.getClass,
         ObjectType(classOf[java.sql.Timestamp]),
         "toJavaTimestamp",
-        input :: Nil)
+        input :: Nil,
+        returnNullable = false)
 
     case DateType =>
       StaticInvoke(
         DateTimeUtils.getClass,
         ObjectType(classOf[java.sql.Date]),
         "toJavaDate",
-        input :: Nil)
+        input :: Nil,
+        returnNullable = false)
 
     case _: DecimalType =>
       Invoke(input, "toJavaBigDecimal", ObjectType(classOf[java.math.BigDecimal]),
@@ -280,7 +287,8 @@ object RowEncoder {
         scala.collection.mutable.WrappedArray.getClass,
         ObjectType(classOf[Seq[_]]),
         "make",
-        arrayData :: Nil)
+        arrayData :: Nil,
+        returnNullable = false)
 
     case MapType(kt, vt, valueNullable) =>
       val keyArrayType = ArrayType(kt, false)
@@ -293,7 +301,8 @@ object RowEncoder {
         ArrayBasedMapData.getClass,
         ObjectType(classOf[Map[_, _]]),
         "toScalaMap",
-        keyData :: valueData :: Nil)
+        keyData :: valueData :: Nil,
+        returnNullable = false)
 
     case schema @ StructType(fields) =>
       val convertedFields = fields.zipWithIndex.map { case (f, i) =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index ce07f4a25c189..24c06d8b14b54 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -118,17 +118,20 @@ trait InvokeLike extends Expression with NonSQLExpression {
  * @param arguments An optional list of expressions to pass as arguments to the function.
  * @param propagateNull When true, and any of the arguments is null, null will be returned instead
  *                      of calling the function.
+ * @param returnNullable When false, indicating the invoked method will always return
+ *                       non-null value.
  */
 case class StaticInvoke(
     staticObject: Class[_],
     dataType: DataType,
     functionName: String,
     arguments: Seq[Expression] = Nil,
-    propagateNull: Boolean = true) extends InvokeLike {
+    propagateNull: Boolean = true,
+    returnNullable: Boolean = true) extends InvokeLike {
 
   val objectName = staticObject.getName.stripSuffix("$")
 
-  override def nullable: Boolean = true
+  override def nullable: Boolean = needNullCheck || returnNullable
   override def children: Seq[Expression] = arguments
 
   override def eval(input: InternalRow): Any =
@@ -141,19 +144,40 @@ case class StaticInvoke(
 
     val callFunc = s"$objectName.$functionName($argString)"
 
-    // If the function can return null, we do an extra check to make sure our null bit is still set
-    // correctly.
-    val postNullCheck = if (ctx.defaultValue(dataType) == "null") {
-      s"${ev.isNull} = ${ev.value} == null;"
+    val prepareIsNull = if (nullable) {
+      s"boolean ${ev.isNull} = $resultIsNull;"
     } else {
+      ev.isNull = "false"
       ""
     }
 
+    val evaluate = if (returnNullable) {
+      if (ctx.defaultValue(dataType) == "null") {
+        s"""
+          ${ev.value} = $callFunc;
+          ${ev.isNull} = ${ev.value} == null;
+        """
+      } else {
+        val boxedResult = ctx.freshName("boxedResult")
+        s"""
+          ${ctx.boxedType(dataType)} $boxedResult = $callFunc;
+          ${ev.isNull} = $boxedResult == null;
+          if (!${ev.isNull}) {
+            ${ev.value} = $boxedResult;
+          }
+        """
+      }
+    } else {
+      s"${ev.value} = $callFunc;"
+    }
+
     val code = s"""
       $argCode
-      boolean ${ev.isNull} = $resultIsNull;
-      final $javaType ${ev.value} = $resultIsNull ? ${ctx.defaultValue(dataType)} : $callFunc;
-      $postNullCheck
+      $prepareIsNull
+      $javaType ${ev.value} = ${ctx.defaultValue(dataType)};
+      if (!$resultIsNull) {
+        $evaluate
+      }
      """
     ev.copy(code = code)
   }

From 4852b7d447e872079c2c81428354adc825a87b27 Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Wed, 5 Jul 2017 18:41:00 +0800
Subject: [PATCH 092/779] [SPARK-21310][ML][PYSPARK] Expose offset in PySpark

## What changes were proposed in this pull request?
Add offset to PySpark in GLM as in #16699.

## How was this patch tested?
Python test

Author: actuaryzhang <actuaryzhang10@gmail.com>

Closes #18534 from actuaryzhang/pythonOffset.
---
 python/pyspark/ml/regression.py | 25 +++++++++++++++++++++----
 python/pyspark/ml/tests.py      | 14 ++++++++++++++
 2 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index 84d843369e105..f0ff7a5f59abf 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -1376,17 +1376,20 @@ class GeneralizedLinearRegression(JavaEstimator, HasLabelCol, HasFeaturesCol, Ha
                       typeConverter=TypeConverters.toFloat)
     solver = Param(Params._dummy(), "solver", "The solver algorithm for optimization. Supported " +
                    "options: irls.", typeConverter=TypeConverters.toString)
+    offsetCol = Param(Params._dummy(), "offsetCol", "The offset column name. If this is not set " +
+                      "or empty, we treat all instance offsets as 0.0",
+                      typeConverter=TypeConverters.toString)
 
     @keyword_only
     def __init__(self, labelCol="label", featuresCol="features", predictionCol="prediction",
                  family="gaussian", link=None, fitIntercept=True, maxIter=25, tol=1e-6,
                  regParam=0.0, weightCol=None, solver="irls", linkPredictionCol=None,
-                 variancePower=0.0, linkPower=None):
+                 variancePower=0.0, linkPower=None, offsetCol=None):
         """
         __init__(self, labelCol="label", featuresCol="features", predictionCol="prediction", \
                  family="gaussian", link=None, fitIntercept=True, maxIter=25, tol=1e-6, \
                  regParam=0.0, weightCol=None, solver="irls", linkPredictionCol=None, \
-                 variancePower=0.0, linkPower=None)
+                 variancePower=0.0, linkPower=None, offsetCol=None)
         """
         super(GeneralizedLinearRegression, self).__init__()
         self._java_obj = self._new_java_obj(
@@ -1402,12 +1405,12 @@ def __init__(self, labelCol="label", featuresCol="features", predictionCol="pred
     def setParams(self, labelCol="label", featuresCol="features", predictionCol="prediction",
                   family="gaussian", link=None, fitIntercept=True, maxIter=25, tol=1e-6,
                   regParam=0.0, weightCol=None, solver="irls", linkPredictionCol=None,
-                  variancePower=0.0, linkPower=None):
+                  variancePower=0.0, linkPower=None, offsetCol=None):
         """
         setParams(self, labelCol="label", featuresCol="features", predictionCol="prediction", \
                   family="gaussian", link=None, fitIntercept=True, maxIter=25, tol=1e-6, \
                   regParam=0.0, weightCol=None, solver="irls", linkPredictionCol=None, \
-                  variancePower=0.0, linkPower=None)
+                  variancePower=0.0, linkPower=None, offsetCol=None)
         Sets params for generalized linear regression.
         """
         kwargs = self._input_kwargs
@@ -1486,6 +1489,20 @@ def getLinkPower(self):
         """
         return self.getOrDefault(self.linkPower)
 
+    @since("2.3.0")
+    def setOffsetCol(self, value):
+        """
+        Sets the value of :py:attr:`offsetCol`.
+        """
+        return self._set(offsetCol=value)
+
+    @since("2.3.0")
+    def getOffsetCol(self):
+        """
+        Gets the value of offsetCol or its default value.
+        """
+        return self.getOrDefault(self.offsetCol)
+
 
 class GeneralizedLinearRegressionModel(JavaModel, JavaPredictionModel, JavaMLWritable,
                                        JavaMLReadable):
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index ffb8b0a890ff8..7870047651601 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -1291,6 +1291,20 @@ def test_tweedie_distribution(self):
         self.assertTrue(np.allclose(model2.coefficients.toArray(), [-0.6667, 0.5], atol=1E-4))
         self.assertTrue(np.isclose(model2.intercept, 0.6667, atol=1E-4))
 
+    def test_offset(self):
+
+        df = self.spark.createDataFrame(
+            [(0.2, 1.0, 2.0, Vectors.dense(0.0, 5.0)),
+             (0.5, 2.1, 0.5, Vectors.dense(1.0, 2.0)),
+             (0.9, 0.4, 1.0, Vectors.dense(2.0, 1.0)),
+             (0.7, 0.7, 0.0, Vectors.dense(3.0, 3.0))], ["label", "weight", "offset", "features"])
+
+        glr = GeneralizedLinearRegression(family="poisson", weightCol="weight", offsetCol="offset")
+        model = glr.fit(df)
+        self.assertTrue(np.allclose(model.coefficients.toArray(), [0.664647, -0.3192581],
+                                    atol=1E-4))
+        self.assertTrue(np.isclose(model.intercept, -1.561613, atol=1E-4))
+
 
 class FPGrowthTests(SparkSessionTestCase):
     def setUp(self):

From 873f3ad2b89c955f42fced49dc129e8efa77d044 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@happy-camper.st>
Date: Wed, 5 Jul 2017 20:32:47 +0800
Subject: [PATCH 093/779] [SPARK-16167][SQL] RowEncoder should preserve
 array/map type nullability.

## What changes were proposed in this pull request?

Currently `RowEncoder` doesn't preserve nullability of `ArrayType` or `MapType`.
It returns always `containsNull = true` for `ArrayType`, `valueContainsNull = true` for `MapType` and also the nullability of itself is always `true`.

This pr fixes the nullability of them.
## How was this patch tested?

Add tests to check if `RowEncoder` preserves array/map nullability.

Author: Takuya UESHIN <ueshin@happy-camper.st>
Author: Takuya UESHIN <ueshin@databricks.com>

Closes #13873 from ueshin/issues/SPARK-16167.
---
 .../sql/catalyst/encoders/RowEncoder.scala    | 25 +++++++++++---
 .../catalyst/encoders/RowEncoderSuite.scala   | 33 +++++++++++++++++++
 2 files changed, 54 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
index cc32fac67e924..43c35bbdf383a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
@@ -123,7 +123,7 @@ object RowEncoder {
         inputObject :: Nil,
         returnNullable = false)
 
-    case t @ ArrayType(et, cn) =>
+    case t @ ArrayType(et, containsNull) =>
       et match {
         case BooleanType | ByteType | ShortType | IntegerType | LongType | FloatType | DoubleType =>
           StaticInvoke(
@@ -132,8 +132,16 @@ object RowEncoder {
             "toArrayData",
             inputObject :: Nil,
             returnNullable = false)
+
         case _ => MapObjects(
-          element => serializerFor(ValidateExternalType(element, et), et),
+          element => {
+            val value = serializerFor(ValidateExternalType(element, et), et)
+            if (!containsNull) {
+              AssertNotNull(value, Seq.empty)
+            } else {
+              value
+            }
+          },
           inputObject,
           ObjectType(classOf[Object]))
       }
@@ -155,10 +163,19 @@ object RowEncoder {
           ObjectType(classOf[scala.collection.Seq[_]]), returnNullable = false)
       val convertedValues = serializerFor(values, ArrayType(vt, valueNullable))
 
-      NewInstance(
+      val nonNullOutput = NewInstance(
         classOf[ArrayBasedMapData],
         convertedKeys :: convertedValues :: Nil,
-        dataType = t)
+        dataType = t,
+        propagateNull = false)
+
+      if (inputObject.nullable) {
+        If(IsNull(inputObject),
+          Literal.create(null, inputType),
+          nonNullOutput)
+      } else {
+        nonNullOutput
+      }
 
     case StructType(fields) =>
       val nonNullOutput = CreateNamedStruct(fields.zipWithIndex.flatMap { case (field, index) =>
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala
index 1a5569a77dc7a..6ed175f86ca77 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala
@@ -273,6 +273,39 @@ class RowEncoderSuite extends SparkFunSuite {
     assert(e4.getMessage.contains("java.lang.String is not a valid external type"))
   }
 
+  for {
+    elementType <- Seq(IntegerType, StringType)
+    containsNull <- Seq(true, false)
+    nullable <- Seq(true, false)
+  } {
+    test("RowEncoder should preserve array nullability: " +
+      s"ArrayType($elementType, containsNull = $containsNull), nullable = $nullable") {
+      val schema = new StructType().add("array", ArrayType(elementType, containsNull), nullable)
+      val encoder = RowEncoder(schema).resolveAndBind()
+      assert(encoder.serializer.length == 1)
+      assert(encoder.serializer.head.dataType == ArrayType(elementType, containsNull))
+      assert(encoder.serializer.head.nullable == nullable)
+    }
+  }
+
+  for {
+    keyType <- Seq(IntegerType, StringType)
+    valueType <- Seq(IntegerType, StringType)
+    valueContainsNull <- Seq(true, false)
+    nullable <- Seq(true, false)
+  } {
+    test("RowEncoder should preserve map nullability: " +
+      s"MapType($keyType, $valueType, valueContainsNull = $valueContainsNull), " +
+      s"nullable = $nullable") {
+      val schema = new StructType().add(
+        "map", MapType(keyType, valueType, valueContainsNull), nullable)
+      val encoder = RowEncoder(schema).resolveAndBind()
+      assert(encoder.serializer.length == 1)
+      assert(encoder.serializer.head.dataType == MapType(keyType, valueType, valueContainsNull))
+      assert(encoder.serializer.head.nullable == nullable)
+    }
+  }
+
   private def encodeDecodeTest(schema: StructType): Unit = {
     test(s"encode/decode: ${schema.simpleString}") {
       val encoder = RowEncoder(schema).resolveAndBind()

From 5787ace463b2abde50d2ca24e8dd111e3a7c158e Mon Sep 17 00:00:00 2001
From: ouyangxiaochen <ou.yangxiaochen@zte.com.cn>
Date: Wed, 5 Jul 2017 20:46:42 +0800
Subject: [PATCH 094/779] [SPARK-20383][SQL] Supporting Create [temporary]
 Function with the keyword 'OR REPLACE' and 'IF NOT EXISTS'

## What changes were proposed in this pull request?

support to create [temporary] function with the keyword 'OR REPLACE' and 'IF NOT EXISTS'

## How was this patch tested?
manual test and added test cases

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: ouyangxiaochen <ou.yangxiaochen@zte.com.cn>

Closes #17681 from ouyangxiaochen/spark-419.
---
 .../spark/sql/catalyst/parser/SqlBase.g4      |  3 +-
 .../catalyst/catalog/ExternalCatalog.scala    |  9 ++++
 .../catalyst/catalog/InMemoryCatalog.scala    |  6 +++
 .../sql/catalyst/catalog/SessionCatalog.scala | 23 ++++++++
 .../spark/sql/catalyst/catalog/events.scala   | 10 ++++
 .../catalog/ExternalCatalogEventSuite.scala   |  9 ++++
 .../catalog/ExternalCatalogSuite.scala        |  9 ++++
 .../spark/sql/execution/SparkSqlParser.scala  |  8 +--
 .../sql/execution/command/functions.scala     | 46 +++++++++++-----
 .../execution/command/DDLCommandSuite.scala   | 52 ++++++++++++++++++-
 .../sql/execution/command/DDLSuite.scala      | 51 ++++++++++++++++++
 .../spark/sql/hive/HiveExternalCatalog.scala  |  9 ++++
 12 files changed, 216 insertions(+), 19 deletions(-)

diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index 29f554451ed4a..ef9f88a9026c9 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -126,7 +126,8 @@ statement
         tableIdentifier ('(' colTypeList ')')? tableProvider
         (OPTIONS tablePropertyList)?                                   #createTempViewUsing
     | ALTER VIEW tableIdentifier AS? query                             #alterViewQuery
-    | CREATE TEMPORARY? FUNCTION qualifiedName AS className=STRING
+    | CREATE (OR REPLACE)? TEMPORARY? FUNCTION (IF NOT EXISTS)?
+        qualifiedName AS className=STRING
         (USING resource (',' resource)*)?                              #createFunction
     | DROP TEMPORARY? FUNCTION (IF EXISTS)? qualifiedName              #dropFunction
     | EXPLAIN (LOGICAL | FORMATTED | EXTENDED | CODEGEN | COST)?
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
index 0254b6bb6d136..6000d483db209 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
@@ -332,6 +332,15 @@ abstract class ExternalCatalog
 
   protected def doDropFunction(db: String, funcName: String): Unit
 
+  final def alterFunction(db: String, funcDefinition: CatalogFunction): Unit = {
+    val name = funcDefinition.identifier.funcName
+    postToAll(AlterFunctionPreEvent(db, name))
+    doAlterFunction(db, funcDefinition)
+    postToAll(AlterFunctionEvent(db, name))
+  }
+
+  protected def doAlterFunction(db: String, funcDefinition: CatalogFunction): Unit
+
   final def renameFunction(db: String, oldName: String, newName: String): Unit = {
     postToAll(RenameFunctionPreEvent(db, oldName, newName))
     doRenameFunction(db, oldName, newName)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
index 747190faa3c8c..d253c72a62739 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
@@ -590,6 +590,12 @@ class InMemoryCatalog(
     catalog(db).functions.remove(funcName)
   }
 
+  override protected def doAlterFunction(db: String, func: CatalogFunction): Unit = synchronized {
+    requireDbExists(db)
+    requireFunctionExists(db, func.identifier.funcName)
+    catalog(db).functions.put(func.identifier.funcName, func)
+  }
+
   override protected def doRenameFunction(
       db: String,
       oldName: String,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index a86604e4353ab..c40d5f6031a21 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -1055,6 +1055,29 @@ class SessionCatalog(
     }
   }
 
+  /**
+   * overwirte a metastore function in the database specified in `funcDefinition`..
+   * If no database is specified, assume the function is in the current database.
+   */
+  def alterFunction(funcDefinition: CatalogFunction): Unit = {
+    val db = formatDatabaseName(funcDefinition.identifier.database.getOrElse(getCurrentDatabase))
+    requireDbExists(db)
+    val identifier = FunctionIdentifier(funcDefinition.identifier.funcName, Some(db))
+    val newFuncDefinition = funcDefinition.copy(identifier = identifier)
+    if (functionExists(identifier)) {
+      if (functionRegistry.functionExists(identifier)) {
+        // If we have loaded this function into the FunctionRegistry,
+        // also drop it from there.
+        // For a permanent function, because we loaded it to the FunctionRegistry
+        // when it's first used, we also need to drop it from the FunctionRegistry.
+        functionRegistry.dropFunction(identifier)
+      }
+      externalCatalog.alterFunction(db, newFuncDefinition)
+    } else {
+      throw new NoSuchFunctionException(db = db, func = identifier.toString)
+    }
+  }
+
   /**
    * Retrieve the metadata of a metastore function.
    *
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/events.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/events.scala
index 459973a13bb10..742a51e640383 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/events.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/events.scala
@@ -139,6 +139,16 @@ case class DropFunctionPreEvent(database: String, name: String) extends Function
  */
 case class DropFunctionEvent(database: String, name: String) extends FunctionEvent
 
+/**
+ * Event fired before a function is altered.
+ */
+case class AlterFunctionPreEvent(database: String, name: String) extends FunctionEvent
+
+/**
+ * Event fired after a function has been altered.
+ */
+case class AlterFunctionEvent(database: String, name: String) extends FunctionEvent
+
 /**
  * Event fired before a function is renamed.
  */
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogEventSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogEventSuite.scala
index 2539ea615ff92..087c26f23f383 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogEventSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogEventSuite.scala
@@ -176,6 +176,15 @@ class ExternalCatalogEventSuite extends SparkFunSuite {
     }
     checkEvents(RenameFunctionPreEvent("db5", "fn7", "fn4") :: Nil)
 
+    // ALTER
+    val alteredFunctionDefinition = CatalogFunction(
+      identifier = FunctionIdentifier("fn4", Some("db5")),
+      className = "org.apache.spark.AlterFunction",
+      resources = Seq.empty)
+    catalog.alterFunction("db5", alteredFunctionDefinition)
+    checkEvents(
+      AlterFunctionPreEvent("db5", "fn4") :: AlterFunctionEvent("db5", "fn4") :: Nil)
+
     // DROP
     intercept[AnalysisException] {
       catalog.dropFunction("db5", "fn7")
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
index c22d55fc96a65..66e895a4690c1 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
@@ -752,6 +752,14 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac
     }
   }
 
+  test("alter function") {
+    val catalog = newBasicCatalog()
+    assert(catalog.getFunction("db2", "func1").className == funcClass)
+    val myNewFunc = catalog.getFunction("db2", "func1").copy(className = newFuncClass)
+    catalog.alterFunction("db2", myNewFunc)
+    assert(catalog.getFunction("db2", "func1").className == newFuncClass)
+  }
+
   test("list functions") {
     val catalog = newBasicCatalog()
     catalog.createFunction("db2", newFunc("func2"))
@@ -916,6 +924,7 @@ abstract class CatalogTestUtils {
   lazy val partWithEmptyValue =
     CatalogTablePartition(Map("a" -> "3", "b" -> ""), storageFormat)
   lazy val funcClass = "org.apache.spark.myFunc"
+  lazy val newFuncClass = "org.apache.spark.myNewFunc"
 
   /**
    * Creates a basic catalog, with the following structure:
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index 2b79eb5eac0f1..2f8e416e7df1b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -687,8 +687,8 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
    *
    * For example:
    * {{{
-   *   CREATE [TEMPORARY] FUNCTION [db_name.]function_name AS class_name
-   *    [USING JAR|FILE|ARCHIVE 'file_uri' [, JAR|FILE|ARCHIVE 'file_uri']];
+   *   CREATE [OR REPLACE] [TEMPORARY] FUNCTION [IF NOT EXISTS] [db_name.]function_name
+   *   AS class_name [USING JAR|FILE|ARCHIVE 'file_uri' [, JAR|FILE|ARCHIVE 'file_uri']];
    * }}}
    */
   override def visitCreateFunction(ctx: CreateFunctionContext): LogicalPlan = withOrigin(ctx) {
@@ -709,7 +709,9 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
       functionIdentifier.funcName,
       string(ctx.className),
       resources,
-      ctx.TEMPORARY != null)
+      ctx.TEMPORARY != null,
+      ctx.EXISTS != null,
+      ctx.REPLACE != null)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala
index a91ad413f4d1b..4f92ffee687aa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala
@@ -31,13 +31,13 @@ import org.apache.spark.sql.types.{StringType, StructField, StructType}
  * The DDL command that creates a function.
  * To create a temporary function, the syntax of using this command in SQL is:
  * {{{
- *    CREATE TEMPORARY FUNCTION functionName
+ *    CREATE [OR REPLACE] TEMPORARY FUNCTION functionName
  *    AS className [USING JAR\FILE 'uri' [, JAR|FILE 'uri']]
  * }}}
  *
  * To create a permanent function, the syntax in SQL is:
  * {{{
- *    CREATE FUNCTION [databaseName.]functionName
+ *    CREATE [OR REPLACE] FUNCTION [IF NOT EXISTS] [databaseName.]functionName
  *    AS className [USING JAR\FILE 'uri' [, JAR|FILE 'uri']]
  * }}}
  */
@@ -46,26 +46,46 @@ case class CreateFunctionCommand(
     functionName: String,
     className: String,
     resources: Seq[FunctionResource],
-    isTemp: Boolean)
+    isTemp: Boolean,
+    ifNotExists: Boolean,
+    replace: Boolean)
   extends RunnableCommand {
 
+  if (ifNotExists && replace) {
+    throw new AnalysisException("CREATE FUNCTION with both IF NOT EXISTS and REPLACE" +
+      " is not allowed.")
+  }
+
+  // Disallow to define a temporary function with `IF NOT EXISTS`
+  if (ifNotExists && isTemp) {
+    throw new AnalysisException(
+      "It is not allowed to define a TEMPORARY function with IF NOT EXISTS.")
+  }
+
+  // Temporary function names should not contain database prefix like "database.function"
+  if (databaseName.isDefined && isTemp) {
+    throw new AnalysisException(s"Specifying a database in CREATE TEMPORARY FUNCTION " +
+      s"is not allowed: '${databaseName.get}'")
+  }
+
   override def run(sparkSession: SparkSession): Seq[Row] = {
     val catalog = sparkSession.sessionState.catalog
     val func = CatalogFunction(FunctionIdentifier(functionName, databaseName), className, resources)
     if (isTemp) {
-      if (databaseName.isDefined) {
-        throw new AnalysisException(s"Specifying a database in CREATE TEMPORARY FUNCTION " +
-          s"is not allowed: '${databaseName.get}'")
-      }
       // We first load resources and then put the builder in the function registry.
       catalog.loadFunctionResources(resources)
-      catalog.registerFunction(func, overrideIfExists = false)
+      catalog.registerFunction(func, overrideIfExists = replace)
     } else {
-      // For a permanent, we will store the metadata into underlying external catalog.
-      // This function will be loaded into the FunctionRegistry when a query uses it.
-      // We do not load it into FunctionRegistry right now.
-      // TODO: should we also parse "IF NOT EXISTS"?
-      catalog.createFunction(func, ignoreIfExists = false)
+      // Handles `CREATE OR REPLACE FUNCTION AS ... USING ...`
+      if (replace && catalog.functionExists(func.identifier)) {
+        // alter the function in the metastore
+        catalog.alterFunction(CatalogFunction(func.identifier, className, resources))
+      } else {
+        // For a permanent, we will store the metadata into underlying external catalog.
+        // This function will be loaded into the FunctionRegistry when a query uses it.
+        // We do not load it into FunctionRegistry right now.
+        catalog.createFunction(CatalogFunction(func.identifier, className, resources), ifNotExists)
+      }
     }
     Seq.empty[Row]
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
index 8a6bc62fec96c..5643c58d9f847 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
@@ -181,8 +181,29 @@ class DDLCommandSuite extends PlanTest {
         |'com.matthewrathbone.example.SimpleUDFExample' USING ARCHIVE '/path/to/archive',
         |FILE '/path/to/file'
       """.stripMargin
+    val sql3 =
+      """
+        |CREATE OR REPLACE TEMPORARY FUNCTION helloworld3 as
+        |'com.matthewrathbone.example.SimpleUDFExample' USING JAR '/path/to/jar1',
+        |JAR '/path/to/jar2'
+      """.stripMargin
+    val sql4 =
+      """
+        |CREATE OR REPLACE FUNCTION hello.world1 as
+        |'com.matthewrathbone.example.SimpleUDFExample' USING ARCHIVE '/path/to/archive',
+        |FILE '/path/to/file'
+      """.stripMargin
+    val sql5 =
+      """
+        |CREATE FUNCTION IF NOT EXISTS hello.world2 as
+        |'com.matthewrathbone.example.SimpleUDFExample' USING ARCHIVE '/path/to/archive',
+        |FILE '/path/to/file'
+      """.stripMargin
     val parsed1 = parser.parsePlan(sql1)
     val parsed2 = parser.parsePlan(sql2)
+    val parsed3 = parser.parsePlan(sql3)
+    val parsed4 = parser.parsePlan(sql4)
+    val parsed5 = parser.parsePlan(sql5)
     val expected1 = CreateFunctionCommand(
       None,
       "helloworld",
@@ -190,7 +211,7 @@ class DDLCommandSuite extends PlanTest {
       Seq(
         FunctionResource(FunctionResourceType.fromString("jar"), "/path/to/jar1"),
         FunctionResource(FunctionResourceType.fromString("jar"), "/path/to/jar2")),
-      isTemp = true)
+      isTemp = true, ifNotExists = false, replace = false)
     val expected2 = CreateFunctionCommand(
       Some("hello"),
       "world",
@@ -198,9 +219,36 @@ class DDLCommandSuite extends PlanTest {
       Seq(
         FunctionResource(FunctionResourceType.fromString("archive"), "/path/to/archive"),
         FunctionResource(FunctionResourceType.fromString("file"), "/path/to/file")),
-      isTemp = false)
+      isTemp = false, ifNotExists = false, replace = false)
+    val expected3 = CreateFunctionCommand(
+      None,
+      "helloworld3",
+      "com.matthewrathbone.example.SimpleUDFExample",
+      Seq(
+        FunctionResource(FunctionResourceType.fromString("jar"), "/path/to/jar1"),
+        FunctionResource(FunctionResourceType.fromString("jar"), "/path/to/jar2")),
+      isTemp = true, ifNotExists = false, replace = true)
+    val expected4 = CreateFunctionCommand(
+      Some("hello"),
+      "world1",
+      "com.matthewrathbone.example.SimpleUDFExample",
+      Seq(
+        FunctionResource(FunctionResourceType.fromString("archive"), "/path/to/archive"),
+        FunctionResource(FunctionResourceType.fromString("file"), "/path/to/file")),
+      isTemp = false, ifNotExists = false, replace = true)
+    val expected5 = CreateFunctionCommand(
+      Some("hello"),
+      "world2",
+      "com.matthewrathbone.example.SimpleUDFExample",
+      Seq(
+        FunctionResource(FunctionResourceType.fromString("archive"), "/path/to/archive"),
+        FunctionResource(FunctionResourceType.fromString("file"), "/path/to/file")),
+      isTemp = false, ifNotExists = true, replace = false)
     comparePlans(parsed1, expected1)
     comparePlans(parsed2, expected2)
+    comparePlans(parsed3, expected3)
+    comparePlans(parsed4, expected4)
+    comparePlans(parsed5, expected5)
   }
 
   test("drop function") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index e4dd077715d0f..5c40d8bb4b1ef 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -2270,6 +2270,57 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
     }
   }
 
+  test("create temporary function with if not exists") {
+    withUserDefinedFunction("func1" -> true) {
+      val sql1 =
+        """
+          |CREATE TEMPORARY FUNCTION IF NOT EXISTS func1 as
+          |'com.matthewrathbone.example.SimpleUDFExample' USING JAR '/path/to/jar1',
+          |JAR '/path/to/jar2'
+        """.stripMargin
+      val e = intercept[AnalysisException] {
+        sql(sql1)
+      }.getMessage
+      assert(e.contains("It is not allowed to define a TEMPORARY function with IF NOT EXISTS"))
+    }
+  }
+
+  test("create function with both if not exists and replace") {
+    withUserDefinedFunction("func1" -> false) {
+      val sql1 =
+        """
+          |CREATE OR REPLACE FUNCTION IF NOT EXISTS func1 as
+          |'com.matthewrathbone.example.SimpleUDFExample' USING JAR '/path/to/jar1',
+          |JAR '/path/to/jar2'
+        """.stripMargin
+      val e = intercept[AnalysisException] {
+        sql(sql1)
+      }.getMessage
+      assert(e.contains("CREATE FUNCTION with both IF NOT EXISTS and REPLACE is not allowed"))
+    }
+  }
+
+  test("create temporary function by specifying a database") {
+    val dbName = "mydb"
+    withDatabase(dbName) {
+      sql(s"CREATE DATABASE $dbName")
+      sql(s"USE $dbName")
+      withUserDefinedFunction("func1" -> true) {
+        val sql1 =
+          s"""
+             |CREATE TEMPORARY FUNCTION $dbName.func1 as
+             |'com.matthewrathbone.example.SimpleUDFExample' USING JAR '/path/to/jar1',
+             |JAR '/path/to/jar2'
+            """.stripMargin
+        val e = intercept[AnalysisException] {
+          sql(sql1)
+        }.getMessage
+        assert(e.contains(s"Specifying a database in CREATE TEMPORARY FUNCTION " +
+          s"is not allowed: '$dbName'"))
+      }
+    }
+  }
+
   Seq(true, false).foreach { caseSensitive =>
     test(s"alter table add columns with existing column name - caseSensitive $caseSensitive") {
       withSQLConf(SQLConf.CASE_SENSITIVE.key -> s"$caseSensitive") {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 2a17849fa8a34..306b38048e3a5 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -1132,6 +1132,15 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     client.dropFunction(db, name)
   }
 
+  override protected def doAlterFunction(
+      db: String, funcDefinition: CatalogFunction): Unit = withClient {
+    requireDbExists(db)
+    val functionName = funcDefinition.identifier.funcName.toLowerCase(Locale.ROOT)
+    requireFunctionExists(db, functionName)
+    val functionIdentifier = funcDefinition.identifier.copy(funcName = functionName)
+    client.alterFunction(db, funcDefinition.copy(identifier = functionIdentifier))
+  }
+
   override protected def doRenameFunction(
       db: String,
       oldName: String,

From e3e2b5da3671a6c6d152b4de481a8aa3e57a6e42 Mon Sep 17 00:00:00 2001
From: "he.qiao" <he.qiao17@zte.com.cn>
Date: Wed, 5 Jul 2017 21:13:25 +0800
Subject: [PATCH 095/779] [SPARK-21286][TEST] Modified StorageTabSuite unit
 test

## What changes were proposed in this pull request?
The old unit test not effect

## How was this patch tested?
unit test

Author: he.qiao <he.qiao17@zte.com.cn>

Closes #18511 from Geek-He/dev_0703.
---
 .../scala/org/apache/spark/ui/storage/StorageTabSuite.scala     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala b/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala
index 66dda382eb653..1cb52593e7060 100644
--- a/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala
@@ -74,7 +74,7 @@ class StorageTabSuite extends SparkFunSuite with BeforeAndAfter {
     // Submitting RDDInfos with duplicate IDs does nothing
     val rddInfo0Cached = new RDDInfo(0, "freedom", 100, StorageLevel.MEMORY_ONLY, Seq(10))
     rddInfo0Cached.numCachedPartitions = 1
-    val stageInfo0Cached = new StageInfo(0, 0, "0", 100, Seq(rddInfo0), Seq.empty, "details")
+    val stageInfo0Cached = new StageInfo(0, 0, "0", 100, Seq(rddInfo0Cached), Seq.empty, "details")
     bus.postToAll(SparkListenerStageSubmitted(stageInfo0Cached))
     assert(storageListener._rddInfoMap.size === 4)
     assert(storageListener.rddInfoList.size === 2)

From 960298ee66b9b8a80f84df679ce5b4b3846267f4 Mon Sep 17 00:00:00 2001
From: sadikovi <ivan.sadikov@lincolnuni.ac.nz>
Date: Wed, 5 Jul 2017 14:40:44 +0100
Subject: [PATCH 096/779] [SPARK-20858][DOC][MINOR] Document ListenerBus event
 queue size

## What changes were proposed in this pull request?

This change adds a new configuration option `spark.scheduler.listenerbus.eventqueue.size` to the configuration docs to specify the capacity of the spark listener bus event queue. Default value is 10000.

This is doc PR for [SPARK-15703](https://issues.apache.org/jira/browse/SPARK-15703).

I added option to the `Scheduling` section, however it might be more related to `Spark UI` section.

## How was this patch tested?

Manually verified correct rendering of configuration option.

Author: sadikovi <ivan.sadikov@lincolnuni.ac.nz>
Author: Ivan Sadikov <ivan.sadikov@team.telstra.com>

Closes #18476 from sadikovi/SPARK-20858.
---
 docs/configuration.md | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/docs/configuration.md b/docs/configuration.md
index bd6a1f9e240e2..c785a664c67b1 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -725,7 +725,7 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.ui.retainedJobs</code></td>
   <td>1000</td>
   <td>
-    How many jobs the Spark UI and status APIs remember before garbage collecting. 
+    How many jobs the Spark UI and status APIs remember before garbage collecting.
     This is a target maximum, and fewer elements may be retained in some circumstances.
   </td>
 </tr>
@@ -733,7 +733,7 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.ui.retainedStages</code></td>
   <td>1000</td>
   <td>
-    How many stages the Spark UI and status APIs remember before garbage collecting. 
+    How many stages the Spark UI and status APIs remember before garbage collecting.
     This is a target maximum, and fewer elements may be retained in some circumstances.
   </td>
 </tr>
@@ -741,7 +741,7 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.ui.retainedTasks</code></td>
   <td>100000</td>
   <td>
-    How many tasks the Spark UI and status APIs remember before garbage collecting. 
+    How many tasks the Spark UI and status APIs remember before garbage collecting.
     This is a target maximum, and fewer elements may be retained in some circumstances.
   </td>
 </tr>
@@ -1389,6 +1389,15 @@ Apart from these, the following properties are also available, and may be useful
     The interval length for the scheduler to revive the worker resource offers to run tasks.
   </td>
 </tr>
+<tr>
+  <td><code>spark.scheduler.listenerbus.eventqueue.capacity</code></td>
+  <td>10000</td>
+  <td>
+    Capacity for event queue in Spark listener bus, must be greater than 0. Consider increasing
+    value (e.g. 20000) if listener events are dropped. Increasing this value may result in the
+    driver using more memory.
+  </td>
+</tr>
 <tr>
   <td><code>spark.blacklist.enabled</code></td>
   <td>
@@ -1475,8 +1484,8 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.blacklist.application.fetchFailure.enabled</code></td>
   <td>false</td>
   <td>
-    (Experimental) If set to "true", Spark will blacklist the executor immediately when a fetch 
-    failure happenes. If external shuffle service is enabled, then the whole node will be 
+    (Experimental) If set to "true", Spark will blacklist the executor immediately when a fetch
+    failure happenes. If external shuffle service is enabled, then the whole node will be
     blacklisted.
   </td>
 </tr>

From 742da0868534dab3d4d7b7edbe5ba9dc8bf26cc8 Mon Sep 17 00:00:00 2001
From: Jeff Zhang <zjffdu@apache.org>
Date: Wed, 5 Jul 2017 10:59:10 -0700
Subject: [PATCH 097/779] [SPARK-19439][PYSPARK][SQL] PySpark's
 registerJavaFunction Should Support UDAFs

## What changes were proposed in this pull request?

Support register Java UDAFs in PySpark so that user can use Java UDAF in PySpark. Besides that I also add api in `UDFRegistration`

## How was this patch tested?

Unit test is added

Author: Jeff Zhang <zjffdu@apache.org>

Closes #17222 from zjffdu/SPARK-19439.
---
 python/pyspark/sql/context.py                 | 23 ++++++++
 python/pyspark/sql/tests.py                   | 10 ++++
 .../apache/spark/sql/UDFRegistration.scala    | 33 +++++++++--
 .../org/apache/spark/sql/JavaUDAFSuite.java   | 55 +++++++++++++++++++
 .../org/apache/spark/sql}/MyDoubleAvg.java    |  2 +-
 .../org/apache/spark/sql}/MyDoubleSum.java    |  8 +--
 sql/hive/pom.xml                              |  7 +++
 .../spark/sql/hive/JavaDataFrameSuite.java    |  2 +-
 .../execution/AggregationQuerySuite.scala     |  5 +-
 9 files changed, 132 insertions(+), 13 deletions(-)
 create mode 100644 sql/core/src/test/java/test/org/apache/spark/sql/JavaUDAFSuite.java
 rename sql/{hive/src/test/java/org/apache/spark/sql/hive/aggregate => core/src/test/java/test/org/apache/spark/sql}/MyDoubleAvg.java (99%)
 rename sql/{hive/src/test/java/org/apache/spark/sql/hive/aggregate => core/src/test/java/test/org/apache/spark/sql}/MyDoubleSum.java (98%)

diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 426f07cd9410d..c44ab247fd3d3 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -232,6 +232,23 @@ def registerJavaFunction(self, name, javaClassName, returnType=None):
             jdt = self.sparkSession._jsparkSession.parseDataType(returnType.json())
         self.sparkSession._jsparkSession.udf().registerJava(name, javaClassName, jdt)
 
+    @ignore_unicode_prefix
+    @since(2.3)
+    def registerJavaUDAF(self, name, javaClassName):
+        """Register a java UDAF so it can be used in SQL statements.
+
+        :param name:  name of the UDAF
+        :param javaClassName: fully qualified name of java class
+
+        >>> sqlContext.registerJavaUDAF("javaUDAF",
+        ...   "test.org.apache.spark.sql.MyDoubleAvg")
+        >>> df = sqlContext.createDataFrame([(1, "a"),(2, "b"), (3, "a")],["id", "name"])
+        >>> df.registerTempTable("df")
+        >>> sqlContext.sql("SELECT name, javaUDAF(id) as avg from df group by name").collect()
+        [Row(name=u'b', avg=102.0), Row(name=u'a', avg=102.0)]
+        """
+        self.sparkSession._jsparkSession.udf().registerJavaUDAF(name, javaClassName)
+
     # TODO(andrew): delete this once we refactor things to take in SparkSession
     def _inferSchema(self, rdd, samplingRatio=None):
         """
@@ -551,6 +568,12 @@ def __init__(self, sqlContext):
     def register(self, name, f, returnType=StringType()):
         return self.sqlContext.registerFunction(name, f, returnType)
 
+    def registerJavaFunction(self, name, javaClassName, returnType=None):
+        self.sqlContext.registerJavaFunction(name, javaClassName, returnType)
+
+    def registerJavaUDAF(self, name, javaClassName):
+        self.sqlContext.registerJavaUDAF(name, javaClassName)
+
     register.__doc__ = SQLContext.registerFunction.__doc__
 
 
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 16ba8bd73f400..c0e3b8d132396 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -481,6 +481,16 @@ def test_udf_registration_returns_udf(self):
             df.select(add_three("id").alias("plus_three")).collect()
         )
 
+    def test_non_existed_udf(self):
+        spark = self.spark
+        self.assertRaisesRegexp(AnalysisException, "Can not load class non_existed_udf",
+                                lambda: spark.udf.registerJavaFunction("udf1", "non_existed_udf"))
+
+    def test_non_existed_udaf(self):
+        spark = self.spark
+        self.assertRaisesRegexp(AnalysisException, "Can not load class non_existed_udaf",
+                                lambda: spark.udf.registerJavaUDAF("udaf1", "non_existed_udaf"))
+
     def test_multiLine_json(self):
         people1 = self.spark.read.json("python/test_support/sql/people.json")
         people_array = self.spark.read.json("python/test_support/sql/people_array.json",
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
index ad01b889429c7..8bdc0221888d0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql
 
-import java.io.IOException
 import java.lang.reflect.{ParameterizedType, Type}
 
 import scala.reflect.runtime.universe.TypeTag
@@ -456,9 +455,9 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
         .map(_.asInstanceOf[ParameterizedType])
         .filter(e => e.getRawType.isInstanceOf[Class[_]] && e.getRawType.asInstanceOf[Class[_]].getCanonicalName.startsWith("org.apache.spark.sql.api.java.UDF"))
       if (udfInterfaces.length == 0) {
-        throw new IOException(s"UDF class ${className} doesn't implement any UDF interface")
+        throw new AnalysisException(s"UDF class ${className} doesn't implement any UDF interface")
       } else if (udfInterfaces.length > 1) {
-        throw new IOException(s"It is invalid to implement multiple UDF interfaces, UDF class ${className}")
+        throw new AnalysisException(s"It is invalid to implement multiple UDF interfaces, UDF class ${className}")
       } else {
         try {
           val udf = clazz.newInstance()
@@ -491,19 +490,41 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
             case 21 => register(name, udf.asInstanceOf[UDF20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
             case 22 => register(name, udf.asInstanceOf[UDF21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
             case 23 => register(name, udf.asInstanceOf[UDF22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
-            case n => logError(s"UDF class with ${n} type arguments is not supported ")
+            case n =>
+              throw new AnalysisException(s"UDF class with ${n} type arguments is not supported.")
           }
         } catch {
           case e @ (_: InstantiationException | _: IllegalArgumentException) =>
-            logError(s"Can not instantiate class ${className}, please make sure it has public non argument constructor")
+            throw new AnalysisException(s"Can not instantiate class ${className}, please make sure it has public non argument constructor")
         }
       }
     } catch {
-      case e: ClassNotFoundException => logError(s"Can not load class ${className}, please make sure it is on the classpath")
+      case e: ClassNotFoundException => throw new AnalysisException(s"Can not load class ${className}, please make sure it is on the classpath")
     }
 
   }
 
+  /**
+   * Register a Java UDAF class using reflection, for use from pyspark
+   *
+   * @param name     UDAF name
+   * @param className    fully qualified class name of UDAF
+   */
+  private[sql] def registerJavaUDAF(name: String, className: String): Unit = {
+    try {
+      val clazz = Utils.classForName(className)
+      if (!classOf[UserDefinedAggregateFunction].isAssignableFrom(clazz)) {
+        throw new AnalysisException(s"class $className doesn't implement interface UserDefinedAggregateFunction")
+      }
+      val udaf = clazz.newInstance().asInstanceOf[UserDefinedAggregateFunction]
+      register(name, udaf)
+    } catch {
+      case e: ClassNotFoundException => throw new AnalysisException(s"Can not load class ${className}, please make sure it is on the classpath")
+      case e @ (_: InstantiationException | _: IllegalArgumentException) =>
+        throw new AnalysisException(s"Can not instantiate class ${className}, please make sure it has public non argument constructor")
+    }
+  }
+
   /**
    * Register a user-defined function with 1 arguments.
    * @since 1.3.0
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDAFSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDAFSuite.java
new file mode 100644
index 0000000000000..ddbaa45a483cb
--- /dev/null
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDAFSuite.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package test.org.apache.spark.sql;
+
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+
+public class JavaUDAFSuite {
+
+  private transient SparkSession spark;
+
+  @Before
+  public void setUp() {
+    spark = SparkSession.builder()
+        .master("local[*]")
+        .appName("testing")
+        .getOrCreate();
+  }
+
+  @After
+  public void tearDown() {
+    spark.stop();
+    spark = null;
+  }
+
+  @SuppressWarnings("unchecked")
+  @Test
+  public void udf1Test() {
+    spark.range(1, 10).toDF("value").registerTempTable("df");
+    spark.udf().registerJavaUDAF("myDoubleAvg", MyDoubleAvg.class.getName());
+    Row result = spark.sql("SELECT myDoubleAvg(value) as my_avg from df").head();
+    Assert.assertEquals(105.0, result.getDouble(0), 1.0e-6);
+  }
+
+}
diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/aggregate/MyDoubleAvg.java b/sql/core/src/test/java/test/org/apache/spark/sql/MyDoubleAvg.java
similarity index 99%
rename from sql/hive/src/test/java/org/apache/spark/sql/hive/aggregate/MyDoubleAvg.java
rename to sql/core/src/test/java/test/org/apache/spark/sql/MyDoubleAvg.java
index ae0c097c362ab..447a71d284fbb 100644
--- a/sql/hive/src/test/java/org/apache/spark/sql/hive/aggregate/MyDoubleAvg.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/MyDoubleAvg.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.hive.aggregate;
+package test.org.apache.spark.sql;
 
 import java.util.ArrayList;
 import java.util.List;
diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/aggregate/MyDoubleSum.java b/sql/core/src/test/java/test/org/apache/spark/sql/MyDoubleSum.java
similarity index 98%
rename from sql/hive/src/test/java/org/apache/spark/sql/hive/aggregate/MyDoubleSum.java
rename to sql/core/src/test/java/test/org/apache/spark/sql/MyDoubleSum.java
index d17fb3e5194f3..93d20330c717f 100644
--- a/sql/hive/src/test/java/org/apache/spark/sql/hive/aggregate/MyDoubleSum.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/MyDoubleSum.java
@@ -15,18 +15,18 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.hive.aggregate;
+package test.org.apache.spark.sql;
 
 import java.util.ArrayList;
 import java.util.List;
 
+import org.apache.spark.sql.Row;
 import org.apache.spark.sql.expressions.MutableAggregationBuffer;
 import org.apache.spark.sql.expressions.UserDefinedAggregateFunction;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
 import org.apache.spark.sql.types.DataType;
 import org.apache.spark.sql.types.DataTypes;
-import org.apache.spark.sql.Row;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
 
 /**
  * An example {@link UserDefinedAggregateFunction} to calculate the sum of a
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 09dcc4055e000..f9462e79a69f3 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -57,6 +57,13 @@
       <artifactId>spark-sql_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-sql_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaDataFrameSuite.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaDataFrameSuite.java
index aefc9cc77da88..636ce10da3734 100644
--- a/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaDataFrameSuite.java
+++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaDataFrameSuite.java
@@ -31,7 +31,7 @@
 import org.apache.spark.sql.expressions.UserDefinedAggregateFunction;
 import static org.apache.spark.sql.functions.*;
 import org.apache.spark.sql.hive.test.TestHive$;
-import org.apache.spark.sql.hive.aggregate.MyDoubleSum;
+import test.org.apache.spark.sql.MyDoubleSum;
 
 public class JavaDataFrameSuite {
   private transient SQLContext hc;
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
index 84f915977bd88..f245a79f805a2 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
@@ -20,16 +20,19 @@ package org.apache.spark.sql.hive.execution
 import scala.collection.JavaConverters._
 import scala.util.Random
 
+import test.org.apache.spark.sql.MyDoubleAvg
+import test.org.apache.spark.sql.MyDoubleSum
+
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.hive.aggregate.{MyDoubleAvg, MyDoubleSum}
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
 
+
 class ScalaAggregateFunction(schema: StructType) extends UserDefinedAggregateFunction {
 
   def inputSchema: StructType = schema

From c8e7f445b98fce0b419b26f43dd3a75bf7c7375b Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Wed, 5 Jul 2017 11:06:15 -0700
Subject: [PATCH 098/779] [SPARK-21307][SQL] Remove SQLConf parameters from the
 parser-related classes.

### What changes were proposed in this pull request?
This PR is to remove SQLConf parameters from the parser-related classes.

### How was this patch tested?
The existing test cases.

Author: gatorsmile <gatorsmile@gmail.com>

Closes #18531 from gatorsmile/rmSQLConfParser.
---
 .../sql/catalyst/catalog/SessionCatalog.scala |   2 +-
 .../sql/catalyst/parser/AstBuilder.scala      |   6 +-
 .../sql/catalyst/parser/ParseDriver.scala     |   8 +-
 .../parser/ExpressionParserSuite.scala        | 167 +++++++++---------
 .../spark/sql/execution/SparkSqlParser.scala  |  11 +-
 .../org/apache/spark/sql/functions.scala      |   3 +-
 .../internal/BaseSessionStateBuilder.scala    |   2 +-
 .../sql/internal/VariableSubstitution.scala   |   4 +-
 .../sql/execution/SparkSqlParserSuite.scala   |  10 +-
 .../execution/command/DDLCommandSuite.scala   |   4 +-
 .../internal/VariableSubstitutionSuite.scala  |  31 ++--
 11 files changed, 121 insertions(+), 127 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index c40d5f6031a21..336d3d65d0dd0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -74,7 +74,7 @@ class SessionCatalog(
       functionRegistry,
       conf,
       new Configuration(),
-      new CatalystSqlParser(conf),
+      CatalystSqlParser,
       DummyFunctionResourceLoader)
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 8eac3ef2d3568..b6a4686bb9ec9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -45,11 +45,9 @@ import org.apache.spark.util.random.RandomSampler
  * The AstBuilder converts an ANTLR4 ParseTree into a catalyst Expression, LogicalPlan or
  * TableIdentifier.
  */
-class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging {
+class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
   import ParserUtils._
 
-  def this() = this(new SQLConf())
-
   protected def typedVisit[T](ctx: ParseTree): T = {
     ctx.accept(this).asInstanceOf[T]
   }
@@ -1457,7 +1455,7 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
    * Special characters can be escaped by using Hive/C-style escaping.
    */
   private def createString(ctx: StringLiteralContext): String = {
-    if (conf.escapedStringLiterals) {
+    if (SQLConf.get.escapedStringLiterals) {
       ctx.STRING().asScala.map(stringWithoutUnescape).mkString
     } else {
       ctx.STRING().asScala.map(string).mkString
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala
index 09598ffe770c6..7e1fcfefc64a5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala
@@ -26,7 +26,6 @@ import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
 import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.trees.Origin
-import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{DataType, StructType}
 
 /**
@@ -122,13 +121,8 @@ abstract class AbstractSqlParser extends ParserInterface with Logging {
 /**
  * Concrete SQL parser for Catalyst-only SQL statements.
  */
-class CatalystSqlParser(conf: SQLConf) extends AbstractSqlParser {
-  val astBuilder = new AstBuilder(conf)
-}
-
-/** For test-only. */
 object CatalystSqlParser extends AbstractSqlParser {
-  val astBuilder = new AstBuilder(new SQLConf())
+  val astBuilder = new AstBuilder
 }
 
 /**
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
index 45f9f72dccc45..ac7325257a15a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
@@ -167,12 +167,12 @@ class ExpressionParserSuite extends PlanTest {
   }
 
   test("like expressions with ESCAPED_STRING_LITERALS = true") {
-    val conf = new SQLConf()
-    conf.setConfString(SQLConf.ESCAPED_STRING_LITERALS.key, "true")
-    val parser = new CatalystSqlParser(conf)
-    assertEqual("a rlike '^\\x20[\\x20-\\x23]+$'", 'a rlike "^\\x20[\\x20-\\x23]+$", parser)
-    assertEqual("a rlike 'pattern\\\\'", 'a rlike "pattern\\\\", parser)
-    assertEqual("a rlike 'pattern\\t\\n'", 'a rlike "pattern\\t\\n", parser)
+    val parser = CatalystSqlParser
+    withSQLConf(SQLConf.ESCAPED_STRING_LITERALS.key -> "true") {
+      assertEqual("a rlike '^\\x20[\\x20-\\x23]+$'", 'a rlike "^\\x20[\\x20-\\x23]+$", parser)
+      assertEqual("a rlike 'pattern\\\\'", 'a rlike "pattern\\\\", parser)
+      assertEqual("a rlike 'pattern\\t\\n'", 'a rlike "pattern\\t\\n", parser)
+    }
   }
 
   test("is null expressions") {
@@ -435,86 +435,85 @@ class ExpressionParserSuite extends PlanTest {
   }
 
   test("strings") {
+    val parser = CatalystSqlParser
     Seq(true, false).foreach { escape =>
-      val conf = new SQLConf()
-      conf.setConfString(SQLConf.ESCAPED_STRING_LITERALS.key, escape.toString)
-      val parser = new CatalystSqlParser(conf)
-
-      // tests that have same result whatever the conf is
-      // Single Strings.
-      assertEqual("\"hello\"", "hello", parser)
-      assertEqual("'hello'", "hello", parser)
-
-      // Multi-Strings.
-      assertEqual("\"hello\" 'world'", "helloworld", parser)
-      assertEqual("'hello' \" \" 'world'", "hello world", parser)
-
-      // 'LIKE' string literals. Notice that an escaped '%' is the same as an escaped '\' and a
-      // regular '%'; to get the correct result you need to add another escaped '\'.
-      // TODO figure out if we shouldn't change the ParseUtils.unescapeSQLString method?
-      assertEqual("'pattern%'", "pattern%", parser)
-      assertEqual("'no-pattern\\%'", "no-pattern\\%", parser)
-
-      // tests that have different result regarding the conf
-      if (escape) {
-        // When SQLConf.ESCAPED_STRING_LITERALS is enabled, string literal parsing fallbacks to
-        // Spark 1.6 behavior.
-
-        // 'LIKE' string literals.
-        assertEqual("'pattern\\\\%'", "pattern\\\\%", parser)
-        assertEqual("'pattern\\\\\\%'", "pattern\\\\\\%", parser)
-
-        // Escaped characters.
-        // Unescape string literal "'\\0'" for ASCII NUL (X'00') doesn't work
-        // when ESCAPED_STRING_LITERALS is enabled.
-        // It is parsed literally.
-        assertEqual("'\\0'", "\\0", parser)
-
-        // Note: Single quote follows 1.6 parsing behavior when ESCAPED_STRING_LITERALS is enabled.
-        val e = intercept[ParseException](parser.parseExpression("'\''"))
-        assert(e.message.contains("extraneous input '''"))
-
-        // The unescape special characters (e.g., "\\t") for 2.0+ don't work
-        // when ESCAPED_STRING_LITERALS is enabled. They are parsed literally.
-        assertEqual("'\\\"'", "\\\"", parser)   // Double quote
-        assertEqual("'\\b'", "\\b", parser)     // Backspace
-        assertEqual("'\\n'", "\\n", parser)     // Newline
-        assertEqual("'\\r'", "\\r", parser)     // Carriage return
-        assertEqual("'\\t'", "\\t", parser)     // Tab character
-
-        // The unescape Octals for 2.0+ don't work when ESCAPED_STRING_LITERALS is enabled.
-        // They are parsed literally.
-        assertEqual("'\\110\\145\\154\\154\\157\\041'", "\\110\\145\\154\\154\\157\\041", parser)
-        // The unescape Unicode for 2.0+ doesn't work when ESCAPED_STRING_LITERALS is enabled.
-        // They are parsed literally.
-        assertEqual("'\\u0057\\u006F\\u0072\\u006C\\u0064\\u0020\\u003A\\u0029'",
-          "\\u0057\\u006F\\u0072\\u006C\\u0064\\u0020\\u003A\\u0029", parser)
-      } else {
-        // Default behavior
-
-        // 'LIKE' string literals.
-        assertEqual("'pattern\\\\%'", "pattern\\%", parser)
-        assertEqual("'pattern\\\\\\%'", "pattern\\\\%", parser)
-
-        // Escaped characters.
-        // See: http://dev.mysql.com/doc/refman/5.7/en/string-literals.html
-        assertEqual("'\\0'", "\u0000", parser) // ASCII NUL (X'00')
-        assertEqual("'\\''", "\'", parser)     // Single quote
-        assertEqual("'\\\"'", "\"", parser)    // Double quote
-        assertEqual("'\\b'", "\b", parser)     // Backspace
-        assertEqual("'\\n'", "\n", parser)     // Newline
-        assertEqual("'\\r'", "\r", parser)     // Carriage return
-        assertEqual("'\\t'", "\t", parser)     // Tab character
-        assertEqual("'\\Z'", "\u001A", parser) // ASCII 26 - CTRL + Z (EOF on windows)
-
-        // Octals
-        assertEqual("'\\110\\145\\154\\154\\157\\041'", "Hello!", parser)
-
-        // Unicode
-        assertEqual("'\\u0057\\u006F\\u0072\\u006C\\u0064\\u0020\\u003A\\u0029'", "World :)",
-          parser)
+      withSQLConf(SQLConf.ESCAPED_STRING_LITERALS.key -> escape.toString) {
+        // tests that have same result whatever the conf is
+        // Single Strings.
+        assertEqual("\"hello\"", "hello", parser)
+        assertEqual("'hello'", "hello", parser)
+
+        // Multi-Strings.
+        assertEqual("\"hello\" 'world'", "helloworld", parser)
+        assertEqual("'hello' \" \" 'world'", "hello world", parser)
+
+        // 'LIKE' string literals. Notice that an escaped '%' is the same as an escaped '\' and a
+        // regular '%'; to get the correct result you need to add another escaped '\'.
+        // TODO figure out if we shouldn't change the ParseUtils.unescapeSQLString method?
+        assertEqual("'pattern%'", "pattern%", parser)
+        assertEqual("'no-pattern\\%'", "no-pattern\\%", parser)
+
+        // tests that have different result regarding the conf
+        if (escape) {
+          // When SQLConf.ESCAPED_STRING_LITERALS is enabled, string literal parsing fallbacks to
+          // Spark 1.6 behavior.
+
+          // 'LIKE' string literals.
+          assertEqual("'pattern\\\\%'", "pattern\\\\%", parser)
+          assertEqual("'pattern\\\\\\%'", "pattern\\\\\\%", parser)
+
+          // Escaped characters.
+          // Unescape string literal "'\\0'" for ASCII NUL (X'00') doesn't work
+          // when ESCAPED_STRING_LITERALS is enabled.
+          // It is parsed literally.
+          assertEqual("'\\0'", "\\0", parser)
+
+          // Note: Single quote follows 1.6 parsing behavior when ESCAPED_STRING_LITERALS is
+          // enabled.
+          val e = intercept[ParseException](parser.parseExpression("'\''"))
+          assert(e.message.contains("extraneous input '''"))
+
+          // The unescape special characters (e.g., "\\t") for 2.0+ don't work
+          // when ESCAPED_STRING_LITERALS is enabled. They are parsed literally.
+          assertEqual("'\\\"'", "\\\"", parser)   // Double quote
+          assertEqual("'\\b'", "\\b", parser)     // Backspace
+          assertEqual("'\\n'", "\\n", parser)     // Newline
+          assertEqual("'\\r'", "\\r", parser)     // Carriage return
+          assertEqual("'\\t'", "\\t", parser)     // Tab character
+
+          // The unescape Octals for 2.0+ don't work when ESCAPED_STRING_LITERALS is enabled.
+          // They are parsed literally.
+          assertEqual("'\\110\\145\\154\\154\\157\\041'", "\\110\\145\\154\\154\\157\\041", parser)
+          // The unescape Unicode for 2.0+ doesn't work when ESCAPED_STRING_LITERALS is enabled.
+          // They are parsed literally.
+          assertEqual("'\\u0057\\u006F\\u0072\\u006C\\u0064\\u0020\\u003A\\u0029'",
+            "\\u0057\\u006F\\u0072\\u006C\\u0064\\u0020\\u003A\\u0029", parser)
+        } else {
+          // Default behavior
+
+          // 'LIKE' string literals.
+          assertEqual("'pattern\\\\%'", "pattern\\%", parser)
+          assertEqual("'pattern\\\\\\%'", "pattern\\\\%", parser)
+
+          // Escaped characters.
+          // See: http://dev.mysql.com/doc/refman/5.7/en/string-literals.html
+          assertEqual("'\\0'", "\u0000", parser) // ASCII NUL (X'00')
+          assertEqual("'\\''", "\'", parser)     // Single quote
+          assertEqual("'\\\"'", "\"", parser)    // Double quote
+          assertEqual("'\\b'", "\b", parser)     // Backspace
+          assertEqual("'\\n'", "\n", parser)     // Newline
+          assertEqual("'\\r'", "\r", parser)     // Carriage return
+          assertEqual("'\\t'", "\t", parser)     // Tab character
+          assertEqual("'\\Z'", "\u001A", parser) // ASCII 26 - CTRL + Z (EOF on windows)
+
+          // Octals
+          assertEqual("'\\110\\145\\154\\154\\157\\041'", "Hello!", parser)
+
+          // Unicode
+          assertEqual("'\\u0057\\u006F\\u0072\\u006C\\u0064\\u0020\\u003A\\u0029'", "World :)",
+            parser)
+        }
       }
-
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index 2f8e416e7df1b..618d027d8dc07 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -39,10 +39,11 @@ import org.apache.spark.sql.types.StructType
 /**
  * Concrete parser for Spark SQL statements.
  */
-class SparkSqlParser(conf: SQLConf) extends AbstractSqlParser {
-  val astBuilder = new SparkSqlAstBuilder(conf)
+class SparkSqlParser extends AbstractSqlParser {
 
-  private val substitutor = new VariableSubstitution(conf)
+  val astBuilder = new SparkSqlAstBuilder
+
+  private val substitutor = new VariableSubstitution
 
   protected override def parse[T](command: String)(toResult: SqlBaseParser => T): T = {
     super.parse(substitutor.substitute(command))(toResult)
@@ -52,9 +53,11 @@ class SparkSqlParser(conf: SQLConf) extends AbstractSqlParser {
 /**
  * Builder that converts an ANTLR ParseTree into a LogicalPlan/Expression/TableIdentifier.
  */
-class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
+class SparkSqlAstBuilder extends AstBuilder {
   import org.apache.spark.sql.catalyst.parser.ParserUtils._
 
+  private def conf: SQLConf = SQLConf.get
+
   /**
    * Create a [[SetCommand]] logical plan.
    *
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 839cbf42024e3..3c67960d13e09 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -32,7 +32,6 @@ import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.plans.logical.{HintInfo, ResolvedHint}
 import org.apache.spark.sql.execution.SparkSqlParser
 import org.apache.spark.sql.expressions.UserDefinedFunction
-import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
@@ -1276,7 +1275,7 @@ object functions {
    */
   def expr(expr: String): Column = {
     val parser = SparkSession.getActiveSession.map(_.sessionState.sqlParser).getOrElse {
-      new SparkSqlParser(new SQLConf)
+      new SparkSqlParser
     }
     Column(parser.parseExpression(expr))
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
index 2532b2ddb72df..9d0148117fadf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
@@ -114,7 +114,7 @@ abstract class BaseSessionStateBuilder(
    * Note: this depends on the `conf` field.
    */
   protected lazy val sqlParser: ParserInterface = {
-    extensions.buildParser(session, new SparkSqlParser(conf))
+    extensions.buildParser(session, new SparkSqlParser)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala
index 4e7c813be9922..2b9c574aaaf0c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala
@@ -25,7 +25,9 @@ import org.apache.spark.internal.config._
  *
  * Variable substitution is controlled by `SQLConf.variableSubstituteEnabled`.
  */
-class VariableSubstitution(conf: SQLConf) {
+class VariableSubstitution {
+
+  private def conf = SQLConf.get
 
   private val provider = new ConfigProvider {
     override def get(key: String): Option[String] = Option(conf.getConfString(key, ""))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala
index d238c76fbeeff..2e29fa43f73d9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala
@@ -37,8 +37,7 @@ import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType
  */
 class SparkSqlParserSuite extends AnalysisTest {
 
-  val newConf = new SQLConf
-  private lazy val parser = new SparkSqlParser(newConf)
+  private lazy val parser = new SparkSqlParser
 
   /**
    * Normalizes plans:
@@ -285,6 +284,7 @@ class SparkSqlParserSuite extends AnalysisTest {
   }
 
   test("query organization") {
+    val conf = SQLConf.get
     // Test all valid combinations of order by/sort by/distribute by/cluster by/limit/windows
     val baseSql = "select * from t"
     val basePlan =
@@ -293,20 +293,20 @@ class SparkSqlParserSuite extends AnalysisTest {
     assertEqual(s"$baseSql distribute by a, b",
       RepartitionByExpression(UnresolvedAttribute("a") :: UnresolvedAttribute("b") :: Nil,
         basePlan,
-        numPartitions = newConf.numShufflePartitions))
+        numPartitions = conf.numShufflePartitions))
     assertEqual(s"$baseSql distribute by a sort by b",
       Sort(SortOrder(UnresolvedAttribute("b"), Ascending) :: Nil,
         global = false,
         RepartitionByExpression(UnresolvedAttribute("a") :: Nil,
           basePlan,
-          numPartitions = newConf.numShufflePartitions)))
+          numPartitions = conf.numShufflePartitions)))
     assertEqual(s"$baseSql cluster by a, b",
       Sort(SortOrder(UnresolvedAttribute("a"), Ascending) ::
           SortOrder(UnresolvedAttribute("b"), Ascending) :: Nil,
         global = false,
         RepartitionByExpression(UnresolvedAttribute("a") :: UnresolvedAttribute("b") :: Nil,
           basePlan,
-          numPartitions = newConf.numShufflePartitions)))
+          numPartitions = conf.numShufflePartitions)))
   }
 
   test("pipeline concatenation") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
index 5643c58d9f847..750574830381f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
@@ -29,13 +29,13 @@ import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical.Project
 import org.apache.spark.sql.execution.SparkSqlParser
 import org.apache.spark.sql.execution.datasources.CreateTable
-import org.apache.spark.sql.internal.{HiveSerDe, SQLConf}
+import org.apache.spark.sql.internal.HiveSerDe
 import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
 
 
 // TODO: merge this with DDLSuite (SPARK-14441)
 class DDLCommandSuite extends PlanTest {
-  private lazy val parser = new SparkSqlParser(new SQLConf)
+  private lazy val parser = new SparkSqlParser
 
   private def assertUnsupported(sql: String, containsThesePhrases: Seq[String] = Seq()): Unit = {
     val e = intercept[ParseException] {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/VariableSubstitutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/VariableSubstitutionSuite.scala
index d5a946aeaac31..c5e5b70e21335 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/internal/VariableSubstitutionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/VariableSubstitutionSuite.scala
@@ -18,12 +18,11 @@
 package org.apache.spark.sql.internal
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.plans.PlanTest
 
-class VariableSubstitutionSuite extends SparkFunSuite {
+class VariableSubstitutionSuite extends SparkFunSuite with PlanTest {
 
-  private lazy val conf = new SQLConf
-  private lazy val sub = new VariableSubstitution(conf)
+  private lazy val sub = new VariableSubstitution
 
   test("system property") {
     System.setProperty("varSubSuite.var", "abcd")
@@ -35,26 +34,26 @@ class VariableSubstitutionSuite extends SparkFunSuite {
   }
 
   test("Spark configuration variable") {
-    conf.setConfString("some-random-string-abcd", "1234abcd")
-    assert(sub.substitute("${hiveconf:some-random-string-abcd}") == "1234abcd")
-    assert(sub.substitute("${sparkconf:some-random-string-abcd}") == "1234abcd")
-    assert(sub.substitute("${spark:some-random-string-abcd}") == "1234abcd")
-    assert(sub.substitute("${some-random-string-abcd}") == "1234abcd")
+    withSQLConf("some-random-string-abcd" -> "1234abcd") {
+      assert(sub.substitute("${hiveconf:some-random-string-abcd}") == "1234abcd")
+      assert(sub.substitute("${sparkconf:some-random-string-abcd}") == "1234abcd")
+      assert(sub.substitute("${spark:some-random-string-abcd}") == "1234abcd")
+      assert(sub.substitute("${some-random-string-abcd}") == "1234abcd")
+    }
   }
 
   test("multiple substitutes") {
     val q = "select ${bar} ${foo} ${doo} this is great"
-    conf.setConfString("bar", "1")
-    conf.setConfString("foo", "2")
-    conf.setConfString("doo", "3")
-    assert(sub.substitute(q) == "select 1 2 3 this is great")
+    withSQLConf("bar" -> "1", "foo" -> "2", "doo" -> "3") {
+      assert(sub.substitute(q) == "select 1 2 3 this is great")
+    }
   }
 
   test("test nested substitutes") {
     val q = "select ${bar} ${foo} this is great"
-    conf.setConfString("bar", "1")
-    conf.setConfString("foo", "${bar}")
-    assert(sub.substitute(q) == "select 1 1 this is great")
+    withSQLConf("bar" -> "1", "foo" -> "${bar}") {
+      assert(sub.substitute(q) == "select 1 1 this is great")
+    }
   }
 
 }

From c8d0aba198c0f593c2b6b656c23b3d0fb7ea98a2 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Wed, 5 Jul 2017 16:33:23 -0700
Subject: [PATCH 099/779] [SPARK-21278][PYSPARK] Upgrade to Py4J 0.10.6

## What changes were proposed in this pull request?

This PR aims to bump Py4J in order to fix the following float/double bug.
Py4J 0.10.5 fixes this (https://github.com/bartdag/py4j/issues/272) and the latest Py4J is 0.10.6.

**BEFORE**
```
>>> df = spark.range(1)
>>> df.select(df['id'] + 17.133574204226083).show()
+--------------------+
|(id + 17.1335742042)|
+--------------------+
|       17.1335742042|
+--------------------+
```

**AFTER**
```
>>> df = spark.range(1)
>>> df.select(df['id'] + 17.133574204226083).show()
+-------------------------+
|(id + 17.133574204226083)|
+-------------------------+
|       17.133574204226083|
+-------------------------+
```

## How was this patch tested?

Manual.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #18546 from dongjoon-hyun/SPARK-21278.
---
 LICENSE                                         |   2 +-
 bin/pyspark                                     |   2 +-
 bin/pyspark2.cmd                                |   2 +-
 core/pom.xml                                    |   2 +-
 .../apache/spark/api/python/PythonUtils.scala   |   2 +-
 dev/deps/spark-deps-hadoop-2.6                  |   2 +-
 dev/deps/spark-deps-hadoop-2.7                  |   2 +-
 python/README.md                                |   2 +-
 python/docs/Makefile                            |   2 +-
 python/lib/py4j-0.10.4-src.zip                  | Bin 74096 -> 0 bytes
 python/lib/py4j-0.10.6-src.zip                  | Bin 0 -> 80352 bytes
 python/setup.py                                 |   2 +-
 .../org/apache/spark/deploy/yarn/Client.scala   |   2 +-
 .../spark/deploy/yarn/YarnClusterSuite.scala    |   2 +-
 sbin/spark-config.sh                            |   2 +-
 15 files changed, 13 insertions(+), 13 deletions(-)
 delete mode 100644 python/lib/py4j-0.10.4-src.zip
 create mode 100644 python/lib/py4j-0.10.6-src.zip

diff --git a/LICENSE b/LICENSE
index 66a2e8f132953..39fe0dc462385 100644
--- a/LICENSE
+++ b/LICENSE
@@ -263,7 +263,7 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
      (New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf)
      (The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net)
      (The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net)
-     (The New BSD License) Py4J (net.sf.py4j:py4j:0.10.4 - http://py4j.sourceforge.net/)
+     (The New BSD License) Py4J (net.sf.py4j:py4j:0.10.6 - http://py4j.sourceforge.net/)
      (Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/)
      (BSD licence) sbt and sbt-launch-lib.bash
      (BSD 3 Clause) d3.min.js (https://github.com/mbostock/d3/blob/master/LICENSE)
diff --git a/bin/pyspark b/bin/pyspark
index 98387c2ec5b8a..d3b512eeb1209 100755
--- a/bin/pyspark
+++ b/bin/pyspark
@@ -57,7 +57,7 @@ export PYSPARK_PYTHON
 
 # Add the PySpark classes to the Python path:
 export PYTHONPATH="${SPARK_HOME}/python/:$PYTHONPATH"
-export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.4-src.zip:$PYTHONPATH"
+export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.6-src.zip:$PYTHONPATH"
 
 # Load the PySpark shell.py script when ./pyspark is used interactively:
 export OLD_PYTHONSTARTUP="$PYTHONSTARTUP"
diff --git a/bin/pyspark2.cmd b/bin/pyspark2.cmd
index f211c0873ad2f..46d4d5c883cfb 100644
--- a/bin/pyspark2.cmd
+++ b/bin/pyspark2.cmd
@@ -30,7 +30,7 @@ if "x%PYSPARK_DRIVER_PYTHON%"=="x" (
 )
 
 set PYTHONPATH=%SPARK_HOME%\python;%PYTHONPATH%
-set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.10.4-src.zip;%PYTHONPATH%
+set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.10.6-src.zip;%PYTHONPATH%
 
 set OLD_PYTHONSTARTUP=%PYTHONSTARTUP%
 set PYTHONSTARTUP=%SPARK_HOME%\python\pyspark\shell.py
diff --git a/core/pom.xml b/core/pom.xml
index 326dde4f274bb..91ee941471495 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -335,7 +335,7 @@
     <dependency>
       <groupId>net.sf.py4j</groupId>
       <artifactId>py4j</artifactId>
-      <version>0.10.4</version>
+      <version>0.10.6</version>
     </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
index c4e55b5e89027..92e228a9dd10c 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
@@ -32,7 +32,7 @@ private[spark] object PythonUtils {
     val pythonPath = new ArrayBuffer[String]
     for (sparkHome <- sys.env.get("SPARK_HOME")) {
       pythonPath += Seq(sparkHome, "python", "lib", "pyspark.zip").mkString(File.separator)
-      pythonPath += Seq(sparkHome, "python", "lib", "py4j-0.10.4-src.zip").mkString(File.separator)
+      pythonPath += Seq(sparkHome, "python", "lib", "py4j-0.10.6-src.zip").mkString(File.separator)
     }
     pythonPath ++= SparkContext.jarOfObject(this)
     pythonPath.mkString(File.pathSeparator)
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 9287bd47cf113..c1325318d52fa 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -156,7 +156,7 @@ parquet-jackson-1.8.2.jar
 pmml-model-1.2.15.jar
 pmml-schema-1.2.15.jar
 protobuf-java-2.5.0.jar
-py4j-0.10.4.jar
+py4j-0.10.6.jar
 pyrolite-4.13.jar
 scala-compiler-2.11.8.jar
 scala-library-2.11.8.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 9127413ab6c23..ac5abd21807b6 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -157,7 +157,7 @@ parquet-jackson-1.8.2.jar
 pmml-model-1.2.15.jar
 pmml-schema-1.2.15.jar
 protobuf-java-2.5.0.jar
-py4j-0.10.4.jar
+py4j-0.10.6.jar
 pyrolite-4.13.jar
 scala-compiler-2.11.8.jar
 scala-library-2.11.8.jar
diff --git a/python/README.md b/python/README.md
index 0a5c8010b8486..84ec88141cb00 100644
--- a/python/README.md
+++ b/python/README.md
@@ -29,4 +29,4 @@ The Python packaging for Spark is not intended to replace all of the other use c
 
 ## Python Requirements
 
-At its core PySpark depends on Py4J (currently version 0.10.4), but additional sub-packages have their own requirements (including numpy and pandas).
\ No newline at end of file
+At its core PySpark depends on Py4J (currently version 0.10.6), but additional sub-packages have their own requirements (including numpy and pandas).
diff --git a/python/docs/Makefile b/python/docs/Makefile
index 5e4cfb8ab6fe3..09898f29950ed 100644
--- a/python/docs/Makefile
+++ b/python/docs/Makefile
@@ -7,7 +7,7 @@ SPHINXBUILD   ?= sphinx-build
 PAPER         ?=
 BUILDDIR      ?= _build
 
-export PYTHONPATH=$(realpath ..):$(realpath ../lib/py4j-0.10.4-src.zip)
+export PYTHONPATH=$(realpath ..):$(realpath ../lib/py4j-0.10.6-src.zip)
 
 # User-friendly check for sphinx-build
 ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
diff --git a/python/lib/py4j-0.10.4-src.zip b/python/lib/py4j-0.10.4-src.zip
deleted file mode 100644
index 8c3829e328726df4dfad7b848d6daaed03495760..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 74096
zcmY(qV~{R9&@DQ)?b&1Qv2EM7ZQI&o;~CqwZQHi(`+n!1b5Gs;NK#2xex$0qv)1Y;
zNP~i*0sT))&s3EBUz7jcf&Vu;c(Pd0EBtR0C?LvzO%f{;IB_}u?IGX*0U`Y#6C*=o
zYX^HL7di*eVGZ5NB?ctl+ghj(2W^e$w>;vSJ@$f#yuDzFf7e&r7MV+=%bWhUq_X^r
zH#f`P&42VV$1|9dGMLYINZV=CIs65z8arM2I~)(io<dG`Y0BZyOw^;d7^qm*sWegK
z;3*$alihBQVpaaFOA8(-Sq<VkQFNz?%!l(@U|ko(Kpkg0qz2DHX-9)NP7#cCzS56n
zy|`Xvi&r)EMB2HadCFs>a!a<n=yLgmj&L*i-Jkd8%$jtP-RHo!XYoWp@x^+)z@E8-
zb05s#&yH1}WLwELsAFL^o9M5opx1wWz&jZOe>o@ltTF#<M}eS&qbkok%|klLw%n?K
zX2#(dfW=hRqX8e-(}tx<kxkLX+Se39h+_h_5_ZYCg<c+#Py#CbCyRBE_CRrrgKCD=
z3WXmETu#F51&S#_k(0z8e+&Mp2s-br`55E_?$XD2mD`64rY;bzop}>5lo^S4-PblK
zy-)^8!w`v0o|QoxZ99dFDMc75zp?k3fo2|d1Z5ZRo8&x)3LBju_h)PcAkPz;!weMI
z{)}eE=T9UMfkS~x(jv?~6%`-Wl>*`w0Zcx<@i_P*VX4?uGh!yOxFp444%{297-k?m
z73R}T0KhRE(Gd<_T1K7%!Q$a?xK0?PJ!+t3Os~N@G}+=JO!)L>@SVh~N)E#5XjaVA
zRRDFHHyv^zAeZ2SfNR`pgV!L@{#q3~20qP;IF<cthSjbOzu0NSA=S_dOK7hVSl6&Y
z@wYg|_xOw=+B@)a&O1H7$$+c0LKtT83^g?2v$sG~n2#J!bdq|(-J)twgUCWQ`pknA
z>g(Lug1yL$V^9<3?>ECLu##>*(a?(5%JZX?j+I~RO<JV<ln=%{$JkYq_-hR4KGE_G
zmWS3jwOnG}&FXPv#<$PGlUt)X1$sT?wOATf8y!YeBdPXlZsl8wmK(%qbWe3^=@SL@
zD=4@3hhrF#L+mmrdcFuQ+5Ox+5oMk7;25Zbe$_4gHIFclNg%hY?m;n-qch#TSA5Nl
zJ=Wd8d~Ru0@L6Lw>3kc(>$>z<^^d3qMZcn#<w{NrCkQjU7%MDl5w5A>>Yyk>3K3x@
z41k6#Wfu>{KQP9Ch!`waWhjP4OnHeDZz?j5zfmZ*-&VpVG=IBJk}bN_0U->h(1&=Y
zp~`^s;&XijeW46KegJUco0?Fqo%t~6*a47k(LUcZ?bpCjm|U>zb{nnFC_C=*zfTp~
z{PT<6x^kEuR+R|~$-&ES&H*Z5d-0<0iUrH`F1V~G9oquImm??3f(8A$X4u&kMt+q_
zM3ZcJW_!RqWM%)WpjL8jM{N$}ufkEtuO{oyk}uXA%2x4V&5?AxV8#2I!e8miKmH@m
z?=@RFwUHG-LGB1rUTKaoJ^I&ya=SIlLYGdM^Y4Ah{#Dxc8KFh3uJhJMq&_<M<~xx$
zkWL?cm!GTbF&e2Rg9K#V6=jOsq{gP}_p;<?K!X@7w^x+jNc!LtZHdM9!E@@sLur2-
z41)CQ{iaVZ#njIn*mv=4>YnLD@VdMa{Ec{;W{F%n!HV&j7gCSgph=&y;u>)}hhC|R
z7tx;^DWaz6-v*>G65LUz(D0J_hWhps<WO%&@@;_d4F%+Ie~xl1eH+gq{0+*_xUf^l
zBi#th5}`<Br<{9=>8Wk@_6)<qqqNw^Sz^M$EgwG!;(pxUF{EA!xMr!FDz-*D8MR)>
z9jV8Ps*QfoVZ)cr>*8}*4hem=l%<Hdf0%^!)?Bbn>?0QDO~1rkQ+kLzWVKlhV1*h0
z-SccbgI_x%J&EHeXW-iq0Xa<hr$%Cc-#uZ}sjK|FaE4Y_4!!^Pce~drNO*r_5L1kn
z!%ved+xq4X|8`l<n3gtYbz^(O0scpp$pM(@wf}wk7U~9k<Z{&Jy#4<T<>{m%l3p)(
zg(;{&K<?^5K>w2i{#Ud&wy`v|b8$9xax-=MFPvZNXggoBAo<PKGVB9G%7k6QC1o%v
zZ#Z0sS`OL<IO2;-%HngsEg)h2aC;}vjyEh8JpmzR2Ra*RYE`?bcjgwQ=)dmxJ=)UG
zU`n=RdhAZ|`F@hnV#9?GPWmX84piP0PDrn0sZ@!orRW{~O*~dYy92s~K{iyj&!HE&
zm=<kDC&m@oNt-u5^36h1#V_75Mcbvw2q@9Nt@z}5Osql~E<w6`^<a1-1{IPVX92Op
zF@;s-jAfGl!q^pGm+}au^sg4xOwj<%W%E>vLZJkK-%pv>f=jQgf3?&T7=+S9>@qnQ
zRmjEhp3~G66(z%1Q*ip6#&qJ5z$ur6fMC1>*D0cEnph38HZ}}sekyF0RZ0eSp+3lr
zG=`V;jB&z;OBL;T8F0{4cO@D?D$c@8Q8$N9gkA(YLMb-g6MbZo4^$dsg#gubMz7Sj
z?9mGRIwrZp<d2Bro|~||82AHXFLqoptR{jOsv%Xv)`{qc{R;vLqpnESGB4Ns3`y|y
z-Yh4AK4PgCCBT>b>e@WkswsdjMJbsttq8VbF0)T7+oz&Y-4JmJ2oyP0{Bod)5?A!W
zN6+KN@>p+TiPB1m6j&4lm=f0{@_bgsMv3m_$n*1lpBP~P0J*y|_V%v*tE=qHRu$5I
z>Arrx=zhM=-`|K|TCIHCS$@2*=K4BvwY`Cvn5E{6v$gayrR+{+Ly}nR-QlJw_H0Is
z3k50*n`oXXqn;O{WWQ`h)jR8nKVGl>X^RD8(E)yX{X-MckeW(0{ceMy2Dm!;QV-8K
zRcu>*utsrGu)q+hp>c{@(enlj0>;Rp?gdq5a4B?UQ-mn=jOS6}Y(p51I4NZ1ax{bP
zOU#)HX$G6S=<{Zvx(eKG_<!=KrI^y|7&zG~{&``Wau~8HX!m7LCLpwEkXLpHggcI&
z3|jT#?v}@%?S^U$ut+r&!3d*e{U}GdKrfot10gB`I3NOp)QXj^6?KpInUK#9jut7w
zKiSUbwOc-uG!<br@v*>bsMq^YktLX99?OzMG)va<P<mA<)IGzV8!Z4)4u4lX>tO}`
z=LmeO)`zKY3PMpisuG*WtjWn)^FkJRWF+>akWin*E`W=!ZEus_05F=5NBubT*qXjp
z!i3)XwB5RTy@5E?(hQ_3OV`)K*oe0XV=Av|hn;VFLx}sndTN?YW<)!NqgNlefECuK
z&e)1IyPH9zZXZr!HFV@&JSs@sV2ngllwiY!Mi)JJcwYT(_@`htK0DaRm>^P-UIe5#
zDwN7f=lYn7<bf$~!k*usPdojM4RWlE8;MbO3!ynfzF*NH0sdHU`RGsLTJ={_s!GD+
z!d6c<u&`)+rks;^;qLngkMsAwZ#i&sNv4d_Khw0^p3c%DXomt>N8DxK5h}jo8!)7j
zqA?>t6;gG}u^s5J(K_Az8DZc3pPwO)K>_hFA`hmhwY)0i!&|OTd6yj7DSA=l#Ho1}
zOJOvsywZXd@ViS1YO3eM48DK32G8KPwYHJreW*YyT1BPF6ve<Z)fwZ|Bv@oc;fnCo
zXnzfuNSc>W09BO0;p0&(5Ln(1KwEj|rg_3tHSjY=5Pw#6?6<H8J$+|euuLu1s5OO;
zJi&}HhneX6K8LD8+9L`(LQ+x416xusz`ZTL#lKo41+i5-lugDYjy<H}TWGMa$u^~|
zuG*<l0!fs$f^%b<db*oBa&K{xy0=qOG~y0B1Pr)9S3NTNvQXgHivA%}Owx$16PJ2J
zJ3K|$NFmn4^B4*kj$xBUuwy4h$u4t>U0-UxuaSKE+$Dzny%=wb1>#>hla@`@$lPdb
z;l6+++M)?hkm_5S7igBrBVjS$>|T^^A)2KJ+yBIRh9^-)pqjN|B4I#c#gx(V(i<aR
zs3}*RRnZ*IG*zND*s3o&j1v$t(6RVk4;uUj#`8D0=^YnzNffnb_R=LV_BUab7R16E
ztS^ddG60~}nqH)$uhDxB|GO(D8G77@#53rx9YJdxM~<O&6m*&^DLqZa$w~YM{u{&x
ztAH_;!xg*^&yTmB9td##sDG6Ms+#ALNfky4L1Cx>N23GQ&d{yID$&I*SXv+uZHy%@
zZwj#hjuzu!opRqD2?frtnQuc)D2JL|n+(wobld!^V@h11N4tzz2qNuO7#ImZx^M;r
zJE;$IC8f=}SJVCUhN)m5?5U6P#05(jy+jORX*PZ<R3NX5gtJo#`ZfUOhw^#@+@cu6
z*BsaqUNh|F=x6|Xs2&~tt17ZiX<;P5MjUqo(wVI40NfF{-fYc)rJ1pwv&AJ47Kren
zXrdtps+lVpy_+~O!gN-&r$0(_dZ$hVowhWCE!>)rap;)NaIx;DqS48a`5;su(33qP
z(C-Ndau(t96nS4zW16r5$8(~+B^KwUGgM1`HL6E^(ikk)X%C9B4b1c@)2)Pv%j~3{
zQxe=XMuJwFhdTVXpd&-&Vf2uvrf}4bb0aJJRgEO?je(Zzs$rGiq>g^9>TZ|*%2coV
zM<CoQDIVrCJj}~pPZ;k{>S<mhWD>@6NLHXCMMtvSgM~)l%;?ov13z1@oJXE(bk3d~
zjVD5q%4SS4-yY#!$P1)m#lizf!L2AwMEsg)t+?rI<J`=xFl!MsY&yospXufZCWIPt
zva?ttTXbG1jekMs79e+d8JC$6zc^Th<-ol~-3jom!@gL2#*~jTAtgRK`gi<DIhrFM
zIx=B}rV>Zmof9}g+jCpRqSTQI27ks>hF21Y$50fb&D0p*5q$>)lMrlVy<S=;EvTkM
z9g~G#$*Pt;5W#<prqknP?IjO8G;Dte!w)Sm{fy7-guAjttrT%9;^orK>5nXva^{R=
zA2R7>%cNS;#AtA=y^D+YD^SCH_>c_2n<*0Th9{j33SHzkda@@GaAS|iD`0O0BV-%3
z>nu|T+EgH;nn8cv9I4NAwUNBZOIl95l^5!PAB!u$+yt$?nj<_ZY#-cQt0}hwCcU!i
za&YB9GUxJ>_^oUJu@Tu(e4AZ#b^qR)!2C1%?)u2R$TQUfw?LH|XB4|{OoH}{!wGDC
z%j|FuN<cR7GvN4%((0**UBTGHeto|Okk%{IO<L0Jkhl%;uJBU`7KHwl-@6#63*+ET
zgYOUCC}vzKvpz$#vQzN(^K1>wFD`;uf;(gdlxz=2>8{#e>Dw)PmZD$L&KSxDoLaE`
z)rzSEffj;y3Y&y5y_yg5D;(J<=h>S^&fvDdj>M_EiT!cCg%Ely+^VdJ$|eTvbX#Bh
zx;On6DE%SC>4C=*pv7v_z}1JUvz+W+286fuVR6$%pbiG%zx%^#FG#(xVYbF#FbDSF
znxIQhZXLx_?_F5VQBb3x22$8+!8I>Q-?Z!N64B=i34EpYAT+>lzbK%d14{6~Up#N~
zMkOKbCfzjlO1uPaXyJf;75(vezt#SnzwCXS8a5n&g8&pUxzB&j-3|V-vt?KWXRdZz
z_vz|j!wZOg*n(v}i#|}QJ70??w-7$96SeZ4{~N{dPUZ4yvq9XrfjW)<Db6}Ll=qD$
z6ahP`5FV-_BB5U0fGces0E_6+sPMe3@0jh-ra|jzRduHe8#xt&A}04FaY=xEuzM|I
z@es|T_X3Bvi`gdRfEe+=aELv5qXr%gPoLE)a2v28#S3$?S+*kq;Uf$VCyoL~A<}D(
zv%H3UEYtSx7pLc+n=!yjJaNVVvjKywMtBJt2E~BhS{~x!J!A?^u`Hhmo;bcg-1~hV
zfvsi-=&d=geySs7@yO?d_G3wNUkg)z4>dK<*b-plz_kN75`RWl39lW6qTS~CUa7W0
z55eDBgJ3Ryb@m80E0mb^sgH`-TasMw);Itf*FN4$-a>Dv`+er}dAfx>9dl?$skzHL
z!G#Sl%1C7?CMVaPV9um{_!!2xrbp<1%_~W|dHC#8t#d5PW*s}lZwA3;sC%@ve{Wa2
zq<-IF3NwpE$<5&N@>L@=NaLo3q8tMnleo<!qM<qKilaA=E=a>KHk+#Z?hxBF;Jc^_
zaZ*~q!NP?%`jWFK;%mDap(_iQlH^*JtUKXvxL#TE+=FBD`AP{ZTujVq(YA9mKTN`P
zs?U`$dFRV_#BPn3tBhJrA*tETuhg&hjw=rj&85vfHxwf$t~2A&5umj^;7W_v{0c>m
zm{OCOWDa9E_CecHo1TuuP)3q;r!(6W=Dk`f+~|8VcU-UH!_iTA&pNU(Z8}~}EPKS<
z*Q?be=XWaR!i!Hv)Fvpb=yZA44k_6WFV<n_vaoMXQmp;Hg!w)n&Yu?VqASm773}S6
zXM14m&Dj?hCl#$ps<io51heh-odUGc8vwtg#csRTnv|Uv_*KgOU7=lh3b8pGH<pAF
zw>8d1`0VT#s7)KK_r9gFb534`VG)<ew29Un%(~>|cj3c6WB*=cB)OJLr66UAbhYgf
z<W#weF`O?aBJJ<kH}uCv>LEEs>`vc~@*MrNn>n~uX7j((JMFvc?Wv#NoPHQRl!W2U
zFE(O3P8S_B9ca#wI?F2<VzL7JO_mj@Zz+<PvXh`F!d--*osjd!N5v9#p&MlEiIR9%
z<`E{4O#WV~n2~tw!GHrsEV<qr&SgyR=HNkXw8`M+SX`7<JMPgxHPstX)qg)8V)>K*
zbe^lV6!uWP4<X%pF%q=Qfd8bX#3k@WtlsnbKwA2IzOy$zUf$LW%*vTB-pcty&DPM<
z%h%2fRxVDSV+pwQ1f1)uPqy;oxV^)ln7HhT_-%8~F6-@Dcc4%*Yi<iqsN$`rD|CWd
zu8+d`SVJRVpFB?xvcb2D3XUu6l}f}WiwX`ToAJWw^jg=!PWG6}aYl39ImV`-o_3*n
zI?jZzI+P6FU@Jw7b<Curl6@%ZnG=W313K9}fIkwG_?kffeaQdE$E2H~3<PniL?ahc
zG>AS6ago29*mlp>%+~JX{i73lIEN!+?*|*X;P7a|(QD@ThVM}I606O^X1xFLIGx>3
zrz2D8F*14rJF+mqvU)tHA{uY0?%GYizW}7WV3Tje^B29#m6R<m<p-?#-!TwC{Ec?T
z4|iUk&1u*D{&OrabACQO{VWTW(cQXO`N%sXq9h=uqXF8kqAfCdA}Tz2jAR$&jmhxo
zpWXF+GwVgWjl)M4WW~6(!0>7op{Kohny#Lc^3W5MAWr=z<0Lq`!puV%Z)ZYxW;Noa
ze68c<WI(;FaB7&*%B9e=0c`n7k6Q?q2!BKa$QJOKYv2~hP=aIObO8%-Wau57-_a99
zCA+bHuR6;*zskz0yhBmyl;_Wi)e(C-$tk)&>`cnd(E>q|G2bNL$Q?V8Enc}UV$j4h
zqq3p4NxTto^`}@!o|U7gqZHgmkeRamwHRdW(;JgngzRA0b+r80oH2I;t!4KjI8*~T
zF|u)g9+s<#r;+k;cfW%r!x^=;r30p8QKVYwyebFbeR?%Fe#ag@-j~P6s;>%??Pq5C
z#)LQb5YbDGw`<~1csP5<379~S`H(|&b{}#)gm)DR-o{2VRq2uvO;b6O!M(?(7zeAE
z8VD(cwRiu7A9w<*|82aNhRqZj?Kk@2zQ50fisE{E5s|<;Uw?|WFNJ_unB(dbhRAn!
z;0kwvSjO!eH<)TqWx(U4fvU!rRqv!P<a*4bStrz}Qy+w^MJC0hJ70OlLm=C&Kq@{n
zanms+@^zpAKee}0eSq4)h5KadD^hc*3H=E!(vq<*zxtj6mHoSlvF$f)0;9!>g>fq6
zR5fSr;FLq1wlLJ_Ir4`@w%j&$Y3oXp4-t;Ze02B?GXPpvTODb8#>RtWzWBVqyQ{rB
zQ=q?dZ-E<k(~mD6F0KP9!^fOFtC%=xPAFeEDE!tXjKFjEa2R<Gwt$ekA`T)PBEuMY
zcCsLcBz^{&^0d7Hs|!bEFychDAmcDTCQ`j)oELJKM>ua*h`u3uMKQ=NXIk&MA7H^@
zthG>VS0%h}YqI@}-BC?)Jha(&cBjkMGaZE6ie16Y+M~=sK*)$Rpi*mxHG)7axd9aw
zHk)W5<5@$xxs<N8aQgVNHjp<MZ2p0!yto9s@H3c8<`{%k|K##kAlr8`m6SNv9s%8^
zJpbi}$`TBLY*O2)M-%V!?~6TS&*IL+<ZMImZl{4a>2*b3mwD!WBiWt_$0CpkHypeA
zyhjDze{7R$bL?jgLbIr2tJ2B0A1wEyuy1kILaq8%Y3qP!q%P-iDKV=>UfG*N&o(W#
z#eXSPp`Uy<VMJK?IINbn#n*M`sqhnI;2P2IeQ<luJ6a2Jrlg*r(#%~Z%N8~l&^EBU
zm!p|b#O*xdLjo;X8oj;ZToDz{E*5IcC(N-=qt&q1R#(c;iI(;1^VCCZKbG>L#k_qf
zW^`@;QI0sP(HgU%kC*&XjOCezb8jowfgs)6rl3~Lz*%>*_<^4>SXd(u-{Sr0?P>S*
z4tD;Y<5bz*ya6ppbcc-q^YL5Sil4c&LNgHgr4b=Rx|&<f#2%k^v%GhdH;HruTv~_}
z-IS!i^23tXIJ-B|I_N^1cmt{NGtXl*QESHTD&g16W6nY$5uJ5<Do@BY>9xypEpE5b
zm?C<G=dQJ`a`Jp>tdzM4yj*m_w3G-ZzQX&ku+>7t%0_x9Giqo^6aG=m+`LI*@67QA
z`~Ro$?muHazCnP1jG+HhW&S@J&)(L-@PG7}Zq@17MFymvQ!24-=>}@#3*r!JFAbu-
zfDnN{jslfqk`&eM%G}o6n<v8_z<M$DIlCp=Zk!6L&qrDIemp-o`U2#uIyR{a8DGO$
z25ed2w?{Q5VWnMrC1G7C8T(2WW5q#IciD$HN?Q$(bm-V-ag=+~wk=A1#C$-g)@@OU
zj!3kcqGehGu7QwmWq!XX+L~87DLUi$q>_3WQih2@yD3W8+5ubxAy$P(p#V_nW?Jgt
zNOLg3!6Tux^o$q9)s0gY?a-;4a@qH7V8OcpsYNjqIwxovTqHyysq8_dw|1SaJ}dBj
zBZS&QQ=lhD^ofsU+w6v`nG*ZTF9Ek#c=$eWD9>3CS{4MOmm{@w{sF*>pC_pO{se`_
zCPy$gevK{ev^bHhTp!m<4JDS-vS&}`5EM`P9YZmp(WPZZR3!Yu6~c@~=WCD9!kD@p
zHn&oVBJOZ?r#A<2^Qm0S2-k!nu8CB=mtf3IKH@g@KsO_Fk=KfJF9DMi+>7(#Ka!;+
z!T2OF^Yk%>C1+ercu;AI-%93tLM&(_tcB}?+hr~*{Y67DC&K|39J=MqJ!2+Ux2WtF
z9;3H&o*8TOujT=mzk7c*-dSJTl73N_v>j{*<OXIJ97^>|bt~-C5@f!)lqjYzLnz0w
zD(k{aCKkmj3U^7$Ju7#k6U<%qbzb7lWz4f@3;Y)fN%)i#N}0as154zWN`g-V2K1FH
z?Ie`P0k}6T{SuR}sDF16o38X`v`aGb%`&yaW37Wnh$Blv%^S;9y+?IJn7VU<S>;{G
zP3Os@N6SCz+WoCX6o1_>5(g_-%loSgw1RuvzFjdtim7BUZ{E~i^&A}QbBA3O$GUIm
z9*{#x;b<r8SnAB$NPf2g<?!D*XOps980T?hl2s&<LjOeh!ljL@wdi!W_KGrnnzCk<
z48n0#y(dCWP=|_X2Qwhe6-S4HFYi%TZc`GivI~m-xdhZmsq8x#IoDN-Ij+7t&I)2q
zJ(lDu0%H=o|MMbw1O2~_@;}n0@<(CF2q+NH!hgCY^8axZGfO)|8%wYMm+V&Gj9L;v
z@|{&j``C+_0z(&sru8ruyaUk#h#1K$Wi)M`BW&Z3s(!gw(r1u5Yr4-t3O+V1>bh23
zES1x}kki`J=PQt<UBd13a_jTCP>lb>Wlv12xB?5R&Im=Jp_>N^cBaeN+f@)IN@vWQ
zZ<504AzqZS{OCRzDIgaxXdtCAN%SLCy-g>TXJv{eqlBi<2!n)eb3eVwAsyE+w=QP(
z7-*_iG;oHVUBe+e94S??TowSR=UlU#aNd=uZU|9urUbO_p`}AN0<|znxnMjRBzVYE
zhk|q}d5^Nwt2iRlZ%%&ZE()>!ZTx2KXC;?RR5<6p{Z3VFJ2og$%dM|_qy_$<92~tn
z4_K$zi|uGupdlx=&mjV-+IRrRHNiv(lcQduggt14WMDX?Dp5&e$IY;5t3Gcs5USE}
zWRyn@7i?D=LW=m*P^B<XE+m~9kCe0~?_WoPRT2FctdIc;%Lr2wEyut+#8w_1EY4G^
zw>SXD;-jk_{}$S%cD6{t<pI9*0_89XH&zn?YxB2?E+<r+x$0cjc>a`>|7#*sZ7-<q
zLxdo_uV;@!FG*gEA`a8Uz-l=XrKkZ7LfZhrKHSQDs#;Jn)IppZ-?W9rWq*M4V>>&q
zDKi36Fw=QKTy2R!qPa+GGDRk5WP85t(fsZ^oEz59T(Ggxv(Ed-d+X0X*>gmxB7$Vo
z9`JN@I|7gPyW=AjITdS_t1L=j<`seD6jFY5o3Dmc%au9Jlg}B;fKhndGHT;oH!K<V
z_!eyB9l*T?;GARs=Qf^1HI2)ttYrVO6f~63;~jaXWj>Z!92oZ*{B?XYe@a^xnuJ!>
z;k6>Pixz&*@~VB*S^2TREj&pLN|2eUytb${LFk*D-m-EdLwSV83#|qz*?-WLkHbsr
zfJ*jeowK{$Bb}sUrzN2co8cuL`;N&&Z{Fn`f&VXFp9fC*i0$)TX8HoIodaXJlH~p|
z`^%#|Phq~bmUlZzGvgT+C0~cKbw#<Vj@Ab8k7)ecGCGCrRbfW%7P=mMb9few*U4Y=
z$4G3HZ|_(=g(clI&SLD;b(<)*i***FQ<khn<CA8_=8m4cgM%(7T|f^hBArC|o4@2w
zsI=39{%Kby0ZC1+7WLxmR*Hee02p1cE8dh^B`Yub$yvcyFZ%EK@~eOEw^eT_yHk++
zhhLV%@8w^&VF=uo)m!_TOWSIFo2p=^HT3~k)xCw;uA_#*Upj_!(>7L|nursAQUdyP
zK+G;$Z0lLQ9HuONI;c+3s3oFSJK$LU|2qBuh+MbzX<i?2KtLd3KtNdk$LXyM-3;}O
z?QLvKja@A5?VSIY&}D1C$%gcEqu=kTU$o4cXh#@kKoD=blls>_VjEaky{ti&!pd|U
z^_Wzz{<r5eHn|LQvi{IzJOIMNIO%ny^H7+8>R9^zp)$<~Pd@1ELZFM_=jlP)%CdR$
zdc~4QGen70Y5GtCoWut(0v%C^n^9zd)ce8C*At$PS1O1X>3H^^0xV(dST9pMZ^W%#
z(nu4w8i>OD7ccjK)AchS>#q{>$d8EJkOdXF(*^xN)23B(EXsMQ#Xk~@n4wB_{Cg{V
z|Jr4)JIuL6ya>WC((cs!?7zctKPfR^0{2KIl02vYcHN?A5ibCg*&~PKFfj~+$$6+C
zH7dUf{(hVa9-Y#;WmM`((@JH`vL*CaHk}gT56APV^IJX%#6xXp%t%Ah^AWs=9j-F5
z>~<=0N$P|}CVj8xF8*j^du}RgLg*WN@V|sE(BJ0TFBVxn*S*eprWC)#H)*xtFi!7p
zYb$lj1!NNadC0XhjHfQ~wJw^rrxkr1TfB+KdGDB(!Cd)c8&aZusgfNTp$m~#LPfOd
zsu*9ukR_ZuwEp=lxT~hVrvc`qz27y3q>YQg6*GUPhqH)MD=CU#bQdnaPG4<g7Yv%u
zKKQ&3u#=pS&uPNujz>aj$F`a@1p%(V%)Z>=P<^2*q6SbRc)!ZPEDC!F4>FdQltnIC
z@@D5>GXg#7`x?sMoOEL61MMj=7t2V_q-XRI8kC}n5Iu}1yZ8}j3?a@}If;k3w(eA9
z=mwOLba2I(aTMM0A(lzH5dkHO6DUMK7^PX~WVllNgb(O;76ByLt!_uGv50Vczbg3-
zT<6m%k+o+^;qI|q$$3=UkA)fN9<)J?7?2naxDux>JmZH^WjGoIQT9g;D*o!YNJqdJ
z?ir0lCyuq7CR`XkFDuR`tnd=m@Q9XCL!%UwI*OEI?%X-O9jbo~=tpekuIV+fLb*j9
zQ`k|4V>b3!rp{y;k_6CH2r5zfL0Y`KF{e50ztKZcV!ZY43pXcE^Lqovo*U4!y?=gU
zmu~Sn#Y^>0d7`IlBB$X$fh}5N1suZ~GL3AC{Uk4?ThAo}ziLk~efvi-@YoCcVRE1u
z(;SO)$?ld7O>7%TyT%nsune&c7AlD)yy|X$Ju-$CA9&!h_>vtU6b{gUnj3Uj1ZU#<
z)71w|H2ZMl(|x$$gsW0UVDfCLo-*NPbcVo)Lvkg8v~ozvZ{*?N@=&@5w2a<1Ei5}l
z59b6bb!+y6byiMlQ_k}`hiWda@cU^ZQVQ1Vr8rtzNlBCEwa(5e_UzA-HOQH&E~=?z
zj%a%lZ0^+rmcd7Xgk3wa`;t`_mC`;0i`X1A#XhohR(R8PjKjjgJ>^)y*|ybA58+`!
z^uz*RM!WqHhDN{rH*~K^%>6EqS|5S$h6(L-S|w()rV*i#6vpj>o2Xljh)w)Mp8TuM
z*cvK7`4EW1*S8+TfDm+91?RU|IOk2x$$U4B)TrCdl)UtQpN!tZ+0PIym%*wZPpUV~
zLi8%=S{Pvx8E;GOv$|UkxbK_e^Aj@{Y^||}uHlK!jvUrzhx<ryC7F7>os#b;l2RxG
zU)(}5Q0g9aC8v2=z5Zifna*m65d?|c*$l$Pzg4%=!K|(*tlygoa7yj$4B$$(RI~u<
z)r#qhU4HAu|AITMfbAKAzf)(AIvR6y1t0v%Hj(z%9=sXk5N}b1wp9ASk5@W|9)_w6
zm3`BSi0)@h?>Z+zZaQwPg8#ZD?_I}R4S@8mgEYh0V`nq+4&p+Y_e+C@bFK!)<FujC
zD)oyju%&}j3HOtYODJg)%7Aj7E3nF|92HkSZP=_xKPAr}S`)I$<j3N?A0n=ipv$9!
ztYu^kc$+$Cxg3LFb&UQ9)hiZBNN(2v4ecwv8SflVCmCQA_5V{*QQ21!?4%*-{+EP!
z<Z%|jp6ZOJR@X(bE|5arXES4tIxn(=?#}je^et3O(Ie%}zmHYOL1FO`37DQYI=G3v
z^&%DU88xX~Nb7(V03ny{75^z+MA;D#kSheU?T_yGW9>qs4L)#T;VuK1^H^O$=VXOa
z`AxQfK34HAhP7m_bIL^<*NzKYTV$@ha3P+toYn-lXivT+(9<LM*~ibx1rPwf&vNhq
zFT>IN(Q0XDeh|cPdcSqCJ2wv9=?`H{Lyr4XFS&}HW+Tyh*I>hpgN~O&8S}nnGcs))
zU3e$Senoa<sVgZd>gtkZ3<G$NoiK9Ehq&UCI=WM;ufG78;Cf92s+zZgtzdDdpS`2E
zvR{rHNQJYe!QYIWjVW1>q3qw$yQ*?$hIY_UaN=j+rH+mRja1CBDfJ-6GV2~rPNjBh
zHRUV<p3$7zhA+w+uM0B)(+;zgnm$nY3q3F#BTojPp-OSS3hX}kO62ln^))AVkLHiI
zv^4T(Q9>9)Q2Jd!xV($lzNeilCQ5@R0e)5L2~=l6;cTv$;ru3^u7hLG96bdEbtz-j
zvXPVCq&)%$9?ev>JO*8N^JkEktgW}T>wA*G-Cj9Sn~+WRXK!%uzgTTH5%5F-RZ+*q
z24EW#8#`Sr3YM`+1x0c&b0Zcl0M7V7aw@U$Sh>l!g3%B!8P~Qq{A>t%0mWa-Pbxgl
zRdYBFp|+?iHBP`qcd~5fMLCS4VHEM2lh91FOz|NGQ{CVRPFGd14yp(nQ2etd-?=;J
z0Um`7wQcBd6ZY4Vf{pa9_-1PM(<}jgJb=&vje03bpqej`S+e&=tHuYnby^T_?2^Lq
zpp>Kk>VCec*4h&yAA&WFe(Krmcki;brVT#NbZT`z$gby9T6v8L+2#IJ-=BXu^so;K
z=4{6zpW~aM4`?UI4kuhIW)0nLRHZK7I_iuuE(9`5o2;Lq2SO;GfU5*1BFyH_@_lN;
zrA1kNe35Eva>)@z2fWRk!qDc1xeY5XtLHlQ<mWJ{6cTf1C2_X+l_Yg%IDH;Ge*e43
zTNYocvdYol8gtYOywk~_K)y7}e&pqW{0}L+O;b4iR`@R(sX>wG279;(YJea$)p?cd
zf%jhIj|L48^(U#Z*>!>I<%faqKY{M!&b0^EOTi>sxY@A=A!<M0b=1$6SuXaztu=a%
zI0S<75OoieqJ!F7wU1~=47kv}Dh`WP5PYQ#LxfhlDcE$4DUuUU3F>prwMh$y{>F*D
z?8bE+jHU^_8-w6nS=|oBA6u2&12Y0{Ks6)?NpqdIPvx*RY9;1q9B9n)adQ8K;Oe2g
zVy$4Qtj@hJebNrgH!i4Jm1qU4oTlXLOv-mFycN!A;eMC?JdIkD|6EMOCd{nbF=R?7
zZfe(SS~qS=PgYuQXG&k3DnuIoS3BNL72eLz*w2v%!&V*(Ig__Rue0UXd)LpS=eIu_
zRb9>RXl(5g%I5Wr`c369EQw1a{7WzhwCyZ=L^GuP*up1(n<z68Eqmbz#0YPQG%h4Y
z{M1}Ld{8$Jxpys3%U?8t$Ztg=jitRE;GC(nm~2(cO3YHbql2}gx)6O}7VVJMhUqP*
zpL>OKb`fL-p_uT{B@7p5hh?NtgK!@!Xqg=|I!i8s7$?IS*xsl0$#Gaw`p31q$<)&l
zGjf_{Hx`S$sJ*92<4|Ivaht=Mma3#cTuPk-n;kcnj4~p9)cNv!M-wmQTc+COSpfgv
zP4no|LGc=WyTxX_3aV8SGql;xUcFoickT+I);~-|tt0=UcORFZkE{7pX56-GA3vcn
z)%uKyP0nVOo*ey~i;>a6&H>cbCx$MczBL@wlZIdKU#B|d;*au?j+65TdQpBQ57yKh
zR9_ujsSf;QcF49?5t3kn#$)*%rAk06IAg3|?AyVbEsz(+JlOLIUMtV?E%-Gt#czwB
zR!ra~W%dP9GBY}Sv7xEVT>Z^k#q_}2$C5;sGvbi$i-+IN=1)i*;Ih>k2Y_DNi!*o|
zXme)-r)R}Ev6JO$^ypS4s>MH{SIIh_oS=e+QAltamy79Wnv(8alL4u+v4I)yiq#R#
z0@d03c7d~fJyL0gkCr06*II7XO0pLK`;S%2GbNWDJ*?&8B%9Km;^x`?=``)&Oe^+1
z;VG;&eA%@N>bVMU9t{1;)7ePMK2TfpgwvDanSTsw164Q)wknp=e~~B7h`UqLz7rb@
zO<#!JWlLR%m$<4s8=$By`P@`rTg%Vm{k9D;RKV!Z6y(iJxg^B&5l)IMViPZ#H?bI@
z2ekb+8mBNaC!Hyw${k|{VH44vZ(0^EZ~VFBlW%lZBZ#QR13$Qq;C3M%udtW;0G`Vc
z*I43Rdsv*)vU2(96e1f%qQj1vNM<{xE(SGZ^q>E9pXIvl==pvq``2N?ZolY*4a<F2
zZo8rT(JdJ}f99cGNqP&nwsnc%O-&|Y>ZTUE{MvR3;>RYU_a8#TrP0Bk<NdoYC+q8V
zLIMrzf!_Z=ULx+Z0gI3U0t!_F{=Y?m{|hgf8@iaf8+!hSm9FqVy*DN9-G9*s?tIt6
zX|#60K#mr4pqnHm7P#}ClF+r~MDQmi+?@CCdzovu*TYS?E#nT++cr|&%+1}+%}<*f
zcVz%R52P6?i%yQ+X<a>ESL5<`lC`_Hy%i;wo(WYIMLJbcT)9<=oldSvD(e1p<dqNr
za?gjXFJFL<-b$q_pS<32QkIh<X=6u+?8DCb_eF_Ti2b78?PC|+W2NUjX=RpA%2w(s
zd!-4}Y!<E+FaTrRp3gsprLrqk>`}BqPbRt}L&bF-SD}&O{jWg|o?7(lu#u{e@*=gS
zN@Wop`SP`GeLBd)wm@doCKyXioe(hn>2ar#A3qo#;w7;@`XX2!A3gc_tN8dUfbZL~
zwl%FbU;SC{fhwSz>SXxZ_kpLE4#RVOuk5+(QIk!(cEvTpRMNAv>H@uw&M-OJM(;pl
zCQC6<pLN-%o0^Gk{xCDi>c0cmvPM*LF#I(!dEXgpYijdXPt2DcnvcJmGSgwC`%2l!
zGC}s&^_P!y^Ly2vadwbH%zT@xDkbOF+-56G>#@kpN6p1cf-oB7Z@`0>;bX886UzN0
z5yi&er7afGdTAcqT^TR2tNiay(Zc*PYGtYg=>vPY^mQ-`kI3yGmMeaCDBCBpK<dEn
z3jeDQz`E<z#<ZsV00-)!R1=20tI|1oe5e(@Hg-vvjhdaJ`D+Iw_^XCKC6vhiwH;`&
z^F9XH{@r62chy}c`S4Pzx~cfc;i1Zu?4fNCb%a#z1rn)^R{al~(3h;ESXhKGoLTAu
zjng)~zT<JLuxa-Hxj5*s39xFg>ei~sXG8h#UiMU-VUZdv1B;H`x~XdOgWbsK&q^N&
zF4zT(PJC0xgK00Hz1pZVk4$)Te{tfoF`oq@=u&nEc?UeTrV<ZYbl?Mq%p%BXE2(;t
zddULLpD}<aI`^Z-dm!D`m;>@Z>Gx@PE=l~p4Kw9SOVVKjE0Ky~JI-?h*-cjKJZ>L8
zf9h60pPtQg@pH^ENBwz%wD>(f773pLguZTfAIGL{cVF-C^TXggH~gPFaf_vQz2?53
zFIP`rD0;rP;k&;-uh090bpF0xi8(pBKR<|ek%W|UgM)>QlUEOK%k$^fWp@vIcW0}m
z^tpsN#@tF$+MGNd?+=T?gS%#!;-~*DLN4b|AbOkozxCTUMDG&hnueFs>l5JX5#$1p
z7OLq#bbcPbSYKMH)PmW+KMT&|5`TDq8Vmd{T<86{L)Pu0aL5TGt@?c^tkssCpsVcl
z&&(J^Mn-mlcR<wcWdo=5vA*^ga%XEF#ybt21p83HAxyHNcnm7^lbr+V^yp4shmJ;#
z|8f#gc=~qtfWP(O<{yA?yc#KyF4Ho?<83SeU7-N6fIm1}fc38%#e{~&6MtOehb})V
za>LyH;B!mG9D#Od(vmOZga$w$lUpd1W@C1DcIgn!=JqFTVaKj8=0#<7_&*&NY4~Z>
z^83iS5^sL=O<ysE!`*;#J>*x;(RSBXl1Rt<%b44(R9d>WzSZp<sn>xf%D98(LeYO`
zda1Ei#%;2OfS|p)x>Z1aa!ONBel&~`59qn+s$ZJE+#ulCLo^Hv7Z-{))`-TGVvzTt
zl|(KLm|?FZCx?oHi&}lA8kFz`xq5<{2I?uq+oO<lvs{kDM;T&{(#OivNmlo5{Mk7f
za#2fBL{|z%_DyAuk6si-ApT}INYknMpiXumG-&u)34e*AkCtT9etN9~u}PV{^m<36
z)}G2LXvTPf(p<6u=(8&*iXJ@OeVy$+P;zsAfE))4$I~`oo|e|m{`BXVyvrGkd$5zf
zR%a1|29`qn0VSfrSaL0+hrz<4&X5={B36U$CM$;Q*-nAY%M}}WTH?c+XVA?yI~A{`
zk5xEhbhVH6F>I=P_EDu1+&2ajQ4YYK!UVgikT7`vTVl49^`)kWolJwaf}DVxkzqM3
zg$cv7jtM!MHn>eOjtiQN`2|5)*j4TS@Nw{D>gMS6wplotz_(Bnn`ka<UboL24lqO2
zgw^Eh=cUk2X$5yk3w&k9^DobK$QOaxKVaSibFHmZfS0fhx-^*E1%^zX1r*DF=3&42
z`}zat9<#;}+)R8E(I7uwz<A8qC6cYI)l<%O#}UPqZqjFuDuP4}PL!b(f|cWV!ky`S
ze)DyO9`o+98+|?&%Io(=bZDHCEm8MmU-_pF!W`XWqPPQ7S8uVcQo{URpYzA>Qc6i8
zG<k*lJXHjQI%{{Evze&m_VRv;|H|t<Qb94czE>2}3s0i(Np)JHG;R6APkz|%f=X<4
zx|mOODVnip3QFx83y|;$sf07da`m-Pd{KJR@dD-PA3cMVSOh$qR7jQ^**e~xzP7G^
zHnibJexwH))xaL71Mb5b!L-1&ML-6KXi}yd(vure#Evfbu&tjNpYW`akOFf5e1Uat
z0>ub%Z0w*=@iAWClGP2b>jMFsE{wx75EibUY+bp7$0g@mG=d2#;G&(oauvjuT&i-)
z>jy3HiYzr#1nA$Ls?t<GCFQf--UBI0G>&(PBpB=-<e2Ty)Q~}dP>PX<qqVfn#u@$>
zI((lm_KrN=obE0t-(!2GV)gUsGIhItSSSlair`a~%Qk-(3lIqdeU0{H%BhMNiuy?c
z?6f$!In32SaL4g4js@kF(F9}qjr($i%4)hnFG9)VfratFfyBopn<?M)F=HX+%G}h1
zHvMNV;jTp4W4D|%(<qx&R02*gVe_)pFyE5kg%ps}Ub+fguj^f?;~x65TAxri(^H(U
zAh3x7Xo~Kt>@jML4GgpGosnD{>Nb+9pUbvcAoB^t7FABwcSEQ&hC>aKhIh|%E1eAe
zWhQYj({4&nz@gZkajIhe2sa9^{abkI{PGt<vHb3o;BN614DfEo2DfDzGDU?cQKF&F
zs{+0T5@M=|T2W`inU>YV3;}^3D#1WM9Pg+J49HzFNC0;(QFq3znCATg8MJ&S`!l!A
zB?!DhB+-YAUC{91WcI1%1N~Kvz2`n^X1<E5?z)<`jC}I$#n{ehV;%5i^m`69-$&iH
z%q-p9>yRqoW&N_-6`B$ATVk3390@8$v-2{a4wa-N6OSw_xVJXCyrZ^C1N8z<moDSH
z=7DLjH$C<FJ?XU#6WGc&g5C?#qYHyXQfhNV1`l?{bco4=VeA$qAnFz)VOx2GhOup8
z8Olx~dw|9#2pMu;r?Kg#+KBi^wx=!OxtY30hit4a-%B?>(-O>?fU#;wIOw4K!mLg+
zo@)+GC&$yhIvrhXAPz|i;_zM<{wErE1uu3~IA}<N7>^bR!piq=CxvmGeMZyiung?^
zo<*sIpOEzeH9IIwTs<s<Re+JLYW6$ts?nl{(u76B=?EP76$cZBr4x_zO(iSKTwb05
zySB@0A70|bK^(?cP<w9=aHOd!3rQ9H+Z-iy{vHX|F*gAF4*IVlVRkj93Dlz~rVfoa
zRBjJ2l%~vEgrpcHRbVDGac5Z_<;&Du4UD3OwID|Ov8-dX9hovl@qpz;la1ObVEfu6
zb=)-2UYS7d3^&n6Zx^(&59Doz%{PtRhAn+oaUnDx91K%_E|y{XlIl8T%{twqj1B{F
z1QJXNmOPw{KMS2z=~_c`6N_~ooE4@reMw1Z)_z66P#>@<X~x}>^^p{2JZRA=K-;=8
zm@s{bLt&f>l(5`hHciQu=MKt`J=13`Xa3*I5)@q*oy(jD#b&7<ePjYbtmr%j9_7&G
zZ12Qrf){U((W&`ff}2!k-GM+$LL=Hn0fSy)0XG9r8^w@~!~X69f9c_z!eq=_Orzoo
z_p;>}rD;jdfdI1L7D4t?2}J9m9pOfH@qqWkd*KM}pJ<&7tmb@OgRO63M#-O!m1Cz+
zx(1&2Xw#H-)6_ZGqnI(6^iAa9#0@)vacT?N#edw@S(A|NI0imBIpkrZ{@XdRuX5z9
z8~mP{{V*D!oyckS#!b4~%By9nVF?xg0^<U<K+F*(;^BNm!Yv5@Wc|S{(<vG>1RDMa
z=|-x|0Av0ajod|w;{iw!^a>f)VC&IaV-&p(RuJfCwgeK&Fu=X3R6lT)P-j7+zoUel
zGxjv5n}lQZF`I&8z#}A#QZLv-WQdqlqpdMdX&gXA^+NY(QGk7?-OKs*_2={Xyt43C
zFD3fc2t1p@w@QgnvnBwoM1UrBU?OF}Xna&OPZ<itCj^W3x^i@Ag1^LecZ0@Qi^I&=
z`07~%s}INz+EXNNk8B2dqTRjjT%a<Rh~vGgA~5Vho53{Jf~XC5cE19^&-qn@m3&(<
z+^UYZtyG)l_V<w3jD_bxgl(ck1#|}W!GggNFF@kt*xMqz8oBjuuR>vf=)SApA<0aM
z_76&l(>X2~JPFfM<)`Vs1Z?#4wW{F|X9}mbY^-$5_SllmcO1Z2qH!R=a*Zb-F>vBa
zt&yc1{e2$(#f%VJv))60Wf{cxA-IQ3gi)S}wAv`2l2mX&+1JK%&L?2*^DZzBtPJa<
zBT&-vW{lmiIK^4K0hrmfpC<b$f9c$=jA|SU9&|7x%pdL}ylvZ97e}m_7$_sy%)ZGB
z*3u&0%W@bcLq<!y#u^y5A|PLQ2~C=S3{=I1K8$cJac9jikkQ9zvi)OY^N3pwD5%yS
z2Cal0&xojK)a>j>8=FO*E8kr1Q4Qbk0W69aIRCz)W(Pf*V$w--PiI85A3}8Ec&^b}
zF1qshS-^!}V^i#iXns53E=(J+zMq<-(E^^hPz>3jyf&SwajRDsdH4nz1-ji~C~*5R
zBBL{tS8!hg50YN6oNqdhyDh8g;Y>JU;oy*}3wr+vl=$~<`Uv5L^glXc>dr2i{>Gk>
ziEg19o&yICTXfxyJ(d_VQjC#yjx!si&YQjfgs4|~b&?8=@akwXS?rDsfg*?z%$PLF
z7tiEFk?yDy3-_&XxZ-Y{f=(#)4p1244~tuW1uwBZQ_<Q<=@n_H<zX$%&wycq6<D*%
zFl`EEN%-0T>_&#}h<41(bV+@$Z-845xB6dYrE)6A@W!6vC!K}qYL|3QBEXlj8B&*-
zOA3!RhSQ8cl{TEp)sx{$Cm<#=Q63WopM@skE-nX|^Dv?A@1GdO2lBI{&73_>auxvj
z=pq8lzRv_Mp7cRs2!W|ZtP1y{CvZaCAFd?jQ0aJklgyarxAY-%40yZ<<IzF8e<(&o
z>D8xrSqx!$D^d09x%re6VYHK4l^y28_z<`rCQZDl>TndyY<PQ%QJvb#_oq>BvZQ!^
z;uEP)yD{RG(MOGkg=w?KkdZxJjv_w^a<36DMAK_USDaF4i{c-mT@Y?giDLzIe`M6<
z<r>w+Yx#J&*sJ@&);$y*v*31ld^-h|G*y_oj(T;lV0U)`>$Z#fY8#)5qZ07K6y*!P
zM;rHs(YfY}K(4G@7Xk}jU!52kFJVXzUtij(nyinQSW0K<c`ntg^%j92P`p70MZd=j
z1Lz#OhJTeZx&&>{Ry=?e38&Q=TGdoXEZ`K?Y1SA{R1v&~dR_j4T-f1Xv16XrDu{Ri
z?>3d_wm%KS`(upeK1xu<32t!iv;|GcuN<hqo;OsS4DK9i3=gZ1QEve@R(0WyggM9j
z|A4MfF!8~3MreYkw*CDtv3CUlTyMuX|H!0XNfM#avpO6fnduO8K-bPyMUODkkHzg6
zW}|~J3u-jTISYtt$&F`#IGoz(1PZcTYDh(BkR5)fxk4MkV!cZ!<>FwwN?E)&Q_h^v
z<AwKgs2)pBIX8K90_BAW35{1Cyi6d2wVTP<a?cGz6rc$8yBcw&t2PFrSy_iSSy3>V
zF_KXz2ZKuI$8)Bj;sE!}*kmd8p?~&dM@g#oe<=TvxY864usFle8uhUZRsPu4(!<>z
z{0^_y^ueOQ{*$py;SYRC*~xB^6LVy;Mx^Gn9uP<`cG;2|zC-$1(PVT0fO2{=67$#<
z)C@_K<A^#HW3XiVoHU})*vgsi3%Dp751f$Mu3CvhQ>H|!U$5F4zZ2ud+3P8d!ith4
zqNXIvLa3{6AG*;G+P-HE6G)k^cX@PSyW>8Z@0=7y?-$%8<|sYUBprA?$8(uXR)C0d
z_s)axI8vyCt_#EhM#dF-koF{%G6S`&jkZDTaxJ#y2wN$ON5IFe*mpt2s!m%r+K5j^
zE!Z8%=K1eKibgi!SiIn>!Q9v}L!bY>PEqX2?Ok;nsYN5BXoz|IJgkbMlhST~L_GUi
zcER1qph{>gV50-Ai-G*?uzsB{2;73^!ws+A;x2L$FKE?1jf4090bW3%zl<t?#gaUQ
zXt{%}&N`MA!6pqf1x9#P-`h59>nQ=|6(bPzya*SMt%SA!A6Fq@FiQOqLI5^Ct0*pH
z=uU=>=pS%2vE}yhv|!1GV)miKRQf?P62RX}E))bzi@~FFW&!|T(*y-bq`~l|7r%y6
zaOcmzIRCJzL$K%=^oKj1NmcxiLlhv+w#L$9IQ+}wlNU$lB({-1>nVOkK^nlHUt9vC
z!+J7CAk<FEE4vnrAILodmQyexNByn}!O#Q&%+T=wg(!jsi<%dUifynxaYv&)jz8&$
zsy$p`2`q1ni&CRtP?7}bCL|r<qvr{B2QwX3(JWVL;bTz`wgTIo%fmCbOA8d`VTbog
zZw_YuYBqyS?HT`;v4#gvS!HR9-QOv0pXbC1WO76~#i<xveY-5bJu<D2Y_mf%>yw)T
zqUjZVJNF|R)i|pE5G$}FQde-<Q0aS~gh>I;JPcQs<IQYlD@CLiS0TNm!EX|>c&>Qs
zLK|9DO-T+<y|T6JgL1k87w5g~<6PUgoY@85Mg>A=LXx3WZo1Af&ivhw7xlsQ_PWEv
zHQXms4-S)Yc&tLRWD(7}IZpv+YM(xR#!H{K&!#YBqI|@EtAJqm-w(d$F%Een@O@UN
z=TD5}@a2-?EWY6#N~RxlI(#necVX`ig4*L0)BWVC-w#ALWo?5#4y2M}>!kG!@fvLd
z1IPxUKKnjLQ#Nw(dciS{$Uf>7={Q}njnlGO&hvYEtu+6fpCWByQ<N=$j@H?Dc@iBL
zt(IO7FQycHhN=cXa!VuDdyX6>!nSEbH8*6g>=JR2B1ks41X3)ZH7p==GbVfF*LWho
zVC~q@3;502B(>DI0RqxOAH>#AKC7xea!kkm@ai5Oj_{lF`6*7s)moxP@|vE?I-t=o
zC}8kPzM9+sB5+;}cSk}4h6uE6tC-^sTya6N5E2|$6Si9YhyykF#|gX~eA4%aLJ2}+
zGzciK*%Er}DcBlx#;u|hj10*zCw^ciz)%xR0792I-4$CJ1aZ{^-FCX*c-WiAL3oK$
zXxW$|<~X6vOHztu01Hw_%XwZKYs?ryjE&h%O{qaIOd~x<*ukl5ta=qnUp70%fEH&z
z15#jp2u%uoS#WLk$6p`4Jd2NM=$~8c^wZxQ9d=kF^$##e{q_gm@eTcpt*t_^!>t9C
zzHHeY{2z}y8&b^U0Y(-4V-4%!%Y%!HokkTJ{jVEIvCBd1#t_$g_!<#&2O0oK*J6Kc
zr-4{%Xbdb>V%-LsVc|%(01fA{ZSB(p7&AJsyPh*K^49hw&{O=~)WdYs*6Qu5`gPQ6
z+g9cC;WixZK1NPkLqp>3kcqcNCU$^qd^u3j^#>i9f*BnM`kti<^(EWD28W(4BeDya
zO6x25nQ{6e!-pa<MveV@kpS41o+sr4l-W!k_#^41zX{q(<!CZ2u$`-*cRnny7(2^B
zpTgN?yJRoVF(V8F78<5~{2W)8tZ)5mtNj8Fj(>6R2RVPc4M2c**%+Ota1O@m`1*ev
zS%)ng^Ca*Xou2<8V6fZEKR<eje0GOwc39fqBM&bAv~U>3W^=-BW(6T!n|Et*t)efU
zz~1oZTOK~ly73q^{N_7WqH4n%>{*A_q+xa7SwilpDV$YXk?@6$7z>AL$`rU|L^)%y
zg<=xFPG-X;$!KVv;<GpJ7S$c;vwU#Iq=|c#kc5ifZ<4~iF;NSPvEi8n1yBoI^kt4Z
zm26L(iDm-)oMbkQsvn7uE1v`XW7;=l`pLktjBpwX@Y4|Lc$gP;$ay&_S1jXF^PU!b
zb#KhkHYQio_ZX|?co*AUw6ktANCxGot*DU6=oN-dxoOmt6!^DTyt!y2r>lG?J`o$V
zQm(R$GJt8E{Rk!I0(}IZVE7w+d?JBxwr3dB1~~XAK|O#oOfA^3x~Wa&Qq!Ch=I9q-
zEZ{YN>DjmK0h$t-sYu36W$3&Gfb=lK(AA92U~Gj90h|wpOm(P6sBkFUKA98y4}?6M
z>Kr<~kZGiod3)_+o!aw#3c+H#8w7g>Fal(d+XV{sI|NX#7gSek5fE_Lt|XFXKf$j9
zhdgYIB&hX$-CW2dfsa9B41jDK>wzO<2#R(Qn$W@rtU@_0VZt-xLyd4CB0jycb+J-R
zqxG(UCAC>?U$27wBNwa$+2V&o);<pY;C?gt!h|VLpqI3~Rwk^V;qy<8Js%4#D<kAX
zqye-)qstUbFJj2NP2;KbumNnn-~N$<?UmoW-9(%^Dk<hs8?^ItEyOdwLn$B%pODps
zx8O#*$Gp`esq`y5oaUYxu#lp(`$df_Trz$T3G0NxKB-nGekmaZXqaWAo$;#7%4?u|
zbWl104dV@T9Zko`<R{t>5<ZR+I<)R4dW5nXe#2%V7aqOkK_PLAi%Tj_duqKv1uBE6
zyjBoRGsg;eURg}I#e(MG7L=U&J_7+9>ZAM<8+2WBJlB>1ccluorkRdYe24h~W6MTX
zddSj0Tjgq6JuBz1q3_u;$ELrH^U<L6h0}6z@$!(nGbXj(t2}03%nf8pvnJ{>9atvg
zJmVxm3}pZGs}EQ=9ules$e<Yl0W<b6>R@P`i0jJeG%gGiRKJ9Fpr<Bcrt>uR4#VRP
ziPzL~NeopsoGvtOOR0+w?h<{Bkz^(&r83FPxCqRVa~i2O+XbFn>#tC|qeIPA^=`f7
z=&**4BHtThtAGT*r39c}c|&9bR}DrV!c^O1#RAd*qdve?m00VZnkBEL{;ij|vamgP
ze^a8nOsv-!+~G&8iHq``mWeO}$26Txp20-pY{a9gt1>zcty5*Q&_VB=xwWv`q<S0s
zb`QI2sGb3+2o;YXxX8#X1%t;0^@yYX+5Fm+d7(klTfiL3a4q%B`Kg@UHN+<YxIV=g
zLX)gSbp^gmV9pew31A7Tul_VEm~AtQa%5QlR2ed0h)A(NlBp&h8Rd%MBw2c1NkSeo
zYtD&Iw|Q6By0~vvuz^cbl}q4^uS$5+AOX9uC1lz__P}W?q#0^B2mHbf1c!%hd5wl~
zwDr;D?R)0@AoF<Ok`68EV!DOUZ9gr79?e(I8aC9mWq4<_3s^du5}+xkeEi>M?{0k4
zz>Q&5!vnAkAIuO4#H<AumaYq_B6LlaqQP?aGRFHC$x}KMjV57v&XNiY+!P+p$Va7r
zmQQPt#j1j(;bo(>Zd&qo`yENcy;H?8voY<PEy{l_2d(hKf1wN>Uh5CPkn$@iq(8z!
z%1Nl0{`3p#!h&M@!!M+iiNN$%0IP$puujLT0Dv>8SjqiqgvTHO>AzHl^qc-_#K<r;
zxG%xkLt-Ku>VwST4=ti3tm@5@r2jU^)wsVsFZsWbr1`bVXm&#O93hCnOU8ymiX`Tg
ziIr-IsscuC0=W<l#pH)sLK!q3m-eLtvcCGdD<2_(f9#1(su%U4ov7UYL`XR0+!jgh
z!Th*u$?KrmJq!wdc8@DODvEd#W)!**dN!{CL*5<Rz)_1cScMG)O>7b)yzZ>xafK=N
z2h_#G9I~Y?F^-qF!s0{rc!Z*R_O}~XJuw@;g=Q2IVpJA&cN(ImWl2AH&*Zfd(vfYG
zN2_)3RBW)3qLGvdE~s$S#nY?FxkpGJGEgIe&yg_LL?;8_g)-H(`a{#DJ(i6~GxEaX
zTu9IyZ3e6ST><I{jIaXmGKaU(&DSQWARY;P6EB<<SX@Y7sqkVKBfPcWqarb5;o|Z0
zuS8BMXTdta_`B`9TVPSNPAWZF?_}?_r?Y5I(liY#pFTfL&dDjQcX=cgO3rsC-7n5u
z8%Q4%H{d>tdbeYGW`7(bAl`#V2yM^Tx3~9niH{Z2+HK0=)i0(LV1P%=#5utG1knEe
zv(5^VA+sH-wv9}*&a$n=zEra}#-f=+zwqNKsAfAIIr3htA=S{90T_uy0-v^CXzkSm
z!2Bh6s}-Sy{@9_{5}AQd-GR)FH|3@TyW*2)QH`i!*Do@_TEiU&6WqK7a#JsIh~9=L
z6XF5E({?2UdSHFmWgQ~7Dd~@tjeM`sq*8;C&T&raB3V6%Et8^yJX#?l(+EWd$Mvla
z>|9TN<1jvH@)_G&6c&p+6MN0rlW9T8ZxovDa1=@OM2~>0Wsi(nHg~@(m~on6JkW9N
zCke%u#+g(QMq)!D^PuX<UjHcbC}Ydk-wD`lVUyb916H1iUcBsj4I8w=2@_4YyL-;c
zg79W%Y*L|<92u3-%XE19u0S*f_vrh=+88-FbOt2J;kv3X7_Q%R4x2kmlEw@B1LK9!
zyE&iNW!sI)Qt8LnCqaeW;)DoKy`#RiMOV~HszmRg=EarvMDrn0Umu&;YgNA^NyeS&
zScQo?wP!)TD~hG2xU@#AMh;y?YUTZ8t|?tj#nxVyGczd|lNHR14orBPgWnT@Aon-m
zN8~N@QlZgwb9*}DyEcY(x3pv+8D%A&L%*^3J#0b|;3!7SlCyjY8RI`Amt^f%q0lZW
zb-r^Sl)JZL7(eV1isO=Lr<~kgJQAI~;5(>;>$1z{*ABAF3>Yp0TU#G>O3>{sbsGQ3
zdk%HzbKvgx;1~+G<KbRFSMfNsga$`fBN}?PL0eLrkHmBaFnR)#A`cU@1jK`=z+>2Z
zzNFT+0g*&uZeNFipwupoX}x%i;<lpwix<-t8XvV2#IU6ePCK<D?wt~sFTi$BhK|3<
z+2=}XyM<^<T;v_!ITACMC{z-##*H^HnnVXdZ9nY^NR!XOogsT7l(ctS)M(;H-jkYi
z6qH>!>M`ua98~NHdt|HT11fDkAV1UWt>?am*PI>1W~+oa{(^V3YG$Q0V^y=s09K_K
z!1Yi4F{ZR{LFir;;qu>UU8d1^4<n-S4s&l}r(_KeC@Eqkib))3O9k4F5ZgJ8C?`fL
z?vYZL8$K<IM}A>IB#e`YYF7}#4nL?vt;A=lIbKPo3_ocL3bsb3wq`{&$iZ_8MPq56
zh6?GZM!zW3wZ}@Oupba$<!n8dF}AXLY?d?_vm`j?V^U+cA59VUr!em=Fs9uZxDcy5
zo08t%!yC2AqZ+!BeWB7il%X&m=r*d51;(*tbSyjW8O>JE$X&q+gUNQshC9~F45r56
zsBN$zPfhoL$<F}Jx94+;L=)!Bhl2AvXAQ}i_57)=Y5e57Tw2YJUZ72z8Cby>3#?<;
zuYD!g1XBm~k^#J8<nU#=3L;TLT@7<>i9R%L-Bs*051MboCmK1}WX@Z&3DhQ-9zeIH
zgwTm16I0BPhv|CF$)Q`6Mveh*Dtpe)qlz(4K9-Wn{|fk}bJ|%eY&gcH_!-)?kNvIQ
zLLPNVHW`jgyj91Bm52|v&aDe*8>gmAwXGRQyQCo6x&=CBv=y`mTm$Tsf>~@z0sqK<
zD{L#Z2Y-<ziAsmlAtS@ikBoTJfc_o13lV!A<6!Tw#?2=>6k@WM8-}@Ir;4eH&}yY8
zyA|=rE|YK2q*+z5b*rcrbzlR=_8isu`RRGD*Fn^}?{?C4EWn1qV&iUmMQzNRUMbi<
z$z)22n#4E6i;+OddbIELH-or=3pn(tEoxM+0@I^9Bv@LrogBhxcOjAp!KozijYtv3
zJ#7t>SY|;N99)4L9b@k3cK;O~xxZE!LG#C`R2w&WFrP=3PMjRLpp0+MQQ?<bWku5S
zZyQ+VN*|6oRVyALy<q<-o!Vhi>+p@SXKN0*omFM=BcwBQ;{k%##o+0PDhzixO1GOf
z+7ekEG0hz{*X@pq$gVAM4@Tbgws~`eDTkRDM#6z!jHFR&WKL7{<MXm!fjpVK$}t&x
zk#0J15G%H^0rTJ85^184+N_XdTYq1bC~cymMosCUwb>v4ovmB>EJHaBO3GFW!VSWs
zy~=HRq+l^=DE!}VkE<&j{ul^+C;1OLI4|K2<yWqev<*+sEg!3EaUPc~Ft)X)hO^G*
zwwv72I|ypar1J_ThBOlMfd<dJqdn~Ht}p*<?T>eOyy~vV-@%ed;lCEohp(vTw^!2~
z5@L<Q!;t$r8vcN9I^K#q!meebLz24Jb9_*njR>DNp{8`D3{`I=AIn@(3RixDt7Ghf
zb5(<G8Wz}t&IPyKgkoje*0YgbYqt<JgSQn8`t?=6+kl5syhw?`ZO8`;zW*qxCM2Cq
zSrEiN%J67_w{9Z+KgCu*i76UDdf`1tDm*k<g*ykL#RjDEDqUr~i%}ZNfsj3Hlfy==
zEf@<<ER@Bh;i=a$#=6NI&lsYdu9oYH8ZY@A?Hrn`uHAPfaJ(6DGvGN>+~q%)yCGl~
zbV8)A3K;Xk<PHQ_Zu&qWpsF?<Q&{oAWyN6A@ihk7Q9vQ=!U6wP#^08MM~^lBs>Rm<
z_9){o7Kl%lv>JaO?aJ|eI2Hp_PpyACs<)&7#(ZyQdI*^#!iPbQHdEHDDDmX;(=Sdk
zPOjmnu7H4dI-c8KuVznv(ic9uGv)2J|8{vk;*53Q&_WGgOUEn}@8qE%j9oHX7^b_)
z+zrVjcFkw8gNgg=Fl2*Kw%w5JWD|0L{X5yOzjDK8w6tS#IrMwC*Kt)=yh`;*c3nzX
zGb`^izEaJkJuOXoQa(bDNnKzwyT^!6JgV&jV=Fv`Ci(l`!0vh5*S}yLF6jpeFfvF;
z{^B(9ZCH`<Z6|ReT5D~f5%Q?b8r01!H4&JX(5N-9!C?4i@5!^b!woFtz@x!{Cb%VM
zG`&kD*x}rum@?oloF84jK0nD`LEy~6izCY+V$|i!n`e7(1ETmfd6OGCwdOle_>O{{
zHcn8yg;`$VGT|($J}ha($TBL3x##+DPSVc))g>5n&|qg$eeN*vGNiPcHF6tpXB_*(
zmK)cl#g9U@Je6M&-7@el$~`|iJUagTNW(6bEw{v6n$y|np`$X%piGO@+`BRK<Nx%`
z;+PY8LMkD%p%F8ZYGiUWfqp1teFkUo_^G{2LiZQc(#)-*0?ZhC%$26vhwYg)_}U4Z
zna;_27A9s3gM+aqtPA|`Usz!9)_(Vk*hZj;{q7g>8;_C*`h`ux!W&p?*e*YfZ@W><
z<&h5tU4y3bE?4&WwdL3&?D`!rv_KF@ZOToniFa?|uS{z?dsrrpr~W4L7xPk5fU?-Z
z!p1QR*xx<7e_p)*Zn2ucj%t(CDiqFOs)?P88X1wf*275rRLYUX<4*Vy)EoNRk@;`~
zA+Q!bGI<R}!3{hZaZ^E*7+t5xfP&O9s6EnaN{C_~<DHE0Qy3hqkE;$}Dil>xT!$Z&
z>9}^o1!$=Rz-}QU1rRiBi-3c1k;yM?zopHv=g>v1?o=JMAv4?xt})S8=0RM=8RQ4n
zxB|K19lhP|7i?<*zVG5vY^@g3+`yC?Y2RJ1+||0APhHCUgh?-2Njkr7o;BqA>J-v4
zLp<i?q1=v%;YmvWj9}dJ7MjNoeHSki`wkyvn(MaT=L>(mx^Kc1vr1ij5t_5EED~>{
z0r%Q+da6UtvSm2hvC=_Dz1>@!*47U8ZT35VdxM6AthX(0!)C!}x1Sl5yWCzM?rT8i
zM_0eh1iFO8o$7UF8D<YfYo6f^`==I(5qeY#;8Bb3ap%OIK_r0D_jS_(5@P$75Qe?n
zkFPcHl40N?vsZ^07&~71*tr|2pdr8csy26<;Av!<)+ya<T~jG2NUMB;7n<)B8Ji(}
zWtmUsUj;Sc$-b|FmI7khg@B@HBzJa`{r{W^cagL@ii!{$brkmsT+;3MbP%OAo#cFI
zR5Kpc61gB4NIInrKJwjEh1e&q)#L)cDb#@T=QbuVOz86YD?Uk_ya0XQF%n<hoA<($
zyRUxma_sOk>u}e=fx^!WW}$aG!LuC{ne0KDOy;F#*`|cp!eN^-tpMV0{|H{`ZIa9c
zOaQ)?pgR&W2n$)ybH`G=qtb$Z?~40YYfsjZ_d4JNJJdv7*;ebAlU^uVZNG@MGh>n0
z4lTzn|8eX??(ic|Ci`tjjbKIW2R(^I(yk=tC?$5i8%!n$+qjQ6WHQHjanhK>Y?`Zw
zl@|!l3N+Eqw%dCebkhOvUl7rLemJcDf2CP#d~B^5=L7TkdWIuUd7&p{c{CD9KQzXl
z$&lm=o0|=x<EBzpoNB|=1;Zx^(4INV2d6iK0dD4SG65;&9yIc9qObc7PV}zpB|5%C
zoSqHML?;7SJZ!sdemSJv7nmmGj>uv`H!4n6G>`CeDv@SBH29COws$&VXON==#lAO8
z4idb%k}a4_=hzinVF<O^;5p?4+_weL26Y+j>n+Vncp=hXi`Dc1=B|XZ#oS^qoHy}t
zdAmn!&lrSTREs%bP&WK-<OwgH{>cRfL_#e=(gGuf-?@<i!)gP!NtlvX$_Zf2N(P=$
z7<8(qvcc-y;3UC_HWSz^>av1~ijrkaP?s$ES`0{odo|}P&?yhJmt2xZI!N8&&%zt*
z*V|;@zdShj!!o?cBc>`3;tPVk-ZK@HdNFgv9Acb$E{84SG!I<I3KBeFO?=)D)A~YH
z_|I)4g>45450k&g^X+UDA$WtsN1xjdoL4pnwo6#iLJQZK7U<H=o*$oYP|fsDOVjP~
zTdx)B5_jWUh908HwQ6(^xF#O|9;MoaD&9_F37l9L0tZ*vc3!kzP66J)MIwc57jkCf
zkfHeXp%bYkUmZajzujhW_Zi$}_C9px9z1IwXU2@$(=|E0z#1=XaY3KXhWPO9^YOg>
zWAOM*${iZil%MMU><fBb^h#`Ghnvi}+>4{F*Td~br;ZQr!^}tzBVFri*E2CEr?3iN
zeR^b<Ep84Vz!W6facK-S=OGN81}KTkZC8OU{$ILWXp&nIq+n04eLuz0UyMe;lC!50
z@FP`hV)$VU+8m`ndvyp_Z@aC~HWEuHaH0hua?R}n$_S=TH@waPA~ze8SEX3u5RDXG
zJx~TWS6fvVq%Zo_>SEba@0%uez)BMetgog(v>L!t2&dLF8Xn@e$YywujP1)YKn^X4
z^ujeDHY?0kX)Y<198ZFpMl7TrSmOkxum(EMO7}>!362y%zCb!hzfR4WocQlGht7>$
za3%o4sp)uaN^+v|Wy}@dfGWUjl5|!VP4r=KXlwlE_qSs&ZE9|GZipdCuTyK*8O!4o
zYNUF_K2NS*qts<Uh3gu(L|O2aj{8{hgRF`i*|4J#3=0SKqW}xNX)-7oM?qvC5C-8F
zV8UzuA0zSAn|nxj`Zw04Jc6)e`4J`Si_3${<HKxc!5S9Cf1^kZ-Rcsiy*b!PsOAEG
zy*GPrI~Ou?)C@UlMtfMxafla%oN`j;OFM)RkU4g&yB`|{U0MW*p#+rKC2wzRNU25G
zU#P0@0_q~L^P|sR!cCaRrzbne!&sQa;ols5evrL9IC-J>{BC!$dgQj76aHa@<*qdt
z8<Ar%*yI`c82C4cwJHc-ygod%3F!PO|G2~$kVU^u&fDp7TbfV^AGVfc%m$8^ZduQx
z4J@_@42O@fiUYtmeUS?>0iKcd`fv9hV5|K&^v24{BgpJS6|jE?a)jz8klQKBR+Q}W
z4`)YRnM$^5=r|s7cd<Z~9pnr~{`mLFcN`if0K07DVQ`v@ejLnB{DXQi?CgfB0k%Zv
ztbYq;mix66a%Sm0YWY8i1t3lzyy(-vJ$ul?mu83m(V?W-*Bd;V{Xcs2k<p_YgS<4?
z`z%M57X_I$9elklzr?A($?Ndc8L~~~2C-<pOB8DM-x`LR?G%R^#mAkVK)~he!^_k2
z9k$kgRh-;z8|<T$h>Z-J=*3~%|GN0N*WmX5H7Lsm;6FR8FwJ>ut{MC!Cw!9ALB9N2
zyr7gVk3i9HHbs#YuJ|%XaD31`$9>}SvbZy+I^V;0hW?9R{35B6eR`5gsNOL7rdXYz
zrd3zut*uqVY*e>WRn1CkxaclDr*}S}@Gar$UQ1$;EK!?r1K}i<l3=fht4&LweR^+z
zjU2>3!MgCPq<95^&1ff=RZns)<9`(0k<ZzXSTx@~<H2r5u>IO**}%BG|G}TB$C*L0
zyrmQ{^o+lo8t$`!gEYw3K-Q_u73+h~`a@Gq^SST4U6pc%#so(kBO!%}r6jUp(g`l@
zKhkv1yL8Z?UND<wDBvD+327wP%wx^Rbce1e<tKc`6}OnYmdS8Y_HB{BJ^BN(#h)9p
z2aJ^7f6@1h#xfGEW@CNr8IF;glG&7dZdy?jGCh~IYz8L0D<HN2dky}Z+khWs{2e6#
z1B~{RSn8u&g3FnJ-(-5E61t_CWbeR4vg7Byj<Wn^tNBoP_M3M8oB~n4eS~feT3k24
zHe*Cb@!nUYweJua7iFrrv?*w?yzWnwlu+d&CJA3@OO4X@k<A9LPkws>`6;&A4FoMU
z4RjLOzs`CPY5$Rx0NObGEt~*vqZf}0O82^zRPCQD)OH?0R}4sd2l{@P(k|uz*8uR@
z?_icLVa)d4wxjtF)p<wrm<-F%9;7y6N87d`uYiM7k_h3vK<%BW`rHfp{43V&E`fGk
zvxnH&ew*(u^jQu#SPZv!a|B}O<m56d&)q|s?T4w`xH;QTgs1Z(9ui^lk7VXbXv|z}
zTQ0GeCVfd4yC?Py#e7Ot9Cxa28O720(aGTv8K9EyI8f3cn0pvza^g^5x){gMjXI3+
zdUZ(<WOde|y+6kXMGhm)Ka>XkDizo<K`Jx>-n){|#sqNGw8B2E<Kzyj&^^~Sbt1Hn
zq=wXqlr}HgZcdv~cIs=0wy9}ZSvjy}IGustcg8QcSA;fKbJuo;DkM46pvr)0?9JQZ
z*1h<l&M_^8V|aY@MYdBIPbZg;l<!Sl1H10<VY|gg*)Be!F1*>9R?fV(x>p?t*r0{&
z;%fV-?%I#wvfYJxsc_s~nEjD_#q=x5cQ#6p;q+~670l>w_P$P`b1-}qsk#kplj<HP
z)cO7$yPAD)oBfX;SMCbk5V6ukFaOu%E+o(B(!euOTO;9%w=}f}2e<2vtcn}n|F>0}
z)mZfr-LNp~yosF;aPK|hLV5{e!jBKX58e!!BTTl*s`{GlSl}!HkkN*(?8|9%03-MX
za_8!<z#LFUTKW8yD^ljpN!3r10JIf%pbalCAygYjh2&^#FSu*h0OyPZ&`Ibqp*H5z
zyd*{M*ly3-N^JWnK|Di%5h`6sxTd^}iR_`s7)&Y;8NM2ScRb$TYP)``LLw9hhMp5q
zM~s{{64imx*s4IZsSQ;16&E5&S4>xO_>ESgkCOyR!dJYR680{4TKuUE?1^J*j+$1N
z(9~c!Cc4!q7Xw90I^djICQ^oOkFLQ-@uZr~43V>!Z&BiQiqQDJfNI5ePU73yx{+J$
zA8^;C&o4z~Qg*P<(bE2oMr#d&7r(sE{3|K{0ABqQ@V}q_{tqqpQ__Zz83riCNi9g5
z#=B*n>1ELJWXpYof_~6KP@uIprl>P9kIXfHu<))g9Y+@=j*YJDqTwf@Pu`<ExW;=K
zXfF^+&$JgEz7Ko60U@u^@uAe?wcUcF#j6Xawiz+^r6;c_?W>8@B!El=NgkH<;GANR
z=Uv%2bJ^hSN_~W;9)M^Zx+inb*ePn1NShUL0H1w<%oJUyPT2(O^1ZQ9Yvux8X-XO&
zcoXH0S8x*SN8iJ#t6IMX3(x%#1<knYQNM<la0ayKqZz#H9*XXg(Rt#N%DvSAWJHJH
znhHcZgh$;(+Ed6L>FwSHf6Z(faN>;-w~X1EcQW+wrU2AH*ff{BlDs4LA9|ytlh}PJ
zta}r+mVdy-ur5J7pc$(1pc`oIpzPQiTmB)vY2AUk+KVUDL!cEF?aI-V$61vv#vwH+
z;twH99SuERrrq{8`5mXxtD_Uy5!%x>^qZ6~A9{Grs{eKna+nS-PjEWkMRwe1j<l{m
z+j`@LhjsXV;H~QmP;m8j>NeXloW~A(?*EM8g}f|jzj`C2QhS_3X~gSdJ7u+*qxJ2;
zH9kCmUSp{3D{HU?q+@_>CnIBz?IOY8{{L;FX2*Di#;~B7ZY0gjg=of97up<lx+FMt
zwtUf+&ydc~K(!wXM>~P|XL(1u+dL8c-={DE2?v5X1PE0WRPJ&s{kQ}uaTB+-NRwbY
zg$nWR%(XjvIdRGd5&{8@?cLdHc6N5ozMFl#^8I|6*HF?As@U$F*&nv?CsrW#WQsSB
zi!YzdzIyLdkj{Y)JzSEgSzVNP3nukUVO;yJO@zB*o2%Q}6L^>?DN;oxkz4FcQ$wk!
zdw8aB3J(>ApahOAe<VT(Ji0kF<sQlAbjy3-{mD|JykR4nB0C3=oV1Q!AZHBloiNxT
z{vkBnuGP|GdxubveiowFP0cDvF24Nwn<q0c^ZWM0>5~tqgHIF}mJqvi!9J=Uy+2ZG
z=ok0SC{=a&c)kyZ54xMv=Y;I%U~?w>#pQ=nnB${c1T;J|N<IL8&`r)LP0u207ZQW}
zgs5J|#h`1)_=E74k5-{@D^eAO=^SbV^I(c|G()<4Xu){05bvia^<DJ^SclbZJ~Yv}
zmq|KLgkjeX-c;$+v1iy}Sq0R8&>j?`^d-LvIg;aK75ZBYs#RhT${GZGc0t!gd*cJe
ze@BLSTwAGf*f9^7wws7HJEghgButWUM_^!RS?ZpZCx~$;s_XD-7YP;Wj_Q;vR0%%V
zy17J%gpshG{f@~n$Sh#4O)(czIGpi5vK-^Up2z@?y@>C-G#}yzPY&{{E~gz~?7)8X
zB+Te<mpRe=VBX?TBJuvJf|UdIe6wo+Fa$)Cla+R|2$M8KZZVh;?BnM9D26UDVQ|EO
z9=-o{>xYFR=}4M2jd{KvJ>$T2J&Qi|v*UfVkgVwNa6+caIZDn(3lL&%?(pCm1!>)A
z`d~9-T?(3tb=gwA25Jb)CkDwaFkD)oTDKriRZ)(KS}lcvze8ap?R=`BN=X#Pf~j-I
zZD<t>YJ<vGim*X79b)z|y4AByLIrwwqcFSMEO)C)SV1N|(ski3$)pSItHmW>MWa_0
z6t6eUI3n>hai`f=Ted={M-dlWzO~3cFr!*msU%Ms$XS2IKq#E=12!8v*LU@L(GdW$
zJfj|~5d<{k!^lZzxTbNF6*3(ST&q-g^)i)!{yegb%5rLk)tIF){c4MB%Z-P5FN<Bz
z04pHmd)}XZ2&~!+YaBEQ4Q+jx#rZcad5{%V%c{tbhe03YF}Y5XzYH~32iQ&gpGQwz
zyFjP2W1k{WWE3F>REpLf+`*PL+<aS9g!B*v-Xv6?a38&1<_cm?${~d^k2aB(rGWRE
z<R%XBqMiV9wzvbOre*taf8q_R{Q$J5=zv&3-+I+O{im-_`tjAtYd_D$a&OxJaamA+
znK@NN+aLIx+G^hD&S}!#R?ifQY%69Fw!k}cw{1HY-qs79m~}*yvFViq79oHEJmi*=
zxC!0p094s_M4v6d(YSY;pHHXfKMdI9R8F?}@jG16g{<DZ^yS)3m?mdg)6qL#Jd2^*
zIto{d|C0vfOaUH6K-~PEyoGs%XO{6wC2PHe8ZGBU?3`Wa2?r^0Z@AB-WS~>^&{)0{
zevx<P(S{5=dWzSaIkIna8AO<4!6qtmyTmCU+hoGW99n)L>)O==@e`8xtQ{9e<RK0h
zGt8$t`1)I4hhy$ZrZ&L+*G+qt)K_Ry%+%C`S1V9@3`mEWWZxs+Izr93%wtjYe}s$a
zTdl^ui#5dEEqnLaSx?-5PmaAD3aI}Nj{9k^ACI`45c?@2tpmojq5OAn*~#J^POIA)
z7r=iM<7wIdBj_%x@;4E^sZa!TP}~|4;}{Y0`r3%`*lJNA8}Y$r?Q^b!&y%A>Lr+D$
zN?4GxCH$qTwjtbv;a-}0%$%*Vhh{(}mFQ{>6tL){O`_gZAY=k_PhSkH`KYp-9E-C$
z%p%IRqjPoEkEI;R;htu-5|jiUh@*+y8O^AV26G)bUL!uPc5Qvnfe((Gy&~x7;TmqQ
zxirqR-)cW$i}1FeJrn{G%9v>?pORS)HBqDhstwCN0KI!nZg|J;OZ12sumUk`sngFu
zI_Y30uDr$HH(!*$RIOS%$|#|-dTNr|g6n!Z<jS#OCJBHhY1fTmvM{Tb(O$(3C@d^V
zYBIUXS4%M;7eORhV0vlbC#XZYSrk%1XVeHQ+fFuly}sZ4Vmd;`39PNI?`mqN;V<20
zlfm8cK}M?Qu@jgewrNj`Wf%m>Wl{r&w!8_>ubT?QZ@#CZ*HD`HyNzi`7R3OJPs;0$
z*0~c<V|6I*WGiQ!r>0}+7jA70PKr-Hl6=5lfV>jSY)gSSt~R^Q#%0-{ZJW$pG%MKr
zsprDUQYxvaF_2IiApt40DW3rl+HC{DF5}i@oTnM;9bb|sYqES@ui%-cS9}NhB~~L4
zEd2DsN1M$EaO1&|Hhl2_FDLlR`TJhUaV;VeepNPHq9BwoJwjwIyTvW^tGp=@JE&JE
zr)w%$_M3GH?djU{zT0pmGR(m8R(@WZtwFlY5kGthAM>AH7{#-s>rZSh;eY~dYn%*M
zHm_RHNPI|M7cx8YLYB!O6)Y1r!lyk!qp<fLBn%XeY3G)1t9{_&@yI6~3{_Wrty`^4
zQQaJup?h!Hu{_N{y($~)=Yo*tR;*PU7wB+Y(pG|<#$|??l2FC*j@FW1gyA7(6YlnQ
zcD8Qnv4nJ`&B~Zz$;7JMtrxdeBolBG?Su`LU%^A9fi`Z>MxzPudU@jG1q}b&vmbw+
z_~&w6%{FUb@oVF?VclnDZMOsp68l_uCE#);^lDiG?OsDrnVI>{bc?AVcLje22e@c4
zfr&<)A~JTfn`X1yrkKdjX(j>>FXq6R6f*=uca8WYp_op>i>h)Bp0wI%g!D7Wsu9cN
zymwtcYL}K?Rv>S@h8~{Ib;<gmYYA=uDJz)`DC{-4EFyoww44ucYyjUS$m*uy#EtX|
zd(4f*+ym|kw!EOC;dW@Z)q*xU)})PCZEvePR6v^YM6DxERS=LnfxmcjvQQqsLbG21
z1yc@Q3^MSWe~OeX9mmY0paU3mcKs5b&&#q|!i7>S>o*O6kgd-B3UZ8*qVSrI@-CaV
z4{fo9lfC|((hW1a0O*SkY#87-9jG(`-J?MhHrZk8BTdnsRR{suO?nD5u9~On_pT|8
z*A9Z)mM#8eSb3-|(&mNFx~iP@+-bNbTX3!ihqp6{v!9zz&5Lo+Uzc`-_a^R~Ah+yF
zJ|wUfF?4QIgxRu)wlu;e_5(is0ySfN3rW=}0&b9fvtKkJDwXh$>Wga*(U=Q`@NNiF
zBdY_4!fewP%j$j&$77+MPUHi*MR9-&19=&YE7vZXdP`BYN;pL;I>;NMOz>1>-{HKV
z9)&P*Rj(_?_^$T#;j)`rlkhdVDu$H#sZR*N{BKQ2bSM$3D*r)_r#m2R66V2hGF~$Z
zWr-&X6%nho>K@37n)Ui99K}f?oQ54}Z%XBw?snm{v{}xRJ&|SFd;Ug8_Bgw%Z*Gar
zg^`F;!Rmppv9gN-Q)4Q-Ygu|h=O~zp-!Zpbqqv!Zwy-D_*Pl`$1xpc)0@Zuw)LNA&
z&16O_N%>|>LInjUM2poD@Hq<l{>fKgeE9NJy?z-hgtmEF1ji$e!hlx>BR$JP=0w7d
z^fDvLwA%estLdQOLEP?G9H9EL;9@#NhYjB!RyMc2Vd3FP&?pH`9HEh2_^oWjDO397
zdp6Ss3J=5gyJq#$`!8Q!d<^!<=#%+;$6R)^+ja`)LT(fuIwR?BC&@doN8)i_0Et$>
zwNLo6NSCv#N)d~=op$5zW*bQbCup;~xh+m&Q-ktR?3>$+I?9JG@^g`nrCEP*Guq3J
zV-d@TJ50E8;1l=Q^d>W6yo2XS#CHeJ9<&rY9=~##bNg!`f8O_D9Qhucc`+jr&N#96
zD_v=5#>ajBAAlUsW|(3BJ&1^sU4H%H`xhTwyf0BI0z<|qPz7A`{UUO%wOcq3Y6(vY
z&x?w@huJC^3p<ii>6h_IW^%84NP=)W6?peH)vk?2ReHse5JL^#QhjR`g_LbshAZ~!
za(EpXJAf@W!>j5s5dkC{xh;PF!3R@_5ObvUc|e4zU&Kq(FK1C$=Fk9KSP~(U=Hw@~
zn6`AkknZngy@w$&C~5IH)A)bUjXyHs?-%eN=LYjHyN2)88TfNX+3krp6O`;#7CbGN
z-J?9o%|4O8vdM_L1yhXg4%^#PF~1^%9$>3)N(M^Bc7^(kuKImQ^G@l30f&@yVHdxv
zN|-fN3aHBa^9^8Z-CC5d+YK_A88DL+jtyf1LUj%(sNnc~5-WU-k9db=mK7Npiy@=h
z1+=VDXh_0{JQrT|SH$f4nY$k3Hh^$|A7KnzR^^IKxC?)bD;&XDHpP*In%%mc8D$d`
zAuc#BAQm1ow^ZQmqnhi;c_%v2JbjFc18D!*02&68kUJt$aFkGl#vRzk6fVsuuV-Y9
zdM4%$1xU>iok*|mqTxib88dmpzy4?ir5K8W<3G7HGcm6(zk2^O2owgEm0!5i+Rf-F
zok^Q{40+sdArTiY;9{}NdxsH53Vi(~aNrg`q2AuVImur0-@NWsz$wbkyk>O{@fV7N
zlkOsD&ReY!FLaM1G@jJ{(%5TL^wpnxjXqH39awDKwKq_4bS|bdY%bH-xZj<Lwah5#
z>&Zu{{MI95MFBC|P9clT*o0&2o$Zr^^@!cSIdm}LVHf%JV7W==$LIK>e7LHFUn189
zl%>(GK&PMGHrs9A^>G!;a$E-bkyEBnF<6H!;z9pFQV7(o?(q;(!EE>dGHdwtG#GJi
z^85<D<xf%z@rF||@n+3;v_5kOhR6%91n&bQQe`RX96`;AtXc{2EBiDkNo@6o665`W
z(MYLB4(JQMp)P}Q3D7z4F8aVefNQ+smaW*JoG7wFo41>`8y_`(fvr;jsMSyGZ9P1Z
zMAU{)dM?VXMA%dn_w?ETupS<Og#4&Mv^f~*xS;$}UMM<I$5ZD-NJ5cl?n;TQ8cKj4
zd98OFb)iZ711+KPDAyBQzYRXXLkl;{VrDWoxN5jQqw%3Lh!-}0V|+5$d~S3}RTU?2
z2LPcR6;V+`bnnoDY*`_Plpi{g#=*%aMP*sZ{68J;VIoC|QU;mTk$@5MV;Z8k2j0QP
zrw-m6qLtva3IUf_DE-11yY!0>v1-W#7tJ5|kM=s*GmHvN|Jh<k+ztxfk~0)oZ+0%1
zuXmvVqtTpO$!uk)r?MkX*XH@0nSC<JW=SSQ`Wx^cM+AhyUYOBxPX_b`L*_^wU5v-f
zw%hvtA?VFzcsH}NsBZCFQ*L?gr0WA=P3-=umu%P5)8F-Wr;M@A*d3UIp_n}%HCF*O
zi4CYjzia)H4;}l2d!j8JLpXKv2ku_|C%QJ$y*1Zo?a}@@kke~_@yMX>0C`K3JFR{;
zq@QU^Y2V<ct~xti&fp8V-VCVCCYy29W9|ckXq9&W68kNL`?r#GUL4HEU7694T!8V?
z5IL8H>0b;gb*8(k_cAfOvnSCJxY=$Ej>IQk?5VHdq<BmmIw*PPuzXs@5jedc?BhrT
zV*tf1q{M!cW8_{8A(F{!KSM)vEZ%Dh&bD33r91mQ;N4;*BRJ|16N6xZe88u|Iu0RU
zlu6GMS`mMhhg8Wx(?eN5nnZoRq42P68atY$Zv&g{ahHMY9A<sx+tY;_3egkyUK~)U
zM(Ge%51u&5#x9Tx>VOpt`DEkc^Z|@lR|@s)f{B&@_3XNVtE*Rlkjll9xS*uo^(inZ
zr{u9&&`tN4oakfS_CTqAuNFIO3zvq19ng;W%?Rr{YK4=N_JC+AjEq#mofSvQI{V+M
zbYTW;V>T=(S}F9MF3Yrf*i>uG#R;M=<Q^BKL&8jCb%xxmv|R5&m=^T)Z>vG&<1()E
z&ErS|RsFm4!QY^f3r0-*3zTFvc|e(JilT8kc)fbXSDYAjoca{b*cPN9gZHf}Zlgnv
z1#e!Kfw<IMw6#^ML8p-*daerx6}PG?l(vr0q!Oo`IC38s6(>@nx)~sLThM98b0CLv
zVf#8BsGc#`4f^YU;^D+skw+>LZQ|v-%{_(50WBRUD#~BbK1UZKbcADbh1J8;QAj>k
zBbn^HgoL~m!iv4Us$>vt_0+;sABfW(57;mN2&uC}m-^+B{b#zOWyaCcx{cYQz(}+h
z&C=1ia)eAJJUu5hpR06A==(TwhbQ4u3@1SWoYN@Q`tkH7qZQGAP`5v;{}zP9uC7y_
zgr)5-X0EZ83|S2?URCK`Qi3NnC(_^XIKj=(H_cs|Z5oQ9!fy+7a@gi$6Gvruws!z5
zt!r)~w89}H5kF4K=^n;mCMcjX3cwqV-7H;{yGly_5Ys0*RscfoLocc|E#PR^UBC3!
zC^1jfDm9>w&!_J}k|h|;=a?gE@Ys@wk}SEbR=K6gKk|rw9zM*zn}3ijYWm@mZ>C?)
zzJT8_oFTKJckuMJnIr3bv;JjGw=TutQQ_le09{Gwch&PvWsDQa2+;VtTYK886>1_D
zH_XdYc(q;znga+Uz2&jPF^?#J)qqpsx_DX>`r&l?)pYQ5icCeCh{d8x?ue2dA?ts_
zu!syM@6D*2a-|rf7)EKx?`YyX&JT?Y@T`;QW{r!!a)Kci#Sz0H(aS$Q7<I?ZY1YRn
z5yga8#CRP&PU7HYC}e<v)dA@>*8nkBB<x9Zrjg>;OT9=+AW8|3AaWk!a~amj!F!W7
z8C{f=eD!{h0Z1@ZE<#7AmNJJ=nx*tVY2)%G;L?JIPEYoV34Q6@$q<c#E5VIKczQeu
zx|l<XdaMW&A8-8x<Ro!eua#3DAhRHWOvJ7X<Wz{~GZtf<a+Y7^0Sfe__ml|C@i8`$
zevR`N7~uHiAQQl|<j+s`qJNwPLDc$#skt4U=!9}NhS9jRsQ3xl<%;^h@<W!WD2EQC
z(Lt?2HWCw7isT<S)L-HybQH>2gLzZ@bXO0@t9Ti$_K%1OD#|z#d1$Cy@fjy_oNY(j
zO2v&FJwWzLs-2WXk7FopQEs=mh3Sb4{EhB&Zv?cFIfMxnQfUo?BJ+&%C$t&Tgu2o+
zb!v#*6CvQ7syy)VDXWd$5D@6sO=%fS5K8vXxGom}B7||vm}tVJ3LI42FF8NnoaLaU
zxYF|{17)sl&Dmy#J0$@m*#$gs`);$G{iDae>9O{^T56A5H`{m>&!Rutc6S8_q`-wM
z|Dsb!r?%5XX@JViC)pgiX!AVA7d8V0%@4m)nHW4<FidRmBNoD&5s`@b!5+6F-+l>2
z;QewUxg-xa5ykPsw?F>$U#~&k%vaJ^bi3mS<rw0C4aglWDkzt*uRmX6+C|0~K}#BU
zY{2fs#n9rQclBz;(LW{DFWg*VArV!Y4A5}1sL=>VgB>$;Pc1M=T3=t=(Zw_xXU2XW
zpiguXu=~ypkmC2^`|QQ=;;$<ZA7ai|ilg}|m53G<kuy7!;fIu{UIy6^YM^R}Jra`X
zVSGAhk_4PK3RS&4&9GG-2^u$<57*<MtCCrOfzT`@vnQQSl6eIN+IU1Sc>=IPRy*_H
zLWfO0n;p|vfeH0=0><ce^j_K43$jQDVpx()nIC`$h3gq?Pm?i4f~Y5R{xdOJm3LRm
zGGnt+WGO?h6gLA%Z>cf#3mxHqc>UAa<!2XPqv%4?TVEpgOcBX32he@pklQ@5Qu^k{
zWX+#pQhvj*MFN81Kbd|b@{ie!BbdXqj*{_yfrI%AP)h>@6aWAK2mqKQL`mc-L8{0P
z006i$000mG003}#G-@w!a&L5RV{dFOaCx;FYjYYm@caD=m41Pm-o$C9?a;b27+)Mx
z3>0uuPm(#rA<QNBVD1h(G?V|{-IY$d7mm|MrnP<Ar*>D`T}iE0%UWy&+a+-pdvU<R
zc)bq<TZ@e!iIimo^Fu%L@8FY1YZ?-%$9H~o%l4^Qv(>?x9-h6_Ol%8y`YD6|ILpMx
z?E4i5JPijHJVU71Y<$lbQN}iL!fr(*5*~cN+b6p?6;>#+ZM<e44+0R?WvhM0?nH86
zeq;#m{cOtup4<u+?ZcHwQr2DxkMDs2XnJ5g5d|GOpxe10ap0o0X$$+-?#l1ge$!$1
zJY`-SrT$tZVh!S?8F-qJ5|$rjB8hmwRwDAYAx}P_Dbj6x57=^U88Hb&T9_m-vVbs5
zS-fGX|Cf;F^h05@++OQFXd9Ef4fit1jk6%mEF=oS7A(yYkeAZT#7A$-qg#;MR_2O$
zD7wrGIEa>i-|@uft3X)znRc1TJR9vWNCeM-GLF~;M4dkCvZqf@pIBCp?}1g)W9K{p
zO#%@f*aZjoa6h#$Vm3(}GS}Vgvwb35m-*o?PBONSd@o)LH}ErbB#7dxG!FKea3zK%
zDbW)@K2uaM9WVn=*6!-ZXCgud-O|`v;FuzX4<C$>(_Y9=1>7B;y|=wM-0>_&Xj=^a
z1u?`AiIukN=BmQrArm}F_~B_6QSlp*y5Nk$pBf84b_cLd^n(H$vMh$c=&C=75PoDa
zes?U(df9)~cdz<?y2J5;ow28<&whIL^AA5h`$<8DV+HwMgMOA&F#dcv9xiSG*cwK#
zGX3n(lAy73*<TD_IVGI$t4RGZ4v+b`1&TQHL4WGFgBSf7j=p^hgWobI%5YKO94u$N
znBBP33FzZa&R;r%g*&`J+^l5{CgU*-3@2lEI-5+L+2W>*<FzbM$$8zsap%tLl`|`Y
z%dnO;Ux5At;N?6Y{)q^G^+=zWudb{M=Xw8nv~b~XMV-8F6-QxaR}PNDe#Q4eMwE-g
zPNdfP&BB?xi<>FIfea?&Tskv7<hQKRWPGW=M=k4oG8sAju^w81u*voL$k8Kf6h9bV
z^+$T>0_QLr7C5CX3ufcYobkX=CM~N!oAqz>Kn8TqLVs>So#EVk2QBNWKh>YSWsR>#
zMrzcurv1V1{YxWZ2hJJw=VsC#uy7VHCKq}n1TGWz%DFms$oFx}dNmmu_#K#O;mrDr
ziLuiyNV)+o<nKR?{u?s(RPCY=GvpxAvYgp$Lfk*X46g?RXFkWT16pNqJsZ1MFanC>
z;5WboxXib(SiOCg<^(QIPU7S&PFQim8j+@4LNi8C_IUos*o82g(XvBD-NEGQst>=X
zau9Vtft3&y%#-7kSZgRMJ&3~)wyD$_^hcu{T}Pyc+@24e(M1ik0T8Yu2%GBxn7W4G
zBf|W}_2OdkdhDtdQ^QIDi(;k*2>^m2IFbvs22T(U6Pe8FTtkGta7NP_WD5`(!!-~`
z5Uj^nL+5o3_#T1Ip95#wsLT$*7sI+0)(8L_IsK~|Yz{E=K_53|mhM-ca)wgy5?0py
z`n*PWYvwRNf|3?cM#F*Ah!hy4kuz>Y5e0L>92U)tJT;M-*aX7M{5UGkhMY)~xgP&M
zp1f|Bv<Ds<Qw3-vVpu|?)G+FgFXb9Kt{&<)^3$xSnGC%7WJX3j9-1=;cKNx}gp}k+
z#?Ey!n-Cul`i-)<#<{v4HDZJsV|Wb3F9^CGH==Bmr=I&FSQkwpS9u<CO%bOs=FJ$X
zVtggSI4K2lzD6&cQ4`_1u#EeQMSt+3(LRd(3lZ#!%xRIU+}uPL=No|uk7L0LE5F1U
zzVn#J9pQLnQDVQEswk@OgE`LJ>CXl)jxX047~z%k5UWI~I)Dm##k~(K-9haLm3cY2
znKERj5}8{|T9ttlMtCxT=#2LWGKHBFQW}wJl(g41)$APT05rmF{RrYMP4dD|ivFP8
zl|pcSEU*cMiiKvJFvpSQT4NaJC@K#$;oQ+gC)bPVH7#k_8x~ZFe4;8^S2#V^4tTg)
zbM~>z4m~!GBhitSwyM!8Xm>o=3q+NF*#Y!oNxQ?&u+U2OqNS*8y;GDRQMaX=wr$(0
zv~AnA?MmCWZQHhO+h%3w=^m%g`0wo)@emL3xc1z!=3X(s`U5UeNgKgTUM?t|WzSKH
zhku3AMNrd)!tw^GkY&m|5p<vUYOMq&ni{!yq&pytwc?4x7YY$R!qeUM#16M$bC%c_
zvuG&uDzn`l;E3d@ecII+>M~9$a3k@V5$4-Wap}zVd3p+c!tOw_XbhwLkh2v9heN&3
z`HMGC;x1=AW`^kY@G(RHp@eXOVC?>?m|Q>0fSxAW+mubcTI|XY3~(!s+zqrA*En>>
z(6G^#G@#T|6d@&SjNT|ZcHoYJDV0luWIE3WG?N?@OC*7KMc`GABT$UFpx?3}O|yus
z&k`q+^qG*G;aTR`pN({Zxp$rH8DsMxV7owE@3VfyK!de0Og7uKlFeY7RO0JRefhjK
z){oq>dlUhood5ymxVLKdK}}Sf8s{lq#GqCv@hFAEXd^=@yYGDtz&}FxxkFs>AmCpx
z9w`<^2mX*0q_Gt%&S?k!!^i&O;}-8hV`iwSC_JL}*Zbh!s^<CEuX9YT&ERtno?~A^
zq2>44**AMZMK<p8HvnR;oTM41U1cV7V}e{Kmkd@Ul-@4NEJ=ZMDujD>Y4E=NgXr+0
z&y?Xn=te1E!t%vqv2su-OIz20e`wU%yMV#AHUrZ8UGTC-^KO(l<qY_ggDB32TAF8B
zcYn4nDC*14a7AX!C}I<|+W{Bk)NG9tlw<rVuBwcqkSZD^5i#Y|zRVXBv@5bQVvUnl
z{)oaqz7u&26@o9H#X=KW?QQTTH#HRK4a>{V&<MduIV~*L<nA8}!_o~_V@nj*pc3kl
z4l+ytcjnMSs@yuZy(2=3uar7rm73E#G5kUi<z6=`;i}tFM{K_18miabc8;Om%c<dE
zbk@dr@iv+WA=kL3d%XN}3A0;w?LbNKqeVi`-jULQz_|mOf?@fXTaF>T$8Xu|CuWO}
z&*^caji59?+UP^=fwJi|DDL9Anz*nGEPu4RLuFB%GFnl0z&o1+=<?D6&iIEylW1%*
z?CZ+1;mZ~f_WbyTM{rnW2U#mz*X*vsVRB+~)=Z}Q_0|a39z16VhpGaA|H!}Zs~Xo(
zh3Oz<V2<mkI>&WJ9`3<F%O2(l;Pme#r%O;_=MNiUB!vSpYZEET=P2RB|Gfo(BrvBv
zQ0XBnKa~|GVcQ8%NG+9OsUi=K{NridZueeVeN4PfPFV?AQ(;>}sB|I_qA~6-(&=Aq
zplpM5jKE(gk+~c$ZL^!i88M^o|IH<~o)lv)H=)(4p{W;_fAMl?_%QIJ7Xc{eT1Llc
z`tclvN)NPy#_S9F%qIMY4S3Xf`ZM}(IW;REZkZ2W+5{tPS>;FS0V59&LB2c8U&V=Q
zG2^rA&AwPC`YqI~B_yLnzYW^CAJSsmiUXb%2aRFu@I*~5mDJ~B?qz}^Us`*`UCHb1
zo-1Odj)@@9*DPf)ylD_kU!73owJ2W;fAwjJRBW;48tv&U6p+p|lB0j;AD65}cVOYl
z2k6W)koj>oB+4lXHh9IaXI)xtj@_r=tI&6n7_8o19ikRVjvdY>UM!!&(hUX`HoDAR
zuSMP1HYv=dP0G_Mu<Iw{I}vOiPz58^Nq#7D#>vFEW`!iH?Qt#0KuF}M$M46^jGg6}
z=*X!|TrLUC?damvEo5jL!j3RI^l^emg4Topk}ioSm4#=DNE_Bjpx|Z10-Go~h-I2#
zOUl6rXRZFBzzq1$P5kTml^&qHyrdBCHA?cUK|mIg@qVH#96Fw#zL#E`0n$r4MIe))
zM~=OT&ZWEiADR{LUetT)McW##`gF?j_kDC{0s&T3Wdrw@Dg+j>$^^A+N3J9ZB(h5b
z=UpBTHS(`Eb*{<;G$?YDULYx)bql0l)90N?2&=?`m8EfZaThvTh&<Qbm2|4-G=+it
zvojT&k~OjKseOl#0-yvT$wE$*gz<<Uh=t{iYmYCGUHk%G;6#5R26=hjzs{c?OG|TY
zxUI|&@?#`7I1BOuTeKXV&Yg=vr>q}v9|^O6uzm8q{AfE~SwEfzPyZHp)pRUvpM9%v
z^6Bcy-%%$~H+Seo3I8qBH<RA&!)8j<redxp{z_yY!BNpO?CWl6k?$o}N1d=!55B_o
zA=KqUFgJ2r`^7K&7iO@&SVGC11LbM)1*2Sze8ND1vCX1MXu<Xs`3b|L!_lIW7K?>j
z&n;<DZP+^ycfuZ^!c&=_U^MOy5&D-GZk{t{1rgIHdfa7WKmfgS#v`_-?L?PHNvOMW
zeB&MPKm;^05`#31dR6(pq~otSw@0Jb4%NghSn#rDLieXgc|HdT>o@Ul6P8YrZel1{
za1^6SZy5QkCmn##L{QV0Cnee)zAK?*4@yR>m!|PWUB|tf;>U|VkZO(aK{zJSHS7WZ
zs}(Lo8&FcpCM0uEz*_0NcisDoz|b*-<|v_1$w4bKUcY2}k+cvK5wj5>sP2=D8U^v$
zo9_H4@4F0V_JiKrfx^|E!%_4ypQAuvc@(5+pkkDn9}mh52$>kUTra|m<>;_*9mLvK
zGoULYpljp@5iq8!Lg1e#wY-CS{RFVPaeK=17SayONIUk7eXgdfKsO$4@-qb|9W?4u
zt!NpPOsC(&CQ=$0plWQH<MbChDjp&^a_`47^%gzmcyOCFl4V{mZ87qm_85yFS(44=
z8j>=;W?(bGc(~^#HGXu7X?h>5q-}AFwrVzMb(e`plh!!p<N;com}{&|KV93+xhA-+
z3X!~Kh}O0x3W@4sMAa5&1$S}r(T3W+H3OXGS5X|hv@WOYh}+mom9n(rQMI5(z8vv%
z$Fh)ep#Gi#i6i%|mH;*NB&-LGRMeHKZ(Im=$fe;_hb<RsKbRnMyNrrdSZunCT-m?k
zPh4v<&0IR6hM4Mhf^ND*2y<lowTd%03rVB+?a^`Y17mkz57YRf#bae3U%fxwboV?=
zN`)tUS@w{^$n3-(q*8mHpPJ149O=MGuuF;eUE@QEcuu_*b<YRM(2qHX8PW%zWi6-+
z!zWOVncz0$;-b2idej!QZ1A-D<wbP&K~QYyY6BLUliaZd6vqPynHyU9yYGRXJK$Th
zDo_BHe0{A)F0h;B_&`xiOLYF#zA;V8cdcAh^Smf`22isNlNWBDCZ+k2{TA0`OH5=_
zU~vHGzA1zzOnnsaN%7Bhm%&zjJP%>_MZP7<oU7GibvoO;WvzU7l^;~i=Hz!dOo{d4
z#fFAviiZ_NCmmFFo3m8f*4~xzy9rH~WMw-u?G>rMUk>OsHt&tDi>m=Wm7~1bf}6!1
zRRPqUBH{d>Z2yy2USjl5T&Nx2CKD6@0OFVE_S^ge*n6;8(mPq0{Zifj3+q&^Ha_!z
z@P{;QBxN9QIP%4K{4|1w@#?Y=pls<)CE>W%p}9Rg%(@3flwq`?tl>Pn*1J>et?S!d
zZW`5Cgz#D~``?Y7@aHnFmn+q36_{N5FJrXWO&?@f3{lL)ANjyvT;e;JE8*QIAKIqt
zpGKZs_ErK?WH6s#AJT#ahZ!2d?tX$J0U$nd*RbYWh@c2yf6S^?d5{-i<h4RRl1BIz
z19>V&Xo(`gWJO1hQ!98)d~MkI-V|3V#HQLR+PV2tOkkU;PRYxJ6I%<bB5g+QkZgkF
zY6pw^W6s3!Kqe}f=9*Phtkpe5{=QX=nIb6&rF8o~epoyoKyJ!Z=utzq#VzBqg>*X!
zOfLU}?!Z<yLcwz)zCchoasAL<(k_Sr4wgTZfdv9KSH}yqgc>m!#4=z2c%JX?XNy}2
zy2BPXG&xzp$1DjEz%;jq&!C(|heXq7Wj4_UPMlyeN*2YmU4y+-vhM|jZ6E_IOv{_9
zXS@&<1IZY8V{k4wQKgWeE{wvPFX2^x#&NRlY%~fW;Gyf5>(OUY3Cp<O=hWm%yYBV^
zH;6sVN%&jquz9fCqx9*;Lk@ed0yF0f+<`O0_^|_YAf*CHX{~ac^oGfNR(}=eYJIwv
zioUpD^CJ5^clQW(7_iUHtj5CQq5Jp|*0o*tZ9dOTL#wN$5V#4o8~%V2h?#=($pa?Z
ztOTUZ@`_+td8(sEH=V&+V1v&`s%J?`N@ZVD<ZUZ2&YI^oGzm*4<c%9%dc)z4Gw`rd
zJg%URWL90!1c}Tpv<{8_64X@v`#n~v&>$N5j@Ky;_xmp!0~LxGD9C`ZR?L$@4_;}W
z=fh)12w*ymhg{cUXuaFij0hye)B9sKS1Go)4ewt%Q`fGzvc)#jlLBmD#H2c1oEYQG
zMwmN~t167L&u~1}oFPP*R!k6ue9D8I_C&TZ{t+RVbcko?ffe?PB0fVI$1pk2Kd7wh
zd0haapo5G~rd;;ZpAybe3NeJwRC5tEk=WvTQ&XTJlV1D~ug1623QMEGT8MKnk_l3z
zUh_n%9XL*gvv)2LG9-qM*L#$fF%-x*!93xfGesyOM7`M_?NJqVgx*exzKD`9ru;4M
z?j)9Suh(lXs25xBniX*R<rZ?w2~|<SO48#F?{d`!T&-iv62cA(s14r*`l(0;?87Q%
z%XC<6Tb%VupR!qWXE0SSx@ENWgoV@|cy&rn6cOb<bJFjpa1R0(e3yj1LT<Jv=Pi~q
z-|eS}-?biM<e;VWs(+Q}?pCKZJ{|#pF5{2TT|~ssJ}9u+8kM{eQ+WmzH8pGHRd*qP
zo!q_f^e0~=U4Ejc1)Oc+f!qUqb1#7Udk$;)e2#I)yd^pc)E5^7`AD1=cVgkL7e_yH
zYf!=xKe7+Z^EzWXH@aPSN$=1}-5iXy#pOtk1g|1Tw-teIwj4-UCOJJqD@|8A!<WYd
zQ)n{0)8IQ?b(J%S)nF7O{a2t=2(x-k1{mG<z-8sVbaTV}kz?Y}UZYtot8-zU*T@jL
z*D$+=A@@4ya#|52_B++1m%_#umZO*}>z^78cSGK9@cKT0W0{HYh!by93lh6;##mM#
zDi6HFy8~NFJ0KQk`1eON7TPL!UOAkqU&zAjVLDmC85SA`vXYK=0{O$GxPv0OoNl*l
zWy`lCG}IJIO8!ot9|X)uP&s&*I5HWGAEOVEH8J6|;>di}t9O7j6ITEz=6Bfb^^ltG
zBDzy$^?}V{bnmzcx)we`#sWOY_wd(YN>OhxCm>fJK7O>N;Fy)+m!A(_H2~7h&i8J=
zKXeVq%R)u0vwUxd%MC2{#p=aDbZg(i^Vzsn-s9<2HDIIDHBuWkek^yb{Zo(@$t37H
z@>D)%CZ&v8{}O&wn$Kf<?+6h06>Gl&|8E@!07%PFkocAE|9e6IKK*YUcQ$cycB239
z7a;)t3+Pv4H}1O+0svt7tM>mF==XnKG&Hd_GPf~sv|{{K{$e#PyF~^R-`m={0Vyb*
zjhM_f6pKE60?v|Vq&!9u+k=*!EcS+M8`OhdC-3oM63Xl`A^Oo%@6$)KTq83|_ka`+
zEXtX5VV3bWd5S?ng(=UQTPiAF%8a;!)~zw!*&(w`a<zWcp8kC(YR5=6&>6EjpQaK(
z&rnRQy)#eIAy`B+^}QepB7mY-3tw+vmY=6*XEfh8Pdr}^-fwokcSPCPQ*yoCLP$>-
zUc^Dg0ypk)0>}Qbb<t{%i^pNqaOEG9+R(l5aim~EnQ|y0J){<#JnSh)pu;hq4J-^|
zfXrJe7N#@f866tGRibl6Y=q>qq*CF{%2Pm2|8m0NcmgVPN=2&xYqXvqJ_O@*u;;pg
z_5<Y@&Ja$jGV}AHZ(TDM&a&At3e%OmqYCtp_@E+5pqJoMbXt9O9nj>u9nvzJB|p$i
z$|`df(mWUMiRprO6jj(N?qpyRt3z2TsIkJFV{)gGiQZPL2?&^8QPcp~axOc8k7HxP
zn|Y9EFV!o`9y0Nh0KX=6%zA@{3V{3t3n4=ijs`8z;Vkha$6!M$Vj4r-lRNjC25pV|
zDX4aLaCPJ(C8)23Fm>PEGR@hQy5-D?xrzD((Ucwr1-le8;XHb79YUkWDqk<Cmh8UT
zyiXmv!;p;SE?$24vPU+Vd?yhZ(5wFXJ2vy^;VTqRfd%tyeTVDPIg*yPz90~Q3|W*j
z_{a++cJ!QPcjJJPiC5ofU!n9bte=+vQglewo_oH!G2kNR7n>offCVF%3N!+mMv%oO
zG9>{jK+wSnn02eaA?DlxCkkxc^JD`g<(ROF^Zew`JLjn#A~m5Q3oP>t)SYKVmE9k<
zObT=$OW!yk>53y1zfu37YHSkn%>nD8R0$x_09?x=ZdpLVYIaT&hD2Fs?33bHmle7o
zFRelxTi7L6GXXRPv`wl#i`xPm#y@qtrq3%uj5p7hHh33;NA@S-g)BJ<L6OrarS2x2
zMu5Jx>z?GCkRKzUk=>lp3Bg2)0{q#bU|ypdlSK?xlbO^L0mOuNC(y}#jVC?HBUad$
zt(d1owb|rRMZBiprq8<CDXe9fYi%U{J3#EHhZDVhE4pZv8Yd&#O*yS~RzF4$3SiAF
z7CSH_g|xY<sU4$>tg)pov8Oh8vzyM#S)5nbC#BaXP+dZCmY-w%1Auv}wTh|z^M;)M
zws3#Kr};f#B;a|1e4XEl{<-`hnQuOu<Wvg`+ZZcssdUnJcg0MGvQcP9NLB3abMQAW
zR;_=3e9tU##k|wObqZz}SF!)-=K;=4#;hhZvNyNZSCX{@|8)c2;###$!+oE$aoD}$
z(nA5;{PYp?U3Gt*f4MF6k$1l4QFiUV&Uq6_6R5e+t#Lwq++5NYDf6A4XG>-fMyA+j
zSVB)^c+%~FzGmk?^|0KTf@9eSB<iQaXj5=6Xv6Xj4%nC6ON-YknuUMIj8sdmbQJ*K
zP3$BuM4v@+MldnYxAGl36NH0}2<OBAJyc)_03)26W!mb&P2j59<qk0V^hq*_5;Yx}
zxt~L9QG1rQCrIkDerSfPMQCTXOEcrsKbD(~IUz@;EST8t;1Shlfe8)SuhTAZh_i`$
zT`9KVUWjd`pLT$tZwExH%<SoD(BO=wXj4t`ML(2kw2qE6yoLWySuDh+kbVRN08svw
zM!f&jPeTu96FnzqM+;jsz5f!*UvUgp?*A_;c=suF$PKB;W|*iKf3hba6cRXp!K6tr
z&F_bCIS!XQ+4eN~J?<CAVUfimlxtx{L+-9u&M#pAxN((vOj~#38t7&f-^>me33W#$
zPkG+{P=S)7a>E0xo6rnPba}S$ER3ziSoFYP#w1w5PO;VP)3_F1)8@}%p)GjrY=F;2
zK4RlbjLykaUxB2TDu-k^dqME@*?=mjx%@EVm=I3WTT$gk-hxZ@5RXuS7J*J)fzDvN
zSJNUz1XE%7p3dZ3c6U%cTC2M`HDu#!hJD9bloO3#-0fxB910*euR+FA-Pg~`{gZHy
z|8osV96lGnR07MLY~xox`#Z3w##J>#C@h*RMtUO}a|%*m)sb4D9=fL>Sbfat;$%X|
zS?b|Zagq{S)W+X9iW+RrQ)kUCOV;(y1zk<VTqhiuS((4@9lErBvDI8wnsarCfbyk&
z4FKK)Rp%MD`&yG+W`r_e@EXC#B6{Qh<Vm-?*~w(XwU1T!V#H8qMcC&@+Qu=ZK(Til
z+K9j2c{nbcUOP~$ud=L<x7y$GWda$eYuV5F*w0lAGmfl1o-60cSj_{+X@dlMC!a_?
zIZH}s=d)*rzxSu*6r8UZe8Y2jx3b^1)pZ{Pc|ldx_J!(_0Fo=5Es`^yMm;^HfT%2f
zK>xE`wlrE5OnzIYCS?Esg8wWRBWnv2TW38d6GvAQ$Nw&vYaA=5P2s!Fzr*_sfv`1X
zvIhE41{8#}{w?DyCx80fXCBs7sn4TiV;Et6vbVdrvWpUJIAq*m(^>0p>RUToXFHQ)
z*8<(}>*Qq%o&SaTFnd1i5nG>6F{Sf6Wkl*dvS-v$XiU(w(n89;Ya8P(a>75q8Ka)g
zGbQHBL+!0gsC5j-s8)_QY$VV#^&<jr)=3s}qf_0Qw@pre)&?-d2xm~oZ--3@-nd|V
zWhu=5d4^e2`wG#!k0K{RPsggOl<A2pw>nSN?+K_s<d_}Ku%nqO{kWGdDTjCzFOgH@
z&V*Ah8d0cl%n321F#~=<uCWnIMgKbZ8b7qli5^i)r57tM#TV7n?ZMFN?)v%%enFKF
zcr|81aTs8b`P$1l%ajg#MyzOzr!vs!&5c@TPyhnpQOxp(F!m4<UN5$O8mQ|{aj3pa
zJjKCBF&K0hXLPd?f9J>YYx&R)KeiSxwg;!qo)ExQodH}vB3K=6&5e74K9%UOed=R}
z>TNzhCEPx_{7f`zxE3c4Df!QJj6XLF2BB{@u<y5QDKjc_iKDTA1{O_Hz3<gCJ|YIW
z8DP!UC!*%F9D4W-oce43>v%0bj2zMj4&A<7v@Tn47$bD~9#&lBaM;nyy(6X=p_r}n
z-x|YmG$h|s1)`&i+=$0b9kzV#BP<7eizT<XF!S$33SuL9k+$I9gkDGMxl0e{Y~KBP
zq&m+}{J)>DD1*_&FW;}bjr4Tuk&}pCj_(iW?_tq&>#OLqIra0Yr!Z4P>+jEhI^6|h
z3m-2Lqj-Bu?pJrSe!)n81^fyyrA%&aK@ZQBU*FQWsVFoWl&rG77;Nbt7P~K&OV^2F
zOk0DtH7TzoC7b*rnr^{cV9cFkbDGW<a(M<#GnnIR8<J=Rf_N)A`H;%H2b2kVWGY@F
zpyany9b4&F+M-GrW?5spGsTCLT8d6Jk^%}cWT(5-m?XVmKjdO|X-wXt!@f~FesEIh
z!~nK)H;L8lZ)%ns1f*Y$8ij;mU5|3fne-SA>s1gyrVq64CO=+G5DT(A1*G7+Z!I|N
z&tGprX}n^ypR_Di2sgpDc+DH_=wA({k`3uWrSbZ8pRDt+_7*Y8i;F$Pfn2?3N?AHe
zg}DP~1S2|i?CUDkvCm;b8DKJ`?*<(u{H^uL*IbwIhuI@dI7<;}5a$?S*USmaQm2dM
zF%&14y&3frQAbt>2K<Yc6iX!QvU<@#Ntz(q$RMZ&5opg)km*d>`2sOuMS<Ji`wkjn
z^qVnb4Z+~gyynj_H2ro^h&{6=QRJXNSYqBep+x>pQk7db!o()|$z6S|%F`&Y1~NHC
z&Z08vmnIwMtGS*x59Nnaqk_G&v%l}P-I62opqPFGyPrgU!{w6z%R*W4u~P9?8{&6$
zNCZ?obFJdf7}gwjyf_UWq6RP?0ymJ!6-X=SyQPn0?tqVu{^Lej0nmuSdH4%8kLs7a
zZV%~!X!8QoXH<|z(k5fYzYAqFXA_cAlK|Nl&rG}xnHwQ?^?JzH(JyTZIqM}X)*Dfp
z=i}DCP-sf!#|U(=W=-n(Hraotvi-gER)bdQO|_~IR59_bJi)W!Q)g<Fx&nt*oM)G=
z4IfvX2NtJK`Jt`u&k?IwD&$Fbh2>oBpq$i5xdvopAP5MVn*KgX1*JEdIoOOd()UA=
zY56B7EUp&inyXsZkD43opxQ565pqFGkjhcD78g!}Mem3m9I<}#f|Qn}8w|)?5a=JK
z3Tw!M3t%3lH58&S!k^(pC>kaM01<{baWG<7VlRFqEJ9ing>;=zbvDgS8=iC|GY%h5
zAXkmuNI;(2MhbLLxH<7R|AHFqJd~i2bbwW!ZX@81QEg7{B(t*!V90G8>0x^6d5*b-
zj5n<2p6GcVL$fw$F`?Q~wt-db<VX@fr6?HBxfEV-RZ$r-_2c!np$8@Fvt;w1dyIqS
z>anaXMy45}oS%jR2RnBCdhyA(r`LG<1#zLs<_di<m%2Z6ZF{fmZ+SnnMuUSX*6fC{
z3f~uK@wJYSIkbYy+bB?N(2M8|sQCCVT&)g?M_1Icl_3Ylfwr+i=D5zcOW_+3?~rfe
zO$61%CN}iNj7%|Zs|rZN+`xk|H*mfVDjH#ihTsQ?-!&jBE~_;6+Q{jhiyDYHX0y}A
z4U?R<w42A;T;|X-nI+kus^|??{Ugoh)-Hee0z%@sTXp+GV3dyV-GGb-{5WHkSx{BL
z9qsUre82s6tRcfxXPTNZAHZ{4W0q-2n1P@4x!MSTM`;sh1)3XV`~yU;Z{gO~|AlQA
zNP_d4IVPWghu(fLm^K0UrxK>Q;4Z7Og1Ryc7^?vRQ_MN6S3m8e8c;z}UminX{VT^9
zoWy|9X32zn!Yg8Wn*=ws^R8)Tfrcn1<m%_JQkKg)3Tp_Pr0t^`ka8^#F&YzSbd(n-
zic@r&TLa@CIIpA8$58(4-%Z21X1lc0Uzn!qk$Fw5@NdM5q|iRYV~%=*3ioeL<Xm0I
z=Y@-%mzJegqG6ep{afRN480yU4up^ckvQC64>15HNUEJ40xc;6W&+9Bf(}lF*%Km^
z9z0oqV#8ptIP}Nt9ol>@>_BK8AUeb(`MDSw*>}&d3#|eL!P6x3x<>R9HYinW0BNo$
zLb(J4Yf-CzFu45-*KeP&3d)^PuXs*|B3*!AT!oLqE)mlndMT9z=<`ym$l)Ej0sCj{
zOgAR9#j~vQMQ(FDaR+CX5f%fF+$Q=k^xO4Zp>Wk-p7&P8g2Fb*Mu@|&)ZCkz3g36m
zF8y>1m&p!qE7$rtpCbG7@RK>PV5H6y!3o@;n**=cZ}6)fE^QMxf%rl5zDd&JxzZ~N
zGzBOMbp@zyQPF_E_AlQCj7>2qJRFxy*LsXoaTmUwHU$DXPPPYeAbrV7o6WR|oHL!4
zV74K{n0f2$d3PM8JHgj=nX8h3r<?Q$YG)Jf6<7N3qAhXQ{;cj!X~A8kU2digZ(OJ8
zz&`e_>tp3aTp#B0^(@ZCJR1LJ67DN}9PDJ%C62fBL#=-Lt>c~%7<vd(05VUTxhLU@
zmSR3}gCw$~?;7w6R3K{Fo_B9n7f^QDR6FJY4|v}n?u~9xJpbkgy4Buf6JYg)N=S#)
z4FRli0&RPS?RqF(LE}F1`Zw>yNapReYcFMbHH3B%4$`d@Z;_=mf7{N<BCrYhDCBvK
zt6r$^B+?YPIxww@ov9R6pBW@DKU~i5^7uEE*Yqjjfr4U9Qu`kSp{#=6V~zws!%}@t
z-EXF{H!V@Bpf==6i9L0T9}e8;wGf43PT56Nec=@$Cvz}Bk?+m3Z7H34vc+4!*Vuji
ztMm2U+8}p+ww%z?w!&aR+w)q5Zjc(G0&Y7*G(+4SRG8~!!o8~k)!MV&{nQ8*nP6aE
zAYHqtG+Eu((P^2<+prW{WGh^Oc8H`{?$Az`GEe_Q0D(~e99~qB*u4n>tQ7AUu^ySz
zVMb$joKe4i@h0tY_lpW+4WI|yR+$U8TP#>lr}aMOmY>(WZM>#NcJVwr<CrsU>C!hQ
zP^uLsuj+Xvqz9|8(tzH1qGm{Fh~UA1z=+%s*0d|CjMw#SItQ44^HHrGz<~j9(&hD`
z{D8BA-g2mSqkP4Kho_mRnXSH(d^eu{{=H`sl`1kUddJD#xFU!l@tOLRCRf48vcMGC
zbC5bu*jJEkCb$N}NRH9rvb6kVkpPuYW7A8eIy=5uTazJKSBKMsY<trX^s}oA5msb>
zhXaj^<w(f&nQe(6(7%vNsW2>C8y)+yXd{VYrH2A8BS7AyPbc#11k7G($%11T<xW{!
z(?WfnzohT5^Q~G;F?1!IPL}TFIPq@a0yiIGeFkK0k=ZO>RIk&n!DY}fAy(X5BpHvY
z3>N-6x4IJJlFvI!BE(&6k76aS5=mnRuo*bk#=_@{DIbWE*}gtEAUo((vt7L-y}N^5
z={u9>gZoGHvD~Q>gs#*CY%IW%Mo9;!t?=8En-loE$|$)ip()X}%8GOC@JN6rbV@k{
zXMzgB*Gw{Cdr_ypsJWMg0~CKi4;7Kbd%5J((a0HtaXB4FO;ZI&Y*`PSx{u(E4r6lZ
zq^1>6^Np5xV8?^QrW8h8al^o9U)+JKVR1}6eB0NF4n-c2Ev&20nsasPZV9L9)Cda6
z%(vaHnDMyfg|-NV*U+uDEJ3LfUeTT74Cbo^Hu-r;G(f5wqzvi_d*KyRG7zs>tY%yL
zSW<LHKfhRBGJbaJ@{{n0Ts=dx$#~oeEN@1$<L(9;s-a=vb?`gUcA}qQ(tL&gv+@H?
z>QGOV1BP2tSA22#_h#VpRma^!R|FNas4De~-wWS+GYQNOdWL>!nnp0H-7P4m`fan&
z7fXp|EheYj-SN_Ebd!Vr#k>!GKZbsT@q+S4=CD=tT`tdArEytvw06*OSa9WJcNERx
zfyBB%W6(43^4`YA>}?~k7XcNDwhC<?zBK1G{W-n2f|?~AuBwsV#|hVO>NkI`u+RG5
zw3&_pPl>O)1i3fdpeiDSTOoTdf?H2@3@@`9%&%YAYApQjgtsZ(4iW^~)@iV)2;e6y
zK{uxAsIF*Y2R?b3bJ*Ls+)15#1NhA}HfL|OsrTh!kS@(2NBmN%`-Hl=5K_7(AgY2I
z)FRUPU|M!m1Qh_%T0WlHi4@N*LrS#7*u~qZgR;p)$j(EcbVa@g`(>8FgO4X8Qc^_8
zn?tobMwomk!^#Rfh!h{lfY(Z~PX-W`MbA`#Zpm#4I~D_w$(GrLX!C}FW(hC?^!e)$
z<s+3qnm8@nFkMUCy&GQiQ&x)h?Oow1Ou*#(rT#|e(^myxy09?c4y_Q3r$fGbNGk!g
z4OF-6DVavicaZ6*s079(93UH~9B}%UUf`HhIK@QpxdH$+Ie@L~^BWIHPx+-G`iN*}
z`qe+ZP_}Z_O{T5YK?g|DCkvJX<oofGqbz5an?8VfFDK)yUPbT~tnNY^oBP9P0dWw4
z^t<`7Tha<kNI>NRnbCL#aa5Mcwk++dqL-M~dZywY<fWs3+6?&;41V)tt=fWCo05I4
zjdn{1u6jB}4L|-_a`q5C5VfRL;@S?`2(Svhe#On<Z>}5WLc?v*Tte>KCi^5zztZCH
z9dzW4XB#+5Ah!mbbB4rAzVy(hU*=Jz{qkYIT>$gOc9@Py;k}nh!EI4@6_Y(i>edNY
z_;kMeS-Sl{cihr#Z}JAN7P1}ieI&iBmTS?xBnp5SZhx<c?COXiH57`brpyy`Gc2ht
zdtm#jLC?D7;DzNT^HGa-71$;dt+4+QB$lfgqjAnR!1C-sm&7!nMJIPV#4J@MJcMng
zy*2o}yjY!o%C#8KQhNr4yArT|8iXJ?^f;P#W?Z+(%cT&e?W|{J`6=Tz^bu!PD)RzX
z6Z#CzVZKYgJ~qVDqy0P{aTptX(pdm-wM!V9HMZ3UG%G{pd(<9*6$m{$;Vx5|l9eY9
ze!FD`7PHVd6<n0qqkPw39AsNi`Ut{)pYA!jjAi3=E3-vzh^&4w#r5bk^?p-gZfF__
zc~dAU6nL#br?a}QJ^Cid-%8r)PSEc^_<?53Lef+lkv9(5ya=^+smM45WHG0I6^QgL
zKffttRu5kAQOgJ1aRmB*+@;t1`+ODR@b)0srmvRZ?At}hjXs{`=+9r$C5`J@(X8{|
zIU&4q?&BVwo^ZX*E<{oE>>D)(PMs4`Qhjd*<j&E-2~Fl#ovFVbB%!C%)C_v0K$E`r
z#{(g8q~cFf1xz%Yfx$sa>~s4{K6D`BEwuRUOx7ED*?)TdtAC-r03bQ*uQA)GT;0A6
z^aC`<<|ER!$GCWZ7QAv8rb$+r!^>=-birgOl3puSiwag7%FfAs%(M~&5^%nCbxXBb
zNe~ces(u`H-8_?zsz04h<^I#6NfUelKmi5-Q2VuMaQ@SxF}3(T-nQ`kAA4q5ecLvZ
z0j1}ZlExEQGYF`Hr5tIY!n!+cNoJ#?(xL%}y{TwKHd<qe=R4*-)Y!%xnf90nF!$&7
z%^R-XW>b?wAIp#kwB0nKD>q*MW|sC6nEEMO8Qpk1sRtNtz0l+dnUq=3Lh(!xu$W|e
zMbSbrj3t3lKF3l%>XdmPrTr)vOw&SLwb}z)BRhH}?sDQ3Y^^(>$RVs%l~kQCr?!g(
zHLCP4|DT6UE@|$6Ge~Wg!psYu(d-N&T8&!`RT$l)>X0dybre($0nD@tY|-1$k$cUm
z<#TYnDRMlx@VDcgac^g22Q;(eCgjdXj-c^1K4}z5CF<D4vT)ptm(M16Qfh?hZaHWt
zmzd;a<LTq(SG3U*VGTNjca1tbA<$M4hj0YoDw~ramwcJFOBVJiv__(=U@dhps$^)a
z4)mO13WF}&3fcYf(BF2)KBm3vYDychw&%p?8gyoUz*wggAMck*AJ%?tA=Z$V^dumY
z0HZ2U(~5wWQDKNi;U`uU=Ctw~JHKI&b6j=ySTpxg0|Cc#Qqsgm-bvVu85^`z4z|jG
zHOm_|MIEnFg${D*as#G4-;&~En1~PlE}R0T$S^LVgejjR?h|gaGg3t?*wkm|Y*zmm
zT<yc=w2G<wgoI%N(s!D+vDgZ5f&d>o26+@*fR;rs(?BS`&-3-WWJfktsV==o-n#za
zJ?Dy+N4N$w4FD4C$KkciX_~|lrX8-=uY1B0wU`y7yX`fm|E*nckVL;h7zZT_7B9Dp
z^8-Q@xFZs;JOG2tji*Nt$YFjKPeG_B0w+R7S_TZaBs$4CZB}hXjG;Xp1D`=A_@o^>
zbhJkpj(twTnnK&|?RGgbV?zMnzcSp2A=ZA~eR2ryjooY3ctdV9AKf0ochcU7?we~H
z{0awVZZCPyIlsAhB|Ww4K6W?k>NxxqeEW!bUBJXCBe#zN`^+lre;Z17PwQqF8^^IK
zmC8^$J-P5x_Goh8AQ9*h(ahRd0#4YU_vFm$Tk2}s!w!4zB^EYIGJR_iDAys}O~}^}
z8TtYG&(%a@UDJ2*yP5`20RXW6b2VAm{+i=1HovB)iSe&Deyk?t^j{+*pK9iNhRh@6
zvf1N+l74rR;1hs!Si?I-w%>w^SP!@HG~*4%Y_LP1rgzdx9?c&w-77T~It5L1P~slC
zYB0wpN=h+IZ#}{My;pu)^6KCx-RpdvvRImrnpwg@`L4L)G|j1K9LiaM-Jq5Tg8<>D
zUEA+lH(9GH#X}=-Jfe2vv<knWy;tulI_24A<cFGfd$9X;#9ne>lo6;Bb|2=os76sw
zg86Q>=uNhM4?Z-2P2vKd^}S`619%NkZh|z`wK(7-8bqn=%TZ~99mO=gpfG&m>nM56
ztOpO4$RM%v=wK6{VXNk(zO@u_SCqt75WG`-G<-h8b}5r?x%1NX!3y^OEdG5t{%a8P
zXnuS9`aZ}Mf)0RL2^%0Y!=aD`ty1QyQd^zZuxeE|7%68b_SN!~TDgO?z$$BycPj7h
z5e*S2l_L75%+KPw)s0f5Z31$AhPjZj1Bfrtycoa`f|i_TD~0%HtZG!S6GgPitSA}d
zRAq({4in`Pqgq0DN;f{hze$<TnpzMqW^hCtFDd~}j7WHUJ|CI1v($?IdSHi>vr}r4
zJjgcf?e)l%Tn$*oN4c<Q_#Pofeyyy4wGRbF6}76Z3h#z4gHk9vK@ss4o+T&&DG11#
zj4%;!dp_CD;&>d(-11Vg8S`2wb^}>|`Yt^R*LK$))}eW2o5;f0P2MZ#^x)q)u;z4E
zwGiIDdMMrf^W)&@E%snHnR}mOrH)`r*T^MK1kDS!DaNpv0KbST=pNEU#*KpnRRf?#
z_{xKKH5^+tz#(xz(OV{G?OQsrISao;;~x!+TrYj2X9=kje0mYLPIWq7-Gk%E*2hPw
zF86bAQ-YNje*+J8J0e!{KfvHqO@WY|0vkSCU&;+!zUXJHF_MMuT@3|@0+5G65B7UL
zlLlYBIm<hptS9q@Vw20w?6GU8`EmKnJ=82E^a$9Cb`r_YL-j)h(ul%^s~`r-x^2`n
zz!S<EYk;!W0jsIotH^w%bZa0HyGnwBg4sI&=s?1pYLpN_=vl}98x5LxR5dNDm`(W@
z!%lm|#TYmLa43TcEA50$gfi$JVGah1ch1_7ViM(KhVjk<F_&^np0+&_I2Gmpc0|Nb
z=j7<YqsntBbU@nOLil`qqYutoDh<ukYh9A-)H{X^%?y(`NAq-mT%&By;4f-(kSqjl
zvr(;mFJvFq2D`FEc!m#mt4}b`TY1nDATz2m`|K)rKCNw@Z%>V+T%M7fb&j2Dm68Pv
zc-BSmafeC%xGI(9Uiaev_^Tba^RfHCbe{D^>_@#zwcy5Y{wxepR&$zcC7u36YCMwv
zr$XR%`2X&l003nW006lEsSuV1t_FGrj*bQ%|9u{ktSV!d$&b+eNUfU1FE}>h02fd5
zM`ocSc|&Sb$+(*AWo<+gt)p}3x%bxh&VV5;1)KS{JDm*%y64tQ4oF}><tWP=SRkP@
zjHK~>>XDbf9IoA+0AHXCm}zR}2=ir(FgeHIMmPsi(yf>ZR8exFYAl$DRF3T(jZz8~
zN}q}7j?77wst9i1168pY)00+;GKtf|B=T{ak#~S=uKVT&<W%#!GRDU>AG*k30xl;*
zUIvQ0fWEv5G2$-)HHIPulxCQBpkheQsSSp>1YWS;queyig|DI0D+tmcICaA?ZhV<9
zs=sgxbN>ZXtAl{*p-I56D{)$H945jLnQEX`<l5E2QaLW;6HJivh>UMbxMDP`SYy#&
zXHNrPykYq-I%Gq0qbFOkSKTD8s$*j;nSGH+u07(ltmp827?3`~e?HU7wAjFpFG>7;
zNA-d@k?sN74fc8a{4`Xs`-{#WG%}3g$e$Wz2Vh4B02vlX>?3k`Ahu#hFYy_6-UL)i
z_w-V=q1cj-7vRmML?CnOyRC-s+X(t~kk+1C0edz0;%m^5rzWxOoux~wF-xr4SEogX
z$ZZCuCSeEFa<}&%6Yg%KI{9XTtAuF%3qTm=ozK<Ec|*Wu<*X$ZTn5?GJNgTitOIUE
z$>p(DNFwp_bBWl+OWQ(j2z~8-ZKKPY7t*Z9`$$Oc&`QY&_|*f?+mbWY(zCrz!MPGO
zi38KoO}t5a759sH{QeV+uOSBvz9v^9W~<pE?%mM_`&=^{_K%!ZiqqoyBH&P7eY9-U
zC(}Bd^F;Enb|%xllpODyQVwU*7{k;m>-IT5I9&R>jk*P{B(pe2agWoIx3|lZck#df
z&jyDmeET;E5&)o);s1A{_rJ^6$iUj#(7?#*zbp7!(`@#4oBOw;X8?tdq)-4C0-{`y
z)F37@iIg`cv(@S$aj-|iFYx6yL+l`$30L9=wZnS<{<Zb=p5WTKjl~>*d_aU%pfEI7
z40qw?wXJ#yCYy+AqBbuxm4JFLQ777Z2MJk^-ugB~S_s&1ZIV$GwZ{4o5Y_-voPR2x
zy4I755c1s?ct=#?n}MLE*mNWv8Up#A^tK~7J#_=Z9%PbM1f|G#Pb7+qG^*>K5HEY2
zozfAt7+etFn1Bj#jCOW<5Q6o~M}@xXngk<02@aRx<)Eb%Oo(F_!m6gv(S(B~$}(2M
zfRht9ctZKx_X2r#Py9RbA`0Dq${V#64~)yA6DTPJ@MFPKymXt#_~HX+BDw}XP?EOq
z=eKbXlZKr*<T{QN2xl#(u(9SKp0pN@j*vlS3J4O4lbW~orHVQGv^@Oz@;(0;&x1RC
z>nb0WIuxU3aj+0esQnQnjC)iCK?4c~8J_6Q!lG5Zf@dC8vwW#{FvK5r0K#0Oh9k^d
zMJUWNRS6W*k~ZvtS-W5J)zl7f4QHumPxMx?w`?qEJj~rf_Ap8j;XT`r<M?TVDJFjX
z3qXUN;c&TR^brma8yE&gB7^lK-lt(jsBgwqxSQVv)BLy&B2HJV4d@u(`pyl0mH-(i
z&6ftu3e06A#>5FYGPyffti{KPy3mE_?+kh>(6<Q;DjZWKfO+HbmKG}5%NC*6>k)3v
z$s3hBIeSSgbt7|P`5S<@<@JUzeJeZnbe?!`%qsM18#(!I%nE)nE<HD!6Fr(ZPTN1G
zRS-u8v8Fe@4-|%+khd=i3rTe_$ek7#BD4fYDJ<Q|+l`{Y)h^D5W=qrM4m_xXM^6UO
zN^^_-n?~!K?zn~|4dX!inD(V3WqQF%T;~u>0R+8GVaGFINELiz#LT~9t=~={UM!;L
zHRNXC@XzZ!crV;ie!>xI0h=qk)b7i?c0KPX+Tr_~(+<V@t`h{?yH`IZ_7Gz`WFa(4
zdlgyO*#YWEl8mlhN(2l!knuhh_sC_=0=%30BN@twLfLT0?y^(y5BIiTClOR%D(|<u
zXsnfZAocgZ;+tfF+SFJiKUWv<Y7Df3Fr*Ryh$hmO8Vr^XPbp2?-f2{aP{mhy9smK*
z^w62YMSO6}tPx09+nRjd2~n%HrG)@-1Fb$gZ03;F2<cFl{YHKv0ZNGAzh|=vOhNAi
z4}#OO%yZH-5a381A$efuUMFOcv`q?p_mVI&Z|G`f;l(u$(K6OSZn)(-<wG42xg`ih
zQ*p^|0+lV-0v<M-lEu<->jil?c@Px`0hY(cl`JBMr)nu_VIicvCQviT5N~Q8em#Mk
zPALf~tR7U`^7?aVk8RCQj_>;0|Jo<C_p<5jMC7r(usmSF<{n8|ba6U9E}Bn!j<w3w
z41eeKqGE#-9!ci*xfmO!RS;t0kt$A@!7Th}8?~(tBif1YHllJ}qps9V?zxO`gfbR0
z<WN*pz85@~f3q|UP2Bg>Cys-nqin&SJMI{h)gXm7v?t%+`n<JR8Vx~7FqL41YLjkZ
z#H5cqKm3`4YXqW$NVimAB|tXVoIq!Vy`i?35;3y2d^XZFH{CeKGw(o6dNxH*4b)8c
zsxC&$bx(5$y1*lsQ^W5+Y#m7Jqx&5k2VGB+TN|k|-l0PFbM@Uyy1f>uX4_uKz81be
zieH;;{9S=z9Kac-h!ZnW+T)fh$O&&II&r;(OthXKFBo%vmPV~bvyz5)z!Q?GhlRTT
zK{PdjkaL<B1FtdmLIPPv2tt@dM;g>?DV$^xDwrhT9#==~x8Q{sxJuF#!Jz~eP=$bo
z1@FC`kyegSb@?{Pe^iZ`l)s2lKFt&`)*j`l1R)c$HsxZJaon#8svl5kwXzBy9Qj1}
z6z&S}LPi65Iuu07IDO&>je!0@hnKTHyS&;A8mlXaP$%?v#EaZk!>uc}8g#a%WIfNQ
zzn=i6#F2|MYUVwR1+DkMZ;Q-bicRcDm@>{4e$L`+7!CmXx;N2VHm6W~evcDbrG3iu
zi_x+(aFgLLxAW$fD?vyvzAHrA{W1PC?y_{S5X2PkSYpbYn_+p;UbpJg<lDsiNY=G-
zqg;L&`SPl%^6pbY`aGRY2$b=MUV}Y0f}VTe79t{Fg+@>|$D@1J`_#3Z!VrzuWoP+m
z<A*Zh`ZmGAJ}3(F{ydc5*?<-w3f8J61{q16JVrrjTquAKVaThr04c28ve4Fk*?&Dx
z_Fpbyz)=lOA!=PR6)8!O4U->m0ktcJz*K!|J79j`qWWFJIKzv!es4kfZ3PrE)B-Yi
zTDP3Xjrq!s?ehJBm03-SgfyPhF{XnVP$1cJBgLd1srR%@hUCgNF|v;t+60q)Oja(s
zo&O%{-7(mA39|eOO+_Q`WUZ|Nk`r>pRVPrapBW}JJxR^fFiO-+Tooq`wHs)M@Z<fG
zFJlKAy$w7wpvYlWv0V5?98L%2UthPdz?#>|%+J%Q?RUj0WtU?ziw>sQuum`2%qf_H
zEj89M&40ZzZ<1OO8HE*zKM|nAH`?WY%b4+1FvGq-L#^s|N!Io-`azxn|McTM^Xo!e
zU&395Bk72JOZsgUoDYy6XG_&tUO1zrUi?HfWY@M1;i-Gi_L!*%e94JCh=Ig-+@yuf
z-H79w8{SM*kO#-GLim*0YvdIafd~E}H}{UxNb^;FnwBT%Lrs4`D}wH}3PWm{$y^nY
z{zDd%Qr%bnm%ZzOYSVd2!>iyV?o@jliA+Anqy_JDhnIYTuQTtEC(Ym$J=Fy~6|g}L
zD8wi)Jz`nvHj-lgediDD4CUPclA5}`V`l-+Ac~t0JdzFs{lS`dj%gz{Q01qjZB2(f
zuK%R<_imm6>2?*#RAJ3&luy7Q%ZEn&%8y9@3)f&()%XRK^05R<75D45`E%(V7;d=K
z$K0lxS><5G5FdF}+Iz8_vVT&GUsX<662e%#qO$PK4{sf*P9%6lr((v++lX1a=`rDG
zKrAZ>Fru&t+B($qbB$?lp_^B`v{(LX%Y`GSt=RixGI6e^aV=pqlpLMy3jPK^7>)``
zH9t^LA%41X<Sq?O&hD26hxwh_QJyBw6cSkGsep_7cr^$9VTr+w7Tx74NDJDIOXdy{
zA~*X!J>rk)<-|K`Bb<lK$F1f$;lHjNe_dbA;_gzL9<GNAIr=+D>1RsUP{YMPKitH2
z&f<gIE7U!?KmEwQJ(-?g$$1D>)|&?|3Y&0s=o>AbGAys^zUXA$!2j8htYN8Z&HY*y
z%t!zL*#D1)#Mt70#<SF=V*kstcvVy1PYA0hF!7M|Qb!156Ym4epI{m%xjz`{cg6a+
zb-3!<Dk@F_LU&x7jxy`<h%FomFoQU3Q!WOGLY#)8L!olQdVpXkgaR>ej2S~8*C+?j
z3SXZxN860N9R9P8kc9w>y50it59WL{OMM2%Cy5{jkh(^482_<KKaXNI5Kx}5?SRmX
zcC7PyIFek@o-s~=AlX^^Amex=__k<dw1}b(3|8O*0_$F%)ypszpPA@b35q7+3`wFI
zF8()^&D~*Icjw_m$niX`Qj2aqTT%oCwtnBU@j0e{HBXVZ#T}TZ+6YHdvuSD!`z}Yj
z{m3S@%YK}T&ZG4)5fDYLJnjN`de3I2<zwh$fQ<ly98p+k+KX<^C)L|ty>r@B&W({Y
zPgkKFtf*WvcP?gLs!J{4WIP$cm}`%BHhB;V(x~GU1D{cqYq^29T|Uka!yXUvAFAVT
zkHo6)7G;3D>)Re8@4{4)Q!OiL{Gl$VCN<caP<Ny3fUPNYsy*+1si+p!92Ft|$F|{>
ztjs}6s|PXtVr4u_{Ye9av4R{{Oj+M{;4v^%w5Nb7JEQ%jE7?M94jv(%;vD{Z%F-?|
zSat=wEJOXP50TPYH)&V6X!~y83qd+udMcB;<UOx-7!IYQOBlIgv41maJkg2&39AVs
z6<-*Vyu{d!fVMLJCbwM2897?bu#Vh1{Xy~|>Zlm6n6?HSib&efM`6j3{Pge&G7VyB
zUfFO{>ynm^2oC9zWAHBfV$PyQk8_eMmOvS(qr#_*p4my`J*PRuxkM6;5H?t`0gz;p
z0c1{V9@@14CLd$V)u<NA3;Ssn_9?@=8WEHr8OU<d-<l&XeQU!xSh`zY11G&%&nk0v
zXUE#(QtvX49>EFlry%$r+rX}EkUVv64@c>wHG@obE!+BlH#KXXBjeM1={`0=W?(pV
zGbHIUeuL}2fsL(D{=+5T$}wXVM^jpEX>TApeXh6WtZQ#Z^p5u5mb<EGQ-dxI&wfIU
z0_gaFx&~_cw{|rh&6m#7dhE@QY60CJfg>Tnilm6K5VueGkkVZ&Y^&1H&jzDu5>l?)
z);VFf^cJ`mj(8y#Rd7prW+|*T)&Q%J&X<?yK3#m3;n|zR2Q%4~DpXF|gkuH8O8paC
z<3?-b4n}AWA;f4Sgg9!H2krBpZ?KGuTa(2-M*3h$n@~OXet~S2N+F|U^<*y1<GYf^
z71z;2w^3tW!G{cl6|uI)53QRm%vmRafm#y(#+EY*Nv;twZ4j}%G<<;>h-L~@6S2K-
zfd3Q%t$xgq+25gF(J}x4{Qrj#m>D>mxc!eeagBTJcqn}JH+P_IN?*tlsg);B2vf2)
zIIy#&>DcNldQ^h7oMRNOWl>k(oggO=U1;IP^V^2pV*iaoEORYEbwN_|`n9Fy@g{~9
zYE+kjXRfE6cl#Y7*ZRDp#|4Lx&A^Bjrm57r2=;?#`sS5yW8-5<hosfOIS+Q5qTPfy
zlNQndDVTZMfh^J(k#TgsYn?j^$p+C~9>B?)-XQ*RlxG@uB)?ZYZIG}90=ezZhgGtR
zA&m4Q?Ym3=!6vS5HT)KKSnA#Q1neeB1Oya<*zV*5>4@^DgS1Ch>|t?A#+GBmdY^oU
z8%m5OMgS7&F@P`r&R{D4e=+t>?V*LymS$|*wryv}wr$&Xc5K@=cWm3XZ6}qgr|Wbd
zT=doY0dszuYm9ds*gr;)BAuQtdbqkeZY|VI$M!hZ)X>Yt=sto(4Mw34c;u+|n?C$K
zfo@OGl7HY(x@YAT59mq<iP}WO{#U4mWRK?|xL>zYNLzeLVo<+$;@$p|$O>iV#BQ>>
zI=@Tq7({ttq06I9HW^Li_E+y%s%%joJqPe_S!pr-RnBo(Uz1py9{p-ydO8r9?(X|G
zJ<a%o-;wNUxpy+)&sdjJHs$J4SI4B&6KUsLR?q2ND!MhBaqnmn>GYt{cbZ=o<F3<W
zJL2s~_edZJOD#b;j%=<HH~u%sa7ZARE&{nNgP|<PSbV!ZF?v}eg!JR;laKN<YUH(g
zlF!3Ke~_W5FGdhtZcY}S&%@_>J4a6^ndaa?n#^Vhyg*?1L5!$%4u^XB9?@B`cbH%=
z?e{|nwnMtO=pkN&{Q*a%WnCc&2>sZ_m0|nKkpxPJLM&fFReG%%O*?#{+WuHtJb8)A
zgp{%1Uit8TJhz!_?{%LQaGVoj65bH=t#6nTOKVF@jVxUcZ##*Md00kQPe(_Gi;tsA
zwvVBm&S%|Av6F*`<NKx8$^Pj(v^(PPGS224mX`v6CflJK12H~7MoiH4K1a#eTo2Ft
z^RMy_;6n^aaQ_p<GeCg`xQOQaH08<n9X`1uh|15e?e_ETqsSethu^77fZZ#jrCNsA
z?Xq!%8p`K7J~Bcbhpu2wk2sa$vYu3G6JnEeser=lc)NZVF+xzm6|bYpK$G1z&bmt(
zP0v0xZOTIdG*v+D5+Nw6o3-u*5d)xk)y&#<)((i{QtR1whlijh-@<SFuLFl=ViQSl
z+a8?HtMf$D1GP6oy~res8&BwYY`{-_sKa=I@-01~*3i8oA5s*hwKFysBdD(MZJwqA
zG$@nrqJoB}+=82^(`%v{5KoD}2T-s(z>3~7+8#uILz&Ih$4$>lPL7q?$;$fK+tCU6
z%KB6L^HSr<O!Ec3Y*XaK!$D#t_0<}s&kzFmEV&wp-_z9H-Po8C(O?M6*#i#MwNTPP
z^;z9uxPQ|r=q@D|uvM;KG)9qX0Nc#O{d)G4&vHrqB2e;qcW&1<{%WHpt0YRktb)9C
z_AR{FZ*otxwslX$do&?`Qp>Zc{cDT6b4Zqs(=Zmu?XjEOHbr-?&L(g93A@Qs_6+`1
zUS+g8>bB*`=fi1=AO}`)du)8`l_&SRSMWr`ZCXLW&=)U~;TSH+#&V7OYtM_QG<HLL
z<lJ|T#eqC@e=u|<rGZgp9{f{#Rd7SaAM(ER()O1<4Ge~SRO`qE-$8?x2l)_nUt>qA
zxr9XGH;S_rZJW-tsT8amGjo30eUOco^pA*u3o(+rQP|+T*BH%S`W_#_q=-ok{;jF1
zN)wnaKC7m1A~g9&kG;2UbZC{jmEXvY`zIQ!9YzZKy;y~X26HK|X(Zm|YaRMI?en6$
zW7eP->w><CzZY+_1abn7`pYt&qfFuZeZn?vX_ha#j+KJLB?fFiXnU3K4V<Zq3lFK_
zONNSJqX>e3xfna~t&OQJBXR^>{;H{bnKc5~KpMab<FDeY`@Gr#+_79~q;Xm)y3e=f
z;X~1NiVyGk=#S>)?v7=ex7uEkNhEcjS=>PU-9{7`mu2Ed+DzCl6|r|^YW58Dn=Noc
zQNGD`ezx#s2)dA8edyS&oW1dW*;0FXm8fH-RritX?EjSd(h64PF95(;f}8{F`$`<R
zFz;hB$*{Nv8<Oo<b(S@ex=gZKFjb@WSff2&zR7DBn=^zHRyZ_w0Ky&6$eQnW)`m$k
z$9FTXF&<xN0<x+s{1=iO-`s*}J3G>ufj)l0iNCluThmt-tD+BMK_X0ga*q1O7HouO
zl-3p^NGkO~W||mME7KpI1)OU@6*G#VATSBhwu5;~PEmH?&JcIVAv*afA?KdL46Go4
z9`?=z^%P}A5J#%$OOWI+&jaxcvisG$y*1G@x&R*c8N^eEK0y{34Tw_BVv?l>=#af<
z2ZuEo82o?+ffkh?g{l0dZZ6=hL=l2jUxc><{HBOu7nn>C?;(X{l*RhECd1GTS%l^U
zBosm}iH@Yz54ojGb<O(-Q<%YQB86@U1Gxvosc(*$f-S5q{p4>XDA$J{vA(`+Z%(ff
zal9QAihv;E<#=P*8Y{*}wWDYgA2CR&QtWcLA1s=7Dl^yJ7MuIoY40Qa6Zq!qp1ApE
zCnO%;v$K7U8m#P|u;#WQA>1OvPpO=fbrkC#kT4BJ(6-s;R~-fx6p-?<7(i0_q!inW
zj`#`5X?Uoj!ENgJxUo}m5-pBJ94wp|>phPt6tLGcv2X$b7rSWUu^79cBQnKqCct;V
z{R8L`5BUOwR;L3dMo`W$Hmd?O;1b|$fLNS*zul?n%0qmu#%~+$fCRqE?DqW#=fPJE
zWpz{Rl7%BM0HlJy2fol$+d4+2+tp!7wHp&3ZyoW7HA%`8;x3KJFse+LII_y44NG4_
zUf9tJBh2grkB4@PlX2spo1*i`&u64vq>NTd6iQl>*29tk8NeRvv&fT8=bD4ne&{e?
zL3u!uF7i$L`IEOJ5RRyV%kr3`XmOT_f@U_ntPJX?IrL??U!;iHHj`cdGvV9GlkC)Q
zfU3CUC{L}(m4@dIenhb?mIWaD^zM$D{{o7<bP?&~GkST8+nU$6w21Suf%J<HSp8*~
zu*L~jhVkE<8PL?t;&T-cB8x&<V3WDiH-q3ZK1Z=3)``V9reVhLSr28r=4WDXrNwW@
zsL%aq7XoV%MFN9`!of}=n7%aI|G_P<w_SEq%n{6iMCMVKwX%iO7+`-G|5Iy~*$lfd
zDX(=KdrVk2BA$(ZPUuMEJ|<Dm_{aS8r|wq?ToQgvYG?D+d}b@FoZlyuS+hu?@z#^A
zo1a}{<XJ~{EX-h2D|E0kht;d!iv#A*ct8*Sy@UlPQ85~$a|r&9hZqjQm+X8pwe8Qz
zJ)qM%x=G8x4!SGUL;&?O0u{r)xZ?@y{v)(f(Ksfx5>yN!Y6=>6GaYL5l4tbqNu7*J
zG0;fa-HOS7W~Tqsv(k||D!d{jELb0EJ^6uQwtyZ37C+1lW`(*x4ig+sD7u1Icr3Ed
z$^cO<=`0I%SnOPp|5N3*m#sV2{cTM#KmjZ4kHp|T|ByI%V<I)z{H1uh?p8&!^?6!m
zods$O19B173uwBU)s?H{(_-Emafv5x=Z5tR_=@yJot6WN3~BRqjUM(aGcG!T?ix=;
z4pd+hi%mVCxcVpMaCrcrPV5#N4EF->n}6yYYhcq09~9PSXe|-l(g?#+UWQ$<N(n>U
zZ>WVDBnr!yHDmfNPz+24=yE7=)w>MHK^<iC#b@C*3x-DMHDNu;At<J4tKb!ToYmfn
z`d94Pa_o_T*z7RUO`tBi-;QT*N@oXkY6ZA=xcENQhhPshLDQMHwLveZyeO&QrEl_5
z9O!BF0*=T<nH|aeKA7WeQH^F!gU6s~UWN))S>CnSoGYK~kFd45sMP`UQ`b&yH8Fik
zB`QozijJ;b3d(IN_dF#F>4inUprXnIr-fSo){Q^4DDFqP4U|88qI7}tWh;sjrf0J3
zLQp5(<yx8C%>o)rE2<D`1AK__#4Fs&eV%W5o^u9IA_s@i$N7~W(jUiA*P#UV&2bc|
zC06K-d{?)Z^QYgCSTaJ!l?m_$BLurMQ{v2L{|W;YGQg=xWn3P*q_rvS11cwx7=7tZ
zeYo7lU{92C-lPS;(Gv$%7R~*6C^d|iYIZPo#~~yl4F4-b@k({Ut{JlyV#uw#nY<2b
zp=361KLu4(bhE=qkO#W}dt3D60B9Gu#*YjxTB=cG-bufy)kyNEuKxScar<hYf+u{o
zOq8c5nf*T5p(A2dRd5%+Dq~QwfxsArWuh_S&95gM&BAr_CV(S#O7#ecjE|#KK>gxc
zn(i>zLiC-{l(3L%3xmSU_<{^ASQ#l$bWgb?xo+ru{7ySwsAnh?3c5rzrz5AX3~a;(
zYDiS_h<!66J<4Sw^DZw`EK@MM<LS(~Zu^4`^lP4;;M2#8Mospqn--RaBCo)?oRo5;
zifGGh1>l?+!289Cphngq)Qu&fn3R8F(aP}se<a2{aCsV_G?k(|MHj!<rl_#K*~XXC
zVK6GvaU-%q=I%kOm$C(c8+1==R$F0=iV(cB|9;{<2+l1Pd;x>qH-Y|+P8&NFuDeMg
zX?QdVE9s(PHbnfN<`*LjDnB>z0GxHkX5&msobc8R>CEXnY}w*RKP1z2D`G?mq2K++
zmsh$gl%M3&bC3t{!(d>=UuhUBU{w2RH#*c?*N27f4O0BOO#57I#-RH5NS7_@4uq!R
zA{JdMD)UR9xHIfCym0ZIzPA~G1;2T;$>DAeb3$RGOA`TY3`w?NiX}yu?O--$xzELs
zelv`%PHmde$XPcn>o?ZTypg1`!yWVbS{R~lKEs>iIns}Ky{T3MYww%;TnbJh5iZF`
zilzuHknslWz-(x3;0<h}CbD+z0i*${Ji#kc>PoMUj5&y4>d&Ml?$O_lGGaW?&=UTt
zXuCY!l^gyeYgy!GHtbI?)(+pRw44~DQ|%a%32btbv;_p}C@fbj{$;$J=DEyX|HKyt
z<$TBDQ2&YfHjOV7c5#M*ava%moQ|;|H~!SW(AeqB&pTlVfw+)a(MyuYL!P3A=<V{A
zIl%YgH211|>DvzLkbCC4jA=|H@>#@%G?Q&CX4Q9G?fPW8+^P`)JgsOtTf2^+@kuH`
zrFvzyqmmWy>g7s7amg3D7~2MK4>=F&p1%ffiztGw&+e;fZv80|p!o2O-A*;;8Ctp@
zSA!0WaOw?|aFV)X(H3a564b(F{$88S1x4Nj?+9kgKT!11NO=P&?s?H40=NE$2L1MB
z%T0aTNQ|L4l$!1sl#Dq)v&r1$D+&<%81&UAuofqhR?PZm;I{c?Rxnl@ulH@;N1XMU
z$w<k2P4R<j)K49`q1rh^`kd@~^`e4n8s8mQOqL4O%zg&guY6AegV6-TTR?0>1)<mz
zp>l3NIELN(ZscY#Nuw(&sx*`|<!&+wc{FV>>L3uf6#KLPFOO$zu0uLdcp)wGLTn{;
zt|j(~f-}0=>=LsU-^YR5Qxe<-u(_?TI`YmxK6fD=h6e^$&?p<WV<qt-F%LTWA#32)
z#$8urcZ}$O_t>vgf5F+;cG&H22l|kanel;|Q=V|Y%%E~I9)HZ6${~n!N>JMB3#3pR
zO$#Ox!*w|RMKJ$*Bk{m+JOL;_G1HL8Hc!|GMl#`RFh@4BTOsI|C9uiZYANx<dhAb3
z9C43He6$ktW9XfctCqGONpaF?(0Bpo0vS~xYP3KAq-wZh%gZ_(h=7ZjOB&jv+{i*o
zTgd|Y$CFv+Op9$E-AVnYSa<_kQA~ohGeR-O0wMLeU5?du^w_$aX@UlI&xafo4fAVp
z+3A)LAEL(@l+oZ^DroKy(pH2y)t0(r9Y}M{p=?O;{YDD!As`D)ctO(Y?fGDF$3a3o
zA(o|ycuGs|7msOF)A4#J6aCwtf6{`gASr?O<;q>Wpcku#a5@6ga<#0!gCW_sGv5Gy
zJ`j=B&o6r`yO6vsSHr_4^oi|}2w4jqg#?3{llhTX%2qe6aBj2V(3@<A8?>wVY=$>%
zhC#kq^)DIq;Bvnoy$qel=;!*F8bmu@UgBdYu5Ch6>P1wPPNiejvzF`6R8O(B<<M0g
z@_dfUjoGa|iq;1k^Ald6D?HEre$@AVdI5BLrzxr(2X?UiV&?qld`T+OEFDPjzb3*d
zzY>BGdF+N=FjAmB&p#cDW%;J|C^vn9Fj0NuF04$wkI;h0CS*jgj;I9jPQp*Kw`T8b
zuCcq&Teu<&YK5+#BJ5X?_GiU^O}K$wS;Nnv<<IaSPS!Z=ZX?;*wO><GTxYW)5Xj`M
zh|FpwYLuIw^%Pp=tCjocwH~v|H8i}Q11cD%;(Ww!)w)w@iwvAz{o8jybnlW_A&4<<
z6&DBFytEa1>F5b$|31_T<k)dVy{;vF-o?^4L#P!DGi3f5fu`^z;3ox?i9ATA4HC34
zQ2xgNkjo!|SzNf}ZcJ&JwOo^Iv`?BD=!`-Cn_0bz_sD-fhX|_xqIYAbAu!TqV0i`1
zTZ6S-8MV;uwH7+h2bD5Io>cV#f!0siJX;3LR$L-^`dX0?#*K6^HF-|aXS3z6J711L
z+ju?eH%$(Pe@}6RKj{KccQeefWxFtXo2YZ+*3z4eQ+jG>dDFQ~EjMuQMU@6pt{J!H
zQE7CQB8md7>0zHUL;+bi_4tavu)|TMoXS2Nz~tldWMdhsZm;nRmukFpFGj}MlV~RP
z@zo|pKGA~G!H_;NVh1Q%w!qqMs~c~uqs;R)tJf6q=bFK@MXMYFy~PkZTZ|69X34fO
zwfqrD=VbycQtsGmRXj-#ic?!n_8TKJoOfp~O;9y(V~Pw?0YL`mt&M5R5v@spJ(wtg
zQ#?de0y8y@6`=*sA)d1@4U-aphGXxWg@jlQ{&9>8Eg{NG(Is;OjwIAHTnp&dRo}nj
z#RK}bvpHjiRk6Uqnk*uEn6BlwS)X;Ca6?H^%ZX6qKs=141_XoU74;v#HaKWv0s&u(
zw&p8&y2>Q^6oj>c%7eCI%cZ%?^z3{?0wCm<+uvB#q3ngxT}~j5eQ?MK<%5yLW~$d>
z*mv?M#yaM1$|$wwl(xYy@J&RwJ+f2;t|Fe}zSIyWM(I_94#Sq6Xqby8m?}UNA9$Uv
z`RYrA_sU9m+Fwh5xSD&glbr-t#IO01dIC?CS9eCkg+CDy9FpOkG0#wp(9|NQz8utQ
zOjxf3gM&R~kX$%&uI=|A5BcTcG`!d4q~)MNP3x$i*vG|4sdF+oMluHPt+z!Y*e#6e
zVXGNGY@qLWUUrY-WD!QkMsD^QfiGTe5=QW8^_w9^E_!?+xn<j=hzl-xr<XMBOqH3b
z5JbHQvc$E6)fr#5N1NT$go79Xmc~V#n!4?bvyf+uw0;++*9ZVgp;QMZcgjAxay56D
z6mfydD07or*s7ch`a|C>*$S)<>(v7PB_)6kM3)<fL~DE=*pZu=aoG-RhIu69+yJ_r
zAC^e#jV@ze*VLTO_Ly4}(^*4^ANQ;uv>Q}9aT@O7@c91Rq)X}$N|)5U7=oX^ds|z2
znp<-@#-UgL;`wPkuA;zx3S(El2Hgg`7ocwq_op}C;O`-Ts;xn3Er~L)UUW-sGVST>
zVg!^fH~QczGr^C2Y_rezEP!h)xsXU;W<ar;p5R_IyE%`gF)n7mpUj_t`iVhu+^wP*
z{4{M?>^nEKVubT6<ru%9LqAP5ocre?@AU!|z`5VAODJP8Q;|GCb^E7n-5^FeTP;HF
zZDp@=UxZq~3YfX598fvZP%d6U7%2pwKifGUZBglQ(TBQI%EZ#HFXk^#NP~d#QZj?E
z{8{Cr%!001&*Y%1hmmaSTrk}xqS<_~`xgIkIuY&+^cluMhc14?&%3(~@GE(^25U6P
zO4!I_!tS=>O)z|)K)~b9iubbFEcWAGDR}lpmAfhA5?vt?-nZnB&`VzW=(Xve8huYj
z@_cVKG&#vqM48tG0S!s@ep$w&Fjt}uctK+VI(ixh^#||P<2V1wQt6WSJ32q@9R~4&
zy-GHW4&X_;-lFOV=gE_6B7efbGFG;#eMOZ*7>Ads_!K#5&V^-R*T^X!5q3PqeAMSv
z^vDuhSlxW@HqDnx4gyK7v%J5^ZkE1yA(3zI+DEakBJ0i$Nl_!yC}T#~&3q&3o*X+x
z%f3h`aDGM9wz-#6kEvFTPGh^nZsz2*1I+5v9ehft<)E=xUSF|%4DbxcK7cz$9>mz@
z&eKGzx(JY9t8MLW$(SC6Mpj?FvvZk`q5;!)G2uNkY}*(*-d*~U_W*hf$7<;ZUFP=5
z9`E~Rc`$Yy3FUYLJaOKeln3UD22?bc==j1OVe01eFDMb*gG^RWoc<egE6d90JQO^!
zW*0*XZED7J%%i=O-|Igztz;~KrUD^pyz_j=tJQ9c_A!zmWjP!H?{xwpb!$8t%L4?s
za7tEca%(e{8%r2*5w1h{KXz)dHfj&{-RU}n0msdQi@<~Z|NISD7d6pgB3;?(fbGD-
zqN$ovarulK4S*<4z4!oymM>MJa$!4_S`bQM^@)c<W_0Y8jBU+E7ja0CNCz;kd`e{7
zmn1M=(*n*~VDv%Z%kF``OE1Ac;mw*V`)u|Rt-M51KbY>fm2{;CUi27U!Y$yS=tBK-
z7<%FZDkrLZ=$A?W#?g(7{w>c2?}pm{tETGz&3ISK5`A4x)h?q|5|N5<US}9er&<B^
z2y~^`UT`J$Bx#!U?#s0QBgDW>un2<QUs`^bz*0VyS`=W`JRkl7PYv5M{A14N=HNN0
z_pC3n#9i--ysJ}bw4FP@in}O>0dNDjsund-KJLsn5|mw9r%{<rIghc;pFF1+QB0Sk
z%8#+tmR=rbstS~+dI5k2nDi{EsW4@wL0j?hj=O2(2X|HmZ6&=(yTWQzxVd4UQJ<z*
zN}orzFyn!sE#QnpXU<ZYtD`&`nd(KC7o(5fk*6n0xBa4B09y-BMf4{}s<DX$LDuZ;
zFSCee{%)oQ9K`P>o=jf)g8yELU(x4X%7t-fXe;qwynp}X>gNVIEpf>9=e2}yntgJn
z6<79Ol4~UXMR+*<09m_(O1t~VjP*{E>_#xcf7KD@+WRG^<r20>-~}alWm#JTQ)i}t
z@=^KdX%#iEpYC3F;_4zW;M*g6QFh<TvgCYC#uiyPAjoyfcOJJ5<oxS4j3p(QHhd{>
z34%T@jR!NEjDlu18k@<v7E#0?b7Tq}i64{+ankJ3?faY>xYjHs)a(dEyf!$)yO>=|
zCGKf{j?iDXlugV7pw0ci)`}CP7u3Y0(ak*R^|0s^D@tGwHbi!5PXO4%65g>1-c@A1
zG-$oLfc}n}U>dOPV6a5lPg4>#P~K@H$D8Oh4JAngS4uE7<X&>tzXm4B9dnQU!w$gh
z&hn*cPS1N-tKDmLqv>cQg6X)+=y&oy=OT>ld^yqvmoFbK6KPV_j+V|aK#64)2y7`2
zN^5adoyU3|;A};hD%JY1=2KKz!i3h2$1!UGDKdY>>6{uv2H;OS+l8O|>xSsno+Z48
zuCP8}YS2Pnc$KL!7zP*9_)5RILmtMGo}9F+>eFtZ?~z<c#Rlz^PtwwSVs+mpmcA18
zZjVKbNRsG^<=}PV4i{-ZMq5`qMexL=J*aeD4h)WE?2re?YIaXVBrQ8(NaTk=K9OTc
zgt9zAgKvSjk-{k)|AujDqImm5?d(9r3B`kj2k!-GWu>aDP&xb^S5=u|{&JjDQaiUg
z8TuzMvCB)8at{FimtPx8MNkBaYG(EV<^5F?IDax3C{LqLC?hrL#Yvqdc&VxU0He8u
zca%?udQ5_dyf8`FTfYq4Laa)=X-e)fXp+|^ID}5&fRnKcvR2Lza*{v}p3wr><TC7K
zy)UM{Ub=W1=jo`yFkMEoj(TZUQ8edIt@!+A#VO}w^;eJ1>K0soKW2<Yc15pfD$<la
zAsZWYIsTG^8=C9<;<aQnF6&(TaY=9Td9>DhA!`VeFmFf-Tz2;_-mFlUqCa18E1CU4
zUzj@7%vZ1-`vj^WOF5n+Z>N2*Wwkf7fqZlI5CIo88I(yx?bkATN`>cn<{5;9WmjN@
z9asWHJwdeUZE*yXGCXN)oeKMtCX1#RN`m!J5M$WpeHxz`5Z%!eLegf0@BCx%o~J>C
z{m6^YTf4lDY&DjHwZo{k4f~pEwtjVI60}MrNeQ0n81uVPxnD%vzF|X#no5lz7WxfZ
z5W$y|+pLHY#Lql9Gf6>5NyPPO&Y)^gQK66X9Z1ywNx3Fw57-Qh5+-{`oGl(spgc;W
z^(SLmp8x|vWJieGse}VD#)FqbNWv&rY>LpNWOWbM@fa6_#EOVE;buUgB}Y3()6`(C
zBmw1;-AT_TS{n9waxqjMBUHPXSP88UgEY(>sjEoB3>%MPNvAiT8YeS~)8Q2-sg~53
zsMravWs&f4_jSV$*-3V~9}!p4dWpS&i049kdVv}Mg3qe375tgVN4<{;YQ*?=FiF7d
zgp;i#Bc{JcN%RT%pZ|rky{HA={&KEtVg8#d?*GV~%}uQB|F6V(E2YnNg8?S=#si9|
zV^vr`Z^$1A*byTXt;4$Gl8%AxfFhYzFd?x`j$gCo!c-yxXL+#NHo;f%&`j}u{v0S+
zUA4P`1Z1?|{34&qq=Feq(zBs#6UKh8se<RPDr35IV0n2@6|&rPhjBDE3a96~A#FzS
zwni%(@VbEk3M7jtMx~Nt%$IRfuqd5sQPgZCYcXF$kF5WQ>iZI7qi@g3quC$Eyr3-h
zu)b%8FrKlopMoo&3z~=*l62Xi5=g>=Ef}V=sD97y-m!(86nOhV4H>^WG-t^2ebm!W
zUwCAVz`jRBCrdyV%r?p!7Bk!@wK%=A%VU@+n_3Tv*iy9+v5r5%1cT=04zutH%7t8q
zDwgz$%24PLbTX40o^_a)IVuBn6u5n@adC@BUCry?PMVU#1G($+tJXjT+i!#kEkXiZ
zyPx?p{I?F<qA2(*MS?SK0+hCNuxC#>S%T=R-kRPyF{Q#`8CDQ(<B9TiV$e^Vv8Vga
z=^N?6a<ewPI+!-|q7E<TTKAB?`@{hUBj^#59%Huo%+9@iNSfh~p;hx~3d>ixKLpu2
zBvSafg70+iTiRe|7ILc0a<vg&yPG|H_>p#HC$k1W$@E^{o>1W{nM7+BbB8A6xpcR?
zdtKk;(NN|LJ|E|}17@%PB@d`{_1~f+7y!T;#sAzX))r3A|F=`7G;I<S|Bo0R@eX<^
zCEtFj-UxSRXINpRl|E+RaD7pcvJc_9=4V?5D0n4s8@9P{g`{KVcx{So7Ic6?bOxg@
zs|pcz&Z$AD@dNpA^*Lznlu`CIr2&*{B+5=g;5jTFz#3zVcxHoy$6)Xwt|yk_fCBOh
zRR@{=meYVEoE#!|ghia=E@{}jh?SF^5k_2TZqM4R`joG~)3B0YXAk=F3#1_RHO`_d
zq+ZR^JaB$kZ1Oe-AgB!>2II(CBnam9&i+}|Uyng_?T@QTBLL#({(Myq#yCwHF-UHj
zurZHg$cusi4~_`lWD^nNE8YR@-M&r#$|F;-t$;WSUbLuHgCIX-7{$=0_8iIKK}gdO
zL@@K89i<f+zbLjOLi{&ELyk`g#VdS$W7J&^Q8^^p+_~F8hVKynW}2#$Q{Dka3ED_X
zqy8RIQ|Z8H3~BxXD-rXx>o86_YBT6Mnql2RA-@K5kvAb4D=8zxyz6R|mm~C)xI6bG
zrz4QY6NF|e9o%_sb%&vK1^&e>mBp!qvo!R6)$9Q6e3LN+=#+EbcJF*5r8*p3eXC$c
zRKNuKvP#xJ=S7Zn)G%EOSQ*STzh+uO2B%;VU-G(u$$pXpSMpuzV;T*IaYeuHc$2Za
zts!00o@9Iex*Ud3_yoVc!)UMjocmI6*+XMgHO<iwzGjd^i-t7>QynIp^pn|<`QUZ7
zX>YzziAEh@-7NPGnACGa)xDdM$IlMY?jG(0;hS>}T|rF*5~La#Bhw@Ym%qb<D6Zxj
zI`GTPHzDGz%mn$Adk2t6SJ~G1fW6yefU~lIDs)i-fx|`8VzKb!Wnl6sM@NE#i25D(
z;&ydPz}#Pb3-60coNh5a(Ygmq^v<%X9DgDAwfIGE;fXzf=A!La;QfZLkgxAl@UYF@
zMoe!mPKCKaoiC9~y$a*vnA=AC$PRy+EQ%&6p=RE%DJWf<I6zZnItbJrc4SWGU!}X1
z?iclAVO%3AFQ3N^*rg1YEJ{7V+hJjt=f+0c89FY_e~q>0Sy9?(oE^N5&l02g9TS?%
z#Rp%FSN~2-Txz#8(3EJfFOs!n<0BU$qm+yfSQgT!S#tY#<AN*1cB%XM!M`J%bN##I
zy58o3Y;>cdvbdx!ZyG~BgoT;c=u%w<C4d?@#$?U1m>`^)#JPFB4A|!;j7v+|T{2#e
z`UN3XA1*l|WmC<pkdhRo;a_HIu(|Uq;@`X<xNmi>jHwiJBzv%(HrU8U+rahsop+u0
zgOF-<(3;CmOzuR1AacD~v%w70PlE?HhdKk)$+!sX*9?7WOdk?Or*yNLPuAAX!oN0_
z4Z)jJEBUr7hua2igT8uM`0Ol@s^-x25J*HvHGeci-&9HVPqE>Uh$hl3b6WPJUY&xM
zuZx<#Z<?!Y@iugOe@bwc_7ii$S_)=I>y?ZEGCZD}Rgrd3E5o6Dv#q@ed%UrpWrxR)
z-ndu~e%TTChJvw<{#sQ3!U@Y77y^aH<Lig+&~m{C5n|E!#>cBdy5*r}JeE^Jmr|7g
z8lRM}ruFa@ZBk48&nYGew7%T8EE9&G!nbA5D5q^$ry4b7l%zt-pk3SEw!Q)JimDw_
z8#1Xo^0e>Lv>o=;Sgkt(8FY3Q-fbq0ZA&Y&o>8iBNo+kL8Rk}c*Vm}vq#0njxZrF8
zS(+pnY6+V1Ler*;1}S9}lVLC|lEHi_2ugV4ZB-k3=xjwhgm~!lFZNX*oxvPSnLK_?
z)bt_4_7kq2=gF4=;z)E_=ZhHnW15!Fh|$st)hpRQAxq9=9%t8n1Cx}9PN7G|R%oux
zh6?zTGqDkte3~mZ%Gt20pyhh1H>od*I?J9^D*03U>&V@_T?EY*6Ak(7b`!OYYoB5x
zGPhEvn-x3X-H<DX=QqAlCNDj?e_(7ID3&~4HnPx}JN&JXH%0B(#H%wf#8Npzsoy}q
zinpBtThkAbJy1TZm=tCR=|wgNVVRm%Yqdm_3}9x?R*+d5hiGZ`?MtQQ-=Co{g8AV3
zKCw=<RE;#3G_3mcA(N{QmG}oQZwav+X}NQBT8<1XX{t5d&bBm{uNUoF%lM>Oz0#7V
zUA45Imz>|RU1o`kH8lD@)h>JJ4F;H|{#6JGEo7;|p?<OH+&C2j*4EL6Ww?8c?DSHH
z5pEAahY>A?o?O-Ad*<W|yiM>Jq5$~)7lohnKC0otulEq^*TD5ZNqub${^L?Jq@riF
z!GPfRQaam-jve(k`-*h3ZmtG9ND%*DDN`^`E;e^Vv4mrz^<&44G52?35v@rbq~y`Z
zN5@AtUCjj-I#gOfRCtEApyW0fiS8fh5sw+HMnp>y#ga)d4rQ;A^%F_u;Q3C*Xdp=m
z1&MqZi*rL-ro2vuJd71UgL1S^MgeZ<5lq@>!014>q|F}6frKq)&oB;GYrpVUP?KCP
z1%dYoR;4PS$vpCDwzQm(JXPoL15S`wHg&L&+^Pf|io^s-(&aqFGE+|ZE(>WgS5{v~
zfHUrxL_yi>-q{F=8NFr&6#XUEw<{($kpllV%J0|#(CuY+C%<<1)3#%2tNVP$eWAsd
zyh+&pJ2;oH=Zip#Q=@ejtV#AWi`{nA=Mmq~1d>>zm(TXQ{GhQWkCyK{94?YDSu*mj
zCk&y|3pg*wLabXKvtCR8a;eKj#?AWZp!>3<IusFJ!)vaqDIqIniw%d+Dw46P*}BeJ
z&HaI46rRt?$Ky&;t9^$^6d$oxcQT`D8Z49F=u%bu<I{M>VB^d^6B7u2EF4Gr#BP3s
znDQU`xu4LY7UvHN=L`Hes{S7LgrZT%u=Pr!ToMVdtHpz}KmzZLnRFub2x|!iqs8rm
zMQE2+EfNHJMqSZHMn76<Gn=5S9F){;*V!ogjt5MXUe>DVbREXU$rH3S{&cnWI-@5B
z>_&&;&mryBVPv8Prn<Gy(lBptm_0)>9j)>$-5HY8(md)+FCQd7x}+}q1VUd6pHoLX
zD;`kpczV4Y1CJWvi3f|+g{B#{v%9VU=E%96)^M@Jl|w~;Tks5rD(S>*`Hkmna|OUY
ziRA*g0sDQ2>ObR(-L1OIo7UmBMPIBm=iC<&5nGc-I0<eox8(KRk};t|mfW`r9U=#3
za6|I%M<{0-rvkGS5n7(peedSww@nIo>u-Sd?7@pQ=9`d7%dfy<<u>(F@eKtfi&r@p
zb&b_1jGuu2`NqG1B}ZZQyEdMFvEcvH+Hf-Yk89kPs!n940E*9xn)YR+P#mk)dmTa3
z7A<Yae27f0B&O}Y3`-evHfG}S&Wmip+~8${wUqF|!sMNAmQxM$=|{I)B_wSz0rhJj
z*07x@Lw7_i&kgM|TjD#|c3}%9s<Q1a9J;cBEl(=vN&^VtadR~*B%numQH#9Vn{)|D
z#Ad>9e-uEeC|WaVedKK?-Ml6fjI~tx)P+gYAZi3Cv*tlb^=o>4EVubDM-q=UWe``I
zehrlwoHRISohtzR6Jx>?<f~s1l|-I0zbPx;paATIL|S!2xrTZh?GM!>kDY2B<84jL
zy{anpylcNoP>S!NYpXd$=Ng;11zhMQJ1Pd%F`PACYcvW;tP({3!0vK*(M>?&9B;LJ
zGdl^UItqajdL!<Oae)<-u-(iO(m6Mpw{)wM(a*iOWK)vIlEXG88cq^hi`hor08#>Q
z{+y$V?Fsr48-0PCc>{RE7u*EI0?TdbH$bG<=(4fwCtj?-ksl7?i3Jn(iE!`OW8?7*
z3j@`q>6d_6&RtLD%pc_X&3puGcDhg>`_ISGmZcb{{jJ$q{;I_S^3sPs8B&TeDD41y
zkSRwTKD_G)L0ucIn7oasT=BUg=Z*gA4LtH!Il24XoK*gon__9XqRv1`O#yZBbfD7Z
z;s#E=my1nP`<{Bt+S<Ykh;Gxk`&KlUboyo~B}z4xEcb`~OJTnl3;HBJ>ec>2;st+n
zvi2^4P1>&)Is~;VIl4G9Hohq9x(Uw3prke_^VEZdk^9w5UhlsNKRXAn+U>f0Cq1`N
zQZMj1B)iZGhPse@H`Dvy=)?Im@Bg0!lK%~)2i1^NO#G!TGyJ|t|EK<Rws!iz&x~SK
zn_nh9!gr3IK?uCm9|)s;5g{nUq&qTw6Iue0QO9hli|P8HtEwcu8BThi9e_FbOrss%
zm(A>(X#)%}VBkqX*(M@-eIZ#SM@R1)0|XwE=1hx+v`+NT1n(S3@PJhMC_2bukt<8!
zBqRmZbq+=9J{LIUf-Hl1vS692lxE;{Nsww&4JmS}LQZw+#U?fU7H+{8iwlXRtE#(f
z;0skG)n}<LRju`D>E=#;xs%I?IZDq~{fTXP6ok*NzQoikK_M{dama&IVg@z+(8)Ta
zS;$P50yLbdeErePd)Wl0>3qk<zevtkNb?5OE(fq!IW1d6!e7g5g?p(VcBh#Oa;47h
zT)xQ&6Hs}ssvX+&D87>ZRB(Yk`pCGYFS2-f*8YKmT&6g`Cq)&BJUMN1bf&^*TYX22
z^Z8vtZQI7T&<uyPxO<q|t6kWxX4PFiv;IP(D<f7HZ#5Mqhm%m{?Z=Zd*9C8Ggz=Y2
zxV$SAUB`4tn~HemL)n|<!D2P+E~-kA(1JFT)tE(2;ejU;_%)YPj#AAGH3RJ2Xdn#|
z5fal42%A4hyUej*<Hjy)t2r$xPZv^B<7xj&c3l%%6J!r~0L7<>%p7neS-E@&nH&?6
z!5Qy))%N`h_YeVGljAM&O@yQ93G1&<*tpG^xoRcxaSkk#al3qlg7-@MjGF8T9N5%g
zfSAU3Q=|mvg*M&X=n<$sQ>t<c28I&kUUsy|6~q^)HWSYw1l&4=$(Nf*n;LWzTbg}N
z?SkxVmrqs392@Flhm@wgH3)EltvW5U*ohWJb@18FB?G#!uD{?L+bi`kGMtSgY1pBm
z6vuNyRT}ls0@MTpoI0M(6DU!XaW~0rZrZgU|7~V(=<U_p{yU{$L+xdsU;p(n!caoW
zIs$il_tH?BewxqQ+sEPgQ|{awOnr^n2AAs+akwOupHgDVaUC<|Uo|hb!?97~Q{T2j
zPj&H+Q_s+9X+I!z>I_CnQ)^Bf6#a!zCJ*<ynizMIYIOBLwW>|B@rvukmsF!5`G#5B
zRUd8t{igmxA7DszyK&4+cxfyuK-Lo4=&uwkj*Q;d{;9lPi$f=9r(jbq47=@lxsf=R
z?W{gF0ULLH=37_SB;Js{tNiGi`f-7211Fw#D|2h|xm6SVo34KCNvz|q=T#h4Qc6@E
zHKzGZpZJ4RZzrJ(=xRAe{OVU-V-i+kXd>dzDK0?(tRe%YZ-C#YdNfnAam{Q=lUwy7
zbnpRlew|M(%L?-`hmXBC4!BnIG$8YggD(EL%SAvp5=UHZ%g+$_<yCsawb941rpCs~
z+ID7~HTA^VXNC1PX}PgwMod;;<X06P6oXd<WhIro2Z}HA&LaB@RMFxU6VEO<8r$;p
zT5QIcFxFpi=YOmiL*Uw8;2eARfGzZ9mN7vOcG<?h!--YdPuyl$*SHbAdlFl`#;ydm
ztVStGuw6R0iwwyZ_v!MJG+tR{2F?f}wObnjTyJhU(dkx7d!=HB4k4tH-1oZF;hCin
z#j8V~!h93r;ue3WGUn@GAI~B8+!&F1{>xcARzDrp^?S}9dHgquyp4&Couh}IwTZ$1
zixAK`bNt`i*-q(fFh3j!yu<`&BBrExK8{Q&Bzx3|OVaS}Y9IJ-Y<%TFiucgC2s8(<
zjkQ!qWwL9}Ov2H&K&#!hXbOAFR;1YJ)amInu%oHnz85pcRp_hP>FUX7&J-_y)Xb#x
zN@wpk9>14C{e9yo<W>8P1_y+IuJ&0{_FpORxi=mkGRG^1LBY0>!AW#*y{Fn^_=PqG
zVQ&jML&0^tz!~Va`zRfA!tbNS$0<y>di*B&8Rsmgg?F~WZ}>|oHS?>|ufhTLTG8RN
zUF$_8F?`V(aPr}^wcRgp&vMFiLryl=0sjc8%=tpKv;s6%X|_XWL|I<7tay`zW3X~j
ztV3nZ&$F-~^x?>X*0ABTy;Bcp4JB(I<~(?9>yVa{7WrL$o|y?!J=n;PzT0}#w}3BT
z?AKDxOm2?ldmQc<b{IiF*}yBX{9j$Yi+dK^d78GwQ(wJuN-L&kPl&g-0r;%_z<s^M
zm5D6(1%AxCoYv{->*4Lvo!I@8%g@v9Wzd~BgFELTkjW0)%k%TFdtl(~qp)xb?H%3)
zUiDk}If9Wix$P`ppT@HVLN2>znZw7|<;j+-nP`ue(`&LrYnjj&k=4=>8UFN1jLjSv
zTQ^O85^P~*qmrw6`xIu_u<$U$iBnK!1uE)g%qPTy>XyLGPLG%z_^C02kXTTo*%u-f
z?CivWTJ7wq2fkq1)B6A+7hgeG;kP|V@rSUc?`z4*#Y?WDpliKCYy#Pg0=q0d7D{KY
zL91gFi-pZ4h`S1>13VO^Q#RVdDPpAPGcS7OXH)FJsM{Ql_BThiv<13#-`i}`lxG4a
z);siGPDpe%W5&Y4->ZJ{BlL9mZcnx*_zYuCQv`8~;zH6LqX%(nSo=y`tWY$4GawHl
z$7pD33*etPE3Q_U4CnoJ4xeb1AgRm9HjF94UJ_!>CT}IVMYe;6H<aSJ0<ZqXcHo%y
zrh)Rir;#WfRJIJbvQ*fme1xfUPn7i--_wudeN=OR<boJDUjME7<vg|mx15_O>;iwc
z?`Cl-*y|a2NpDGEANy}q(d=b{_D+Q&Ea%$=LunzGDd(aV@Z&(u(SMLlA;a(sf9+Er
zMIw{JSm<SsxabeK^JjBLc}Fct3;zN@a8{eZ(%h^L%4;YZ+6^mj3vek@8bg1v{e|D}
z1K`?4mMZ131o}#=F!$aYL$Wl2z_q=Gnc?xvNAwF~`%t@tb`&xb`Pd;|nA^o#J<wM>
zU|6+QMLlF_FQR}$wiqPM*AB^~-W;d3`VEbb$Ii2l2<Yjf$2US*XND=|3Em>O-*dMN
zg($$)Xt5KCITKcsu8Y-yCSDksjL*3l(04+!w>kdukH?rWJL?CY2Js2-Ug+vw@5Ug>
zQ*GrL2^jwkNezzaTZG7{7`l`v7vp6H*ok1oGF<RG*ZG$|Dvbz5U@tF;CAX>3{!qU$
zh?)=X%xueiKbjZ+G-N`@`-6>rjTgy}*WIbk)h#9Dm3S*V!W4Oz6u_qZOz4F@s7Pz0
zYAp}$;)FtP8_p=#X3Z|v*_`zi%;IX&?Q>zPq@D>ReMk9ZSgMguKK^GI=}{jm#E40g
zWCTHls!i*X?l$GH%&?2gIRk$Bq4y_>zHV+y0)56gR@2F8fP&Rr*GSxZ855ARP9RLZ
z=@zli?gL$f8*~RzA4EYIr{LOh{T>+0Tm^oRl$gW)FFFB7p+i5+W{&Lke&@3C9`-4K
zM^$g3Gn(WzlNV$rvRitTN-I;r{oFV-gu8U{EpW<tNMoi<G1R7-q)PCu>i9?qFcoC7
zi7izpsu`RFT!6|<U$Drqut)0n3iMp^LR=83hR0!jVZkNM=qOH`fP^hss(kBVjH=b~
z_%ts?E`DJkYn*4PZjJ3}NH8}U)2j^#C(Zat)n`hA2i>3LiE9(g?qq6Axj(m;R2LF6
zOI0PZ84U_zsgD$8^~J*}Q_dBJ69bCYoa(qb&xscW)AyS{_A1^U?k=zMwy6M7Ni-z#
zbj}KJK5^m)A#`dE^>B#@`8EcgGbl8s%rFB&s4eTbpquBMy8b`>w}!R{-xcwf74ee=
zg>m034YX*;8GzY!!1&32-2n+zAexoyX^V04{+n5y7r+A!9*C2x;7wWe!}Tt?e_AC^
zs-vcIp}8$M4Np2u)kdphwl~CXljXAp7P6D`c<2d%ujQ*@rc?WEKBk$Z(T}g~qDnRd
zlzlJCA}UUC!cs9hB!+1bO#dB_{3&iKqHeS0X9vw>Zx(ljz9L5}b~y%(D%n*x24_Gw
z8-Ssjj73)=m12bs+$gybxo#7f9t^TLk)jalfR`!qFMeoLrlOfg>g$w4(YEhN*kQa9
z_GdJn?rlNvKyIHR@r&a-;4)bDib{SOM_!b=R*GOM#_Fw3LMOA@>_E9)mEOg*#&Jg#
zbVFB}S3mNfSj6r%2TeT)BnWrn4=yti1r$x?zzeLWXX>AXe55S!2C_(1+Q<N6{={43
zt71)SIGhav1XlPJ7zH*oVg~ytshMuAtreu%KwobKmHWPE)cU)Wf*V1e<s_&O5))h~
zlx1qZTQBplEoaBBsy7a1{!5_FG~q*|?*A(WE$o7t@C<iKh>hr@D4=9ic7)6ibUMZi
zKqHZOjV5K{(2G#5(t3@X!f1jYw=TGJ)i$hZF(#*uI;ryGd?FPBP{7&zM~9;1>u3hy
zs$ad6OubHqt!g;9bs{j8VRtgTo1CFqsctl&$(@o9Z*Q;ICjM@PPLp74_QW*?vlV(5
zdb*}5DGqz(MKv3`Lv1YCl?~lh&!(C*yNOyNSfE^$H6+}mwp-Du2gysWQwM<FFXwbL
zSgg|$p+>L`Y|kd(7WNUYupT$<1kTHHv@jr=u9PEqLq~QK@eJZH-Y;z%El?sR5e?-0
z)5-l3@s8Xd0gO4Hcz0M-F4N>pd+$6L;M_1(xf;E*rKFJzDjC(Xpw6YB*KS^}lklbY
zSwnaV<y)~tz+G1cwFX;7-v0zEbY(?jz}_STE3_4EfPCAQv8~&w_O5OCAbZ!r=R(&h
zG=n)WgV{Ql`8L~Ya@NoAAg6P#!~P{ld-vUXFR@xOEn$<E6D`5RETm-EFO9$0GMT4@
ztB39?2sXzudC0m&BIrJY6i;kR8G@$XC^eJKnD@ePX!x}amUtOxjFwW>3-1x^UuMzu
zW2L^LnBw|TywYnGI;@DU-_ci)o7&K?p|YR59X!GQBPd4-IRE}RsspNn8HLSsw0dXy
z8b)Iu)pqVRERJl;gmrq6!|BT5Sbxcy(qyoSf;0A;)oUJ=@ksY?s(a?qN>lY+lS_SN
zrGl`F+zvl9Zh5sfYfPSseXz~Y1gqIrmk0NH9Qj7)myU-Y7`$79lpQuhK2Q?pk8NPu
z5Sj6%vhdVXoYE2i>!WrAtVa=znAGa03IvekM|A`lNo0$bOxGx(mX$oYm3hY>kANg{
z1#vesP(15xplCp2kYz=Vj6Z-rBWg@O@C!d4nB~4me?umpgvY_D=}T@SFYTH4dM@J6
zr~o263GRj&inBdJPznUjgQn_q0dHPZzOt<bgD$W8J1r%U|24N1OyjRhmm)MKTUZy2
zKCKY?!n0$;BztM0zMG9L_79AkxE#@%H}EH+FJ<~c9<1+awr-k-fZCx>*KI$XFExTZ
zjlaN+>3gu~^!#aq45?%hitetK-BL!OrW4-TS!QbD;oh8P_~#17z;*VIG#%f<28F3>
zOv+I(Wi<xAmhp=4Uh)Y@DUXc3puMy~Mc3!M*i-Ju_C!yT19QI9ZVDuFs5}`!0|2YG
zBX@Z&QCB+8C8J{XGY=_qsikbGR~y&~cJc!F;mL<f#&*Cow#){Y&+1-2F)V}Ull9Ob
z+Qy1Cv8_c~=&tWML-Ny3St$8;_P2YU-WOD<5BA}gfJvf4kg5M#X(xEk)_nM7;YD?U
ze!jUUIEC9XgR8n_rG^}q!Lc~FcS8jCgfsY0^b=J2K=PQtvP6M78fW1Z_CF8V#x{!0
zOn!%K&QSjierMxi?QG#}?r36Q{Qsm!Ev4xEj-XLOZ=R@G^v2|+d-MwtR!>Fqs~DDT
z*^MBE_#~3X65<Yv-d?y{_E#k`gRLo>x*LigatdGKofAkTMw@{V^YTO&yy_k=zaV+}
zdAUgs3(NWk+o-uv=%fV%+X<TW52YnI@m2C2dpVPfS?OjaCpJ-flVNs)@5z;6x)f9Z
z@QB5gLm|;<itt+d!%Y31ErbAeL^p<xSSDmFM9dGeXN^v$WQU}089mb?MX;SBjo2N^
z&^1n;tnNDTEJdQ^y}boGaPUMdQ0)z9Tc;0q=i<}=*hqs9CEexl<-I;0uA@Z2&r}NO
z^@E*B)_1c!<O`Ai(Wrvc<d%&t@6Rt;e<e4^fK_Kz&38zOiKN;a<0LDei-+m3C~X;K
zF9tk4m@a6n<&C0HHGG7?%bR)};hf$QNVO$OO3chR<x&vo%VA{Qm>@vogc4ZB33ziN
z`TsO|)~}p9yXMKE%mR+b9Nm^_d9|<HNZDNlc%g3^W=rdpM^SW-=Of$pCFloAa1rtx
z>kl|7Ct@2#xJh`PCTJ#w-_<Az`bK6Kkxi_NrY`BN^b3PysNwjo%D~95N`Y^1qiie-
zy?-GDsk#6`ysS+=Jrpxp9~LiCY$R|zBvPT2*`0JKEiRSuK2H%x%6WV>XgJ$$;@jh?
zeON7Nd@Ri$gWX<(d5ilceqX7Z_WHA_rFK%svA(Lm)6bX3d#1L!qje8J+|nG@o!Psb
zzOe1%%1>*+cmQ+QH*0C+U6c~IzIE#_tYI^L4RIdm8b8qM?HnU({1BKcEdPLl183Ol
zLX~sM6<9Bw*f*hl{+GO}W}~Bg`Cr*b*>8^$|7Qfk|4xe3`wzOEiMxS~y|u|N$2D1A
zMrz;}(Rfpf;V8+N)It>^)Vm(ED+t>4muA7(kX^=g;If4EXM3s?55+m=VY}1g22_y1
zRr-_yw9r6{v1%Ey3T+|;fnB{LXb<)>ek%Zq3VROE*jlnIqtZL)0P?7#htCMOq6&t)
z?WLo`7qt&A(lB=HF8%e-XK)lLry+H)I8+l_i$z5i6Bo-z783gg#BHJ?Sh+`4cs(#J
zss;#E;BQder#?&bpn)=&m^|9LOUD-$nOY+lD0-Y60P3^Z`jKbzKiw+&RH?bX>^{2b
zKX%OTSmNB!jGbUQkdFaZ%hO&xp+W3~os^W#a@3@Qj3A0BWlc$fwXCjaeznwD+a$s^
z^R8WEmQ-}#)pPc27t>Sv{ZevU&*3!_UF4qnuY50@ZSlzp>r<F6mg8E*@tKU`91%}I
zs9uH~qn|ntA+LS~g-N`NbgU=KKmUc6NdjA+aQ0h>*x#a%{QoEj({Dv&V!Lby7+`|k
zctQ8ZG<aO7cFTq5;L;=LL<AY&ve%PIg9;MguPSqgYrtT|w%s~)KS3Ve8Q6$*AnA*|
z3sFP=S7T=a4`uf~@Udkpdu0h@-^upcy$NOCvt$|T*w<`fq>v@M?2$c;lAVy`CGjFl
zAw-(Uz7%<>{GUNJ!!*DD^Z7iVnd|d$zH`sH_dfUDbDwh@xANEMYv)+$Y^2p$p6Tzz
zzw=zyX7da17dtT{bB+A$8#YZd!aCUS`&HsaR<fkhC8%DpyE@O{#y?tkHEX&bv$i9W
z=rhkg;Ft5cNBjDLup)V+Ae80e_{DI^foTdjZu3`%J)@G)ByL~Z<)|m_C;5$1w=~bl
z+ZyrQWzF|orHe3)0-lHW-r;5XSCmWcwmBA@6Jzkzs_dhBu}Q|g(Mbfklk>f|BNu``
zw!X)=-_`)s^cZNn{Mb~5JdtAO21ND#-H3g3dz{+OQ#9}L)k#ZHcrTPrD{>bV-KC=c
zR78^}xv2DrpH5Y4p|ygkE)zdL*%%77x2G?V`SfBpweDNjSdNVBI;GNx^M~oz{W(Jp
z#PMd`0u?v@a)@(yGqFa-NN-A`EYMi{m}yc0E@8#UD^~Qb&bX;VQ8(m<j7x}cUYJdJ
z4R;oYMvp9R@!47LEuG$w8#qPT4xLdeoBbs%l-(%^IWYc;rOc(Tww~c@<+7P)Uu}&J
z^nQZ*8IZ?aDm37@uD~N+7&GJPXCYL5>PZ_n;9F5?)fOaVC#GPFI#cx`q3j>Bl<FT1
zLR2;{AWX%n@<sjUnEw+bp=8yUhj!+cg`9umnMcc}DnjV<PCujmwLawE&gaJW8Q^YL
zOa%4*@)CJpRH2qFBO`I)yE4?aWvf&-bT>piL)dw;xX{o@k5g;SlejLTZTXApf9uTX
zR>_;DXqW@6@`(@x62N8^FFP9_uWOFJb|_})HPA%HTWSH~ElF|c*;R%QsmXa5;}gA#
z1`CGdvt+wG@LKBfZNC(`7q?uvC^lbp`eGA;>bmS|%YWWH&uF*k!*v=7?v(C4Z6AcD
zk<LCnWmwLwk|Yp4oLkZi+y0iA1`e}w-ugH`LuQ#v8ie!MmfdyW*64?w(4+?2PNyUZ
zwxP&sPoux&z?^+!#ExW|p<c=F?nHrOeJeQBy>d^D?|?F{EX`Fzh%4?sYAb$WjWe1y
zCs0EXTKI(0xS)n*y>Q-QN%GyTdio!ng*gpg`~faZ6QP@r>f%k{Txz>)se!H~*-&`~
zILqgor5<rZbPVQF9|z`zdsMF2+S#VtN3kbz6MH^pwyoFTeG_t9I!dgPt$O151;Q`9
zg_H{1|At<LTXdhisyzXVmxHfYe^p8txiFdUd+}-aUSZGl=qyRl$eW-!>yLrzw_1nk
zpZM9jWX6U>aB()$Ix#SSq_5tc%wOr)<L9{WZmr_7dAE#B<*b`c4PH<_{ZulA?ch?g
zu=&=)-pd7upD!Fy%cjL_wo9T?4!Oy&HqXAz@L=`^<)q}`9UOX#Xvasv@$^0(-EoQV
zYYA}ut~stmgAN^`_z|)$@a|GiKvC_WV-lQx?6qrDEC)w&U}_-KD}~*PrVqQ*nV%WZ
zTZ{^2$J)O@OHl?|&5;n|`_G`?z1B5eQyotMIC%J(@A<uAaJyl1&0{bxQ4W3}vtHI%
z_DoO$G)4K_s+OOrLmo?7vwKdE!^792)FjJbF8@4j<3}sgPaO@_<YvIIS#g)dq6b3J
z1=6@V*TO{PmzJ4Y2Yga~v{7E<&meeLwk55uMnK-j5ra5aBj7N{dK+=ig0IB$(ztpz
zKc~BlzqZ*!{p4j8`7p(W9L~A2UTdDazR-5O$-%U`Mvg^IW-=Y>_ptI-&TZEpHJcf1
z(E|~V*%tpgl!m7dfnI(VD9+qN*Fhfw*0}=2Bs)iUEELhc4z4-c1E~pqIlPvQr;rY>
z;C_fzx4KAGq^fMLDy;0p$u=>^kHjoJ>XUzEF+_}-jNRy$>LZs?HAy-#+FmB-y$<u6
z8k<`cgvDxUYw30w={8GUo=Robnu2rnNq>|HTbE8YQ4_v2Ao#vi$>iKkfe!v8F%FIG
z$O)1@5!K2%4=;8}+nC!TGnL$Vd`~#(lHuHUwU#y2c-3?^wpRoj*vRqSBMSpJj4zrK
zwq1Vp<z}%Y!n54mhDDWkzI)YdMip26M3tk(ST4RE|9w}{#+hAOCNGvO?b@dP3+n~Y
zPX#kQ;;Jv>o)<q$<tMVKXK!wO*|r&NUr1LK*{Ro5KV*jBqhFk>)R-EH_v-oJcHfCF
ze9KJOsNZD|YO8T0(fm0#r|PZbyAcL8atmz4cM7FtErRo7*3~@_1v7{pDMN_!3!YQe
z21?{L_~W`8U<s;kfpH(gVu)@&-B|mf&dwg|xUexUBN?<hp*vePksHibaz4gfVeIW?
zUhXhz8W){)T|R}{klETqS6l?Qj_J*C-7Ys8^@=znnshS42LGbCH3TA9doWg#Ry@SO
zfj7<ZlDX>+thnEP3r`uh!z`bVNxveCigvNV@;fxH&q=-$mY#5Rxt2>V^pa4q2j6`>
zedU5)?~4I={SC?SgOAj5en|0%`X`$R=I$k4J>4>ExNy64T0QbpYAm<To&dPHlvRq0
zf`_6+vy;d>)PY)YwVgfL%*Z<@JCT%6jJQ`IY7m<8T0Uv~Z{p%CF5^Jg`l?)R<NA}d
zG^9BMyyNo|8fI{(2s*8@6%ltIiILBy4Qg%TQq)r};k68R@+K1(Yp&*%$&&U|<<Am)
zwN&JATa%)Im#VcquTyxqoF1t=73*-T*lJjqa&y|cyDdFg*UwkOIE3+ZeM6wf-)a;Z
zDm-TRWZ%uD8uE8ZT|d0-BFe!j&j~5pyn2^}r)fKba<(tkR7YsLhq3}1Oed>z?sLQ=
zFDKD*{}huqR&QDQzBxHRfre44aKm*~s+l^Uy*G<*V=%G_+A?QT5D&T9zO$_*QVx?0
z4O9ppE!)up5qDT<4@0k*R*+lTLLDaCI^j`^DMDOtL`4hdPA2Y#>bVQcdbeDThPmh6
zk5<2b$^o}{deC%QrUN!%@+Hkr{cU_qS#dcRB*0wU;f=&&7`4VnMqZW=9dpV7ri0FK
z$I@Ion(|=lXR41B*{;+=T4ES`t_^!UT!z;Z&eF>aR;RlaGwu=cJdb>OzC<GCHG)#k
zTIasxZKKs-&&T!}T}6J(j+;rX&t{7>y{91s%2#4kz7sJ}s+{8S>GRRxV>E-1wIaX*
zGYj~(9_i1=A7uSA#4y^fTAOy%mlRAT_OuF^^7sRO@gu|Kkx`-r2&N94({9-{qL}Y^
zIgzX7MN5KsRm=0(Icr6a=C-wHsq_mkj_ao9ms+)>zqyXY0()Gvsb5s&*AtWar-Qu7
z<VhWB8p9$!F8^mQXY){M{MOt<>I{!Bvbp2wp2Vt3r_Zz`s3F*C$P!X4tyl*{iJaX_
zJ@xVvGA8w45bB%rUd>r8F$9xJ-O48#r^p3*HWXQvq=$Wdp|Z_2{5skp&J^6t)~1#X
zhSdr30g_v)wAP?&{>}bQrt44l*n`-_a_h9%SX2=O@He9i1TaVPamr*7)`)v;0v|$@
zbPyMUn8Uk9L=5dX|Fh0EPJT$7<eF0u|Ao+N<S!}jyUH|RKT9!@Z84tLAYPU=M>4m%
z;TSni&vMx@sKMLJ0fb<zOl9w^sKVO@&1Qja`!h@8Ueg!y-4m9zjrA4!=)Q_v98yf~
zAU;iW2O{5@RMo{4)cE9PQ^)4aVtvPCZlskhD@)l-_Jhr<Lg{8a;a#ocEM>*f_#@wk
z>+*AJGx9|XSI(SKR=Z#RcV^g&;OLU7b2+%ubR*6)d!tX{<ZoM0`-l;IZp*Ax5Hi*0
z`0O+#(-<+UA6LiPOsPq9+HF0vgf~Nq!rJyM|Hd<G$#S*pis4gvP!m_zv6uz~aVi@@
zX?t;gO2HB@Lgf}CjmOEi6H{eu&8>U!YtCIJ96_xkq$MHgC2_XAuzsQ%iTUW~;wOBf
zcHK*7zCTWqk^dAZa7EH3b)n;ZQQ2w9Touy*9fzKoPQRKk+eP|}p#fX@<xeH0HnST&
z1*Fb(!_9UPjF;kD32zTiQ|VoWTIX{X-^j7VABr2wqbujQt`3dfj$}7<p2Lf}x2b|y
zQi{6AnAP=dL%!{lag+a)1h=?<0O?20q}7x4HwLI1zh}PMh$mV}*8TP>J<>fJyz@FQ
zXS_gmB5vd0q&cut0$j)wAg4pN321=t$omX%BcFsq$i0$lLJ)|fo1-@b0<3=6z&h8(
z)W8P7-@!USAa*NzI}+r>^*|1-sE+EnaUh}`@c+o0{HGis3G+2acQ-%)1akNt0s+f`
zxy}m;pBEJ4+LwVO6M?v6YgE`R@kR}Rk~?ry{FH(6D>Uy2lZ2e7z~2q*xC!Gm{o68y
z`JApGl~|4$RofKbKNp@%j%DL4i}pE8K52?=gDbIbaGbScJ-JTR`ze+E@=J}{ku1l>
z-ppI~KuR~h{ab1^r0Wm(ka2MT-mg8Nk4#-v0Q>7A*k2tYi^yMty!N$20s`+Ba6q)6
zwa;6=Fz!K|pP!yEv6q2B`M21Es=eu^kIyCmfyjW-gyNT2q?}(n0>}XO9`0Vq6z8Y}
zP_$odLe&5ct_|rtQu5C-DgkN8fUR={Dbyg4pApiS$Ym|^0t5F7z}^4h2idU?l4BpG
zcpDFNGY;agR~}Lh%SS2eyK(;Cfq~H)IP6J=MyUm6RR<JzR}ZUWwK$X~KKf_*$R?n_
zQxMh(%+Uq`jZ>n@e+T4|+L!+;fz;6|94hM49(F~oR;C5)0OLCDB@kdhR7DK|+S*4P
zBpq;faj~=U2HNknqol*iIB1?cFv%a3#rn(zKT-n(3OB?#ViG(-RMKG#%s)`Bdt;{{
zcooXJfG*b{a<pl+1RPR4yxhIrZQNb{Kq#O8Goje#hl*k&vuhU6h=_n}UgT&RtByV-
z96l9~0>CcQLJI>pEfl8?6m?ks-KvgvI?`VNWTy*EI0PIFNvIv;RnbA$`}t{g=fbhB
zhwK=S++PO22hO-T0Xq6^?+`$EA-ZsDJ2x8#S1T`P;iJ_!wC`H@VKx3>?*ayHjnBXW
z%K)G;{i+6%)Rcl&4eRUPb`T(OlcSsc&*OEdH3J>Wry9o2n=q-@wF7KY0}PsAG~PlQ
z8t<UvLk@aAHX_?Db|QLSx1W6nS;N>;G~%yLij5}5N`|%npy$0prHz-P(T-%n!lX+>
z59}=T3_7T+?~l<~`(p!IJ{?Rr`!wu0^o%E{xaR8PxL-TOhfWMkME|GQiRclzQHgOa
z$B8KMxiP^qPm*J`9D3AWR9ITuaoCX<z?gt=QS3nUS1VD01=GiYs4rS#qV?XyPD6h^
z50%zCbDW0yvK}VdyAtd)^cO5pY1e^W?EMGU!RWZ(G*Mo+zyt$*#STM1v5pE$Svn3w
zIlqnxM#)5hwV9wFKSqU#uO5e?97e_j6LrB3Lq8vg3d8w=2E%Y#5ED-+`G4@x+>)rg
zojr8kF()M^9P~YQ9J-eY6_<&NgW5`v4F;3X2@}+hh!U#-(LE`sP(J+s4MqA@Fi|6g
zuv5{U5~x%|0xVPvE(%QaG8lF``sO<-edz=iI;O37O!%fk?0EE@TvR+0@p1g`eO*kf
z$qno*^c_=F7Awhd){&hNOu!X7Dy;1ZecuHYs7Zzf{C}`m;xV!QwDLt|Nl>D(P*$Lr
z>NI76Y0mVgmGMsw7??BN_{sUbPDTTKdxW^mHHgzEaR@jp|DHW#S~jBs+J6H6RE5JO
d)4I&jB@+Sg@;TBu$__dWOq-RdfLSf*{{W)JaJB#d

diff --git a/python/lib/py4j-0.10.6-src.zip b/python/lib/py4j-0.10.6-src.zip
new file mode 100644
index 0000000000000000000000000000000000000000..2f8edcc0c0b886669460642aa650a1b642367439
GIT binary patch
literal 80352
zcmafZ1B@`;wq@J4ZM*wx+qP}nwr$(CZQC}!wmtuSnLBspB`;ISE+lnMcBQh<I(w}p
zF9i$&1@NzfO5;TGzi$5b2^0Vaz}|z!l3qm_5&&49T1O+|pKx)91^@(k1_S^ALHXxJ
z`G1E0>k0v&QGOsvq7S?-^UssrKY{S?;d*)&wieEMdUW<4|230^y}AYwy4HW6Nl6u=
zoaJ5-LR@-QR$A^vl6rDZB|J`!nwFAQGA2%Ke42Kgo=QPnnre2Al2#%n?2d5qNx#vg
zLS!W4-9uaZ{$8F;zKm8{9bO_C`oE>MI*Aq63km=LhxLD@WoTezWpC%`{QpU7MO`Xx
ziw&XoQw?Flz@#5B5@*v;0sWj>WS)p39@G;8D1<s@ZCbKeP_e7ZFYiEzNkpNQeFvmm
zHk9S-^m}H0PJ)tZ)Q(P~%#Gexo5$<L`k9*6PK%IL*?8?<k$C`hLt-I5(=z@TuDJ1b
zIJUWG7h1MaltSnA*yLx(v!!!0C~%)9h2LB!DKn)Xb<nk25T^(HVQx$9*zT3HZBnh8
zu`}Xw#qxv|HTo80Bh$^}4JyQq#QXl{;l!(PC-Lt|sCQqlHlla+!3fX179`i<!u{E#
zFRfK23Z-t5-ZI9gnV3mR#8sTUCA@y_*|Fsu%4npO`+RXh_HI5nl5P56Q6xi6zNoux
z4;2?>fR(qEen@k{AUt3$<M^S%sDN$w<s?r*q`3f9xrH4tSplM3vnmF|j3xNBG(>zi
z;{dihzth9IJM2krlN>HW3%ZU@f)zh#x}qz1I%iKBe-(P<jlXf^J<<vP)^W|_3WTZw
z0yGTUl1Kt(9{Xy_LK1<w?CWiyp1C4XyXe(tBN%G}1I9ZD5Od7?L}`5kD28tQZm{@S
z4`m9bWos^k!O1pqG!%MfHzH0c0vZ{;_D?;Jqo#`Nim-Ix4N2Ri=?f1CVj%1Sw&S<U
zUu6j7O-}T28(e`wz`6ndsy~f!f|QwHIhc`*T46Ph^qW6uPX#8iDv~0sJ6QCE;_DRY
zaco4K(2YDq9{BS!rbh7kG1f9U>UFCT?dhT0aD|MeuiGxy4R>IH0`W_ZMf4)j+w+$g
z&DpJ|DXf0EK#%-2Xw7m&$!gLmp+z{0+|j)9=N-Q-kysDt#|9;^#5;m^kYh*zh&4#|
z4df7CH$H;4FfyEB7=l^*M<v$rooJwh%!>|dV=3w8Kc42583#*=jtV%zL*ueu&U2Oo
z&)ds{ur9liY+7?~5wuIlOP?FV65|<W<i(goF}ONk`d5M(g(RNG@WvOML&EAENg{Ut
z{BL&8uxTF)vpI&@c~BhLHUQcJVGBxzi*f|}(}U&tLx^qgqR*f_GihvUlJeSt5&(xO
z&3(^!`rKR0{cR=$hjy#|3v;i?(c2A-9##(rcgCtD4Jsbis)SMNE14DZ4RW$^hRIVM
zczDw0OzmgdFT#@e$D!RjHFLybZo#{HBEuq5WgSQ{qr-1{ngsNbP~%XH?SL82r0GFe
z^U9W|fOd+=$dvJ0bZjkg18kI$+)5t>eTeC@3ZEeknNdgMEe>Yr-~<eLX3VX9(X05_
znPA$ZpAp)SN#*n2OdDAv>rnGgV#w^%xwuzI{{u3=pNbvv`QwzoNZN_kuvWq3t?@y$
z8MEXqc4>Y2dgo}^sFQb_jN``W;U}$r=9)Y3S(74`h~W{;EmYZKnY^q{u~4q{4VQf~
z79KOih$5^^(v5a|aY3h~Q(t8JFvuJMhg%~`X$_cd@<K+pw%Zlj{!H43B4<r{;|{9D
z({O7zuixETA|tN8<9L8)_HkEh^UA3LtB?U-0IJz6Be;+qIxcIeoA4A;rD)_$=N8c#
z(*uzl#WzQ4v>_nb7q3Pt3<WoLR-Re#Qhc#<>mWyq?5X=1Cj-+d>H*(Nq>;;*OGB2~
z6}$*}Tnw0Z$!e_-f+JI8IeHLiZWRdIV|J<#VaX-l9)ZP_)z(gWnlBX7X-&o)PRc&<
z<yhLeUmY6{)p4RMt8%_XXVm!K)-r$bu1ok_YV+8^`F1AOm$Jk>KOTvBXrbPmD>-L=
z`cZbtSy-iQOV||H$!Y_4TiFHsY&F^43qL=mKdxUEPMtU`d(E{95DdDFx%`kp<6>h6
z#S4|1euLb|Jb~|)-B;LQ%_k-gCC-eYUZluQgEXN2=B+z*FU4ld<*LphdGKbhOaGWD
zJ*gL?wLTtS-$re89X)Dv*)w3Y=s!~ySv#&?=eaJ+n$M`ks-Q7b-^}BK#+ELLM*q4?
zzks;u+q55ct(g4>p8kby<~YP%=coVxzPbPaDF22hBWnv2TW2Q|M^_Wae`CSF!0uo1
z-@xu#`y%P6HRZRq4`{)gA;nzTHCwqvc|~jGVl{f}!l+g2NTZH@q<}<RhFye;yHqS|
z^Veq&FuuK<_f(u$UFD^jVjs{Q-S_q&fPQ#tuMcI)Fy%C>!kx?S`GNk;m;nclK2jw~
zLSc`rMZB#`+SWu@jR>*IxhIv>J77;TLtm|dR+R}ET4PTt10Lz#=z$QCuf$}BB%@tC
zwatRA;3~_<N9&kq#!BNx67n|>2F&aaf+Te*<7ur#?_x}v1l{r2iA|ziF@?!<oTw@X
zygA^WtBCFd6a^40Wu-ASICoWqyGrsr<3L56NN4DQltd0FjOvPn#uTcna+8l&>8en1
z0+yUUz;*&d6itxsh^nKNH%F=JwVFeo={PWkG99)Ej#hIr`S{v^z7Coe;@DNGtt95y
z_)cN9NK%<N*{tHeRHBMe0!<|muq0y}28{Al^}4nH)rXY|&8_i_F;_yfjlrc6W7%2f
zSda9LWQVHK#`W*h)v=noXt#u=N8VqTJ#&Kn$2hVSm+t9EDAj4b%B@me)YtcPEaf|r
zAVQNrXmuHb5~+cK^$aV-wq8Kq+9Xq5*c!g^Ct1ZH5d)4@t!3}yw$mU_03x|lCOK&D
zS7elGMh=`_E?+O+c=&m_x^uR+o#1x0wP)KcJ`ayyVO3LaGtVo2-_v|<J#w<9cBl7W
ze+^xq(KK`M^QL{<Ir(yQu1XH24^6l{wO2-L$<>NtQ!>4}8q1?h5+aHZ`(SP9qpoAG
zp|pO?h_NlcHeZ^L?@WKo@Xbs#0MhQ~q!?NYyOrcBY0sUy6RXp(QPdlD0?Ed>-U7(u
z<Oa?&nd10JG>FB51^`qfQV+$imO-#`mYU4RnMoxcI>XkFPQ%(Bm#W;-?yH)njvkwS
zb9KgUYiobmy1<Br+ErpWz+~`gnu;rjt61b3pZrYT;mBZ;H891N0A;L5*Ysg6a#tO2
zRG74iQUQhb-3oZ$oCV<x5I)*ID>eW{%z_!8F=R9wi)_mqCoE8czP@cTC(Ly<jSkCX
zE4eTlu<<s4ho{;0HQF~4&P`%uQ2C(u`*WU^O;et>M2yp<Ol0?)<-;#5L7D~?dR{~y
zC&(z#Cqx6uj4{ovv}t@p{D|t{>y#DnqSW@i#M#x|%?EQ7L~}E!QUlDq)X4OqO7?x5
z9sCQHB{GA$z}DKWKFje3nu5Bv1+45-_3_2QJ*$^sIYjTKZZ~4;-}~~q*MQ$GjcOuX
zC^TyH{mNsx$Nb4$j6J{;SX>b6lrm~!&7X~;@V+H~ZXsNRS;Ha*t1VXq#P^*s%|wt$
z6jU%>%TLBW&Kt)Yd~#`+FzrrSDP9@Xsh-AgS8)uRU={?oBCedNqePLlNcqXYzi$i7
z%2}`ZKc<p}3pH3vb|ifOJ@R#E>@EncP;Tu#(PgSNT<9E#W>*dMSoZQ^s-Rwp5Kd-h
z-XWRydEzVVYG!ZK+8jO5_s~>wf+FX$_m29Zu+IP3U}LytElYKIfk-2acnT}62p3uw
zv+t9RQ|gpS$?JU7w*C>bke}#Rn1cG=88$+ygmnt{ei-`8s9^}}LbWE5gb4F~l-KwB
z<K<T^i(huGx0{64Ek+syxDhSVs4dBsz13*W);3IOh0~L@hyIR0rhVhEsG+dU$|W#N
zjX3c4mDb)Mk@CER;=!68jV%fRxPWCrLH-vp;)6%Rs#>CTSRa<462<G<LcODLoRzCt
z1j~-zR%tW=i2xmea+Nk0nL>Kdc<Z><>S(4K39~X>J1BqRJETL>by{2Hkqof4uyPm)
z24-Q)&~AJc56<Ea1ky8rEWQ+oItghd>Z?}XSFQG=bP#Vi7e;ErG}p2mu;LG@Dww*L
zCwP;<%9KE8qz+f3AyB#6w(TMnJ|Vz?d9)(SU(op?8ead#DMy{;eYsSc;1F#x*lJ!C
z2CR~rVu}{{HT-vnv;YQ6^+2Mb)9I^<aR%*KXWwPy#MQG-Y0?^L3D{o}lAomzR&EIS
zs=qV6l_#!^){-#Vr~BOW`iB-GP}Yl~Je2g7G8<sP*AzzDJU}^;1j_+-?I-r)iN&b*
z_(o_QRY6uPuE3fkvE#8g-NhwQo9V5P!|1Rdie8fAGqX(}(|<RN&6Rh}Eh9>oz?f7b
z17Iqfu6OUnRtXUWj{WcV6cbBNHwjTYWK4sh3atk=sFHQg>8KWmWO+&(HL#RK3nFJ6
z3(tj#_Yo-2Fa3HCuSEm&)Cg<dWe6w)QG01KtwUmefybKjVWb4~aq<mgpPaR=5*Ljc
zUiTO7`aVf&?*B+bcqARa);CFGC>My<uK!qzsIZNUCgi&W5Pzzt5zy4<kcnN*z92i$
zu}?VCLoHIvW7}6@xNg|CK>fi%W<j9-74nXv$RP53#%Y^^A_~XBv2U!!?cA{~%f=Js
zP3hTXBSNRbVBNT$xd5<aL`4bTMy^y7sNMwU9^kY^|5N;OI2*f3s3!TLm2nIs#^V{P
z8fnwnQ8OW{a1(3_51UqdjZgq)QOfFt9fIyLQK;3JNj!Yzc6zrT!ro6aR#%!%2e2o_
zY!N224t5D=YhEvnC6$l}d)C800oDuCg~60X;bHx^gbjx+z@8rDMy$_BN}-ECiiqI^
zlHf_fZ-WnCp8VW#e~oR~AQgK+R<HH2YJ}Tn`0<Yr9{smLf+c1NaaOF;=qw6^*SI!*
zxF=jND+I5b3-&rU8*vezuktYn&&Mr2(&Lv@QHUzP5^^lEhy5K33A=z9(4^+AmbZ-N
zsXOmO1A{?vkZxj9o;et&IML#9uyuB%#461rjs@T^HR?g+2ks^`3~AHqk?f$k*_DEY
znLbzAo@~I2g*Gtf)9le=q(5_Od<w8>HX0c?aqR$A)z<v170@O#`K5hoa6;{b{8qa1
zsOA`!sV%BkW&H`IIdX;b`F++TRPsvd#a&>b(S!ERUswrmB=4hb5?~Hkl0b;BN8)z0
zHg^Qs;|E>fZn)76JSB~DVuNgTmUfB#<Zl{+*kEu#NHaxG06qBeCY|#l1H3M>IqMeh
z6vxngU2+c`kU4|#I`@W-iepJl3|@sN7P8zX-Pnj;akhHD2ZWwKWKU96v#dWF^cItU
z%aS4PI>Gk_?AwvoAcbdU;83eb{q;5{LZ+Yez-lM#BO77aSw>%gSVP*LiSa=?`OhR6
z>Yx32=Jyj2=2ch8f%usJ%=l9_%I0$VyUwJ_TtFHV5AYsaL}f<AK-p9|M<6kJq`mx2
zl16fg`Tm)r(vV`tO*Cfk4k?}3V771YJ~jd9yjh20Cc19OPhBjM$18br3;pZL;&zoF
zX*TRRWF~BksC7l0lrGCwK4LI_48zSTgul)PTQ0SkL5h7CiXP%-zqT_<#4Bxr_>n%T
zb7qzq5c42X49xd&DK~U-tqf0iQ@q`hQLK|iRzYzP!`ts*_Gv<P!JdY=n;u(V>TSFZ
zH{BpjK;lBcWc4L<P{7;GbBwG_3PYO9b?t=D@p6p-%;lE5KEuP~54X$Q1&y6~P;x$u
zjAMB$Jz~JZ;*{&mw&~P8Dr%WsHjxcG%@>{l?r4WXN~BNrG=KdD5fR;-mGYN8(J3Gr
zt5B8|oMeIkW<ahaAKpSlN;-h%OgKeOhx`z#%r#Gy5Q&WUs(rHywdA_FG&jxZgkgi#
z<+#yxd1982`&#J?ufj!`3(!a>sEorBX$M%VcnsBgpR%0G_M0cSuf;tj*0v3B?XpXt
zG}z7a+yG4#ehaCMCx%&G9LInwL!GSXQk3jY&)B|SKZEyM4ed6!dF|xK!SDgXrIBo9
zT$|HJa><L#ZrXE++FYxB4JE&R_I11vby!lW5EKS^-JCP5J02xC#KeH{TM;;d060c5
z`BQO?3J0kuEJy~(7ch_&wW@{?iT4U&%tfaZVMm-wvcbY&w92PHAvcYW@OY?&+DEaz
zRLBYzdTzayCsiZLlzs}aq3;9z5DVc3++|%Y4HQsp3O*|=Ea1n?CCi)cM9P0L9}~wV
zgB%=VL^DF``(T>vLydtmJHo^b?$>XSxd-4@FB}`L(=h^|tk}qSaat#V(=3aGd`WDp
zy0f^*mHCrN0Ws>ML5{1X-EdZE7(sRG!86)6_!Cbtp+4iIeshH22(O&+#jA5)w4=V?
zyBT#9VR+shhEWFwY1d8Ak#Cqfwy9yiYP(7%>IY9t;gjw){ukOI<N%sg6jWaQkhfyE
zQkpM4OjoD0@EW!nDD<(O;*nR`LS`<YqCmqlsuk?eQvN%w0PybF@52!y+-gE)3<IdL
zl2UIArmSs1b4Un+e}=Z&rEaKaOaDc_&VIuP{S-?JK_=1RxHA5D$5z2yWEOLlPG%)w
zS{thjQF#Y|tyf~;Ug2<PPL^wd3dy$W_3|5TpAZ#8UqGlaS$udO=@a8>K1A&rr=9yx
zBsm7GZiWRfdF^V7YaB!)d?<HZc(F%$g^q8(7%*pTQ8YZQ<ch}|#(i7<SBAj*&^x1w
zt2YDOE~?0BNE@;@)+)T|Q9^~)fOW+t7ECXmNA}O9LUWhw`2jEIF>i{Oem8?C2WirE
z-_9scgUE^|FZ6M_n;rS}e#^bD;d4~B3jXFGye&AQSKut`<)c85S2i8nXB11FW%WU!
zt|b#aI^_B|26Bohn`1G&kfxc{->1sIgOeH^uiU+YoJ(k>C+#*C->J)Afj^JXH<qa=
zWvY@<Yeg0*j+BU{<@}Kj{v&9UXUxSTk)tTU3@}#8DP<YoYiC@9F7J+Xs{p(1CXgZj
ze0J<@cySbq)K!m+`WiJ3XjzG$D_M{W1P|_;YeL`hS@cj$E?c@`By;vyWo}4?>1e#~
zTz~$kh4P+q44JH6tVhl<%&#yU@7t2mi=3FL2%5R@+;yr$s%pz{lWkhmD1D^J&+ww8
z-I6|{nD->^BDl%8SZM`#NI~_}IGeHAy!@L<T&$Ov-1L{e;Q|u!Z$WR>wVmS^6>nGe
z3@cH_wsy`JZEuF&Tr7PzHLt*P{jASW1vXE8wMy5yfaF2L3{|X<kVOObHTkPaqtZNy
z_vhF-->Td8<`hOQ7sfNA<{5k8_!YBj_7H`zuvV;uW7hTrtL_|DYLr2KQc<USC!+as
zk)^6tig4CNr%l$(DDC)o`rdhfDBR{_hzVxVT;$!HfETNl0;Z>#5WK<CP&>D=BoA-g
zFUPrd_}TR|9w&o8?OW(<Mq+D)!uF1qfi(GRvfK5IrMmk%t!kcPiz!g~mC)FPJSp#j
zRaF?^HqAMP>yB~dn~3p+=Hh~_l$graf_`UqbS;y>9%st6<ln@-T_t8ZAy*Er=aao1
zjF?cY=t~2x2>lM|RTj6uJGEp(Lhp}dmhT`Jz+OIkLg+f=*N8J$NAG`k?&t0&Rz^nd
zPb|DWycx3q)@fAre`I=m-_s1n?e3(zt=;1Y`w+@PW+T39t&}UUZVfxE>mB{zw;Px4
zy;WQr6QfL#VYgzpi;5??@05ncs5S{rsao)2gM8KG&<BfL2Ed0yE%XQS+U5#C6|zfX
zX*PJ$ViHhmRVbbLu%K42F}^*{htf=Sl;@LLTaQdGgH>c0K~g^=9Z@yHYTqS)=T9Xs
zvQi3j^>TD0jxz0zmngwhHUj{aoV@J-bSKE2Z)4UL8$>)8-%+zil)oOgez9fGUzBh!
zB%(jAiHaXO9qZMibWBFJ4rd7?Gf6KY&T%)JR9RI9)RU>;2BEiy8mbh?YGxVJU&nl=
zU}A!j$0kk9*4EV4?pI8_nY*}oj%m_?WQW6OC3JU?C0Xq1vgktW6;e<2clxCDd|zJ^
z#0FFM8IhF*3Wx>pY>t=Q34QyCe4QVh?<XiUQ@z>R3Xit_kWO#N?1PSJk6+jBf}dA}
zRCKBqRnN8Kg?>HHebxtOauS0t>aYxYoeUMsbpa19Tc0Z)Dr*FelymGA%mF$n#|wwN
z=Y)g$@MQJ%^hVR0va^rXnHA+m*ds+*d_C+u-Rf4|UAumr?|g~7z_U@I^^IO!I%E!3
zGC?dBJW8|fCMcdH9X&d^U=MD)4>nz^V%q_(8n&o-)kD6-5^j$93@_nE?d~jm1#gG)
z3?J~+8#$P})Wy~PLInnu<mAoCp2}&Or}y>Ds4+!t)<2`43iKPp*L3-%q0nl&dpQD4
zVr)getD`+z5c($5Cs+&HJ$7viXN9o?zdj0fC+%UVAvxq78gZ=aIG1qh@c5g3@9%@6
zMk$G;^VXa-e&3)Hrl=>6<V^eM74pp)vvj?UHNGpoG}JYCXk;1NlXvCjOwameVyYJh
zwY$5*y@lS=V#>~IE<&={1?0W&q7cV3WY<L;YjDDalH=>MmHlZ?7wYpO$4TK0Us<66
zD!x|x=1%QiBAA$W_}eit8GG$AF-~5(S{+V-abENe)~KXw1K=B<%wv&uz+(qMwDjPz
zfWOB5r8Xu}Gx|QiF0hk2SmCY^(#&GUOkV36XK9|xrqcMr1SSV$v*Y{y*0Wx6AAP+|
zYQRKvJ{`IS1)7P6bL8uygFE_r=C;2Wlw-rkr-}^s<|WVDf>FdlmDCXkfHi^`{mIt-
z{FdgAav&!}m4{g34Hx|2u`q!gKY=Vik;FDBCq(=$e(Y2bUq*<4Z1xoI1@_?%CX^S#
zxr{KN-0kY2Yx<TCjyb#1LhBmBb#;d*br4;pA=Kwqm}AG6??E9ok`~8spHbA#d=x^I
zq;FVhMo5PC0W`(HWIue7paHn}js>Q$c~tyfThY__XGrb&DzM|)YA(yI%w+a+YdZ#(
z@9T%a?XiH8r~6tSsfJ5y=+MY_7{ptNMXx<f%%f5Evh#^&QGLb=<vSPhCDF0N_)+X@
z;6?fno_BO0?lZD~Jo~)esGKKeNHPlly4N|%eB1(AneDNLLz4Ktw6&J<S!MpyeEtz^
zWwPVOF(?cGZL^>`c!s@fSz99Rd1+-zb+gYoQ-13DCSr0(swB40%H{K6nKqsta;wwM
zB0BVw5CYXJE6ltzcbHf~h9aXS@;qy2AJgf!(k8&Uo{f?zQm3=AILmZECaOgR_A&T%
zLUzUH&$s64jjze4Ekn_{L{5SIZY+|niknyt-37dQ&~;kVCA+YOoBg)h*yUCMcf#%R
z=cv7gw^s(W_c<XuZkw9@=G2fvzMu<=w^=pwX4#5T1-hNP89)Gbfod{R5dWl)bk>iZ
ziE<om(WS|Oa^JfMDvjH18Z%P!i&oHi*pq_DYXGpI8UdO`U+jd(R9PDS-8;4-ZdCCr
zTTY#3ouo^5X%?sU*jDPgGnnID<dxE;GrpBcPha!5n6HP8+gXfW;t3hd6RT<`gLOE6
zkK)z1*B|;muQiLaY>Lhk>M-5fV}wPqe>P^Ddf8nqo-VGmwtJh(##=-2wlrO#6Yg&Q
zXQFv(Dt~n=+|(;lEmvhdpk1FN^ZZ{dvov^Z*ez-0uNQx?jM^68nb(v)MP^@Mz<=TY
zU)Go^*?s~87yuv<@*f59|FFh(HueVpM?3r<_~I7Tnb;);gx)hs(H*HyD#S~oU@A}b
zf)HRNpnze(8Y1!x>z=I5ifij7^dnGr#!*+_EL-&W^MZ!H_tUYXIR`p|6ijqC>T>Cy
zLm7swnLsy3Ev6x4U3(}0ZlsC>Mf35JK*_soBQQwYbz*V?fhIAOyLWvy#eSyBfC!k{
zLQbea)G7!S>UO&NQ25I=xrT(1Rg;O7Z9$x~lnRtl#ycCzlm^IhqG(EXD%Eq)df}-;
zCJZl81QuyJ=ZFf6vX6Wp=_ZBoznPoydpHOnapf$uB&S4a^{sFV2!;KBU;^-~PH3yw
z5a`hFJbR)1ed&Ze++FZVX?*Tn-e~p`c)Ue`?EwXE)cr$71Lkt4p&rZDleX#Y4rsOC
z0WzMM+gA)$lMVc!Dx82C$Foic;;UvKSBAy(Z3aCh>xj)P(jz0dig#Ch=T+=%LW)!C
zzKx8<(?xAi+)f9Kg)LulFs{vUj`ZUxN8qHei*lZ7u~*y|^$*40|N3H|5E9Gz*V=V5
z277Ko)JoWgMxY2GktT$~u9YPHu3-MYl9+POo~Yh?x5jdgTQCxLFc>hoq)`#SrCU>T
z3CMclF?zer-eH0IRn!ZDZ(B=w%2o3qbz}_fbPw_P+0)EccJ6Nm=QRIHZuQjGI8yfc
zM4w#c60a!Y%Z0ZniPg=*Wl|{OS0wzzk#gj3v^WcLmjIYGX7L$uo_MqUlIL(9>QOy+
zK>RATPvBdw4nXrC)YUyVOY|hqvA#wC#EiP-jB3%|f@$Je`mxXVdS%$9n}$YO+fr_h
za<QOARB(lsHF@|!Vcjq#Pw?!xv2ZIiB#U6vV1-@op#9Jaot5yJ+SpSyVk!n$3^arJ
zEqq=vKE;tS>dYV20oZ|_p}*tlpoeT<-5la8h;Q~UqDTQ_4$*Glzeq2{NCP2m3)>|6
zb{(!`9dazIA49j3u-VL=lssbJ=p);~<`wtDv0lzAiL&4mP?=!Pa*JneH`cqd`VJ4p
zC_WW1K`AT>a`LN@R@$#Kc7l{Idn~K?YCFyUNy78KbO5@-cM16rxWoQWef`Tcp!87`
zy!20fEg}E_ApX0)Of75;tSvnMBgy;^u7PW{ov1BQ1iv*sMs0jcbgg32c_NN;ohbnV
z3&I}WltV2kVk@UAorbQ+qu{UGobPzjjfb6%wJ^a#axa#=zV{iJ53tldT0%9@v@DpP
z?jDVwW0HxH5s?Q`)EfSJ)FyfT2~sS5`yo=~FJjAaP-ZZeOi_uUwPH;wnP2v8_2pxP
zbQ*E!jAA~C4NS5LdaR5QpyXh7>7h92w}X*a`a}|=nwsR<{Y(q3f__i%zD8Kd(HOak
z<;u{NN^6@chZZ1Rc1bOB(lR7_QVvklK^p&>7^IpJm3i^Nk{XJ7>K112PtB9pdy^)w
zce)vs9r=!X){ZKfM3RC<ExT`UYKOQ1>6$$Q+cOQIC;3n{d-eiX@NI!MM9UfB;Tl#D
ze-*E8{iwgYT?^`kq8Z4!Il#coxrUN19xVZrsaui7e~=Al9zH!X-SOK{MPLHj<5XIK
zT2Unjv!m0SO;NI+8~9U}IUrIz%y>m-U3Q^LKiA1z9PWZo)-AQu12XNXppo{HR<L|N
zK)~rML@;o@t-x|C!o93%s@-IVyqK$Rl#LeHh55evFGcqU-y<WmL;8F7t>z}K0*);O
zph^{MDEO(Q4C(>VF7bq9CbzLrOkms^mjX<sgy_9F0{3PhC95Fa5r+SW`51eg;8ujH
zTXiH>9JQmva{58$@p@&nb-`rk%mnSu^^UPFuA}Jk2SHU+sCt1nZO;JHqxEK^rXX)%
zVSfIzB1Ur{n3GP>p<U%x8)UAKOnL8l9ANFTSBF)G`*89z@H{P>pnTyD?JO|kl6C*$
zLXb&c`d?T%#wEQV{ES5PLAKLv0*_l%uXu5T-6AlRv4TZ=S7d+IoIW<%0?Y}{(azb7
zJWfJ~)gT0#8Y}AwbR`J#Ag47e%uF*Lv1>xXBTio1?-UsV(FjN;d3gwNq08wDPq^8T
zSc473#czC0?W;&Em1uQqxq}B;=wrRE4BZAN?>YuWDWph7?qCROlG=_S7c|)Zy@|I3
zE+ZCmxQK0mWr>u!stw<QK;mmZG_MuHpZWS*d*>+sc|IdN*G(4lD-R0!V<aR-v8FSb
zy$CaA&Gy2^*BS|#uwZ?YnsGGRyejMn9vm6=IN;>*laS3FKasDe7kB)7E*|@N6%9XU
z^mJ90I{Cy?h6E#%-fuD=8?<5GQlMfWi$6*KL*=_rc6;UH^|yBf_s@gdi%-_@&l9TG
zaUiZ<ou6Aru4@ZkhdTK=!$@#heSb-w$E3+p9&|H6Pm^qgAF9C*{P+=M6ZnzG@CBdJ
zjuV>&=b*HHLT)IJ{fR~9?|<{|+%}|ns=xvO0Eq$sVElV^urzQr&@-~Lwl*<xwy?8x
z`nPWTKP?j4Hg;PaNIy4v15SP7T5IB+p9cumXyc|kT~CQD;X9ycJAx%PoT}7Pyb7|1
z)GvK|(|01px;W&rTQu^(A~ueu9NCz>*@PwBM)bb!17sU4vUPClTMJd_zr(4rZClsb
zCQB+r>U76UQYPG?w@ozQhfkWvpbYhEUztO<B#22wvx;mI?a6|TL`LXWhOxA*O+{Rv
z)+}+PS`-I&v|5Jja_jCY^(041j3!5{S&^7}q#}Jw7dU5u=<`qyx{t{{-Wx{ruP?8h
zIl#Sv3J<_9)N@fC;=_hBzDH_ACk@4v(+}1Ih32hPsvY4(X!NNXy7ekpL&+k^!y)4y
zL|)NxCaKj>h?J4Bhcqo|p{(Z&Z`n0aa4#H3Q>5z!W1)wA)Csb}b5HAFY2Fl2B(#IE
zXk=@Oan0JlPYrw%Xi~aw#08;uc7U9vhiGqS+7E@}17Hm}b<hKk*>x)n4cdt-e#5RB
z$2@GONPYgX1<bQz_FB%MWIYE~%vhZdGjGJxtzvs~!-h2cgDDh+ApjLhE56aXG$xoY
zCb_7X=}`_9y<9^Zymp4@S9!Yd58O*e{oCl=IigZ`#g-GG9JDdD9*sa<r-xLj`cV|J
zY1AZT1R$^y>DM!=A#1pTZ}R&I59H@JL{mYItp|XR{`!DslA|*FWCU^1aZecY+9F+R
za03z3MIHM1+VA-K1tb!#z|?@9&BD~x%_^KGe}bv1F=PXIVWO-ng88R?^ntF2Y)^?u
z1ylz?R0T2z3?=G>YqLb~f$>WECj-*smfFJ$`6G>2jX>uXiWxZ+K6x@p8pLOw?v*x+
z`&ipdL<fC?5wS|V3D*g)4&!P`-A6?+vBwyWNOB`4Jb|M4vId#~0JWKqxEXn+u!)bD
z(AOApiLWgS*F{z^Jzp?jB>l`Hsfxun5;2~ukpd|pOX}k)?#$T#fI-B*0cP=~zi_A|
z7(B&mB4T5bQGKu|sxwL2lK@O70}<v9_(^Zef*cS|ktM0j0xk4izE*cXTDNCTg9AA0
z#{?9>0`{e&bH?Iegcw>|9tGtasyb<8qH>_GR4XT!h)9xO)w-enyzW0DaR9|N5DI!O
z&sz~X5y`SPkLV=S`QYNBw^d;!10@$ovQ&x-O}&?b>JmjHcSDrM=y7KQQb20+Ket7(
zjla{D)`Xr}f!p~Dk1Z)A_Ul%(03^>8ZzcsSMWzQrj5-C#R|N`xeq#dz%Ln94_6R?^
z6_j9iP6`0uxVi?*8SZrZN|^fZ&~;Tey<%Jw#G)96=@8j_>JUbVvkQvpjs7ABj4`23
zfVD(WqDTGJTxRJ-F8}BU{3C?$0^9#4G^DC_$C09yJ{|ufR(+L9eIrjC2<s8Sf|q4m
zb*C^01ox_t|LJjC2mIpkTW5>^Fb+x&bPUHE(%v<ccE#{?&baXf0w<{4Xc3P^Rb9FM
z{e@Zls}gx5=={uQ>VSoybxsasJ|7Ai*`YCY7))oTGFazTEg3%fB7!Lr)D*RVzhKWW
zsDo@C<aRqmNBE@qLKu@kfVsLu<JI<>`#tSzOav%TG+AF|0^r61Bv2++6o3khwjr6=
zu+t_A$(zGg2-56U)f~@v*PW2*Y58RN&dO9yhaRTSzq1jJTW&5t5@SI_&b>EYMwmZY
zk(Grx;IVvcJ166XQf;x>w_O99z11AhRxaRLRck!9K2S^G!M<!0VXuPCYM^c783*gY
z3`SPox=Pv!-4rX+@+2m3Xd}MwW`%>d3Md!Vwtg_Mg}54w*Pr3p3}y#nCS12~LxFY=
z4I7qv<qrv}9bZ+(lnW&t#6`u60-8c%6ky`R9R*6`Qhmy(0Y<HZ>XcO7^@OpI^d}5!
zw4!L#`vEczEn6u>JyACk(aGev`CJl|A)4oax*|C+Kwm_@3xq&+HCm2hO6&BVi0;se
zqCLNS5>)M!shC~kCjw?pseaUxk|1Sm_2XmyoYsFNah7bcqTPiVoA<jtafC8$<bWEk
zYDcHN{tMvRI=VBUYs{Bd7_c6cK*Q)wunm}`zxe!h{KuPyGJyY24U%}!nTtTlVvRL6
zfpozRq^973j+)BxdVl`M82=PwY3B@CuyH=4oaBg1uo5>5^%Og2iBza+o(ceM%ENcj
z)Kn2);Ssx;nb=vM$eEZK@XME+oK@P5o<GD;Tc2%l8>Vft!Zv>}2q8w(r`cvr#r;>k
zYar73#j=a9Z8Bc}yl6;N(NsdoaYZ*`2Wv2Xnh&Ku1TD?{?^=I_r#CW1hRsNJ-me%_
z1i2{}$kh=11&1a35xksSEO?xeF9;N>YkD3P&$s%{F~Q*%SErd!4saA#YP|poj=!dg
z+VRvRScT09M9{~FntlS6wJ>N4#nfFsE8ds>A@LJqX{G4j0|%X2aQNL$3!!{OYj`aG
zWiVsY76AL99E4C5wRp!@^R_^ctyHoW8T=Ul&ixl$kvrHqib9HNQ$olJ^n-!QHqmhu
z9+w}Gu2yD+2f7m-ed{m!dSPD-BZwpnze=h{WpEq;N*Zf*8g)<V#}OIUmg^n)G(R2)
zJYOq_T12ZT)_7hMj`hoS6DehWNOct!GFp-{L^c>L9x`GmbAmel5HX1C4(7tciiCf}
z>&UdHzv82O;fEjjt?j7VU$NOx^p+@Z7Y{%%9m-}filb^48B{`|LLJI&8OZsd9r_W0
zO(V*fYfK$uwB`D7%QgPixX0p;mKMt}{0XD@ZVS`}@BoHU$Tu6>U74&jY!mxw!H1)Z
zZtVP=xY>N11Ux+Z_EgJCcS?_t+EO`}x>N6M4zC>H#5b1`I9{M!`Rym`#W%zZtqou*
zY4pR-w!iyhuY_F@2FZ{*V$9v|y0tO}QiaSea<3n+%qkpkE%_L?F{+7!@MIW4E&A5k
zVcIbhC<b%N^#p3D3&<`>K}ju|0)Rc4)Axy}V+TtX)i;s(m4nwy$rqAxm5FA|4D<?S
z9^R;B+=K?k?RI{Q{Ok*_oSC1K<Z|$I-8wSTYp;u5XNz0Un7RNPBv;U<nUg6e`dJTK
zp02g*MWbKbSu-R%(bpSf{NjBEN^o`&xeaO#aMA4&PCnvnpbP-%Ky7K94SE1_%WJ7X
z`O`_>XKQqtHMVXm;H$wgtbtM`w}w4es~`EAXwq_1mGEWPABXYRpB33mwlG#kd#t+<
zdG-FnBF7>lx_s<g&<<!as<Ryj`8tp=_VaiS!7e7b_*gP9V8nC4f)X`@LPC$Ww?0LK
zp-nnENXx}_SOt}L>@`yOb-X82q^8`jIS^D8$?Miyq`e2SgQEAf{pvfiTk-c6t5zJP
zu-Y$E1u46oUs^b)YeaN;RAQ3?a8oUfzr8N$sn6zWhdRJL#Q`r`C+yeQSt0BymRf0D
zZySZjE5_K9y4TG!CG@_n)PDDmcSz~&_ysVda!mmiWP4KwqmFMsVlQRnLknmnj0qdS
zpSNevdvjO#XWc(mG?6p#!9I{f?8``R8+2V`a<p}!pvA$qY5IKQ39ij$=fj&p<mDGS
zb2$hxX_~TSs{E?cggibt#@xP6zTPaGpjZn7<?=#(^uow|CdE_i85?nbL@La?H?5K&
z4pk;X3~b_WBDJa$b-r$1vU_-TRu)=dWh$0&^yHRDvS=>5)D_s;-Nnk9=|+>tj|#)Q
zM&b2!8rk?9$|5&I%Ltwc2frt$KFA#0PPUASx;2yA8T}_!=B{9NlGb%uQaK7R4)S3|
zc_kD32`NNqnoKM%h8}5Z)-h=@q(X7AXR2Z=HlXd8>oU&^%9^w0NVb!nW#U|rcCs|S
z3s$UC&}hH9Y-;W1@O^lwL4SGUshhfAOqY4wY7erebY|Y%-Ob2D?UGIux%F*p0G<B4
z`Pl#2u2B3|blSOderIMAKcN~^b3N<UXAiU$yEp~7rM-CWY@Qd(KAq57fx22iL@qxI
zr<d*Cq60sFw#uH7x0`jyO6@GL5+%7MX>VS#1Zt;^8KweJAqgtvpq*K6X5<)C(?Jql
zE7Lr*aqqXCIeKqh>jVB|vRh-}qt;E?D|zK_b!RlCh%|cKwT|zw8$()rSngJ}VxvtP
z#+<+RG{xtm=bo>P>Nv{u5HG3m0o3Ry=%YF;{<Wh!o8PhNvm`4Mrw-UHeTWU2Zuyfg
zZ*c@3a8On(xU@QL9bWzCUe}M^i`Qt(iBTc0n7xmO=4^vXp~^8DIvI27Xn?#I^k!T!
zjuH&?mqZbRXN^Tw-$wLQ#_4LLwnO4J6b&XjZGHx)VNQ%k<h(EYM$sxzbG^N;j*rFj
zd<X1FMC(gEu#1R7L6GJfI56D)XqG~d3R~37`<<ey@kVBRxcJ35sp@*e;Pqg1tOXKz
zz1g|olWvG!JdmtbJ;zbo$iZ7p5zHXyKwy~bd^jx`mGaxNnK#~*)5g!1;Eo@uzbR$@
zaf?h4pBy1h>o+r?l36*?-qjf~=BFLwNqrFgh|<pQR5W9#J8k>NT(*6uKlx_c%~+G3
z3%&*~wd;2M(xyBIyo|l$<`0W4D;K;s6P7k!%&zvXm-FWt9b3hn-+xG)e_3mJYA4Ce
z<pBU_wE+KPt^Kby&CI~r#Ld9t-!0nzK$UK5+c<5BCjQLoF-qyF@Z*C{#WuNBQTfDE
zZl)Hsp;B`Fse{Hz6hcCSAON7q$a?sBxk~c@7N3;-D|I6)p+Um1w6wFd^zA$sjTg}~
z=4F^DbL8wxP#Z@7`h%R;ym|ei!KOnhshknMB#G*%7}3?P8E*ZhAXABy3q`p*u0^>!
zD$<e|!I6|Cl1;@(-B=P|Gv(4UzIeJ-zv4+9r5=l{5@|_D-4J~iB`J-hLFXQ9SumW{
zW>#2jTmYU@p<be-lAKg6U*jLjlP0yV`PBHNTA~6YB*UCwnnJNmqD7~|DRSzDM9GYY
zypCZ64oaW&CQ5n*cehTRT)nIeuwulDxaT&~^!?q}q6zK@s)!1H2pX?bE=;?2s6MMg
zb%fNiN_0?U-dd>|5=n$e{|JMjUczXuAtDjoDLq*mf5arEniAPA66Gb^NCNmA$Wn5_
zsnS=iEafNB_>NP|rq`6%b#i6F#)#{<U&cTy8S?bE&DybOyRA}vfU8ixcGM$5M%iMl
zXWB8vb>w8iWY4ue=5JEVX}+fa=zO&FD<RK$WR()e5=)=bL#cdI>cPkR7=m(M5b8OY
zfK<O6;ZnzCWC)Fk3hG>^A&nI1HqbkrG>0q0@o%ppwfP4e1ibRfJ;dChGTA+!io%&W
zf!Sq=$4REv393fQ01br3gIQdmkDAeo<M)?LA4OxK6#krwOByQsM+6l)lbS;1jsB&Q
zzH@CQ#Eequ5(PN+B@(Q+CIv$VOJ@os6iQlxAyP=@7^%8Ua!sXoNYB7NwEH~j2@-{{
zQb_4YaQ>FGD$~JiFXyB?oZ!|5?!4M9&peF-;>D{%F}|=u)yiOsV&tPrVmH_*vI@v1
zg3x!56aSFhigdY&HmJU96J5=)VddU|9GtOc4a&JN^bG}^l`$8s5QefN6#jt*_Pdpv
z;GdjKQ(n*;!DQDbSiZo%@YDMI+vCn1ixyq&KJSgbNHFr>l`!48*WZ`5Rfpe`h35+f
zPEOw5OL(xm_V1?+zu?ciy`@$kFAoRSV6{2lx1Z13ouRz2I{Mk7sj8?v-xn=zH({@y
zpA!bG-0Wxi+Jd1Ag}UF5!>4w?kfU=|+vMGai;JkD-MEE(zh|_!YPT>tzGgZ<-ybrp
zFLy`GXF6HA&z^&ckw1(*&mvh{@O8BcyNW6UpD&sz*R|$#_j!>&l&Y2AfXz!(Du79G
zp{b4Zk&{$YPWEA8o3{G7{TugY`Dufbx)d-Fn-}0b86-x3IxfNi5s_mt*30Lg`WJem
zkn`&&AB|;V{a74n#fS-`-l4QTRzz-b%l2oMB9Jzu(*ZqOSrM>-0QXHJt$IluOEXRf
z<sl~2>`+lum)TL1&%bYxlfQ=4!YdXfS_SpxiH4-FZ4t>UUC581j{0G8#pQ6dT=;Qs
zk^6ov3Lol{KCaG|Wscb}-P6OS36nmd1bSUjJ4;^AAQFMem<DN7S6I5F`ed>GgqDkQ
zz>h-~x2;V708BJCezX=6qOxr9DU_p)ItkK%gh7s)17OTVS*M7xyD_+9o+H5Ld{{vC
zs|k7G0LuF@p`vM}8Wt9?k|6-Bp)*u_NU;d07PD#(hxeldddY{1(w|s1@|&}9c(^$7
z9xDTT0w1RwJNF|we;{DRVxZ3eP={53r2nFaCt9qy<_DYjPkgk%9#zO$BF!wm0J{3i
zXUs)?F}J8lRJ6VjMf8izApM~)C<>Y|xUKYK`73tz?G_I2=}%LrriQ+jmZu>;>YbUo
z>L#IXuTlvC0AJn+6Rm;h?p6ubtp#OC=`59Sg5z$Q3^7pF#R2+B*&<8-H;Asn!X8oQ
zxenY1gs1dYabnrnw5mAr5yU36ItbGrGnq00n;LK`?vaK>t<V#eT0oo7UrZFu)0BX(
zNEjwS`swD}c-`ZTWX0M4F)+{nq`x&^^~=-V0SlJSlkX)==XE?{6IG5YxACA2M4~7{
z7djzZe>$Sly`sMh<hc&^`)W228*&E%J&&+Vqb`b5Mc)!x8(iQMkeh(l3+tw>i`61Q
z*9TB<5~w*$0aKrj6itE+Ebo!QSb_%+M(_&9(#O;BfZG+61TIB_v=gWX%LnGzAj}4V
zD&Qms2W#DT+t1<KQfL@R(F(OfE@>!jA24~fkIf-2zbVD!j~K<*)sNW|do1Z4yHY*>
zU6FoAh^A@V-%#nx$E&5eYro}Cc)(#AlNOQGS+uk})T7WsM#oRA)aSZ6UT9jKH~m>l
zO;ff%0#FBP{8+a{Rk%fBRr=(RJV;}jKm;GMXq3_l{NL`Clof~8jBoB;(Y8h|YbzJT
z{3(O9$;h6gfaKs>DNf<CA^XLR(<>yoBl!vUn$g;?h?WqZ2#ERg@xA;GQuzhVVqD`3
zhas51{QBX{d+|W}is2aQBZu9p5A|{^_}x_S+x5}`8qmrd@LeS4?Ujcl+#$l<LDgP~
zEHf(0og*4=?MI<??*KXRqDds8;`Zk6a77=~mNije^U;$Cgu(4?<Or&s=>O_l#}?if
zH1xi@yfAaJ4d}37YxP(*IfNX6N0rXEe#ACI^<pwb_GKz;j?M4<ee!KZJGeUy)qnzy
zV;>$mIS2(2lt3V#Ze!8mJHP_Ju}l*DDP6Hax(>OAHlcp5mMRjKRJnrQ(0%@)9j7c0
z4pI#ei7h9}19J5sRQ#8qM)VrQL2w-`$T4fKI;F;5!ohd_^pipuP2=GU#Qu>5PzBvr
z*`U{`>lx;xAmzy|UAI0B`LNiN0WJlyO1b(xL6qce*a8Wn_J-m#vN!3M+WSXN4l6tW
z_sIxNM1(OzulZS^hRDD87$aJLKy1ch?Jx)`diw`Q<3rOv)$4Ow8-OOT*5R0x*DTGV
zCVLauCPQdY8tP>bTM%)_l2jD@0dG;M6dIMEbM^FFw1?URXAb51`@f6$kUX-P+rtQi
zQ@<bv_Mb5n-%{K^*6e>h)V4lo*RE7NtL&?X$g3LN1Html6@gNLs{xr5-(qs${!$-2
zkKM>6y2Dr-q&_YsQMS+VT(e1;uGUS5yl-L)E*MtDEEZow)=YVipgFw05a6NpDO~#T
zK=#E<n`l??37+m5EcO{B>{;yzBUJGmGZLO6(ZfkIRF@enlQRS`_0Q?5tUs>5jlS-)
zLCYLd<<LZL0!535ePS3xXVQZtp)_B!FEbs3pB#L`W5*-;l{Q4y!@W?}VyGK-U^y3)
zVO*91Sin<OT3=nUD~k-kjwk~)OC9-Dj(79npjBO?jDw}CeT8q|iZ@u5qZ`gpQUcje
zpQBzG4r@8gC$t?&Us^1hn{52!S%;VI4OD6m+tM}Fo4YCR_g<V;qS&Qv2CrK#lO!PV
zzX>@M43+p6H(HNS351NF`T>lUAB0R#>}2Moz%iRVjh6`(X29A(tq%wKyMY<Qncaz$
zCJc^s9ADD0hLwl-`j(0=QfOM6P=-T|Li{9=s=7##j3?KBM_RAGPJ%_vupY&3Tn;0$
zdHgL{12t?}9VMSVJ<O>!06;3J0DWe!gm5~4?Wt*V;Huf0Ne_Um26YPJRTkMY92ywM
zKjp^F44rbke`KnU)u|dO$I3C<;@FP?hR6`VKQf$x#DZxf(eFbMzrk`;>TvO32faj1
zrBfn!0^Kq^BfW(72p1CpLvyy`v45AGYqCu-)K;KsRyl8M(jaP9@STtQA)7jyVMG^#
z!8^0aQ>`px!f3)0>VYE7?YzdydI;Vo35y9S<;$joVK*`kI)}K*NwEZQkBZJ4-m_Vs
z3Uy7e7TjzbHqxvof%|ISRY3Wo3SO?Efx>~w3@EB9u9Hq^ANgtamURx&a<DB;+lZ=;
zSe$7)D~_zq){HAUi|b68hKETeYA|*-^Oy`?zK`hHqOb;8oYP2qv>+I$t9;VztpKu;
zWTZ(nlXzN#!p=z5S(*|m#LHmM?L;?&i`I|UygZK`#jmE!DNEl?chq>`Lck`oLVH6I
zdI&U?;FkXR5E+_c^QoksE9j^^>=A<9r1)j1Q_?(}IN9v}2GBtOE{GOof>ahM?Im>W
z>;?t9dsG!7YWGDBWrKs4)lXNzZbY7eH6o|0S`VbUZ~G(=w79BdE!QS$(*atenHC>0
zj>O|N1X2+k?HZ0sQOl_Wxop;15vmAMy<VFeXzR4f)*qaO#Csc94fv|wr3&iaFOgXA
zY7BO$h~5V9Hoz5maU@LJWf+b2cXe4fRr$Qkh9&z*_!83AaeG9uOtc}Q*pUV>3YH$F
zfqIE-Wu+IPB_ib1&&%oQg~HRvVbzZ*Rw=NNL+++Q$VV9)R4F~MMjbkeS}ZC-ZYS*q
zKwvnI8LZi-joj=$PA}>11zEjpaxcAp3lc9F5bh2;YYdE1n3{v!-|3d%oTEQb3-*w-
z8thqu?orsqg77V%ObvBH_cmg;v3+XsmWo$NrTSrVfg|IBQ)q^JAjYv_H}O#bj~e+2
zKs0f<t+MJ$N6H;w)dV(d-y<R{don`O1ktS5hj@`8zMy=y<6&N~Q!_Rt#0k#HmfG^n
z-H^f&o7gi+D9sWU!+wR_7l0O9Btw+lC`nh$y_w$vZXrRy5(OPVhu{c)a}{CBc4UNk
ztt8h){ZM*>RHl{3SU~?=-H+VLMfj>@kBrfFXR%{P3X>eK5phZS9;(cn;%UUjSW;6g
z$JAmKj}vN>2kO;0OTDwvg^J-(@0D+>GofIaQ96fHT`_ZquO7qhF*2ZXgjrRjWKu&a
zv^>r0A~L8-wYSX@5cg)o)wlO0;6Y*(@2)}j9PN+?|8Svn3>)_vUetn@)4?|(tPM1+
zhsV4XJl{dvn4M~8>PWlADm$~(L`B|lqe5l2%(6QZoby$X!h`@`z>T=N&Q@u)`EoU3
zJvc=D6XW5N)7-yxoP2B0o2N8?N+|EIO=N~``yCMalE9K%ZUx&YKVPLAy?%T<OIGiI
zC`=IvH1zH>Wk($}Vc;wp+6B(R>0rYxcCvMl*dZJc<{z0^|M`q5=bmAqY*z&q78miz
z2j#f&x{8m`P*#p=tM(23v`W~49KdN%lsSpeDsQNnAaI+8D}g)G%A~h>Ra5<OLaL13
z;fl{_R45*vtp5c}K(oK4h{un8hOD(7V_LAJte64{XO7NQn9z%d7jRlQ9^&x?v&-?Q
zksy|Aj}cKi!~hteD51u%2<m9J!L6799o^jker4u@qw%9t?43<+o<J%$=g@XR$$3xr
z4tC9GS9n~Q8qVP+9PNZ8a?rVFaUv&m^V!A37Mpd9@H%=JBbTfRI|9<^iuVm6KRk$@
zwPDX)9C+*Sr=NaeC`F*-msJda@=BH2s+cIfKPVi&{yzM{>r4Hz)Bs%u)l=T$qjMf4
z2Irx???)idJ|`>m=q=@qBKT$yEdCHn5e=a6d8c;FBI1MA_EZa`#>tdJJ?A7{L60J=
zle&uAm8yyul892L0t&+|w3v#KYd#YC7_{O%T2mbPr&&ySM%PMatAx)1jpVAzn47~H
z1^`{sGeIU}Z%Dw8$u@RB-FS@L>=__k%`(Y94dM56{BuV-d!ex&3rcAx%U{EK&F#pS
zC;ijo?VFMO>Q$fiB6n^@IvK)<w{L2jl8)2gxd&Mr;6WRbW<3}ZM%43*=&XPCqJL#Z
z;{paBsnlDPo%j16?!jznrqxSl1yWP%!~+NhK-7GNlfJt{Q=M%G613fKY@^m_n?2Qg
z6trv5?hYpX(>3=lvaTS88fR+mU1ngNwFQP2IwC4N*BaOM!1v%{!Z4+<IKd-~)kLne
zLjpJodEwIRxMOB6tE}*x+C-*0anB$MCYhYl+1@{HEduL)ACjA!^PV|0VJTca-3&t~
z`n;t}*AMh2yZ^{p2VX_lzC;@<$b98`Mzs#}EK54A-{&d@|AVfgCcN#k!-0IQ2l+qf
zK)&cQzKOfobQRcQsbG;cd<8$-8IdJ0Of~(9G8WH0aOKFf9xfjogz+lGh{}lmfm&Ah
zLNdOz2Oi1LB(lOIhqSE(bQ3N!;YBdBjx}$L6}7$rwEz}7z|t7i%rx+xGF$at6UjC5
z-{o<bjNjE$?xxStqwWF#+E5UboGGjkITP{7)=c5cc5#nU8AE|}Vg!t`S)dzj8;9x(
z-YK+nO51HEDwd8NOKNA<4yBc~<I_|GE3(^os*B5xY<&te69;xn#B8V;YWL0;cDT?y
z>}pSg9$@_8x*lJK69Jmp8i21%c%z5bNmt*@>|v*R=C0}M^8J>PJGDu!*9ROY{*Aoa
z^)6)i&|`UPox!JL`Mj4*7|ni3A4H4Axt|zV9Ss?5#Y_!XThm|LQ4z~kS^1Mu)B1*j
z+x1k$9RK)uy9}wi1#V1V{0xWdTwUXF!?--rVqVOGx<EC1_kE~iFxa*Tu4wpDS0Zm^
zm_WiD63gT<odj+x0zK3%o<OPW<~6t>G{2BF5wjcE;+C}89@W=X-8N0hc}pX0n^dx&
zWJRIMJWHpv11g*BZ*xw*0nuk{t__n_=BJGKFWyUb+WI@bH92cZXGgh-@fleDqYEgD
zBD+J4;J&zbW69Hp>3Se8TwgrKF(T@tkqIBo1!%|IC{8^R$9&^iR+U=jw3ET<wWOAM
z3X^OOq0ViwTLw#)I<vx<;i_1Bl~g@?$U4$1hA6nZS+04#UTPcg)i5FkT55NZ0=V&6
zMki!CwaGBIX97OCZ@vBIv|!7U?#JLFq}IP>MjG@F#K4ij>ouB>-80i5@aIfWgNd-A
zzHspRdKg_Dzq|UnsiWISsLz_Zcg-Xwe&`_p70B)PRt6W*>B;%4{uRMC+S%LZ1P4)<
zxEc)UfHer9VOv&A;;~zcW=G7sZYlM_T|8l5DPs`&IK&LMU{D%7OT>qB7IxzVE10ZV
z1UpiL9=am#E;1Pd1E{ArYrXS8m_h<|bNvJ9FXej7dLxyYL^G~j*^;3ip5`&)411Tp
zNGU>ji12mOOQI~zW-|zCPxW`!H9Xkol;ysAGWXt*;W=@DjQ>PHh*J@azl-DVx@PpQ
z9kyd;eX>xK8}E{-o%=3nb@K*I4?E~BS-XPitb&}bc@icnzEyZN!2EduDzn7~((6rA
zo-`m|3M%7PI&?`f&j7<B4i9T>3mpgbO(tBNYa7R@x7mF6?n5=KfbGof-PfFp5uNW7
z=w8OWP=>~t52v03eP+)+;eKV6>7(eDTHP%CSvPl=_QU=C{ii0i!|+)uYbG^6;=Q$m
zK>fGDvzp`3Zw#*+u8!||8RYOOrxPcC@xY6Y)b!UG5^wx%+}Ete*5e$fhhp4rH=&#L
zTtR!t2S?0m+BRuz3!Zegfk)|rW<Gl>u|AJpyh!zFPjDYaMLEu{Q=Zef%oDk0$7pT-
zn4h9;;!yP0I`lPGeK&=mW4g7{Tjz@{1)gE5p|6V6t3$<shod~OjYGK1jZO#fK99!~
z3uKPj%vFlju%?-7ogC0+&7S^dPrl9Kq*8^>CUK;)c2t5#Xaj70{4&ehbZWuXAI{eB
zkp5tX_xlA-1RjE+G8`q4@KlG%A)hiN7VsF4tiU5dG?`0`j>HE3r5HTAY7%pZE*UqZ
z3L(?+Ghvg}clpQ)e>ul<)Q{T@IAIOKJFP-xM_RKnN?c2@?eRN3Dxlz!B8S!iKd|SV
z*lzHQ5Kz}AmMdLp(2UpfKr_xRIyLU~<HkW7XQ5b{Fg^L|uKl|}v91hoLl87hWMQf?
zBP21hH9%WRZAzgBX=z`C3c#r=ZF;qVzRX{T321S)Bg}qy%|!9E?PY+?`s3^V>E+HV
zQ6z_JcJbm(zjtpn5`Tb$us(jU?Wz?1Y*bsd!FImb(An2U+k5su$hH3FdsviZHx&#i
z__tDP@APOec#u|M(EoRu)O9&kv^%uk8?TWuU(rCo^J<%$%=oiWAT~YRX<#}bO1aG~
z-(r(q2=Gpen4X6;0j7)&?#{Nr{`}TT(#TWl(X>7r_k`A3RCOQgvDJiC{rk7qaJ2X-
z>O&VAgdZVAEkp{dAfkc-MgQI2w>GzNBnf`^ujqj`!hjcor3vrF26yOPnxbWnMN&&r
z_Bb>Opb0d|wm<+z1EP68|M#nWRb{;zASKzJ-9gM)BvF<1%F4=jBC&dp@I-T>U75bt
zRDrx?2WT<q**46(46n-Nh52>J>BE=~MQn^y_V2_DU?M%%fP$5WHkDQoZLiW#1!E03
z4S=~|TX#XXtDwD`vHf#oc}gxl==e~vm!aFdY@|@2G2&=^`XnXd=!le<lvQ->TUpl%
zM}{K-JH@1$8qNXG7zVMTuM?rbsmH$YqRE5U0(9|4ebNFSZ_ry%c*@(^{FVeAj4hG;
zeMbedE?^d==Q-RH7QI=yW3xomH#gO!E|J-0x`pY0#^$2aV{RSK`)vFur?<cvUsOO1
zimnowDHq739^=$8qS^=-(|TO5nE7#jCA<knW)wZ8xUH%jO03QrZKKqQN$v(sRnU$-
zQo=@(RxNN6()sOFeDXEPxTFfjrVR%m*%jaqh-_w*dNmx{G;HMR1KwZ;%1yuc44l2p
zKcBJNmr^IK7iJFrl)$BVdf6LTuwi%C+q4_lFa_t-=Qp5PKqfN$;S>GRC439T57B?|
zyrG6*)x*?g=8r@0?i~JP1H{@Ohwh5oK&FI@3bK8_9o!D;nL77`d^u5P=vKkUh{0<+
z>AfD<5V#Nm&UPCJVVGP9=yI|R6cwxI04&!t?5lcV9N-YMVXiO{8T{B)=wUqq`yk{y
z6a!fmBTS4a;RD&U*In=N!6~{+7&_8zzJ(G{AkK(U+#kjagaIBKFDrTQiQ0WQt((>6
z`x?yto)?x~O5q=k*4`Za&3#mSFHX5*SS4JqmDrmb3K42-2&oxa5kizi186^jEzQW}
ztl_#D#7F7P#?03H-8PTC@{^rkziCBIyDI}oX4lQobK5A4@X5A9k3@~g&bBj4Ow?aU
z(o|*fmtQ!S0H{0HmQaIn-!I{YOWGBANa(;YEr1ISrZ36X3>rJs=FlrUpIz2h;L%VP
zBW39xwcvGh9DPX=wHsvJ?4zgfuFkgEFO**f=?gq~_$_V<sp0q!lp|FFyDQ+Vhv2T2
z0?{;6=l~(TtlO(*cm_9y`(54*jWB2W1OJoI^l23(_Eu;lTm^hYEldk1!@s1;GRzF8
zslk9w%m#z%nkC+zm5RsXGY~F@Ba1&>lJgpHr^V>(?8QDUXN2;e*Lg%r`s;bKvbxI=
z#q^SAJ<kcP+o%}xx_3!v>XCMcfpiU<&rZ9Bxm(qj6xO^V<s_g-fCA`3ZumI}zY<pp
zwMlWn@`mm4vP|lso^f_AvoOrr8jt0$i(*?5KSqEuW65=mJTqMaj>?9eN(r?P+|^lu
zScP%F^ZDELg4~80jMlxgS~vn(qbV6wgctr63FB(LlzVv85_!uYEikhNb5!*h4Hhu0
z3WkKwdI2{J$Ga=8YqX<~CbJPGp>kzH+tpjwCYo7OOfh)((i}9(VCh}OTNy<RyQ6Zc
z059&;+4{qUT)j9&-QnvRh$2?tiKasIz@H4AlD~P}FpL6(k0d_F<OB_%o?#^$-mZlU
z%Rb@oR*QV(nHgXVBO0fMS{f<4FvJ;M_dx0ZsJ{5qtl+fy6uqcg_&8xmz|eK3^)$}*
zt#-yy#psX(pI4F<iBC-hJ$l0{S?lbsS(yi%GuxaS%6McZUN_eMCa~F;mLdC6za0f7
zLuKc$SMY$4mK>BkP0GHn)~IOvj)d3|jW<-%!$rAFH|3-XG6&$4U^g9QjW;v%Wau#F
z3a*Zgns4e!#d$!zk(A?0*K=;PiU+hbYcM@P5WWk+Q^EH_y0B0Wo#M^%t&nEkUlva=
zZn)uCZVZtEPGyX4IOXL^Y5w5j=(JehS95bU{FK1ferF>l>_P^~w-Y%dd6;(Pi3+$i
z3M;bVN3NeG@X*!%l4wM<W>*ofR3j!3yB=Zj8@h{Snz>&BA=qIi2!Zam09FRC0CMK5
zP%=}7<X!KKE%TU5BKaHekp9pghm8#Lvj_TQ35gPJ(Zxt9`jB8HV`eboBz;!ZOPIO4
zIXV0vh|~OCc{F<<JDLfi(D#b$y40kQ@B_=$P~u}f-0E69d;ktPYX_-X2{c3$2iyT@
zUjx|{V@uGRSmG0*U&|UaQhDnMC!yN;Xk;!9Bq?5V>I0Vm!%o3VfN=qVk!1Xa6@g9L
zgJ<^a3TB9}V;bBNoZn}4bDd(aG#hak@Or_LK)%o$>R%oXDYb6zo+Gto7Vq=M0~GgW
zf7*EB2|xIssEt5@gtIKk)YUO0m`m?UZ~bNl;f#VuGNz5Ojo(f;8yLi}Ka^D<V$t=k
zvkA~<Nbe(l&_?wUKDkKH5Lr?49CK>fyZUdkq#fdoP%`rTr94QWPl7Z{rD2|pP-J%4
z0`Ster|{-?<5XZCn5mXc-NLX${MyH2UhHKw-`bxuO*CQA&4(#M@*(*Ypo-o-0LQG&
zuUn{56eE?3BiAXEZP*T+$zU)DJD)r|DNfN^=?bb8U~Gc)0c6Ichav_UfWiS<9`0iI
zKIya_3WPv{{MlaPZJ4%a>zkW9tghs<X^~Djs=p(x$}z?svEmuu34{B)PqQ63O=&yS
z;Sh-Ea?{{LOxa=8-r%If(V*s+i>RBg=5&U<=j(Dg*{}iqLJZ*UDumizmxVddLh!fF
zi*Pq$i&|IBCVTj+-pEp{DcALEwfp#Kx`!TE+eNz2tGDH5F3WL+#U(tK646Q|oJsVC
z42D<ryK0Cz>9xr^m|R<4oM<5uN)R3@4j9QCrz9?-)&s~g3OYceHN6v!fnymw)3+F~
zOBWIh+xWOCFG<&;VF9}%nyLBh@ub51Fb<mX2nH9E$1XP5Y4%8|WeX@M#th-yfT#ae
zkVg~d!N#NtXT&-bR1Z2i*<T+d=cL(^_IJEC57|wzLy<S0ie9Yjwmu$lI+#<C{h)OK
zj)3q^XC(RP$u}}=<0=}_@m(0!xVSEv_M(mvy#_B!h(Wtpue7|tRKLk@n>r=QbOm0<
z4eVfDav&$=I<&mh;NzGj+KmulICMf9jI9o~lNF-35cA?rJ1qGitFHudamsb#cgV?j
zCml0g;i7gd$hTFsP+1^FY1P<hoPevm8&6f%)6hnsR|&iY^G<(KOa+~bPiAyU4SyvJ
z!4}adI+3-^&lZNvhZ+w~eA~gG?}m}|P#~-%&M;+NP~V$JC^q<ceGn}fo%vrzWB5;B
zqpbTec4*RAs(w~$K<b{HVf@D&QyjM(wDHM_<}vGBH}d3Q#Vnij$_=k|eyJH`)PUhN
z(7pLjZ5(uBq^$CfQW)=&6}_w8`;5~Yzy5)jAAG=fA)$Vju$CqJITh161`<VK^~!po
z^wTM&vKmq2xy#e=r(Nr1dwvQExx?dg@v}aB2DCm*09T}XKi@k)_<rvPVZAp1h)}}6
zVV)zTe|XxnH);YOn$FR|(TQ1&@6Wus{j-A?u^V)<n@V(%{J!(|S@rjsqGRyMyFJS1
z!@EDKu{}3P_#1gbR;!Se^jTA&i5m{gdHCvjtCkjv*PiV?P#Y5+w#VGNi)V<+Liv91
zULG30>1P~m6A(%1rq=49F_WJZCE|X;9(iL$yjRTGB<OQLP0V_UF}#Fho93RC7Ba$D
zbQo&?IVEFK%4kJwM(SHGS`^`FjOX<`!0kZcmxykab&Bh!El3cY5Qe#;=E+5XiWZ4w
z6-fc0a;jNtx1O=^Riq2sK|2dY-6ZXoULxTe-}U06l9VT$jC5LGR>Y05!d#NeqDELU
z9IR?*ypdybWS?87DSzTt1lND7dpU^qyB8ykj!^f$2v`A^V^d*LNJ^3jB~;Q?I}PHZ
z<xFGDi6)g;k;5+i=*Z>aOplO^<&MRSi+Wa?Uv!0t0Qmx(!$iX@#Hvg*D3pmjc;R6m
zWt=I<jgll$%m@uNRlugtiT^29mK*!dELs$$Sc;LAGZQLxo-s)=BZ~bR>2)+!w4O@7
zcjF@NUYchMecHL=fE4gfDU<>GLN^GHvv<xAXeXFpXdsI5ZdCGvgjn;$%ILJbL2fI~
z2aQ^IQaYnf!Il~qW4Rl{TFuGqWM=Xo2F|DE_M;c21P?+Cae1pRAnln0gr~K17BB<s
z0#pMWabR%MC4Sa+2eck2o76H;&`2PF->>M-mpnO46uAn#s6sshBoJd+ew2JwY!wgO
zVwJbrA`M4taS-`L`5Sz9RCHn4+=B2#O(w#a9&OXrnsR?96dn}6(9CV{qK6eDI(%$J
zhyM}ir3&HJq%#|vrsl`E8NHOYkQzbm=Qx`o?o9Hjh7V`$(Y5Kj0<#y78tDp&!UZ5h
zJtKU0%Ti283mXv<e~3F>gj!LsU|A5UodT6h_`0ahD*nL09-wbmKZvJK?1hHy39;GD
z!q(9V)6S4BaERoti~sF=cY85Qore_Yg!e3w8iexQ+f3T+j~}l`5H%LE>`TY58~wA*
z;mb6wKUi8|+q{6*8ME+udU|r|=udD@+L(F1DlI%Q1IyStqK;}X?|3EnCw0i7W>Ly-
z%}5;%2#U(oU2na^lVc-qA3t*=IwyCZ|KP|q;8mR#n_|*37In%Hj43x+M~@j&$Xmne
z=_QMBX`U+c*j1O8pb|0!-|W?uV>?`)orqCZL+(+%Cr|MYhf^AENO&RUr)gA_O+9xH
zC$=`?(<?6@F%2;38MzZ+5F4Rd##Bs(-f6}56$(zWU&Grv1go~0;8oQHgSY9>+d`AE
zoq#JJFVG-I(A$M!__W*YA~TP;0a^D@#W_esUujOF7xNpol$LVZwAy8qXuhK&dc{!*
z>HDieTL4Z<R!-{k|G<iQ>y(#L%7dxt-Mj_{7G*y-6mD9ZI{pu+>@6>c!1@7V(^G2r
zC4h%-l^(k9z(R6W^S?k%OGswn&_7*<uaYEzz5o35VDHGY86@WEd8s5Mz$o9%1{j;h
z(~eNN;WlcZ2e->|!MS+i_ftXYOHFngOQLr>2r+9F!GBwI{4GQq-d5#Xb0r|{V!{yU
z%l_rf#iT55m;Ep{cvtO)zu_&>-i0TowzrL9hP93Wm2wHS6a2jig|7=oP-j;*Hw&ND
z$i|P4;C76XS3j+&V;pAK-Ry<itd)s)+aOd}jRwj8p}tkOv%Ki!b%Maf2V5doXy2Pr
z#jIjJM($+7jG$Q~Z%-KS0FOu~HY;@4#jg9ylV5j!dHQS5W?#;2L`tpX_*Gk$T|OB-
zw`pgX>|Tlw>yxyQQVOYb0s(3$iJkY@Ju-)vEPrsa0V;6ScuD%`<*@nVk_c2YoDhp_
zra*PXZSEwGiZk>UupWaK)g{D@a*jngKu)vB2!!RYXki*w7{{NQK@`nNAln)Q5b_l8
zx6&u4;_x1*Pj*UV4eTNMWTiybpj!3G_q7fMW+9n@I|)%Jx?R1Yq;LW&qI(<h!5+DN
z;Aa!F$?;=Kk`*NAe0K8v@lY(9+88)6EE`0tch;-R$N%05=;%hwYsWrm(QZu9nf?Cc
z8Z}(V5J@N$WU?hq<&!fl21GZhL2olIW^FQAt@-bv8<&Lck#1aNUGi?wA4etr(RHcG
ztL>x9zQ4MiOmQ_kNO(&h72ji~S0rO7ykBw=K28r?OBz?oC;`@W^O)U%19T%5cN{QM
z?4zfi_wIG=o45J?i+U-0rMF;Uc#~KWr|nRzSZG(iW!Q<BuStLg$o-TysEt_KglS&x
zjZ~VCZtwNZ<EL+u3=-qSo{xh*PH@?bNbgC1CmVJHY)Y4waC&h5>hyScbZ~aI_xvEC
z^t4l_A6`G*c@rSTH)x=4IEd4?$xtnalBz)%?u3XX_=nCY+iU=fM`K<_#U-dPS2&YF
zc74lw(d~i4PNziOaPYFFc$=MD5I9xB90NrtD+y`D4}!IHlphIiE$}4fo*wKU9DaA;
zypUBTi`WHOk`<uQ4|Aq=>p?Y;`XoKy9oD%>jVR&iY<_DWG$ck6eUJXX<o60p*4Gr+
z4}X=S8|r<*f+SRjR)DdhM~dLOHEd2aL*+RHz*6~*g$e9JUwKRuu!{ivFDx*!Kn1A*
zv59kI+I0h_C3@?T@Bbkjk`g;!o4ZDY#vjF&_;E9*KZVMs`BXY@Q`#t6H$(xGCM*TR
zHSjyy8YdIXz~hQm{+Fk_pwlyL;_!k;_}256QWm6UrJ`Xr4vE43{N>%V>fPtnY78%`
zB=c2JFw?6`fNe+?5+W{tB5otlO7r6JB3c3p{I%vYL}n17&raHs|1q!$dQlT<sKD_I
zYiY8#x1UJ~q{ktbE`IGuJrv}O|B9yiCDuEHFC$AKGLFvRXWC@SlW2ezOKduzEE=Wl
zgk++mCWCP~lkjvHKVw?dE2mp4-CfsyYBI6iAlgWA;F3S&yT~SQ-{ZpY4R51w({{ns
z2odmf*JQU4=ep9W@7AuZau@4*I?<T-S%7+SlN23MX$O(+TceO`(}So-f6B?3c%B3}
z3qJ=~ymE!}@Q?Rha82xY@F$VJK*!dh6474Y*HJ27hh11OPTwn;#G81-^;l}9S~&{5
zWoEav&;jWOij}K&wYGY&Ps#55X`6%uq&MXXVygw9*-thF($90Ok1qiw_ptBnG##Kq
zq|X$R!6(edOcsjPe9Wo$Gic%+>O)c`_Y=0qw%p0y2%#r{lK0{B2#jc~-(tkDMG&$4
zi<cV)sj`$<u`sqPjhTIo28w6042GH_ptGf}kRnTtbSHVI21Y<ymE-GfjPmK2L%=V~
z(1m#Z0kH`JU4I8$3Xo|V1d1G-^y#K@Hvdr%Tu;)IQHX_Tmr+!yV3O@9SPruMLYN|w
zC7S?j$({=`1IjMeY9rr9R7lvwZ8t7dIDxo5Gn{oZ4DuU&cSJ!Y#d9O?D<!dnLw<LZ
zH-<}z3@_yrNfgfDE{+3LkSNbWPc4RS&lZBr@LrP4(92Y`NeIbhG8)RU?iRoN8hqud
zZW0mDKK<MRYDo+i^Tm2vW-YZ@p#?r~tGlGOCoGui@`DG2WMfZG6P}<y?o7N8yCjro
zL$e9Iym5q#WbMZ?nOJ|V?8xKsZfa6}b0c<xmBcJ*CfPf3iMd4Z<1xVQDJlb>RK|C6
zE-Kx-IZY|itY`t#tUwaYjh$X;u*@(%$Q9BEM)AbF;Jp7SH0O5ueWYeM4@htADUO1G
zEmQ~@re8RcKm+**(ktVZAV)^23IRGiD!`keYa911nt8z$!P1rz1I|`ZGjp@C`T(Ss
zdU0soY>L(GyjZN43ov|#b9&m7gD$#oc~E!T{AUkS!9dm=tRGm<u=1q2V0hX!Rgn$T
z13rTPFxyXQOkgoUQG#ZF?iG6lRQMzfCf{9*RpSnr%nucw6Us+_Q-KhOOI|HQy>U@U
zwxR;?FQGNvGiO&{tXDMRC(*f)jmv*MB7H_Te4-NSgf2LE!*_j;dGX~>DhE$oQwRXY
z5f~YDMf8vX!ODQcEl4m^{stOKVD2m@r0R!Gr!Gms?R?q5A(<U*#^$kD*7Go@C*+P9
zgDhE7`H;zcjR{_jyiR2Yb*@#eDLF`_x|UOT4eNSiwl7s=Sw5u3rJEU0#eR|^coNjP
zl)9>20qZnz@aCk+!=z4RJ)u6rNYH^baR?z~x$XiZ;*Bvd&yH!_EB;@6-d<Yz0DOwd
ze~7bv&vVs23n7OUU9|9?rJQ+v+3eZjX{%@^AJ_Wp!-E&kHkM!Vk22P%w~dmQ<mBT^
zbRl=PIcCKn(Uzi}KXCeLByLcL5j+tS5YwXdVq)NR^&|%7*{xSy*s_rQddNY#mPZHY
z-<)iBirb&yHmCQ#CwK2t`!FXaVo&aHyvD#v5Luj2$M_m<|9tw#a39*P=l)O{??#S=
zJM_8q8E<-@lZ;-7uSiC=sQ5<ho*!&1qqiNNDjVMWsgdr7y4KY0Vq%U@%q=|n>cC!G
zcpMA^L-B3wknsA+n=nd`j0T)g)>Q?APppgo*w-cIon(UK?CG&`UdFMXtVDpEvo8{m
zBSccLVfcX$8bk<oZ6zJ-o~Y{BOM=2hRZ#u1p`M#JlqNV(%kUOwCUV`H%WVKH_CZPE
z^=nq`Dc}aH`5DTKez#?{Nc4Loz5|+@U|=1UhNYhDS_`+9iIvh2OWH*`!#%3ufnSb_
zWu!r*LU>An*gyugu@rOVlxLr~nUTlBzyoU?+t7CbLu8@eHmTrH0Hi|OP@HVJw295k
z=M}lm^*wOx0HJy+t4)nYR4Q6Mr%JH`%+?GLagm%v{GhGTkMC|qUKZg}WbS@Z4Xy@O
ztuqpLThn9kqnX}dTF$NpbhtrPv4jHk@~7D*FAFSvxeG1dORK0o8#XGzV%LUY<Y3_{
zaz-sfFNkow`?x{)4G8g?fBHyddwei;Pg0)y1Ibc8LL~R{{W$Bh^S$%K{p}g+e2H2I
z_aBg@MiP}_ro&6Qk*W3&kNdsed6T`!Kv9Q4QHS`3CBh+|R~+T!h%eV|<%|^+cI;li
zfn*qVyunZF+Dw_#Z%P4^0lk`rr-!b&Gz~t2ogRGs;$Z*$@Z@+4ei#}fJN(zZ@Aif-
z_Ku&sddQoNtRA{;U9b2ah^6luc%3jGTYOnOB@aY$hkf$KF~VoB_V=TFjINUVheO5y
zEjq1yURx1hJkVdB{OUBx)z45GuC|s(KavY3(Ca-wD{cV3?l522!TUtKUMG|aT6MJ_
z2H#j%xj&k{uK?zs^zVa?R(^Z^R5c|n%JBS$mj~J8Cz~X6lFliIt2p1_3-Z$J{Ndj$
zXAU^rq8Mm%uS4m`PCs^LC;9=aNa)SgGDr+)nFOEuzW{A<T8B-OOL@+1ef{CHH3{&g
z)dybs!7oqmb@6%F&_?-><w`pIeVs>JF(?J^pWdTeIeFRA2bU#?yuiq;G4S=K{uz$`
zx?F})XGk_JHHca3ZM;y2|5M#ihg<ofM*eXp$7koKulCPR+69fINd3|L<nAX(Jls((
zBG~j6I>zZY^N)MgRLgISu)Mck&7p$xv`Xfh!GFOBZ|(^M@<EDQ<Z7tu3xa~*Srr9Z
zxa#K;z)_%k3Gc*rb#*IIo$t)g1pUh|zbxiv``|AFcGc^F)Qegt)6<D7^sVi+G^a7Y
zk*-#)v|iQI!6|;10HnT^dIzpbk4&;|<arZEI8LQRvv(6$IxT|s)tvzAtBHSx*5Y?%
zJ(b$U02`-WoKu?6ax#JcqjyI+B}HP@eEyO)+t7u{+zpEg#>L$iKjMl7o1kg~q=3P{
z(a*Mr`de2`+VIx~tp|KoBo97iApyGC@&>zlsoYL*jOR#YArKYD7L!PaNg1f@`Y$<J
zRMp_5sZ~T~2};!rG$QqZ-rmjE*z*C)SwYMp{e@y&;fX1iwhphGvagH$_TUE~i+>Tc
zpDz`@f7$Vr#=H|bwK0F~DUN}flG>Ca|9KRpLWaM~UFMAmOK0RXz;?htv<~<|%3pK6
z-rWrANtjsb!)SsHsuB1$)g!mCIy0_)V2H@@@YyGsw)DeB^&!*j*J=7ZFi8340kXA`
z;_?8K%7}FH*eB}4NbOsA#zmPb&Rq%`Ow1uyNGBx(j)YLc6WRtMw0&T;!K>qMk0C$B
zCcS}7E3F#v5a&139;l-Tt+%WtFk|e0yZ8Jci7%Q8^77h+R9c^AYFk3kWd<m{r!AW;
zLOZJiv;yGk|2_uo_38P}n@wWZX3=>|^%!qUPX$sF+mU=W=&KPVotfWC%0w^AQjn_8
zt5@ljGjgv4>UH@I5#N4Szap?%cDZ;u*6k)=jGoTPB{xsKA<gDs>J&96{Rz`_`V(zQ
zn#kYJtS1974fd`*B&(Qc*)2uxk$kI&1;kW0$E~_sLUC|<aJ+wj3Mg~89h5=f%<T)!
zkBT#P^~E|-mR+?5<L-PWS>{D%?b&it0Vp&W+5AI;&|YO3cC3Y2&#tRwO|dZn9H&|#
zo7Q1*i(RnnyG@-4=n8W79P)`=PODTQ!AnedeYKt%whR-X2p^cCi@fo>YX5>VJNURt
z>G>gukYr5TExLqbuix}GzQy-7j=5H_hldB>Z|BC7rSg&R-L|yXPu}!?2WOuTC=0h6
z(=Z>68Y_G241*0K-Io4qe@wULJoYBE$FN*;`hEHF>CPL22yGl>#@d)ctfz0R{qc8U
zI<-pU9;ljO)RJ7?!<ahXzN2ol2d>rr_+k03tQo=zO)|&-ReTqlB>V)+FIjAjm@jq}
zbNXPv*mU2CxIsD0jnb_1Rqtcb1|uYALg^@gJK%e~sK;DXBRet0?n$|!Oyk^Fo{EFF
zEvMoTUXx0O8tMjRvay0?Ng?fvx%tiTovYglb3ie-^4*bUDN7Zp`dJZy7Nrz0u{tkd
zX>$vxMid&$kX7kv296mC%p#ZI=vzY_4cc1NVr(}q!(OS}1_Olnk^lx%$xTA5@`i+D
z_nnN)WbRGojA3m%576e=as5#VMbHrpEhj9FFrU_E(SfJ2Nq|U+4cPUFBXTLIja63|
z{6;s?pA$JrnjfK1E2VyN$KoArpidlGb<|{jZiediMu@kbmkU9$EfqM`%X-Q{QnO__
zsUFWSFD;Uz<!4sIb4sdSs24C_aS2vPV!m$RIc$-cBHkbrpjc#b<bWL6eDnDN?;1ES
z{tRo-6<U8d|HizAFHcYY_CumvD?SKAA%KjWJOUTSB<q&^&DBOv%_B}$&C?G?h#I6;
z!%fw}JTTY%-ps4!E4Rnh3mu89>`lW<LSMWCesB$S8A$uhoSvXhoT)Oq;&4LVprd{6
zM|V3hM>VhXoKh!3L7>Y)al~m~*;A9UN&*r%)vmPl;GBSy=WX4nvuvPXQ-?D7i3K3u
zhwNeQh@N5z9BH#63g8#zto_ZwFCd%Xl8V%^dS6N&$W0T}@ZZN6Gum_YZ&{Hx4?9A$
z1Ow%6wdNhXi>H5yoo3X1bU%hQY4f-Ut_s;N!>wpR=|D<4p6jX(x7AW1Tr)Q;9m0c^
z5$*}(B<gKmLhjBa@(g3y-;i3yWX+2V-M_BP)If(JwT`i14wY2&b}1LEsB0wa!tAd7
zJ?bQDgt*5r+~XbtxlP$_7}x$DgCt*|?0C_EdJ{;(qE|UQ@<di;nQ;!AQjWiQv{bv{
zg_rTNgWN1z?XTa*?s2pbSx;)~<;z(3ij=PGp+QBP{BOG<g~?KR5LI&D1sc8Rj9g#$
zw+u~p6)%#rmhZdXy*@Jnt}E7_(k(+u{$S-T5#$6fpk>y48mu<N(lC3B_#nkk%OH0z
z39$RN+l!!Wg|7*y$(z;P08%Sp(`XdvV<QxlgmTn+6RVlouSgjdSQE07v{;jwroy^V
z=dcZnFwjpvNy{$*&tDqRzT4|>1tKTe$0wiX!_wQT<DdqMb*=zT7Oc%OoGNXeJra9a
z?4cyCU`k91^V)Mngnzlo=L!TmO;#mK3UzUs$OTMhS%ZsFbmH))U<nVU3^C$0dj4>R
z5c6v<Eb3<dhwgNfcfk9Drv^Hn%%)}*dsun_4J_zYb3p~NTpH{U{{R|XSMU;1Yu|=~
z<XMPpn3^?`93H=Xbw1obIsWe8^jsA<+4Q3Lu)M&w4Y7D(vbzjL-DEWk)n)pPQrYB^
zmX!n!ZwxmlEQobK8;6tX7iR~jW{>YZ;ydu16U%erfyfzXcVllCP=ot~sGcO69X`8H
zJct*mth6q=qsk?SFx8_L5aV<)CU8SKyE8)N&48ny&g+}%sGd&iCN5;3DfdAypH#$2
zq+xdxL`z=cb<Cyf3fcy>tdgKMDpB&quR`hSKDr7eXt6`J)fj}j2GKDblDH@mUmxm4
zpkv<JFToQ|%)bhbTE*FBPjN4)2$K}t;md`oWnp@vPhjUxFxR1I7b+ECawXK~1e4&M
z?c3!Hk+2f9v)>RIE@uU}fO9inVdO6-Q{2Njm$%70US@!Y3gWFo^9JTEw719ns>f-E
z7`x>>+6rbQ$0bfQzQXNA>x~RQt<1?8_I$ox8o-bcjbB!{$)ZfMDRL7UhzucU<>tK;
zbreG#aFBN-vzwdlzxHW}X5!1UZLF#0`7(CKEzfng?Z|^I$kIaar-H``m@4&@>~_ZM
zW;MTohpQ8mb;IogWxeT(NmG#_Lr?YEP(vuIM9#&HVYpm_YSyd`JLhmwvR1R>^-VcJ
zPe{1=*ufelK^Z%($17K5RrzPb2P`}@N&$jy;SK|yV^}@<Bvj@2S`214^T`@(K_M%M
zq(`Ps+$Fknfq!+WLh?~HdQq9?^?ccjNjz-aPI0neFLZM8C&DRw>!5fhtD4!OzutMT
zXZ25z2!-l>KtjPu-K^@FSv~+D&ogmKA}0vX8w);Nv>DE5-ROmkPXqQU9lUyys%gI$
zc}5v#8-^h>X6Z5g$O+kck=sQti(k)XFCgH1_D>HCt2Pu*pw?cIuMe^~zojP+vZB_r
zDstp$^WBMKPLlj(V7c1BZlX7X*I)Sss>$|zih!9>m>gg$Jo|p}*~D;oGqATcIii4<
z1p6cIL)6R2ff%82$UzxJm&moHg!j70O_1WnMFQ~K;w==1=Ia+^dXcf*JOJ$>J0Ns$
zfAza-`=>Y3==Ib7pWlqwbI+jHWIe_NGb8K>zCY&c<W<wF+rv%!i;GO*%(mzjVH<oo
z^2V0E$nrrDL#Iv+9Akp;xyT~~=sM24p(1WXH&zBz2|EPYK?R`Fc81q{@ANdNE||WE
z+>WmKy>GZf7modsmS{?lX~Mgr4%>fVCH8GMhHm@B%ly~fkLf_33uoSffav<3Qlb@3
zcxV}~RI%1G)1#r9h%0sLJmsJ*+^hFbTKXK2hsONPz;E*9D87)vM0Y^U83Fq?;zdL?
z7Nk&_ccfVIu}Kzu?3$79(7SeeNBjg8K5H_MSB7H_4>PQ%x~=#N4w=E4BOM?ywfZmV
z@Lyf_Ecz1H<%*`Iwk8m*GDeR~*))%Rk9g|@YG(NFd!y?AB3#T#wdpV`un=PvRa7SM
zt|#Q*dkcrLjP^eq$1Uz+1<!{gt}BTBfsj^%@q8$M3tZM!yhD4Onv5@){6k_qRXqO3
zKzBKn_d|5URcCdit$ag>2wm|$s3tuz=#bKs)CcAaTdJP4yeF+EmQ3pRzUEcJf(Tvu
z#oMY{WZe6FMl(e{hU%)c&<s$nT8*x{=|+T56|ba-`m&lYDy-7!sbOtBs-la5#dyy>
z5rlk4M=n@Dq2)*|cLzWTK}q2O1)BJqXATz;2OQ>BRJGdR<8<9XZFs*x&NjO@2q(I+
zhc{~~UT+#FwQmW+)>f@2w8}9733bfLt@Os<+ARl%7^DE^eOP?Ip3GfKdE{(yFEJuw
z^9jVT({HT0hpHS*U+Y`^dvjF2rKgq@61+;&*+pui*@6QHwGFw_KFp{A@QSo^4i2uu
ztn=csDDF<VDY!^Y!J>RM5%+OcL{hyeRR+EfHk7OJfIDbPN6N}(#h1LEy_>(4iI8&w
zTC3}u8pm1EH>>%)FtX(xuhhNAIx#`UryV|8EkhtkFOyg}JiB^>yO+zV>J>qxVla?8
zvy>JsOvwTmfcZ)3{NI&E7+#bsP<OH$wAm*^=TZAbDWxSjX%G%#27E@}z+;5TRu`Cv
z<6^#E*}N>^Nid7Cgm=XZzvG2)S}hfHR5_3?sZ8|vS`X!Ii_mUK1UpH%`O-Rf3a~pq
z=HIN#@?|}R1YA^I5@QGQ8mrD#wfg&KU(e?qgBy2B+i-Me{@90~)W2Vql-D96;TM?v
z*|fm;SD7DR;Z3<3Uz>TAS0!Kv8GdG~8tPBiSF_TLSNZmz^Eq`Q$BdN}3c=tFxf)>G
z?9eaA=F9HiKkLN1<jx=2T*3hhw5c(#j@HfF*FI7oHq5f%?$96Pm>8)b`?MK^9rpnm
zroHzdL85RdJ2&x0&4EiODe)_{80hZs6@0a&H)eHHAw%!{W5@Pp2Wp5BmN*xHG_q=~
zT0)@1b@5mUb{dWut`MavPT%Oj1u4!joYrIO>#c9Lz0^z<{tGXwswV?vVp^_e<7=yu
z3Al--Pman@;1OAxF|Kzzoj%=q`x_9V`C`@VzJB9BpOd;8&S!?juWi(ZgwG5c^ER$m
ziI@xB!U$JPuO_9T-Lv}tRzvxo=@wE!nhN|FJm74_1a19dIS0mWc(t6b7bzz41@1%;
z;Uxw<Oq@dyD6U8{K5;50hmVR;l=t*6Q68m%`ufWe$5Oz1uR}j-O-s+Js@S<SGu#>Z
zN0x%_w=e?$Wd*kZ2YW^#i@*>QEyo8O1|X|*L{<$NP~k?jLz)@WATjp<w}Lz`sA+H=
znng9Hi;n4%K!jdgS2tjQl=euiBTkhOkS9@sk#o*m!WVyqZ_ThLK*1ab&kh;pJO33a
z8+wkBN5QIQv-#{Xyw8(zIWYnyd)7ngh)dx@a9&}JF)$Qf;#0mVM$KI_Fr4UR{qu<Y
zGq8N-tRgEePSYFOgS4Jabi9FO&-td&vhA<-LCc&%V)y8h?%16EKx3<)SIg2y?W`bJ
zf>M7VtZry6a>Fy91+IbiJYAjUk!?6~4~Ks<fwLdUq(*~YOwcXuh~y@moS?tlN_-?t
zFJ$NfUdkLDAELAdxDAyCDc;s_7}1Ueq-qBMFDuGpLf^Q8sMx}Ps?80+6pdY`5P^h%
zJ+P{r4k#?<&0tc!o0;cv;3Azc266%F01w6`s4^y1GhWsU%&JwISG2;1e2J9F{5B}Q
zhkXG%3S#0E(zhVSuj;@aPPeI%g0Hct7*OU9z99zYKh}oCj}oG)<3AwqbbW*TCirw&
zPR4&F1<C|p7HmRZYt=gp5EZxH0EMGGDS*?!TZp7o&PcWkwxub$wAp<=rq=g2da`@P
zO?`Du<q;`-Z&59$cO=G60(O;MjiK$D(#93|j$TZw8{(Gh86~;^Vg#yGIDagK6dpym
ziat6e)2lTtQJaa6dXn-DiG(r=_L&xoC*Z$e()Vvpjt)M*sAr%5h{1inZ=P#{tRw{r
zbCDgyNbOljoJdT|Gt5Z9w8H(Vo$H|RAU==EIDq~6z}a<(9vhS&rna`du<-C3p-~c?
z1VS6$;MP2ZbOWXCvLD$D7btuT_gBm5<NeRi4xb+$pF>RMrz_&Jm+QsK!8v4!!bfLA
zx|@DLw=19V!VNb2VvuNRxb{8^iy)j`R1UF_+kq0DK7rv(DzHKG_0_duaohp1t3ml7
z_RW7Hb+!N<B0q{k3N`C5VMcB2I0XnY8>E<U$AS0tvzfovz<3A0_anYLqyuT1<=FA~
zPntQmKW)jM=e}@{hP?l~Ue1U_Fisr(iXsi}crO?KVUXidvA_4?#aDa#-xg;t5B3kg
zKHQfq6@epTn5Y88{J>C!?^?4k??KJ!NzL!$ilT?nD;O21PV4uHofgI<bHyinhd2qR
zR<+{oT~^2#hN>#N>zra}NySCAw~Eq`FGz<g)jH_$x@YbH^4ttX)%#=w;A-Ty`QEc<
zrw3<e_w+uG<(T<Y<e2%?I~344EDc;Z5`Js><wv$Sj?3Sm4GbbHQ(-_hk{&PsL0MiS
zdOiLaVf=X){z$=pzc85JI5fP~X5jA`R=20R7KD;5#)1zMvU{&ca=9n`FJCgGZeeUJ
zy)m2OFXQM|2E9dJZs~V#nkm>bRz}*~0)%QEPCt`yNE5-`;`^#JyQYl-kBS#_gRxRw
z3`*QJb6_&f`%_~&HpmGGln)aMDoTjYkmrRD`4Pd*cRs1m$ygj2tsT1|i^38$j41sY
z>2m6&Jf>7|cUG@CF4w?O1BOkia!M}T1AmQESix>_3M&aMyLB@ZW0Ps(*KpJgKsp$D
ztt=_0V1|U{+6C_fmqOcPlsbS7Ka@bjLShmCL<-W~L59ZltlJxCno(X3(HnJ_-5mxH
zgk{hP?fM!G`wp9-lBfA!fNq3R1jT`)f4V$OU;cb{vj6Sy?ELg#??`VexD*;}8S<Fj
z7_w`f_K@%z4~$?j>1LM^W(xe`@oK$bKA~Q``#=5S&;DP&@H)U2$<6YIHG2p@fjT%D
zE&}en6=aVXx;qZg^rog-GbzEQPhHc{=>v4$fyc&817%4%d+a(x?lMc8_j_e?E!)s8
z_vUm`^{qSNMe)m%Uw3|a`fJZG7L(X*4+wjY*!$Zx6HG+dMR7gQu3z~1IgZM^i%N1j
zLg+$6u?q(OjWoVq6zj!*`nbw%cg<~rAG%a2lpNO074e`y;}Qbq+9mEdb7{neZ$V})
zzn%mm!A+JxfRn9%{0V9y(QsBxOddC^pG>$rScqny9Km~LRjQ~(9Ra8jkyX(UKWVj~
zq_8!&6n(+-tC8FBumv5FggOb<#Xv`>-?YDA4B!kzT=PTMOKy*v>FM>nS!G<oGV7OD
z*=_+)-z>3rYe1d{Rc$DyxOY*c62<-%h4&%^J^*Nk2Ot4IDu^~36nh#}C_d>_bZm}?
z$%&E#!pOmzftgh!4e-OndOfIy#^Z142~GZc@`A_CYa1VMI(FybW?9UF_XbbRY;Dp9
z#d{3sV7vd8B!Fr6xy~8Aq7Em_=^H}Z8AMHm=-z+_*`xvvDe-FhFH)};PMVt1v7-Bb
zI^9E`n<AwQDystlL*$1tMDYy#hmBqxkQ~C3NR>}3Cy?c$UqFss`UxK~EZlDTgZl&h
zlXgV3GlQ(q`2S?HjZ8LP$ss7LK_UA|1VlVSTUP@gs=1Xq$^z@t$codwX+MW@Px{3$
z&4kc=1O6kNYs-3Z+jzD4qG!QuAY~5sNgsyocaPEAm!Kq<t*2>s%=I1IF3Sb)-G-@$
z2O#@iPpTmFz?<)Ev388H+p(MSZ3SxfZfAMXNq1FmKp1yzzsb)znlH{IJ<=w^X%}<3
z<?Zz*5{Qk!C3fBK`66}x9z-$V>9GQbz76CpHP^lxba$J1=F8^E=cjdTj>5(%=tTwM
zO};m|Y_@dWLLS(PR(TH~vfoU&Z%ed!Q8F8SG6DXk_ai<UB<C#$Lx+K_b`!TN5+=5u
ztm4A?37jt$f+M*X&x_P|_m7)L#9?Jx-gs39)MXa7Vn1__tqMi}rACO=eu?Z+d0I;l
z;<$XL6}QVK)Qfpo<qA%|U5-mn#an3J&F8W>M8FS-(V8Yy$oqV%Lb^8cg_ZPtr4=v*
z-H<Bz_{Ake<;0{=U)E6=KIv<ki-8QKf%)Qo!a%WSc6}Q7(}flau@mP62T-a(IRr{P
z<<v=%yFgnb+D|a#lO@LKnOUzosl~DqOqAWo-ny+$HxvQj6Hqy;iA&RHBkOZwQYgt|
ztDsvM!3AyMuYre^gHrumjo0K8us6kI2Y4fHMOn9+vuVBxJSiNAWd)g$N+YwvO37w_
zTV)y(V9D7q#%!f<6VVR}Pj3yEQ{2S?Vk}3%CkR23PfnSMxl7s_x3vo%!Zt;p-^G!j
zP3@Cv$tW)ie6}YINv8e2^Z`EV9g0FQyaiwaKG0+>1CHmi#w;4`!fmX4_FD0^#=@|@
z)K}q-E#}Q?>w6m-Zq*}4iZ`#yK-_A?*4nC3NvDAz2B8b>hFjmCKUuyZdM(A0ikx!d
z$$eN>oXCi(D?qFl)v_D!0YA<IJJ;ZtvgR2PH}GHot3XFTjG{yCXGpY#mv83pFip<W
z!IMF8Zp4~`3rl%=I|+m%cZF$?#_yp9NekuP$*Pl+$CQxQOju#I7Zop}34#OO!>G6+
zPj|fFp#A-|&YCW@>Lt@}`o3mAAX^%GA>tJ^j6?(3EVa!QGh{}DCsx+sy9%#FOv_y6
zj#$Hmn2m}8a3}2cdarjLKYbH{m|y%zHQF8wVRNYKDQ&{U&KD}zSWgD1LHEnkl=coj
ztij`+eV%VGe$NbM^IAB>7#u7ihYJ0506`85vxr0<75SvAy<@$5-e+6Kd+)^yb`sG^
zPtssH4+x`x(@_|_q1??p#l9j<X-QYnCwx{6gsu;LP|ffFJL}cu<A3iY_KD*A!Ua9w
zJN;_!`GHx@3#bvb_1cn%l59Dtrd>;uU(<@;Y`@H`<nNbiIz2dlb$UEJGC!N;Y_l6V
zorH+Zlv(H1?ClJtE{DO^q$fFG(u;Xid|!PE(NpA{h)#eu)?NH*FQ#CL7+(=DTapiH
z1{^04Mtc^l-8}Pv@)yhb+iI4+L=V`|gVWQK)2+9Ybt*hWQY|XvF5-9Fb^YfAOM>KM
zp%-maN{i8nVU$9CJN;Z1l=jR3k5r+X865glB|{8K*tIW5t6o0ZVq7?G7k531h3CW<
z@Ld#jbT5rVSfS7W480D(t~tAsfc;1r7u`R|6pzWCSwiAOl#(8S^gLu}Ls(wtq}&t?
z83{@O!Hor}001XLr6v=rEGM>+AD;|M8Gj{=sP_$XY5qXr#>N_4rJYNQJpmKsX>cPI
zo_n4MUF4CX?Fv)(s^=|FK=cCTv7R|jeG8ohD`Wz8WlK&4c|Pkgh9zg|Pu<L$8<+Ql
z49xK}*5`Tkx}OoiEU0jo{Qa?o68Sj`lc>ojN_VCw+IJfo=$l(<9DHI`@j3eCviZLX
zQkGa%4n9UPt%^P!c45U#{y{+f83>`BNYCo;o5D|ZdW4@LUfR=s?Jxnf5BEeKb^(Hp
ze^Qy_{LLeG*hJpQ&K*?0gx*O_bT5X|3`%p2%@xMq{00x0K5rv|wmOHfphBXoVbEk$
zQ2m4kw&3TwQf2C_BXTA~z#ewFqvw-Po2G#vFs>WQGT0=P7@u)jj+Otiz_=z%lr*Uf
z2MqTUs*g7!J!r|H^z@ZL8M(1W>@$Otk_3_%0v?2YV`vh@NB08PV+#udVtB^N`vj{B
zM!V>(wpm{QQ^bMglX|>zkSv0YkkS|`GwK&3;G*UC9{j;ppuqice`1~3sPEQ}#1_7U
zMj#m>i5P91al`uU$Djh=Q06nN+S%qJitF7ie)-k^^Q`)LMIz}Fcimxy_8G!B4#O3U
zt07lS_pJnb6DYgL7$bp3uqlxPyUvSYo0pS`-^+Hg**_`QuaR6KHG)Bey#tVJTa>O_
zwr$(CZQHhX*|zPfUADc;wr$(SF5NnPUq|25FFHCSXGTUw##)&fbB#H2{A2!KimBnO
z17FP?<R}?;>LId~Rcuw0#l_SGMWwXGm7)2e_3v(SQ%qI=EqFT)-q*jk_6lw6NFQ$?
zOkBpCzx}pobBEjUI8+4Wm?_f8(rPo1G9;-N89^G^RHoE~-L<#M8*Mg7uGB(<GHdj5
z%$!v5Al4%*&}v2m2Zo>=x?nRyOr9345_k0gBiQh|0ZW2R3k*<Tw{-4b_3QwV!-GIq
zgX#)v$4l@E#oGx(Pkp~2KaJcKh;uV8C{dM3BHF8oef~q4W8nGTBUPVBPi03>Y9MG;
zGNBwOLs11J*?_Xb5+?Y7*Z**Ff4<t2R@BAG#raM60)si04>sH38DaP^D)pzQQ|{T@
znC%E{9t{%0#>Mw<<P(1Asv($DV|9*6xBfui$N%G6h!S}qb$WZcNdJSKU}OdWK=@B{
zc?U;(XZs&>`Tuk*wAj|tBtqBz?=bLYZ0jFW`TGuEQ2F<v?KYu#uny>*68{5=>W<i}
z#vHy`G$z`G7A;;FNx5^h&l`4OB^1grN$pM0D6)MR^HAmi4@v8d7S<i62I(Z8yQFxL
zF)Mtg2BQlCGt4ngn(#oLwjgn=+E=6r?{?^Y*fnz-(Nu72M-IQ@a1OLt7$%@^#l%MX
zw<J<go*|N*d7ONkF1Qc>Ax=$l%C)-mVixa)jnWPhW3|D1+Xf;&SYd-D!U-R_YYk5V
zf{b-{{6lS|dFKI%&bU;XDGBQ_Z>T^`g0q0xWJ{(d!IlYv5&==k1*7TVv+$E6-z~Bi
z7Ghr{^S;Pp0#^iR)>)bpAq~OoMwSvn&4$$-F_zEhiqV!aUhT2Qqil&$rq6gmo4#QR
z;;~GVK6SE1Op!^#kZv`vc(%Qd<;59LR;hlHRtIH}b^^^@pq}X8(LQ&rEZYD(sVeF+
z#LI-f5&_Mx@3AF-x}lT(BUQp0*W6`czT*~f+)3%t0ZVHhVLZIN+|5;VTfpT_(sY(k
ziNQ2n@zhYq`GRI^yNnks5#cA*FK>LNx#EEqX+@lCBLTmw($T><v5++8OjCp3vps+Y
zkhtfN{IbzI8gLc`7E)S24saeJLAZw3Blj{lq2K{2@>jh`@kkxqIKB^GC7XnyA%asy
zcgbbiz5%rJJLCwdXz(1x7LxIDT{)=ufR(_b72lGK_Xe&3)+2tS8Z^_&_cC(b%XcGw
z0j33>w3uqvde(Yv%idJ8V@!>KdU(2ioxJJKd~tj$3xN?sdK3uwjp+u>JsGg1_W<Yd
zhVPkyz4f8xCA=4~<9E8kW|ONHZ9}AqJDUU%TvzR6wdsAE+DBI6^Y!lA55_okqPT-i
z;7*m^bdF&h-VLYKZhmcI>k!)6X@A~XfItV`jb_P=AiHt-neV^0fD-+&_q^%M;P$za
zldXentX{)8yaA?t@w`Pt>~{UbQ>(Y5p?>Xh<I3@}YyXG)(*(P+a62aI0-^SlZ}M&W
z&2WV2;(lv0$K$~b87|D$0pkT!LYW4%g%u}Guk0k>RyTG;xRqT=TH1iEy9=k?$~5Hi
zJt%8ifmU}9v*d^KX{POf<#rzX_h@>t#dCoYr&9yK(EM75*G@3IugV+F_M(JcF>bL%
ztaTvK+D1D`@B8<5RN??!H1p-uxNs1u-R9fGU5ZwO|M+UhFMB%OpHg~@UK}{-2iziy
zTWQx@+u{NkM;9>F=C-}`+ltN_GNx#sd0Rlb?Vp9Sbmea&?UgRhGO23s5ayld_iLL_
zdGN)B4_7Xr2s365D7r6b-A8t&Ab{lecOGgs_xm<~<#-Olb|@4g^}qnN3Q?=mzx>4T
zIQHZ8M3L7|Bkb(Fue^v+?wQDK*!3lKKqCazc@)n45kwF=8A~?-<xq(saepPQJa^%0
zt@2m{cg+Hbx1JT>RD%H!&FJk<)DPH+YukiXt6=Ke&{YWug?cH&?$6csDEs#a4lHx5
zg6q<*9h8D>5Q#e5p$Z@ZvHyDRc9#S46VCc<xRJ;7Um|$3mRTDPH8cEf@5;im31z4g
zgdboTdzS8D2t~LB-E3aFGPT_);TgDukSfWC0e2|Pitk6VJdMRtOfRA180gH1r-x@U
zy@-GF065d+hs8?3KX!ebfyl{^8180ZA4$I62*$t+=h1}mLKq)p<)p(KWru0ljF0Bt
zw&6LF;<?ULM&JLMv=x#<O`LZs%q)(I``aa^r?R(ckZaK!jxxN9A(hA2j>}q5Ra{&H
zj8+4q8m>*JTQhe6GF*j8oD^)Ykz7>5Ji+aG1ee8D(`3DwHRJV}CjJC`sNdjVj&PEu
zV|EehBZ&zC^}=!7>F@wMo?clz^APT^*yvu-lOaKG6&n=J?Bu2VtNr+b3zXDA)j3;*
zO7u!twyWsNJn<?oGgxa&#?92QA*p<-lC*lf9}*E#o-Ce-p0~SNhT6MOf><azv!QN0
zQ_txBA}$PLIgY^Ym9RE&u?}2J{W>s4O$QxNBN;x1Lot9Dy3N=wnnD!MqW8}MIPT6g
zdFzLiuub~I`XP30rRvlIX?48V1S+XM*WoN_A&Hr-qSuA0c7&_=0PjKB2R(W4HH~fa
z7-*{AS?-!+YXx9di_yXCew=MYjk=#Kw~4rs-MsWniL8QC7WVjlGo-GwNUoI>6;djy
z^}?}B`YaTCwYTkheVj+fOlSI*uswRC<BJ>g_<6<Aw%_3=h+aTOC!<4_S%ZL#5xw!<
zI31yK>x~dNaRB9mro#xsh=8JeMO6~Zs=zm1ONWm0saUBEy3%jK)`@G#eFTdLWsBTj
zXmvaSFVU^sL80!_%aR!PbUC#nvmj2k=niC=Ru;O*V;Tm-kWGD-lH%7F*Z%S%Pzmjw
zw&=KIY<GcyBC<}WQtbj#h-l$Ml&?3*g2K4QS-+2S>l{4{$Wg&<4M&#yx?z1UD$6xw
z4ex--$tBQsHT;b`XjQdaD^aQiTSI0`te~O(wl7Y(BqBYBh;<-jTclj35mG;qKhc;b
zEsqLdb0v}lFO-vR%y$4#?59I;y|b*<v+(R1i#+I1CdDZ{*JJ2azq${B$3!7USh7&M
zx8j(mkTKGP-u8pRugX|g<jBv8v?hZpJfx6*?13R)aFcI@2yV|~KOkRXY+SX}mtw-x
zXcgp#x^grmnp<ra@HRu+Gi1{x(IHd`779imVBF@*Ov#dHylDQo3a-O(lFhl^9^1jA
zDo<YTsuf57)mC6mpDx7|UL*JJ7s^%5khqpu-afe?_gk|aV>4!rKymyf#He#G0&w{4
zNfaMZZ3HS3B49!Ryvf3PyAfj7M9bj<x!<fCnZIC-x0Fo949P2WQh&u5gO5_kK98&3
zCrg_$W;*6~21h7sCB%|w1+-i-^BgHF8ydtxA{kVLO2$7_NK&-fk;)8eg?PaLa!+gV
zVPsY6Pz7bo$Y9f@uH3WSFlE1u)YZz@X+wE`6&E5PcpM>E0KRuQkv}wm6a=9qwnOb5
z{fhJDJK+wcq+!%XM?Vfi=`69!P#V;DLMysU?I|8I7wOTQ4)u`089XHFKgVhGE0<Q{
zN0hZ2mKc%|>pLP3B|ZZIqI%%x7E8|-mJHb@Bs&rhVG%OV#gjTH<U;_4L1<`G2@4u=
zkW$<@<m5!1ei$<E$Zq>UZyJkfsv*pnz@dTF04T;~qj@pWyaZh6JlrH~59riQ3`Xy~
zQ~^6DXB4T^;VxQ4wBYGWCbI&BA%Ka@&Azw8``R`%j#KSV{iU_ELf&R(qM$;Z`3qP|
zH=9<WzS|(S76++eb)zq%>s9a}BBz-}9aUXQqR2Dezz#!U1v}NgMHK{c093)U_!m`{
zO2|&S0E;n=;P+LbN%Y)R1ZM9<<r4<HW*5G>F^r#I2CPPdY>`-T=2z;RW;X!^G{*;{
z;&1e_{g^&Q&6t1}`iV$dg)D<Dmq!Y5Qi_v{p2^(OQt-?Cm=pO)n4QT&8Gt4KjYE|q
z?%*U(BO5b=Zrs0qzbrcm&Oeub>HW=$Oww{%n!MQKS?VqNYTqrwlgioZHY5JnBjqEj
zVr>FTmy0tCtobDlgDFRV@woxk5~(EoM=_!Oe*9R(_|h;D;cfw1t-3`u&UZ8+jtPN?
zIVLvG6dMq=*(7RsjTg&a*vVBMJ{Nz#(g@=9RSNshUGl<Stz(?o?{o+z5w;oX{+%$R
zOR*f(!Kfya!-^7{h3wWumxLc+ix~-lpmGW}b}FBMn1fwj;Cw8nf)bJ*oxT?KnrwDk
zVw*yJ!SAYE-5slQt#}F*lI1}+SRPd|L^dP<R2OAIb&omd`Z>abAF~Jyn0V7ykASL>
zB13n=<OWvhy}-u(B;fq8*aB3r*9buoH(OnuBPb!IM-Y{Q)P+CIH?09ZOCEuj3Y<hT
zjPc%n=<n0MBk#@e@YAUwhVxeR>kepo3Ujk_M*_nxh$4Hh4@3@j{1gI;_TdhL^DL4_
zLzX@KpJhuv)_UAh2q=-}X3@P$;CAi)J^b$$0B=4Z7A|OScy2z)B0Kz7zEyf^jpE>#
zC_XkLlXXVp1ID1Rd1pgF2wUSv=9PrMNFVdc#Tg{M{eJJKQRdymqvDUo+n4V2<oJ9)
zKXwEmhlybs8O$r;f%-HVNA1pEL%^n&G7_8V9)HJl&hiR`U;Bpl@V<XhH=q}!HLM)@
zsvP5|wF>rzp=>Ok<2{MMVB@$7E@r)r3%VzJofZ+je+1%JPA|3@RXnWRppCjks^*ip
z{Z06g?n0}D1<_i?bse1idhIP4@@fW>9cw7zgv}yJ+%PHY!&>Y7DD|#JK@eHy6OV^c
zY6-}jP@-QuQAltX<&x5N1Z-Yb)TCN6FEk>b=>hs9Le00|A0C_Fh)gDJ(OYRT)YP;h
zLI^rHS39Wgjzm{TCvW>2fW-v#C0cU0h;-MNDf1BiqhEZ+Uj`t??xG+|$p5bETt5*!
zy>&Z%^;K^=)0*->`868N*r0JcQ|*9EL7QA#(Kr;S?vSXemXVxBb(NTV>iMU2pxnYx
zK4ZZom!dm>GO1|$tT_gx*g0%)<z1tb=j8Y7(IYc*C{+c=9!?5%Uiz$zb=#nNoPel=
zhW(4S;Vq^ko~A)u`|gU6qE<5Nlu$;w?J?^a?h^KdVY77=C8BpbkMmM~(CVZ46T0vD
zs4I=0?k%+2VBy=dIBt7j@tc!v&)GB(O_%1wBKU{q+XuWy<8fWX^-&X6c4xp>AsijJ
zLTFOY8WRf7C@>b7KBJF8Ut8Wg59OpoK!C#>;9a{n+RD-Poe;I+H(iF*iZe^{2{Zv#
zs79`p7R^$ecz%7J%3u4tKepM2sMG!Qag3<5C(wq$D4W_Y>BZDp?3-<YnRkjnr=P-Q
zugsck`zsZi9<tt8=vr10)=PS~S*;A}+$lxP3_|{omvkBadbisTt>Re(v5Y~|Zk3j>
zIfGX12`On=R_&yJD9)(e;V8gK`IeByh4(A%3b{!(=ASiE)7Mg6qOjajU({$Lh`=w<
zZbc%2he#jC3vv^j^hpj^hl=2ULr?N&JoTuJ_po%Bw@eyN4)ql;WXZow-f=;YN#UJ6
z>Z-3OsM>m}1`tjixG*YS)z|(eh|%=O&(G|eL>?cIM-B0H=~d?5n8&W{Imi80L4P=b
zxAA`kq~qWhMmD|sKClWug2;AT5(2;1c3B(Rr;zlD(K13$Q`eFCP~qJv8S~MB4*B>z
zLvqZD^P$!LrA1VOsT8%Fk3mDS0*W6X=x_&;jc=WQPvQ46E{kEf(rI|^#{h8mF@rH<
z3gy2g7>-}RLTY0w2*Qw??~0Ihk9mEBD_U7ix^9ezXqpAbIJA-j5EDuUR@j`WAS=TG
zO8`jsVJ$Ri>u*S>GKw;hi(h@~w0LY!Dt|6a?c`u{@4&P4dsII9R@dR^KwVeffVZ4X
zme}lL=*9HufOEUPT?*v_9hC~s7fGoz)N)Bx!RPY?b?Ptt%)%fmJ$HN}T77=~1(L;;
zjd;tiOiFIBk(MX@|H38w!@Ly!gKygm3IG6s2mk>8pM?h}OY<Mm?tg>XRjbR!ZLlG9
zU8p0BiE43d<9SlRyv`EI)(d0;$T~s+(bFojH6oQKs<3^%_y{MUm~xMyRnw(PPOoIK
z56n3|?;nF!x)3iJ4)AsQJ?*1wQReb`J(t=I%Q3he#=?sECB)Q4GgH3<PN?Cb+|XPc
zZaX+>p%1+ubaFYI1S`sDJ&UFzLypB_AG16{LTHjFGqifzhxA2I0I=5SG>N^)P;rN^
zGp;Ux@wcE~mt|Ql;KuA8I&b<`UxXR0W6SaT>8-SfO}AOMafKI9V9hn;@`vZxjJvA1
zG*e>Wt&u&wG4YDEAqFH18gy-F&NX-FHXA)4eHxTgF-2J9Q*3woJiK3DlWveL*f$}q
zQk=(XPi%Y4N_BllI|Nzg-D0;kz=b!Q70rlFq1xa;2nD5SNdQ=AVhT2+9#j~{Gy&xC
zdAjUJDNKXCR;LdEa9n*`W#EC?iVB;1*2tMdruNq`8E-<AG#M{j0%O{(74E_leyCyO
zTOrkDN{-`T7utdl?Li$3%#LMpm!n=qQ;7x(``n(xAFjHP&wxm}dVA{oiZQteOvyRm
z=3rkQKR**}{n}K%x5xIQ`MTup>bc_7l;8%;dnB6l<Pd!%hYzRk$WfWUawR3@GY0$A
zb<4v=lD?#_Y<kM>TIjNPwFK1xIEi>GIHSDJToJ!@9eO6tQ>38P<JN@sQ}KRE1_#Ta
zV}14lDBPd{?6sj6vVH2jSe9-*4$^+UTfmX7P6-6WAJ3L2kt!2Qr@eCwF&XU<-!K3E
zEefOY1!Y+vhcKjBNogzbFqjd~>sdx#Wc+D?>oFA|xN^taE<w2*_7tg@?@lFvn0Uo;
zI#A-7E%<bJ;zA3?pm~?;ZVqRBlc77_>|gNyo|P3QPP6g;PW5mg*H;2NHM!sbM1*13
z#e>sje~5|lVthi<%t`J=urHaNd<D8M03@XR_*W@D&VYa@n?VFQ{rn8D`WH*%XH^#n
zXP!7H>V&`dY!ZM1AdV(ORsi-k`CN=Rr{K9P&v;D~c&8sDvpH8ur#x1P+x^QO#l?Y#
zhF(5yQ$hUlWsPk6X+}D-S7@#BjtE)KA{|Y_4m)5l)>H0#QW^UnmO_a84$)-J3gtLJ
zZ|JmcSW+);gbhcYvi$=uZdY6#fwNbwb==^>5?*!bdnPnFR`QV({^roBe&ULxvLWf4
zGC&$=S4zP+8z;h+EirKVI8WEi)rbRPE^4|Nx~ZDEctj~puq`L2vWh=GdCFfWaQB4G
zf61TYOJ{R7c(w?{hd)IRcgljuA7aTu#`Ma*%(`a%#;T@R;05Y2yFMCf^2(QpBdIpm
z0QFBzYZauVZ&*#H3Y&=3Z{&hj>$;Y=zK4$~vfhC~lZ53<+JhTv2Oao*QgRfI5}j3D
zB&Es!5<0Qxe4aM@{C!<0wFXTr@>RfzIjy<q^7X{zBfaG&b>m|?o_B&o9L#qoYf`^F
z9!{F2yEw#kwO#IrSTZ2Y;^o5MKmB9<P{4$vBbY%+7Cpxxdj3JpD#PHu3ocjjjAIOg
zk^U$VqQpe3+f26!%VHx85|lIavy*AZh0$IxX;`a5tbJtnYzWu9$A>m&>40qW<}1#@
zaF>vLA0uPPm+t(ygGR-^?!If^J7!yOq_I8Sj+?sTjH|<paS=~;Tiu$z0>W5BXZj|<
z8ThqdMr>RmQW{I#M*&?vRoP)9TWJH@Om~Ti{gYZy{lr;LY0bp@`u?&%al&noY~KN+
zl}GkNqslvQeQu5y4!Iqe`Ba<eJ+@aRJsGwipv(6>_);+oJQ@H)ZB}e9Iwk2U!u_W+
z@K$U1{!k`Xu8Yh!J%_!wk&}z=yX+z52u^_vSk*O)?`e3ZbEDxERI*dkrig!KV{7Os
zce8EMj>98WcTH#WG3{mOSl=-6^!UORQn$Tg9GV!!n9G}U&ZJ<@F7l}&62UE=`U~)%
z=nn<(&+Q*r&VM}l{~n<KGx|H5IypPhtExZ(0RO|tiTD{V?$7{$AWwh*03awoZ~w1@
z{Ii6h`oDty^XfkX?cWE{*SEB@bk^7Z&kFYcfJG%HQ1D=pB2f7`F@b`iv9+PODb&B0
zsT%t+-)j&604w<a2fq4$Kd6zZow0?jp`$h9|8kaMbzAuZ4ur35eZ!;fBDd;|28YU!
z3rU(0x+)3+Eland{88D><=w%AD}W`d@AsI4P7ao<Ksx-tOt~IrAJOJKni7bGqN%}J
z5H$!<%7m;^5sT5ca(8Z+sXZ)FmT%hCs-=w#&u5<BcsohrNHjmqc_zC1-hgZ7)aUug
z{cqX9@XA#9<(*`SgDT391UV0DXH$2VALou(-tFy>baM3kGt=GRNegRhOZ*nm+W~T;
z5lW{bg9tboPM%%M)CsRfzVgCbyqC@hJczn^1O|``g1gk?=)8{-?qY_xsBsDVIuN~Y
zZnjv*r_N+h+5m1eYyA)i%BM;sO{rNY?9O1?!de89E@a3>>Wc-$_5S<@Cd5OqjY5B7
z%e82UL8x`YOmW<}wW}3(wh~My4|~^yvmLP(M5ueapJe74$j6XY20h(+U8liFY$_tm
z_JV;{=88v?j+sZXBt#25B{(lYRbk@8aIt@nx7B>{Rb!OaeV|t$!sSe%jMEkA+o$I9
z4_hqOo5+s>i&7<_l#z0gW3yFCBN%Kr3KJKBG>C(W22v$M#1NGT5fK~zxUtEUX0#mc
z#ST=1Izw>E6Lgalv31wV)Rs_loM%neg&Ot6R5%+JZnH?2i=Y!}7U&#z?b@>&%7XjC
zZf%t?ts69MuaKVt#=OcZujVQ`A?zfo1TE)IvK5@LK&mq)!21Hp79c{~NMH#pMAW@e
zDByxiMKy^Mr_juGzZkzFj~|YlmWH7V4U4@0@!cVMiN9NIMw>v8igPN{@~=x63MPe<
zHC13_z*sc8z5c+=o10~U7k_L;RIDi@Zla3<<8t*X)xSeS<7H9sK!%;Kh>C4(R_|Gr
zNtc9l;RhBXi*_;#r0LvyAHi6uHfUGN9Cjb*wR7p-k^>%?h{HTUOaVP3k+nq1sK_;b
z?+{q?VwG9SA=ORS0Z{MWm^twRZRlCCP4c83|8v`Jmlf6?kGbh_&k;e!g67Ce2&s9T
zQxEwy&d|kt0OEt1D4vTgh8j{L6%>FS2FP6xemPVEVgHb2Ieb^2Y~s_G>^dHC8;V_P
zYr065JvRAAo<r8VzawkO%S(MM&un*h9>WuQmxW%iQr`%>I;Nw7=79>;!HU1X{1ZP5
zUb(FJ?%0F@i_KX}RVOBhEe2Tqz$a_?olQ?xM*Hw;KghKncw0`w#~Ve>0r2pxa`nie
z`7o!K1w3uvs!vCFT)0P=e|^KPXIl@*wzT_xL1VFBD|JY>^<8RWgUQ>yOR5+{>612-
z_i)<`d#dU*azp2){kV0oZlVY?n)C!65yL;L;=uOK((a1Pb&JGp@5aWz_PH6`EbRMS
zqjTwf8a+hsq4VZ;$EK5;x4U!Qi@yDtzQMwC1N%<8(Gh}DO1_#2H=>jTUo_ogiZMwF
zA&)WH0U^DQ1fp}mC=b`rIf?WwOLGTgQy7K*SFJ_}dGz@ZaWJ^FB$3?j_&@xM=tOZ=
zs)d1nclJ*{_rmy%7vYsz*jf2^ZEC<crR<h|Oym*Z^93ay?QL^=aOcg9ws~&LKl$X8
zr~AD}LiY7A9r!9rl>`Lt_RKof_8{g?K42gIa+`eJ|8;j|Q)6<@!D@Dc2=)jL;^`Mm
zDM6H5acyx4YnhwnQR^P96?PvR5~0KIFf(y7yBRJ6XBIAZ<8E)X^x~v&{Q~}X{4B(#
zkohNktNg@Jynn}NBM)a&eJ5u}OFMJ@e<JGt5=BE)CStP~P_`aWTgSjEAgyL;DkRpL
zWT{=|)T~Of6iqoDAVr2to=K7U%6I1QFIYi@=0^z7o}V~J_(tM5Y4qp_ST)OEsNGk)
z^*f(a2wF>k2=7C&qJ#uAmxhr>bC@d#QGECzrk}*pL{2SEl{F1#qZY3|3He3~QM@Ey
zNj3y8h*$U?iy<3YLS?KL{T)-9NSF>u_d$-yozxrp)4<d)0Q_Qk(B&H{P@yze0+*DX
zui>&M7268(Cwa~&AOck+Pj+RcUsGtp>P?#+>QLXewa*x|T>3GOcjNKSXY<ErhU~<y
zhgqw>d0jAKem2}i%>#a$05l&I1wf6Rp^i#DhcCmsDy8?qISqs_TEtM^A@Nohp%>=D
z^SVOS!==qVC5E1^EG&9Z7Geuo?)*aW!^k{dgx^k&fiGEHQHrJG+=iOoTR6Eg8^ZIt
z5UWHZxlHes;{_haxec~l$gta!mJXSVA1nmR_3lU3zuVIo+WNymer?1zp4H7Oxk;PB
zktCI|R-EcyKSl}7-CO&+5n5i0T6<<lx%0%Q^dn0JHq_1|Nb)OC(hljsDKwRhAFEs_
zWIbv#!2YpOWTr2PlIIP7b2WMiU*E@EjQ)CKW;bc%9dW1mE%D?7#X|Ef%wP%-N^wjH
zn(?C5rIWOK-0~ajzb4Z^S;L<7?v5!k0Kkb3006<iCzG*_rKz2>zLTk=tEuC^rqlm`
z+q>eicG}=b-1(%gyI0L|uB;;=Z*)uHwa*#(g>;g*ifV1f#G0g}h(be{4?zd0HK)7f
zw`0d50YpNoD3_hlg{6vz1i^y!c@5TU{Cv8DS1&Ow+TcF!o6h6+h`;LSoHd5ZLZ0cj
zT%XQZFJ)R1m8zN^44sR+wk#_0sqRrFnmHCu4SE|XxU#79sP;zlr2WWrKxm}Eau_k2
zN$n%zurQSrCx`-_YcbTwor@a(E)K(Yay%B}!DLd~-*L+|*^j!yqb%&&H7A{+LJJIj
zEPndq`;}QcAPe{a`pJ_f*-uPsa!&-`_!bYziW)I*h;&!piQAw*wb|0jz1J9ZV$R>0
zD^pv$Et{>+FR!Q9m7~|w;rZR@ahL~qILnmyOR{b%NgZ92tTF#BlDdI^iE8rEc{;#w
z6VSg&sVS2p`al!IC(}q9@b;cG&e-FR)WNf0D#$iQ!PV1@yq>In&Zj$D7hhH<PJ_i5
zpsNZJxI!eSeAs2F!f9o)d4B3z9wW4GwZepZvboge9pyI;kWwcb`L7?a)hNt9JJu~W
z4leH08!a#z0Hls3MV`!=x!>)gfB7LzZ^nC1y4uI1!(12P1+Qn%-;b?V6Vq;S$`5sI
zd)7gkVY#9}YyrAsWa3tHOROjg5vxbOSB(nNDyxd2$cc}a$cLWC3P8Iw>_H!)vKtK|
z$gLZ5Qxh0CA$P=2ItSlh-M|-WP+^u_%@EHM9$1w;K6t~C@EaoYd=)XEo_Wjj!qSn(
z&tIQj&zF&1U72+!>Nk2iGOLG?pD#xjD>nD{fdRVQJ#Bvs(`Ke-#)hUqJoPuzzu5V+
zf9vb;F#C&C+VRjTN~~G#G5npi?|V86XlG%lbEo;KVHoE60Br%-kGas#!p>3WQEvlo
z>(MOqI$ti(8<$_bwBxo;VXk<j)}ff(^jLM{nclG{^%K5Rh&pnO-3t*UQ0P}-RiDE6
zfP^z}ZK%MkLEy~<NW6Zkkr-fQhY_HMTQ3Pj_qO;(kuotRiH_XEC48fiox;jpfyq~w
zI6?=Ky7aUJtXA`C*R7FXk4y%$F_))Jf^kzp1iGqHoT&zjT4HYHu<dv8fzg%PO|q{R
z80tf$erDvra9N%a+Egx<ZIl70^d-Dgz@-$pUt2vRS`_@9|FCBioGe8!g67S6Mvb!v
zN5%NCJk$qlFDz{6*&oaH&p;F=8@YWM#Ppa{+asSALbD|bXExWV{mHowcB9SIg-dJo
zpoZKF?xQ<&H6VJBBhKV;gn@rIDsT=YCqp)+6SjI7tyV+Pg^c~6Ie5`A-9m$B8Dj~Z
zjI@B%3^X}qnCQJme+gjx?)R)%PO5O(6wBfN-B7VbG<$*A7Y>*%!!Zm++Xpp8?*CD2
zhEIoU_QaP=<Be#>9N+9z@B{P)s#u~|Y60-e2_ti$cngNNlhP~a^78812BEZfMo9#Q
zD-SR_n6ntP&_ooG#A22*f?!kzCG{U8LtS-ZX5rC52nFph8D0how>{Kh6ui`rc*CKf
zw3ntxx2~5mWTXZgY1E6fkOY9iB9MVRNRQ1gcPpXZ6CpJvurzVX=%ryP1-fckv)ird
z{^<UxS5ot^VyeXJ4h7rQ+|^t{`+m^0@mn1Ct(ui!<D?L;09_ip`DsDroz%@qYWv9|
zlKlJ)9l~#q***tqo_x}&xyXsG3&Obn;W+SuzxpI2cc9D%j<0tf7*w=JoV@I9KH4S7
zjs`n2{IURGy6l&6m3|a?64e5aMaghQ+KeAU8btGAAb3}r24({k>@Bq+<Xn%IIgUDt
zpK_!Lb`bYcg)##<1clLngfvFUM&2YE(BUdece5-)q)F-BZuC~X%?^HHLtz{zg!;`Y
z)t1V)uAova#7Au?jPs-=;$3H&D&m?!hBAISIdl?UT84U6;Ag)!oH4H8h?4EiZk6)f
z7ofy<OS=5vF;q&SpS^7Ia<D~pYkLOj!zo^0CCTD@-OnM;(Bfxcly*X-Oa|N|wJ-bK
zF;~VpZ2_FvS)Lr)9dhbeM9HW_hRE5f4FW%G)hxtT{bLl15v0f6M4~?h5Zl>2-vxE@
zLHWKj_x%*8olz~c#eRYD>h3u3Qc<=<QdqcP!Sk1F^2L)CC~CiP$;^HHQ)t|0-KWe9
zJB4jM0FKloLkIf=*a`tEoa(fr87NADBo;2Cev}n-8`KjLRxgm++A#XPk(kgXoNp*=
zpcw}fW=@8UoHh53ViNTZ*WArp_3V1tprlZPO3pJ%0&?WsrQeVgP1<%CLnP92DiGSj
z3~pwTQ-a{UXna{gEzWTeCfx|lgdh`j##>`}s9*v#!eliGVr_&p1fw`V88S^LOrzDx
z&Zvu#RCYp>2)C}1i=IM6(_H8QDlDJS10u5B&NN1s0+c}8D6AG62S)2w0ajRDEw-F9
zT{v$7K8D;T^oHbRsC4NRuhHqS|IR&*QGnRHQ8BYNkYa|(J+S2wHeRVI!=RX%T$|Za
zaoKJ8YvMErnQ|$C^KE=|u^^l8kd;wh#~bWD)uF)@*6ZZH59Y!^yG)?`nc}qu@{^r>
z;2_q7!_2(Q*Ch($SS!%H@UfuWpEkWaCFk_DO>89AZl1$y$MTJ0n(g#iNa8SPTivBV
zH24F@n?x&qI)Q0T+A>B)v;(^y=~sHbA^1eNV8K0EL3$g=T{6I*0GBOR=|(_Pp-J88
z3P-%*+y#S+dfCnMRa05Gbw<mtx=<&3Cqiv9o16)!nPWrdz|h=$x~@Q2%sL7{Y8ued
zP(T+fO2(6JU=IfP`#N8N^(`@h#Vv0ec44NjQ=Cfl_{(tLs6OgGP)rWHKY#I5s5g=+
zMjeTp_-;4AE^Yh<_42HgPMk8x|L)%ANcEHi5)*H(6_+iWLOCcHG*HWq!3`WSb)?YV
zMlvixP+w#<;Cv`DhNc-D9PdA;nu`EjOM8G0E~`j-W2)^EaQ87<7v^$=6;cPcZGugr
zZZoO7IYbNarm(4!=@670=hf5tyB7~M1RE;iUOrO8XJcKlU+5dmPiSJXbtR~%K50~R
z&LMA1CG#*aN>s@X=|l<nT;e$)i+2c1S-5b2c1_9Bm{5~R3%C#4qJ^0v23HrGRN^@F
zEt+Ul&gWeX#Y&0&Zes(mK`RdL8$N7RsBZXB^<ih+7hVHEjd4_conI@d2nuahC=Q-=
zvvqX|HJxp6XeE~cl&zpr;!;E?YULRm%4$DCfQjp?2u2U{0E&YoJy$WwOYt?>CL`R9
zQt*+@Nqd|zM%=rHZOBSZoD)tCYw;|rz53lE1Rc%^JHldebwq+a631O4Uabc%<gry-
z9C0*<zp)E;dljJ?Fy7P3@$k%ZazVI|ezUv#Wl{-G3-5FleKSKFskZqkG?^Cm)WYMN
zVbPuE|3^CkCxLv3IHloS{Q;u{g<sX(qH=YdfxHdRlQ^h^hvU*Fg59k{?n7WZ?!5yg
zTv<;klRcT9;hXhfdDK*%BF#mibpt)<%J{FnDG(mS_mjG;Fz=ZyTqvejDDEYB#?XTv
zb@*MF*mkMN<M{}?3){|woU6IF`tkdCoz-#~Tp8zLA+7&2yINz?RzFqYej23J2S4bR
z)~9+>th)q&w^RefcCaUAaAWG0zy}YW%j@NM?4}Y>QQIC(3!?~7nFZP2&W1(}BU-<Z
zz|Cy7sh`f3W!tm#T%grcql|B$0%CuJ_E4<+my(leyj>HNF*G7R?^@Tynje#h!%M;-
zHtgK=`EMbJtUh?}DYgZbND*-8nw|Tn)S{1c45N96=Ww6FI?NY|35sMnxYrNNgIMhT
zW|nnZ7%b>V`QTRSz*d~yJ@?i1DV$>{B*GvI%)s;^DtB;3s-OG^dupb<p53B~Ye<<y
zNcn7pPCsMG9T&DErx<L!1_Qjn;ARCOKXNEj>D*yL@GaGo<~w^nKVv~(8~yi--b8e5
zx`b-CkrSryV3mU`0jcny+@3>@a3VW;xdNgUZ(N8Sr56iIsmm}B@wY&irn>(hL3~`N
zIZFnNP4S!UIYPIL50EDQiDvI#UmMhfsgb2o;c!YV6h$wa`Yuo^@^=vZUm&1y*ZRUM
zzQ{m$*VJ(@H=OGMcT$X%ylY|n7g7%RgQJ@_FaEX*H@8|Z5r5;>4$s$6y@Ey-C;OPW
zw1dT`$GwmcR03Iw2(MeFv<b3@qyoQ$@fk3v6R9D$8#^@y)ChKdt=!M!)v)>;`Dh53
z2Yu}dqT@L?a+D;lGBAy(WUvmAW%N=lGrfzPQvLzDc|FM5Wp9$u?GL0)Nzm;J;KqaJ
z?B6{{TIA~xE=3ICt^QlOUB{CaM44!LNj%$!#}U)~?*IMQnhCFKt#j?;Y~`@F%gkUl
z{jq4Fimmq1QZ<)zy6(qvQIN`X6Na1$mcg)=k~aJsFxOwZ1cEvh0eOaEj5apLLt71`
z3`f#s1&m(~FPM=0-QwXD+9GH~@v*&ZThIKS+0K$;T$w<8&#hmC^J(227N&WD0ye#|
ziN`MaE<Nly8SW6JZ2<Alh|XOjbxxh;HIbFrNUewa$k`)-T2o5XlKgL-lH+H0H;l~`
z-%I|rF`s=hBKw8dJ*Qdc+yDdFDCK27GS-gAAy?vzb>aR}F{|gJk!#uUKo*@1hClqh
z_&d-}s||+x>u_8sL~<u$BcCNAxm!S-r?XrxLa2Mc`;0SIm+hEI=8HyWnfF*{Tx?wT
ztMc+e&`27=dvzFT+32g{mv`NXqOi%foxG5`NKW%f+p&z{9W7zSc>TjNe<@Vh6Oh%2
zTrY&Q|NOj~D%Y-r_zyoi!S?lYdJH$$*Gi)CK^yFfVARoVg04tFh&KC8OXCU#mZmQP
z)H2##%O-NIjg8YlMZ?8WF2=7vH3DrpF>N=?=2+ZCHCqZyS*I8Gv-o9HM&fBeRF78i
zsYdlIY=ARdN^i!)=tPqK7=3abdSlBwmr^&zUD$5p4DVf<`&(#(xz`;n$9-X&T2@1r
zj{uQZvJ8CAfzRAJQv$UCA^de;#{gc)lP@l(3t}$n?2bOVjo)qCewR<R5vlu`N9Bm+
z4rE68>}1IlA3}<@)HE=*AW?trdlepzK0sG8HOxMFSCHRFTaBBk>iK;;{k(0dlTJ~3
zwJbIVgH~$uSD0Kke_gi4wknAF3AD>VMbSo3Hv8WJ9V`@m-Z6HB%BTb?GQTHp9Jn%I
zq_><ie3|q+SsE>$_fCS+1!95~E?L9MT?kqaWX6Po5(Zp=5yf{&O9-Ppg0rS^nbDg@
zmL=^GRE2v!Z8u|eIf9`eon~9*m3USF3E9wvwzpqNS65&%3=Bj9xx9y7tjk7`^**;d
zyp>a|uxa|7?9*f*%l1Ycpt9sQ`4a@l<vRqNAY1kz!5hw9;cEgL7D&mbwN{s{BiYl9
z6VB5`!w0h21udf#AfpfsU5l&|iA_^x3m=Y0qR1aZ$r_e*CRx17EbXpNgkG;nL+LFr
zEZCshMdH$Nb|5pUKVw<uU^4ljJaFR$@gC*nW!ENmaYW8vWa(JR_noj8_;?i;w*`()
z_ay@iR%uY|iuU1OlTl2T4OROPiozy#HWT_`WDx>($b6uAK`mnxy6Cp^s1C$4G2TE+
zNyrlVmO2^D?kY^<2(JS|%%Qn!pu=o%84h)M^usKx4>r3KxAl?hIvS)g)*GUOq)n+n
z&3pzZ0Dz?^xM!D}-h*trkrtwL?7x(0r>4>!0FAa0#~TL`yuu-f&kH3dtDY%OiV7?c
zKh3bH1R1)7E`^k@Rc_Y^<FqUo^~9{gn`|!4E*@WlR2!0=*UP(9L$^Mm9eqB8CbT6L
znOc=@C}=xuU2#1_Ep|h<R41RZoWTLwFt5dpne2?*X(xs;dyLr9dGx13#J~mZ8&-fV
z<si3phxkF9Trup5BO-Bf{T-}DVYbO}6_;zi!g8gxJ<W#uO8EQsn_sEt=e||G^If}W
zr;_Q89j?!}VZOG~5MLHTSW9(et1Y+UY^ax4H{t~NcHZwl;Rtig7xre_7=zt-H5%Wc
zFbCgay05~im}}BN9`$zG2F)+9m2lr4^x@y6ovRaSzI`c-e;w;!Gg6QO$9`Nfw44aQ
zH)#$<&rH59hJZp-<w`9CyB5^ALzaA3Q`KtwWN7mY$`bMrUz)!>D`#t%Dm&M%PzMY0
zw>?$;N^`|7VY`5zbrYVbFCV!D(ONtvuMp3aNxa<@D72}QqF>>uomQ|RBmWPI*+*fy
z<ecq~-ABI-o^WtbCzE=vRpD4EV}}Q*902`Iy0v7wT^h$#$q67?Inw$7Qb^DVM(a#H
zHT^4=O@$PHOL30&lscQCty#3>(8@we<nqe|B%ESW<nay#VqL3DrsDebOE+v3`r(p3
zfO==yThpn+5mwEJO}3w>x&EM8q!t0X0dy34k^=8R{n$d1-_V9vI^ViFt*z1~#7<Sw
z_i@XhUek12U~)*ynd3wE1rSz-y4}t>@c7s3;T+6_QW+s#84a^N7j;cM!D_JGQC>;b
zMciX1#(N?M*QZ>qAmp0$3s~ot(MXiSJJDo-IJ-O;R+Dme7WRlm0L34%iAFs;DJ?Kt
zinfmmT&4U!<CPeoX8sYxLEt@=H3=46_sj1NkMRFhzWYb7OBZ?uKn4Z?Q2%N9!1;Ii
z&dl<s@59pbzm@L(2i0y#eaCu(0m1i0pKudB3lur4KMz4;Y&lGjeYL$*G1%q@Q_zqi
zbuaX?a)a`ne9<h~L065BB<a@ve$Bl#=Bm4*DP@4AL?q=nP2#~%JjavGJq@mQW=$wB
zF<I0N1~-&*P~b}@Do8O!r7M}2d^n^fAzQ)p)9;&nf<<y*C8kjYQp+-{HSh~-s&eq`
z#R67pg^j(}j>M!|5QnSTz{jl<s-e?_=Nq;mMPG;b_K|yHw+eM?)F9o#AklRr`GA8`
zOP&!4X2n=i2^G~`E6*<XFF9PFIqidz7su8#7#CpdSO+NNoJ@a)(qR+2Cy}0D0XFt9
zbTQ|!Sb3?x`Bb~dq_ee^;Bi-R@tfCRI3Oo;`0(;QO3cw&GW5q)D)}jtM&>h!M^1&u
zYzFIy4GRMY^?q_fOup?Lfq+5O@{OcCqt$CSe<ZW#o&Y@Bw)u0E4{N^eZi4R~LF)vl
zK|l$VC_lZvaxN_5iNYX(n^A!YsKK8|rN&MWPP4&qMv(_|=yqI+t6^M-1J6L{+AGcg
zMGYoUU;`}fVexv(5mjzG((RBcfCJkX+u%;fEQ`zR282G=C){Nrp;!q@JIz|;C$2e(
z5m&^U^8><-ARj@pU`~xWfgW9G+nu={2OoN8NcV{3ROX~CKJ7x0#Sn(e&Tb#Fa~Tbe
z^M?u)iO;hZqfS8NP4$WcZ}*2ggGNyVS;q{S#HM~Afkjs@jK*fFJP7P+!K*f#u23cy
zb8YHw)5hxWyyd$VGViQ0(m<h-L-tVtp$vgX5~+(_nqKikXp%gXBsY@KU{$E_QHgmD
zmYAaFvOt1US6xz#mTY5wj1J619Jz!}Sa9h2kNR!$&@03(&aS<k>ETm=Uld*CavLuY
zJ?sv(KG}Qkh&<?`ZRC+Po4Vc^2mMzdtHZo#*P0AoBP=_J*DBreoPHOSwf4~`0jnwf
zC+3?}y?r5np=#zV(!5waUx|DknY;>;)XsKS)-P-`!)#LQ3k17|THJfhUj4KT?D(`M
z(IJXW0){DQZ>@urTFttxL(NbzzyF))LSs|Y`~GvG^`im+VEy|-v$XqBTwH8_YUQRT
z|BK+Vr6KL~(`MEEpl+}Uonq*5KE}*P(lSpaOC!IJGY<_$l*VM6XaF!8`Q;-3Kmv%6
zm?9<1y%cTAgVwRb7ogf_5yPaV8Xu9!hLhqVogu*Ky(M5mrcEFZ6Iu4VwPOcwAyrZE
zNK%AD>7eY?y}YZ6DjyMNmANk07>Drfj~e&y#c5VsELyf-%gB$hJjh*%c^x_Kc`@4=
znoM}4uzJXV+6@mT@fBJLmt)7z9b_``n2(!NkFj%T>8qrX0ceyf-B;HX)x*MU2E-;h
zjeU^2fu&mr=Aj78qioS>GyZ$n<r`3*i()!!RfVVMj~PKNd`KwOhcCduiQnUrfOBDQ
zVMinMuW?8BGxDcWNkfgNe-A#MK0k)FJUZHZK7H@f8qq{KQNsJ$4}mbINjBhIWz^#u
zrHfl}4C&H&h<ldWTT0D-C$@x;#lfa*HzGcPkm)GBmEsz?lMfUTJyDTcOwi5wvyA#K
z_f1pvb84|Tx+qK63i4^u;Dsqcx|A#+9W+pa4l{;TNGJ%7pP$o~b707J!0$+;jTp8-
zp2JPg2gr{Yvf<=lDJFDiRM$mS*9#&DyhYZrrhV)2+IA>nR3J}c9gdEi+%oL~My~tw
z<=0Ic$g=J`xS3{CNXSXW;I+ax1t(yU0QfG|%amTQsnr|CkXuJOk5cQJMZ<QfYWJ3f
ziZ5R5EyJoKqBobLSyomAtv}GDag@FkwsQfjwuXzh?|ATb`&?hV@Pa>!-o_sTw<p_x
z42eRj=Dvb5=aP!o`-fIh^|CG|UpdHPHH6Y3i%(4m;oGJ7Z;K_0%(ph#44QIm+JF~d
z4M&XwF6gyHbkF}l0=ug0+$M&uc=ftH*n7H1qO!X3*P~o6{Il?;F9pW9DlpvBGSnO8
zYQfeV{=DV0T)(Os@HooAknBxnf`Sz5Adu}_YQdlgcEBrMO~o*wqoYqO74pQr0vpB`
zF4;-2*x?~ubZ;k;wGQ4jojAae!^y`lY+*!p&(S8OIEm&zadA~b>FK(blIxXzY2W>p
zBcu#4lP4$wz?_x^lYChQQo^6CJ~L*H*EwcV+7vRn<bl>ziZu9kF|l#_$t?)Lb0^8g
zwv!M32-C0D4DKB%3;&Y0c2*0yw4-v>s$A&ZsPffA!%gEc#FsPHcS%mF9P31^dP>6&
z-LrW>8f{-o&XE~*l_hb-BdagHtf%B~ox9`UVLiHB{<yTWGk6!akM)~s7NpN4OuaE8
zopHA>Qib{RSbu$HtIB*~4icv0a^qu<Q{Y(+r%mQ~zC6~v@-}pQneWhssQT>Ep+LR<
z-SSPY)_nL84Yyc6txWrY_pn+2sejJFTkYGn^*Hlw?=^Mv?{youJwPi02mqiA0ssK_
z-{Hi{(A7}i(9zM*<6oNF|FCe!sL9%|3m|m8t5<n9kWK(49U@RFDN9tUS<&0dBWlSS
z0Ek~ScXYrV?YJ%LSK(s!$K<?hce3Lxzjoiyq6YTSPLRF>Mv}54N}D}oocILJ<2bs^
z@C7k}qon7oLOoLuF6GKk=8Q8Bztyl53r%ZgqW)p3N<JgoAwn+~1Qim8(Q~Gg3hESd
z*9&5?4BgH!g|VViYZy8^X7vqW-}Q2I3s$cD%9-w77KfW>a)F<@Qj~_`5yQ8ofEe+M
zM+M?U0i{>z7OWPYd0+!-GK-VyJ7Z@IT9&uzp%pZC*N?FIHNuj~FPf)#nK>Yp(>)C8
zm2QU|jJrw}*DOF2sOgRUEV@1jZ>K@z_6ZxPbtKMd7Z94?pn^n;fgbym&!Ro?o9*y%
zz9e(JP7`?+%(Yn&%Vl3QR|@%N?ql#k8P+*7U?~f^ajvdcU#!?s-wlf(iSIs(MOfp=
zKNp|s;jtA2k%ov&0nDZF7U*^ukjya23+0R+4VvGolb`$ox}X)7#v`{DcPv)Q!yDih
zoDotsr2`s`=FAxFdYD#%=l(n!tNH2uheJSKezI^!H|2<z^E6|-L0WN>F{fIfyZ3Zs
zUnf&*uPHO+G+`xKQvVDT4m#?iQ>$DnoYgUM4g;P8+5X*cI6|$nqCRNx7RrB?kl>n1
z7t^$6yfQ@kdgb-{V1o;nJ4i*ZKlqxos#bgkxbj5!G48B0x9y<I^+pmxA~!tG8%eVw
zI}AwKkS*9tPZV}1ZEa4ya942;;7yucxH@qh&yQ0*gDKs@<BtW~lHc!4r+p<{Tf{Z-
zB$QL`Lbmf~fQ;#hBEfXE)waEm9}bD>y>&0ZnPY|^CHvi_;QGi~{w(+V|H(^pDh{NG
zWFt}5e|pnyIsSjL@_)>4V?!Goqo3}ue@*d<pZ(DPjQ`f5cJ+VvM0@#yR&Ii0Ua;Df
zc7e8=8{m%EE?8&gJ-d2Y00*R=YDAD&QC5_@)qj767U30<Ot@y+d0t_H8bu9w%s7~p
zX{EX_7=k$M6Jq86nw(9d!+Hk)3<}bB7*|i`6&*>xLxq%R6C;0(6j_yrqgib;VG)UK
zl<^o3KOo?braYjC@lNv`g!GoffJ>z7j|v8(B##Z`P~g_Z`gI=x{zbHt!3+5d#3Vfs
zYQg`Vv5-y@iNAeZkbW^K=+ZoqjtQeVs#btPKIr!Z_!`jwou<LdkKJ?z{;x1^M|dej
z4syw9%T8bKEhkG1<6L_~j&`E;bJnl$9O~HvTD=p7XTZQ2X$_AWwDwaYSgIKRH~9-_
zsUW({T>^xvNr-oeRZueylDw88>KC$LZaht+*?<GHcsN&GB&V=<!hFgb3mJbX4i`%W
z3d_&eVgi2CXeL;>@Kg*M?)`+ZmmxpA9%sGSo!FeHyAM%xQg_L0lN86eZLwQs0+MSY
zH&K651|1Qd>xD8W2ZuV~71thi3_iqOfN&%@Jg~nbhByGQ<xm|DD&=`~HuAd-I}@e*
z#*lzjbPc}F#;?kItsqH6=A%j5Dn6Jazjiw`>TGl~y+sxwC0`9LKZv~^GjkJeWkQ7{
zh_?G3XmZ8@30ho@sJ!3u)78FsQL}X*#I)Mjp141hI&(+jgE>qwgPd+3T;*b)_YQSm
z&&CWeuBVnLNw`{qS(59UBS>*Z<c5P`cf@NgMVOx9<~rPNX2&=^efY#@rgwOLPtSee
ze7-*}QM33lilW<3lP|2n<nSf&pDe85<drf*^1r(<Wyr~s5cx$AjU(pyY1Ki?=pe?5
z#b6~Wkof!5fyaoAMpB4GH~;c77FnvN;7_%q>2hc8kHMp72Wa{5u3H*T_`9Gu!5cBC
z)?<P<(XF1O`J9f(u}g#!6yjTsqu6ic0kzd<_Di-9)j$-Xh)Pudo#fWP?X~-n_*TT0
z@me?N@mcMJvj1YcvqW$#=GW&Nmp$s`X+8+|U+=z60CGrAVtj#yk|=9`kY6O(@$0|5
zDMWt0vP9iw0q8SMckI|Hri$txX~~AV%?b5@&kySe$fEQrhNSSO7))lg7MNLcz@bhk
z=m4u(N_7|+=oKn7h`Kchwo};O7$}7J+Y#G&M~ykKIgoMPCElQ@#wqZcIP1%hs7oeA
z234_rpBE^xVP;E_yXtl0>jgLqCMa;EaxWlN&)uG46f^0w5Sy^E&=(E2m}daSGY`ea
zsIki91#-OgWjZ4_1s8Pfb54pS#uS0bW5wn9!X&X262HQMo=gSo@mVczPfPc1CBI7x
zb(^aDwq*(56l*B=Ig7WuJ<G7i#QF0s-i@si$+AO7tjo$DmbmOAn?S|~u)Dd_89SFb
zSYm<hG{axPBp}m|l0O=YU_5CxM0|_#nGq#OcqQ`x4`c7xrdiN#*`}>Z+p4r}+qP}n
z&aCv4wry0}wr$(isonjh-_v_vSO15I6>CO}Id0N>8}KhsY?ln07Ro>Gk}5YAR6hY+
zvdN-9ChP@oF_;X=W#zB3*pbQQ?Od{zfS57TG?L6_x~Tkvd6i1Gml#^IAEuYlKfNij
zBOyo}zyhC)7^m<&IgKjP{Q<=pWFbR7hSiB#<?9(=#p67R;)jRcjE~BGr4XtOz_*;x
zWAw3|_bc8z9jHgC`V@sQ190-$@OPR42=+06ib}#PLZzLmc`8d|ZL_xa8>y%^k_az7
zmwC7ChL^!4=#&%Ve9s(VRp*Wdy<+N>P7N$QFSBE(c`>)1Xb9g|7pB_8Fqe>&=%>Qz
zVnee~sUXTC0O@Z6^rb<(mSey!K6i1A;2(!>DD`G$na2`kN)Pe_<@1u9K{Zk+3u~)p
zJ*arZIs0Vr9Po7VJ-95^4=3GuPC!7ZK$RSM-vtI2nP-fv;=DuF+ti5W^pp?y#I1aR
zcaw0j(e>+_@T2)pEkC5cChx2rlqj-9w#&6wuXL7{TBX}eKcJP{O_*Ipni((`_o@|Z
zhUuuB@=747x1SN5_es*0?Re@o{u+<4OS(@4r6FWNl#Dne_D(uEsN+x|WlcdhY)~ls
zz}T_Cs$jx3ieyoNX-Q+%=Xmj;YJH-1BgS14Y?A5yCLx2^CCu_`iGQDZvUqj07;bSg
zxEVFYBfWZxSy*)c@dXv~xh@(6c@6u;2`0gXEAwTQiq1sEMbY-+<J)w@EU&Bpt=zy?
zgz+zpa0^rZ`e}80BVVqHuX6|4qDy|kw=gNm!!ja>2+8aPttmT<CdlpS(MF?QQrG(D
z?R2+Aj*%Ben*aSMHkx1In}z%%(ji+WxT;$eD#%9D=jvMW0D!1cvL+KHm0TrKg|OO5
zcSu=TmKFoy-Tb2QsSt5)My(Y>mxOCG((3r7EyZaB$UJ~5zSeYXeojdCtskqKh*A<u
zk%;1i^qXM+@VSh}2p!nf-pUKl@7EAQ>zIcNB$o5;MO=Q?<_icVi&vv-zvOC%(^ZC)
z-mlMJi+jQ}5mJQtYo}d(s&QPj=cM;cp@gpg@)6mr6aX2-6dXEV@KM3-LTFM-!%!Ou
zqcpOgEXi=+JHStLp?88xhEk$dN8s6;%1ONAxusGLRbSiLRnI+wP5miC8%0Ctwbqf<
zzg(!@wN8DkUH@EUanuA$Lb@w08>+8z@*<T>7EgiEz(&M3LXJ|h2wwGFH&FU7i_Yt>
z*ASUu)T-%tDsHc?evDJ(DGH2f`-6oxN4`loJt?t?&+<x#QquJlfg2Lf7K-Vuq~qrh
z%ZQq{doj6psyWOLtwz~7nn*AHT$yplyiVb)TH}IaMM*y8x$JVMYumc?Q3ErgER>3k
zSX1hN!n7oGV}-c}7MLH14`@jlZcO+53{^|0E9L~^jN?(sR!fM)5l=S!kFaE5rK{Sg
z;q<$mq7WyvY4&)a<%~<~R40~GOWY8av3;@pvvE$>%vo9Hr&=+yF`dfMk~;L3brWA{
z-<wJRYwp$!(RH5eYDwhx3Q3iL9ipTu|0>}n9{4BU^d_zDh2GW44_Zfgn&s)I8@*aG
z-m}e@)<(Ct<r`bG_Xo7v#Bc44ZanO6twwdcpX*VwcMNPbki(FY=zz3pNmdgpbu6y(
zy&cBXUymuPh%pK_9jg^o5@?!Xhvp%*Wsstd{hH@dmEE+OPf{HXVEKP=WN2TK`ksS}
z@9mOWz7+Ig@)Vd#|Bbzq!H16{l3H#5o4x2+LF6dC#v(KDwPszSv{Wx}Rhh%Y{Wl7O
zEbnL9mKXX{S1eZFeS|LTvhW-9@yZ(jY=wjnW}KIvvGkNG0XW^iY@dBO{X5t!1P&g+
zjdrz$_r`KdG$*RC!fCVvQOMHJd){r7F@aP;{WhO?=I$$i(W)5ow)n6mDd3PitWUKZ
zwsY-K1;wnKq)Npyk@N;|A8@QI&ZUp7y*3M1jr5mG%XyQ{9mQgd4d~2?U?z&MEGmoO
z@!>Fo*G;uwY${@rJ&>ipP{CojOO&B~IJ<=Luj;%}RyI}cs?4|V$@(aT{y5le=sRlo
zi<gOU`k^m<a`P`ORGuBYSM!+k-i5Vp4?kz$3_9<RVBhWUrH%`V?}#8&4E4n{hGAf9
zUpV}O)bT$wPTuWq0?$I-r%EVD%?8naK+giJE1%UTxheqvT*v^uJ^p3p{M4VP(aNVJ
zk<OkQFjVigi$(q11RT7EF^dk*p_79i^RVjyS>L;9b1*eG+&6y-T#7-zLyol{yx%-(
zJfZME;QyHkSFts;SN~lxF(U&3;rut5(8TgTDa`*N5ngF%%U`k~`OVbU^-CAjsBHmt
z;YLD633O#UQ)MrN!w^l7CJpWmmS0^q{B(DdN@tk_7KC=QAVy%#_&5{bIQ3D7IBr*{
z0S!l}!DGNQv&woxq$z?2F|-LBsyQuF^~w=LITfzi$X}<po5(r{qS}nMKt_NR@q6!c
zz&@6O-7(i4cdC8{sl$Vcdg37lVE3WPb6yC3ujYJu{RYDT#R6B7oFQeS*`eb>3XYRp
zFo4epTKwg*?}((+egG?0)LGEW6geHgcpfjhf=)OI@BL)^7c-GMPUdsL#GMLFk!2_a
z2ZEg>m_QKVV|53mn~oWth)~^d&+Q;i&gXfVM8W5Ip0soIX#zBbhNk3OjOJ6b*22vx
z<O3h%m?+4$__+=F-bdl@Blf|ViPsff4)nwguo76W;30RJoJH18&U1C716xD*+p{=s
zTpYr<<s2iAIaa1MO+f$yg9JJ)AG9M}4G&uGBadsWF*4KY9$%3iQ*B<WjI615dUkI4
z+Ob5ktQ^%b>(zc7b*!DsWnN`&1Esn^{$UiH(b9b2z3s8Zm5CCKZxT<!Y{^jw;*6t*
z>R)${Q!(C_elg?Dv;`%`NQ{gY7j9?P+&ZL0ae-pfu2HaxV(Fr%vM5@#6|qYcdm>_X
zJcY5~GrM5~7CJpW8*}b+OJ{t)#7XD{&lS^HTM(*VV&YGTH<84)vq|?3A*;ThlVPlR
z|7Q@b+;yRWmJJjfi@dFyGG^8LWw596P(eBJDqPZS*D&2NUSkF;46-Ykp_*}0i!GN>
z1%#`W?{s2PPfhW(iYU80#Z*E#Kd}mUj7=8!Q4Q`uG5I%3@P^&F>cS4hmwO0bqIWPQ
zP&u-KKEmO6;O3##0*=W===!nj`Z3pLX0^l4^6G+MYrOLp*c+nO>5rJljhMur`@jhs
zM%(I}<)!&<G;W2qtj>xYm>m#VbqV)kxBTRHq6>8V<>O^kwuHC2HlEPtaP$YSZ@kXW
ztNYqagHNj#FW4`eJyn9Kd9TKEE%P{<xll?$j<upB-}y>it>=!?dhU%cdr6Onuu*yN
zGASx1jO{y|u$3P-o6S4#ztr5zGV=`s-Jxdf6D?0B@m>^Gw2l&xaAnzK@HR097s8)e
zNo;!`J}NkV4g~5moR%76)Fhf+5>%1Fb7E{0EqT@*@zlw4JdnW6Lcdr1l@`Z$1P;FS
zT;X#b)XL^vU&l^!FfT~E5T;c3ROAK{v>z7gvE#&ly4+;YQHttd_m07@NpFs(&s=hF
z61S5;5Y`J8D7P^A{eN`=4Sk~=1}IFZDL#XA`v0$@)_>Kqpn(2MGUk^Ctcphn1VmQ>
z1Vr%Pd@RfjT}<8oe^*HVgLCYX*V1`ItYPPgdU&=H=2QU+SgNJa%d0s`X6^o5OVO>t
zv`vAKv&bMAC|H>3A@14zl3?RH9hf0F8p(BV2FHGxJTNEcI;Z>ibdP+^t6OxqZ}7>y
z<M8WfZ<*SsA>VR+SWqH6O-^DrX~jimQZH4bYkLL<$KN^el?%#!Y!v#y`h4y!ku@!>
zFKhLgB{C^~oz3winlMIkzhf4-nU}pJ|3FlCKc=8p+Q`Z|0)E>G*8?AK#BNvIasm2_
zH~&nVzeL}72CtZYD5>p$n;F}w3>vS)c|=-J@1YiF|H3BAFN6Dz)t*tlb5%E_FHP(h
z6p^jxbZ*hD;9!2C!Ds03@7eEsUl*>wmx+68eLGisM?(fbMbrlKDT+*5Wq*yh^ksDA
zZ_)IF(<qc?QZEG|E~k}yXR#I%n7<Q2=!90{Y~NXpVZiW4Sfq~HH0=D@hWmW>Ou#O1
zz`jFb`3rUT#R^SjvKd@bUQO#m1fM=$X`%+`#`Lsw_rbrbCf2`6@}aLJ@MLs%?l$=?
z%I5i9N*)<}$pZb|>~Osyyj(A6R4caoGTA>xQ++w}AAW%gnX$B#(mj!X9%O-JqsaXU
z;J?*5A0d3hO?(=-bG9)roXOJ}ExUSGynHpbf>Z{dYD3`)$PUXZfQ&uKuM21C%+LQk
zei;n48`eV#gwsbvc-bM)Uq3O&NX5mNXUJEETpY)TC2e8GUl6r9Fo1wYN|?uc<y(1W
zk3@mVi5D09T6HdvKxaKw#0>zHIdSXT_27gU0wwmxjaEKBB@ed>zKZeyKL|)qj$|9=
z`R)DApIg;b*eGo^Xy&_3e=QcD!>!sm#^lDq!5Qcq3hx6?0$U?lmh0Ks`HTB++$Y8I
z@Ed>Y&e{n^@Ic|t<-^xR>n(8voWh*7@qQSegr<B?Ft+EbAY>}YvC&30g8jGM&j6n_
zPslO>7OZ9FOyI@E5FbbCIKdQy-}txio?ZT^KQ>tn`&%S*ju`(G-pUG>t`OT8jSIO!
zecUqI`X-4x+B~I1|J}kZ!2P)_Ye)|2I8jpZk=fm@><4O#kXk!#YlRZ$A1lYshsMIk
zo*F?{RO6H+plq`j{Fq~?#SH})KzdK9dwOXPQMbZem91x<stBL|#VS6$ry)b^2;?__
z<#~ISk=RJY2W5$@xGzJ_@cC<&pXjtS1{L-L0|#g5n~BTl@wbZ)-s&&s&h|W_xB>{K
zBL77N4XsoQ?E~=7LKt9t=5S6Rk<CEsT&s>c@cpJr+4~Lqo`e<i!^g`++#KCbtbcRm
zneSf;4~(7O+ytiSrx73qh&Z`S^yh|v9}Px#!*37~Ac9x2%!W<Eef4^KBa?+r)|=Fc
zMFg-fLoG%e(}od#u$3gLeHs)z-_9p7b_vua{;~2pE<IB{*J;b0YB8QD<Z^S0CUCFe
zUqk)QBM8H_1wJIBh|Ibl<dmNTcNP)Jx=#Jrm<|~1^xePP<EHKPSY|U5&uP%(N@ky=
zsz%H8;@(c~Si#*{rkx171Y1>k-<8v>m5#zq0G6A%{f416>WnRtVVhq&?Z#4KE|+<&
zJ;f%-Nc<6;jilymOj7sp{1_n$$bdA`)o7ZRZEkoUs-mEJj!f~(v83>dup{xTFb<t-
zvy#7KiGKU>l1#bu(7EREGm@yrHksV|lxcpW9QfRR<CNV4P_Neml3|kM)2}|v>|qm7
zX1>E5(86hDc`uxfyfc47C=C!G^V^bZCyKscvLPbb_Y#kFaSqdlwM6Xj*dZDm6Nd*X
z7q2a(c8?Yvd7(y?cX1<J4In^BL8<9@NDEEG?6zJr1ct+;fgd%ObK+8m%1xMg<&o3+
z$up78%9Z;i;c_7_<^$m0E<NRC5}qpFoJ)ahq0C%$nX6O95WmfX_@~U!@kBH4-O6qI
zcu-|HA}RW+YfG8=1(UeRR^$!`9n-qb5(&V>`CUlmOnD7y!T%;S<hss{?GBb9tn%m0
zQH9w&Pn$fq_8UPY<<Fj+7*sOc_<Vm((E1^pOu=a764fgF%zvt>OZ>v)ki^Xc4tg8$
zPBj!N=Hu49hS-{3KW!pzAd?#(vk5IntG>i`z93N88eq`eRYJwqLLC@%z&HKNfQJD~
zcIdB-tgdvF&J(aSFo$F~*T@#K$K^>=Vfo9GcfOAt=MXuWOP^{Mv?Vct;^d9mY-2ap
zGQ@6w{mC4{lVHwrKfw}->7ds<E$ZPGm6WeNgtx);o?^7jwxb(oqXl2lejP2vbS4lj
zeki8}4pf#29*k)|**{J~tRkomEt=Oz>&x~jG>SlA5t6D7r<e$&$uKT2<v8TQAk+cb
z7dAc*4w*(xK*1lxchc~bLI`9(b5R8E7YX1Xu2&xdbUO6IYB^1O9_p*AW~%mc3_vf<
zXp98GlVU=oW$w^Fr3!s5-UQc`(V&w$J%o+ie|8p>->1c?c#ffn@E83N`zoK1b`+$y
zcv$+^wJuc1tioWDHe{)9|AXO=7=E;h{A8V-M3?AYKs5u}$HDuMjnA$3zDN6WikF@P
znto^xL_VZbXZt)AdYKA_x{qdP(ZboMWEYi@gt&!7D2Ur)1a<m{-W)3`X4!O{h`$d5
z{b|25VYam3h|O<6OiC5~@65_Un_VekYzGfz-D1UaxF=0ZILV;2O;l)gQvUAy18RQA
z?)yD{r0(?uBTgkGHV+3z*G6d%1(d?wMn)i>TPTzXy0MZqfD0u_v<P??wLW3bz3%G|
z)Sdp@uMv8#zxTpmk9~*0J$br)OHpyqPcB-8zcE~7wlWisBoxc7nhW{b6qQ&xlPUw5
zX3h6yG`w5D+N>b-6_%SxbL_h2xop=NiK>^Jw(#E=elbc=E7$}@1!zYLhX*h?{)C+R
zTz+KRP=Q6VkuTz3`v;J5{_FCK!Aj{sMOiZebD3W8)bD;$T+62|B|Urk#1T1uo#|$j
zpisQIa>>)3KrYeg1`hC7lLOQ2nSYqlY|+O{^r*dFD0^wUtzkJH1xPYce5i8l`<+K{
zQzP|4QBIxjy>xjZ6<7D-Yq1FvKf=w=b6a%TMCR*8UqNhUVP++e5a2}?s8egpcJ)TO
z!Y@i_Z`B!nw+(|eg`i`=xxj-=jutv2$E=B16&%GEOOF3KOmB>@maUnjC9?L=ml}AZ
z$v2%`Crw>p(7Gkq2sdCBV|z1X7FmYTXHsEM##BuEnWf3>b4>)=d}8Lz;S+4HY{-yg
zC?$#_V@yfx>Dhl=7s>P|o?G_UXMwV8g8GDF{)QRxFYQezph`4GI}2<RhoT9R9lEk?
zycWz-mDbHRpDwp|nw&z=$Yv_V{Juxa;hWF#9a06{%ru?5-1j?@V4uxH6V9Z^kx$!A
zxLnk$s1$)gQr@D&$}0L`mz`9PD`Mgt|H4NGfl7REA%GzE>X##(491B%LAyHz27LzY
z57X@jvlNu@m)}&#$tu4fK`SsTCvZoCIjTI}KKYvHBF_JdFvhVr^5Z0=Ri^7!Kr$8U
zv{cF6OkNs01fOYU*4TO`r@-}>LZ%C!e5q2G1;hA<uv`yttX><#dK7G3;>uCIifArs
zb5&?HmM@KxdY=^<E^b*e&rxldPdm?uE%Xw8S}KEAg8!vTOJnJEpZHQg{j6S&a;yo1
z>@y?(D|OgdC7ICfXO|Oh!1*(sF($uojeg7}3X-G948j7Zrp`)z58#jXz{>rsSlI;q
z>t|wCR|Flhlcei!hIV7w7rYYfWd{5YNyd}0<|^NyhL|^*D(&VuFNvqOG95hPDw1#{
z#pRS9jzAK@*s`qhb|a=^ra7Tnt(0{d1dA0+IHjeyXs`=oh`af5G{w@ueFmKn*Kkk}
zH=KgvW*pH~Q116n{y!rcn;!N}w%!$Ho1&MNU~V+Y)7#y9K@S&v&SpFUQS#=-KRx?+
zVU>e-cnde}dX(x&j!Fs8Rfh;lmo=Vg$Dd-ROVt)x#Lsp`olxzf7fmo#DWZEhH?NHO
zky?)w$Rlbl<c-5N4|wiWzq>1Au3w^aVZ;lsZp|{p<xVCYgXb@;7e`2Eu2cyvI`jnb
zD}*_}4-~GanAR9J0TLF=31M_w%u^pWr8SCxc4E=L<TVuoIM@$ZqC<=FML+Q~2Q?Od
z2Mf_+6|Sw_h3=4W<b?p>t0g==UFeG&ojS3DrW9_$&9PYuM}aq%4HacgB>Dnn-Qa^#
zXn<Hq*XP6b618v`l_KwRbxHlre-9mwT9t%;*EM553EKl^Nk*O9BQ+<(6BKD{-@lm^
z8CWh)fjnsoHb>5DjU&XGw<&@5FtIe2kw{-g2*{hp^{_g_;JYyBO_U93>tWI{+Rro>
ztZs;fq`}d4<vo5T-Rgs??g-^UAubRw)AY@hRNhaVt~Oy4RD&h3MhZ_Ur_^Bzp_J_T
zLlU{To;NjF*u5`ARw!Z{4B90TejL~1RmYZ6+GE<Op{|fEB~~p|CSt%HSRpA5=xL|f
zTq%Z@gkn?D#c9<M1w?}5)8Tn)pwvpRO(SpL*FNA`ep(}2Q!E4&x$l(@d7x4$E~~C_
zK}WS}ItuL8FuiwuuBrw(spz8k?t;YMUD((}>U})39r<Ik@1F1)&gaMLRAC){OsPHX
zYq^lra!f2HnHG2v%QKzTCTdn)u|kcg(O#7tS(r5Af2`ZPACClO363kw#c$)LP~jCk
z7FQ}!*~0bR-RiIFWWi4Eb&OTki)`2Gn=8x%>O|vY%@i?Tqk^+&!6aKM!t<r0tX*BH
zvX1m1A%nW@r3K^pFlHhLxVsAJ7Hi9Ss!>n*q~_%mD@0h_DNy2Ck`rPiaJy@V)wzi^
z-P=OcEiTF5$hHQh>_yhd?ra4-j>JWV_svoW)|BPY^%jZsE_BF|`4EC04kIL;2aBNN
z_UvKmQvl%#6#d3>u@}Z528%cftl_#1)wgvs7fv2?n$u$_YeyMjp4g~Kk>a|!RK?bv
zMcX;d#Fw68qLyzJ^H?^Cd#u#RgX85*U{Etvduo<+mG49rM6Rms{+_WtJR_hUvD;rf
zBiTzW%fE}z+$X?!4FK_T!P(|t&nZE%p#E+u&kirzC;$X7gKGVS_mLJ5+w=|kR{bY^
zn;xqjGivKc=MOQP6P4@ge6rvzwK&6<4Yy{HCKz__<sK4Aesz{getHm<C#}zNUQ{|E
zxNDt1lzZ&GU?Pj~NkHE@UVXF>_(f9pWl8|={LZdk^P#&!S*PA<eRD|v`+k-d)_}o~
zK8$k|JRM~bwrUL7W}PENCAb3&cXGQ}Z-EU7Mk}GG7>o^i0O+?F{mOW%$9oBU18t+d
z%}pmIapJc9!;2_BV*NPbi=(L6+n42c6K8)n0y3sQs8d$N{Y<;tp?tlc$6AwcSqevy
zuPc7u3jsluckc8uvEZDQ?FGW4%@|nZb+icQ_m-*Cd6wGCG#q@b5X&YM!-XZreLHjy
z`fLt^jz;g-Jj2IVcIt+j{YHb{qu!G$3>Lku#Uk1y;BhRJ7sf^Xyi$5HdtS3%@pj4g
zbayx#BA^i)UgCFaF{^|l&y}|{8k=+a@IpKMdV`E(=Th{aZGC*T%PRXs<{fS1&O1`0
zbW;7lB)-rEEC`>AL`8)fVV?Il{)`1t*RnC;t9^6{K3;B|0h1b+G|@OxjY87~8G7}j
zH%wDNgJ>o9Y!BJ#Gev(sOf!Nh$nMS6-Y<84#&a8_v1I+|Ko=3k_Ja=skHpNxL+hyw
zs!CKq?#%?|O&_qfrHzjRcT?c(SOX!6PNUxHU!b)d4qrXNJD(i6gUxf9VR0I~`McRc
zCzKI(pi>7-CL?}xRWFOHvUySs#wXDFdHw==z=;a@#kerBYoDe?#7DOshUZRJvt_$+
zHZ0@5<-^WU=eJ&}XQI;^0{<|gpTfwyiwLXj7(Mt<SG8{Vg6dfF`bCe0)X^rf-x~o&
zvI^BWc+8^A^&Hyks@|9)#fH8QlURv-5$OiUe9P&Gp~ys$)C<~*PJ|erwWmw$7b4qN
zjby%0w`mbm&ICSVWZ{}z0*T>SdcIT{W7A3P+JDImex8iXS7R6+KM*Y5(HN$xK&F~2
zkveULMj5e>k&;we@r0z&ut#g_GxSlA3u!=`J7u;aURBgB(q{2nnMMbT>htA%v<|~;
zwpls4ndEphFl>!>=v|zrH`;tmZ(qN*eQ<p(uxAqEvfh*xB2zD8G8@t$>l)r!h{q%r
zH%6ISo%4J>W6CyRJ#2Y7YPl9$`vpgv`HiJ%+s^J!kgY}6!YWsidj)|zz_nc0nln|4
zZNftu*{R&JF`)Il+_XUzj;yPzcw2#O3~Im`H-dZ3bqebb07kq8giaIe+&A?x^!H<k
zZU7D6Z$p5(nw-WZ{pMG7Tv3ryuO`4DC8&WkIzhABPDh^G4tamH!rlT9ge?W3ZcQ*b
zW}unK!4Kd{)9V}aN#sc^hZ9)WSW67-rJu5;@oV&(axp)oB(YL8aJf$hI^a`f^kg#l
zcT6|~Uv7aP>%hK*e>-5b2k*>pTP2b6iFFni0im5%vCaX)mc63tO(8%*KP^MEjuOxc
z&99C@#U5P{pr6OB(LuX;`&T4NFUM76G`|Q3^-JPuB4kwyqY&OZgiM(ZKdJM~MY9aO
zAb<0Hzea|5>~j?OT~cn4XL7Kdre#DCI_W}$5_4395`^`h#1)9G)qv^AMkAY>qfY!g
zHl}ZD1l_iX(7{QdGtF*J=sKc6lNg-5WqsYM3A97Sd}LUB)UZ?ue$A|uIm8BiJ^T?q
zU~@tDqC;;v56EvKA#+(f%fI2QG={z&kA{$9LaQ_#hF_mqQ#p~w2W)_7(sQ^MQYV-G
zdrg^k%^}yQca9)W*-6L)(C?LgzDV4|>SV~@w$Mh^BH@d8GUS-xLhgMXzUq_rNGklM
z!+=d?xjI{lchxc4IDWC*usbV}^H8#tut%WsG;%4G!9f3cTmMt%>4&fGP7%7$8M+@A
zhSzu51U0VHJxG^`kPt)pcRn2@_BKkf9z!0$l?mc01CFFDZ0H_3seJ=QHQ$-1?8vce
zn3c37MSH%~QlR=2x*bQJVCW<Nb06TBF&FV0^lEfvCk1cD5Bu+ev<P8P2fiu0t^w4K
zF}u4gI1_BN8Qd12M_TQ*LD!bIm305B1Ce6Kx+oPy)e;YTv55F#x_&>fwxoaP!sy7t
zZ!)^8|4&{&=MHMGO&TMWX^C#Xqa>7Uo538jDY135A&)b-C8q$2CHfA{9kzL=6ymJF
z7R8+d(@Q;&v7Z$Zq()Zw2FqozCROK<WnbFO5kM?WeZQJRZN$Lp1nFtWr_-cX53Xcs
zMQgFd1RthSf;!zebjWnO<_+yo7LQP3;9YLV>up~()tE^xR_+S%W8?-h8xom-KJeE)
zY~recJ3^F7o{B}UU&2|0t#XP6E$1FFCnBFjWjF+VxCU>My>qnonO>$+7}=)DQ?bnc
zblyIpB)<%$omxez5Rm{OgHkkK$UfHS1CT$f=$B_NaXNt<-`~d;i7a{hJI4gBP9fYu
zn;iob+wPE=M0(_&3BvUnWw{9kN)r8E&BYzodTQd`@ee}ec)#oVqD!g3y$5bKC(r{V
zrG&5tW-{gYen~Gj*)BgzGKfn>5pa@kZ0pLqMRn0)&{4+*lhFKRp#mC6;F?jmO1|)>
z<1nIO!S%=h8m5|<_g;A^Zx>T`XAifMJE#_;e8f3Th=5cW6W`o0fa&zq@nnY@Z6_X9
zzcLnu^e4H6)!Nxc-`!oh>Se2ME@$iT82?pp<HFw2_gS%v{)YBBZTCtRnkUM`9ssz6
zt}LJ#nEz9qJsILPNcX1K9}3RDK;@Hbdhic=P9thFR`YNmv_Tl&L8YcNpWc?!^`OW4
zPd$esD%_gZ?e}xjoAXsddR3EH4rf#Op~V;xq?(4|;7)WZmxgR|{K38ZHhE)vG}x6A
zDuqOt+UFlU2(xGcGxW`5t0eeh)j;xP^h2V;ibknBNnbo4q9KAsV4aIfq77_;`olxC
znj%^{$z1~a4Hv^-i&x2no4<3FTt^#u_fr_l;Og^hLa+iQdXl#(t0G~PYh?z_T!GP3
z6&9?7aLzeQHECgRJ}4ZvsINmU&km@;HSd@i0qlc3LN>j^f~KXx8xpZ2(}{r*zpm3C
zV>giKP@z7)#@&4<Ya)w({Zmg(T)IlhRy!%%nBULCeA|m>ivkiJZ^BJDMshWhI3~9N
ztI};GS-j8kw8zNd$hM?pay}Y?ML$RqGDqT6WH8n_FV;DkL)z#{EP%L=p^>59cw+eO
zs-SwU-bXB%bL5a_Jnl&T?h?aj>XeO4l_e@+R|wrRU_w0-#qwr$s4~oJ>XN&Y*NYqL
ze-EszugYuQ?wNjQ7$TZuq#+!48|C)>gyn8c)n!VZy!0a`{7B@V^}NE5KbAM=ej6=;
zuOHtqg1%{gJj0UMu*?WSm*Kab=dTM_VQfP4m!+zP8*n9Kf5Cn1xc6}TMJ$FN#Wi`S
z=@&>G-g$I#<BuI=V_e+>&6{2ri(Lf=IXqF6GIjf;giTGm_|Cqu=1_1>CDk^jW}04!
zEfH0j$iU8F&yef;vt|jMd`bLZa6RBFy5OBmgcGVm4LXOv+GZN9dLG;kPAn0Iu)Hb0
zY$0?B(Z$P;;QU}|eKth0Kf!mSA+R<Y#eBMpcCry%LoxX#NVTxLJLa%=^2LX=H#P>b
zY?gEi6_T9Lvbu+|X`gwrSvnb=NtW|Hl{vO<d%ETpC(^gEJ5BS!5PtK%QQ+!70Ivo>
zgd8g)fv`PK>xcv?f&%4T0pBixup^gV+~*g<?Upgo`aJpiwDRjuwJH6n{apEZSR%Py
z;?gD<QO6I^I*P=t$gk%cVt}jO9NH-GO}7q!yN$LOVuU@#8XiKzZy)Si0kg>34tl--
zc7pWw@)EE}Yk%A<t+~Q^z>3#Wj#HT^B-~URNp6!%RirIrDp4rK>BUD5$fXCrFQtRL
zA2E&4g*z#1i4$>Ix=N1H8Vc*A)EQ2~|BPY&(~e;uw8T|>%?4c^LiP|AZ@v1IJ$B4`
zKrS@bgwQJ_T6a`s!l?#AeaA6%=Xz#DG`6iPH!j@GKqkRzGa-QNK>)n-*8ZypyD5I#
zaY#LA6fKOGs)-V#W=8ygwyU1U-dp&F%NsO|!_qzn%znEQx?8uF1UHVBKul1{PY=w$
zI$qhG?xLKqyYy~ME#xEWweohW9Bdj9YS*`M>;gxG_B^?9UyJH&z8UP`HNg#qt&z%h
zRj2p7S}SlWN~>c@XYpB+n!C~Gi$rQ%-6U4bG-bo)SWM8{yO73~!|YMk>H(OfJa>!O
zf$$yScU}2t(YXrc)CD&Ni&)`qFyfBd^Mlt%-PuEvpzvP}H8-SWYlB<STfulGJ<%c`
z>g#~j3gOGrpfCQ-LIT<D+iKa~*M~3D3%LO1kFTchFQ*YX!-npj%WZNVu4lkFmOq+Y
zsrlY4?<*JSPy%eN^)&TVdh-_S4Nmy_>^S$f+HJREKU_U-SwcYxZg+0X3{RkDC9ex&
z`Ej8Aw5_?&54ORoN3MCf&UcegDZZPAB}i;Cv}1)ApLgX!-9ENzYwQ~9pZ3g__d(K<
zyYrEenL^IgXHQPJN<k|pROp%qnx&WD7DiwfTG^G`!50p5n2y$mElp8W1f6;j%tj8u
zMes*K%X{oB|5TT^G&;X*09ti1cE+HQT?A#an*$Ihv9!UTF4att<UW0#&dl?FZ6b3A
z8B@Rez=k-dis(U2Iu9n*Mg!1PlkpEU?2=yY>WTrX%cWQzD7##toP(I^ksBo}UU>Al
z)$9RBa4exyoLIT~GVT5N^}<S`1zexeBC0(S_d%ir3BMsQt^(t&8XFATUvo%`49l28
zB$7_O!BQ^kS-=BZISdqS77zQ{jdE27^O1673O#w?E69JnR9+8wDL53PK+)@^8$@Zg
zNYlLlB+<o=t*fvK2xnSI;sKaxE>`(DfX{7_v5NjL9Sjn+(XPCrrgHUBj@YCRjV02H
zkFTqHL;tsA5kE;j4B*9@N3PQ#G<lpRnDXGYZsbT%B}q~Jr+y+?J9M%YN1oTV0qhJD
zf-0(V7-RumFv)U`4@+f<0-<Xp;Y0i$XG6g^B-FN@h(w`L;C7L7VGg?TL~DXBlZ}*o
zbLpO%2e|ozT>wA&BvvPJ6fFV?xhL34*>Juho@za^cRAl3tHR)>G%x~=LfJamcH3E6
zYu)<ORuT||((PfDV%iI%)3}cs(Ks5npsGR@{h-Uj338ib(M<va4il}QV`=7Cb4=;q
zDe_QAL{`VZ1`V?fgZHD`Q@$m1=Z2KZyhD2(P8AM&B4KkJx`+hCd6I%ET=Iix`R%n*
zUasiRlIt`nxkuUbVu!)uCZ$vo#Ew)=dMVy5>jq~dv~iaMBUAN*gWI_hA=~`!ymSPi
zoed?l>eSz^7LqU8iR2TM;a({skBWONq-^EDjA#O?1IxJv%w5RWIZl`VO4piHm7b|r
z!%=RurWvfepH|Jk+bxMLEbHoN%@dE}Tw&FoC!6flQGRrYVX?%NlG@}8iPJ~oc;Ion
zKChU9HdgX5N;QMoK-avl-cH?I>o)3^PuIS(&8Ryr*mlKRo7^u_NnD~Wi%WE@*xI*-
z*60#nV4p<MKsTILEl0%-;XK)R)v2b7Rv(Py)C_+yrvrjmpW4R4N~kK1+L)qLO=<bw
zkw=5xN7Z%a=u#Ffu*k*RiP8jI4y$kT*%j(`i|*kB*6u>V3=LeD1S~3>;aJMhwZ@})
zsc;aWR>_T@<fgJLXztDaHM8@$O%4F?{L@-rNO-C!qhHJS)mE2}IF{jOb>vA{+NPk!
zlin&g>VN9nMSVeNn#Qz9z8F1JV-}SUJ{}csw>HS_%YFZ0t`FZH7BQ?YvTb|q8z7~u
zs{GOp>mZ14YjCBQdp0SFrfiv^AWN6M`?RXB82^bVvAw7V-TzY$+QI!_b<_Wa^;?+Q
zIQ$3F|35HFODTPJ8w_w^H=Z!WU6i5*d7^<J;7*uf=v_7)mvjtl2b2I>!NjD$^7@i)
zO5Cz7xGlp=vh(s$pE6Q<n5sZ0(c`LN9FU3@rnoA_5~^Z4Fyau!*&x%^i2#yqO0mK@
z;<WR3I{-(+G&!{n$;{jwgf)JO+w^+#?^$$M+Q`CX9UMSn^R~PgER<6>X?a^lEv2i?
zJp%|oTR&}F`qlGG#xoona8rDbeVCwjnqbc1q2K&*z2h)BuVl%588i{Yx*QRTnsk-9
z{6E;pdQR(J^G|LUx(ampBlYb27d}M+*q<}l<3%7V=DLHrgBh;#TEbS9wJEHOjjj^m
z8F_k`xEAO*!P~ysdqIBaL_F1S^>SQ!MX9Xt5WdtjX4jpPdTxbKLj{c>dq$%CadG2h
zP7YeKt6lf%-*#aEr-cuAi6ue;8n<r)bU&jbTmwGk1Z+iOCpQ6!J8+d0rLz!hI2bDj
zdoC8G#~^!gt}DS_+!|#cJgcP(R}LSrdnhhA@)%;7EQ*^yceHIm1g}B{Uk9+FWZAcN
z`K(R5!DKAGiy1L0X^P9&ct3>Icqql0S%U95Us0)joa;$TxOAV=`+g@rk56y+KE}Kw
z2p^?BT_Y<i5gyHSiF$G5TXKB5yK@8TU-H<fvlj0^mF`fvd+`6cj!IYC7rh~XfYvDg
z`*mbv>Fo0VvW}*-bnQ3TP<>x&4VEz4T){MBwI|cUpGYpWP<89vyrJO&>uZhM8p@PZ
zjD9{6l1-FR%l}%EuSggqyx`y8-s(ja9C66qz=Ka31WE*L_v6CrBPi^C^gMd?9As5P
zW#?JTtic$qUV0%w()?b_W6(LKDMbpSTrUb?N~W00q9KeSfj}Ku1KPWY8RQJEyFGnE
z$cYggrZ!PNyY{)78Y3T)v6il{y`x32=a}Q}tnB2wZFd)40s>&9rtQN!K%|4n{Wdg_
z+lTRJ<NhJ+#YjfxiWl`P7Xa(2TDGRFs6gMu-_MP4G!<Iim&F#2SYSEiogz9gp1DQt
zw*fSQw@G_2LP&gc>DdEIyMV@2CNsLr-G@9;G|<B6OoQnHgxw4Iw2-5SQOP3q4<V#(
zu<T9n4`_sIoy4}|578+<`peq@RXLPheOSq~(R9rJpcgxcyN0UdN6$IwxkWyNR(<yx
za)S!1n`_^}QYGpw`N1YeR=PME<Z0OJ1$vfnzLk>waiE1(on{DK)%pH1_#y2iN<~iC
zr7f)mEM=1g?|^C~VXU%|(sPteUDR<(+EVTu?8q!pgf;oo(_JFr(xw45mPf^hc)Ytv
z)Q1r=hZiWs?jxQ+o#Lc!U!+fJW0qWIp{F|^f{FFsl~x?AFTLr4Rd3PX+mhc@r6c%b
z6#aTPn4{A%_3MPGB7-RLD{oHFiyeg=uC?yO0U{`(>tF|t`Yu{#e=vtE7m7O+FuJ1d
z^inQ%5C=a-QZV0F(T|YN91{pN|4z@&IfzLIj=c(9oL_4dBYR>HFK%KYtM`9^lGcz_
z?11dHH9|&M<0&}g_kywnLu8wM{mhgBQ&Ns^1TQ)&Byum}!XjYr?t0v6HeN8Vz2N_#
zenp<>&#_KA<^UU2QVM?}lC*&&6l2;YZ<;(KuzPf@i0SN9PLXq!(brH5Sm%JfP{5GJ
z+-Ks-3q9u*CA0{z<`85mUcgkO&7c8H7f9^r-~IKTRYzby+M8-G+`Pf!7LoAr&<03Y
zBgB2Vr_3!(jLS0G$=Ub2CWMl*7^<yHlz*RW03PIp5O{O33*{CG@ND_p_0HR#Z7Rak
z)XYnf%XESYN5QBvdu2{+EOrZ3+yo@_L3V&YaK)WH2CMu3x*FnCj*r$<_I6{-i8X>Y
zHB@J^HOe6?oT;E*FO&yKWK%`_$>G1Neza;)p5unqdMzJDfI4@4M&Dl;S_3gsj)*)j
zEWDUgqts2GiOUV7*8(V3RW<2x8t=G6jC%I<n61x~1!r>CAKT8^oLfbnETFGOV993x
z!jT<jGZIq{kNUI0!i>#Law0!{<!Hv>Dtc*HF6thxqc-x|Uv!tVg)Clw;iH$pqnC%)
z4Nhuhkk3Oq;D+#9_-CI3(ldtBlC2PTdh~#eyZTAe@@vz%oFeW<{<VkI*XH=TVS}Es
z$*IZ9MWFGQ@1G=*z|~x0t1n(cZ2CiWk;KAa#U9?=OE_AKU<mjsr&HnzrjXxxAq<CA
z4lVsFj2!!=q0*jA7y0xddDfk`s2avx;B^@@1Fjbf;V#;#L=KdTIpK1yMr@R>ws+bX
ze%4d9ekGoEybCI{cqU1nY3koL3f1S0A1rOySJ~pszPxt^6N7(U)|M+2SI$un+`^bz
zCN$}!!K=VYTRN$#s8R)dor%$9!s)YPyIC!<sVs{PHw7@^!1rw^X$2cml8P%g{*l`t
z`|rU<X<mdHQ>>2`3UF&db{W>EjeB{RUEp?gE4LLa)1%vR(fYnD-TFu^h@*#&4WfP0
zlIPd*7p8mWhOTT?cl$%Rp_&RL%U(O)K6>HIusbeB`Ks3|h{2V5+Sh1*P6)HJgK(KD
z+C95j4indwH2^ZRVVc3ptMS?l?$#~YM*cc-6CP@nwst{3yP*_QS2rIfz#BK-n3c~x
z{utOUe?M49hGtvALajWaw&@i)tvm6z_F{I%?8jF2c_~^z-lBQN3&7!`c8G3@?2Lka
z#7|c!tdu2(f#@MDPNQjQjC@Mr%6z;-*+(Zh_NJUId1sizRg;?gX6FPCE-3q4Uepi1
z!sv2LXnJ(l_v>tI)*C#PKXYVoxSvYVSM+gmmdO{qR$9<YN4y|L*s=3=sAksTu5*nD
z_(H=hvA5l+NO_i&pPdxTyJTN@&E;XX!vN749-b6h1q?E`Vx117#cur$!G)x7*&t2+
z>Eeq{z5mC|*Z=Zx$n2vTUHt3wVnhG;?6$4p|I>fbklK#@!aohijJ{#NG?l7iX1}y3
z+3c`@goJlMsa3<~aXY!J(SfrX#QM+lnxuBe6`QQYZYtNw$K96uiSq$REFB9JC}$FU
zM-aF-x5=6XPgKE60jqWSKfPp=rZD)EFKW`a)I^43x0P&VptIVdl-0m`iN-jthbrc5
zK|c`+Y_h6#fIGIL%Pb2-JlHg#-Cwz+_%F(5ii6G2*Yp8KqVKtg@UnePC_1YG*kepJ
zGn|eC_4=D6u^23Tao=8PnJTyr<q#l6S^y-<2po5%dhDnre+EVXzJPMvaLBIjR;$c}
z!Bvk%VHy4dFWO^#SQwrDlwDf9Nqe5|8;w7HuK_vYQrP-1)aF_s(;kds?I?S_7B+D9
zqsb6y(j7pkQmJ=JxL{tfdg*vMMyRsQ$nh{H{mkWdg{kCCZcedDuHrS8iub(6zSSS_
zx@yQu_(;a+3bOK#0OpY*qcxp-O`~0q+{OMQzTp_s*<|7oS#qR%1(VxSQZ_#WvZ8NW
z&lAlFI1QU@5FpR8@rl>Wh^u%<Dkx1{mqNyp)GN6yEGCzCGEF~&mCOIJ&;OB4&^dor
z%4;NO$e^rbrkAKULSvES#jjhd+ISyv<D>xFxyQn-Dl)@SV>x7Px~)FH^KmxzUBR-I
zTcq?OkuCK-AFKg~?~-+x-?1@o<PU%`^pd-Zib6EpSs&)UR%dOneL`wk^DFuk)#o;?
zMe^iqnuCyMf7@d!DimmUQOeMp0Xyx;lkSjuCk$37^UB8{@_y@@nl_&Dsmt-XzlTb4
zhLfER&(4DXz?AIsWMH?MIX-&8UK2cG?MM!&$rlw(C)$wea(n?Zg<nhnF-<BdH+QT)
z5~8W_HWxiNo2pZNW~Ir{HeVmLwQzzM<Ia3d($JrZ%@E4BeXY+d&}0yPK-SkI&WekJ
z4SAr!J;ASvYe9v^mR)rapl{py?Lv}a266(=>+qNwN*T)usIuUQ>$qsoT~m|7{Qf_1
z=KtD13)u3Mw*U6e6YPJ#f1FMKpZ*88)OMmX1^#JlYB4XPh2j^eeL0|-rT{5IXMzi*
zg)~VVlBlk&9oF-oFL<ov*tQmN^B39j*Bm|FH_l<Mx-03HLeh+*Abj~@r%PZ3mAB)L
zdd8OQN;|#Ui*CYLN8W#jSkcljM~+4TDV$LCRJP@kP!djL&7lnvDNN)EWGO6WW*~nV
zn5bzZhX)+xa+VVfg|nru^h*Szv{9|EEy6$84#Zo!>}60mIydu&v;GF1fvJzl(1(`E
z*y7Is`~CBL1o;`%Fd>&ED`?S;ITS*ulTMa2)o-HtOhzQ%a7;4^8}G_m9?76<&$~@)
zcC~!vaZSy}`zvEaSb~RLsvlvEt3ev;rpE;aoz-gJes()c4>nB%x<8iZpqFtWGvO&5
z)+Z~`WNw!_>n6_Bt?SX|cboPGyLH&G5X|I(mCHog&7c|T(KpxQtwW&AN5vq_5^fzg
z`w3bm(YywjJ%rk{3Sk@IdSCbBt@g(jlc$5;3_#c2v^EZFPDzOz_Z=9#>)qR*V;$u-
z9aqFv-t{b)*F;h!LJHV;7}RGji=~oTH$EBli?O(Tqz_k#yKbvicwniH+MLSo-eSLD
zY;t0eis~kY^5Yl45@jUYNjKdna^rt}V-$YT<G1?P>_BVAgmeZ7X$fdZV}XXJv^7B2
ze&`ypJG7pV>G!%i;Psxs|Dcuy%N>HA%_mRL{-Nai-s>T~3Y<;FQubhDA{jOZYSSgW
zO8Mr7H8DB)nFHZ`4I|0f)0n2<S`3cAMMcWl6d$?I%4B?S4*YZ(xpUv+8jG^u1J{3g
zMc~U0q85pjBnsJ>+`ofS*2H?0|IbJRt|_II_0P!9@GlzwckLY)8|VL*L{qG${a=a3
zZ>5%SF?=+wgiNpl1e}G&wUJeD*`vPE&;ll?B-Vxt_1`Ei+?$mf1q|_)tm1)BqbQ!w
zY4>Zm0R~JQ0|cf5v^VhMLl*MV5ej~YzL_c~Y1{3Df7O*F{?vX!qyis1mwzfIYIUr<
z<cNaB;RGq@8s<FYBe*0savQtmD%@EyVhcNTfJ*^OMsTwbZTm%%8HEMEEi1jdJJJqk
zogKBIY3ri=3R{b+vYlRSBAT~>OW<<#T8T0$IGBH`$Gx>PQW9g5)AZ{X>Lh3yG;<XY
zOaE|l8Mrv}DAs*RC!i?CnG(;n07$njtw}@PU7Qy?r)7&s_-mQ2a7@3J{_JRtz}&@&
z?E@2K!YR*9vrVQJ%}*-82^DOomz+yZPHrd9CLm~#%M3SETrC;7EGcDuebVBwhSC1w
zZ16xd_d@pvmiK^xeD9@mu!lED-l+LcHArR_d-U1kc@<6Z?j&?|KmE*tWwX1FRHEA$
zQvM}~&U+Sg;jxfXuK1MtRjHdTkg`ewva0h~6`8qaSlGoI%wOvzPr3ANI!1HexzV6W
zV`8HQD{wD?ARlox!LE(nHyW=*<QzTVaZQ%Jha7Ee`1KI|Ab~?ZMU<xA0x2q$nlbs`
zk#&#TEn50-pK`O2#?`nyMm3b?E^3Z6D*2-(!NujHmiH|qW?`JJf%DPJ$u&mE{tRa5
zUg(BIAG*>U*V;@g!>6o9E0UF4$Z%ue_pxK0PGI<f>aKAD(;>EsX6nlQ%}S?oDrt{%
ztruLUyLJ=n&Ac$*?ua<&Pk94PvQoz>V^XkaS5R;I?itX9b^Qg`INqs`(GklW$+2R0
zyH2O9+Vu+YWMrs&l&in?lniDmxw7t;H_mcCNPalm+H!Q`YHKMoz8sjmfCs|W4{@{%
zlx^;YPuQ8goWDP=hNj-aQ)?c`3{2^pUAGNH($Zn{+DTa_EGjk5OP*~8(_duQIlX5-
zOH#gb--KhrU?6F9x=k?W<-BF2WR{!RG`$w9W1mUpSzFLsH(OMTo^@LB$z|i@xJ>?9
z_fhvZOd2fo0f$z%8N|{@md2q0WvQZ%V#mV8bJ6)ZxaQY$bLa-|<nyE<2-j3UKaPgW
zdR-P862HJ>i}@p|r&DxieoIf_>7ACCt`j5(w+k~b>@Cj*oPcKX`8N@i!Dc-Gg_0>>
zN<n1sq*?hP^4Z}O0CA&{ydpWSYQp>xd`L>*=z=7z6NDij+Sw)GS1XkHoj%e0`w~&@
zoVEY1SYfr>9Zz*8?ci|g&T;0XdydoK7<BqVxXaEwj~xVZf9bJj4%in|b$N@;z9x9x
z%h{oW{bg-7+Nr^d^4wjLI9-cW*_@$%N8R%}7t&iDMJ*+-fwGg?{e0RN?6+$p9O7zV
z=$~fi?`EU>**~kGTw9beVvsT)@DBZ^tNvBYXd``Z)nZKi2XoVA%Qrce9Gr)N14-?^
zqaq(?Cnw~;t-L;VBMqq*hP?CB)?7Fgr%nlB{F&?#&mVgpc=AdxVEB27<EUvA*AwKm
z-UI{We@Xdb421h+B*xE!WU>|;9-LmG^xqUv{O@^F|GR<oUq|*>{d7$4zvKGI^Zyz{
zVryz^@8qd(V`})Hf{*{<Ht<Tv)_J`h`8!8HpGbd__g|yTa$(Zs_*iMagq{fH)Y8j~
z8-IjycX1V0Qf~b1$(-y>w8hjI*}}rwU!pfft#av_FOD=?_wkTlKK?2W&)zA5FIFxG
z_m-P`WiYqTHg4}`(zPa@IrAe#)ZHw_T$W)C`bN%1(+~acWID>uaB@&**?e0k3ZSHK
ziN}}X7I&Dzlv`=ir`~;`yWWo0*@)4{oR(N{7$N9_mdBHpph@9%Z<BZ#j#<V6_K9Yt
ztKrgKSjLV~F(+cZ_MKvcU?*<!p>L=R;Pb)b#mvDxLl2RfJ5*Rw5Ou}1YOE3=T{({;
zZz4>apH?yY79{IS-&Qz&xK&&(m+`5l_4#(v)q1>fz#ljDa4?sN-cY*wZPt(Jv1%53
z(~RWGm~(P;A6J66-bcEM0Oe{|AK9bb*cM%=wr#NT6b`pOBk901F2NAIz2~S^uKZ2P
z3Q;PxRyV`&PAhUdm_O^dK)#CnE>)=e7$^RvPV3_2<@WjgNdJ8&{4=n--}q+6;K2?D
zUeRKEeR&yk$67E+JgMRuT}=@_Vu2sUhD>1X%#5i&sO!i2cM6Ag(Zh-JqkHFUdeGb(
z;fLv1t^%r~th!~1j$r1b#yA>*tvkh{qJ7Tzv>dxh<fKWEB$cuHZ4*wF)df6GjI`IV
z)@|<Sh|G)ipV#*n7nV-vb=<5)z;<@Gf1H^8J5+zw3IW30S>!9%Iik2iJ3s1CIDvPU
z-wy!k@D#um)4?MyuIR;>w+q%-Sj;z%+FUPvSK+kyD_QB-R&g?ZM}#FTx3M03n>-DC
zP}rH_&7XNJ8L<yLW~SplFCjUKsX=10(V>nTqWJB^dIVHn6@Pkzy?fxJwb8f(y;w@?
z414TEfp=RPW%8V1frZkomPj5fhqHc(56ZZ|(G3hp{B-ig;XS<yBJME!(2yY<NIBUe
z@_x!~(}wx~>+Gt-s%pM9ozmUi-5{ZWbhmVOBi%@ebSmB5DBWGsEhXLEA>6}PKhW>~
z^xng>pXdB>c;DG;*38~Bv(|d)6D8yX$Tr1`w&%dx=eR7)7Ul+5$6kr_uXQP-S_%?H
zPRs(onLB49nq21noW^QeYl#WG8yXrgjO1|~#BD_iep`r>tcWysA1)DeNC|V+IsxaL
zJo2r!A>ADNt86HtJ8}U|T;XZ58f>~Qr({_77xjL)LnMM=bOr=dt=d?;hLF7|2$$Cs
zB+ehV$25ST=Gc#6(BUkKOuC+@H<>E6!;7n|amIZ<IK2FZp2Y~HZSWi}GoVUF2qm;s
zx5Do$rT~SaQ-Cp_D(2k6aKed<0jy^xUGVumJnhuCA+?^)6r?ttPqjrl&!YW>n1Zuy
zqstW0@=NrT^HnoPYM^-DPosi*`FerrcO+p2sPnbq!$G;Hsy%Dy5~>p1ZK;HM*2zXb
z{fVjKU5Lmm#Y@JI_1?xT_vk!KQ4C}L2uaXgAV~d*IxF$Zq$u^4HzobNY-0FCg%ONw
zZ%eR|xGhesd4ug{yZHLbgtV@lwAh)I1c1Td#i{O4v1y!Mu&dv`G7V(zwG`%aO0aNi
z_>wHsB<6Hsc<aA3E9q7fazK_&hv!^*s4NJsr`q8x&i@WdlF%(tLxChyIZqbDzF)o}
zk7S)^y+VX**>d%CBkM?XZPsg-W9(~zH<qT5@-F*_^&sNNNI&*u2#YU`LQK!B3t;xQ
z2f!!g;j%5O;e3T#oV}wYkHNR*=OvH694c;G4Tj1~p8yLke_{aZi4?-o8|k9JZBF*0
z^^b{}kp?2cM_?M`vQ#y934~Cah+fWu9CN}1Nj6k#h=VC99f%gyshUt+()=8l)jiT~
zkN(~`p4V_4ZGf4q(8}=DS|*#NNtmY@%wlvtACn`yEDUvCm6|dc(t`NR>jT+3+1_|D
z4m1Qf<e@J?2zdkOs<3O439cnVDq@1?;uXg|9~SLrxQIyJd;<ZmjeqeTS;tB^5l*az
z?Lz1(DZH!5XNvpcjYIqpm{W*>-x$=LQsR)%6&u-!`@1=sdRP<_W_KDDX03Nnp9)8%
zN7STZ*DEC0V~2$fz}c$Crz=z~TvXrDxfN#ie_E~O^H{gL?^ilnTU%L@$W;Jh6t9rw
zS2fR|y(V<fl{Y?yA-Bq8N#$Zke?wmPk-vaAxS(ueOSz>uWk=!m#z)13-Lw?D7aynB
zV#$BGG=9&vB2BFGt>%Jo-x(ZsuXseiyF{#5?JMw}iQZXcOzBvR))45lHQL>z>5JaV
zfzpgth1o_5yCZWfJq$Rw=E<PB4DQ%rb(2PAFofBXfaO~dXV*I97;{pC7Q6yFpEpd%
zsx^B&ns2w#>xS?{a^#2G5taIe&lH0THeo(-hct4onE9zLaKg)x(N)rX+>f_{W-lxd
zc7%22&ysUVtT$l=Zzm2U?!%oDY6^5vx>6o^yHX`|+k#`hs&P}Yvv@@8eo1_f0ZcPx
zT&u_<EKr6U4MXl+9<vB2whE-*li*|Z>}tv^wX=_&fMtd}V-Qotilg#nv9c%%sAkSw
zRBdoTcMjbYl^TUAPUyV0v_1`_M$Q6I1Da$Q8$aTjCzRYQ45hFWr(J6gI+pGvTL=}h
zI|!DOLU#%X%PqGJR8EgP84Cd}M8;R@D1)0%SbiQXd*2L;xa-xjDlB^&-RP?trg-so
zq`5Ow>}9S>BE+sdQClMG&hjbfnz6zMs`20zUbQ1H*k^sLP&Lpu6ydBe;kr=xq6(1n
z0p^XtAK3s8F?&{NzZ9qqbb5;i6rUxuFJ0Bw<{eoaCsSC@tY5(Xt#YektA@zQw6n_s
zcCEn6<=P=0$b_p=s2FAJ>RSI@6(4#vD6Hq218wCTy=#h&n#!kMpAZuozGE9lNKw$R
zv0f--2bR;?X#FlcIQKQfSZlJ?+v}?WbKM~VYr0*b%EmbRQ7v&|<d{)QOT*~bS94il
z?%K4S3oKqJEjUdw_IkCbQ-m76HgIh_AWNF3(3$0e#WX5SY^%X7!cGd$qM?zIojM>v
z$Q$$q`*Z!%qIQhHq-gC7rF6Cl7l8yY$NhY~gQODXWOf~=zis;gZR^P>RFcE6$>jyL
z4ZT8b6T6Oo5ke~~Gi9Db82>h412BR|@&$OcRU{@Di+T!lFOB_m`Lq{FA*NU^u)|jh
zyH;GcwPfba2ffV(%jT1acxrP~818!!_Osz#ccc&U!K*f)QVW#6#Ek3qeu_A?R>qvT
zlsONi3t?**RC^AA;9ST?w$2~ON?J-)#fiydxTD9*?qVW%5MQX<>3d|0Cq4&DIyG)H
zp@4TIwk>^+8geb=U%AQVHD9mAgPM6=m_jh8jdpO(;qR1>L<!Lts!9*)r(|p4#hVKw
z;k}ZQ%W{hUeR>1d%P5Nqs{-5DENMnKX-1t*of?aYM0_A~?aI?_FiWK)W<y9h$|J0)
z_j^a|V%21#tB3Kmr|Kpp_q@uAC|6idrt=l}rd!Iu?66OR!0s6IygZMjlG+qiqeBZ=
zvocq~2NgY`j`6~Xk&^|;^U1sx>fCdf@^mOF!^-g@Ujmq^w=exql<UC7q}<x`32JUl
z;I@s>_E3prwfgQs$fb?tyaI@%)D>|xQPe5Hv>_Gmx>P>959M6FQLoRpl@Hterv4>7
z`Kr~Rj|TaOTc_vb$_~xjUs(1wST|OB-IU5tc)PW|Obdnz5G~!RwGiaHyVWMpy-b(>
zVt-!t0unFhCQ{>`uj61%IA~ET_J~SV&AGOT#DomDd^^~3w!enGW-596^u1N}<i*5?
zQsl$Jg{uWbEP!liv4mf~qgSvHLZyb8`l;^h@b<P1Mf>J+td>oq71_kA$Ppy4_xH?Z
zS>PYg2)c7O9T1LE6{qsr%*Dy#UX27RO=-uaZY}O!O8`>{2HY3NcP$@I!gwbEslDcq
z8j@w^M8(qF8&?V(fHZ%ep$z&4?h6~^BEiS%i@nc!QDxgSW^68by@FEso;1fc;rixm
z(bH<=az~x#3F5a_iWyqUP^GEfShze8EY9a7@%$R#70`}WOrc&<ba&rB9}n}rj^)N3
zpVMW51%!O$1qn*wdTK7qmh#`WNv_CE5}M`6v4rA|{TL(AJDY863fRiC2bBN&p|qu?
zgN41Ry@{QnuEB4p6Hdfw0%}sD1{|ExuxigBN}d4Y__qjiz!>_J8`Wb8Ql8spN>=%i
zatt&tMxs$j*O9{ZzupK&l-S@EIUw00l|zlh0iENQRuVekAG_PZ$J00~g>vaP=TKQs
z7_6~<6Cqn7>i4!SFN#@tyo9|46D?c0Yorc2zRyQ&sl~?BEoP`FsuC2PSX8!-3;m1G
z*)UA*0MJa1ExZLzu}Y>;LcaW)sV*EX-j~LXtpr?dFA*e~%=GtdcG;gf5PN9VqYT~3
z(!7W6?0){a|69-pHINlG7E474bU0P$G!y|*^4d!K_h0KvqdjqPNvA^j0ztN*KDWNM
z;78}JqpyNLl#$Kiw(T{VY|bvuw56d)Jk2S2tCv_N8vsLH@R9=GG3PcjG*4r0Adw!w
zA3}&)E^?u?$d`G*Oh3&|L9XqUD3wJLDR&Sypd1T?Z}c+(pA$qdIAkboda!Q@#qcs}
zLKPcVuYeOehG4lGHYdaGt|n7zod$~>d+lT8k8U$RVp%EMuEr>gC_~9$nOx8mj1<dX
z6-sYf)8`8XVNz7uFDq=4rdZ6e*Nr#N+rZ+rpwD46hNtJ}AkDPGqY+pTC3|i<)fBII
zCtjFcg*a3Ce;<v>5Sa1l$WNZ4zqlTBcY9yOR<x5Kf__k8ROu%4q3;$*-?nm*J-Jey
z&eRHMcEOvY*Qw$Sw*Uk|*<fY;2VF+X!%7++nL62w=j(Jy{zI}rDgMvS@vPv#_X@j%
z^$}_<R(}WwRdI_D+H|=g47<edYAU_M$a#U8`9*u?cBFsSo%j|(pi|;r+lD6xNuLY@
zO<yj;@;Zq?sP%`9yh{DGbU8qbOCG?7NuE~LuyJ8E)BdM)rJ=K~rHzH*FG@bg%1TMJ
z0}8Yrlw#V6Gx@PA!T5VuLbm$CxT2`C_jXt%>;}w8ncp=g%DBl6b2V|CtT%4B;LoDK
z1Y?<i*C!Bu1w*7$k?67#(nm<Bih!zsq%Aj#-+;FpMH}C9Ktf+8y$r193r06kGOsxN
zR(1mnqr=e0jI?zGzU6mHE2?M``Anad&Y!8QpV>59xk(yEa!;?nMJj;H;5uIznb)-J
z3upu|DlqPfNU@E^uJF1Im}#1C^N1Ccr63Hc20s(Z>x9&)$wbOJRjbKDw4JS19gK}-
z6{UJ?SyfP`1{V+LORDGNe!kjL3*7pHs2hFRYq-mpJcBeuXd4S@Y!2w!lWRlPqvh;Y
z*9*?P7#R1HdCjj+H(XhlvNo(Bh1ryr(eao(5j5e=T#B<Rh@fM)wxy~0_fR`hJ0N@#
z<}zON-D;koZFyxx$8fQ-k8KRz{}7Qx3RfPr4XBC|4(Kt`-{>voUv!t0Y`s-G<FmOl
zG(QTrqw#$2+-E}E2GkX(;H{kYW-1828PV4VBtler@P0O{^GB?rHjJeV3fv?9hmw6N
zYP$As5dFjb8qs<%$JE~=Io2NdZal}O*spY9gOE>l_GZ~h!JUfSi?DHyDWhK|ite+L
z>)Rg6HZYwjpMt`3SGq{N3}?okbt<I=JxgLMu-5MaM)Wuq@GJ3!85!9`DkQ#4&72j^
z%Z#EtH`t53)()Y?h}%~{x7sg4rc@_gW--Q!Lxil5Na1_-)xNZdGjPPb5ZB>^=}I;4
zQ<iNbl1Y_F)sgXtLd~^-{SWocm%xlKr2sZ819&~{f3xF5XpW&3Kvm%P+}}a2->jVx
zvEhcM>Nyvp{dWF0LCWmx7-a0tY}9nlomVCJ!co!N2Q;cx(GVHYuZ@h1jj_#9?U$Oe
z>0a1XZQ#rBc*`c2<7=(#a)rko_GHf;UmN?LzjGX*Q`sD;%b+ytP4;y-gv?y@JINb~
zGn&=g!$1%MKH4h-?(#!VDIZO@h-0J=hQicN80!^e10DqV>OA5XiGRkBmyGuH!zwQk
z*uD-9#D#&azi5#QD}CCY+=$e>CN#FjTu}M#sD`u{h=e4$=lk^A0~Wze)!02Se8&gT
z{rF-KtD;f<pSnL5yz>8A?>vWrrPr;Uk`u<aHiSxWMGq56JjP_yl~&-*m{T(x5=rX<
zznzh5x1b#E!Nh^uRaaZStsJ{XY)m48B6nRdp8rV#lvT?BbVpJ$RBeu;C>bky(o)>Y
zfo!cKi0{s4KK+9G&8Y1y<kJ&}^lykYyMQj+gaiVj`)^%lXQ=OB_r}!G@HhLcQ&Gx%
zj`3m2Lgbq%duKGJpo74h^*I}k2)^-ir)1;GE^UPYX)%t%^&(AgSaU^UmMnW6ro_oY
z;(>-Y(=I4Xx;!kp{@I(rS}A(;FUfG|wEdz{0~OP0lyz=iw+uu4Q0<euv*AJrjKg3}
zK(Cpj*bDi0Z9TF{>Ep{2R_8a(I#g>Za4z2E+uFTM5L8N^>kwfA70sMC@6A9pX|Gz~
zH$uDlzAn=Nn=Mgxq(}>N5|JV!+y0F-Hns&h8_q{W>BO=72>-Tti9m2OQldMfGpDsd
zN;GvAWqmugt*O<$6mCL<B=z$|l!IhEOVY*kW-nFvopih~HH5NofJ>Z@0pa_%1iKhB
zbkXmX($Yhsf^=0PT`bbmL0u=p)DQ%8Qzq0Z<=4w5^9v8k8K`eFJ&1eE(@NyHiBi|@
z<qZ!l!ci|v=&uH^`Yz(YymHxCQ+ua5v49DbCGFY!EY*RA+e0hV$2GlpsK{O}934bc
zXNQ)*w&P&>3T&0%!9WTY?@aHUCG|o7Ooib4PM1||gJZ`~xCb!l<UuRjWdq7>u);|y
zuqM~&C-~>{sG3EdVbo_jF_Mvm9|%<fBQ_u-ttd`Qq&E~mu?)05)EFw2pNGjS5!W{i
zl~-gLoQ~TJ;XWteK0TN?a<J*Vf_>^zqDY;GUjVbI5WuZ|P&xSH+4p;6-ZEUu959Cl
z9NeJ^P+$azb<@*<33FiXvYH$SYp!(|-nBR=w{3criawuC6ToOJ#rGi0aHW!rMTj77
zIHsUgK{%XdrO8e)&GVo(pH|4{^|Hl8gZU;76VaWU$u-I`BfZHI`b|(eyRF{D`cqbn
zla6Ip%cQ_8h#F3hh`DX?%ViA9dwEwTmz!2DdcFAs$!-Vr;P81PySqxPzW5dXgT9uF
z(`?iI-kP27(4^3O!dGl<GH>`-<e?SWQgR$>K90}|O;iK34OE*jl|Hr4;HpR037|wD
zF4j*~xBjrt8&hL|=*n*$^zFTtvIc~}Hg2exY<2Ey=z}|A!mOg&s|OZSmhG}Wd%H~u
z(49M`tyR7Mo`*1Nw@d1{UBb<}NmMD;BogGBW%qVadPufo!?vQ~<qIQ%A-+6{OrfqR
zs1+gllMua-WrseNpkq=9_4lf@rotbg5Npwr)}r+EnV3_Mj6;N(*MlfEHthNB$Tw_B
zMU+^n@+p%>yDyxU@LUL;#YpEPy4xmjh%TWLc!!^f@Q6v8(319&CPNmNIw!QzOud<H
zDC4Kfdd;K0W;e(I&9f@Np0aJD?Y&IV&iL}YAFj72p)I2&N)lSXnzZh|B)O2IDcYz*
zFF_QgDP>);nW@RA%T9lhrw>i$tA0T?D!W~)25(`vQzX-O!AYvjk6BdGEckLG$a2<V
z2GM3g&S&w=U8RJw-n)YiutiPEJ}lU4@$U#smNZ002p7pZE<`frI^H2};)L%vcL|L8
z8R0iK`VL$$2!F-xgWKi^fN7sbiJ=3@dI>BoTgHE$x!|`3%bFjuP5v?rq>}43r~hte
z&?ya%x(>SZut7=H;pwTTU`6(H5Vim{YxF1D+!UtoT<kY2cm8@l$n+y@GK2@p;yM;K
zt=D^C$tvy%B8E_AO5<Ut39RQV6S>E0@n+a?IiRCEwcUH}pB+_Qt~atY4#`>Q(insX
zgcxd7=WS5kA>L_?yc0NE9H)F0AGi)i3=U9~#qzO2WI);U__!LC_<nMRtCq1)EDfD4
zXw)csf;_I!<L%ZKSYaNyM0Koxl};lF<7bB31cec43ae$Ufexm2(-}?j(Ear*{`OKu
z8;QZA#R(YRMQa%e43~Ii(7xWWb1MZolzv_f`Jn2{-4?R~dC2LwrA%F(XPG6i<>u=y
z*qL!bd7v07@L@CYRgs0TMQ-Eat#s?5VzyucA0wDKp>VJ3VQi#0K6G^zy0!VWwG=0K
zH|!;Y%Q4G4UU5@=!5LCc(>l!MT?p|7;Zejr6O;izQ<t%M-M1CDUyHcGU&%_x(uR~$
zb#i)MR2_J??*M(Ux5)0qLnm5`C`4i#c(%eV)_lHXFkFrigHNNR-N#>`S4wV{n`AZx
z$!Nz?l-hbd0fsHOVwaJ^72z3u1;|~cI%5k640DnxKcGDowB4!z3M9S5q)!I-Y_7Rz
z-=#f(%xc6aNU4C=bme6;u?kglKoxP(&b|=JRuQ6{`24Yx4o3L}(>KMP;mCro#adx_
z{Z1;=TE$AK^se>hCHYgw_l)z;xbH6H!61)n8GwBcT>}bVT5Ex7Kn1KYcd+R)vX@$B
zbl!tW_3ORb7MaFeZ{`|(0YrnxA`aR8A=8h9WZ4YTpeC`cp6Iz$AmN(Y-J^1!@qiNf
zPN6hkSpeZ;{BS@(SWh3{ZwxJr9;OmQfS!z@rII2v5NHOVF8;&yLv8%VxEc6D(Wr+&
z`o%FaN5*U}I<nxIi-F6qPH$jPqrhD2DJq8nDu=~TRw(|}i)eMk_(%<Ttrni}Qv3K+
z6#FZ!&P!KE7q^eSH&-`?u2r3<)b+-93KloCrj;A%P|ezu#{!=))M|#u!9^Ir9Eo~h
z`M<k+S}zL>V}N`YYw9`^DDy&1x8BpPyZwIr!suS&Ds|Y}OCyPrYQ6KUrPDDM@p4%R
zWhE8HAl&0T(d*e_Yvetm)IzAGHYy?ixTiv0#{GBVcMH}TgviZDxcA{Wttbtmu(Tq<
zZLw)YqLBws<p)I=kv39|Wqa32_;(@&{j13*2%RY7E@kc^c-b>@)vutd1i9U^o)7UI
z<^q90bkh4Je0D{WTCOO}AYIbwa1tS?oLX;(JZjKFQ}KwfFEaDp@Cyk;pM2NlA>i&R
zx>0>ItC-Xkp{2qSIOE8mElhM->L8I6PQ?p_i92=SiYp+BR)g$PtqgJgwij)i9oIhg
zD|C}B_t*7{BF%ob*Y?N0HL+Gv-rdI2?Ly>tNqJ-t(ffr<r|AbYaRW|M(uYjmx0ik4
z!U)ros3E4#D5qwWmXVW+qz8>(I-8gwBikT~bCxSd!tl_pneEWtc}vYq_=roYA}jOt
zmB?QlODh?^W<JbK$lcCa`6wdg?ZIoZo4h-{1KcgnJ!l=j{xPHydvA~bwfo^^$)tOD
zfCl0w_SDDfRkRkQua+EZ3L7P!!8V;Tuf%LHZ=p83QYjrfoh)b~D8?C3FIM4G7S&ff
zSu%TPljj3XieKF5z1oZ6x5>`;m8~5ctB-lB7uOW+u|46zJ+TX#9H|=}zbfxMs#U*g
zD>IWQcb&Bra`PGbr7}&#Way#$>2u}9A$_1Mx=_<a*Q8z+%vG`W@%+>neU^ZM)p~0h
z5EN*-AaNI!d9qkGi1!d;8H2k-@fKj98lK&yqQP2HJdU!U>g)xY5?cW`uhf$7E^A)+
zWkpnb?5!O*pMW4@?z6nbtHq2qAVQ9tw|v+2R+zKb8{K>y_4Cr1`{I;FZ?Vg~(`fy3
z)5NOTL3EFPY{v4V@~BnnY#5cBxZ0h@-dnwd!<%Yg+by&g(m^d2tGRb@FQ{o7F9^#P
zP{@+IeJO}t_)}dR4+lSeiC0KJSaiYSV1nO6=*V~D`_|Y^*mDDqA6umx=Qv*mx^Pg#
z#(kND$BOKskLjJ0h`)(m%j{8xf4sm4t|yS3y<KTi6dyvcjT``Nd$z0LOh+1+Dj;aV
zeQrBhk!kRXHMnjFdup^O>g7;&<@FR;t9xg_S1Bg)?u~}B%n_p*WSTaxTPTueJMu3U
z@$qws7$bu<sb5jHNx;>gnM{WAL0i5Oi<^2IgCbYeDJdLZ*i+Ft0=I-1BQ<NeADwaL
zuC_)^){Ph3#aZ5=9_Ns7yx=S<8W-qEZBy>EgBjgqx1PDSUK)$%@ZO~N`m#unzLoNT
zN!low6rK}u@a>W#K>~VmngR{U>gTEa#qU!zC~sE85y`!V?0f~ohs09s+b_9s+1!gu
zz+vM=m7ydM#`dhr2Z5NZnP?Q8^Wa5pp%}B<d4~Kd&s>bt(<}WyxVtyg8^EUsde#_{
znxI|7>1#1+8t#?|pjMR88EzKnI2<ZqF7NJxg}Lk>5|1j%6Jo}CLM*8dK#Zb6#=D2a
z)_V93S$)<WBk~}e`Gi0cSj{h^`u&TEn)IsAT^p=4MoXF(uR0t7%7`HnKVn(Pb>&db
zE3Ho=C}pyCUb`XVsGLi5Yvnjw`6PMS(C%>E*|Wo~;v`h>mc7N*5-h*ZO9_(|<#V-?
zb!oA0T6G3C({C`kWi7ukY=qJ)4T@xMn5xUek@V3P1<H_qsa0kLPj9r$@P;t<-2g!?
zqTnR0#)=5fAndo7oGQ%mN*oyW8%akcT+ebtse6d{1j5l}2FWBRoqdkG!(X7pnZVrh
zYTDl4V>zmgUdIo-K)e$fX47nTc(-wHxb3An8NNhl-T`yxezG2(nym7jRj|}lEoJ3C
zKS&#yi=nQ`PT{iD`0n(5I;Ft+36IGl*)%GA%zG1S@j2+0)xLEFHin>iS$NJdI4BM4
z8Xc6r*HCyO$kjZ4qZbWXubOU&DZQZX{rkOE;OmamX<usdqi{I6UGHx!9`>I$e=M{h
zzU&3FHP1!mC{!$%46$ii8IbHbTp7q~)xB)horFMF%C;O|wwl!8@H67@0OuHFWXiI`
z0N1h2#<C+k=c3g>PEt$9rSL-PYxZkC*|pX<^tq!r@OT%^)faQE{cZI=F^J9P&LCh&
z<OX}CnksUd1*1v&B1Xza4{h{1@HjaV8aHwUmoO&V#P<cCjq9Dc*A6;IqVdvqzny{_
zy0IKbQ}Hk9ZhFpHhr<+<r;*>@sIze96hyol8~FrtE_OEJTqLIbwIL9J;Zw@~z%csf
z1|5nw*r0lReW-*HDm=bzZ(p!sv}&H|R7Ne;TE0ZZK_mEHiL`)wuQ`Zd4Tp&3@>MUT
zcSv-@lrdw^t|49XsG}e;HF#)9!ixh*9nY?Wg*7F|s9XI8{QE1>Xac~|my|DslRod~
z2&xrJI{qu4kDe#zyxJ_ItPK~Jyp`iGtw)><HP1CC_<GfI;>}iU+<9<hQk=@$^1s=l
ztlv&ke0l49lV(axN9~#>2h2<G3~DNk{EGp!D7z6|m%Dd3Q8Gwin(_NEIWdgL9$#x`
zpN>32-+HTxlTcQSN=|xo>(OoLj(#_sRF>ULnZhVy-Ja^%*QgLL$rf%zQ=*XDE=rHo
zfe2rEVnnp#=;*{-D37sC*R9PQvvk#$eoT-_KEi#C#IzD-bXOE4CQ-|(8sz!5>A1kx
z+muXCD$md+q1`pO&(&EL1U4Glb|qY^mZ5rPp0rGH5%sxrdRjDtNCq)PC%8;P;6c2a
zw%k(aWnzD?LrDOoMiZAb9O6Zfgf@~Qs{R)n{}uk5%bk^X{adFm8WBEJxw>7-I`j)d
zr+UD|F?p=ZTawRoIpvU8mE!lZJ-5ZlRKmF+LPfJ#Hw6^?@HEhi=LqZoi)cii`k?bl
zfd$vMENyDELw3E9Xh(TB`%?qk*@Bj@T4VGx6a~+kCbVRg847P7ao$o{|1sWVaJU@q
zg`N?xD?+P-#t~*x)O)Ak`|!|R?WZd`Ph{4nD!`tXGazj9hZXPtVKLXJ+zZHOL~Pul
zffy8WL?fy2pnP^2K@LA`zI=)d20|B#l2efvp{Sm@+5BN#oKCi=CLcaPXz-Zx08Yf%
zhK~y}6`a*ot?m2z`t2vJa>t~-))U7dNlsV%VUI60exw<F=#3HLtE8y#5yd*X#3TK*
zMPC$V#DWCSJ#o(N-$Q+=UTI%8q~27>5<n$)+A1_u^%V8%(XLF@<P8512fht$0|~O`
zVFCNh$qemSq*>YnIRLW-2A>|k-m&4}1+*9P*R*mO5+bqH%);~Ob8rvS@OnC<93fLX
z2#yd?Wd2CdBM;P!IT|@EQS3ytGWM_3L3}(S1X~EJCCL-Ra`<xIb3VPKpE%yjlgWbn
z$k5nuR~@q|Aa_$oB}3*{7$wcEyBIq<UcQZ>w_vdjKaK~d5`p<<`*IP;jYkI#oXdEQ
z252Ch4xK`gu*mBQQ=*U9?R4!J+iFT>r*v_$nuJ6|90gh~$SHQGlImj=m7(0-0sBZo
z0nFUHG0ykQmPLxm>9hOqg(A_7RlKsibDMPK(s8}~wk*DmcibO2;Ca9`l5d;AC}-m7
zXYx12iz}ZsIjPyP;*-qA9V)XEDb}5tXDC+K5O(yi%c2H#I&0(}C%`lZN=5I*v5tk{
zq@=}D<w3Gy7Hb-QAQC~wO=ISurzN=!t!oSH_Cjc4PiAXzqSAf|-to$TRAJ3cIL()T
z3UeQj($8}#+Bi~A=HxnBtyaLTLp~9_l3jqsDUgN7n}-0%MqV62jcQCBNopQYK1m5n
z;9ehB1Qa~3;*kd^;JPK@dxkU1W|FOzRrtrNeY|(^gZ|=X6&Pl)w`}>Ed>Q5a^!VU6
z0eT2jyCZ&ojU=@mGw1so=xNLbUn6loe{qang#Po;!@(3vKuz`p0^*P8nb&DseXr*U
zuX6)=oxFMG@Lg{ISdRb>=>Q*U)irTD1X+Cy!1sp>;qUc`qak49(Aw(nO><cqP1T3%
zhgFES_Fuo#)+Xd7q+q0HVx(uMfRU8|2EhmY55oh~-yd)w%)h=O0`2iXe0|tJVNg;8
z2Lk;UhCiYH6MRUj13E1Q_$O<?h4>c~V9yFD%jd=4P}<t2R;Kpa+JCeifcg-8_(QvY
zM4jxz4sQYn0)of>5q0u!)CbWr8*4lJ-$Z>dtO+hj{U$&*)Kd!x2=z%@7XT@+f84gd
zg(*Nj^o^k%z>a?-|H29n$kZ`5y_kUQs3^cP68;G?D&F77)|NK9|FrfOm=9r{3fGx_
zV*t!iL?9rfCz$qxe_{TkEc|y<f5Wr~<sV#?04rS(VITnLpMC#lheeV81^cHkoVA68
zp}sx9=idA#`hy{OOZv!7WC7dY>cBskn2Y%_`d?M?|3?0WIUeflZys)rGXPA%#PXv{
z9K8Jt*~ZS=-df+<;&0T4GW)-9G{IT|%SwQwK>=i&5uRAXB>FGZzn(sSgZ{53{`cVp
z`xod#qxblY;{O7DICX#Xi$V4$;Dalmd0&D*47rC3{Yfhep920rhFXbrujf7>UBV3E
zM}I&_c+yxsLo0m~OI<s2reB)kKhr}d4yEK7z`Ya!4ERr4E|dBc-NoKe8zA3nYGwSN
z?2KPe77sqRWd3c@5CsTmL*qf>@UcfHr9a{Lb8u__Il7;){KEep`rxU+_hT0I$|o%U
z49X`A|Et^l$5v03jUO{~)I4GMN$2?2c6zvjr&6bn@lhI|;QulHb)O2K{+i-vwaLd6
zC(S=o{BwZ+PYeox<zI9se|$PT)#rK4Lec*-%M-PpU*E>hnjVh{6o-B$_(|adK=K<?
z&(jhfk4gFtekOTZ(Bs#)@zCK<i#t4KKtBGN;c2Od-(z^1ZTpyE<??5SC%L!3NANTq
z@i76(?au^Hk`sTA;AytlV}iB2p9!Aij{P3N)9j4L1e2h^f4+45`7rp`T#eu4coOOL
zm}4K|DaXIWef=KA(_oy(6eY+{DE>Hp`)6c7KBxRWqNi~rkBKx;e@*mfRLSo#J&m+@
z%#?-pUzq+i?&9|tp9Ts%W>mrWFO2^lHt>6tPY;qGQx;(TO!<HBuBQjFkMaAkf5!i5
zCI9Q2`1yq9F$p@s6Otd#YXI>7a#r)>^W^E)!(;d)%BS!@o+|&h=Ktq?@acl?F@6rs
hU-*xgcL30TeIEe_EK-1g$N+yGfcJL@+K0FH{{dOjytV)U

literal 0
HcmV?d00001

diff --git a/python/setup.py b/python/setup.py
index 2644d3e79dea1..cfc83c68e3df5 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -194,7 +194,7 @@ def _supports_symlinks():
             'pyspark.examples.src.main.python': ['*.py', '*/*.py']},
         scripts=scripts,
         license='http://www.apache.org/licenses/LICENSE-2.0',
-        install_requires=['py4j==0.10.4'],
+        install_requires=['py4j==0.10.6'],
         setup_requires=['pypandoc'],
         extras_require={
             'ml': ['numpy>=1.7'],
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index e5131e636dc04..1dd0715918042 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -1124,7 +1124,7 @@ private[spark] class Client(
         val pyArchivesFile = new File(pyLibPath, "pyspark.zip")
         require(pyArchivesFile.exists(),
           s"$pyArchivesFile not found; cannot run pyspark application in YARN mode.")
-        val py4jFile = new File(pyLibPath, "py4j-0.10.4-src.zip")
+        val py4jFile = new File(pyLibPath, "py4j-0.10.6-src.zip")
         require(py4jFile.exists(),
           s"$py4jFile not found; cannot run pyspark application in YARN mode.")
         Seq(pyArchivesFile.getAbsolutePath(), py4jFile.getAbsolutePath())
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
index 59adb7e22d185..fc78bc488b116 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -249,7 +249,7 @@ class YarnClusterSuite extends BaseYarnClusterSuite {
     // needed locations.
     val sparkHome = sys.props("spark.test.home")
     val pythonPath = Seq(
-        s"$sparkHome/python/lib/py4j-0.10.4-src.zip",
+        s"$sparkHome/python/lib/py4j-0.10.6-src.zip",
         s"$sparkHome/python")
     val extraEnvVars = Map(
       "PYSPARK_ARCHIVES_PATH" -> pythonPath.map("local:" + _).mkString(File.pathSeparator),
diff --git a/sbin/spark-config.sh b/sbin/spark-config.sh
index f2d9e6b568a9b..bac154e10ae62 100755
--- a/sbin/spark-config.sh
+++ b/sbin/spark-config.sh
@@ -28,6 +28,6 @@ export SPARK_CONF_DIR="${SPARK_CONF_DIR:-"${SPARK_HOME}/conf"}"
 # Add the PySpark classes to the PYTHONPATH:
 if [ -z "${PYSPARK_PYTHONPATH_SET}" ]; then
   export PYTHONPATH="${SPARK_HOME}/python:${PYTHONPATH}"
-  export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.4-src.zip:${PYTHONPATH}"
+  export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.6-src.zip:${PYTHONPATH}"
   export PYSPARK_PYTHONPATH_SET=1
 fi

From ab866f117378e64dba483ead51b769ae7be31d4d Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Wed, 5 Jul 2017 18:26:28 -0700
Subject: [PATCH 100/779] [SPARK-21248][SS] The clean up codes in
 StreamExecution should not be interrupted

## What changes were proposed in this pull request?

This PR uses `runUninterruptibly` to avoid that the clean up codes in StreamExecution is interrupted. It also removes an optimization in `runUninterruptibly` to make sure this method never throw `InterruptedException`.

## How was this patch tested?

Jenkins

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #18461 from zsxwing/SPARK-21248.
---
 .../org/apache/spark/util/UninterruptibleThread.scala  | 10 +---------
 .../apache/spark/util/UninterruptibleThreadSuite.scala |  5 ++---
 .../sql/execution/streaming/StreamExecution.scala      |  6 +++++-
 3 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/UninterruptibleThread.scala b/core/src/main/scala/org/apache/spark/util/UninterruptibleThread.scala
index 27922b31949b6..6a58ec142dd7f 100644
--- a/core/src/main/scala/org/apache/spark/util/UninterruptibleThread.scala
+++ b/core/src/main/scala/org/apache/spark/util/UninterruptibleThread.scala
@@ -55,9 +55,6 @@ private[spark] class UninterruptibleThread(
    * Run `f` uninterruptibly in `this` thread. The thread won't be interrupted before returning
    * from `f`.
    *
-   * If this method finds that `interrupt` is called before calling `f` and it's not inside another
-   * `runUninterruptibly`, it will throw `InterruptedException`.
-   *
    * Note: this method should be called only in `this` thread.
    */
   def runUninterruptibly[T](f: => T): T = {
@@ -73,12 +70,7 @@ private[spark] class UninterruptibleThread(
 
     uninterruptibleLock.synchronized {
       // Clear the interrupted status if it's set.
-      if (Thread.interrupted() || shouldInterruptThread) {
-        shouldInterruptThread = false
-        // Since it's interrupted, we don't need to run `f` which may be a long computation.
-        // Throw InterruptedException as we don't have a T to return.
-        throw new InterruptedException()
-      }
+      shouldInterruptThread = Thread.interrupted() || shouldInterruptThread
       uninterruptible = true
     }
     try {
diff --git a/core/src/test/scala/org/apache/spark/util/UninterruptibleThreadSuite.scala b/core/src/test/scala/org/apache/spark/util/UninterruptibleThreadSuite.scala
index 39b31f8ddeaba..6a190f63ac9d0 100644
--- a/core/src/test/scala/org/apache/spark/util/UninterruptibleThreadSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UninterruptibleThreadSuite.scala
@@ -68,7 +68,6 @@ class UninterruptibleThreadSuite extends SparkFunSuite {
         Uninterruptibles.awaitUninterruptibly(interruptLatch, 10, TimeUnit.SECONDS)
         try {
           runUninterruptibly {
-            assert(false, "Should not reach here")
           }
         } catch {
           case _: InterruptedException => hasInterruptedException = true
@@ -80,8 +79,8 @@ class UninterruptibleThreadSuite extends SparkFunSuite {
     t.interrupt()
     interruptLatch.countDown()
     t.join()
-    assert(hasInterruptedException === true)
-    assert(interruptStatusBeforeExit === false)
+    assert(hasInterruptedException === false)
+    assert(interruptStatusBeforeExit === true)
   }
 
   test("nested runUninterruptibly") {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index d5f8d2acba92b..10c42a7338e85 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -357,7 +357,11 @@ class StreamExecution(
         if (!NonFatal(e)) {
           throw e
         }
-    } finally {
+    } finally microBatchThread.runUninterruptibly {
+      // The whole `finally` block must run inside `runUninterruptibly` to avoid being interrupted
+      // when a query is stopped by the user. We need to make sure the following codes finish
+      // otherwise it may throw `InterruptedException` to `UncaughtExceptionHandler` (SPARK-21248).
+
       // Release latches to unblock the user codes since exception can happen in any place and we
       // may not get a chance to release them
       startLatch.countDown()

From 75b168fd30bb9a52ae223b6f1df73da4b1316f2e Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Thu, 6 Jul 2017 14:18:50 +0800
Subject: [PATCH 101/779] [SPARK-21308][SQL] Remove SQLConf parameters from the
 optimizer

### What changes were proposed in this pull request?
This PR removes SQLConf parameters from the optimizer rules

### How was this patch tested?
The existing test cases

Author: gatorsmile <gatorsmile@gmail.com>

Closes #18533 from gatorsmile/rmSQLConfOptimizer.
---
 .../optimizer/CostBasedJoinReorder.scala      |  7 ++--
 .../sql/catalyst/optimizer/Optimizer.scala    | 36 +++++++++----------
 .../optimizer/StarSchemaDetection.scala       |  4 ++-
 .../sql/catalyst/optimizer/expressions.scala  | 14 ++++----
 .../spark/sql/catalyst/optimizer/joins.scala  |  6 ++--
 .../BinaryComparisonSimplificationSuite.scala |  2 +-
 .../BooleanSimplificationSuite.scala          |  2 +-
 .../optimizer/CombiningLimitsSuite.scala      |  2 +-
 .../optimizer/ConstantFoldingSuite.scala      |  2 +-
 .../optimizer/DecimalAggregatesSuite.scala    |  2 +-
 .../optimizer/EliminateMapObjectsSuite.scala  |  2 +-
 .../optimizer/JoinOptimizationSuite.scala     |  2 +-
 .../catalyst/optimizer/JoinReorderSuite.scala | 27 +++++++++++---
 .../optimizer/LimitPushdownSuite.scala        |  2 +-
 .../optimizer/OptimizeCodegenSuite.scala      |  2 +-
 .../catalyst/optimizer/OptimizeInSuite.scala  | 24 +++++++------
 .../StarJoinCostBasedReorderSuite.scala       | 36 ++++++++++++++-----
 .../optimizer/StarJoinReorderSuite.scala      | 25 ++++++++++---
 .../optimizer/complexTypesSuite.scala         |  2 +-
 .../spark/sql/catalyst/plans/PlanTest.scala   |  4 +--
 .../execution/OptimizeMetadataOnlyQuery.scala |  8 ++---
 .../spark/sql/execution/SparkOptimizer.scala  |  6 ++--
 .../internal/BaseSessionStateBuilder.scala    |  2 +-
 23 files changed, 137 insertions(+), 82 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/CostBasedJoinReorder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/CostBasedJoinReorder.scala
index 3a7543e2141e9..db7baf6e9bc7d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/CostBasedJoinReorder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/CostBasedJoinReorder.scala
@@ -32,7 +32,10 @@ import org.apache.spark.sql.internal.SQLConf
  * We may have several join reorder algorithms in the future. This class is the entry of these
  * algorithms, and chooses which one to use.
  */
-case class CostBasedJoinReorder(conf: SQLConf) extends Rule[LogicalPlan] with PredicateHelper {
+object CostBasedJoinReorder extends Rule[LogicalPlan] with PredicateHelper {
+
+  private def conf = SQLConf.get
+
   def apply(plan: LogicalPlan): LogicalPlan = {
     if (!conf.cboEnabled || !conf.joinReorderEnabled) {
       plan
@@ -379,7 +382,7 @@ object JoinReorderDPFilters extends PredicateHelper {
 
     if (conf.joinReorderDPStarFilter) {
       // Compute the tables in a star-schema relationship.
-      val starJoin = StarSchemaDetection(conf).findStarJoins(items, conditions.toSeq)
+      val starJoin = StarSchemaDetection.findStarJoins(items, conditions.toSeq)
       val nonStarJoin = items.filterNot(starJoin.contains(_))
 
       if (starJoin.nonEmpty && nonStarJoin.nonEmpty) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 946fa7bae0199..d82af94dbffb7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -34,10 +34,10 @@ import org.apache.spark.sql.types._
  * Abstract class all optimizers should inherit of, contains the standard batches (extending
  * Optimizers can override this.
  */
-abstract class Optimizer(sessionCatalog: SessionCatalog, conf: SQLConf)
+abstract class Optimizer(sessionCatalog: SessionCatalog)
   extends RuleExecutor[LogicalPlan] {
 
-  protected val fixedPoint = FixedPoint(conf.optimizerMaxIterations)
+  protected def fixedPoint = FixedPoint(SQLConf.get.optimizerMaxIterations)
 
   def batches: Seq[Batch] = {
     Batch("Eliminate Distinct", Once, EliminateDistinct) ::
@@ -77,11 +77,11 @@ abstract class Optimizer(sessionCatalog: SessionCatalog, conf: SQLConf)
     Batch("Operator Optimizations", fixedPoint, Seq(
       // Operator push down
       PushProjectionThroughUnion,
-      ReorderJoin(conf),
+      ReorderJoin,
       EliminateOuterJoin,
       PushPredicateThroughJoin,
       PushDownPredicate,
-      LimitPushDown(conf),
+      LimitPushDown,
       ColumnPruning,
       InferFiltersFromConstraints,
       // Operator combine
@@ -92,10 +92,10 @@ abstract class Optimizer(sessionCatalog: SessionCatalog, conf: SQLConf)
       CombineLimits,
       CombineUnions,
       // Constant folding and strength reduction
-      NullPropagation(conf),
+      NullPropagation,
       ConstantPropagation,
       FoldablePropagation,
-      OptimizeIn(conf),
+      OptimizeIn,
       ConstantFolding,
       ReorderAssociativeOperator,
       LikeSimplification,
@@ -117,11 +117,11 @@ abstract class Optimizer(sessionCatalog: SessionCatalog, conf: SQLConf)
       CombineConcats) ++
       extendedOperatorOptimizationRules: _*) ::
     Batch("Check Cartesian Products", Once,
-      CheckCartesianProducts(conf)) ::
+      CheckCartesianProducts) ::
     Batch("Join Reorder", Once,
-      CostBasedJoinReorder(conf)) ::
+      CostBasedJoinReorder) ::
     Batch("Decimal Optimizations", fixedPoint,
-      DecimalAggregates(conf)) ::
+      DecimalAggregates) ::
     Batch("Object Expressions Optimization", fixedPoint,
       EliminateMapObjects,
       CombineTypedFilters) ::
@@ -129,7 +129,7 @@ abstract class Optimizer(sessionCatalog: SessionCatalog, conf: SQLConf)
       ConvertToLocalRelation,
       PropagateEmptyRelation) ::
     Batch("OptimizeCodegen", Once,
-      OptimizeCodegen(conf)) ::
+      OptimizeCodegen) ::
     Batch("RewriteSubquery", Once,
       RewritePredicateSubquery,
       CollapseProject) :: Nil
@@ -178,8 +178,7 @@ class SimpleTestOptimizer extends Optimizer(
   new SessionCatalog(
     new InMemoryCatalog,
     EmptyFunctionRegistry,
-    new SQLConf().copy(SQLConf.CASE_SENSITIVE -> true)),
-  new SQLConf().copy(SQLConf.CASE_SENSITIVE -> true))
+    new SQLConf().copy(SQLConf.CASE_SENSITIVE -> true)))
 
 /**
  * Remove redundant aliases from a query plan. A redundant alias is an alias that does not change
@@ -288,7 +287,7 @@ object RemoveRedundantProject extends Rule[LogicalPlan] {
 /**
  * Pushes down [[LocalLimit]] beneath UNION ALL and beneath the streamed inputs of outer joins.
  */
-case class LimitPushDown(conf: SQLConf) extends Rule[LogicalPlan] {
+object LimitPushDown extends Rule[LogicalPlan] {
 
   private def stripGlobalLimitIfPresent(plan: LogicalPlan): LogicalPlan = {
     plan match {
@@ -1077,8 +1076,7 @@ object CombineLimits extends Rule[LogicalPlan] {
  * the join between R and S is not a cartesian product and therefore should be allowed.
  * The predicate R.r = S.s is not recognized as a join condition until the ReorderJoin rule.
  */
-case class CheckCartesianProducts(conf: SQLConf)
-    extends Rule[LogicalPlan] with PredicateHelper {
+object CheckCartesianProducts extends Rule[LogicalPlan] with PredicateHelper {
   /**
    * Check if a join is a cartesian product. Returns true if
    * there are no join conditions involving references from both left and right.
@@ -1090,7 +1088,7 @@ case class CheckCartesianProducts(conf: SQLConf)
   }
 
   def apply(plan: LogicalPlan): LogicalPlan =
-    if (conf.crossJoinEnabled) {
+    if (SQLConf.get.crossJoinEnabled) {
       plan
     } else plan transform {
       case j @ Join(left, right, Inner | LeftOuter | RightOuter | FullOuter, condition)
@@ -1112,7 +1110,7 @@ case class CheckCartesianProducts(conf: SQLConf)
  * This uses the same rules for increasing the precision and scale of the output as
  * [[org.apache.spark.sql.catalyst.analysis.DecimalPrecision]].
  */
-case class DecimalAggregates(conf: SQLConf) extends Rule[LogicalPlan] {
+object DecimalAggregates extends Rule[LogicalPlan] {
   import Decimal.MAX_LONG_DIGITS
 
   /** Maximum number of decimal digits representable precisely in a Double */
@@ -1130,7 +1128,7 @@ case class DecimalAggregates(conf: SQLConf) extends Rule[LogicalPlan] {
             we.copy(windowFunction = ae.copy(aggregateFunction = Average(UnscaledValue(e))))
           Cast(
             Divide(newAggExpr, Literal.create(math.pow(10.0, scale), DoubleType)),
-            DecimalType(prec + 4, scale + 4), Option(conf.sessionLocalTimeZone))
+            DecimalType(prec + 4, scale + 4), Option(SQLConf.get.sessionLocalTimeZone))
 
         case _ => we
       }
@@ -1142,7 +1140,7 @@ case class DecimalAggregates(conf: SQLConf) extends Rule[LogicalPlan] {
           val newAggExpr = ae.copy(aggregateFunction = Average(UnscaledValue(e)))
           Cast(
             Divide(newAggExpr, Literal.create(math.pow(10.0, scale), DoubleType)),
-            DecimalType(prec + 4, scale + 4), Option(conf.sessionLocalTimeZone))
+            DecimalType(prec + 4, scale + 4), Option(SQLConf.get.sessionLocalTimeZone))
 
         case _ => ae
       }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/StarSchemaDetection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/StarSchemaDetection.scala
index ca729127e7d1d..1f20b7661489e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/StarSchemaDetection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/StarSchemaDetection.scala
@@ -28,7 +28,9 @@ import org.apache.spark.sql.internal.SQLConf
 /**
  * Encapsulates star-schema detection logic.
  */
-case class StarSchemaDetection(conf: SQLConf) extends PredicateHelper {
+object StarSchemaDetection extends PredicateHelper {
+
+  private def conf = SQLConf.get
 
   /**
    * Star schema consists of one or more fact tables referencing a number of dimension
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
index 66b8ca62e5e4c..6c83f4790004f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
@@ -173,12 +173,12 @@ object ReorderAssociativeOperator extends Rule[LogicalPlan] {
  * 2. Replaces [[In (value, seq[Literal])]] with optimized version
  *    [[InSet (value, HashSet[Literal])]] which is much faster.
  */
-case class OptimizeIn(conf: SQLConf) extends Rule[LogicalPlan] {
+object OptimizeIn extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
     case q: LogicalPlan => q transformExpressionsDown {
       case expr @ In(v, list) if expr.inSetConvertible =>
         val newList = ExpressionSet(list).toSeq
-        if (newList.size > conf.optimizerInSetConversionThreshold) {
+        if (newList.size > SQLConf.get.optimizerInSetConversionThreshold) {
           val hSet = newList.map(e => e.eval(EmptyRow))
           InSet(v, HashSet() ++ hSet)
         } else if (newList.size < list.size) {
@@ -414,7 +414,7 @@ object LikeSimplification extends Rule[LogicalPlan] {
  * equivalent [[Literal]] values. This rule is more specific with
  * Null value propagation from bottom to top of the expression tree.
  */
-case class NullPropagation(conf: SQLConf) extends Rule[LogicalPlan] {
+object NullPropagation extends Rule[LogicalPlan] {
   private def isNullLiteral(e: Expression): Boolean = e match {
     case Literal(null, _) => true
     case _ => false
@@ -423,9 +423,9 @@ case class NullPropagation(conf: SQLConf) extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
     case q: LogicalPlan => q transformExpressionsUp {
       case e @ WindowExpression(Cast(Literal(0L, _), _, _), _) =>
-        Cast(Literal(0L), e.dataType, Option(conf.sessionLocalTimeZone))
+        Cast(Literal(0L), e.dataType, Option(SQLConf.get.sessionLocalTimeZone))
       case e @ AggregateExpression(Count(exprs), _, _, _) if exprs.forall(isNullLiteral) =>
-        Cast(Literal(0L), e.dataType, Option(conf.sessionLocalTimeZone))
+        Cast(Literal(0L), e.dataType, Option(SQLConf.get.sessionLocalTimeZone))
       case ae @ AggregateExpression(Count(exprs), _, false, _) if !exprs.exists(_.nullable) =>
         // This rule should be only triggered when isDistinct field is false.
         ae.copy(aggregateFunction = Count(Literal(1)))
@@ -552,14 +552,14 @@ object FoldablePropagation extends Rule[LogicalPlan] {
 /**
  * Optimizes expressions by replacing according to CodeGen configuration.
  */
-case class OptimizeCodegen(conf: SQLConf) extends Rule[LogicalPlan] {
+object OptimizeCodegen extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
     case e: CaseWhen if canCodegen(e) => e.toCodegen()
   }
 
   private def canCodegen(e: CaseWhen): Boolean = {
     val numBranches = e.branches.size + e.elseValue.size
-    numBranches <= conf.maxCaseBranchesForCodegen
+    numBranches <= SQLConf.get.maxCaseBranchesForCodegen
   }
 }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala
index bb97e2c808b9f..edbeaf273fd6f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala
@@ -34,7 +34,7 @@ import org.apache.spark.sql.internal.SQLConf
  *
  * If star schema detection is enabled, reorder the star join plans based on heuristics.
  */
-case class ReorderJoin(conf: SQLConf) extends Rule[LogicalPlan] with PredicateHelper {
+object ReorderJoin extends Rule[LogicalPlan] with PredicateHelper {
   /**
    * Join a list of plans together and push down the conditions into them.
    *
@@ -87,8 +87,8 @@ case class ReorderJoin(conf: SQLConf) extends Rule[LogicalPlan] with PredicateHe
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
     case ExtractFiltersAndInnerJoins(input, conditions)
         if input.size > 2 && conditions.nonEmpty =>
-      if (conf.starSchemaDetection && !conf.cboEnabled) {
-        val starJoinPlan = StarSchemaDetection(conf).reorderStarJoins(input, conditions)
+      if (SQLConf.get.starSchemaDetection && !SQLConf.get.cboEnabled) {
+        val starJoinPlan = StarSchemaDetection.reorderStarJoins(input, conditions)
         if (starJoinPlan.nonEmpty) {
           val rest = input.filterNot(starJoinPlan.contains(_))
           createOrderedJoin(starJoinPlan ++ rest, conditions)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BinaryComparisonSimplificationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BinaryComparisonSimplificationSuite.scala
index 2a04bd588dc1d..a313681eeb8f0 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BinaryComparisonSimplificationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BinaryComparisonSimplificationSuite.scala
@@ -33,7 +33,7 @@ class BinaryComparisonSimplificationSuite extends PlanTest with PredicateHelper
       Batch("AnalysisNodes", Once,
         EliminateSubqueryAliases) ::
       Batch("Constant Folding", FixedPoint(50),
-        NullPropagation(conf),
+        NullPropagation,
         ConstantFolding,
         BooleanSimplification,
         SimplifyBinaryComparison,
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala
index c6345b60b744b..56399f4831a6f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala
@@ -35,7 +35,7 @@ class BooleanSimplificationSuite extends PlanTest with PredicateHelper {
       Batch("AnalysisNodes", Once,
         EliminateSubqueryAliases) ::
       Batch("Constant Folding", FixedPoint(50),
-        NullPropagation(conf),
+        NullPropagation,
         ConstantFolding,
         BooleanSimplification,
         PruneFilters) :: Nil
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CombiningLimitsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CombiningLimitsSuite.scala
index ac71887c16f96..87ad81db11b64 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CombiningLimitsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CombiningLimitsSuite.scala
@@ -32,7 +32,7 @@ class CombiningLimitsSuite extends PlanTest {
       Batch("Combine Limit", FixedPoint(10),
         CombineLimits) ::
       Batch("Constant Folding", FixedPoint(10),
-        NullPropagation(conf),
+        NullPropagation,
         ConstantFolding,
         BooleanSimplification,
         SimplifyConditionals) :: Nil
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala
index 25c592b9c1dde..641c89873dcc4 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala
@@ -33,7 +33,7 @@ class ConstantFoldingSuite extends PlanTest {
       Batch("AnalysisNodes", Once,
         EliminateSubqueryAliases) ::
       Batch("ConstantFolding", Once,
-        OptimizeIn(conf),
+        OptimizeIn,
         ConstantFolding,
         BooleanSimplification) :: Nil
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/DecimalAggregatesSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/DecimalAggregatesSuite.scala
index cc4fb3a244a98..711294ed61928 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/DecimalAggregatesSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/DecimalAggregatesSuite.scala
@@ -29,7 +29,7 @@ class DecimalAggregatesSuite extends PlanTest {
 
   object Optimize extends RuleExecutor[LogicalPlan] {
     val batches = Batch("Decimal Optimizations", FixedPoint(100),
-      DecimalAggregates(conf)) :: Nil
+      DecimalAggregates) :: Nil
   }
 
   val testRelation = LocalRelation('a.decimal(2, 1), 'b.decimal(12, 1))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateMapObjectsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateMapObjectsSuite.scala
index d4f37e2a5e877..157472c2293f9 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateMapObjectsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateMapObjectsSuite.scala
@@ -31,7 +31,7 @@ class EliminateMapObjectsSuite extends PlanTest {
   object Optimize extends RuleExecutor[LogicalPlan] {
     val batches = {
       Batch("EliminateMapObjects", FixedPoint(50),
-        NullPropagation(conf),
+        NullPropagation,
         SimplifyCasts,
         EliminateMapObjects) :: Nil
     }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/JoinOptimizationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/JoinOptimizationSuite.scala
index a6584aa5fbba7..2f30a78f03211 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/JoinOptimizationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/JoinOptimizationSuite.scala
@@ -37,7 +37,7 @@ class JoinOptimizationSuite extends PlanTest {
         CombineFilters,
         PushDownPredicate,
         BooleanSimplification,
-        ReorderJoin(conf),
+        ReorderJoin,
         PushPredicateThroughJoin,
         ColumnPruning,
         CollapseProject) :: Nil
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/JoinReorderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/JoinReorderSuite.scala
index 71db4e2e0ec4d..2fb587d50a4cb 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/JoinReorderSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/JoinReorderSuite.scala
@@ -24,25 +24,42 @@ import org.apache.spark.sql.catalyst.plans.{Inner, PlanTest}
 import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, LogicalPlan}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.apache.spark.sql.catalyst.statsEstimation.{StatsEstimationTestBase, StatsTestPlan}
-import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.internal.SQLConf.{CBO_ENABLED, JOIN_REORDER_ENABLED}
 
 
 class JoinReorderSuite extends PlanTest with StatsEstimationTestBase {
 
-  override val conf = new SQLConf().copy(CBO_ENABLED -> true, JOIN_REORDER_ENABLED -> true)
-
   object Optimize extends RuleExecutor[LogicalPlan] {
     val batches =
       Batch("Operator Optimizations", FixedPoint(100),
         CombineFilters,
         PushDownPredicate,
-        ReorderJoin(conf),
+        ReorderJoin,
         PushPredicateThroughJoin,
         ColumnPruning,
         CollapseProject) ::
       Batch("Join Reorder", Once,
-        CostBasedJoinReorder(conf)) :: Nil
+        CostBasedJoinReorder) :: Nil
+  }
+
+  var originalConfCBOEnabled = false
+  var originalConfJoinReorderEnabled = false
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    originalConfCBOEnabled = conf.cboEnabled
+    originalConfJoinReorderEnabled = conf.joinReorderEnabled
+    conf.setConf(CBO_ENABLED, true)
+    conf.setConf(JOIN_REORDER_ENABLED, true)
+  }
+
+  override def afterAll(): Unit = {
+    try {
+      conf.setConf(CBO_ENABLED, originalConfCBOEnabled)
+      conf.setConf(JOIN_REORDER_ENABLED, originalConfJoinReorderEnabled)
+    } finally {
+      super.afterAll()
+    }
   }
 
   /** Set up tables and columns for testing */
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LimitPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LimitPushdownSuite.scala
index d8302dfc9462d..f50e2e86516f0 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LimitPushdownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LimitPushdownSuite.scala
@@ -32,7 +32,7 @@ class LimitPushdownSuite extends PlanTest {
       Batch("Subqueries", Once,
         EliminateSubqueryAliases) ::
       Batch("Limit pushdown", FixedPoint(100),
-        LimitPushDown(conf),
+        LimitPushDown,
         CombineLimits,
         ConstantFolding,
         BooleanSimplification) :: Nil
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeCodegenSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeCodegenSuite.scala
index 9dc6738ba04b3..b71067c0af3a1 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeCodegenSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeCodegenSuite.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.rules._
 class OptimizeCodegenSuite extends PlanTest {
 
   object Optimize extends RuleExecutor[LogicalPlan] {
-    val batches = Batch("OptimizeCodegen", Once, OptimizeCodegen(conf)) :: Nil
+    val batches = Batch("OptimizeCodegen", Once, OptimizeCodegen) :: Nil
   }
 
   protected def assertEquivalent(e1: Expression, e2: Expression): Unit = {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
index d8937321ecb98..6a77580b29a21 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
@@ -34,10 +34,10 @@ class OptimizeInSuite extends PlanTest {
       Batch("AnalysisNodes", Once,
         EliminateSubqueryAliases) ::
       Batch("ConstantFolding", FixedPoint(10),
-        NullPropagation(conf),
+        NullPropagation,
         ConstantFolding,
         BooleanSimplification,
-        OptimizeIn(conf)) :: Nil
+        OptimizeIn) :: Nil
   }
 
   val testRelation = LocalRelation('a.int, 'b.int, 'c.int)
@@ -159,16 +159,20 @@ class OptimizeInSuite extends PlanTest {
         .where(In(UnresolvedAttribute("a"), Seq(Literal(1), Literal(2), Literal(3))))
         .analyze
 
-    val notOptimizedPlan = OptimizeIn(conf)(plan)
-    comparePlans(notOptimizedPlan, plan)
+    withSQLConf(OPTIMIZER_INSET_CONVERSION_THRESHOLD.key -> "10") {
+      val notOptimizedPlan = OptimizeIn(plan)
+      comparePlans(notOptimizedPlan, plan)
+    }
 
     // Reduce the threshold to turning into InSet.
-    val optimizedPlan = OptimizeIn(conf.copy(OPTIMIZER_INSET_CONVERSION_THRESHOLD -> 2))(plan)
-    optimizedPlan match {
-      case Filter(cond, _)
-        if cond.isInstanceOf[InSet] && cond.asInstanceOf[InSet].getHSet().size == 3 =>
-          // pass
-      case _ => fail("Unexpected result for OptimizedIn")
+    withSQLConf(OPTIMIZER_INSET_CONVERSION_THRESHOLD.key -> "2") {
+      val optimizedPlan = OptimizeIn(plan)
+      optimizedPlan match {
+        case Filter(cond, _)
+          if cond.isInstanceOf[InSet] && cond.asInstanceOf[InSet].getHSet().size == 3 =>
+        // pass
+        case _ => fail("Unexpected result for OptimizedIn")
+      }
     }
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/StarJoinCostBasedReorderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/StarJoinCostBasedReorderSuite.scala
index a23d6266b2840..ada6e2a43ea0f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/StarJoinCostBasedReorderSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/StarJoinCostBasedReorderSuite.scala
@@ -24,28 +24,46 @@ import org.apache.spark.sql.catalyst.plans.{Inner, PlanTest}
 import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, LogicalPlan}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.apache.spark.sql.catalyst.statsEstimation.{StatsEstimationTestBase, StatsTestPlan}
-import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.internal.SQLConf._
 
 
 class StarJoinCostBasedReorderSuite extends PlanTest with StatsEstimationTestBase {
 
-  override val conf = new SQLConf().copy(
-    CBO_ENABLED -> true,
-    JOIN_REORDER_ENABLED -> true,
-    JOIN_REORDER_DP_STAR_FILTER -> true)
-
   object Optimize extends RuleExecutor[LogicalPlan] {
     val batches =
       Batch("Operator Optimizations", FixedPoint(100),
         CombineFilters,
         PushDownPredicate,
-        ReorderJoin(conf),
+        ReorderJoin,
         PushPredicateThroughJoin,
         ColumnPruning,
         CollapseProject) ::
-        Batch("Join Reorder", Once,
-          CostBasedJoinReorder(conf)) :: Nil
+      Batch("Join Reorder", Once,
+        CostBasedJoinReorder) :: Nil
+  }
+
+  var originalConfCBOEnabled = false
+  var originalConfJoinReorderEnabled = false
+  var originalConfJoinReorderDPStarFilter = false
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    originalConfCBOEnabled = conf.cboEnabled
+    originalConfJoinReorderEnabled = conf.joinReorderEnabled
+    originalConfJoinReorderDPStarFilter = conf.joinReorderDPStarFilter
+    conf.setConf(CBO_ENABLED, true)
+    conf.setConf(JOIN_REORDER_ENABLED, true)
+    conf.setConf(JOIN_REORDER_DP_STAR_FILTER, true)
+  }
+
+  override def afterAll(): Unit = {
+    try {
+      conf.setConf(CBO_ENABLED, originalConfCBOEnabled)
+      conf.setConf(JOIN_REORDER_ENABLED, originalConfJoinReorderEnabled)
+      conf.setConf(JOIN_REORDER_DP_STAR_FILTER, originalConfJoinReorderDPStarFilter)
+    } finally {
+      super.afterAll()
+    }
   }
 
   private val columnInfo: AttributeMap[ColumnStat] = AttributeMap(Seq(
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/StarJoinReorderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/StarJoinReorderSuite.scala
index 605c01b7220d1..777c5637201ed 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/StarJoinReorderSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/StarJoinReorderSuite.scala
@@ -24,19 +24,36 @@ import org.apache.spark.sql.catalyst.plans.{Inner, PlanTest}
 import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, LocalRelation, LogicalPlan}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.apache.spark.sql.catalyst.statsEstimation.{StatsEstimationTestBase, StatsTestPlan}
-import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.internal.SQLConf.{CASE_SENSITIVE, STARSCHEMA_DETECTION}
+import org.apache.spark.sql.internal.SQLConf._
 
 class StarJoinReorderSuite extends PlanTest with StatsEstimationTestBase {
 
-  override val conf = new SQLConf().copy(CASE_SENSITIVE -> true, STARSCHEMA_DETECTION -> true)
+  var originalConfStarSchemaDetection = false
+  var originalConfCBOEnabled = true
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    originalConfStarSchemaDetection = conf.starSchemaDetection
+    originalConfCBOEnabled = conf.cboEnabled
+    conf.setConf(STARSCHEMA_DETECTION, true)
+    conf.setConf(CBO_ENABLED, false)
+  }
+
+  override def afterAll(): Unit = {
+    try {
+      conf.setConf(STARSCHEMA_DETECTION, originalConfStarSchemaDetection)
+      conf.setConf(CBO_ENABLED, originalConfCBOEnabled)
+    } finally {
+      super.afterAll()
+    }
+  }
 
   object Optimize extends RuleExecutor[LogicalPlan] {
     val batches =
       Batch("Operator Optimizations", FixedPoint(100),
         CombineFilters,
         PushDownPredicate,
-        ReorderJoin(conf),
+        ReorderJoin,
         PushPredicateThroughJoin,
         ColumnPruning,
         CollapseProject) :: Nil
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala
index 0a18858350e1f..3634accf1ec21 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala
@@ -37,7 +37,7 @@ class ComplexTypesSuite extends PlanTest{
       Batch("collapse projections", FixedPoint(10),
           CollapseProject) ::
       Batch("Constant Folding", FixedPoint(10),
-          NullPropagation(conf),
+          NullPropagation,
           ConstantFolding,
           BooleanSimplification,
           SimplifyConditionals,
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
index e9679d3361509..5389bf3389da4 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
@@ -31,8 +31,8 @@ import org.apache.spark.sql.internal.SQLConf
  */
 trait PlanTest extends SparkFunSuite with PredicateHelper {
 
-  // TODO(gatorsmile): remove this from PlanTest and all the analyzer/optimizer rules
-  protected val conf = new SQLConf().copy(SQLConf.CASE_SENSITIVE -> true)
+  // TODO(gatorsmile): remove this from PlanTest and all the analyzer rules
+  protected def conf = SQLConf.get
 
   /**
    * Since attribute references are given globally unique ids during analysis,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala
index 3c046ce494285..5cfad9126986b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala
@@ -38,12 +38,10 @@ import org.apache.spark.sql.internal.SQLConf
  * 3. aggregate function on partition columns which have same result w or w/o DISTINCT keyword.
  *  e.g. SELECT col1, Max(col2) FROM tbl GROUP BY col1.
  */
-case class OptimizeMetadataOnlyQuery(
-    catalog: SessionCatalog,
-    conf: SQLConf) extends Rule[LogicalPlan] {
+case class OptimizeMetadataOnlyQuery(catalog: SessionCatalog) extends Rule[LogicalPlan] {
 
   def apply(plan: LogicalPlan): LogicalPlan = {
-    if (!conf.optimizerMetadataOnly) {
+    if (!SQLConf.get.optimizerMetadataOnly) {
       return plan
     }
 
@@ -106,7 +104,7 @@ case class OptimizeMetadataOnlyQuery(
             val caseInsensitiveProperties =
               CaseInsensitiveMap(relation.tableMeta.storage.properties)
             val timeZoneId = caseInsensitiveProperties.get(DateTimeUtils.TIMEZONE_OPTION)
-              .getOrElse(conf.sessionLocalTimeZone)
+              .getOrElse(SQLConf.get.sessionLocalTimeZone)
             val partitionData = catalog.listPartitions(relation.tableMeta.identifier).map { p =>
               InternalRow.fromSeq(partAttrs.map { attr =>
                 Cast(Literal(p.spec(attr.name)), attr.dataType, Option(timeZoneId)).eval()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
index 1de4f508b89a0..00ff4c8ac310b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
@@ -22,16 +22,14 @@ import org.apache.spark.sql.catalyst.catalog.SessionCatalog
 import org.apache.spark.sql.catalyst.optimizer.Optimizer
 import org.apache.spark.sql.execution.datasources.PruneFileSourcePartitions
 import org.apache.spark.sql.execution.python.ExtractPythonUDFFromAggregate
-import org.apache.spark.sql.internal.SQLConf
 
 class SparkOptimizer(
     catalog: SessionCatalog,
-    conf: SQLConf,
     experimentalMethods: ExperimentalMethods)
-  extends Optimizer(catalog, conf) {
+  extends Optimizer(catalog) {
 
   override def batches: Seq[Batch] = (preOptimizationBatches ++ super.batches :+
-    Batch("Optimize Metadata Only Query", Once, OptimizeMetadataOnlyQuery(catalog, conf)) :+
+    Batch("Optimize Metadata Only Query", Once, OptimizeMetadataOnlyQuery(catalog)) :+
     Batch("Extract Python UDF from Aggregate", Once, ExtractPythonUDFFromAggregate) :+
     Batch("Prune File Source Table Partitions", Once, PruneFileSourcePartitions)) ++
     postHocOptimizationBatches :+
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
index 9d0148117fadf..72d0ddc62303a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
@@ -208,7 +208,7 @@ abstract class BaseSessionStateBuilder(
    * Note: this depends on the `conf`, `catalog` and `experimentalMethods` fields.
    */
   protected def optimizer: Optimizer = {
-    new SparkOptimizer(catalog, conf, experimentalMethods) {
+    new SparkOptimizer(catalog, experimentalMethods) {
       override def extendedOperatorOptimizationRules: Seq[Rule[LogicalPlan]] =
         super.extendedOperatorOptimizationRules ++ customOperatorOptimizationRules
     }

From 14a3bb3a008c302aac908d7deaf0942a98c63be7 Mon Sep 17 00:00:00 2001
From: Sumedh Wale <swale@snappydata.io>
Date: Thu, 6 Jul 2017 14:47:22 +0800
Subject: [PATCH 102/779] [SPARK-21312][SQL] correct offsetInBytes in
 UnsafeRow.writeToStream

## What changes were proposed in this pull request?

Corrects offsetInBytes calculation in UnsafeRow.writeToStream. Known failures include writes to some DataSources that have own SparkPlan implementations and cause EXCHANGE in writes.

## How was this patch tested?

Extended UnsafeRowSuite.writeToStream to include an UnsafeRow over byte array having non-zero offset.

Author: Sumedh Wale <swale@snappydata.io>

Closes #18535 from sumwale/SPARK-21312.
---
 .../spark/sql/catalyst/expressions/UnsafeRow.java   |  2 +-
 .../scala/org/apache/spark/sql/UnsafeRowSuite.scala | 13 +++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index 86de90984ca00..56994fafe064b 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -550,7 +550,7 @@ public void copyFrom(UnsafeRow row) {
    */
   public void writeToStream(OutputStream out, byte[] writeBuffer) throws IOException {
     if (baseObject instanceof byte[]) {
-      int offsetInByteArray = (int) (Platform.BYTE_ARRAY_OFFSET - baseOffset);
+      int offsetInByteArray = (int) (baseOffset - Platform.BYTE_ARRAY_OFFSET);
       out.write((byte[]) baseObject, offsetInByteArray, sizeInBytes);
     } else {
       int dataRemaining = sizeInBytes;
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
index a32763db054f3..a5f904c621e6e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
@@ -101,9 +101,22 @@ class UnsafeRowSuite extends SparkFunSuite {
         MemoryAllocator.UNSAFE.free(offheapRowPage)
       }
     }
+    val (bytesFromArrayBackedRowWithOffset, field0StringFromArrayBackedRowWithOffset) = {
+      val baos = new ByteArrayOutputStream()
+      val numBytes = arrayBackedUnsafeRow.getSizeInBytes
+      val bytesWithOffset = new Array[Byte](numBytes + 100)
+      System.arraycopy(arrayBackedUnsafeRow.getBaseObject.asInstanceOf[Array[Byte]], 0,
+        bytesWithOffset, 100, numBytes)
+      val arrayBackedRow = new UnsafeRow(arrayBackedUnsafeRow.numFields())
+      arrayBackedRow.pointTo(bytesWithOffset, Platform.BYTE_ARRAY_OFFSET + 100, numBytes)
+      arrayBackedRow.writeToStream(baos, null)
+      (baos.toByteArray, arrayBackedRow.getString(0))
+    }
 
     assert(bytesFromArrayBackedRow === bytesFromOffheapRow)
     assert(field0StringFromArrayBackedRow === field0StringFromOffheapRow)
+    assert(bytesFromArrayBackedRow === bytesFromArrayBackedRowWithOffset)
+    assert(field0StringFromArrayBackedRow === field0StringFromArrayBackedRowWithOffset)
   }
 
   test("calling getDouble() and getFloat() on null columns") {

From 60043f22458668ac7ecba94fa78953f23a6bdcec Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Thu, 6 Jul 2017 00:20:26 -0700
Subject: [PATCH 103/779] [SS][MINOR] Fix flaky test in
 DatastreamReaderWriterSuite. temp checkpoint dir should be deleted

## What changes were proposed in this pull request?

Stopping query while it is being initialized can throw interrupt exception, in which case temporary checkpoint directories will not be deleted, and the test will fail.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #18442 from tdas/DatastreamReaderWriterSuite-fix.
---
 .../spark/sql/streaming/test/DataStreamReaderWriterSuite.scala   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
index 3de0ae67a3892..e8a6202b8adce 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
@@ -641,6 +641,7 @@ class DataStreamReaderWriterSuite extends StreamTest with BeforeAndAfter {
   test("temp checkpoint dir should be deleted if a query is stopped without errors") {
     import testImplicits._
     val query = MemoryStream[Int].toDS.writeStream.format("console").start()
+    query.processAllAvailable()
     val checkpointDir = new Path(
       query.asInstanceOf[StreamingQueryWrapper].streamingQuery.resolvedCheckpointRoot)
     val fs = checkpointDir.getFileSystem(spark.sessionState.newHadoopConf())

From 5800144a54f5c0180ccf67392f32c3e8a51119b1 Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Thu, 6 Jul 2017 15:32:49 +0800
Subject: [PATCH 104/779] [SPARK-21012][SUBMIT] Add glob support for resources
 adding to Spark

Current "--jars (spark.jars)", "--files (spark.files)", "--py-files (spark.submit.pyFiles)" and "--archives (spark.yarn.dist.archives)" only support non-glob path. This is OK for most of the cases, but when user requires to add more jars, files into Spark, it is too verbose to list one by one. So here propose to add glob path support for resources.

Also improving the code of downloading resources.

## How was this patch tested?

UT added, also verified manually in local cluster.

Author: jerryshao <sshao@hortonworks.com>

Closes #18235 from jerryshao/SPARK-21012.
---
 .../org/apache/spark/deploy/SparkSubmit.scala | 166 ++++++++++++++----
 .../spark/deploy/SparkSubmitArguments.scala   |   2 +-
 .../spark/deploy/SparkSubmitSuite.scala       |  68 ++++++-
 docs/configuration.md                         |   6 +-
 4 files changed, 196 insertions(+), 46 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index d13fb4193970b..abde04062c4b1 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -17,17 +17,21 @@
 
 package org.apache.spark.deploy
 
-import java.io.{File, IOException}
+import java.io._
 import java.lang.reflect.{InvocationTargetException, Modifier, UndeclaredThrowableException}
 import java.net.URL
 import java.nio.file.Files
-import java.security.PrivilegedExceptionAction
+import java.security.{KeyStore, PrivilegedExceptionAction}
+import java.security.cert.X509Certificate
 import java.text.ParseException
+import javax.net.ssl._
 
 import scala.annotation.tailrec
 import scala.collection.mutable.{ArrayBuffer, HashMap, Map}
 import scala.util.Properties
 
+import com.google.common.io.ByteStreams
+import org.apache.commons.io.FileUtils
 import org.apache.commons.lang3.StringUtils
 import org.apache.hadoop.conf.{Configuration => HadoopConfiguration}
 import org.apache.hadoop.fs.{FileSystem, Path}
@@ -310,33 +314,33 @@ object SparkSubmit extends CommandLineUtils {
       RPackageUtils.checkAndBuildRPackage(args.jars, printStream, args.verbose)
     }
 
-    // In client mode, download remote files.
-    if (deployMode == CLIENT) {
-      val hadoopConf = new HadoopConfiguration()
-      args.primaryResource = Option(args.primaryResource).map(downloadFile(_, hadoopConf)).orNull
-      args.jars = Option(args.jars).map(downloadFileList(_, hadoopConf)).orNull
-      args.pyFiles = Option(args.pyFiles).map(downloadFileList(_, hadoopConf)).orNull
-      args.files = Option(args.files).map(downloadFileList(_, hadoopConf)).orNull
-    }
-
-    // Require all python files to be local, so we can add them to the PYTHONPATH
-    // In YARN cluster mode, python files are distributed as regular files, which can be non-local.
-    // In Mesos cluster mode, non-local python files are automatically downloaded by Mesos.
-    if (args.isPython && !isYarnCluster && !isMesosCluster) {
-      if (Utils.nonLocalPaths(args.primaryResource).nonEmpty) {
-        printErrorAndExit(s"Only local python files are supported: ${args.primaryResource}")
+    val hadoopConf = new HadoopConfiguration()
+    val targetDir = Files.createTempDirectory("tmp").toFile
+    // scalastyle:off runtimeaddshutdownhook
+    Runtime.getRuntime.addShutdownHook(new Thread() {
+      override def run(): Unit = {
+        FileUtils.deleteQuietly(targetDir)
       }
-      val nonLocalPyFiles = Utils.nonLocalPaths(args.pyFiles).mkString(",")
-      if (nonLocalPyFiles.nonEmpty) {
-        printErrorAndExit(s"Only local additional python files are supported: $nonLocalPyFiles")
-      }
-    }
+    })
+    // scalastyle:on runtimeaddshutdownhook
 
-    // Require all R files to be local
-    if (args.isR && !isYarnCluster && !isMesosCluster) {
-      if (Utils.nonLocalPaths(args.primaryResource).nonEmpty) {
-        printErrorAndExit(s"Only local R files are supported: ${args.primaryResource}")
-      }
+    // Resolve glob path for different resources.
+    args.jars = Option(args.jars).map(resolveGlobPaths(_, hadoopConf)).orNull
+    args.files = Option(args.files).map(resolveGlobPaths(_, hadoopConf)).orNull
+    args.pyFiles = Option(args.pyFiles).map(resolveGlobPaths(_, hadoopConf)).orNull
+    args.archives = Option(args.archives).map(resolveGlobPaths(_, hadoopConf)).orNull
+
+    // In client mode, download remote files.
+    if (deployMode == CLIENT) {
+      args.primaryResource = Option(args.primaryResource).map {
+        downloadFile(_, targetDir, args.sparkProperties, hadoopConf)
+      }.orNull
+      args.jars = Option(args.jars).map {
+        downloadFileList(_, targetDir, args.sparkProperties, hadoopConf)
+      }.orNull
+      args.pyFiles = Option(args.pyFiles).map {
+        downloadFileList(_, targetDir, args.sparkProperties, hadoopConf)
+      }.orNull
     }
 
     // The following modes are not supported or applicable
@@ -841,36 +845,132 @@ object SparkSubmit extends CommandLineUtils {
    * Download a list of remote files to temp local files. If the file is local, the original file
    * will be returned.
    * @param fileList A comma separated file list.
+   * @param targetDir A temporary directory for which downloaded files
+   * @param sparkProperties Spark properties
    * @return A comma separated local files list.
    */
   private[deploy] def downloadFileList(
       fileList: String,
+      targetDir: File,
+      sparkProperties: Map[String, String],
       hadoopConf: HadoopConfiguration): String = {
     require(fileList != null, "fileList cannot be null.")
-    fileList.split(",").map(downloadFile(_, hadoopConf)).mkString(",")
+    fileList.split(",")
+      .map(downloadFile(_, targetDir, sparkProperties, hadoopConf))
+      .mkString(",")
   }
 
   /**
    * Download a file from the remote to a local temporary directory. If the input path points to
    * a local path, returns it with no operation.
+   * @param path A file path from where the files will be downloaded.
+   * @param targetDir A temporary directory for which downloaded files
+   * @param sparkProperties Spark properties
+   * @return A comma separated local files list.
    */
-  private[deploy] def downloadFile(path: String, hadoopConf: HadoopConfiguration): String = {
+  private[deploy] def downloadFile(
+      path: String,
+      targetDir: File,
+      sparkProperties: Map[String, String],
+      hadoopConf: HadoopConfiguration): String = {
     require(path != null, "path cannot be null.")
     val uri = Utils.resolveURI(path)
     uri.getScheme match {
-      case "file" | "local" =>
-        path
+      case "file" | "local" => path
+      case "http" | "https" | "ftp" =>
+        val uc = uri.toURL.openConnection()
+        uc match {
+          case https: HttpsURLConnection =>
+            val trustStore = sparkProperties.get("spark.ssl.fs.trustStore")
+              .orElse(sparkProperties.get("spark.ssl.trustStore"))
+            val trustStorePwd = sparkProperties.get("spark.ssl.fs.trustStorePassword")
+              .orElse(sparkProperties.get("spark.ssl.trustStorePassword"))
+              .map(_.toCharArray)
+              .orNull
+            val protocol = sparkProperties.get("spark.ssl.fs.protocol")
+              .orElse(sparkProperties.get("spark.ssl.protocol"))
+            if (protocol.isEmpty) {
+              printErrorAndExit("spark ssl protocol is required when enabling SSL connection.")
+            }
+
+            val trustStoreManagers = trustStore.map { t =>
+              var input: InputStream = null
+              try {
+                input = new FileInputStream(new File(t))
+                val ks = KeyStore.getInstance(KeyStore.getDefaultType)
+                ks.load(input, trustStorePwd)
+                val tmf = TrustManagerFactory.getInstance(TrustManagerFactory.getDefaultAlgorithm)
+                tmf.init(ks)
+                tmf.getTrustManagers
+              } finally {
+                if (input != null) {
+                  input.close()
+                  input = null
+                }
+              }
+            }.getOrElse {
+              Array({
+                new X509TrustManager {
+                  override def getAcceptedIssuers: Array[X509Certificate] = null
+                  override def checkClientTrusted(
+                      x509Certificates: Array[X509Certificate], s: String) {}
+                  override def checkServerTrusted(
+                      x509Certificates: Array[X509Certificate], s: String) {}
+                }: TrustManager
+              })
+            }
+            val sslContext = SSLContext.getInstance(protocol.get)
+            sslContext.init(null, trustStoreManagers, null)
+            https.setSSLSocketFactory(sslContext.getSocketFactory)
+            https.setHostnameVerifier(new HostnameVerifier {
+              override def verify(s: String, sslSession: SSLSession): Boolean = false
+            })
+
+          case _ =>
+        }
 
+        uc.setConnectTimeout(60 * 1000)
+        uc.setReadTimeout(60 * 1000)
+        uc.connect()
+        val in = uc.getInputStream
+        val fileName = new Path(uri).getName
+        val tempFile = new File(targetDir, fileName)
+        val out = new FileOutputStream(tempFile)
+        // scalastyle:off println
+        printStream.println(s"Downloading ${uri.toString} to ${tempFile.getAbsolutePath}.")
+        // scalastyle:on println
+        try {
+          ByteStreams.copy(in, out)
+        } finally {
+          in.close()
+          out.close()
+        }
+        tempFile.toURI.toString
       case _ =>
         val fs = FileSystem.get(uri, hadoopConf)
-        val tmpFile = new File(Files.createTempDirectory("tmp").toFile, uri.getPath)
+        val tmpFile = new File(targetDir, new Path(uri).getName)
         // scalastyle:off println
         printStream.println(s"Downloading ${uri.toString} to ${tmpFile.getAbsolutePath}.")
         // scalastyle:on println
         fs.copyToLocalFile(new Path(uri), new Path(tmpFile.getAbsolutePath))
-        Utils.resolveURI(tmpFile.getAbsolutePath).toString
+        tmpFile.toURI.toString
     }
   }
+
+  private def resolveGlobPaths(paths: String, hadoopConf: HadoopConfiguration): String = {
+    require(paths != null, "paths cannot be null.")
+    paths.split(",").map(_.trim).filter(_.nonEmpty).flatMap { path =>
+      val uri = Utils.resolveURI(path)
+      uri.getScheme match {
+        case "local" | "http" | "https" | "ftp" => Array(path)
+        case _ =>
+          val fs = FileSystem.get(uri, hadoopConf)
+          Option(fs.globStatus(new Path(uri))).map { status =>
+            status.filter(_.isFile).map(_.getPath.toUri.toString)
+          }.getOrElse(Array(path))
+      }
+    }.mkString(",")
+  }
 }
 
 /** Provides utility functions to be used inside SparkSubmit. */
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
index 7800d3d624e3e..fd1521193fdee 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -520,7 +520,7 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
         |                              (Default: client).
         |  --class CLASS_NAME          Your application's main class (for Java / Scala apps).
         |  --name NAME                 A name of your application.
-        |  --jars JARS                 Comma-separated list of local jars to include on the driver
+        |  --jars JARS                 Comma-separated list of jars to include on the driver
         |                              and executor classpaths.
         |  --packages                  Comma-separated list of maven coordinates of jars to include
         |                              on the driver and executor classpaths. Will search the local
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index b089357e7b868..97357cdbb6083 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -20,12 +20,14 @@ package org.apache.spark.deploy
 import java.io._
 import java.net.URI
 import java.nio.charset.StandardCharsets
+import java.nio.file.Files
 
+import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 import scala.io.Source
 
 import com.google.common.io.ByteStreams
-import org.apache.commons.io.{FilenameUtils, FileUtils}
+import org.apache.commons.io.FileUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.scalatest.{BeforeAndAfterEach, Matchers}
@@ -42,7 +44,6 @@ import org.apache.spark.TestUtils.JavaSourceFromString
 import org.apache.spark.scheduler.EventLoggingListener
 import org.apache.spark.util.{CommandLineUtils, ResetSystemProperties, Utils}
 
-
 trait TestPrematureExit {
   suite: SparkFunSuite =>
 
@@ -726,6 +727,47 @@ class SparkSubmitSuite
     Utils.unionFileLists(None, Option("/tmp/a.jar")) should be (Set("/tmp/a.jar"))
     Utils.unionFileLists(Option("/tmp/a.jar"), None) should be (Set("/tmp/a.jar"))
   }
+
+  test("support glob path") {
+    val tmpJarDir = Utils.createTempDir()
+    val jar1 = TestUtils.createJarWithFiles(Map("test.resource" -> "1"), tmpJarDir)
+    val jar2 = TestUtils.createJarWithFiles(Map("test.resource" -> "USER"), tmpJarDir)
+
+    val tmpFileDir = Utils.createTempDir()
+    val file1 = File.createTempFile("tmpFile1", "", tmpFileDir)
+    val file2 = File.createTempFile("tmpFile2", "", tmpFileDir)
+
+    val tmpPyFileDir = Utils.createTempDir()
+    val pyFile1 = File.createTempFile("tmpPy1", ".py", tmpPyFileDir)
+    val pyFile2 = File.createTempFile("tmpPy2", ".egg", tmpPyFileDir)
+
+    val tmpArchiveDir = Utils.createTempDir()
+    val archive1 = File.createTempFile("archive1", ".zip", tmpArchiveDir)
+    val archive2 = File.createTempFile("archive2", ".zip", tmpArchiveDir)
+
+    val args = Seq(
+      "--class", UserClasspathFirstTest.getClass.getName.stripPrefix("$"),
+      "--name", "testApp",
+      "--master", "yarn",
+      "--deploy-mode", "client",
+      "--jars", s"${tmpJarDir.getAbsolutePath}/*.jar",
+      "--files", s"${tmpFileDir.getAbsolutePath}/tmpFile*",
+      "--py-files", s"${tmpPyFileDir.getAbsolutePath}/tmpPy*",
+      "--archives", s"${tmpArchiveDir.getAbsolutePath}/*.zip",
+      jar2.toString)
+
+    val appArgs = new SparkSubmitArguments(args)
+    val sysProps = SparkSubmit.prepareSubmitEnvironment(appArgs)._3
+    sysProps("spark.yarn.dist.jars").split(",").toSet should be
+      (Set(jar1.toURI.toString, jar2.toURI.toString))
+    sysProps("spark.yarn.dist.files").split(",").toSet should be
+      (Set(file1.toURI.toString, file2.toURI.toString))
+    sysProps("spark.submit.pyFiles").split(",").toSet should be
+      (Set(pyFile1.getAbsolutePath, pyFile2.getAbsolutePath))
+    sysProps("spark.yarn.dist.archives").split(",").toSet should be
+      (Set(archive1.toURI.toString, archive2.toURI.toString))
+  }
+
   // scalastyle:on println
 
   private def checkDownloadedFile(sourcePath: String, outputPath: String): Unit = {
@@ -738,7 +780,7 @@ class SparkSubmitSuite
     assert(outputUri.getScheme === "file")
 
     // The path and filename are preserved.
-    assert(outputUri.getPath.endsWith(sourceUri.getPath))
+    assert(outputUri.getPath.endsWith(new Path(sourceUri).getName))
     assert(FileUtils.readFileToString(new File(outputUri.getPath)) ===
       FileUtils.readFileToString(new File(sourceUri.getPath)))
   }
@@ -752,25 +794,29 @@ class SparkSubmitSuite
 
   test("downloadFile - invalid url") {
     intercept[IOException] {
-      SparkSubmit.downloadFile("abc:/my/file", new Configuration())
+      SparkSubmit.downloadFile(
+        "abc:/my/file", Utils.createTempDir(), mutable.Map.empty, new Configuration())
     }
   }
 
   test("downloadFile - file doesn't exist") {
     val hadoopConf = new Configuration()
+    val tmpDir = Utils.createTempDir()
     // Set s3a implementation to local file system for testing.
     hadoopConf.set("fs.s3a.impl", "org.apache.spark.deploy.TestFileSystem")
     // Disable file system impl cache to make sure the test file system is picked up.
     hadoopConf.set("fs.s3a.impl.disable.cache", "true")
     intercept[FileNotFoundException] {
-      SparkSubmit.downloadFile("s3a:/no/such/file", hadoopConf)
+      SparkSubmit.downloadFile("s3a:/no/such/file", tmpDir, mutable.Map.empty, hadoopConf)
     }
   }
 
   test("downloadFile does not download local file") {
     // empty path is considered as local file.
-    assert(SparkSubmit.downloadFile("", new Configuration()) === "")
-    assert(SparkSubmit.downloadFile("/local/file", new Configuration()) === "/local/file")
+    val tmpDir = Files.createTempDirectory("tmp").toFile
+    assert(SparkSubmit.downloadFile("", tmpDir, mutable.Map.empty, new Configuration()) === "")
+    assert(SparkSubmit.downloadFile("/local/file", tmpDir, mutable.Map.empty,
+      new Configuration()) === "/local/file")
   }
 
   test("download one file to local") {
@@ -779,12 +825,14 @@ class SparkSubmitSuite
     val content = "hello, world"
     FileUtils.write(jarFile, content)
     val hadoopConf = new Configuration()
+    val tmpDir = Files.createTempDirectory("tmp").toFile
     // Set s3a implementation to local file system for testing.
     hadoopConf.set("fs.s3a.impl", "org.apache.spark.deploy.TestFileSystem")
     // Disable file system impl cache to make sure the test file system is picked up.
     hadoopConf.set("fs.s3a.impl.disable.cache", "true")
     val sourcePath = s"s3a://${jarFile.getAbsolutePath}"
-    val outputPath = SparkSubmit.downloadFile(sourcePath, hadoopConf)
+    val outputPath =
+      SparkSubmit.downloadFile(sourcePath, tmpDir, mutable.Map.empty, hadoopConf)
     checkDownloadedFile(sourcePath, outputPath)
     deleteTempOutputFile(outputPath)
   }
@@ -795,12 +843,14 @@ class SparkSubmitSuite
     val content = "hello, world"
     FileUtils.write(jarFile, content)
     val hadoopConf = new Configuration()
+    val tmpDir = Files.createTempDirectory("tmp").toFile
     // Set s3a implementation to local file system for testing.
     hadoopConf.set("fs.s3a.impl", "org.apache.spark.deploy.TestFileSystem")
     // Disable file system impl cache to make sure the test file system is picked up.
     hadoopConf.set("fs.s3a.impl.disable.cache", "true")
     val sourcePaths = Seq("/local/file", s"s3a://${jarFile.getAbsolutePath}")
-    val outputPaths = SparkSubmit.downloadFileList(sourcePaths.mkString(","), hadoopConf).split(",")
+    val outputPaths = SparkSubmit.downloadFileList(
+      sourcePaths.mkString(","), tmpDir, mutable.Map.empty, hadoopConf).split(",")
 
     assert(outputPaths.length === sourcePaths.length)
     sourcePaths.zip(outputPaths).foreach { case (sourcePath, outputPath) =>
diff --git a/docs/configuration.md b/docs/configuration.md
index c785a664c67b1..7dc23e441a7ba 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -422,21 +422,21 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.files</code></td>
   <td></td>
   <td>
-    Comma-separated list of files to be placed in the working directory of each executor.
+    Comma-separated list of files to be placed in the working directory of each executor. Globs are allowed.
   </td>
 </tr>
 <tr>
   <td><code>spark.submit.pyFiles</code></td>
   <td></td>
   <td>
-    Comma-separated list of .zip, .egg, or .py files to place on the PYTHONPATH for Python apps.
+    Comma-separated list of .zip, .egg, or .py files to place on the PYTHONPATH for Python apps. Globs are allowed.
   </td>
 </tr>
 <tr>
   <td><code>spark.jars</code></td>
   <td></td>
   <td>
-    Comma-separated list of local jars to include on the driver and executor classpaths.
+    Comma-separated list of jars to include on the driver and executor classpaths. Globs are allowed.
   </td>
 </tr>
 <tr>

From 6ff05a66fe83e721063efe5c28d2ffeb850fecc7 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Thu, 6 Jul 2017 15:47:09 +0800
Subject: [PATCH 105/779] [SPARK-20703][SQL] Associate metrics with data writes
 onto DataFrameWriter operations

## What changes were proposed in this pull request?

Right now in the UI, after SPARK-20213, we can show the operations to write data out. However, there is no way to associate metrics with data writes. We should show relative metrics on the operations.

#### Supported commands

This change supports updating metrics for file-based data writing operations, including `InsertIntoHadoopFsRelationCommand`, `InsertIntoHiveTable`.

Supported metrics:

* number of written files
* number of dynamic partitions
* total bytes of written data
* total number of output rows
* average writing data out time (ms)
* (TODO) min/med/max number of output rows per file/partition
* (TODO) min/med/max bytes of written data per file/partition

####  Commands not supported

`InsertIntoDataSourceCommand`, `SaveIntoDataSourceCommand`:

The two commands uses DataSource APIs to write data out, i.e., the logic of writing data out is delegated to the DataSource implementations, such as  `InsertableRelation.insert` and `CreatableRelationProvider.createRelation`. So we can't obtain metrics from delegated methods for now.

`CreateHiveTableAsSelectCommand`, `CreateDataSourceTableAsSelectCommand` :

The two commands invokes other commands to write data out. The invoked commands can even write to non file-based data source. We leave them as future TODO.

#### How to update metrics of writing files out

A `RunnableCommand` which wants to update metrics, needs to override its `metrics` and provide the metrics data structure to `ExecutedCommandExec`.

The metrics are prepared during the execution of `FileFormatWriter`. The callback function passed to `FileFormatWriter` will accept the metrics and update accordingly.

There is a metrics updating function in `RunnableCommand`. In runtime, the function will be bound to the spark context and `metrics` of `ExecutedCommandExec` and pass to `FileFormatWriter`.

## How was this patch tested?

Updated unit tests.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #18159 from viirya/SPARK-20703-2.
---
 .../scala/org/apache/spark/util/Utils.scala   |   9 ++
 .../command/DataWritingCommand.scala          |  75 ++++++++++
 .../sql/execution/command/commands.scala      |  12 ++
 .../datasources/FileFormatWriter.scala        | 121 ++++++++++++---
 .../InsertIntoHadoopFsRelationCommand.scala   |  18 ++-
 .../sql/sources/PartitionedWriteSuite.scala   |  21 +--
 .../hive/execution/InsertIntoHiveTable.scala  |   8 +-
 .../sql/hive/execution/SQLMetricsSuite.scala  | 139 ++++++++++++++++++
 8 files changed, 362 insertions(+), 41 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLMetricsSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 26f61e25da4d3..b4caf68f0afaa 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -1002,6 +1002,15 @@ private[spark] object Utils extends Logging {
     }
   }
 
+  /**
+   * Lists files recursively.
+   */
+  def recursiveList(f: File): Array[File] = {
+    require(f.isDirectory)
+    val current = f.listFiles
+    current ++ current.filter(_.isDirectory).flatMap(recursiveList)
+  }
+
   /**
    * Delete a file or directory and its contents recursively.
    * Don't follow directories if they are symlinks.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala
new file mode 100644
index 0000000000000..0c381a2c02986
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.command
+
+import org.apache.spark.SparkContext
+import org.apache.spark.sql.execution.SQLExecution
+import org.apache.spark.sql.execution.datasources.ExecutedWriteSummary
+import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
+
+/**
+ * A special `RunnableCommand` which writes data out and updates metrics.
+ */
+trait DataWritingCommand extends RunnableCommand {
+
+  override lazy val metrics: Map[String, SQLMetric] = {
+    val sparkContext = SparkContext.getActive.get
+    Map(
+      "avgTime" -> SQLMetrics.createMetric(sparkContext, "average writing time (ms)"),
+      "numFiles" -> SQLMetrics.createMetric(sparkContext, "number of written files"),
+      "numOutputBytes" -> SQLMetrics.createMetric(sparkContext, "bytes of written output"),
+      "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
+      "numParts" -> SQLMetrics.createMetric(sparkContext, "number of dynamic part")
+    )
+  }
+
+  /**
+   * Callback function that update metrics collected from the writing operation.
+   */
+  protected def updateWritingMetrics(writeSummaries: Seq[ExecutedWriteSummary]): Unit = {
+    val sparkContext = SparkContext.getActive.get
+    var numPartitions = 0
+    var numFiles = 0
+    var totalNumBytes: Long = 0L
+    var totalNumOutput: Long = 0L
+    var totalWritingTime: Long = 0L
+
+    writeSummaries.foreach { summary =>
+      numPartitions += summary.updatedPartitions.size
+      numFiles += summary.numOutputFile
+      totalNumBytes += summary.numOutputBytes
+      totalNumOutput += summary.numOutputRows
+      totalWritingTime += summary.totalWritingTime
+    }
+
+    val avgWritingTime = if (numFiles > 0) {
+      (totalWritingTime / numFiles).toLong
+    } else {
+      0L
+    }
+
+    metrics("avgTime").add(avgWritingTime)
+    metrics("numFiles").add(numFiles)
+    metrics("numOutputBytes").add(totalNumBytes)
+    metrics("numOutputRows").add(totalNumOutput)
+    metrics("numParts").add(numPartitions)
+
+    val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
+    SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metrics.values.toList)
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
index 81bc93e7ebcf4..7cd4baef89e75 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
@@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.plans.{logical, QueryPlan}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.debug._
+import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.sql.execution.streaming.{IncrementalExecution, OffsetSeqMetadata}
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types._
@@ -37,6 +38,11 @@ import org.apache.spark.sql.types._
  * wrapped in `ExecutedCommand` during execution.
  */
 trait RunnableCommand extends logical.Command {
+
+  // The map used to record the metrics of running the command. This will be passed to
+  // `ExecutedCommand` during query planning.
+  lazy val metrics: Map[String, SQLMetric] = Map.empty
+
   def run(sparkSession: SparkSession, children: Seq[SparkPlan]): Seq[Row] = {
     throw new NotImplementedError
   }
@@ -49,8 +55,14 @@ trait RunnableCommand extends logical.Command {
 /**
  * A physical operator that executes the run method of a `RunnableCommand` and
  * saves the result to prevent multiple executions.
+ *
+ * @param cmd the `RunnableCommand` this operator will run.
+ * @param children the children physical plans ran by the `RunnableCommand`.
  */
 case class ExecutedCommandExec(cmd: RunnableCommand, children: Seq[SparkPlan]) extends SparkPlan {
+
+  override lazy val metrics: Map[String, SQLMetric] = cmd.metrics
+
   /**
    * A concrete command should override this lazy field to wrap up any side effects caused by the
    * command or any other computation that should be evaluated exactly once. The value of this field
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
index 0daffa93b4747..64866630623ab 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
@@ -22,7 +22,7 @@ import java.util.{Date, UUID}
 import scala.collection.mutable
 
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.Path
+import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
 import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
@@ -82,7 +82,7 @@ object FileFormatWriter extends Logging {
   }
 
   /** The result of a successful write task. */
-  private case class WriteTaskResult(commitMsg: TaskCommitMessage, updatedPartitions: Set[String])
+  private case class WriteTaskResult(commitMsg: TaskCommitMessage, summary: ExecutedWriteSummary)
 
   /**
    * Basic work flow of this command is:
@@ -104,7 +104,7 @@ object FileFormatWriter extends Logging {
       hadoopConf: Configuration,
       partitionColumns: Seq[Attribute],
       bucketSpec: Option[BucketSpec],
-      refreshFunction: (Seq[TablePartitionSpec]) => Unit,
+      refreshFunction: (Seq[ExecutedWriteSummary]) => Unit,
       options: Map[String, String]): Unit = {
 
     val job = Job.getInstance(hadoopConf)
@@ -196,12 +196,10 @@ object FileFormatWriter extends Logging {
         })
 
       val commitMsgs = ret.map(_.commitMsg)
-      val updatedPartitions = ret.flatMap(_.updatedPartitions)
-        .distinct.map(PartitioningUtils.parsePathFragment)
 
       committer.commitJob(job, commitMsgs)
       logInfo(s"Job ${job.getJobID} committed.")
-      refreshFunction(updatedPartitions)
+      refreshFunction(ret.map(_.summary))
     } catch { case cause: Throwable =>
       logError(s"Aborting job ${job.getJobID}.", cause)
       committer.abortJob(job)
@@ -247,9 +245,9 @@ object FileFormatWriter extends Logging {
     try {
       Utils.tryWithSafeFinallyAndFailureCallbacks(block = {
         // Execute the task to write rows out and commit the task.
-        val outputPartitions = writeTask.execute(iterator)
+        val summary = writeTask.execute(iterator)
         writeTask.releaseResources()
-        WriteTaskResult(committer.commitTask(taskAttemptContext), outputPartitions)
+        WriteTaskResult(committer.commitTask(taskAttemptContext), summary)
       })(catchBlock = {
         // If there is an error, release resource and then abort the task
         try {
@@ -273,12 +271,36 @@ object FileFormatWriter extends Logging {
    * automatically trigger task aborts.
    */
   private trait ExecuteWriteTask {
+
     /**
-     * Writes data out to files, and then returns the list of partition strings written out.
-     * The list of partitions is sent back to the driver and used to update the catalog.
+     * The data structures used to measure metrics during writing.
      */
-    def execute(iterator: Iterator[InternalRow]): Set[String]
+    protected var totalWritingTime: Long = 0L
+    protected var timeOnCurrentFile: Long = 0L
+    protected var numOutputRows: Long = 0L
+    protected var numOutputBytes: Long = 0L
+
+    /**
+     * Writes data out to files, and then returns the summary of relative information which
+     * includes the list of partition strings written out. The list of partitions is sent back
+     * to the driver and used to update the catalog. Other information will be sent back to the
+     * driver too and used to update the metrics in UI.
+     */
+    def execute(iterator: Iterator[InternalRow]): ExecutedWriteSummary
     def releaseResources(): Unit
+
+    /**
+     * A helper function used to determine the size in  bytes of a written file.
+     */
+    protected def getFileSize(conf: Configuration, filePath: String): Long = {
+      if (filePath != null) {
+        val path = new Path(filePath)
+        val fs = path.getFileSystem(conf)
+        fs.getFileStatus(path).getLen()
+      } else {
+        0L
+      }
+    }
   }
 
   /** Writes data to a single directory (used for non-dynamic-partition writes). */
@@ -288,24 +310,26 @@ object FileFormatWriter extends Logging {
       committer: FileCommitProtocol) extends ExecuteWriteTask {
 
     private[this] var currentWriter: OutputWriter = _
+    private[this] var currentPath: String = _
 
     private def newOutputWriter(fileCounter: Int): Unit = {
       val ext = description.outputWriterFactory.getFileExtension(taskAttemptContext)
-      val tmpFilePath = committer.newTaskTempFile(
+      currentPath = committer.newTaskTempFile(
         taskAttemptContext,
         None,
         f"-c$fileCounter%03d" + ext)
 
       currentWriter = description.outputWriterFactory.newInstance(
-        path = tmpFilePath,
+        path = currentPath,
         dataSchema = description.dataColumns.toStructType,
         context = taskAttemptContext)
     }
 
-    override def execute(iter: Iterator[InternalRow]): Set[String] = {
+    override def execute(iter: Iterator[InternalRow]): ExecutedWriteSummary = {
       var fileCounter = 0
       var recordsInFile: Long = 0L
       newOutputWriter(fileCounter)
+
       while (iter.hasNext) {
         if (description.maxRecordsPerFile > 0 && recordsInFile >= description.maxRecordsPerFile) {
           fileCounter += 1
@@ -314,21 +338,35 @@ object FileFormatWriter extends Logging {
 
           recordsInFile = 0
           releaseResources()
+          numOutputRows += recordsInFile
           newOutputWriter(fileCounter)
         }
 
         val internalRow = iter.next()
+        val startTime = System.nanoTime()
         currentWriter.write(internalRow)
+        timeOnCurrentFile += (System.nanoTime() - startTime)
         recordsInFile += 1
       }
       releaseResources()
-      Set.empty
+      numOutputRows += recordsInFile
+
+      ExecutedWriteSummary(
+        updatedPartitions = Set.empty,
+        numOutputFile = fileCounter + 1,
+        numOutputBytes = numOutputBytes,
+        numOutputRows = numOutputRows,
+        totalWritingTime = totalWritingTime)
     }
 
     override def releaseResources(): Unit = {
       if (currentWriter != null) {
         try {
+          val startTime = System.nanoTime()
           currentWriter.close()
+          totalWritingTime += (timeOnCurrentFile + System.nanoTime() - startTime) / 1000 / 1000
+          timeOnCurrentFile = 0
+          numOutputBytes += getFileSize(taskAttemptContext.getConfiguration, currentPath)
         } finally {
           currentWriter = null
         }
@@ -348,6 +386,8 @@ object FileFormatWriter extends Logging {
     // currentWriter is initialized whenever we see a new key
     private var currentWriter: OutputWriter = _
 
+    private var currentPath: String = _
+
     /** Expressions that given partition columns build a path string like: col1=val/col2=val/... */
     private def partitionPathExpression: Seq[Expression] = {
       desc.partitionColumns.zipWithIndex.flatMap { case (c, i) =>
@@ -403,19 +443,19 @@ object FileFormatWriter extends Logging {
         case _ =>
           None
       }
-      val path = if (customPath.isDefined) {
+      currentPath = if (customPath.isDefined) {
         committer.newTaskTempFileAbsPath(taskAttemptContext, customPath.get, ext)
       } else {
         committer.newTaskTempFile(taskAttemptContext, partDir, ext)
       }
 
       currentWriter = desc.outputWriterFactory.newInstance(
-        path = path,
+        path = currentPath,
         dataSchema = desc.dataColumns.toStructType,
         context = taskAttemptContext)
     }
 
-    override def execute(iter: Iterator[InternalRow]): Set[String] = {
+    override def execute(iter: Iterator[InternalRow]): ExecutedWriteSummary = {
       val getPartitionColsAndBucketId = UnsafeProjection.create(
         desc.partitionColumns ++ desc.bucketIdExpression, desc.allColumns)
 
@@ -429,15 +469,22 @@ object FileFormatWriter extends Logging {
       // If anything below fails, we should abort the task.
       var recordsInFile: Long = 0L
       var fileCounter = 0
+      var totalFileCounter = 0
       var currentPartColsAndBucketId: UnsafeRow = null
       val updatedPartitions = mutable.Set[String]()
+
       for (row <- iter) {
         val nextPartColsAndBucketId = getPartitionColsAndBucketId(row)
         if (currentPartColsAndBucketId != nextPartColsAndBucketId) {
+          if (currentPartColsAndBucketId != null) {
+            totalFileCounter += (fileCounter + 1)
+          }
+
           // See a new partition or bucket - write to a new partition dir (or a new bucket file).
           currentPartColsAndBucketId = nextPartColsAndBucketId.copy()
           logDebug(s"Writing partition: $currentPartColsAndBucketId")
 
+          numOutputRows += recordsInFile
           recordsInFile = 0
           fileCounter = 0
 
@@ -447,6 +494,8 @@ object FileFormatWriter extends Logging {
             recordsInFile >= desc.maxRecordsPerFile) {
           // Exceeded the threshold in terms of the number of records per file.
           // Create a new file by increasing the file counter.
+
+          numOutputRows += recordsInFile
           recordsInFile = 0
           fileCounter += 1
           assert(fileCounter < MAX_FILE_COUNTER,
@@ -455,18 +504,33 @@ object FileFormatWriter extends Logging {
           releaseResources()
           newOutputWriter(currentPartColsAndBucketId, getPartPath, fileCounter, updatedPartitions)
         }
-
+        val startTime = System.nanoTime()
         currentWriter.write(getOutputRow(row))
+        timeOnCurrentFile += (System.nanoTime() - startTime)
         recordsInFile += 1
       }
+      if (currentPartColsAndBucketId != null) {
+        totalFileCounter += (fileCounter + 1)
+      }
       releaseResources()
-      updatedPartitions.toSet
+      numOutputRows += recordsInFile
+
+      ExecutedWriteSummary(
+        updatedPartitions = updatedPartitions.toSet,
+        numOutputFile = totalFileCounter,
+        numOutputBytes = numOutputBytes,
+        numOutputRows = numOutputRows,
+        totalWritingTime = totalWritingTime)
     }
 
     override def releaseResources(): Unit = {
       if (currentWriter != null) {
         try {
+          val startTime = System.nanoTime()
           currentWriter.close()
+          totalWritingTime += (timeOnCurrentFile + System.nanoTime() - startTime) / 1000 / 1000
+          timeOnCurrentFile = 0
+          numOutputBytes += getFileSize(taskAttemptContext.getConfiguration, currentPath)
         } finally {
           currentWriter = null
         }
@@ -474,3 +538,20 @@ object FileFormatWriter extends Logging {
     }
   }
 }
+
+/**
+ * Wrapper class for the metrics of writing data out.
+ *
+ * @param updatedPartitions the partitions updated during writing data out. Only valid
+ *                          for dynamic partition.
+ * @param numOutputFile the total number of files.
+ * @param numOutputRows the number of output rows.
+ * @param numOutputBytes the bytes of output data.
+ * @param totalWritingTime the total writing time in ms.
+ */
+case class ExecutedWriteSummary(
+  updatedPartitions: Set[String],
+  numOutputFile: Int,
+  numOutputRows: Long,
+  numOutputBytes: Long,
+  totalWritingTime: Long)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index ab26f2affbce5..0031567d3d288 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -21,6 +21,7 @@ import java.io.IOException
 
 import org.apache.hadoop.fs.{FileSystem, Path}
 
+import org.apache.spark.SparkContext
 import org.apache.spark.internal.io.FileCommitProtocol
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable, CatalogTablePartition}
@@ -29,6 +30,7 @@ import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.command._
+import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
 
 /**
  * A command for writing data to a [[HadoopFsRelation]].  Supports both overwriting and appending.
@@ -53,7 +55,7 @@ case class InsertIntoHadoopFsRelationCommand(
     mode: SaveMode,
     catalogTable: Option[CatalogTable],
     fileIndex: Option[FileIndex])
-  extends RunnableCommand {
+  extends DataWritingCommand {
   import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.escapePathName
 
   override def children: Seq[LogicalPlan] = query :: Nil
@@ -123,8 +125,16 @@ case class InsertIntoHadoopFsRelationCommand(
 
     if (doInsertion) {
 
-      // Callback for updating metastore partition metadata after the insertion job completes.
-      def refreshPartitionsCallback(updatedPartitions: Seq[TablePartitionSpec]): Unit = {
+      // Callback for updating metric and metastore partition metadata
+      // after the insertion job completes.
+      def refreshCallback(summary: Seq[ExecutedWriteSummary]): Unit = {
+        val updatedPartitions = summary.flatMap(_.updatedPartitions)
+          .distinct.map(PartitioningUtils.parsePathFragment)
+
+        // Updating metrics.
+        updateWritingMetrics(summary)
+
+        // Updating metastore partition metadata.
         if (partitionsTrackedByCatalog) {
           val newPartitions = updatedPartitions.toSet -- initialMatchingPartitions
           if (newPartitions.nonEmpty) {
@@ -154,7 +164,7 @@ case class InsertIntoHadoopFsRelationCommand(
         hadoopConf = hadoopConf,
         partitionColumns = partitionColumns,
         bucketSpec = bucketSpec,
-        refreshFunction = refreshPartitionsCallback,
+        refreshFunction = refreshCallback,
         options = options)
 
       // refresh cached files in FileIndex
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
index a2f3afe3ce236..6f998aa60faf5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
@@ -91,15 +91,15 @@ class PartitionedWriteSuite extends QueryTest with SharedSQLContext {
     withTempDir { f =>
       spark.range(start = 0, end = 4, step = 1, numPartitions = 1)
         .write.option("maxRecordsPerFile", 1).mode("overwrite").parquet(f.getAbsolutePath)
-      assert(recursiveList(f).count(_.getAbsolutePath.endsWith("parquet")) == 4)
+      assert(Utils.recursiveList(f).count(_.getAbsolutePath.endsWith("parquet")) == 4)
 
       spark.range(start = 0, end = 4, step = 1, numPartitions = 1)
         .write.option("maxRecordsPerFile", 2).mode("overwrite").parquet(f.getAbsolutePath)
-      assert(recursiveList(f).count(_.getAbsolutePath.endsWith("parquet")) == 2)
+      assert(Utils.recursiveList(f).count(_.getAbsolutePath.endsWith("parquet")) == 2)
 
       spark.range(start = 0, end = 4, step = 1, numPartitions = 1)
         .write.option("maxRecordsPerFile", -1).mode("overwrite").parquet(f.getAbsolutePath)
-      assert(recursiveList(f).count(_.getAbsolutePath.endsWith("parquet")) == 1)
+      assert(Utils.recursiveList(f).count(_.getAbsolutePath.endsWith("parquet")) == 1)
     }
   }
 
@@ -111,7 +111,7 @@ class PartitionedWriteSuite extends QueryTest with SharedSQLContext {
         .option("maxRecordsPerFile", 1)
         .mode("overwrite")
         .parquet(f.getAbsolutePath)
-      assert(recursiveList(f).count(_.getAbsolutePath.endsWith("parquet")) == 4)
+      assert(Utils.recursiveList(f).count(_.getAbsolutePath.endsWith("parquet")) == 4)
     }
   }
 
@@ -138,14 +138,14 @@ class PartitionedWriteSuite extends QueryTest with SharedSQLContext {
     val df = Seq((1, ts)).toDF("i", "ts")
     withTempPath { f =>
       df.write.partitionBy("ts").parquet(f.getAbsolutePath)
-      val files = recursiveList(f).filter(_.getAbsolutePath.endsWith("parquet"))
+      val files = Utils.recursiveList(f).filter(_.getAbsolutePath.endsWith("parquet"))
       assert(files.length == 1)
       checkPartitionValues(files.head, "2016-12-01 00:00:00")
     }
     withTempPath { f =>
       df.write.option(DateTimeUtils.TIMEZONE_OPTION, "GMT")
         .partitionBy("ts").parquet(f.getAbsolutePath)
-      val files = recursiveList(f).filter(_.getAbsolutePath.endsWith("parquet"))
+      val files = Utils.recursiveList(f).filter(_.getAbsolutePath.endsWith("parquet"))
       assert(files.length == 1)
       // use timeZone option "GMT" to format partition value.
       checkPartitionValues(files.head, "2016-12-01 08:00:00")
@@ -153,18 +153,11 @@ class PartitionedWriteSuite extends QueryTest with SharedSQLContext {
     withTempPath { f =>
       withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> "GMT") {
         df.write.partitionBy("ts").parquet(f.getAbsolutePath)
-        val files = recursiveList(f).filter(_.getAbsolutePath.endsWith("parquet"))
+        val files = Utils.recursiveList(f).filter(_.getAbsolutePath.endsWith("parquet"))
         assert(files.length == 1)
         // if there isn't timeZone option, then use session local timezone.
         checkPartitionValues(files.head, "2016-12-01 08:00:00")
       }
     }
   }
-
-  /** Lists files recursively. */
-  private def recursiveList(f: File): Array[File] = {
-    require(f.isDirectory)
-    val current = f.listFiles
-    current ++ current.filter(_.isDirectory).flatMap(recursiveList)
-  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index 223d375232393..cd263e8b6df8e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -31,14 +31,16 @@ import org.apache.hadoop.hive.ql.exec.TaskRunner
 import org.apache.hadoop.hive.ql.ErrorMsg
 import org.apache.hadoop.hive.ql.plan.TableDesc
 
+import org.apache.spark.SparkContext
 import org.apache.spark.internal.io.FileCommitProtocol
 import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
 import org.apache.spark.sql.catalyst.catalog.CatalogTable
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SparkPlan
-import org.apache.spark.sql.execution.command.{CommandUtils, RunnableCommand}
+import org.apache.spark.sql.execution.command.{CommandUtils, DataWritingCommand}
 import org.apache.spark.sql.execution.datasources.FileFormatWriter
+import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
 import org.apache.spark.sql.hive._
 import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc}
 import org.apache.spark.sql.hive.client.{HiveClientImpl, HiveVersion}
@@ -80,7 +82,7 @@ case class InsertIntoHiveTable(
     partition: Map[String, Option[String]],
     query: LogicalPlan,
     overwrite: Boolean,
-    ifPartitionNotExists: Boolean) extends RunnableCommand {
+    ifPartitionNotExists: Boolean) extends DataWritingCommand {
 
   override def children: Seq[LogicalPlan] = query :: Nil
 
@@ -354,7 +356,7 @@ case class InsertIntoHiveTable(
       hadoopConf = hadoopConf,
       partitionColumns = partitionAttributes,
       bucketSpec = None,
-      refreshFunction = _ => (),
+      refreshFunction = updateWritingMetrics,
       options = Map.empty)
 
     if (partition.nonEmpty) {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLMetricsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLMetricsSuite.scala
new file mode 100644
index 0000000000000..1ef1988d4c605
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLMetricsSuite.scala
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.execution
+
+import java.io.File
+
+import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.util.Utils
+
+class SQLMetricsSuite extends SQLTestUtils with TestHiveSingleton {
+  import spark.implicits._
+
+  /**
+   * Get execution metrics for the SQL execution and verify metrics values.
+   *
+   * @param metricsValues the expected metric values (numFiles, numPartitions, numOutputRows).
+   * @param func the function can produce execution id after running.
+   */
+  private def verifyWriteDataMetrics(metricsValues: Seq[Int])(func: => Unit): Unit = {
+    val previousExecutionIds = spark.sharedState.listener.executionIdToData.keySet
+    // Run the given function to trigger query execution.
+    func
+    spark.sparkContext.listenerBus.waitUntilEmpty(10000)
+    val executionIds =
+      spark.sharedState.listener.executionIdToData.keySet.diff(previousExecutionIds)
+    assert(executionIds.size == 1)
+    val executionId = executionIds.head
+
+    val executionData = spark.sharedState.listener.getExecution(executionId).get
+    val executedNode = executionData.physicalPlanGraph.nodes.head
+
+    val metricsNames = Seq(
+      "number of written files",
+      "number of dynamic part",
+      "number of output rows")
+
+    val metrics = spark.sharedState.listener.getExecutionMetrics(executionId)
+
+    metricsNames.zip(metricsValues).foreach { case (metricsName, expected) =>
+      val sqlMetric = executedNode.metrics.find(_.name == metricsName)
+      assert(sqlMetric.isDefined)
+      val accumulatorId = sqlMetric.get.accumulatorId
+      val metricValue = metrics(accumulatorId).replaceAll(",", "").toInt
+      assert(metricValue == expected)
+    }
+
+    val totalNumBytesMetric = executedNode.metrics.find(_.name == "bytes of written output").get
+    val totalNumBytes = metrics(totalNumBytesMetric.accumulatorId).replaceAll(",", "").toInt
+    assert(totalNumBytes > 0)
+    val writingTimeMetric = executedNode.metrics.find(_.name == "average writing time (ms)").get
+    val writingTime = metrics(writingTimeMetric.accumulatorId).replaceAll(",", "").toInt
+    assert(writingTime >= 0)
+  }
+
+  private def testMetricsNonDynamicPartition(
+      dataFormat: String,
+      tableName: String): Unit = {
+    withTable(tableName) {
+      Seq((1, 2)).toDF("i", "j")
+        .write.format(dataFormat).mode("overwrite").saveAsTable(tableName)
+
+      val tableLocation =
+        new File(spark.sessionState.catalog.getTableMetadata(TableIdentifier(tableName)).location)
+
+      // 2 files, 100 rows, 0 dynamic partition.
+      verifyWriteDataMetrics(Seq(2, 0, 100)) {
+        (0 until 100).map(i => (i, i + 1)).toDF("i", "j").repartition(2)
+          .write.format(dataFormat).mode("overwrite").insertInto(tableName)
+      }
+      assert(Utils.recursiveList(tableLocation).count(_.getName.startsWith("part-")) == 2)
+    }
+  }
+
+  private def testMetricsDynamicPartition(
+      provider: String,
+      dataFormat: String,
+      tableName: String): Unit = {
+    withTempPath { dir =>
+      spark.sql(
+        s"""
+           |CREATE TABLE $tableName(a int, b int)
+           |USING $provider
+           |PARTITIONED BY(a)
+           |LOCATION '${dir.toURI}'
+         """.stripMargin)
+      val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier(tableName))
+      assert(table.location == makeQualifiedPath(dir.getAbsolutePath))
+
+      val df = spark.range(start = 0, end = 40, step = 1, numPartitions = 1)
+        .selectExpr("id a", "id b")
+
+      // 40 files, 80 rows, 40 dynamic partitions.
+      verifyWriteDataMetrics(Seq(40, 40, 80)) {
+        df.union(df).repartition(2, $"a")
+          .write
+          .format(dataFormat)
+          .mode("overwrite")
+          .insertInto(tableName)
+      }
+      assert(Utils.recursiveList(dir).count(_.getName.startsWith("part-")) == 40)
+    }
+  }
+
+  test("writing data out metrics: parquet") {
+    testMetricsNonDynamicPartition("parquet", "t1")
+  }
+
+  test("writing data out metrics with dynamic partition: parquet") {
+    testMetricsDynamicPartition("parquet", "parquet", "t1")
+  }
+
+  test("writing data out metrics: hive") {
+    testMetricsNonDynamicPartition("hive", "t1")
+  }
+
+  test("writing data out metrics dynamic partition: hive") {
+    withSQLConf(("hive.exec.dynamic.partition.mode", "nonstrict")) {
+      testMetricsDynamicPartition("hive", "hive", "t1")
+    }
+  }
+}

From b8e4d567a7d6c2ff277700d4e7707e57e87c7808 Mon Sep 17 00:00:00 2001
From: wangzhenhua <wangzhenhua@huawei.com>
Date: Thu, 6 Jul 2017 16:00:31 +0800
Subject: [PATCH 106/779] [SPARK-21324][TEST] Improve statistics test suites

## What changes were proposed in this pull request?

1. move `StatisticsCollectionTestBase` to a separate file.
2. move some test cases to `StatisticsCollectionSuite` so that `hive/StatisticsSuite` only keeps tests that need hive support.
3. clear up some test cases.

## How was this patch tested?

Existing tests.

Author: wangzhenhua <wangzhenhua@huawei.com>
Author: Zhenhua Wang <wzh_zju@163.com>

Closes #18545 from wzhfy/cleanStatSuites.
---
 .../spark/sql/StatisticsCollectionSuite.scala | 193 +++---------------
 .../sql/StatisticsCollectionTestBase.scala    | 192 +++++++++++++++++
 .../spark/sql/hive/StatisticsSuite.scala      | 124 +++--------
 3 files changed, 258 insertions(+), 251 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
index d9392de37a815..843ced7f0e697 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
@@ -17,19 +17,12 @@
 
 package org.apache.spark.sql
 
-import java.{lang => jl}
-import java.sql.{Date, Timestamp}
-
 import scala.collection.mutable
-import scala.util.Random
 
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.catalog.{CatalogRelation, CatalogStatistics}
 import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.catalyst.util.DateTimeUtils
-import org.apache.spark.sql.execution.datasources.LogicalRelation
-import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf}
-import org.apache.spark.sql.test.{SharedSQLContext, SQLTestUtils}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.test.SQLTestData.ArrayData
 import org.apache.spark.sql.types._
 
@@ -58,6 +51,37 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
     }
   }
 
+  test("analyzing views is not supported") {
+    def assertAnalyzeUnsupported(analyzeCommand: String): Unit = {
+      val err = intercept[AnalysisException] {
+        sql(analyzeCommand)
+      }
+      assert(err.message.contains("ANALYZE TABLE is not supported"))
+    }
+
+    val tableName = "tbl"
+    withTable(tableName) {
+      spark.range(10).write.saveAsTable(tableName)
+      val viewName = "view"
+      withView(viewName) {
+        sql(s"CREATE VIEW $viewName AS SELECT * FROM $tableName")
+        assertAnalyzeUnsupported(s"ANALYZE TABLE $viewName COMPUTE STATISTICS")
+        assertAnalyzeUnsupported(s"ANALYZE TABLE $viewName COMPUTE STATISTICS FOR COLUMNS id")
+      }
+    }
+  }
+
+  test("statistics collection of a table with zero column") {
+    val table_no_cols = "table_no_cols"
+    withTable(table_no_cols) {
+      val rddNoCols = sparkContext.parallelize(1 to 10).map(_ => Row.empty)
+      val dfNoCols = spark.createDataFrame(rddNoCols, StructType(Seq.empty))
+      dfNoCols.write.format("json").saveAsTable(table_no_cols)
+      sql(s"ANALYZE TABLE $table_no_cols COMPUTE STATISTICS")
+      checkTableStats(table_no_cols, hasSizeInBytes = true, expectedRowCounts = Some(10))
+    }
+  }
+
   test("analyze column command - unsupported types and invalid columns") {
     val tableName = "column_stats_test1"
     withTable(tableName) {
@@ -239,154 +263,3 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
   }
 
 }
-
-
-/**
- * The base for test cases that we want to include in both the hive module (for verifying behavior
- * when using the Hive external catalog) as well as in the sql/core module.
- */
-abstract class StatisticsCollectionTestBase extends QueryTest with SQLTestUtils {
-  import testImplicits._
-
-  private val dec1 = new java.math.BigDecimal("1.000000000000000000")
-  private val dec2 = new java.math.BigDecimal("8.000000000000000000")
-  private val d1 = Date.valueOf("2016-05-08")
-  private val d2 = Date.valueOf("2016-05-09")
-  private val t1 = Timestamp.valueOf("2016-05-08 00:00:01")
-  private val t2 = Timestamp.valueOf("2016-05-09 00:00:02")
-
-  /**
-   * Define a very simple 3 row table used for testing column serialization.
-   * Note: last column is seq[int] which doesn't support stats collection.
-   */
-  protected val data = Seq[
-    (jl.Boolean, jl.Byte, jl.Short, jl.Integer, jl.Long,
-      jl.Double, jl.Float, java.math.BigDecimal,
-      String, Array[Byte], Date, Timestamp,
-      Seq[Int])](
-    (false, 1.toByte, 1.toShort, 1, 1L, 1.0, 1.0f, dec1, "s1", "b1".getBytes, d1, t1, null),
-    (true, 2.toByte, 3.toShort, 4, 5L, 6.0, 7.0f, dec2, "ss9", "bb0".getBytes, d2, t2, null),
-    (null, null, null, null, null, null, null, null, null, null, null, null, null)
-  )
-
-  /** A mapping from column to the stats collected. */
-  protected val stats = mutable.LinkedHashMap(
-    "cbool" -> ColumnStat(2, Some(false), Some(true), 1, 1, 1),
-    "cbyte" -> ColumnStat(2, Some(1.toByte), Some(2.toByte), 1, 1, 1),
-    "cshort" -> ColumnStat(2, Some(1.toShort), Some(3.toShort), 1, 2, 2),
-    "cint" -> ColumnStat(2, Some(1), Some(4), 1, 4, 4),
-    "clong" -> ColumnStat(2, Some(1L), Some(5L), 1, 8, 8),
-    "cdouble" -> ColumnStat(2, Some(1.0), Some(6.0), 1, 8, 8),
-    "cfloat" -> ColumnStat(2, Some(1.0f), Some(7.0f), 1, 4, 4),
-    "cdecimal" -> ColumnStat(2, Some(Decimal(dec1)), Some(Decimal(dec2)), 1, 16, 16),
-    "cstring" -> ColumnStat(2, None, None, 1, 3, 3),
-    "cbinary" -> ColumnStat(2, None, None, 1, 3, 3),
-    "cdate" -> ColumnStat(2, Some(DateTimeUtils.fromJavaDate(d1)),
-      Some(DateTimeUtils.fromJavaDate(d2)), 1, 4, 4),
-    "ctimestamp" -> ColumnStat(2, Some(DateTimeUtils.fromJavaTimestamp(t1)),
-      Some(DateTimeUtils.fromJavaTimestamp(t2)), 1, 8, 8)
-  )
-
-  private val randomName = new Random(31)
-
-  def checkTableStats(
-      tableName: String,
-      hasSizeInBytes: Boolean,
-      expectedRowCounts: Option[Int]): Option[CatalogStatistics] = {
-    val stats = spark.sessionState.catalog.getTableMetadata(TableIdentifier(tableName)).stats
-    if (hasSizeInBytes || expectedRowCounts.nonEmpty) {
-      assert(stats.isDefined)
-      assert(stats.get.sizeInBytes >= 0)
-      assert(stats.get.rowCount === expectedRowCounts)
-    } else {
-      assert(stats.isEmpty)
-    }
-
-    stats
-  }
-
-  /**
-   * Compute column stats for the given DataFrame and compare it with colStats.
-   */
-  def checkColStats(
-      df: DataFrame,
-      colStats: mutable.LinkedHashMap[String, ColumnStat]): Unit = {
-    val tableName = "column_stats_test_" + randomName.nextInt(1000)
-    withTable(tableName) {
-      df.write.saveAsTable(tableName)
-
-      // Collect statistics
-      sql(s"analyze table $tableName compute STATISTICS FOR COLUMNS " +
-        colStats.keys.mkString(", "))
-
-      // Validate statistics
-      val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier(tableName))
-      assert(table.stats.isDefined)
-      assert(table.stats.get.colStats.size == colStats.size)
-
-      colStats.foreach { case (k, v) =>
-        withClue(s"column $k") {
-          assert(table.stats.get.colStats(k) == v)
-        }
-      }
-    }
-  }
-
-  // This test will be run twice: with and without Hive support
-  test("SPARK-18856: non-empty partitioned table should not report zero size") {
-    withTable("ds_tbl", "hive_tbl") {
-      spark.range(100).select($"id", $"id" % 5 as "p").write.partitionBy("p").saveAsTable("ds_tbl")
-      val stats = spark.table("ds_tbl").queryExecution.optimizedPlan.stats
-      assert(stats.sizeInBytes > 0, "non-empty partitioned table should not report zero size.")
-
-      if (spark.conf.get(StaticSQLConf.CATALOG_IMPLEMENTATION) == "hive") {
-        sql("CREATE TABLE hive_tbl(i int) PARTITIONED BY (j int)")
-        sql("INSERT INTO hive_tbl PARTITION(j=1) SELECT 1")
-        val stats2 = spark.table("hive_tbl").queryExecution.optimizedPlan.stats
-        assert(stats2.sizeInBytes > 0, "non-empty partitioned table should not report zero size.")
-      }
-    }
-  }
-
-  // This test will be run twice: with and without Hive support
-  test("conversion from CatalogStatistics to Statistics") {
-    withTable("ds_tbl", "hive_tbl") {
-      // Test data source table
-      checkStatsConversion(tableName = "ds_tbl", isDatasourceTable = true)
-      // Test hive serde table
-      if (spark.conf.get(StaticSQLConf.CATALOG_IMPLEMENTATION) == "hive") {
-        checkStatsConversion(tableName = "hive_tbl", isDatasourceTable = false)
-      }
-    }
-  }
-
-  private def checkStatsConversion(tableName: String, isDatasourceTable: Boolean): Unit = {
-    // Create an empty table and run analyze command on it.
-    val createTableSql = if (isDatasourceTable) {
-      s"CREATE TABLE $tableName (c1 INT, c2 STRING) USING PARQUET"
-    } else {
-      s"CREATE TABLE $tableName (c1 INT, c2 STRING)"
-    }
-    sql(createTableSql)
-    // Analyze only one column.
-    sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS c1")
-    val (relation, catalogTable) = spark.table(tableName).queryExecution.analyzed.collect {
-      case catalogRel: CatalogRelation => (catalogRel, catalogRel.tableMeta)
-      case logicalRel: LogicalRelation => (logicalRel, logicalRel.catalogTable.get)
-    }.head
-    val emptyColStat = ColumnStat(0, None, None, 0, 4, 4)
-    // Check catalog statistics
-    assert(catalogTable.stats.isDefined)
-    assert(catalogTable.stats.get.sizeInBytes == 0)
-    assert(catalogTable.stats.get.rowCount == Some(0))
-    assert(catalogTable.stats.get.colStats == Map("c1" -> emptyColStat))
-
-    // Check relation statistics
-    assert(relation.stats.sizeInBytes == 0)
-    assert(relation.stats.rowCount == Some(0))
-    assert(relation.stats.attributeStats.size == 1)
-    val (attribute, colStat) = relation.stats.attributeStats.head
-    assert(attribute.name == "c1")
-    assert(colStat == emptyColStat)
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala
new file mode 100644
index 0000000000000..41569762d3c59
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala
@@ -0,0 +1,192 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import java.{lang => jl}
+import java.sql.{Date, Timestamp}
+
+import scala.collection.mutable
+import scala.util.Random
+
+import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.catalog.{CatalogRelation, CatalogStatistics, CatalogTable}
+import org.apache.spark.sql.catalyst.plans.logical.ColumnStat
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.execution.datasources.LogicalRelation
+import org.apache.spark.sql.internal.StaticSQLConf
+import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.sql.types.Decimal
+
+
+/**
+ * The base for statistics test cases that we want to include in both the hive module (for
+ * verifying behavior when using the Hive external catalog) as well as in the sql/core module.
+ */
+abstract class StatisticsCollectionTestBase extends QueryTest with SQLTestUtils {
+  import testImplicits._
+
+  private val dec1 = new java.math.BigDecimal("1.000000000000000000")
+  private val dec2 = new java.math.BigDecimal("8.000000000000000000")
+  private val d1 = Date.valueOf("2016-05-08")
+  private val d2 = Date.valueOf("2016-05-09")
+  private val t1 = Timestamp.valueOf("2016-05-08 00:00:01")
+  private val t2 = Timestamp.valueOf("2016-05-09 00:00:02")
+
+  /**
+   * Define a very simple 3 row table used for testing column serialization.
+   * Note: last column is seq[int] which doesn't support stats collection.
+   */
+  protected val data = Seq[
+    (jl.Boolean, jl.Byte, jl.Short, jl.Integer, jl.Long,
+      jl.Double, jl.Float, java.math.BigDecimal,
+      String, Array[Byte], Date, Timestamp,
+      Seq[Int])](
+    (false, 1.toByte, 1.toShort, 1, 1L, 1.0, 1.0f, dec1, "s1", "b1".getBytes, d1, t1, null),
+    (true, 2.toByte, 3.toShort, 4, 5L, 6.0, 7.0f, dec2, "ss9", "bb0".getBytes, d2, t2, null),
+    (null, null, null, null, null, null, null, null, null, null, null, null, null)
+  )
+
+  /** A mapping from column to the stats collected. */
+  protected val stats = mutable.LinkedHashMap(
+    "cbool" -> ColumnStat(2, Some(false), Some(true), 1, 1, 1),
+    "cbyte" -> ColumnStat(2, Some(1.toByte), Some(2.toByte), 1, 1, 1),
+    "cshort" -> ColumnStat(2, Some(1.toShort), Some(3.toShort), 1, 2, 2),
+    "cint" -> ColumnStat(2, Some(1), Some(4), 1, 4, 4),
+    "clong" -> ColumnStat(2, Some(1L), Some(5L), 1, 8, 8),
+    "cdouble" -> ColumnStat(2, Some(1.0), Some(6.0), 1, 8, 8),
+    "cfloat" -> ColumnStat(2, Some(1.0f), Some(7.0f), 1, 4, 4),
+    "cdecimal" -> ColumnStat(2, Some(Decimal(dec1)), Some(Decimal(dec2)), 1, 16, 16),
+    "cstring" -> ColumnStat(2, None, None, 1, 3, 3),
+    "cbinary" -> ColumnStat(2, None, None, 1, 3, 3),
+    "cdate" -> ColumnStat(2, Some(DateTimeUtils.fromJavaDate(d1)),
+      Some(DateTimeUtils.fromJavaDate(d2)), 1, 4, 4),
+    "ctimestamp" -> ColumnStat(2, Some(DateTimeUtils.fromJavaTimestamp(t1)),
+      Some(DateTimeUtils.fromJavaTimestamp(t2)), 1, 8, 8)
+  )
+
+  private val randomName = new Random(31)
+
+  def getCatalogTable(tableName: String): CatalogTable = {
+    spark.sessionState.catalog.getTableMetadata(TableIdentifier(tableName))
+  }
+
+  def getCatalogStatistics(tableName: String): CatalogStatistics = {
+    getCatalogTable(tableName).stats.get
+  }
+
+  def checkTableStats(
+      tableName: String,
+      hasSizeInBytes: Boolean,
+      expectedRowCounts: Option[Int]): Option[CatalogStatistics] = {
+    val stats = getCatalogTable(tableName).stats
+    if (hasSizeInBytes || expectedRowCounts.nonEmpty) {
+      assert(stats.isDefined)
+      assert(stats.get.sizeInBytes >= 0)
+      assert(stats.get.rowCount === expectedRowCounts)
+    } else {
+      assert(stats.isEmpty)
+    }
+
+    stats
+  }
+
+  /**
+   * Compute column stats for the given DataFrame and compare it with colStats.
+   */
+  def checkColStats(
+      df: DataFrame,
+      colStats: mutable.LinkedHashMap[String, ColumnStat]): Unit = {
+    val tableName = "column_stats_test_" + randomName.nextInt(1000)
+    withTable(tableName) {
+      df.write.saveAsTable(tableName)
+
+      // Collect statistics
+      sql(s"analyze table $tableName compute STATISTICS FOR COLUMNS " +
+        colStats.keys.mkString(", "))
+
+      // Validate statistics
+      val table = getCatalogTable(tableName)
+      assert(table.stats.isDefined)
+      assert(table.stats.get.colStats.size == colStats.size)
+
+      colStats.foreach { case (k, v) =>
+        withClue(s"column $k") {
+          assert(table.stats.get.colStats(k) == v)
+        }
+      }
+    }
+  }
+
+  // This test will be run twice: with and without Hive support
+  test("SPARK-18856: non-empty partitioned table should not report zero size") {
+    withTable("ds_tbl", "hive_tbl") {
+      spark.range(100).select($"id", $"id" % 5 as "p").write.partitionBy("p").saveAsTable("ds_tbl")
+      val stats = spark.table("ds_tbl").queryExecution.optimizedPlan.stats
+      assert(stats.sizeInBytes > 0, "non-empty partitioned table should not report zero size.")
+
+      if (spark.conf.get(StaticSQLConf.CATALOG_IMPLEMENTATION) == "hive") {
+        sql("CREATE TABLE hive_tbl(i int) PARTITIONED BY (j int)")
+        sql("INSERT INTO hive_tbl PARTITION(j=1) SELECT 1")
+        val stats2 = spark.table("hive_tbl").queryExecution.optimizedPlan.stats
+        assert(stats2.sizeInBytes > 0, "non-empty partitioned table should not report zero size.")
+      }
+    }
+  }
+
+  // This test will be run twice: with and without Hive support
+  test("conversion from CatalogStatistics to Statistics") {
+    withTable("ds_tbl", "hive_tbl") {
+      // Test data source table
+      checkStatsConversion(tableName = "ds_tbl", isDatasourceTable = true)
+      // Test hive serde table
+      if (spark.conf.get(StaticSQLConf.CATALOG_IMPLEMENTATION) == "hive") {
+        checkStatsConversion(tableName = "hive_tbl", isDatasourceTable = false)
+      }
+    }
+  }
+
+  private def checkStatsConversion(tableName: String, isDatasourceTable: Boolean): Unit = {
+    // Create an empty table and run analyze command on it.
+    val createTableSql = if (isDatasourceTable) {
+      s"CREATE TABLE $tableName (c1 INT, c2 STRING) USING PARQUET"
+    } else {
+      s"CREATE TABLE $tableName (c1 INT, c2 STRING)"
+    }
+    sql(createTableSql)
+    // Analyze only one column.
+    sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS c1")
+    val (relation, catalogTable) = spark.table(tableName).queryExecution.analyzed.collect {
+      case catalogRel: CatalogRelation => (catalogRel, catalogRel.tableMeta)
+      case logicalRel: LogicalRelation => (logicalRel, logicalRel.catalogTable.get)
+    }.head
+    val emptyColStat = ColumnStat(0, None, None, 0, 4, 4)
+    // Check catalog statistics
+    assert(catalogTable.stats.isDefined)
+    assert(catalogTable.stats.get.sizeInBytes == 0)
+    assert(catalogTable.stats.get.rowCount == Some(0))
+    assert(catalogTable.stats.get.colStats == Map("c1" -> emptyColStat))
+
+    // Check relation statistics
+    assert(relation.stats.sizeInBytes == 0)
+    assert(relation.stats.rowCount == Some(0))
+    assert(relation.stats.attributeStats.size == 1)
+    val (attribute, colStat) = relation.stats.attributeStats.head
+    assert(attribute.name == "c1")
+    assert(colStat == emptyColStat)
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index c601038a2b0af..e00fa64e9f2ce 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -25,7 +25,7 @@ import scala.util.matching.Regex
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.catalog.{CatalogRelation, CatalogStatistics, CatalogTable}
+import org.apache.spark.sql.catalyst.catalog.{CatalogRelation, CatalogStatistics}
 import org.apache.spark.sql.catalyst.util.StringUtils
 import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.execution.datasources.LogicalRelation
@@ -33,7 +33,6 @@ import org.apache.spark.sql.execution.joins._
 import org.apache.spark.sql.hive.HiveExternalCatalog._
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.types._
 
 
 class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleton {
@@ -82,58 +81,42 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
       spark.table(tableName).queryExecution.analyzed.stats.sizeInBytes
 
     // Non-partitioned table
-    sql("CREATE TABLE analyzeTable (key STRING, value STRING)").collect()
-    sql("INSERT INTO TABLE analyzeTable SELECT * FROM src").collect()
-    sql("INSERT INTO TABLE analyzeTable SELECT * FROM src").collect()
+    val nonPartTable = "non_part_table"
+    withTable(nonPartTable) {
+      sql(s"CREATE TABLE $nonPartTable (key STRING, value STRING)")
+      sql(s"INSERT INTO TABLE $nonPartTable SELECT * FROM src")
+      sql(s"INSERT INTO TABLE $nonPartTable SELECT * FROM src")
 
-    sql("ANALYZE TABLE analyzeTable COMPUTE STATISTICS noscan")
+      sql(s"ANALYZE TABLE $nonPartTable COMPUTE STATISTICS noscan")
 
-    assert(queryTotalSize("analyzeTable") === BigInt(11624))
-
-    sql("DROP TABLE analyzeTable").collect()
+      assert(queryTotalSize(nonPartTable) === BigInt(11624))
+    }
 
     // Partitioned table
-    sql(
-      """
-        |CREATE TABLE analyzeTable_part (key STRING, value STRING) PARTITIONED BY (ds STRING)
-      """.stripMargin).collect()
-    sql(
-      """
-        |INSERT INTO TABLE analyzeTable_part PARTITION (ds='2010-01-01')
-        |SELECT * FROM src
-      """.stripMargin).collect()
-    sql(
-      """
-        |INSERT INTO TABLE analyzeTable_part PARTITION (ds='2010-01-02')
-        |SELECT * FROM src
-      """.stripMargin).collect()
-    sql(
-      """
-        |INSERT INTO TABLE analyzeTable_part PARTITION (ds='2010-01-03')
-        |SELECT * FROM src
-      """.stripMargin).collect()
+    val partTable = "part_table"
+    withTable(partTable) {
+      sql(s"CREATE TABLE $partTable (key STRING, value STRING) PARTITIONED BY (ds STRING)")
+      sql(s"INSERT INTO TABLE $partTable PARTITION (ds='2010-01-01') SELECT * FROM src")
+      sql(s"INSERT INTO TABLE $partTable PARTITION (ds='2010-01-02') SELECT * FROM src")
+      sql(s"INSERT INTO TABLE $partTable PARTITION (ds='2010-01-03') SELECT * FROM src")
 
-    assert(queryTotalSize("analyzeTable_part") === spark.sessionState.conf.defaultSizeInBytes)
+      assert(queryTotalSize(partTable) === spark.sessionState.conf.defaultSizeInBytes)
 
-    sql("ANALYZE TABLE analyzeTable_part COMPUTE STATISTICS noscan")
+      sql(s"ANALYZE TABLE $partTable COMPUTE STATISTICS noscan")
 
-    assert(queryTotalSize("analyzeTable_part") === BigInt(17436))
-
-    sql("DROP TABLE analyzeTable_part").collect()
+      assert(queryTotalSize(partTable) === BigInt(17436))
+    }
 
     // Try to analyze a temp table
-    sql("""SELECT * FROM src""").createOrReplaceTempView("tempTable")
-    intercept[AnalysisException] {
-      sql("ANALYZE TABLE tempTable COMPUTE STATISTICS")
+    withView("tempTable") {
+      sql("""SELECT * FROM src""").createOrReplaceTempView("tempTable")
+      intercept[AnalysisException] {
+        sql("ANALYZE TABLE tempTable COMPUTE STATISTICS")
+      }
     }
-    spark.sessionState.catalog.dropTable(
-      TableIdentifier("tempTable"), ignoreIfNotExists = true, purge = false)
   }
 
   test("SPARK-21079 - analyze table with location different than that of individual partitions") {
-    def queryTotalSize(tableName: String): BigInt =
-      spark.table(tableName).queryExecution.analyzed.stats.sizeInBytes
-
     val tableName = "analyzeTable_part"
     withTable(tableName) {
       withTempPath { path =>
@@ -148,15 +131,12 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
 
         sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS noscan")
 
-        assert(queryTotalSize(tableName) === BigInt(17436))
+        assert(getCatalogStatistics(tableName).sizeInBytes === BigInt(17436))
       }
     }
   }
 
   test("SPARK-21079 - analyze partitioned table with only a subset of partitions visible") {
-    def queryTotalSize(tableName: String): BigInt =
-      spark.table(tableName).queryExecution.analyzed.stats.sizeInBytes
-
     val sourceTableName = "analyzeTable_part"
     val tableName = "analyzeTable_part_vis"
     withTable(sourceTableName, tableName) {
@@ -188,39 +168,19 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
 
           // Register only one of the partitions found on disk
           val ds = partitionDates.head
-          sql(s"ALTER TABLE $tableName ADD PARTITION (ds='$ds')").collect()
+          sql(s"ALTER TABLE $tableName ADD PARTITION (ds='$ds')")
 
           // Analyze original table - expect 3 partitions
           sql(s"ANALYZE TABLE $sourceTableName COMPUTE STATISTICS noscan")
-          assert(queryTotalSize(sourceTableName) === BigInt(3 * 5812))
+          assert(getCatalogStatistics(sourceTableName).sizeInBytes === BigInt(3 * 5812))
 
           // Analyze partial-copy table - expect only 1 partition
           sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS noscan")
-          assert(queryTotalSize(tableName) === BigInt(5812))
+          assert(getCatalogStatistics(tableName).sizeInBytes === BigInt(5812))
         }
     }
   }
 
-  test("analyzing views is not supported") {
-    def assertAnalyzeUnsupported(analyzeCommand: String): Unit = {
-      val err = intercept[AnalysisException] {
-        sql(analyzeCommand)
-      }
-      assert(err.message.contains("ANALYZE TABLE is not supported"))
-    }
-
-    val tableName = "tbl"
-    withTable(tableName) {
-      spark.range(10).write.saveAsTable(tableName)
-      val viewName = "view"
-      withView(viewName) {
-        sql(s"CREATE VIEW $viewName AS SELECT * FROM $tableName")
-        assertAnalyzeUnsupported(s"ANALYZE TABLE $viewName COMPUTE STATISTICS")
-        assertAnalyzeUnsupported(s"ANALYZE TABLE $viewName COMPUTE STATISTICS FOR COLUMNS id")
-      }
-    }
-  }
-
   test("test table-level statistics for hive tables created in HiveExternalCatalog") {
     val textTable = "textTable"
     withTable(textTable) {
@@ -290,8 +250,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
     if (analyzedByHive) hiveClient.runSqlHive(s"ANALYZE TABLE $tabName COMPUTE STATISTICS")
     val describeResult1 = hiveClient.runSqlHive(s"DESCRIBE FORMATTED $tabName")
 
-    val tableMetadata =
-      spark.sessionState.catalog.getTableMetadata(TableIdentifier(tabName)).properties
+    val tableMetadata = getCatalogTable(tabName).properties
     // statistics info is not contained in the metadata of the original table
     assert(Seq(StatsSetupConst.COLUMN_STATS_ACCURATE,
       StatsSetupConst.NUM_FILES,
@@ -327,8 +286,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
     val tabName = "tab1"
     withTable(tabName) {
       createNonPartitionedTable(tabName, analyzedByHive = false, analyzedBySpark = false)
-      checkTableStats(
-        tabName, hasSizeInBytes = true, expectedRowCounts = None)
+      checkTableStats(tabName, hasSizeInBytes = true, expectedRowCounts = None)
 
       // ALTER TABLE SET TBLPROPERTIES invalidates some contents of Hive specific statistics
       // This is triggered by the Hive alterTable API
@@ -370,10 +328,6 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
   }
 
   test("alter table should not have the side effect to store statistics in Spark side") {
-    def getCatalogTable(tableName: String): CatalogTable = {
-      spark.sessionState.catalog.getTableMetadata(TableIdentifier(tableName))
-    }
-
     val table = "alter_table_side_effect"
     withTable(table) {
       sql(s"CREATE TABLE $table (i string, j string)")
@@ -637,12 +591,12 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
 
       // the default value for `spark.sql.hive.convertMetastoreParquet` is true, here we just set it
       // for robustness
-      withSQLConf("spark.sql.hive.convertMetastoreParquet" -> "true") {
+      withSQLConf(HiveUtils.CONVERT_METASTORE_PARQUET.key -> "true") {
         checkTableStats(parquetTable, hasSizeInBytes = false, expectedRowCounts = None)
         sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS")
         checkTableStats(parquetTable, hasSizeInBytes = true, expectedRowCounts = Some(500))
       }
-      withSQLConf("spark.sql.hive.convertMetastoreOrc" -> "true") {
+      withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> "true") {
         // We still can get tableSize from Hive before Analyze
         checkTableStats(orcTable, hasSizeInBytes = true, expectedRowCounts = None)
         sql(s"ANALYZE TABLE $orcTable COMPUTE STATISTICS")
@@ -759,8 +713,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
       val parquetTable = "parquetTable"
       withTable(parquetTable) {
         sql(createTableCmd)
-        val catalogTable = spark.sessionState.catalog.getTableMetadata(
-          TableIdentifier(parquetTable))
+        val catalogTable = getCatalogTable(parquetTable)
         assert(DDLUtils.isDatasourceTable(catalogTable))
 
         // Add a filter to avoid creating too many partitions
@@ -795,17 +748,6 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
     "partitioned data source table",
     "CREATE TABLE parquetTable (key STRING, value STRING) USING PARQUET PARTITIONED BY (key)")
 
-  test("statistics collection of a table with zero column") {
-    val table_no_cols = "table_no_cols"
-    withTable(table_no_cols) {
-      val rddNoCols = sparkContext.parallelize(1 to 10).map(_ => Row.empty)
-      val dfNoCols = spark.createDataFrame(rddNoCols, StructType(Seq.empty))
-      dfNoCols.write.format("json").saveAsTable(table_no_cols)
-      sql(s"ANALYZE TABLE $table_no_cols COMPUTE STATISTICS")
-      checkTableStats(table_no_cols, hasSizeInBytes = true, expectedRowCounts = Some(10))
-    }
-  }
-
   /** Used to test refreshing cached metadata once table stats are updated. */
   private def getStatsBeforeAfterUpdate(isAnalyzeColumns: Boolean)
     : (CatalogStatistics, CatalogStatistics) = {

From d540dfbff33aa2f8571e0de149dfa3f4e7321113 Mon Sep 17 00:00:00 2001
From: Wang Gengliang <ltnwgl@gmail.com>
Date: Thu, 6 Jul 2017 19:12:15 +0800
Subject: [PATCH 107/779] [SPARK-21273][SQL][FOLLOW-UP] Add missing test cases
 back and revise code style

## What changes were proposed in this pull request?

Add missing test cases back and revise code style

Follow up the previous PR: https://github.com/apache/spark/pull/18479

## How was this patch tested?

Unit test

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Wang Gengliang <ltnwgl@gmail.com>

Closes #18548 from gengliangwang/stat_propagation_revise.
---
 .../plans/logical/LogicalPlanVisitor.scala    |  2 +-
 .../BasicStatsEstimationSuite.scala           | 45 +++++++++++++++++++
 2 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanVisitor.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanVisitor.scala
index b23045810a4f6..2652f6d72730c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanVisitor.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanVisitor.scala
@@ -38,10 +38,10 @@ trait LogicalPlanVisitor[T] {
     case p: Range => visitRange(p)
     case p: Repartition => visitRepartition(p)
     case p: RepartitionByExpression => visitRepartitionByExpr(p)
+    case p: ResolvedHint => visitHint(p)
     case p: Sample => visitSample(p)
     case p: ScriptTransformation => visitScriptTransform(p)
     case p: Union => visitUnion(p)
-    case p: ResolvedHint => visitHint(p)
     case p: LogicalPlan => default(p)
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/BasicStatsEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/BasicStatsEstimationSuite.scala
index 5fd21a06a109d..913be6d1ff07f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/BasicStatsEstimationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/BasicStatsEstimationSuite.scala
@@ -78,6 +78,37 @@ class BasicStatsEstimationSuite extends PlanTest with StatsEstimationTestBase {
     checkStats(globalLimit, stats)
   }
 
+  test("sample estimation") {
+    val sample = Sample(0.0, 0.5, withReplacement = false, (math.random * 1000).toLong, plan)
+    checkStats(sample, Statistics(sizeInBytes = 60, rowCount = Some(5)))
+
+    // Child doesn't have rowCount in stats
+    val childStats = Statistics(sizeInBytes = 120)
+    val childPlan = DummyLogicalPlan(childStats, childStats)
+    val sample2 =
+      Sample(0.0, 0.11, withReplacement = false, (math.random * 1000).toLong, childPlan)
+    checkStats(sample2, Statistics(sizeInBytes = 14))
+  }
+
+  test("estimate statistics when the conf changes") {
+    val expectedDefaultStats =
+      Statistics(
+        sizeInBytes = 40,
+        rowCount = Some(10),
+        attributeStats = AttributeMap(Seq(
+          AttributeReference("c1", IntegerType)() -> ColumnStat(10, Some(1), Some(10), 0, 4, 4))))
+    val expectedCboStats =
+      Statistics(
+        sizeInBytes = 4,
+        rowCount = Some(1),
+        attributeStats = AttributeMap(Seq(
+          AttributeReference("c1", IntegerType)() -> ColumnStat(1, Some(5), Some(5), 0, 4, 4))))
+
+    val plan = DummyLogicalPlan(defaultStats = expectedDefaultStats, cboStats = expectedCboStats)
+    checkStats(
+      plan, expectedStatsCboOn = expectedCboStats, expectedStatsCboOff = expectedDefaultStats)
+  }
+
   /** Check estimated stats when cbo is turned on/off. */
   private def checkStats(
       plan: LogicalPlan,
@@ -99,3 +130,17 @@ class BasicStatsEstimationSuite extends PlanTest with StatsEstimationTestBase {
   private def checkStats(plan: LogicalPlan, expectedStats: Statistics): Unit =
     checkStats(plan, expectedStats, expectedStats)
 }
+
+/**
+ * This class is used for unit-testing the cbo switch, it mimics a logical plan which computes
+ * a simple statistics or a cbo estimated statistics based on the conf.
+ */
+private case class DummyLogicalPlan(
+    defaultStats: Statistics,
+    cboStats: Statistics)
+  extends LeafNode {
+
+  override def output: Seq[Attribute] = Nil
+
+  override def computeStats(): Statistics = if (conf.cboEnabled) cboStats else defaultStats
+}

From 565e7a8d4ae7879ee704fb94ae9b3da31e202d7e Mon Sep 17 00:00:00 2001
From: caoxuewen <cao.xuewen@zte.com.cn>
Date: Thu, 6 Jul 2017 19:49:34 +0800
Subject: [PATCH 108/779] [SPARK-20950][CORE] add a new config to
 diskWriteBufferSize which is hard coded before

## What changes were proposed in this pull request?

This PR Improvement in two:
1.With spark.shuffle.spill.diskWriteBufferSize configure diskWriteBufferSize of ShuffleExternalSorter.
    when change the size of the diskWriteBufferSize to test `forceSorterToSpill`
    The average performance of running 10 times is as follows:(their unit is MS).
```
diskWriteBufferSize:       1M    512K    256K    128K    64K    32K    16K    8K    4K
---------------------------------------------------------------------------------------
RecordSize = 2.5M          742   722     694     686     667    668    671    669   683
RecordSize = 1M            294   293     292     287     283    285    281    279   285
```

2.Remove outputBufferSizeInBytes and inputBufferSizeInBytes to initialize in mergeSpillsWithFileStream function.

## How was this patch tested?
The unit test.

Author: caoxuewen <cao.xuewen@zte.com.cn>

Closes #18174 from heary-cao/buffersize.
---
 .../shuffle/sort/ShuffleExternalSorter.java   | 11 +++++---
 .../shuffle/sort/UnsafeShuffleWriter.java     | 14 +++++++---
 .../unsafe/sort/UnsafeSorterSpillWriter.java  | 24 ++++++++++-------
 .../spark/internal/config/package.scala       | 27 +++++++++++++++++++
 4 files changed, 60 insertions(+), 16 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java
index c33d1e33f030f..338faaadb33d4 100644
--- a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java
@@ -43,6 +43,7 @@
 import org.apache.spark.unsafe.array.LongArray;
 import org.apache.spark.unsafe.memory.MemoryBlock;
 import org.apache.spark.util.Utils;
+import org.apache.spark.internal.config.package$;
 
 /**
  * An external sorter that is specialized for sort-based shuffle.
@@ -82,6 +83,9 @@ final class ShuffleExternalSorter extends MemoryConsumer {
   /** The buffer size to use when writing spills using DiskBlockObjectWriter */
   private final int fileBufferSizeBytes;
 
+  /** The buffer size to use when writing the sorted records to an on-disk file */
+  private final int diskWriteBufferSize;
+
   /**
    * Memory pages that hold the records being sorted. The pages in this list are freed when
    * spilling, although in principle we could recycle these pages across spills (on the other hand,
@@ -116,13 +120,14 @@ final class ShuffleExternalSorter extends MemoryConsumer {
     this.taskContext = taskContext;
     this.numPartitions = numPartitions;
     // Use getSizeAsKb (not bytes) to maintain backwards compatibility if no units are provided
-    this.fileBufferSizeBytes = (int) conf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024;
+    this.fileBufferSizeBytes = (int) (long) conf.get(package$.MODULE$.SHUFFLE_FILE_BUFFER_SIZE()) * 1024;
     this.numElementsForSpillThreshold =
       conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold", 1024 * 1024 * 1024);
     this.writeMetrics = writeMetrics;
     this.inMemSorter = new ShuffleInMemorySorter(
       this, initialSize, conf.getBoolean("spark.shuffle.sort.useRadixSort", true));
     this.peakMemoryUsedBytes = getMemoryUsage();
+    this.diskWriteBufferSize = (int) (long) conf.get(package$.MODULE$.SHUFFLE_DISK_WRITE_BUFFER_SIZE());
   }
 
   /**
@@ -155,7 +160,7 @@ private void writeSortedFile(boolean isLastFile) throws IOException {
     // be an API to directly transfer bytes from managed memory to the disk writer, we buffer
     // data through a byte array. This array does not need to be large enough to hold a single
     // record;
-    final byte[] writeBuffer = new byte[DISK_WRITE_BUFFER_SIZE];
+    final byte[] writeBuffer = new byte[diskWriteBufferSize];
 
     // Because this output will be read during shuffle, its compression codec must be controlled by
     // spark.shuffle.compress instead of spark.shuffle.spill.compress, so we need to use
@@ -195,7 +200,7 @@ private void writeSortedFile(boolean isLastFile) throws IOException {
       int dataRemaining = Platform.getInt(recordPage, recordOffsetInPage);
       long recordReadPosition = recordOffsetInPage + 4; // skip over record length
       while (dataRemaining > 0) {
-        final int toTransfer = Math.min(DISK_WRITE_BUFFER_SIZE, dataRemaining);
+        final int toTransfer = Math.min(diskWriteBufferSize, dataRemaining);
         Platform.copyMemory(
           recordPage, recordReadPosition, writeBuffer, Platform.BYTE_ARRAY_OFFSET, toTransfer);
         writer.write(writeBuffer, 0, toTransfer);
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
index 34c179990214f..1b578491b81d7 100644
--- a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
@@ -55,6 +55,7 @@
 import org.apache.spark.storage.TimeTrackingOutputStream;
 import org.apache.spark.unsafe.Platform;
 import org.apache.spark.util.Utils;
+import org.apache.spark.internal.config.package$;
 
 @Private
 public class UnsafeShuffleWriter<K, V> extends ShuffleWriter<K, V> {
@@ -65,6 +66,7 @@ public class UnsafeShuffleWriter<K, V> extends ShuffleWriter<K, V> {
 
   @VisibleForTesting
   static final int DEFAULT_INITIAL_SORT_BUFFER_SIZE = 4096;
+  static final int DEFAULT_INITIAL_SER_BUFFER_SIZE = 1024 * 1024;
 
   private final BlockManager blockManager;
   private final IndexShuffleBlockResolver shuffleBlockResolver;
@@ -78,6 +80,8 @@ public class UnsafeShuffleWriter<K, V> extends ShuffleWriter<K, V> {
   private final SparkConf sparkConf;
   private final boolean transferToEnabled;
   private final int initialSortBufferSize;
+  private final int inputBufferSizeInBytes;
+  private final int outputBufferSizeInBytes;
 
   @Nullable private MapStatus mapStatus;
   @Nullable private ShuffleExternalSorter sorter;
@@ -140,6 +144,10 @@ public UnsafeShuffleWriter(
     this.transferToEnabled = sparkConf.getBoolean("spark.file.transferTo", true);
     this.initialSortBufferSize = sparkConf.getInt("spark.shuffle.sort.initialBufferSize",
                                                   DEFAULT_INITIAL_SORT_BUFFER_SIZE);
+    this.inputBufferSizeInBytes =
+      (int) (long) sparkConf.get(package$.MODULE$.SHUFFLE_FILE_BUFFER_SIZE()) * 1024;
+    this.outputBufferSizeInBytes =
+      (int) (long) sparkConf.get(package$.MODULE$.SHUFFLE_UNSAFE_FILE_OUTPUT_BUFFER_SIZE()) * 1024;
     open();
   }
 
@@ -209,7 +217,7 @@ private void open() throws IOException {
       partitioner.numPartitions(),
       sparkConf,
       writeMetrics);
-    serBuffer = new MyByteArrayOutputStream(1024 * 1024);
+    serBuffer = new MyByteArrayOutputStream(DEFAULT_INITIAL_SER_BUFFER_SIZE);
     serOutputStream = serializer.serializeStream(serBuffer);
   }
 
@@ -360,12 +368,10 @@ private long[] mergeSpillsWithFileStream(
 
     final OutputStream bos = new BufferedOutputStream(
             new FileOutputStream(outputFile),
-            (int) sparkConf.getSizeAsKb("spark.shuffle.unsafe.file.output.buffer", "32k") * 1024);
+            outputBufferSizeInBytes);
     // Use a counting output stream to avoid having to close the underlying file and ask
     // the file system for its size after each partition is written.
     final CountingOutputStream mergedFileOutputStream = new CountingOutputStream(bos);
-    final int inputBufferSizeInBytes =
-      (int) sparkConf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024;
 
     boolean threwException = true;
     try {
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java
index 164b9d70b79d7..f9b5493755443 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java
@@ -20,9 +20,10 @@
 import java.io.File;
 import java.io.IOException;
 
-import org.apache.spark.serializer.SerializerManager;
 import scala.Tuple2;
 
+import org.apache.spark.SparkConf;
+import org.apache.spark.serializer.SerializerManager;
 import org.apache.spark.executor.ShuffleWriteMetrics;
 import org.apache.spark.serializer.DummySerializerInstance;
 import org.apache.spark.storage.BlockId;
@@ -30,6 +31,7 @@
 import org.apache.spark.storage.DiskBlockObjectWriter;
 import org.apache.spark.storage.TempLocalBlockId;
 import org.apache.spark.unsafe.Platform;
+import org.apache.spark.internal.config.package$;
 
 /**
  * Spills a list of sorted records to disk. Spill files have the following format:
@@ -38,12 +40,16 @@
  */
 public final class UnsafeSorterSpillWriter {
 
-  static final int DISK_WRITE_BUFFER_SIZE = 1024 * 1024;
+  private final SparkConf conf = new SparkConf();
+
+  /** The buffer size to use when writing the sorted records to an on-disk file */
+  private final int diskWriteBufferSize =
+    (int) (long) conf.get(package$.MODULE$.SHUFFLE_DISK_WRITE_BUFFER_SIZE());
 
   // Small writes to DiskBlockObjectWriter will be fairly inefficient. Since there doesn't seem to
   // be an API to directly transfer bytes from managed memory to the disk writer, we buffer
   // data through a byte array.
-  private byte[] writeBuffer = new byte[DISK_WRITE_BUFFER_SIZE];
+  private byte[] writeBuffer = new byte[diskWriteBufferSize];
 
   private final File file;
   private final BlockId blockId;
@@ -114,7 +120,7 @@ public void write(
     writeIntToBuffer(recordLength, 0);
     writeLongToBuffer(keyPrefix, 4);
     int dataRemaining = recordLength;
-    int freeSpaceInWriteBuffer = DISK_WRITE_BUFFER_SIZE - 4 - 8; // space used by prefix + len
+    int freeSpaceInWriteBuffer = diskWriteBufferSize - 4 - 8; // space used by prefix + len
     long recordReadPosition = baseOffset;
     while (dataRemaining > 0) {
       final int toTransfer = Math.min(freeSpaceInWriteBuffer, dataRemaining);
@@ -122,15 +128,15 @@ public void write(
         baseObject,
         recordReadPosition,
         writeBuffer,
-        Platform.BYTE_ARRAY_OFFSET + (DISK_WRITE_BUFFER_SIZE - freeSpaceInWriteBuffer),
+        Platform.BYTE_ARRAY_OFFSET + (diskWriteBufferSize - freeSpaceInWriteBuffer),
         toTransfer);
-      writer.write(writeBuffer, 0, (DISK_WRITE_BUFFER_SIZE - freeSpaceInWriteBuffer) + toTransfer);
+      writer.write(writeBuffer, 0, (diskWriteBufferSize - freeSpaceInWriteBuffer) + toTransfer);
       recordReadPosition += toTransfer;
       dataRemaining -= toTransfer;
-      freeSpaceInWriteBuffer = DISK_WRITE_BUFFER_SIZE;
+      freeSpaceInWriteBuffer = diskWriteBufferSize;
     }
-    if (freeSpaceInWriteBuffer < DISK_WRITE_BUFFER_SIZE) {
-      writer.write(writeBuffer, 0, (DISK_WRITE_BUFFER_SIZE - freeSpaceInWriteBuffer));
+    if (freeSpaceInWriteBuffer < diskWriteBufferSize) {
+      writer.write(writeBuffer, 0, (diskWriteBufferSize - freeSpaceInWriteBuffer));
     }
     writer.recordWritten();
   }
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index 8dee0d970c4c6..a629810bf093a 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -336,4 +336,31 @@ package object config {
         "spark.")
       .booleanConf
       .createWithDefault(false)
+
+  private[spark] val SHUFFLE_FILE_BUFFER_SIZE =
+    ConfigBuilder("spark.shuffle.file.buffer")
+      .doc("Size of the in-memory buffer for each shuffle file output stream. " +
+        "These buffers reduce the number of disk seeks and system calls made " +
+        "in creating intermediate shuffle files.")
+      .bytesConf(ByteUnit.KiB)
+      .checkValue(v => v > 0 && v <= Int.MaxValue / 1024,
+        s"The file buffer size must be greater than 0 and less than ${Int.MaxValue / 1024}.")
+      .createWithDefaultString("32k")
+
+  private[spark] val SHUFFLE_UNSAFE_FILE_OUTPUT_BUFFER_SIZE =
+    ConfigBuilder("spark.shuffle.unsafe.file.output.buffer")
+      .doc("The file system for this buffer size after each partition " +
+        "is written in unsafe shuffle writer.")
+      .bytesConf(ByteUnit.KiB)
+      .checkValue(v => v > 0 && v <= Int.MaxValue / 1024,
+        s"The buffer size must be greater than 0 and less than ${Int.MaxValue / 1024}.")
+      .createWithDefaultString("32k")
+
+  private[spark] val SHUFFLE_DISK_WRITE_BUFFER_SIZE =
+    ConfigBuilder("spark.shuffle.spill.diskWriteBufferSize")
+      .doc("The buffer size to use when writing the sorted records to an on-disk file.")
+      .bytesConf(ByteUnit.BYTE)
+      .checkValue(v => v > 0 && v <= Int.MaxValue,
+        s"The buffer size must be greater than 0 and less than ${Int.MaxValue}.")
+      .createWithDefault(1024 * 1024)
 }

From 26ac085debb54d0104762d1cd4187cdf73f301ba Mon Sep 17 00:00:00 2001
From: Bogdan Raducanu <bogdan@databricks.com>
Date: Fri, 7 Jul 2017 01:04:57 +0800
Subject: [PATCH 109/779] [SPARK-21228][SQL] InSet incorrect handling of
 structs

## What changes were proposed in this pull request?
When data type is struct, InSet now uses TypeUtils.getInterpretedOrdering (similar to EqualTo) to build a TreeSet. In other cases it will use a HashSet as before (which should be faster). Similarly, In.eval uses Ordering.equiv instead of equals.

## How was this patch tested?
New test in SQLQuerySuite.

Author: Bogdan Raducanu <bogdan@databricks.com>

Closes #18455 from bogdanrdc/SPARK-21228.
---
 .../sql/catalyst/expressions/predicates.scala | 57 ++++++++++++-------
 .../catalyst/expressions/PredicateSuite.scala | 31 +++++-----
 .../catalyst/optimizer/OptimizeInSuite.scala  |  2 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 22 +++++++
 4 files changed, 78 insertions(+), 34 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index f3fe58caa6fe2..7bf10f199f1c7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -17,10 +17,11 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import scala.collection.immutable.TreeSet
+
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
-import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
-import org.apache.spark.sql.catalyst.expressions.codegen.{Predicate => BasePredicate}
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode, GenerateSafeProjection, GenerateUnsafeProjection, Predicate => BasePredicate}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.types._
@@ -175,20 +176,23 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate {
                  |[${sub.output.map(_.dataType.catalogString).mkString(", ")}].
                """.stripMargin)
           } else {
-            TypeCheckResult.TypeCheckSuccess
+            TypeUtils.checkForOrderingExpr(value.dataType, s"function $prettyName")
           }
         }
       case _ =>
-        if (list.exists(l => l.dataType != value.dataType)) {
-          TypeCheckResult.TypeCheckFailure("Arguments must be same type")
+        val mismatchOpt = list.find(l => l.dataType != value.dataType)
+        if (mismatchOpt.isDefined) {
+          TypeCheckResult.TypeCheckFailure(s"Arguments must be same type but were: " +
+            s"${value.dataType} != ${mismatchOpt.get.dataType}")
         } else {
-          TypeCheckResult.TypeCheckSuccess
+          TypeUtils.checkForOrderingExpr(value.dataType, s"function $prettyName")
         }
     }
   }
 
   override def children: Seq[Expression] = value +: list
   lazy val inSetConvertible = list.forall(_.isInstanceOf[Literal])
+  private lazy val ordering = TypeUtils.getInterpretedOrdering(value.dataType)
 
   override def nullable: Boolean = children.exists(_.nullable)
   override def foldable: Boolean = children.forall(_.foldable)
@@ -203,10 +207,10 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate {
       var hasNull = false
       list.foreach { e =>
         val v = e.eval(input)
-        if (v == evaluatedValue) {
-          return true
-        } else if (v == null) {
+        if (v == null) {
           hasNull = true
+        } else if (ordering.equiv(v, evaluatedValue)) {
+          return true
         }
       }
       if (hasNull) {
@@ -265,7 +269,7 @@ case class InSet(child: Expression, hset: Set[Any]) extends UnaryExpression with
   override def nullable: Boolean = child.nullable || hasNull
 
   protected override def nullSafeEval(value: Any): Any = {
-    if (hset.contains(value)) {
+    if (set.contains(value)) {
       true
     } else if (hasNull) {
       null
@@ -274,27 +278,40 @@ case class InSet(child: Expression, hset: Set[Any]) extends UnaryExpression with
     }
   }
 
-  def getHSet(): Set[Any] = hset
+  @transient private[this] lazy val set = child.dataType match {
+    case _: AtomicType => hset
+    case _: NullType => hset
+    case _ =>
+      // for structs use interpreted ordering to be able to compare UnsafeRows with non-UnsafeRows
+      TreeSet.empty(TypeUtils.getInterpretedOrdering(child.dataType)) ++ hset
+  }
+
+  def getSet(): Set[Any] = set
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val setName = classOf[Set[Any]].getName
     val InSetName = classOf[InSet].getName
     val childGen = child.genCode(ctx)
     ctx.references += this
-    val hsetTerm = ctx.freshName("hset")
-    val hasNullTerm = ctx.freshName("hasNull")
-    ctx.addMutableState(setName, hsetTerm,
-      s"$hsetTerm = (($InSetName)references[${ctx.references.size - 1}]).getHSet();")
-    ctx.addMutableState("boolean", hasNullTerm, s"$hasNullTerm = $hsetTerm.contains(null);")
+    val setTerm = ctx.freshName("set")
+    val setNull = if (hasNull) {
+      s"""
+         |if (!${ev.value}) {
+         |  ${ev.isNull} = true;
+         |}
+       """.stripMargin
+    } else {
+      ""
+    }
+    ctx.addMutableState(setName, setTerm,
+      s"$setTerm = (($InSetName)references[${ctx.references.size - 1}]).getSet();")
     ev.copy(code = s"""
       ${childGen.code}
       boolean ${ev.isNull} = ${childGen.isNull};
       boolean ${ev.value} = false;
       if (!${ev.isNull}) {
-        ${ev.value} = $hsetTerm.contains(${childGen.value});
-        if (!${ev.value} && $hasNullTerm) {
-          ${ev.isNull} = true;
-        }
+        ${ev.value} = $setTerm.contains(${childGen.value});
+        $setNull
       }
      """)
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
index 6fe295c3dd936..ef510a95ef446 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
@@ -35,7 +35,8 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
     test(s"3VL $name") {
       truthTable.foreach {
         case (l, r, answer) =>
-          val expr = op(NonFoldableLiteral(l, BooleanType), NonFoldableLiteral(r, BooleanType))
+          val expr = op(NonFoldableLiteral.create(l, BooleanType),
+            NonFoldableLiteral.create(r, BooleanType))
           checkEvaluation(expr, answer)
       }
     }
@@ -72,7 +73,7 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
         (false, true) ::
         (null, null) :: Nil
     notTrueTable.foreach { case (v, answer) =>
-      checkEvaluation(Not(NonFoldableLiteral(v, BooleanType)), answer)
+      checkEvaluation(Not(NonFoldableLiteral.create(v, BooleanType)), answer)
     }
     checkConsistencyBetweenInterpretedAndCodegen(Not, BooleanType)
   }
@@ -120,22 +121,26 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
       (null, null, null) :: Nil)
 
   test("IN") {
-    checkEvaluation(In(NonFoldableLiteral(null, IntegerType), Seq(Literal(1), Literal(2))), null)
-    checkEvaluation(In(NonFoldableLiteral(null, IntegerType),
-      Seq(NonFoldableLiteral(null, IntegerType))), null)
-    checkEvaluation(In(NonFoldableLiteral(null, IntegerType), Seq.empty), null)
+    checkEvaluation(In(NonFoldableLiteral.create(null, IntegerType), Seq(Literal(1),
+      Literal(2))), null)
+    checkEvaluation(In(NonFoldableLiteral.create(null, IntegerType),
+      Seq(NonFoldableLiteral.create(null, IntegerType))), null)
+    checkEvaluation(In(NonFoldableLiteral.create(null, IntegerType), Seq.empty), null)
     checkEvaluation(In(Literal(1), Seq.empty), false)
-    checkEvaluation(In(Literal(1), Seq(NonFoldableLiteral(null, IntegerType))), null)
-    checkEvaluation(In(Literal(1), Seq(Literal(1), NonFoldableLiteral(null, IntegerType))), true)
-    checkEvaluation(In(Literal(2), Seq(Literal(1), NonFoldableLiteral(null, IntegerType))), null)
+    checkEvaluation(In(Literal(1), Seq(NonFoldableLiteral.create(null, IntegerType))), null)
+    checkEvaluation(In(Literal(1), Seq(Literal(1), NonFoldableLiteral.create(null, IntegerType))),
+      true)
+    checkEvaluation(In(Literal(2), Seq(Literal(1), NonFoldableLiteral.create(null, IntegerType))),
+      null)
     checkEvaluation(In(Literal(1), Seq(Literal(1), Literal(2))), true)
     checkEvaluation(In(Literal(2), Seq(Literal(1), Literal(2))), true)
     checkEvaluation(In(Literal(3), Seq(Literal(1), Literal(2))), false)
     checkEvaluation(
-      And(In(Literal(1), Seq(Literal(1), Literal(2))), In(Literal(2), Seq(Literal(1), Literal(2)))),
+      And(In(Literal(1), Seq(Literal(1), Literal(2))), In(Literal(2), Seq(Literal(1),
+        Literal(2)))),
       true)
 
-    val ns = NonFoldableLiteral(null, StringType)
+    val ns = NonFoldableLiteral.create(null, StringType)
     checkEvaluation(In(ns, Seq(Literal("1"), Literal("2"))), null)
     checkEvaluation(In(ns, Seq(ns)), null)
     checkEvaluation(In(Literal("a"), Seq(ns)), null)
@@ -155,7 +160,7 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
           case _ => value
         }
       }
-      val input = inputData.map(NonFoldableLiteral(_, t))
+      val input = inputData.map(NonFoldableLiteral.create(_, t))
       val expected = if (inputData(0) == null) {
         null
       } else if (inputData.slice(1, 10).contains(inputData(0))) {
@@ -279,7 +284,7 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
   test("BinaryComparison: null test") {
     // Use -1 (default value for codegen) which can trigger some weird bugs, e.g. SPARK-14757
     val normalInt = Literal(-1)
-    val nullInt = NonFoldableLiteral(null, IntegerType)
+    val nullInt = NonFoldableLiteral.create(null, IntegerType)
 
     def nullTest(op: (Expression, Expression) => Expression): Unit = {
       checkEvaluation(op(normalInt, nullInt), null)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
index 6a77580b29a21..28bf7b6f84341 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
@@ -169,7 +169,7 @@ class OptimizeInSuite extends PlanTest {
       val optimizedPlan = OptimizeIn(plan)
       optimizedPlan match {
         case Filter(cond, _)
-          if cond.isInstanceOf[InSet] && cond.asInstanceOf[InSet].getHSet().size == 3 =>
+          if cond.isInstanceOf[InSet] && cond.asInstanceOf[InSet].getSet().size == 3 =>
         // pass
         case _ => fail("Unexpected result for OptimizedIn")
       }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 68f61cfab6d2f..5171aaebc9907 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2616,4 +2616,26 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     val e = intercept[AnalysisException](sql("SELECT nvl(1, 2, 3)"))
     assert(e.message.contains("Invalid number of arguments"))
   }
+
+  test("SPARK-21228: InSet incorrect handling of structs") {
+    withTempView("A") {
+      // reduce this from the default of 10 so the repro query text is not too long
+      withSQLConf((SQLConf.OPTIMIZER_INSET_CONVERSION_THRESHOLD.key -> "3")) {
+        // a relation that has 1 column of struct type with values (1,1), ..., (9, 9)
+        spark.range(1, 10).selectExpr("named_struct('a', id, 'b', id) as a")
+          .createOrReplaceTempView("A")
+        val df = sql(
+          """
+            |SELECT * from
+            | (SELECT MIN(a) as minA FROM A) AA -- this Aggregate will return UnsafeRows
+            | -- the IN will become InSet with a Set of GenericInternalRows
+            | -- a GenericInternalRow is never equal to an UnsafeRow so the query would
+            | -- returns 0 results, which is incorrect
+            | WHERE minA IN (NAMED_STRUCT('a', 1L, 'b', 1L), NAMED_STRUCT('a', 2L, 'b', 2L),
+            |   NAMED_STRUCT('a', 3L, 'b', 3L))
+          """.stripMargin)
+        checkAnswer(df, Row(Row(1, 1)))
+      }
+    }
+  }
 }

From 48e44b24a7663142176102ac4c6bf4242f103804 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Fri, 7 Jul 2017 01:07:45 +0800
Subject: [PATCH 110/779] [SPARK-21204][SQL] Add support for Scala Set
 collection types in serialization

## What changes were proposed in this pull request?

Currently we can't produce a `Dataset` containing `Set` in SparkSQL. This PR tries to support serialization/deserialization of `Set`.

Because there's no corresponding internal data type in SparkSQL for a `Set`, the most proper choice for serializing a set should be an array.

## How was this patch tested?

Added unit tests.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #18416 from viirya/SPARK-21204.
---
 .../spark/sql/catalyst/ScalaReflection.scala  | 28 +++++++++++++++--
 .../expressions/objects/objects.scala         |  5 +--
 .../org/apache/spark/sql/SQLImplicits.scala   | 10 ++++++
 .../spark/sql/DataFrameAggregateSuite.scala   | 10 ++++++
 .../spark/sql/DatasetPrimitiveSuite.scala     | 31 +++++++++++++++++++
 5 files changed, 79 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 814f2c10b9097..4d5401f30d392 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -309,7 +309,10 @@ object ScalaReflection extends ScalaReflection {
           Invoke(arrayData, primitiveMethod, arrayCls, returnNullable = false)
         }
 
-      case t if t <:< localTypeOf[Seq[_]] =>
+      // We serialize a `Set` to Catalyst array. When we deserialize a Catalyst array
+      // to a `Set`, if there are duplicated elements, the elements will be de-duplicated.
+      case t if t <:< localTypeOf[Seq[_]] ||
+          t <:< localTypeOf[scala.collection.Set[_]] =>
         val TypeRef(_, _, Seq(elementType)) = t
         val Schema(dataType, elementNullable) = schemaFor(elementType)
         val className = getClassNameFromType(elementType)
@@ -327,8 +330,10 @@ object ScalaReflection extends ScalaReflection {
         }
 
         val companion = t.normalize.typeSymbol.companionSymbol.typeSignature
-        val cls = companion.declaration(newTermName("newBuilder")) match {
-          case NoSymbol => classOf[Seq[_]]
+        val cls = companion.member(newTermName("newBuilder")) match {
+          case NoSymbol if t <:< localTypeOf[Seq[_]] => classOf[Seq[_]]
+          case NoSymbol if t <:< localTypeOf[scala.collection.Set[_]] =>
+            classOf[scala.collection.Set[_]]
           case _ => mirror.runtimeClass(t.typeSymbol.asClass)
         }
         UnresolvedMapObjects(mapFunction, getPath, Some(cls))
@@ -502,6 +507,19 @@ object ScalaReflection extends ScalaReflection {
           serializerFor(_, valueType, valuePath, seenTypeSet),
           valueNullable = !valueType.typeSymbol.asClass.isPrimitive)
 
+      case t if t <:< localTypeOf[scala.collection.Set[_]] =>
+        val TypeRef(_, _, Seq(elementType)) = t
+
+        // There's no corresponding Catalyst type for `Set`, we serialize a `Set` to Catalyst array.
+        // Note that the property of `Set` is only kept when manipulating the data as domain object.
+        val newInput =
+          Invoke(
+           inputObject,
+           "toSeq",
+           ObjectType(classOf[Seq[_]]))
+
+        toCatalystArray(newInput, elementType)
+
       case t if t <:< localTypeOf[String] =>
         StaticInvoke(
           classOf[UTF8String],
@@ -713,6 +731,10 @@ object ScalaReflection extends ScalaReflection {
         val Schema(valueDataType, valueNullable) = schemaFor(valueType)
         Schema(MapType(schemaFor(keyType).dataType,
           valueDataType, valueContainsNull = valueNullable), nullable = true)
+      case t if t <:< localTypeOf[Set[_]] =>
+        val TypeRef(_, _, Seq(elementType)) = t
+        val Schema(dataType, nullable) = schemaFor(elementType)
+        Schema(ArrayType(dataType, containsNull = nullable), nullable = true)
       case t if t <:< localTypeOf[String] => Schema(StringType, nullable = true)
       case t if t <:< localTypeOf[java.sql.Timestamp] => Schema(TimestampType, nullable = true)
       case t if t <:< localTypeOf[java.sql.Date] => Schema(DateType, nullable = true)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index 24c06d8b14b54..9b28a18035b1c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -627,8 +627,9 @@ case class MapObjects private(
 
     val (initCollection, addElement, getResult): (String, String => String, String) =
       customCollectionCls match {
-        case Some(cls) if classOf[Seq[_]].isAssignableFrom(cls) =>
-          // Scala sequence
+        case Some(cls) if classOf[Seq[_]].isAssignableFrom(cls) ||
+          classOf[scala.collection.Set[_]].isAssignableFrom(cls) =>
+          // Scala sequence or set
           val getBuilder = s"${cls.getName}$$.MODULE$$.newBuilder()"
           val builder = ctx.freshName("collectionBuilder")
           (
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala
index 86574e2f71d92..05db292bd41b1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLImplicits.scala
@@ -171,6 +171,16 @@ abstract class SQLImplicits extends LowPrioritySQLImplicits {
   /** @since 2.3.0 */
   implicit def newMapEncoder[T <: Map[_, _] : TypeTag]: Encoder[T] = ExpressionEncoder()
 
+  /**
+   * Notice that we serialize `Set` to Catalyst array. The set property is only kept when
+   * manipulating the domain objects. The serialization format doesn't keep the set property.
+   * When we have a Catalyst array which contains duplicated elements and convert it to
+   * `Dataset[Set[T]]` by using the encoder, the elements will be de-duplicated.
+   *
+   * @since 2.3.0
+   */
+  implicit def newSetEncoder[T <: Set[_] : TypeTag]: Encoder[T] = ExpressionEncoder()
+
   // Arrays
 
   /** @since 1.6.1 */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index 5db354d79bb6e..b52d50b195bcc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -460,6 +460,16 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {
       df.select(collect_set($"a"), collect_set($"b")),
       Seq(Row(Seq(1, 2, 3), Seq(2, 4)))
     )
+
+    checkDataset(
+      df.select(collect_set($"a").as("aSet")).as[Set[Int]],
+      Set(1, 2, 3))
+    checkDataset(
+      df.select(collect_set($"b").as("bSet")).as[Set[Int]],
+      Set(2, 4))
+    checkDataset(
+      df.select(collect_set($"a"), collect_set($"b")).as[(Set[Int], Set[Int])],
+      Seq(Set(1, 2, 3) -> Set(2, 4)): _*)
   }
 
   test("collect functions structs") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala
index a6847dcfbffc4..f62f9e23db66d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql
 
+import scala.collection.immutable.{HashSet => HSet}
 import scala.collection.immutable.Queue
 import scala.collection.mutable.{LinkedHashMap => LHMap}
 import scala.collection.mutable.ArrayBuffer
@@ -342,6 +343,31 @@ class DatasetPrimitiveSuite extends QueryTest with SharedSQLContext {
       LHMapClass(LHMap(1 -> 2)) -> LHMap("test" -> MapClass(Map(3 -> 4))))
   }
 
+  test("arbitrary sets") {
+    checkDataset(Seq(Set(1, 2, 3, 4)).toDS(), Set(1, 2, 3, 4))
+    checkDataset(Seq(Set(1.toLong, 2.toLong)).toDS(), Set(1.toLong, 2.toLong))
+    checkDataset(Seq(Set(1.toDouble, 2.toDouble)).toDS(), Set(1.toDouble, 2.toDouble))
+    checkDataset(Seq(Set(1.toFloat, 2.toFloat)).toDS(), Set(1.toFloat, 2.toFloat))
+    checkDataset(Seq(Set(1.toByte, 2.toByte)).toDS(), Set(1.toByte, 2.toByte))
+    checkDataset(Seq(Set(1.toShort, 2.toShort)).toDS(), Set(1.toShort, 2.toShort))
+    checkDataset(Seq(Set(true, false)).toDS(), Set(true, false))
+    checkDataset(Seq(Set("test1", "test2")).toDS(), Set("test1", "test2"))
+    checkDataset(Seq(Set(Tuple1(1), Tuple1(2))).toDS(), Set(Tuple1(1), Tuple1(2)))
+
+    checkDataset(Seq(HSet(1, 2)).toDS(), HSet(1, 2))
+    checkDataset(Seq(HSet(1.toLong, 2.toLong)).toDS(), HSet(1.toLong, 2.toLong))
+    checkDataset(Seq(HSet(1.toDouble, 2.toDouble)).toDS(), HSet(1.toDouble, 2.toDouble))
+    checkDataset(Seq(HSet(1.toFloat, 2.toFloat)).toDS(), HSet(1.toFloat, 2.toFloat))
+    checkDataset(Seq(HSet(1.toByte, 2.toByte)).toDS(), HSet(1.toByte, 2.toByte))
+    checkDataset(Seq(HSet(1.toShort, 2.toShort)).toDS(), HSet(1.toShort, 2.toShort))
+    checkDataset(Seq(HSet(true, false)).toDS(), HSet(true, false))
+    checkDataset(Seq(HSet("test1", "test2")).toDS(), HSet("test1", "test2"))
+    checkDataset(Seq(HSet(Tuple1(1), Tuple1(2))).toDS(), HSet(Tuple1(1), Tuple1(2)))
+
+    checkDataset(Seq(Seq(Some(1), None), Seq(Some(2))).toDF("c").as[Set[Integer]],
+      Seq(Set[Integer](1, null), Set[Integer](2)): _*)
+  }
+
   test("nested sequences") {
     checkDataset(Seq(Seq(Seq(1))).toDS(), Seq(Seq(1)))
     checkDataset(Seq(List(Queue(1))).toDS(), List(Queue(1)))
@@ -352,6 +378,11 @@ class DatasetPrimitiveSuite extends QueryTest with SharedSQLContext {
     checkDataset(Seq(LHMap(Map(1 -> 2) -> 3)).toDS(), LHMap(Map(1 -> 2) -> 3))
   }
 
+  test("nested set") {
+    checkDataset(Seq(Set(HSet(1, 2), HSet(3, 4))).toDS(), Set(HSet(1, 2), HSet(3, 4)))
+    checkDataset(Seq(HSet(Set(1, 2), Set(3, 4))).toDS(), HSet(Set(1, 2), Set(3, 4)))
+  }
+
   test("package objects") {
     import packageobject._
     checkDataset(Seq(PackageClass(1)).toDS(), PackageClass(1))

From bf66335acab3c0c188f6c378eb8aa6948a259cb2 Mon Sep 17 00:00:00 2001
From: Wang Gengliang <ltnwgl@gmail.com>
Date: Thu, 6 Jul 2017 13:58:27 -0700
Subject: [PATCH 111/779] [SPARK-21323][SQL] Rename
 plans.logical.statsEstimation.Range to ValueInterval

## What changes were proposed in this pull request?

Rename org.apache.spark.sql.catalyst.plans.logical.statsEstimation.Range to ValueInterval.
The current naming is identical to logical operator "range".
Refactoring it to ValueInterval is more accurate.

## How was this patch tested?

unit test

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Wang Gengliang <ltnwgl@gmail.com>

Closes #18549 from gengliangwang/ValueInterval.
---
 .../statsEstimation/FilterEstimation.scala    | 36 ++++++++--------
 .../statsEstimation/JoinEstimation.scala      | 14 +++----
 .../{Range.scala => ValueInterval.scala}      | 41 ++++++++++---------
 3 files changed, 48 insertions(+), 43 deletions(-)
 rename sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/{Range.scala => ValueInterval.scala} (65%)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
index 5a3bee7b9e449..e13db85c7a76e 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
@@ -316,8 +316,8 @@ case class FilterEstimation(plan: Filter) extends Logging {
     // decide if the value is in [min, max] of the column.
     // We currently don't store min/max for binary/string type.
     // Hence, we assume it is in boundary for binary/string type.
-    val statsRange = Range(colStat.min, colStat.max, attr.dataType)
-    if (statsRange.contains(literal)) {
+    val statsInterval = ValueInterval(colStat.min, colStat.max, attr.dataType)
+    if (statsInterval.contains(literal)) {
       if (update) {
         // We update ColumnStat structure after apply this equality predicate:
         // Set distinctCount to 1, nullCount to 0, and min/max values (if exist) to the literal
@@ -388,9 +388,10 @@ case class FilterEstimation(plan: Filter) extends Logging {
     // use [min, max] to filter the original hSet
     dataType match {
       case _: NumericType | BooleanType | DateType | TimestampType =>
-        val statsRange = Range(colStat.min, colStat.max, dataType).asInstanceOf[NumericRange]
+        val statsInterval =
+          ValueInterval(colStat.min, colStat.max, dataType).asInstanceOf[NumericValueInterval]
         val validQuerySet = hSet.filter { v =>
-          v != null && statsRange.contains(Literal(v, dataType))
+          v != null && statsInterval.contains(Literal(v, dataType))
         }
 
         if (validQuerySet.isEmpty) {
@@ -440,12 +441,13 @@ case class FilterEstimation(plan: Filter) extends Logging {
       update: Boolean): Option[BigDecimal] = {
 
     val colStat = colStatsMap(attr)
-    val statsRange = Range(colStat.min, colStat.max, attr.dataType).asInstanceOf[NumericRange]
-    val max = statsRange.max.toBigDecimal
-    val min = statsRange.min.toBigDecimal
+    val statsInterval =
+      ValueInterval(colStat.min, colStat.max, attr.dataType).asInstanceOf[NumericValueInterval]
+    val max = statsInterval.max.toBigDecimal
+    val min = statsInterval.min.toBigDecimal
     val ndv = BigDecimal(colStat.distinctCount)
 
-    // determine the overlapping degree between predicate range and column's range
+    // determine the overlapping degree between predicate interval and column's interval
     val numericLiteral = if (literal.dataType == BooleanType) {
       if (literal.value.asInstanceOf[Boolean]) BigDecimal(1) else BigDecimal(0)
     } else {
@@ -566,18 +568,18 @@ case class FilterEstimation(plan: Filter) extends Logging {
     }
 
     val colStatLeft = colStatsMap(attrLeft)
-    val statsRangeLeft = Range(colStatLeft.min, colStatLeft.max, attrLeft.dataType)
-      .asInstanceOf[NumericRange]
-    val maxLeft = statsRangeLeft.max
-    val minLeft = statsRangeLeft.min
+    val statsIntervalLeft = ValueInterval(colStatLeft.min, colStatLeft.max, attrLeft.dataType)
+      .asInstanceOf[NumericValueInterval]
+    val maxLeft = statsIntervalLeft.max
+    val minLeft = statsIntervalLeft.min
 
     val colStatRight = colStatsMap(attrRight)
-    val statsRangeRight = Range(colStatRight.min, colStatRight.max, attrRight.dataType)
-      .asInstanceOf[NumericRange]
-    val maxRight = statsRangeRight.max
-    val minRight = statsRangeRight.min
+    val statsIntervalRight = ValueInterval(colStatRight.min, colStatRight.max, attrRight.dataType)
+      .asInstanceOf[NumericValueInterval]
+    val maxRight = statsIntervalRight.max
+    val minRight = statsIntervalRight.min
 
-    // determine the overlapping degree between predicate range and column's range
+    // determine the overlapping degree between predicate interval and column's interval
     val allNotNull = (colStatLeft.nullCount == 0) && (colStatRight.nullCount == 0)
     val (noOverlap: Boolean, completeOverlap: Boolean) = op match {
       // Left < Right or Left <= Right
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/JoinEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/JoinEstimation.scala
index f48196997a24d..dcbe36da91dfc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/JoinEstimation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/JoinEstimation.scala
@@ -175,9 +175,9 @@ case class InnerOuterEstimation(join: Join) extends Logging {
       // Check if the two sides are disjoint
       val leftKeyStats = leftStats.attributeStats(leftKey)
       val rightKeyStats = rightStats.attributeStats(rightKey)
-      val lRange = Range(leftKeyStats.min, leftKeyStats.max, leftKey.dataType)
-      val rRange = Range(rightKeyStats.min, rightKeyStats.max, rightKey.dataType)
-      if (Range.isIntersected(lRange, rRange)) {
+      val lInterval = ValueInterval(leftKeyStats.min, leftKeyStats.max, leftKey.dataType)
+      val rInterval = ValueInterval(rightKeyStats.min, rightKeyStats.max, rightKey.dataType)
+      if (ValueInterval.isIntersected(lInterval, rInterval)) {
         // Get the largest ndv among pairs of join keys
         val maxNdv = leftKeyStats.distinctCount.max(rightKeyStats.distinctCount)
         if (maxNdv > ndvDenom) ndvDenom = maxNdv
@@ -239,16 +239,16 @@ case class InnerOuterEstimation(join: Join) extends Logging {
     joinKeyPairs.foreach { case (leftKey, rightKey) =>
       val leftKeyStats = leftStats.attributeStats(leftKey)
       val rightKeyStats = rightStats.attributeStats(rightKey)
-      val lRange = Range(leftKeyStats.min, leftKeyStats.max, leftKey.dataType)
-      val rRange = Range(rightKeyStats.min, rightKeyStats.max, rightKey.dataType)
+      val lInterval = ValueInterval(leftKeyStats.min, leftKeyStats.max, leftKey.dataType)
+      val rInterval = ValueInterval(rightKeyStats.min, rightKeyStats.max, rightKey.dataType)
       // When we reach here, join selectivity is not zero, so each pair of join keys should be
       // intersected.
-      assert(Range.isIntersected(lRange, rRange))
+      assert(ValueInterval.isIntersected(lInterval, rInterval))
 
       // Update intersected column stats
       assert(leftKey.dataType.sameType(rightKey.dataType))
       val newNdv = leftKeyStats.distinctCount.min(rightKeyStats.distinctCount)
-      val (newMin, newMax) = Range.intersect(lRange, rRange, leftKey.dataType)
+      val (newMin, newMax) = ValueInterval.intersect(lInterval, rInterval, leftKey.dataType)
       val newMaxLen = math.min(leftKeyStats.maxLen, rightKeyStats.maxLen)
       val newAvgLen = (leftKeyStats.avgLen + rightKeyStats.avgLen) / 2
       val newStats = ColumnStat(newNdv, newMin, newMax, 0, newAvgLen, newMaxLen)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/Range.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/ValueInterval.scala
similarity index 65%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/Range.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/ValueInterval.scala
index 4ac5ba5689f82..0caaf796a3b68 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/Range.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/ValueInterval.scala
@@ -22,12 +22,12 @@ import org.apache.spark.sql.types._
 
 
 /** Value range of a column. */
-trait Range {
+trait ValueInterval {
   def contains(l: Literal): Boolean
 }
 
-/** For simplicity we use decimal to unify operations of numeric ranges. */
-case class NumericRange(min: Decimal, max: Decimal) extends Range {
+/** For simplicity we use decimal to unify operations of numeric intervals. */
+case class NumericValueInterval(min: Decimal, max: Decimal) extends ValueInterval {
   override def contains(l: Literal): Boolean = {
     val lit = EstimationUtils.toDecimal(l.value, l.dataType)
     min <= lit && max >= lit
@@ -38,46 +38,49 @@ case class NumericRange(min: Decimal, max: Decimal) extends Range {
  * This version of Spark does not have min/max for binary/string types, we define their default
  * behaviors by this class.
  */
-class DefaultRange extends Range {
+class DefaultValueInterval extends ValueInterval {
   override def contains(l: Literal): Boolean = true
 }
 
 /** This is for columns with only null values. */
-class NullRange extends Range {
+class NullValueInterval extends ValueInterval {
   override def contains(l: Literal): Boolean = false
 }
 
-object Range {
-  def apply(min: Option[Any], max: Option[Any], dataType: DataType): Range = dataType match {
-    case StringType | BinaryType => new DefaultRange()
-    case _ if min.isEmpty || max.isEmpty => new NullRange()
+object ValueInterval {
+  def apply(
+      min: Option[Any],
+      max: Option[Any],
+      dataType: DataType): ValueInterval = dataType match {
+    case StringType | BinaryType => new DefaultValueInterval()
+    case _ if min.isEmpty || max.isEmpty => new NullValueInterval()
     case _ =>
-      NumericRange(
+      NumericValueInterval(
         min = EstimationUtils.toDecimal(min.get, dataType),
         max = EstimationUtils.toDecimal(max.get, dataType))
   }
 
-  def isIntersected(r1: Range, r2: Range): Boolean = (r1, r2) match {
-    case (_, _: DefaultRange) | (_: DefaultRange, _) =>
-      // The DefaultRange represents string/binary types which do not have max/min stats,
+  def isIntersected(r1: ValueInterval, r2: ValueInterval): Boolean = (r1, r2) match {
+    case (_, _: DefaultValueInterval) | (_: DefaultValueInterval, _) =>
+      // The DefaultValueInterval represents string/binary types which do not have max/min stats,
       // we assume they are intersected to be conservative on estimation
       true
-    case (_, _: NullRange) | (_: NullRange, _) =>
+    case (_, _: NullValueInterval) | (_: NullValueInterval, _) =>
       false
-    case (n1: NumericRange, n2: NumericRange) =>
+    case (n1: NumericValueInterval, n2: NumericValueInterval) =>
       n1.min.compareTo(n2.max) <= 0 && n1.max.compareTo(n2.min) >= 0
   }
 
   /**
-   * Intersected results of two ranges. This is only for two overlapped ranges.
+   * Intersected results of two intervals. This is only for two overlapped intervals.
    * The outputs are the intersected min/max values.
    */
-  def intersect(r1: Range, r2: Range, dt: DataType): (Option[Any], Option[Any]) = {
+  def intersect(r1: ValueInterval, r2: ValueInterval, dt: DataType): (Option[Any], Option[Any]) = {
     (r1, r2) match {
-      case (_, _: DefaultRange) | (_: DefaultRange, _) =>
+      case (_, _: DefaultValueInterval) | (_: DefaultValueInterval, _) =>
         // binary/string types don't support intersecting.
         (None, None)
-      case (n1: NumericRange, n2: NumericRange) =>
+      case (n1: NumericValueInterval, n2: NumericValueInterval) =>
         // Choose the maximum of two min values, and the minimum of two max values.
         val newMin = if (n1.min <= n2.min) n2.min else n1.min
         val newMax = if (n1.max <= n2.max) n1.max else n2.max

From 0217dfd26f89133f146197359b556c9bf5aca172 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Thu, 6 Jul 2017 17:28:20 -0700
Subject: [PATCH 112/779] [SPARK-21267][SS][DOCS] Update Structured Streaming
 Documentation

## What changes were proposed in this pull request?

Few changes to the Structured Streaming documentation
- Clarify that the entire stream input table is not materialized
- Add information for Ganglia
- Add Kafka Sink to the main docs
- Removed a couple of leftover experimental tags
- Added more associated reading material and talk videos.

In addition, https://github.com/apache/spark/pull/16856 broke the link to the RDD programming guide in several places while renaming the page. This PR fixes those sameeragarwal cloud-fan.
- Added a redirection to avoid breaking internal and possible external links.
- Removed unnecessary redirection pages that were there since the separate scala, java, and python programming guides were merged together in 2013 or 2014.

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)
(If this patch involves UI changes, please attach a screenshot; otherwise, remove this)

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #18485 from tdas/SPARK-21267.
---
 docs/_layouts/global.html                     |   7 +-
 docs/index.md                                 |  13 +-
 docs/java-programming-guide.md                |   7 -
 docs/programming-guide.md                     |   7 +
 docs/python-programming-guide.md              |   7 -
 docs/rdd-programming-guide.md                 |   2 +-
 docs/scala-programming-guide.md               |   7 -
 docs/sql-programming-guide.md                 |  16 +-
 .../structured-streaming-programming-guide.md | 172 +++++++++++++++---
 .../scala/org/apache/spark/sql/Dataset.scala  |   3 -
 10 files changed, 169 insertions(+), 72 deletions(-)
 delete mode 100644 docs/java-programming-guide.md
 create mode 100644 docs/programming-guide.md
 delete mode 100644 docs/python-programming-guide.md
 delete mode 100644 docs/scala-programming-guide.md

diff --git a/docs/_layouts/global.html b/docs/_layouts/global.html
index c00d0db63cd10..570483c0b04ea 100755
--- a/docs/_layouts/global.html
+++ b/docs/_layouts/global.html
@@ -69,11 +69,10 @@
                             <a href="#" class="dropdown-toggle" data-toggle="dropdown">Programming Guides<b class="caret"></b></a>
                             <ul class="dropdown-menu">
                                 <li><a href="quick-start.html">Quick Start</a></li>
-                                <li><a href="programming-guide.html">Spark Programming Guide</a></li>
-                                <li class="divider"></li>
-                                <li><a href="streaming-programming-guide.html">Spark Streaming</a></li>
-                                <li><a href="sql-programming-guide.html">DataFrames, Datasets and SQL</a></li>
+                                <li><a href="rdd-programming-guide.html">RDDs, Accumulators, Broadcasts Vars</a></li>
+                                <li><a href="sql-programming-guide.html">SQL, DataFrames, and Datasets</a></li>
                                 <li><a href="structured-streaming-programming-guide.html">Structured Streaming</a></li>
+                                <li><a href="streaming-programming-guide.html">Spark Streaming (DStreams)</a></li>
                                 <li><a href="ml-guide.html">MLlib (Machine Learning)</a></li>
                                 <li><a href="graphx-programming-guide.html">GraphX (Graph Processing)</a></li>
                                 <li><a href="sparkr.html">SparkR (R on Spark)</a></li>
diff --git a/docs/index.md b/docs/index.md
index f7b5863957ce2..81ed4653b46b7 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -88,13 +88,12 @@ options for deployment:
 **Programming Guides:**
 
 * [Quick Start](quick-start.html): a quick introduction to the Spark API; start here!
-* [Spark Programming Guide](programming-guide.html): detailed overview of Spark
-  in all supported languages (Scala, Java, Python, R)
-* Modules built on Spark:
-  * [Spark Streaming](streaming-programming-guide.html): processing real-time data streams
-  * [Spark SQL, Datasets, and DataFrames](sql-programming-guide.html): support for structured data and relational queries
-  * [MLlib](ml-guide.html): built-in machine learning library
-  * [GraphX](graphx-programming-guide.html): Spark's new API for graph processing
+* [RDD Programming Guide](programming-guide.html): overview of Spark basics - RDDs (core but old API), accumulators, and broadcast variables  
+* [Spark SQL, Datasets, and DataFrames](sql-programming-guide.html): processing structured data with relational queries (newer API than RDDs)
+* [Structured Streaming](structured-streaming-programming-guide.html): processing structured data streams with relation queries (using Datasets and DataFrames, newer API than DStreams)
+* [Spark Streaming](streaming-programming-guide.html): processing data streams using DStreams (old API)
+* [MLlib](ml-guide.html): applying machine learning algorithms
+* [GraphX](graphx-programming-guide.html): processing graphs 
 
 **API Docs:**
 
diff --git a/docs/java-programming-guide.md b/docs/java-programming-guide.md
deleted file mode 100644
index bb539582f64ec..0000000000000
--- a/docs/java-programming-guide.md
+++ /dev/null
@@ -1,7 +0,0 @@
----
-layout: global
-title: Java Programming Guide
-redirect: programming-guide.html
----
-
-This document has been merged into the [Spark programming guide](programming-guide.html).
diff --git a/docs/programming-guide.md b/docs/programming-guide.md
new file mode 100644
index 0000000000000..f8b8f74c53b55
--- /dev/null
+++ b/docs/programming-guide.md
@@ -0,0 +1,7 @@
+---
+layout: global
+title: Spark Programming Guide
+redirect: rdd-programming-guide.html
+---
+
+This document has moved [here](rdd-programming-guide.html).
diff --git a/docs/python-programming-guide.md b/docs/python-programming-guide.md
deleted file mode 100644
index 68f04b50aaa0d..0000000000000
--- a/docs/python-programming-guide.md
+++ /dev/null
@@ -1,7 +0,0 @@
----
-layout: global
-title: Python Programming Guide
-redirect: programming-guide.html
----
-
-This document has been merged into the [Spark programming guide](programming-guide.html).
diff --git a/docs/rdd-programming-guide.md b/docs/rdd-programming-guide.md
index 483acd347a826..3aa529169bb12 100644
--- a/docs/rdd-programming-guide.md
+++ b/docs/rdd-programming-guide.md
@@ -1,6 +1,6 @@
 ---
 layout: global
-title: Spark Programming Guide
+title: RDD Programming Guide
 description: Spark SPARK_VERSION_SHORT programming guide in Java, Scala and Python
 ---
 
diff --git a/docs/scala-programming-guide.md b/docs/scala-programming-guide.md
deleted file mode 100644
index 69ceb637dc8db..0000000000000
--- a/docs/scala-programming-guide.md
+++ /dev/null
@@ -1,7 +0,0 @@
----
-layout: global
-title: Spark Programming Guide
-redirect: programming-guide.html
----
-
-This document has moved [here](programming-guide.html).
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 8e722ae6adca6..b5eca76480eb8 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -392,41 +392,31 @@ While those functions are designed for DataFrames, Spark SQL also has type-safe
 Moreover, users are not limited to the predefined aggregate functions and can create their own.
 
 ### Untyped User-Defined Aggregate Functions
-
-<div class="codetabs">
-
-<div data-lang="scala"  markdown="1">
-
 Users have to extend the [UserDefinedAggregateFunction](api/scala/index.html#org.apache.spark.sql.expressions.UserDefinedAggregateFunction)
 abstract class to implement a custom untyped aggregate function. For example, a user-defined average
 can look like:
 
+<div class="codetabs">
+<div data-lang="scala"  markdown="1">
 {% include_example untyped_custom_aggregation scala/org/apache/spark/examples/sql/UserDefinedUntypedAggregation.scala%}
 </div>
-
 <div data-lang="java"  markdown="1">
-
 {% include_example untyped_custom_aggregation java/org/apache/spark/examples/sql/JavaUserDefinedUntypedAggregation.java%}
 </div>
-
 </div>
 
 ### Type-Safe User-Defined Aggregate Functions
 
 User-defined aggregations for strongly typed Datasets revolve around the [Aggregator](api/scala/index.html#org.apache.spark.sql.expressions.Aggregator) abstract class.
 For example, a type-safe user-defined average can look like:
-<div class="codetabs">
 
+<div class="codetabs">
 <div data-lang="scala"  markdown="1">
-
 {% include_example typed_custom_aggregation scala/org/apache/spark/examples/sql/UserDefinedTypedAggregation.scala%}
 </div>
-
 <div data-lang="java"  markdown="1">
-
 {% include_example typed_custom_aggregation java/org/apache/spark/examples/sql/JavaUserDefinedTypedAggregation.java%}
 </div>
-
 </div>
 
 # Data Sources
diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md
index d478042dea5c8..3bc377c9a38b5 100644
--- a/docs/structured-streaming-programming-guide.md
+++ b/docs/structured-streaming-programming-guide.md
@@ -15,7 +15,7 @@ In this guide, we are going to walk you through the programming model and the AP
 # Quick Example
 Let’s say you want to maintain a running word count of text data received from a data server listening on a TCP socket. Let’s see how you can express this using Structured Streaming. You can see the full code in
 [Scala]({{site.SPARK_GITHUB_URL}}/blob/v{{site.SPARK_VERSION_SHORT}}/examples/src/main/scala/org/apache/spark/examples/sql/streaming/StructuredNetworkWordCount.scala)/[Java]({{site.SPARK_GITHUB_URL}}/blob/v{{site.SPARK_VERSION_SHORT}}/examples/src/main/java/org/apache/spark/examples/sql/streaming/JavaStructuredNetworkWordCount.java)/[Python]({{site.SPARK_GITHUB_URL}}/blob/v{{site.SPARK_VERSION_SHORT}}/examples/src/main/python/sql/streaming/structured_network_wordcount.py)/[R]({{site.SPARK_GITHUB_URL}}/blob/v{{site.SPARK_VERSION_SHORT}}/examples/src/main/r/streaming/structured_network_wordcount.R).
-And if you [download Spark](http://spark.apache.org/downloads.html), you can directly run the example. In any case, let’s walk through the example step-by-step and understand how it works. First, we have to import the necessary classes and create a local SparkSession, the starting point of all functionalities related to Spark.
+And if you [download Spark](http://spark.apache.org/downloads.html), you can directly [run the example](index.html#running-the-examples-and-shell). In any case, let’s walk through the example step-by-step and understand how it works. First, we have to import the necessary classes and create a local SparkSession, the starting point of all functionalities related to Spark.
 
 <div class="codetabs">
 <div data-lang="scala"  markdown="1">
@@ -450,7 +450,12 @@ running counts with the new data to compute updated counts, as shown below.
 
 ![Model](img/structured-streaming-example-model.png)
 
-This model is significantly different from many other stream processing 
+**Note that Structured Streaming does not materialize the entire table**. It reads the latest
+available data from the streaming data source, processes it incrementally to update the result,
+and then discards the source data. It only keeps around the minimal intermediate *state* data as
+required to update the result (e.g. intermediate counts in the earlier example).
+
+This model is significantly different from many other stream processing
 engines. Many streaming systems require the user to maintain running 
 aggregations themselves, thus having to reason about fault-tolerance, and 
 data consistency (at-least-once, or at-most-once, or exactly-once). In this 
@@ -486,11 +491,11 @@ Streaming DataFrames can be created through the `DataStreamReader` interface
 returned by `SparkSession.readStream()`. In [R](api/R/read.stream.html), with the `read.stream()` method. Similar to the read interface for creating static DataFrame, you can specify the details of the source – data format, schema, options, etc.
 
 #### Input Sources
-In Spark 2.0, there are a few built-in sources.
+There are a few built-in sources.
 
   - **File source** - Reads files written in a directory as a stream of data. Supported file formats are text, csv, json, parquet. See the docs of the DataStreamReader interface for a more up-to-date list, and supported options for each file format. Note that the files must be atomically placed in the given directory, which in most file systems, can be achieved by file move operations.
 
-  - **Kafka source** - Poll data from Kafka. It's compatible with Kafka broker versions 0.10.0 or higher. See the [Kafka Integration Guide](structured-streaming-kafka-integration.html) for more details.
+  - **Kafka source** - Reads data from Kafka. It's compatible with Kafka broker versions 0.10.0 or higher. See the [Kafka Integration Guide](structured-streaming-kafka-integration.html) for more details.
 
   - **Socket source (for testing)** - Reads UTF8 text data from a socket connection. The listening server socket is at the driver. Note that this should be used only for testing as this does not provide end-to-end fault-tolerance guarantees. 
 
@@ -517,17 +522,18 @@ Here are the details of all the sources in Spark.
         <br/>
         <code>fileNameOnly</code>: whether to check new files based on only the filename instead of on the full path (default: false). With this set to `true`, the following files would be considered as the same file, because their filenames, "dataset.txt", are the same:
         <br/>
-        · "file:///dataset.txt"<br/>
-        · "s3://a/dataset.txt"<br/>
-        · "s3n://a/b/dataset.txt"<br/>
-        · "s3a://a/b/c/dataset.txt"<br/>
-        <br/>
-
-        <br/>
+        "file:///dataset.txt"<br/>
+        "s3://a/dataset.txt"<br/>
+        "s3n://a/b/dataset.txt"<br/>
+        "s3a://a/b/c/dataset.txt"<br/>
+        <br/><br/>
         For file-format-specific options, see the related methods in <code>DataStreamReader</code>
         (<a href="api/scala/index.html#org.apache.spark.sql.streaming.DataStreamReader">Scala</a>/<a href="api/java/org/apache/spark/sql/streaming/DataStreamReader.html">Java</a>/<a href="api/python/pyspark.sql.html#pyspark.sql.streaming.DataStreamReader">Python</a>/<a
         href="api/R/read.stream.html">R</a>).
-        E.g. for "parquet" format options see <code>DataStreamReader.parquet()</code></td>
+        E.g. for "parquet" format options see <code>DataStreamReader.parquet()</code>.
+        <br/><br/>
+        In addition, there are session configurations that affect certain file-formats. See the <a href="sql-programming-guide.html">SQL Programming Guide</a> for more details. E.g., for "parquet", see <a href="sql-programming-guide.html#configuration">Parquet configuration</a> section.
+        </td>
     <td>Yes</td>
     <td>Supports glob paths, but does not support multiple comma-separated paths/globs.</td>
   </tr>
@@ -758,6 +764,60 @@ count(groupBy(df, "deviceType"))
 </div>
 </div>
 
+You can also register a streaming DataFrame/Dataset as a temporary view and then apply SQL commands on it.
+
+<div class="codetabs">
+<div data-lang="scala"  markdown="1">
+{% highlight scala %}
+df.createOrReplaceTempView("updates")
+spark.sql("select count(*) from updates")  // returns another streaming DF
+{% endhighlight %}
+</div>
+<div data-lang="java"  markdown="1">  
+{% highlight java %}
+df.createOrReplaceTempView("updates");
+spark.sql("select count(*) from updates");  // returns another streaming DF
+{% endhighlight %}
+</div>
+<div data-lang="python"  markdown="1">  
+{% highlight python %}
+df.createOrReplaceTempView("updates")
+spark.sql("select count(*) from updates")  # returns another streaming DF
+{% endhighlight %}
+</div>
+<div data-lang="r"  markdown="1">
+{% highlight r %}
+createOrReplaceTempView(df, "updates")
+sql("select count(*) from updates")
+{% endhighlight %}
+</div>
+</div>
+
+Note, you can identify whether a DataFrame/Dataset has streaming data or not by using `df.isStreaming`.
+
+<div class="codetabs">
+<div data-lang="scala"  markdown="1">
+{% highlight scala %}
+df.isStreaming
+{% endhighlight %}
+</div>
+<div data-lang="java"  markdown="1">
+{% highlight java %}
+df.isStreaming()
+{% endhighlight %}
+</div>
+<div data-lang="python"  markdown="1">
+{% highlight python %}
+df.isStreaming()
+{% endhighlight %}
+</div>
+<div data-lang="r"  markdown="1">
+{% highlight bash %}
+Not available.
+{% endhighlight %}
+</div>
+</div>
+
 ### Window Operations on Event Time
 Aggregations over a sliding event-time window are straightforward with Structured Streaming and are very similar to grouped aggregations. In a grouped aggregation, aggregate values (e.g. counts) are maintained for each unique value in the user-specified grouping column. In case of window-based aggregations, aggregate values are maintained for each window the event-time of a row falls into. Let's understand this with an illustration. 
 
@@ -1043,7 +1103,7 @@ streamingDf \
 </div>
 
 ### Arbitrary Stateful Operations
-Many uscases require more advanced stateful operations than aggregations. For example, in many usecases, you have to track sessions from data streams of events. For doing such sessionization, you will have to save arbitrary types of data as state, and perform arbitrary operations on the state using the data stream events in every trigger. Since Spark 2.2, this can be done using the operation `mapGroupsWithState` and the more powerful operation `flatMapGroupsWithState`. Both operations allow you to apply user-defined code on grouped Datasets to update user-defined state. For more concrete details, take a look at the API documentation ([Scala](api/scala/index.html#org.apache.spark.sql.streaming.GroupState)/[Java](api/java/org/apache/spark/sql/streaming/GroupState.html)) and the examples ([Scala]({{site.SPARK_GITHUB_URL}}/blob/v{{site.SPARK_VERSION_SHORT}}/examples/src/main/scala/org/apache/spark/examples/sql/streaming/StructuredSessionization.scala)/[Java]({{site.SPARK_GITHUB_URL}}/blob/v{{site.SPARK_VERSION_SHORT}}/examples/src/main/java/org/apache/spark/examples/sql/streaming/JavaStructuredSessionization.java)). 
+Many usecases require more advanced stateful operations than aggregations. For example, in many usecases, you have to track sessions from data streams of events. For doing such sessionization, you will have to save arbitrary types of data as state, and perform arbitrary operations on the state using the data stream events in every trigger. Since Spark 2.2, this can be done using the operation `mapGroupsWithState` and the more powerful operation `flatMapGroupsWithState`. Both operations allow you to apply user-defined code on grouped Datasets to update user-defined state. For more concrete details, take a look at the API documentation ([Scala](api/scala/index.html#org.apache.spark.sql.streaming.GroupState)/[Java](api/java/org/apache/spark/sql/streaming/GroupState.html)) and the examples ([Scala]({{site.SPARK_GITHUB_URL}}/blob/v{{site.SPARK_VERSION_SHORT}}/examples/src/main/scala/org/apache/spark/examples/sql/streaming/StructuredSessionization.scala)/[Java]({{site.SPARK_GITHUB_URL}}/blob/v{{site.SPARK_VERSION_SHORT}}/examples/src/main/java/org/apache/spark/examples/sql/streaming/JavaStructuredSessionization.java)).
 
 ### Unsupported Operations
 There are a few DataFrame/Dataset operations that are not supported with streaming DataFrames/Datasets. 
@@ -1201,6 +1261,16 @@ writeStream
     .start()
 {% endhighlight %}
 
+- **Kafka sink** - Stores the output to one or more topics in Kafka.
+
+{% highlight scala %}
+writeStream
+    .format("kafka")
+    .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+    .option("topic", "updates")
+    .start()
+{% endhighlight %}
+
 - **Foreach sink** - Runs arbitrary computation on the records in the output. See later in the section for more details.
 
 {% highlight scala %}
@@ -1253,12 +1323,19 @@ Here are the details of all the sinks in Spark.
         href="api/R/write.stream.html">R</a>).
         E.g. for "parquet" format options see <code>DataFrameWriter.parquet()</code>
     </td>
-    <td>Yes</td>
+    <td>Yes (exactly-once)</td>
     <td>Supports writes to partitioned tables. Partitioning by time may be useful.</td>
   </tr>
+  <tr>
+    <td><b>Kafka Sink</b></td>
+    <td>Append, Update, Complete</td>
+    <td>See the <a href="structured-streaming-kafka-integration.html">Kafka Integration Guide</a></td>
+    <td>Yes (at-least-once)</td>
+    <td>More details in the <a href="structured-streaming-kafka-integration.html">Kafka Integration Guide</a></td>
+  </tr>
   <tr>
     <td><b>Foreach Sink</b></td>
-    <td>Append, Update, Compelete</td>
+    <td>Append, Update, Complete</td>
     <td>None</td>
     <td>Depends on ForeachWriter implementation</td>
     <td>More details in the <a href="#using-foreach">next section</a></td>
@@ -1624,10 +1701,9 @@ Not available in R.
 
 
 ## Monitoring Streaming Queries
-There are two APIs for monitoring and debugging active queries - 
-interactively and asynchronously.
+There are multiple ways to monitor active streaming queries. You can either push metrics to external systems using Spark's Dropwizard Metrics support, or access them programmatically.
 
-### Interactive APIs
+### Reading Metrics Interactively
 
 You can directly get the current status and metrics of an active query using 
 `streamingQuery.lastProgress()` and `streamingQuery.status()`. 
@@ -1857,7 +1933,7 @@ Will print something like the following.
 </div>
 </div>
 
-### Asynchronous API
+### Reporting Metrics programmatically using Asynchronous APIs
 
 You can also asynchronously monitor all queries associated with a
 `SparkSession` by attaching a `StreamingQueryListener`
@@ -1922,6 +1998,41 @@ Not available in R.
 </div>
 </div>
 
+### Reporting Metrics using Dropwizard 
+Spark supports reporting metrics using the [Dropwizard Library](monitoring.html#metrics). To enable metrics of Structured Streaming queries to be reported as well, you have to explicitly enable the configuration `spark.sql.streaming.metricsEnabled` in the SparkSession. 
+
+<div class="codetabs">
+<div data-lang="scala"  markdown="1">
+{% highlight scala %}
+spark.conf.set("spark.sql.streaming.metricsEnabled", "true")
+// or
+spark.sql("SET spark.sql.streaming.metricsEnabled=true")
+{% endhighlight %}
+</div>
+<div data-lang="java"  markdown="1">  
+{% highlight java %}
+spark.conf().set("spark.sql.streaming.metricsEnabled", "true");
+// or
+spark.sql("SET spark.sql.streaming.metricsEnabled=true");
+{% endhighlight %}
+</div>
+<div data-lang="python"  markdown="1">  
+{% highlight python %}
+spark.conf.set("spark.sql.streaming.metricsEnabled", "true")
+# or
+spark.sql("SET spark.sql.streaming.metricsEnabled=true")
+{% endhighlight %}
+</div>
+<div data-lang="r"  markdown="1">
+{% highlight r %}
+sql("SET spark.sql.streaming.metricsEnabled=true")
+{% endhighlight %}
+</div>
+</div>
+
+
+All queries started in the SparkSession after this configuration has been enabled will report metrics through Dropwizard to whatever [sinks](monitoring.html#metrics) have been configured (e.g. Ganglia, Graphite, JMX, etc.).
+
 ## Recovering from Failures with Checkpointing 
 In case of a failure or intentional shutdown, you can recover the previous progress and state of a previous query, and continue where it left off. This is done using checkpointing and write ahead logs. You can configure a query with a checkpoint location, and the query will save all the progress information (i.e. range of offsets processed in each trigger) and the running aggregates (e.g. word counts in the [quick example](#quick-example)) to the checkpoint location. This checkpoint location has to be a path in an HDFS compatible file system, and can be set as an option in the DataStreamWriter when [starting a query](#starting-streaming-queries).
 
@@ -1971,8 +2082,23 @@ write.stream(aggDF, "memory", outputMode = "complete", checkpointLocation = "pat
 </div>
 </div>
 
-# Where to go from here
-- Examples: See and run the
-[Scala]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/scala/org/apache/spark/examples/sql/streaming)/[Java]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/java/org/apache/spark/examples/sql/streaming)/[Python]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/python/sql/streaming)/[R]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/r/streaming)
-examples.
+# Additional Information
+
+**Further Reading**
+
+- See and run the
+  [Scala]({{site.SPARK_GITHUB_URL}}/tree/v{{site.SPARK_VERSION_SHORT}}/examples/src/main/scala/org/apache/spark/examples/sql/streaming)/[Java]({{site.SPARK_GITHUB_URL}}/tree/v{{site.SPARK_VERSION_SHORT}}/examples/src/main/java/org/apache/spark/examples/sql/streaming)/[Python]({{site.SPARK_GITHUB_URL}}/tree/v{{site.SPARK_VERSION_SHORT}}/examples/src/main/python/sql/streaming)/[R]({{site.SPARK_GITHUB_URL}}/tree/v{{site.SPARK_VERSION_SHORT}}/examples/src/main/r/streaming)
+  examples.
+    - [Instructions](index.html#running-the-examples-and-shell) on how to run Spark examples
+- Read about integrating with Kafka in the [Structured Streaming Kafka Integration Guide](structured-streaming-kafka-integration.html)
+- Read more details about using DataFrames/Datasets in the [Spark SQL Programming Guide](sql-programming-guide.html)
+- Third-party Blog Posts
+    - [Real-time Streaming ETL with Structured Streaming in Apache Spark 2.1 (Databricks Blog)](https://databricks.com/blog/2017/01/19/real-time-streaming-etl-structured-streaming-apache-spark-2-1.html)
+    - [Real-Time End-to-End Integration with Apache Kafka in Apache Spark’s Structured Streaming (Databricks Blog)](https://databricks.com/blog/2017/04/04/real-time-end-to-end-integration-with-apache-kafka-in-apache-sparks-structured-streaming.html)
+    - [Event-time Aggregation and Watermarking in Apache Spark’s Structured Streaming (Databricks Blog)](https://databricks.com/blog/2017/05/08/event-time-aggregation-watermarking-apache-sparks-structured-streaming.html)
+
+**Talks**
+
+- Spark Summit 2017 Talk - [Easy, Scalable, Fault-tolerant Stream Processing with Structured Streaming in Apache Spark](https://spark-summit.org/2017/events/easy-scalable-fault-tolerant-stream-processing-with-structured-streaming-in-apache-spark/)
 - Spark Summit 2016 Talk - [A Deep Dive into Structured Streaming](https://spark-summit.org/2016/events/a-deep-dive-into-structured-streaming/)
+
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 7be4aa1ca9562..b1638a2180b07 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -520,7 +520,6 @@ class Dataset[T] private[sql](
    * @group streaming
    * @since 2.0.0
    */
-  @Experimental
   @InterfaceStability.Evolving
   def isStreaming: Boolean = logicalPlan.isStreaming
 
@@ -581,7 +580,6 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * :: Experimental ::
    * Defines an event time watermark for this [[Dataset]]. A watermark tracks a point in time
    * before which we assume no more late data is going to arrive.
    *
@@ -605,7 +603,6 @@ class Dataset[T] private[sql](
    * @group streaming
    * @since 2.1.0
    */
-  @Experimental
   @InterfaceStability.Evolving
   // We only accept an existing column name, not a derived column here as a watermark that is
   // defined on a derived column cannot referenced elsewhere in the plan.

From 40c7add3a4c811202d1fa2be9606aca08df81266 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Fri, 7 Jul 2017 08:44:31 +0800
Subject: [PATCH 113/779] [SPARK-20946][SQL] Do not update conf for existing
 SparkContext in SparkSession.getOrCreate

## What changes were proposed in this pull request?

SparkContext is shared by all sessions, we should not update its conf for only one session.

## How was this patch tested?

existing tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #18536 from cloud-fan/config.
---
 .../spark/ml/recommendation/ALSSuite.scala    |  4 +---
 .../apache/spark/ml/tree/impl/TreeTests.scala |  2 --
 .../org/apache/spark/sql/SparkSession.scala   | 19 +++++++------------
 .../spark/sql/SparkSessionBuilderSuite.scala  |  8 +++-----
 4 files changed, 11 insertions(+), 22 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
index 3094f52ba1bc5..b57fc8d21ab34 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
@@ -818,15 +818,13 @@ class ALSCleanerSuite extends SparkFunSuite {
       FileUtils.listFiles(localDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet
     try {
       conf.set("spark.local.dir", localDir.getAbsolutePath)
-      val sc = new SparkContext("local[2]", "test", conf)
+      val sc = new SparkContext("local[2]", "ALSCleanerSuite", conf)
       try {
         sc.setCheckpointDir(checkpointDir.getAbsolutePath)
         // Generate test data
         val (training, _) = ALSSuite.genImplicitTestData(sc, 20, 5, 1, 0.2, 0)
         // Implicitly test the cleaning of parents during ALS training
         val spark = SparkSession.builder
-          .master("local[2]")
-          .appName("ALSCleanerSuite")
           .sparkContext(sc)
           .getOrCreate()
         import spark.implicits._
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreeTests.scala b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreeTests.scala
index 92a236928e90b..b6894b30b0c2b 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreeTests.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreeTests.scala
@@ -43,8 +43,6 @@ private[ml] object TreeTests extends SparkFunSuite {
       categoricalFeatures: Map[Int, Int],
       numClasses: Int): DataFrame = {
     val spark = SparkSession.builder()
-      .master("local[2]")
-      .appName("TreeTests")
       .sparkContext(data.sparkContext)
       .getOrCreate()
     import spark.implicits._
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index 0ddcd2111aa58..6dfe8a66baa9b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -867,7 +867,7 @@ object SparkSession {
      *
      * @since 2.2.0
      */
-    def withExtensions(f: SparkSessionExtensions => Unit): Builder = {
+    def withExtensions(f: SparkSessionExtensions => Unit): Builder = synchronized {
       f(extensions)
       this
     }
@@ -912,21 +912,16 @@ object SparkSession {
 
         // No active nor global default session. Create a new one.
         val sparkContext = userSuppliedContext.getOrElse {
-          // set app name if not given
-          val randomAppName = java.util.UUID.randomUUID().toString
           val sparkConf = new SparkConf()
           options.foreach { case (k, v) => sparkConf.set(k, v) }
+
+          // set a random app name if not given.
           if (!sparkConf.contains("spark.app.name")) {
-            sparkConf.setAppName(randomAppName)
-          }
-          val sc = SparkContext.getOrCreate(sparkConf)
-          // maybe this is an existing SparkContext, update its SparkConf which maybe used
-          // by SparkSession
-          options.foreach { case (k, v) => sc.conf.set(k, v) }
-          if (!sc.conf.contains("spark.app.name")) {
-            sc.conf.setAppName(randomAppName)
+            sparkConf.setAppName(java.util.UUID.randomUUID().toString)
           }
-          sc
+
+          SparkContext.getOrCreate(sparkConf)
+          // Do not update `SparkConf` for existing `SparkContext`, as it's shared by all sessions.
         }
 
         // Initialize extensions if the user has defined a configurator class.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSuite.scala
index cdac6827082c4..770e15629c839 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSuite.scala
@@ -102,11 +102,9 @@ class SparkSessionBuilderSuite extends SparkFunSuite {
     assert(session.conf.get("key1") == "value1")
     assert(session.conf.get("key2") == "value2")
     assert(session.sparkContext == sparkContext2)
-    assert(session.sparkContext.conf.get("key1") == "value1")
-    // If the created sparkContext is not passed through the Builder's API sparkContext,
-    // the conf of this sparkContext will also contain the conf set through the API config.
-    assert(session.sparkContext.conf.get("key2") == "value2")
-    assert(session.sparkContext.conf.get("spark.app.name") == "test")
+    // We won't update conf for existing `SparkContext`
+    assert(!sparkContext2.conf.contains("key2"))
+    assert(sparkContext2.conf.get("key1") == "value1")
     session.stop()
   }
 

From e5bb26174d3336e07dd670eec4fd2137df346163 Mon Sep 17 00:00:00 2001
From: Jacek Laskowski <jacek@japila.pl>
Date: Thu, 6 Jul 2017 18:11:41 -0700
Subject: [PATCH 114/779] [SPARK-21329][SS] Make EventTimeWatermarkExec
 explicitly UnaryExecNode

## What changes were proposed in this pull request?

Making EventTimeWatermarkExec explicitly UnaryExecNode

/cc tdas zsxwing

## How was this patch tested?

Local build.

Author: Jacek Laskowski <jacek@japila.pl>

Closes #18509 from jaceklaskowski/EventTimeWatermarkExec-UnaryExecNode.
---
 .../sql/execution/streaming/EventTimeWatermarkExec.scala    | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala
index 25cf609fc336e..87e5b78550423 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala
@@ -21,7 +21,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection}
 import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark
-import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
 import org.apache.spark.sql.types.MetadataBuilder
 import org.apache.spark.unsafe.types.CalendarInterval
 import org.apache.spark.util.AccumulatorV2
@@ -81,7 +81,7 @@ class EventTimeStatsAccum(protected var currentStats: EventTimeStats = EventTime
 case class EventTimeWatermarkExec(
     eventTime: Attribute,
     delay: CalendarInterval,
-    child: SparkPlan) extends SparkPlan {
+    child: SparkPlan) extends UnaryExecNode {
 
   val eventTimeStats = new EventTimeStatsAccum()
   val delayMs = EventTimeWatermark.getDelayMs(delay)
@@ -117,6 +117,4 @@ case class EventTimeWatermarkExec(
       a
     }
   }
-
-  override def children: Seq[SparkPlan] = child :: Nil
 }

From d451b7f43d559aa1efd7ac3d1cbec5249f3a7a24 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Fri, 7 Jul 2017 12:24:03 +0800
Subject: [PATCH 115/779] [SPARK-21326][SPARK-21066][ML] Use TextFileFormat in
 LibSVMFileFormat and allow multiple input paths for determining numFeatures

## What changes were proposed in this pull request?

This is related with [SPARK-19918](https://issues.apache.org/jira/browse/SPARK-19918) and [SPARK-18362](https://issues.apache.org/jira/browse/SPARK-18362).

This PR proposes to use `TextFileFormat` and allow multiple input paths (but with a warning) when determining the number of features in LibSVM data source via an extra scan.

There are three points here:

- The main advantage of this change should be to remove file-listing bottlenecks in driver side.

- Another advantage is ones from using `FileScanRDD`. For example, I guess we can use `spark.sql.files.ignoreCorruptFiles` option when determining the number of features.

- We can unify the schema inference code path in text based data sources. This is also a preparation for [SPARK-21289](https://issues.apache.org/jira/browse/SPARK-21289).

## How was this patch tested?

Unit tests in `LibSVMRelationSuite`.

Closes #18288

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #18556 from HyukjinKwon/libsvm-schema.
---
 .../ml/source/libsvm/LibSVMRelation.scala     | 26 +++++++++----------
 .../org/apache/spark/mllib/util/MLUtils.scala | 25 ++++++++++++++++--
 .../source/libsvm/LibSVMRelationSuite.scala   | 17 +++++++++---
 3 files changed, 49 insertions(+), 19 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
index f68847a664b69..dec118330aec6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
@@ -23,6 +23,7 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
 
+import org.apache.spark.internal.Logging
 import org.apache.spark.TaskContext
 import org.apache.spark.ml.feature.LabeledPoint
 import org.apache.spark.ml.linalg.{Vectors, VectorUDT}
@@ -66,7 +67,10 @@ private[libsvm] class LibSVMOutputWriter(
 
 /** @see [[LibSVMDataSource]] for public documentation. */
 // If this is moved or renamed, please update DataSource's backwardCompatibilityMap.
-private[libsvm] class LibSVMFileFormat extends TextBasedFileFormat with DataSourceRegister {
+private[libsvm] class LibSVMFileFormat
+  extends TextBasedFileFormat
+  with DataSourceRegister
+  with Logging {
 
   override def shortName(): String = "libsvm"
 
@@ -89,18 +93,14 @@ private[libsvm] class LibSVMFileFormat extends TextBasedFileFormat with DataSour
       files: Seq[FileStatus]): Option[StructType] = {
     val libSVMOptions = new LibSVMOptions(options)
     val numFeatures: Int = libSVMOptions.numFeatures.getOrElse {
-      // Infers number of features if the user doesn't specify (a valid) one.
-      val dataFiles = files.filterNot(_.getPath.getName startsWith "_")
-      val path = if (dataFiles.length == 1) {
-        dataFiles.head.getPath.toUri.toString
-      } else if (dataFiles.isEmpty) {
-        throw new IOException("No input path specified for libsvm data")
-      } else {
-        throw new IOException("Multiple input paths are not supported for libsvm data.")
-      }
-
-      val sc = sparkSession.sparkContext
-      val parsed = MLUtils.parseLibSVMFile(sc, path, sc.defaultParallelism)
+      require(files.nonEmpty, "No input path specified for libsvm data")
+      logWarning(
+        "'numFeatures' option not specified, determining the number of features by going " +
+        "though the input. If you know the number in advance, please specify it via " +
+        "'numFeatures' option to avoid the extra scan.")
+
+      val paths = files.map(_.getPath.toUri.toString)
+      val parsed = MLUtils.parseLibSVMFile(sparkSession, paths)
       MLUtils.computeNumFeatures(parsed)
     }
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 4fdad05973969..14af8b5c73870 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -28,8 +28,10 @@ import org.apache.spark.mllib.linalg._
 import org.apache.spark.mllib.linalg.BLAS.dot
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.{PartitionwiseSampledRDD, RDD}
-import org.apache.spark.sql.{DataFrame, Dataset}
-import org.apache.spark.sql.functions.{col, udf}
+import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
+import org.apache.spark.sql.execution.datasources.DataSource
+import org.apache.spark.sql.execution.datasources.text.TextFileFormat
+import org.apache.spark.sql.functions._
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.random.BernoulliCellSampler
 
@@ -102,6 +104,25 @@ object MLUtils extends Logging {
       .map(parseLibSVMRecord)
   }
 
+  private[spark] def parseLibSVMFile(
+      sparkSession: SparkSession, paths: Seq[String]): RDD[(Double, Array[Int], Array[Double])] = {
+    val lines = sparkSession.baseRelationToDataFrame(
+      DataSource.apply(
+        sparkSession,
+        paths = paths,
+        className = classOf[TextFileFormat].getName
+      ).resolveRelation(checkFilesExist = false))
+      .select("value")
+
+    import lines.sqlContext.implicits._
+
+    lines.select(trim($"value").as("line"))
+      .filter(not((length($"line") === 0).or($"line".startsWith("#"))))
+      .as[String]
+      .rdd
+      .map(MLUtils.parseLibSVMRecord)
+  }
+
   private[spark] def parseLibSVMRecord(line: String): (Double, Array[Int], Array[Double]) = {
     val items = line.split(' ')
     val label = items.head.toDouble
diff --git a/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala
index e164d279f3f02..a67e49d54e148 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala
@@ -35,15 +35,22 @@ class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   override def beforeAll(): Unit = {
     super.beforeAll()
-    val lines =
+    val lines0 =
       """
         |1 1:1.0 3:2.0 5:3.0
         |0
+      """.stripMargin
+    val lines1 =
+      """
         |0 2:4.0 4:5.0 6:6.0
       """.stripMargin
     val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data")
-    val file = new File(dir, "part-00000")
-    Files.write(lines, file, StandardCharsets.UTF_8)
+    val succ = new File(dir, "_SUCCESS")
+    val file0 = new File(dir, "part-00000")
+    val file1 = new File(dir, "part-00001")
+    Files.write("", succ, StandardCharsets.UTF_8)
+    Files.write(lines0, file0, StandardCharsets.UTF_8)
+    Files.write(lines1, file1, StandardCharsets.UTF_8)
     path = dir.toURI.toString
   }
 
@@ -145,7 +152,9 @@ class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("create libsvmTable table without schema and path") {
     try {
-      val e = intercept[IOException](spark.sql("CREATE TABLE libsvmTable USING libsvm"))
+      val e = intercept[IllegalArgumentException] {
+        spark.sql("CREATE TABLE libsvmTable USING libsvm")
+      }
       assert(e.getMessage.contains("No input path specified for libsvm data"))
     } finally {
       spark.sql("DROP TABLE IF EXISTS libsvmTable")

From 53c2eb59b2cc557081f6a252748dc38511601b0d Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Fri, 7 Jul 2017 14:05:22 +0900
Subject: [PATCH 116/779] [SPARK-21327][SQL][PYSPARK] ArrayConstructor should
 handle an array of typecode 'l' as long rather than int in Python 2.

## What changes were proposed in this pull request?

Currently `ArrayConstructor` handles an array of typecode `'l'` as `int` when converting Python object in Python 2 into Java object, so if the value is larger than `Integer.MAX_VALUE` or smaller than `Integer.MIN_VALUE` then the overflow occurs.

```python
import array
data = [Row(longarray=array.array('l', [-9223372036854775808, 0, 9223372036854775807]))]
df = spark.createDataFrame(data)
df.show(truncate=False)
```

```
+----------+
|longarray |
+----------+
|[0, 0, -1]|
+----------+
```

This should be:

```
+----------------------------------------------+
|longarray                                     |
+----------------------------------------------+
|[-9223372036854775808, 0, 9223372036854775807]|
+----------------------------------------------+
```

## How was this patch tested?

Added a test and existing tests.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #18553 from ueshin/issues/SPARK-21327.
---
 .../scala/org/apache/spark/api/python/SerDeUtil.scala  | 10 ++++++++++
 python/pyspark/sql/tests.py                            |  6 ++++++
 2 files changed, 16 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
index 6e4eab4b805c1..42f67e8dbe865 100644
--- a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
@@ -73,6 +73,16 @@ private[spark] object SerDeUtil extends Logging {
         // This must be ISO 8859-1 / Latin 1, not UTF-8, to interoperate correctly
         val data = args(1).asInstanceOf[String].getBytes(StandardCharsets.ISO_8859_1)
         construct(typecode, machineCodes(typecode), data)
+      } else if (args.length == 2 && args(0) == "l") {
+        // On Python 2, an array of typecode 'l' should be handled as long rather than int.
+        val values = args(1).asInstanceOf[JArrayList[_]]
+        val result = new Array[Long](values.size)
+        var i = 0
+        while (i < values.size) {
+          result(i) = values.get(i).asInstanceOf[Number].longValue()
+          i += 1
+        }
+        result
       } else {
         super.construct(args)
       }
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index c0e3b8d132396..9db2f40474f70 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -2342,6 +2342,12 @@ def test_to_pandas(self):
         self.assertEquals(types[2], np.bool)
         self.assertEquals(types[3], np.float32)
 
+    def test_create_dataframe_from_array_of_long(self):
+        import array
+        data = [Row(longarray=array.array('l', [-9223372036854775808, 0, 9223372036854775807]))]
+        df = self.spark.createDataFrame(data)
+        self.assertEqual(df.first(), Row(longarray=[-9223372036854775808, 0, 9223372036854775807]))
+
 
 class HiveSparkSubmitTests(SparkSubmitTests):
 

From c09b31eb8fa83d5463a045c9278f5874ae505a8e Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Fri, 7 Jul 2017 13:09:32 +0800
Subject: [PATCH 117/779] [SPARK-21217][SQL] Support
 ColumnVector.Array.to<type>Array()

## What changes were proposed in this pull request?

This PR implements bulk-copy for `ColumnVector.Array.to<type>Array()` methods (e.g. `toIntArray()`) in `ColumnVector.Array` by using `System.arrayCopy()` or `Platform.copyMemory()`.

Before this PR, when one of these method is called, the generic method in `ArrayData` is called. It is not fast since element-wise copy is performed.

This PR can improve performance of a benchmark program by 1.9x and 3.2x.

Without this PR
```
OpenJDK 64-Bit Server VM 1.8.0_131-8u131-b11-0ubuntu1.16.04.2-b11 on Linux 4.4.0-66-generic
Intel(R) Xeon(R) CPU E5-2667 v3  3.20GHz

Int Array                                Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)
------------------------------------------------------------------------------------------------
ON_HEAP                                        586 /  628         14.3          69.9
OFF_HEAP                                       893 /  902          9.4         106.5
```

With this PR
```
OpenJDK 64-Bit Server VM 1.8.0_131-8u131-b11-0ubuntu1.16.04.2-b11 on Linux 4.4.0-66-generic
Intel(R) Xeon(R) CPU E5-2667 v3  3.20GHz

Int Array                                Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)
------------------------------------------------------------------------------------------------
ON_HEAP                                        306 /  331         27.4          36.4
OFF_HEAP                                       282 /  287         29.8          33.6
```

Source program
```
    (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode => {
      val len = 8 * 1024 * 1024
      val column = ColumnVector.allocate(len * 2, new ArrayType(IntegerType, false), memMode)

      val data = column.arrayData
      var i = 0
      while (i < len) {
        data.putInt(i, i)
        i += 1
      }
      column.putArray(0, 0, len)

      val benchmark = new Benchmark("Int Array", len, minNumIters = 20)
      benchmark.addCase(s"$memMode") { iter =>
        var i = 0
        while (i < 50) {
          column.getArray(0).toIntArray
          i += 1
        }
      }
      benchmark.run
    }}
```

## How was this patch tested?

Added test suite

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #18425 from kiszk/SPARK-21217.
---
 .../execution/vectorized/ColumnVector.java    | 56 ++++++++++++++++++
 .../vectorized/OffHeapColumnVector.java       | 58 +++++++++++++++++++
 .../vectorized/OnHeapColumnVector.java        | 58 +++++++++++++++++++
 .../vectorized/ColumnarBatchSuite.scala       | 49 ++++++++++++++++
 4 files changed, 221 insertions(+)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
index 24260a60197f2..0c027f80d48cc 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
@@ -100,6 +100,27 @@ public ArrayData copy() {
       throw new UnsupportedOperationException();
     }
 
+    @Override
+    public boolean[] toBooleanArray() { return data.getBooleans(offset, length); }
+
+    @Override
+    public byte[] toByteArray() { return data.getBytes(offset, length); }
+
+    @Override
+    public short[] toShortArray() { return data.getShorts(offset, length); }
+
+    @Override
+    public int[] toIntArray() { return data.getInts(offset, length); }
+
+    @Override
+    public long[] toLongArray() { return data.getLongs(offset, length); }
+
+    @Override
+    public float[] toFloatArray() { return data.getFloats(offset, length); }
+
+    @Override
+    public double[] toDoubleArray() { return data.getDoubles(offset, length); }
+
     // TODO: this is extremely expensive.
     @Override
     public Object[] array() {
@@ -366,6 +387,11 @@ private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
    */
   public abstract boolean getBoolean(int rowId);
 
+  /**
+   * Gets values from [rowId, rowId + count)
+   */
+  public abstract boolean[] getBooleans(int rowId, int count);
+
   /**
    * Sets the value at rowId to `value`.
    */
@@ -386,6 +412,11 @@ private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
    */
   public abstract byte getByte(int rowId);
 
+  /**
+   * Gets values from [rowId, rowId + count)
+   */
+  public abstract byte[] getBytes(int rowId, int count);
+
   /**
    * Sets the value at rowId to `value`.
    */
@@ -406,6 +437,11 @@ private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
    */
   public abstract short getShort(int rowId);
 
+  /**
+   * Gets values from [rowId, rowId + count)
+   */
+  public abstract short[] getShorts(int rowId, int count);
+
   /**
    * Sets the value at rowId to `value`.
    */
@@ -432,6 +468,11 @@ private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
    */
   public abstract int getInt(int rowId);
 
+  /**
+   * Gets values from [rowId, rowId + count)
+   */
+  public abstract int[] getInts(int rowId, int count);
+
   /**
    * Returns the dictionary Id for rowId.
    * This should only be called when the ColumnVector is dictionaryIds.
@@ -465,6 +506,11 @@ private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
    */
   public abstract long getLong(int rowId);
 
+  /**
+   * Gets values from [rowId, rowId + count)
+   */
+  public abstract long[] getLongs(int rowId, int count);
+
   /**
    * Sets the value at rowId to `value`.
    */
@@ -491,6 +537,11 @@ private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
    */
   public abstract float getFloat(int rowId);
 
+  /**
+   * Gets values from [rowId, rowId + count)
+   */
+  public abstract float[] getFloats(int rowId, int count);
+
   /**
    * Sets the value at rowId to `value`.
    */
@@ -517,6 +568,11 @@ private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
    */
   public abstract double getDouble(int rowId);
 
+  /**
+   * Gets values from [rowId, rowId + count)
+   */
+  public abstract double[] getDoubles(int rowId, int count);
+
   /**
    * Puts a byte array that already exists in this column.
    */
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
index a7d3744d00e91..2d1f3da8e7463 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
@@ -134,6 +134,16 @@ public void putBooleans(int rowId, int count, boolean value) {
   @Override
   public boolean getBoolean(int rowId) { return Platform.getByte(null, data + rowId) == 1; }
 
+  @Override
+  public boolean[] getBooleans(int rowId, int count) {
+    assert(dictionary == null);
+    boolean[] array = new boolean[count];
+    for (int i = 0; i < count; ++i) {
+      array[i] = (Platform.getByte(null, data + rowId + i) == 1);
+    }
+    return array;
+  }
+
   //
   // APIs dealing with Bytes
   //
@@ -165,6 +175,14 @@ public byte getByte(int rowId) {
     }
   }
 
+  @Override
+  public byte[] getBytes(int rowId, int count) {
+    assert(dictionary == null);
+    byte[] array = new byte[count];
+    Platform.copyMemory(null, data + rowId, array, Platform.BYTE_ARRAY_OFFSET, count);
+    return array;
+  }
+
   //
   // APIs dealing with shorts
   //
@@ -197,6 +215,14 @@ public short getShort(int rowId) {
     }
   }
 
+  @Override
+  public short[] getShorts(int rowId, int count) {
+    assert(dictionary == null);
+    short[] array = new short[count];
+    Platform.copyMemory(null, data + rowId * 2, array, Platform.SHORT_ARRAY_OFFSET, count * 2);
+    return array;
+  }
+
   //
   // APIs dealing with ints
   //
@@ -244,6 +270,14 @@ public int getInt(int rowId) {
     }
   }
 
+  @Override
+  public int[] getInts(int rowId, int count) {
+    assert(dictionary == null);
+    int[] array = new int[count];
+    Platform.copyMemory(null, data + rowId * 4, array, Platform.INT_ARRAY_OFFSET, count * 4);
+    return array;
+  }
+
   /**
    * Returns the dictionary Id for rowId.
    * This should only be called when the ColumnVector is dictionaryIds.
@@ -302,6 +336,14 @@ public long getLong(int rowId) {
     }
   }
 
+  @Override
+  public long[] getLongs(int rowId, int count) {
+    assert(dictionary == null);
+    long[] array = new long[count];
+    Platform.copyMemory(null, data + rowId * 8, array, Platform.LONG_ARRAY_OFFSET, count * 8);
+    return array;
+  }
+
   //
   // APIs dealing with floats
   //
@@ -348,6 +390,14 @@ public float getFloat(int rowId) {
     }
   }
 
+  @Override
+  public float[] getFloats(int rowId, int count) {
+    assert(dictionary == null);
+    float[] array = new float[count];
+    Platform.copyMemory(null, data + rowId * 4, array, Platform.FLOAT_ARRAY_OFFSET, count * 4);
+    return array;
+  }
+
 
   //
   // APIs dealing with doubles
@@ -395,6 +445,14 @@ public double getDouble(int rowId) {
     }
   }
 
+  @Override
+  public double[] getDoubles(int rowId, int count) {
+    assert(dictionary == null);
+    double[] array = new double[count];
+    Platform.copyMemory(null, data + rowId * 8, array, Platform.DOUBLE_ARRAY_OFFSET, count * 8);
+    return array;
+  }
+
   //
   // APIs dealing with Arrays.
   //
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
index 94ed32294cfae..506434364be48 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
@@ -130,6 +130,16 @@ public boolean getBoolean(int rowId) {
     return byteData[rowId] == 1;
   }
 
+  @Override
+  public boolean[] getBooleans(int rowId, int count) {
+    assert(dictionary == null);
+    boolean[] array = new boolean[count];
+    for (int i = 0; i < count; ++i) {
+      array[i] = (byteData[rowId + i] == 1);
+    }
+   return array;
+  }
+
   //
 
   //
@@ -162,6 +172,14 @@ public byte getByte(int rowId) {
     }
   }
 
+  @Override
+  public byte[] getBytes(int rowId, int count) {
+    assert(dictionary == null);
+    byte[] array = new byte[count];
+    System.arraycopy(byteData, rowId, array, 0, count);
+    return array;
+  }
+
   //
   // APIs dealing with Shorts
   //
@@ -192,6 +210,14 @@ public short getShort(int rowId) {
     }
   }
 
+  @Override
+  public short[] getShorts(int rowId, int count) {
+    assert(dictionary == null);
+    short[] array = new short[count];
+    System.arraycopy(shortData, rowId, array, 0, count);
+    return array;
+  }
+
 
   //
   // APIs dealing with Ints
@@ -234,6 +260,14 @@ public int getInt(int rowId) {
     }
   }
 
+  @Override
+  public int[] getInts(int rowId, int count) {
+    assert(dictionary == null);
+    int[] array = new int[count];
+    System.arraycopy(intData, rowId, array, 0, count);
+    return array;
+  }
+
   /**
    * Returns the dictionary Id for rowId.
    * This should only be called when the ColumnVector is dictionaryIds.
@@ -286,6 +320,14 @@ public long getLong(int rowId) {
     }
   }
 
+  @Override
+  public long[] getLongs(int rowId, int count) {
+    assert(dictionary == null);
+    long[] array = new long[count];
+    System.arraycopy(longData, rowId, array, 0, count);
+    return array;
+  }
+
   //
   // APIs dealing with floats
   //
@@ -325,6 +367,14 @@ public float getFloat(int rowId) {
     }
   }
 
+  @Override
+  public float[] getFloats(int rowId, int count) {
+    assert(dictionary == null);
+    float[] array = new float[count];
+    System.arraycopy(floatData, rowId, array, 0, count);
+    return array;
+  }
+
   //
   // APIs dealing with doubles
   //
@@ -366,6 +416,14 @@ public double getDouble(int rowId) {
     }
   }
 
+  @Override
+  public double[] getDoubles(int rowId, int count) {
+    assert(dictionary == null);
+    double[] array = new double[count];
+    System.arraycopy(doubleData, rowId, array, 0, count);
+    return array;
+  }
+
   //
   // APIs dealing with Arrays
   //
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
index 80d41577dcf2d..ccf7aa7022a2a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
@@ -709,6 +709,55 @@ class ColumnarBatchSuite extends SparkFunSuite {
     }}
   }
 
+  test("toArray for primitive types") {
+    // (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode => {
+    (MemoryMode.ON_HEAP :: Nil).foreach { memMode => {
+      val len = 4
+
+      val columnBool = ColumnVector.allocate(len, new ArrayType(BooleanType, false), memMode)
+      val boolArray = Array(false, true, false, true)
+      boolArray.zipWithIndex.map { case (v, i) => columnBool.arrayData.putBoolean(i, v) }
+      columnBool.putArray(0, 0, len)
+      assert(columnBool.getArray(0).toBooleanArray === boolArray)
+
+      val columnByte = ColumnVector.allocate(len, new ArrayType(ByteType, false), memMode)
+      val byteArray = Array[Byte](0, 1, 2, 3)
+      byteArray.zipWithIndex.map { case (v, i) => columnByte.arrayData.putByte(i, v) }
+      columnByte.putArray(0, 0, len)
+      assert(columnByte.getArray(0).toByteArray === byteArray)
+
+      val columnShort = ColumnVector.allocate(len, new ArrayType(ShortType, false), memMode)
+      val shortArray = Array[Short](0, 1, 2, 3)
+      shortArray.zipWithIndex.map { case (v, i) => columnShort.arrayData.putShort(i, v) }
+      columnShort.putArray(0, 0, len)
+      assert(columnShort.getArray(0).toShortArray === shortArray)
+
+      val columnInt = ColumnVector.allocate(len, new ArrayType(IntegerType, false), memMode)
+      val intArray = Array(0, 1, 2, 3)
+      intArray.zipWithIndex.map { case (v, i) => columnInt.arrayData.putInt(i, v) }
+      columnInt.putArray(0, 0, len)
+      assert(columnInt.getArray(0).toIntArray === intArray)
+
+      val columnLong = ColumnVector.allocate(len, new ArrayType(LongType, false), memMode)
+      val longArray = Array[Long](0, 1, 2, 3)
+      longArray.zipWithIndex.map { case (v, i) => columnLong.arrayData.putLong(i, v) }
+      columnLong.putArray(0, 0, len)
+      assert(columnLong.getArray(0).toLongArray === longArray)
+
+      val columnFloat = ColumnVector.allocate(len, new ArrayType(FloatType, false), memMode)
+      val floatArray = Array(0.0F, 1.1F, 2.2F, 3.3F)
+      floatArray.zipWithIndex.map { case (v, i) => columnFloat.arrayData.putFloat(i, v) }
+      columnFloat.putArray(0, 0, len)
+      assert(columnFloat.getArray(0).toFloatArray === floatArray)
+
+      val columnDouble = ColumnVector.allocate(len, new ArrayType(DoubleType, false), memMode)
+      val doubleArray = Array(0.0, 1.1, 2.2, 3.3)
+      doubleArray.zipWithIndex.map { case (v, i) => columnDouble.arrayData.putDouble(i, v) }
+      columnDouble.putArray(0, 0, len)
+      assert(columnDouble.getArray(0).toDoubleArray === doubleArray)
+    }}
+  }
+
   test("Struct Column") {
     (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode => {
       val schema = new StructType().add("int", IntegerType).add("double", DoubleType)

From 5df99bd364561c6f4c02308149ba5eb71f89247e Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Fri, 7 Jul 2017 13:12:20 +0800
Subject: [PATCH 118/779] [SPARK-20703][SQL][FOLLOW-UP] Associate metrics with
 data writes onto DataFrameWriter operations

## What changes were proposed in this pull request?

Remove time metrics since it seems no way to measure it in non per-row tracking.

## How was this patch tested?

Existing tests.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #18558 from viirya/SPARK-20703-followup.
---
 .../command/DataWritingCommand.scala          | 10 ---------
 .../datasources/FileFormatWriter.scala        | 22 +++----------------
 .../sql/hive/execution/SQLMetricsSuite.scala  |  3 ---
 3 files changed, 3 insertions(+), 32 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala
index 0c381a2c02986..700f7f81dc8a9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala
@@ -30,7 +30,6 @@ trait DataWritingCommand extends RunnableCommand {
   override lazy val metrics: Map[String, SQLMetric] = {
     val sparkContext = SparkContext.getActive.get
     Map(
-      "avgTime" -> SQLMetrics.createMetric(sparkContext, "average writing time (ms)"),
       "numFiles" -> SQLMetrics.createMetric(sparkContext, "number of written files"),
       "numOutputBytes" -> SQLMetrics.createMetric(sparkContext, "bytes of written output"),
       "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
@@ -47,23 +46,14 @@ trait DataWritingCommand extends RunnableCommand {
     var numFiles = 0
     var totalNumBytes: Long = 0L
     var totalNumOutput: Long = 0L
-    var totalWritingTime: Long = 0L
 
     writeSummaries.foreach { summary =>
       numPartitions += summary.updatedPartitions.size
       numFiles += summary.numOutputFile
       totalNumBytes += summary.numOutputBytes
       totalNumOutput += summary.numOutputRows
-      totalWritingTime += summary.totalWritingTime
     }
 
-    val avgWritingTime = if (numFiles > 0) {
-      (totalWritingTime / numFiles).toLong
-    } else {
-      0L
-    }
-
-    metrics("avgTime").add(avgWritingTime)
     metrics("numFiles").add(numFiles)
     metrics("numOutputBytes").add(totalNumBytes)
     metrics("numOutputRows").add(totalNumOutput)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
index 64866630623ab..9eb9eae699e94 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
@@ -275,8 +275,6 @@ object FileFormatWriter extends Logging {
     /**
      * The data structures used to measure metrics during writing.
      */
-    protected var totalWritingTime: Long = 0L
-    protected var timeOnCurrentFile: Long = 0L
     protected var numOutputRows: Long = 0L
     protected var numOutputBytes: Long = 0L
 
@@ -343,9 +341,7 @@ object FileFormatWriter extends Logging {
         }
 
         val internalRow = iter.next()
-        val startTime = System.nanoTime()
         currentWriter.write(internalRow)
-        timeOnCurrentFile += (System.nanoTime() - startTime)
         recordsInFile += 1
       }
       releaseResources()
@@ -355,17 +351,13 @@ object FileFormatWriter extends Logging {
         updatedPartitions = Set.empty,
         numOutputFile = fileCounter + 1,
         numOutputBytes = numOutputBytes,
-        numOutputRows = numOutputRows,
-        totalWritingTime = totalWritingTime)
+        numOutputRows = numOutputRows)
     }
 
     override def releaseResources(): Unit = {
       if (currentWriter != null) {
         try {
-          val startTime = System.nanoTime()
           currentWriter.close()
-          totalWritingTime += (timeOnCurrentFile + System.nanoTime() - startTime) / 1000 / 1000
-          timeOnCurrentFile = 0
           numOutputBytes += getFileSize(taskAttemptContext.getConfiguration, currentPath)
         } finally {
           currentWriter = null
@@ -504,9 +496,7 @@ object FileFormatWriter extends Logging {
           releaseResources()
           newOutputWriter(currentPartColsAndBucketId, getPartPath, fileCounter, updatedPartitions)
         }
-        val startTime = System.nanoTime()
         currentWriter.write(getOutputRow(row))
-        timeOnCurrentFile += (System.nanoTime() - startTime)
         recordsInFile += 1
       }
       if (currentPartColsAndBucketId != null) {
@@ -519,17 +509,13 @@ object FileFormatWriter extends Logging {
         updatedPartitions = updatedPartitions.toSet,
         numOutputFile = totalFileCounter,
         numOutputBytes = numOutputBytes,
-        numOutputRows = numOutputRows,
-        totalWritingTime = totalWritingTime)
+        numOutputRows = numOutputRows)
     }
 
     override def releaseResources(): Unit = {
       if (currentWriter != null) {
         try {
-          val startTime = System.nanoTime()
           currentWriter.close()
-          totalWritingTime += (timeOnCurrentFile + System.nanoTime() - startTime) / 1000 / 1000
-          timeOnCurrentFile = 0
           numOutputBytes += getFileSize(taskAttemptContext.getConfiguration, currentPath)
         } finally {
           currentWriter = null
@@ -547,11 +533,9 @@ object FileFormatWriter extends Logging {
  * @param numOutputFile the total number of files.
  * @param numOutputRows the number of output rows.
  * @param numOutputBytes the bytes of output data.
- * @param totalWritingTime the total writing time in ms.
  */
 case class ExecutedWriteSummary(
   updatedPartitions: Set[String],
   numOutputFile: Int,
   numOutputRows: Long,
-  numOutputBytes: Long,
-  totalWritingTime: Long)
+  numOutputBytes: Long)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLMetricsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLMetricsSuite.scala
index 1ef1988d4c605..24c038587d1d6 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLMetricsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLMetricsSuite.scala
@@ -65,9 +65,6 @@ class SQLMetricsSuite extends SQLTestUtils with TestHiveSingleton {
     val totalNumBytesMetric = executedNode.metrics.find(_.name == "bytes of written output").get
     val totalNumBytes = metrics(totalNumBytesMetric.accumulatorId).replaceAll(",", "").toInt
     assert(totalNumBytes > 0)
-    val writingTimeMetric = executedNode.metrics.find(_.name == "average writing time (ms)").get
-    val writingTime = metrics(writingTimeMetric.accumulatorId).replaceAll(",", "").toInt
-    assert(writingTime >= 0)
   }
 
   private def testMetricsNonDynamicPartition(

From 7fcbb9b57f5eba8b14bf7d86ebaa08a8ee937cd2 Mon Sep 17 00:00:00 2001
From: Jacek Laskowski <jacek@japila.pl>
Date: Fri, 7 Jul 2017 08:31:30 +0100
Subject: [PATCH 119/779] [SPARK-21313][SS] ConsoleSink's string representation

## What changes were proposed in this pull request?

Add `toString` with options for `ConsoleSink` so it shows nicely in query progress.

**BEFORE**

```
  "sink" : {
    "description" : "org.apache.spark.sql.execution.streaming.ConsoleSink4b340441"
  }
```

**AFTER**

```
  "sink" : {
    "description" : "ConsoleSink[numRows=10, truncate=false]"
  }
```

/cc zsxwing tdas

## How was this patch tested?

Local build

Author: Jacek Laskowski <jacek@japila.pl>

Closes #18539 from jaceklaskowski/SPARK-21313-ConsoleSink-toString.
---
 .../org/apache/spark/sql/execution/streaming/ForeachSink.scala  | 2 ++
 .../org/apache/spark/sql/execution/streaming/console.scala      | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ForeachSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ForeachSink.scala
index de09fb568d2a6..2cc54107f8b83 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ForeachSink.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ForeachSink.scala
@@ -63,4 +63,6 @@ class ForeachSink[T : Encoder](writer: ForeachWriter[T]) extends Sink with Seria
       }
     }
   }
+
+  override def toString(): String = "ForeachSink"
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
index 3baea6376069f..1c9284e252bd6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
@@ -52,6 +52,8 @@ class ConsoleSink(options: Map[String, String]) extends Sink with Logging {
       data.sparkSession.sparkContext.parallelize(data.collect()), data.schema)
       .show(numRowsToShow, isTruncated)
   }
+
+  override def toString(): String = s"ConsoleSink[numRows=$numRowsToShow, truncate=$isTruncated]"
 }
 
 case class ConsoleRelation(override val sqlContext: SQLContext, data: DataFrame)

From 56536e9992ac4ea771758463962e49bba410e896 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Fri, 7 Jul 2017 18:32:01 +0800
Subject: [PATCH 120/779] [SPARK-21285][ML] VectorAssembler reports the column
 name of unsupported data type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?
add the column name in the exception which is raised by unsupported data type.

## How was this patch tested?
+ [x] pass all tests.

Author: Yan Facai (颜发才) <facai.yan@gmail.com>

Closes #18523 from facaiy/ENH/vectorassembler_add_col.
---
 .../apache/spark/ml/feature/VectorAssembler.scala | 15 +++++++++------
 .../spark/ml/feature/VectorAssemblerSuite.scala   |  5 ++++-
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
index ca900536bc7b8..73f27d1a423d9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
@@ -113,12 +113,15 @@ class VectorAssembler @Since("1.4.0") (@Since("1.4.0") override val uid: String)
   override def transformSchema(schema: StructType): StructType = {
     val inputColNames = $(inputCols)
     val outputColName = $(outputCol)
-    val inputDataTypes = inputColNames.map(name => schema(name).dataType)
-    inputDataTypes.foreach {
-      case _: NumericType | BooleanType =>
-      case t if t.isInstanceOf[VectorUDT] =>
-      case other =>
-        throw new IllegalArgumentException(s"Data type $other is not supported.")
+    val incorrectColumns = inputColNames.flatMap { name =>
+      schema(name).dataType match {
+        case _: NumericType | BooleanType => None
+        case t if t.isInstanceOf[VectorUDT] => None
+        case other => Some(s"Data type $other of column $name is not supported.")
+      }
+    }
+    if (incorrectColumns.nonEmpty) {
+      throw new IllegalArgumentException(incorrectColumns.mkString("\n"))
     }
     if (schema.fieldNames.contains(outputColName)) {
       throw new IllegalArgumentException(s"Output column $outputColName already exists.")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala
index 46cced3a9a6e5..6aef1c6837025 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala
@@ -79,7 +79,10 @@ class VectorAssemblerSuite
     val thrown = intercept[IllegalArgumentException] {
       assembler.transform(df)
     }
-    assert(thrown.getMessage contains "Data type StringType is not supported")
+    assert(thrown.getMessage contains
+      "Data type StringType of column a is not supported.\n" +
+      "Data type StringType of column b is not supported.\n" +
+      "Data type StringType of column c is not supported.")
   }
 
   test("ML attributes") {

From fef081309fc28efe8e136f363d85d7ccd9466e61 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Fri, 7 Jul 2017 20:04:30 +0800
Subject: [PATCH 121/779] [SPARK-21335][SQL] support un-aliased subquery

## What changes were proposed in this pull request?

un-aliased subquery is supported by Spark SQL for a long time. Its semantic was not well defined and had confusing behaviors, and it's not a standard SQL syntax, so we disallowed it in https://issues.apache.org/jira/browse/SPARK-20690 .

However, this is a breaking change, and we do have existing queries using un-aliased subquery. We should add the support back and fix its semantic.

This PR fixes the un-aliased subquery by assigning a default alias name.

After this PR, there is no syntax change from branch 2.2 to master, but we invalid a weird use case:
`SELECT v.i from (SELECT i FROM v)`. Now this query will throw analysis exception because users should not be able to use the qualifier inside a subquery.

## How was this patch tested?

new regression test

Author: Wenchen Fan <wenchen@databricks.com>

Closes #18559 from cloud-fan/sub-query.
---
 .../sql/catalyst/parser/AstBuilder.scala      | 16 ++--
 .../catalyst/plans/logical/LogicalPlan.scala  |  2 +-
 .../sql/catalyst/parser/PlanParserSuite.scala | 13 ---
 .../resources/sql-tests/inputs/group-by.sql   |  2 +-
 .../test/resources/sql-tests/inputs/limit.sql |  2 +-
 .../sql-tests/inputs/string-functions.sql     |  2 +-
 .../in-subquery/in-set-operations.sql         |  2 +-
 .../negative-cases/invalid-correlation.sql    |  2 +-
 .../scalar-subquery-predicate.sql             |  2 +-
 .../test/resources/sql-tests/inputs/union.sql |  4 +-
 .../results/columnresolution-negative.sql.out | 16 ++--
 .../sql-tests/results/group-by.sql.out        |  2 +-
 .../resources/sql-tests/results/limit.sql.out |  2 +-
 .../results/string-functions.sql.out          |  6 +-
 .../in-subquery/in-set-operations.sql.out     |  2 +-
 .../invalid-correlation.sql.out               |  2 +-
 .../scalar-subquery-predicate.sql.out         |  2 +-
 .../results/subquery/subquery-in-from.sql.out | 20 +----
 .../resources/sql-tests/results/union.sql.out |  4 +-
 .../apache/spark/sql/CachedTableSuite.scala   | 82 ++++++-------------
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 13 +++
 .../org/apache/spark/sql/SubquerySuite.scala  |  8 +-
 22 files changed, 83 insertions(+), 123 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index b6a4686bb9ec9..4d725904bc9b9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -751,15 +751,17 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
    * hooks.
    */
   override def visitAliasedQuery(ctx: AliasedQueryContext): LogicalPlan = withOrigin(ctx) {
-    // The unaliased subqueries in the FROM clause are disallowed. Instead of rejecting it in
-    // parser rules, we handle it here in order to provide better error message.
-    if (ctx.strictIdentifier == null) {
-      throw new ParseException("The unaliased subqueries in the FROM clause are not supported.",
-        ctx)
+    val alias = if (ctx.strictIdentifier == null) {
+      // For un-aliased subqueries, use a default alias name that is not likely to conflict with
+      // normal subquery names, so that parent operators can only access the columns in subquery by
+      // unqualified names. Users can still use this special qualifier to access columns if they
+      // know it, but that's not recommended.
+      "__auto_generated_subquery_name"
+    } else {
+      ctx.strictIdentifier.getText
     }
 
-    aliasPlan(ctx.strictIdentifier,
-      plan(ctx.queryNoWith).optionalMap(ctx.sample)(withSample))
+    SubqueryAlias(alias, plan(ctx.queryNoWith).optionalMap(ctx.sample)(withSample))
   }
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index 8649603b1a9f5..9b440cd99f994 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -253,7 +253,7 @@ abstract class LogicalPlan
 
       // More than one match.
       case ambiguousReferences =>
-        val referenceNames = ambiguousReferences.map(_._1).mkString(", ")
+        val referenceNames = ambiguousReferences.map(_._1.qualifiedName).mkString(", ")
         throw new AnalysisException(
           s"Reference '$name' is ambiguous, could be: $referenceNames.")
     }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
index 5b2573fa4d601..6dad097041a15 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
@@ -450,19 +450,6 @@ class PlanParserSuite extends AnalysisTest {
         |      (select id from t0)) as u_1
       """.stripMargin,
       plan.union(plan).union(plan).as("u_1").select('id))
-
-  }
-
-  test("aliased subquery") {
-    val errMsg = "The unaliased subqueries in the FROM clause are not supported"
-
-    assertEqual("select a from (select id as a from t0) tt",
-      table("t0").select('id.as("a")).as("tt").select('a))
-    intercept("select a from (select id as a from t0)", errMsg)
-
-    assertEqual("from (select id as a from t0) tt select a",
-      table("t0").select('id.as("a")).as("tt").select('a))
-    intercept("from (select id as a from t0) select a", errMsg)
   }
 
   test("scalar sub-query") {
diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
index bc2120727dac2..1e1384549a410 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
@@ -34,7 +34,7 @@ SELECT SKEWNESS(a), KURTOSIS(a), MIN(a), MAX(a), AVG(a), VARIANCE(a), STDDEV(a),
 FROM testData;
 
 -- Aggregate with foldable input and multiple distinct groups.
-SELECT COUNT(DISTINCT b), COUNT(DISTINCT b, c) FROM (SELECT 1 AS a, 2 AS b, 3 AS c) t GROUP BY a;
+SELECT COUNT(DISTINCT b), COUNT(DISTINCT b, c) FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY a;
 
 -- Aliases in SELECT could be used in GROUP BY
 SELECT a AS k, COUNT(b) FROM testData GROUP BY k;
diff --git a/sql/core/src/test/resources/sql-tests/inputs/limit.sql b/sql/core/src/test/resources/sql-tests/inputs/limit.sql
index df555bdc1976d..f21912a042716 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/limit.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/limit.sql
@@ -21,7 +21,7 @@ SELECT * FROM testdata LIMIT true;
 SELECT * FROM testdata LIMIT 'a';
 
 -- limit within a subquery
-SELECT * FROM (SELECT * FROM range(10) LIMIT 5) t WHERE id > 3;
+SELECT * FROM (SELECT * FROM range(10) LIMIT 5) WHERE id > 3;
 
 -- limit ALL
 SELECT * FROM testdata WHERE key < 3 LIMIT ALL;
diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
index 20c0390664037..c95f4817b7ce0 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
@@ -7,7 +7,7 @@ select 'a' || 'b' || 'c';
 
 -- Check if catalyst combine nested `Concat`s
 EXPLAIN EXTENDED SELECT (col1 || col2 || col3 || col4) col
-FROM (SELECT id col1, id col2, id col3, id col4 FROM range(10)) t;
+FROM (SELECT id col1, id col2, id col3, id col4 FROM range(10));
 
 -- replace function
 select replace('abc', 'b', '123');
diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-set-operations.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-set-operations.sql
index 42f84e9748713..5c371d2305ac8 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-set-operations.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-set-operations.sql
@@ -394,7 +394,7 @@ FROM   (SELECT *
                              FROM   t1)) t4
         WHERE  t4.t2b IN (SELECT Min(t3b)
                           FROM   t3
-                          WHERE  t4.t2a = t3a)) T;
+                          WHERE  t4.t2a = t3a));
 
 -- UNION, UNION ALL, UNION DISTINCT, INTERSECT and EXCEPT for NOT IN
 -- TC 01.12
diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/negative-cases/invalid-correlation.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/negative-cases/invalid-correlation.sql
index f3f0c7622ccdb..e22cade936792 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/subquery/negative-cases/invalid-correlation.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/negative-cases/invalid-correlation.sql
@@ -23,7 +23,7 @@ AND    t2b = (SELECT max(avg)
               FROM   (SELECT   t2b, avg(t2b) avg
                       FROM     t2
                       WHERE    t2a = t1.t1b
-                     ) T
+                     )
              )
 ;
 
diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-predicate.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-predicate.sql
index dbe8d76d2f117..fb0d07fbdace7 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-predicate.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-predicate.sql
@@ -19,7 +19,7 @@ AND    c.cv = (SELECT max(avg)
                FROM   (SELECT   c1.cv, avg(c1.cv) avg
                        FROM     c c1
                        WHERE    c1.ck = p.pk
-                       GROUP BY c1.cv) T);
+                       GROUP BY c1.cv));
 
 create temporary view t1 as select * from values
   ('val1a', 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 00:00:00.000', date '2014-04-04'),
diff --git a/sql/core/src/test/resources/sql-tests/inputs/union.sql b/sql/core/src/test/resources/sql-tests/inputs/union.sql
index 63bc044535e4d..e57d69eaad033 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/union.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/union.sql
@@ -5,7 +5,7 @@ CREATE OR REPLACE TEMPORARY VIEW t2 AS VALUES (1.0, 1), (2.0, 4) tbl(c1, c2);
 SELECT *
 FROM   (SELECT * FROM t1
         UNION ALL
-        SELECT * FROM t1) T;
+        SELECT * FROM t1);
 
 -- Type Coerced Union
 SELECT *
@@ -13,7 +13,7 @@ FROM   (SELECT * FROM t1
         UNION ALL
         SELECT * FROM t2
         UNION ALL
-        SELECT * FROM t2) T;
+        SELECT * FROM t2);
 
 -- Regression test for SPARK-18622
 SELECT a
diff --git a/sql/core/src/test/resources/sql-tests/results/columnresolution-negative.sql.out b/sql/core/src/test/resources/sql-tests/results/columnresolution-negative.sql.out
index 9e60e592c2bd1..b5a4f5c2bf654 100644
--- a/sql/core/src/test/resources/sql-tests/results/columnresolution-negative.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/columnresolution-negative.sql.out
@@ -72,7 +72,7 @@ SELECT i1 FROM t1, mydb1.t1
 struct<>
 -- !query 8 output
 org.apache.spark.sql.AnalysisException
-Reference 'i1' is ambiguous, could be: i1#x, i1#x.; line 1 pos 7
+Reference 'i1' is ambiguous, could be: t1.i1, t1.i1.; line 1 pos 7
 
 
 -- !query 9
@@ -81,7 +81,7 @@ SELECT t1.i1 FROM t1, mydb1.t1
 struct<>
 -- !query 9 output
 org.apache.spark.sql.AnalysisException
-Reference 't1.i1' is ambiguous, could be: i1#x, i1#x.; line 1 pos 7
+Reference 't1.i1' is ambiguous, could be: t1.i1, t1.i1.; line 1 pos 7
 
 
 -- !query 10
@@ -99,7 +99,7 @@ SELECT i1 FROM t1, mydb2.t1
 struct<>
 -- !query 11 output
 org.apache.spark.sql.AnalysisException
-Reference 'i1' is ambiguous, could be: i1#x, i1#x.; line 1 pos 7
+Reference 'i1' is ambiguous, could be: t1.i1, t1.i1.; line 1 pos 7
 
 
 -- !query 12
@@ -108,7 +108,7 @@ SELECT t1.i1 FROM t1, mydb2.t1
 struct<>
 -- !query 12 output
 org.apache.spark.sql.AnalysisException
-Reference 't1.i1' is ambiguous, could be: i1#x, i1#x.; line 1 pos 7
+Reference 't1.i1' is ambiguous, could be: t1.i1, t1.i1.; line 1 pos 7
 
 
 -- !query 13
@@ -125,7 +125,7 @@ SELECT i1 FROM t1, mydb1.t1
 struct<>
 -- !query 14 output
 org.apache.spark.sql.AnalysisException
-Reference 'i1' is ambiguous, could be: i1#x, i1#x.; line 1 pos 7
+Reference 'i1' is ambiguous, could be: t1.i1, t1.i1.; line 1 pos 7
 
 
 -- !query 15
@@ -134,7 +134,7 @@ SELECT t1.i1 FROM t1, mydb1.t1
 struct<>
 -- !query 15 output
 org.apache.spark.sql.AnalysisException
-Reference 't1.i1' is ambiguous, could be: i1#x, i1#x.; line 1 pos 7
+Reference 't1.i1' is ambiguous, could be: t1.i1, t1.i1.; line 1 pos 7
 
 
 -- !query 16
@@ -143,7 +143,7 @@ SELECT i1 FROM t1, mydb2.t1
 struct<>
 -- !query 16 output
 org.apache.spark.sql.AnalysisException
-Reference 'i1' is ambiguous, could be: i1#x, i1#x.; line 1 pos 7
+Reference 'i1' is ambiguous, could be: t1.i1, t1.i1.; line 1 pos 7
 
 
 -- !query 17
@@ -152,7 +152,7 @@ SELECT t1.i1 FROM t1, mydb2.t1
 struct<>
 -- !query 17 output
 org.apache.spark.sql.AnalysisException
-Reference 't1.i1' is ambiguous, could be: i1#x, i1#x.; line 1 pos 7
+Reference 't1.i1' is ambiguous, could be: t1.i1, t1.i1.; line 1 pos 7
 
 
 -- !query 18
diff --git a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
index e23ebd4e822fa..986bb01c13fe4 100644
--- a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
@@ -134,7 +134,7 @@ struct<skewness(CAST(a AS DOUBLE)):double,kurtosis(CAST(a AS DOUBLE)):double,min
 
 
 -- !query 14
-SELECT COUNT(DISTINCT b), COUNT(DISTINCT b, c) FROM (SELECT 1 AS a, 2 AS b, 3 AS c) t GROUP BY a
+SELECT COUNT(DISTINCT b), COUNT(DISTINCT b, c) FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY a
 -- !query 14 schema
 struct<count(DISTINCT b):bigint,count(DISTINCT b, c):bigint>
 -- !query 14 output
diff --git a/sql/core/src/test/resources/sql-tests/results/limit.sql.out b/sql/core/src/test/resources/sql-tests/results/limit.sql.out
index afdd6df2a5714..146abe6cbd058 100644
--- a/sql/core/src/test/resources/sql-tests/results/limit.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/limit.sql.out
@@ -93,7 +93,7 @@ The limit expression must be integer type, but got string;
 
 
 -- !query 10
-SELECT * FROM (SELECT * FROM range(10) LIMIT 5) t WHERE id > 3
+SELECT * FROM (SELECT * FROM range(10) LIMIT 5) WHERE id > 3
 -- !query 10 schema
 struct<id:bigint>
 -- !query 10 output
diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
index 52eb554edf89e..b0ae9d775d968 100644
--- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
@@ -30,20 +30,20 @@ abc
 
 -- !query 3
 EXPLAIN EXTENDED SELECT (col1 || col2 || col3 || col4) col
-FROM (SELECT id col1, id col2, id col3, id col4 FROM range(10)) t
+FROM (SELECT id col1, id col2, id col3, id col4 FROM range(10))
 -- !query 3 schema
 struct<plan:string>
 -- !query 3 output
 == Parsed Logical Plan ==
 'Project [concat(concat(concat('col1, 'col2), 'col3), 'col4) AS col#x]
-+- 'SubqueryAlias t
++- 'SubqueryAlias __auto_generated_subquery_name
    +- 'Project ['id AS col1#x, 'id AS col2#x, 'id AS col3#x, 'id AS col4#x]
       +- 'UnresolvedTableValuedFunction range, [10]
 
 == Analyzed Logical Plan ==
 col: string
 Project [concat(concat(concat(cast(col1#xL as string), cast(col2#xL as string)), cast(col3#xL as string)), cast(col4#xL as string)) AS col#x]
-+- SubqueryAlias t
++- SubqueryAlias __auto_generated_subquery_name
    +- Project [id#xL AS col1#xL, id#xL AS col2#xL, id#xL AS col3#xL, id#xL AS col4#xL]
       +- Range (0, 10, step=1, splits=None)
 
diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-set-operations.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-set-operations.sql.out
index 5780f49648ec7..e06f9206d3401 100644
--- a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-set-operations.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-set-operations.sql.out
@@ -496,7 +496,7 @@ FROM   (SELECT *
                              FROM   t1)) t4
         WHERE  t4.t2b IN (SELECT Min(t3b)
                           FROM   t3
-                          WHERE  t4.t2a = t3a)) T
+                          WHERE  t4.t2a = t3a))
 -- !query 13 schema
 struct<t2a:string,t2b:smallint,t2c:int,t2d:bigint,t2e:float,t2f:double,t2g:decimal(2,-2),t2h:timestamp,t2i:date>
 -- !query 13 output
diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/invalid-correlation.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/invalid-correlation.sql.out
index ca3930b33e06d..e4b1a2dbc675c 100644
--- a/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/invalid-correlation.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/invalid-correlation.sql.out
@@ -40,7 +40,7 @@ AND    t2b = (SELECT max(avg)
               FROM   (SELECT   t2b, avg(t2b) avg
                       FROM     t2
                       WHERE    t2a = t1.t1b
-                     ) T
+                     )
              )
 -- !query 3 schema
 struct<>
diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out
index 1d5dddca76a17..8b29300e71f90 100644
--- a/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out
@@ -39,7 +39,7 @@ AND    c.cv = (SELECT max(avg)
                FROM   (SELECT   c1.cv, avg(c1.cv) avg
                        FROM     c c1
                        WHERE    c1.ck = p.pk
-                       GROUP BY c1.cv) T)
+                       GROUP BY c1.cv))
 -- !query 3 schema
 struct<pk:int,cv:int>
 -- !query 3 output
diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/subquery-in-from.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/subquery-in-from.sql.out
index 14553557d1ffc..50370df349168 100644
--- a/sql/core/src/test/resources/sql-tests/results/subquery/subquery-in-from.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/subquery/subquery-in-from.sql.out
@@ -37,26 +37,14 @@ struct<key:int,value:string>
 -- !query 4
 SELECT * FROM (SELECT * FROM testData) WHERE key = 1
 -- !query 4 schema
-struct<>
+struct<key:int,value:string>
 -- !query 4 output
-org.apache.spark.sql.catalyst.parser.ParseException
-
-The unaliased subqueries in the FROM clause are not supported.(line 1, pos 14)
-
-== SQL ==
-SELECT * FROM (SELECT * FROM testData) WHERE key = 1
---------------^^^
+1	1
 
 
 -- !query 5
 FROM (SELECT * FROM testData WHERE key = 1) SELECT *
 -- !query 5 schema
-struct<>
+struct<key:int,value:string>
 -- !query 5 output
-org.apache.spark.sql.catalyst.parser.ParseException
-
-The unaliased subqueries in the FROM clause are not supported.(line 1, pos 5)
-
-== SQL ==
-FROM (SELECT * FROM testData WHERE key = 1) SELECT *
------^^^
+1	1
diff --git a/sql/core/src/test/resources/sql-tests/results/union.sql.out b/sql/core/src/test/resources/sql-tests/results/union.sql.out
index 865b3aed65d70..d123b7fdbe0cf 100644
--- a/sql/core/src/test/resources/sql-tests/results/union.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/union.sql.out
@@ -22,7 +22,7 @@ struct<>
 SELECT *
 FROM   (SELECT * FROM t1
         UNION ALL
-        SELECT * FROM t1) T
+        SELECT * FROM t1)
 -- !query 2 schema
 struct<c1:int,c2:string>
 -- !query 2 output
@@ -38,7 +38,7 @@ FROM   (SELECT * FROM t1
         UNION ALL
         SELECT * FROM t2
         UNION ALL
-        SELECT * FROM t2) T
+        SELECT * FROM t2)
 -- !query 3 schema
 struct<c1:decimal(11,1),c2:string>
 -- !query 3 output
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
index 506cc2548e260..3e4f619431599 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
@@ -631,13 +631,13 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext
       val ds2 =
         sql(
           """
-            |SELECT * FROM (SELECT max(c1) as c1 FROM t1 GROUP BY c1) tt
+            |SELECT * FROM (SELECT c1, max(c1) FROM t1 GROUP BY c1)
             |WHERE
-            |tt.c1 = (SELECT max(c1) FROM t2 GROUP BY c1)
+            |c1 = (SELECT max(c1) FROM t2 GROUP BY c1)
             |OR
             |EXISTS (SELECT c1 FROM t3)
             |OR
-            |tt.c1 IN (SELECT c1 FROM t4)
+            |c1 IN (SELECT c1 FROM t4)
           """.stripMargin)
       assert(getNumInMemoryRelations(ds2) == 4)
     }
@@ -683,20 +683,15 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext
       Seq(1).toDF("c1").createOrReplaceTempView("t1")
       Seq(2).toDF("c1").createOrReplaceTempView("t2")
 
-      sql(
+      val sql1 =
         """
           |SELECT * FROM t1
           |WHERE
           |NOT EXISTS (SELECT * FROM t2)
-        """.stripMargin).cache()
+        """.stripMargin
+      sql(sql1).cache()
 
-      val cachedDs =
-        sql(
-          """
-            |SELECT * FROM t1
-            |WHERE
-            |NOT EXISTS (SELECT * FROM t2)
-          """.stripMargin)
+      val cachedDs = sql(sql1)
       assert(getNumInMemoryRelations(cachedDs) == 1)
 
       // Additional predicate in the subquery plan should cause a cache miss
@@ -717,20 +712,15 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext
       Seq(1).toDF("c1").createOrReplaceTempView("t2")
 
       // Simple correlated predicate in subquery
-      sql(
+      val sqlText =
         """
           |SELECT * FROM t1
           |WHERE
           |t1.c1 in (SELECT t2.c1 FROM t2 where t1.c1 = t2.c1)
-        """.stripMargin).cache()
+        """.stripMargin
+      sql(sqlText).cache()
 
-      val cachedDs =
-        sql(
-          """
-            |SELECT * FROM t1
-            |WHERE
-            |t1.c1 in (SELECT t2.c1 FROM t2 where t1.c1 = t2.c1)
-          """.stripMargin)
+      val cachedDs = sql(sqlText)
       assert(getNumInMemoryRelations(cachedDs) == 1)
     }
   }
@@ -741,22 +731,16 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext
       spark.catalog.cacheTable("t1")
 
       // underlying table t1 is cached as well as the query that refers to it.
-      val ds =
-      sql(
+      val sqlText =
         """
           |SELECT * FROM t1
           |WHERE
           |NOT EXISTS (SELECT * FROM t1)
-        """.stripMargin)
+        """.stripMargin
+      val ds = sql(sqlText)
       assert(getNumInMemoryRelations(ds) == 2)
 
-      val cachedDs =
-        sql(
-          """
-            |SELECT * FROM t1
-            |WHERE
-            |NOT EXISTS (SELECT * FROM t1)
-          """.stripMargin).cache()
+      val cachedDs = sql(sqlText).cache()
       assert(getNumInMemoryTablesRecursively(cachedDs.queryExecution.sparkPlan) == 3)
     }
   }
@@ -769,45 +753,31 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext
       Seq(1).toDF("c1").createOrReplaceTempView("t4")
 
       // Nested predicate subquery
-      sql(
+      val sql1 =
         """
           |SELECT * FROM t1
           |WHERE
           |c1 IN (SELECT c1 FROM t2 WHERE c1 IN (SELECT c1 FROM t3 WHERE c1 = 1))
-        """.stripMargin).cache()
+        """.stripMargin
+      sql(sql1).cache()
 
-      val cachedDs =
-        sql(
-          """
-            |SELECT * FROM t1
-            |WHERE
-            |c1 IN (SELECT c1 FROM t2 WHERE c1 IN (SELECT c1 FROM t3 WHERE c1 = 1))
-          """.stripMargin)
+      val cachedDs = sql(sql1)
       assert(getNumInMemoryRelations(cachedDs) == 1)
 
       // Scalar subquery and predicate subquery
-      sql(
+      val sql2 =
         """
-          |SELECT * FROM (SELECT max(c1) as c1 FROM t1 GROUP BY c1) tt
+          |SELECT * FROM (SELECT c1, max(c1) FROM t1 GROUP BY c1)
           |WHERE
-          |tt.c1 = (SELECT max(c1) FROM t2 GROUP BY c1)
+          |c1 = (SELECT max(c1) FROM t2 GROUP BY c1)
           |OR
           |EXISTS (SELECT c1 FROM t3)
           |OR
-          |tt.c1 IN (SELECT c1 FROM t4)
-        """.stripMargin).cache()
+          |c1 IN (SELECT c1 FROM t4)
+        """.stripMargin
+      sql(sql2).cache()
 
-      val cachedDs2 =
-        sql(
-          """
-            |SELECT * FROM (SELECT max(c1) as c1 FROM t1 GROUP BY c1) tt
-            |WHERE
-            |tt.c1 = (SELECT max(c1) FROM t2 GROUP BY c1)
-            |OR
-            |EXISTS (SELECT c1 FROM t3)
-            |OR
-            |tt.c1 IN (SELECT c1 FROM t4)
-          """.stripMargin)
+      val cachedDs2 = sql(sql2)
       assert(getNumInMemoryRelations(cachedDs2) == 1)
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 5171aaebc9907..472ff7385b194 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2638,4 +2638,17 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       }
     }
   }
+
+  test("SPARK-21335: support un-aliased subquery") {
+    withTempView("v") {
+      Seq(1 -> "a").toDF("i", "j").createOrReplaceTempView("v")
+      checkAnswer(sql("SELECT i from (SELECT i FROM v)"), Row(1))
+
+      val e = intercept[AnalysisException](sql("SELECT v.i from (SELECT i FROM v)"))
+      assert(e.message ==
+        "cannot resolve '`v.i`' given input columns: [__auto_generated_subquery_name.i]")
+
+      checkAnswer(sql("SELECT __auto_generated_subquery_name.i from (SELECT i FROM v)"), Row(1))
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
index c0a3b5add313a..7bcb419e8df6c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
@@ -112,7 +112,7 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
           |   with t4 as (select 1 as d, 3 as e)
           |   select * from t4 cross join t2 where t2.b = t4.d
           | )
-          | select a from (select 1 as a union all select 2 as a) t
+          | select a from (select 1 as a union all select 2 as a)
           | where a = (select max(d) from t3)
         """.stripMargin),
       Array(Row(1))
@@ -606,8 +606,8 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
             |    select cntPlusOne + 1 as cntPlusTwo from (
             |        select cnt + 1 as cntPlusOne from (
             |            select sum(r.c) s, count(*) cnt from r where l.a = r.c having cnt = 0
-            |        ) t1
-            |    ) t2
+            |        )
+            |    )
             |) = 2""".stripMargin),
       Row(1) :: Row(1) :: Row(null) :: Row(null) :: Nil)
   }
@@ -655,7 +655,7 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
           """
             | select c1 from onerow t1
             | where exists (select 1
-            |               from   (select 1 as c1 from onerow t2 LIMIT 1) t2
+            |               from   (select c1 from onerow t2 LIMIT 1) t2
             |               where  t1.c1=t2.c1)""".stripMargin),
         Row(1) :: Nil)
     }

From fbbe37ed416f2ca9d8fc713a135b335b8a0247bf Mon Sep 17 00:00:00 2001
From: CodingCat <zhunansjtu@gmail.com>
Date: Fri, 7 Jul 2017 20:10:24 +0800
Subject: [PATCH 122/779] [SPARK-19358][CORE] LiveListenerBus shall log the
 event name when dropping them due to a fully filled queue

## What changes were proposed in this pull request?

Some dropped event will make the whole application behaves unexpectedly, e.g. some UI problem...we shall log the dropped event name to facilitate the debugging

## How was this patch tested?

Existing tests

Author: CodingCat <zhunansjtu@gmail.com>

Closes #16697 from CodingCat/SPARK-19358.
---
 .../main/scala/org/apache/spark/scheduler/LiveListenerBus.scala  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala
index f0887e090b956..0dd63d4392800 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala
@@ -232,6 +232,7 @@ private[spark] class LiveListenerBus(conf: SparkConf) extends SparkListenerBus {
         "This likely means one of the SparkListeners is too slow and cannot keep up with " +
         "the rate at which tasks are being started by the scheduler.")
     }
+    logTrace(s"Dropping event $event")
   }
 }
 

From a0fe32a219253f0abe9d67cf178c73daf5f6fcc1 Mon Sep 17 00:00:00 2001
From: Wang Gengliang <ltnwgl@gmail.com>
Date: Fri, 7 Jul 2017 15:39:29 -0700
Subject: [PATCH 123/779] [SPARK-21336] Revise rand comparison in
 BatchEvalPythonExecSuite

## What changes were proposed in this pull request?

Revise rand comparison in BatchEvalPythonExecSuite

In BatchEvalPythonExecSuite, there are two cases using the case "rand() > 3"
Rand() generates a random value in [0, 1), it is wired to be compared with 3, use 0.3 instead

## How was this patch tested?

unit test

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Wang Gengliang <ltnwgl@gmail.com>

Closes #18560 from gengliangwang/revise_BatchEvalPythonExecSuite.
---
 .../spark/sql/execution/python/BatchEvalPythonExecSuite.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExecSuite.scala
index 80ef4eb75ca53..bbd9484271a3e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExecSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExecSuite.scala
@@ -65,7 +65,7 @@ class BatchEvalPythonExecSuite extends SparkPlanTest with SharedSQLContext {
 
   test("Python UDF: no push down on non-deterministic") {
     val df = Seq(("Hello", 4)).toDF("a", "b")
-      .where("b > 4 and dummyPythonUDF(a) and rand() > 3")
+      .where("b > 4 and dummyPythonUDF(a) and rand() > 0.3")
     val qualifiedPlanNodes = df.queryExecution.executedPlan.collect {
       case f @ FilterExec(
           And(_: AttributeReference, _: GreaterThan),
@@ -77,7 +77,7 @@ class BatchEvalPythonExecSuite extends SparkPlanTest with SharedSQLContext {
 
   test("Python UDF: no push down on predicates starting from the first non-deterministic") {
     val df = Seq(("Hello", 4)).toDF("a", "b")
-      .where("dummyPythonUDF(a) and rand() > 3 and b > 4")
+      .where("dummyPythonUDF(a) and rand() > 0.3 and b > 4")
     val qualifiedPlanNodes = df.queryExecution.executedPlan.collect {
       case f @ FilterExec(And(_: And, _: GreaterThan), InputAdapter(_: BatchEvalPythonExec)) => f
     }

From e1a172c201d68406faa53b113518b10c879f1ff6 Mon Sep 17 00:00:00 2001
From: Andrew Ray <ray.andrew@gmail.com>
Date: Sat, 8 Jul 2017 13:47:41 +0800
Subject: [PATCH 124/779] [SPARK-21100][SQL] Add summary method as alternative
 to describe that gives quartiles similar to Pandas

## What changes were proposed in this pull request?

Adds method `summary`  that allows user to specify which statistics and percentiles to calculate. By default it include the existing statistics from `describe` and quartiles (25th, 50th, and 75th percentiles) similar to Pandas. Also changes the implementation of `describe` to delegate to `summary`.

## How was this patch tested?

additional unit test

Author: Andrew Ray <ray.andrew@gmail.com>

Closes #18307 from aray/SPARK-21100.
---
 .../scala/org/apache/spark/sql/Dataset.scala  | 113 +++++++++++-------
 .../sql/execution/stat/StatFunctions.scala    |  98 ++++++++++++++-
 .../org/apache/spark/sql/DataFrameSuite.scala | 112 +++++++++++++----
 3 files changed, 258 insertions(+), 65 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index b1638a2180b07..5326b45b50a8b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -38,18 +38,18 @@ import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.catalog.CatalogRelation
 import org.apache.spark.sql.catalyst.encoders._
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.json.{JacksonGenerator, JSONOptions}
 import org.apache.spark.sql.catalyst.optimizer.CombineUnions
 import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, PartitioningCollection}
-import org.apache.spark.sql.catalyst.util.{usePrettyExpression, DateTimeUtils}
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.command._
 import org.apache.spark.sql.execution.datasources.LogicalRelation
 import org.apache.spark.sql.execution.python.EvaluatePython
+import org.apache.spark.sql.execution.stat.StatFunctions
 import org.apache.spark.sql.streaming.DataStreamWriter
 import org.apache.spark.sql.types._
 import org.apache.spark.storage.StorageLevel
@@ -224,7 +224,7 @@ class Dataset[T] private[sql](
     }
   }
 
-  private def aggregatableColumns: Seq[Expression] = {
+  private[sql] def aggregatableColumns: Seq[Expression] = {
     schema.fields
       .filter(f => f.dataType.isInstanceOf[NumericType] || f.dataType.isInstanceOf[StringType])
       .map { n =>
@@ -2161,9 +2161,9 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * Computes statistics for numeric and string columns, including count, mean, stddev, min, and
-   * max. If no columns are given, this function computes statistics for all numerical or string
-   * columns.
+   * Computes basic statistics for numeric and string columns, including count, mean, stddev, min,
+   * and max. If no columns are given, this function computes statistics for all numerical or
+   * string columns.
    *
    * This function is meant for exploratory data analysis, as we make no guarantee about the
    * backward compatibility of the schema of the resulting Dataset. If you want to
@@ -2181,46 +2181,79 @@ class Dataset[T] private[sql](
    *   // max     92.0  192.0
    * }}}
    *
+   * Use [[summary]] for expanded statistics and control over which statistics to compute.
+   *
+   * @param cols Columns to compute statistics on.
+   *
    * @group action
    * @since 1.6.0
    */
   @scala.annotation.varargs
-  def describe(cols: String*): DataFrame = withPlan {
-
-    // The list of summary statistics to compute, in the form of expressions.
-    val statistics = List[(String, Expression => Expression)](
-      "count" -> ((child: Expression) => Count(child).toAggregateExpression()),
-      "mean" -> ((child: Expression) => Average(child).toAggregateExpression()),
-      "stddev" -> ((child: Expression) => StddevSamp(child).toAggregateExpression()),
-      "min" -> ((child: Expression) => Min(child).toAggregateExpression()),
-      "max" -> ((child: Expression) => Max(child).toAggregateExpression()))
-
-    val outputCols =
-      (if (cols.isEmpty) aggregatableColumns.map(usePrettyExpression(_).sql) else cols).toList
-
-    val ret: Seq[Row] = if (outputCols.nonEmpty) {
-      val aggExprs = statistics.flatMap { case (_, colToAgg) =>
-        outputCols.map(c => Column(Cast(colToAgg(Column(c).expr), StringType)).as(c))
-      }
-
-      val row = groupBy().agg(aggExprs.head, aggExprs.tail: _*).head().toSeq
-
-      // Pivot the data so each summary is one row
-      row.grouped(outputCols.size).toSeq.zip(statistics).map { case (aggregation, (statistic, _)) =>
-        Row(statistic :: aggregation.toList: _*)
-      }
-    } else {
-      // If there are no output columns, just output a single column that contains the stats.
-      statistics.map { case (name, _) => Row(name) }
-    }
-
-    // All columns are string type
-    val schema = StructType(
-      StructField("summary", StringType) :: outputCols.map(StructField(_, StringType))).toAttributes
-    // `toArray` forces materialization to make the seq serializable
-    LocalRelation.fromExternalRows(schema, ret.toArray.toSeq)
+  def describe(cols: String*): DataFrame = {
+    val selected = if (cols.isEmpty) this else select(cols.head, cols.tail: _*)
+    selected.summary("count", "mean", "stddev", "min", "max")
   }
 
+  /**
+   * Computes specified statistics for numeric and string columns. Available statistics are:
+   *
+   * - count
+   * - mean
+   * - stddev
+   * - min
+   * - max
+   * - arbitrary approximate percentiles specified as a percentage (eg, 75%)
+   *
+   * If no statistics are given, this function computes count, mean, stddev, min,
+   * approximate quartiles (percentiles at 25%, 50%, and 75%), and max.
+   *
+   * This function is meant for exploratory data analysis, as we make no guarantee about the
+   * backward compatibility of the schema of the resulting Dataset. If you want to
+   * programmatically compute summary statistics, use the `agg` function instead.
+   *
+   * {{{
+   *   ds.summary().show()
+   *
+   *   // output:
+   *   // summary age   height
+   *   // count   10.0  10.0
+   *   // mean    53.3  178.05
+   *   // stddev  11.6  15.7
+   *   // min     18.0  163.0
+   *   // 25%     24.0  176.0
+   *   // 50%     24.0  176.0
+   *   // 75%     32.0  180.0
+   *   // max     92.0  192.0
+   * }}}
+   *
+   * {{{
+   *   ds.summary("count", "min", "25%", "75%", "max").show()
+   *
+   *   // output:
+   *   // summary age   height
+   *   // count   10.0  10.0
+   *   // min     18.0  163.0
+   *   // 25%     24.0  176.0
+   *   // 75%     32.0  180.0
+   *   // max     92.0  192.0
+   * }}}
+   *
+   * To do a summary for specific columns first select them:
+   *
+   * {{{
+   *   ds.select("age", "height").summary().show()
+   * }}}
+   *
+   * See also [[describe]] for basic statistics.
+   *
+   * @param statistics Statistics from above list to be computed.
+   *
+   * @group action
+   * @since 2.3.0
+   */
+  @scala.annotation.varargs
+  def summary(statistics: String*): DataFrame = StatFunctions.summary(this, statistics.toSeq)
+
   /**
    * Returns the first `n` rows.
    *
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
index 1debad03c93fa..436e18fdb5ff5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
@@ -19,9 +19,10 @@ package org.apache.spark.sql.execution.stat
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{Column, DataFrame, Dataset, Row}
-import org.apache.spark.sql.catalyst.expressions.{Cast, GenericInternalRow}
+import org.apache.spark.sql.catalyst.expressions.{Cast, CreateArray, Expression, GenericInternalRow, Literal}
+import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
-import org.apache.spark.sql.catalyst.util.QuantileSummaries
+import org.apache.spark.sql.catalyst.util.{usePrettyExpression, QuantileSummaries}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
@@ -220,4 +221,97 @@ object StatFunctions extends Logging {
 
     Dataset.ofRows(df.sparkSession, LocalRelation(schema.toAttributes, table)).na.fill(0.0)
   }
+
+  /** Calculate selected summary statistics for a dataset */
+  def summary(ds: Dataset[_], statistics: Seq[String]): DataFrame = {
+
+    val defaultStatistics = Seq("count", "mean", "stddev", "min", "25%", "50%", "75%", "max")
+    val selectedStatistics = if (statistics.nonEmpty) statistics else defaultStatistics
+
+    val hasPercentiles = selectedStatistics.exists(_.endsWith("%"))
+    val (percentiles, percentileNames, remainingAggregates) = if (hasPercentiles) {
+      val (pStrings, rest) = selectedStatistics.partition(a => a.endsWith("%"))
+      val percentiles = pStrings.map { p =>
+        try {
+          p.stripSuffix("%").toDouble / 100.0
+        } catch {
+          case e: NumberFormatException =>
+            throw new IllegalArgumentException(s"Unable to parse $p as a percentile", e)
+        }
+      }
+      require(percentiles.forall(p => p >= 0 && p <= 1), "Percentiles must be in the range [0, 1]")
+      (percentiles, pStrings, rest)
+    } else {
+      (Seq(), Seq(), selectedStatistics)
+    }
+
+
+    // The list of summary statistics to compute, in the form of expressions.
+    val availableStatistics = Map[String, Expression => Expression](
+      "count" -> ((child: Expression) => Count(child).toAggregateExpression()),
+      "mean" -> ((child: Expression) => Average(child).toAggregateExpression()),
+      "stddev" -> ((child: Expression) => StddevSamp(child).toAggregateExpression()),
+      "min" -> ((child: Expression) => Min(child).toAggregateExpression()),
+      "max" -> ((child: Expression) => Max(child).toAggregateExpression()))
+
+    val statisticFns = remainingAggregates.map { agg =>
+      require(availableStatistics.contains(agg), s"$agg is not a recognised statistic")
+      agg -> availableStatistics(agg)
+    }
+
+    def percentileAgg(child: Expression): Expression =
+      new ApproximatePercentile(child, CreateArray(percentiles.map(Literal(_))))
+        .toAggregateExpression()
+
+    val outputCols = ds.aggregatableColumns.map(usePrettyExpression(_).sql).toList
+
+    val ret: Seq[Row] = if (outputCols.nonEmpty) {
+      var aggExprs = statisticFns.toList.flatMap { case (_, colToAgg) =>
+        outputCols.map(c => Column(Cast(colToAgg(Column(c).expr), StringType)).as(c))
+      }
+      if (hasPercentiles) {
+        aggExprs = outputCols.map(c => Column(percentileAgg(Column(c).expr)).as(c)) ++ aggExprs
+      }
+
+      val row = ds.groupBy().agg(aggExprs.head, aggExprs.tail: _*).head().toSeq
+
+      // Pivot the data so each summary is one row
+      val grouped: Seq[Seq[Any]] = row.grouped(outputCols.size).toSeq
+
+      val basicStats = if (hasPercentiles) grouped.tail else grouped
+
+      val rows = basicStats.zip(statisticFns).map { case (aggregation, (statistic, _)) =>
+        Row(statistic :: aggregation.toList: _*)
+      }
+
+      if (hasPercentiles) {
+        def nullSafeString(x: Any) = if (x == null) null else x.toString
+        val percentileRows = grouped.head
+          .map {
+            case a: Seq[Any] => a
+            case _ => Seq.fill(percentiles.length)(null: Any)
+          }
+          .transpose
+          .zip(percentileNames)
+          .map { case (values: Seq[Any], name) =>
+            Row(name :: values.map(nullSafeString).toList: _*)
+          }
+        (rows ++ percentileRows)
+          .sortWith((left, right) =>
+            selectedStatistics.indexOf(left(0)) < selectedStatistics.indexOf(right(0)))
+      } else {
+        rows
+      }
+    } else {
+      // If there are no output columns, just output a single column that contains the stats.
+      selectedStatistics.map(Row(_))
+    }
+
+    // All columns are string type
+    val schema = StructType(
+      StructField("summary", StringType) :: outputCols.map(StructField(_, StringType))).toAttributes
+    // `toArray` forces materialization to make the seq serializable
+    Dataset.ofRows(ds.sparkSession, LocalRelation.fromExternalRows(schema, ret.toArray.toSeq))
+  }
+
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 9ea9951c24ef1..2c7051bf431c3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -28,8 +28,7 @@ import org.scalatest.Matchers._
 
 import org.apache.spark.SparkException
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.plans.logical.{Filter, OneRowRelation, Project, Union}
-import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.catalyst.plans.logical.{Filter, OneRowRelation, Union}
 import org.apache.spark.sql.execution.{FilterExec, QueryExecution}
 import org.apache.spark.sql.execution.aggregate.HashAggregateExec
 import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ReusedExchangeExec, ShuffleExchange}
@@ -663,13 +662,13 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
     assert(df.schema.map(_.name) === Seq("key", "valueRenamed", "newCol"))
   }
 
-  test("describe") {
-    val describeTestData = Seq(
-      ("Bob", 16, 176),
-      ("Alice", 32, 164),
-      ("David", 60, 192),
-      ("Amy", 24, 180)).toDF("name", "age", "height")
+  private lazy val person2: DataFrame = Seq(
+    ("Bob", 16, 176),
+    ("Alice", 32, 164),
+    ("David", 60, 192),
+    ("Amy", 24, 180)).toDF("name", "age", "height")
 
+  test("describe") {
     val describeResult = Seq(
       Row("count", "4", "4", "4"),
       Row("mean", null, "33.0", "178.0"),
@@ -686,32 +685,99 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
 
     def getSchemaAsSeq(df: DataFrame): Seq[String] = df.schema.map(_.name)
 
-    val describeTwoCols = describeTestData.describe("name", "age", "height")
-    assert(getSchemaAsSeq(describeTwoCols) === Seq("summary", "name", "age", "height"))
-    checkAnswer(describeTwoCols, describeResult)
-    // All aggregate value should have been cast to string
-    describeTwoCols.collect().foreach { row =>
-      assert(row.get(2).isInstanceOf[String], "expected string but found " + row.get(2).getClass)
-      assert(row.get(3).isInstanceOf[String], "expected string but found " + row.get(3).getClass)
-    }
-
-    val describeAllCols = describeTestData.describe()
+    val describeAllCols = person2.describe()
     assert(getSchemaAsSeq(describeAllCols) === Seq("summary", "name", "age", "height"))
     checkAnswer(describeAllCols, describeResult)
+    // All aggregate value should have been cast to string
+    describeAllCols.collect().foreach { row =>
+      row.toSeq.foreach { value =>
+        if (value != null) {
+          assert(value.isInstanceOf[String], "expected string but found " + value.getClass)
+        }
+      }
+    }
 
-    val describeOneCol = describeTestData.describe("age")
+    val describeOneCol = person2.describe("age")
     assert(getSchemaAsSeq(describeOneCol) === Seq("summary", "age"))
     checkAnswer(describeOneCol, describeResult.map { case Row(s, _, d, _) => Row(s, d)} )
 
-    val describeNoCol = describeTestData.select("name").describe()
-    assert(getSchemaAsSeq(describeNoCol) === Seq("summary", "name"))
-    checkAnswer(describeNoCol, describeResult.map { case Row(s, n, _, _) => Row(s, n)} )
+    val describeNoCol = person2.select().describe()
+    assert(getSchemaAsSeq(describeNoCol) === Seq("summary"))
+    checkAnswer(describeNoCol, describeResult.map { case Row(s, _, _, _) => Row(s)} )
 
-    val emptyDescription = describeTestData.limit(0).describe()
+    val emptyDescription = person2.limit(0).describe()
     assert(getSchemaAsSeq(emptyDescription) === Seq("summary", "name", "age", "height"))
     checkAnswer(emptyDescription, emptyDescribeResult)
   }
 
+  test("summary") {
+    val summaryResult = Seq(
+      Row("count", "4", "4", "4"),
+      Row("mean", null, "33.0", "178.0"),
+      Row("stddev", null, "19.148542155126762", "11.547005383792516"),
+      Row("min", "Alice", "16", "164"),
+      Row("25%", null, "24.0", "176.0"),
+      Row("50%", null, "24.0", "176.0"),
+      Row("75%", null, "32.0", "180.0"),
+      Row("max", "David", "60", "192"))
+
+    val emptySummaryResult = Seq(
+      Row("count", "0", "0", "0"),
+      Row("mean", null, null, null),
+      Row("stddev", null, null, null),
+      Row("min", null, null, null),
+      Row("25%", null, null, null),
+      Row("50%", null, null, null),
+      Row("75%", null, null, null),
+      Row("max", null, null, null))
+
+    def getSchemaAsSeq(df: DataFrame): Seq[String] = df.schema.map(_.name)
+
+    val summaryAllCols = person2.summary()
+
+    assert(getSchemaAsSeq(summaryAllCols) === Seq("summary", "name", "age", "height"))
+    checkAnswer(summaryAllCols, summaryResult)
+    // All aggregate value should have been cast to string
+    summaryAllCols.collect().foreach { row =>
+      row.toSeq.foreach { value =>
+        if (value != null) {
+          assert(value.isInstanceOf[String], "expected string but found " + value.getClass)
+        }
+      }
+    }
+
+    val summaryOneCol = person2.select("age").summary()
+    assert(getSchemaAsSeq(summaryOneCol) === Seq("summary", "age"))
+    checkAnswer(summaryOneCol, summaryResult.map { case Row(s, _, d, _) => Row(s, d)} )
+
+    val summaryNoCol = person2.select().summary()
+    assert(getSchemaAsSeq(summaryNoCol) === Seq("summary"))
+    checkAnswer(summaryNoCol, summaryResult.map { case Row(s, _, _, _) => Row(s)} )
+
+    val emptyDescription = person2.limit(0).summary()
+    assert(getSchemaAsSeq(emptyDescription) === Seq("summary", "name", "age", "height"))
+    checkAnswer(emptyDescription, emptySummaryResult)
+  }
+
+  test("summary advanced") {
+    val stats = Array("count", "50.01%", "max", "mean", "min", "25%")
+    val orderMatters = person2.summary(stats: _*)
+    assert(orderMatters.collect().map(_.getString(0)) === stats)
+
+    val onlyPercentiles = person2.summary("0.1%", "99.9%")
+    assert(onlyPercentiles.count() === 2)
+
+    val fooE = intercept[IllegalArgumentException] {
+      person2.summary("foo")
+    }
+    assert(fooE.getMessage === "requirement failed: foo is not a recognised statistic")
+
+    val parseE = intercept[IllegalArgumentException] {
+      person2.summary("foo%")
+    }
+    assert(parseE.getMessage === "Unable to parse foo% as a percentile")
+  }
+
   test("apply on query results (SPARK-5462)") {
     val df = testData.sparkSession.sql("select key from testData")
     checkAnswer(df.select(df("key")), testData.select('key).collect().toSeq)

From 7896e7b99d95d28800f5644bd36b3990cf0ef8c4 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Fri, 7 Jul 2017 23:05:38 -0700
Subject: [PATCH 125/779] [SPARK-21281][SQL] Use string types by default if
 array and map have no argument

## What changes were proposed in this pull request?
This pr modified code to use string types by default if `array` and `map` in functions have no argument. This behaviour is the same with Hive one;
```
hive> CREATE TEMPORARY TABLE t1 AS SELECT map();
hive> DESCRIBE t1;
_c0   map<string,string>

hive> CREATE TEMPORARY TABLE t2 AS SELECT array();
hive> DESCRIBE t2;
_c0   array<string>
```

## How was this patch tested?
Added tests in `DataFrameFunctionsSuite`.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #18516 from maropu/SPARK-21281.
---
 .../sql/catalyst/expressions/arithmetic.scala | 10 +++---
 .../expressions/complexTypeCreator.scala      | 35 ++++++++++--------
 .../spark/sql/catalyst/expressions/hash.scala |  5 +--
 .../expressions/nullExpressions.scala         |  7 ++--
 .../ExpressionTypeCheckingSuite.scala         |  4 +--
 .../org/apache/spark/sql/functions.scala      | 10 ++----
 .../spark/sql/DataFrameFunctionsSuite.scala   | 36 +++++++++++++++++++
 7 files changed, 74 insertions(+), 33 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index ec6e6ba0f091b..423bf66a24d1f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -527,13 +527,14 @@ case class Least(children: Seq[Expression]) extends Expression {
 
   override def checkInputDataTypes(): TypeCheckResult = {
     if (children.length <= 1) {
-      TypeCheckResult.TypeCheckFailure(s"LEAST requires at least 2 arguments")
+      TypeCheckResult.TypeCheckFailure(
+        s"input to function $prettyName requires at least two arguments")
     } else if (children.map(_.dataType).distinct.count(_ != NullType) > 1) {
       TypeCheckResult.TypeCheckFailure(
         s"The expressions should all have the same type," +
           s" got LEAST(${children.map(_.dataType.simpleString).mkString(", ")}).")
     } else {
-      TypeUtils.checkForOrderingExpr(dataType, "function " + prettyName)
+      TypeUtils.checkForOrderingExpr(dataType, s"function $prettyName")
     }
   }
 
@@ -592,13 +593,14 @@ case class Greatest(children: Seq[Expression]) extends Expression {
 
   override def checkInputDataTypes(): TypeCheckResult = {
     if (children.length <= 1) {
-      TypeCheckResult.TypeCheckFailure(s"GREATEST requires at least 2 arguments")
+      TypeCheckResult.TypeCheckFailure(
+        s"input to function $prettyName requires at least two arguments")
     } else if (children.map(_.dataType).distinct.count(_ != NullType) > 1) {
       TypeCheckResult.TypeCheckFailure(
         s"The expressions should all have the same type," +
           s" got GREATEST(${children.map(_.dataType.simpleString).mkString(", ")}).")
     } else {
-      TypeUtils.checkForOrderingExpr(dataType, "function " + prettyName)
+      TypeUtils.checkForOrderingExpr(dataType, s"function $prettyName")
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index 98c4cbee38dee..d9eeb5358ef79 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -41,12 +41,13 @@ case class CreateArray(children: Seq[Expression]) extends Expression {
 
   override def foldable: Boolean = children.forall(_.foldable)
 
-  override def checkInputDataTypes(): TypeCheckResult =
-    TypeUtils.checkForSameTypeInputExpr(children.map(_.dataType), "function array")
+  override def checkInputDataTypes(): TypeCheckResult = {
+    TypeUtils.checkForSameTypeInputExpr(children.map(_.dataType), s"function $prettyName")
+  }
 
   override def dataType: ArrayType = {
     ArrayType(
-      children.headOption.map(_.dataType).getOrElse(NullType),
+      children.headOption.map(_.dataType).getOrElse(StringType),
       containsNull = children.exists(_.nullable))
   }
 
@@ -93,7 +94,7 @@ private [sql] object GenArrayData {
     if (!ctx.isPrimitiveType(elementType)) {
       val genericArrayClass = classOf[GenericArrayData].getName
       ctx.addMutableState("Object[]", arrayName,
-        s"$arrayName = new Object[${numElements}];")
+        s"$arrayName = new Object[$numElements];")
 
       val assignments = elementsCode.zipWithIndex.map { case (eval, i) =>
         val isNullAssignment = if (!isMapKey) {
@@ -119,7 +120,7 @@ private [sql] object GenArrayData {
         UnsafeArrayData.calculateHeaderPortionInBytes(numElements) +
         ByteArrayMethods.roundNumberOfBytesToNearestWord(elementType.defaultSize * numElements)
       val baseOffset = Platform.BYTE_ARRAY_OFFSET
-      ctx.addMutableState("UnsafeArrayData", arrayDataName, "");
+      ctx.addMutableState("UnsafeArrayData", arrayDataName, "")
 
       val primitiveValueTypeName = ctx.primitiveTypeName(elementType)
       val assignments = elementsCode.zipWithIndex.map { case (eval, i) =>
@@ -169,13 +170,16 @@ case class CreateMap(children: Seq[Expression]) extends Expression {
 
   override def checkInputDataTypes(): TypeCheckResult = {
     if (children.size % 2 != 0) {
-      TypeCheckResult.TypeCheckFailure(s"$prettyName expects a positive even number of arguments.")
+      TypeCheckResult.TypeCheckFailure(
+        s"$prettyName expects a positive even number of arguments.")
     } else if (keys.map(_.dataType).distinct.length > 1) {
-      TypeCheckResult.TypeCheckFailure("The given keys of function map should all be the same " +
-        "type, but they are " + keys.map(_.dataType.simpleString).mkString("[", ", ", "]"))
+      TypeCheckResult.TypeCheckFailure(
+        "The given keys of function map should all be the same type, but they are " +
+          keys.map(_.dataType.simpleString).mkString("[", ", ", "]"))
     } else if (values.map(_.dataType).distinct.length > 1) {
-      TypeCheckResult.TypeCheckFailure("The given values of function map should all be the same " +
-        "type, but they are " + values.map(_.dataType.simpleString).mkString("[", ", ", "]"))
+      TypeCheckResult.TypeCheckFailure(
+        "The given values of function map should all be the same type, but they are " +
+          values.map(_.dataType.simpleString).mkString("[", ", ", "]"))
     } else {
       TypeCheckResult.TypeCheckSuccess
     }
@@ -183,8 +187,8 @@ case class CreateMap(children: Seq[Expression]) extends Expression {
 
   override def dataType: DataType = {
     MapType(
-      keyType = keys.headOption.map(_.dataType).getOrElse(NullType),
-      valueType = values.headOption.map(_.dataType).getOrElse(NullType),
+      keyType = keys.headOption.map(_.dataType).getOrElse(StringType),
+      valueType = values.headOption.map(_.dataType).getOrElse(StringType),
       valueContainsNull = values.exists(_.nullable))
   }
 
@@ -292,14 +296,17 @@ trait CreateNamedStructLike extends Expression {
   }
 
   override def checkInputDataTypes(): TypeCheckResult = {
-    if (children.size % 2 != 0) {
+    if (children.length < 1) {
+      TypeCheckResult.TypeCheckFailure(
+        s"input to function $prettyName requires at least one argument")
+    } else if (children.size % 2 != 0) {
       TypeCheckResult.TypeCheckFailure(s"$prettyName expects an even number of arguments.")
     } else {
       val invalidNames = nameExprs.filterNot(e => e.foldable && e.dataType == StringType)
       if (invalidNames.nonEmpty) {
         TypeCheckResult.TypeCheckFailure(
           "Only foldable StringType expressions are allowed to appear at odd position, got:" +
-          s" ${invalidNames.mkString(",")}")
+            s" ${invalidNames.mkString(",")}")
       } else if (!names.contains(null)) {
         TypeCheckResult.TypeCheckSuccess
       } else {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
index ffd0e64d86cff..2476fc962a6fa 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
@@ -247,8 +247,9 @@ abstract class HashExpression[E] extends Expression {
   override def nullable: Boolean = false
 
   override def checkInputDataTypes(): TypeCheckResult = {
-    if (children.isEmpty) {
-      TypeCheckResult.TypeCheckFailure("function hash requires at least one argument")
+    if (children.length < 1) {
+      TypeCheckResult.TypeCheckFailure(
+        s"input to function $prettyName requires at least one argument")
     } else {
       TypeCheckResult.TypeCheckSuccess
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
index 0866b8d791e01..1b625141d56ac 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
@@ -52,10 +52,11 @@ case class Coalesce(children: Seq[Expression]) extends Expression {
   override def foldable: Boolean = children.forall(_.foldable)
 
   override def checkInputDataTypes(): TypeCheckResult = {
-    if (children == Nil) {
-      TypeCheckResult.TypeCheckFailure("input to function coalesce cannot be empty")
+    if (children.length < 1) {
+      TypeCheckResult.TypeCheckFailure(
+        s"input to function $prettyName requires at least one argument")
     } else {
-      TypeUtils.checkForSameTypeInputExpr(children.map(_.dataType), "function coalesce")
+      TypeUtils.checkForSameTypeInputExpr(children.map(_.dataType), s"function $prettyName")
     }
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
index 30459f173ab52..30725773a37b1 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
@@ -155,7 +155,7 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite {
       "input to function array should all be the same type")
     assertError(Coalesce(Seq('intField, 'booleanField)),
       "input to function coalesce should all be the same type")
-    assertError(Coalesce(Nil), "input to function coalesce cannot be empty")
+    assertError(Coalesce(Nil), "function coalesce requires at least one argument")
     assertError(new Murmur3Hash(Nil), "function hash requires at least one argument")
     assertError(Explode('intField),
       "input to function explode should be array or map type")
@@ -207,7 +207,7 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite {
 
   test("check types for Greatest/Least") {
     for (operator <- Seq[(Seq[Expression] => Expression)](Greatest, Least)) {
-      assertError(operator(Seq('booleanField)), "requires at least 2 arguments")
+      assertError(operator(Seq('booleanField)), "requires at least two arguments")
       assertError(operator(Seq('intField, 'stringField)), "should all have the same type")
       assertError(operator(Seq('mapField, 'mapField)), "does not support ordering")
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 3c67960d13e09..1263071a3ffd5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1565,10 +1565,7 @@ object functions {
    * @since 1.5.0
    */
   @scala.annotation.varargs
-  def greatest(exprs: Column*): Column = withExpr {
-    require(exprs.length > 1, "greatest requires at least 2 arguments.")
-    Greatest(exprs.map(_.expr))
-  }
+  def greatest(exprs: Column*): Column = withExpr { Greatest(exprs.map(_.expr)) }
 
   /**
    * Returns the greatest value of the list of column names, skipping null values.
@@ -1672,10 +1669,7 @@ object functions {
    * @since 1.5.0
    */
   @scala.annotation.varargs
-  def least(exprs: Column*): Column = withExpr {
-    require(exprs.length > 1, "least requires at least 2 arguments.")
-    Least(exprs.map(_.expr))
-  }
+  def least(exprs: Column*): Column = withExpr { Least(exprs.map(_.expr)) }
 
   /**
    * Returns the least value of the list of column names, skipping null values.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 0e9a2c6cf7dec..0681b9cbeb1d8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -448,6 +448,42 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext {
       rand(Random.nextLong()), randn(Random.nextLong())
     ).foreach(assertValuesDoNotChangeAfterCoalesceOrUnion(_))
   }
+
+  test("SPARK-21281 use string types by default if array and map have no argument") {
+    val ds = spark.range(1)
+    var expectedSchema = new StructType()
+      .add("x", ArrayType(StringType, containsNull = false), nullable = false)
+    assert(ds.select(array().as("x")).schema == expectedSchema)
+    expectedSchema = new StructType()
+      .add("x", MapType(StringType, StringType, valueContainsNull = false), nullable = false)
+    assert(ds.select(map().as("x")).schema == expectedSchema)
+  }
+
+  test("SPARK-21281 fails if functions have no argument") {
+    val df = Seq(1).toDF("a")
+
+    val funcsMustHaveAtLeastOneArg =
+      ("coalesce", (df: DataFrame) => df.select(coalesce())) ::
+      ("coalesce", (df: DataFrame) => df.selectExpr("coalesce()")) ::
+      ("named_struct", (df: DataFrame) => df.select(struct())) ::
+      ("named_struct", (df: DataFrame) => df.selectExpr("named_struct()")) ::
+      ("hash", (df: DataFrame) => df.select(hash())) ::
+      ("hash", (df: DataFrame) => df.selectExpr("hash()")) :: Nil
+    funcsMustHaveAtLeastOneArg.foreach { case (name, func) =>
+      val errMsg = intercept[AnalysisException] { func(df) }.getMessage
+      assert(errMsg.contains(s"input to function $name requires at least one argument"))
+    }
+
+    val funcsMustHaveAtLeastTwoArgs =
+      ("greatest", (df: DataFrame) => df.select(greatest())) ::
+      ("greatest", (df: DataFrame) => df.selectExpr("greatest()")) ::
+      ("least", (df: DataFrame) => df.select(least())) ::
+      ("least", (df: DataFrame) => df.selectExpr("least()")) :: Nil
+    funcsMustHaveAtLeastTwoArgs.foreach { case (name, func) =>
+      val errMsg = intercept[AnalysisException] { func(df) }.getMessage
+      assert(errMsg.contains(s"input to function $name requires at least two arguments"))
+    }
+  }
 }
 
 object DataFrameFunctionsSuite {

From 9760c15acbcf755dd5b13597ceb333576f806ecf Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Sat, 8 Jul 2017 14:20:09 +0800
Subject: [PATCH 126/779] [SPARK-20379][CORE] Allow SSL config to reference env
 variables.

This change exposes the internal code path in SparkConf that allows
configs to be read with variable substitution applied, and uses that
new method in SSLOptions so that SSL configs can reference other
variables, and more importantly, environment variables, providing
a secure way to provide passwords to Spark when using SSL.

The approach is a little bit hacky, but is the smallest change possible.
Otherwise, the concept of "namespaced configs" would have to be added
to the config system, which would create a lot of noise for not much
gain at this point.

Tested with added unit tests, and on a real cluster with SSL enabled.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #18394 from vanzin/SPARK-20379.try2.
---
 .../scala/org/apache/spark/SSLOptions.scala   | 20 +++++++++----------
 .../scala/org/apache/spark/SparkConf.scala    |  5 +++++
 .../org/apache/spark/SSLOptionsSuite.scala    | 16 +++++++++++++++
 3 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SSLOptions.scala b/core/src/main/scala/org/apache/spark/SSLOptions.scala
index 29163e7f30546..f86fd20e59190 100644
--- a/core/src/main/scala/org/apache/spark/SSLOptions.scala
+++ b/core/src/main/scala/org/apache/spark/SSLOptions.scala
@@ -167,39 +167,39 @@ private[spark] object SSLOptions extends Logging {
   def parse(conf: SparkConf, ns: String, defaults: Option[SSLOptions] = None): SSLOptions = {
     val enabled = conf.getBoolean(s"$ns.enabled", defaultValue = defaults.exists(_.enabled))
 
-    val port = conf.getOption(s"$ns.port").map(_.toInt)
+    val port = conf.getWithSubstitution(s"$ns.port").map(_.toInt)
     port.foreach { p =>
       require(p >= 0, "Port number must be a non-negative value.")
     }
 
-    val keyStore = conf.getOption(s"$ns.keyStore").map(new File(_))
+    val keyStore = conf.getWithSubstitution(s"$ns.keyStore").map(new File(_))
         .orElse(defaults.flatMap(_.keyStore))
 
-    val keyStorePassword = conf.getOption(s"$ns.keyStorePassword")
+    val keyStorePassword = conf.getWithSubstitution(s"$ns.keyStorePassword")
         .orElse(defaults.flatMap(_.keyStorePassword))
 
-    val keyPassword = conf.getOption(s"$ns.keyPassword")
+    val keyPassword = conf.getWithSubstitution(s"$ns.keyPassword")
         .orElse(defaults.flatMap(_.keyPassword))
 
-    val keyStoreType = conf.getOption(s"$ns.keyStoreType")
+    val keyStoreType = conf.getWithSubstitution(s"$ns.keyStoreType")
         .orElse(defaults.flatMap(_.keyStoreType))
 
     val needClientAuth =
       conf.getBoolean(s"$ns.needClientAuth", defaultValue = defaults.exists(_.needClientAuth))
 
-    val trustStore = conf.getOption(s"$ns.trustStore").map(new File(_))
+    val trustStore = conf.getWithSubstitution(s"$ns.trustStore").map(new File(_))
         .orElse(defaults.flatMap(_.trustStore))
 
-    val trustStorePassword = conf.getOption(s"$ns.trustStorePassword")
+    val trustStorePassword = conf.getWithSubstitution(s"$ns.trustStorePassword")
         .orElse(defaults.flatMap(_.trustStorePassword))
 
-    val trustStoreType = conf.getOption(s"$ns.trustStoreType")
+    val trustStoreType = conf.getWithSubstitution(s"$ns.trustStoreType")
         .orElse(defaults.flatMap(_.trustStoreType))
 
-    val protocol = conf.getOption(s"$ns.protocol")
+    val protocol = conf.getWithSubstitution(s"$ns.protocol")
         .orElse(defaults.flatMap(_.protocol))
 
-    val enabledAlgorithms = conf.getOption(s"$ns.enabledAlgorithms")
+    val enabledAlgorithms = conf.getWithSubstitution(s"$ns.enabledAlgorithms")
         .map(_.split(",").map(_.trim).filter(_.nonEmpty).toSet)
         .orElse(defaults.map(_.enabledAlgorithms))
         .getOrElse(Set.empty)
diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index de2f475c6895f..715cfdcc8f4ef 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -373,6 +373,11 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
     Option(settings.get(key)).orElse(getDeprecatedConfig(key, this))
   }
 
+  /** Get an optional value, applying variable substitution. */
+  private[spark] def getWithSubstitution(key: String): Option[String] = {
+    getOption(key).map(reader.substitute(_))
+  }
+
   /** Get all parameters as a list of pairs */
   def getAll: Array[(String, String)] = {
     settings.entrySet().asScala.map(x => (x.getKey, x.getValue)).toArray
diff --git a/core/src/test/scala/org/apache/spark/SSLOptionsSuite.scala b/core/src/test/scala/org/apache/spark/SSLOptionsSuite.scala
index 6fc7cea6ee94a..8eabc2b3cb958 100644
--- a/core/src/test/scala/org/apache/spark/SSLOptionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SSLOptionsSuite.scala
@@ -22,6 +22,8 @@ import javax.net.ssl.SSLContext
 
 import org.scalatest.BeforeAndAfterAll
 
+import org.apache.spark.util.SparkConfWithEnv
+
 class SSLOptionsSuite extends SparkFunSuite with BeforeAndAfterAll {
 
   test("test resolving property file as spark conf ") {
@@ -133,4 +135,18 @@ class SSLOptionsSuite extends SparkFunSuite with BeforeAndAfterAll {
     assert(opts.enabledAlgorithms === Set("ABC", "DEF"))
   }
 
+  test("variable substitution") {
+    val conf = new SparkConfWithEnv(Map(
+      "ENV1" -> "val1",
+      "ENV2" -> "val2"))
+
+    conf.set("spark.ssl.enabled", "true")
+    conf.set("spark.ssl.keyStore", "${env:ENV1}")
+    conf.set("spark.ssl.trustStore", "${env:ENV2}")
+
+    val opts = SSLOptions.parse(conf, "spark.ssl", defaults = None)
+    assert(opts.keyStore === Some(new File("val1")))
+    assert(opts.trustStore === Some(new File("val2")))
+  }
+
 }

From d0bfc6733521709e453d643582df2bdd68f28de7 Mon Sep 17 00:00:00 2001
From: Prashant Sharma <prashant@apache.org>
Date: Fri, 7 Jul 2017 23:33:12 -0700
Subject: [PATCH 127/779] [SPARK-21069][SS][DOCS] Add rate source to
 programming guide.

## What changes were proposed in this pull request?

SPARK-20979 added a new structured streaming source: Rate source. This patch adds the corresponding documentation to programming guide.

## How was this patch tested?

Tested by running jekyll locally.

Author: Prashant Sharma <prashant@apache.org>
Author: Prashant Sharma <prashsh1@in.ibm.com>

Closes #18562 from ScrapCodes/spark-21069/rate-source-docs.
---
 docs/structured-streaming-programming-guide.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md
index 3bc377c9a38b5..8f64faadc32dc 100644
--- a/docs/structured-streaming-programming-guide.md
+++ b/docs/structured-streaming-programming-guide.md
@@ -499,6 +499,8 @@ There are a few built-in sources.
 
   - **Socket source (for testing)** - Reads UTF8 text data from a socket connection. The listening server socket is at the driver. Note that this should be used only for testing as this does not provide end-to-end fault-tolerance guarantees. 
 
+  - **Rate source (for testing)** - Generates data at the specified number of rows per second, each output row contains a `timestamp` and `value`. Where `timestamp` is a `Timestamp` type containing the time of message dispatch, and `value` is of `Long` type containing the message count, starting from 0 as the first row. This source is intended for testing and benchmarking.
+
 Some sources are not fault-tolerant because they do not guarantee that data can be replayed using 
 checkpointed offsets after a failure. See the earlier section on 
 [fault-tolerance semantics](#fault-tolerance-semantics).
@@ -546,6 +548,19 @@ Here are the details of all the sources in Spark.
     <td>No</td>
     <td></td>
   </tr>
+  <tr>
+    <td><b>Rate Source</b></td>
+    <td>
+        <code>rowsPerSecond</code> (e.g. 100, default: 1): How many rows should be generated per second.<br/><br/>
+        <code>rampUpTime</code> (e.g. 5s, default: 0s): How long to ramp up before the generating speed becomes <code>rowsPerSecond</code>. Using finer granularities than seconds will be truncated to integer seconds. <br/><br/>
+        <code>numPartitions</code> (e.g. 10, default: Spark's default parallelism): The partition number for the generated rows. <br/><br/>
+        
+        The source will try its best to reach <code>rowsPerSecond</code>, but the query may be resource constrained, and <code>numPartitions</code> can be tweaked to help reach the desired speed.
+    </td>
+    <td>Yes</td>
+    <td></td>
+  </tr>
+
   <tr>
     <td><b>Kafka Source</b></td>
     <td>

From a7b46c627b5d2461257f337139a29f23350e0c77 Mon Sep 17 00:00:00 2001
From: wangmiao1981 <wm624@hotmail.com>
Date: Fri, 7 Jul 2017 23:51:32 -0700
Subject: [PATCH 128/779] [SPARK-20307][SPARKR] SparkR: pass on
 setHandleInvalid to spark.mllib functions that use StringIndexer

## What changes were proposed in this pull request?

For randomForest classifier, if test data contains unseen labels, it will throw an error. The StringIndexer already has the handleInvalid logic. The patch add a new method to set the underlying StringIndexer handleInvalid logic.

This patch should also apply to other classifiers. This PR focuses on the main logic and randomForest classifier. I will do follow-up PR for other classifiers.

## How was this patch tested?

Add a new unit test based on the error case in the JIRA.

Author: wangmiao1981 <wm624@hotmail.com>

Closes #18496 from wangmiao1981/handle.
---
 R/pkg/R/mllib_tree.R                          | 11 ++++++--
 R/pkg/tests/fulltests/test_mllib_tree.R       | 17 +++++++++++++
 .../apache/spark/ml/feature/RFormula.scala    | 25 +++++++++++++++++++
 .../r/RandomForestClassificationWrapper.scala |  4 ++-
 .../spark/ml/feature/StringIndexerSuite.scala |  2 +-
 5 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/R/pkg/R/mllib_tree.R b/R/pkg/R/mllib_tree.R
index 2f1220a752783..75b1a74ee8c7c 100644
--- a/R/pkg/R/mllib_tree.R
+++ b/R/pkg/R/mllib_tree.R
@@ -374,6 +374,10 @@ setMethod("write.ml", signature(object = "GBTClassificationModel", path = "chara
 #'                     nodes. If TRUE, the algorithm will cache node IDs for each instance. Caching
 #'                     can speed up training of deeper trees. Users can set how often should the
 #'                     cache be checkpointed or disable it by setting checkpointInterval.
+#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in classification model.
+#'        Supported options: "skip" (filter out rows with invalid data),
+#'                           "error" (throw an error), "keep" (put invalid data in a special additional
+#'                           bucket, at index numLabels). Default is "error".
 #' @param ... additional arguments passed to the method.
 #' @aliases spark.randomForest,SparkDataFrame,formula-method
 #' @return \code{spark.randomForest} returns a fitted Random Forest model.
@@ -409,7 +413,8 @@ setMethod("spark.randomForest", signature(data = "SparkDataFrame", formula = "fo
                    maxDepth = 5, maxBins = 32, numTrees = 20, impurity = NULL,
                    featureSubsetStrategy = "auto", seed = NULL, subsamplingRate = 1.0,
                    minInstancesPerNode = 1, minInfoGain = 0.0, checkpointInterval = 10,
-                   maxMemoryInMB = 256, cacheNodeIds = FALSE) {
+                   maxMemoryInMB = 256, cacheNodeIds = FALSE,
+                   handleInvalid = c("error", "keep", "skip")) {
             type <- match.arg(type)
             formula <- paste(deparse(formula), collapse = "")
             if (!is.null(seed)) {
@@ -430,6 +435,7 @@ setMethod("spark.randomForest", signature(data = "SparkDataFrame", formula = "fo
                      new("RandomForestRegressionModel", jobj = jobj)
                    },
                    classification = {
+                     handleInvalid <- match.arg(handleInvalid)
                      if (is.null(impurity)) impurity <- "gini"
                      impurity <- match.arg(impurity, c("gini", "entropy"))
                      jobj <- callJStatic("org.apache.spark.ml.r.RandomForestClassifierWrapper",
@@ -439,7 +445,8 @@ setMethod("spark.randomForest", signature(data = "SparkDataFrame", formula = "fo
                                          as.numeric(minInfoGain), as.integer(checkpointInterval),
                                          as.character(featureSubsetStrategy), seed,
                                          as.numeric(subsamplingRate),
-                                         as.integer(maxMemoryInMB), as.logical(cacheNodeIds))
+                                         as.integer(maxMemoryInMB), as.logical(cacheNodeIds),
+                                         handleInvalid)
                      new("RandomForestClassificationModel", jobj = jobj)
                    }
             )
diff --git a/R/pkg/tests/fulltests/test_mllib_tree.R b/R/pkg/tests/fulltests/test_mllib_tree.R
index 9b3fc8d270b25..66a0693a59a52 100644
--- a/R/pkg/tests/fulltests/test_mllib_tree.R
+++ b/R/pkg/tests/fulltests/test_mllib_tree.R
@@ -212,6 +212,23 @@ test_that("spark.randomForest", {
   expect_equal(length(grep("1.0", predictions)), 50)
   expect_equal(length(grep("2.0", predictions)), 50)
 
+  # Test unseen labels
+  data <- data.frame(clicked = base::sample(c(0, 1), 10, replace = TRUE),
+                    someString = base::sample(c("this", "that"), 10, replace = TRUE),
+                    stringsAsFactors = FALSE)
+  trainidxs <- base::sample(nrow(data), nrow(data) * 0.7)
+  traindf <- as.DataFrame(data[trainidxs, ])
+  testdf <- as.DataFrame(rbind(data[-trainidxs, ], c(0, "the other")))
+  model <- spark.randomForest(traindf, clicked ~ ., type = "classification",
+                          maxDepth = 10, maxBins = 10, numTrees = 10)
+  predictions <- predict(model, testdf)
+  expect_error(collect(predictions))
+  model <- spark.randomForest(traindf, clicked ~ ., type = "classification",
+                             maxDepth = 10, maxBins = 10, numTrees = 10,
+                             handleInvalid = "skip")
+  predictions <- predict(model, testdf)
+  expect_equal(class(collect(predictions)$clicked[1]), "character")
+
   # spark.randomForest classification can work on libsvm data
   if (windows_with_hadoop()) {
     data <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index 4b44878784c90..61aa6463bb6da 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -132,6 +132,30 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
   @Since("1.5.0")
   def getFormula: String = $(formula)
 
+  /**
+   * Param for how to handle invalid data (unseen labels or NULL values).
+   * Options are 'skip' (filter out rows with invalid data),
+   * 'error' (throw an error), or 'keep' (put invalid data in a special additional
+   * bucket, at index numLabels).
+   * Default: "error"
+   * @group param
+   */
+  @Since("2.3.0")
+  val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "How to handle " +
+    "invalid data (unseen labels or NULL values). " +
+    "Options are 'skip' (filter out rows with invalid data), error (throw an error), " +
+    "or 'keep' (put invalid data in a special additional bucket, at index numLabels).",
+    ParamValidators.inArray(StringIndexer.supportedHandleInvalids))
+  setDefault(handleInvalid, StringIndexer.ERROR_INVALID)
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setHandleInvalid(value: String): this.type = set(handleInvalid, value)
+
+  /** @group getParam */
+  @Since("2.3.0")
+  def getHandleInvalid: String = $(handleInvalid)
+
   /** @group setParam */
   @Since("1.5.0")
   def setFeaturesCol(value: String): this.type = set(featuresCol, value)
@@ -197,6 +221,7 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
             .setInputCol(term)
             .setOutputCol(indexCol)
             .setStringOrderType($(stringIndexerOrderType))
+            .setHandleInvalid($(handleInvalid))
           prefixesToRewrite(indexCol + "_") = term + "_"
           (term, indexCol)
         case _ =>
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala
index 8a83d4e980f7b..132345fb9a6d9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala
@@ -78,11 +78,13 @@ private[r] object RandomForestClassifierWrapper extends MLReadable[RandomForestC
       seed: String,
       subsamplingRate: Double,
       maxMemoryInMB: Int,
-      cacheNodeIds: Boolean): RandomForestClassifierWrapper = {
+      cacheNodeIds: Boolean,
+      handleInvalid: String): RandomForestClassifierWrapper = {
 
     val rFormula = new RFormula()
       .setFormula(formula)
       .setForceIndexLabel(true)
+      .setHandleInvalid(handleInvalid)
     checkDataColumns(rFormula, data)
     val rFormulaModel = rFormula.fit(data)
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
index 806a92760c8b6..027b1fbc6657c 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.ml.feature
 import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.ml.attribute.{Attribute, NominalAttribute}
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.{DefaultReadWriteTest, Identifiable, MLTestingUtils}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.functions.col

From f5f02d213d3151f58070e113d64fcded4f5d401e Mon Sep 17 00:00:00 2001
From: Michael Patterson <map222@gmail.com>
Date: Fri, 7 Jul 2017 23:59:34 -0700
Subject: [PATCH 129/779] [SPARK-20456][DOCS] Add examples for functions
 collection for pyspark

## What changes were proposed in this pull request?

This adds documentation to many functions in pyspark.sql.functions.py:
`upper`, `lower`, `reverse`, `unix_timestamp`, `from_unixtime`, `rand`, `randn`, `collect_list`, `collect_set`, `lit`
Add units to the trigonometry functions.
Renames columns in datetime examples to be more informative.
Adds links between some functions.

## How was this patch tested?

`./dev/lint-python`
`python python/pyspark/sql/functions.py`
`./python/run-tests.py --module pyspark-sql`

Author: Michael Patterson <map222@gmail.com>

Closes #17865 from map222/spark-20456.
---
 R/pkg/R/functions.R                           |  11 +-
 python/pyspark/sql/functions.py               | 166 +++++++++++-------
 .../org/apache/spark/sql/functions.scala      |  14 +-
 3 files changed, 119 insertions(+), 72 deletions(-)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index c529d83060f50..f28d26a51baa0 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -336,7 +336,8 @@ setMethod("asin",
           })
 
 #' @details
-#' \code{atan}: Computes the tangent inverse of the given value.
+#' \code{atan}: Computes the tangent inverse of the given value; the returned angle is in the range
+#' -pi/2 through pi/2.
 #'
 #' @rdname column_math_functions
 #' @export
@@ -599,7 +600,7 @@ setMethod("covar_pop", signature(col1 = "characterOrColumn", col2 = "characterOr
           })
 
 #' @details
-#' \code{cos}: Computes the cosine of the given value.
+#' \code{cos}: Computes the cosine of the given value. Units in radians.
 #'
 #' @rdname column_math_functions
 #' @aliases cos cos,Column-method
@@ -1407,7 +1408,7 @@ setMethod("sign", signature(x = "Column"),
           })
 
 #' @details
-#' \code{sin}: Computes the sine of the given value.
+#' \code{sin}: Computes the sine of the given value. Units in radians.
 #'
 #' @rdname column_math_functions
 #' @aliases sin sin,Column-method
@@ -1597,7 +1598,7 @@ setMethod("sumDistinct",
           })
 
 #' @details
-#' \code{tan}: Computes the tangent of the given value.
+#' \code{tan}: Computes the tangent of the given value. Units in radians.
 #'
 #' @rdname column_math_functions
 #' @aliases tan tan,Column-method
@@ -1896,7 +1897,7 @@ setMethod("year",
 
 #' @details
 #' \code{atan2}: Returns the angle theta from the conversion of rectangular coordinates
-#' (x, y) to polar coordinates (r, theta).
+#' (x, y) to polar coordinates (r, theta). Units in radians.
 #'
 #' @rdname column_math_functions
 #' @aliases atan2 atan2,Column-method
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 3416c4b118a07..5d8ded83f667d 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -67,9 +67,14 @@ def _():
     _.__doc__ = 'Window function: ' + doc
     return _
 
+_lit_doc = """
+    Creates a :class:`Column` of literal value.
 
+    >>> df.select(lit(5).alias('height')).withColumn('spark_user', lit(True)).take(1)
+    [Row(height=5, spark_user=True)]
+    """
 _functions = {
-    'lit': 'Creates a :class:`Column` of literal value.',
+    'lit': _lit_doc,
     'col': 'Returns a :class:`Column` based on the given column name.',
     'column': 'Returns a :class:`Column` based on the given column name.',
     'asc': 'Returns a sort expression based on the ascending order of the given column name.',
@@ -95,10 +100,13 @@ def _():
             '0.0 through pi.',
     'asin': 'Computes the sine inverse of the given value; the returned angle is in the range' +
             '-pi/2 through pi/2.',
-    'atan': 'Computes the tangent inverse of the given value.',
+    'atan': 'Computes the tangent inverse of the given value; the returned angle is in the range' +
+            '-pi/2 through pi/2',
     'cbrt': 'Computes the cube-root of the given value.',
     'ceil': 'Computes the ceiling of the given value.',
-    'cos': 'Computes the cosine of the given value.',
+    'cos': """Computes the cosine of the given value.
+
+           :param col: :class:`DoubleType` column, units in radians.""",
     'cosh': 'Computes the hyperbolic cosine of the given value.',
     'exp': 'Computes the exponential of the given value.',
     'expm1': 'Computes the exponential of the given value minus one.',
@@ -109,15 +117,33 @@ def _():
     'rint': 'Returns the double value that is closest in value to the argument and' +
             ' is equal to a mathematical integer.',
     'signum': 'Computes the signum of the given value.',
-    'sin': 'Computes the sine of the given value.',
+    'sin': """Computes the sine of the given value.
+
+           :param col: :class:`DoubleType` column, units in radians.""",
     'sinh': 'Computes the hyperbolic sine of the given value.',
-    'tan': 'Computes the tangent of the given value.',
+    'tan': """Computes the tangent of the given value.
+
+           :param col: :class:`DoubleType` column, units in radians.""",
     'tanh': 'Computes the hyperbolic tangent of the given value.',
-    'toDegrees': '.. note:: Deprecated in 2.1, use degrees instead.',
-    'toRadians': '.. note:: Deprecated in 2.1, use radians instead.',
+    'toDegrees': '.. note:: Deprecated in 2.1, use :func:`degrees` instead.',
+    'toRadians': '.. note:: Deprecated in 2.1, use :func:`radians` instead.',
     'bitwiseNOT': 'Computes bitwise not.',
 }
 
+_collect_list_doc = """
+    Aggregate function: returns a list of objects with duplicates.
+
+    >>> df2 = spark.createDataFrame([(2,), (5,), (5,)], ('age',))
+    >>> df2.agg(collect_list('age')).collect()
+    [Row(collect_list(age)=[2, 5, 5])]
+    """
+_collect_set_doc = """
+    Aggregate function: returns a set of objects with duplicate elements eliminated.
+
+    >>> df2 = spark.createDataFrame([(2,), (5,), (5,)], ('age',))
+    >>> df2.agg(collect_set('age')).collect()
+    [Row(collect_set(age)=[5, 2])]
+    """
 _functions_1_6 = {
     # unary math functions
     'stddev': 'Aggregate function: returns the unbiased sample standard deviation of' +
@@ -131,9 +157,8 @@ def _():
     'var_pop':  'Aggregate function: returns the population variance of the values in a group.',
     'skewness': 'Aggregate function: returns the skewness of the values in a group.',
     'kurtosis': 'Aggregate function: returns the kurtosis of the values in a group.',
-    'collect_list': 'Aggregate function: returns a list of objects with duplicates.',
-    'collect_set': 'Aggregate function: returns a set of objects with duplicate elements' +
-                   ' eliminated.',
+    'collect_list': _collect_list_doc,
+    'collect_set': _collect_set_doc
 }
 
 _functions_2_1 = {
@@ -147,7 +172,7 @@ def _():
 # math functions that take two arguments as input
 _binary_mathfunctions = {
     'atan2': 'Returns the angle theta from the conversion of rectangular coordinates (x, y) to' +
-             'polar coordinates (r, theta).',
+             'polar coordinates (r, theta). Units in radians.',
     'hypot': 'Computes ``sqrt(a^2 + b^2)`` without intermediate overflow or underflow.',
     'pow': 'Returns the value of the first argument raised to the power of the second argument.',
 }
@@ -200,17 +225,20 @@ def _():
 @since(1.3)
 def approxCountDistinct(col, rsd=None):
     """
-    .. note:: Deprecated in 2.1, use approx_count_distinct instead.
+    .. note:: Deprecated in 2.1, use :func:`approx_count_distinct` instead.
     """
     return approx_count_distinct(col, rsd)
 
 
 @since(2.1)
 def approx_count_distinct(col, rsd=None):
-    """Returns a new :class:`Column` for approximate distinct count of ``col``.
+    """Aggregate function: returns a new :class:`Column` for approximate distinct count of column `col`.
 
-    >>> df.agg(approx_count_distinct(df.age).alias('c')).collect()
-    [Row(c=2)]
+    :param rsd: maximum estimation error allowed (default = 0.05). For rsd < 0.01, it is more
+        efficient to use :func:`countDistinct`
+
+    >>> df.agg(approx_count_distinct(df.age).alias('distinct_ages')).collect()
+    [Row(distinct_ages=2)]
     """
     sc = SparkContext._active_spark_context
     if rsd is None:
@@ -267,8 +295,7 @@ def coalesce(*cols):
 
 @since(1.6)
 def corr(col1, col2):
-    """Returns a new :class:`Column` for the Pearson Correlation Coefficient for ``col1``
-    and ``col2``.
+    """Returns a new :class:`Column` for the Pearson Correlation Coefficient for ``col1`` and ``col2``.
 
     >>> a = range(20)
     >>> b = [2 * x for x in range(20)]
@@ -282,8 +309,7 @@ def corr(col1, col2):
 
 @since(2.0)
 def covar_pop(col1, col2):
-    """Returns a new :class:`Column` for the population covariance of ``col1``
-    and ``col2``.
+    """Returns a new :class:`Column` for the population covariance of ``col1`` and ``col2``.
 
     >>> a = [1] * 10
     >>> b = [1] * 10
@@ -297,8 +323,7 @@ def covar_pop(col1, col2):
 
 @since(2.0)
 def covar_samp(col1, col2):
-    """Returns a new :class:`Column` for the sample covariance of ``col1``
-    and ``col2``.
+    """Returns a new :class:`Column` for the sample covariance of ``col1`` and ``col2``.
 
     >>> a = [1] * 10
     >>> b = [1] * 10
@@ -450,7 +475,7 @@ def monotonically_increasing_id():
 def nanvl(col1, col2):
     """Returns col1 if it is not NaN, or col2 if col1 is NaN.
 
-    Both inputs should be floating point columns (DoubleType or FloatType).
+    Both inputs should be floating point columns (:class:`DoubleType` or :class:`FloatType`).
 
     >>> df = spark.createDataFrame([(1.0, float('nan')), (float('nan'), 2.0)], ("a", "b"))
     >>> df.select(nanvl("a", "b").alias("r1"), nanvl(df.a, df.b).alias("r2")).collect()
@@ -460,10 +485,15 @@ def nanvl(col1, col2):
     return Column(sc._jvm.functions.nanvl(_to_java_column(col1), _to_java_column(col2)))
 
 
+@ignore_unicode_prefix
 @since(1.4)
 def rand(seed=None):
     """Generates a random column with independent and identically distributed (i.i.d.) samples
     from U[0.0, 1.0].
+
+    >>> df.withColumn('rand', rand(seed=42) * 3).collect()
+    [Row(age=2, name=u'Alice', rand=1.1568609015300986),
+     Row(age=5, name=u'Bob', rand=1.403379671529166)]
     """
     sc = SparkContext._active_spark_context
     if seed is not None:
@@ -473,10 +503,15 @@ def rand(seed=None):
     return Column(jc)
 
 
+@ignore_unicode_prefix
 @since(1.4)
 def randn(seed=None):
     """Generates a column with independent and identically distributed (i.i.d.) samples from
     the standard normal distribution.
+
+    >>> df.withColumn('randn', randn(seed=42)).collect()
+    [Row(age=2, name=u'Alice', randn=-0.7556247885860078),
+    Row(age=5, name=u'Bob', randn=-0.0861619008451133)]
     """
     sc = SparkContext._active_spark_context
     if seed is not None:
@@ -760,7 +795,7 @@ def ntile(n):
 @since(1.5)
 def current_date():
     """
-    Returns the current date as a date column.
+    Returns the current date as a :class:`DateType` column.
     """
     sc = SparkContext._active_spark_context
     return Column(sc._jvm.functions.current_date())
@@ -768,7 +803,7 @@ def current_date():
 
 def current_timestamp():
     """
-    Returns the current timestamp as a timestamp column.
+    Returns the current timestamp as a :class:`TimestampType` column.
     """
     sc = SparkContext._active_spark_context
     return Column(sc._jvm.functions.current_timestamp())
@@ -787,8 +822,8 @@ def date_format(date, format):
     .. note:: Use when ever possible specialized functions like `year`. These benefit from a
         specialized implementation.
 
-    >>> df = spark.createDataFrame([('2015-04-08',)], ['a'])
-    >>> df.select(date_format('a', 'MM/dd/yyy').alias('date')).collect()
+    >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
+    >>> df.select(date_format('dt', 'MM/dd/yyy').alias('date')).collect()
     [Row(date=u'04/08/2015')]
     """
     sc = SparkContext._active_spark_context
@@ -800,8 +835,8 @@ def year(col):
     """
     Extract the year of a given date as integer.
 
-    >>> df = spark.createDataFrame([('2015-04-08',)], ['a'])
-    >>> df.select(year('a').alias('year')).collect()
+    >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
+    >>> df.select(year('dt').alias('year')).collect()
     [Row(year=2015)]
     """
     sc = SparkContext._active_spark_context
@@ -813,8 +848,8 @@ def quarter(col):
     """
     Extract the quarter of a given date as integer.
 
-    >>> df = spark.createDataFrame([('2015-04-08',)], ['a'])
-    >>> df.select(quarter('a').alias('quarter')).collect()
+    >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
+    >>> df.select(quarter('dt').alias('quarter')).collect()
     [Row(quarter=2)]
     """
     sc = SparkContext._active_spark_context
@@ -826,8 +861,8 @@ def month(col):
     """
     Extract the month of a given date as integer.
 
-    >>> df = spark.createDataFrame([('2015-04-08',)], ['a'])
-    >>> df.select(month('a').alias('month')).collect()
+    >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
+    >>> df.select(month('dt').alias('month')).collect()
     [Row(month=4)]
    """
     sc = SparkContext._active_spark_context
@@ -839,8 +874,8 @@ def dayofmonth(col):
     """
     Extract the day of the month of a given date as integer.
 
-    >>> df = spark.createDataFrame([('2015-04-08',)], ['a'])
-    >>> df.select(dayofmonth('a').alias('day')).collect()
+    >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
+    >>> df.select(dayofmonth('dt').alias('day')).collect()
     [Row(day=8)]
     """
     sc = SparkContext._active_spark_context
@@ -852,8 +887,8 @@ def dayofyear(col):
     """
     Extract the day of the year of a given date as integer.
 
-    >>> df = spark.createDataFrame([('2015-04-08',)], ['a'])
-    >>> df.select(dayofyear('a').alias('day')).collect()
+    >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
+    >>> df.select(dayofyear('dt').alias('day')).collect()
     [Row(day=98)]
     """
     sc = SparkContext._active_spark_context
@@ -865,8 +900,8 @@ def hour(col):
     """
     Extract the hours of a given date as integer.
 
-    >>> df = spark.createDataFrame([('2015-04-08 13:08:15',)], ['a'])
-    >>> df.select(hour('a').alias('hour')).collect()
+    >>> df = spark.createDataFrame([('2015-04-08 13:08:15',)], ['ts'])
+    >>> df.select(hour('ts').alias('hour')).collect()
     [Row(hour=13)]
     """
     sc = SparkContext._active_spark_context
@@ -878,8 +913,8 @@ def minute(col):
     """
     Extract the minutes of a given date as integer.
 
-    >>> df = spark.createDataFrame([('2015-04-08 13:08:15',)], ['a'])
-    >>> df.select(minute('a').alias('minute')).collect()
+    >>> df = spark.createDataFrame([('2015-04-08 13:08:15',)], ['ts'])
+    >>> df.select(minute('ts').alias('minute')).collect()
     [Row(minute=8)]
     """
     sc = SparkContext._active_spark_context
@@ -891,8 +926,8 @@ def second(col):
     """
     Extract the seconds of a given date as integer.
 
-    >>> df = spark.createDataFrame([('2015-04-08 13:08:15',)], ['a'])
-    >>> df.select(second('a').alias('second')).collect()
+    >>> df = spark.createDataFrame([('2015-04-08 13:08:15',)], ['ts'])
+    >>> df.select(second('ts').alias('second')).collect()
     [Row(second=15)]
     """
     sc = SparkContext._active_spark_context
@@ -904,8 +939,8 @@ def weekofyear(col):
     """
     Extract the week number of a given date as integer.
 
-    >>> df = spark.createDataFrame([('2015-04-08',)], ['a'])
-    >>> df.select(weekofyear(df.a).alias('week')).collect()
+    >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
+    >>> df.select(weekofyear(df.dt).alias('week')).collect()
     [Row(week=15)]
     """
     sc = SparkContext._active_spark_context
@@ -917,9 +952,9 @@ def date_add(start, days):
     """
     Returns the date that is `days` days after `start`
 
-    >>> df = spark.createDataFrame([('2015-04-08',)], ['d'])
-    >>> df.select(date_add(df.d, 1).alias('d')).collect()
-    [Row(d=datetime.date(2015, 4, 9))]
+    >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
+    >>> df.select(date_add(df.dt, 1).alias('next_date')).collect()
+    [Row(next_date=datetime.date(2015, 4, 9))]
     """
     sc = SparkContext._active_spark_context
     return Column(sc._jvm.functions.date_add(_to_java_column(start), days))
@@ -930,9 +965,9 @@ def date_sub(start, days):
     """
     Returns the date that is `days` days before `start`
 
-    >>> df = spark.createDataFrame([('2015-04-08',)], ['d'])
-    >>> df.select(date_sub(df.d, 1).alias('d')).collect()
-    [Row(d=datetime.date(2015, 4, 7))]
+    >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
+    >>> df.select(date_sub(df.dt, 1).alias('prev_date')).collect()
+    [Row(prev_date=datetime.date(2015, 4, 7))]
     """
     sc = SparkContext._active_spark_context
     return Column(sc._jvm.functions.date_sub(_to_java_column(start), days))
@@ -956,9 +991,9 @@ def add_months(start, months):
     """
     Returns the date that is `months` months after `start`
 
-    >>> df = spark.createDataFrame([('2015-04-08',)], ['d'])
-    >>> df.select(add_months(df.d, 1).alias('d')).collect()
-    [Row(d=datetime.date(2015, 5, 8))]
+    >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
+    >>> df.select(add_months(df.dt, 1).alias('next_month')).collect()
+    [Row(next_month=datetime.date(2015, 5, 8))]
     """
     sc = SparkContext._active_spark_context
     return Column(sc._jvm.functions.add_months(_to_java_column(start), months))
@@ -969,8 +1004,8 @@ def months_between(date1, date2):
     """
     Returns the number of months between date1 and date2.
 
-    >>> df = spark.createDataFrame([('1997-02-28 10:30:00', '1996-10-30')], ['t', 'd'])
-    >>> df.select(months_between(df.t, df.d).alias('months')).collect()
+    >>> df = spark.createDataFrame([('1997-02-28 10:30:00', '1996-10-30')], ['date1', 'date2'])
+    >>> df.select(months_between(df.date1, df.date2).alias('months')).collect()
     [Row(months=3.9495967...)]
     """
     sc = SparkContext._active_spark_context
@@ -1073,12 +1108,17 @@ def last_day(date):
     return Column(sc._jvm.functions.last_day(_to_java_column(date)))
 
 
+@ignore_unicode_prefix
 @since(1.5)
 def from_unixtime(timestamp, format="yyyy-MM-dd HH:mm:ss"):
     """
     Converts the number of seconds from unix epoch (1970-01-01 00:00:00 UTC) to a string
     representing the timestamp of that moment in the current system time zone in the given
     format.
+
+    >>> time_df = spark.createDataFrame([(1428476400,)], ['unix_time'])
+    >>> time_df.select(from_unixtime('unix_time').alias('ts')).collect()
+    [Row(ts=u'2015-04-08 00:00:00')]
     """
     sc = SparkContext._active_spark_context
     return Column(sc._jvm.functions.from_unixtime(_to_java_column(timestamp), format))
@@ -1092,6 +1132,10 @@ def unix_timestamp(timestamp=None, format='yyyy-MM-dd HH:mm:ss'):
     locale, return null if fail.
 
     if `timestamp` is None, then it returns current timestamp.
+
+    >>> time_df = spark.createDataFrame([('2015-04-08',)], ['dt'])
+    >>> time_df.select(unix_timestamp('dt', 'yyyy-MM-dd').alias('unix_time')).collect()
+    [Row(unix_time=1428476400)]
     """
     sc = SparkContext._active_spark_context
     if timestamp is None:
@@ -1106,8 +1150,8 @@ def from_utc_timestamp(timestamp, tz):
     that corresponds to the same time of day in the given timezone.
 
     >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
-    >>> df.select(from_utc_timestamp(df.t, "PST").alias('t')).collect()
-    [Row(t=datetime.datetime(1997, 2, 28, 2, 30))]
+    >>> df.select(from_utc_timestamp(df.t, "PST").alias('local_time')).collect()
+    [Row(local_time=datetime.datetime(1997, 2, 28, 2, 30))]
     """
     sc = SparkContext._active_spark_context
     return Column(sc._jvm.functions.from_utc_timestamp(_to_java_column(timestamp), tz))
@@ -1119,9 +1163,9 @@ def to_utc_timestamp(timestamp, tz):
     Given a timestamp, which corresponds to a certain time of day in the given timezone, returns
     another timestamp that corresponds to the same time of day in UTC.
 
-    >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
-    >>> df.select(to_utc_timestamp(df.t, "PST").alias('t')).collect()
-    [Row(t=datetime.datetime(1997, 2, 28, 18, 30))]
+    >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['ts'])
+    >>> df.select(to_utc_timestamp(df.ts, "PST").alias('utc_time')).collect()
+    [Row(utc_time=datetime.datetime(1997, 2, 28, 18, 30))]
     """
     sc = SparkContext._active_spark_context
     return Column(sc._jvm.functions.to_utc_timestamp(_to_java_column(timestamp), tz))
@@ -2095,7 +2139,7 @@ def _test():
     sc = spark.sparkContext
     globs['sc'] = sc
     globs['spark'] = spark
-    globs['df'] = sc.parallelize([Row(name='Alice', age=2), Row(name='Bob', age=5)]).toDF()
+    globs['df'] = spark.createDataFrame([Row(name='Alice', age=2), Row(name='Bob', age=5)])
     (failure_count, test_count) = doctest.testmod(
         pyspark.sql.functions, globs=globs,
         optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 1263071a3ffd5..a5e4a444f33be 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1321,7 +1321,8 @@ object functions {
   def asin(columnName: String): Column = asin(Column(columnName))
 
   /**
-   * Computes the tangent inverse of the given value.
+   * Computes the tangent inverse of the given column; the returned angle is in the range
+   * -pi/2 through pi/2
    *
    * @group math_funcs
    * @since 1.4.0
@@ -1329,7 +1330,8 @@ object functions {
   def atan(e: Column): Column = withExpr { Atan(e.expr) }
 
   /**
-   * Computes the tangent inverse of the given column.
+   * Computes the tangent inverse of the given column; the returned angle is in the range
+   * -pi/2 through pi/2
    *
    * @group math_funcs
    * @since 1.4.0
@@ -1338,7 +1340,7 @@ object functions {
 
   /**
    * Returns the angle theta from the conversion of rectangular coordinates (x, y) to
-   * polar coordinates (r, theta).
+   * polar coordinates (r, theta). Units in radians.
    *
    * @group math_funcs
    * @since 1.4.0
@@ -1470,7 +1472,7 @@ object functions {
   }
 
   /**
-   * Computes the cosine of the given value.
+   * Computes the cosine of the given value. Units in radians.
    *
    * @group math_funcs
    * @since 1.4.0
@@ -1937,7 +1939,7 @@ object functions {
   def signum(columnName: String): Column = signum(Column(columnName))
 
   /**
-   * Computes the sine of the given value.
+   * Computes the sine of the given value. Units in radians.
    *
    * @group math_funcs
    * @since 1.4.0
@@ -1969,7 +1971,7 @@ object functions {
   def sinh(columnName: String): Column = sinh(Column(columnName))
 
   /**
-   * Computes the tangent of the given value.
+   * Computes the tangent of the given value. Units in radians.
    *
    * @group math_funcs
    * @since 1.4.0

From 01f183e8497d4931f1fe5c69ff16fe84b1e41492 Mon Sep 17 00:00:00 2001
From: Joachim Hereth <joachim.hereth@numberfour.eu>
Date: Sat, 8 Jul 2017 08:32:45 +0100
Subject: [PATCH 130/779] Mesos doc fixes

## What changes were proposed in this pull request?

Some link fixes for the documentation [Running Spark on Mesos](https://spark.apache.org/docs/latest/running-on-mesos.html):

* Updated Link to Mesos Frameworks (Projects built on top of Mesos)
* Update Link to Mesos binaries from Mesosphere (former link was redirected to dcos install page)

## How was this patch tested?

Documentation was built and changed page manually/visually inspected.

No code was changed, hence no dev tests.

Since these changes are rather trivial I did not open a new JIRA ticket.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Joachim Hereth <joachim.hereth@numberfour.eu>

Closes #18564 from daten-kieker/mesos_doc_fixes.
---
 docs/running-on-mesos.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md
index ec130c1db8f5f..7401b63e022c1 100644
--- a/docs/running-on-mesos.md
+++ b/docs/running-on-mesos.md
@@ -10,7 +10,7 @@ Spark can run on hardware clusters managed by [Apache Mesos](http://mesos.apache
 The advantages of deploying Spark with Mesos include:
 
 - dynamic partitioning between Spark and other
-  [frameworks](https://mesos.apache.org/documentation/latest/mesos-frameworks/)
+  [frameworks](https://mesos.apache.org/documentation/latest/frameworks/)
 - scalable partitioning between multiple instances of Spark
 
 # How it Works
@@ -61,7 +61,7 @@ third party projects publish binary releases that may be helpful in setting Meso
 
 One of those is Mesosphere.  To install Mesos using the binary releases provided by Mesosphere:
 
-1. Download Mesos installation package from [downloads page](http://mesosphere.io/downloads/)
+1. Download Mesos installation package from [downloads page](https://open.mesosphere.com/downloads/mesos/)
 2. Follow their instructions for installation and configuration
 
 The Mesosphere installation documents suggest setting up ZooKeeper to handle Mesos master failover,

From 330bf5c99825afb6129577a34e6bed8b221a98cc Mon Sep 17 00:00:00 2001
From: caoxuewen <cao.xuewen@zte.com.cn>
Date: Sat, 8 Jul 2017 08:34:51 +0100
Subject: [PATCH 131/779] [SPARK-20609][MLLIB][TEST] manually cleared
 'spark.local.dir' before/after a test in ALSCleanerSuite

## What changes were proposed in this pull request?

This PR is similar to #17869.
Once` 'spark.local.dir'` is set. Unless this is manually cleared before/after a test. it could return the same directory even if this property is configured.
and add before/after for each likewise in ALSCleanerSuite.

## How was this patch tested?
existing test.

Author: caoxuewen <cao.xuewen@zte.com.cn>

Closes #18537 from heary-cao/ALSCleanerSuite.
---
 .../spark/ml/recommendation/ALSSuite.scala       | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
index b57fc8d21ab34..0a0fea255c7f3 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
@@ -29,6 +29,7 @@ import scala.language.existentials
 import com.github.fommil.netlib.BLAS.{getInstance => blas}
 import org.apache.commons.io.FileUtils
 import org.apache.commons.io.filefilter.TrueFileFilter
+import org.scalatest.BeforeAndAfterEach
 
 import org.apache.spark._
 import org.apache.spark.internal.Logging
@@ -777,7 +778,20 @@ class ALSSuite
   }
 }
 
-class ALSCleanerSuite extends SparkFunSuite {
+class ALSCleanerSuite extends SparkFunSuite with BeforeAndAfterEach {
+  override def beforeEach(): Unit = {
+    super.beforeEach()
+    // Once `Utils.getOrCreateLocalRootDirs` is called, it is cached in `Utils.localRootDirs`.
+    // Unless this is manually cleared before and after a test, it returns the same directory
+    // set before even if 'spark.local.dir' is configured afterwards.
+    Utils.clearLocalRootDirs()
+  }
+
+  override def afterEach(): Unit = {
+    Utils.clearLocalRootDirs()
+    super.afterEach()
+  }
+
   test("ALS shuffle cleanup standalone") {
     val conf = new SparkConf()
     val localDir = Utils.createTempDir()

From 0b8dd2d08460f3e6eb578727d2c336b6f11959e7 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Sat, 8 Jul 2017 20:16:47 +0800
Subject: [PATCH 132/779] [SPARK-21345][SQL][TEST][TEST-MAVEN]
 SparkSessionBuilderSuite should clean up stopped sessions.

## What changes were proposed in this pull request?

`SparkSessionBuilderSuite` should clean up stopped sessions. Otherwise, it leaves behind some stopped `SparkContext`s interfereing with other test suites using `ShardSQLContext`.

Recently, master branch fails consequtively.
- https://amplab.cs.berkeley.edu/jenkins/view/Spark%20QA%20Test%20(Dashboard)/

## How was this patch tested?

Pass the Jenkins with a updated suite.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #18567 from dongjoon-hyun/SPARK-SESSION.
---
 .../spark/sql/SparkSessionBuilderSuite.scala  | 46 ++++++++-----------
 1 file changed, 18 insertions(+), 28 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSuite.scala
index 770e15629c839..c0301f2ce2d66 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSuite.scala
@@ -17,50 +17,49 @@
 
 package org.apache.spark.sql
 
+import org.scalatest.BeforeAndAfterEach
+
 import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite}
 import org.apache.spark.sql.internal.SQLConf
 
 /**
  * Test cases for the builder pattern of [[SparkSession]].
  */
-class SparkSessionBuilderSuite extends SparkFunSuite {
+class SparkSessionBuilderSuite extends SparkFunSuite with BeforeAndAfterEach {
 
-  private var initialSession: SparkSession = _
+  override def afterEach(): Unit = {
+    // This suite should not interfere with the other test suites.
+    SparkSession.getActiveSession.foreach(_.stop())
+    SparkSession.clearActiveSession()
+    SparkSession.getDefaultSession.foreach(_.stop())
+    SparkSession.clearDefaultSession()
+  }
 
-  private lazy val sparkContext: SparkContext = {
-    initialSession = SparkSession.builder()
+  test("create with config options and propagate them to SparkContext and SparkSession") {
+    val session = SparkSession.builder()
       .master("local")
       .config("spark.ui.enabled", value = false)
       .config("some-config", "v2")
       .getOrCreate()
-    initialSession.sparkContext
-  }
-
-  test("create with config options and propagate them to SparkContext and SparkSession") {
-    // Creating a new session with config - this works by just calling the lazy val
-    sparkContext
-    assert(initialSession.sparkContext.conf.get("some-config") == "v2")
-    assert(initialSession.conf.get("some-config") == "v2")
-    SparkSession.clearDefaultSession()
+    assert(session.sparkContext.conf.get("some-config") == "v2")
+    assert(session.conf.get("some-config") == "v2")
   }
 
   test("use global default session") {
-    val session = SparkSession.builder().getOrCreate()
+    val session = SparkSession.builder().master("local").getOrCreate()
     assert(SparkSession.builder().getOrCreate() == session)
-    SparkSession.clearDefaultSession()
   }
 
   test("config options are propagated to existing SparkSession") {
-    val session1 = SparkSession.builder().config("spark-config1", "a").getOrCreate()
+    val session1 = SparkSession.builder().master("local").config("spark-config1", "a").getOrCreate()
     assert(session1.conf.get("spark-config1") == "a")
     val session2 = SparkSession.builder().config("spark-config1", "b").getOrCreate()
     assert(session1 == session2)
     assert(session1.conf.get("spark-config1") == "b")
-    SparkSession.clearDefaultSession()
   }
 
   test("use session from active thread session and propagate config options") {
-    val defaultSession = SparkSession.builder().getOrCreate()
+    val defaultSession = SparkSession.builder().master("local").getOrCreate()
     val activeSession = defaultSession.newSession()
     SparkSession.setActiveSession(activeSession)
     val session = SparkSession.builder().config("spark-config2", "a").getOrCreate()
@@ -73,16 +72,14 @@ class SparkSessionBuilderSuite extends SparkFunSuite {
     SparkSession.clearActiveSession()
 
     assert(SparkSession.builder().getOrCreate() == defaultSession)
-    SparkSession.clearDefaultSession()
   }
 
   test("create a new session if the default session has been stopped") {
-    val defaultSession = SparkSession.builder().getOrCreate()
+    val defaultSession = SparkSession.builder().master("local").getOrCreate()
     SparkSession.setDefaultSession(defaultSession)
     defaultSession.stop()
     val newSession = SparkSession.builder().master("local").getOrCreate()
     assert(newSession != defaultSession)
-    newSession.stop()
   }
 
   test("create a new session if the active thread session has been stopped") {
@@ -91,11 +88,9 @@ class SparkSessionBuilderSuite extends SparkFunSuite {
     activeSession.stop()
     val newSession = SparkSession.builder().master("local").getOrCreate()
     assert(newSession != activeSession)
-    newSession.stop()
   }
 
   test("create SparkContext first then SparkSession") {
-    sparkContext.stop()
     val conf = new SparkConf().setAppName("test").setMaster("local").set("key1", "value1")
     val sparkContext2 = new SparkContext(conf)
     val session = SparkSession.builder().config("key2", "value2").getOrCreate()
@@ -105,11 +100,9 @@ class SparkSessionBuilderSuite extends SparkFunSuite {
     // We won't update conf for existing `SparkContext`
     assert(!sparkContext2.conf.contains("key2"))
     assert(sparkContext2.conf.get("key1") == "value1")
-    session.stop()
   }
 
   test("create SparkContext first then pass context to SparkSession") {
-    sparkContext.stop()
     val conf = new SparkConf().setAppName("test").setMaster("local").set("key1", "value1")
     val newSC = new SparkContext(conf)
     val session = SparkSession.builder().sparkContext(newSC).config("key2", "value2").getOrCreate()
@@ -121,14 +114,12 @@ class SparkSessionBuilderSuite extends SparkFunSuite {
     // the conf of this sparkContext will not contain the conf set through the API config.
     assert(!session.sparkContext.conf.contains("key2"))
     assert(session.sparkContext.conf.get("spark.app.name") == "test")
-    session.stop()
   }
 
   test("SPARK-15887: hive-site.xml should be loaded") {
     val session = SparkSession.builder().master("local").getOrCreate()
     assert(session.sessionState.newHadoopConf().get("hive.in.test") == "true")
     assert(session.sparkContext.hadoopConfiguration.get("hive.in.test") == "true")
-    session.stop()
   }
 
   test("SPARK-15991: Set global Hadoop conf") {
@@ -140,7 +131,6 @@ class SparkSessionBuilderSuite extends SparkFunSuite {
       assert(session.sessionState.newHadoopConf().get(mySpecialKey) == mySpecialValue)
     } finally {
       session.sparkContext.hadoopConfiguration.unset(mySpecialKey)
-      session.stop()
     }
   }
 }

From 9fccc3627fa41d32fbae6dbbb9bd1521e43eb4f0 Mon Sep 17 00:00:00 2001
From: Zhenhua Wang <wangzhenhua@huawei.com>
Date: Sat, 8 Jul 2017 20:44:12 +0800
Subject: [PATCH 133/779] [SPARK-21083][SQL] Store zero size and row count when
 analyzing empty table

## What changes were proposed in this pull request?

We should be able to store zero size and row count after analyzing empty table.

This pr also enhances the test cases for re-analyzing tables.

## How was this patch tested?

Added a new test case and enhanced some test cases.

Author: Zhenhua Wang <wangzhenhua@huawei.com>

Closes #18292 from wzhfy/analyzeNewColumn.
---
 .../command/AnalyzeTableCommand.scala         |  5 +-
 .../spark/sql/StatisticsCollectionSuite.scala | 13 +++++
 .../spark/sql/hive/StatisticsSuite.scala      | 52 ++++++++++++++++---
 3 files changed, 59 insertions(+), 11 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
index 42e2a9ca5c4e2..cba147c35dd99 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
@@ -20,7 +20,6 @@ package org.apache.spark.sql.execution.command
 import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog.{CatalogStatistics, CatalogTableType}
-import org.apache.spark.sql.execution.SQLExecution
 
 
 /**
@@ -40,10 +39,10 @@ case class AnalyzeTableCommand(
     }
     val newTotalSize = CommandUtils.calculateTotalSize(sessionState, tableMeta)
 
-    val oldTotalSize = tableMeta.stats.map(_.sizeInBytes.toLong).getOrElse(0L)
+    val oldTotalSize = tableMeta.stats.map(_.sizeInBytes.toLong).getOrElse(-1L)
     val oldRowCount = tableMeta.stats.flatMap(_.rowCount.map(_.toLong)).getOrElse(-1L)
     var newStats: Option[CatalogStatistics] = None
-    if (newTotalSize > 0 && newTotalSize != oldTotalSize) {
+    if (newTotalSize >= 0 && newTotalSize != oldTotalSize) {
       newStats = Some(CatalogStatistics(sizeInBytes = newTotalSize))
     }
     // We only set rowCount when noscan is false, because otherwise:
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
index 843ced7f0e697..b80bd80e93e8b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
@@ -82,6 +82,19 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
     }
   }
 
+  test("analyze empty table") {
+    val table = "emptyTable"
+    withTable(table) {
+      sql(s"CREATE TABLE $table (key STRING, value STRING) USING PARQUET")
+      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS noscan")
+      val fetchedStats1 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = None)
+      assert(fetchedStats1.get.sizeInBytes == 0)
+      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS")
+      val fetchedStats2 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(0))
+      assert(fetchedStats2.get.sizeInBytes == 0)
+    }
+  }
+
   test("analyze column command - unsupported types and invalid columns") {
     val tableName = "column_stats_test1"
     withTable(tableName) {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index e00fa64e9f2ce..84bcea30d61a6 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -26,6 +26,7 @@ import scala.util.matching.Regex
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog.{CatalogRelation, CatalogStatistics}
+import org.apache.spark.sql.catalyst.plans.logical.ColumnStat
 import org.apache.spark.sql.catalyst.util.StringUtils
 import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.execution.datasources.LogicalRelation
@@ -210,27 +211,62 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
     }
   }
 
-  test("test elimination of the influences of the old stats") {
+  test("keep existing row count in stats with noscan if table is not changed") {
     val textTable = "textTable"
     withTable(textTable) {
-      sql(s"CREATE TABLE $textTable (key STRING, value STRING) STORED AS TEXTFILE")
+      sql(s"CREATE TABLE $textTable (key STRING, value STRING)")
       sql(s"INSERT INTO TABLE $textTable SELECT * FROM src")
       sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS")
       val fetchedStats1 =
         checkTableStats(textTable, hasSizeInBytes = true, expectedRowCounts = Some(500))
 
       sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS noscan")
-      // when the total size is not changed, the old row count is kept
+      // when the table is not changed, total size is the same, and the old row count is kept
       val fetchedStats2 =
         checkTableStats(textTable, hasSizeInBytes = true, expectedRowCounts = Some(500))
       assert(fetchedStats1 == fetchedStats2)
+    }
+  }
 
-      sql(s"INSERT INTO TABLE $textTable SELECT * FROM src")
-      sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS noscan")
-      // update total size and remove the old and invalid row count
+  test("keep existing column stats if table is not changed") {
+    val table = "update_col_stats_table"
+    withTable(table) {
+      sql(s"CREATE TABLE $table (c1 INT, c2 STRING, c3 DOUBLE)")
+      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS c1")
+      val fetchedStats0 =
+        checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(0))
+      assert(fetchedStats0.get.colStats == Map("c1" -> ColumnStat(0, None, None, 0, 4, 4)))
+
+      // Insert new data and analyze: have the latest column stats.
+      sql(s"INSERT INTO TABLE $table SELECT 1, 'a', 10.0")
+      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS c1")
+      val fetchedStats1 =
+        checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(1)).get
+      assert(fetchedStats1.colStats == Map(
+        "c1" -> ColumnStat(distinctCount = 1, min = Some(1), max = Some(1), nullCount = 0,
+          avgLen = 4, maxLen = 4)))
+
+      // Analyze another column: since the table is not changed, the precious column stats are kept.
+      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS c2")
+      val fetchedStats2 =
+        checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(1)).get
+      assert(fetchedStats2.colStats == Map(
+        "c1" -> ColumnStat(distinctCount = 1, min = Some(1), max = Some(1), nullCount = 0,
+          avgLen = 4, maxLen = 4),
+        "c2" -> ColumnStat(distinctCount = 1, min = None, max = None, nullCount = 0,
+          avgLen = 1, maxLen = 1)))
+
+      // Insert new data and analyze: stale column stats are removed and newly collected column
+      // stats are added.
+      sql(s"INSERT INTO TABLE $table SELECT 2, 'b', 20.0")
+      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS c1, c3")
       val fetchedStats3 =
-        checkTableStats(textTable, hasSizeInBytes = true, expectedRowCounts = None)
-      assert(fetchedStats3.get.sizeInBytes > fetchedStats2.get.sizeInBytes)
+        checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(2)).get
+      assert(fetchedStats3.colStats == Map(
+        "c1" -> ColumnStat(distinctCount = 2, min = Some(1), max = Some(2), nullCount = 0,
+          avgLen = 4, maxLen = 4),
+        "c3" -> ColumnStat(distinctCount = 2, min = Some(10.0), max = Some(20.0), nullCount = 0,
+          avgLen = 8, maxLen = 8)))
     }
   }
 

From 9131bdb7e12bcfb2cb699b3438f554604e28aaa8 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Sun, 9 Jul 2017 00:24:54 +0800
Subject: [PATCH 134/779] [SPARK-20342][CORE] Update task accumulators before
 sending task end event.

This makes sures that listeners get updated task information; otherwise it's
possible to write incomplete task information into event logs, for example,
making the information in a replayed UI inconsistent with the original
application.

Added a new unit test to try to detect the problem, but it's not guaranteed
to fail since it's a race; but it fails pretty reliably for me without the
scheduler changes.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #18393 from vanzin/SPARK-20342.try2.
---
 .../apache/spark/scheduler/DAGScheduler.scala | 70 ++++++++++++-------
 .../spark/scheduler/DAGSchedulerSuite.scala   | 32 ++++++++-
 2 files changed, 75 insertions(+), 27 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 3422a5f204b12..89b4cab88109d 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -1122,6 +1122,25 @@ class DAGScheduler(
     }
   }
 
+  private def postTaskEnd(event: CompletionEvent): Unit = {
+    val taskMetrics: TaskMetrics =
+      if (event.accumUpdates.nonEmpty) {
+        try {
+          TaskMetrics.fromAccumulators(event.accumUpdates)
+        } catch {
+          case NonFatal(e) =>
+            val taskId = event.taskInfo.taskId
+            logError(s"Error when attempting to reconstruct metrics for task $taskId", e)
+            null
+        }
+      } else {
+        null
+      }
+
+    listenerBus.post(SparkListenerTaskEnd(event.task.stageId, event.task.stageAttemptId,
+      Utils.getFormattedClassName(event.task), event.reason, event.taskInfo, taskMetrics))
+  }
+
   /**
    * Responds to a task finishing. This is called inside the event loop so it assumes that it can
    * modify the scheduler's internal state. Use taskEnded() to post a task end event from outside.
@@ -1138,34 +1157,36 @@ class DAGScheduler(
       event.taskInfo.attemptNumber, // this is a task attempt number
       event.reason)
 
-    // Reconstruct task metrics. Note: this may be null if the task has failed.
-    val taskMetrics: TaskMetrics =
-      if (event.accumUpdates.nonEmpty) {
-        try {
-          TaskMetrics.fromAccumulators(event.accumUpdates)
-        } catch {
-          case NonFatal(e) =>
-            logError(s"Error when attempting to reconstruct metrics for task $taskId", e)
-            null
-        }
-      } else {
-        null
-      }
-
-    // The stage may have already finished when we get this event -- eg. maybe it was a
-    // speculative task. It is important that we send the TaskEnd event in any case, so listeners
-    // are properly notified and can chose to handle it. For instance, some listeners are
-    // doing their own accounting and if they don't get the task end event they think
-    // tasks are still running when they really aren't.
-    listenerBus.post(SparkListenerTaskEnd(
-       stageId, task.stageAttemptId, taskType, event.reason, event.taskInfo, taskMetrics))
-
     if (!stageIdToStage.contains(task.stageId)) {
+      // The stage may have already finished when we get this event -- eg. maybe it was a
+      // speculative task. It is important that we send the TaskEnd event in any case, so listeners
+      // are properly notified and can chose to handle it. For instance, some listeners are
+      // doing their own accounting and if they don't get the task end event they think
+      // tasks are still running when they really aren't.
+      postTaskEnd(event)
+
       // Skip all the actions if the stage has been cancelled.
       return
     }
 
     val stage = stageIdToStage(task.stageId)
+
+    // Make sure the task's accumulators are updated before any other processing happens, so that
+    // we can post a task end event before any jobs or stages are updated. The accumulators are
+    // only updated in certain cases.
+    event.reason match {
+      case Success =>
+        stage match {
+          case rs: ResultStage if rs.activeJob.isEmpty =>
+            // Ignore update if task's job has finished.
+          case _ =>
+            updateAccumulators(event)
+        }
+      case _: ExceptionFailure => updateAccumulators(event)
+      case _ =>
+    }
+    postTaskEnd(event)
+
     event.reason match {
       case Success =>
         task match {
@@ -1176,7 +1197,6 @@ class DAGScheduler(
             resultStage.activeJob match {
               case Some(job) =>
                 if (!job.finished(rt.outputId)) {
-                  updateAccumulators(event)
                   job.finished(rt.outputId) = true
                   job.numFinished += 1
                   // If the whole job has finished, remove it
@@ -1203,7 +1223,6 @@ class DAGScheduler(
 
           case smt: ShuffleMapTask =>
             val shuffleStage = stage.asInstanceOf[ShuffleMapStage]
-            updateAccumulators(event)
             val status = event.result.asInstanceOf[MapStatus]
             val execId = status.location.executorId
             logDebug("ShuffleMapTask finished on " + execId)
@@ -1374,8 +1393,7 @@ class DAGScheduler(
         // Do nothing here, left up to the TaskScheduler to decide how to handle denied commits
 
       case exceptionFailure: ExceptionFailure =>
-        // Tasks failed with exceptions might still have accumulator updates.
-        updateAccumulators(event)
+        // Nothing left to do, already handled above for accumulator updates.
 
       case TaskResultLost =>
         // Do nothing here; the TaskScheduler handles these failures and resubmits the task.
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 453be26ed8d0c..3b5df657d45cf 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.scheduler
 
 import java.util.Properties
-import java.util.concurrent.atomic.AtomicBoolean
+import java.util.concurrent.atomic.{AtomicBoolean, AtomicLong}
 
 import scala.annotation.meta.param
 import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, Map}
@@ -2346,6 +2346,36 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with Timeou
       (Success, 1)))
   }
 
+  test("task end event should have updated accumulators (SPARK-20342)") {
+    val tasks = 10
+
+    val accumId = new AtomicLong()
+    val foundCount = new AtomicLong()
+    val listener = new SparkListener() {
+      override def onTaskEnd(event: SparkListenerTaskEnd): Unit = {
+        event.taskInfo.accumulables.find(_.id == accumId.get).foreach { _ =>
+          foundCount.incrementAndGet()
+        }
+      }
+    }
+    sc.addSparkListener(listener)
+
+    // Try a few times in a loop to make sure. This is not guaranteed to fail when the bug exists,
+    // but it should at least make the test flaky. If the bug is fixed, this should always pass.
+    (1 to 10).foreach { i =>
+      foundCount.set(0L)
+
+      val accum = sc.longAccumulator(s"accum$i")
+      accumId.set(accum.id)
+
+      sc.parallelize(1 to tasks, tasks).foreach { _ =>
+        accum.add(1L)
+      }
+      sc.listenerBus.waitUntilEmpty(1000)
+      assert(foundCount.get() === tasks)
+    }
+  }
+
   /**
    * Assert that the supplied TaskSet has exactly the given hosts as its preferred locations.
    * Note that this checks only the host and not the executor ID.

From 062c336d06a0bd4e740a18d2349e03e311509243 Mon Sep 17 00:00:00 2001
From: jinxing <jinxing6042@126.com>
Date: Sun, 9 Jul 2017 00:27:58 +0800
Subject: [PATCH 135/779] [SPARK-21343] Refine the document for
 spark.reducer.maxReqSizeShuffleToMem.

## What changes were proposed in this pull request?

In current code, reducer can break the old shuffle service when `spark.reducer.maxReqSizeShuffleToMem` is enabled. Let's refine document.

Author: jinxing <jinxing6042@126.com>

Closes #18566 from jinxing64/SPARK-21343.
---
 .../org/apache/spark/internal/config/package.scala     |  6 ++++--
 docs/configuration.md                                  | 10 ++++++++++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index a629810bf093a..512d539ee9c38 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -323,9 +323,11 @@ package object config {
 
   private[spark] val REDUCER_MAX_REQ_SIZE_SHUFFLE_TO_MEM =
     ConfigBuilder("spark.reducer.maxReqSizeShuffleToMem")
-      .internal()
       .doc("The blocks of a shuffle request will be fetched to disk when size of the request is " +
-        "above this threshold. This is to avoid a giant request takes too much memory.")
+        "above this threshold. This is to avoid a giant request takes too much memory. We can " +
+        "enable this config by setting a specific value(e.g. 200m). Note that this config can " +
+        "be enabled only when the shuffle shuffle service is newer than Spark-2.2 or the shuffle" +
+        " service is disabled.")
       .bytesConf(ByteUnit.BYTE)
       .createWithDefault(Long.MaxValue)
 
diff --git a/docs/configuration.md b/docs/configuration.md
index 7dc23e441a7ba..6ca84240c1247 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -528,6 +528,16 @@ Apart from these, the following properties are also available, and may be useful
     By allowing it to limit the number of fetch requests, this scenario can be mitigated.
   </td>
 </tr>
+<tr>
+  <td><code>spark.reducer.maxReqSizeShuffleToMem</code></td>
+  <td>Long.MaxValue</td>
+  <td>
+    The blocks of a shuffle request will be fetched to disk when size of the request is above
+    this threshold. This is to avoid a giant request takes too much memory. We can enable this
+    config by setting a specific value(e.g. 200m). Note that this config can be enabled only when
+    the shuffle shuffle service is newer than Spark-2.2 or the shuffle service is disabled.
+  </td>
+</tr>
 <tr>
   <td><code>spark.shuffle.compress</code></td>
   <td>true</td>

From c3712b77a915a5cb12b6c0204bc5bd6037aad1f5 Mon Sep 17 00:00:00 2001
From: Xiao Li <gatorsmile@gmail.com>
Date: Sat, 8 Jul 2017 11:56:19 -0700
Subject: [PATCH 136/779] [SPARK-21307][REVERT][SQL] Remove SQLConf parameters
 from the parser-related classes

## What changes were proposed in this pull request?
Since we do not set active sessions when parsing the plan, we are unable to correctly use SQLConf.get to find the correct active session. Since https://github.com/apache/spark/pull/18531 breaks the build, I plan to revert it at first.

## How was this patch tested?
The existing test cases

Author: Xiao Li <gatorsmile@gmail.com>

Closes #18568 from gatorsmile/revert18531.
---
 .../sql/catalyst/catalog/SessionCatalog.scala |   2 +-
 .../sql/catalyst/parser/AstBuilder.scala      |   6 +-
 .../sql/catalyst/parser/ParseDriver.scala     |   8 +-
 .../parser/ExpressionParserSuite.scala        | 167 +++++++++---------
 .../spark/sql/execution/SparkSqlParser.scala  |  11 +-
 .../org/apache/spark/sql/functions.scala      |   3 +-
 .../internal/BaseSessionStateBuilder.scala    |   2 +-
 .../sql/internal/VariableSubstitution.scala   |   4 +-
 .../sql/execution/SparkSqlParserSuite.scala   |  10 +-
 .../execution/command/DDLCommandSuite.scala   |   4 +-
 .../internal/VariableSubstitutionSuite.scala  |  31 ++--
 11 files changed, 127 insertions(+), 121 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index 336d3d65d0dd0..c40d5f6031a21 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -74,7 +74,7 @@ class SessionCatalog(
       functionRegistry,
       conf,
       new Configuration(),
-      CatalystSqlParser,
+      new CatalystSqlParser(conf),
       DummyFunctionResourceLoader)
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 4d725904bc9b9..a616b0f773f38 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -45,9 +45,11 @@ import org.apache.spark.util.random.RandomSampler
  * The AstBuilder converts an ANTLR4 ParseTree into a catalyst Expression, LogicalPlan or
  * TableIdentifier.
  */
-class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
+class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging {
   import ParserUtils._
 
+  def this() = this(new SQLConf())
+
   protected def typedVisit[T](ctx: ParseTree): T = {
     ctx.accept(this).asInstanceOf[T]
   }
@@ -1457,7 +1459,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
    * Special characters can be escaped by using Hive/C-style escaping.
    */
   private def createString(ctx: StringLiteralContext): String = {
-    if (SQLConf.get.escapedStringLiterals) {
+    if (conf.escapedStringLiterals) {
       ctx.STRING().asScala.map(stringWithoutUnescape).mkString
     } else {
       ctx.STRING().asScala.map(string).mkString
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala
index 7e1fcfefc64a5..09598ffe770c6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala
@@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
 import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.trees.Origin
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{DataType, StructType}
 
 /**
@@ -121,8 +122,13 @@ abstract class AbstractSqlParser extends ParserInterface with Logging {
 /**
  * Concrete SQL parser for Catalyst-only SQL statements.
  */
+class CatalystSqlParser(conf: SQLConf) extends AbstractSqlParser {
+  val astBuilder = new AstBuilder(conf)
+}
+
+/** For test-only. */
 object CatalystSqlParser extends AbstractSqlParser {
-  val astBuilder = new AstBuilder
+  val astBuilder = new AstBuilder(new SQLConf())
 }
 
 /**
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
index ac7325257a15a..45f9f72dccc45 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
@@ -167,12 +167,12 @@ class ExpressionParserSuite extends PlanTest {
   }
 
   test("like expressions with ESCAPED_STRING_LITERALS = true") {
-    val parser = CatalystSqlParser
-    withSQLConf(SQLConf.ESCAPED_STRING_LITERALS.key -> "true") {
-      assertEqual("a rlike '^\\x20[\\x20-\\x23]+$'", 'a rlike "^\\x20[\\x20-\\x23]+$", parser)
-      assertEqual("a rlike 'pattern\\\\'", 'a rlike "pattern\\\\", parser)
-      assertEqual("a rlike 'pattern\\t\\n'", 'a rlike "pattern\\t\\n", parser)
-    }
+    val conf = new SQLConf()
+    conf.setConfString(SQLConf.ESCAPED_STRING_LITERALS.key, "true")
+    val parser = new CatalystSqlParser(conf)
+    assertEqual("a rlike '^\\x20[\\x20-\\x23]+$'", 'a rlike "^\\x20[\\x20-\\x23]+$", parser)
+    assertEqual("a rlike 'pattern\\\\'", 'a rlike "pattern\\\\", parser)
+    assertEqual("a rlike 'pattern\\t\\n'", 'a rlike "pattern\\t\\n", parser)
   }
 
   test("is null expressions") {
@@ -435,85 +435,86 @@ class ExpressionParserSuite extends PlanTest {
   }
 
   test("strings") {
-    val parser = CatalystSqlParser
     Seq(true, false).foreach { escape =>
-      withSQLConf(SQLConf.ESCAPED_STRING_LITERALS.key -> escape.toString) {
-        // tests that have same result whatever the conf is
-        // Single Strings.
-        assertEqual("\"hello\"", "hello", parser)
-        assertEqual("'hello'", "hello", parser)
-
-        // Multi-Strings.
-        assertEqual("\"hello\" 'world'", "helloworld", parser)
-        assertEqual("'hello' \" \" 'world'", "hello world", parser)
-
-        // 'LIKE' string literals. Notice that an escaped '%' is the same as an escaped '\' and a
-        // regular '%'; to get the correct result you need to add another escaped '\'.
-        // TODO figure out if we shouldn't change the ParseUtils.unescapeSQLString method?
-        assertEqual("'pattern%'", "pattern%", parser)
-        assertEqual("'no-pattern\\%'", "no-pattern\\%", parser)
-
-        // tests that have different result regarding the conf
-        if (escape) {
-          // When SQLConf.ESCAPED_STRING_LITERALS is enabled, string literal parsing fallbacks to
-          // Spark 1.6 behavior.
-
-          // 'LIKE' string literals.
-          assertEqual("'pattern\\\\%'", "pattern\\\\%", parser)
-          assertEqual("'pattern\\\\\\%'", "pattern\\\\\\%", parser)
-
-          // Escaped characters.
-          // Unescape string literal "'\\0'" for ASCII NUL (X'00') doesn't work
-          // when ESCAPED_STRING_LITERALS is enabled.
-          // It is parsed literally.
-          assertEqual("'\\0'", "\\0", parser)
-
-          // Note: Single quote follows 1.6 parsing behavior when ESCAPED_STRING_LITERALS is
-          // enabled.
-          val e = intercept[ParseException](parser.parseExpression("'\''"))
-          assert(e.message.contains("extraneous input '''"))
-
-          // The unescape special characters (e.g., "\\t") for 2.0+ don't work
-          // when ESCAPED_STRING_LITERALS is enabled. They are parsed literally.
-          assertEqual("'\\\"'", "\\\"", parser)   // Double quote
-          assertEqual("'\\b'", "\\b", parser)     // Backspace
-          assertEqual("'\\n'", "\\n", parser)     // Newline
-          assertEqual("'\\r'", "\\r", parser)     // Carriage return
-          assertEqual("'\\t'", "\\t", parser)     // Tab character
-
-          // The unescape Octals for 2.0+ don't work when ESCAPED_STRING_LITERALS is enabled.
-          // They are parsed literally.
-          assertEqual("'\\110\\145\\154\\154\\157\\041'", "\\110\\145\\154\\154\\157\\041", parser)
-          // The unescape Unicode for 2.0+ doesn't work when ESCAPED_STRING_LITERALS is enabled.
-          // They are parsed literally.
-          assertEqual("'\\u0057\\u006F\\u0072\\u006C\\u0064\\u0020\\u003A\\u0029'",
-            "\\u0057\\u006F\\u0072\\u006C\\u0064\\u0020\\u003A\\u0029", parser)
-        } else {
-          // Default behavior
-
-          // 'LIKE' string literals.
-          assertEqual("'pattern\\\\%'", "pattern\\%", parser)
-          assertEqual("'pattern\\\\\\%'", "pattern\\\\%", parser)
-
-          // Escaped characters.
-          // See: http://dev.mysql.com/doc/refman/5.7/en/string-literals.html
-          assertEqual("'\\0'", "\u0000", parser) // ASCII NUL (X'00')
-          assertEqual("'\\''", "\'", parser)     // Single quote
-          assertEqual("'\\\"'", "\"", parser)    // Double quote
-          assertEqual("'\\b'", "\b", parser)     // Backspace
-          assertEqual("'\\n'", "\n", parser)     // Newline
-          assertEqual("'\\r'", "\r", parser)     // Carriage return
-          assertEqual("'\\t'", "\t", parser)     // Tab character
-          assertEqual("'\\Z'", "\u001A", parser) // ASCII 26 - CTRL + Z (EOF on windows)
-
-          // Octals
-          assertEqual("'\\110\\145\\154\\154\\157\\041'", "Hello!", parser)
-
-          // Unicode
-          assertEqual("'\\u0057\\u006F\\u0072\\u006C\\u0064\\u0020\\u003A\\u0029'", "World :)",
-            parser)
-        }
+      val conf = new SQLConf()
+      conf.setConfString(SQLConf.ESCAPED_STRING_LITERALS.key, escape.toString)
+      val parser = new CatalystSqlParser(conf)
+
+      // tests that have same result whatever the conf is
+      // Single Strings.
+      assertEqual("\"hello\"", "hello", parser)
+      assertEqual("'hello'", "hello", parser)
+
+      // Multi-Strings.
+      assertEqual("\"hello\" 'world'", "helloworld", parser)
+      assertEqual("'hello' \" \" 'world'", "hello world", parser)
+
+      // 'LIKE' string literals. Notice that an escaped '%' is the same as an escaped '\' and a
+      // regular '%'; to get the correct result you need to add another escaped '\'.
+      // TODO figure out if we shouldn't change the ParseUtils.unescapeSQLString method?
+      assertEqual("'pattern%'", "pattern%", parser)
+      assertEqual("'no-pattern\\%'", "no-pattern\\%", parser)
+
+      // tests that have different result regarding the conf
+      if (escape) {
+        // When SQLConf.ESCAPED_STRING_LITERALS is enabled, string literal parsing fallbacks to
+        // Spark 1.6 behavior.
+
+        // 'LIKE' string literals.
+        assertEqual("'pattern\\\\%'", "pattern\\\\%", parser)
+        assertEqual("'pattern\\\\\\%'", "pattern\\\\\\%", parser)
+
+        // Escaped characters.
+        // Unescape string literal "'\\0'" for ASCII NUL (X'00') doesn't work
+        // when ESCAPED_STRING_LITERALS is enabled.
+        // It is parsed literally.
+        assertEqual("'\\0'", "\\0", parser)
+
+        // Note: Single quote follows 1.6 parsing behavior when ESCAPED_STRING_LITERALS is enabled.
+        val e = intercept[ParseException](parser.parseExpression("'\''"))
+        assert(e.message.contains("extraneous input '''"))
+
+        // The unescape special characters (e.g., "\\t") for 2.0+ don't work
+        // when ESCAPED_STRING_LITERALS is enabled. They are parsed literally.
+        assertEqual("'\\\"'", "\\\"", parser)   // Double quote
+        assertEqual("'\\b'", "\\b", parser)     // Backspace
+        assertEqual("'\\n'", "\\n", parser)     // Newline
+        assertEqual("'\\r'", "\\r", parser)     // Carriage return
+        assertEqual("'\\t'", "\\t", parser)     // Tab character
+
+        // The unescape Octals for 2.0+ don't work when ESCAPED_STRING_LITERALS is enabled.
+        // They are parsed literally.
+        assertEqual("'\\110\\145\\154\\154\\157\\041'", "\\110\\145\\154\\154\\157\\041", parser)
+        // The unescape Unicode for 2.0+ doesn't work when ESCAPED_STRING_LITERALS is enabled.
+        // They are parsed literally.
+        assertEqual("'\\u0057\\u006F\\u0072\\u006C\\u0064\\u0020\\u003A\\u0029'",
+          "\\u0057\\u006F\\u0072\\u006C\\u0064\\u0020\\u003A\\u0029", parser)
+      } else {
+        // Default behavior
+
+        // 'LIKE' string literals.
+        assertEqual("'pattern\\\\%'", "pattern\\%", parser)
+        assertEqual("'pattern\\\\\\%'", "pattern\\\\%", parser)
+
+        // Escaped characters.
+        // See: http://dev.mysql.com/doc/refman/5.7/en/string-literals.html
+        assertEqual("'\\0'", "\u0000", parser) // ASCII NUL (X'00')
+        assertEqual("'\\''", "\'", parser)     // Single quote
+        assertEqual("'\\\"'", "\"", parser)    // Double quote
+        assertEqual("'\\b'", "\b", parser)     // Backspace
+        assertEqual("'\\n'", "\n", parser)     // Newline
+        assertEqual("'\\r'", "\r", parser)     // Carriage return
+        assertEqual("'\\t'", "\t", parser)     // Tab character
+        assertEqual("'\\Z'", "\u001A", parser) // ASCII 26 - CTRL + Z (EOF on windows)
+
+        // Octals
+        assertEqual("'\\110\\145\\154\\154\\157\\041'", "Hello!", parser)
+
+        // Unicode
+        assertEqual("'\\u0057\\u006F\\u0072\\u006C\\u0064\\u0020\\u003A\\u0029'", "World :)",
+          parser)
       }
+
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index 618d027d8dc07..2f8e416e7df1b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -39,11 +39,10 @@ import org.apache.spark.sql.types.StructType
 /**
  * Concrete parser for Spark SQL statements.
  */
-class SparkSqlParser extends AbstractSqlParser {
+class SparkSqlParser(conf: SQLConf) extends AbstractSqlParser {
+  val astBuilder = new SparkSqlAstBuilder(conf)
 
-  val astBuilder = new SparkSqlAstBuilder
-
-  private val substitutor = new VariableSubstitution
+  private val substitutor = new VariableSubstitution(conf)
 
   protected override def parse[T](command: String)(toResult: SqlBaseParser => T): T = {
     super.parse(substitutor.substitute(command))(toResult)
@@ -53,11 +52,9 @@ class SparkSqlParser extends AbstractSqlParser {
 /**
  * Builder that converts an ANTLR ParseTree into a LogicalPlan/Expression/TableIdentifier.
  */
-class SparkSqlAstBuilder extends AstBuilder {
+class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
   import org.apache.spark.sql.catalyst.parser.ParserUtils._
 
-  private def conf: SQLConf = SQLConf.get
-
   /**
    * Create a [[SetCommand]] logical plan.
    *
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index a5e4a444f33be..0c7b483f5c836 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -32,6 +32,7 @@ import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.plans.logical.{HintInfo, ResolvedHint}
 import org.apache.spark.sql.execution.SparkSqlParser
 import org.apache.spark.sql.expressions.UserDefinedFunction
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
@@ -1275,7 +1276,7 @@ object functions {
    */
   def expr(expr: String): Column = {
     val parser = SparkSession.getActiveSession.map(_.sessionState.sqlParser).getOrElse {
-      new SparkSqlParser
+      new SparkSqlParser(new SQLConf)
     }
     Column(parser.parseExpression(expr))
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
index 72d0ddc62303a..267f76217df84 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
@@ -114,7 +114,7 @@ abstract class BaseSessionStateBuilder(
    * Note: this depends on the `conf` field.
    */
   protected lazy val sqlParser: ParserInterface = {
-    extensions.buildParser(session, new SparkSqlParser)
+    extensions.buildParser(session, new SparkSqlParser(conf))
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala
index 2b9c574aaaf0c..4e7c813be9922 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala
@@ -25,9 +25,7 @@ import org.apache.spark.internal.config._
  *
  * Variable substitution is controlled by `SQLConf.variableSubstituteEnabled`.
  */
-class VariableSubstitution {
-
-  private def conf = SQLConf.get
+class VariableSubstitution(conf: SQLConf) {
 
   private val provider = new ConfigProvider {
     override def get(key: String): Option[String] = Option(conf.getConfString(key, ""))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala
index 2e29fa43f73d9..d238c76fbeeff 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala
@@ -37,7 +37,8 @@ import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType
  */
 class SparkSqlParserSuite extends AnalysisTest {
 
-  private lazy val parser = new SparkSqlParser
+  val newConf = new SQLConf
+  private lazy val parser = new SparkSqlParser(newConf)
 
   /**
    * Normalizes plans:
@@ -284,7 +285,6 @@ class SparkSqlParserSuite extends AnalysisTest {
   }
 
   test("query organization") {
-    val conf = SQLConf.get
     // Test all valid combinations of order by/sort by/distribute by/cluster by/limit/windows
     val baseSql = "select * from t"
     val basePlan =
@@ -293,20 +293,20 @@ class SparkSqlParserSuite extends AnalysisTest {
     assertEqual(s"$baseSql distribute by a, b",
       RepartitionByExpression(UnresolvedAttribute("a") :: UnresolvedAttribute("b") :: Nil,
         basePlan,
-        numPartitions = conf.numShufflePartitions))
+        numPartitions = newConf.numShufflePartitions))
     assertEqual(s"$baseSql distribute by a sort by b",
       Sort(SortOrder(UnresolvedAttribute("b"), Ascending) :: Nil,
         global = false,
         RepartitionByExpression(UnresolvedAttribute("a") :: Nil,
           basePlan,
-          numPartitions = conf.numShufflePartitions)))
+          numPartitions = newConf.numShufflePartitions)))
     assertEqual(s"$baseSql cluster by a, b",
       Sort(SortOrder(UnresolvedAttribute("a"), Ascending) ::
           SortOrder(UnresolvedAttribute("b"), Ascending) :: Nil,
         global = false,
         RepartitionByExpression(UnresolvedAttribute("a") :: UnresolvedAttribute("b") :: Nil,
           basePlan,
-          numPartitions = conf.numShufflePartitions)))
+          numPartitions = newConf.numShufflePartitions)))
   }
 
   test("pipeline concatenation") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
index 750574830381f..5643c58d9f847 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
@@ -29,13 +29,13 @@ import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical.Project
 import org.apache.spark.sql.execution.SparkSqlParser
 import org.apache.spark.sql.execution.datasources.CreateTable
-import org.apache.spark.sql.internal.HiveSerDe
+import org.apache.spark.sql.internal.{HiveSerDe, SQLConf}
 import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
 
 
 // TODO: merge this with DDLSuite (SPARK-14441)
 class DDLCommandSuite extends PlanTest {
-  private lazy val parser = new SparkSqlParser
+  private lazy val parser = new SparkSqlParser(new SQLConf)
 
   private def assertUnsupported(sql: String, containsThesePhrases: Seq[String] = Seq()): Unit = {
     val e = intercept[ParseException] {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/VariableSubstitutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/VariableSubstitutionSuite.scala
index c5e5b70e21335..d5a946aeaac31 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/internal/VariableSubstitutionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/VariableSubstitutionSuite.scala
@@ -18,11 +18,12 @@
 package org.apache.spark.sql.internal
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.AnalysisException
 
-class VariableSubstitutionSuite extends SparkFunSuite with PlanTest {
+class VariableSubstitutionSuite extends SparkFunSuite {
 
-  private lazy val sub = new VariableSubstitution
+  private lazy val conf = new SQLConf
+  private lazy val sub = new VariableSubstitution(conf)
 
   test("system property") {
     System.setProperty("varSubSuite.var", "abcd")
@@ -34,26 +35,26 @@ class VariableSubstitutionSuite extends SparkFunSuite with PlanTest {
   }
 
   test("Spark configuration variable") {
-    withSQLConf("some-random-string-abcd" -> "1234abcd") {
-      assert(sub.substitute("${hiveconf:some-random-string-abcd}") == "1234abcd")
-      assert(sub.substitute("${sparkconf:some-random-string-abcd}") == "1234abcd")
-      assert(sub.substitute("${spark:some-random-string-abcd}") == "1234abcd")
-      assert(sub.substitute("${some-random-string-abcd}") == "1234abcd")
-    }
+    conf.setConfString("some-random-string-abcd", "1234abcd")
+    assert(sub.substitute("${hiveconf:some-random-string-abcd}") == "1234abcd")
+    assert(sub.substitute("${sparkconf:some-random-string-abcd}") == "1234abcd")
+    assert(sub.substitute("${spark:some-random-string-abcd}") == "1234abcd")
+    assert(sub.substitute("${some-random-string-abcd}") == "1234abcd")
   }
 
   test("multiple substitutes") {
     val q = "select ${bar} ${foo} ${doo} this is great"
-    withSQLConf("bar" -> "1", "foo" -> "2", "doo" -> "3") {
-      assert(sub.substitute(q) == "select 1 2 3 this is great")
-    }
+    conf.setConfString("bar", "1")
+    conf.setConfString("foo", "2")
+    conf.setConfString("doo", "3")
+    assert(sub.substitute(q) == "select 1 2 3 this is great")
   }
 
   test("test nested substitutes") {
     val q = "select ${bar} ${foo} this is great"
-    withSQLConf("bar" -> "1", "foo" -> "${bar}") {
-      assert(sub.substitute(q) == "select 1 1 this is great")
-    }
+    conf.setConfString("bar", "1")
+    conf.setConfString("foo", "${bar}")
+    assert(sub.substitute(q) == "select 1 1 this is great")
   }
 
 }

From 08e0d033b40946b4ef5741a7aa1e7ba0bd48c6fb Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sat, 8 Jul 2017 14:24:37 -0700
Subject: [PATCH 137/779] [SPARK-21093][R] Terminate R's worker processes in
 the parent of R's daemon to prevent a leak

## What changes were proposed in this pull request?

This is a retry for #18320. This PR was reverted due to unexpected test failures with -10 error code.

I was unable to reproduce in MacOS, CentOS and Ubuntu but only in Jenkins. So, the tests proceeded to verify this and revert the past try here - https://github.com/apache/spark/pull/18456

This new approach was tested in https://github.com/apache/spark/pull/18463.

**Test results**:

- With the part of suspicious change in the past try (https://github.com/apache/spark/pull/18463/commits/466325d3fd353668583f3bde38ae490d9db0b189)

  Tests ran 4 times and 2 times passed and 2 time failed.

- Without the part of suspicious change in the past try (https://github.com/apache/spark/pull/18463/commits/466325d3fd353668583f3bde38ae490d9db0b189)

  Tests ran 5 times and they all passed.

- With this new approach (https://github.com/apache/spark/pull/18463/commits/0a7589c09f53dfc2094497d8d3e59d6407569417)

  Tests ran 5 times and they all passed.

It looks the cause is as below (see https://github.com/apache/spark/pull/18463/commits/466325d3fd353668583f3bde38ae490d9db0b189):

```diff
+ exitCode <- 1
...
+   data <- parallel:::readChild(child)
+   if (is.raw(data)) {
+     if (unserialize(data) == exitCode) {
      ...
+     }
+   }

...

- parallel:::mcexit(0L)
+ parallel:::mcexit(0L, send = exitCode)
```

Two possibilities I think

 - `parallel:::mcexit(.. , send = exitCode)`

   https://stat.ethz.ch/R-manual/R-devel/library/parallel/html/mcfork.html

   > It sends send to the master (unless NULL) and then shuts down the child process.

   However, it looks possible that the parent attemps to terminate the child right after getting our custom exit code. So, the child gets terminated between "send" and "shuts down", failing to exit properly.

 - A bug between `parallel:::mcexit(..., send = ...)` and `parallel:::readChild`.

**Proposal**:

To resolve this, I simply decided to avoid both possibilities with this new approach here (https://github.com/apache/spark/pull/18465/commits/9ff89a7859cb9f427fc774f33c3521c7d962b723). To support this idea, I explained with some quotation of the documentation as below:

https://stat.ethz.ch/R-manual/R-devel/library/parallel/html/mcfork.html

> `readChild` and `readChildren` return a raw vector with a "pid" attribute if data were available, an integer vector of length one with the process ID if a child terminated or `NULL` if the child no longer exists (no children at all for `readChildren`).

`readChild` returns "an integer vector of length one with the process ID if a child terminated" so we can check if it is `integer` and the same selected "process ID". I believe this makes sure that the children are exited.

In case that children happen to send any data manually to parent (which is why we introduced the suspicious part of the change (https://github.com/apache/spark/pull/18463/commits/466325d3fd353668583f3bde38ae490d9db0b189)), this should be raw bytes and will be discarded (and then will try to read the next and check if it is `integer` in the next loop).

## How was this patch tested?

Manual tests and Jenkins tests.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #18465 from HyukjinKwon/SPARK-21093-retry-1.
---
 R/pkg/inst/worker/daemon.R | 51 +++++++++++++++++++++++++++++++++++---
 1 file changed, 48 insertions(+), 3 deletions(-)

diff --git a/R/pkg/inst/worker/daemon.R b/R/pkg/inst/worker/daemon.R
index 3a318b71ea06d..2e31dc5f728cd 100644
--- a/R/pkg/inst/worker/daemon.R
+++ b/R/pkg/inst/worker/daemon.R
@@ -30,8 +30,50 @@ port <- as.integer(Sys.getenv("SPARKR_WORKER_PORT"))
 inputCon <- socketConnection(
     port = port, open = "rb", blocking = TRUE, timeout = connectionTimeout)
 
+# Waits indefinitely for a socket connecion by default.
+selectTimeout <- NULL
+
 while (TRUE) {
-  ready <- socketSelect(list(inputCon))
+  ready <- socketSelect(list(inputCon), timeout = selectTimeout)
+
+  # Note that the children should be terminated in the parent. If each child terminates
+  # itself, it appears that the resource is not released properly, that causes an unexpected
+  # termination of this daemon due to, for example, running out of file descriptors
+  # (see SPARK-21093). Therefore, the current implementation tries to retrieve children
+  # that are exited (but not terminated) and then sends a kill signal to terminate them properly
+  # in the parent.
+  #
+  # There are two paths that it attempts to send a signal to terminate the children in the parent.
+  #
+  #   1. Every second if any socket connection is not available and if there are child workers
+  #     running.
+  #   2. Right after a socket connection is available.
+  #
+  # In other words, the parent attempts to send the signal to the children every second if
+  # any worker is running or right before launching other worker children from the following
+  # new socket connection.
+
+  # The process IDs of exited children are returned below.
+  children <- parallel:::selectChildren(timeout = 0)
+
+  if (is.integer(children)) {
+    lapply(children, function(child) {
+      # This should be the PIDs of exited children. Otherwise, this returns raw bytes if any data
+      # was sent from this child. In this case, we discard it.
+      pid <- parallel:::readChild(child)
+      if (is.integer(pid)) {
+        # This checks if the data from this child is the same pid of this selected child.
+        if (child == pid) {
+          # If so, we terminate this child.
+          tools::pskill(child, tools::SIGUSR1)
+        }
+      }
+    })
+  } else if (is.null(children)) {
+    # If it is NULL, there are no children. Waits indefinitely for a socket connecion.
+    selectTimeout <- NULL
+  }
+
   if (ready) {
     port <- SparkR:::readInt(inputCon)
     # There is a small chance that it could be interrupted by signal, retry one time
@@ -44,12 +86,15 @@ while (TRUE) {
     }
     p <- parallel:::mcfork()
     if (inherits(p, "masterProcess")) {
+      # Reach here because this is a child process.
       close(inputCon)
       Sys.setenv(SPARKR_WORKER_PORT = port)
       try(source(script))
-      # Set SIGUSR1 so that child can exit
-      tools::pskill(Sys.getpid(), tools::SIGUSR1)
+      # Note that this mcexit does not fully terminate this child.
       parallel:::mcexit(0L)
+    } else {
+      # Forking succeeded and we need to check if they finished their jobs every second.
+      selectTimeout <- 1
     }
   }
 }

From 680b33f16694b7c460235b11b8c265bc304f795a Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Sun, 9 Jul 2017 16:30:35 -0700
Subject: [PATCH 138/779] [SPARK-18016][SQL][FOLLOWUP] merge
 declareAddedFunctions, initNestedClasses and declareNestedClasses

## What changes were proposed in this pull request?

These 3 methods have to be used together, so it makes more sense to merge them into one method and then the caller side only need to call one method.

## How was this patch tested?

existing tests.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #18579 from cloud-fan/minor.
---
 .../expressions/codegen/CodeGenerator.scala   | 29 +++++++------------
 .../codegen/GenerateMutableProjection.scala   |  5 +---
 .../codegen/GenerateOrdering.scala            |  5 +---
 .../codegen/GeneratePredicate.scala           |  5 +---
 .../codegen/GenerateSafeProjection.scala      |  5 +---
 .../codegen/GenerateUnsafeProjection.scala    |  5 +---
 .../sql/execution/WholeStageCodegenExec.scala |  5 +---
 .../columnar/GenerateColumnAccessor.scala     |  5 +---
 8 files changed, 18 insertions(+), 46 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index b15bf2ca7c116..7cf9daf628608 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -302,29 +302,20 @@ class CodegenContext {
   }
 
   /**
-   * Instantiates all nested, private sub-classes as objects to the `OuterClass`
+   * Declares all function code. If the added functions are too many, split them into nested
+   * sub-classes to avoid hitting Java compiler constant pool limitation.
    */
-  private[sql] def initNestedClasses(): String = {
+  def declareAddedFunctions(): String = {
+    val inlinedFunctions = classFunctions(outerClassName).values
+
     // Nested, private sub-classes have no mutable state (though they do reference the outer class'
     // mutable state), so we declare and initialize them inline to the OuterClass.
-    classes.filter(_._1 != outerClassName).map {
+    val initNestedClasses = classes.filter(_._1 != outerClassName).map {
       case (className, classInstance) =>
         s"private $className $classInstance = new $className();"
-    }.mkString("\n")
-  }
-
-  /**
-   * Declares all function code that should be inlined to the `OuterClass`.
-   */
-  private[sql] def declareAddedFunctions(): String = {
-    classFunctions(outerClassName).values.mkString("\n")
-  }
+    }
 
-  /**
-   * Declares all nested, private sub-classes and the function code that should be inlined to them.
-   */
-  private[sql] def declareNestedClasses(): String = {
-    classFunctions.filterKeys(_ != outerClassName).map {
+    val declareNestedClasses = classFunctions.filterKeys(_ != outerClassName).map {
       case (className, functions) =>
         s"""
            |private class $className {
@@ -332,7 +323,9 @@ class CodegenContext {
            |}
            """.stripMargin
     }
-  }.mkString("\n")
+
+    (inlinedFunctions ++ initNestedClasses ++ declareNestedClasses).mkString("\n")
+  }
 
   final val JAVA_BOOLEAN = "boolean"
   final val JAVA_BYTE = "byte"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
index 635766835029b..3768dcde00a4e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
@@ -115,8 +115,6 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], MutableP
           ${ctx.initPartition()}
         }
 
-        ${ctx.declareAddedFunctions()}
-
         public ${classOf[BaseMutableProjection].getName} target(InternalRow row) {
           mutableRow = row;
           return this;
@@ -136,8 +134,7 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], MutableP
           return mutableRow;
         }
 
-        ${ctx.initNestedClasses()}
-        ${ctx.declareNestedClasses()}
+        ${ctx.declareAddedFunctions()}
       }
     """
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
index a31943255b995..4e47895985209 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
@@ -173,15 +173,12 @@ object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[InternalR
           ${ctx.initMutableStates()}
         }
 
-        ${ctx.declareAddedFunctions()}
-
         public int compare(InternalRow a, InternalRow b) {
           $comparisons
           return 0;
         }
 
-        ${ctx.initNestedClasses()}
-        ${ctx.declareNestedClasses()}
+        ${ctx.declareAddedFunctions()}
       }"""
 
     val code = CodeFormatter.stripOverlappingComments(
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
index b400783bb5e55..e35b9dda6c017 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
@@ -66,15 +66,12 @@ object GeneratePredicate extends CodeGenerator[Expression, Predicate] {
           ${ctx.initPartition()}
         }
 
-        ${ctx.declareAddedFunctions()}
-
         public boolean eval(InternalRow ${ctx.INPUT_ROW}) {
           ${eval.code}
           return !${eval.isNull} && ${eval.value};
         }
 
-        ${ctx.initNestedClasses()}
-        ${ctx.declareNestedClasses()}
+        ${ctx.declareAddedFunctions()}
       }"""
 
     val code = CodeFormatter.stripOverlappingComments(
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
index dd0419d2286d1..192701a829686 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
@@ -175,16 +175,13 @@ object GenerateSafeProjection extends CodeGenerator[Seq[Expression], Projection]
           ${ctx.initPartition()}
         }
 
-        ${ctx.declareAddedFunctions()}
-
         public java.lang.Object apply(java.lang.Object _i) {
           InternalRow ${ctx.INPUT_ROW} = (InternalRow) _i;
           $allExpressions
           return mutableRow;
         }
 
-        ${ctx.initNestedClasses()}
-        ${ctx.declareNestedClasses()}
+        ${ctx.declareAddedFunctions()}
       }
     """
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
index 6be69d119bf8a..f2a66efc98e71 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
@@ -391,8 +391,6 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
           ${ctx.initPartition()}
         }
 
-        ${ctx.declareAddedFunctions()}
-
         // Scala.Function1 need this
         public java.lang.Object apply(java.lang.Object row) {
           return apply((InternalRow) row);
@@ -403,8 +401,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
           return ${eval.value};
         }
 
-        ${ctx.initNestedClasses()}
-        ${ctx.declareNestedClasses()}
+        ${ctx.declareAddedFunctions()}
       }
       """
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
index 0bd28e36135c8..1007a7d55691b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
@@ -352,14 +352,11 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
           ${ctx.initPartition()}
         }
 
-        ${ctx.declareAddedFunctions()}
-
         protected void processNext() throws java.io.IOException {
           ${code.trim}
         }
 
-        ${ctx.initNestedClasses()}
-        ${ctx.declareNestedClasses()}
+        ${ctx.declareAddedFunctions()}
       }
       """.trim
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala
index fc977f2fd5530..da34643281911 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala
@@ -192,8 +192,6 @@ object GenerateColumnAccessor extends CodeGenerator[Seq[DataType], ColumnarItera
           this.columnIndexes = columnIndexes;
         }
 
-        ${ctx.declareAddedFunctions()}
-
         public boolean hasNext() {
           if (currentRow < numRowsInBatch) {
             return true;
@@ -222,8 +220,7 @@ object GenerateColumnAccessor extends CodeGenerator[Seq[DataType], ColumnarItera
           return unsafeRow;
         }
 
-        ${ctx.initNestedClasses()}
-        ${ctx.declareNestedClasses()}
+        ${ctx.declareAddedFunctions()}
       }"""
 
     val code = CodeFormatter.stripOverlappingComments(

From 457dc9ccbf8404fef6c1ebf8f82e59e4ba480a0e Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Mon, 10 Jul 2017 11:22:28 +0800
Subject: [PATCH 139/779] [MINOR][DOC] Improve the docs about how to correctly
 set configurations

## What changes were proposed in this pull request?

Spark provides several ways to set configurations, either from configuration file, or from `spark-submit` command line options, or programmatically through `SparkConf` class. It may confuses beginners why some configurations set through `SparkConf` cannot take affect. So here add some docs to address this problems and let beginners know how to correctly set configurations.

## How was this patch tested?

N/A

Author: jerryshao <sshao@hortonworks.com>

Closes #18552 from jerryshao/improve-doc.
---
 docs/configuration.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/docs/configuration.md b/docs/configuration.md
index 6ca84240c1247..91b5befd1b1eb 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -95,6 +95,13 @@ in the `spark-defaults.conf` file. A few configuration keys have been renamed si
 versions of Spark; in such cases, the older key names are still accepted, but take lower
 precedence than any instance of the newer key.
 
+Spark properties mainly can be divided into two kinds: one is related to deploy, like
+"spark.driver.memory", "spark.executor.instances", this kind of properties may not be affected when
+setting programmatically through `SparkConf` in runtime, or the behavior is depending on which
+cluster manager and deploy mode you choose, so it would be suggested to set through configuration
+file or `spark-submit` command line options; another is mainly related to Spark runtime control,
+like "spark.task.maxFailures", this kind of properties can be set in either way.
+
 ## Viewing Spark Properties
 
 The application web UI at `http://<driver>:4040` lists Spark properties in the "Environment" tab.

From 0e80ecae300f3e2033419b2d98da8bf092c105bb Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Sun, 9 Jul 2017 22:53:27 -0700
Subject: [PATCH 140/779] [SPARK-21100][SQL][FOLLOWUP] cleanup code and add
 more comments for Dataset.summary

## What changes were proposed in this pull request?

Some code cleanup and adding comments to make the code more readable. Changed the way to generate result rows, to be more clear.

## How was this patch tested?

existing tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #18570 from cloud-fan/summary.
---
 .../scala/org/apache/spark/sql/Dataset.scala  |   9 --
 .../sql/execution/stat/StatFunctions.scala    | 129 ++++++++----------
 .../org/apache/spark/sql/DataFrameSuite.scala |   2 +-
 3 files changed, 56 insertions(+), 84 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 5326b45b50a8b..dfb51192c69bc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -224,15 +224,6 @@ class Dataset[T] private[sql](
     }
   }
 
-  private[sql] def aggregatableColumns: Seq[Expression] = {
-    schema.fields
-      .filter(f => f.dataType.isInstanceOf[NumericType] || f.dataType.isInstanceOf[StringType])
-      .map { n =>
-        queryExecution.analyzed.resolveQuoted(n.name, sparkSession.sessionState.analyzer.resolver)
-          .get
-      }
-  }
-
   /**
    * Compose the string representing rows for output
    *
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
index 436e18fdb5ff5..a75cfb3600225 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
@@ -17,12 +17,15 @@
 
 package org.apache.spark.sql.execution.stat
 
+import java.util.Locale
+
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{Column, DataFrame, Dataset, Row}
-import org.apache.spark.sql.catalyst.expressions.{Cast, CreateArray, Expression, GenericInternalRow, Literal}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Cast, Expression, GenericInternalRow, GetArrayItem, Literal}
 import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
-import org.apache.spark.sql.catalyst.util.{usePrettyExpression, QuantileSummaries}
+import org.apache.spark.sql.catalyst.util.QuantileSummaries
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
@@ -228,90 +231,68 @@ object StatFunctions extends Logging {
     val defaultStatistics = Seq("count", "mean", "stddev", "min", "25%", "50%", "75%", "max")
     val selectedStatistics = if (statistics.nonEmpty) statistics else defaultStatistics
 
-    val hasPercentiles = selectedStatistics.exists(_.endsWith("%"))
-    val (percentiles, percentileNames, remainingAggregates) = if (hasPercentiles) {
-      val (pStrings, rest) = selectedStatistics.partition(a => a.endsWith("%"))
-      val percentiles = pStrings.map { p =>
-        try {
-          p.stripSuffix("%").toDouble / 100.0
-        } catch {
-          case e: NumberFormatException =>
-            throw new IllegalArgumentException(s"Unable to parse $p as a percentile", e)
-        }
+    val percentiles = selectedStatistics.filter(a => a.endsWith("%")).map { p =>
+      try {
+        p.stripSuffix("%").toDouble / 100.0
+      } catch {
+        case e: NumberFormatException =>
+          throw new IllegalArgumentException(s"Unable to parse $p as a percentile", e)
       }
-      require(percentiles.forall(p => p >= 0 && p <= 1), "Percentiles must be in the range [0, 1]")
-      (percentiles, pStrings, rest)
-    } else {
-      (Seq(), Seq(), selectedStatistics)
     }
+    require(percentiles.forall(p => p >= 0 && p <= 1), "Percentiles must be in the range [0, 1]")
 
-
-    // The list of summary statistics to compute, in the form of expressions.
-    val availableStatistics = Map[String, Expression => Expression](
-      "count" -> ((child: Expression) => Count(child).toAggregateExpression()),
-      "mean" -> ((child: Expression) => Average(child).toAggregateExpression()),
-      "stddev" -> ((child: Expression) => StddevSamp(child).toAggregateExpression()),
-      "min" -> ((child: Expression) => Min(child).toAggregateExpression()),
-      "max" -> ((child: Expression) => Max(child).toAggregateExpression()))
-
-    val statisticFns = remainingAggregates.map { agg =>
-      require(availableStatistics.contains(agg), s"$agg is not a recognised statistic")
-      agg -> availableStatistics(agg)
-    }
-
-    def percentileAgg(child: Expression): Expression =
-      new ApproximatePercentile(child, CreateArray(percentiles.map(Literal(_))))
-        .toAggregateExpression()
-
-    val outputCols = ds.aggregatableColumns.map(usePrettyExpression(_).sql).toList
-
-    val ret: Seq[Row] = if (outputCols.nonEmpty) {
-      var aggExprs = statisticFns.toList.flatMap { case (_, colToAgg) =>
-        outputCols.map(c => Column(Cast(colToAgg(Column(c).expr), StringType)).as(c))
-      }
-      if (hasPercentiles) {
-        aggExprs = outputCols.map(c => Column(percentileAgg(Column(c).expr)).as(c)) ++ aggExprs
+    var percentileIndex = 0
+    val statisticFns = selectedStatistics.map { stats =>
+      if (stats.endsWith("%")) {
+        val index = percentileIndex
+        percentileIndex += 1
+        (child: Expression) =>
+          GetArrayItem(
+            new ApproximatePercentile(child, Literal.create(percentiles)).toAggregateExpression(),
+            Literal(index))
+      } else {
+        stats.toLowerCase(Locale.ROOT) match {
+          case "count" => (child: Expression) => Count(child).toAggregateExpression()
+          case "mean" => (child: Expression) => Average(child).toAggregateExpression()
+          case "stddev" => (child: Expression) => StddevSamp(child).toAggregateExpression()
+          case "min" => (child: Expression) => Min(child).toAggregateExpression()
+          case "max" => (child: Expression) => Max(child).toAggregateExpression()
+          case _ => throw new IllegalArgumentException(s"$stats is not a recognised statistic")
+        }
       }
+    }
 
-      val row = ds.groupBy().agg(aggExprs.head, aggExprs.tail: _*).head().toSeq
+    val selectedCols = ds.logicalPlan.output
+      .filter(a => a.dataType.isInstanceOf[NumericType] || a.dataType.isInstanceOf[StringType])
 
-      // Pivot the data so each summary is one row
-      val grouped: Seq[Seq[Any]] = row.grouped(outputCols.size).toSeq
+    val aggExprs = statisticFns.flatMap { func =>
+      selectedCols.map(c => Column(Cast(func(c), StringType)).as(c.name))
+    }
 
-      val basicStats = if (hasPercentiles) grouped.tail else grouped
+    // If there is no selected columns, we don't need to run this aggregate, so make it a lazy val.
+    lazy val aggResult = ds.select(aggExprs: _*).queryExecution.toRdd.collect().head
 
-      val rows = basicStats.zip(statisticFns).map { case (aggregation, (statistic, _)) =>
-        Row(statistic :: aggregation.toList: _*)
-      }
+    // We will have one row for each selected statistic in the result.
+    val result = Array.fill[InternalRow](selectedStatistics.length) {
+      // each row has the statistic name, and statistic values of each selected column.
+      new GenericInternalRow(selectedCols.length + 1)
+    }
 
-      if (hasPercentiles) {
-        def nullSafeString(x: Any) = if (x == null) null else x.toString
-        val percentileRows = grouped.head
-          .map {
-            case a: Seq[Any] => a
-            case _ => Seq.fill(percentiles.length)(null: Any)
-          }
-          .transpose
-          .zip(percentileNames)
-          .map { case (values: Seq[Any], name) =>
-            Row(name :: values.map(nullSafeString).toList: _*)
-          }
-        (rows ++ percentileRows)
-          .sortWith((left, right) =>
-            selectedStatistics.indexOf(left(0)) < selectedStatistics.indexOf(right(0)))
-      } else {
-        rows
+    var rowIndex = 0
+    while (rowIndex < result.length) {
+      val statsName = selectedStatistics(rowIndex)
+      result(rowIndex).update(0, UTF8String.fromString(statsName))
+      for (colIndex <- selectedCols.indices) {
+        val statsValue = aggResult.getUTF8String(rowIndex * selectedCols.length + colIndex)
+        result(rowIndex).update(colIndex + 1, statsValue)
       }
-    } else {
-      // If there are no output columns, just output a single column that contains the stats.
-      selectedStatistics.map(Row(_))
+      rowIndex += 1
     }
 
     // All columns are string type
-    val schema = StructType(
-      StructField("summary", StringType) :: outputCols.map(StructField(_, StringType))).toAttributes
-    // `toArray` forces materialization to make the seq serializable
-    Dataset.ofRows(ds.sparkSession, LocalRelation.fromExternalRows(schema, ret.toArray.toSeq))
-  }
+    val output = AttributeReference("summary", StringType)() +:
+      selectedCols.map(c => AttributeReference(c.name, StringType)())
 
+    Dataset.ofRows(ds.sparkSession, LocalRelation(output, result))
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 2c7051bf431c3..b2219b4eb8c17 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -770,7 +770,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
     val fooE = intercept[IllegalArgumentException] {
       person2.summary("foo")
     }
-    assert(fooE.getMessage === "requirement failed: foo is not a recognised statistic")
+    assert(fooE.getMessage === "foo is not a recognised statistic")
 
     val parseE = intercept[IllegalArgumentException] {
       person2.summary("foo%")

From 96d58f285bc98d4c2484150eefe7447db4784a86 Mon Sep 17 00:00:00 2001
From: Eric Vandenberg <ericvandenberg@fb.com>
Date: Mon, 10 Jul 2017 14:40:20 +0800
Subject: [PATCH 141/779] [SPARK-21219][CORE] Task retry occurs on same
 executor due to race condition with blacklisting

## What changes were proposed in this pull request?

There's a race condition in the current TaskSetManager where a failed task is added for retry (addPendingTask), and can asynchronously be assigned to an executor *prior* to the blacklist state (updateBlacklistForFailedTask), the result is the task might re-execute on the same executor.  This is particularly problematic if the executor is shutting down since the retry task immediately becomes a lost task (ExecutorLostFailure).  Another side effect is that the actual failure reason gets obscured by the retry task which never actually executed.  There are sample logs showing the issue in the https://issues.apache.org/jira/browse/SPARK-21219

The fix is to change the ordering of the addPendingTask and updatingBlackListForFailedTask calls in TaskSetManager.handleFailedTask

## How was this patch tested?

Implemented a unit test that verifies the task is black listed before it is added to the pending task.  Ran the unit test without the fix and it fails.  Ran the unit test with the fix and it passes.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Eric Vandenberg <ericvandenberg@fb.com>

Closes #18427 from ericvandenbergfb/blacklistFix.
---
 .../spark/scheduler/TaskSetManager.scala      | 21 ++++-----
 .../spark/scheduler/TaskSetManagerSuite.scala | 44 ++++++++++++++++++-
 2 files changed, 54 insertions(+), 11 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index 02d374dc37cd5..3968fb7e6356d 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -198,7 +198,7 @@ private[spark] class TaskSetManager(
   private[scheduler] var emittedTaskSizeWarning = false
 
   /** Add a task to all the pending-task lists that it should be on. */
-  private def addPendingTask(index: Int) {
+  private[spark] def addPendingTask(index: Int) {
     for (loc <- tasks(index).preferredLocations) {
       loc match {
         case e: ExecutorCacheTaskLocation =>
@@ -832,15 +832,6 @@ private[spark] class TaskSetManager(
 
     sched.dagScheduler.taskEnded(tasks(index), reason, null, accumUpdates, info)
 
-    if (successful(index)) {
-      logInfo(s"Task ${info.id} in stage ${taskSet.id} (TID $tid) failed, but the task will not" +
-        s" be re-executed (either because the task failed with a shuffle data fetch failure," +
-        s" so the previous stage needs to be re-run, or because a different copy of the task" +
-        s" has already succeeded).")
-    } else {
-      addPendingTask(index)
-    }
-
     if (!isZombie && reason.countTowardsTaskFailures) {
       taskSetBlacklistHelperOpt.foreach(_.updateBlacklistForFailedTask(
         info.host, info.executorId, index))
@@ -854,6 +845,16 @@ private[spark] class TaskSetManager(
         return
       }
     }
+
+    if (successful(index)) {
+      logInfo(s"Task ${info.id} in stage ${taskSet.id} (TID $tid) failed, but the task will not" +
+        s" be re-executed (either because the task failed with a shuffle data fetch failure," +
+        s" so the previous stage needs to be re-run, or because a different copy of the task" +
+        s" has already succeeded).")
+    } else {
+      addPendingTask(index)
+    }
+
     maybeFinishTaskSet()
   }
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
index 80fb674725814..e46900e4e5049 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
@@ -23,7 +23,7 @@ import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
 import org.mockito.Matchers.{any, anyInt, anyString}
-import org.mockito.Mockito.{mock, never, spy, verify, when}
+import org.mockito.Mockito.{mock, never, spy, times, verify, when}
 import org.mockito.invocation.InvocationOnMock
 import org.mockito.stubbing.Answer
 
@@ -1172,6 +1172,48 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg
     assert(blacklistTracker.isNodeBlacklisted("host1"))
   }
 
+  test("update blacklist before adding pending task to avoid race condition") {
+    // When a task fails, it should apply the blacklist policy prior to
+    // retrying the task otherwise there's a race condition where run on
+    // the same executor that it was intended to be black listed from.
+    val conf = new SparkConf().
+      set(config.BLACKLIST_ENABLED, true)
+
+    // Create a task with two executors.
+    sc = new SparkContext("local", "test", conf)
+    val exec = "executor1"
+    val host = "host1"
+    val exec2 = "executor2"
+    val host2 = "host2"
+    sched = new FakeTaskScheduler(sc, (exec, host), (exec2, host2))
+    val taskSet = FakeTask.createTaskSet(1)
+
+    val clock = new ManualClock
+    val mockListenerBus = mock(classOf[LiveListenerBus])
+    val blacklistTracker = new BlacklistTracker(mockListenerBus, conf, None, clock)
+    val taskSetManager = new TaskSetManager(sched, taskSet, 1, Some(blacklistTracker))
+    val taskSetManagerSpy = spy(taskSetManager)
+
+    val taskDesc = taskSetManagerSpy.resourceOffer(exec, host, TaskLocality.ANY)
+
+    // Assert the task has been black listed on the executor it was last executed on.
+    when(taskSetManagerSpy.addPendingTask(anyInt())).thenAnswer(
+      new Answer[Unit] {
+        override def answer(invocationOnMock: InvocationOnMock): Unit = {
+          val task = invocationOnMock.getArgumentAt(0, classOf[Int])
+          assert(taskSetManager.taskSetBlacklistHelperOpt.get.
+            isExecutorBlacklistedForTask(exec, task))
+        }
+      }
+    )
+
+    // Simulate a fake exception
+    val e = new ExceptionFailure("a", "b", Array(), "c", None)
+    taskSetManagerSpy.handleFailedTask(taskDesc.get.taskId, TaskState.FAILED, e)
+
+    verify(taskSetManagerSpy, times(1)).addPendingTask(anyInt())
+  }
+
   private def createTaskResult(
       id: Int,
       accumUpdates: Seq[AccumulatorV2[_, _]] = Seq.empty): DirectTaskResult[Int] = {

From c444d10868c808f4ae43becd5506bf944d9c2e9b Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Mon, 10 Jul 2017 07:46:47 +0100
Subject: [PATCH 142/779] [MINOR][DOC] Remove obsolete `ec2-scripts.md`

## What changes were proposed in this pull request?

Since this document became obsolete, we had better remove this for Apache Spark 2.3.0. The original document is removed via SPARK-12735 on January 2016, and currently it's just redirection page. The only reference in Apache Spark website will go directly to the destination in https://github.com/apache/spark-website/pull/54.

## How was this patch tested?

N/A. This is a removal of documentation.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #18578 from dongjoon-hyun/SPARK-REMOVE-EC2.
---
 docs/ec2-scripts.md | 7 -------
 1 file changed, 7 deletions(-)
 delete mode 100644 docs/ec2-scripts.md

diff --git a/docs/ec2-scripts.md b/docs/ec2-scripts.md
deleted file mode 100644
index 6cd39dbed055d..0000000000000
--- a/docs/ec2-scripts.md
+++ /dev/null
@@ -1,7 +0,0 @@
----		
-layout: global
-title: Running Spark on EC2
-redirect: https://github.com/amplab/spark-ec2#readme
----
-
-This document has been superseded and replaced by documentation at https://github.com/amplab/spark-ec2#readme

From 647963a26a2d4468ebd9b68111ebe68bee501fde Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Mon, 10 Jul 2017 15:58:34 +0800
Subject: [PATCH 143/779] [SPARK-20460][SQL] Make it more consistent to handle
 column name duplication

## What changes were proposed in this pull request?
This pr made it more consistent to handle column name duplication. In the current master, error handling is different when hitting column name duplication:
```
// json
scala> val schema = StructType(StructField("a", IntegerType) :: StructField("a", IntegerType) :: Nil)
scala> Seq("""{"a":1, "a":1}"""""").toDF().coalesce(1).write.mode("overwrite").text("/tmp/data")
scala> spark.read.format("json").schema(schema).load("/tmp/data").show
org.apache.spark.sql.AnalysisException: Reference 'a' is ambiguous, could be: a#12, a#13.;
  at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolve(LogicalPlan.scala:287)
  at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolve(LogicalPlan.scala:181)
  at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolve$1.apply(LogicalPlan.scala:153)

scala> spark.read.format("json").load("/tmp/data").show
org.apache.spark.sql.AnalysisException: Duplicate column(s) : "a" found, cannot save to JSON format;
  at org.apache.spark.sql.execution.datasources.json.JsonDataSource.checkConstraints(JsonDataSource.scala:81)
  at org.apache.spark.sql.execution.datasources.json.JsonDataSource.inferSchema(JsonDataSource.scala:63)
  at org.apache.spark.sql.execution.datasources.json.JsonFileFormat.inferSchema(JsonFileFormat.scala:57)
  at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$7.apply(DataSource.scala:176)
  at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$7.apply(DataSource.scala:176)

// csv
scala> val schema = StructType(StructField("a", IntegerType) :: StructField("a", IntegerType) :: Nil)
scala> Seq("a,a", "1,1").toDF().coalesce(1).write.mode("overwrite").text("/tmp/data")
scala> spark.read.format("csv").schema(schema).option("header", false).load("/tmp/data").show
org.apache.spark.sql.AnalysisException: Reference 'a' is ambiguous, could be: a#41, a#42.;
  at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolve(LogicalPlan.scala:287)
  at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolve(LogicalPlan.scala:181)
  at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolve$1.apply(LogicalPlan.scala:153)
  at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolve$1.apply(LogicalPlan.scala:152)

// If `inferSchema` is true, a CSV format is duplicate-safe (See SPARK-16896)
scala> spark.read.format("csv").option("header", true).load("/tmp/data").show
+---+---+
| a0| a1|
+---+---+
|  1|  1|
+---+---+

// parquet
scala> val schema = StructType(StructField("a", IntegerType) :: StructField("a", IntegerType) :: Nil)
scala> Seq((1, 1)).toDF("a", "b").coalesce(1).write.mode("overwrite").parquet("/tmp/data")
scala> spark.read.format("parquet").schema(schema).option("header", false).load("/tmp/data").show
org.apache.spark.sql.AnalysisException: Reference 'a' is ambiguous, could be: a#110, a#111.;
  at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolve(LogicalPlan.scala:287)
  at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolve(LogicalPlan.scala:181)
  at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolve$1.apply(LogicalPlan.scala:153)
  at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolve$1.apply(LogicalPlan.scala:152)
  at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
  at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
```
When this patch applied, the results change to;
```

// json
scala> val schema = StructType(StructField("a", IntegerType) :: StructField("a", IntegerType) :: Nil)
scala> Seq("""{"a":1, "a":1}"""""").toDF().coalesce(1).write.mode("overwrite").text("/tmp/data")
scala> spark.read.format("json").schema(schema).load("/tmp/data").show
org.apache.spark.sql.AnalysisException: Found duplicate column(s) in datasource: "a";
  at org.apache.spark.sql.util.SchemaUtils$.checkColumnNameDuplication(SchemaUtil.scala:47)
  at org.apache.spark.sql.util.SchemaUtils$.checkSchemaColumnNameDuplication(SchemaUtil.scala:33)
  at org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:186)
  at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:368)

scala> spark.read.format("json").load("/tmp/data").show
org.apache.spark.sql.AnalysisException: Found duplicate column(s) in datasource: "a";
  at org.apache.spark.sql.util.SchemaUtils$.checkColumnNameDuplication(SchemaUtil.scala:47)
  at org.apache.spark.sql.util.SchemaUtils$.checkSchemaColumnNameDuplication(SchemaUtil.scala:33)
  at org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:186)
  at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:368)
  at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
  at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:156)

// csv
scala> val schema = StructType(StructField("a", IntegerType) :: StructField("a", IntegerType) :: Nil)
scala> Seq("a,a", "1,1").toDF().coalesce(1).write.mode("overwrite").text("/tmp/data")
scala> spark.read.format("csv").schema(schema).option("header", false).load("/tmp/data").show
org.apache.spark.sql.AnalysisException: Found duplicate column(s) in datasource: "a";
  at org.apache.spark.sql.util.SchemaUtils$.checkColumnNameDuplication(SchemaUtil.scala:47)
  at org.apache.spark.sql.util.SchemaUtils$.checkSchemaColumnNameDuplication(SchemaUtil.scala:33)
  at org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:186)
  at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:368)
  at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)

scala> spark.read.format("csv").option("header", true).load("/tmp/data").show
+---+---+
| a0| a1|
+---+---+
|  1|  1|
+---+---+

// parquet
scala> val schema = StructType(StructField("a", IntegerType) :: StructField("a", IntegerType) :: Nil)
scala> Seq((1, 1)).toDF("a", "b").coalesce(1).write.mode("overwrite").parquet("/tmp/data")
scala> spark.read.format("parquet").schema(schema).option("header", false).load("/tmp/data").show
org.apache.spark.sql.AnalysisException: Found duplicate column(s) in datasource: "a";
  at org.apache.spark.sql.util.SchemaUtils$.checkColumnNameDuplication(SchemaUtil.scala:47)
  at org.apache.spark.sql.util.SchemaUtils$.checkSchemaColumnNameDuplication(SchemaUtil.scala:33)
  at org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:186)
  at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:368)
```

## How was this patch tested?
Added tests in `DataFrameReaderWriterSuite` and `SQLQueryTestSuite`.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #17758 from maropu/SPARK-20460.
---
 .../sql/catalyst/catalog/SessionCatalog.scala | 16 +---
 .../apache/spark/sql/util/SchemaUtils.scala   | 58 +++++++++---
 .../spark/sql/util/SchemaUtilsSuite.scala     | 83 +++++++++++++++++
 .../command/createDataSourceTables.scala      |  2 -
 .../spark/sql/execution/command/tables.scala  |  8 +-
 .../spark/sql/execution/command/views.scala   |  9 +-
 .../execution/datasources/DataSource.scala    | 43 +++++++--
 .../InsertIntoHadoopFsRelationCommand.scala   | 14 ++-
 .../datasources/PartitioningUtils.scala       | 10 +--
 .../datasources/jdbc/JdbcUtils.scala          | 11 +--
 .../datasources/json/JsonDataSource.scala     | 15 +---
 .../sql/execution/datasources/rules.scala     | 36 ++++----
 .../org/apache/spark/sql/DataFrameSuite.scala |  4 +-
 .../sql/execution/command/DDLSuite.scala      | 56 ++++++++----
 .../spark/sql/jdbc/JDBCWriteSuite.scala       |  4 +-
 .../sql/sources/ResolvedDataSourceSuite.scala |  5 +-
 .../sql/streaming/FileStreamSinkSuite.scala   | 37 ++++++++
 .../sql/test/DataFrameReaderWriterSuite.scala | 88 +++++++++++++++++++
 .../sql/hive/execution/HiveDDLSuite.scala     |  2 +-
 19 files changed, 382 insertions(+), 119 deletions(-)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/util/SchemaUtilsSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index c40d5f6031a21..b44d2ee69e1d1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -39,7 +39,7 @@ import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParserInterface}
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias, View}
 import org.apache.spark.sql.catalyst.util.StringUtils
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.types.{StructField, StructType}
+import org.apache.spark.sql.types.StructType
 
 object SessionCatalog {
   val DEFAULT_DATABASE = "default"
@@ -188,19 +188,6 @@ class SessionCatalog(
     }
   }
 
-  private def checkDuplication(fields: Seq[StructField]): Unit = {
-    val columnNames = if (conf.caseSensitiveAnalysis) {
-      fields.map(_.name)
-    } else {
-      fields.map(_.name.toLowerCase)
-    }
-    if (columnNames.distinct.length != columnNames.length) {
-      val duplicateColumns = columnNames.groupBy(identity).collect {
-        case (x, ys) if ys.length > 1 => x
-      }
-      throw new AnalysisException(s"Found duplicate column(s): ${duplicateColumns.mkString(", ")}")
-    }
-  }
   // ----------------------------------------------------------------------------
   // Databases
   // ----------------------------------------------------------------------------
@@ -353,7 +340,6 @@ class SessionCatalog(
     val tableIdentifier = TableIdentifier(table, Some(db))
     requireDbExists(db)
     requireTableExists(tableIdentifier)
-    checkDuplication(newSchema)
 
     val catalogTable = externalCatalog.getTable(db, table)
     val oldSchema = catalogTable.schema
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala
index e881685ce6262..41ca270095ffb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala
@@ -17,7 +17,9 @@
 
 package org.apache.spark.sql.util
 
-import org.apache.spark.internal.Logging
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.analysis._
+import org.apache.spark.sql.types.StructType
 
 
 /**
@@ -25,29 +27,63 @@ import org.apache.spark.internal.Logging
  *
  * TODO: Merge this file with [[org.apache.spark.ml.util.SchemaUtils]].
  */
-private[spark] object SchemaUtils extends Logging {
+private[spark] object SchemaUtils {
 
   /**
-   * Checks if input column names have duplicate identifiers. Prints a warning message if
+   * Checks if an input schema has duplicate column names. This throws an exception if the
+   * duplication exists.
+   *
+   * @param schema schema to check
+   * @param colType column type name, used in an exception message
+   * @param caseSensitiveAnalysis whether duplication checks should be case sensitive or not
+   */
+  def checkSchemaColumnNameDuplication(
+      schema: StructType, colType: String, caseSensitiveAnalysis: Boolean = false): Unit = {
+    checkColumnNameDuplication(schema.map(_.name), colType, caseSensitiveAnalysis)
+  }
+
+  // Returns true if a given resolver is case-sensitive
+  private def isCaseSensitiveAnalysis(resolver: Resolver): Boolean = {
+    if (resolver == caseSensitiveResolution) {
+      true
+    } else if (resolver == caseInsensitiveResolution) {
+      false
+    } else {
+      sys.error("A resolver to check if two identifiers are equal must be " +
+        "`caseSensitiveResolution` or `caseInsensitiveResolution` in o.a.s.sql.catalyst.")
+    }
+  }
+
+  /**
+   * Checks if input column names have duplicate identifiers. This throws an exception if
    * the duplication exists.
    *
    * @param columnNames column names to check
-   * @param colType column type name, used in a warning message
+   * @param colType column type name, used in an exception message
+   * @param resolver resolver used to determine if two identifiers are equal
+   */
+  def checkColumnNameDuplication(
+      columnNames: Seq[String], colType: String, resolver: Resolver): Unit = {
+    checkColumnNameDuplication(columnNames, colType, isCaseSensitiveAnalysis(resolver))
+  }
+
+  /**
+   * Checks if input column names have duplicate identifiers. This throws an exception if
+   * the duplication exists.
+   *
+   * @param columnNames column names to check
+   * @param colType column type name, used in an exception message
    * @param caseSensitiveAnalysis whether duplication checks should be case sensitive or not
    */
   def checkColumnNameDuplication(
       columnNames: Seq[String], colType: String, caseSensitiveAnalysis: Boolean): Unit = {
-    val names = if (caseSensitiveAnalysis) {
-      columnNames
-    } else {
-      columnNames.map(_.toLowerCase)
-    }
+    val names = if (caseSensitiveAnalysis) columnNames else columnNames.map(_.toLowerCase)
     if (names.distinct.length != names.length) {
       val duplicateColumns = names.groupBy(identity).collect {
         case (x, ys) if ys.length > 1 => s"`$x`"
       }
-      logWarning(s"Found duplicate column(s) $colType: ${duplicateColumns.mkString(", ")}. " +
-        "You might need to assign different column names.")
+      throw new AnalysisException(
+        s"Found duplicate column(s) $colType: ${duplicateColumns.mkString(", ")}")
     }
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/util/SchemaUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/util/SchemaUtilsSuite.scala
new file mode 100644
index 0000000000000..a25be2fe61dbd
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/util/SchemaUtilsSuite.scala
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.util
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.analysis._
+import org.apache.spark.sql.types.StructType
+
+class SchemaUtilsSuite extends SparkFunSuite {
+
+  private def resolver(caseSensitiveAnalysis: Boolean): Resolver = {
+    if (caseSensitiveAnalysis) {
+      caseSensitiveResolution
+    } else {
+      caseInsensitiveResolution
+    }
+  }
+
+  Seq((true, ("a", "a"), ("b", "b")), (false, ("a", "A"), ("b", "B"))).foreach {
+      case (caseSensitive, (a0, a1), (b0, b1)) =>
+
+    val testType = if (caseSensitive) "case-sensitive" else "case-insensitive"
+    test(s"Check column name duplication in $testType cases") {
+      def checkExceptionCases(schemaStr: String, duplicatedColumns: Seq[String]): Unit = {
+        val expectedErrorMsg = "Found duplicate column(s) in SchemaUtilsSuite: " +
+          duplicatedColumns.map(c => s"`${c.toLowerCase}`").mkString(", ")
+        val schema = StructType.fromDDL(schemaStr)
+        var msg = intercept[AnalysisException] {
+          SchemaUtils.checkSchemaColumnNameDuplication(
+            schema, "in SchemaUtilsSuite", caseSensitiveAnalysis = caseSensitive)
+        }.getMessage
+        assert(msg.contains(expectedErrorMsg))
+        msg = intercept[AnalysisException] {
+          SchemaUtils.checkColumnNameDuplication(
+            schema.map(_.name), "in SchemaUtilsSuite", resolver(caseSensitive))
+        }.getMessage
+        assert(msg.contains(expectedErrorMsg))
+        msg = intercept[AnalysisException] {
+          SchemaUtils.checkColumnNameDuplication(
+            schema.map(_.name), "in SchemaUtilsSuite", caseSensitiveAnalysis = caseSensitive)
+        }.getMessage
+        assert(msg.contains(expectedErrorMsg))
+      }
+
+      checkExceptionCases(s"$a0 INT, b INT, $a1 INT", a0 :: Nil)
+      checkExceptionCases(s"$a0 INT, b INT, $a1 INT, $a0 INT", a0 :: Nil)
+      checkExceptionCases(s"$a0 INT, $b0 INT, $a1 INT, $a0 INT, $b1 INT", b0 :: a0 :: Nil)
+    }
+  }
+
+  test("Check no exception thrown for valid schemas") {
+    def checkNoExceptionCases(schemaStr: String, caseSensitive: Boolean): Unit = {
+      val schema = StructType.fromDDL(schemaStr)
+      SchemaUtils.checkSchemaColumnNameDuplication(
+        schema, "in SchemaUtilsSuite", caseSensitiveAnalysis = caseSensitive)
+      SchemaUtils.checkColumnNameDuplication(
+        schema.map(_.name), "in SchemaUtilsSuite", resolver(caseSensitive))
+      SchemaUtils.checkColumnNameDuplication(
+        schema.map(_.name), "in SchemaUtilsSuite", caseSensitiveAnalysis = caseSensitive)
+    }
+
+    checkNoExceptionCases("a INT, b INT, c INT", caseSensitive = true)
+    checkNoExceptionCases("Aa INT, b INT, aA INT", caseSensitive = true)
+
+    checkNoExceptionCases("a INT, b INT, c INT", caseSensitive = false)
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
index 729bd39d821c9..04b2534ca5eb1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -19,8 +19,6 @@ package org.apache.spark.sql.execution.command
 
 import java.net.URI
 
-import org.apache.hadoop.fs.Path
-
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index 8ded1060f7bf0..fa50d12722411 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -20,13 +20,11 @@ package org.apache.spark.sql.execution.command
 import java.io.File
 import java.net.URI
 import java.nio.file.FileSystems
-import java.util.Date
 
 import scala.collection.mutable.ArrayBuffer
 import scala.util.control.NonFatal
 import scala.util.Try
 
-import org.apache.commons.lang3.StringEscapeUtils
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
@@ -42,6 +40,7 @@ import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat
 import org.apache.spark.sql.execution.datasources.json.JsonFileFormat
 import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.util.SchemaUtils
 import org.apache.spark.util.Utils
 
 /**
@@ -202,6 +201,11 @@ case class AlterTableAddColumnsCommand(
 
     // make sure any partition columns are at the end of the fields
     val reorderedSchema = catalogTable.dataSchema ++ columns ++ catalogTable.partitionSchema
+
+    SchemaUtils.checkColumnNameDuplication(
+      reorderedSchema.map(_.name), "in the table definition of " + table.identifier,
+      conf.caseSensitiveAnalysis)
+
     catalog.alterTableSchema(
       table, catalogTable.schema.copy(fields = reorderedSchema.toArray))
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
index a6d56ca91a3ee..ffdfd527fa701 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
@@ -27,6 +27,7 @@ import org.apache.spark.sql.catalyst.expressions.{Alias, SubqueryExpression}
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, View}
 import org.apache.spark.sql.types.MetadataBuilder
+import org.apache.spark.sql.util.SchemaUtils
 
 
 /**
@@ -355,15 +356,15 @@ object ViewHelper {
       properties: Map[String, String],
       session: SparkSession,
       analyzedPlan: LogicalPlan): Map[String, String] = {
+    val queryOutput = analyzedPlan.schema.fieldNames
+
     // Generate the query column names, throw an AnalysisException if there exists duplicate column
     // names.
-    val queryOutput = analyzedPlan.schema.fieldNames
-    assert(queryOutput.distinct.size == queryOutput.size,
-      s"The view output ${queryOutput.mkString("(", ",", ")")} contains duplicate column name.")
+    SchemaUtils.checkColumnNameDuplication(
+      queryOutput, "in the view definition", session.sessionState.conf.resolver)
 
     // Generate the view default database name.
     val viewDefaultDatabase = session.sessionState.catalog.getCurrentDatabase
-
     removeQueryColumnNames(properties) ++
       generateViewDefaultDatabase(viewDefaultDatabase) ++
       generateQueryColumnNames(queryOutput)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index 75e530607570f..d36a04f1fff8e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -87,6 +87,14 @@ case class DataSource(
   lazy val providingClass: Class[_] = DataSource.lookupDataSource(className)
   lazy val sourceInfo: SourceInfo = sourceSchema()
   private val caseInsensitiveOptions = CaseInsensitiveMap(options)
+  private val equality = sparkSession.sessionState.conf.resolver
+
+  bucketSpec.map { bucket =>
+    SchemaUtils.checkColumnNameDuplication(
+      bucket.bucketColumnNames, "in the bucket definition", equality)
+    SchemaUtils.checkColumnNameDuplication(
+      bucket.sortColumnNames, "in the sort definition", equality)
+  }
 
   /**
    * Get the schema of the given FileFormat, if provided by `userSpecifiedSchema`, or try to infer
@@ -132,7 +140,6 @@ case class DataSource(
       // Try to infer partitioning, because no DataSource in the read path provides the partitioning
       // columns properly unless it is a Hive DataSource
       val resolved = tempFileIndex.partitionSchema.map { partitionField =>
-        val equality = sparkSession.sessionState.conf.resolver
         // SPARK-18510: try to get schema from userSpecifiedSchema, otherwise fallback to inferred
         userSpecifiedSchema.flatMap(_.find(f => equality(f.name, partitionField.name))).getOrElse(
           partitionField)
@@ -146,7 +153,6 @@ case class DataSource(
         inferredPartitions
       } else {
         val partitionFields = partitionColumns.map { partitionColumn =>
-          val equality = sparkSession.sessionState.conf.resolver
           userSpecifiedSchema.flatMap(_.find(c => equality(c.name, partitionColumn))).orElse {
             val inferredPartitions = tempFileIndex.partitionSchema
             val inferredOpt = inferredPartitions.find(p => equality(p.name, partitionColumn))
@@ -172,7 +178,6 @@ case class DataSource(
     }
 
     val dataSchema = userSpecifiedSchema.map { schema =>
-      val equality = sparkSession.sessionState.conf.resolver
       StructType(schema.filterNot(f => partitionSchema.exists(p => equality(p.name, f.name))))
     }.orElse {
       format.inferSchema(
@@ -184,9 +189,18 @@ case class DataSource(
         s"Unable to infer schema for $format. It must be specified manually.")
     }
 
-    SchemaUtils.checkColumnNameDuplication(
-      (dataSchema ++ partitionSchema).map(_.name), "in the data schema and the partition schema",
-      sparkSession.sessionState.conf.caseSensitiveAnalysis)
+    // We just print a waring message if the data schema and partition schema have the duplicate
+    // columns. This is because we allow users to do so in the previous Spark releases and
+    // we have the existing tests for the cases (e.g., `ParquetHadoopFsRelationSuite`).
+    // See SPARK-18108 and SPARK-21144 for related discussions.
+    try {
+      SchemaUtils.checkColumnNameDuplication(
+        (dataSchema ++ partitionSchema).map(_.name),
+        "in the data schema and the partition schema",
+        equality)
+    } catch {
+      case e: AnalysisException => logWarning(e.getMessage)
+    }
 
     (dataSchema, partitionSchema)
   }
@@ -391,6 +405,23 @@ case class DataSource(
           s"$className is not a valid Spark SQL Data Source.")
     }
 
+    relation match {
+      case hs: HadoopFsRelation =>
+        SchemaUtils.checkColumnNameDuplication(
+          hs.dataSchema.map(_.name),
+          "in the data schema",
+          equality)
+        SchemaUtils.checkColumnNameDuplication(
+          hs.partitionSchema.map(_.name),
+          "in the partition schema",
+          equality)
+      case _ =>
+        SchemaUtils.checkColumnNameDuplication(
+          relation.schema.map(_.name),
+          "in the data schema",
+          equality)
+    }
+
     relation
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index 0031567d3d288..c1bcfb8610783 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -21,7 +21,6 @@ import java.io.IOException
 
 import org.apache.hadoop.fs.{FileSystem, Path}
 
-import org.apache.spark.SparkContext
 import org.apache.spark.internal.io.FileCommitProtocol
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable, CatalogTablePartition}
@@ -30,7 +29,7 @@ import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.command._
-import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
+import org.apache.spark.sql.util.SchemaUtils
 
 /**
  * A command for writing data to a [[HadoopFsRelation]].  Supports both overwriting and appending.
@@ -64,13 +63,10 @@ case class InsertIntoHadoopFsRelationCommand(
     assert(children.length == 1)
 
     // Most formats don't do well with duplicate columns, so lets not allow that
-    if (query.schema.fieldNames.length != query.schema.fieldNames.distinct.length) {
-      val duplicateColumns = query.schema.fieldNames.groupBy(identity).collect {
-        case (x, ys) if ys.length > 1 => "\"" + x + "\""
-      }.mkString(", ")
-      throw new AnalysisException(s"Duplicate column(s): $duplicateColumns found, " +
-        "cannot save to file.")
-    }
+    SchemaUtils.checkSchemaColumnNameDuplication(
+      query.schema,
+      s"when inserting into $outputPath",
+      sparkSession.sessionState.conf.caseSensitiveAnalysis)
 
     val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(options)
     val fs = outputPath.getFileSystem(hadoopConf)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
index f61c673baaa58..92358da6d6c67 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
@@ -33,6 +33,7 @@ import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.{Cast, Literal}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.util.SchemaUtils
 
 // TODO: We should tighten up visibility of the classes here once we clean up Hive coupling.
 
@@ -301,13 +302,8 @@ object PartitioningUtils {
       normalizedKey -> value
     }
 
-    if (normalizedPartSpec.map(_._1).distinct.length != normalizedPartSpec.length) {
-      val duplicateColumns = normalizedPartSpec.map(_._1).groupBy(identity).collect {
-        case (x, ys) if ys.length > 1 => x
-      }
-      throw new AnalysisException(s"Found duplicated columns in partition specification: " +
-        duplicateColumns.mkString(", "))
-    }
+    SchemaUtils.checkColumnNameDuplication(
+      normalizedPartSpec.map(_._1), "in the partition schema", resolver)
 
     normalizedPartSpec.toMap
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
index 55b2539c13381..bbe9024f13a44 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
@@ -35,6 +35,7 @@ import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
 import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils, GenericArrayData}
 import org.apache.spark.sql.jdbc.{JdbcDialect, JdbcDialects, JdbcType}
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.util.SchemaUtils
 import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.NextIterator
 
@@ -749,14 +750,8 @@ object JdbcUtils extends Logging {
     val nameEquality = df.sparkSession.sessionState.conf.resolver
 
     // checks duplicate columns in the user specified column types.
-    userSchema.fieldNames.foreach { col =>
-      val duplicatesCols = userSchema.fieldNames.filter(nameEquality(_, col))
-      if (duplicatesCols.size >= 2) {
-        throw new AnalysisException(
-          "Found duplicate column(s) in createTableColumnTypes option value: " +
-            duplicatesCols.mkString(", "))
-      }
-    }
+    SchemaUtils.checkColumnNameDuplication(
+      userSchema.map(_.name), "in the createTableColumnTypes option value", nameEquality)
 
     // checks if user specified column names exist in the DataFrame schema
     userSchema.fieldNames.foreach { col =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonDataSource.scala
index 5a92a71d19e78..8b7c2709afde1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonDataSource.scala
@@ -59,9 +59,7 @@ abstract class JsonDataSource extends Serializable {
       inputPaths: Seq[FileStatus],
       parsedOptions: JSONOptions): Option[StructType] = {
     if (inputPaths.nonEmpty) {
-      val jsonSchema = infer(sparkSession, inputPaths, parsedOptions)
-      checkConstraints(jsonSchema)
-      Some(jsonSchema)
+      Some(infer(sparkSession, inputPaths, parsedOptions))
     } else {
       None
     }
@@ -71,17 +69,6 @@ abstract class JsonDataSource extends Serializable {
       sparkSession: SparkSession,
       inputPaths: Seq[FileStatus],
       parsedOptions: JSONOptions): StructType
-
-  /** Constraints to be imposed on schema to be stored. */
-  private def checkConstraints(schema: StructType): Unit = {
-    if (schema.fieldNames.length != schema.fieldNames.distinct.length) {
-      val duplicateColumns = schema.fieldNames.groupBy(identity).collect {
-        case (x, ys) if ys.length > 1 => "\"" + x + "\""
-      }.mkString(", ")
-      throw new AnalysisException(s"Duplicate column(s) : $duplicateColumns found, " +
-        s"cannot save to JSON format")
-    }
-  }
 }
 
 object JsonDataSource {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
index 3f4a78580f1eb..41d40aa926fbb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources.InsertableRelation
 import org.apache.spark.sql.types.{AtomicType, StructType}
+import org.apache.spark.sql.util.SchemaUtils
 
 /**
  * Try to replaces [[UnresolvedRelation]]s if the plan is for direct query on files.
@@ -222,12 +223,10 @@ case class PreprocessTableCreation(sparkSession: SparkSession) extends Rule[Logi
   }
 
   private def normalizeCatalogTable(schema: StructType, table: CatalogTable): CatalogTable = {
-    val columnNames = if (sparkSession.sessionState.conf.caseSensitiveAnalysis) {
-      schema.map(_.name)
-    } else {
-      schema.map(_.name.toLowerCase)
-    }
-    checkDuplication(columnNames, "table definition of " + table.identifier)
+    SchemaUtils.checkSchemaColumnNameDuplication(
+      schema,
+      "in the table definition of " + table.identifier,
+      sparkSession.sessionState.conf.caseSensitiveAnalysis)
 
     val normalizedPartCols = normalizePartitionColumns(schema, table)
     val normalizedBucketSpec = normalizeBucketSpec(schema, table)
@@ -253,7 +252,10 @@ case class PreprocessTableCreation(sparkSession: SparkSession) extends Rule[Logi
       partCols = table.partitionColumnNames,
       resolver = sparkSession.sessionState.conf.resolver)
 
-    checkDuplication(normalizedPartitionCols, "partition")
+    SchemaUtils.checkColumnNameDuplication(
+      normalizedPartitionCols,
+      "in the partition schema",
+      sparkSession.sessionState.conf.resolver)
 
     if (schema.nonEmpty && normalizedPartitionCols.length == schema.length) {
       if (DDLUtils.isHiveTable(table)) {
@@ -283,8 +285,15 @@ case class PreprocessTableCreation(sparkSession: SparkSession) extends Rule[Logi
           tableCols = schema.map(_.name),
           bucketSpec = bucketSpec,
           resolver = sparkSession.sessionState.conf.resolver)
-        checkDuplication(normalizedBucketSpec.bucketColumnNames, "bucket")
-        checkDuplication(normalizedBucketSpec.sortColumnNames, "sort")
+
+        SchemaUtils.checkColumnNameDuplication(
+          normalizedBucketSpec.bucketColumnNames,
+          "in the bucket definition",
+          sparkSession.sessionState.conf.resolver)
+        SchemaUtils.checkColumnNameDuplication(
+          normalizedBucketSpec.sortColumnNames,
+          "in the sort definition",
+          sparkSession.sessionState.conf.resolver)
 
         normalizedBucketSpec.sortColumnNames.map(schema(_)).map(_.dataType).foreach {
           case dt if RowOrdering.isOrderable(dt) => // OK
@@ -297,15 +306,6 @@ case class PreprocessTableCreation(sparkSession: SparkSession) extends Rule[Logi
     }
   }
 
-  private def checkDuplication(colNames: Seq[String], colType: String): Unit = {
-    if (colNames.distinct.length != colNames.length) {
-      val duplicateColumns = colNames.groupBy(identity).collect {
-        case (x, ys) if ys.length > 1 => x
-      }
-      failAnalysis(s"Found duplicate column(s) in $colType: ${duplicateColumns.mkString(", ")}")
-    }
-  }
-
   private def failAnalysis(msg: String) = throw new AnalysisException(msg)
 }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index b2219b4eb8c17..a5a2e1c38d300 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -1189,7 +1189,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
       Seq((1, 2, 3), (2, 3, 4), (3, 4, 5)).toDF("column1", "column2", "column1")
         .write.format("parquet").save("temp")
     }
-    assert(e.getMessage.contains("Duplicate column(s)"))
+    assert(e.getMessage.contains("Found duplicate column(s) when inserting into"))
     assert(e.getMessage.contains("column1"))
     assert(!e.getMessage.contains("column2"))
 
@@ -1199,7 +1199,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
         .toDF("column1", "column2", "column3", "column1", "column3")
         .write.format("json").save("temp")
     }
-    assert(f.getMessage.contains("Duplicate column(s)"))
+    assert(f.getMessage.contains("Found duplicate column(s) when inserting into"))
     assert(f.getMessage.contains("column1"))
     assert(f.getMessage.contains("column3"))
     assert(!f.getMessage.contains("column2"))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 5c40d8bb4b1ef..5c0a6aa724bf0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -436,16 +436,13 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
   }
 
   test("create table - duplicate column names in the table definition") {
-    val e = intercept[AnalysisException] {
-      sql("CREATE TABLE tbl(a int, a string) USING json")
-    }
-    assert(e.message == "Found duplicate column(s) in table definition of `tbl`: a")
-
-    withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") {
-      val e2 = intercept[AnalysisException] {
-        sql("CREATE TABLE tbl(a int, A string) USING json")
+    Seq((true, ("a", "a")), (false, ("aA", "Aa"))).foreach { case (caseSensitive, (c0, c1)) =>
+      withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) {
+        val errMsg = intercept[AnalysisException] {
+          sql(s"CREATE TABLE t($c0 INT, $c1 INT) USING parquet")
+        }.getMessage
+        assert(errMsg.contains("Found duplicate column(s) in the table definition of `t`"))
       }
-      assert(e2.message == "Found duplicate column(s) in table definition of `tbl`: a")
     }
   }
 
@@ -466,17 +463,33 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
   }
 
   test("create table - column repeated in partition columns") {
-    val e = intercept[AnalysisException] {
-      sql("CREATE TABLE tbl(a int) USING json PARTITIONED BY (a, a)")
+    Seq((true, ("a", "a")), (false, ("aA", "Aa"))).foreach { case (caseSensitive, (c0, c1)) =>
+      withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) {
+        val errMsg = intercept[AnalysisException] {
+          sql(s"CREATE TABLE t($c0 INT) USING parquet PARTITIONED BY ($c0, $c1)")
+        }.getMessage
+        assert(errMsg.contains("Found duplicate column(s) in the partition schema"))
+      }
     }
-    assert(e.message == "Found duplicate column(s) in partition: a")
   }
 
-  test("create table - column repeated in bucket columns") {
-    val e = intercept[AnalysisException] {
-      sql("CREATE TABLE tbl(a int) USING json CLUSTERED BY (a, a) INTO 4 BUCKETS")
+  test("create table - column repeated in bucket/sort columns") {
+    Seq((true, ("a", "a")), (false, ("aA", "Aa"))).foreach { case (caseSensitive, (c0, c1)) =>
+      withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) {
+        var errMsg = intercept[AnalysisException] {
+          sql(s"CREATE TABLE t($c0 INT) USING parquet CLUSTERED BY ($c0, $c1) INTO 2 BUCKETS")
+        }.getMessage
+        assert(errMsg.contains("Found duplicate column(s) in the bucket definition"))
+
+        errMsg = intercept[AnalysisException] {
+          sql(s"""
+              |CREATE TABLE t($c0 INT, col INT) USING parquet CLUSTERED BY (col)
+              |  SORTED BY ($c0, $c1) INTO 2 BUCKETS
+             """.stripMargin)
+        }.getMessage
+        assert(errMsg.contains("Found duplicate column(s) in the sort definition"))
+      }
     }
-    assert(e.message == "Found duplicate column(s) in bucket: a")
   }
 
   test("Refresh table after changing the data source table partitioning") {
@@ -528,6 +541,17 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
     }
   }
 
+  test("create view - duplicate column names in the view definition") {
+    Seq((true, ("a", "a")), (false, ("aA", "Aa"))).foreach { case (caseSensitive, (c0, c1)) =>
+      withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) {
+        val errMsg = intercept[AnalysisException] {
+          sql(s"CREATE VIEW t AS SELECT * FROM VALUES (1, 1) AS t($c0, $c1)")
+        }.getMessage
+        assert(errMsg.contains("Found duplicate column(s) in the view definition"))
+      }
+    }
+  }
+
   test("Alter/Describe Database") {
     val catalog = spark.sessionState.catalog
     val databaseNames = Seq("db1", "`database`")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
index 92f50a095f19b..2334d5ae32dc3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.jdbc
 
-import java.sql.{Date, DriverManager, Timestamp}
+import java.sql.DriverManager
 import java.util.Properties
 
 import scala.collection.JavaConverters.propertiesAsScalaMapConverter
@@ -479,7 +479,7 @@ class JDBCWriteSuite extends SharedSQLContext with BeforeAndAfter {
           .jdbc(url1, "TEST.USERDBTYPETEST", properties)
       }.getMessage()
       assert(msg.contains(
-        "Found duplicate column(s) in createTableColumnTypes option value: name, NaMe"))
+        "Found duplicate column(s) in the createTableColumnTypes option value: `name`"))
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala
index 0f97fd78d2ffb..308c5079c44bf 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala
@@ -21,11 +21,12 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.datasources.DataSource
+import org.apache.spark.sql.test.SharedSQLContext
 
-class ResolvedDataSourceSuite extends SparkFunSuite {
+class ResolvedDataSourceSuite extends SparkFunSuite with SharedSQLContext {
   private def getProvidingClass(name: String): Class[_] =
     DataSource(
-      sparkSession = null,
+      sparkSession = spark,
       className = name,
       options = Map(DateTimeUtils.TIMEZONE_OPTION -> DateTimeUtils.defaultTimeZone().getID)
     ).providingClass
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
index bb6a27803bb20..6676099d426ba 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
@@ -26,6 +26,7 @@ import org.apache.spark.sql.execution.DataSourceScanExec
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.functions._
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
 import org.apache.spark.util.Utils
 
@@ -352,4 +353,40 @@ class FileStreamSinkSuite extends StreamTest {
     assertAncestorIsNotMetadataDirectory(s"/a/b/c")
     assertAncestorIsNotMetadataDirectory(s"/a/b/c/${FileStreamSink.metadataDir}extra")
   }
+
+  test("SPARK-20460 Check name duplication in schema") {
+    Seq((true, ("a", "a")), (false, ("aA", "Aa"))).foreach { case (caseSensitive, (c0, c1)) =>
+      withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) {
+        val inputData = MemoryStream[(Int, Int)]
+        val df = inputData.toDF()
+
+        val outputDir = Utils.createTempDir(namePrefix = "stream.output").getCanonicalPath
+        val checkpointDir = Utils.createTempDir(namePrefix = "stream.checkpoint").getCanonicalPath
+
+        var query: StreamingQuery = null
+        try {
+          query =
+            df.writeStream
+              .option("checkpointLocation", checkpointDir)
+              .format("json")
+              .start(outputDir)
+
+          inputData.addData((1, 1))
+
+          failAfter(streamingTimeout) {
+            query.processAllAvailable()
+          }
+        } finally {
+          if (query != null) {
+            query.stop()
+          }
+        }
+
+        val errorMsg = intercept[AnalysisException] {
+          spark.read.schema(s"$c0 INT, $c1 INT").json(outputDir).as[(Int, Int)]
+        }.getMessage
+        assert(errorMsg.contains("Found duplicate column(s) in the data schema: "))
+      }
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala
index 306aecb5bbc86..569bac156b531 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala
@@ -27,6 +27,7 @@ import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage
 import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
@@ -687,4 +688,91 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSQLContext with Be
     testRead(spark.read.schema(userSchemaString).text(dir, dir), data ++ data, userSchema)
     testRead(spark.read.schema(userSchemaString).text(Seq(dir, dir): _*), data ++ data, userSchema)
   }
+
+  test("SPARK-20460 Check name duplication in buckets") {
+    Seq((true, ("a", "a")), (false, ("aA", "Aa"))).foreach { case (caseSensitive, (c0, c1)) =>
+      withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) {
+        var errorMsg = intercept[AnalysisException] {
+          Seq((1, 1)).toDF("col", c0).write.bucketBy(2, c0, c1).saveAsTable("t")
+        }.getMessage
+        assert(errorMsg.contains("Found duplicate column(s) in the bucket definition"))
+
+        errorMsg = intercept[AnalysisException] {
+          Seq((1, 1)).toDF("col", c0).write.bucketBy(2, "col").sortBy(c0, c1).saveAsTable("t")
+        }.getMessage
+        assert(errorMsg.contains("Found duplicate column(s) in the sort definition"))
+      }
+    }
+  }
+
+  test("SPARK-20460 Check name duplication in schema") {
+    def checkWriteDataColumnDuplication(
+        format: String, colName0: String, colName1: String, tempDir: File): Unit = {
+      val errorMsg = intercept[AnalysisException] {
+        Seq((1, 1)).toDF(colName0, colName1).write.format(format).mode("overwrite")
+          .save(tempDir.getAbsolutePath)
+      }.getMessage
+      assert(errorMsg.contains("Found duplicate column(s) when inserting into"))
+    }
+
+    def checkReadUserSpecifiedDataColumnDuplication(
+        df: DataFrame, format: String, colName0: String, colName1: String, tempDir: File): Unit = {
+      val testDir = Utils.createTempDir(tempDir.getAbsolutePath)
+      df.write.format(format).mode("overwrite").save(testDir.getAbsolutePath)
+      val errorMsg = intercept[AnalysisException] {
+        spark.read.format(format).schema(s"$colName0 INT, $colName1 INT")
+          .load(testDir.getAbsolutePath)
+      }.getMessage
+      assert(errorMsg.contains("Found duplicate column(s) in the data schema:"))
+    }
+
+    def checkReadPartitionColumnDuplication(
+        format: String, colName0: String, colName1: String, tempDir: File): Unit = {
+      val testDir = Utils.createTempDir(tempDir.getAbsolutePath)
+      Seq(1).toDF("col").write.format(format).mode("overwrite")
+        .save(s"${testDir.getAbsolutePath}/$colName0=1/$colName1=1")
+      val errorMsg = intercept[AnalysisException] {
+        spark.read.format(format).load(testDir.getAbsolutePath)
+      }.getMessage
+      assert(errorMsg.contains("Found duplicate column(s) in the partition schema:"))
+    }
+
+    Seq((true, ("a", "a")), (false, ("aA", "Aa"))).foreach { case (caseSensitive, (c0, c1)) =>
+      withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) {
+        withTempDir { src =>
+          // Check CSV format
+          checkWriteDataColumnDuplication("csv", c0, c1, src)
+          checkReadUserSpecifiedDataColumnDuplication(
+            Seq((1, 1)).toDF("c0", "c1"), "csv", c0, c1, src)
+          // If `inferSchema` is true, a CSV format is duplicate-safe (See SPARK-16896)
+          var testDir = Utils.createTempDir(src.getAbsolutePath)
+          Seq("a,a", "1,1").toDF().coalesce(1).write.mode("overwrite").text(testDir.getAbsolutePath)
+          val df = spark.read.format("csv").option("inferSchema", true).option("header", true)
+            .load(testDir.getAbsolutePath)
+          checkAnswer(df, Row(1, 1))
+          checkReadPartitionColumnDuplication("csv", c0, c1, src)
+
+          // Check JSON format
+          checkWriteDataColumnDuplication("json", c0, c1, src)
+          checkReadUserSpecifiedDataColumnDuplication(
+            Seq((1, 1)).toDF("c0", "c1"), "json", c0, c1, src)
+          // Inferred schema cases
+          testDir = Utils.createTempDir(src.getAbsolutePath)
+          Seq(s"""{"$c0":3, "$c1":5}""").toDF().write.mode("overwrite")
+            .text(testDir.getAbsolutePath)
+          val errorMsg = intercept[AnalysisException] {
+            spark.read.format("json").option("inferSchema", true).load(testDir.getAbsolutePath)
+          }.getMessage
+          assert(errorMsg.contains("Found duplicate column(s) in the data schema:"))
+          checkReadPartitionColumnDuplication("json", c0, c1, src)
+
+          // Check Parquet format
+          checkWriteDataColumnDuplication("parquet", c0, c1, src)
+          checkReadUserSpecifiedDataColumnDuplication(
+            Seq((1, 1)).toDF("c0", "c1"), "parquet", c0, c1, src)
+          checkReadPartitionColumnDuplication("parquet", c0, c1, src)
+        }
+      }
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index 31fa3d2447467..12daf3af11abe 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -345,7 +345,7 @@ class HiveDDLSuite
     val e = intercept[AnalysisException] {
       sql("CREATE TABLE tbl(a int) PARTITIONED BY (a string)")
     }
-    assert(e.message == "Found duplicate column(s) in table definition of `default`.`tbl`: a")
+    assert(e.message == "Found duplicate column(s) in the table definition of `default`.`tbl`: `a`")
   }
 
   test("add/drop partition with location - managed table") {

From 6a06c4b03c4dd86241fb9d11b4360371488f0e53 Mon Sep 17 00:00:00 2001
From: jinxing <jinxing6042@126.com>
Date: Mon, 10 Jul 2017 21:06:58 +0800
Subject: [PATCH 144/779] [SPARK-21342] Fix DownloadCallback to work well with
 RetryingBlockFetcher.

## What changes were proposed in this pull request?

When `RetryingBlockFetcher` retries fetching blocks. There could be two `DownloadCallback`s download the same content to the same target file. It could cause `ShuffleBlockFetcherIterator` reading a partial result.

This pr proposes to create and delete the tmp files in `OneForOneBlockFetcher`

Author: jinxing <jinxing6042@126.com>
Author: Shixiong Zhu <zsxwing@gmail.com>

Closes #18565 from jinxing64/SPARK-21342.
---
 .../shuffle/ExternalShuffleClient.java        |  7 ++--
 .../shuffle/OneForOneBlockFetcher.java        | 34 +++++++++++-------
 .../spark/network/shuffle/ShuffleClient.java  | 13 +++++--
 .../shuffle/TempShuffleFileManager.java       | 36 +++++++++++++++++++
 .../network/sasl/SaslIntegrationSuite.java    |  2 +-
 .../shuffle/OneForOneBlockFetcherSuite.java   |  2 +-
 .../spark/network/BlockTransferService.scala  |  8 ++---
 .../netty/NettyBlockTransferService.scala     |  9 +++--
 .../storage/ShuffleBlockFetcherIterator.scala | 28 ++++++++++-----
 .../spark/storage/BlockManagerSuite.scala     |  4 +--
 .../ShuffleBlockFetcherIteratorSuite.scala    | 10 +++---
 11 files changed, 108 insertions(+), 45 deletions(-)
 create mode 100644 common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/TempShuffleFileManager.java

diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleClient.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleClient.java
index 6ac9302517ee0..31bd24e5038b2 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleClient.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleClient.java
@@ -17,7 +17,6 @@
 
 package org.apache.spark.network.shuffle;
 
-import java.io.File;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.List;
@@ -91,15 +90,15 @@ public void fetchBlocks(
       String execId,
       String[] blockIds,
       BlockFetchingListener listener,
-      File[] shuffleFiles) {
+      TempShuffleFileManager tempShuffleFileManager) {
     checkInit();
     logger.debug("External shuffle fetch from {}:{} (executor id {})", host, port, execId);
     try {
       RetryingBlockFetcher.BlockFetchStarter blockFetchStarter =
           (blockIds1, listener1) -> {
             TransportClient client = clientFactory.createClient(host, port);
-            new OneForOneBlockFetcher(client, appId, execId, blockIds1, listener1, conf,
-              shuffleFiles).start();
+            new OneForOneBlockFetcher(client, appId, execId,
+              blockIds1, listener1, conf, tempShuffleFileManager).start();
           };
 
       int maxRetries = conf.maxIORetries();
diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java
index d46ce2e0e6b78..2f160d12af22b 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java
@@ -57,11 +57,21 @@ public class OneForOneBlockFetcher {
   private final String[] blockIds;
   private final BlockFetchingListener listener;
   private final ChunkReceivedCallback chunkCallback;
-  private TransportConf transportConf = null;
-  private File[] shuffleFiles = null;
+  private final TransportConf transportConf;
+  private final TempShuffleFileManager tempShuffleFileManager;
 
   private StreamHandle streamHandle = null;
 
+  public OneForOneBlockFetcher(
+    TransportClient client,
+    String appId,
+    String execId,
+    String[] blockIds,
+    BlockFetchingListener listener,
+    TransportConf transportConf) {
+    this(client, appId, execId, blockIds, listener, transportConf, null);
+  }
+
   public OneForOneBlockFetcher(
       TransportClient client,
       String appId,
@@ -69,18 +79,14 @@ public OneForOneBlockFetcher(
       String[] blockIds,
       BlockFetchingListener listener,
       TransportConf transportConf,
-      File[] shuffleFiles) {
+      TempShuffleFileManager tempShuffleFileManager) {
     this.client = client;
     this.openMessage = new OpenBlocks(appId, execId, blockIds);
     this.blockIds = blockIds;
     this.listener = listener;
     this.chunkCallback = new ChunkCallback();
     this.transportConf = transportConf;
-    if (shuffleFiles != null) {
-      this.shuffleFiles = shuffleFiles;
-      assert this.shuffleFiles.length == blockIds.length:
-        "Number of shuffle files should equal to blocks";
-    }
+    this.tempShuffleFileManager = tempShuffleFileManager;
   }
 
   /** Callback invoked on receipt of each chunk. We equate a single chunk to a single block. */
@@ -119,9 +125,9 @@ public void onSuccess(ByteBuffer response) {
           // Immediately request all chunks -- we expect that the total size of the request is
           // reasonable due to higher level chunking in [[ShuffleBlockFetcherIterator]].
           for (int i = 0; i < streamHandle.numChunks; i++) {
-            if (shuffleFiles != null) {
+            if (tempShuffleFileManager != null) {
               client.stream(OneForOneStreamManager.genStreamChunkId(streamHandle.streamId, i),
-                new DownloadCallback(shuffleFiles[i], i));
+                new DownloadCallback(i));
             } else {
               client.fetchChunk(streamHandle.streamId, i, chunkCallback);
             }
@@ -157,8 +163,8 @@ private class DownloadCallback implements StreamCallback {
     private File targetFile = null;
     private int chunkIndex;
 
-    DownloadCallback(File targetFile, int chunkIndex) throws IOException {
-      this.targetFile = targetFile;
+    DownloadCallback(int chunkIndex) throws IOException {
+      this.targetFile = tempShuffleFileManager.createTempShuffleFile();
       this.channel = Channels.newChannel(new FileOutputStream(targetFile));
       this.chunkIndex = chunkIndex;
     }
@@ -174,6 +180,9 @@ public void onComplete(String streamId) throws IOException {
       ManagedBuffer buffer = new FileSegmentManagedBuffer(transportConf, targetFile, 0,
         targetFile.length());
       listener.onBlockFetchSuccess(blockIds[chunkIndex], buffer);
+      if (!tempShuffleFileManager.registerTempShuffleFileToClean(targetFile)) {
+        targetFile.delete();
+      }
     }
 
     @Override
@@ -182,6 +191,7 @@ public void onFailure(String streamId, Throwable cause) throws IOException {
       // On receipt of a failure, fail every block from chunkIndex onwards.
       String[] remainingBlockIds = Arrays.copyOfRange(blockIds, chunkIndex, blockIds.length);
       failRemainingBlocks(remainingBlockIds, cause);
+      targetFile.delete();
     }
   }
 }
diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleClient.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleClient.java
index 978ff5a2a8699..9e77bee7f9ee6 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleClient.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleClient.java
@@ -18,7 +18,6 @@
 package org.apache.spark.network.shuffle;
 
 import java.io.Closeable;
-import java.io.File;
 
 /** Provides an interface for reading shuffle files, either from an Executor or external service. */
 public abstract class ShuffleClient implements Closeable {
@@ -35,6 +34,16 @@ public void init(String appId) { }
    * Note that this API takes a sequence so the implementation can batch requests, and does not
    * return a future so the underlying implementation can invoke onBlockFetchSuccess as soon as
    * the data of a block is fetched, rather than waiting for all blocks to be fetched.
+   *
+   * @param host the host of the remote node.
+   * @param port the port of the remote node.
+   * @param execId the executor id.
+   * @param blockIds block ids to fetch.
+   * @param listener the listener to receive block fetching status.
+   * @param tempShuffleFileManager TempShuffleFileManager to create and clean temp shuffle files.
+   *                               If it's not <code>null</code>, the remote blocks will be streamed
+   *                               into temp shuffle files to reduce the memory usage, otherwise,
+   *                               they will be kept in memory.
    */
   public abstract void fetchBlocks(
       String host,
@@ -42,5 +51,5 @@ public abstract void fetchBlocks(
       String execId,
       String[] blockIds,
       BlockFetchingListener listener,
-      File[] shuffleFiles);
+      TempShuffleFileManager tempShuffleFileManager);
 }
diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/TempShuffleFileManager.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/TempShuffleFileManager.java
new file mode 100644
index 0000000000000..84a5ed6a276bd
--- /dev/null
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/TempShuffleFileManager.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.network.shuffle;
+
+import java.io.File;
+
+/**
+ * A manager to create temp shuffle block files to reduce the memory usage and also clean temp
+ * files when they won't be used any more.
+ */
+public interface TempShuffleFileManager {
+
+  /** Create a temp shuffle block file. */
+  File createTempShuffleFile();
+
+  /**
+   * Register a temp shuffle file to clean up when it won't be used any more. Return whether the
+   * file is registered successfully. If `false`, the caller should clean up the file by itself.
+   */
+  boolean registerTempShuffleFileToClean(File file);
+}
diff --git a/common/network-shuffle/src/test/java/org/apache/spark/network/sasl/SaslIntegrationSuite.java b/common/network-shuffle/src/test/java/org/apache/spark/network/sasl/SaslIntegrationSuite.java
index 8110f1e004c73..02e6eb3a4467e 100644
--- a/common/network-shuffle/src/test/java/org/apache/spark/network/sasl/SaslIntegrationSuite.java
+++ b/common/network-shuffle/src/test/java/org/apache/spark/network/sasl/SaslIntegrationSuite.java
@@ -204,7 +204,7 @@ public void onBlockFetchFailure(String blockId, Throwable t) {
 
       String[] blockIds = { "shuffle_0_1_2", "shuffle_0_3_4" };
       OneForOneBlockFetcher fetcher =
-          new OneForOneBlockFetcher(client1, "app-2", "0", blockIds, listener, conf, null);
+          new OneForOneBlockFetcher(client1, "app-2", "0", blockIds, listener, conf);
       fetcher.start();
       blockFetchLatch.await();
       checkSecurityException(exception.get());
diff --git a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/OneForOneBlockFetcherSuite.java b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/OneForOneBlockFetcherSuite.java
index 61d82214e7d30..dc947a619bf02 100644
--- a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/OneForOneBlockFetcherSuite.java
+++ b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/OneForOneBlockFetcherSuite.java
@@ -131,7 +131,7 @@ private static BlockFetchingListener fetchBlocks(LinkedHashMap<String, ManagedBu
     BlockFetchingListener listener = mock(BlockFetchingListener.class);
     String[] blockIds = blocks.keySet().toArray(new String[blocks.size()]);
     OneForOneBlockFetcher fetcher =
-      new OneForOneBlockFetcher(client, "app-id", "exec-id", blockIds, listener, conf, null);
+      new OneForOneBlockFetcher(client, "app-id", "exec-id", blockIds, listener, conf);
 
     // Respond to the "OpenBlocks" message with an appropriate ShuffleStreamHandle with streamId 123
     doAnswer(invocationOnMock -> {
diff --git a/core/src/main/scala/org/apache/spark/network/BlockTransferService.scala b/core/src/main/scala/org/apache/spark/network/BlockTransferService.scala
index 6860214c7fe39..fe5fd2da039bb 100644
--- a/core/src/main/scala/org/apache/spark/network/BlockTransferService.scala
+++ b/core/src/main/scala/org/apache/spark/network/BlockTransferService.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.network
 
-import java.io.{Closeable, File}
+import java.io.Closeable
 import java.nio.ByteBuffer
 
 import scala.concurrent.{Future, Promise}
@@ -26,7 +26,7 @@ import scala.reflect.ClassTag
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer}
-import org.apache.spark.network.shuffle.{BlockFetchingListener, ShuffleClient}
+import org.apache.spark.network.shuffle.{BlockFetchingListener, ShuffleClient, TempShuffleFileManager}
 import org.apache.spark.storage.{BlockId, StorageLevel}
 import org.apache.spark.util.ThreadUtils
 
@@ -68,7 +68,7 @@ abstract class BlockTransferService extends ShuffleClient with Closeable with Lo
       execId: String,
       blockIds: Array[String],
       listener: BlockFetchingListener,
-      shuffleFiles: Array[File]): Unit
+      tempShuffleFileManager: TempShuffleFileManager): Unit
 
   /**
    * Upload a single block to a remote node, available only after [[init]] is invoked.
@@ -101,7 +101,7 @@ abstract class BlockTransferService extends ShuffleClient with Closeable with Lo
           ret.flip()
           result.success(new NioManagedBuffer(ret))
         }
-      }, shuffleFiles = null)
+      }, tempShuffleFileManager = null)
     ThreadUtils.awaitResult(result.future, Duration.Inf)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
index b13a9c681e543..30ff93897f98a 100644
--- a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
+++ b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.network.netty
 
-import java.io.File
 import java.nio.ByteBuffer
 
 import scala.collection.JavaConverters._
@@ -30,7 +29,7 @@ import org.apache.spark.network.buffer.ManagedBuffer
 import org.apache.spark.network.client.{RpcResponseCallback, TransportClientBootstrap, TransportClientFactory}
 import org.apache.spark.network.crypto.{AuthClientBootstrap, AuthServerBootstrap}
 import org.apache.spark.network.server._
-import org.apache.spark.network.shuffle.{BlockFetchingListener, OneForOneBlockFetcher, RetryingBlockFetcher}
+import org.apache.spark.network.shuffle.{BlockFetchingListener, OneForOneBlockFetcher, RetryingBlockFetcher, TempShuffleFileManager}
 import org.apache.spark.network.shuffle.protocol.UploadBlock
 import org.apache.spark.network.util.JavaUtils
 import org.apache.spark.serializer.JavaSerializer
@@ -90,14 +89,14 @@ private[spark] class NettyBlockTransferService(
       execId: String,
       blockIds: Array[String],
       listener: BlockFetchingListener,
-      shuffleFiles: Array[File]): Unit = {
+      tempShuffleFileManager: TempShuffleFileManager): Unit = {
     logTrace(s"Fetch blocks from $host:$port (executor id $execId)")
     try {
       val blockFetchStarter = new RetryingBlockFetcher.BlockFetchStarter {
         override def createAndStart(blockIds: Array[String], listener: BlockFetchingListener) {
           val client = clientFactory.createClient(host, port)
-          new OneForOneBlockFetcher(client, appId, execId, blockIds.toArray, listener,
-            transportConf, shuffleFiles).start()
+          new OneForOneBlockFetcher(client, appId, execId, blockIds, listener,
+            transportConf, tempShuffleFileManager).start()
         }
       }
 
diff --git a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
index a10f1feadd0af..81d822dc8a98f 100644
--- a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
+++ b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
@@ -28,7 +28,7 @@ import scala.collection.mutable.{ArrayBuffer, HashSet, Queue}
 import org.apache.spark.{SparkException, TaskContext}
 import org.apache.spark.internal.Logging
 import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer}
-import org.apache.spark.network.shuffle.{BlockFetchingListener, ShuffleClient}
+import org.apache.spark.network.shuffle.{BlockFetchingListener, ShuffleClient, TempShuffleFileManager}
 import org.apache.spark.shuffle.FetchFailedException
 import org.apache.spark.util.Utils
 import org.apache.spark.util.io.ChunkedByteBufferOutputStream
@@ -66,7 +66,7 @@ final class ShuffleBlockFetcherIterator(
     maxReqsInFlight: Int,
     maxReqSizeShuffleToMem: Long,
     detectCorrupt: Boolean)
-  extends Iterator[(BlockId, InputStream)] with Logging {
+  extends Iterator[(BlockId, InputStream)] with TempShuffleFileManager with Logging {
 
   import ShuffleBlockFetcherIterator._
 
@@ -135,7 +135,8 @@ final class ShuffleBlockFetcherIterator(
    * A set to store the files used for shuffling remote huge blocks. Files in this set will be
    * deleted when cleanup. This is a layer of defensiveness against disk file leaks.
    */
-  val shuffleFilesSet = mutable.HashSet[File]()
+  @GuardedBy("this")
+  private[this] val shuffleFilesSet = mutable.HashSet[File]()
 
   initialize()
 
@@ -149,6 +150,19 @@ final class ShuffleBlockFetcherIterator(
     currentResult = null
   }
 
+  override def createTempShuffleFile(): File = {
+    blockManager.diskBlockManager.createTempLocalBlock()._2
+  }
+
+  override def registerTempShuffleFileToClean(file: File): Boolean = synchronized {
+    if (isZombie) {
+      false
+    } else {
+      shuffleFilesSet += file
+      true
+    }
+  }
+
   /**
    * Mark the iterator as zombie, and release all buffers that haven't been deserialized yet.
    */
@@ -176,7 +190,7 @@ final class ShuffleBlockFetcherIterator(
     }
     shuffleFilesSet.foreach { file =>
       if (!file.delete()) {
-        logInfo("Failed to cleanup shuffle fetch temp file " + file.getAbsolutePath());
+        logWarning("Failed to cleanup shuffle fetch temp file " + file.getAbsolutePath())
       }
     }
   }
@@ -221,12 +235,8 @@ final class ShuffleBlockFetcherIterator(
     // already encrypted and compressed over the wire(w.r.t. the related configs), we can just fetch
     // the data and write it to file directly.
     if (req.size > maxReqSizeShuffleToMem) {
-      val shuffleFiles = blockIds.map { _ =>
-        blockManager.diskBlockManager.createTempLocalBlock()._2
-      }.toArray
-      shuffleFilesSet ++= shuffleFiles
       shuffleClient.fetchBlocks(address.host, address.port, address.executorId, blockIds.toArray,
-        blockFetchingListener, shuffleFiles)
+        blockFetchingListener, this)
     } else {
       shuffleClient.fetchBlocks(address.host, address.port, address.executorId, blockIds.toArray,
         blockFetchingListener, null)
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
index 086adccea954c..755a61a438a6a 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
@@ -45,7 +45,7 @@ import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer}
 import org.apache.spark.network.client.{RpcResponseCallback, TransportClient}
 import org.apache.spark.network.netty.{NettyBlockTransferService, SparkTransportConf}
 import org.apache.spark.network.server.{NoOpRpcHandler, TransportServer, TransportServerBootstrap}
-import org.apache.spark.network.shuffle.BlockFetchingListener
+import org.apache.spark.network.shuffle.{BlockFetchingListener, ShuffleClient, TempShuffleFileManager}
 import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, RegisterExecutor}
 import org.apache.spark.rpc.RpcEnv
 import org.apache.spark.scheduler.LiveListenerBus
@@ -1382,7 +1382,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
         execId: String,
         blockIds: Array[String],
         listener: BlockFetchingListener,
-        shuffleFiles: Array[File]): Unit = {
+        tempShuffleFileManager: TempShuffleFileManager): Unit = {
       listener.onBlockFetchSuccess("mockBlockId", new NioManagedBuffer(ByteBuffer.allocate(1)))
     }
 
diff --git a/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala b/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala
index 559b3faab8fd2..6a70cedf769b8 100644
--- a/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala
@@ -33,7 +33,7 @@ import org.scalatest.PrivateMethodTester
 import org.apache.spark.{SparkFunSuite, TaskContext}
 import org.apache.spark.network._
 import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer}
-import org.apache.spark.network.shuffle.BlockFetchingListener
+import org.apache.spark.network.shuffle.{BlockFetchingListener, TempShuffleFileManager}
 import org.apache.spark.network.util.LimitedInputStream
 import org.apache.spark.shuffle.FetchFailedException
 import org.apache.spark.util.Utils
@@ -432,12 +432,12 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT
     val remoteBlocks = Map[BlockId, ManagedBuffer](
       ShuffleBlockId(0, 0, 0) -> createMockManagedBuffer())
     val transfer = mock(classOf[BlockTransferService])
-    var shuffleFiles: Array[File] = null
+    var tempShuffleFileManager: TempShuffleFileManager = null
     when(transfer.fetchBlocks(any(), any(), any(), any(), any(), any()))
       .thenAnswer(new Answer[Unit] {
         override def answer(invocation: InvocationOnMock): Unit = {
           val listener = invocation.getArguments()(4).asInstanceOf[BlockFetchingListener]
-          shuffleFiles = invocation.getArguments()(5).asInstanceOf[Array[File]]
+          tempShuffleFileManager = invocation.getArguments()(5).asInstanceOf[TempShuffleFileManager]
           Future {
             listener.onBlockFetchSuccess(
               ShuffleBlockId(0, 0, 0).toString, remoteBlocks(ShuffleBlockId(0, 0, 0)))
@@ -466,13 +466,13 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT
     fetchShuffleBlock(blocksByAddress1)
     // `maxReqSizeShuffleToMem` is 200, which is greater than the block size 100, so don't fetch
     // shuffle block to disk.
-    assert(shuffleFiles === null)
+    assert(tempShuffleFileManager == null)
 
     val blocksByAddress2 = Seq[(BlockManagerId, Seq[(BlockId, Long)])](
       (remoteBmId, remoteBlocks.keys.map(blockId => (blockId, 300L)).toSeq))
     fetchShuffleBlock(blocksByAddress2)
     // `maxReqSizeShuffleToMem` is 200, which is smaller than the block size 300, so fetch
     // shuffle block to disk.
-    assert(shuffleFiles != null)
+    assert(tempShuffleFileManager != null)
   }
 }

From 18b3b00ecfde6c694fb6fee4f4d07d04e3d08ccf Mon Sep 17 00:00:00 2001
From: Juliusz Sompolski <julek@databricks.com>
Date: Mon, 10 Jul 2017 09:26:42 -0700
Subject: [PATCH 145/779] [SPARK-21272] SortMergeJoin LeftAnti does not update
 numOutputRows

## What changes were proposed in this pull request?

Updating numOutputRows metric was missing from one return path of LeftAnti SortMergeJoin.

## How was this patch tested?

Non-zero output rows manually seen in metrics.

Author: Juliusz Sompolski <julek@databricks.com>

Closes #18494 from juliuszsompolski/SPARK-21272.
---
 .../sql/execution/joins/SortMergeJoinExec.scala      |  1 +
 .../spark/sql/execution/metric/SQLMetricsSuite.scala | 12 ++++++++++++
 2 files changed, 13 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
index 8445c26eeee58..639b8e00c121b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
@@ -290,6 +290,7 @@ case class SortMergeJoinExec(
                 currentLeftRow = smjScanner.getStreamedRow
                 val currentRightMatches = smjScanner.getBufferedMatches
                 if (currentRightMatches == null || currentRightMatches.length == 0) {
+                  numOutputRows += 1
                   return true
                 }
                 var found = false
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
index cb3405b2fe19b..2911cbbeee479 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
@@ -483,6 +483,18 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
     }
   }
 
+  test("SortMergeJoin(left-anti) metrics") {
+    val anti = testData2.filter("a > 2")
+    withTempView("antiData") {
+      anti.createOrReplaceTempView("antiData")
+      val df = spark.sql(
+        "SELECT * FROM testData2 ANTI JOIN antiData ON testData2.a = antiData.a")
+      testSparkPlanMetrics(df, 1, Map(
+        0L -> ("SortMergeJoin", Map("number of output rows" -> 4L)))
+      )
+    }
+  }
+
   test("save metrics") {
     withTempPath { file =>
       // person creates a temporary view. get the DF before listing previous execution IDs

From 2bfd5accdce2ae31feeeddf213a019cf8ec97663 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Mon, 10 Jul 2017 10:40:03 -0700
Subject: [PATCH 146/779] [SPARK-21266][R][PYTHON] Support schema a
 DDL-formatted string in dapply/gapply/from_json

## What changes were proposed in this pull request?

This PR supports schema in a DDL formatted string for `from_json` in R/Python and `dapply` and `gapply` in R, which are commonly used and/or consistent with Scala APIs.

Additionally, this PR exposes `structType` in R to allow working around in other possible corner cases.

**Python**

`from_json`

```python
from pyspark.sql.functions import from_json

data = [(1, '''{"a": 1}''')]
df = spark.createDataFrame(data, ("key", "value"))
df.select(from_json(df.value, "a INT").alias("json")).show()
```

**R**

`from_json`

```R
df <- sql("SELECT named_struct('name', 'Bob') as people")
df <- mutate(df, people_json = to_json(df$people))
head(select(df, from_json(df$people_json, "name STRING")))
```

`structType.character`

```R
structType("a STRING, b INT")
```

`dapply`

```R
dapply(createDataFrame(list(list(1.0)), "a"), function(x) {x}, "a DOUBLE")
```

`gapply`

```R
gapply(createDataFrame(list(list(1.0)), "a"), "a", function(key, x) { x }, "a DOUBLE")
```

## How was this patch tested?

Doc tests for `from_json` in Python and unit tests `test_sparkSQL.R` in R.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #18498 from HyukjinKwon/SPARK-21266.
---
 R/pkg/NAMESPACE                               |   2 +
 R/pkg/R/DataFrame.R                           |  36 ++++-
 R/pkg/R/functions.R                           |  12 +-
 R/pkg/R/group.R                               |   3 +
 R/pkg/R/schema.R                              |  29 +++-
 R/pkg/tests/fulltests/test_sparkSQL.R         | 136 ++++++++++--------
 python/pyspark/sql/functions.py               |  11 +-
 .../org/apache/spark/sql/functions.scala      |   7 +-
 8 files changed, 160 insertions(+), 76 deletions(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index b7fdae58de459..232f5cf31f319 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -429,6 +429,7 @@ export("structField",
        "structField.character",
        "print.structField",
        "structType",
+       "structType.character",
        "structType.jobj",
        "structType.structField",
        "print.structType")
@@ -465,5 +466,6 @@ S3method(print, summary.GBTRegressionModel)
 S3method(print, summary.GBTClassificationModel)
 S3method(structField, character)
 S3method(structField, jobj)
+S3method(structType, character)
 S3method(structType, jobj)
 S3method(structType, structField)
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 3b9d42d6e7158..e7a166c3014c1 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1391,6 +1391,10 @@ setMethod("summarize",
           })
 
 dapplyInternal <- function(x, func, schema) {
+  if (is.character(schema)) {
+    schema <- structType(schema)
+  }
+
   packageNamesArr <- serialize(.sparkREnv[[".packages"]],
                                connection = NULL)
 
@@ -1408,6 +1412,8 @@ dapplyInternal <- function(x, func, schema) {
   dataFrame(sdf)
 }
 
+setClassUnion("characterOrstructType", c("character", "structType"))
+
 #' dapply
 #'
 #' Apply a function to each partition of a SparkDataFrame.
@@ -1418,10 +1424,11 @@ dapplyInternal <- function(x, func, schema) {
 #'             to each partition will be passed.
 #'             The output of func should be a R data.frame.
 #' @param schema The schema of the resulting SparkDataFrame after the function is applied.
-#'               It must match the output of func.
+#'               It must match the output of func. Since Spark 2.3, the DDL-formatted string
+#'               is also supported for the schema.
 #' @family SparkDataFrame functions
 #' @rdname dapply
-#' @aliases dapply,SparkDataFrame,function,structType-method
+#' @aliases dapply,SparkDataFrame,function,characterOrstructType-method
 #' @name dapply
 #' @seealso \link{dapplyCollect}
 #' @export
@@ -1444,6 +1451,17 @@ dapplyInternal <- function(x, func, schema) {
 #'              y <- cbind(y, y[1] + 1L)
 #'            },
 #'            schema)
+#'
+#'   # The schema also can be specified in a DDL-formatted string.
+#'   schema <- "a INT, d DOUBLE, c STRING, d INT"
+#'   df1 <- dapply(
+#'            df,
+#'            function(x) {
+#'              y <- x[x[1] > 1, ]
+#'              y <- cbind(y, y[1] + 1L)
+#'            },
+#'            schema)
+#'
 #'   collect(df1)
 #'   # the result
 #'   #       a b c d
@@ -1452,7 +1470,7 @@ dapplyInternal <- function(x, func, schema) {
 #' }
 #' @note dapply since 2.0.0
 setMethod("dapply",
-          signature(x = "SparkDataFrame", func = "function", schema = "structType"),
+          signature(x = "SparkDataFrame", func = "function", schema = "characterOrstructType"),
           function(x, func, schema) {
             dapplyInternal(x, func, schema)
           })
@@ -1522,6 +1540,7 @@ setMethod("dapplyCollect",
 #' @param schema the schema of the resulting SparkDataFrame after the function is applied.
 #'               The schema must match to output of \code{func}. It has to be defined for each
 #'               output column with preferred output column name and corresponding data type.
+#'               Since Spark 2.3, the DDL-formatted string is also supported for the schema.
 #' @return A SparkDataFrame.
 #' @family SparkDataFrame functions
 #' @aliases gapply,SparkDataFrame-method
@@ -1541,7 +1560,7 @@ setMethod("dapplyCollect",
 #'
 #' Here our output contains three columns, the key which is a combination of two
 #' columns with data types integer and string and the mean which is a double.
-#' schema <-  structType(structField("a", "integer"), structField("c", "string"),
+#' schema <- structType(structField("a", "integer"), structField("c", "string"),
 #'   structField("avg", "double"))
 #' result <- gapply(
 #'   df,
@@ -1550,6 +1569,15 @@ setMethod("dapplyCollect",
 #'     y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE)
 #' }, schema)
 #'
+#' The schema also can be specified in a DDL-formatted string.
+#' schema <- "a INT, c STRING, avg DOUBLE"
+#' result <- gapply(
+#'   df,
+#'   c("a", "c"),
+#'   function(key, x) {
+#'     y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE)
+#' }, schema)
+#'
 #' We can also group the data and afterwards call gapply on GroupedData.
 #' For Example:
 #' gdf <- group_by(df, "a", "c")
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index f28d26a51baa0..86507f13f038d 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -2174,8 +2174,9 @@ setMethod("date_format", signature(y = "Column", x = "character"),
 #'
 #' @rdname column_collection_functions
 #' @param schema a structType object to use as the schema to use when parsing the JSON string.
+#'               Since Spark 2.3, the DDL-formatted string is also supported for the schema.
 #' @param as.json.array indicating if input string is JSON array of objects or a single object.
-#' @aliases from_json from_json,Column,structType-method
+#' @aliases from_json from_json,Column,characterOrstructType-method
 #' @export
 #' @examples
 #'
@@ -2188,10 +2189,15 @@ setMethod("date_format", signature(y = "Column", x = "character"),
 #' df2 <- sql("SELECT named_struct('name', 'Bob') as people")
 #' df2 <- mutate(df2, people_json = to_json(df2$people))
 #' schema <- structType(structField("name", "string"))
-#' head(select(df2, from_json(df2$people_json, schema)))}
+#' head(select(df2, from_json(df2$people_json, schema)))
+#' head(select(df2, from_json(df2$people_json, "name STRING")))}
 #' @note from_json since 2.2.0
-setMethod("from_json", signature(x = "Column", schema = "structType"),
+setMethod("from_json", signature(x = "Column", schema = "characterOrstructType"),
           function(x, schema, as.json.array = FALSE, ...) {
+            if (is.character(schema)) {
+              schema <- structType(schema)
+            }
+
             if (as.json.array) {
               jschema <- callJStatic("org.apache.spark.sql.types.DataTypes",
                                      "createArrayType",
diff --git a/R/pkg/R/group.R b/R/pkg/R/group.R
index 17f5283abead1..0a7be0e993975 100644
--- a/R/pkg/R/group.R
+++ b/R/pkg/R/group.R
@@ -233,6 +233,9 @@ setMethod("gapplyCollect",
           })
 
 gapplyInternal <- function(x, func, schema) {
+  if (is.character(schema)) {
+    schema <- structType(schema)
+  }
   packageNamesArr <- serialize(.sparkREnv[[".packages"]],
                        connection = NULL)
   broadcastArr <- lapply(ls(.broadcastNames),
diff --git a/R/pkg/R/schema.R b/R/pkg/R/schema.R
index cb5bdb90175bf..d1ed6833d5d02 100644
--- a/R/pkg/R/schema.R
+++ b/R/pkg/R/schema.R
@@ -23,18 +23,24 @@
 #' Create a structType object that contains the metadata for a SparkDataFrame. Intended for
 #' use with createDataFrame and toDF.
 #'
-#' @param x a structField object (created with the field() function)
+#' @param x a structField object (created with the \code{structField} method). Since Spark 2.3,
+#'          this can be a DDL-formatted string, which is a comma separated list of field
+#'          definitions, e.g., "a INT, b STRING".
 #' @param ... additional structField objects
 #' @return a structType object
 #' @rdname structType
 #' @export
 #' @examples
 #'\dontrun{
-#' schema <-  structType(structField("a", "integer"), structField("c", "string"),
+#' schema <- structType(structField("a", "integer"), structField("c", "string"),
 #'                       structField("avg", "double"))
 #' df1 <- gapply(df, list("a", "c"),
 #'               function(key, x) { y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE) },
 #'               schema)
+#' schema <- structType("a INT, c STRING, avg DOUBLE")
+#' df1 <- gapply(df, list("a", "c"),
+#'               function(key, x) { y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE) },
+#'               schema)
 #' }
 #' @note structType since 1.4.0
 structType <- function(x, ...) {
@@ -68,6 +74,23 @@ structType.structField <- function(x, ...) {
   structType(stObj)
 }
 
+#' @rdname structType
+#' @method structType character
+#' @export
+structType.character <- function(x, ...) {
+  if (!is.character(x)) {
+    stop("schema must be a DDL-formatted string.")
+  }
+  if (length(list(...)) > 0) {
+    stop("multiple DDL-formatted strings are not supported")
+  }
+
+  stObj <- handledCallJStatic("org.apache.spark.sql.types.StructType",
+                              "fromDDL",
+                              x)
+  structType(stObj)
+}
+
 #' Print a Spark StructType.
 #'
 #' This function prints the contents of a StructType returned from the
@@ -102,7 +125,7 @@ print.structType <- function(x, ...) {
 #' field1 <- structField("a", "integer")
 #' field2 <- structField("c", "string")
 #' field3 <- structField("avg", "double")
-#' schema <-  structType(field1, field2, field3)
+#' schema <- structType(field1, field2, field3)
 #' df1 <- gapply(df, list("a", "c"),
 #'               function(key, x) { y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE) },
 #'               schema)
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index a2bcb5aefe16d..77052d4a28345 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -146,6 +146,13 @@ test_that("structType and structField", {
   expect_is(testSchema, "structType")
   expect_is(testSchema$fields()[[2]], "structField")
   expect_equal(testSchema$fields()[[1]]$dataType.toString(), "StringType")
+
+  testSchema <- structType("a STRING, b INT")
+  expect_is(testSchema, "structType")
+  expect_is(testSchema$fields()[[2]], "structField")
+  expect_equal(testSchema$fields()[[1]]$dataType.toString(), "StringType")
+
+  expect_error(structType("A stri"), "DataType stri is not supported.")
 })
 
 test_that("structField type strings", {
@@ -1480,13 +1487,15 @@ test_that("column functions", {
   j <- collect(select(df, alias(to_json(df$info), "json")))
   expect_equal(j[order(j$json), ][1], "{\"age\":16,\"height\":176.5}")
   df <- as.DataFrame(j)
-  schema <- structType(structField("age", "integer"),
-                       structField("height", "double"))
-  s <- collect(select(df, alias(from_json(df$json, schema), "structcol")))
-  expect_equal(ncol(s), 1)
-  expect_equal(nrow(s), 3)
-  expect_is(s[[1]][[1]], "struct")
-  expect_true(any(apply(s, 1, function(x) { x[[1]]$age == 16 } )))
+  schemas <- list(structType(structField("age", "integer"), structField("height", "double")),
+                  "age INT, height DOUBLE")
+  for (schema in schemas) {
+    s <- collect(select(df, alias(from_json(df$json, schema), "structcol")))
+    expect_equal(ncol(s), 1)
+    expect_equal(nrow(s), 3)
+    expect_is(s[[1]][[1]], "struct")
+    expect_true(any(apply(s, 1, function(x) { x[[1]]$age == 16 } )))
+  }
 
   # passing option
   df <- as.DataFrame(list(list("col" = "{\"date\":\"21/10/2014\"}")))
@@ -1504,14 +1513,15 @@ test_that("column functions", {
   # check if array type in string is correctly supported.
   jsonArr <- "[{\"name\":\"Bob\"}, {\"name\":\"Alice\"}]"
   df <- as.DataFrame(list(list("people" = jsonArr)))
-  schema <- structType(structField("name", "string"))
-  arr <- collect(select(df, alias(from_json(df$people, schema, as.json.array = TRUE), "arrcol")))
-  expect_equal(ncol(arr), 1)
-  expect_equal(nrow(arr), 1)
-  expect_is(arr[[1]][[1]], "list")
-  expect_equal(length(arr$arrcol[[1]]), 2)
-  expect_equal(arr$arrcol[[1]][[1]]$name, "Bob")
-  expect_equal(arr$arrcol[[1]][[2]]$name, "Alice")
+  for (schema in list(structType(structField("name", "string")), "name STRING")) {
+    arr <- collect(select(df, alias(from_json(df$people, schema, as.json.array = TRUE), "arrcol")))
+    expect_equal(ncol(arr), 1)
+    expect_equal(nrow(arr), 1)
+    expect_is(arr[[1]][[1]], "list")
+    expect_equal(length(arr$arrcol[[1]]), 2)
+    expect_equal(arr$arrcol[[1]][[1]]$name, "Bob")
+    expect_equal(arr$arrcol[[1]][[2]]$name, "Alice")
+  }
 
   # Test create_array() and create_map()
   df <- as.DataFrame(data.frame(
@@ -2885,30 +2895,33 @@ test_that("dapply() and dapplyCollect() on a DataFrame", {
   expect_identical(ldf, result)
 
   # Filter and add a column
-  schema <- structType(structField("a", "integer"), structField("b", "double"),
-                       structField("c", "string"), structField("d", "integer"))
-  df1 <- dapply(
-           df,
-           function(x) {
-             y <- x[x$a > 1, ]
-             y <- cbind(y, y$a + 1L)
-           },
-           schema)
-  result <- collect(df1)
-  expected <- ldf[ldf$a > 1, ]
-  expected$d <- expected$a + 1L
-  rownames(expected) <- NULL
-  expect_identical(expected, result)
-
-  result <- dapplyCollect(
-              df,
-              function(x) {
-                y <- x[x$a > 1, ]
-                y <- cbind(y, y$a + 1L)
-              })
-  expected1 <- expected
-  names(expected1) <- names(result)
-  expect_identical(expected1, result)
+  schemas <- list(structType(structField("a", "integer"), structField("b", "double"),
+                             structField("c", "string"), structField("d", "integer")),
+                  "a INT, b DOUBLE, c STRING, d INT")
+  for (schema in schemas) {
+    df1 <- dapply(
+             df,
+             function(x) {
+               y <- x[x$a > 1, ]
+               y <- cbind(y, y$a + 1L)
+             },
+             schema)
+    result <- collect(df1)
+    expected <- ldf[ldf$a > 1, ]
+    expected$d <- expected$a + 1L
+    rownames(expected) <- NULL
+    expect_identical(expected, result)
+
+    result <- dapplyCollect(
+                df,
+                function(x) {
+                  y <- x[x$a > 1, ]
+                  y <- cbind(y, y$a + 1L)
+                })
+    expected1 <- expected
+    names(expected1) <- names(result)
+    expect_identical(expected1, result)
+  }
 
   # Remove the added column
   df2 <- dapply(
@@ -3020,29 +3033,32 @@ test_that("gapply() and gapplyCollect() on a DataFrame", {
 
   # Computes the sum of second column by grouping on the first and third columns
   # and checks if the sum is larger than 2
-  schema <- structType(structField("a", "integer"), structField("e", "boolean"))
-  df2 <- gapply(
-    df,
-    c(df$"a", df$"c"),
-    function(key, x) {
-      y <- data.frame(key[1], sum(x$b) > 2)
-    },
-    schema)
-  actual <- collect(df2)$e
-  expected <- c(TRUE, TRUE)
-  expect_identical(actual, expected)
-
-  df2Collect <- gapplyCollect(
-    df,
-    c(df$"a", df$"c"),
-    function(key, x) {
-      y <- data.frame(key[1], sum(x$b) > 2)
-      colnames(y) <- c("a", "e")
-      y
-    })
-    actual <- df2Collect$e
+  schemas <- list(structType(structField("a", "integer"), structField("e", "boolean")),
+                  "a INT, e BOOLEAN")
+  for (schema in schemas) {
+    df2 <- gapply(
+      df,
+      c(df$"a", df$"c"),
+      function(key, x) {
+        y <- data.frame(key[1], sum(x$b) > 2)
+      },
+      schema)
+    actual <- collect(df2)$e
+    expected <- c(TRUE, TRUE)
     expect_identical(actual, expected)
 
+    df2Collect <- gapplyCollect(
+      df,
+      c(df$"a", df$"c"),
+      function(key, x) {
+        y <- data.frame(key[1], sum(x$b) > 2)
+        colnames(y) <- c("a", "e")
+        y
+      })
+      actual <- df2Collect$e
+      expect_identical(actual, expected)
+  }
+
   # Computes the arithmetic mean of the second column by grouping
   # on the first and third columns. Output the groupping value and the average.
   schema <-  structType(structField("a", "integer"), structField("c", "string"),
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 5d8ded83f667d..f3e7d033e97cf 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -1883,15 +1883,20 @@ def from_json(col, schema, options={}):
     string.
 
     :param col: string column in json format
-    :param schema: a StructType or ArrayType of StructType to use when parsing the json column
+    :param schema: a StructType or ArrayType of StructType to use when parsing the json column.
     :param options: options to control parsing. accepts the same options as the json datasource
 
+    .. note:: Since Spark 2.3, the DDL-formatted string or a JSON format string is also
+              supported for ``schema``.
+
     >>> from pyspark.sql.types import *
     >>> data = [(1, '''{"a": 1}''')]
     >>> schema = StructType([StructField("a", IntegerType())])
     >>> df = spark.createDataFrame(data, ("key", "value"))
     >>> df.select(from_json(df.value, schema).alias("json")).collect()
     [Row(json=Row(a=1))]
+    >>> df.select(from_json(df.value, "a INT").alias("json")).collect()
+    [Row(json=Row(a=1))]
     >>> data = [(1, '''[{"a": 1}]''')]
     >>> schema = ArrayType(StructType([StructField("a", IntegerType())]))
     >>> df = spark.createDataFrame(data, ("key", "value"))
@@ -1900,7 +1905,9 @@ def from_json(col, schema, options={}):
     """
 
     sc = SparkContext._active_spark_context
-    jc = sc._jvm.functions.from_json(_to_java_column(col), schema.json(), options)
+    if isinstance(schema, DataType):
+        schema = schema.json()
+    jc = sc._jvm.functions.from_json(_to_java_column(col), schema, options)
     return Column(jc)
 
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 0c7b483f5c836..ebdeb42b0bfb1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2114,7 +2114,7 @@ object functions {
    * Calculates the hash code of given columns, and returns the result as an int column.
    *
    * @group misc_funcs
-   * @since 2.0
+   * @since 2.0.0
    */
   @scala.annotation.varargs
   def hash(cols: Column*): Column = withExpr {
@@ -3074,9 +3074,8 @@ object functions {
    * string.
    *
    * @param e a string column containing JSON data.
-   * @param schema the schema to use when parsing the json string as a json string. In Spark 2.1,
-   *               the user-provided schema has to be in JSON format. Since Spark 2.2, the DDL
-   *               format is also supported for the schema.
+   * @param schema the schema to use when parsing the json string as a json string, it could be a
+   *               JSON format string or a DDL-formatted string.
    *
    * @group collection_funcs
    * @since 2.3.0

From d03aebbe6508ba441dc87f9546f27aeb27553d77 Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Mon, 10 Jul 2017 15:21:03 -0700
Subject: [PATCH 147/779] [SPARK-13534][PYSPARK] Using Apache Arrow to increase
 performance of DataFrame.toPandas

## What changes were proposed in this pull request?
Integrate Apache Arrow with Spark to increase performance of `DataFrame.toPandas`.  This has been done by using Arrow to convert data partitions on the executor JVM to Arrow payload byte arrays where they are then served to the Python process.  The Python DataFrame can then collect the Arrow payloads where they are combined and converted to a Pandas DataFrame.  Data types except complex, date, timestamp, and decimal  are currently supported, otherwise an `UnsupportedOperation` exception is thrown.

Additions to Spark include a Scala package private method `Dataset.toArrowPayload` that will convert data partitions in the executor JVM to `ArrowPayload`s as byte arrays so they can be easily served.  A package private class/object `ArrowConverters` that provide data type mappings and conversion routines.  In Python, a private method `DataFrame._collectAsArrow` is added to collect Arrow payloads and a SQLConf "spark.sql.execution.arrow.enable" can be used in `toPandas()` to enable using Arrow (uses the old conversion by default).

## How was this patch tested?
Added a new test suite `ArrowConvertersSuite` that will run tests on conversion of Datasets to Arrow payloads for supported types.  The suite will generate a Dataset and matching Arrow JSON data, then the dataset is converted to an Arrow payload and finally validated against the JSON data.  This will ensure that the schema and data has been converted correctly.

Added PySpark tests to verify the `toPandas` method is producing equal DataFrames with and without pyarrow.  A roundtrip test to ensure the pandas DataFrame produced by pyspark is equal to a one made directly with pandas.

Author: Bryan Cutler <cutlerb@gmail.com>
Author: Li Jin <ice.xelloss@gmail.com>
Author: Li Jin <li.jin@twosigma.com>
Author: Wes McKinney <wes.mckinney@twosigma.com>

Closes #18459 from BryanCutler/toPandas_with_arrow-SPARK-13534.
---
 bin/pyspark                                   |    2 +-
 dev/deps/spark-deps-hadoop-2.6                |    5 +
 dev/deps/spark-deps-hadoop-2.7                |    5 +
 pom.xml                                       |   20 +
 python/pyspark/serializers.py                 |   17 +
 python/pyspark/sql/dataframe.py               |   48 +-
 python/pyspark/sql/tests.py                   |   78 +-
 .../apache/spark/sql/internal/SQLConf.scala   |   22 +
 sql/core/pom.xml                              |    4 +
 .../scala/org/apache/spark/sql/Dataset.scala  |   20 +
 .../sql/execution/arrow/ArrowConverters.scala |  429 ++++++
 .../arrow/ArrowConvertersSuite.scala          | 1222 +++++++++++++++++
 12 files changed, 1859 insertions(+), 13 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala

diff --git a/bin/pyspark b/bin/pyspark
index d3b512eeb1209..dd286277c1fc1 100755
--- a/bin/pyspark
+++ b/bin/pyspark
@@ -68,7 +68,7 @@ if [[ -n "$SPARK_TESTING" ]]; then
   unset YARN_CONF_DIR
   unset HADOOP_CONF_DIR
   export PYTHONHASHSEED=0
-  exec "$PYSPARK_DRIVER_PYTHON" -m "$1"
+  exec "$PYSPARK_DRIVER_PYTHON" -m "$@"
   exit
 fi
 
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index c1325318d52fa..1a6515be51cff 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -13,6 +13,9 @@ apacheds-kerberos-codec-2.0.0-M15.jar
 api-asn1-api-1.0.0-M20.jar
 api-util-1.0.0-M20.jar
 arpack_combined_all-0.1.jar
+arrow-format-0.4.0.jar
+arrow-memory-0.4.0.jar
+arrow-vector-0.4.0.jar
 avro-1.7.7.jar
 avro-ipc-1.7.7.jar
 avro-mapred-1.7.7-hadoop2.jar
@@ -55,6 +58,7 @@ datanucleus-core-3.2.10.jar
 datanucleus-rdbms-3.2.9.jar
 derby-10.12.1.1.jar
 eigenbase-properties-1.1.5.jar
+flatbuffers-1.2.0-3f79e055.jar
 gson-2.2.4.jar
 guava-14.0.1.jar
 guice-3.0.jar
@@ -77,6 +81,7 @@ hadoop-yarn-server-web-proxy-2.6.5.jar
 hk2-api-2.4.0-b34.jar
 hk2-locator-2.4.0-b34.jar
 hk2-utils-2.4.0-b34.jar
+hppc-0.7.1.jar
 htrace-core-3.0.4.jar
 httpclient-4.5.2.jar
 httpcore-4.4.4.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index ac5abd21807b6..09e5a4288ca50 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -13,6 +13,9 @@ apacheds-kerberos-codec-2.0.0-M15.jar
 api-asn1-api-1.0.0-M20.jar
 api-util-1.0.0-M20.jar
 arpack_combined_all-0.1.jar
+arrow-format-0.4.0.jar
+arrow-memory-0.4.0.jar
+arrow-vector-0.4.0.jar
 avro-1.7.7.jar
 avro-ipc-1.7.7.jar
 avro-mapred-1.7.7-hadoop2.jar
@@ -55,6 +58,7 @@ datanucleus-core-3.2.10.jar
 datanucleus-rdbms-3.2.9.jar
 derby-10.12.1.1.jar
 eigenbase-properties-1.1.5.jar
+flatbuffers-1.2.0-3f79e055.jar
 gson-2.2.4.jar
 guava-14.0.1.jar
 guice-3.0.jar
@@ -77,6 +81,7 @@ hadoop-yarn-server-web-proxy-2.7.3.jar
 hk2-api-2.4.0-b34.jar
 hk2-locator-2.4.0-b34.jar
 hk2-utils-2.4.0-b34.jar
+hppc-0.7.1.jar
 htrace-core-3.1.0-incubating.jar
 httpclient-4.5.2.jar
 httpcore-4.4.4.jar
diff --git a/pom.xml b/pom.xml
index 5f524079495c0..f124ba45007b7 100644
--- a/pom.xml
+++ b/pom.xml
@@ -181,6 +181,7 @@
     <paranamer.version>2.6</paranamer.version>
     <maven-antrun.version>1.8</maven-antrun.version>
     <commons-crypto.version>1.0.0</commons-crypto.version>
+    <arrow.version>0.4.0</arrow.version>
 
     <test.java.home>${java.home}</test.java.home>
     <test.exclude.tags></test.exclude.tags>
@@ -1878,6 +1879,25 @@
         <artifactId>paranamer</artifactId>
         <version>${paranamer.version}</version>
       </dependency>
+      <dependency>
+        <groupId>org.apache.arrow</groupId>
+        <artifactId>arrow-vector</artifactId>
+        <version>${arrow.version}</version>
+        <exclusions>
+          <exclusion>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-annotations</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-databind</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>io.netty</groupId>
+            <artifactId>netty-handler</artifactId>
+          </exclusion>
+        </exclusions>
+      </dependency>
     </dependencies>
   </dependencyManagement>
 
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index ea5e00e9eeef5..d5c2a7518b18f 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -182,6 +182,23 @@ def loads(self, obj):
         raise NotImplementedError
 
 
+class ArrowSerializer(FramedSerializer):
+    """
+    Serializes an Arrow stream.
+    """
+
+    def dumps(self, obj):
+        raise NotImplementedError
+
+    def loads(self, obj):
+        import pyarrow as pa
+        reader = pa.RecordBatchFileReader(pa.BufferReader(obj))
+        return reader.read_all()
+
+    def __repr__(self):
+        return "ArrowSerializer"
+
+
 class BatchedSerializer(Serializer):
 
     """
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 27a6dad8917d3..944739bcd2078 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -29,7 +29,8 @@
 
 from pyspark import copy_func, since
 from pyspark.rdd import RDD, _load_from_socket, ignore_unicode_prefix
-from pyspark.serializers import BatchedSerializer, PickleSerializer, UTF8Deserializer
+from pyspark.serializers import ArrowSerializer, BatchedSerializer, PickleSerializer, \
+    UTF8Deserializer
 from pyspark.storagelevel import StorageLevel
 from pyspark.traceback_utils import SCCallSiteSync
 from pyspark.sql.types import _parse_datatype_json_string
@@ -1710,7 +1711,8 @@ def toDF(self, *cols):
 
     @since(1.3)
     def toPandas(self):
-        """Returns the contents of this :class:`DataFrame` as Pandas ``pandas.DataFrame``.
+        """
+        Returns the contents of this :class:`DataFrame` as Pandas ``pandas.DataFrame``.
 
         This is only available if Pandas is installed and available.
 
@@ -1723,18 +1725,42 @@ def toPandas(self):
         1    5    Bob
         """
         import pandas as pd
+        if self.sql_ctx.getConf("spark.sql.execution.arrow.enable", "false").lower() == "true":
+            try:
+                import pyarrow
+                tables = self._collectAsArrow()
+                if tables:
+                    table = pyarrow.concat_tables(tables)
+                    return table.to_pandas()
+                else:
+                    return pd.DataFrame.from_records([], columns=self.columns)
+            except ImportError as e:
+                msg = "note: pyarrow must be installed and available on calling Python process " \
+                      "if using spark.sql.execution.arrow.enable=true"
+                raise ImportError("%s\n%s" % (e.message, msg))
+        else:
+            dtype = {}
+            for field in self.schema:
+                pandas_type = _to_corrected_pandas_type(field.dataType)
+                if pandas_type is not None:
+                    dtype[field.name] = pandas_type
 
-        dtype = {}
-        for field in self.schema:
-            pandas_type = _to_corrected_pandas_type(field.dataType)
-            if pandas_type is not None:
-                dtype[field.name] = pandas_type
+            pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
 
-        pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
+            for f, t in dtype.items():
+                pdf[f] = pdf[f].astype(t, copy=False)
+            return pdf
 
-        for f, t in dtype.items():
-            pdf[f] = pdf[f].astype(t, copy=False)
-        return pdf
+    def _collectAsArrow(self):
+        """
+        Returns all records as list of deserialized ArrowPayloads, pyarrow must be installed
+        and available.
+
+        .. note:: Experimental.
+        """
+        with SCCallSiteSync(self._sc) as css:
+            port = self._jdf.collectAsArrowToPython()
+        return list(_load_from_socket(port, ArrowSerializer()))
 
     ##########################################################################################
     # Pandas compatibility
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 9db2f40474f70..bd8477e35f37a 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -58,12 +58,21 @@
 from pyspark.sql import SparkSession, SQLContext, HiveContext, Column, Row
 from pyspark.sql.types import *
 from pyspark.sql.types import UserDefinedType, _infer_type, _make_type_verifier
-from pyspark.tests import ReusedPySparkTestCase, SparkSubmitTests
+from pyspark.tests import QuietTest, ReusedPySparkTestCase, SparkSubmitTests
 from pyspark.sql.functions import UserDefinedFunction, sha2, lit
 from pyspark.sql.window import Window
 from pyspark.sql.utils import AnalysisException, ParseException, IllegalArgumentException
 
 
+_have_arrow = False
+try:
+    import pyarrow
+    _have_arrow = True
+except:
+    # No Arrow, but that's okay, we'll skip those tests
+    pass
+
+
 class UTCOffsetTimezone(datetime.tzinfo):
     """
     Specifies timezone in UTC offset
@@ -2843,6 +2852,73 @@ def __init__(self, **kwargs):
                 _make_type_verifier(data_type, nullable=False)(obj)
 
 
+@unittest.skipIf(not _have_arrow, "Arrow not installed")
+class ArrowTests(ReusedPySparkTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        ReusedPySparkTestCase.setUpClass()
+        cls.spark = SparkSession(cls.sc)
+        cls.spark.conf.set("spark.sql.execution.arrow.enable", "true")
+        cls.schema = StructType([
+            StructField("1_str_t", StringType(), True),
+            StructField("2_int_t", IntegerType(), True),
+            StructField("3_long_t", LongType(), True),
+            StructField("4_float_t", FloatType(), True),
+            StructField("5_double_t", DoubleType(), True)])
+        cls.data = [("a", 1, 10, 0.2, 2.0),
+                    ("b", 2, 20, 0.4, 4.0),
+                    ("c", 3, 30, 0.8, 6.0)]
+
+    def assertFramesEqual(self, df_with_arrow, df_without):
+        msg = ("DataFrame from Arrow is not equal" +
+               ("\n\nWith Arrow:\n%s\n%s" % (df_with_arrow, df_with_arrow.dtypes)) +
+               ("\n\nWithout:\n%s\n%s" % (df_without, df_without.dtypes)))
+        self.assertTrue(df_without.equals(df_with_arrow), msg=msg)
+
+    def test_unsupported_datatype(self):
+        schema = StructType([StructField("array", ArrayType(IntegerType(), False), True)])
+        df = self.spark.createDataFrame([([1, 2, 3],)], schema=schema)
+        with QuietTest(self.sc):
+            self.assertRaises(Exception, lambda: df.toPandas())
+
+    def test_null_conversion(self):
+        df_null = self.spark.createDataFrame([tuple([None for _ in range(len(self.data[0]))])] +
+                                             self.data)
+        pdf = df_null.toPandas()
+        null_counts = pdf.isnull().sum().tolist()
+        self.assertTrue(all([c == 1 for c in null_counts]))
+
+    def test_toPandas_arrow_toggle(self):
+        df = self.spark.createDataFrame(self.data, schema=self.schema)
+        self.spark.conf.set("spark.sql.execution.arrow.enable", "false")
+        pdf = df.toPandas()
+        self.spark.conf.set("spark.sql.execution.arrow.enable", "true")
+        pdf_arrow = df.toPandas()
+        self.assertFramesEqual(pdf_arrow, pdf)
+
+    def test_pandas_round_trip(self):
+        import pandas as pd
+        import numpy as np
+        data_dict = {}
+        for j, name in enumerate(self.schema.names):
+            data_dict[name] = [self.data[i][j] for i in range(len(self.data))]
+        # need to convert these to numpy types first
+        data_dict["2_int_t"] = np.int32(data_dict["2_int_t"])
+        data_dict["4_float_t"] = np.float32(data_dict["4_float_t"])
+        pdf = pd.DataFrame(data=data_dict)
+        df = self.spark.createDataFrame(self.data, schema=self.schema)
+        pdf_arrow = df.toPandas()
+        self.assertFramesEqual(pdf_arrow, pdf)
+
+    def test_filtered_frame(self):
+        df = self.spark.range(3).toDF("i")
+        pdf = df.filter("i < 0").toPandas()
+        self.assertEqual(len(pdf.columns), 1)
+        self.assertEqual(pdf.columns[0], "i")
+        self.assertTrue(pdf.empty)
+
+
 if __name__ == "__main__":
     from pyspark.sql.tests import *
     if xmlrunner:
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 25152f3e32d6b..643587a6eb09d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -855,6 +855,24 @@ object SQLConf {
       .intConf
       .createWithDefault(UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD.toInt)
 
+  val ARROW_EXECUTION_ENABLE =
+    buildConf("spark.sql.execution.arrow.enable")
+      .internal()
+      .doc("Make use of Apache Arrow for columnar data transfers. Currently available " +
+        "for use with pyspark.sql.DataFrame.toPandas with the following data types: " +
+        "StringType, BinaryType, BooleanType, DoubleType, FloatType, ByteType, IntegerType, " +
+        "LongType, ShortType")
+      .booleanConf
+      .createWithDefault(false)
+
+  val ARROW_EXECUTION_MAX_RECORDS_PER_BATCH =
+    buildConf("spark.sql.execution.arrow.maxRecordsPerBatch")
+      .internal()
+      .doc("When using Apache Arrow, limit the maximum number of records that can be written " +
+        "to a single ArrowRecordBatch in memory. If set to zero or negative there is no limit.")
+      .intConf
+      .createWithDefault(10000)
+
   object Deprecated {
     val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"
   }
@@ -1115,6 +1133,10 @@ class SQLConf extends Serializable with Logging {
 
   def starSchemaFTRatio: Double = getConf(STARSCHEMA_FACT_TABLE_RATIO)
 
+  def arrowEnable: Boolean = getConf(ARROW_EXECUTION_ENABLE)
+
+  def arrowMaxRecordsPerBatch: Int = getConf(ARROW_EXECUTION_MAX_RECORDS_PER_BATCH)
+
   /** ********************** SQLConf functionality methods ************ */
 
   /** Set Spark SQL configuration properties. */
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 1bc34a6b069d9..661c31ded7148 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -103,6 +103,10 @@
       <artifactId>jackson-databind</artifactId>
       <version>${fasterxml.jackson.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.arrow</groupId>
+      <artifactId>arrow-vector</artifactId>
+    </dependency>
     <dependency>
       <groupId>org.apache.xbean</groupId>
       <artifactId>xbean-asm5-shaded</artifactId>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index dfb51192c69bc..a7773831df075 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -46,6 +46,7 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, PartitioningCollection}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution._
+import org.apache.spark.sql.execution.arrow.{ArrowConverters, ArrowPayload}
 import org.apache.spark.sql.execution.command._
 import org.apache.spark.sql.execution.datasources.LogicalRelation
 import org.apache.spark.sql.execution.python.EvaluatePython
@@ -2907,6 +2908,16 @@ class Dataset[T] private[sql](
     }
   }
 
+  /**
+   * Collect a Dataset as ArrowPayload byte arrays and serve to PySpark.
+   */
+  private[sql] def collectAsArrowToPython(): Int = {
+    withNewExecutionId {
+      val iter = toArrowPayload.collect().iterator.map(_.asPythonSerializable)
+      PythonRDD.serveIterator(iter, "serve-Arrow")
+    }
+  }
+
   private[sql] def toPythonIterator(): Int = {
     withNewExecutionId {
       PythonRDD.toLocalIteratorAndServe(javaToPython.rdd)
@@ -2988,4 +2999,13 @@ class Dataset[T] private[sql](
       Dataset(sparkSession, logicalPlan)
     }
   }
+
+  /** Convert to an RDD of ArrowPayload byte arrays */
+  private[sql] def toArrowPayload: RDD[ArrowPayload] = {
+    val schemaCaptured = this.schema
+    val maxRecordsPerBatch = sparkSession.sessionState.conf.arrowMaxRecordsPerBatch
+    queryExecution.toRdd.mapPartitionsInternal { iter =>
+      ArrowConverters.toPayloadIterator(iter, schemaCaptured, maxRecordsPerBatch)
+    }
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
new file mode 100644
index 0000000000000..6af5c73422377
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
@@ -0,0 +1,429 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.arrow
+
+import java.io.ByteArrayOutputStream
+import java.nio.channels.Channels
+
+import scala.collection.JavaConverters._
+
+import io.netty.buffer.ArrowBuf
+import org.apache.arrow.memory.{BufferAllocator, RootAllocator}
+import org.apache.arrow.vector._
+import org.apache.arrow.vector.BaseValueVector.BaseMutator
+import org.apache.arrow.vector.file._
+import org.apache.arrow.vector.schema.{ArrowFieldNode, ArrowRecordBatch}
+import org.apache.arrow.vector.types.FloatingPointPrecision
+import org.apache.arrow.vector.types.pojo.{ArrowType, Field, FieldType, Schema}
+import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.types._
+import org.apache.spark.util.Utils
+
+
+/**
+ * Store Arrow data in a form that can be serialized by Spark and served to a Python process.
+ */
+private[sql] class ArrowPayload private[arrow] (payload: Array[Byte]) extends Serializable {
+
+  /**
+   * Convert the ArrowPayload to an ArrowRecordBatch.
+   */
+  def loadBatch(allocator: BufferAllocator): ArrowRecordBatch = {
+    ArrowConverters.byteArrayToBatch(payload, allocator)
+  }
+
+  /**
+   * Get the ArrowPayload as a type that can be served to Python.
+   */
+  def asPythonSerializable: Array[Byte] = payload
+}
+
+private[sql] object ArrowPayload {
+
+  /**
+   * Create an ArrowPayload from an ArrowRecordBatch and Spark schema.
+   */
+  def apply(
+      batch: ArrowRecordBatch,
+      schema: StructType,
+      allocator: BufferAllocator): ArrowPayload = {
+    new ArrowPayload(ArrowConverters.batchToByteArray(batch, schema, allocator))
+  }
+}
+
+private[sql] object ArrowConverters {
+
+  /**
+   * Map a Spark DataType to ArrowType.
+   */
+  private[arrow] def sparkTypeToArrowType(dataType: DataType): ArrowType = {
+    dataType match {
+      case BooleanType => ArrowType.Bool.INSTANCE
+      case ShortType => new ArrowType.Int(8 * ShortType.defaultSize, true)
+      case IntegerType => new ArrowType.Int(8 * IntegerType.defaultSize, true)
+      case LongType => new ArrowType.Int(8 * LongType.defaultSize, true)
+      case FloatType => new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)
+      case DoubleType => new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)
+      case ByteType => new ArrowType.Int(8, true)
+      case StringType => ArrowType.Utf8.INSTANCE
+      case BinaryType => ArrowType.Binary.INSTANCE
+      case _ => throw new UnsupportedOperationException(s"Unsupported data type: $dataType")
+    }
+  }
+
+  /**
+   * Convert a Spark Dataset schema to Arrow schema.
+   */
+  private[arrow] def schemaToArrowSchema(schema: StructType): Schema = {
+    val arrowFields = schema.fields.map { f =>
+      new Field(f.name, f.nullable, sparkTypeToArrowType(f.dataType), List.empty[Field].asJava)
+    }
+    new Schema(arrowFields.toList.asJava)
+  }
+
+  /**
+   * Maps Iterator from InternalRow to ArrowPayload. Limit ArrowRecordBatch size in ArrowPayload
+   * by setting maxRecordsPerBatch or use 0 to fully consume rowIter.
+   */
+  private[sql] def toPayloadIterator(
+      rowIter: Iterator[InternalRow],
+      schema: StructType,
+      maxRecordsPerBatch: Int): Iterator[ArrowPayload] = {
+    new Iterator[ArrowPayload] {
+      private val _allocator = new RootAllocator(Long.MaxValue)
+      private var _nextPayload = if (rowIter.nonEmpty) convert() else null
+
+      override def hasNext: Boolean = _nextPayload != null
+
+      override def next(): ArrowPayload = {
+        val obj = _nextPayload
+        if (hasNext) {
+          if (rowIter.hasNext) {
+            _nextPayload = convert()
+          } else {
+            _allocator.close()
+            _nextPayload = null
+          }
+        }
+        obj
+      }
+
+      private def convert(): ArrowPayload = {
+        val batch = internalRowIterToArrowBatch(rowIter, schema, _allocator, maxRecordsPerBatch)
+        ArrowPayload(batch, schema, _allocator)
+      }
+    }
+  }
+
+  /**
+   * Iterate over InternalRows and write to an ArrowRecordBatch, stopping when rowIter is consumed
+   * or the number of records in the batch equals maxRecordsInBatch.  If maxRecordsPerBatch is 0,
+   * then rowIter will be fully consumed.
+   */
+  private def internalRowIterToArrowBatch(
+      rowIter: Iterator[InternalRow],
+      schema: StructType,
+      allocator: BufferAllocator,
+      maxRecordsPerBatch: Int = 0): ArrowRecordBatch = {
+
+    val columnWriters = schema.fields.zipWithIndex.map { case (field, ordinal) =>
+      ColumnWriter(field.dataType, ordinal, allocator).init()
+    }
+
+    val writerLength = columnWriters.length
+    var recordsInBatch = 0
+    while (rowIter.hasNext && (maxRecordsPerBatch <= 0 || recordsInBatch < maxRecordsPerBatch)) {
+      val row = rowIter.next()
+      var i = 0
+      while (i < writerLength) {
+        columnWriters(i).write(row)
+        i += 1
+      }
+      recordsInBatch += 1
+    }
+
+    val (fieldNodes, bufferArrays) = columnWriters.map(_.finish()).unzip
+    val buffers = bufferArrays.flatten
+
+    val rowLength = if (fieldNodes.nonEmpty) fieldNodes.head.getLength else 0
+    val recordBatch = new ArrowRecordBatch(rowLength,
+      fieldNodes.toList.asJava, buffers.toList.asJava)
+
+    buffers.foreach(_.release())
+    recordBatch
+  }
+
+  /**
+   * Convert an ArrowRecordBatch to a byte array and close batch to release resources. Once closed,
+   * the batch can no longer be used.
+   */
+  private[arrow] def batchToByteArray(
+      batch: ArrowRecordBatch,
+      schema: StructType,
+      allocator: BufferAllocator): Array[Byte] = {
+    val arrowSchema = ArrowConverters.schemaToArrowSchema(schema)
+    val root = VectorSchemaRoot.create(arrowSchema, allocator)
+    val out = new ByteArrayOutputStream()
+    val writer = new ArrowFileWriter(root, null, Channels.newChannel(out))
+
+    // Write a batch to byte stream, ensure the batch, allocator and writer are closed
+    Utils.tryWithSafeFinally {
+      val loader = new VectorLoader(root)
+      loader.load(batch)
+      writer.writeBatch()  // writeBatch can throw IOException
+    } {
+      batch.close()
+      root.close()
+      writer.close()
+    }
+    out.toByteArray
+  }
+
+  /**
+   * Convert a byte array to an ArrowRecordBatch.
+   */
+  private[arrow] def byteArrayToBatch(
+      batchBytes: Array[Byte],
+      allocator: BufferAllocator): ArrowRecordBatch = {
+    val in = new ByteArrayReadableSeekableByteChannel(batchBytes)
+    val reader = new ArrowFileReader(in, allocator)
+
+    // Read a batch from a byte stream, ensure the reader is closed
+    Utils.tryWithSafeFinally {
+      val root = reader.getVectorSchemaRoot  // throws IOException
+      val unloader = new VectorUnloader(root)
+      reader.loadNextBatch()  // throws IOException
+      unloader.getRecordBatch
+    } {
+      reader.close()
+    }
+  }
+}
+
+/**
+ * Interface for writing InternalRows to Arrow Buffers.
+ */
+private[arrow] trait ColumnWriter {
+  def init(): this.type
+  def write(row: InternalRow): Unit
+
+  /**
+   * Clear the column writer and return the ArrowFieldNode and ArrowBuf.
+   * This should be called only once after all the data is written.
+   */
+  def finish(): (ArrowFieldNode, Array[ArrowBuf])
+}
+
+/**
+ * Base class for flat arrow column writer, i.e., column without children.
+ */
+private[arrow] abstract class PrimitiveColumnWriter(val ordinal: Int)
+  extends ColumnWriter {
+
+  def getFieldType(dtype: ArrowType): FieldType = FieldType.nullable(dtype)
+
+  def valueVector: BaseDataValueVector
+  def valueMutator: BaseMutator
+
+  def setNull(): Unit
+  def setValue(row: InternalRow): Unit
+
+  protected var count = 0
+  protected var nullCount = 0
+
+  override def init(): this.type = {
+    valueVector.allocateNew()
+    this
+  }
+
+  override def write(row: InternalRow): Unit = {
+    if (row.isNullAt(ordinal)) {
+      setNull()
+      nullCount += 1
+    } else {
+      setValue(row)
+    }
+    count += 1
+  }
+
+  override def finish(): (ArrowFieldNode, Array[ArrowBuf]) = {
+    valueMutator.setValueCount(count)
+    val fieldNode = new ArrowFieldNode(count, nullCount)
+    val valueBuffers = valueVector.getBuffers(true)
+    (fieldNode, valueBuffers)
+  }
+}
+
+private[arrow] class BooleanColumnWriter(dtype: ArrowType, ordinal: Int, allocator: BufferAllocator)
+  extends PrimitiveColumnWriter(ordinal) {
+  override val valueVector: NullableBitVector
+    = new NullableBitVector("BooleanValue", getFieldType(dtype), allocator)
+  override val valueMutator: NullableBitVector#Mutator = valueVector.getMutator
+
+  override def setNull(): Unit = valueMutator.setNull(count)
+  override def setValue(row: InternalRow): Unit
+    = valueMutator.setSafe(count, if (row.getBoolean(ordinal)) 1 else 0 )
+}
+
+private[arrow] class ShortColumnWriter(dtype: ArrowType, ordinal: Int, allocator: BufferAllocator)
+  extends PrimitiveColumnWriter(ordinal) {
+  override val valueVector: NullableSmallIntVector
+    = new NullableSmallIntVector("ShortValue", getFieldType(dtype: ArrowType), allocator)
+  override val valueMutator: NullableSmallIntVector#Mutator = valueVector.getMutator
+
+  override def setNull(): Unit = valueMutator.setNull(count)
+  override def setValue(row: InternalRow): Unit
+    = valueMutator.setSafe(count, row.getShort(ordinal))
+}
+
+private[arrow] class IntegerColumnWriter(dtype: ArrowType, ordinal: Int, allocator: BufferAllocator)
+  extends PrimitiveColumnWriter(ordinal) {
+  override val valueVector: NullableIntVector
+    = new NullableIntVector("IntValue", getFieldType(dtype), allocator)
+  override val valueMutator: NullableIntVector#Mutator = valueVector.getMutator
+
+  override def setNull(): Unit = valueMutator.setNull(count)
+  override def setValue(row: InternalRow): Unit
+    = valueMutator.setSafe(count, row.getInt(ordinal))
+}
+
+private[arrow] class LongColumnWriter(dtype: ArrowType, ordinal: Int, allocator: BufferAllocator)
+  extends PrimitiveColumnWriter(ordinal) {
+  override val valueVector: NullableBigIntVector
+    = new NullableBigIntVector("LongValue", getFieldType(dtype), allocator)
+  override val valueMutator: NullableBigIntVector#Mutator = valueVector.getMutator
+
+  override def setNull(): Unit = valueMutator.setNull(count)
+  override def setValue(row: InternalRow): Unit
+    = valueMutator.setSafe(count, row.getLong(ordinal))
+}
+
+private[arrow] class FloatColumnWriter(dtype: ArrowType, ordinal: Int, allocator: BufferAllocator)
+  extends PrimitiveColumnWriter(ordinal) {
+  override val valueVector: NullableFloat4Vector
+    = new NullableFloat4Vector("FloatValue", getFieldType(dtype), allocator)
+  override val valueMutator: NullableFloat4Vector#Mutator = valueVector.getMutator
+
+  override def setNull(): Unit = valueMutator.setNull(count)
+  override def setValue(row: InternalRow): Unit
+    = valueMutator.setSafe(count, row.getFloat(ordinal))
+}
+
+private[arrow] class DoubleColumnWriter(dtype: ArrowType, ordinal: Int, allocator: BufferAllocator)
+  extends PrimitiveColumnWriter(ordinal) {
+  override val valueVector: NullableFloat8Vector
+    = new NullableFloat8Vector("DoubleValue", getFieldType(dtype), allocator)
+  override val valueMutator: NullableFloat8Vector#Mutator = valueVector.getMutator
+
+  override def setNull(): Unit = valueMutator.setNull(count)
+  override def setValue(row: InternalRow): Unit
+    = valueMutator.setSafe(count, row.getDouble(ordinal))
+}
+
+private[arrow] class ByteColumnWriter(dtype: ArrowType, ordinal: Int, allocator: BufferAllocator)
+  extends PrimitiveColumnWriter(ordinal) {
+  override val valueVector: NullableUInt1Vector
+    = new NullableUInt1Vector("ByteValue", getFieldType(dtype), allocator)
+  override val valueMutator: NullableUInt1Vector#Mutator = valueVector.getMutator
+
+  override def setNull(): Unit = valueMutator.setNull(count)
+  override def setValue(row: InternalRow): Unit
+    = valueMutator.setSafe(count, row.getByte(ordinal))
+}
+
+private[arrow] class UTF8StringColumnWriter(
+    dtype: ArrowType,
+    ordinal: Int,
+    allocator: BufferAllocator)
+  extends PrimitiveColumnWriter(ordinal) {
+  override val valueVector: NullableVarCharVector
+    = new NullableVarCharVector("UTF8StringValue", getFieldType(dtype), allocator)
+  override val valueMutator: NullableVarCharVector#Mutator = valueVector.getMutator
+
+  override def setNull(): Unit = valueMutator.setNull(count)
+  override def setValue(row: InternalRow): Unit = {
+    val str = row.getUTF8String(ordinal)
+    valueMutator.setSafe(count, str.getByteBuffer, 0, str.numBytes)
+  }
+}
+
+private[arrow] class BinaryColumnWriter(dtype: ArrowType, ordinal: Int, allocator: BufferAllocator)
+  extends PrimitiveColumnWriter(ordinal) {
+  override val valueVector: NullableVarBinaryVector
+    = new NullableVarBinaryVector("BinaryValue", getFieldType(dtype), allocator)
+  override val valueMutator: NullableVarBinaryVector#Mutator = valueVector.getMutator
+
+  override def setNull(): Unit = valueMutator.setNull(count)
+  override def setValue(row: InternalRow): Unit = {
+    val bytes = row.getBinary(ordinal)
+    valueMutator.setSafe(count, bytes, 0, bytes.length)
+  }
+}
+
+private[arrow] class DateColumnWriter(dtype: ArrowType, ordinal: Int, allocator: BufferAllocator)
+  extends PrimitiveColumnWriter(ordinal) {
+  override val valueVector: NullableDateDayVector
+    = new NullableDateDayVector("DateValue", getFieldType(dtype), allocator)
+  override val valueMutator: NullableDateDayVector#Mutator = valueVector.getMutator
+
+  override def setNull(): Unit = valueMutator.setNull(count)
+  override def setValue(row: InternalRow): Unit = {
+    valueMutator.setSafe(count, row.getInt(ordinal))
+  }
+}
+
+private[arrow] class TimeStampColumnWriter(
+    dtype: ArrowType,
+    ordinal: Int,
+    allocator: BufferAllocator)
+  extends PrimitiveColumnWriter(ordinal) {
+  override val valueVector: NullableTimeStampMicroVector
+    = new NullableTimeStampMicroVector("TimeStampValue", getFieldType(dtype), allocator)
+  override val valueMutator: NullableTimeStampMicroVector#Mutator = valueVector.getMutator
+
+  override def setNull(): Unit = valueMutator.setNull(count)
+  override def setValue(row: InternalRow): Unit = {
+    valueMutator.setSafe(count, row.getLong(ordinal))
+  }
+}
+
+private[arrow] object ColumnWriter {
+
+  /**
+   * Create an Arrow ColumnWriter given the type and ordinal of row.
+   */
+  def apply(dataType: DataType, ordinal: Int, allocator: BufferAllocator): ColumnWriter = {
+    val dtype = ArrowConverters.sparkTypeToArrowType(dataType)
+    dataType match {
+      case BooleanType => new BooleanColumnWriter(dtype, ordinal, allocator)
+      case ShortType => new ShortColumnWriter(dtype, ordinal, allocator)
+      case IntegerType => new IntegerColumnWriter(dtype, ordinal, allocator)
+      case LongType => new LongColumnWriter(dtype, ordinal, allocator)
+      case FloatType => new FloatColumnWriter(dtype, ordinal, allocator)
+      case DoubleType => new DoubleColumnWriter(dtype, ordinal, allocator)
+      case ByteType => new ByteColumnWriter(dtype, ordinal, allocator)
+      case StringType => new UTF8StringColumnWriter(dtype, ordinal, allocator)
+      case BinaryType => new BinaryColumnWriter(dtype, ordinal, allocator)
+      case DateType => new DateColumnWriter(dtype, ordinal, allocator)
+      case TimestampType => new TimeStampColumnWriter(dtype, ordinal, allocator)
+      case _ => throw new UnsupportedOperationException(s"Unsupported data type: $dataType")
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala
new file mode 100644
index 0000000000000..159328cc0d958
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala
@@ -0,0 +1,1222 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.arrow
+
+import java.io.File
+import java.nio.charset.StandardCharsets
+import java.sql.{Date, Timestamp}
+import java.text.SimpleDateFormat
+import java.util.Locale
+
+import com.google.common.io.Files
+import org.apache.arrow.memory.RootAllocator
+import org.apache.arrow.vector.{VectorLoader, VectorSchemaRoot}
+import org.apache.arrow.vector.file.json.JsonFileReader
+import org.apache.arrow.vector.util.Validator
+import org.scalatest.BeforeAndAfterAll
+
+import org.apache.spark.SparkException
+import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.sql.types.{BinaryType, StructField, StructType}
+import org.apache.spark.util.Utils
+
+
+class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
+  import testImplicits._
+
+  private var tempDataPath: String = _
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    tempDataPath = Utils.createTempDir(namePrefix = "arrow").getAbsolutePath
+  }
+
+  test("collect to arrow record batch") {
+    val indexData = (1 to 6).toDF("i")
+    val arrowPayloads = indexData.toArrowPayload.collect()
+    assert(arrowPayloads.nonEmpty)
+    assert(arrowPayloads.length == indexData.rdd.getNumPartitions)
+    val allocator = new RootAllocator(Long.MaxValue)
+    val arrowRecordBatches = arrowPayloads.map(_.loadBatch(allocator))
+    val rowCount = arrowRecordBatches.map(_.getLength).sum
+    assert(rowCount === indexData.count())
+    arrowRecordBatches.foreach(batch => assert(batch.getNodes.size() > 0))
+    arrowRecordBatches.foreach(_.close())
+    allocator.close()
+  }
+
+  test("short conversion") {
+    val json =
+      s"""
+         |{
+         |  "schema" : {
+         |    "fields" : [ {
+         |      "name" : "a_s",
+         |      "type" : {
+         |        "name" : "int",
+         |        "isSigned" : true,
+         |        "bitWidth" : 16
+         |      },
+         |      "nullable" : false,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 16
+         |        } ]
+         |      }
+         |    }, {
+         |      "name" : "b_s",
+         |      "type" : {
+         |        "name" : "int",
+         |        "isSigned" : true,
+         |        "bitWidth" : 16
+         |      },
+         |      "nullable" : true,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 16
+         |        } ]
+         |      }
+         |    } ]
+         |  },
+         |  "batches" : [ {
+         |    "count" : 6,
+         |    "columns" : [ {
+         |      "name" : "a_s",
+         |      "count" : 6,
+         |      "VALIDITY" : [ 1, 1, 1, 1, 1, 1 ],
+         |      "DATA" : [ 1, -1, 2, -2, 32767, -32768 ]
+         |    }, {
+         |      "name" : "b_s",
+         |      "count" : 6,
+         |      "VALIDITY" : [ 1, 0, 0, 1, 0, 1 ],
+         |      "DATA" : [ 1, 0, 0, -2, 0, -32768 ]
+         |    } ]
+         |  } ]
+         |}
+       """.stripMargin
+
+    val a_s = List[Short](1, -1, 2, -2, 32767, -32768)
+    val b_s = List[Option[Short]](Some(1), None, None, Some(-2), None, Some(-32768))
+    val df = a_s.zip(b_s).toDF("a_s", "b_s")
+
+    collectAndValidate(df, json, "integer-16bit.json")
+  }
+
+  test("int conversion") {
+    val json =
+      s"""
+         |{
+         |  "schema" : {
+         |    "fields" : [ {
+         |      "name" : "a_i",
+         |      "type" : {
+         |        "name" : "int",
+         |        "isSigned" : true,
+         |        "bitWidth" : 32
+         |      },
+         |      "nullable" : false,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 32
+         |        } ]
+         |      }
+         |    }, {
+         |      "name" : "b_i",
+         |      "type" : {
+         |        "name" : "int",
+         |        "isSigned" : true,
+         |        "bitWidth" : 32
+         |      },
+         |      "nullable" : true,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 32
+         |        } ]
+         |      }
+         |    } ]
+         |  },
+         |  "batches" : [ {
+         |    "count" : 6,
+         |    "columns" : [ {
+         |      "name" : "a_i",
+         |      "count" : 6,
+         |      "VALIDITY" : [ 1, 1, 1, 1, 1, 1 ],
+         |      "DATA" : [ 1, -1, 2, -2, 2147483647, -2147483648 ]
+         |    }, {
+         |      "name" : "b_i",
+         |      "count" : 6,
+         |      "VALIDITY" : [ 1, 0, 0, 1, 0, 1 ],
+         |      "DATA" : [ 1, 0, 0, -2, 0, -2147483648 ]
+         |    } ]
+         |  } ]
+         |}
+       """.stripMargin
+
+    val a_i = List[Int](1, -1, 2, -2, 2147483647, -2147483648)
+    val b_i = List[Option[Int]](Some(1), None, None, Some(-2), None, Some(-2147483648))
+    val df = a_i.zip(b_i).toDF("a_i", "b_i")
+
+    collectAndValidate(df, json, "integer-32bit.json")
+  }
+
+  test("long conversion") {
+    val json =
+      s"""
+         |{
+         |  "schema" : {
+         |    "fields" : [ {
+         |      "name" : "a_l",
+         |      "type" : {
+         |        "name" : "int",
+         |        "isSigned" : true,
+         |        "bitWidth" : 64
+         |      },
+         |      "nullable" : false,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 64
+         |        } ]
+         |      }
+         |    }, {
+         |      "name" : "b_l",
+         |      "type" : {
+         |        "name" : "int",
+         |        "isSigned" : true,
+         |        "bitWidth" : 64
+         |      },
+         |      "nullable" : true,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 64
+         |        } ]
+         |      }
+         |    } ]
+         |  },
+         |  "batches" : [ {
+         |    "count" : 6,
+         |    "columns" : [ {
+         |      "name" : "a_l",
+         |      "count" : 6,
+         |      "VALIDITY" : [ 1, 1, 1, 1, 1, 1 ],
+         |      "DATA" : [ 1, -1, 2, -2, 9223372036854775807, -9223372036854775808 ]
+         |    }, {
+         |      "name" : "b_l",
+         |      "count" : 6,
+         |      "VALIDITY" : [ 1, 0, 0, 1, 0, 1 ],
+         |      "DATA" : [ 1, 0, 0, -2, 0, -9223372036854775808 ]
+         |    } ]
+         |  } ]
+         |}
+       """.stripMargin
+
+    val a_l = List[Long](1, -1, 2, -2, 9223372036854775807L, -9223372036854775808L)
+    val b_l = List[Option[Long]](Some(1), None, None, Some(-2), None, Some(-9223372036854775808L))
+    val df = a_l.zip(b_l).toDF("a_l", "b_l")
+
+    collectAndValidate(df, json, "integer-64bit.json")
+  }
+
+  test("float conversion") {
+    val json =
+      s"""
+         |{
+         |  "schema" : {
+         |    "fields" : [ {
+         |      "name" : "a_f",
+         |      "type" : {
+         |        "name" : "floatingpoint",
+         |        "precision" : "SINGLE"
+         |      },
+         |      "nullable" : false,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 32
+         |        } ]
+         |      }
+         |    }, {
+         |      "name" : "b_f",
+         |      "type" : {
+         |        "name" : "floatingpoint",
+         |        "precision" : "SINGLE"
+         |      },
+         |      "nullable" : true,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 32
+         |        } ]
+         |      }
+         |    } ]
+         |  },
+         |  "batches" : [ {
+         |    "count" : 6,
+         |    "columns" : [ {
+         |      "name" : "a_f",
+         |      "count" : 6,
+         |      "VALIDITY" : [ 1, 1, 1, 1, 1, 1 ],
+         |      "DATA" : [ 1.0, 2.0, 0.01, 200.0, 0.0001, 20000.0 ]
+         |    }, {
+         |      "name" : "b_f",
+         |      "count" : 6,
+         |      "VALIDITY" : [ 1, 0, 0, 1, 0, 1 ],
+         |      "DATA" : [ 1.1, 0.0, 0.0, 2.2, 0.0, 3.3 ]
+         |    } ]
+         |  } ]
+         |}
+       """.stripMargin
+
+    val a_f = List(1.0f, 2.0f, 0.01f, 200.0f, 0.0001f, 20000.0f)
+    val b_f = List[Option[Float]](Some(1.1f), None, None, Some(2.2f), None, Some(3.3f))
+    val df = a_f.zip(b_f).toDF("a_f", "b_f")
+
+    collectAndValidate(df, json, "floating_point-single_precision.json")
+  }
+
+  test("double conversion") {
+    val json =
+      s"""
+         |{
+         |  "schema" : {
+         |    "fields" : [ {
+         |      "name" : "a_d",
+         |      "type" : {
+         |        "name" : "floatingpoint",
+         |        "precision" : "DOUBLE"
+         |      },
+         |      "nullable" : false,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 64
+         |        } ]
+         |      }
+         |    }, {
+         |      "name" : "b_d",
+         |      "type" : {
+         |        "name" : "floatingpoint",
+         |        "precision" : "DOUBLE"
+         |      },
+         |      "nullable" : true,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 64
+         |        } ]
+         |      }
+         |    } ]
+         |  },
+         |  "batches" : [ {
+         |    "count" : 6,
+         |    "columns" : [ {
+         |      "name" : "a_d",
+         |      "count" : 6,
+         |      "VALIDITY" : [ 1, 1, 1, 1, 1, 1 ],
+         |      "DATA" : [ 1.0, 2.0, 0.01, 200.0, 1.0E-4, 20000.0 ]
+         |    }, {
+         |      "name" : "b_d",
+         |      "count" : 6,
+         |      "VALIDITY" : [ 1, 0, 0, 1, 0, 1 ],
+         |      "DATA" : [ 1.1, 0.0, 0.0, 2.2, 0.0, 3.3 ]
+         |    } ]
+         |  } ]
+         |}
+       """.stripMargin
+
+    val a_d = List(1.0, 2.0, 0.01, 200.0, 0.0001, 20000.0)
+    val b_d = List[Option[Double]](Some(1.1), None, None, Some(2.2), None, Some(3.3))
+    val df = a_d.zip(b_d).toDF("a_d", "b_d")
+
+    collectAndValidate(df, json, "floating_point-double_precision.json")
+  }
+
+  test("index conversion") {
+    val data = List[Int](1, 2, 3, 4, 5, 6)
+    val json =
+      s"""
+         |{
+         |  "schema" : {
+         |    "fields" : [ {
+         |      "name" : "i",
+         |      "type" : {
+         |        "name" : "int",
+         |        "isSigned" : true,
+         |        "bitWidth" : 32
+         |      },
+         |      "nullable" : false,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 32
+         |        } ]
+         |      }
+         |    } ]
+         |  },
+         |  "batches" : [ {
+         |    "count" : 6,
+         |    "columns" : [ {
+         |      "name" : "i",
+         |      "count" : 6,
+         |      "VALIDITY" : [ 1, 1, 1, 1, 1, 1 ],
+         |      "DATA" : [ 1, 2, 3, 4, 5, 6 ]
+         |    } ]
+         |  } ]
+         |}
+       """.stripMargin
+    val df = data.toDF("i")
+
+    collectAndValidate(df, json, "indexData-ints.json")
+  }
+
+  test("mixed numeric type conversion") {
+    val json =
+      s"""
+         |{
+         |  "schema" : {
+         |    "fields" : [ {
+         |      "name" : "a",
+         |      "type" : {
+         |        "name" : "int",
+         |        "isSigned" : true,
+         |        "bitWidth" : 16
+         |      },
+         |      "nullable" : false,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 16
+         |        } ]
+         |      }
+         |    }, {
+         |      "name" : "b",
+         |      "type" : {
+         |        "name" : "floatingpoint",
+         |        "precision" : "SINGLE"
+         |      },
+         |      "nullable" : false,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 32
+         |        } ]
+         |      }
+         |    }, {
+         |      "name" : "c",
+         |      "type" : {
+         |        "name" : "int",
+         |        "isSigned" : true,
+         |        "bitWidth" : 32
+         |      },
+         |      "nullable" : false,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 32
+         |        } ]
+         |      }
+         |    }, {
+         |      "name" : "d",
+         |      "type" : {
+         |        "name" : "floatingpoint",
+         |        "precision" : "DOUBLE"
+         |      },
+         |      "nullable" : false,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 64
+         |        } ]
+         |      }
+         |    }, {
+         |      "name" : "e",
+         |      "type" : {
+         |        "name" : "int",
+         |        "isSigned" : true,
+         |        "bitWidth" : 64
+         |      },
+         |      "nullable" : false,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 64
+         |        } ]
+         |      }
+         |    } ]
+         |  },
+         |  "batches" : [ {
+         |    "count" : 6,
+         |    "columns" : [ {
+         |      "name" : "a",
+         |      "count" : 6,
+         |      "VALIDITY" : [ 1, 1, 1, 1, 1, 1 ],
+         |      "DATA" : [ 1, 2, 3, 4, 5, 6 ]
+         |    }, {
+         |      "name" : "b",
+         |      "count" : 6,
+         |      "VALIDITY" : [ 1, 1, 1, 1, 1, 1 ],
+         |      "DATA" : [ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0 ]
+         |    }, {
+         |      "name" : "c",
+         |      "count" : 6,
+         |      "VALIDITY" : [ 1, 1, 1, 1, 1, 1 ],
+         |      "DATA" : [ 1, 2, 3, 4, 5, 6 ]
+         |    }, {
+         |      "name" : "d",
+         |      "count" : 6,
+         |      "VALIDITY" : [ 1, 1, 1, 1, 1, 1 ],
+         |      "DATA" : [ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0 ]
+         |    }, {
+         |      "name" : "e",
+         |      "count" : 6,
+         |      "VALIDITY" : [ 1, 1, 1, 1, 1, 1 ],
+         |      "DATA" : [ 1, 2, 3, 4, 5, 6 ]
+         |    } ]
+         |  } ]
+         |}
+       """.stripMargin
+
+    val data = List(1, 2, 3, 4, 5, 6)
+    val data_tuples = for (d <- data) yield {
+      (d.toShort, d.toFloat, d.toInt, d.toDouble, d.toLong)
+    }
+    val df = data_tuples.toDF("a", "b", "c", "d", "e")
+
+    collectAndValidate(df, json, "mixed_numeric_types.json")
+  }
+
+  test("string type conversion") {
+    val json =
+      s"""
+         |{
+         |  "schema" : {
+         |    "fields" : [ {
+         |      "name" : "upper_case",
+         |      "type" : {
+         |        "name" : "utf8"
+         |      },
+         |      "nullable" : true,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "OFFSET",
+         |          "typeBitWidth" : 32
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 8
+         |        } ]
+         |      }
+         |    }, {
+         |      "name" : "lower_case",
+         |      "type" : {
+         |        "name" : "utf8"
+         |      },
+         |      "nullable" : true,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "OFFSET",
+         |          "typeBitWidth" : 32
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 8
+         |        } ]
+         |      }
+         |    }, {
+         |      "name" : "null_str",
+         |      "type" : {
+         |        "name" : "utf8"
+         |      },
+         |      "nullable" : true,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "OFFSET",
+         |          "typeBitWidth" : 32
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 8
+         |        } ]
+         |      }
+         |    } ]
+         |  },
+         |  "batches" : [ {
+         |    "count" : 3,
+         |    "columns" : [ {
+         |      "name" : "upper_case",
+         |      "count" : 3,
+         |      "VALIDITY" : [ 1, 1, 1 ],
+         |      "OFFSET" : [ 0, 1, 2, 3 ],
+         |      "DATA" : [ "A", "B", "C" ]
+         |    }, {
+         |      "name" : "lower_case",
+         |      "count" : 3,
+         |      "VALIDITY" : [ 1, 1, 1 ],
+         |      "OFFSET" : [ 0, 1, 2, 3 ],
+         |      "DATA" : [ "a", "b", "c" ]
+         |    }, {
+         |      "name" : "null_str",
+         |      "count" : 3,
+         |      "VALIDITY" : [ 1, 1, 0 ],
+         |      "OFFSET" : [ 0, 2, 5, 5 ],
+         |      "DATA" : [ "ab", "CDE", "" ]
+         |    } ]
+         |  } ]
+         |}
+       """.stripMargin
+
+    val upperCase = Seq("A", "B", "C")
+    val lowerCase = Seq("a", "b", "c")
+    val nullStr = Seq("ab", "CDE", null)
+    val df = (upperCase, lowerCase, nullStr).zipped.toList
+      .toDF("upper_case", "lower_case", "null_str")
+
+    collectAndValidate(df, json, "stringData.json")
+  }
+
+  test("boolean type conversion") {
+    val json =
+      s"""
+         |{
+         |  "schema" : {
+         |    "fields" : [ {
+         |      "name" : "a_bool",
+         |      "type" : {
+         |        "name" : "bool"
+         |      },
+         |      "nullable" : false,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 1
+         |        } ]
+         |      }
+         |    } ]
+         |  },
+         |  "batches" : [ {
+         |    "count" : 4,
+         |    "columns" : [ {
+         |      "name" : "a_bool",
+         |      "count" : 4,
+         |      "VALIDITY" : [ 1, 1, 1, 1 ],
+         |      "DATA" : [ true, true, false, true ]
+         |    } ]
+         |  } ]
+         |}
+       """.stripMargin
+    val df = Seq(true, true, false, true).toDF("a_bool")
+    collectAndValidate(df, json, "boolData.json")
+  }
+
+  test("byte type conversion") {
+    val json =
+      s"""
+         |{
+         |  "schema" : {
+         |    "fields" : [ {
+         |      "name" : "a_byte",
+         |      "type" : {
+         |        "name" : "int",
+         |        "isSigned" : true,
+         |        "bitWidth" : 8
+         |      },
+         |      "nullable" : false,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 8
+         |        } ]
+         |      }
+         |    } ]
+         |  },
+         |  "batches" : [ {
+         |    "count" : 4,
+         |    "columns" : [ {
+         |      "name" : "a_byte",
+         |      "count" : 4,
+         |      "VALIDITY" : [ 1, 1, 1, 1 ],
+         |      "DATA" : [ 1, -1, 64, 127 ]
+         |    } ]
+         |  } ]
+         |}
+         |
+       """.stripMargin
+    val df = List[Byte](1.toByte, (-1).toByte, 64.toByte, Byte.MaxValue).toDF("a_byte")
+    collectAndValidate(df, json, "byteData.json")
+  }
+
+  test("binary type conversion") {
+    val json =
+      s"""
+         |{
+         |  "schema" : {
+         |    "fields" : [ {
+         |      "name" : "a_binary",
+         |      "type" : {
+         |        "name" : "binary"
+         |      },
+         |      "nullable" : true,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "OFFSET",
+         |          "typeBitWidth" : 32
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 8
+         |        } ]
+         |      }
+         |    } ]
+         |  },
+         |  "batches" : [ {
+         |    "count" : 3,
+         |    "columns" : [ {
+         |      "name" : "a_binary",
+         |      "count" : 3,
+         |      "VALIDITY" : [ 1, 1, 1 ],
+         |      "OFFSET" : [ 0, 3, 4, 6 ],
+         |      "DATA" : [ "616263", "64", "6566" ]
+         |    } ]
+         |  } ]
+         |}
+       """.stripMargin
+
+    val data = Seq("abc", "d", "ef")
+    val rdd = sparkContext.parallelize(data.map(s => Row(s.getBytes("utf-8"))))
+    val df = spark.createDataFrame(rdd, StructType(Seq(StructField("a_binary", BinaryType))))
+
+    collectAndValidate(df, json, "binaryData.json")
+  }
+
+  test("floating-point NaN") {
+    val json =
+      s"""
+         |{
+         |  "schema" : {
+         |    "fields" : [ {
+         |      "name" : "NaN_f",
+         |      "type" : {
+         |        "name" : "floatingpoint",
+         |        "precision" : "SINGLE"
+         |      },
+         |      "nullable" : false,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 32
+         |        } ]
+         |      }
+         |    }, {
+         |      "name" : "NaN_d",
+         |      "type" : {
+         |        "name" : "floatingpoint",
+         |        "precision" : "DOUBLE"
+         |      },
+         |      "nullable" : false,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 64
+         |        } ]
+         |      }
+         |    } ]
+         |  },
+         |  "batches" : [ {
+         |    "count" : 2,
+         |    "columns" : [ {
+         |      "name" : "NaN_f",
+         |      "count" : 2,
+         |      "VALIDITY" : [ 1, 1 ],
+         |      "DATA" : [ 1.2000000476837158, "NaN" ]
+         |    }, {
+         |      "name" : "NaN_d",
+         |      "count" : 2,
+         |      "VALIDITY" : [ 1, 1 ],
+         |      "DATA" : [ "NaN", 1.2 ]
+         |    } ]
+         |  } ]
+         |}
+       """.stripMargin
+
+    val fnan = Seq(1.2F, Float.NaN)
+    val dnan = Seq(Double.NaN, 1.2)
+    val df = fnan.zip(dnan).toDF("NaN_f", "NaN_d")
+
+    collectAndValidate(df, json, "nanData-floating_point.json")
+  }
+
+  test("partitioned DataFrame") {
+    val json1 =
+      s"""
+         |{
+         |  "schema" : {
+         |    "fields" : [ {
+         |      "name" : "a",
+         |      "type" : {
+         |        "name" : "int",
+         |        "isSigned" : true,
+         |        "bitWidth" : 32
+         |      },
+         |      "nullable" : false,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 32
+         |        } ]
+         |      }
+         |    }, {
+         |      "name" : "b",
+         |      "type" : {
+         |        "name" : "int",
+         |        "isSigned" : true,
+         |        "bitWidth" : 32
+         |      },
+         |      "nullable" : false,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 32
+         |        } ]
+         |      }
+         |    } ]
+         |  },
+         |  "batches" : [ {
+         |    "count" : 3,
+         |    "columns" : [ {
+         |      "name" : "a",
+         |      "count" : 3,
+         |      "VALIDITY" : [ 1, 1, 1 ],
+         |      "DATA" : [ 1, 1, 2 ]
+         |    }, {
+         |      "name" : "b",
+         |      "count" : 3,
+         |      "VALIDITY" : [ 1, 1, 1 ],
+         |      "DATA" : [ 1, 2, 1 ]
+         |    } ]
+         |  } ]
+         |}
+       """.stripMargin
+    val json2 =
+      s"""
+         |{
+         |  "schema" : {
+         |    "fields" : [ {
+         |      "name" : "a",
+         |      "type" : {
+         |        "name" : "int",
+         |        "isSigned" : true,
+         |        "bitWidth" : 32
+         |      },
+         |      "nullable" : false,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 32
+         |        } ]
+         |      }
+         |    }, {
+         |      "name" : "b",
+         |      "type" : {
+         |        "name" : "int",
+         |        "isSigned" : true,
+         |        "bitWidth" : 32
+         |      },
+         |      "nullable" : false,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 32
+         |        } ]
+         |      }
+         |    } ]
+         |  },
+         |  "batches" : [ {
+         |    "count" : 3,
+         |    "columns" : [ {
+         |      "name" : "a",
+         |      "count" : 3,
+         |      "VALIDITY" : [ 1, 1, 1 ],
+         |      "DATA" : [ 2, 3, 3 ]
+         |    }, {
+         |      "name" : "b",
+         |      "count" : 3,
+         |      "VALIDITY" : [ 1, 1, 1 ],
+         |      "DATA" : [ 2, 1, 2 ]
+         |    } ]
+         |  } ]
+         |}
+       """.stripMargin
+
+    val arrowPayloads = testData2.toArrowPayload.collect()
+    // NOTE: testData2 should have 2 partitions -> 2 arrow batches in payload
+    assert(arrowPayloads.length === 2)
+    val schema = testData2.schema
+
+    val tempFile1 = new File(tempDataPath, "testData2-ints-part1.json")
+    val tempFile2 = new File(tempDataPath, "testData2-ints-part2.json")
+    Files.write(json1, tempFile1, StandardCharsets.UTF_8)
+    Files.write(json2, tempFile2, StandardCharsets.UTF_8)
+
+    validateConversion(schema, arrowPayloads(0), tempFile1)
+    validateConversion(schema, arrowPayloads(1), tempFile2)
+  }
+
+  test("empty frame collect") {
+    val arrowPayload = spark.emptyDataFrame.toArrowPayload.collect()
+    assert(arrowPayload.isEmpty)
+
+    val filteredDF = List[Int](1, 2, 3, 4, 5, 6).toDF("i")
+    val filteredArrowPayload = filteredDF.filter("i < 0").toArrowPayload.collect()
+    assert(filteredArrowPayload.isEmpty)
+  }
+
+  test("empty partition collect") {
+    val emptyPart = spark.sparkContext.parallelize(Seq(1), 2).toDF("i")
+    val arrowPayloads = emptyPart.toArrowPayload.collect()
+    assert(arrowPayloads.length === 1)
+    val allocator = new RootAllocator(Long.MaxValue)
+    val arrowRecordBatches = arrowPayloads.map(_.loadBatch(allocator))
+    assert(arrowRecordBatches.head.getLength == 1)
+    arrowRecordBatches.foreach(_.close())
+    allocator.close()
+  }
+
+  test("max records in batch conf") {
+    val totalRecords = 10
+    val maxRecordsPerBatch = 3
+    spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", maxRecordsPerBatch)
+    val df = spark.sparkContext.parallelize(1 to totalRecords, 2).toDF("i")
+    val arrowPayloads = df.toArrowPayload.collect()
+    val allocator = new RootAllocator(Long.MaxValue)
+    val arrowRecordBatches = arrowPayloads.map(_.loadBatch(allocator))
+    var recordCount = 0
+    arrowRecordBatches.foreach { batch =>
+      assert(batch.getLength > 0)
+      assert(batch.getLength <= maxRecordsPerBatch)
+      recordCount += batch.getLength
+      batch.close()
+    }
+    assert(recordCount == totalRecords)
+    allocator.close()
+    spark.conf.unset("spark.sql.execution.arrow.maxRecordsPerBatch")
+  }
+
+  testQuietly("unsupported types") {
+    def runUnsupported(block: => Unit): Unit = {
+      val msg = intercept[SparkException] {
+        block
+      }
+      assert(msg.getMessage.contains("Unsupported data type"))
+      assert(msg.getCause.getClass === classOf[UnsupportedOperationException])
+    }
+
+    runUnsupported { decimalData.toArrowPayload.collect() }
+    runUnsupported { arrayData.toDF().toArrowPayload.collect() }
+    runUnsupported { mapData.toDF().toArrowPayload.collect() }
+    runUnsupported { complexData.toArrowPayload.collect() }
+
+    val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS z", Locale.US)
+    val d1 = new Date(sdf.parse("2015-04-08 13:10:15.000 UTC").getTime)
+    val d2 = new Date(sdf.parse("2016-05-09 13:10:15.000 UTC").getTime)
+    runUnsupported { Seq(d1, d2).toDF("date").toArrowPayload.collect() }
+
+    val ts1 = new Timestamp(sdf.parse("2013-04-08 01:10:15.567 UTC").getTime)
+    val ts2 = new Timestamp(sdf.parse("2013-04-08 13:10:10.789 UTC").getTime)
+    runUnsupported { Seq(ts1, ts2).toDF("timestamp").toArrowPayload.collect() }
+  }
+
+  test("test Arrow Validator") {
+    val json =
+      s"""
+         |{
+         |  "schema" : {
+         |    "fields" : [ {
+         |      "name" : "a_i",
+         |      "type" : {
+         |        "name" : "int",
+         |        "isSigned" : true,
+         |        "bitWidth" : 32
+         |      },
+         |      "nullable" : false,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 32
+         |        } ]
+         |      }
+         |    }, {
+         |      "name" : "b_i",
+         |      "type" : {
+         |        "name" : "int",
+         |        "isSigned" : true,
+         |        "bitWidth" : 32
+         |      },
+         |      "nullable" : true,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 32
+         |        } ]
+         |      }
+         |    } ]
+         |  },
+         |  "batches" : [ {
+         |    "count" : 6,
+         |    "columns" : [ {
+         |      "name" : "a_i",
+         |      "count" : 6,
+         |      "VALIDITY" : [ 1, 1, 1, 1, 1, 1 ],
+         |      "DATA" : [ 1, -1, 2, -2, 2147483647, -2147483648 ]
+         |    }, {
+         |      "name" : "b_i",
+         |      "count" : 6,
+         |      "VALIDITY" : [ 1, 0, 0, 1, 0, 1 ],
+         |      "DATA" : [ 1, 0, 0, -2, 0, -2147483648 ]
+         |    } ]
+         |  } ]
+         |}
+       """.stripMargin
+    val json_diff_col_order =
+      s"""
+         |{
+         |  "schema" : {
+         |    "fields" : [ {
+         |      "name" : "b_i",
+         |      "type" : {
+         |        "name" : "int",
+         |        "isSigned" : true,
+         |        "bitWidth" : 32
+         |      },
+         |      "nullable" : true,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 32
+         |        } ]
+         |      }
+         |    }, {
+         |      "name" : "a_i",
+         |      "type" : {
+         |        "name" : "int",
+         |        "isSigned" : true,
+         |        "bitWidth" : 32
+         |      },
+         |      "nullable" : false,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 32
+         |        } ]
+         |      }
+         |    } ]
+         |  },
+         |  "batches" : [ {
+         |    "count" : 6,
+         |    "columns" : [ {
+         |      "name" : "a_i",
+         |      "count" : 6,
+         |      "VALIDITY" : [ 1, 1, 1, 1, 1, 1 ],
+         |      "DATA" : [ 1, -1, 2, -2, 2147483647, -2147483648 ]
+         |    }, {
+         |      "name" : "b_i",
+         |      "count" : 6,
+         |      "VALIDITY" : [ 1, 0, 0, 1, 0, 1 ],
+         |      "DATA" : [ 1, 0, 0, -2, 0, -2147483648 ]
+         |    } ]
+         |  } ]
+         |}
+       """.stripMargin
+
+    val a_i = List[Int](1, -1, 2, -2, 2147483647, -2147483648)
+    val b_i = List[Option[Int]](Some(1), None, None, Some(-2), None, Some(-2147483648))
+    val df = a_i.zip(b_i).toDF("a_i", "b_i")
+
+    // Different schema
+    intercept[IllegalArgumentException] {
+      collectAndValidate(df, json_diff_col_order, "validator_diff_schema.json")
+    }
+
+    // Different values
+    intercept[IllegalArgumentException] {
+      collectAndValidate(df.sort($"a_i".desc), json, "validator_diff_values.json")
+    }
+  }
+
+  /** Test that a converted DataFrame to Arrow record batch equals batch read from JSON file */
+  private def collectAndValidate(df: DataFrame, json: String, file: String): Unit = {
+    // NOTE: coalesce to single partition because can only load 1 batch in validator
+    val arrowPayload = df.coalesce(1).toArrowPayload.collect().head
+    val tempFile = new File(tempDataPath, file)
+    Files.write(json, tempFile, StandardCharsets.UTF_8)
+    validateConversion(df.schema, arrowPayload, tempFile)
+  }
+
+  private def validateConversion(
+      sparkSchema: StructType,
+      arrowPayload: ArrowPayload,
+      jsonFile: File): Unit = {
+    val allocator = new RootAllocator(Long.MaxValue)
+    val jsonReader = new JsonFileReader(jsonFile, allocator)
+
+    val arrowSchema = ArrowConverters.schemaToArrowSchema(sparkSchema)
+    val jsonSchema = jsonReader.start()
+    Validator.compareSchemas(arrowSchema, jsonSchema)
+
+    val arrowRoot = VectorSchemaRoot.create(arrowSchema, allocator)
+    val vectorLoader = new VectorLoader(arrowRoot)
+    val arrowRecordBatch = arrowPayload.loadBatch(allocator)
+    vectorLoader.load(arrowRecordBatch)
+    val jsonRoot = jsonReader.read()
+    Validator.compareVectorSchemaRoot(arrowRoot, jsonRoot)
+
+    jsonRoot.close()
+    jsonReader.close()
+    arrowRecordBatch.close()
+    arrowRoot.close()
+    allocator.close()
+  }
+}

From c3713fde86204bf3f027483914ff9e60e7aad261 Mon Sep 17 00:00:00 2001
From: chie8842 <chie8842@gmail.com>
Date: Mon, 10 Jul 2017 18:56:54 -0700
Subject: [PATCH 148/779] [SPARK-21358][EXAMPLES] Argument of
 repartitionandsortwithinpartitions at pyspark

## What changes were proposed in this pull request?
At example of repartitionAndSortWithinPartitions at rdd.py, third argument should be True or False.
I proposed fix of example code.

## How was this patch tested?
* I rename test_repartitionAndSortWithinPartitions to test_repartitionAndSortWIthinPartitions_asc to specify boolean argument.
* I added test_repartitionAndSortWithinPartitions_desc to test False pattern at third argument.

(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)
(If this patch involves UI changes, please attach a screenshot; otherwise, remove this)

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: chie8842 <chie8842@gmail.com>

Closes #18586 from chie8842/SPARK-21358.
---
 python/pyspark/rdd.py   |  2 +-
 python/pyspark/tests.py | 12 ++++++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 7dfa17f68a943..3325b65f8b600 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -608,7 +608,7 @@ def repartitionAndSortWithinPartitions(self, numPartitions=None, partitionFunc=p
         sort records by their keys.
 
         >>> rdd = sc.parallelize([(0, 5), (3, 8), (2, 6), (0, 8), (3, 8), (1, 3)])
-        >>> rdd2 = rdd.repartitionAndSortWithinPartitions(2, lambda x: x % 2, 2)
+        >>> rdd2 = rdd.repartitionAndSortWithinPartitions(2, lambda x: x % 2, True)
         >>> rdd2.glom().collect()
         [[(0, 5), (0, 8), (2, 6)], [(1, 3), (3, 8), (3, 8)]]
         """
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index bb13de563cdd4..73ab442dfd791 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -1019,14 +1019,22 @@ def test_histogram(self):
         self.assertEqual((["ab", "ef"], [5]), rdd.histogram(1))
         self.assertRaises(TypeError, lambda: rdd.histogram(2))
 
-    def test_repartitionAndSortWithinPartitions(self):
+    def test_repartitionAndSortWithinPartitions_asc(self):
         rdd = self.sc.parallelize([(0, 5), (3, 8), (2, 6), (0, 8), (3, 8), (1, 3)], 2)
 
-        repartitioned = rdd.repartitionAndSortWithinPartitions(2, lambda key: key % 2)
+        repartitioned = rdd.repartitionAndSortWithinPartitions(2, lambda key: key % 2, True)
         partitions = repartitioned.glom().collect()
         self.assertEqual(partitions[0], [(0, 5), (0, 8), (2, 6)])
         self.assertEqual(partitions[1], [(1, 3), (3, 8), (3, 8)])
 
+    def test_repartitionAndSortWithinPartitions_desc(self):
+        rdd = self.sc.parallelize([(0, 5), (3, 8), (2, 6), (0, 8), (3, 8), (1, 3)], 2)
+
+        repartitioned = rdd.repartitionAndSortWithinPartitions(2, lambda key: key % 2, False)
+        partitions = repartitioned.glom().collect()
+        self.assertEqual(partitions[0], [(2, 6), (0, 5), (0, 8)])
+        self.assertEqual(partitions[1], [(3, 8), (3, 8), (1, 3)])
+
     def test_repartition_no_skewed(self):
         num_partitions = 20
         a = self.sc.parallelize(range(int(1000)), 2)

From a2bec6c92a063f4a8e9ed75a9f3f06808485b6d7 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Mon, 10 Jul 2017 20:16:29 -0700
Subject: [PATCH 149/779] [SPARK-21043][SQL] Add unionByName in Dataset

## What changes were proposed in this pull request?
This pr added `unionByName` in `DataSet`.
Here is how to use:
```
val df1 = Seq((1, 2, 3)).toDF("col0", "col1", "col2")
val df2 = Seq((4, 5, 6)).toDF("col1", "col2", "col0")
df1.unionByName(df2).show

// output:
// +----+----+----+
// |col0|col1|col2|
// +----+----+----+
// |   1|   2|   3|
// |   6|   4|   5|
// +----+----+----+
```

## How was this patch tested?
Added tests in `DataFrameSuite`.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #18300 from maropu/SPARK-21043-2.
---
 .../scala/org/apache/spark/sql/Dataset.scala  | 60 +++++++++++++
 .../org/apache/spark/sql/DataFrameSuite.scala | 87 +++++++++++++++++++
 2 files changed, 147 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index a7773831df075..7f3ae05411516 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -53,6 +53,7 @@ import org.apache.spark.sql.execution.python.EvaluatePython
 import org.apache.spark.sql.execution.stat.StatFunctions
 import org.apache.spark.sql.streaming.DataStreamWriter
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.util.SchemaUtils
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.unsafe.types.CalendarInterval
 import org.apache.spark.util.Utils
@@ -1734,6 +1735,65 @@ class Dataset[T] private[sql](
     CombineUnions(Union(logicalPlan, other.logicalPlan))
   }
 
+  /**
+   * Returns a new Dataset containing union of rows in this Dataset and another Dataset.
+   *
+   * This is different from both `UNION ALL` and `UNION DISTINCT` in SQL. To do a SQL-style set
+   * union (that does deduplication of elements), use this function followed by a [[distinct]].
+   *
+   * The difference between this function and [[union]] is that this function
+   * resolves columns by name (not by position):
+   *
+   * {{{
+   *   val df1 = Seq((1, 2, 3)).toDF("col0", "col1", "col2")
+   *   val df2 = Seq((4, 5, 6)).toDF("col1", "col2", "col0")
+   *   df1.unionByName(df2).show
+   *
+   *   // output:
+   *   // +----+----+----+
+   *   // |col0|col1|col2|
+   *   // +----+----+----+
+   *   // |   1|   2|   3|
+   *   // |   6|   4|   5|
+   *   // +----+----+----+
+   * }}}
+   *
+   * @group typedrel
+   * @since 2.3.0
+   */
+  def unionByName(other: Dataset[T]): Dataset[T] = withSetOperator {
+    // Check column name duplication
+    val resolver = sparkSession.sessionState.analyzer.resolver
+    val leftOutputAttrs = logicalPlan.output
+    val rightOutputAttrs = other.logicalPlan.output
+
+    SchemaUtils.checkColumnNameDuplication(
+      leftOutputAttrs.map(_.name),
+      "in the left attributes",
+      sparkSession.sessionState.conf.caseSensitiveAnalysis)
+    SchemaUtils.checkColumnNameDuplication(
+      rightOutputAttrs.map(_.name),
+      "in the right attributes",
+      sparkSession.sessionState.conf.caseSensitiveAnalysis)
+
+    // Builds a project list for `other` based on `logicalPlan` output names
+    val rightProjectList = leftOutputAttrs.map { lattr =>
+      rightOutputAttrs.find { rattr => resolver(lattr.name, rattr.name) }.getOrElse {
+        throw new AnalysisException(
+          s"""Cannot resolve column name "${lattr.name}" among """ +
+            s"""(${rightOutputAttrs.map(_.name).mkString(", ")})""")
+      }
+    }
+
+    // Delegates failure checks to `CheckAnalysis`
+    val notFoundAttrs = rightOutputAttrs.diff(rightProjectList)
+    val rightChild = Project(rightProjectList ++ notFoundAttrs, other.logicalPlan)
+
+    // This breaks caching, but it's usually ok because it addresses a very specific use case:
+    // using union to union many files or partitions.
+    CombineUnions(Union(logicalPlan, rightChild))
+  }
+
   /**
    * Returns a new Dataset containing rows only in both this Dataset and another Dataset.
    * This is equivalent to `INTERSECT` in SQL.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index a5a2e1c38d300..5ae27032e0e94 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -111,6 +111,93 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
     )
   }
 
+  test("union by name") {
+    var df1 = Seq((1, 2, 3)).toDF("a", "b", "c")
+    var df2 = Seq((3, 1, 2)).toDF("c", "a", "b")
+    val df3 = Seq((2, 3, 1)).toDF("b", "c", "a")
+    val unionDf = df1.unionByName(df2.unionByName(df3))
+    checkAnswer(unionDf,
+      Row(1, 2, 3) :: Row(1, 2, 3) :: Row(1, 2, 3) :: Nil
+    )
+
+    // Check if adjacent unions are combined into a single one
+    assert(unionDf.queryExecution.optimizedPlan.collect { case u: Union => true }.size == 1)
+
+    // Check failure cases
+    df1 = Seq((1, 2)).toDF("a", "c")
+    df2 = Seq((3, 4, 5)).toDF("a", "b", "c")
+    var errMsg = intercept[AnalysisException] {
+      df1.unionByName(df2)
+    }.getMessage
+    assert(errMsg.contains(
+      "Union can only be performed on tables with the same number of columns, " +
+        "but the first table has 2 columns and the second table has 3 columns"))
+
+    df1 = Seq((1, 2, 3)).toDF("a", "b", "c")
+    df2 = Seq((4, 5, 6)).toDF("a", "c", "d")
+    errMsg = intercept[AnalysisException] {
+      df1.unionByName(df2)
+    }.getMessage
+    assert(errMsg.contains("""Cannot resolve column name "b" among (a, c, d)"""))
+  }
+
+  test("union by name - type coercion") {
+    var df1 = Seq((1, "a")).toDF("c0", "c1")
+    var df2 = Seq((3, 1L)).toDF("c1", "c0")
+    checkAnswer(df1.unionByName(df2), Row(1L, "a") :: Row(1L, "3") :: Nil)
+
+    df1 = Seq((1, 1.0)).toDF("c0", "c1")
+    df2 = Seq((8L, 3.0)).toDF("c1", "c0")
+    checkAnswer(df1.unionByName(df2), Row(1.0, 1.0) :: Row(3.0, 8.0) :: Nil)
+
+    df1 = Seq((2.0f, 7.4)).toDF("c0", "c1")
+    df2 = Seq(("a", 4.0)).toDF("c1", "c0")
+    checkAnswer(df1.unionByName(df2), Row(2.0, "7.4") :: Row(4.0, "a") :: Nil)
+
+    df1 = Seq((1, "a", 3.0)).toDF("c0", "c1", "c2")
+    df2 = Seq((1.2, 2, "bc")).toDF("c2", "c0", "c1")
+    val df3 = Seq(("def", 1.2, 3)).toDF("c1", "c2", "c0")
+    checkAnswer(df1.unionByName(df2.unionByName(df3)),
+      Row(1, "a", 3.0) :: Row(2, "bc", 1.2) :: Row(3, "def", 1.2) :: Nil
+    )
+  }
+
+  test("union by name - check case sensitivity") {
+    def checkCaseSensitiveTest(): Unit = {
+      val df1 = Seq((1, 2, 3)).toDF("ab", "cd", "ef")
+      val df2 = Seq((4, 5, 6)).toDF("cd", "ef", "AB")
+      checkAnswer(df1.unionByName(df2), Row(1, 2, 3) :: Row(6, 4, 5) :: Nil)
+    }
+    withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") {
+      val errMsg2 = intercept[AnalysisException] {
+        checkCaseSensitiveTest()
+      }.getMessage
+      assert(errMsg2.contains("""Cannot resolve column name "ab" among (cd, ef, AB)"""))
+    }
+    withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") {
+      checkCaseSensitiveTest()
+    }
+  }
+
+  test("union by name - check name duplication") {
+    Seq((true, ("a", "a")), (false, ("aA", "Aa"))).foreach { case (caseSensitive, (c0, c1)) =>
+      withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) {
+        var df1 = Seq((1, 1)).toDF(c0, c1)
+        var df2 = Seq((1, 1)).toDF("c0", "c1")
+        var errMsg = intercept[AnalysisException] {
+          df1.unionByName(df2)
+        }.getMessage
+        assert(errMsg.contains("Found duplicate column(s) in the left attributes:"))
+        df1 = Seq((1, 1)).toDF("c0", "c1")
+        df2 = Seq((1, 1)).toDF(c0, c1)
+        errMsg = intercept[AnalysisException] {
+          df1.unionByName(df2)
+        }.getMessage
+        assert(errMsg.contains("Found duplicate column(s) in the right attributes:"))
+      }
+    }
+  }
+
   test("empty data frame") {
     assert(spark.emptyDataFrame.columns.toSeq === Seq.empty[String])
     assert(spark.emptyDataFrame.count() === 0)

From 1471ee7af5a9952b60cf8c56d60cb6a7ec46cc69 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 11 Jul 2017 11:19:59 +0800
Subject: [PATCH 150/779] [SPARK-21350][SQL] Fix the error message when the
 number of arguments is wrong when invoking a UDF

### What changes were proposed in this pull request?
Users get a very confusing error when users specify a wrong number of parameters.
```Scala
    val df = spark.emptyDataFrame
    spark.udf.register("foo", (_: String).length)
    df.selectExpr("foo(2, 3, 4)")
```
```
org.apache.spark.sql.UDFSuite$$anonfun$9$$anonfun$apply$mcV$sp$12 cannot be cast to scala.Function3
java.lang.ClassCastException: org.apache.spark.sql.UDFSuite$$anonfun$9$$anonfun$apply$mcV$sp$12 cannot be cast to scala.Function3
	at org.apache.spark.sql.catalyst.expressions.ScalaUDF.<init>(ScalaUDF.scala:109)
```

This PR is to capture the exception and issue an error message that is consistent with what we did for built-in functions. After the fix, the error message is improved to
```
Invalid number of arguments for function foo; line 1 pos 0
org.apache.spark.sql.AnalysisException: Invalid number of arguments for function foo; line 1 pos 0
	at org.apache.spark.sql.catalyst.analysis.SimpleFunctionRegistry.lookupFunction(FunctionRegistry.scala:119)
```

### How was this patch tested?
Added a test case

Author: gatorsmile <gatorsmile@gmail.com>

Closes #18574 from gatorsmile/statsCheck.
---
 .../apache/spark/sql/UDFRegistration.scala    | 412 +++++++++++++-----
 .../org/apache/spark/sql/JavaUDFSuite.java    |   8 +
 .../scala/org/apache/spark/sql/UDFSuite.scala |  13 +-
 3 files changed, 331 insertions(+), 102 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
index 8bdc0221888d0..c4d0adb5236f2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
@@ -111,7 +111,12 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
         def register[$typeTags](name: String, func: Function$x[$types]): UserDefinedFunction = {
           val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
           val inputTypes = Try($inputTypes).toOption
-          def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+          def builder(e: Seq[Expression]) = if (e.length == $x) {
+            ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+          } else {
+             throw new AnalysisException("Invalid number of arguments for function " + name +
+               ". Expected: $x; Found: " + e.length)
+          }
           functionRegistry.createOrReplaceTempFunction(name, builder)
           UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
         }""")
@@ -123,16 +128,20 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
       val anyCast = s".asInstanceOf[UDF$i[$anyTypeArgs, Any]]"
       val anyParams = (1 to i).map(_ => "_: Any").mkString(", ")
       println(s"""
-         |/**
-         | * Register a user-defined function with ${i} arguments.
-         | * @since 1.3.0
-         | */
-         |def register(name: String, f: UDF$i[$extTypeArgs, _], returnType: DataType): Unit = {
-         |  val func = f$anyCast.call($anyParams)
-         |  functionRegistry.createOrReplaceTempFunction(
-         |    name,
-         |    (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
-         |}""".stripMargin)
+        |/**
+        | * Register a user-defined function with ${i} arguments.
+        | * @since 1.3.0
+        | */
+        |def register(name: String, f: UDF$i[$extTypeArgs, _], returnType: DataType): Unit = {
+        |  val func = f$anyCast.call($anyParams)
+        |def builder(e: Seq[Expression]) = if (e.length == $i) {
+        |  ScalaUDF(func, returnType, e)
+        |} else {
+        |  throw new AnalysisException("Invalid number of arguments for function " + name +
+        |    ". Expected: $i; Found: " + e.length)
+        |}
+        |functionRegistry.createOrReplaceTempFunction(name, builder)
+        |}""".stripMargin)
     }
     */
 
@@ -144,7 +153,12 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag](name: String, func: Function0[RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    def builder(e: Seq[Expression]) = if (e.length == 0) {
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 0; Found: " + e.length)
+    }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
   }
@@ -157,7 +171,12 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag](name: String, func: Function1[A1, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    def builder(e: Seq[Expression]) = if (e.length == 1) {
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 1; Found: " + e.length)
+    }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
   }
@@ -170,7 +189,12 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag](name: String, func: Function2[A1, A2, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    def builder(e: Seq[Expression]) = if (e.length == 2) {
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 2; Found: " + e.length)
+    }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
   }
@@ -183,7 +207,12 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag](name: String, func: Function3[A1, A2, A3, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    def builder(e: Seq[Expression]) = if (e.length == 3) {
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 3; Found: " + e.length)
+    }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
   }
@@ -196,7 +225,12 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag](name: String, func: Function4[A1, A2, A3, A4, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    def builder(e: Seq[Expression]) = if (e.length == 4) {
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 4; Found: " + e.length)
+    }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
   }
@@ -209,7 +243,12 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag](name: String, func: Function5[A1, A2, A3, A4, A5, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    def builder(e: Seq[Expression]) = if (e.length == 5) {
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 5; Found: " + e.length)
+    }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
   }
@@ -222,7 +261,12 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag](name: String, func: Function6[A1, A2, A3, A4, A5, A6, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    def builder(e: Seq[Expression]) = if (e.length == 6) {
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 6; Found: " + e.length)
+    }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
   }
@@ -235,7 +279,12 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag](name: String, func: Function7[A1, A2, A3, A4, A5, A6, A7, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    def builder(e: Seq[Expression]) = if (e.length == 7) {
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 7; Found: " + e.length)
+    }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
   }
@@ -248,7 +297,12 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag](name: String, func: Function8[A1, A2, A3, A4, A5, A6, A7, A8, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    def builder(e: Seq[Expression]) = if (e.length == 8) {
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 8; Found: " + e.length)
+    }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
   }
@@ -261,7 +315,12 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag](name: String, func: Function9[A1, A2, A3, A4, A5, A6, A7, A8, A9, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    def builder(e: Seq[Expression]) = if (e.length == 9) {
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 9; Found: " + e.length)
+    }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
   }
@@ -274,7 +333,12 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag](name: String, func: Function10[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    def builder(e: Seq[Expression]) = if (e.length == 10) {
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 10; Found: " + e.length)
+    }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
   }
@@ -287,7 +351,12 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag](name: String, func: Function11[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    def builder(e: Seq[Expression]) = if (e.length == 11) {
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 11; Found: " + e.length)
+    }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
   }
@@ -300,7 +369,12 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag](name: String, func: Function12[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    def builder(e: Seq[Expression]) = if (e.length == 12) {
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 12; Found: " + e.length)
+    }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
   }
@@ -313,7 +387,12 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag](name: String, func: Function13[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    def builder(e: Seq[Expression]) = if (e.length == 13) {
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 13; Found: " + e.length)
+    }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
   }
@@ -326,7 +405,12 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag](name: String, func: Function14[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    def builder(e: Seq[Expression]) = if (e.length == 14) {
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 14; Found: " + e.length)
+    }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
   }
@@ -339,7 +423,12 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag](name: String, func: Function15[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    def builder(e: Seq[Expression]) = if (e.length == 15) {
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 15; Found: " + e.length)
+    }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
   }
@@ -352,7 +441,12 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag](name: String, func: Function16[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: ScalaReflection.schemaFor[A16].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    def builder(e: Seq[Expression]) = if (e.length == 16) {
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 16; Found: " + e.length)
+    }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
   }
@@ -365,7 +459,12 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag](name: String, func: Function17[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: ScalaReflection.schemaFor[A16].dataType :: ScalaReflection.schemaFor[A17].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    def builder(e: Seq[Expression]) = if (e.length == 17) {
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 17; Found: " + e.length)
+    }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
   }
@@ -378,7 +477,12 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag](name: String, func: Function18[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: ScalaReflection.schemaFor[A16].dataType :: ScalaReflection.schemaFor[A17].dataType :: ScalaReflection.schemaFor[A18].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    def builder(e: Seq[Expression]) = if (e.length == 18) {
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 18; Found: " + e.length)
+    }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
   }
@@ -391,7 +495,12 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag](name: String, func: Function19[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: ScalaReflection.schemaFor[A16].dataType :: ScalaReflection.schemaFor[A17].dataType :: ScalaReflection.schemaFor[A18].dataType :: ScalaReflection.schemaFor[A19].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    def builder(e: Seq[Expression]) = if (e.length == 19) {
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 19; Found: " + e.length)
+    }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
   }
@@ -404,7 +513,12 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag, A20: TypeTag](name: String, func: Function20[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: ScalaReflection.schemaFor[A16].dataType :: ScalaReflection.schemaFor[A17].dataType :: ScalaReflection.schemaFor[A18].dataType :: ScalaReflection.schemaFor[A19].dataType :: ScalaReflection.schemaFor[A20].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    def builder(e: Seq[Expression]) = if (e.length == 20) {
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 20; Found: " + e.length)
+    }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
   }
@@ -417,7 +531,12 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag, A20: TypeTag, A21: TypeTag](name: String, func: Function21[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, A21, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: ScalaReflection.schemaFor[A16].dataType :: ScalaReflection.schemaFor[A17].dataType :: ScalaReflection.schemaFor[A18].dataType :: ScalaReflection.schemaFor[A19].dataType :: ScalaReflection.schemaFor[A20].dataType :: ScalaReflection.schemaFor[A21].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    def builder(e: Seq[Expression]) = if (e.length == 21) {
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 21; Found: " + e.length)
+    }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
   }
@@ -430,7 +549,12 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag, A20: TypeTag, A21: TypeTag, A22: TypeTag](name: String, func: Function22[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, A21, A22, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: ScalaReflection.schemaFor[A16].dataType :: ScalaReflection.schemaFor[A17].dataType :: ScalaReflection.schemaFor[A18].dataType :: ScalaReflection.schemaFor[A19].dataType :: ScalaReflection.schemaFor[A20].dataType :: ScalaReflection.schemaFor[A21].dataType :: ScalaReflection.schemaFor[A22].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    def builder(e: Seq[Expression]) = if (e.length == 22) {
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 22; Found: " + e.length)
+    }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
   }
@@ -531,9 +655,13 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    */
   def register(name: String, f: UDF1[_, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF1[Any, Any]].call(_: Any)
-    functionRegistry.createOrReplaceTempFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
+    def builder(e: Seq[Expression]) = if (e.length == 1) {
+      ScalaUDF(func, returnType, e)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 1; Found: " + e.length)
+    }
+    functionRegistry.createOrReplaceTempFunction(name, builder)
   }
 
   /**
@@ -542,9 +670,13 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    */
   def register(name: String, f: UDF2[_, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF2[Any, Any, Any]].call(_: Any, _: Any)
-    functionRegistry.createOrReplaceTempFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
+    def builder(e: Seq[Expression]) = if (e.length == 2) {
+      ScalaUDF(func, returnType, e)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 2; Found: " + e.length)
+    }
+    functionRegistry.createOrReplaceTempFunction(name, builder)
   }
 
   /**
@@ -553,9 +685,13 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    */
   def register(name: String, f: UDF3[_, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF3[Any, Any, Any, Any]].call(_: Any, _: Any, _: Any)
-    functionRegistry.createOrReplaceTempFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
+    def builder(e: Seq[Expression]) = if (e.length == 3) {
+      ScalaUDF(func, returnType, e)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 3; Found: " + e.length)
+    }
+    functionRegistry.createOrReplaceTempFunction(name, builder)
   }
 
   /**
@@ -564,9 +700,13 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    */
   def register(name: String, f: UDF4[_, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF4[Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any)
-    functionRegistry.createOrReplaceTempFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
+    def builder(e: Seq[Expression]) = if (e.length == 4) {
+      ScalaUDF(func, returnType, e)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 4; Found: " + e.length)
+    }
+    functionRegistry.createOrReplaceTempFunction(name, builder)
   }
 
   /**
@@ -575,9 +715,13 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    */
   def register(name: String, f: UDF5[_, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF5[Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any)
-    functionRegistry.createOrReplaceTempFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
+    def builder(e: Seq[Expression]) = if (e.length == 5) {
+      ScalaUDF(func, returnType, e)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 5; Found: " + e.length)
+    }
+    functionRegistry.createOrReplaceTempFunction(name, builder)
   }
 
   /**
@@ -586,9 +730,13 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    */
   def register(name: String, f: UDF6[_, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF6[Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
-    functionRegistry.createOrReplaceTempFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
+    def builder(e: Seq[Expression]) = if (e.length == 6) {
+      ScalaUDF(func, returnType, e)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 6; Found: " + e.length)
+    }
+    functionRegistry.createOrReplaceTempFunction(name, builder)
   }
 
   /**
@@ -597,9 +745,13 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    */
   def register(name: String, f: UDF7[_, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF7[Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
-    functionRegistry.createOrReplaceTempFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
+    def builder(e: Seq[Expression]) = if (e.length == 7) {
+      ScalaUDF(func, returnType, e)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 7; Found: " + e.length)
+    }
+    functionRegistry.createOrReplaceTempFunction(name, builder)
   }
 
   /**
@@ -608,9 +760,13 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    */
   def register(name: String, f: UDF8[_, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF8[Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
-    functionRegistry.createOrReplaceTempFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
+    def builder(e: Seq[Expression]) = if (e.length == 8) {
+      ScalaUDF(func, returnType, e)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 8; Found: " + e.length)
+    }
+    functionRegistry.createOrReplaceTempFunction(name, builder)
   }
 
   /**
@@ -619,9 +775,13 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    */
   def register(name: String, f: UDF9[_, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF9[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
-    functionRegistry.createOrReplaceTempFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
+    def builder(e: Seq[Expression]) = if (e.length == 9) {
+      ScalaUDF(func, returnType, e)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 9; Found: " + e.length)
+    }
+    functionRegistry.createOrReplaceTempFunction(name, builder)
   }
 
   /**
@@ -630,9 +790,13 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    */
   def register(name: String, f: UDF10[_, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF10[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
-    functionRegistry.createOrReplaceTempFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
+    def builder(e: Seq[Expression]) = if (e.length == 10) {
+      ScalaUDF(func, returnType, e)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 10; Found: " + e.length)
+    }
+    functionRegistry.createOrReplaceTempFunction(name, builder)
   }
 
   /**
@@ -641,9 +805,13 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    */
   def register(name: String, f: UDF11[_, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF11[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
-    functionRegistry.createOrReplaceTempFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
+    def builder(e: Seq[Expression]) = if (e.length == 11) {
+      ScalaUDF(func, returnType, e)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 11; Found: " + e.length)
+    }
+    functionRegistry.createOrReplaceTempFunction(name, builder)
   }
 
   /**
@@ -652,9 +820,13 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    */
   def register(name: String, f: UDF12[_, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF12[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
-    functionRegistry.createOrReplaceTempFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
+    def builder(e: Seq[Expression]) = if (e.length == 12) {
+      ScalaUDF(func, returnType, e)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 12; Found: " + e.length)
+    }
+    functionRegistry.createOrReplaceTempFunction(name, builder)
   }
 
   /**
@@ -663,9 +835,13 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    */
   def register(name: String, f: UDF13[_, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF13[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
-    functionRegistry.createOrReplaceTempFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
+    def builder(e: Seq[Expression]) = if (e.length == 13) {
+      ScalaUDF(func, returnType, e)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 13; Found: " + e.length)
+    }
+    functionRegistry.createOrReplaceTempFunction(name, builder)
   }
 
   /**
@@ -674,9 +850,13 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    */
   def register(name: String, f: UDF14[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF14[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
-    functionRegistry.createOrReplaceTempFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
+    def builder(e: Seq[Expression]) = if (e.length == 14) {
+      ScalaUDF(func, returnType, e)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 14; Found: " + e.length)
+    }
+    functionRegistry.createOrReplaceTempFunction(name, builder)
   }
 
   /**
@@ -685,9 +865,13 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    */
   def register(name: String, f: UDF15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF15[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
-    functionRegistry.createOrReplaceTempFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
+    def builder(e: Seq[Expression]) = if (e.length == 15) {
+      ScalaUDF(func, returnType, e)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 15; Found: " + e.length)
+    }
+    functionRegistry.createOrReplaceTempFunction(name, builder)
   }
 
   /**
@@ -696,9 +880,13 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    */
   def register(name: String, f: UDF16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF16[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
-    functionRegistry.createOrReplaceTempFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
+    def builder(e: Seq[Expression]) = if (e.length == 16) {
+      ScalaUDF(func, returnType, e)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 16; Found: " + e.length)
+    }
+    functionRegistry.createOrReplaceTempFunction(name, builder)
   }
 
   /**
@@ -707,9 +895,13 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    */
   def register(name: String, f: UDF17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF17[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
-    functionRegistry.createOrReplaceTempFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
+    def builder(e: Seq[Expression]) = if (e.length == 17) {
+      ScalaUDF(func, returnType, e)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 17; Found: " + e.length)
+    }
+    functionRegistry.createOrReplaceTempFunction(name, builder)
   }
 
   /**
@@ -718,9 +910,13 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    */
   def register(name: String, f: UDF18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF18[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
-    functionRegistry.createOrReplaceTempFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
+    def builder(e: Seq[Expression]) = if (e.length == 18) {
+      ScalaUDF(func, returnType, e)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 18; Found: " + e.length)
+    }
+    functionRegistry.createOrReplaceTempFunction(name, builder)
   }
 
   /**
@@ -729,9 +925,13 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    */
   def register(name: String, f: UDF19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF19[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
-    functionRegistry.createOrReplaceTempFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
+    def builder(e: Seq[Expression]) = if (e.length == 19) {
+      ScalaUDF(func, returnType, e)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 19; Found: " + e.length)
+    }
+    functionRegistry.createOrReplaceTempFunction(name, builder)
   }
 
   /**
@@ -740,9 +940,13 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    */
   def register(name: String, f: UDF20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF20[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
-    functionRegistry.createOrReplaceTempFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
+    def builder(e: Seq[Expression]) = if (e.length == 20) {
+      ScalaUDF(func, returnType, e)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 20; Found: " + e.length)
+    }
+    functionRegistry.createOrReplaceTempFunction(name, builder)
   }
 
   /**
@@ -751,9 +955,13 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    */
   def register(name: String, f: UDF21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF21[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
-    functionRegistry.createOrReplaceTempFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
+    def builder(e: Seq[Expression]) = if (e.length == 21) {
+      ScalaUDF(func, returnType, e)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 21; Found: " + e.length)
+    }
+    functionRegistry.createOrReplaceTempFunction(name, builder)
   }
 
   /**
@@ -762,9 +970,13 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    */
   def register(name: String, f: UDF22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF22[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
-    functionRegistry.createOrReplaceTempFunction(
-      name,
-      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
+    def builder(e: Seq[Expression]) = if (e.length == 22) {
+      ScalaUDF(func, returnType, e)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 22; Found: " + e.length)
+    }
+    functionRegistry.createOrReplaceTempFunction(name, builder)
   }
 
   // scalastyle:on line.size.limit
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java
index 250fa674d8ecc..4fb2988f24d26 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java
@@ -25,6 +25,7 @@
 import org.junit.Before;
 import org.junit.Test;
 
+import org.apache.spark.sql.AnalysisException;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SparkSession;
 import org.apache.spark.sql.api.java.UDF2;
@@ -105,4 +106,11 @@ public void udf4Test() {
     }
     Assert.assertEquals(55, sum);
   }
+
+  @SuppressWarnings("unchecked")
+  @Test(expected = AnalysisException.class)
+  public void udf5Test() {
+    spark.udf().register("inc", (Long i) -> i + 1, DataTypes.LongType);
+    List<Row> results = spark.sql("SELECT inc(1, 5)").collectAsList();
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
index b4f744b193ada..335b882ace92a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
@@ -71,12 +71,21 @@ class UDFSuite extends QueryTest with SharedSQLContext {
     }
   }
 
-  test("error reporting for incorrect number of arguments") {
+  test("error reporting for incorrect number of arguments - builtin function") {
     val df = spark.emptyDataFrame
     val e = intercept[AnalysisException] {
       df.selectExpr("substr('abcd', 2, 3, 4)")
     }
-    assert(e.getMessage.contains("arguments"))
+    assert(e.getMessage.contains("Invalid number of arguments for function substr"))
+  }
+
+  test("error reporting for incorrect number of arguments - udf") {
+    val df = spark.emptyDataFrame
+    val e = intercept[AnalysisException] {
+      spark.udf.register("foo", (_: String).length)
+      df.selectExpr("foo(2, 3, 4)")
+    }
+    assert(e.getMessage.contains("Invalid number of arguments for function foo"))
   }
 
   test("error reporting for undefined functions") {

From 833eab2c9bd273ee9577fbf9e480d3e3a4b7d203 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Tue, 11 Jul 2017 11:26:17 +0800
Subject: [PATCH 151/779] [SPARK-21369][CORE] Don't use Scala Tuple2 in
 common/network-*

## What changes were proposed in this pull request?

Remove all usages of Scala Tuple2 from common/network-* projects. Otherwise, Yarn users cannot use `spark.reducer.maxReqSizeShuffleToMem`.

## How was this patch tested?

Jenkins.

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #18593 from zsxwing/SPARK-21369.
---
 common/network-common/pom.xml                 |  3 ++-
 .../client/TransportResponseHandler.java      | 20 +++++++++----------
 .../server/OneForOneStreamManager.java        | 17 +++++-----------
 common/network-shuffle/pom.xml                |  1 +
 common/network-yarn/pom.xml                   |  1 +
 5 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 066970f24205f..0254d0cefc368 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -90,7 +90,8 @@
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
-    </dependency>
+      <scope>test</scope>
+   </dependency>
 
     <!--
       This spark-tags test-dep is needed even though it isn't used in this module, otherwise testing-cmds that exclude
diff --git a/common/network-common/src/main/java/org/apache/spark/network/client/TransportResponseHandler.java b/common/network-common/src/main/java/org/apache/spark/network/client/TransportResponseHandler.java
index 340b8b96aabc6..7a3d96ceaef0c 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/client/TransportResponseHandler.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/client/TransportResponseHandler.java
@@ -24,10 +24,10 @@
 import java.util.concurrent.ConcurrentLinkedQueue;
 import java.util.concurrent.atomic.AtomicLong;
 
-import scala.Tuple2;
-
 import com.google.common.annotations.VisibleForTesting;
 import io.netty.channel.Channel;
+import org.apache.commons.lang3.tuple.ImmutablePair;
+import org.apache.commons.lang3.tuple.Pair;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -58,7 +58,7 @@ public class TransportResponseHandler extends MessageHandler<ResponseMessage> {
 
   private final Map<Long, RpcResponseCallback> outstandingRpcs;
 
-  private final Queue<Tuple2<String, StreamCallback>> streamCallbacks;
+  private final Queue<Pair<String, StreamCallback>> streamCallbacks;
   private volatile boolean streamActive;
 
   /** Records the time (in system nanoseconds) that the last fetch or RPC request was sent. */
@@ -92,7 +92,7 @@ public void removeRpcRequest(long requestId) {
 
   public void addStreamCallback(String streamId, StreamCallback callback) {
     timeOfLastRequestNs.set(System.nanoTime());
-    streamCallbacks.offer(new Tuple2<>(streamId, callback));
+    streamCallbacks.offer(ImmutablePair.of(streamId, callback));
   }
 
   @VisibleForTesting
@@ -119,9 +119,9 @@ private void failOutstandingRequests(Throwable cause) {
         logger.warn("RpcResponseCallback.onFailure throws exception", e);
       }
     }
-    for (Tuple2<String, StreamCallback> entry : streamCallbacks) {
+    for (Pair<String, StreamCallback> entry : streamCallbacks) {
       try {
-        entry._2().onFailure(entry._1(), cause);
+        entry.getValue().onFailure(entry.getKey(), cause);
       } catch (Exception e) {
         logger.warn("StreamCallback.onFailure throws exception", e);
       }
@@ -208,9 +208,9 @@ public void handle(ResponseMessage message) throws Exception {
       }
     } else if (message instanceof StreamResponse) {
       StreamResponse resp = (StreamResponse) message;
-      Tuple2<String, StreamCallback> entry = streamCallbacks.poll();
+      Pair<String, StreamCallback> entry = streamCallbacks.poll();
       if (entry != null) {
-        StreamCallback callback = entry._2();
+        StreamCallback callback = entry.getValue();
         if (resp.byteCount > 0) {
           StreamInterceptor interceptor = new StreamInterceptor(this, resp.streamId, resp.byteCount,
             callback);
@@ -235,9 +235,9 @@ public void handle(ResponseMessage message) throws Exception {
       }
     } else if (message instanceof StreamFailure) {
       StreamFailure resp = (StreamFailure) message;
-      Tuple2<String, StreamCallback> entry = streamCallbacks.poll();
+      Pair<String, StreamCallback> entry = streamCallbacks.poll();
       if (entry != null) {
-        StreamCallback callback = entry._2();
+        StreamCallback callback = entry.getValue();
         try {
           callback.onFailure(resp.streamId, new RuntimeException(resp.error));
         } catch (IOException ioe) {
diff --git a/common/network-common/src/main/java/org/apache/spark/network/server/OneForOneStreamManager.java b/common/network-common/src/main/java/org/apache/spark/network/server/OneForOneStreamManager.java
index ad8e8b44d201e..85ca2f1728e6a 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/server/OneForOneStreamManager.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/server/OneForOneStreamManager.java
@@ -23,8 +23,6 @@
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.atomic.AtomicLong;
 
-import scala.Tuple2;
-
 import com.google.common.base.Preconditions;
 import io.netty.channel.Channel;
 import org.slf4j.Logger;
@@ -98,21 +96,16 @@ public ManagedBuffer getChunk(long streamId, int chunkIndex) {
 
   @Override
   public ManagedBuffer openStream(String streamChunkId) {
-    Tuple2<Long, Integer> streamIdAndChunkId = parseStreamChunkId(streamChunkId);
-    return getChunk(streamIdAndChunkId._1, streamIdAndChunkId._2);
-  }
-
-  public static String genStreamChunkId(long streamId, int chunkId) {
-    return String.format("%d_%d", streamId, chunkId);
-  }
-
-  public static Tuple2<Long, Integer> parseStreamChunkId(String streamChunkId) {
     String[] array = streamChunkId.split("_");
     assert array.length == 2:
       "Stream id and chunk index should be specified when open stream for fetching block.";
     long streamId = Long.valueOf(array[0]);
     int chunkIndex = Integer.valueOf(array[1]);
-    return new Tuple2<>(streamId, chunkIndex);
+    return getChunk(streamId, chunkIndex);
+  }
+
+  public static String genStreamChunkId(long streamId, int chunkId) {
+    return String.format("%d_%d", streamId, chunkId);
   }
 
   @Override
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 2de882adcb582..9968480ab7658 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -69,6 +69,7 @@
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
+      <scope>test</scope>
     </dependency>
 
     <!--
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 7e24315c5f39b..ec2db6e5bb88c 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -48,6 +48,7 @@
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
+      <scope>test</scope>
     </dependency>
 
     <!--

From 97a1aa2c70b1bf726d5f572789e150d168ac61e5 Mon Sep 17 00:00:00 2001
From: jinxing <jinxing6042@126.com>
Date: Tue, 11 Jul 2017 11:47:47 +0800
Subject: [PATCH 152/779] [SPARK-21315][SQL] Skip some spill files when
 generateIterator(startIndex) in ExternalAppendOnlyUnsafeRowArray.

## What changes were proposed in this pull request?

In current code, it is expensive to use `UnboundedFollowingWindowFunctionFrame`, because it is iterating from the start to lower bound every time calling `write` method. When traverse the iterator, it's possible to skip some spilled files thus to save some time.

## How was this patch tested?

Added unit test

Did a small test for benchmark:

Put 2000200 rows into `UnsafeExternalSorter`-- 2 spill files(each contains 1000000 rows) and inMemSorter contains 200 rows.
Move the iterator forward to index=2000001.

*With this change*:
`getIterator(2000001)`, it will cost almost 0ms~1ms;
*Without this change*:
`for(int i=0; i<2000001; i++)geIterator().loadNext()`, it will cost 300ms.

Author: jinxing <jinxing6042@126.com>

Closes #18541 from jinxing64/SPARK-21315.
---
 .../unsafe/sort/UnsafeExternalSorter.java     | 35 ++++++++++++++++---
 .../unsafe/sort/UnsafeSorterSpillWriter.java  |  4 +++
 .../sort/UnsafeExternalSorterSuite.java       | 34 +++++++++++++++++-
 .../ExternalAppendOnlyUnsafeRowArray.scala    | 22 ++----------
 ...nalAppendOnlyUnsafeRowArrayBenchmark.scala |  2 +-
 5 files changed, 70 insertions(+), 27 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
index 82d03e3e9190c..a6e858ca72021 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
@@ -589,29 +589,54 @@ public long getKeyPrefix() {
   }
 
   /**
-   * Returns a iterator, which will return the rows in the order as inserted.
+   * Returns an iterator starts from startIndex, which will return the rows in the order as
+   * inserted.
    *
    * It is the caller's responsibility to call `cleanupResources()`
    * after consuming this iterator.
    *
    * TODO: support forced spilling
    */
-  public UnsafeSorterIterator getIterator() throws IOException {
+  public UnsafeSorterIterator getIterator(int startIndex) throws IOException {
     if (spillWriters.isEmpty()) {
       assert(inMemSorter != null);
-      return inMemSorter.getSortedIterator();
+      UnsafeSorterIterator iter = inMemSorter.getSortedIterator();
+      moveOver(iter, startIndex);
+      return iter;
     } else {
       LinkedList<UnsafeSorterIterator> queue = new LinkedList<>();
+      int i = 0;
       for (UnsafeSorterSpillWriter spillWriter : spillWriters) {
-        queue.add(spillWriter.getReader(serializerManager));
+        if (i + spillWriter.recordsSpilled() > startIndex) {
+          UnsafeSorterIterator iter = spillWriter.getReader(serializerManager);
+          moveOver(iter, startIndex - i);
+          queue.add(iter);
+        }
+        i += spillWriter.recordsSpilled();
       }
       if (inMemSorter != null) {
-        queue.add(inMemSorter.getSortedIterator());
+        UnsafeSorterIterator iter = inMemSorter.getSortedIterator();
+        moveOver(iter, startIndex - i);
+        queue.add(iter);
       }
       return new ChainedIterator(queue);
     }
   }
 
+  private void moveOver(UnsafeSorterIterator iter, int steps)
+      throws IOException {
+    if (steps > 0) {
+      for (int i = 0; i < steps; i++) {
+        if (iter.hasNext()) {
+          iter.loadNext();
+        } else {
+          throw new ArrayIndexOutOfBoundsException("Failed to move the iterator " + steps +
+            " steps forward");
+        }
+      }
+    }
+  }
+
   /**
    * Chain multiple UnsafeSorterIterator together as single one.
    */
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java
index f9b5493755443..850f247b045cf 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java
@@ -155,4 +155,8 @@ public File getFile() {
   public UnsafeSorterSpillReader getReader(SerializerManager serializerManager) throws IOException {
     return new UnsafeSorterSpillReader(serializerManager, file, blockId);
   }
+
+  public int recordsSpilled() {
+    return numRecordsSpilled;
+  }
 }
diff --git a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
index d31d7c1c0900c..cd5db1a70f722 100644
--- a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
+++ b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
@@ -395,7 +395,7 @@ public void forcedSpillingWithoutComparator() throws Exception {
         sorter.spill();
       }
     }
-    UnsafeSorterIterator iter = sorter.getIterator();
+    UnsafeSorterIterator iter = sorter.getIterator(0);
     for (int i = 0; i < n; i++) {
       iter.hasNext();
       iter.loadNext();
@@ -479,5 +479,37 @@ public void testPeakMemoryUsed() throws Exception {
     }
   }
 
+  @Test
+  public void testGetIterator() throws Exception {
+    final UnsafeExternalSorter sorter = newSorter();
+    for (int i = 0; i < 100; i++) {
+      insertNumber(sorter, i);
+    }
+    verifyIntIterator(sorter.getIterator(0), 0, 100);
+    verifyIntIterator(sorter.getIterator(79), 79, 100);
+
+    sorter.spill();
+    for (int i = 100; i < 200; i++) {
+      insertNumber(sorter, i);
+    }
+    sorter.spill();
+    verifyIntIterator(sorter.getIterator(79), 79, 200);
+
+    for (int i = 200; i < 300; i++) {
+      insertNumber(sorter, i);
+    }
+    verifyIntIterator(sorter.getIterator(79), 79, 300);
+    verifyIntIterator(sorter.getIterator(139), 139, 300);
+    verifyIntIterator(sorter.getIterator(279), 279, 300);
+  }
+
+  private void verifyIntIterator(UnsafeSorterIterator iter, int start, int end)
+      throws IOException {
+    for (int i = start; i < end; i++) {
+      assert (iter.hasNext());
+      iter.loadNext();
+      assert (Platform.getInt(iter.getBaseObject(), iter.getBaseOffset()) == i);
+    }
+  }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArray.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArray.scala
index 458ac4ba3637c..c4d383421f976 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArray.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArray.scala
@@ -166,7 +166,7 @@ private[sql] class ExternalAppendOnlyUnsafeRowArray(
     if (spillableArray == null) {
       new InMemoryBufferIterator(startIndex)
     } else {
-      new SpillableArrayIterator(spillableArray.getIterator, numFieldsPerRow, startIndex)
+      new SpillableArrayIterator(spillableArray.getIterator(startIndex), numFieldsPerRow)
     }
   }
 
@@ -204,29 +204,11 @@ private[sql] class ExternalAppendOnlyUnsafeRowArray(
 
   private[this] class SpillableArrayIterator(
       iterator: UnsafeSorterIterator,
-      numFieldPerRow: Int,
-      startIndex: Int)
+      numFieldPerRow: Int)
     extends ExternalAppendOnlyUnsafeRowArrayIterator {
 
     private val currentRow = new UnsafeRow(numFieldPerRow)
 
-    def init(): Unit = {
-      var i = 0
-      while (i < startIndex) {
-        if (iterator.hasNext) {
-          iterator.loadNext()
-        } else {
-          throw new ArrayIndexOutOfBoundsException(
-            "Invalid `startIndex` provided for generating iterator over the array. " +
-              s"Total elements: $numRows, requested `startIndex`: $startIndex")
-        }
-        i += 1
-      }
-    }
-
-    // Traverse upto the given [[startIndex]]
-    init()
-
     override def hasNext(): Boolean = !isModified() && iterator.hasNext
 
     override def next(): UnsafeRow = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArrayBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArrayBenchmark.scala
index 00c5f2550cbb1..031ac38c17d7b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArrayBenchmark.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArrayBenchmark.scala
@@ -130,7 +130,7 @@ object ExternalAppendOnlyUnsafeRowArrayBenchmark {
             false))
 
         val unsafeRow = new UnsafeRow(1)
-        val iter = array.getIterator
+        val iter = array.getIterator(0)
         while (iter.hasNext) {
           iter.loadNext()
           unsafeRow.pointTo(iter.getBaseObject, iter.getBaseOffset, iter.getRecordLength)

From d4d9e17b31daab9d0c07de24383abd585145fd9c Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Tue, 11 Jul 2017 15:23:03 +0900
Subject: [PATCH 153/779] [SPARK-20456][PYTHON][FOLLOWUP] Fix
 timezone-dependent doctests in unix_timestamp and from_unixtime

## What changes were proposed in this pull request?

This PR proposes to simply ignore the results in examples that are timezone-dependent in `unix_timestamp` and `from_unixtime`.

```
Failed example:
    time_df.select(unix_timestamp('dt', 'yyyy-MM-dd').alias('unix_time')).collect()
Expected:
    [Row(unix_time=1428476400)]
Got:unix_timestamp
    [Row(unix_time=1428418800)]
```

```
Failed example:
    time_df.select(from_unixtime('unix_time').alias('ts')).collect()
Expected:
    [Row(ts=u'2015-04-08 00:00:00')]
Got:
    [Row(ts=u'2015-04-08 16:00:00')]
```

## How was this patch tested?

Manually tested and `./run-tests --modules pyspark-sql`.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #18597 from HyukjinKwon/SPARK-20456.
---
 python/pyspark/sql/functions.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index f3e7d033e97cf..f203d85dd9e61 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -1116,9 +1116,11 @@ def from_unixtime(timestamp, format="yyyy-MM-dd HH:mm:ss"):
     representing the timestamp of that moment in the current system time zone in the given
     format.
 
+    >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
     >>> time_df = spark.createDataFrame([(1428476400,)], ['unix_time'])
     >>> time_df.select(from_unixtime('unix_time').alias('ts')).collect()
     [Row(ts=u'2015-04-08 00:00:00')]
+    >>> spark.conf.unset("spark.sql.session.timeZone")
     """
     sc = SparkContext._active_spark_context
     return Column(sc._jvm.functions.from_unixtime(_to_java_column(timestamp), format))
@@ -1133,9 +1135,11 @@ def unix_timestamp(timestamp=None, format='yyyy-MM-dd HH:mm:ss'):
 
     if `timestamp` is None, then it returns current timestamp.
 
+    >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
     >>> time_df = spark.createDataFrame([('2015-04-08',)], ['dt'])
     >>> time_df.select(unix_timestamp('dt', 'yyyy-MM-dd').alias('unix_time')).collect()
     [Row(unix_time=1428476400)]
+    >>> spark.conf.unset("spark.sql.session.timeZone")
     """
     sc = SparkContext._active_spark_context
     if timestamp is None:

From a4baa8f48fe611e5c4d147f22fb2bb4c78d58a09 Mon Sep 17 00:00:00 2001
From: Michael Allman <michael@videoamp.com>
Date: Tue, 11 Jul 2017 14:50:11 +0800
Subject: [PATCH 154/779] [SPARK-20331][SQL] Enhanced Hive partition pruning
 predicate pushdown

(Link to Jira: https://issues.apache.org/jira/browse/SPARK-20331)

## What changes were proposed in this pull request?

Spark 2.1 introduced scalable support for Hive tables with huge numbers of partitions. Key to leveraging this support is the ability to prune unnecessary table partitions to answer queries. Spark supports a subset of the class of partition pruning predicates that the Hive metastore supports. If a user writes a query with a partition pruning predicate that is *not* supported by Spark, Spark falls back to loading all partitions and pruning client-side. We want to broaden Spark's current partition pruning predicate pushdown capabilities.

One of the key missing capabilities is support for disjunctions. For example, for a table partitioned by date, writing a query with a predicate like

    date = 20161011 or date = 20161014

will result in Spark fetching all partitions. For a table partitioned by date and hour, querying a range of hours across dates can be quite difficult to accomplish without fetching all partition metadata.

The current partition pruning support supports only comparisons against literals. We can expand that to foldable expressions by evaluating them at planning time.

We can also implement support for the "IN" comparison by expanding it to a sequence of "OR"s.

## How was this patch tested?

The `HiveClientSuite` and `VersionsSuite` were refactored and simplified to make Hive client-based, version-specific testing more modular and conceptually simpler. There are now two Hive test suites: `HiveClientSuite` and `HivePartitionFilteringSuite`. These test suites have a single-argument constructor taking a `version` parameter. As such, these test suites cannot be run by themselves. Instead, they have been bundled into "aggregation" test suites which run each suite for each Hive client version. These aggregation suites are called `HiveClientSuites` and `HivePartitionFilteringSuites`. The `VersionsSuite` and `HiveClientSuite` have been refactored into each of these aggregation suites, respectively.

`HiveClientSuite` and `HivePartitionFilteringSuite` subclass a new abstract class, `HiveVersionSuite`. `HiveVersionSuite` collects functionality related to testing a single Hive version and overrides relevant test suite methods to display version-specific information.

A new trait, `HiveClientVersions`, has been added with a sequence of Hive test versions.

Author: Michael Allman <michael@videoamp.com>

Closes #17633 from mallman/spark-20331-enhanced_partition_pruning_pushdown.
---
 .../spark/sql/hive/client/HiveShim.scala      |  71 +++++-
 .../sql/hive/client/HiveClientBuilder.scala   |   6 +-
 .../sql/hive/client/HiveClientSuite.scala     | 225 +++++++++++++++++-
 .../sql/hive/client/HiveClientSuites.scala    |  29 +++
 .../sql/hive/client/HiveClientVersions.scala  |  26 ++
 .../sql/hive/client/HiveVersionSuite.scala    |  46 ++++
 .../spark/sql/hive/client/VersionsSuite.scala |   4 +-
 7 files changed, 378 insertions(+), 29 deletions(-)
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuites.scala
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientVersions.scala
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveVersionSuite.scala

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
index 7abb9f06b1310..449a303b59eed 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -25,6 +25,7 @@ import java.util.concurrent.TimeUnit
 
 import scala.collection.JavaConverters._
 import scala.util.control.NonFatal
+import scala.util.Try
 
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.hive.conf.HiveConf
@@ -46,6 +47,7 @@ import org.apache.spark.sql.catalyst.catalog.{CatalogFunction, CatalogTableParti
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{IntegralType, StringType}
+import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.Utils
 
 /**
@@ -589,18 +591,67 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
         col.getType.startsWith(serdeConstants.CHAR_TYPE_NAME))
       .map(col => col.getName).toSet
 
-    filters.collect {
-      case op @ BinaryComparison(a: Attribute, Literal(v, _: IntegralType)) =>
-        s"${a.name} ${op.symbol} $v"
-      case op @ BinaryComparison(Literal(v, _: IntegralType), a: Attribute) =>
-        s"$v ${op.symbol} ${a.name}"
-      case op @ BinaryComparison(a: Attribute, Literal(v, _: StringType))
+    object ExtractableLiteral {
+      def unapply(expr: Expression): Option[String] = expr match {
+        case Literal(value, _: IntegralType) => Some(value.toString)
+        case Literal(value, _: StringType) => Some(quoteStringLiteral(value.toString))
+        case _ => None
+      }
+    }
+
+    object ExtractableLiterals {
+      def unapply(exprs: Seq[Expression]): Option[Seq[String]] = {
+        exprs.map(ExtractableLiteral.unapply).foldLeft(Option(Seq.empty[String])) {
+          case (Some(accum), Some(value)) => Some(accum :+ value)
+          case _ => None
+        }
+      }
+    }
+
+    object ExtractableValues {
+      private lazy val valueToLiteralString: PartialFunction[Any, String] = {
+        case value: Byte => value.toString
+        case value: Short => value.toString
+        case value: Int => value.toString
+        case value: Long => value.toString
+        case value: UTF8String => quoteStringLiteral(value.toString)
+      }
+
+      def unapply(values: Set[Any]): Option[Seq[String]] = {
+        values.toSeq.foldLeft(Option(Seq.empty[String])) {
+          case (Some(accum), value) if valueToLiteralString.isDefinedAt(value) =>
+            Some(accum :+ valueToLiteralString(value))
+          case _ => None
+        }
+      }
+    }
+
+    def convertInToOr(a: Attribute, values: Seq[String]): String = {
+      values.map(value => s"${a.name} = $value").mkString("(", " or ", ")")
+    }
+
+    lazy val convert: PartialFunction[Expression, String] = {
+      case In(a: Attribute, ExtractableLiterals(values))
+          if !varcharKeys.contains(a.name) && values.nonEmpty =>
+        convertInToOr(a, values)
+      case InSet(a: Attribute, ExtractableValues(values))
+          if !varcharKeys.contains(a.name) && values.nonEmpty =>
+        convertInToOr(a, values)
+      case op @ BinaryComparison(a: Attribute, ExtractableLiteral(value))
           if !varcharKeys.contains(a.name) =>
-        s"""${a.name} ${op.symbol} ${quoteStringLiteral(v.toString)}"""
-      case op @ BinaryComparison(Literal(v, _: StringType), a: Attribute)
+        s"${a.name} ${op.symbol} $value"
+      case op @ BinaryComparison(ExtractableLiteral(value), a: Attribute)
           if !varcharKeys.contains(a.name) =>
-        s"""${quoteStringLiteral(v.toString)} ${op.symbol} ${a.name}"""
-    }.mkString(" and ")
+        s"$value ${op.symbol} ${a.name}"
+      case op @ And(expr1, expr2)
+          if convert.isDefinedAt(expr1) || convert.isDefinedAt(expr2) =>
+        (convert.lift(expr1) ++ convert.lift(expr2)).mkString("(", " and ", ")")
+      case op @ Or(expr1, expr2)
+          if convert.isDefinedAt(expr1) && convert.isDefinedAt(expr2) =>
+        s"(${convert(expr1)} or ${convert(expr2)})"
+    }
+
+    filters.map(convert.lift).collect { case Some(filterString) => filterString }.mkString(" and ")
   }
 
   private def quoteStringLiteral(str: String): String = {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientBuilder.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientBuilder.scala
index e85ea5a59427d..ae804ce7c7b07 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientBuilder.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientBuilder.scala
@@ -25,9 +25,7 @@ import org.apache.hadoop.util.VersionInfo
 import org.apache.spark.SparkConf
 import org.apache.spark.util.Utils
 
-private[client] class HiveClientBuilder {
-  private val sparkConf = new SparkConf()
-
+private[client] object HiveClientBuilder {
   // In order to speed up test execution during development or in Jenkins, you can specify the path
   // of an existing Ivy cache:
   private val ivyPath: Option[String] = {
@@ -52,7 +50,7 @@ private[client] class HiveClientBuilder {
     IsolatedClientLoader.forVersion(
       hiveMetastoreVersion = version,
       hadoopVersion = VersionInfo.getVersion,
-      sparkConf = sparkConf,
+      sparkConf = new SparkConf(),
       hadoopConf = hadoopConf,
       config = buildConf(extraConf),
       ivyPath = ivyPath).createClient()
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala
index 4790331168bd2..6a2c23a015529 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala
@@ -19,21 +19,25 @@ package org.apache.spark.sql.hive.client
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.hive.conf.HiveConf
+import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.catalog._
-import org.apache.spark.sql.catalyst.expressions.{AttributeReference, EqualTo, Literal}
+import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, EmptyRow, EqualTo, Expression, GreaterThan, GreaterThanOrEqual, In, InSet, LessThan, LessThanOrEqual, Like, Literal, Or}
+import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
 import org.apache.spark.sql.hive.HiveUtils
-import org.apache.spark.sql.types.IntegerType
+import org.apache.spark.sql.types.{ByteType, IntegerType, StringType}
 
-class HiveClientSuite extends SparkFunSuite {
-  private val clientBuilder = new HiveClientBuilder
+// TODO: Refactor this to `HivePartitionFilteringSuite`
+class HiveClientSuite(version: String)
+    extends HiveVersionSuite(version) with BeforeAndAfterAll {
+  import CatalystSqlParser._
 
   private val tryDirectSqlKey = HiveConf.ConfVars.METASTORE_TRY_DIRECT_SQL.varname
 
-  test(s"getPartitionsByFilter returns all partitions when $tryDirectSqlKey=false") {
-    val testPartitionCount = 5
+  private val testPartitionCount = 3 * 24 * 4
 
+  private def init(tryDirectSql: Boolean): HiveClient = {
     val storageFormat = CatalogStorageFormat(
       locationUri = None,
       inputFormat = None,
@@ -43,19 +47,214 @@ class HiveClientSuite extends SparkFunSuite {
       properties = Map.empty)
 
     val hadoopConf = new Configuration()
-    hadoopConf.setBoolean(tryDirectSqlKey, false)
-    val client = clientBuilder.buildClient(HiveUtils.hiveExecutionVersion, hadoopConf)
-    client.runSqlHive("CREATE TABLE test (value INT) PARTITIONED BY (part INT)")
+    hadoopConf.setBoolean(tryDirectSqlKey, tryDirectSql)
+    val client = buildClient(hadoopConf)
+    client
+      .runSqlHive("CREATE TABLE test (value INT) PARTITIONED BY (ds INT, h INT, chunk STRING)")
+
+    val partitions =
+      for {
+        ds <- 20170101 to 20170103
+        h <- 0 to 23
+        chunk <- Seq("aa", "ab", "ba", "bb")
+      } yield CatalogTablePartition(Map(
+        "ds" -> ds.toString,
+        "h" -> h.toString,
+        "chunk" -> chunk
+      ), storageFormat)
+    assert(partitions.size == testPartitionCount)
 
-    val partitions = (1 to testPartitionCount).map { part =>
-      CatalogTablePartition(Map("part" -> part.toString), storageFormat)
-    }
     client.createPartitions(
       "default", "test", partitions, ignoreIfExists = false)
+    client
+  }
 
+  override def beforeAll() {
+    client = init(true)
+  }
+
+  test(s"getPartitionsByFilter returns all partitions when $tryDirectSqlKey=false") {
+    val client = init(false)
     val filteredPartitions = client.getPartitionsByFilter(client.getTable("default", "test"),
-      Seq(EqualTo(AttributeReference("part", IntegerType)(), Literal(3))))
+      Seq(parseExpression("ds=20170101")))
 
     assert(filteredPartitions.size == testPartitionCount)
   }
+
+  test("getPartitionsByFilter: ds=20170101") {
+    testMetastorePartitionFiltering(
+      "ds=20170101",
+      20170101 to 20170101,
+      0 to 23,
+      "aa" :: "ab" :: "ba" :: "bb" :: Nil)
+  }
+
+  test("getPartitionsByFilter: ds=(20170101 + 1) and h=0") {
+    // Should return all partitions where h=0 because getPartitionsByFilter does not support
+    // comparisons to non-literal values
+    testMetastorePartitionFiltering(
+      "ds=(20170101 + 1) and h=0",
+      20170101 to 20170103,
+      0 to 0,
+      "aa" :: "ab" :: "ba" :: "bb" :: Nil)
+  }
+
+  test("getPartitionsByFilter: chunk='aa'") {
+    testMetastorePartitionFiltering(
+      "chunk='aa'",
+      20170101 to 20170103,
+      0 to 23,
+      "aa" :: Nil)
+  }
+
+  test("getPartitionsByFilter: 20170101=ds") {
+    testMetastorePartitionFiltering(
+      "20170101=ds",
+      20170101 to 20170101,
+      0 to 23,
+      "aa" :: "ab" :: "ba" :: "bb" :: Nil)
+  }
+
+  test("getPartitionsByFilter: ds=20170101 and h=10") {
+    testMetastorePartitionFiltering(
+      "ds=20170101 and h=10",
+      20170101 to 20170101,
+      10 to 10,
+      "aa" :: "ab" :: "ba" :: "bb" :: Nil)
+  }
+
+  test("getPartitionsByFilter: ds=20170101 or ds=20170102") {
+    testMetastorePartitionFiltering(
+      "ds=20170101 or ds=20170102",
+      20170101 to 20170102,
+      0 to 23,
+      "aa" :: "ab" :: "ba" :: "bb" :: Nil)
+  }
+
+  test("getPartitionsByFilter: ds in (20170102, 20170103) (using IN expression)") {
+    testMetastorePartitionFiltering(
+      "ds in (20170102, 20170103)",
+      20170102 to 20170103,
+      0 to 23,
+      "aa" :: "ab" :: "ba" :: "bb" :: Nil)
+  }
+
+  test("getPartitionsByFilter: ds in (20170102, 20170103) (using INSET expression)") {
+    testMetastorePartitionFiltering(
+      "ds in (20170102, 20170103)",
+      20170102 to 20170103,
+      0 to 23,
+      "aa" :: "ab" :: "ba" :: "bb" :: Nil, {
+        case expr @ In(v, list) if expr.inSetConvertible =>
+          InSet(v, Set() ++ list.map(_.eval(EmptyRow)))
+      })
+  }
+
+  test("getPartitionsByFilter: chunk in ('ab', 'ba') (using IN expression)") {
+    testMetastorePartitionFiltering(
+      "chunk in ('ab', 'ba')",
+      20170101 to 20170103,
+      0 to 23,
+      "ab" :: "ba" :: Nil)
+  }
+
+  test("getPartitionsByFilter: chunk in ('ab', 'ba') (using INSET expression)") {
+    testMetastorePartitionFiltering(
+      "chunk in ('ab', 'ba')",
+      20170101 to 20170103,
+      0 to 23,
+      "ab" :: "ba" :: Nil, {
+        case expr @ In(v, list) if expr.inSetConvertible =>
+          InSet(v, Set() ++ list.map(_.eval(EmptyRow)))
+      })
+  }
+
+  test("getPartitionsByFilter: (ds=20170101 and h>=8) or (ds=20170102 and h<8)") {
+    val day1 = (20170101 to 20170101, 8 to 23, Seq("aa", "ab", "ba", "bb"))
+    val day2 = (20170102 to 20170102, 0 to 7, Seq("aa", "ab", "ba", "bb"))
+    testMetastorePartitionFiltering(
+      "(ds=20170101 and h>=8) or (ds=20170102 and h<8)",
+      day1 :: day2 :: Nil)
+  }
+
+  test("getPartitionsByFilter: (ds=20170101 and h>=8) or (ds=20170102 and h<(7+1))") {
+    val day1 = (20170101 to 20170101, 8 to 23, Seq("aa", "ab", "ba", "bb"))
+    // Day 2 should include all hours because we can't build a filter for h<(7+1)
+    val day2 = (20170102 to 20170102, 0 to 23, Seq("aa", "ab", "ba", "bb"))
+    testMetastorePartitionFiltering(
+      "(ds=20170101 and h>=8) or (ds=20170102 and h<(7+1))",
+      day1 :: day2 :: Nil)
+  }
+
+  test("getPartitionsByFilter: " +
+      "chunk in ('ab', 'ba') and ((ds=20170101 and h>=8) or (ds=20170102 and h<8))") {
+    val day1 = (20170101 to 20170101, 8 to 23, Seq("ab", "ba"))
+    val day2 = (20170102 to 20170102, 0 to 7, Seq("ab", "ba"))
+    testMetastorePartitionFiltering(
+      "chunk in ('ab', 'ba') and ((ds=20170101 and h>=8) or (ds=20170102 and h<8))",
+      day1 :: day2 :: Nil)
+  }
+
+  private def testMetastorePartitionFiltering(
+      filterString: String,
+      expectedDs: Seq[Int],
+      expectedH: Seq[Int],
+      expectedChunks: Seq[String]): Unit = {
+    testMetastorePartitionFiltering(
+      filterString,
+      (expectedDs, expectedH, expectedChunks) :: Nil,
+      identity)
+  }
+
+  private def testMetastorePartitionFiltering(
+      filterString: String,
+      expectedDs: Seq[Int],
+      expectedH: Seq[Int],
+      expectedChunks: Seq[String],
+      transform: Expression => Expression): Unit = {
+    testMetastorePartitionFiltering(
+      filterString,
+      (expectedDs, expectedH, expectedChunks) :: Nil,
+      identity)
+  }
+
+  private def testMetastorePartitionFiltering(
+      filterString: String,
+      expectedPartitionCubes: Seq[(Seq[Int], Seq[Int], Seq[String])]): Unit = {
+    testMetastorePartitionFiltering(filterString, expectedPartitionCubes, identity)
+  }
+
+  private def testMetastorePartitionFiltering(
+      filterString: String,
+      expectedPartitionCubes: Seq[(Seq[Int], Seq[Int], Seq[String])],
+      transform: Expression => Expression): Unit = {
+    val filteredPartitions = client.getPartitionsByFilter(client.getTable("default", "test"),
+      Seq(
+        transform(parseExpression(filterString))
+      ))
+
+    val expectedPartitionCount = expectedPartitionCubes.map {
+      case (expectedDs, expectedH, expectedChunks) =>
+        expectedDs.size * expectedH.size * expectedChunks.size
+    }.sum
+
+    val expectedPartitions = expectedPartitionCubes.map {
+      case (expectedDs, expectedH, expectedChunks) =>
+        for {
+          ds <- expectedDs
+          h <- expectedH
+          chunk <- expectedChunks
+        } yield Set(
+          "ds" -> ds.toString,
+          "h" -> h.toString,
+          "chunk" -> chunk
+        )
+    }.reduce(_ ++ _)
+
+    val actualFilteredPartitionCount = filteredPartitions.size
+
+    assert(actualFilteredPartitionCount == expectedPartitionCount,
+      s"Expected $expectedPartitionCount partitions but got $actualFilteredPartitionCount")
+    assert(filteredPartitions.map(_.spec.toSet).toSet == expectedPartitions.toSet)
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuites.scala
new file mode 100644
index 0000000000000..de1be2115b2d8
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuites.scala
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.client
+
+import scala.collection.immutable.IndexedSeq
+
+import org.scalatest.Suite
+
+class HiveClientSuites extends Suite with HiveClientVersions {
+  override def nestedSuites: IndexedSeq[Suite] = {
+    // Hive 0.12 does not provide the partition filtering API we call
+    versions.filterNot(_ == "0.12").map(new HiveClientSuite(_))
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientVersions.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientVersions.scala
new file mode 100644
index 0000000000000..2e7dfde8b2fa5
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientVersions.scala
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.client
+
+import scala.collection.immutable.IndexedSeq
+
+import org.apache.spark.SparkFunSuite
+
+private[client] trait HiveClientVersions {
+  protected val versions = IndexedSeq("0.12", "0.13", "0.14", "1.0", "1.1", "1.2", "2.0", "2.1")
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveVersionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveVersionSuite.scala
new file mode 100644
index 0000000000000..986c6675cbb63
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveVersionSuite.scala
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.client
+
+import org.apache.hadoop.conf.Configuration
+import org.scalatest.Tag
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.hive.HiveUtils
+
+private[client] abstract class HiveVersionSuite(version: String) extends SparkFunSuite {
+  protected var client: HiveClient = null
+
+  protected def buildClient(hadoopConf: Configuration): HiveClient = {
+    // Hive changed the default of datanucleus.schema.autoCreateAll from true to false and
+    // hive.metastore.schema.verification from false to true since 2.0
+    // For details, see the JIRA HIVE-6113 and HIVE-12463
+    if (version == "2.0" || version == "2.1") {
+      hadoopConf.set("datanucleus.schema.autoCreateAll", "true")
+      hadoopConf.set("hive.metastore.schema.verification", "false")
+    }
+    HiveClientBuilder
+      .buildClient(version, hadoopConf, HiveUtils.hiveClientConfigurations(hadoopConf))
+  }
+
+  override def suiteName: String = s"${super.suiteName}($version)"
+
+  override protected def test(testName: String, testTags: Tag*)(testFun: => Unit): Unit = {
+    super.test(s"$version: $testName", testTags: _*)(testFun)
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index f109843f5be20..82fbdd645ebe0 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -47,11 +47,11 @@ import org.apache.spark.util.{MutableURLClassLoader, Utils}
  * sure that reflective calls are not throwing NoSuchMethod error, but the actually functionality
  * is not fully tested.
  */
+// TODO: Refactor this to `HiveClientSuite` and make it a subclass of `HiveVersionSuite`
 @ExtendedHiveTest
 class VersionsSuite extends SparkFunSuite with Logging {
 
-  private val clientBuilder = new HiveClientBuilder
-  import clientBuilder.buildClient
+  import HiveClientBuilder.buildClient
 
   /**
    * Creates a temporary directory, which is then passed to `f` and will be deleted after `f`

From 7514db1deca22b44ce18e4c571275ce79addc100 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Tue, 11 Jul 2017 11:11:08 +0100
Subject: [PATCH 155/779] [SPARK-21263][SQL] Do not allow partially parsing
 double and floats via NumberFormat in CSV

## What changes were proposed in this pull request?

This PR proposes to remove `NumberFormat.parse` use to disallow a case of partially parsed data. For example,

```
scala> spark.read.schema("a DOUBLE").option("mode", "FAILFAST").csv(Seq("10u12").toDS).show()
+----+
|   a|
+----+
|10.0|
+----+
```

## How was this patch tested?

Unit tests added in `UnivocityParserSuite` and `CSVSuite`.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #18532 from HyukjinKwon/SPARK-21263.
---
 .../datasources/csv/UnivocityParser.scala     |  8 ++-----
 .../execution/datasources/csv/CSVSuite.scala  | 21 +++++++++++++++++++
 .../csv/UnivocityParserSuite.scala            | 21 ++++++++++---------
 3 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
index c3657acb7d867..0e41f3c7aa6b8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
@@ -111,9 +111,7 @@ class UnivocityParser(
         case options.nanValue => Float.NaN
         case options.negativeInf => Float.NegativeInfinity
         case options.positiveInf => Float.PositiveInfinity
-        case datum =>
-          Try(datum.toFloat)
-            .getOrElse(NumberFormat.getInstance(Locale.US).parse(datum).floatValue())
+        case datum => datum.toFloat
       }
 
     case _: DoubleType => (d: String) =>
@@ -121,9 +119,7 @@ class UnivocityParser(
         case options.nanValue => Double.NaN
         case options.negativeInf => Double.NegativeInfinity
         case options.positiveInf => Double.PositiveInfinity
-        case datum =>
-          Try(datum.toDouble)
-            .getOrElse(NumberFormat.getInstance(Locale.US).parse(datum).doubleValue())
+        case datum => datum.toDouble
       }
 
     case _: BooleanType => (d: String) =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 89d9b69dec7ef..487c84f629be3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -1174,4 +1174,25 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
         }
       }
   }
+
+  test("SPARK-21263: Invalid float and double are handled correctly in different modes") {
+    val exception = intercept[SparkException] {
+      spark.read.schema("a DOUBLE")
+        .option("mode", "FAILFAST")
+        .csv(Seq("10u12").toDS())
+        .collect()
+    }
+    assert(exception.getMessage.contains("""input string: "10u12""""))
+
+    val count = spark.read.schema("a FLOAT")
+      .option("mode", "DROPMALFORMED")
+      .csv(Seq("10u12").toDS())
+      .count()
+    assert(count == 0)
+
+    val results = spark.read.schema("a FLOAT")
+      .option("mode", "PERMISSIVE")
+      .csv(Seq("10u12").toDS())
+    checkAnswer(results, Row(null))
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParserSuite.scala
index a74b22a4a88a6..efbf73534bd19 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParserSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParserSuite.scala
@@ -130,16 +130,17 @@ class UnivocityParserSuite extends SparkFunSuite {
       DateTimeUtils.millisToDays(DateTimeUtils.stringToTime("2015-01-01").getTime))
   }
 
-  test("Float and Double Types are cast without respect to platform default Locale") {
-    val originalLocale = Locale.getDefault
-    try {
-      Locale.setDefault(new Locale("fr", "FR"))
-      // Would parse as 1.0 in fr-FR
-      val options = new CSVOptions(Map.empty[String, String], "GMT")
-      assert(parser.makeConverter("_1", FloatType, options = options).apply("1,00") == 100.0)
-      assert(parser.makeConverter("_1", DoubleType, options = options).apply("1,00") == 100.0)
-    } finally {
-      Locale.setDefault(originalLocale)
+  test("Throws exception for casting an invalid string to Float and Double Types") {
+    val options = new CSVOptions(Map.empty[String, String], "GMT")
+    val types = Seq(DoubleType, FloatType)
+    val input = Seq("10u000", "abc", "1 2/3")
+    types.foreach { dt =>
+      input.foreach { v =>
+        val message = intercept[NumberFormatException] {
+          parser.makeConverter("_1", dt, options = options).apply(v)
+        }.getMessage
+        assert(message.contains(v))
+      }
     }
   }
 

From 66d21686556681457aab6e44e19f5614c5635f0c Mon Sep 17 00:00:00 2001
From: Xingbo Jiang <xingbo.jiang@databricks.com>
Date: Tue, 11 Jul 2017 21:52:54 +0800
Subject: [PATCH 156/779] [SPARK-21366][SQL][TEST] Add sql test for window
 functions

## What changes were proposed in this pull request?

Add sql test for window functions, also remove uncecessary test cases in `WindowQuerySuite`.

## How was this patch tested?

Added `window.sql` and the corresponding output file.

Author: Xingbo Jiang <xingbo.jiang@databricks.com>

Closes #18591 from jiangxb1987/window.
---
 .../resources/sql-tests/inputs/window.sql     |  69 ++++++
 .../sql-tests/results/window.sql.out          | 204 ++++++++++++++++++
 .../sql/hive/execution/WindowQuerySuite.scala |  27 ---
 3 files changed, 273 insertions(+), 27 deletions(-)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/window.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/window.sql.out

diff --git a/sql/core/src/test/resources/sql-tests/inputs/window.sql b/sql/core/src/test/resources/sql-tests/inputs/window.sql
new file mode 100644
index 0000000000000..c800fc3d49891
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/window.sql
@@ -0,0 +1,69 @@
+-- Test data.
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(null, "a"), (1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b"), (null, null), (3, null)
+AS testData(val, cate);
+
+-- RowsBetween
+SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY val ROWS CURRENT ROW) FROM testData
+ORDER BY cate, val;
+SELECT val, cate, sum(val) OVER(PARTITION BY cate ORDER BY val
+ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING) FROM testData ORDER BY cate, val;
+
+-- RangeBetween
+SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY val RANGE 1 PRECEDING) FROM testData
+ORDER BY cate, val;
+SELECT val, cate, sum(val) OVER(PARTITION BY cate ORDER BY val
+RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, val;
+
+-- RangeBetween with reverse OrderBy
+SELECT val, cate, sum(val) OVER(PARTITION BY cate ORDER BY val DESC
+RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, val;
+
+-- Window functions
+SELECT val, cate,
+max(val) OVER w AS max,
+min(val) OVER w AS min,
+min(val) OVER w AS min,
+count(val) OVER w AS count,
+sum(val) OVER w AS sum,
+avg(val) OVER w AS avg,
+stddev(val) OVER w AS stddev,
+first_value(val) OVER w AS first_value,
+first_value(val, true) OVER w AS first_value_ignore_null,
+first_value(val, false) OVER w AS first_value_contain_null,
+last_value(val) OVER w AS last_value,
+last_value(val, true) OVER w AS last_value_ignore_null,
+last_value(val, false) OVER w AS last_value_contain_null,
+rank() OVER w AS rank,
+dense_rank() OVER w AS dense_rank,
+cume_dist() OVER w AS cume_dist,
+percent_rank() OVER w AS percent_rank,
+ntile(2) OVER w AS ntile,
+row_number() OVER w AS row_number,
+var_pop(val) OVER w AS var_pop,
+var_samp(val) OVER w AS var_samp,
+approx_count_distinct(val) OVER w AS approx_count_distinct
+FROM testData
+WINDOW w AS (PARTITION BY cate ORDER BY val)
+ORDER BY cate, val;
+
+-- Null inputs
+SELECT val, cate, avg(null) OVER(PARTITION BY cate ORDER BY val) FROM testData ORDER BY cate, val;
+
+-- OrderBy not specified
+SELECT val, cate, row_number() OVER(PARTITION BY cate) FROM testData ORDER BY cate, val;
+
+-- Over clause is empty
+SELECT val, cate, sum(val) OVER(), avg(val) OVER() FROM testData ORDER BY cate, val;
+
+-- first_value()/last_value() over ()
+SELECT val, cate,
+first_value(false) OVER w AS first_value,
+first_value(true, true) OVER w AS first_value_ignore_null,
+first_value(false, false) OVER w AS first_value_contain_null,
+last_value(false) OVER w AS last_value,
+last_value(true, true) OVER w AS last_value_ignore_null,
+last_value(false, false) OVER w AS last_value_contain_null
+FROM testData
+WINDOW w AS ()
+ORDER BY cate, val;
diff --git a/sql/core/src/test/resources/sql-tests/results/window.sql.out b/sql/core/src/test/resources/sql-tests/results/window.sql.out
new file mode 100644
index 0000000000000..aa5856138ed81
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/window.sql.out
@@ -0,0 +1,204 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 11
+
+
+-- !query 0
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(null, "a"), (1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b"), (null, null), (3, null)
+AS testData(val, cate)
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY val ROWS CURRENT ROW) FROM testData
+ORDER BY cate, val
+-- !query 1 schema
+struct<val:int,cate:string,count(val) OVER (PARTITION BY cate ORDER BY val ASC NULLS FIRST ROWS BETWEEN CURRENT ROW AND CURRENT ROW):bigint>
+-- !query 1 output
+NULL	NULL	0
+3	NULL	1
+NULL	a	0
+1	a	1
+1	a	1
+2	a	1
+1	b	1
+2	b	1
+3	b	1
+
+
+-- !query 2
+SELECT val, cate, sum(val) OVER(PARTITION BY cate ORDER BY val
+ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING) FROM testData ORDER BY cate, val
+-- !query 2 schema
+struct<val:int,cate:string,sum(val) OVER (PARTITION BY cate ORDER BY val ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING):bigint>
+-- !query 2 output
+NULL	NULL	3
+3	NULL	3
+NULL	a	1
+1	a	2
+1	a	4
+2	a	4
+1	b	3
+2	b	6
+3	b	6
+
+
+-- !query 3
+SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY val RANGE 1 PRECEDING) FROM testData
+ORDER BY cate, val
+-- !query 3 schema
+struct<val:int,cate:string,count(val) OVER (PARTITION BY cate ORDER BY val ASC NULLS FIRST RANGE BETWEEN 1 PRECEDING AND CURRENT ROW):bigint>
+-- !query 3 output
+NULL	NULL	0
+3	NULL	1
+NULL	a	0
+1	a	2
+1	a	2
+2	a	3
+1	b	1
+2	b	2
+3	b	2
+
+
+-- !query 4
+SELECT val, cate, sum(val) OVER(PARTITION BY cate ORDER BY val
+RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, val
+-- !query 4 schema
+struct<val:int,cate:string,sum(val) OVER (PARTITION BY cate ORDER BY val ASC NULLS FIRST RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING):bigint>
+-- !query 4 output
+NULL	NULL	NULL
+3	NULL	3
+NULL	a	NULL
+1	a	4
+1	a	4
+2	a	2
+1	b	3
+2	b	5
+3	b	3
+
+
+-- !query 5
+SELECT val, cate, sum(val) OVER(PARTITION BY cate ORDER BY val DESC
+RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, val
+-- !query 5 schema
+struct<val:int,cate:string,sum(val) OVER (PARTITION BY cate ORDER BY val DESC NULLS LAST RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING):bigint>
+-- !query 5 output
+NULL	NULL	NULL
+3	NULL	3
+NULL	a	NULL
+1	a	2
+1	a	2
+2	a	4
+1	b	1
+2	b	3
+3	b	5
+
+
+-- !query 6
+SELECT val, cate,
+max(val) OVER w AS max,
+min(val) OVER w AS min,
+min(val) OVER w AS min,
+count(val) OVER w AS count,
+sum(val) OVER w AS sum,
+avg(val) OVER w AS avg,
+stddev(val) OVER w AS stddev,
+first_value(val) OVER w AS first_value,
+first_value(val, true) OVER w AS first_value_ignore_null,
+first_value(val, false) OVER w AS first_value_contain_null,
+last_value(val) OVER w AS last_value,
+last_value(val, true) OVER w AS last_value_ignore_null,
+last_value(val, false) OVER w AS last_value_contain_null,
+rank() OVER w AS rank,
+dense_rank() OVER w AS dense_rank,
+cume_dist() OVER w AS cume_dist,
+percent_rank() OVER w AS percent_rank,
+ntile(2) OVER w AS ntile,
+row_number() OVER w AS row_number,
+var_pop(val) OVER w AS var_pop,
+var_samp(val) OVER w AS var_samp,
+approx_count_distinct(val) OVER w AS approx_count_distinct
+FROM testData
+WINDOW w AS (PARTITION BY cate ORDER BY val)
+ORDER BY cate, val
+-- !query 6 schema
+struct<val:int,cate:string,max:int,min:int,min:int,count:bigint,sum:bigint,avg:double,stddev:double,first_value:int,first_value_ignore_null:int,first_value_contain_null:int,last_value:int,last_value_ignore_null:int,last_value_contain_null:int,rank:int,dense_rank:int,cume_dist:double,percent_rank:double,ntile:int,row_number:int,var_pop:double,var_samp:double,approx_count_distinct:bigint>
+-- !query 6 output
+NULL	NULL	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1	1	0.5	0.0	1	1	NULL	NULL	0
+3	NULL	3	3	3	1	3	3.0	NaN	NULL	3	NULL	3	3	3	2	2	1.0	1.0	2	2	0.0	NaN	1
+NULL	a	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1	1	0.25	0.0	1	1	NULL	NULL	0
+1	a	1	1	1	2	2	1.0	0.0	NULL	1	NULL	1	1	1	2	2	0.75	0.3333333333333333	1	2	0.0	0.0	1
+1	a	1	1	1	2	2	1.0	0.0	NULL	1	NULL	1	1	1	2	2	0.75	0.3333333333333333	2	3	0.0	0.0	1
+2	a	2	1	1	3	4	1.3333333333333333	0.5773502691896258	NULL	1	NULL	2	2	2	4	3	1.0	1.0	2	4	0.22222222222222224	0.33333333333333337	2
+1	b	1	1	1	1	1	1.0	NaN	1	1	1	1	1	1	1	1	0.3333333333333333	0.0	1	1	0.0	NaN	1
+2	b	2	1	1	2	3	1.5	0.7071067811865476	1	1	1	2	2	2	2	2	0.6666666666666666	0.5	1	2	0.25	0.5	2
+3	b	3	1	1	3	6	2.0	1.0	1	1	1	3	3	3	3	3	1.0	1.0	2	3	0.6666666666666666	1.0	3
+
+
+-- !query 7
+SELECT val, cate, avg(null) OVER(PARTITION BY cate ORDER BY val) FROM testData ORDER BY cate, val
+-- !query 7 schema
+struct<val:int,cate:string,avg(CAST(NULL AS DOUBLE)) OVER (PARTITION BY cate ORDER BY val ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW):double>
+-- !query 7 output
+NULL	NULL	NULL
+3	NULL	NULL
+NULL	a	NULL
+1	a	NULL
+1	a	NULL
+2	a	NULL
+1	b	NULL
+2	b	NULL
+3	b	NULL
+
+
+-- !query 8
+SELECT val, cate, row_number() OVER(PARTITION BY cate) FROM testData ORDER BY cate, val
+-- !query 8 schema
+struct<>
+-- !query 8 output
+org.apache.spark.sql.AnalysisException
+Window function row_number() requires window to be ordered, please add ORDER BY clause. For example SELECT row_number()(value_expr) OVER (PARTITION BY window_partition ORDER BY window_ordering) from table;
+
+
+-- !query 9
+SELECT val, cate, sum(val) OVER(), avg(val) OVER() FROM testData ORDER BY cate, val
+-- !query 9 schema
+struct<val:int,cate:string,sum(CAST(val AS BIGINT)) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING):bigint,avg(CAST(val AS BIGINT)) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING):double>
+-- !query 9 output
+NULL	NULL	13	1.8571428571428572
+3	NULL	13	1.8571428571428572
+NULL	a	13	1.8571428571428572
+1	a	13	1.8571428571428572
+1	a	13	1.8571428571428572
+2	a	13	1.8571428571428572
+1	b	13	1.8571428571428572
+2	b	13	1.8571428571428572
+3	b	13	1.8571428571428572
+
+
+-- !query 10
+SELECT val, cate,
+first_value(false) OVER w AS first_value,
+first_value(true, true) OVER w AS first_value_ignore_null,
+first_value(false, false) OVER w AS first_value_contain_null,
+last_value(false) OVER w AS last_value,
+last_value(true, true) OVER w AS last_value_ignore_null,
+last_value(false, false) OVER w AS last_value_contain_null
+FROM testData
+WINDOW w AS ()
+ORDER BY cate, val
+-- !query 10 schema
+struct<val:int,cate:string,first_value:boolean,first_value_ignore_null:boolean,first_value_contain_null:boolean,last_value:boolean,last_value_ignore_null:boolean,last_value_contain_null:boolean>
+-- !query 10 output
+NULL	NULL	false	true	false	false	true	false
+3	NULL	false	true	false	false	true	false
+NULL	a	false	true	false	false	true	false
+1	a	false	true	false	false	true	false
+1	a	false	true	false	false	true	false
+2	a	false	true	false	false	true	false
+1	b	false	true	false	false	true	false
+2	b	false	true	false	false	true	false
+3	b	false	true	false	false	true	false
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/WindowQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/WindowQuerySuite.scala
index a20c758a83e71..3f9485dd018b1 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/WindowQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/WindowQuerySuite.scala
@@ -232,31 +232,4 @@ class WindowQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleto
         Row("Manufacturer#5", "almond azure blanched chiffon midnight", 23, 315.9225931564038, 315.9225931564038, 46, 99807.08486666666, -0.9978877469246935, -5664.856666666666)))
       // scalastyle:on
   }
-
-  test("null arguments") {
-    checkAnswer(sql("""
-        |select  p_mfgr, p_name, p_size,
-        |sum(null) over(distribute by p_mfgr sort by p_name) as sum,
-        |avg(null) over(distribute by p_mfgr sort by p_name) as avg
-        |from part
-      """.stripMargin),
-      sql("""
-        |select  p_mfgr, p_name, p_size,
-        |null as sum,
-        |null as avg
-        |from part
-        """.stripMargin))
-  }
-
-  test("SPARK-16646: LAST_VALUE(FALSE) OVER ()") {
-    checkAnswer(sql("SELECT LAST_VALUE(FALSE) OVER ()"), Row(false))
-    checkAnswer(sql("SELECT LAST_VALUE(FALSE, FALSE) OVER ()"), Row(false))
-    checkAnswer(sql("SELECT LAST_VALUE(TRUE, TRUE) OVER ()"), Row(true))
-  }
-
-  test("SPARK-16646: FIRST_VALUE(FALSE) OVER ()") {
-    checkAnswer(sql("SELECT FIRST_VALUE(FALSE) OVER ()"), Row(false))
-    checkAnswer(sql("SELECT FIRST_VALUE(FALSE, FALSE) OVER ()"), Row(false))
-    checkAnswer(sql("SELECT FIRST_VALUE(TRUE, TRUE) OVER ()"), Row(true))
-  }
 }

From ebc124d4c44d4c84f7868f390f778c0ff5cd66cb Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Tue, 11 Jul 2017 22:03:10 +0800
Subject: [PATCH 157/779] [SPARK-21365][PYTHON] Deduplicate logics parsing DDL
 type/schema definition

## What changes were proposed in this pull request?

This PR deals with four points as below:

- Reuse existing DDL parser APIs rather than reimplementing within PySpark

- Support DDL formatted string, `field type, field type`.

- Support case-insensitivity for parsing.

- Support nested data types as below:

  **Before**
  ```
  >>> spark.createDataFrame([[[1]]], "struct<a: struct<b: int>>").show()
  ...
  ValueError: The strcut field string format is: 'field_name:field_type', but got: a: struct<b: int>
  ```

  ```
  >>> spark.createDataFrame([[[1]]], "a: struct<b: int>").show()
  ...
  ValueError: The strcut field string format is: 'field_name:field_type', but got: a: struct<b: int>
  ```

  ```
  >>> spark.createDataFrame([[1]], "a int").show()
  ...
  ValueError: Could not parse datatype: a int
  ```

  **After**
  ```
  >>> spark.createDataFrame([[[1]]], "struct<a: struct<b: int>>").show()
  +---+
  |  a|
  +---+
  |[1]|
  +---+
  ```

  ```
  >>> spark.createDataFrame([[[1]]], "a: struct<b: int>").show()
  +---+
  |  a|
  +---+
  |[1]|
  +---+
  ```

  ```
  >>> spark.createDataFrame([[1]], "a int").show()
  +---+
  |  a|
  +---+
  |  1|
  +---+
  ```

## How was this patch tested?

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #18590 from HyukjinKwon/deduplicate-python-ddl.
---
 python/pyspark/sql/functions.py               | 16 +++-
 python/pyspark/sql/tests.py                   | 25 ++++++
 python/pyspark/sql/types.py                   | 88 +++++++------------
 .../spark/sql/api/python/PythonSQLUtils.scala | 25 ++++++
 4 files changed, 97 insertions(+), 57 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index f203d85dd9e61..d45ff63355de3 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -2037,15 +2037,25 @@ def __init__(self, func, returnType, name=None):
                 "{0}".format(type(func)))
 
         self.func = func
-        self.returnType = (
-            returnType if isinstance(returnType, DataType)
-            else _parse_datatype_string(returnType))
+        self._returnType = returnType
         # Stores UserDefinedPythonFunctions jobj, once initialized
+        self._returnType_placeholder = None
         self._judf_placeholder = None
         self._name = name or (
             func.__name__ if hasattr(func, '__name__')
             else func.__class__.__name__)
 
+    @property
+    def returnType(self):
+        # This makes sure this is called after SparkContext is initialized.
+        # ``_parse_datatype_string`` accesses to JVM for parsing a DDL formatted string.
+        if self._returnType_placeholder is None:
+            if isinstance(self._returnType, DataType):
+                self._returnType_placeholder = self._returnType
+            else:
+                self._returnType_placeholder = _parse_datatype_string(self._returnType)
+        return self._returnType_placeholder
+
     @property
     def _judf(self):
         # It is possible that concurrent access, to newly created UDF,
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index bd8477e35f37a..29e48a6ccf763 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -1255,6 +1255,31 @@ def test_struct_type(self):
         with self.assertRaises(TypeError):
             not_a_field = struct1[9.9]
 
+    def test_parse_datatype_string(self):
+        from pyspark.sql.types import _all_atomic_types, _parse_datatype_string
+        for k, t in _all_atomic_types.items():
+            if t != NullType:
+                self.assertEqual(t(), _parse_datatype_string(k))
+        self.assertEqual(IntegerType(), _parse_datatype_string("int"))
+        self.assertEqual(DecimalType(1, 1), _parse_datatype_string("decimal(1  ,1)"))
+        self.assertEqual(DecimalType(10, 1), _parse_datatype_string("decimal( 10,1 )"))
+        self.assertEqual(DecimalType(11, 1), _parse_datatype_string("decimal(11,1)"))
+        self.assertEqual(
+            ArrayType(IntegerType()),
+            _parse_datatype_string("array<int >"))
+        self.assertEqual(
+            MapType(IntegerType(), DoubleType()),
+            _parse_datatype_string("map< int, double  >"))
+        self.assertEqual(
+            StructType([StructField("a", IntegerType()), StructField("c", DoubleType())]),
+            _parse_datatype_string("struct<a:int, c:double >"))
+        self.assertEqual(
+            StructType([StructField("a", IntegerType()), StructField("c", DoubleType())]),
+            _parse_datatype_string("a:int, c:double"))
+        self.assertEqual(
+            StructType([StructField("a", IntegerType()), StructField("c", DoubleType())]),
+            _parse_datatype_string("a INT, c DOUBLE"))
+
     def test_metadata_null(self):
         from pyspark.sql.types import StructType, StringType, StructField
         schema = StructType([StructField("f1", StringType(), True, None),
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index f5505ed4722ad..22fa273fc1aac 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -32,6 +32,7 @@
 from py4j.protocol import register_input_converter
 from py4j.java_gateway import JavaClass
 
+from pyspark import SparkContext
 from pyspark.serializers import CloudPickleSerializer
 
 __all__ = [
@@ -727,18 +728,6 @@ def __eq__(self, other):
 _BRACKETS = {'(': ')', '[': ']', '{': '}'}
 
 
-def _parse_basic_datatype_string(s):
-    if s in _all_atomic_types.keys():
-        return _all_atomic_types[s]()
-    elif s == "int":
-        return IntegerType()
-    elif _FIXED_DECIMAL.match(s):
-        m = _FIXED_DECIMAL.match(s)
-        return DecimalType(int(m.group(1)), int(m.group(2)))
-    else:
-        raise ValueError("Could not parse datatype: %s" % s)
-
-
 def _ignore_brackets_split(s, separator):
     """
     Splits the given string by given separator, but ignore separators inside brackets pairs, e.g.
@@ -771,32 +760,23 @@ def _ignore_brackets_split(s, separator):
     return parts
 
 
-def _parse_struct_fields_string(s):
-    parts = _ignore_brackets_split(s, ",")
-    fields = []
-    for part in parts:
-        name_and_type = _ignore_brackets_split(part, ":")
-        if len(name_and_type) != 2:
-            raise ValueError("The strcut field string format is: 'field_name:field_type', " +
-                             "but got: %s" % part)
-        field_name = name_and_type[0].strip()
-        field_type = _parse_datatype_string(name_and_type[1])
-        fields.append(StructField(field_name, field_type))
-    return StructType(fields)
-
-
 def _parse_datatype_string(s):
     """
     Parses the given data type string to a :class:`DataType`. The data type string format equals
     to :class:`DataType.simpleString`, except that top level struct type can omit
     the ``struct<>`` and atomic types use ``typeName()`` as their format, e.g. use ``byte`` instead
     of ``tinyint`` for :class:`ByteType`. We can also use ``int`` as a short name
-    for :class:`IntegerType`.
+    for :class:`IntegerType`. Since Spark 2.3, this also supports a schema in a DDL-formatted
+    string and case-insensitive strings.
 
     >>> _parse_datatype_string("int ")
     IntegerType
+    >>> _parse_datatype_string("INT ")
+    IntegerType
     >>> _parse_datatype_string("a: byte, b: decimal(  16 , 8   ) ")
     StructType(List(StructField(a,ByteType,true),StructField(b,DecimalType(16,8),true)))
+    >>> _parse_datatype_string("a DOUBLE, b STRING")
+    StructType(List(StructField(a,DoubleType,true),StructField(b,StringType,true)))
     >>> _parse_datatype_string("a: array< short>")
     StructType(List(StructField(a,ArrayType(ShortType,true),true)))
     >>> _parse_datatype_string(" map<string , string > ")
@@ -806,43 +786,43 @@ def _parse_datatype_string(s):
     >>> _parse_datatype_string("blabla") # doctest: +IGNORE_EXCEPTION_DETAIL
     Traceback (most recent call last):
         ...
-    ValueError:...
+    ParseException:...
     >>> _parse_datatype_string("a: int,") # doctest: +IGNORE_EXCEPTION_DETAIL
     Traceback (most recent call last):
         ...
-    ValueError:...
+    ParseException:...
     >>> _parse_datatype_string("array<int") # doctest: +IGNORE_EXCEPTION_DETAIL
     Traceback (most recent call last):
         ...
-    ValueError:...
+    ParseException:...
     >>> _parse_datatype_string("map<int, boolean>>") # doctest: +IGNORE_EXCEPTION_DETAIL
     Traceback (most recent call last):
         ...
-    ValueError:...
+    ParseException:...
     """
-    s = s.strip()
-    if s.startswith("array<"):
-        if s[-1] != ">":
-            raise ValueError("'>' should be the last char, but got: %s" % s)
-        return ArrayType(_parse_datatype_string(s[6:-1]))
-    elif s.startswith("map<"):
-        if s[-1] != ">":
-            raise ValueError("'>' should be the last char, but got: %s" % s)
-        parts = _ignore_brackets_split(s[4:-1], ",")
-        if len(parts) != 2:
-            raise ValueError("The map type string format is: 'map<key_type,value_type>', " +
-                             "but got: %s" % s)
-        kt = _parse_datatype_string(parts[0])
-        vt = _parse_datatype_string(parts[1])
-        return MapType(kt, vt)
-    elif s.startswith("struct<"):
-        if s[-1] != ">":
-            raise ValueError("'>' should be the last char, but got: %s" % s)
-        return _parse_struct_fields_string(s[7:-1])
-    elif ":" in s:
-        return _parse_struct_fields_string(s)
-    else:
-        return _parse_basic_datatype_string(s)
+    sc = SparkContext._active_spark_context
+
+    def from_ddl_schema(type_str):
+        return _parse_datatype_json_string(
+            sc._jvm.org.apache.spark.sql.types.StructType.fromDDL(type_str).json())
+
+    def from_ddl_datatype(type_str):
+        return _parse_datatype_json_string(
+            sc._jvm.org.apache.spark.sql.api.python.PythonSQLUtils.parseDataType(type_str).json())
+
+    try:
+        # DDL format, "fieldname datatype, fieldname datatype".
+        return from_ddl_schema(s)
+    except Exception as e:
+        try:
+            # For backwards compatibility, "integer", "struct<fieldname: datatype>" and etc.
+            return from_ddl_datatype(s)
+        except:
+            try:
+                # For backwards compatibility, "fieldname: datatype, fieldname: datatype" case.
+                return from_ddl_datatype("struct<%s>" % s.strip())
+            except:
+                raise e
 
 
 def _parse_datatype_json_string(json_string):
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala
new file mode 100644
index 0000000000000..731feb914d251
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.python
+
+import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
+import org.apache.spark.sql.types.DataType
+
+private[sql] object PythonSQLUtils {
+  def parseDataType(typeText: String): DataType = CatalystSqlParser.parseDataType(typeText)
+}

From 1cad31f00644d899d8e74d58c6eb4e9f72065473 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Tue, 11 Jul 2017 11:25:40 -0700
Subject: [PATCH 158/779] [SPARK-16019][YARN] Use separate RM poll interval
 when starting client AM.

Currently the code monitoring the launch of the client AM uses the value of
spark.yarn.report.interval as the interval for polling the RM; if someone
has that value to a really large interval, it would take that long to detect
that the client AM has started, which is not expected.

Instead, have a separate config for the interval to use when the client AM is
starting. The other config is still used in cluster mode, and to detect the
status of the client AM after it is already running.

Tested by running client and cluster mode apps with a modified value of
spark.yarn.report.interval, verifying client AM launch is detected before
that interval elapses.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #18380 from vanzin/SPARK-16019.
---
 .../scala/org/apache/spark/deploy/yarn/Client.scala    |  6 ++++--
 .../scala/org/apache/spark/deploy/yarn/config.scala    | 10 ++++++++--
 .../scheduler/cluster/YarnClientSchedulerBackend.scala |  6 +++++-
 3 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 1dd0715918042..7caaa91e1af97 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -986,13 +986,15 @@ private[spark] class Client(
    * @param appId ID of the application to monitor.
    * @param returnOnRunning Whether to also return the application state when it is RUNNING.
    * @param logApplicationReport Whether to log details of the application report every iteration.
+   * @param interval How often to poll the YARN RM for application status (in ms).
    * @return A pair of the yarn application state and the final application state.
    */
   def monitorApplication(
       appId: ApplicationId,
       returnOnRunning: Boolean = false,
-      logApplicationReport: Boolean = true): (YarnApplicationState, FinalApplicationStatus) = {
-    val interval = sparkConf.get(REPORT_INTERVAL)
+      logApplicationReport: Boolean = true,
+      interval: Long = sparkConf.get(REPORT_INTERVAL)):
+      (YarnApplicationState, FinalApplicationStatus) = {
     var lastState: YarnApplicationState = null
     while (true) {
       Thread.sleep(interval)
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala
index d4108caab28c1..187803cc6050b 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala
@@ -127,7 +127,7 @@ package object config {
     .stringConf
     .createOptional
 
-  /* Cluster-mode launcher configuration. */
+  /* Launcher configuration. */
 
   private[spark] val WAIT_FOR_APP_COMPLETION = ConfigBuilder("spark.yarn.submit.waitAppCompletion")
     .doc("In cluster mode, whether to wait for the application to finish before exiting the " +
@@ -136,10 +136,16 @@ package object config {
     .createWithDefault(true)
 
   private[spark] val REPORT_INTERVAL = ConfigBuilder("spark.yarn.report.interval")
-    .doc("Interval between reports of the current app status in cluster mode.")
+    .doc("Interval between reports of the current app status.")
     .timeConf(TimeUnit.MILLISECONDS)
     .createWithDefaultString("1s")
 
+  private[spark] val CLIENT_LAUNCH_MONITOR_INTERVAL =
+    ConfigBuilder("spark.yarn.clientLaunchMonitorInterval")
+      .doc("Interval between requests for status the client mode AM when starting the app.")
+      .timeConf(TimeUnit.MILLISECONDS)
+      .createWithDefaultString("1s")
+
   /* Shared Client-mode AM / Driver configuration. */
 
   private[spark] val AM_MAX_WAIT_TIME = ConfigBuilder("spark.yarn.am.waitTime")
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
index 60da356ad14aa..d482376d14dd7 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
@@ -23,6 +23,7 @@ import org.apache.hadoop.yarn.api.records.YarnApplicationState
 
 import org.apache.spark.{SparkContext, SparkException}
 import org.apache.spark.deploy.yarn.{Client, ClientArguments, YarnSparkHadoopUtil}
+import org.apache.spark.deploy.yarn.config._
 import org.apache.spark.internal.Logging
 import org.apache.spark.launcher.SparkAppHandle
 import org.apache.spark.scheduler.TaskSchedulerImpl
@@ -77,8 +78,11 @@ private[spark] class YarnClientSchedulerBackend(
    * This assumes both `client` and `appId` have already been set.
    */
   private def waitForApplication(): Unit = {
+    val monitorInterval = conf.get(CLIENT_LAUNCH_MONITOR_INTERVAL)
+
     assert(client != null && appId.isDefined, "Application has not been submitted yet!")
-    val (state, _) = client.monitorApplication(appId.get, returnOnRunning = true) // blocking
+    val (state, _) = client.monitorApplication(appId.get, returnOnRunning = true,
+      interval = monitorInterval) // blocking
     if (state == YarnApplicationState.FINISHED ||
       state == YarnApplicationState.FAILED ||
       state == YarnApplicationState.KILLED) {

From d3e071658f931f601cd4caaf00997ae411593a44 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 11 Jul 2017 15:44:29 -0700
Subject: [PATCH 159/779] [SPARK-19285][SQL] Implement UDF0

### What changes were proposed in this pull request?
This PR is to implement UDF0. `UDF0` is needed when users need to implement a JAVA UDF with no argument.

### How was this patch tested?
Added a test case

Author: gatorsmile <gatorsmile@gmail.com>

Closes #18598 from gatorsmile/udf0.
---
 .../org/apache/spark/sql/api/java/UDF0.java   | 30 +++++++++++++
 .../apache/spark/sql/UDFRegistration.scala    | 44 +++++++++++++------
 .../org/apache/spark/sql/JavaUDFSuite.java    |  8 ++++
 3 files changed, 69 insertions(+), 13 deletions(-)
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/api/java/UDF0.java

diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF0.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF0.java
new file mode 100644
index 0000000000000..4eeb7be3f5abb
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/UDF0.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.api.java;
+
+import java.io.Serializable;
+
+import org.apache.spark.annotation.InterfaceStability;
+
+/**
+ * A Spark SQL UDF that has 0 arguments.
+ */
+@InterfaceStability.Stable
+public interface UDF0<R> extends Serializable {
+    R call() throws Exception;
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
index c4d0adb5236f2..c66d4057b9135 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
@@ -122,25 +122,27 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
         }""")
     }
 
-    (1 to 22).foreach { i =>
-      val extTypeArgs = (1 to i).map(_ => "_").mkString(", ")
-      val anyTypeArgs = (1 to i).map(_ => "Any").mkString(", ")
-      val anyCast = s".asInstanceOf[UDF$i[$anyTypeArgs, Any]]"
+    (0 to 22).foreach { i =>
+      val extTypeArgs = (0 to i).map(_ => "_").mkString(", ")
+      val anyTypeArgs = (0 to i).map(_ => "Any").mkString(", ")
+      val anyCast = s".asInstanceOf[UDF$i[$anyTypeArgs]]"
       val anyParams = (1 to i).map(_ => "_: Any").mkString(", ")
+      val version = if (i == 0) "2.3.0" else "1.3.0"
+      val funcCall = if (i == 0) "() => func" else "func"
       println(s"""
         |/**
         | * Register a user-defined function with ${i} arguments.
-        | * @since 1.3.0
+        | * @since $version
         | */
-        |def register(name: String, f: UDF$i[$extTypeArgs, _], returnType: DataType): Unit = {
+        |def register(name: String, f: UDF$i[$extTypeArgs], returnType: DataType): Unit = {
         |  val func = f$anyCast.call($anyParams)
-        |def builder(e: Seq[Expression]) = if (e.length == $i) {
-        |  ScalaUDF(func, returnType, e)
-        |} else {
-        |  throw new AnalysisException("Invalid number of arguments for function " + name +
-        |    ". Expected: $i; Found: " + e.length)
-        |}
-        |functionRegistry.createOrReplaceTempFunction(name, builder)
+        |  def builder(e: Seq[Expression]) = if (e.length == $i) {
+        |    ScalaUDF($funcCall, returnType, e)
+        |  } else {
+        |    throw new AnalysisException("Invalid number of arguments for function " + name +
+        |      ". Expected: $i; Found: " + e.length)
+        |  }
+        |  functionRegistry.createOrReplaceTempFunction(name, builder)
         |}""".stripMargin)
     }
     */
@@ -592,6 +594,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
           }
 
           udfInterfaces(0).getActualTypeArguments.length match {
+            case 1 => register(name, udf.asInstanceOf[UDF0[_]], returnType)
             case 2 => register(name, udf.asInstanceOf[UDF1[_, _]], returnType)
             case 3 => register(name, udf.asInstanceOf[UDF2[_, _, _]], returnType)
             case 4 => register(name, udf.asInstanceOf[UDF3[_, _, _, _]], returnType)
@@ -649,6 +652,21 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     }
   }
 
+  /**
+   * Register a user-defined function with 0 arguments.
+   * @since 2.3.0
+   */
+  def register(name: String, f: UDF0[_], returnType: DataType): Unit = {
+    val func = f.asInstanceOf[UDF0[Any]].call()
+    def builder(e: Seq[Expression]) = if (e.length == 0) {
+      ScalaUDF(() => func, returnType, e)
+    } else {
+      throw new AnalysisException("Invalid number of arguments for function " + name +
+        ". Expected: 0; Found: " + e.length)
+    }
+    functionRegistry.createOrReplaceTempFunction(name, builder)
+  }
+
   /**
    * Register a user-defined function with 1 arguments.
    * @since 1.3.0
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java
index 4fb2988f24d26..5bf1888826186 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java
@@ -113,4 +113,12 @@ public void udf5Test() {
     spark.udf().register("inc", (Long i) -> i + 1, DataTypes.LongType);
     List<Row> results = spark.sql("SELECT inc(1, 5)").collectAsList();
   }
+
+  @SuppressWarnings("unchecked")
+  @Test
+  public void udf6Test() {
+    spark.udf().register("returnOne", () -> 1, DataTypes.IntegerType);
+    Row result = spark.sql("SELECT returnOne()").head();
+    Assert.assertEquals(1, result.getInt(0));
+  }
 }

From 2cbfc975ba937a4eb761de7a6473b7747941f386 Mon Sep 17 00:00:00 2001
From: Jane Wang <janewang@fb.com>
Date: Tue, 11 Jul 2017 22:00:36 -0700
Subject: [PATCH 160/779] [SPARK-12139][SQL] REGEX Column Specification

## What changes were proposed in this pull request?
Hive interprets regular expression, e.g., `(a)?+.+` in query specification. This PR enables spark to support this feature when hive.support.quoted.identifiers is set to true.

## How was this patch tested?

- Add unittests in SQLQuerySuite.scala
- Run spark-shell tested the original failed query:
scala> hc.sql("SELECT `(a|b)?+.+` from test1").collect.foreach(println)

Author: Jane Wang <janewang@fb.com>

Closes #18023 from janewangfb/support_select_regex.
---
 .../sql/catalyst/analysis/unresolved.scala    |  29 +-
 .../sql/catalyst/parser/AstBuilder.scala      |  43 ++-
 .../sql/catalyst/parser/ParserUtils.scala     |   6 +
 .../apache/spark/sql/internal/SQLConf.scala   |   8 +
 .../scala/org/apache/spark/sql/Dataset.scala  |  27 +-
 .../sql-tests/inputs/query_regex_column.sql   |  52 +++
 .../results/query_regex_column.sql.out        | 313 ++++++++++++++++++
 .../spark/sql/DataFrameAggregateSuite.scala   |  10 +-
 .../spark/sql/DataFrameNaFunctionsSuite.scala | 225 +++++++------
 .../apache/spark/sql/DataFrameStatSuite.scala |  87 ++---
 .../org/apache/spark/sql/DataFrameSuite.scala |  55 +--
 .../org/apache/spark/sql/DatasetSuite.scala   |  81 ++++-
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  41 ++-
 .../datasources/json/JsonSuite.scala          |  38 ++-
 .../parquet/ParquetFilterSuite.scala          |   3 +-
 .../spark/sql/sources/DataSourceTest.scala    |   7 +-
 .../spark/sql/sources/TableScanSuite.scala    |  55 +--
 17 files changed, 825 insertions(+), 255 deletions(-)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/query_regex_column.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/query_regex_column.sql.out

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
index 42b9641bef276..fb322697c7c68 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
@@ -22,6 +22,7 @@ import org.apache.spark.sql.catalyst.{FunctionIdentifier, InternalRow, TableIden
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodegenFallback, ExprCode}
+import org.apache.spark.sql.catalyst.parser.ParserUtils
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan}
 import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.catalyst.util.quoteIdentifier
@@ -123,7 +124,10 @@ case class UnresolvedAttribute(nameParts: Seq[String]) extends Attribute with Un
 
   override def toString: String = s"'$name"
 
-  override def sql: String = quoteIdentifier(name)
+  override def sql: String = name match {
+    case ParserUtils.escapedIdentifier(_) | ParserUtils.qualifiedEscapedIdentifier(_, _) => name
+    case _ => quoteIdentifier(name)
+  }
 }
 
 object UnresolvedAttribute {
@@ -306,6 +310,29 @@ case class UnresolvedStar(target: Option[Seq[String]]) extends Star with Unevalu
   override def toString: String = target.map(_ + ".").getOrElse("") + "*"
 }
 
+/**
+ * Represents all of the input attributes to a given relational operator, for example in
+ * "SELECT `(id)?+.+` FROM ...".
+ *
+ * @param table an optional table that should be the target of the expansion.  If omitted all
+ *              tables' columns are produced.
+ */
+case class UnresolvedRegex(regexPattern: String, table: Option[String], caseSensitive: Boolean)
+  extends Star with Unevaluable {
+  override def expand(input: LogicalPlan, resolver: Resolver): Seq[NamedExpression] = {
+    val pattern = if (caseSensitive) regexPattern else s"(?i)$regexPattern"
+    table match {
+      // If there is no table specified, use all input attributes that match expr
+      case None => input.output.filter(_.name.matches(pattern))
+      // If there is a table, pick out attributes that are part of this table that match expr
+      case Some(t) => input.output.filter(_.qualifier.exists(resolver(_, t)))
+        .filter(_.name.matches(pattern))
+    }
+  }
+
+  override def toString: String = table.map(_ + "." + regexPattern).getOrElse(regexPattern)
+}
+
 /**
  * Used to assign new names to Generator's output, such as hive udtf.
  * For example the SQL expression "stack(2, key, value, key, value) as (a, b)" could be represented
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index a616b0f773f38..ad359e714bcc7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -1261,25 +1261,54 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
   }
 
   /**
-   * Create a dereference expression. The return type depends on the type of the parent, this can
-   * either be a [[UnresolvedAttribute]] (if the parent is an [[UnresolvedAttribute]]), or an
-   * [[UnresolvedExtractValue]] if the parent is some expression.
+   * Currently only regex in expressions of SELECT statements are supported; in other
+   * places, e.g., where `(a)?+.+` = 2, regex are not meaningful.
+   */
+  private def canApplyRegex(ctx: ParserRuleContext): Boolean = withOrigin(ctx) {
+    var parent = ctx.getParent
+    while (parent != null) {
+      if (parent.isInstanceOf[NamedExpressionContext]) return true
+      parent = parent.getParent
+    }
+    return false
+  }
+
+  /**
+   * Create a dereference expression. The return type depends on the type of the parent.
+   * If the parent is an [[UnresolvedAttribute]], it can be a [[UnresolvedAttribute]] or
+   * a [[UnresolvedRegex]] for regex quoted in ``; if the parent is some other expression,
+   * it can be [[UnresolvedExtractValue]].
    */
   override def visitDereference(ctx: DereferenceContext): Expression = withOrigin(ctx) {
     val attr = ctx.fieldName.getText
     expression(ctx.base) match {
-      case UnresolvedAttribute(nameParts) =>
-        UnresolvedAttribute(nameParts :+ attr)
+      case unresolved_attr @ UnresolvedAttribute(nameParts) =>
+        ctx.fieldName.getStart.getText match {
+          case escapedIdentifier(columnNameRegex)
+            if conf.supportQuotedRegexColumnName && canApplyRegex(ctx) =>
+            UnresolvedRegex(columnNameRegex, Some(unresolved_attr.name),
+              conf.caseSensitiveAnalysis)
+          case _ =>
+            UnresolvedAttribute(nameParts :+ attr)
+        }
       case e =>
         UnresolvedExtractValue(e, Literal(attr))
     }
   }
 
   /**
-   * Create an [[UnresolvedAttribute]] expression.
+   * Create an [[UnresolvedAttribute]] expression or a [[UnresolvedRegex]] if it is a regex
+   * quoted in ``
    */
   override def visitColumnReference(ctx: ColumnReferenceContext): Expression = withOrigin(ctx) {
-    UnresolvedAttribute.quoted(ctx.getText)
+    ctx.getStart.getText match {
+      case escapedIdentifier(columnNameRegex)
+        if conf.supportQuotedRegexColumnName && canApplyRegex(ctx) =>
+        UnresolvedRegex(columnNameRegex, None, conf.caseSensitiveAnalysis)
+      case _ =>
+        UnresolvedAttribute.quoted(ctx.getText)
+    }
+
   }
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
index 77fdaa8255aa6..9c1031e8033e7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
@@ -177,6 +177,12 @@ object ParserUtils {
     sb.toString()
   }
 
+  /** the column name pattern in quoted regex without qualifier */
+  val escapedIdentifier = "`(.+)`".r
+
+  /** the column name pattern in quoted regex with qualifier */
+  val qualifiedEscapedIdentifier = ("(.+)" + """.""" + "`(.+)`").r
+
   /** Some syntactic sugar which makes it easier to work with optional clauses for LogicalPlans. */
   implicit class EnhancedLogicalPlan(val plan: LogicalPlan) extends AnyVal {
     /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 643587a6eb09d..55558ca9f700c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -855,6 +855,12 @@ object SQLConf {
       .intConf
       .createWithDefault(UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD.toInt)
 
+  val SUPPORT_QUOTED_REGEX_COLUMN_NAME = buildConf("spark.sql.parser.quotedRegexColumnNames")
+    .doc("When true, quoted Identifiers (using backticks) in SELECT statement are interpreted" +
+      " as regular expressions.")
+    .booleanConf
+    .createWithDefault(false)
+
   val ARROW_EXECUTION_ENABLE =
     buildConf("spark.sql.execution.arrow.enable")
       .internal()
@@ -1133,6 +1139,8 @@ class SQLConf extends Serializable with Logging {
 
   def starSchemaFTRatio: Double = getConf(STARSCHEMA_FACT_TABLE_RATIO)
 
+  def supportQuotedRegexColumnName: Boolean = getConf(SUPPORT_QUOTED_REGEX_COLUMN_NAME)
+
   def arrowEnable: Boolean = getConf(ARROW_EXECUTION_ENABLE)
 
   def arrowMaxRecordsPerBatch: Int = getConf(ARROW_EXECUTION_MAX_RECORDS_PER_BATCH)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 7f3ae05411516..b825b6cd6160f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -40,7 +40,7 @@ import org.apache.spark.sql.catalyst.encoders._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.json.{JacksonGenerator, JSONOptions}
 import org.apache.spark.sql.catalyst.optimizer.CombineUnions
-import org.apache.spark.sql.catalyst.parser.ParseException
+import org.apache.spark.sql.catalyst.parser.{ParseException, ParserUtils}
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, PartitioningCollection}
@@ -1178,8 +1178,29 @@ class Dataset[T] private[sql](
     case "*" =>
       Column(ResolvedStar(queryExecution.analyzed.output))
     case _ =>
-      val expr = resolve(colName)
-      Column(expr)
+      if (sqlContext.conf.supportQuotedRegexColumnName) {
+        colRegex(colName)
+      } else {
+        val expr = resolve(colName)
+        Column(expr)
+      }
+  }
+
+  /**
+   * Selects column based on the column name specified as a regex and return it as [[Column]].
+   * @group untypedrel
+   * @since 2.3.0
+   */
+  def colRegex(colName: String): Column = {
+    val caseSensitive = sparkSession.sessionState.conf.caseSensitiveAnalysis
+    colName match {
+      case ParserUtils.escapedIdentifier(columnNameRegex) =>
+        Column(UnresolvedRegex(columnNameRegex, None, caseSensitive))
+      case ParserUtils.qualifiedEscapedIdentifier(nameParts, columnNameRegex) =>
+        Column(UnresolvedRegex(columnNameRegex, Some(nameParts), caseSensitive))
+      case _ =>
+        Column(resolve(colName))
+    }
   }
 
   /**
diff --git a/sql/core/src/test/resources/sql-tests/inputs/query_regex_column.sql b/sql/core/src/test/resources/sql-tests/inputs/query_regex_column.sql
new file mode 100644
index 0000000000000..ad96754826a45
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/query_regex_column.sql
@@ -0,0 +1,52 @@
+set spark.sql.parser.quotedRegexColumnNames=false;
+
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, "1", "11"), (2, "2", "22"), (3, "3", "33"), (4, "4", "44"), (5, "5", "55"), (6, "6", "66")
+AS testData(key, value1, value2);
+
+CREATE OR REPLACE TEMPORARY VIEW testData2 AS SELECT * FROM VALUES
+(1, 1, 1, 2), (1, 2, 1, 2), (2, 1, 2, 3), (2, 2, 2, 3), (3, 1, 3, 4), (3, 2, 3, 4)
+AS testData2(A, B, c, d);
+
+-- AnalysisException
+SELECT `(a)?+.+` FROM testData2 WHERE a = 1;
+SELECT t.`(a)?+.+` FROM testData2 t WHERE a = 1;
+SELECT `(a|b)` FROM testData2 WHERE a = 2;
+SELECT `(a|b)?+.+` FROM testData2 WHERE a = 2;
+SELECT SUM(`(a|b)?+.+`) FROM testData2;
+SELECT SUM(`(a)`) FROM testData2;
+
+set spark.sql.parser.quotedRegexColumnNames=true;
+
+-- Regex columns
+SELECT `(a)?+.+` FROM testData2 WHERE a = 1;
+SELECT `(A)?+.+` FROM testData2 WHERE a = 1;
+SELECT t.`(a)?+.+` FROM testData2 t WHERE a = 1;
+SELECT t.`(A)?+.+` FROM testData2 t WHERE a = 1;
+SELECT `(a|B)` FROM testData2 WHERE a = 2;
+SELECT `(A|b)` FROM testData2 WHERE a = 2;
+SELECT `(a|B)?+.+` FROM testData2 WHERE a = 2;
+SELECT `(A|b)?+.+` FROM testData2 WHERE a = 2;
+SELECT `(e|f)` FROM testData2;
+SELECT t.`(e|f)` FROM testData2 t;
+SELECT p.`(KEY)?+.+`, b, testdata2.`(b)?+.+` FROM testData p join testData2 ON p.key = testData2.a WHERE key < 3;
+SELECT p.`(key)?+.+`, b, testdata2.`(b)?+.+` FROM testData p join testData2 ON p.key = testData2.a WHERE key < 3;
+
+set spark.sql.caseSensitive=true;
+
+CREATE OR REPLACE TEMPORARY VIEW testdata3 AS SELECT * FROM VALUES
+(0, 1), (1, 2), (2, 3), (3, 4)
+AS testdata3(a, b);
+
+-- Regex columns
+SELECT `(A)?+.+` FROM testdata3;
+SELECT `(a)?+.+` FROM testdata3;
+SELECT `(A)?+.+` FROM testdata3 WHERE a > 1;
+SELECT `(a)?+.+` FROM testdata3 where `a` > 1;
+SELECT SUM(`a`) FROM testdata3;
+SELECT SUM(`(a)`) FROM testdata3;
+SELECT SUM(`(a)?+.+`) FROM testdata3;
+SELECT SUM(a) FROM testdata3 GROUP BY `a`;
+-- AnalysisException
+SELECT SUM(a) FROM testdata3 GROUP BY `(a)`;
+SELECT SUM(a) FROM testdata3 GROUP BY `(a)?+.+`;
diff --git a/sql/core/src/test/resources/sql-tests/results/query_regex_column.sql.out b/sql/core/src/test/resources/sql-tests/results/query_regex_column.sql.out
new file mode 100644
index 0000000000000..2dade86f35df9
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/query_regex_column.sql.out
@@ -0,0 +1,313 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 34
+
+
+-- !query 0
+set spark.sql.parser.quotedRegexColumnNames=false
+-- !query 0 schema
+struct<key:string,value:string>
+-- !query 0 output
+spark.sql.parser.quotedRegexColumnNames	false
+
+
+-- !query 1
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, "1", "11"), (2, "2", "22"), (3, "3", "33"), (4, "4", "44"), (5, "5", "55"), (6, "6", "66")
+AS testData(key, value1, value2)
+-- !query 1 schema
+struct<>
+-- !query 1 output
+
+
+
+-- !query 2
+CREATE OR REPLACE TEMPORARY VIEW testData2 AS SELECT * FROM VALUES
+(1, 1, 1, 2), (1, 2, 1, 2), (2, 1, 2, 3), (2, 2, 2, 3), (3, 1, 3, 4), (3, 2, 3, 4)
+AS testData2(A, B, c, d)
+-- !query 2 schema
+struct<>
+-- !query 2 output
+
+
+
+-- !query 3
+SELECT `(a)?+.+` FROM testData2 WHERE a = 1
+-- !query 3 schema
+struct<>
+-- !query 3 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '`(a)?+.+`' given input columns: [testdata2.A, testdata2.B, testdata2.c, testdata2.d]; line 1 pos 7
+
+
+-- !query 4
+SELECT t.`(a)?+.+` FROM testData2 t WHERE a = 1
+-- !query 4 schema
+struct<>
+-- !query 4 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 't.`(a)?+.+`' given input columns: [t.A, t.B, t.c, t.d]; line 1 pos 7
+
+
+-- !query 5
+SELECT `(a|b)` FROM testData2 WHERE a = 2
+-- !query 5 schema
+struct<>
+-- !query 5 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '`(a|b)`' given input columns: [testdata2.A, testdata2.B, testdata2.c, testdata2.d]; line 1 pos 7
+
+
+-- !query 6
+SELECT `(a|b)?+.+` FROM testData2 WHERE a = 2
+-- !query 6 schema
+struct<>
+-- !query 6 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '`(a|b)?+.+`' given input columns: [testdata2.A, testdata2.B, testdata2.c, testdata2.d]; line 1 pos 7
+
+
+-- !query 7
+SELECT SUM(`(a|b)?+.+`) FROM testData2
+-- !query 7 schema
+struct<>
+-- !query 7 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '`(a|b)?+.+`' given input columns: [testdata2.A, testdata2.B, testdata2.c, testdata2.d]; line 1 pos 11
+
+
+-- !query 8
+SELECT SUM(`(a)`) FROM testData2
+-- !query 8 schema
+struct<>
+-- !query 8 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '`(a)`' given input columns: [testdata2.A, testdata2.B, testdata2.c, testdata2.d]; line 1 pos 11
+
+
+-- !query 9
+set spark.sql.parser.quotedRegexColumnNames=true
+-- !query 9 schema
+struct<key:string,value:string>
+-- !query 9 output
+spark.sql.parser.quotedRegexColumnNames	true
+
+
+-- !query 10
+SELECT `(a)?+.+` FROM testData2 WHERE a = 1
+-- !query 10 schema
+struct<B:int,c:int,d:int>
+-- !query 10 output
+1	1	2
+2	1	2
+
+
+-- !query 11
+SELECT `(A)?+.+` FROM testData2 WHERE a = 1
+-- !query 11 schema
+struct<B:int,c:int,d:int>
+-- !query 11 output
+1	1	2
+2	1	2
+
+
+-- !query 12
+SELECT t.`(a)?+.+` FROM testData2 t WHERE a = 1
+-- !query 12 schema
+struct<B:int,c:int,d:int>
+-- !query 12 output
+1	1	2
+2	1	2
+
+
+-- !query 13
+SELECT t.`(A)?+.+` FROM testData2 t WHERE a = 1
+-- !query 13 schema
+struct<B:int,c:int,d:int>
+-- !query 13 output
+1	1	2
+2	1	2
+
+
+-- !query 14
+SELECT `(a|B)` FROM testData2 WHERE a = 2
+-- !query 14 schema
+struct<A:int,B:int>
+-- !query 14 output
+2	1
+2	2
+
+
+-- !query 15
+SELECT `(A|b)` FROM testData2 WHERE a = 2
+-- !query 15 schema
+struct<A:int,B:int>
+-- !query 15 output
+2	1
+2	2
+
+
+-- !query 16
+SELECT `(a|B)?+.+` FROM testData2 WHERE a = 2
+-- !query 16 schema
+struct<c:int,d:int>
+-- !query 16 output
+2	3
+2	3
+
+
+-- !query 17
+SELECT `(A|b)?+.+` FROM testData2 WHERE a = 2
+-- !query 17 schema
+struct<c:int,d:int>
+-- !query 17 output
+2	3
+2	3
+
+
+-- !query 18
+SELECT `(e|f)` FROM testData2
+-- !query 18 schema
+struct<>
+-- !query 18 output
+
+
+
+-- !query 19
+SELECT t.`(e|f)` FROM testData2 t
+-- !query 19 schema
+struct<>
+-- !query 19 output
+
+
+
+-- !query 20
+SELECT p.`(KEY)?+.+`, b, testdata2.`(b)?+.+` FROM testData p join testData2 ON p.key = testData2.a WHERE key < 3
+-- !query 20 schema
+struct<value1:string,value2:string,b:int,A:int,c:int,d:int>
+-- !query 20 output
+1	11	1	1	1	2
+1	11	2	1	1	2
+2	22	1	2	2	3
+2	22	2	2	2	3
+
+
+-- !query 21
+SELECT p.`(key)?+.+`, b, testdata2.`(b)?+.+` FROM testData p join testData2 ON p.key = testData2.a WHERE key < 3
+-- !query 21 schema
+struct<value1:string,value2:string,b:int,A:int,c:int,d:int>
+-- !query 21 output
+1	11	1	1	1	2
+1	11	2	1	1	2
+2	22	1	2	2	3
+2	22	2	2	2	3
+
+
+-- !query 22
+set spark.sql.caseSensitive=true
+-- !query 22 schema
+struct<key:string,value:string>
+-- !query 22 output
+spark.sql.caseSensitive	true
+
+
+-- !query 23
+CREATE OR REPLACE TEMPORARY VIEW testdata3 AS SELECT * FROM VALUES
+(0, 1), (1, 2), (2, 3), (3, 4)
+AS testdata3(a, b)
+-- !query 23 schema
+struct<>
+-- !query 23 output
+
+
+
+-- !query 24
+SELECT `(A)?+.+` FROM testdata3
+-- !query 24 schema
+struct<a:int,b:int>
+-- !query 24 output
+0	1
+1	2
+2	3
+3	4
+
+
+-- !query 25
+SELECT `(a)?+.+` FROM testdata3
+-- !query 25 schema
+struct<b:int>
+-- !query 25 output
+1
+2
+3
+4
+
+
+-- !query 26
+SELECT `(A)?+.+` FROM testdata3 WHERE a > 1
+-- !query 26 schema
+struct<a:int,b:int>
+-- !query 26 output
+2	3
+3	4
+
+
+-- !query 27
+SELECT `(a)?+.+` FROM testdata3 where `a` > 1
+-- !query 27 schema
+struct<b:int>
+-- !query 27 output
+3
+4
+
+
+-- !query 28
+SELECT SUM(`a`) FROM testdata3
+-- !query 28 schema
+struct<sum(a):bigint>
+-- !query 28 output
+6
+
+
+-- !query 29
+SELECT SUM(`(a)`) FROM testdata3
+-- !query 29 schema
+struct<sum(a):bigint>
+-- !query 29 output
+6
+
+
+-- !query 30
+SELECT SUM(`(a)?+.+`) FROM testdata3
+-- !query 30 schema
+struct<sum(b):bigint>
+-- !query 30 output
+10
+
+
+-- !query 31
+SELECT SUM(a) FROM testdata3 GROUP BY `a`
+-- !query 31 schema
+struct<sum(a):bigint>
+-- !query 31 output
+0
+1
+2
+3
+
+
+-- !query 32
+SELECT SUM(a) FROM testdata3 GROUP BY `(a)`
+-- !query 32 schema
+struct<>
+-- !query 32 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '`(a)`' given input columns: [testdata3.a, testdata3.b]; line 1 pos 38
+
+
+-- !query 33
+SELECT SUM(a) FROM testdata3 GROUP BY `(a)?+.+`
+-- !query 33 schema
+struct<>
+-- !query 33 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '`(a)?+.+`' given input columns: [testdata3.a, testdata3.b]; line 1 pos 38
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index b52d50b195bcc..4568b67024acb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -543,10 +543,12 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {
 
   test("SPARK-17237 remove backticks in a pivot result schema") {
     val df = Seq((2, 3, 4), (3, 4, 5)).toDF("a", "x", "y")
-    checkAnswer(
-      df.groupBy("a").pivot("x").agg(count("y"), avg("y")).na.fill(0),
-      Seq(Row(3, 0, 0.0, 1, 5.0), Row(2, 1, 4.0, 0, 0.0))
-    )
+    withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "false") {
+      checkAnswer(
+        df.groupBy("a").pivot("x").agg(count("y"), avg("y")).na.fill(0),
+        Seq(Row(3, 0, 0.0, 1, 5.0), Row(2, 1, 4.0, 0, 0.0))
+      )
+    }
   }
 
   test("aggregate function in GROUP BY") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala
index e63c5cb194d68..47c9ba5847a4f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala
@@ -19,9 +19,9 @@ package org.apache.spark.sql
 
 import scala.collection.JavaConverters._
 
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 
-
 class DataFrameNaFunctionsSuite extends QueryTest with SharedSQLContext {
   import testImplicits._
 
@@ -111,119 +111,124 @@ class DataFrameNaFunctionsSuite extends QueryTest with SharedSQLContext {
       (null, null)
     ).toDF("name", "spy")
 
-    val fillNumeric = input.na.fill(50.6)
-    checkAnswer(
-      fillNumeric,
-      Row("Bob", 16, 176.5) ::
-        Row("Alice", 50, 164.3) ::
-        Row("David", 60, 50.6) ::
-        Row("Nina", 25, 50.6) ::
-        Row("Amy", 50, 50.6) ::
-        Row(null, 50, 50.6) :: Nil)
-
-    // Make sure the columns are properly named.
-    assert(fillNumeric.columns.toSeq === input.columns.toSeq)
-
-    // string
-    checkAnswer(
-      input.na.fill("unknown").select("name"),
-      Row("Bob") :: Row("Alice") :: Row("David") ::
-        Row("Nina") :: Row("Amy") :: Row("unknown") :: Nil)
-    assert(input.na.fill("unknown").columns.toSeq === input.columns.toSeq)
-
-    // boolean
-    checkAnswer(
-      boolInput.na.fill(true).select("spy"),
-      Row(false) :: Row(true) :: Row(true) :: Row(true) :: Nil)
-    assert(boolInput.na.fill(true).columns.toSeq === boolInput.columns.toSeq)
-
-    // fill double with subset columns
-    checkAnswer(
-      input.na.fill(50.6, "age" :: Nil).select("name", "age"),
-      Row("Bob", 16) ::
-        Row("Alice", 50) ::
-        Row("David", 60) ::
-        Row("Nina", 25) ::
-        Row("Amy", 50) ::
-        Row(null, 50) :: Nil)
-
-    // fill boolean with subset columns
-    checkAnswer(
-      boolInput.na.fill(true, "spy" :: Nil).select("name", "spy"),
-      Row("Bob", false) ::
-        Row("Alice", true) ::
-        Row("Mallory", true) ::
-        Row(null, true) :: Nil)
-
-    // fill string with subset columns
-    checkAnswer(
-      Seq[(String, String)]((null, null)).toDF("col1", "col2").na.fill("test", "col1" :: Nil),
-      Row("test", null))
-
-    checkAnswer(
-      Seq[(Long, Long)]((1, 2), (-1, -2), (9123146099426677101L, 9123146560113991650L))
-        .toDF("a", "b").na.fill(0),
-      Row(1, 2) :: Row(-1, -2) :: Row(9123146099426677101L, 9123146560113991650L) :: Nil
-    )
-
-    checkAnswer(
-      Seq[(java.lang.Long, java.lang.Double)]((null, 3.14), (9123146099426677101L, null),
-        (9123146560113991650L, 1.6), (null, null)).toDF("a", "b").na.fill(0.2),
-      Row(0, 3.14) :: Row(9123146099426677101L, 0.2) :: Row(9123146560113991650L, 1.6)
-        :: Row(0, 0.2) :: Nil
-    )
-
-    checkAnswer(
-      Seq[(java.lang.Long, java.lang.Float)]((null, 3.14f), (9123146099426677101L, null),
-        (9123146560113991650L, 1.6f), (null, null)).toDF("a", "b").na.fill(0.2),
-      Row(0, 3.14f) :: Row(9123146099426677101L, 0.2f) :: Row(9123146560113991650L, 1.6f)
-        :: Row(0, 0.2f) :: Nil
-    )
-
-    checkAnswer(
-      Seq[(java.lang.Long, java.lang.Double)]((null, 1.23), (3L, null), (4L, 3.45))
-        .toDF("a", "b").na.fill(2.34),
-      Row(2, 1.23) :: Row(3, 2.34) :: Row(4, 3.45) :: Nil
-    )
-
-    checkAnswer(
-      Seq[(java.lang.Long, java.lang.Double)]((null, 1.23), (3L, null), (4L, 3.45))
-        .toDF("a", "b").na.fill(5),
-      Row(5, 1.23) :: Row(3, 5.0) :: Row(4, 3.45) :: Nil
-    )
+    withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "false") {
+      val fillNumeric = input.na.fill(50.6)
+      checkAnswer(
+        fillNumeric,
+        Row("Bob", 16, 176.5) ::
+          Row("Alice", 50, 164.3) ::
+          Row("David", 60, 50.6) ::
+          Row("Nina", 25, 50.6) ::
+          Row("Amy", 50, 50.6) ::
+          Row(null, 50, 50.6) :: Nil)
+
+      // Make sure the columns are properly named.
+      assert(fillNumeric.columns.toSeq === input.columns.toSeq)
+
+      // string
+      checkAnswer(
+        input.na.fill("unknown").select("name"),
+        Row("Bob") :: Row("Alice") :: Row("David") ::
+          Row("Nina") :: Row("Amy") :: Row("unknown") :: Nil)
+      assert(input.na.fill("unknown").columns.toSeq === input.columns.toSeq)
+
+      // boolean
+      checkAnswer(
+        boolInput.na.fill(true).select("spy"),
+        Row(false) :: Row(true) :: Row(true) :: Row(true) :: Nil)
+      assert(boolInput.na.fill(true).columns.toSeq === boolInput.columns.toSeq)
+
+      // fill double with subset columns
+      checkAnswer(
+        input.na.fill(50.6, "age" :: Nil).select("name", "age"),
+        Row("Bob", 16) ::
+          Row("Alice", 50) ::
+          Row("David", 60) ::
+          Row("Nina", 25) ::
+          Row("Amy", 50) ::
+          Row(null, 50) :: Nil)
+
+      // fill boolean with subset columns
+      checkAnswer(
+        boolInput.na.fill(true, "spy" :: Nil).select("name", "spy"),
+        Row("Bob", false) ::
+          Row("Alice", true) ::
+          Row("Mallory", true) ::
+          Row(null, true) :: Nil)
+
+      // fill string with subset columns
+      checkAnswer(
+        Seq[(String, String)]((null, null)).toDF("col1", "col2").na.fill("test", "col1" :: Nil),
+        Row("test", null))
+
+      checkAnswer(
+        Seq[(Long, Long)]((1, 2), (-1, -2), (9123146099426677101L, 9123146560113991650L))
+          .toDF("a", "b").na.fill(0),
+        Row(1, 2) :: Row(-1, -2) :: Row(9123146099426677101L, 9123146560113991650L) :: Nil
+      )
+
+      checkAnswer(
+        Seq[(java.lang.Long, java.lang.Double)]((null, 3.14), (9123146099426677101L, null),
+          (9123146560113991650L, 1.6), (null, null)).toDF("a", "b").na.fill(0.2),
+        Row(0, 3.14) :: Row(9123146099426677101L, 0.2) :: Row(9123146560113991650L, 1.6)
+          :: Row(0, 0.2) :: Nil
+      )
+
+      checkAnswer(
+        Seq[(java.lang.Long, java.lang.Float)]((null, 3.14f), (9123146099426677101L, null),
+          (9123146560113991650L, 1.6f), (null, null)).toDF("a", "b").na.fill(0.2),
+        Row(0, 3.14f) :: Row(9123146099426677101L, 0.2f) :: Row(9123146560113991650L, 1.6f)
+          :: Row(0, 0.2f) :: Nil
+      )
+
+      checkAnswer(
+        Seq[(java.lang.Long, java.lang.Double)]((null, 1.23), (3L, null), (4L, 3.45))
+          .toDF("a", "b").na.fill(2.34),
+        Row(2, 1.23) :: Row(3, 2.34) :: Row(4, 3.45) :: Nil
+      )
+
+      checkAnswer(
+        Seq[(java.lang.Long, java.lang.Double)]((null, 1.23), (3L, null), (4L, 3.45))
+          .toDF("a", "b").na.fill(5),
+        Row(5, 1.23) :: Row(3, 5.0) :: Row(4, 3.45) :: Nil
+      )
+    }
   }
 
   test("fill with map") {
-    val df = Seq[(String, String, java.lang.Integer, java.lang.Long,
+    withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "false") {
+      val df = Seq[(String, String, java.lang.Integer, java.lang.Long,
         java.lang.Float, java.lang.Double, java.lang.Boolean)](
-      (null, null, null, null, null, null, null))
-      .toDF("stringFieldA", "stringFieldB", "integerField", "longField",
-        "floatField", "doubleField", "booleanField")
-
-    val fillMap = Map(
-      "stringFieldA" -> "test",
-      "integerField" -> 1,
-      "longField" -> 2L,
-      "floatField" -> 3.3f,
-      "doubleField" -> 4.4d,
-      "booleanField" -> false)
-
-    val expectedRow = Row("test", null, 1, 2L, 3.3f, 4.4d, false)
-
-    checkAnswer(df.na.fill(fillMap), expectedRow)
-    checkAnswer(df.na.fill(fillMap.asJava), expectedRow) // Test Java version
-
-    // Ensure replacement values are cast to the column data type.
-    checkAnswer(df.na.fill(Map(
-      "integerField" -> 1d,
-      "longField" -> 2d,
-      "floatField" -> 3d,
-      "doubleField" -> 4d)),
-      Row(null, null, 1, 2L, 3f, 4d, null))
-
-    // Ensure column types do not change. Columns that have null values replaced
-    // will no longer be flagged as nullable, so do not compare schemas directly.
-    assert(df.na.fill(fillMap).schema.fields.map(_.dataType) === df.schema.fields.map(_.dataType))
+        (null, null, null, null, null, null, null))
+        .toDF("stringFieldA", "stringFieldB", "integerField", "longField",
+          "floatField", "doubleField", "booleanField")
+
+      val fillMap = Map(
+        "stringFieldA" -> "test",
+        "integerField" -> 1,
+        "longField" -> 2L,
+        "floatField" -> 3.3f,
+        "doubleField" -> 4.4d,
+        "booleanField" -> false)
+
+      val expectedRow = Row("test", null, 1, 2L, 3.3f, 4.4d, false)
+
+
+      checkAnswer(df.na.fill(fillMap), expectedRow)
+      checkAnswer(df.na.fill(fillMap.asJava), expectedRow) // Test Java version
+
+      // Ensure replacement values are cast to the column data type.
+      checkAnswer(df.na.fill(Map(
+        "integerField" -> 1d,
+        "longField" -> 2d,
+        "floatField" -> 3d,
+        "doubleField" -> 4d)),
+        Row(null, null, 1, 2L, 3f, 4d, null))
+
+      // Ensure column types do not change. Columns that have null values replaced
+      // will no longer be flagged as nullable, so do not compare schemas directly.
+      assert(df.na.fill(fillMap).schema.fields.map(_.dataType) === df.schema.fields.map(_.dataType))
+    }
   }
 
   test("replace") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
index dd118f88e3bb3..09502d05f7709 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
@@ -24,6 +24,7 @@ import org.scalatest.Matchers._
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.execution.stat.StatFunctions
 import org.apache.spark.sql.functions.col
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
 
@@ -263,52 +264,56 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext {
   }
 
   test("crosstab") {
-    val rng = new Random()
-    val data = Seq.tabulate(25)(i => (rng.nextInt(5), rng.nextInt(10)))
-    val df = data.toDF("a", "b")
-    val crosstab = df.stat.crosstab("a", "b")
-    val columnNames = crosstab.schema.fieldNames
-    assert(columnNames(0) === "a_b")
-    // reduce by key
-    val expected = data.map(t => (t, 1)).groupBy(_._1).mapValues(_.length)
-    val rows = crosstab.collect()
-    rows.foreach { row =>
-      val i = row.getString(0).toInt
-      for (col <- 1 until columnNames.length) {
-        val j = columnNames(col).toInt
-        assert(row.getLong(col) === expected.getOrElse((i, j), 0).toLong)
+    withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "false") {
+      val rng = new Random()
+      val data = Seq.tabulate(25)(i => (rng.nextInt(5), rng.nextInt(10)))
+      val df = data.toDF("a", "b")
+      val crosstab = df.stat.crosstab("a", "b")
+      val columnNames = crosstab.schema.fieldNames
+      assert(columnNames(0) === "a_b")
+      // reduce by key
+      val expected = data.map(t => (t, 1)).groupBy(_._1).mapValues(_.length)
+      val rows = crosstab.collect()
+      rows.foreach { row =>
+        val i = row.getString(0).toInt
+        for (col <- 1 until columnNames.length) {
+          val j = columnNames(col).toInt
+          assert(row.getLong(col) === expected.getOrElse((i, j), 0).toLong)
+        }
       }
     }
   }
 
   test("special crosstab elements (., '', null, ``)") {
-    val data = Seq(
-      ("a", Double.NaN, "ho"),
-      (null, 2.0, "ho"),
-      ("a.b", Double.NegativeInfinity, ""),
-      ("b", Double.PositiveInfinity, "`ha`"),
-      ("a", 1.0, null)
-    )
-    val df = data.toDF("1", "2", "3")
-    val ct1 = df.stat.crosstab("1", "2")
-    // column fields should be 1 + distinct elements of second column
-    assert(ct1.schema.fields.length === 6)
-    assert(ct1.collect().length === 4)
-    val ct2 = df.stat.crosstab("1", "3")
-    assert(ct2.schema.fields.length === 5)
-    assert(ct2.schema.fieldNames.contains("ha"))
-    assert(ct2.collect().length === 4)
-    val ct3 = df.stat.crosstab("3", "2")
-    assert(ct3.schema.fields.length === 6)
-    assert(ct3.schema.fieldNames.contains("NaN"))
-    assert(ct3.schema.fieldNames.contains("Infinity"))
-    assert(ct3.schema.fieldNames.contains("-Infinity"))
-    assert(ct3.collect().length === 4)
-    val ct4 = df.stat.crosstab("3", "1")
-    assert(ct4.schema.fields.length === 5)
-    assert(ct4.schema.fieldNames.contains("null"))
-    assert(ct4.schema.fieldNames.contains("a.b"))
-    assert(ct4.collect().length === 4)
+    withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "false") {
+      val data = Seq(
+        ("a", Double.NaN, "ho"),
+        (null, 2.0, "ho"),
+        ("a.b", Double.NegativeInfinity, ""),
+        ("b", Double.PositiveInfinity, "`ha`"),
+        ("a", 1.0, null)
+      )
+      val df = data.toDF("1", "2", "3")
+      val ct1 = df.stat.crosstab("1", "2")
+      // column fields should be 1 + distinct elements of second column
+      assert(ct1.schema.fields.length === 6)
+      assert(ct1.collect().length === 4)
+      val ct2 = df.stat.crosstab("1", "3")
+      assert(ct2.schema.fields.length === 5)
+      assert(ct2.schema.fieldNames.contains("ha"))
+      assert(ct2.collect().length === 4)
+      val ct3 = df.stat.crosstab("3", "2")
+      assert(ct3.schema.fields.length === 6)
+      assert(ct3.schema.fieldNames.contains("NaN"))
+      assert(ct3.schema.fieldNames.contains("Infinity"))
+      assert(ct3.schema.fieldNames.contains("-Infinity"))
+      assert(ct3.collect().length === 4)
+      val ct4 = df.stat.crosstab("3", "1")
+      assert(ct4.schema.fields.length === 5)
+      assert(ct4.schema.fieldNames.contains("null"))
+      assert(ct4.schema.fieldNames.contains("a.b"))
+      assert(ct4.collect().length === 4)
+    }
   }
 
   test("Frequent Items") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 5ae27032e0e94..3f3a6221d20b6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -1179,28 +1179,31 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
   }
 
   test("SPARK-7551: support backticks for DataFrame attribute resolution") {
-    val df = spark.read.json(Seq("""{"a.b": {"c": {"d..e": {"f": 1}}}}""").toDS())
-    checkAnswer(
-      df.select(df("`a.b`.c.`d..e`.`f`")),
-      Row(1)
-    )
-
-    val df2 = spark.read.json(Seq("""{"a  b": {"c": {"d  e": {"f": 1}}}}""").toDS())
-    checkAnswer(
-      df2.select(df2("`a  b`.c.d  e.f")),
-      Row(1)
-    )
-
-    def checkError(testFun: => Unit): Unit = {
-      val e = intercept[org.apache.spark.sql.AnalysisException] {
-        testFun
+    withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "false") {
+      val df = spark.read.json(Seq("""{"a.b": {"c": {"d..e": {"f": 1}}}}""").toDS())
+      checkAnswer(
+        df.select(df("`a.b`.c.`d..e`.`f`")),
+        Row(1)
+      )
+
+      val df2 = spark.read.json(Seq("""{"a  b": {"c": {"d  e": {"f": 1}}}}""").toDS())
+      checkAnswer(
+        df2.select(df2("`a  b`.c.d  e.f")),
+        Row(1)
+      )
+
+      def checkError(testFun: => Unit): Unit = {
+        val e = intercept[org.apache.spark.sql.AnalysisException] {
+          testFun
+        }
+        assert(e.getMessage.contains("syntax error in attribute name:"))
       }
-      assert(e.getMessage.contains("syntax error in attribute name:"))
+
+      checkError(df("`abc.`c`"))
+      checkError(df("`abc`..d"))
+      checkError(df("`a`.b."))
+      checkError(df("`a.b`.c.`d"))
     }
-    checkError(df("`abc.`c`"))
-    checkError(df("`abc`..d"))
-    checkError(df("`a`.b."))
-    checkError(df("`a.b`.c.`d"))
   }
 
   test("SPARK-7324 dropDuplicates") {
@@ -1928,11 +1931,13 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
   }
 
   test("SPARK-17957: outer join + na.fill") {
-    val df1 = Seq((1, 2), (2, 3)).toDF("a", "b")
-    val df2 = Seq((2, 5), (3, 4)).toDF("a", "c")
-    val joinedDf = df1.join(df2, Seq("a"), "outer").na.fill(0)
-    val df3 = Seq((3, 1)).toDF("a", "d")
-    checkAnswer(joinedDf.join(df3, "a"), Row(3, 0, 4, 1))
+    withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "false") {
+      val df1 = Seq((1, 2), (2, 3)).toDF("a", "b")
+      val df2 = Seq((2, 5), (3, 4)).toDF("a", "c")
+      val joinedDf = df1.join(df2, Seq("a"), "outer").na.fill(0)
+      val df3 = Seq((3, 1)).toDF("a", "d")
+      checkAnswer(joinedDf.join(df3, "a"), Row(3, 0, 4, 1))
+    }
   }
 
   test("SPARK-17123: Performing set operations that combine non-scala native types") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 87b7b090de3bf..825840707d421 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -22,7 +22,7 @@ import java.sql.{Date, Timestamp}
 
 import org.apache.spark.sql.catalyst.encoders.{OuterScopes, RowEncoder}
 import org.apache.spark.sql.catalyst.util.sideBySide
-import org.apache.spark.sql.execution.{LogicalRDD, RDDScanExec, SortExec}
+import org.apache.spark.sql.execution.{LogicalRDD, RDDScanExec}
 import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ShuffleExchange}
 import org.apache.spark.sql.execution.streaming.MemoryStream
 import org.apache.spark.sql.functions._
@@ -244,6 +244,85 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
       ("a", ClassData("a", 1)), ("b", ClassData("b", 2)), ("c", ClassData("c", 3)))
   }
 
+  test("REGEX column specification") {
+    val ds = Seq(("a", 1), ("b", 2), ("c", 3)).toDS()
+
+    withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "false") {
+      var e = intercept[AnalysisException] {
+        ds.select(expr("`(_1)?+.+`").as[Int])
+      }.getMessage
+      assert(e.contains("cannot resolve '`(_1)?+.+`'"))
+
+      e = intercept[AnalysisException] {
+        ds.select(expr("`(_1|_2)`").as[Int])
+      }.getMessage
+      assert(e.contains("cannot resolve '`(_1|_2)`'"))
+
+      e = intercept[AnalysisException] {
+        ds.select(ds("`(_1)?+.+`"))
+      }.getMessage
+      assert(e.contains("Cannot resolve column name \"`(_1)?+.+`\""))
+
+      e = intercept[AnalysisException] {
+        ds.select(ds("`(_1|_2)`"))
+      }.getMessage
+      assert(e.contains("Cannot resolve column name \"`(_1|_2)`\""))
+    }
+
+    withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "true") {
+      checkDataset(
+        ds.select(ds.col("_2")).as[Int],
+        1, 2, 3)
+
+      checkDataset(
+        ds.select(ds.colRegex("`(_1)?+.+`")).as[Int],
+        1, 2, 3)
+
+      checkDataset(
+        ds.select(ds("`(_1|_2)`"))
+          .select(expr("named_struct('a', _1, 'b', _2)").as[ClassData]),
+        ClassData("a", 1), ClassData("b", 2), ClassData("c", 3))
+
+      checkDataset(
+        ds.alias("g")
+          .select(ds("g.`(_1|_2)`"))
+          .select(expr("named_struct('a', _1, 'b', _2)").as[ClassData]),
+        ClassData("a", 1), ClassData("b", 2), ClassData("c", 3))
+
+      checkDataset(
+        ds.select(ds("`(_1)?+.+`"))
+          .select(expr("_2").as[Int]),
+        1, 2, 3)
+
+      checkDataset(
+        ds.alias("g")
+          .select(ds("g.`(_1)?+.+`"))
+          .select(expr("_2").as[Int]),
+        1, 2, 3)
+
+      checkDataset(
+        ds.select(expr("`(_1)?+.+`").as[Int]),
+        1, 2, 3)
+      val m = ds.select(expr("`(_1|_2)`"))
+
+      checkDataset(
+        ds.select(expr("`(_1|_2)`"))
+          .select(expr("named_struct('a', _1, 'b', _2)").as[ClassData]),
+        ClassData("a", 1), ClassData("b", 2), ClassData("c", 3))
+
+      checkDataset(
+        ds.alias("g")
+          .select(expr("g.`(_1)?+.+`").as[Int]),
+        1, 2, 3)
+
+      checkDataset(
+        ds.alias("g")
+          .select(expr("g.`(_1|_2)`"))
+          .select(expr("named_struct('a', _1, 'b', _2)").as[ClassData]),
+        ClassData("a", 1), ClassData("b", 2), ClassData("c", 3))
+    }
+  }
+
   test("filter") {
     val ds = Seq(("a", 1), ("b", 2), ("c", 3)).toDS()
     checkDataset(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 472ff7385b194..c78ec6d9a89ff 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1219,7 +1219,9 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
   test("SPARK-3483 Special chars in column names") {
     val data = Seq("""{"key?number1": "value1", "key.number2": "value2"}""").toDS()
     spark.read.json(data).createOrReplaceTempView("records")
-    sql("SELECT `key?number1`, `key.number2` FROM records")
+    withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "false") {
+      sql("SELECT `key?number1`, `key.number2` FROM records")
+    }
   }
 
   test("SPARK-3814 Support Bitwise & operator") {
@@ -1339,7 +1341,9 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       .json(Seq("""{"a": {"c.b": 1}, "b.$q": [{"a@!.q": 1}], "q.w": {"w.i&": [1]}}""").toDS())
       .createOrReplaceTempView("t")
 
-    checkAnswer(sql("SELECT a.`c.b`, `b.$q`[0].`a@!.q`, `q.w`.`w.i&`[0] FROM t"), Row(1, 1, 1))
+    withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "false") {
+      checkAnswer(sql("SELECT a.`c.b`, `b.$q`[0].`a@!.q`, `q.w`.`w.i&`[0] FROM t"), Row(1, 1, 1))
+    }
   }
 
   test("SPARK-6583 order by aggregated function") {
@@ -1835,25 +1839,28 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     }
 
     // Create paths with unusual characters
-    val specialCharacterPath = sql(
-      """
+    withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "false") {
+      val specialCharacterPath = sql(
+        """
         | SELECT struct(`col$.a_`, `a.b.c.`) as `r&&b.c` FROM
         |   (SELECT struct(a, b) as `col$.a_`, struct(b, a) as `a.b.c.` FROM testData2) tmp
       """.stripMargin)
-    withTempView("specialCharacterTable") {
-      specialCharacterPath.createOrReplaceTempView("specialCharacterTable")
-      checkAnswer(
-        specialCharacterPath.select($"`r&&b.c`.*"),
-        nestedStructData.select($"record.*"))
-      checkAnswer(
-        sql("SELECT `r&&b.c`.`col$.a_` FROM specialCharacterTable"),
+      withTempView("specialCharacterTable") {
+        specialCharacterPath.createOrReplaceTempView("specialCharacterTable")
+        checkAnswer(
+          specialCharacterPath.select($"`r&&b.c`.*"),
+          nestedStructData.select($"record.*"))
+        checkAnswer(
+        sql(
+          "SELECT `r&&b.c`.`col$.a_` FROM specialCharacterTable"),
         nestedStructData.select($"record.r1"))
-      checkAnswer(
-        sql("SELECT `r&&b.c`.`a.b.c.` FROM specialCharacterTable"),
-        nestedStructData.select($"record.r2"))
-      checkAnswer(
-        sql("SELECT `r&&b.c`.`col$.a_`.* FROM specialCharacterTable"),
-        nestedStructData.select($"record.r1.*"))
+        checkAnswer(
+          sql("SELECT `r&&b.c`.`a.b.c.` FROM specialCharacterTable"),
+          nestedStructData.select($"record.r2"))
+        checkAnswer(
+          sql("SELECT `r&&b.c`.`col$.a_`.* FROM specialCharacterTable"),
+          nestedStructData.select($"record.r1.*"))
+      }
     }
 
     // Try star expanding a scalar. This should fail.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
index 704823ad516c2..1cde137edbb91 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -937,14 +937,16 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
       Row(Map("e" -> null)) :: Nil
     )
 
-    checkAnswer(
-      sql("select `map`['c'] from jsonWithSimpleMap"),
-      Row(null) ::
-      Row(null) ::
-      Row(3) ::
-      Row(1) ::
-      Row(null) :: Nil
-    )
+    withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "false") {
+      checkAnswer(
+        sql("select `map`['c'] from jsonWithSimpleMap"),
+        Row(null) ::
+        Row(null) ::
+        Row(3) ::
+        Row(1) ::
+        Row(null) :: Nil
+      )
+    }
 
     val innerStruct = StructType(
       StructField("field1", ArrayType(IntegerType, true), true) ::
@@ -966,15 +968,17 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
       Row(Map("f" -> Row(null, null))) :: Nil
     )
 
-    checkAnswer(
-      sql("select `map`['a'].field1, `map`['c'].field2 from jsonWithComplexMap"),
-      Row(Seq(1, 2, 3, null), null) ::
-      Row(null, null) ::
-      Row(null, 4) ::
-      Row(null, 3) ::
-      Row(null, null) ::
-      Row(null, null) :: Nil
-    )
+    withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "false") {
+      checkAnswer(
+        sql("select `map`['a'].field1, `map`['c'].field2 from jsonWithComplexMap"),
+        Row(Seq(1, 2, 3, null), null) ::
+        Row(null, null) ::
+        Row(null, 4) ::
+        Row(null, 3) ::
+        Row(null, null) ::
+        Row(null, null) :: Nil
+      )
+    }
   }
 
   test("SPARK-2096 Correctly parse dot notations") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
index 98427cfe3031c..c43c1ec8b9a6b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
@@ -544,7 +544,8 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
 
     Seq(true, false).foreach { vectorized =>
       withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vectorized.toString,
-          SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> true.toString) {
+          SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> true.toString,
+          SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "false") {
         withTempPath { path =>
           Seq(Some(1), None).toDF("col.dots").write.parquet(path.getAbsolutePath)
           val readBack = spark.read.parquet(path.getAbsolutePath).where("`col.dots` IS NOT NULL")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala
index 80868fff897fd..70338670c9f9c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala
@@ -20,14 +20,17 @@ package org.apache.spark.sql.sources
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
 private[sql] abstract class DataSourceTest extends QueryTest {
 
-  protected def sqlTest(sqlString: String, expectedAnswer: Seq[Row]) {
+  protected def sqlTest(sqlString: String, expectedAnswer: Seq[Row], enableRegex: Boolean = false) {
     test(sqlString) {
-      checkAnswer(spark.sql(sqlString), expectedAnswer)
+      withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> enableRegex.toString) {
+        checkAnswer(spark.sql(sqlString), expectedAnswer)
+      }
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
index b01d15eb917e2..65474a52dd0b7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
@@ -22,6 +22,7 @@ import java.sql.{Date, Timestamp}
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
 
@@ -247,32 +248,34 @@ class TableScanSuite extends DataSourceTest with SharedSQLContext {
 
     assert(expectedSchema == spark.table("tableWithSchema").schema)
 
-    checkAnswer(
-      sql(
-        """SELECT
-          | `string$%Field`,
-          | cast(binaryField as string),
-          | booleanField,
-          | byteField,
-          | shortField,
-          | int_Field,
-          | `longField_:,<>=+/~^`,
-          | floatField,
-          | doubleField,
-          | decimalField1,
-          | decimalField2,
-          | dateField,
-          | timestampField,
-          | varcharField,
-          | charField,
-          | arrayFieldSimple,
-          | arrayFieldComplex,
-          | mapFieldSimple,
-          | mapFieldComplex,
-          | structFieldSimple,
-          | structFieldComplex FROM tableWithSchema""".stripMargin),
-      tableWithSchemaExpected
-    )
+    withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "false") {
+        checkAnswer(
+          sql(
+            """SELECT
+            | `string$%Field`,
+            | cast(binaryField as string),
+            | booleanField,
+            | byteField,
+            | shortField,
+            | int_Field,
+            | `longField_:,<>=+/~^`,
+            | floatField,
+            | doubleField,
+            | decimalField1,
+            | decimalField2,
+            | dateField,
+            | timestampField,
+            | varcharField,
+            | charField,
+            | arrayFieldSimple,
+            | arrayFieldComplex,
+            | mapFieldSimple,
+            | mapFieldComplex,
+            | structFieldSimple,
+            | structFieldComplex FROM tableWithSchema""".stripMargin),
+        tableWithSchemaExpected
+      )
+    }
   }
 
   sqlTest(

From 24367f23f77349a864da340573e39ab2168c5403 Mon Sep 17 00:00:00 2001
From: liuzhaokun <liu.zhaokun@zte.com.cn>
Date: Tue, 11 Jul 2017 23:02:20 -0700
Subject: [PATCH 161/779] [SPARK-21382] The note about Scala 2.10 in
 building-spark.md is wrong.

[https://issues.apache.org/jira/browse/SPARK-21382](https://issues.apache.org/jira/browse/SPARK-21382)
There should be "Note that support for Scala 2.10 is deprecated as of Spark 2.1.0 and may be removed in Spark 2.3.0",right?

Author: liuzhaokun <liu.zhaokun@zte.com.cn>

Closes #18606 from liu-zhaokun/new07120923.
---
 docs/building-spark.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/building-spark.md b/docs/building-spark.md
index 777635a64f83c..815843c54ad7b 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -97,7 +97,7 @@ To produce a Spark package compiled with Scala 2.10, use the `-Dscala-2.10` prop
     ./dev/change-scala-version.sh 2.10
     ./build/mvn -Pyarn -Dscala-2.10 -DskipTests clean package
 
-Note that support for Scala 2.10 is deprecated as of Spark 2.1.0 and may be removed in Spark 2.2.0.
+Note that support for Scala 2.10 is deprecated as of Spark 2.1.0 and may be removed in Spark 2.3.0.
 
 ## Building submodules individually
 

From e16e8c7ad31762aaca5e2bc874de1540af9cc4b7 Mon Sep 17 00:00:00 2001
From: Devaraj K <devaraj@apache.org>
Date: Wed, 12 Jul 2017 00:14:58 -0700
Subject: [PATCH 162/779] [SPARK-21146][CORE] Master/Worker should handle and
 shutdown when any thread gets UncaughtException

## What changes were proposed in this pull request?

Adding the default UncaughtExceptionHandler to the Worker.

## How was this patch tested?

I verified it manually, when any of the worker thread gets uncaught exceptions then the default UncaughtExceptionHandler will handle those exceptions.

Author: Devaraj K <devaraj@apache.org>

Closes #18357 from devaraj-kavali/SPARK-21146.
---
 .../scala/org/apache/spark/deploy/master/Master.scala |  4 +++-
 .../scala/org/apache/spark/deploy/worker/Worker.scala |  4 +++-
 .../scala/org/apache/spark/executor/Executor.scala    |  2 +-
 .../spark/util/SparkUncaughtExceptionHandler.scala    | 11 ++++++-----
 core/src/main/scala/org/apache/spark/util/Utils.scala |  4 +++-
 .../spark/deploy/mesos/MesosClusterDispatcher.scala   |  2 +-
 6 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
index 0dee25fb2ebe2..4cc580eb75ca2 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
@@ -36,7 +36,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.metrics.MetricsSystem
 import org.apache.spark.rpc._
 import org.apache.spark.serializer.{JavaSerializer, Serializer}
-import org.apache.spark.util.{ThreadUtils, Utils}
+import org.apache.spark.util.{SparkUncaughtExceptionHandler, ThreadUtils, Utils}
 
 private[deploy] class Master(
     override val rpcEnv: RpcEnv,
@@ -1045,6 +1045,8 @@ private[deploy] object Master extends Logging {
   val ENDPOINT_NAME = "Master"
 
   def main(argStrings: Array[String]) {
+    Thread.setDefaultUncaughtExceptionHandler(new SparkUncaughtExceptionHandler(
+      exitOnUncaughtException = false))
     Utils.initDaemon(log)
     val conf = new SparkConf
     val args = new MasterArguments(argStrings, conf)
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
index bed47455680dd..f6d3876e3bbfa 100755
--- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
@@ -38,7 +38,7 @@ import org.apache.spark.deploy.worker.ui.WorkerWebUI
 import org.apache.spark.internal.Logging
 import org.apache.spark.metrics.MetricsSystem
 import org.apache.spark.rpc._
-import org.apache.spark.util.{ThreadUtils, Utils}
+import org.apache.spark.util.{SparkUncaughtExceptionHandler, ThreadUtils, Utils}
 
 private[deploy] class Worker(
     override val rpcEnv: RpcEnv,
@@ -737,6 +737,8 @@ private[deploy] object Worker extends Logging {
   val ENDPOINT_NAME = "Worker"
 
   def main(argStrings: Array[String]) {
+    Thread.setDefaultUncaughtExceptionHandler(new SparkUncaughtExceptionHandler(
+      exitOnUncaughtException = false))
     Utils.initDaemon(log)
     val conf = new SparkConf
     val args = new WorkerArguments(argStrings, conf)
diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index 19e7eb086f413..21f0db1039188 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -56,7 +56,7 @@ private[spark] class Executor(
     env: SparkEnv,
     userClassPath: Seq[URL] = Nil,
     isLocal: Boolean = false,
-    uncaughtExceptionHandler: UncaughtExceptionHandler = SparkUncaughtExceptionHandler)
+    uncaughtExceptionHandler: UncaughtExceptionHandler = new SparkUncaughtExceptionHandler)
   extends Logging {
 
   logInfo(s"Starting executor ID $executorId on host $executorHostname")
diff --git a/core/src/main/scala/org/apache/spark/util/SparkUncaughtExceptionHandler.scala b/core/src/main/scala/org/apache/spark/util/SparkUncaughtExceptionHandler.scala
index 95bf3f58bc77f..e0f5af5250e7f 100644
--- a/core/src/main/scala/org/apache/spark/util/SparkUncaughtExceptionHandler.scala
+++ b/core/src/main/scala/org/apache/spark/util/SparkUncaughtExceptionHandler.scala
@@ -20,11 +20,12 @@ package org.apache.spark.util
 import org.apache.spark.internal.Logging
 
 /**
- * The default uncaught exception handler for Executors terminates the whole process, to avoid
- * getting into a bad state indefinitely. Since Executors are relatively lightweight, it's better
- * to fail fast when things go wrong.
+ * The default uncaught exception handler for Spark daemons. It terminates the whole process for
+ * any Errors, and also terminates the process for Exceptions when the exitOnException flag is true.
+ *
+ * @param exitOnUncaughtException Whether to exit the process on UncaughtException.
  */
-private[spark] object SparkUncaughtExceptionHandler
+private[spark] class SparkUncaughtExceptionHandler(val exitOnUncaughtException: Boolean = true)
   extends Thread.UncaughtExceptionHandler with Logging {
 
   override def uncaughtException(thread: Thread, exception: Throwable) {
@@ -40,7 +41,7 @@ private[spark] object SparkUncaughtExceptionHandler
       if (!ShutdownHookManager.inShutdown()) {
         if (exception.isInstanceOf[OutOfMemoryError]) {
           System.exit(SparkExitCode.OOM)
-        } else {
+        } else if (exitOnUncaughtException) {
           System.exit(SparkExitCode.UNCAUGHT_EXCEPTION)
         }
       }
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index b4caf68f0afaa..584337a71cb43 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -76,6 +76,8 @@ private[spark] object CallSite {
 private[spark] object Utils extends Logging {
   val random = new Random()
 
+  private val sparkUncaughtExceptionHandler = new SparkUncaughtExceptionHandler
+
   /**
    * Define a default value for driver memory here since this value is referenced across the code
    * base and nearly all files already use Utils.scala
@@ -1274,7 +1276,7 @@ private[spark] object Utils extends Logging {
       block
     } catch {
       case e: ControlThrowable => throw e
-      case t: Throwable => SparkUncaughtExceptionHandler.uncaughtException(t)
+      case t: Throwable => sparkUncaughtExceptionHandler.uncaughtException(t)
     }
   }
 
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcher.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcher.scala
index 38b082ac01197..aa378c9d340f1 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcher.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcher.scala
@@ -97,7 +97,7 @@ private[mesos] object MesosClusterDispatcher
   with CommandLineUtils {
 
   override def main(args: Array[String]) {
-    Thread.setDefaultUncaughtExceptionHandler(SparkUncaughtExceptionHandler)
+    Thread.setDefaultUncaughtExceptionHandler(new SparkUncaughtExceptionHandler)
     Utils.initDaemon(log)
     val conf = new SparkConf
     val dispatcherArgs = new MesosClusterDispatcherArguments(args, conf)

From e0af76a36a67d409776bd379c6d6ef6d60356c06 Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Wed, 12 Jul 2017 00:39:09 -0700
Subject: [PATCH 163/779] [SPARK-21370][SS] Add test for state reliability when
 one read-only state store aborts after read-write state store commits

## What changes were proposed in this pull request?

During Streaming Aggregation, we have two StateStores per task, one used as read-only in
`StateStoreRestoreExec`, and one read-write used in `StateStoreSaveExec`. `StateStore.abort`
will be called for these StateStores if they haven't committed their results. We need to
make sure that `abort` in read-only store after a `commit` in the read-write store doesn't
accidentally lead to the deletion of state.

This PR adds a test for this condition.

## How was this patch tested?

This PR adds a test.

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #18603 from brkyvz/ss-test.
---
 .../streaming/state/StateStoreSuite.scala     | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala
index c2087ec219e57..7cb86dc143844 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala
@@ -665,6 +665,37 @@ abstract class StateStoreSuiteBase[ProviderClass <: StateStoreProvider]
     checkInvalidVersion(3)
   }
 
+  test("two concurrent StateStores - one for read-only and one for read-write") {
+    // During Streaming Aggregation, we have two StateStores per task, one used as read-only in
+    // `StateStoreRestoreExec`, and one read-write used in `StateStoreSaveExec`. `StateStore.abort`
+    // will be called for these StateStores if they haven't committed their results. We need to
+    // make sure that `abort` in read-only store after a `commit` in the read-write store doesn't
+    // accidentally lead to the deletion of state.
+    val dir = newDir()
+    val storeId = StateStoreId(dir, 0L, 1)
+    val provider0 = newStoreProvider(storeId)
+    // prime state
+    val store = provider0.getStore(0)
+    val key = "a"
+    put(store, key, 1)
+    store.commit()
+    assert(rowsToSet(store.iterator()) === Set(key -> 1))
+
+    // two state stores
+    val provider1 = newStoreProvider(storeId)
+    val restoreStore = provider1.getStore(1)
+    val saveStore = provider1.getStore(1)
+
+    put(saveStore, key, get(restoreStore, key).get + 1)
+    saveStore.commit()
+    restoreStore.abort()
+
+    // check that state is correct for next batch
+    val provider2 = newStoreProvider(storeId)
+    val finalStore = provider2.getStore(2)
+    assert(rowsToSet(finalStore.iterator()) === Set(key -> 2))
+  }
+
   /** Return a new provider with a random id */
   def newStoreProvider(): ProviderClass
 

From f587d2e3fa133051a64e4ec1aa788b554b552690 Mon Sep 17 00:00:00 2001
From: Xiao Li <gatorsmile@gmail.com>
Date: Wed, 12 Jul 2017 15:48:44 +0800
Subject: [PATCH 164/779] [SPARK-20842][SQL] Upgrade to 1.2.2 for Hive
 Metastore Client 1.2

### What changes were proposed in this pull request?
Hive 1.2.2 release is available. Below is the list of bugs fixed in 1.2.2

https://issues.apache.org/jira/secure/ReleaseNote.jspa?version=12332952&styleName=Text&projectId=12310843

### How was this patch tested?
N/A

Author: Xiao Li <gatorsmile@gmail.com>

Closes #18063 from gatorsmile/upgradeHiveClientTo1.2.2.
---
 .../org/apache/spark/sql/hive/client/IsolatedClientLoader.scala | 2 +-
 .../main/scala/org/apache/spark/sql/hive/client/package.scala   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
index b8aa067cdb903..930f0dd4b32b8 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
@@ -92,7 +92,7 @@ private[hive] object IsolatedClientLoader extends Logging {
     case "14" | "0.14" | "0.14.0" => hive.v14
     case "1.0" | "1.0.0" => hive.v1_0
     case "1.1" | "1.1.0" => hive.v1_1
-    case "1.2" | "1.2.0" | "1.2.1" => hive.v1_2
+    case "1.2" | "1.2.0" | "1.2.1" | "1.2.2" => hive.v1_2
     case "2.0" | "2.0.0" | "2.0.1" => hive.v2_0
     case "2.1" | "2.1.0" | "2.1.1" => hive.v2_1
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/package.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/package.scala
index f9635e36549e8..c14154a3b3c21 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/package.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/package.scala
@@ -56,7 +56,7 @@ package object client {
         "net.hydromatic:linq4j",
         "net.hydromatic:quidem"))
 
-    case object v1_2 extends HiveVersion("1.2.1",
+    case object v1_2 extends HiveVersion("1.2.2",
       exclusions = Seq("eigenbase:eigenbase-properties",
         "org.apache.curator:*",
         "org.pentaho:pentaho-aggdesigner-algorithm",

From 5ed134ee213060882c6e3ed713473fa6cc158d36 Mon Sep 17 00:00:00 2001
From: Peng Meng <peng.meng@intel.com>
Date: Wed, 12 Jul 2017 11:02:04 +0100
Subject: [PATCH 165/779] [SPARK-21305][ML][MLLIB] Add options to disable
 multi-threading of native BLAS

## What changes were proposed in this pull request?

Many ML/MLLIB algorithms use native BLAS (like Intel MKL, ATLAS, OpenBLAS) to improvement the performance.
Many popular Native BLAS, like Intel MKL, OpenBLAS, use multi-threading technology, which will conflict with Spark.  Spark should provide options to disable multi-threading of Native BLAS.

https://github.com/xianyi/OpenBLAS/wiki/faq#multi-threaded
https://software.intel.com/en-us/articles/recommended-settings-for-calling-intel-mkl-routines-from-multi-threaded-applications

## How was this patch tested?
The existing UT.

Author: Peng Meng <peng.meng@intel.com>

Closes #18551 from mpjlu/optimzeBLAS.
---
 conf/spark-env.sh.template | 4 ++++
 docs/ml-guide.md           | 6 ++++++
 2 files changed, 10 insertions(+)

diff --git a/conf/spark-env.sh.template b/conf/spark-env.sh.template
index b9aab5a3712c4..1663019ee5758 100755
--- a/conf/spark-env.sh.template
+++ b/conf/spark-env.sh.template
@@ -61,3 +61,7 @@
 # - SPARK_IDENT_STRING  A string representing this instance of spark. (Default: $USER)
 # - SPARK_NICENESS      The scheduling priority for daemons. (Default: 0)
 # - SPARK_NO_DAEMONIZE  Run the proposed command in the foreground. It will not output a PID file.
+# Options for native BLAS, like Intel MKL, OpenBLAS, and so on.
+# You might get better performance to enable these options if using native BLAS (see SPARK-21305).
+# - MKL_NUM_THREADS=1        Disable multi-threading of Intel MKL
+# - OPENBLAS_NUM_THREADS=1   Disable multi-threading of OpenBLAS
diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index fb4621389ab92..adb1c9aaefcdc 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -61,6 +61,12 @@ To configure `netlib-java` / Breeze to use system optimised binaries, include
 project and read the [netlib-java](https://github.com/fommil/netlib-java) documentation for your
 platform's additional installation instructions.
 
+The most popular native BLAS such as [Intel MKL](https://software.intel.com/en-us/mkl), [OpenBLAS](http://www.openblas.net), can use multiple threads in a single operation, which can conflict with Spark's execution model.
+
+Configuring these BLAS implementations to use a single thread for operations may actually improve performance (see [SPARK-21305](https://issues.apache.org/jira/browse/SPARK-21305)). It is usually optimal to match this to the number of cores each Spark task is configured to use, which is 1 by default and typically left at 1.
+
+Please refer to resources like the following to understand how to configure the number of threads these BLAS implementations use: [Intel MKL](https://software.intel.com/en-us/articles/recommended-settings-for-calling-intel-mkl-routines-from-multi-threaded-applications) and [OpenBLAS](https://github.com/xianyi/OpenBLAS/wiki/faq#multi-threaded).
+
 To use MLlib in Python, you will need [NumPy](http://www.numpy.org) version 1.4 or newer.
 
 [^1]: To learn more about the benefits and background of system optimised natives, you may wish to

From aaad34dc2f537f7eef50fc5f72a7f178800e8d38 Mon Sep 17 00:00:00 2001
From: liuxian <liu.xian3@zte.com.cn>
Date: Wed, 12 Jul 2017 18:51:19 +0800
Subject: [PATCH 166/779] [SPARK-21007][SQL] Add SQL function - RIGHT && LEFT

## What changes were proposed in this pull request?
 Add  SQL function - RIGHT && LEFT, same as MySQL:
https://dev.mysql.com/doc/refman/5.7/en/string-functions.html#function_left
https://dev.mysql.com/doc/refman/5.7/en/string-functions.html#function_right

## How was this patch tested?
unit test

Author: liuxian <liu.xian3@zte.com.cn>

Closes #18228 from 10110346/lx-wip-0607.
---
 .../catalyst/analysis/FunctionRegistry.scala  |  2 +
 .../expressions/stringExpressions.scala       | 43 +++++++++++++++++++
 .../sql-tests/inputs/string-functions.sql     |  6 +++
 .../results/string-functions.sql.out          | 34 ++++++++++++++-
 4 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index f4b3e86052d8e..10b22ae562bcf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -346,6 +346,8 @@ object FunctionRegistry {
     expression[StringSplit]("split"),
     expression[Substring]("substr"),
     expression[Substring]("substring"),
+    expression[Left]("left"),
+    expression[Right]("right"),
     expression[SubstringIndex]("substring_index"),
     expression[StringTranslate]("translate"),
     expression[StringTrim]("trim"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 83fdcfce9c3bd..d75b9d6280328 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -1198,6 +1198,49 @@ case class Substring(str: Expression, pos: Expression, len: Expression)
   }
 }
 
+/**
+ * Returns the rightmost n characters from the string.
+ */
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage = "_FUNC_(str, len) - Returns the rightmost `len`(`len` can be string type) characters from the string `str`,if `len` is less or equal than 0 the result is an empty string.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark SQL', 3);
+       SQL
+  """)
+// scalastyle:on line.size.limit
+case class Right(str: Expression, len: Expression, child: Expression) extends RuntimeReplaceable {
+  def this(str: Expression, len: Expression) = {
+    this(str, len, If(IsNull(str), Literal(null, StringType), If(LessThanOrEqual(len, Literal(0)),
+      Literal(UTF8String.EMPTY_UTF8, StringType), new Substring(str, UnaryMinus(len)))))
+  }
+
+  override def flatArguments: Iterator[Any] = Iterator(str, len)
+  override def sql: String = s"$prettyName(${str.sql}, ${len.sql})"
+}
+
+/**
+ * Returns the leftmost n characters from the string.
+ */
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage = "_FUNC_(str, len) - Returns the leftmost `len`(`len` can be string type) characters from the string `str`,if `len` is less or equal than 0 the result is an empty string.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark SQL', 3);
+       Spa
+  """)
+// scalastyle:on line.size.limit
+case class Left(str: Expression, len: Expression, child: Expression) extends RuntimeReplaceable {
+  def this(str: Expression, len: Expression) = {
+    this(str, len, Substring(str, Literal(1), len))
+  }
+
+  override def flatArguments: Iterator[Any] = Iterator(str, len)
+  override def sql: String = s"$prettyName(${str.sql}, ${len.sql})"
+}
+
 /**
  * A function that returns the char length of the given string expression or
  * number of bytes of the given binary expression.
diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
index c95f4817b7ce0..40d0c064f5c44 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
@@ -18,3 +18,9 @@ select length(uuid()), (uuid() <> uuid());
 
 -- position
 select position('bar' in 'foobarbar'), position(null, 'foobarbar'), position('aaads', null);
+
+-- left && right
+select left("abcd", 2), left("abcd", 5), left("abcd", '2'), left("abcd", null);
+select left(null, -2), left("abcd", -2), left("abcd", 0), left("abcd", 'a');
+select right("abcd", 2), right("abcd", 5), right("abcd", '2'), right("abcd", null);
+select right(null, -2), right("abcd", -2), right("abcd", 0), right("abcd", 'a');
diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
index b0ae9d775d968..2d9b3d7d2ca33 100644
--- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 8
+-- Number of queries: 12
 
 
 -- !query 0
@@ -86,3 +86,35 @@ select position('bar' in 'foobarbar'), position(null, 'foobarbar'), position('aa
 struct<locate(bar, foobarbar, 1):int,locate(CAST(NULL AS STRING), foobarbar, 1):int,locate(aaads, CAST(NULL AS STRING), 1):int>
 -- !query 7 output
 4	NULL	NULL
+
+
+-- !query 8
+select left("abcd", 2), left("abcd", 5), left("abcd", '2'), left("abcd", null)
+-- !query 8 schema
+struct<left('abcd', 2):string,left('abcd', 5):string,left('abcd', '2'):string,left('abcd', NULL):string>
+-- !query 8 output
+ab	abcd	ab	NULL
+
+
+-- !query 9
+select left(null, -2), left("abcd", -2), left("abcd", 0), left("abcd", 'a')
+-- !query 9 schema
+struct<left(NULL, -2):string,left('abcd', -2):string,left('abcd', 0):string,left('abcd', 'a'):string>
+-- !query 9 output
+NULL			NULL
+
+
+-- !query 10
+select right("abcd", 2), right("abcd", 5), right("abcd", '2'), right("abcd", null)
+-- !query 10 schema
+struct<right('abcd', 2):string,right('abcd', 5):string,right('abcd', '2'):string,right('abcd', NULL):string>
+-- !query 10 output
+cd	abcd	cd	NULL
+
+
+-- !query 11
+select right(null, -2), right("abcd", -2), right("abcd", 0), right("abcd", 'a')
+-- !query 11 schema
+struct<right(NULL, -2):string,right('abcd', -2):string,right('abcd', 0):string,right('abcd', 'a'):string>
+-- !query 11 output
+NULL			NULL

From d2d2a5de186ddf381d0bdb353b23d64ff0224e7f Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Wed, 12 Jul 2017 22:09:03 +0800
Subject: [PATCH 167/779] [SPARK-18619][ML] Make
 QuantileDiscretizer/Bucketizer/StringIndexer/RFormula inherit from
 HasHandleInvalid

## What changes were proposed in this pull request?
1, HasHandleInvaild support override
2, Make QuantileDiscretizer/Bucketizer/StringIndexer/RFormula inherit from HasHandleInvalid

## How was this patch tested?
existing tests

[JIRA](https://issues.apache.org/jira/browse/SPARK-18619)

Author: Zheng RuiFeng <ruifengz@foxmail.com>

Closes #18582 from zhengruifeng/heritate_HasHandleInvalid.
---
 .../apache/spark/ml/feature/Bucketizer.scala  | 14 ++---
 .../ml/feature/QuantileDiscretizer.scala      | 13 ++--
 .../apache/spark/ml/feature/RFormula.scala    | 13 ++--
 .../spark/ml/feature/StringIndexer.scala      | 13 ++--
 .../ml/param/shared/SharedParamsCodeGen.scala |  2 +-
 .../spark/ml/param/shared/sharedParams.scala  |  2 +-
 .../GeneralizedLinearRegression.scala         |  2 +-
 .../ml/regression/LinearRegression.scala      | 14 ++---
 python/pyspark/ml/feature.py                  | 60 ++++++++-----------
 9 files changed, 53 insertions(+), 80 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
index 46b512f8aea7e..6a11a75d1d569 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
@@ -24,7 +24,7 @@ import org.apache.spark.annotation.Since
 import org.apache.spark.ml.Model
 import org.apache.spark.ml.attribute.NominalAttribute
 import org.apache.spark.ml.param._
-import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
+import org.apache.spark.ml.param.shared.{HasHandleInvalid, HasInputCol, HasOutputCol}
 import org.apache.spark.ml.util._
 import org.apache.spark.sql._
 import org.apache.spark.sql.expressions.UserDefinedFunction
@@ -36,7 +36,8 @@ import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
  */
 @Since("1.4.0")
 final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String)
-  extends Model[Bucketizer] with HasInputCol with HasOutputCol with DefaultParamsWritable {
+  extends Model[Bucketizer] with HasHandleInvalid with HasInputCol with HasOutputCol
+    with DefaultParamsWritable {
 
   @Since("1.4.0")
   def this() = this(Identifiable.randomUID("bucketizer"))
@@ -84,17 +85,12 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
    * Default: "error"
    * @group param
    */
-  // TODO: SPARK-18619 Make Bucketizer inherit from HasHandleInvalid.
   @Since("2.1.0")
-  val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle " +
-    "invalid entries. Options are skip (filter out rows with invalid values), " +
+  override val handleInvalid: Param[String] = new Param[String](this, "handleInvalid",
+    "how to handle invalid entries. Options are skip (filter out rows with invalid values), " +
     "error (throw an error), or keep (keep invalid values in a special additional bucket).",
     ParamValidators.inArray(Bucketizer.supportedHandleInvalids))
 
-  /** @group getParam */
-  @Since("2.1.0")
-  def getHandleInvalid: String = $(handleInvalid)
-
   /** @group setParam */
   @Since("2.1.0")
   def setHandleInvalid(value: String): this.type = set(handleInvalid, value)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
index feceeba866dfa..95e8830283dee 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
@@ -22,7 +22,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.ml._
 import org.apache.spark.ml.attribute.NominalAttribute
 import org.apache.spark.ml.param._
-import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
+import org.apache.spark.ml.param.shared.{HasHandleInvalid, HasInputCol, HasOutputCol}
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.Dataset
 import org.apache.spark.sql.types.StructType
@@ -31,7 +31,7 @@ import org.apache.spark.sql.types.StructType
  * Params for [[QuantileDiscretizer]].
  */
 private[feature] trait QuantileDiscretizerBase extends Params
-  with HasInputCol with HasOutputCol {
+  with HasHandleInvalid with HasInputCol with HasOutputCol {
 
   /**
    * Number of buckets (quantiles, or categories) into which data points are grouped. Must
@@ -72,18 +72,13 @@ private[feature] trait QuantileDiscretizerBase extends Params
    * Default: "error"
    * @group param
    */
-  // TODO: SPARK-18619 Make QuantileDiscretizer inherit from HasHandleInvalid.
   @Since("2.1.0")
-  val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle " +
-    "invalid entries. Options are skip (filter out rows with invalid values), " +
+  override val handleInvalid: Param[String] = new Param[String](this, "handleInvalid",
+    "how to handle invalid entries. Options are skip (filter out rows with invalid values), " +
     "error (throw an error), or keep (keep invalid values in a special additional bucket).",
     ParamValidators.inArray(Bucketizer.supportedHandleInvalids))
   setDefault(handleInvalid, Bucketizer.ERROR_INVALID)
 
-  /** @group getParam */
-  @Since("2.1.0")
-  def getHandleInvalid: String = $(handleInvalid)
-
 }
 
 /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index 61aa6463bb6da..bb7acaf118ea5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -27,7 +27,7 @@ import org.apache.spark.ml.{Estimator, Model, Pipeline, PipelineModel, PipelineS
 import org.apache.spark.ml.attribute.AttributeGroup
 import org.apache.spark.ml.linalg.VectorUDT
 import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap, ParamValidators}
-import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasLabelCol}
+import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasHandleInvalid, HasLabelCol}
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.{DataFrame, Dataset}
 import org.apache.spark.sql.types._
@@ -108,7 +108,8 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol {
 @Experimental
 @Since("1.5.0")
 class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
-  extends Estimator[RFormulaModel] with RFormulaBase with DefaultParamsWritable {
+  extends Estimator[RFormulaModel] with RFormulaBase with HasHandleInvalid
+    with DefaultParamsWritable {
 
   @Since("1.5.0")
   def this() = this(Identifiable.randomUID("rFormula"))
@@ -141,8 +142,8 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
    * @group param
    */
   @Since("2.3.0")
-  val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "How to handle " +
-    "invalid data (unseen labels or NULL values). " +
+  override val handleInvalid: Param[String] = new Param[String](this, "handleInvalid",
+    "How to handle invalid data (unseen labels or NULL values). " +
     "Options are 'skip' (filter out rows with invalid data), error (throw an error), " +
     "or 'keep' (put invalid data in a special additional bucket, at index numLabels).",
     ParamValidators.inArray(StringIndexer.supportedHandleInvalids))
@@ -152,10 +153,6 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
   @Since("2.3.0")
   def setHandleInvalid(value: String): this.type = set(handleInvalid, value)
 
-  /** @group getParam */
-  @Since("2.3.0")
-  def getHandleInvalid: String = $(handleInvalid)
-
   /** @group setParam */
   @Since("1.5.0")
   def setFeaturesCol(value: String): this.type = set(featuresCol, value)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
index dfc902bd0b0f1..2679ec310c470 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
@@ -26,7 +26,7 @@ import org.apache.spark.annotation.Since
 import org.apache.spark.ml.{Estimator, Model, Transformer}
 import org.apache.spark.ml.attribute.{Attribute, NominalAttribute}
 import org.apache.spark.ml.param._
-import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
+import org.apache.spark.ml.param.shared.{HasHandleInvalid, HasInputCol, HasOutputCol}
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.{DataFrame, Dataset}
 import org.apache.spark.sql.functions._
@@ -36,7 +36,8 @@ import org.apache.spark.util.collection.OpenHashMap
 /**
  * Base trait for [[StringIndexer]] and [[StringIndexerModel]].
  */
-private[feature] trait StringIndexerBase extends Params with HasInputCol with HasOutputCol {
+private[feature] trait StringIndexerBase extends Params with HasHandleInvalid with HasInputCol
+  with HasOutputCol {
 
   /**
    * Param for how to handle invalid data (unseen labels or NULL values).
@@ -47,18 +48,14 @@ private[feature] trait StringIndexerBase extends Params with HasInputCol with Ha
    * @group param
    */
   @Since("1.6.0")
-  val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "How to handle " +
-    "invalid data (unseen labels or NULL values). " +
+  override val handleInvalid: Param[String] = new Param[String](this, "handleInvalid",
+    "How to handle invalid data (unseen labels or NULL values). " +
     "Options are 'skip' (filter out rows with invalid data), error (throw an error), " +
     "or 'keep' (put invalid data in a special additional bucket, at index numLabels).",
     ParamValidators.inArray(StringIndexer.supportedHandleInvalids))
 
   setDefault(handleInvalid, StringIndexer.ERROR_INVALID)
 
-  /** @group getParam */
-  @Since("1.6.0")
-  def getHandleInvalid: String = $(handleInvalid)
-
   /**
    * Param for how to order labels of string column. The first label after ordering is assigned
    * an index of 0.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
index 23e0d45d943a0..fd9b20ed9364a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
@@ -67,7 +67,7 @@ private[shared] object SharedParamsCodeGen {
       ParamDesc[String]("handleInvalid", "how to handle invalid entries. Options are skip (which " +
         "will filter out rows with bad values), or error (which will throw an error). More " +
         "options may be added later",
-        isValid = "ParamValidators.inArray(Array(\"skip\", \"error\"))"),
+        isValid = "ParamValidators.inArray(Array(\"skip\", \"error\"))", finalFields = false),
       ParamDesc[Boolean]("standardization", "whether to standardize the training features" +
         " before fitting the model", Some("true")),
       ParamDesc[Long]("seed", "random seed", Some("this.getClass.getName.hashCode.toLong")),
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
index 1a8f499798b80..a29b45c3ec66c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -273,7 +273,7 @@ private[ml] trait HasHandleInvalid extends Params {
    * Param for how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an error). More options may be added later.
    * @group param
    */
-  final val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an error). More options may be added later", ParamValidators.inArray(Array("skip", "error")))
+  val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an error). More options may be added later", ParamValidators.inArray(Array("skip", "error")))
 
   /** @group getParam */
   final def getHandleInvalid: String = $(handleInvalid)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index c600b87bdc64a..815607f0a76d2 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -171,7 +171,7 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam
    *
    * @group param
    */
-  @Since("2.3.0")
+  @Since("2.0.0")
   final override val solver: Param[String] = new Param[String](this, "solver",
     "The solver algorithm for optimization. Supported options: " +
       s"${supportedSolvers.mkString(", ")}. (Default irls)",
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index ce5e0797915df..91cd229704a37 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -64,7 +64,7 @@ private[regression] trait LinearRegressionParams extends PredictorParams
    *
    * @group param
    */
-  @Since("2.3.0")
+  @Since("1.6.0")
   final override val solver: Param[String] = new Param[String](this, "solver",
     "The solver algorithm for optimization. Supported options: " +
       s"${supportedSolvers.mkString(", ")}. (Default auto)",
@@ -194,7 +194,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
    */
   @Since("1.6.0")
   def setSolver(value: String): this.type = set(solver, value)
-  setDefault(solver -> AUTO)
+  setDefault(solver -> Auto)
 
   /**
    * Suggested depth for treeAggregate (greater than or equal to 2).
@@ -224,8 +224,8 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
       elasticNetParam, fitIntercept, maxIter, regParam, standardization, aggregationDepth)
     instr.logNumFeatures(numFeatures)
 
-    if (($(solver) == AUTO &&
-      numFeatures <= WeightedLeastSquares.MAX_NUM_FEATURES) || $(solver) == NORMAL) {
+    if (($(solver) == Auto &&
+      numFeatures <= WeightedLeastSquares.MAX_NUM_FEATURES) || $(solver) == Normal) {
       // For low dimensional data, WeightedLeastSquares is more efficient since the
       // training algorithm only requires one pass through the data. (SPARK-10668)
 
@@ -460,16 +460,16 @@ object LinearRegression extends DefaultParamsReadable[LinearRegression] {
   val MAX_FEATURES_FOR_NORMAL_SOLVER: Int = WeightedLeastSquares.MAX_NUM_FEATURES
 
   /** String name for "auto". */
-  private[regression] val AUTO = "auto"
+  private[regression] val Auto = "auto"
 
   /** String name for "normal". */
-  private[regression] val NORMAL = "normal"
+  private[regression] val Normal = "normal"
 
   /** String name for "l-bfgs". */
   private[regression] val LBFGS = "l-bfgs"
 
   /** Set of solvers that LinearRegression supports. */
-  private[regression] val supportedSolvers = Array(AUTO, NORMAL, LBFGS)
+  private[regression] val supportedSolvers = Array(Auto, Normal, LBFGS)
 }
 
 /**
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 25ad06f682ed9..7eb1b9fac2f5a 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -314,7 +314,8 @@ class BucketedRandomProjectionLSHModel(LSHModel, JavaMLReadable, JavaMLWritable)
 
 
 @inherit_doc
-class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):
+class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, HasHandleInvalid,
+                 JavaMLReadable, JavaMLWritable):
     """
     Maps a column of continuous features to a column of feature buckets.
 
@@ -398,20 +399,6 @@ def getSplits(self):
         """
         return self.getOrDefault(self.splits)
 
-    @since("2.1.0")
-    def setHandleInvalid(self, value):
-        """
-        Sets the value of :py:attr:`handleInvalid`.
-        """
-        return self._set(handleInvalid=value)
-
-    @since("2.1.0")
-    def getHandleInvalid(self):
-        """
-        Gets the value of :py:attr:`handleInvalid` or its default value.
-        """
-        return self.getOrDefault(self.handleInvalid)
-
 
 @inherit_doc
 class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):
@@ -1623,7 +1610,8 @@ def getDegree(self):
 
 
 @inherit_doc
-class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):
+class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid,
+                          JavaMLReadable, JavaMLWritable):
     """
     .. note:: Experimental
 
@@ -1743,20 +1731,6 @@ def getRelativeError(self):
         """
         return self.getOrDefault(self.relativeError)
 
-    @since("2.1.0")
-    def setHandleInvalid(self, value):
-        """
-        Sets the value of :py:attr:`handleInvalid`.
-        """
-        return self._set(handleInvalid=value)
-
-    @since("2.1.0")
-    def getHandleInvalid(self):
-        """
-        Gets the value of :py:attr:`handleInvalid` or its default value.
-        """
-        return self.getOrDefault(self.handleInvalid)
-
     def _create_model(self, java_model):
         """
         Private method to convert the java_model to a Python model.
@@ -2977,7 +2951,8 @@ def explainedVariance(self):
 
 
 @inherit_doc
-class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol, JavaMLReadable, JavaMLWritable):
+class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol, HasHandleInvalid,
+               JavaMLReadable, JavaMLWritable):
     """
     .. note:: Experimental
 
@@ -3020,6 +2995,8 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol, JavaMLReadable, JavaM
     True
     >>> loadedRF.getLabelCol() == rf.getLabelCol()
     True
+    >>> loadedRF.getHandleInvalid() == rf.getHandleInvalid()
+    True
     >>> str(loadedRF)
     'RFormula(y ~ x + s) (uid=...)'
     >>> modelPath = temp_path + "/rFormulaModel"
@@ -3058,26 +3035,37 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol, JavaMLReadable, JavaM
                                    "RFormula drops the same category as R when encoding strings.",
                                    typeConverter=TypeConverters.toString)
 
+    handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries. " +
+                          "Options are 'skip' (filter out rows with invalid values), " +
+                          "'error' (throw an error), or 'keep' (put invalid data in a special " +
+                          "additional bucket, at index numLabels).",
+                          typeConverter=TypeConverters.toString)
+
     @keyword_only
     def __init__(self, formula=None, featuresCol="features", labelCol="label",
-                 forceIndexLabel=False, stringIndexerOrderType="frequencyDesc"):
+                 forceIndexLabel=False, stringIndexerOrderType="frequencyDesc",
+                 handleInvalid="error"):
         """
         __init__(self, formula=None, featuresCol="features", labelCol="label", \
-                 forceIndexLabel=False, stringIndexerOrderType="frequencyDesc")
+                 forceIndexLabel=False, stringIndexerOrderType="frequencyDesc", \
+                 handleInvalid="error")
         """
         super(RFormula, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RFormula", self.uid)
-        self._setDefault(forceIndexLabel=False, stringIndexerOrderType="frequencyDesc")
+        self._setDefault(forceIndexLabel=False, stringIndexerOrderType="frequencyDesc",
+                         handleInvalid="error")
         kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
     @since("1.5.0")
     def setParams(self, formula=None, featuresCol="features", labelCol="label",
-                  forceIndexLabel=False, stringIndexerOrderType="frequencyDesc"):
+                  forceIndexLabel=False, stringIndexerOrderType="frequencyDesc",
+                  handleInvalid="error"):
         """
         setParams(self, formula=None, featuresCol="features", labelCol="label", \
-                  forceIndexLabel=False, stringIndexerOrderType="frequencyDesc")
+                  forceIndexLabel=False, stringIndexerOrderType="frequencyDesc", \
+                  handleInvalid="error")
         Sets params for RFormula.
         """
         kwargs = self._input_kwargs

From 780586a9f2400c3fdfdb9a6b954001a3c9663941 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 12 Jul 2017 09:23:54 -0700
Subject: [PATCH 168/779] [SPARK-17701][SQL] Refactor RowDataSourceScanExec so
 its sameResult call does not compare strings

## What changes were proposed in this pull request?

Currently, `RowDataSourceScanExec` and `FileSourceScanExec` rely on a "metadata" string map to implement equality comparison, since the RDDs they depend on cannot be directly compared. This has resulted in a number of correctness bugs around exchange reuse, e.g. SPARK-17673 and SPARK-16818.

To make these comparisons less brittle, we should refactor these classes to compare constructor parameters directly instead of relying on the metadata map.

This PR refactors `RowDataSourceScanExec`, `FileSourceScanExec` will be fixed in the follow-up PR.

## How was this patch tested?

existing tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #18600 from cloud-fan/minor.
---
 .../sql/execution/DataSourceScanExec.scala    | 65 +++++++++----------
 .../spark/sql/execution/SparkPlan.scala       |  5 --
 .../spark/sql/execution/SparkPlanInfo.scala   |  4 +-
 .../datasources/DataSourceStrategy.scala      | 57 ++++++----------
 .../sql/execution/ui/SparkPlanGraph.scala     |  5 +-
 5 files changed, 56 insertions(+), 80 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index a0def68d88e0d..588c937a13e45 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -33,21 +33,23 @@ import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partition
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat => ParquetSource}
 import org.apache.spark.sql.execution.metric.SQLMetrics
-import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.sources.BaseRelation
+import org.apache.spark.sql.sources.{BaseRelation, Filter}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.Utils
 
 trait DataSourceScanExec extends LeafExecNode with CodegenSupport {
   val relation: BaseRelation
-  val metastoreTableIdentifier: Option[TableIdentifier]
+  val tableIdentifier: Option[TableIdentifier]
 
   protected val nodeNamePrefix: String = ""
 
   override val nodeName: String = {
-    s"Scan $relation ${metastoreTableIdentifier.map(_.unquotedString).getOrElse("")}"
+    s"Scan $relation ${tableIdentifier.map(_.unquotedString).getOrElse("")}"
   }
 
+  // Metadata that describes more details of this scan.
+  protected def metadata: Map[String, String]
+
   override def simpleString: String = {
     val metadataEntries = metadata.toSeq.sorted.map {
       case (key, value) =>
@@ -73,34 +75,25 @@ trait DataSourceScanExec extends LeafExecNode with CodegenSupport {
 
 /** Physical plan node for scanning data from a relation. */
 case class RowDataSourceScanExec(
-    output: Seq[Attribute],
+    fullOutput: Seq[Attribute],
+    requiredColumnsIndex: Seq[Int],
+    filters: Set[Filter],
+    handledFilters: Set[Filter],
     rdd: RDD[InternalRow],
     @transient relation: BaseRelation,
-    override val outputPartitioning: Partitioning,
-    override val metadata: Map[String, String],
-    override val metastoreTableIdentifier: Option[TableIdentifier])
+    override val tableIdentifier: Option[TableIdentifier])
   extends DataSourceScanExec {
 
+  def output: Seq[Attribute] = requiredColumnsIndex.map(fullOutput)
+
   override lazy val metrics =
     Map("numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
 
-  val outputUnsafeRows = relation match {
-    case r: HadoopFsRelation if r.fileFormat.isInstanceOf[ParquetSource] =>
-      !SparkSession.getActiveSession.get.sessionState.conf.getConf(
-        SQLConf.PARQUET_VECTORIZED_READER_ENABLED)
-    case _: HadoopFsRelation => true
-    case _ => false
-  }
-
   protected override def doExecute(): RDD[InternalRow] = {
-    val unsafeRow = if (outputUnsafeRows) {
-      rdd
-    } else {
-      rdd.mapPartitionsWithIndexInternal { (index, iter) =>
-        val proj = UnsafeProjection.create(schema)
-        proj.initialize(index)
-        iter.map(proj)
-      }
+    val unsafeRow = rdd.mapPartitionsWithIndexInternal { (index, iter) =>
+      val proj = UnsafeProjection.create(schema)
+      proj.initialize(index)
+      iter.map(proj)
     }
 
     val numOutputRows = longMetric("numOutputRows")
@@ -126,24 +119,31 @@ case class RowDataSourceScanExec(
     ctx.INPUT_ROW = row
     ctx.currentVars = null
     val columnsRowInput = exprRows.map(_.genCode(ctx))
-    val inputRow = if (outputUnsafeRows) row else null
     s"""
        |while ($input.hasNext()) {
        |  InternalRow $row = (InternalRow) $input.next();
        |  $numOutputRows.add(1);
-       |  ${consume(ctx, columnsRowInput, inputRow).trim}
+       |  ${consume(ctx, columnsRowInput, null).trim}
        |  if (shouldStop()) return;
        |}
      """.stripMargin
   }
 
-  // Only care about `relation` and `metadata` when canonicalizing.
+  override val metadata: Map[String, String] = {
+    val markedFilters = for (filter <- filters) yield {
+      if (handledFilters.contains(filter)) s"*$filter" else s"$filter"
+    }
+    Map(
+      "ReadSchema" -> output.toStructType.catalogString,
+      "PushedFilters" -> markedFilters.mkString("[", ", ", "]"))
+  }
+
+  // Don't care about `rdd` and `tableIdentifier` when canonicalizing.
   override lazy val canonicalized: SparkPlan =
     copy(
-      output.map(QueryPlan.normalizeExprId(_, output)),
+      fullOutput.map(QueryPlan.normalizeExprId(_, fullOutput)),
       rdd = null,
-      outputPartitioning = null,
-      metastoreTableIdentifier = None)
+      tableIdentifier = None)
 }
 
 /**
@@ -154,7 +154,7 @@ case class RowDataSourceScanExec(
  * @param requiredSchema Required schema of the underlying relation, excluding partition columns.
  * @param partitionFilters Predicates to use for partition pruning.
  * @param dataFilters Filters on non-partition columns.
- * @param metastoreTableIdentifier identifier for the table in the metastore.
+ * @param tableIdentifier identifier for the table in the metastore.
  */
 case class FileSourceScanExec(
     @transient relation: HadoopFsRelation,
@@ -162,7 +162,7 @@ case class FileSourceScanExec(
     requiredSchema: StructType,
     partitionFilters: Seq[Expression],
     dataFilters: Seq[Expression],
-    override val metastoreTableIdentifier: Option[TableIdentifier])
+    override val tableIdentifier: Option[TableIdentifier])
   extends DataSourceScanExec with ColumnarBatchScan  {
 
   val supportsBatch: Boolean = relation.fileFormat.supportBatch(
@@ -261,7 +261,6 @@ case class FileSourceScanExec(
   private val pushedDownFilters = dataFilters.flatMap(DataSourceStrategy.translateFilter)
   logInfo(s"Pushed Filters: ${pushedDownFilters.mkString(",")}")
 
-  // These metadata values make scan plans uniquely identifiable for equality checking.
   override val metadata: Map[String, String] = {
     def seqToString(seq: Seq[Any]) = seq.mkString("[", ", ", "]")
     val location = relation.location
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index db975614c961a..c7277c21cebb2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -71,11 +71,6 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
     super.makeCopy(newArgs)
   }
 
-  /**
-   * @return Metadata that describes more details of this SparkPlan.
-   */
-  def metadata: Map[String, String] = Map.empty
-
   /**
    * @return All metrics containing metrics of this SparkPlan.
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala
index 7aa93126fdabd..06b69625fb53e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala
@@ -31,7 +31,6 @@ class SparkPlanInfo(
     val nodeName: String,
     val simpleString: String,
     val children: Seq[SparkPlanInfo],
-    val metadata: Map[String, String],
     val metrics: Seq[SQLMetricInfo]) {
 
   override def hashCode(): Int = {
@@ -58,7 +57,6 @@ private[execution] object SparkPlanInfo {
       new SQLMetricInfo(metric.name.getOrElse(key), metric.id, metric.metricType)
     }
 
-    new SparkPlanInfo(plan.nodeName, plan.simpleString, children.map(fromSparkPlan),
-      plan.metadata, metrics)
+    new SparkPlanInfo(plan.nodeName, plan.simpleString, children.map(fromSparkPlan), metrics)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index e05a8d5f02bd8..587b9b450ea2a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -19,8 +19,6 @@ package org.apache.spark.sql.execution.datasources
 
 import java.util.concurrent.Callable
 
-import scala.collection.mutable.ArrayBuffer
-
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
@@ -288,10 +286,11 @@ case class DataSourceStrategy(conf: SQLConf) extends Strategy with Logging with
     case l @ LogicalRelation(baseRelation: TableScan, _, _) =>
       RowDataSourceScanExec(
         l.output,
+        l.output.indices,
+        Set.empty,
+        Set.empty,
         toCatalystRDD(l, baseRelation.buildScan()),
         baseRelation,
-        UnknownPartitioning(0),
-        Map.empty,
         None) :: Nil
 
     case _ => Nil
@@ -354,36 +353,10 @@ case class DataSourceStrategy(conf: SQLConf) extends Strategy with Logging with
     val (unhandledPredicates, pushedFilters, handledFilters) =
       selectFilters(relation.relation, candidatePredicates)
 
-    // A set of column attributes that are only referenced by pushed down filters.  We can eliminate
-    // them from requested columns.
-    val handledSet = {
-      val handledPredicates = filterPredicates.filterNot(unhandledPredicates.contains)
-      val unhandledSet = AttributeSet(unhandledPredicates.flatMap(_.references))
-      AttributeSet(handledPredicates.flatMap(_.references)) --
-        (projectSet ++ unhandledSet).map(relation.attributeMap)
-    }
-
     // Combines all Catalyst filter `Expression`s that are either not convertible to data source
     // `Filter`s or cannot be handled by `relation`.
     val filterCondition = unhandledPredicates.reduceLeftOption(expressions.And)
 
-    // These metadata values make scan plans uniquely identifiable for equality checking.
-    // TODO(SPARK-17701) using strings for equality checking is brittle
-    val metadata: Map[String, String] = {
-      val pairs = ArrayBuffer.empty[(String, String)]
-
-      // Mark filters which are handled by the underlying DataSource with an Astrisk
-      if (pushedFilters.nonEmpty) {
-        val markedFilters = for (filter <- pushedFilters) yield {
-            if (handledFilters.contains(filter)) s"*$filter" else s"$filter"
-        }
-        pairs += ("PushedFilters" -> markedFilters.mkString("[", ", ", "]"))
-      }
-      pairs += ("ReadSchema" ->
-        StructType.fromAttributes(projects.map(_.toAttribute)).catalogString)
-      pairs.toMap
-    }
-
     if (projects.map(_.toAttribute) == projects &&
         projectSet.size == projects.size &&
         filterSet.subsetOf(projectSet)) {
@@ -395,24 +368,36 @@ case class DataSourceStrategy(conf: SQLConf) extends Strategy with Logging with
         .asInstanceOf[Seq[Attribute]]
         // Match original case of attributes.
         .map(relation.attributeMap)
-        // Don't request columns that are only referenced by pushed filters.
-        .filterNot(handledSet.contains)
 
       val scan = RowDataSourceScanExec(
-        projects.map(_.toAttribute),
+        relation.output,
+        requestedColumns.map(relation.output.indexOf),
+        pushedFilters.toSet,
+        handledFilters,
         scanBuilder(requestedColumns, candidatePredicates, pushedFilters),
-        relation.relation, UnknownPartitioning(0), metadata,
+        relation.relation,
         relation.catalogTable.map(_.identifier))
       filterCondition.map(execution.FilterExec(_, scan)).getOrElse(scan)
     } else {
+      // A set of column attributes that are only referenced by pushed down filters.  We can
+      // eliminate them from requested columns.
+      val handledSet = {
+        val handledPredicates = filterPredicates.filterNot(unhandledPredicates.contains)
+        val unhandledSet = AttributeSet(unhandledPredicates.flatMap(_.references))
+        AttributeSet(handledPredicates.flatMap(_.references)) --
+          (projectSet ++ unhandledSet).map(relation.attributeMap)
+      }
       // Don't request columns that are only referenced by pushed filters.
       val requestedColumns =
         (projectSet ++ filterSet -- handledSet).map(relation.attributeMap).toSeq
 
       val scan = RowDataSourceScanExec(
-        requestedColumns,
+        relation.output,
+        requestedColumns.map(relation.output.indexOf),
+        pushedFilters.toSet,
+        handledFilters,
         scanBuilder(requestedColumns, candidatePredicates, pushedFilters),
-        relation.relation, UnknownPartitioning(0), metadata,
+        relation.relation,
         relation.catalogTable.map(_.identifier))
       execution.ProjectExec(
         projects, filterCondition.map(execution.FilterExec(_, scan)).getOrElse(scan))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala
index 9d4ebcce4d103..884f945815e0f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala
@@ -113,7 +113,7 @@ object SparkPlanGraph {
         }
         val node = new SparkPlanGraphNode(
           nodeIdGenerator.getAndIncrement(), planInfo.nodeName,
-          planInfo.simpleString, planInfo.metadata, metrics)
+          planInfo.simpleString, metrics)
         if (subgraph == null) {
           nodes += node
         } else {
@@ -143,7 +143,6 @@ private[ui] class SparkPlanGraphNode(
     val id: Long,
     val name: String,
     val desc: String,
-    val metadata: Map[String, String],
     val metrics: Seq[SQLPlanMetric]) {
 
   def makeDotNode(metricsValue: Map[Long, String]): String = {
@@ -177,7 +176,7 @@ private[ui] class SparkPlanGraphCluster(
     desc: String,
     val nodes: mutable.ArrayBuffer[SparkPlanGraphNode],
     metrics: Seq[SQLPlanMetric])
-  extends SparkPlanGraphNode(id, name, desc, Map.empty, metrics) {
+  extends SparkPlanGraphNode(id, name, desc, metrics) {
 
   override def makeDotNode(metricsValue: Map[Long, String]): String = {
     val duration = metrics.filter(_.name.startsWith(WholeStageCodegenExec.PIPELINE_DURATION_METRIC))

From e08d06b37bc96cc48fec1c5e40f73e0bca09c616 Mon Sep 17 00:00:00 2001
From: Kohki Nishio <taroplus@me.com>
Date: Thu, 13 Jul 2017 08:22:40 +0800
Subject: [PATCH 169/779] [SPARK-18646][REPL] Set parent classloader as null
 for ExecutorClassLoader

## What changes were proposed in this pull request?

`ClassLoader` will preferentially load class from `parent`. Only when `parent` is null or the load failed, that it will call the overridden `findClass` function. To avoid the potential issue caused by loading class using inappropriate class loader, we should set the `parent` of `ClassLoader` to null, so that we can fully control which class loader is used.

This is take over of #17074,  the primary author of this PR is taroplus .

Should close #17074 after this PR get merged.

## How was this patch tested?

Add test case in `ExecutorClassLoaderSuite`.

Author: Kohki Nishio <taroplus@me.com>
Author: Xingbo Jiang <xingbo.jiang@databricks.com>

Closes #18614 from jiangxb1987/executor_classloader.
---
 .../spark/repl/ExecutorClassLoader.scala      | 17 ++++---
 .../spark/repl/ExecutorClassLoaderSuite.scala | 46 +++++++++++++++++++
 2 files changed, 57 insertions(+), 6 deletions(-)

diff --git a/repl/src/main/scala/org/apache/spark/repl/ExecutorClassLoader.scala b/repl/src/main/scala/org/apache/spark/repl/ExecutorClassLoader.scala
index df13b32451af2..127f67329f266 100644
--- a/repl/src/main/scala/org/apache/spark/repl/ExecutorClassLoader.scala
+++ b/repl/src/main/scala/org/apache/spark/repl/ExecutorClassLoader.scala
@@ -33,18 +33,23 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.util.{ParentClassLoader, Utils}
 
 /**
- * A ClassLoader that reads classes from a Hadoop FileSystem or HTTP URI,
- * used to load classes defined by the interpreter when the REPL is used.
- * Allows the user to specify if user class path should be first.
- * This class loader delegates getting/finding resources to parent loader,
- * which makes sense until REPL never provide resource dynamically.
+ * A ClassLoader that reads classes from a Hadoop FileSystem or HTTP URI, used to load classes
+ * defined by the interpreter when the REPL is used. Allows the user to specify if user class path
+ * should be first. This class loader delegates getting/finding resources to parent loader, which
+ * makes sense until REPL never provide resource dynamically.
+ *
+ * Note: [[ClassLoader]] will preferentially load class from parent. Only when parent is null or
+ * the load failed, that it will call the overridden `findClass` function. To avoid the potential
+ * issue caused by loading class using inappropriate class loader, we should set the parent of
+ * ClassLoader to null, so that we can fully control which class loader is used. For detailed
+ * discussion, see SPARK-18646.
  */
 class ExecutorClassLoader(
     conf: SparkConf,
     env: SparkEnv,
     classUri: String,
     parent: ClassLoader,
-    userClassPathFirst: Boolean) extends ClassLoader with Logging {
+    userClassPathFirst: Boolean) extends ClassLoader(null) with Logging {
   val uri = new URI(classUri)
   val directory = uri.getPath
 
diff --git a/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala b/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala
index 6d274bddb7782..092d3c272b8f6 100644
--- a/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala
+++ b/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala
@@ -23,6 +23,8 @@ import java.nio.channels.{FileChannel, ReadableByteChannel}
 import java.nio.charset.StandardCharsets
 import java.nio.file.{Paths, StandardOpenOption}
 import java.util
+import java.util.Collections
+import javax.tools.{JavaFileObject, SimpleJavaFileObject, ToolProvider}
 
 import scala.io.Source
 import scala.language.implicitConversions
@@ -77,6 +79,50 @@ class ExecutorClassLoaderSuite
     }
   }
 
+  test("child over system classloader") {
+    // JavaFileObject for scala.Option class
+    val scalaOptionFile = new SimpleJavaFileObject(
+      URI.create(s"string:///scala/Option.java"),
+      JavaFileObject.Kind.SOURCE) {
+
+      override def getCharContent(ignoreEncodingErrors: Boolean): CharSequence = {
+        "package scala; class Option {}"
+      }
+    }
+    // compile fake scala.Option class
+    ToolProvider
+      .getSystemJavaCompiler
+      .getTask(null, null, null, null, null, Collections.singletonList(scalaOptionFile)).call()
+
+    // create 'scala' dir in tempDir1
+    val scalaDir = new File(tempDir1, "scala")
+    assert(scalaDir.mkdir(), s"Failed to create 'scala' directory in $tempDir1")
+
+    // move the generated class into scala dir
+    val filename = "Option.class"
+    val result = new File(filename)
+    assert(result.exists(), "Compiled file not found: " + result.getAbsolutePath)
+
+    val out = new File(scalaDir, filename)
+    Files.move(result, out)
+    assert(out.exists(), "Destination file not moved: " + out.getAbsolutePath)
+
+    // construct class loader tree
+    val parentLoader = new URLClassLoader(urls2, null)
+    val classLoader = new ExecutorClassLoader(
+      new SparkConf(), null, url1, parentLoader, true)
+
+    // load 'scala.Option', using ClassforName to do the exact same behavior as
+    // what JavaDeserializationStream does
+
+    // scalastyle:off classforname
+    val optionClass = Class.forName("scala.Option", false, classLoader)
+    // scalastyle:on classforname
+
+    assert(optionClass.getClassLoader == classLoader,
+      "scala.Option didn't come from ExecutorClassLoader")
+  }
+
   test("child first") {
     val parentLoader = new URLClassLoader(urls2, null)
     val classLoader = new ExecutorClassLoader(new SparkConf(), null, url1, parentLoader, true)

From 425c4ada4c24e338b45d0e9987071f05c5766fa5 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Thu, 13 Jul 2017 17:06:24 +0800
Subject: [PATCH 170/779] [SPARK-19810][BUILD][CORE] Remove support for Scala
 2.10

## What changes were proposed in this pull request?

- Remove Scala 2.10 build profiles and support
- Replace some 2.10 support in scripts with commented placeholders for 2.12 later
- Remove deprecated API calls from 2.10 support
- Remove usages of deprecated context bounds where possible
- Remove Scala 2.10 workarounds like ScalaReflectionLock
- Other minor Scala warning fixes

## How was this patch tested?

Existing tests

Author: Sean Owen <sowen@cloudera.com>

Closes #17150 from srowen/SPARK-19810.
---
 R/pkg/R/sparkR.R                              |    4 +-
 R/pkg/tests/fulltests/test_client.R           |    4 +-
 bin/load-spark-env.cmd                        |   22 +-
 bin/load-spark-env.sh                         |   22 +-
 build/mvn                                     |    6 +-
 .../scala/org/apache/spark/Accumulable.scala  |    3 +-
 .../scala/org/apache/spark/SparkContext.scala |   15 +-
 .../spark/rdd/SequenceFileRDDFunctions.scala  |   54 +-
 .../org/apache/spark/rpc/RpcTimeout.scala     |    4 +-
 .../org/apache/spark/ui/JettyUtils.scala      |    6 +-
 .../spark/util/logging/FileAppender.scala     |    8 +-
 .../scala/org/apache/spark/FileSuite.scala    |    4 +-
 .../spark/deploy/SparkSubmitUtilsSuite.scala  |    6 +-
 .../spark/deploy/master/MasterSuite.scala     |    3 +-
 .../apache/spark/executor/ExecutorSuite.scala |    1 +
 .../spark/rdd/LocalCheckpointSuite.scala      |    1 +
 .../OutputCommitCoordinatorSuite.scala        |    2 +-
 .../spark/scheduler/SparkListenerSuite.scala  |    2 +-
 .../scheduler/TaskSetBlacklistSuite.scala     |    4 +-
 .../serializer/KryoSerializerSuite.scala      |    4 +-
 .../spark/util/TimeStampedHashMapSuite.scala  |    2 +-
 .../apache/spark/util/VersionUtilsSuite.scala |    8 +-
 .../util/collection/AppendOnlyMapSuite.scala  |    4 +-
 .../util/collection/ExternalSorterSuite.scala |   14 +-
 .../util/collection/OpenHashMapSuite.scala    |    5 +-
 dev/change-scala-version.sh                   |    4 +-
 dev/change-version-to-2.10.sh                 |   23 -
 dev/change-version-to-2.11.sh                 |   23 -
 dev/create-release/release-build.sh           |   23 +-
 docs/building-spark.md                        |    8 -
 docs/index.md                                 |    3 +-
 .../graphx/AggregateMessagesExample.scala     |    2 +-
 .../spark/examples/mllib/AbstractParams.scala |    2 +-
 .../spark/examples/sql/SparkSQLExample.scala  |    2 -
 .../spark/streaming/kafka/KafkaRDD.scala      |    4 +-
 .../org/apache/spark/graphx/EdgeContext.scala |    2 +-
 .../launcher/AbstractCommandBuilder.java      |   20 +-
 .../org/apache/spark/ml/linalg/Matrices.scala |    2 +-
 .../spark/ml/linalg/MatricesSuite.scala       |   20 +-
 .../org/apache/spark/ml/tree/treeModels.scala |    2 +-
 .../apache/spark/mllib/linalg/Matrices.scala  |    2 +-
 .../linalg/distributed/BlockMatrix.scala      |    4 +-
 .../spark/ml/recommendation/ALSSuite.scala    |    2 +-
 .../spark/mllib/clustering/LDASuite.scala     |    2 +-
 .../spark/mllib/linalg/MatricesSuite.scala    |   20 +-
 pom.xml                                       |   68 +-
 project/SparkBuild.scala                      |   23 +-
 python/run-tests.py                           |    3 +-
 repl/pom.xml                                  |   26 +-
 .../scala/org/apache/spark/repl/Main.scala    |   37 -
 .../apache/spark/repl/SparkCommandLine.scala  |   46 -
 .../apache/spark/repl/SparkExprTyper.scala    |  114 --
 .../org/apache/spark/repl/SparkHelper.scala   |   39 -
 .../org/apache/spark/repl/SparkILoop.scala    | 1145 -----------
 .../apache/spark/repl/SparkILoopInit.scala    |  168 --
 .../org/apache/spark/repl/SparkIMain.scala    | 1808 -----------------
 .../org/apache/spark/repl/SparkImports.scala  |  239 ---
 .../spark/repl/SparkJLineCompletion.scala     |  403 ----
 .../apache/spark/repl/SparkJLineReader.scala  |   90 -
 .../spark/repl/SparkMemberHandlers.scala      |  232 ---
 .../spark/repl/SparkRunnerSettings.scala      |   31 -
 .../org/apache/spark/repl/ReplSuite.scala     |  366 ----
 .../execution/UnsafeExternalRowSorter.java    |    4 +-
 .../spark/sql/catalyst/ScalaReflection.scala  |   36 +-
 .../sql/catalyst/analysis/Analyzer.scala      |    4 +-
 .../expressions/codegen/package.scala         |    4 +-
 .../expressions/namedExpressions.scala        |    2 +-
 .../apache/spark/sql/catalyst/package.scala   |    6 -
 .../util/AbstractScalaRowIterator.scala       |   30 -
 .../apache/spark/sql/types/BinaryType.scala   |    3 +-
 .../apache/spark/sql/types/BooleanType.scala  |    3 +-
 .../org/apache/spark/sql/types/ByteType.scala |    3 +-
 .../org/apache/spark/sql/types/DateType.scala |    4 +-
 .../apache/spark/sql/types/DecimalType.scala  |    3 +-
 .../apache/spark/sql/types/DoubleType.scala   |    3 +-
 .../apache/spark/sql/types/FloatType.scala    |    3 +-
 .../spark/sql/types/HiveStringType.scala      |    5 +-
 .../apache/spark/sql/types/IntegerType.scala  |    4 +-
 .../org/apache/spark/sql/types/LongType.scala |    3 +-
 .../apache/spark/sql/types/ShortType.scala    |    3 +-
 .../apache/spark/sql/types/StringType.scala   |    3 +-
 .../spark/sql/types/TimestampType.scala       |    4 +-
 .../optimizer/JoinOptimizationSuite.scala     |   24 +-
 .../optimizer/OptimizeCodegenSuite.scala      |    4 +-
 .../scala/org/apache/spark/sql/Column.scala   |    2 +-
 .../execution/OptimizeMetadataOnlyQuery.scala |    8 +-
 .../spark/sql/execution/QueryExecution.scala  |    4 +-
 .../execution/python/ExtractPythonUDFs.scala  |    2 +-
 .../org/apache/spark/sql/DatasetSuite.scala   |    4 +-
 .../spark/sql/StatisticsCollectionSuite.scala |   20 +-
 .../org/apache/spark/sql/SubquerySuite.scala  |   30 +-
 .../datasources/parquet/ParquetIOSuite.scala  |    2 +-
 .../execution/metric/SQLMetricsSuite.scala    |   60 +-
 .../sql/execution/ui/SQLListenerSuite.scala   |    2 +-
 .../expressions/ReduceAggregatorSuite.scala   |   14 +-
 .../StreamingQueryListenerSuite.scala         |    2 -
 .../org/apache/spark/sql/hive/HiveUtils.scala |    4 +-
 .../sql/hive/HiveMetastoreCatalogSuite.scala  |    8 +-
 .../spark/sql/hive/HiveSparkSubmitSuite.scala |    2 +-
 .../sql/hive/execution/HiveDDLSuite.scala     |    2 +
 .../sql/hive/execution/HiveQuerySuite.scala   |    2 +-
 101 files changed, 311 insertions(+), 5231 deletions(-)
 delete mode 100755 dev/change-version-to-2.10.sh
 delete mode 100755 dev/change-version-to-2.11.sh
 delete mode 100644 repl/scala-2.10/src/main/scala/org/apache/spark/repl/Main.scala
 delete mode 100644 repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkCommandLine.scala
 delete mode 100644 repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkExprTyper.scala
 delete mode 100644 repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkHelper.scala
 delete mode 100644 repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala
 delete mode 100644 repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala
 delete mode 100644 repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkIMain.scala
 delete mode 100644 repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkImports.scala
 delete mode 100644 repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkJLineCompletion.scala
 delete mode 100644 repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkJLineReader.scala
 delete mode 100644 repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkMemberHandlers.scala
 delete mode 100644 repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkRunnerSettings.scala
 delete mode 100644 repl/scala-2.10/src/test/scala/org/apache/spark/repl/ReplSuite.scala
 delete mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/AbstractScalaRowIterator.scala

diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index f2d2620e5447a..81507ea7186af 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -113,7 +113,7 @@ sparkR.stop <- function() {
 #'                  list(spark.executor.memory="4g"),
 #'                  list(LD_LIBRARY_PATH="/directory of JVM libraries (libjvm.so) on workers/"),
 #'                  c("one.jar", "two.jar", "three.jar"),
-#'                  c("com.databricks:spark-avro_2.10:2.0.1"))
+#'                  c("com.databricks:spark-avro_2.11:2.0.1"))
 #'}
 #' @note sparkR.init since 1.4.0
 sparkR.init <- function(
@@ -357,7 +357,7 @@ sparkRHive.init <- function(jsc = NULL) {
 #' sparkR.session("yarn-client", "SparkR", "/home/spark",
 #'                list(spark.executor.memory="4g"),
 #'                c("one.jar", "two.jar", "three.jar"),
-#'                c("com.databricks:spark-avro_2.10:2.0.1"))
+#'                c("com.databricks:spark-avro_2.11:2.0.1"))
 #' sparkR.session(spark.master = "yarn-client", spark.executor.memory = "4g")
 #'}
 #' @note sparkR.session since 2.0.0
diff --git a/R/pkg/tests/fulltests/test_client.R b/R/pkg/tests/fulltests/test_client.R
index 0cf25fe1dbf39..de624b572cc2a 100644
--- a/R/pkg/tests/fulltests/test_client.R
+++ b/R/pkg/tests/fulltests/test_client.R
@@ -37,7 +37,7 @@ test_that("multiple packages don't produce a warning", {
 
 test_that("sparkJars sparkPackages as character vectors", {
   args <- generateSparkSubmitArgs("", "", c("one.jar", "two.jar", "three.jar"), "",
-                                  c("com.databricks:spark-avro_2.10:2.0.1"))
+                                  c("com.databricks:spark-avro_2.11:2.0.1"))
   expect_match(args, "--jars one.jar,two.jar,three.jar")
-  expect_match(args, "--packages com.databricks:spark-avro_2.10:2.0.1")
+  expect_match(args, "--packages com.databricks:spark-avro_2.11:2.0.1")
 })
diff --git a/bin/load-spark-env.cmd b/bin/load-spark-env.cmd
index 0977025c2036e..993aa31a4c374 100644
--- a/bin/load-spark-env.cmd
+++ b/bin/load-spark-env.cmd
@@ -35,21 +35,21 @@ if [%SPARK_ENV_LOADED%] == [] (
 
 rem Setting SPARK_SCALA_VERSION if not already set.
 
-set ASSEMBLY_DIR2="%SPARK_HOME%\assembly\target\scala-2.11"
-set ASSEMBLY_DIR1="%SPARK_HOME%\assembly\target\scala-2.10"
+rem set ASSEMBLY_DIR2="%SPARK_HOME%\assembly\target\scala-2.11"
+rem set ASSEMBLY_DIR1="%SPARK_HOME%\assembly\target\scala-2.12"
 
 if [%SPARK_SCALA_VERSION%] == [] (
 
-  if exist %ASSEMBLY_DIR2% if exist %ASSEMBLY_DIR1% (
-    echo "Presence of build for both scala versions(SCALA 2.10 and SCALA 2.11) detected."
-    echo "Either clean one of them or, set SPARK_SCALA_VERSION=2.11 in spark-env.cmd."
-    exit 1
-  )
-  if exist %ASSEMBLY_DIR2% (
+  rem if exist %ASSEMBLY_DIR2% if exist %ASSEMBLY_DIR1% (
+  rem   echo "Presence of build for multiple Scala versions detected."
+  rem   echo "Either clean one of them or, set SPARK_SCALA_VERSION=2.11 in spark-env.cmd."
+  rem   exit 1
+  rem )
+  rem if exist %ASSEMBLY_DIR2% (
     set SPARK_SCALA_VERSION=2.11
-  ) else (
-    set SPARK_SCALA_VERSION=2.10
-  )
+  rem ) else (
+  rem   set SPARK_SCALA_VERSION=2.12
+  rem )
 )
 exit /b 0
 
diff --git a/bin/load-spark-env.sh b/bin/load-spark-env.sh
index 8a2f709960a25..9de62039c51ed 100644
--- a/bin/load-spark-env.sh
+++ b/bin/load-spark-env.sh
@@ -46,18 +46,18 @@ fi
 
 if [ -z "$SPARK_SCALA_VERSION" ]; then
 
-  ASSEMBLY_DIR2="${SPARK_HOME}/assembly/target/scala-2.11"
-  ASSEMBLY_DIR1="${SPARK_HOME}/assembly/target/scala-2.10"
+  #ASSEMBLY_DIR2="${SPARK_HOME}/assembly/target/scala-2.11"
+  #ASSEMBLY_DIR1="${SPARK_HOME}/assembly/target/scala-2.12"
 
-  if [[ -d "$ASSEMBLY_DIR2" && -d "$ASSEMBLY_DIR1" ]]; then
-    echo -e "Presence of build for both scala versions(SCALA 2.10 and SCALA 2.11) detected." 1>&2
-    echo -e 'Either clean one of them or, export SPARK_SCALA_VERSION=2.11 in spark-env.sh.' 1>&2
-    exit 1
-  fi
+  #if [[ -d "$ASSEMBLY_DIR2" && -d "$ASSEMBLY_DIR1" ]]; then
+  #  echo -e "Presence of build for multiple Scala versions detected." 1>&2
+  #  echo -e 'Either clean one of them or, export SPARK_SCALA_VERSION=2.11 in spark-env.sh.' 1>&2
+  #  exit 1
+  #fi
 
-  if [ -d "$ASSEMBLY_DIR2" ]; then
+  #if [ -d "$ASSEMBLY_DIR2" ]; then
     export SPARK_SCALA_VERSION="2.11"
-  else
-    export SPARK_SCALA_VERSION="2.10"
-  fi
+  #else
+  #  export SPARK_SCALA_VERSION="2.12"
+  #fi
 fi
diff --git a/build/mvn b/build/mvn
index 1e393c331dd8b..efa4f9364ea52 100755
--- a/build/mvn
+++ b/build/mvn
@@ -91,13 +91,13 @@ install_mvn() {
 
 # Install zinc under the build/ folder
 install_zinc() {
-  local zinc_path="zinc-0.3.11/bin/zinc"
+  local zinc_path="zinc-0.3.15/bin/zinc"
   [ ! -f "${_DIR}/${zinc_path}" ] && ZINC_INSTALL_FLAG=1
   local TYPESAFE_MIRROR=${TYPESAFE_MIRROR:-https://downloads.typesafe.com}
 
   install_app \
-    "${TYPESAFE_MIRROR}/zinc/0.3.11" \
-    "zinc-0.3.11.tgz" \
+    "${TYPESAFE_MIRROR}/zinc/0.3.15" \
+    "zinc-0.3.15.tgz" \
     "${zinc_path}"
   ZINC_BIN="${_DIR}/${zinc_path}"
 }
diff --git a/core/src/main/scala/org/apache/spark/Accumulable.scala b/core/src/main/scala/org/apache/spark/Accumulable.scala
index 5532931e2a794..3092074232d18 100644
--- a/core/src/main/scala/org/apache/spark/Accumulable.scala
+++ b/core/src/main/scala/org/apache/spark/Accumulable.scala
@@ -201,7 +201,8 @@ trait AccumulableParam[R, T] extends Serializable {
 
 @deprecated("use AccumulatorV2", "2.0.0")
 private[spark] class
-GrowableAccumulableParam[R <% Growable[T] with TraversableOnce[T] with Serializable: ClassTag, T]
+GrowableAccumulableParam[R : ClassTag, T]
+  (implicit rg: R => Growable[T] with TraversableOnce[T] with Serializable)
   extends AccumulableParam[R, T] {
 
   def addAccumulator(growable: R, elem: T): R = {
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index b2a26c51d4de1..e1ce66a547bc6 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -183,8 +183,6 @@ class SparkContext(config: SparkConf) extends Logging {
   // log out Spark Version in Spark driver log
   logInfo(s"Running Spark version $SPARK_VERSION")
 
-  warnDeprecatedVersions()
-
   /* ------------------------------------------------------------------------------------- *
    | Private variables. These variables keep the internal state of the context, and are    |
    | not accessible by the outside world. They're mutable since we want to initialize all  |
@@ -349,13 +347,6 @@ class SparkContext(config: SparkConf) extends Logging {
     value
   }
 
-  private def warnDeprecatedVersions(): Unit = {
-    val javaVersion = System.getProperty("java.version").split("[+.\\-]+", 3)
-    if (scala.util.Properties.releaseVersion.exists(_.startsWith("2.10"))) {
-      logWarning("Support for Scala 2.10 is deprecated as of Spark 2.1.0")
-    }
-  }
-
   /** Control our logLevel. This overrides any user-defined log settings.
    * @param logLevel The desired log level as a string.
    * Valid log levels include: ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN
@@ -1396,6 +1387,8 @@ class SparkContext(config: SparkConf) extends Logging {
   @deprecated("use AccumulatorV2", "2.0.0")
   def accumulableCollection[R <% Growable[T] with TraversableOnce[T] with Serializable: ClassTag, T]
       (initialValue: R): Accumulable[R, T] = {
+    // TODO the context bound (<%) above should be replaced with simple type bound and implicit
+    // conversion but is a breaking change. This should be fixed in Spark 3.x.
     val param = new GrowableAccumulableParam[R, T]
     val acc = new Accumulable(initialValue, param)
     cleaner.foreach(_.registerAccumulatorForCleanup(acc.newAcc))
@@ -2605,9 +2598,9 @@ object SparkContext extends Logging {
    */
   private[spark] val LEGACY_DRIVER_IDENTIFIER = "<driver>"
 
-  private implicit def arrayToArrayWritable[T <% Writable: ClassTag](arr: Traversable[T])
+  private implicit def arrayToArrayWritable[T <: Writable : ClassTag](arr: Traversable[T])
     : ArrayWritable = {
-    def anyToWritable[U <% Writable](u: U): Writable = u
+    def anyToWritable[U <: Writable](u: U): Writable = u
 
     new ArrayWritable(classTag[T].runtimeClass.asInstanceOf[Class[Writable]],
         arr.map(x => anyToWritable(x)).toArray)
diff --git a/core/src/main/scala/org/apache/spark/rdd/SequenceFileRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/SequenceFileRDDFunctions.scala
index 86a332790fb00..02def89dd8c2b 100644
--- a/core/src/main/scala/org/apache/spark/rdd/SequenceFileRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/SequenceFileRDDFunctions.scala
@@ -16,7 +16,7 @@
  */
 package org.apache.spark.rdd
 
-import scala.reflect.{classTag, ClassTag}
+import scala.reflect.ClassTag
 
 import org.apache.hadoop.io.Writable
 import org.apache.hadoop.io.compress.CompressionCodec
@@ -39,40 +39,8 @@ class SequenceFileRDDFunctions[K <% Writable: ClassTag, V <% Writable : ClassTag
   extends Logging
   with Serializable {
 
-  private val keyWritableClass =
-    if (_keyWritableClass == null) {
-      // pre 1.3.0, we need to use Reflection to get the Writable class
-      getWritableClass[K]()
-    } else {
-      _keyWritableClass
-    }
-
-  private val valueWritableClass =
-    if (_valueWritableClass == null) {
-      // pre 1.3.0, we need to use Reflection to get the Writable class
-      getWritableClass[V]()
-    } else {
-      _valueWritableClass
-    }
-
-  private def getWritableClass[T <% Writable: ClassTag](): Class[_ <: Writable] = {
-    val c = {
-      if (classOf[Writable].isAssignableFrom(classTag[T].runtimeClass)) {
-        classTag[T].runtimeClass
-      } else {
-        // We get the type of the Writable class by looking at the apply method which converts
-        // from T to Writable. Since we have two apply methods we filter out the one which
-        // is not of the form "java.lang.Object apply(java.lang.Object)"
-        implicitly[T => Writable].getClass.getDeclaredMethods().filter(
-            m => m.getReturnType().toString != "class java.lang.Object" &&
-                 m.getName() == "apply")(0).getReturnType
-
-      }
-       // TODO: use something like WritableConverter to avoid reflection
-    }
-    c.asInstanceOf[Class[_ <: Writable]]
-  }
-
+  // TODO the context bound (<%) above should be replaced with simple type bound and implicit
+  // conversion but is a breaking change. This should be fixed in Spark 3.x.
 
   /**
    * Output the RDD as a Hadoop SequenceFile using the Writable types we infer from the RDD's key
@@ -90,24 +58,24 @@ class SequenceFileRDDFunctions[K <% Writable: ClassTag, V <% Writable : ClassTag
     // valueWritableClass at the compile time. To implement that, we need to add type parameters to
     // SequenceFileRDDFunctions. however, SequenceFileRDDFunctions is a public class so it will be a
     // breaking change.
-    val convertKey = self.keyClass != keyWritableClass
-    val convertValue = self.valueClass != valueWritableClass
+    val convertKey = self.keyClass != _keyWritableClass
+    val convertValue = self.valueClass != _valueWritableClass
 
-    logInfo("Saving as sequence file of type (" + keyWritableClass.getSimpleName + "," +
-      valueWritableClass.getSimpleName + ")" )
+    logInfo("Saving as sequence file of type " +
+      s"(${_keyWritableClass.getSimpleName},${_valueWritableClass.getSimpleName})" )
     val format = classOf[SequenceFileOutputFormat[Writable, Writable]]
     val jobConf = new JobConf(self.context.hadoopConfiguration)
     if (!convertKey && !convertValue) {
-      self.saveAsHadoopFile(path, keyWritableClass, valueWritableClass, format, jobConf, codec)
+      self.saveAsHadoopFile(path, _keyWritableClass, _valueWritableClass, format, jobConf, codec)
     } else if (!convertKey && convertValue) {
       self.map(x => (x._1, anyToWritable(x._2))).saveAsHadoopFile(
-        path, keyWritableClass, valueWritableClass, format, jobConf, codec)
+        path, _keyWritableClass, _valueWritableClass, format, jobConf, codec)
     } else if (convertKey && !convertValue) {
       self.map(x => (anyToWritable(x._1), x._2)).saveAsHadoopFile(
-        path, keyWritableClass, valueWritableClass, format, jobConf, codec)
+        path, _keyWritableClass, _valueWritableClass, format, jobConf, codec)
     } else if (convertKey && convertValue) {
       self.map(x => (anyToWritable(x._1), anyToWritable(x._2))).saveAsHadoopFile(
-        path, keyWritableClass, valueWritableClass, format, jobConf, codec)
+        path, _keyWritableClass, _valueWritableClass, format, jobConf, codec)
     }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcTimeout.scala b/core/src/main/scala/org/apache/spark/rpc/RpcTimeout.scala
index 0557b7a3cc0b7..3dc41f7f12798 100644
--- a/core/src/main/scala/org/apache/spark/rpc/RpcTimeout.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcTimeout.scala
@@ -125,9 +125,9 @@ private[spark] object RpcTimeout {
     var foundProp: Option[(String, String)] = None
     while (itr.hasNext && foundProp.isEmpty) {
       val propKey = itr.next()
-      conf.getOption(propKey).foreach { prop => foundProp = Some(propKey, prop) }
+      conf.getOption(propKey).foreach { prop => foundProp = Some((propKey, prop)) }
     }
-    val finalProp = foundProp.getOrElse(timeoutPropList.head, defaultValue)
+    val finalProp = foundProp.getOrElse((timeoutPropList.head, defaultValue))
     val timeout = { Utils.timeStringAsSeconds(finalProp._2).seconds }
     new RpcTimeout(timeout, finalProp._1)
   }
diff --git a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
index b9371c7ad7b45..0fa96713eaf95 100644
--- a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
@@ -54,7 +54,7 @@ private[spark] object JettyUtils extends Logging {
   // implicit conversion from many types of functions to jetty Handlers.
   type Responder[T] = HttpServletRequest => T
 
-  class ServletParams[T <% AnyRef](val responder: Responder[T],
+  class ServletParams[T <: AnyRef](val responder: Responder[T],
     val contentType: String,
     val extractFn: T => String = (in: Any) => in.toString) {}
 
@@ -68,7 +68,7 @@ private[spark] object JettyUtils extends Logging {
   implicit def textResponderToServlet(responder: Responder[String]): ServletParams[String] =
     new ServletParams(responder, "text/plain")
 
-  def createServlet[T <% AnyRef](
+  def createServlet[T <: AnyRef](
       servletParams: ServletParams[T],
       securityMgr: SecurityManager,
       conf: SparkConf): HttpServlet = {
@@ -113,7 +113,7 @@ private[spark] object JettyUtils extends Logging {
   }
 
   /** Create a context handler that responds to a request with the given path prefix */
-  def createServletHandler[T <% AnyRef](
+  def createServletHandler[T <: AnyRef](
       path: String,
       servletParams: ServletParams[T],
       securityMgr: SecurityManager,
diff --git a/core/src/main/scala/org/apache/spark/util/logging/FileAppender.scala b/core/src/main/scala/org/apache/spark/util/logging/FileAppender.scala
index 8a0cc709bccc5..2f9ad4c8cc3e1 100644
--- a/core/src/main/scala/org/apache/spark/util/logging/FileAppender.scala
+++ b/core/src/main/scala/org/apache/spark/util/logging/FileAppender.scala
@@ -125,16 +125,16 @@ private[spark] object FileAppender extends Logging {
       val validatedParams: Option[(Long, String)] = rollingInterval match {
         case "daily" =>
           logInfo(s"Rolling executor logs enabled for $file with daily rolling")
-          Some(24 * 60 * 60 * 1000L, "--yyyy-MM-dd")
+          Some((24 * 60 * 60 * 1000L, "--yyyy-MM-dd"))
         case "hourly" =>
           logInfo(s"Rolling executor logs enabled for $file with hourly rolling")
-          Some(60 * 60 * 1000L, "--yyyy-MM-dd--HH")
+          Some((60 * 60 * 1000L, "--yyyy-MM-dd--HH"))
         case "minutely" =>
           logInfo(s"Rolling executor logs enabled for $file with rolling every minute")
-          Some(60 * 1000L, "--yyyy-MM-dd--HH-mm")
+          Some((60 * 1000L, "--yyyy-MM-dd--HH-mm"))
         case IntParam(seconds) =>
           logInfo(s"Rolling executor logs enabled for $file with rolling $seconds seconds")
-          Some(seconds * 1000L, "--yyyy-MM-dd--HH-mm-ss")
+          Some((seconds * 1000L, "--yyyy-MM-dd--HH-mm-ss"))
         case _ =>
           logWarning(s"Illegal interval for rolling executor logs [$rollingInterval], " +
               s"rolling logs not enabled")
diff --git a/core/src/test/scala/org/apache/spark/FileSuite.scala b/core/src/test/scala/org/apache/spark/FileSuite.scala
index 5be0121db58ae..02728180ac82d 100644
--- a/core/src/test/scala/org/apache/spark/FileSuite.scala
+++ b/core/src/test/scala/org/apache/spark/FileSuite.scala
@@ -113,11 +113,11 @@ class FileSuite extends SparkFunSuite with LocalSparkContext {
 
     val normalFile = new File(normalDir, "part-00000")
     val normalContent = sc.sequenceFile[String, String](normalDir).collect
-    assert(normalContent === Array.fill(100)("abc", "abc"))
+    assert(normalContent === Array.fill(100)(("abc", "abc")))
 
     val compressedFile = new File(compressedOutputDir, "part-00000" + codec.getDefaultExtension)
     val compressedContent = sc.sequenceFile[String, String](compressedOutputDir).collect
-    assert(compressedContent === Array.fill(100)("abc", "abc"))
+    assert(compressedContent === Array.fill(100)(("abc", "abc")))
 
     assert(compressedFile.length < normalFile.length)
   }
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
index 57024786b95e3..88b77e5143420 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
@@ -93,8 +93,8 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll {
 
   test("add dependencies works correctly") {
     val md = SparkSubmitUtils.getModuleDescriptor
-    val artifacts = SparkSubmitUtils.extractMavenCoordinates("com.databricks:spark-csv_2.10:0.1," +
-      "com.databricks:spark-avro_2.10:0.1")
+    val artifacts = SparkSubmitUtils.extractMavenCoordinates("com.databricks:spark-csv_2.11:0.1," +
+      "com.databricks:spark-avro_2.11:0.1")
 
     SparkSubmitUtils.addDependenciesToIvy(md, artifacts, "default")
     assert(md.getDependencies.length === 2)
@@ -196,7 +196,7 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll {
       SparkSubmitUtils.buildIvySettings(None, None),
       isTest = true)
     assert(path === "", "should return empty path")
-    val main = MavenCoordinate("org.apache.spark", "spark-streaming-kafka-assembly_2.10", "1.2.0")
+    val main = MavenCoordinate("org.apache.spark", "spark-streaming-kafka-assembly_2.11", "1.2.0")
     IvyTestUtils.withRepository(main, None, None) { repo =>
       val files = SparkSubmitUtils.resolveMavenCoordinates(
         coordinates + "," + main.toString,
diff --git a/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala b/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala
index a2232126787f6..84b3a29b58bf4 100644
--- a/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala
@@ -80,6 +80,7 @@ class MockWorker(master: RpcEndpointRef, conf: SparkConf = new SparkConf) extend
         case Some(appId) =>
           apps.remove(appId)
           master.send(UnregisterApplication(appId))
+        case None =>
       }
       driverIdToAppId.remove(driverId)
   }
@@ -575,7 +576,7 @@ class MasterSuite extends SparkFunSuite
       override val rpcEnv: RpcEnv = master.rpcEnv
 
       override def receive: PartialFunction[Any, Unit] = {
-        case KillExecutor(_, appId, execId) => killedExecutors.add(appId, execId)
+        case KillExecutor(_, appId, execId) => killedExecutors.add((appId, execId))
         case KillDriver(driverId) => killedDrivers.add(driverId)
       }
     })
diff --git a/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala b/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala
index efcad140350b9..601dde6c63284 100644
--- a/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala
@@ -25,6 +25,7 @@ import java.util.concurrent.{CountDownLatch, TimeUnit}
 
 import scala.collection.mutable.Map
 import scala.concurrent.duration._
+import scala.language.postfixOps
 
 import org.mockito.ArgumentCaptor
 import org.mockito.Matchers.{any, eq => meq}
diff --git a/core/src/test/scala/org/apache/spark/rdd/LocalCheckpointSuite.scala b/core/src/test/scala/org/apache/spark/rdd/LocalCheckpointSuite.scala
index 9e204f5cc33fe..478f0690f8e45 100644
--- a/core/src/test/scala/org/apache/spark/rdd/LocalCheckpointSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/LocalCheckpointSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.rdd
 
 import scala.concurrent.duration._
+import scala.language.postfixOps
 
 import org.scalatest.concurrent.Eventually.{eventually, interval, timeout}
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
index 1579b614ea5b0..60b5955321984 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
@@ -115,7 +115,7 @@ class OutputCommitCoordinatorSuite extends SparkFunSuite with BeforeAndAfter {
               locality: TaskLocality.Value): Option[(Int, TaskLocality.Value)] = {
             if (!hasDequeuedSpeculatedTask) {
               hasDequeuedSpeculatedTask = true
-              Some(0, TaskLocality.PROCESS_LOCAL)
+              Some((0, TaskLocality.PROCESS_LOCAL))
             } else {
               None
             }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
index f3d0bc19675fc..481603b40357c 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
@@ -299,7 +299,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     val d2 = d.map { i => w(i) -> i * 2 }.setName("shuffle input 1")
     val d3 = d.map { i => w(i) -> (0 to (i % 5)) }.setName("shuffle input 2")
     val d4 = d2.cogroup(d3, numSlices).map { case (k, (v1, v2)) =>
-      w(k) -> (v1.size, v2.size)
+      (w(k), (v1.size, v2.size))
     }
     d4.setName("A Cogroup")
     d4.collectAsMap()
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetBlacklistSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetBlacklistSuite.scala
index 6b52c10b2c68b..f1392e9db6bfd 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetBlacklistSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetBlacklistSuite.scala
@@ -86,8 +86,8 @@ class TaskSetBlacklistSuite extends SparkFunSuite {
     Seq("exec1", "exec2").foreach { exec =>
       assert(
         execToFailures(exec).taskToFailureCountAndFailureTime === Map(
-          0 -> (1, 0),
-          1 -> (1, 0)
+          0 -> ((1, 0)),
+          1 -> ((1, 0))
         )
       )
     }
diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
index 7c3922e47fbb9..eaec098b8d785 100644
--- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
@@ -276,7 +276,7 @@ class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext {
   }
 
   test("kryo with collect for specialized tuples") {
-    assert (sc.parallelize( Array((1, 11), (2, 22), (3, 33)) ).collect().head === (1, 11))
+    assert (sc.parallelize( Array((1, 11), (2, 22), (3, 33)) ).collect().head === ((1, 11)))
   }
 
   test("kryo with SerializableHyperLogLog") {
@@ -475,7 +475,7 @@ class KryoSerializerAutoResetDisabledSuite extends SparkFunSuite with SharedSpar
     val deserializationStream = serInstance.deserializeStream(new ByteArrayInputStream(worldWorld))
     assert(deserializationStream.readValue[Any]() === world)
     deserializationStream.close()
-    assert(serInstance.deserialize[Any](helloHello) === (hello, hello))
+    assert(serInstance.deserialize[Any](helloHello) === ((hello, hello)))
   }
 }
 
diff --git a/core/src/test/scala/org/apache/spark/util/TimeStampedHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/TimeStampedHashMapSuite.scala
index fd9add76909b9..dcae78900fde4 100644
--- a/core/src/test/scala/org/apache/spark/util/TimeStampedHashMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/TimeStampedHashMapSuite.scala
@@ -115,7 +115,7 @@ class TimeStampedHashMapSuite extends SparkFunSuite {
       testMap2("k1") = "v1"
       testMap2 --= keys
       assert(testMap2.size === 1)
-      assert(testMap2.iterator.toSeq.head === ("k1", "v1"))
+      assert(testMap2.iterator.toSeq.head === (("k1", "v1")))
 
       // +
       val testMap3 = testMap2 + (("k0", "v0"))
diff --git a/core/src/test/scala/org/apache/spark/util/VersionUtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/VersionUtilsSuite.scala
index aaf79ebd4f9fc..b36d6be231d39 100644
--- a/core/src/test/scala/org/apache/spark/util/VersionUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/VersionUtilsSuite.scala
@@ -58,10 +58,10 @@ class VersionUtilsSuite extends SparkFunSuite {
   }
 
   test("Parse Spark major and minor versions") {
-    assert(majorMinorVersion("2.0") === (2, 0))
-    assert(majorMinorVersion("12.10.11") === (12, 10))
-    assert(majorMinorVersion("2.0.1-SNAPSHOT") === (2, 0))
-    assert(majorMinorVersion("2.0.x") === (2, 0))
+    assert(majorMinorVersion("2.0") === ((2, 0)))
+    assert(majorMinorVersion("12.10.11") === ((12, 10)))
+    assert(majorMinorVersion("2.0.1-SNAPSHOT") === ((2, 0)))
+    assert(majorMinorVersion("2.0.x") === ((2, 0)))
     withClue("majorMinorVersion parsing should fail for invalid major version number") {
       intercept[IllegalArgumentException] {
         majorMinorVersion("2z.0")
diff --git a/core/src/test/scala/org/apache/spark/util/collection/AppendOnlyMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/AppendOnlyMapSuite.scala
index a2a6d703860f2..6b4e92879d6cb 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/AppendOnlyMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/AppendOnlyMapSuite.scala
@@ -181,9 +181,9 @@ class AppendOnlyMapSuite extends SparkFunSuite {
     // Should be sorted by key
     assert(it.hasNext)
     var previous = it.next()
-    assert(previous == (null, "happy new year!"))
+    assert(previous == ((null, "happy new year!")))
     previous = it.next()
-    assert(previous == ("1", "2014"))
+    assert(previous == (("1", "2014")))
     while (it.hasNext) {
       val kv = it.next()
       assert(kv._1.toInt > previous._1.toInt)
diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
index 6bcc601e13ecc..47173b89e91e2 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
@@ -388,13 +388,13 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
     sorter.insertAll(elements)
     assert(sorter.numSpills > 0, "sorter did not spill")
     val iter = sorter.partitionedIterator.map(p => (p._1, p._2.toList))
-    assert(iter.next() === (0, Nil))
-    assert(iter.next() === (1, List((1, 1))))
-    assert(iter.next() === (2, (0 until 1000).map(x => (2, 2)).toList))
-    assert(iter.next() === (3, Nil))
-    assert(iter.next() === (4, Nil))
-    assert(iter.next() === (5, List((5, 5))))
-    assert(iter.next() === (6, Nil))
+    assert(iter.next() === ((0, Nil)))
+    assert(iter.next() === ((1, List((1, 1)))))
+    assert(iter.next() === ((2, (0 until 1000).map(x => (2, 2)).toList)))
+    assert(iter.next() === ((3, Nil)))
+    assert(iter.next() === ((4, Nil)))
+    assert(iter.next() === ((5, List((5, 5)))))
+    assert(iter.next() === ((6, Nil)))
     sorter.stop()
   }
 
diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
index 335ecb9320ab9..08a3200288981 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
@@ -75,7 +75,7 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers {
     for ((k, v) <- map) {
       set.add((k, v))
     }
-    val expected = (1 to 1000).map(x => (x.toString, x)) :+ (null.asInstanceOf[String], -1)
+    val expected = (1 to 1000).map(x => (x.toString, x)) :+ ((null.asInstanceOf[String], -1))
     assert(set === expected.toSet)
   }
 
@@ -103,7 +103,8 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers {
     for ((k, v) <- map) {
       set.add((k, v))
     }
-    val expected = (1 to 1000).map(_.toString).map(x => (x, x)) :+ (null.asInstanceOf[String], "-1")
+    val expected =
+      (1 to 1000).map(_.toString).map(x => (x, x)) :+ ((null.asInstanceOf[String], "-1"))
     assert(set === expected.toSet)
   }
 
diff --git a/dev/change-scala-version.sh b/dev/change-scala-version.sh
index d7975dfb6475c..022e68c2796c3 100755
--- a/dev/change-scala-version.sh
+++ b/dev/change-scala-version.sh
@@ -19,7 +19,7 @@
 
 set -e
 
-VALID_VERSIONS=( 2.10 2.11 )
+VALID_VERSIONS=( 2.11 2.12 )
 
 usage() {
   echo "Usage: $(basename $0) [-h|--help] <version>
@@ -45,7 +45,7 @@ check_scala_version() {
 check_scala_version "$TO_VERSION"
 
 if [ $TO_VERSION = "2.11" ]; then
-  FROM_VERSION="2.10"
+  FROM_VERSION="2.12"
 else
   FROM_VERSION="2.11"
 fi
diff --git a/dev/change-version-to-2.10.sh b/dev/change-version-to-2.10.sh
deleted file mode 100755
index b718d94f849dd..0000000000000
--- a/dev/change-version-to-2.10.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# This script exists for backwards compatibility. Use change-scala-version.sh instead.
-echo "This script is deprecated. Please instead run: change-scala-version.sh 2.10"
-
-$(dirname $0)/change-scala-version.sh 2.10
diff --git a/dev/change-version-to-2.11.sh b/dev/change-version-to-2.11.sh
deleted file mode 100755
index 93087959a38dd..0000000000000
--- a/dev/change-version-to-2.11.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# This script exists for backwards compatibility. Use change-scala-version.sh instead.
-echo "This script is deprecated. Please instead run: change-scala-version.sh 2.11"
-
-$(dirname $0)/change-scala-version.sh 2.11
diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index a72307a28ad7a..9bf2899e340ec 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -155,10 +155,10 @@ if [[ "$1" == "package" ]]; then
 
     cd spark-$SPARK_VERSION-bin-$NAME
 
-    # TODO There should probably be a flag to make-distribution to allow 2.10 support
-    if [[ $FLAGS == *scala-2.10* ]]; then
-      ./dev/change-scala-version.sh 2.10
-    fi
+    # TODO There should probably be a flag to make-distribution to allow 2.12 support
+    #if [[ $FLAGS == *scala-2.12* ]]; then
+    #  ./dev/change-scala-version.sh 2.12
+    #fi
 
     export ZINC_PORT=$ZINC_PORT
     echo "Creating distribution: $NAME ($FLAGS)"
@@ -305,9 +305,9 @@ if [[ "$1" == "publish-snapshot" ]]; then
   export ZINC_PORT=$(python -S -c "import random; print random.randrange(3030,4030)")
 
   $MVN -DzincPort=$ZINC_PORT --settings $tmp_settings -DskipTests $PUBLISH_PROFILES deploy
-  ./dev/change-scala-version.sh 2.10
-  $MVN -DzincPort=$ZINC_PORT -Dscala-2.10 --settings $tmp_settings \
-    -DskipTests $PUBLISH_PROFILES clean deploy
+  #./dev/change-scala-version.sh 2.12
+  #$MVN -DzincPort=$ZINC_PORT -Pscala-2.12 --settings $tmp_settings \
+  #  -DskipTests $PUBLISH_PROFILES clean deploy
 
   # Clean-up Zinc nailgun process
   /usr/sbin/lsof -P |grep $ZINC_PORT | grep LISTEN | awk '{ print $2; }' | xargs kill
@@ -342,16 +342,13 @@ if [[ "$1" == "publish-release" ]]; then
 
   $MVN -DzincPort=$ZINC_PORT -Dmaven.repo.local=$tmp_repo -DskipTests $PUBLISH_PROFILES clean install
 
-  ./dev/change-scala-version.sh 2.10
-
-  $MVN -DzincPort=$ZINC_PORT -Dmaven.repo.local=$tmp_repo -Dscala-2.10 \
-    -DskipTests $PUBLISH_PROFILES clean install
+  #./dev/change-scala-version.sh 2.12
+  #$MVN -DzincPort=$ZINC_PORT -Dmaven.repo.local=$tmp_repo -Pscala-2.12 \
+  #  -DskipTests $PUBLISH_PROFILES clean install
 
   # Clean-up Zinc nailgun process
   /usr/sbin/lsof -P |grep $ZINC_PORT | grep LISTEN | awk '{ print $2; }' | xargs kill
 
-  ./dev/change-version-to-2.10.sh
-
   pushd $tmp_repo/org/apache/spark
 
   # Remove any extra files generated during install
diff --git a/docs/building-spark.md b/docs/building-spark.md
index 815843c54ad7b..69d83023b2281 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -91,14 +91,6 @@ like ZooKeeper and Hadoop itself.
 
     ./build/mvn -Pmesos -DskipTests clean package
 
-## Building for Scala 2.10
-To produce a Spark package compiled with Scala 2.10, use the `-Dscala-2.10` property:
-
-    ./dev/change-scala-version.sh 2.10
-    ./build/mvn -Pyarn -Dscala-2.10 -DskipTests clean package
-
-Note that support for Scala 2.10 is deprecated as of Spark 2.1.0 and may be removed in Spark 2.3.0.
-
 ## Building submodules individually
 
 It's possible to build Spark sub-modules using the `mvn -pl` option.
diff --git a/docs/index.md b/docs/index.md
index 81ed4653b46b7..07b6b171014ed 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -31,8 +31,7 @@ uses Scala {{site.SCALA_BINARY_VERSION}}. You will need to use a compatible Scal
 ({{site.SCALA_BINARY_VERSION}}.x).
 
 Note that support for Java 7, Python 2.6 and old Hadoop versions before 2.6.5 were removed as of Spark 2.2.0.
-
-Note that support for Scala 2.10 is deprecated as of Spark 2.1.0, and may be removed in Spark 2.3.0.
+Support for Scala 2.10 was removed as of 2.3.0.
 
 # Running the Examples and Shell
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/graphx/AggregateMessagesExample.scala b/examples/src/main/scala/org/apache/spark/examples/graphx/AggregateMessagesExample.scala
index 8f8262db374b8..8441b5a9dd8aa 100644
--- a/examples/src/main/scala/org/apache/spark/examples/graphx/AggregateMessagesExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/graphx/AggregateMessagesExample.scala
@@ -52,7 +52,7 @@ object AggregateMessagesExample {
       triplet => { // Map Function
         if (triplet.srcAttr > triplet.dstAttr) {
           // Send message to destination vertex containing counter and age
-          triplet.sendToDst(1, triplet.srcAttr)
+          triplet.sendToDst((1, triplet.srcAttr))
         }
       },
       // Add counter and age
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/AbstractParams.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/AbstractParams.scala
index ae6057758d6fc..8985c8565c531 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/AbstractParams.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/AbstractParams.scala
@@ -38,7 +38,7 @@ abstract class AbstractParams[T: TypeTag] {
    */
   override def toString: String = {
     val tpe = tag.tpe
-    val allAccessors = tpe.declarations.collect {
+    val allAccessors = tpe.decls.collect {
       case m: MethodSymbol if m.isCaseAccessor => m
     }
     val mirror = runtimeMirror(getClass.getClassLoader)
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/SparkSQLExample.scala b/examples/src/main/scala/org/apache/spark/examples/sql/SparkSQLExample.scala
index b9a612d96a577..958361a6684c5 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/SparkSQLExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/SparkSQLExample.scala
@@ -29,8 +29,6 @@ import org.apache.spark.sql.types._
 object SparkSQLExample {
 
   // $example on:create_ds$
-  // Note: Case classes in Scala 2.10 can support only up to 22 fields. To work around this limit,
-  // you can use custom classes that implement the Product interface
   case class Person(name: String, age: Long)
   // $example off:create_ds$
 
diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala
index 2b925774a2f7f..5ea52b6ad36a0 100644
--- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala
+++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala
@@ -259,8 +259,8 @@ object KafkaRDD {
       messageHandler: MessageAndMetadata[K, V] => R
     ): KafkaRDD[K, V, U, T, R] = {
     val leaders = untilOffsets.map { case (tp, lo) =>
-        tp -> (lo.host, lo.port)
-    }.toMap
+        tp -> ((lo.host, lo.port))
+    }
 
     val offsetRanges = fromOffsets.map { case (tp, fo) =>
         val uo = untilOffsets(tp)
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/EdgeContext.scala b/graphx/src/main/scala/org/apache/spark/graphx/EdgeContext.scala
index 23430179f12ec..3b96a420b34a8 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/EdgeContext.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/EdgeContext.scala
@@ -63,5 +63,5 @@ object EdgeContext {
    * }}}
    */
   def unapply[VD, ED, A](edge: EdgeContext[VD, ED, A]): Some[(VertexId, VertexId, VD, VD, ED)] =
-    Some(edge.srcId, edge.dstId, edge.srcAttr, edge.dstAttr, edge.attr)
+    Some((edge.srcId, edge.dstId, edge.srcAttr, edge.dstAttr, edge.attr))
 }
diff --git a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
index 6c0c3ebcaebf4..481ff20e424d7 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
@@ -229,17 +229,17 @@ String getScalaVersion() {
       return scala;
     }
     String sparkHome = getSparkHome();
-    File scala210 = new File(sparkHome, "launcher/target/scala-2.10");
+    //File scala212 = new File(sparkHome, "launcher/target/scala-2.12");
     File scala211 = new File(sparkHome, "launcher/target/scala-2.11");
-    checkState(!scala210.isDirectory() || !scala211.isDirectory(),
-      "Presence of build for both scala versions (2.10 and 2.11) detected.\n" +
-      "Either clean one of them or set SPARK_SCALA_VERSION in your environment.");
-    if (scala210.isDirectory()) {
-      return "2.10";
-    } else {
-      checkState(scala211.isDirectory(), "Cannot find any build directories.");
-      return "2.11";
-    }
+    //checkState(!scala210.isDirectory() || !scala211.isDirectory(),
+    //  "Presence of build for multiple Scala versions detected.\n" +
+    //  "Either clean one of them or set SPARK_SCALA_VERSION in your environment.");
+    //if (scala212.isDirectory()) {
+    //  return "2.12";
+    //} else {
+    checkState(scala211.isDirectory(), "Cannot find any build directories.");
+    return "2.11";
+    //}
   }
 
   String getSparkHome() {
diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
index 07f3bc27280bd..66c53624419d9 100644
--- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
+++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
@@ -856,7 +856,7 @@ object SparseMatrix {
     var prevRow = -1
     var prevVal = 0.0
     // Append a dummy entry to include the last one at the end of the loop.
-    (sortedEntries.view :+ (numRows, numCols, 1.0)).foreach { case (i, j, v) =>
+    (sortedEntries.view :+ ((numRows, numCols, 1.0))).foreach { case (i, j, v) =>
       if (v != 0) {
         if (i == prevRow && j == prevCol) {
           prevVal += v
diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala
index 9f8202086817d..7fb9034d6501a 100644
--- a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala
+++ b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala
@@ -633,22 +633,22 @@ class MatricesSuite extends SparkMLFunSuite {
       dnMap.put((i, j), value)
     }
     assert(dnMap.size === 6)
-    assert(dnMap(0, 0) === 1.0)
-    assert(dnMap(1, 0) === 2.0)
-    assert(dnMap(2, 0) === 0.0)
-    assert(dnMap(0, 1) === 0.0)
-    assert(dnMap(1, 1) === 4.0)
-    assert(dnMap(2, 1) === 5.0)
+    assert(dnMap((0, 0)) === 1.0)
+    assert(dnMap((1, 0)) === 2.0)
+    assert(dnMap((2, 0)) === 0.0)
+    assert(dnMap((0, 1)) === 0.0)
+    assert(dnMap((1, 1)) === 4.0)
+    assert(dnMap((2, 1)) === 5.0)
 
     val spMap = MutableMap[(Int, Int), Double]()
     sp.foreachActive { (i, j, value) =>
       spMap.put((i, j), value)
     }
     assert(spMap.size === 4)
-    assert(spMap(0, 0) === 1.0)
-    assert(spMap(1, 0) === 2.0)
-    assert(spMap(1, 1) === 4.0)
-    assert(spMap(2, 1) === 5.0)
+    assert(spMap((0, 0)) === 1.0)
+    assert(spMap((1, 0)) === 2.0)
+    assert(spMap((1, 1)) === 4.0)
+    assert(spMap((2, 1)) === 5.0)
   }
 
   test("horzcat, vertcat, eye, speye") {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala
index 0d6e9034e5ce4..4aa4c3617e7fd 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala
@@ -436,7 +436,7 @@ private[ml] object EnsembleModelReadWrite {
     val treesMetadataRDD: RDD[(Int, (Metadata, Double))] = sql.read.parquet(treesMetadataPath)
       .select("treeID", "metadata", "weights").as[(Int, String, Double)].rdd.map {
       case (treeID: Int, json: String, weights: Double) =>
-        treeID -> (DefaultParamsReader.parseMetadata(json, treeClassName), weights)
+        treeID -> ((DefaultParamsReader.parseMetadata(json, treeClassName), weights))
     }
 
     val treesMetadataWeights = treesMetadataRDD.sortByKey().values.collect()
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index 2b2b5fe49ea32..bf9b4cfe15b2c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -797,7 +797,7 @@ object SparseMatrix {
     var prevRow = -1
     var prevVal = 0.0
     // Append a dummy entry to include the last one at the end of the loop.
-    (sortedEntries.view :+ (numRows, numCols, 1.0)).foreach { case (i, j, v) =>
+    (sortedEntries.view :+ ((numRows, numCols, 1.0))).foreach { case (i, j, v) =>
       if (v != 0) {
         if (i == prevRow && j == prevCol) {
           prevVal += v
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
index 20d68a34bf3ea..7caacd13b3459 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
@@ -275,7 +275,7 @@ class BlockMatrix @Since("1.3.0") (
     val rows = blocks.flatMap { case ((blockRowIdx, blockColIdx), mat) =>
       mat.rowIter.zipWithIndex.map {
         case (vector, rowIdx) =>
-          blockRowIdx * rowsPerBlock + rowIdx -> (blockColIdx, vector.asBreeze)
+          blockRowIdx * rowsPerBlock + rowIdx -> ((blockColIdx, vector.asBreeze))
       }
     }.groupByKey().map { case (rowIdx, vectors) =>
       val numberNonZeroPerRow = vectors.map(_._2.activeSize).sum.toDouble / cols.toDouble
@@ -286,7 +286,7 @@ class BlockMatrix @Since("1.3.0") (
         BDV.zeros[Double](cols)
       }
 
-      vectors.foreach { case (blockColIdx: Int, vec: BV[Double]) =>
+      vectors.foreach { case (blockColIdx: Int, vec: BV[_]) =>
         val offset = colsPerBlock * blockColIdx
         wholeVector(offset until Math.min(cols, offset + colsPerBlock)) := vec
       }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
index 0a0fea255c7f3..45d3f9b4c53be 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
@@ -898,7 +898,7 @@ class ALSStorageSuite
     // check final factor RDD default storage levels
     val defaultFactorRDDs = sc.getPersistentRDDs.collect {
       case (id, rdd) if rdd.name == "userFactors" || rdd.name == "itemFactors" =>
-        rdd.name -> (id, rdd.getStorageLevel)
+        rdd.name -> ((id, rdd.getStorageLevel))
     }.toMap
     defaultFactorRDDs.foreach { case (_, (id, level)) =>
       assert(level == StorageLevel.MEMORY_AND_DISK)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index 086bb211a9e43..8906e52faebe5 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -151,7 +151,7 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
     // Check: topTopicAssignments
     // Make sure it assigns a topic to each term appearing in each doc.
     val topTopicAssignments: Map[Long, (Array[Int], Array[Int])] =
-      model.topicAssignments.collect().map(x => x._1 -> (x._2, x._3)).toMap
+      model.topicAssignments.collect().map(x => x._1 -> ((x._2, x._3))).toMap
     assert(topTopicAssignments.keys.max < tinyCorpus.length)
     tinyCorpus.foreach { case (docID: Long, doc: Vector) =>
       if (topTopicAssignments.contains(docID)) {
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
index 93c00d80974c3..6736e7d3db511 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
@@ -241,22 +241,22 @@ class MatricesSuite extends SparkFunSuite {
       dnMap.put((i, j), value)
     }
     assert(dnMap.size === 6)
-    assert(dnMap(0, 0) === 1.0)
-    assert(dnMap(1, 0) === 2.0)
-    assert(dnMap(2, 0) === 0.0)
-    assert(dnMap(0, 1) === 0.0)
-    assert(dnMap(1, 1) === 4.0)
-    assert(dnMap(2, 1) === 5.0)
+    assert(dnMap((0, 0)) === 1.0)
+    assert(dnMap((1, 0)) === 2.0)
+    assert(dnMap((2, 0)) === 0.0)
+    assert(dnMap((0, 1)) === 0.0)
+    assert(dnMap((1, 1)) === 4.0)
+    assert(dnMap((2, 1)) === 5.0)
 
     val spMap = MutableMap[(Int, Int), Double]()
     sp.foreachActive { (i, j, value) =>
       spMap.put((i, j), value)
     }
     assert(spMap.size === 4)
-    assert(spMap(0, 0) === 1.0)
-    assert(spMap(1, 0) === 2.0)
-    assert(spMap(1, 1) === 4.0)
-    assert(spMap(2, 1) === 5.0)
+    assert(spMap((0, 0)) === 1.0)
+    assert(spMap((1, 0)) === 2.0)
+    assert(spMap((1, 1)) === 4.0)
+    assert(spMap((2, 1)) === 5.0)
   }
 
   test("horzcat, vertcat, eye, speye") {
diff --git a/pom.xml b/pom.xml
index f124ba45007b7..bea2798cc8619 100644
--- a/pom.xml
+++ b/pom.xml
@@ -498,7 +498,7 @@
         <groupId>org.slf4j</groupId>
         <artifactId>jcl-over-slf4j</artifactId>
         <version>${slf4j.version}</version>
-        <!-- <scope>runtime</scope> --> <!-- more correct, but scalac 2.10.3 doesn't like it -->
+        <scope>runtime</scope>
       </dependency>
       <dependency>
         <groupId>log4j</groupId>
@@ -1859,9 +1859,9 @@
         <version>${antlr4.version}</version>
       </dependency>
       <dependency>
-        <groupId>${jline.groupid}</groupId>
+        <groupId>jline</groupId>
         <artifactId>jline</artifactId>
-        <version>${jline.version}</version>
+        <version>2.12.1</version>
       </dependency>
       <dependency>
         <groupId>org.apache.commons</groupId>
@@ -1933,6 +1933,7 @@
                       -->
                       <exclude>org.jboss.netty</exclude>
                       <exclude>org.codehaus.groovy</exclude>
+                      <exclude>*:*_2.10</exclude>
                     </excludes>
                     <searchTransitive>true</searchTransitive>
                   </bannedDependencies>
@@ -1987,6 +1988,8 @@
               <arg>-unchecked</arg>
               <arg>-deprecation</arg>
               <arg>-feature</arg>
+              <arg>-explaintypes</arg>
+              <arg>-Yno-adapted-args</arg>
             </args>
             <jvmArgs>
               <jvmArg>-Xms1024m</jvmArg>
@@ -2585,44 +2588,6 @@
       </modules>
     </profile>
 
-    <profile>
-      <id>scala-2.10</id>
-      <activation>
-        <property><name>scala-2.10</name></property>
-      </activation>
-      <properties>
-        <scala.version>2.10.6</scala.version>
-        <scala.binary.version>2.10</scala.binary.version>
-        <jline.version>${scala.version}</jline.version>
-        <jline.groupid>org.scala-lang</jline.groupid>
-      </properties>
-      <build>
-        <plugins>
-          <plugin>
-            <groupId>org.apache.maven.plugins</groupId>
-            <artifactId>maven-enforcer-plugin</artifactId>
-            <executions>
-              <execution>
-                <id>enforce-versions</id>
-                <goals>
-                  <goal>enforce</goal>
-                </goals>
-                <configuration>
-                  <rules>
-                    <bannedDependencies>
-                      <excludes combine.children="append">
-                        <exclude>*:*_2.11</exclude>
-                      </excludes>
-                    </bannedDependencies>
-                  </rules>
-                </configuration>
-              </execution>
-            </executions>
-          </plugin>
-        </plugins>
-      </build>
-    </profile>
-
     <profile>
       <id>test-java-home</id>
       <activation>
@@ -2633,16 +2598,18 @@
       </properties>
     </profile>
 
+    <!-- Exists for backwards compatibility; profile doesn't do anything -->
     <profile>
       <id>scala-2.11</id>
-      <activation>
-        <property><name>!scala-2.10</name></property>
-      </activation>
+    </profile>
+
+    <!-- Draft of Scala 2.12 profile for later -->
+    <!--
+    <profile>
+      <id>scala-2.12</id>
       <properties>
-        <scala.version>2.11.8</scala.version>
-        <scala.binary.version>2.11</scala.binary.version>
-        <jline.version>2.12.1</jline.version>
-        <jline.groupid>jline</jline.groupid>
+        <scala.version>2.12.1</scala.version>
+        <scala.binary.version>2.12</scala.binary.version>
       </properties>
       <build>
         <plugins>
@@ -2659,7 +2626,7 @@
                   <rules>
                     <bannedDependencies>
                       <excludes combine.children="append">
-                        <exclude>*:*_2.10</exclude>
+                        <exclude>*:*_2.11</exclude>
                       </excludes>
                     </bannedDependencies>
                   </rules>
@@ -2670,10 +2637,11 @@
         </plugins>
       </build>
     </profile>
+    -->
 
     <!--
      This is a profile to enable the use of the ASF snapshot and staging repositories
-     during a build. It is useful when testing againt nightly or RC releases of dependencies.
+     during a build. It is useful when testing against nightly or RC releases of dependencies.
      It MUST NOT be used when building copies of Spark to use in production of for distribution,
      -->
     <profile>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 89b0c7a3ab7b0..41f3a0451aa8a 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -87,19 +87,11 @@ object SparkBuild extends PomBuild {
   val projectsMap: Map[String, Seq[Setting[_]]] = Map.empty
 
   override val profiles = {
-    val profiles = Properties.envOrNone("SBT_MAVEN_PROFILES") match {
+    Properties.envOrNone("SBT_MAVEN_PROFILES") match {
       case None => Seq("sbt")
       case Some(v) =>
         v.split("(\\s+|,)").filterNot(_.isEmpty).map(_.trim.replaceAll("-P", "")).toSeq
     }
-
-    if (System.getProperty("scala-2.10") == "") {
-      // To activate scala-2.10 profile, replace empty property value to non-empty value
-      // in the same way as Maven which handles -Dname as -Dname=true before executes build process.
-      // see: https://github.com/apache/maven/blob/maven-3.0.4/maven-embedder/src/main/java/org/apache/maven/cli/MavenCli.java#L1082
-      System.setProperty("scala-2.10", "true")
-    }
-    profiles
   }
 
   Properties.envOrNone("SBT_MAVEN_PROPERTIES") match {
@@ -234,9 +226,7 @@ object SparkBuild extends PomBuild {
     },
 
     javacJVMVersion := "1.8",
-    // SBT Scala 2.10 build still doesn't support Java 8, because scalac 2.10 doesn't, but,
-    // it also doesn't touch Java 8 code and it's OK to emit Java 7 bytecode in this case
-    scalacJVMVersion := (if (System.getProperty("scala-2.10") == "true") "1.7" else "1.8"),
+    scalacJVMVersion := "1.8",
 
     javacOptions in Compile ++= Seq(
       "-encoding", "UTF-8",
@@ -477,7 +467,6 @@ object OldDeps {
 
   def oldDepsSettings() = Defaults.coreDefaultSettings ++ Seq(
     name := "old-deps",
-    scalaVersion := "2.10.5",
     libraryDependencies := allPreviousArtifactKeys.value.flatten
   )
 }
@@ -756,13 +745,7 @@ object CopyDependencies {
 object TestSettings {
   import BuildCommons._
 
-  private val scalaBinaryVersion =
-    if (System.getProperty("scala-2.10") == "true") {
-      "2.10"
-    } else {
-      "2.11"
-    }
-
+  private val scalaBinaryVersion = "2.11"
   lazy val settings = Seq (
     // Fork new JVMs for tests and set Java options for those
     fork := true,
diff --git a/python/run-tests.py b/python/run-tests.py
index b2e50435bb192..afd3d29a0ff90 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -54,7 +54,8 @@ def print_red(text):
 LOGGER = logging.getLogger()
 
 # Find out where the assembly jars are located.
-for scala in ["2.11", "2.10"]:
+# Later, add back 2.12 to this list:
+for scala in ["2.11"]:
     build_dir = os.path.join(SPARK_HOME, "assembly", "target", "scala-" + scala)
     if os.path.isdir(build_dir):
         SPARK_DIST_CLASSPATH = os.path.join(build_dir, "jars", "*")
diff --git a/repl/pom.xml b/repl/pom.xml
index 6d133a3cfff7d..51eb9b60dd54a 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -32,8 +32,8 @@
 
   <properties>
     <sbt.project.name>repl</sbt.project.name>
-    <extra.source.dir>scala-2.10/src/main/scala</extra.source.dir>
-    <extra.testsource.dir>scala-2.10/src/test/scala</extra.testsource.dir>
+    <extra.source.dir>scala-2.11/src/main/scala</extra.source.dir>
+    <extra.testsource.dir>scala-2.11/src/test/scala</extra.testsource.dir>
   </properties>
 
   <dependencies>
@@ -71,7 +71,7 @@
       <version>${scala.version}</version>
     </dependency>
     <dependency>
-      <groupId>${jline.groupid}</groupId>
+      <groupId>jline</groupId>
       <artifactId>jline</artifactId>
     </dependency>
      <dependency>
@@ -170,23 +170,17 @@
       </plugin>
     </plugins>
   </build>
+  
+  <!--
   <profiles>
     <profile>
-      <id>scala-2.10</id>
-      <activation>
-        <property><name>scala-2.10</name></property>
-      </activation>
-    </profile>
-
-    <profile>
-      <id>scala-2.11</id>
-      <activation>
-        <property><name>!scala-2.10</name></property>
-      </activation>
+      <id>scala-2.12</id>
       <properties>
-        <extra.source.dir>scala-2.11/src/main/scala</extra.source.dir>
-        <extra.testsource.dir>scala-2.11/src/test/scala</extra.testsource.dir>
+        <extra.source.dir>scala-2.12/src/main/scala</extra.source.dir>
+        <extra.testsource.dir>scala-2.12/src/test/scala</extra.testsource.dir>
       </properties>
     </profile>
   </profiles>
+  -->
+
 </project>
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/Main.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/Main.scala
deleted file mode 100644
index fba321be91886..0000000000000
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/Main.scala
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.repl
-
-import org.apache.spark.internal.Logging
-
-object Main extends Logging {
-
-  initializeLogIfNecessary(true)
-  Signaling.cancelOnInterrupt()
-
-  private var _interp: SparkILoop = _
-
-  def interp = _interp
-
-  def interp_=(i: SparkILoop) { _interp = i }
-
-  def main(args: Array[String]) {
-    _interp = new SparkILoop
-    _interp.process(args)
-  }
-}
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkCommandLine.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkCommandLine.scala
deleted file mode 100644
index be9b79021d2a8..0000000000000
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkCommandLine.scala
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.repl
-
-import scala.tools.nsc.{CompilerCommand, Settings}
-
-import org.apache.spark.annotation.DeveloperApi
-
-/**
- * Command class enabling Spark-specific command line options (provided by
- * <i>org.apache.spark.repl.SparkRunnerSettings</i>).
- *
- * @example new SparkCommandLine(Nil).settings
- *
- * @param args The list of command line arguments
- * @param settings The underlying settings to associate with this set of
- *                 command-line options
- */
-@DeveloperApi
-class SparkCommandLine(args: List[String], override val settings: Settings)
-    extends CompilerCommand(args, settings) {
-  def this(args: List[String], error: String => Unit) {
-    this(args, new SparkRunnerSettings(error))
-  }
-
-  def this(args: List[String]) {
-    // scalastyle:off println
-    this(args, str => Console.println("Error: " + str))
-    // scalastyle:on println
-  }
-}
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkExprTyper.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkExprTyper.scala
deleted file mode 100644
index 2b5d56a895902..0000000000000
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkExprTyper.scala
+++ /dev/null
@@ -1,114 +0,0 @@
-// scalastyle:off
-
-/* NSC -- new Scala compiler
- * Copyright 2005-2013 LAMP/EPFL
- * @author  Paul Phillips
- */
-
-package org.apache.spark.repl
-
-import scala.tools.nsc._
-import scala.tools.nsc.interpreter._
-
-import scala.reflect.internal.util.BatchSourceFile
-import scala.tools.nsc.ast.parser.Tokens.EOF
-
-import org.apache.spark.internal.Logging
-
-private[repl] trait SparkExprTyper extends Logging {
-  val repl: SparkIMain
-
-  import repl._
-  import global.{ reporter => _, Import => _, _ }
-  import definitions._
-  import syntaxAnalyzer.{ UnitParser, UnitScanner, token2name }
-  import naming.freshInternalVarName
-
-  object codeParser extends { val global: repl.global.type = repl.global } with CodeHandlers[Tree] {
-    def applyRule[T](code: String, rule: UnitParser => T): T = {
-      reporter.reset()
-      val scanner = newUnitParser(code)
-      val result  = rule(scanner)
-
-      if (!reporter.hasErrors)
-        scanner.accept(EOF)
-
-      result
-    }
-
-    def defns(code: String) = stmts(code) collect { case x: DefTree => x }
-    def expr(code: String)  = applyRule(code, _.expr())
-    def stmts(code: String) = applyRule(code, _.templateStats())
-    def stmt(code: String)  = stmts(code).last  // guaranteed nonempty
-  }
-
-  /** Parse a line into a sequence of trees. Returns None if the input is incomplete. */
-  def parse(line: String): Option[List[Tree]] = debugging(s"""parse("$line")""")  {
-    var isIncomplete = false
-    reporter.withIncompleteHandler((_, _) => isIncomplete = true) {
-      val trees = codeParser.stmts(line)
-      if (reporter.hasErrors) {
-        Some(Nil)
-      } else if (isIncomplete) {
-        None
-      } else {
-        Some(trees)
-      }
-    }
-  }
-  // def parsesAsExpr(line: String) = {
-  //   import codeParser._
-  //   (opt expr line).isDefined
-  // }
-
-  def symbolOfLine(code: String): Symbol = {
-    def asExpr(): Symbol = {
-      val name  = freshInternalVarName()
-      // Typing it with a lazy val would give us the right type, but runs
-      // into compiler bugs with things like existentials, so we compile it
-      // behind a def and strip the NullaryMethodType which wraps the expr.
-      val line = "def " + name + " = {\n" + code + "\n}"
-
-      interpretSynthetic(line) match {
-        case IR.Success =>
-          val sym0 = symbolOfTerm(name)
-          // drop NullaryMethodType
-          val sym = sym0.cloneSymbol setInfo afterTyper(sym0.info.finalResultType)
-          if (sym.info.typeSymbol eq UnitClass) NoSymbol else sym
-        case _          => NoSymbol
-      }
-    }
-    def asDefn(): Symbol = {
-      val old = repl.definedSymbolList.toSet
-
-      interpretSynthetic(code) match {
-        case IR.Success =>
-          repl.definedSymbolList filterNot old match {
-            case Nil        => NoSymbol
-            case sym :: Nil => sym
-            case syms       => NoSymbol.newOverloaded(NoPrefix, syms)
-          }
-        case _ => NoSymbol
-      }
-    }
-    beQuietDuring(asExpr()) orElse beQuietDuring(asDefn())
-  }
-
-  private var typeOfExpressionDepth = 0
-  def typeOfExpression(expr: String, silent: Boolean = true): Type = {
-    if (typeOfExpressionDepth > 2) {
-      logDebug("Terminating typeOfExpression recursion for expression: " + expr)
-      return NoType
-    }
-    typeOfExpressionDepth += 1
-    // Don't presently have a good way to suppress undesirable success output
-    // while letting errors through, so it is first trying it silently: if there
-    // is an error, and errors are desired, then it re-evaluates non-silently
-    // to induce the error message.
-    try beSilentDuring(symbolOfLine(expr).tpe) match {
-      case NoType if !silent => symbolOfLine(expr).tpe // generate error
-      case tpe               => tpe
-    }
-    finally typeOfExpressionDepth -= 1
-  }
-}
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkHelper.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkHelper.scala
deleted file mode 100644
index 955be17a73b85..0000000000000
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkHelper.scala
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package scala.tools.nsc
-
-import org.apache.spark.annotation.DeveloperApi
-
-// NOTE: Forced to be public (and in scala.tools.nsc package) to access the
-//       settings "explicitParentLoader" method
-
-/**
- * Provides exposure for the explicitParentLoader method on settings instances.
- */
-@DeveloperApi
-object SparkHelper {
-  /**
-   * Retrieves the explicit parent loader for the provided settings.
-   *
-   * @param settings The settings whose explicit parent loader to retrieve
-   *
-   * @return The Optional classloader representing the explicit parent loader
-   */
-  @DeveloperApi
-  def explicitParentLoader(settings: Settings) = settings.explicitParentLoader
-}
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala
deleted file mode 100644
index b7237a6ce822f..0000000000000
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala
+++ /dev/null
@@ -1,1145 +0,0 @@
-// scalastyle:off
-
-/* NSC -- new Scala compiler
- * Copyright 2005-2013 LAMP/EPFL
- * @author Alexander Spoon
- */
-
-package org.apache.spark.repl
-
-
-import java.net.URL
-
-import scala.reflect.io.AbstractFile
-import scala.tools.nsc._
-import scala.tools.nsc.backend.JavaPlatform
-import scala.tools.nsc.interpreter._
-import scala.tools.nsc.interpreter.{Results => IR}
-import Predef.{println => _, _}
-import java.io.{BufferedReader, FileReader}
-import java.net.URI
-import java.util.concurrent.locks.ReentrantLock
-import scala.sys.process.Process
-import scala.tools.nsc.interpreter.session._
-import scala.util.Properties.{jdkHome, javaVersion}
-import scala.tools.util.{Javap}
-import scala.annotation.tailrec
-import scala.collection.mutable.ListBuffer
-import scala.concurrent.ops
-import scala.tools.nsc.util._
-import scala.tools.nsc.interpreter._
-import scala.tools.nsc.io.{File, Directory}
-import scala.reflect.NameTransformer._
-import scala.tools.nsc.util.ScalaClassLoader._
-import scala.tools.util._
-import scala.language.{implicitConversions, existentials, postfixOps}
-import scala.reflect.{ClassTag, classTag}
-import scala.tools.reflect.StdRuntimeTags._
-
-import java.lang.{Class => jClass}
-import scala.reflect.api.{Mirror, TypeCreator, Universe => ApiUniverse}
-
-import org.apache.spark.SparkConf
-import org.apache.spark.SparkContext
-import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.internal.Logging
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.util.Utils
-
-/** The Scala interactive shell.  It provides a read-eval-print loop
- *  around the Interpreter class.
- *  After instantiation, clients should call the main() method.
- *
- *  If no in0 is specified, then input will come from the console, and
- *  the class will attempt to provide input editing feature such as
- *  input history.
- *
- *  @author Moez A. Abdel-Gawad
- *  @author  Lex Spoon
- *  @version 1.2
- */
-@DeveloperApi
-class SparkILoop(
-    private val in0: Option[BufferedReader],
-    protected val out: JPrintWriter,
-    val master: Option[String]
-) extends AnyRef with LoopCommands with SparkILoopInit with Logging {
-  def this(in0: BufferedReader, out: JPrintWriter, master: String) = this(Some(in0), out, Some(master))
-  def this(in0: BufferedReader, out: JPrintWriter) = this(Some(in0), out, None)
-  def this() = this(None, new JPrintWriter(Console.out, true), None)
-
-  private var in: InteractiveReader = _   // the input stream from which commands come
-
-  // NOTE: Exposed in package for testing
-  private[repl] var settings: Settings = _
-
-  private[repl] var intp: SparkIMain = _
-
-  @deprecated("Use `intp` instead.", "2.9.0") def interpreter = intp
-  @deprecated("Use `intp` instead.", "2.9.0") def interpreter_= (i: SparkIMain): Unit = intp = i
-
-  /** Having inherited the difficult "var-ness" of the repl instance,
-   *  I'm trying to work around it by moving operations into a class from
-   *  which it will appear a stable prefix.
-   */
-  private def onIntp[T](f: SparkIMain => T): T = f(intp)
-
-  class IMainOps[T <: SparkIMain](val intp: T) {
-    import intp._
-    import global._
-
-    def printAfterTyper(msg: => String) =
-      intp.reporter printMessage afterTyper(msg)
-
-    /** Strip NullaryMethodType artifacts. */
-    private def replInfo(sym: Symbol) = {
-      sym.info match {
-        case NullaryMethodType(restpe) if sym.isAccessor  => restpe
-        case info                                         => info
-      }
-    }
-    def echoTypeStructure(sym: Symbol) =
-      printAfterTyper("" + deconstruct.show(replInfo(sym)))
-
-    def echoTypeSignature(sym: Symbol, verbose: Boolean) = {
-      if (verbose) SparkILoop.this.echo("// Type signature")
-      printAfterTyper("" + replInfo(sym))
-
-      if (verbose) {
-        SparkILoop.this.echo("\n// Internal Type structure")
-        echoTypeStructure(sym)
-      }
-    }
-  }
-  implicit def stabilizeIMain(intp: SparkIMain) = new IMainOps[intp.type](intp)
-
-  /** TODO -
-   *  -n normalize
-   *  -l label with case class parameter names
-   *  -c complete - leave nothing out
-   */
-  private def typeCommandInternal(expr: String, verbose: Boolean): Result = {
-    onIntp { intp =>
-      val sym = intp.symbolOfLine(expr)
-      if (sym.exists) intp.echoTypeSignature(sym, verbose)
-      else ""
-    }
-  }
-
-  // NOTE: Must be public for visibility
-  @DeveloperApi
-  var sparkContext: SparkContext = _
-
-  override def echoCommandMessage(msg: String) {
-    intp.reporter printMessage msg
-  }
-
-  // def isAsync = !settings.Yreplsync.value
-  private[repl] def isAsync = false
-  // lazy val power = new Power(intp, new StdReplVals(this))(tagOfStdReplVals, classTag[StdReplVals])
-  private def history = in.history
-
-  /** The context class loader at the time this object was created */
-  protected val originalClassLoader = Utils.getContextOrSparkClassLoader
-
-  // classpath entries added via :cp
-  private var addedClasspath: String = ""
-
-  /** A reverse list of commands to replay if the user requests a :replay */
-  private var replayCommandStack: List[String] = Nil
-
-  /** A list of commands to replay if the user requests a :replay */
-  private def replayCommands = replayCommandStack.reverse
-
-  /** Record a command for replay should the user request a :replay */
-  private def addReplay(cmd: String) = replayCommandStack ::= cmd
-
-  private def savingReplayStack[T](body: => T): T = {
-    val saved = replayCommandStack
-    try body
-    finally replayCommandStack = saved
-  }
-  private def savingReader[T](body: => T): T = {
-    val saved = in
-    try body
-    finally in = saved
-  }
-
-
-  private def sparkCleanUp() {
-    echo("Stopping spark context.")
-    intp.beQuietDuring {
-      command("sc.stop()")
-    }
-  }
-  /** Close the interpreter and set the var to null. */
-  private def closeInterpreter() {
-    if (intp ne null) {
-      sparkCleanUp()
-      intp.close()
-      intp = null
-    }
-  }
-
-  class SparkILoopInterpreter extends SparkIMain(settings, out) {
-    outer =>
-
-    override private[repl] lazy val formatting = new Formatting {
-      def prompt = SparkILoop.this.prompt
-    }
-    override protected def parentClassLoader = SparkHelper.explicitParentLoader(settings).getOrElse(classOf[SparkILoop].getClassLoader)
-  }
-
-  /**
-   * Constructs a new interpreter.
-   */
-  protected def createInterpreter() {
-    require(settings != null)
-
-    if (addedClasspath != "") settings.classpath.append(addedClasspath)
-    val addedJars =
-      if (Utils.isWindows) {
-        // Strip any URI scheme prefix so we can add the correct path to the classpath
-        // e.g. file:/C:/my/path.jar -> C:/my/path.jar
-        getAddedJars().map { jar => new URI(jar).getPath.stripPrefix("/") }
-      } else {
-        // We need new URI(jar).getPath here for the case that `jar` includes encoded white space (%20).
-        getAddedJars().map { jar => new URI(jar).getPath }
-      }
-    // work around for Scala bug
-    val totalClassPath = addedJars.foldLeft(
-      settings.classpath.value)((l, r) => ClassPath.join(l, r))
-    this.settings.classpath.value = totalClassPath
-
-    intp = new SparkILoopInterpreter
-  }
-
-  /** print a friendly help message */
-  private def helpCommand(line: String): Result = {
-    if (line == "") helpSummary()
-    else uniqueCommand(line) match {
-      case Some(lc) => echo("\n" + lc.longHelp)
-      case _        => ambiguousError(line)
-    }
-  }
-  private def helpSummary() = {
-    val usageWidth  = commands map (_.usageMsg.length) max
-    val formatStr   = "%-" + usageWidth + "s %s %s"
-
-    echo("All commands can be abbreviated, e.g. :he instead of :help.")
-    echo("Those marked with a * have more detailed help, e.g. :help imports.\n")
-
-    commands foreach { cmd =>
-      val star = if (cmd.hasLongHelp) "*" else " "
-      echo(formatStr.format(cmd.usageMsg, star, cmd.help))
-    }
-  }
-  private def ambiguousError(cmd: String): Result = {
-    matchingCommands(cmd) match {
-      case Nil  => echo(cmd + ": no such command.  Type :help for help.")
-      case xs   => echo(cmd + " is ambiguous: did you mean " + xs.map(":" + _.name).mkString(" or ") + "?")
-    }
-    Result(true, None)
-  }
-  private def matchingCommands(cmd: String) = commands filter (_.name startsWith cmd)
-  private def uniqueCommand(cmd: String): Option[LoopCommand] = {
-    // this lets us add commands willy-nilly and only requires enough command to disambiguate
-    matchingCommands(cmd) match {
-      case List(x)  => Some(x)
-      // exact match OK even if otherwise appears ambiguous
-      case xs       => xs find (_.name == cmd)
-    }
-  }
-  private var fallbackMode = false
-
-  private def toggleFallbackMode() {
-    val old = fallbackMode
-    fallbackMode = !old
-    System.setProperty("spark.repl.fallback", fallbackMode.toString)
-    echo(s"""
-      |Switched ${if (old) "off" else "on"} fallback mode without restarting.
-      |       If you have defined classes in the repl, it would
-      |be good to redefine them incase you plan to use them. If you still run
-      |into issues it would be good to restart the repl and turn on `:fallback`
-      |mode as first command.
-      """.stripMargin)
-  }
-
-  /** Show the history */
-  private lazy val historyCommand = new LoopCommand("history", "show the history (optional num is commands to show)") {
-    override def usage = "[num]"
-    def defaultLines = 20
-
-    def apply(line: String): Result = {
-      if (history eq NoHistory)
-        return "No history available."
-
-      val xs      = words(line)
-      val current = history.index
-      val count   = try xs.head.toInt catch { case _: Exception => defaultLines }
-      val lines   = history.asStrings takeRight count
-      val offset  = current - lines.size + 1
-
-      for ((line, index) <- lines.zipWithIndex)
-        echo("%3d  %s".format(index + offset, line))
-    }
-  }
-
-  // When you know you are most likely breaking into the middle
-  // of a line being typed.  This softens the blow.
-  private[repl] def echoAndRefresh(msg: String) = {
-    echo("\n" + msg)
-    in.redrawLine()
-  }
-  private[repl] def echo(msg: String) = {
-    out println msg
-    out.flush()
-  }
-  private def echoNoNL(msg: String) = {
-    out print msg
-    out.flush()
-  }
-
-  /** Search the history */
-  private def searchHistory(_cmdline: String) {
-    val cmdline = _cmdline.toLowerCase
-    val offset  = history.index - history.size + 1
-
-    for ((line, index) <- history.asStrings.zipWithIndex ; if line.toLowerCase contains cmdline)
-      echo("%d %s".format(index + offset, line))
-  }
-
-  private var currentPrompt = Properties.shellPromptString
-
-  /**
-   * Sets the prompt string used by the REPL.
-   *
-   * @param prompt The new prompt string
-   */
-  @DeveloperApi
-  def setPrompt(prompt: String) = currentPrompt = prompt
-
-  /**
-   * Represents the current prompt string used by the REPL.
-   *
-   * @return The current prompt string
-   */
-  @DeveloperApi
-  def prompt = currentPrompt
-
-  import LoopCommand.{ cmd, nullary }
-
-  /** Standard commands */
-  private lazy val standardCommands = List(
-    cmd("cp", "<path>", "add a jar or directory to the classpath", addClasspath),
-    cmd("help", "[command]", "print this summary or command-specific help", helpCommand),
-    historyCommand,
-    cmd("h?", "<string>", "search the history", searchHistory),
-    cmd("imports", "[name name ...]", "show import history, identifying sources of names", importsCommand),
-    cmd("implicits", "[-v]", "show the implicits in scope", implicitsCommand),
-    cmd("javap", "<path|class>", "disassemble a file or class name", javapCommand),
-    cmd("load", "<path>", "load and interpret a Scala file", loadCommand),
-    nullary("paste", "enter paste mode: all input up to ctrl-D compiled together", pasteCommand),
-//    nullary("power", "enable power user mode", powerCmd),
-    nullary("quit", "exit the repl", () => Result(false, None)),
-    nullary("replay", "reset execution and replay all previous commands", replay),
-    nullary("reset", "reset the repl to its initial state, forgetting all session entries", resetCommand),
-    shCommand,
-    nullary("silent", "disable/enable automatic printing of results", verbosity),
-    nullary("fallback", """
-                           |disable/enable advanced repl changes, these fix some issues but may introduce others.
-                           |This mode will be removed once these fixes stablize""".stripMargin, toggleFallbackMode),
-    cmd("type", "[-v] <expr>", "display the type of an expression without evaluating it", typeCommand),
-    nullary("warnings", "show the suppressed warnings from the most recent line which had any", warningsCommand)
-  )
-
-  /** Power user commands */
-  private lazy val powerCommands: List[LoopCommand] = List(
-    // cmd("phase", "<phase>", "set the implicit phase for power commands", phaseCommand)
-  )
-
-  // private def dumpCommand(): Result = {
-  //   echo("" + power)
-  //   history.asStrings takeRight 30 foreach echo
-  //   in.redrawLine()
-  // }
-  // private def valsCommand(): Result = power.valsDescription
-
-  private val typeTransforms = List(
-    "scala.collection.immutable." -> "immutable.",
-    "scala.collection.mutable."   -> "mutable.",
-    "scala.collection.generic."   -> "generic.",
-    "java.lang."                  -> "jl.",
-    "scala.runtime."              -> "runtime."
-  )
-
-  private def importsCommand(line: String): Result = {
-    val tokens    = words(line)
-    val handlers  = intp.languageWildcardHandlers ++ intp.importHandlers
-    val isVerbose = tokens contains "-v"
-
-    handlers.filterNot(_.importedSymbols.isEmpty).zipWithIndex foreach {
-      case (handler, idx) =>
-        val (types, terms) = handler.importedSymbols partition (_.name.isTypeName)
-        val imps           = handler.implicitSymbols
-        val found          = tokens filter (handler importsSymbolNamed _)
-        val typeMsg        = if (types.isEmpty) "" else types.size + " types"
-        val termMsg        = if (terms.isEmpty) "" else terms.size + " terms"
-        val implicitMsg    = if (imps.isEmpty) "" else imps.size + " are implicit"
-        val foundMsg       = if (found.isEmpty) "" else found.mkString(" // imports: ", ", ", "")
-        val statsMsg       = List(typeMsg, termMsg, implicitMsg) filterNot (_ == "") mkString ("(", ", ", ")")
-
-        intp.reporter.printMessage("%2d) %-30s %s%s".format(
-          idx + 1,
-          handler.importString,
-          statsMsg,
-          foundMsg
-        ))
-    }
-  }
-
-  private def implicitsCommand(line: String): Result = onIntp { intp =>
-    import intp._
-    import global._
-
-    def p(x: Any) = intp.reporter.printMessage("" + x)
-
-    // If an argument is given, only show a source with that
-    // in its name somewhere.
-    val args     = line split "\\s+"
-    val filtered = intp.implicitSymbolsBySource filter {
-      case (source, syms) =>
-        (args contains "-v") || {
-          if (line == "") (source.fullName.toString != "scala.Predef")
-          else (args exists (source.name.toString contains _))
-        }
-    }
-
-    if (filtered.isEmpty)
-      return "No implicits have been imported other than those in Predef."
-
-    filtered foreach {
-      case (source, syms) =>
-        p("/* " + syms.size + " implicit members imported from " + source.fullName + " */")
-
-        // This groups the members by where the symbol is defined
-        val byOwner = syms groupBy (_.owner)
-        val sortedOwners = byOwner.toList sortBy { case (owner, _) => afterTyper(source.info.baseClasses indexOf owner) }
-
-        sortedOwners foreach {
-          case (owner, members) =>
-            // Within each owner, we cluster results based on the final result type
-            // if there are more than a couple, and sort each cluster based on name.
-            // This is really just trying to make the 100 or so implicits imported
-            // by default into something readable.
-            val memberGroups: List[List[Symbol]] = {
-              val groups = members groupBy (_.tpe.finalResultType) toList
-              val (big, small) = groups partition (_._2.size > 3)
-              val xss = (
-                (big sortBy (_._1.toString) map (_._2)) :+
-                (small flatMap (_._2))
-              )
-
-              xss map (xs => xs sortBy (_.name.toString))
-            }
-
-            val ownerMessage = if (owner == source) " defined in " else " inherited from "
-            p("  /* " + members.size + ownerMessage + owner.fullName + " */")
-
-            memberGroups foreach { group =>
-              group foreach (s => p("  " + intp.symbolDefString(s)))
-              p("")
-            }
-        }
-        p("")
-    }
-  }
-
-  private def findToolsJar() = {
-    val jdkPath = Directory(jdkHome)
-    val jar     = jdkPath / "lib" / "tools.jar" toFile;
-
-    if (jar isFile)
-      Some(jar)
-    else if (jdkPath.isDirectory)
-      jdkPath.deepFiles find (_.name == "tools.jar")
-    else None
-  }
-  private def addToolsJarToLoader() = {
-    val cl = findToolsJar match {
-      case Some(tools) => ScalaClassLoader.fromURLs(Seq(tools.toURL), intp.classLoader)
-      case _           => intp.classLoader
-    }
-    if (Javap.isAvailable(cl)) {
-      logDebug(":javap available.")
-      cl
-    }
-    else {
-      logDebug(":javap unavailable: no tools.jar at " + jdkHome)
-      intp.classLoader
-    }
-  }
-
-  private def newJavap() = new JavapClass(addToolsJarToLoader(), new SparkIMain.ReplStrippingWriter(intp)) {
-    override def tryClass(path: String): Array[Byte] = {
-      val hd :: rest = path split '.' toList;
-      // If there are dots in the name, the first segment is the
-      // key to finding it.
-      if (rest.nonEmpty) {
-        intp optFlatName hd match {
-          case Some(flat) =>
-            val clazz = flat :: rest mkString NAME_JOIN_STRING
-            val bytes = super.tryClass(clazz)
-            if (bytes.nonEmpty) bytes
-            else super.tryClass(clazz + MODULE_SUFFIX_STRING)
-          case _          => super.tryClass(path)
-        }
-      }
-      else {
-        // Look for Foo first, then Foo$, but if Foo$ is given explicitly,
-        // we have to drop the $ to find object Foo, then tack it back onto
-        // the end of the flattened name.
-        def className  = intp flatName path
-        def moduleName = (intp flatName path.stripSuffix(MODULE_SUFFIX_STRING)) + MODULE_SUFFIX_STRING
-
-        val bytes = super.tryClass(className)
-        if (bytes.nonEmpty) bytes
-        else super.tryClass(moduleName)
-      }
-    }
-  }
-  // private lazy val javap = substituteAndLog[Javap]("javap", NoJavap)(newJavap())
-  private lazy val javap =
-    try newJavap()
-    catch { case _: Exception => null }
-
-  // Still todo: modules.
-  private def typeCommand(line0: String): Result = {
-    line0.trim match {
-      case ""                      => ":type [-v] <expression>"
-      case s if s startsWith "-v " => typeCommandInternal(s stripPrefix "-v " trim, true)
-      case s                       => typeCommandInternal(s, false)
-    }
-  }
-
-  private def warningsCommand(): Result = {
-    if (intp.lastWarnings.isEmpty)
-      "Can't find any cached warnings."
-    else
-      intp.lastWarnings foreach { case (pos, msg) => intp.reporter.warning(pos, msg) }
-  }
-
-  private def javapCommand(line: String): Result = {
-    if (javap == null)
-      ":javap unavailable, no tools.jar at %s.  Set JDK_HOME.".format(jdkHome)
-    else if (javaVersion startsWith "1.7")
-      ":javap not yet working with java 1.7"
-    else if (line == "")
-      ":javap [-lcsvp] [path1 path2 ...]"
-    else
-      javap(words(line)) foreach { res =>
-        if (res.isError) return "Failed: " + res.value
-        else res.show()
-      }
-  }
-
-  private def wrapCommand(line: String): Result = {
-    def failMsg = "Argument to :wrap must be the name of a method with signature [T](=> T): T"
-    onIntp { intp =>
-      import intp._
-      import global._
-
-      words(line) match {
-        case Nil            =>
-          intp.executionWrapper match {
-            case ""   => "No execution wrapper is set."
-            case s    => "Current execution wrapper: " + s
-          }
-        case "clear" :: Nil =>
-          intp.executionWrapper match {
-            case ""   => "No execution wrapper is set."
-            case s    => intp.clearExecutionWrapper() ; "Cleared execution wrapper."
-          }
-        case wrapper :: Nil =>
-          intp.typeOfExpression(wrapper) match {
-            case PolyType(List(targ), MethodType(List(arg), restpe)) =>
-              intp setExecutionWrapper intp.pathToTerm(wrapper)
-              "Set wrapper to '" + wrapper + "'"
-            case tp =>
-              failMsg + "\nFound: <unknown>"
-          }
-        case _ => failMsg
-      }
-    }
-  }
-
-  private def pathToPhaseWrapper = intp.pathToTerm("$r") + ".phased.atCurrent"
-  // private def phaseCommand(name: String): Result = {
-  //   val phased: Phased = power.phased
-  //   import phased.NoPhaseName
-
-  //   if (name == "clear") {
-  //     phased.set(NoPhaseName)
-  //     intp.clearExecutionWrapper()
-  //     "Cleared active phase."
-  //   }
-  //   else if (name == "") phased.get match {
-  //     case NoPhaseName => "Usage: :phase <expr> (e.g. typer, erasure.next, erasure+3)"
-  //     case ph          => "Active phase is '%s'.  (To clear, :phase clear)".format(phased.get)
-  //   }
-  //   else {
-  //     val what = phased.parse(name)
-  //     if (what.isEmpty || !phased.set(what))
-  //       "'" + name + "' does not appear to represent a valid phase."
-  //     else {
-  //       intp.setExecutionWrapper(pathToPhaseWrapper)
-  //       val activeMessage =
-  //         if (what.toString.length == name.length) "" + what
-  //         else "%s (%s)".format(what, name)
-
-  //       "Active phase is now: " + activeMessage
-  //     }
-  //   }
-  // }
-
-  /**
-   * Provides a list of available commands.
-   *
-   * @return The list of commands
-   */
-  @DeveloperApi
-  def commands: List[LoopCommand] = standardCommands /*++ (
-    if (isReplPower) powerCommands else Nil
-  )*/
-
-  private val replayQuestionMessage =
-    """|That entry seems to have slain the compiler.  Shall I replay
-       |your session? I can re-run each line except the last one.
-       |[y/n]
-    """.trim.stripMargin
-
-  private def crashRecovery(ex: Throwable): Boolean = {
-    echo(ex.toString)
-    ex match {
-      case _: NoSuchMethodError | _: NoClassDefFoundError =>
-        echo("\nUnrecoverable error.")
-        throw ex
-      case _  =>
-        def fn(): Boolean =
-          try in.readYesOrNo(replayQuestionMessage, { echo("\nYou must enter y or n.") ; fn() })
-          catch { case _: RuntimeException => false }
-
-        if (fn()) replay()
-        else echo("\nAbandoning crashed session.")
-    }
-    true
-  }
-
-  /** The main read-eval-print loop for the repl.  It calls
-   *  command() for each line of input, and stops when
-   *  command() returns false.
-   */
-  private def loop() {
-    def readOneLine() = {
-      out.flush()
-      in readLine prompt
-    }
-    // return false if repl should exit
-    def processLine(line: String): Boolean = {
-      if (isAsync) {
-        if (!awaitInitialized()) return false
-        runThunks()
-      }
-      if (line eq null) false               // assume null means EOF
-      else command(line) match {
-        case Result(false, _)           => false
-        case Result(_, Some(finalLine)) => addReplay(finalLine) ; true
-        case _                          => true
-      }
-    }
-    def innerLoop() {
-      val shouldContinue = try {
-        processLine(readOneLine())
-      } catch {case t: Throwable => crashRecovery(t)}
-      if (shouldContinue)
-        innerLoop()
-    }
-    innerLoop()
-  }
-
-  /** interpret all lines from a specified file */
-  private def interpretAllFrom(file: File) {
-    savingReader {
-      savingReplayStack {
-        file applyReader { reader =>
-          in = SimpleReader(reader, out, false)
-          echo("Loading " + file + "...")
-          loop()
-        }
-      }
-    }
-  }
-
-  /** create a new interpreter and replay the given commands */
-  private def replay() {
-    reset()
-    if (replayCommandStack.isEmpty)
-      echo("Nothing to replay.")
-    else for (cmd <- replayCommands) {
-      echo("Replaying: " + cmd)  // flush because maybe cmd will have its own output
-      command(cmd)
-      echo("")
-    }
-  }
-  private def resetCommand() {
-    echo("Resetting repl state.")
-    if (replayCommandStack.nonEmpty) {
-      echo("Forgetting this session history:\n")
-      replayCommands foreach echo
-      echo("")
-      replayCommandStack = Nil
-    }
-    if (intp.namedDefinedTerms.nonEmpty)
-      echo("Forgetting all expression results and named terms: " + intp.namedDefinedTerms.mkString(", "))
-    if (intp.definedTypes.nonEmpty)
-      echo("Forgetting defined types: " + intp.definedTypes.mkString(", "))
-
-    reset()
-  }
-
-  private def reset() {
-    intp.reset()
-    // unleashAndSetPhase()
-  }
-
-  /** fork a shell and run a command */
-  private lazy val shCommand = new LoopCommand("sh", "run a shell command (result is implicitly => List[String])") {
-    override def usage = "<command line>"
-    def apply(line: String): Result = line match {
-      case ""   => showUsage()
-      case _    =>
-        val toRun = classOf[ProcessResult].getName + "(" + string2codeQuoted(line) + ")"
-        intp interpret toRun
-        ()
-    }
-  }
-
-  private def withFile(filename: String)(action: File => Unit) {
-    val f = File(filename)
-
-    if (f.exists) action(f)
-    else echo("That file does not exist")
-  }
-
-  private def loadCommand(arg: String) = {
-    var shouldReplay: Option[String] = None
-    withFile(arg)(f => {
-      interpretAllFrom(f)
-      shouldReplay = Some(":load " + arg)
-    })
-    Result(true, shouldReplay)
-  }
-
-  private def addAllClasspath(args: Seq[String]): Unit = {
-    var added = false
-    var totalClasspath = ""
-    for (arg <- args) {
-      val f = File(arg).normalize
-      if (f.exists) {
-        added = true
-        addedClasspath = ClassPath.join(addedClasspath, f.path)
-        totalClasspath = ClassPath.join(settings.classpath.value, addedClasspath)
-        intp.addUrlsToClassPath(f.toURI.toURL)
-        sparkContext.addJar(f.toURI.toURL.getPath)
-      }
-    }
-  }
-
-  private def addClasspath(arg: String): Unit = {
-    val f = File(arg).normalize
-    if (f.exists) {
-      addedClasspath = ClassPath.join(addedClasspath, f.path)
-      intp.addUrlsToClassPath(f.toURI.toURL)
-      sparkContext.addJar(f.toURI.toURL.getPath)
-      echo("Added '%s'.  Your new classpath is:\n\"%s\"".format(f.path, intp.global.classPath.asClasspathString))
-    }
-    else echo("The path '" + f + "' doesn't seem to exist.")
-  }
-
-
-  private def powerCmd(): Result = {
-    if (isReplPower) "Already in power mode."
-    else enablePowerMode(false)
-  }
-
-  private[repl] def enablePowerMode(isDuringInit: Boolean) = {
-    // replProps.power setValue true
-    // unleashAndSetPhase()
-    // asyncEcho(isDuringInit, power.banner)
-  }
-  // private def unleashAndSetPhase() {
-//     if (isReplPower) {
-// //      power.unleash()
-//       // Set the phase to "typer"
-//       intp beSilentDuring phaseCommand("typer")
-//     }
-//   }
-
-  private def asyncEcho(async: Boolean, msg: => String) {
-    if (async) asyncMessage(msg)
-    else echo(msg)
-  }
-
-  private def verbosity() = {
-    // val old = intp.printResults
-    // intp.printResults = !old
-    // echo("Switched " + (if (old) "off" else "on") + " result printing.")
-  }
-
-  /**
-   * Run one command submitted by the user.  Two values are returned:
-   * (1) whether to keep running, (2) the line to record for replay,
-   * if any.
-   */
-  private[repl] def command(line: String): Result = {
-    if (line startsWith ":") {
-      val cmd = line.tail takeWhile (x => !x.isWhitespace)
-      uniqueCommand(cmd) match {
-        case Some(lc) => lc(line.tail stripPrefix cmd dropWhile (_.isWhitespace))
-        case _        => ambiguousError(cmd)
-      }
-    }
-    else if (intp.global == null) Result(false, None)  // Notice failure to create compiler
-    else Result(true, interpretStartingWith(line))
-  }
-
-  private def readWhile(cond: String => Boolean) = {
-    Iterator continually in.readLine("") takeWhile (x => x != null && cond(x))
-  }
-
-  private def pasteCommand(): Result = {
-    echo("// Entering paste mode (ctrl-D to finish)\n")
-    val code = readWhile(_ => true) mkString "\n"
-    echo("\n// Exiting paste mode, now interpreting.\n")
-    intp interpret code
-    ()
-  }
-
-  private object paste extends Pasted {
-    val ContinueString = "     | "
-    val PromptString   = "scala> "
-
-    def interpret(line: String): Unit = {
-      echo(line.trim)
-      intp interpret line
-      echo("")
-    }
-
-    def transcript(start: String) = {
-      echo("\n// Detected repl transcript paste: ctrl-D to finish.\n")
-      apply(Iterator(start) ++ readWhile(_.trim != PromptString.trim))
-    }
-  }
-  import paste.{ ContinueString, PromptString }
-
-  /**
-   * Interpret expressions starting with the first line.
-   * Read lines until a complete compilation unit is available
-   * or until a syntax error has been seen.  If a full unit is
-   * read, go ahead and interpret it.  Return the full string
-   * to be recorded for replay, if any.
-   */
-  private def interpretStartingWith(code: String): Option[String] = {
-    // signal completion non-completion input has been received
-    in.completion.resetVerbosity()
-
-    def reallyInterpret = {
-      val reallyResult = intp.interpret(code)
-      (reallyResult, reallyResult match {
-        case IR.Error       => None
-        case IR.Success     => Some(code)
-        case IR.Incomplete  =>
-          if (in.interactive && code.endsWith("\n\n")) {
-            echo("You typed two blank lines.  Starting a new command.")
-            None
-          }
-          else in.readLine(ContinueString) match {
-            case null =>
-              // we know compilation is going to fail since we're at EOF and the
-              // parser thinks the input is still incomplete, but since this is
-              // a file being read non-interactively we want to fail.  So we send
-              // it straight to the compiler for the nice error message.
-              intp.compileString(code)
-              None
-
-            case line => interpretStartingWith(code + "\n" + line)
-          }
-      })
-    }
-
-    /** Here we place ourselves between the user and the interpreter and examine
-     *  the input they are ostensibly submitting.  We intervene in several cases:
-     *
-     *  1) If the line starts with "scala> " it is assumed to be an interpreter paste.
-     *  2) If the line starts with "." (but not ".." or "./") it is treated as an invocation
-     *     on the previous result.
-     *  3) If the Completion object's execute returns Some(_), we inject that value
-     *     and avoid the interpreter, as it's likely not valid scala code.
-     */
-    if (code == "") None
-    else if (!paste.running && code.trim.startsWith(PromptString)) {
-      paste.transcript(code)
-      None
-    }
-    else if (Completion.looksLikeInvocation(code) && intp.mostRecentVar != "") {
-      interpretStartingWith(intp.mostRecentVar + code)
-    }
-    else if (code.trim startsWith "//") {
-      // line comment, do nothing
-      None
-    }
-    else
-      reallyInterpret._2
-  }
-
-  // runs :load `file` on any files passed via -i
-  private def loadFiles(settings: Settings) = settings match {
-    case settings: SparkRunnerSettings =>
-      for (filename <- settings.loadfiles.value) {
-        val cmd = ":load " + filename
-        command(cmd)
-        addReplay(cmd)
-        echo("")
-      }
-    case _ =>
-  }
-
-  /** Tries to create a JLineReader, falling back to SimpleReader:
-   *  unless settings or properties are such that it should start
-   *  with SimpleReader.
-   */
-  private def chooseReader(settings: Settings): InteractiveReader = {
-    if (settings.Xnojline.value || Properties.isEmacsShell)
-      SimpleReader()
-    else try new SparkJLineReader(
-      if (settings.noCompletion.value) NoCompletion
-      else new SparkJLineCompletion(intp)
-    )
-    catch {
-      case ex @ (_: Exception | _: NoClassDefFoundError) =>
-        echo("Failed to created SparkJLineReader: " + ex + "\nFalling back to SimpleReader.")
-        SimpleReader()
-    }
-  }
-
-  private val u: scala.reflect.runtime.universe.type = scala.reflect.runtime.universe
-  private val m = u.runtimeMirror(Utils.getSparkClassLoader)
-  private def tagOfStaticClass[T: ClassTag]: u.TypeTag[T] =
-    u.TypeTag[T](
-      m,
-      new TypeCreator {
-        def apply[U <: ApiUniverse with Singleton](m: Mirror[U]): U # Type =
-          m.staticClass(classTag[T].runtimeClass.getName).toTypeConstructor.asInstanceOf[U # Type]
-      })
-
-  private def process(settings: Settings): Boolean = savingContextLoader {
-    this.settings = settings
-    createInterpreter()
-
-    // sets in to some kind of reader depending on environmental cues
-    in = in0 match {
-      case Some(reader) => SimpleReader(reader, out, true)
-      case None         =>
-        // some post-initialization
-        chooseReader(settings) match {
-          case x: SparkJLineReader => addThunk(x.consoleReader.postInit) ; x
-          case x                   => x
-        }
-    }
-    lazy val tagOfSparkIMain = tagOfStaticClass[org.apache.spark.repl.SparkIMain]
-    // Bind intp somewhere out of the regular namespace where
-    // we can get at it in generated code.
-    addThunk(intp.quietBind(NamedParam[SparkIMain]("$intp", intp)(tagOfSparkIMain, classTag[SparkIMain])))
-    addThunk({
-      import scala.tools.nsc.io._
-      import Properties.userHome
-      import scala.compat.Platform.EOL
-      val autorun = replProps.replAutorunCode.option flatMap (f => io.File(f).safeSlurp())
-      if (autorun.isDefined) intp.quietRun(autorun.get)
-    })
-
-    addThunk(printWelcome())
-    addThunk(initializeSpark())
-
-    // it is broken on startup; go ahead and exit
-    if (intp.reporter.hasErrors)
-      return false
-
-    // This is about the illusion of snappiness.  We call initialize()
-    // which spins off a separate thread, then print the prompt and try
-    // our best to look ready.  The interlocking lazy vals tend to
-    // inter-deadlock, so we break the cycle with a single asynchronous
-    // message to an rpcEndpoint.
-    if (isAsync) {
-      intp initialize initializedCallback()
-      createAsyncListener() // listens for signal to run postInitialization
-    }
-    else {
-      intp.initializeSynchronous()
-      postInitialization()
-    }
-    // printWelcome()
-
-    loadFiles(settings)
-
-    try loop()
-    catch AbstractOrMissingHandler()
-    finally closeInterpreter()
-
-    true
-  }
-
-  // NOTE: Must be public for visibility
-  @DeveloperApi
-  def createSparkSession(): SparkSession = {
-    val execUri = System.getenv("SPARK_EXECUTOR_URI")
-    val jars = getAddedJars()
-    val conf = new SparkConf()
-      .setMaster(getMaster())
-      .setJars(jars)
-      .setIfMissing("spark.app.name", "Spark shell")
-      // SparkContext will detect this configuration and register it with the RpcEnv's
-      // file server, setting spark.repl.class.uri to the actual URI for executors to
-      // use. This is sort of ugly but since executors are started as part of SparkContext
-      // initialization in certain cases, there's an initialization order issue that prevents
-      // this from being set after SparkContext is instantiated.
-      .set("spark.repl.class.outputDir", intp.outputDir.getAbsolutePath())
-    if (execUri != null) {
-      conf.set("spark.executor.uri", execUri)
-    }
-
-    val builder = SparkSession.builder.config(conf)
-    val sparkSession = if (SparkSession.hiveClassesArePresent) {
-      logInfo("Creating Spark session with Hive support")
-      builder.enableHiveSupport().getOrCreate()
-    } else {
-      logInfo("Creating Spark session")
-      builder.getOrCreate()
-    }
-    sparkContext = sparkSession.sparkContext
-    sparkSession
-  }
-
-  private def getMaster(): String = {
-    val master = this.master match {
-      case Some(m) => m
-      case None =>
-        val envMaster = sys.env.get("MASTER")
-        val propMaster = sys.props.get("spark.master")
-        propMaster.orElse(envMaster).getOrElse("local[*]")
-    }
-    master
-  }
-
-  /** process command-line arguments and do as they request */
-  def process(args: Array[String]): Boolean = {
-    val command = new SparkCommandLine(args.toList, msg => echo(msg))
-    def neededHelp(): String =
-      (if (command.settings.help.value) command.usageMsg + "\n" else "") +
-      (if (command.settings.Xhelp.value) command.xusageMsg + "\n" else "")
-
-    // if they asked for no help and command is valid, we call the real main
-    neededHelp() match {
-      case ""     => command.ok && process(command.settings)
-      case help   => echoNoNL(help) ; true
-    }
-  }
-
-  @deprecated("Use `process` instead", "2.9.0")
-  private def main(settings: Settings): Unit = process(settings)
-
-  @DeveloperApi
-  def getAddedJars(): Array[String] = {
-    val conf = new SparkConf().setMaster(getMaster())
-    val envJars = sys.env.get("ADD_JARS")
-    if (envJars.isDefined) {
-      logWarning("ADD_JARS environment variable is deprecated, use --jar spark submit argument instead")
-    }
-    val jars = {
-      val userJars = Utils.getUserJars(conf, isShell = true)
-      if (userJars.isEmpty) {
-        envJars.getOrElse("")
-      } else {
-        userJars.mkString(",")
-      }
-    }
-    Utils.resolveURIs(jars).split(",").filter(_.nonEmpty)
-  }
-
-}
-
-object SparkILoop extends Logging {
-  implicit def loopToInterpreter(repl: SparkILoop): SparkIMain = repl.intp
-  private def echo(msg: String) = Console println msg
-
-  // Designed primarily for use by test code: take a String with a
-  // bunch of code, and prints out a transcript of what it would look
-  // like if you'd just typed it into the repl.
-  private[repl] def runForTranscript(code: String, settings: Settings): String = {
-    import java.io.{ BufferedReader, StringReader, OutputStreamWriter }
-
-    stringFromStream { ostream =>
-      Console.withOut(ostream) {
-        val output = new JPrintWriter(new OutputStreamWriter(ostream), true) {
-          override def write(str: String) = {
-            // completely skip continuation lines
-            if (str forall (ch => ch.isWhitespace || ch == '|')) ()
-            // print a newline on empty scala prompts
-            else if ((str contains '\n') && (str.trim == "scala> ")) super.write("\n")
-            else super.write(str)
-          }
-        }
-        val input = new BufferedReader(new StringReader(code)) {
-          override def readLine(): String = {
-            val s = super.readLine()
-            // helping out by printing the line being interpreted.
-            if (s != null)
-              // scalastyle:off println
-              output.println(s)
-              // scalastyle:on println
-            s
-          }
-        }
-        val repl = new SparkILoop(input, output)
-
-        if (settings.classpath.isDefault)
-          settings.classpath.value = sys.props("java.class.path")
-
-        repl.getAddedJars().map(jar => new URI(jar).getPath).foreach(settings.classpath.append(_))
-
-        repl process settings
-      }
-    }
-  }
-
-  /** Creates an interpreter loop with default settings and feeds
-   *  the given code to it as input.
-   */
-  private[repl] def run(code: String, sets: Settings = new Settings): String = {
-    import java.io.{ BufferedReader, StringReader, OutputStreamWriter }
-
-    stringFromStream { ostream =>
-      Console.withOut(ostream) {
-        val input    = new BufferedReader(new StringReader(code))
-        val output   = new JPrintWriter(new OutputStreamWriter(ostream), true)
-        val repl     = new ILoop(input, output)
-
-        if (sets.classpath.isDefault)
-          sets.classpath.value = sys.props("java.class.path")
-
-        repl process sets
-      }
-    }
-  }
-  private[repl] def run(lines: List[String]): String = run(lines map (_ + "\n") mkString)
-}
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala
deleted file mode 100644
index 5f0d92bccd809..0000000000000
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala
+++ /dev/null
@@ -1,168 +0,0 @@
-// scalastyle:off
-
-/* NSC -- new Scala compiler
- * Copyright 2005-2013 LAMP/EPFL
- * @author Paul Phillips
- */
-
-package org.apache.spark.repl
-
-import scala.tools.nsc._
-import scala.tools.nsc.interpreter._
-
-import scala.tools.nsc.util.stackTraceString
-
-import org.apache.spark.SPARK_VERSION
-
-/**
- *  Machinery for the asynchronous initialization of the repl.
- */
-private[repl] trait SparkILoopInit {
-  self: SparkILoop =>
-
-  /** Print a welcome message */
-  def printWelcome() {
-    echo("""Welcome to
-      ____              __
-     / __/__  ___ _____/ /__
-    _\ \/ _ \/ _ `/ __/  '_/
-   /___/ .__/\_,_/_/ /_/\_\   version %s
-      /_/
-""".format(SPARK_VERSION))
-    import Properties._
-    val welcomeMsg = "Using Scala %s (%s, Java %s)".format(
-      versionString, javaVmName, javaVersion)
-    echo(welcomeMsg)
-    echo("Type in expressions to have them evaluated.")
-    echo("Type :help for more information.")
-   }
-
-  protected def asyncMessage(msg: String) {
-    if (isReplInfo || isReplPower)
-      echoAndRefresh(msg)
-  }
-
-  private val initLock = new java.util.concurrent.locks.ReentrantLock()
-  private val initCompilerCondition = initLock.newCondition() // signal the compiler is initialized
-  private val initLoopCondition = initLock.newCondition()     // signal the whole repl is initialized
-  private val initStart = System.nanoTime
-
-  private def withLock[T](body: => T): T = {
-    initLock.lock()
-    try body
-    finally initLock.unlock()
-  }
-  // a condition used to ensure serial access to the compiler.
-  @volatile private var initIsComplete = false
-  @volatile private var initError: String = null
-  private def elapsed() = "%.3f".format((System.nanoTime - initStart).toDouble / 1000000000L)
-
-  // the method to be called when the interpreter is initialized.
-  // Very important this method does nothing synchronous (i.e. do
-  // not try to use the interpreter) because until it returns, the
-  // repl's lazy val `global` is still locked.
-  protected def initializedCallback() = withLock(initCompilerCondition.signal())
-
-  // Spins off a thread which awaits a single message once the interpreter
-  // has been initialized.
-  protected def createAsyncListener() = {
-    io.spawn {
-      withLock(initCompilerCondition.await())
-      asyncMessage("[info] compiler init time: " + elapsed() + " s.")
-      postInitialization()
-    }
-  }
-
-  // called from main repl loop
-  protected def awaitInitialized(): Boolean = {
-    if (!initIsComplete)
-      withLock { while (!initIsComplete) initLoopCondition.await() }
-    if (initError != null) {
-      // scalastyle:off println
-      println("""
-        |Failed to initialize the REPL due to an unexpected error.
-        |This is a bug, please, report it along with the error diagnostics printed below.
-        |%s.""".stripMargin.format(initError)
-      )
-      // scalastyle:on println
-      false
-    } else true
-  }
-  // private def warningsThunks = List(
-  //   () => intp.bind("lastWarnings", "" + typeTag[List[(Position, String)]], intp.lastWarnings _),
-  // )
-
-  protected def postInitThunks = List[Option[() => Unit]](
-    Some(intp.setContextClassLoader _),
-    if (isReplPower) Some(() => enablePowerMode(true)) else None
-  ).flatten
-  // ++ (
-  //   warningsThunks
-  // )
-  // called once after init condition is signalled
-  protected def postInitialization() {
-    try {
-      postInitThunks foreach (f => addThunk(f()))
-      runThunks()
-    } catch {
-      case ex: Throwable =>
-        initError = stackTraceString(ex)
-        throw ex
-    } finally {
-      initIsComplete = true
-
-      if (isAsync) {
-        asyncMessage("[info] total init time: " + elapsed() + " s.")
-        withLock(initLoopCondition.signal())
-      }
-    }
-  }
-
-  def initializeSpark() {
-    intp.beQuietDuring {
-      command("""
-        @transient val spark = org.apache.spark.repl.Main.interp.createSparkSession()
-        @transient val sc = {
-          val _sc = spark.sparkContext
-          if (_sc.getConf.getBoolean("spark.ui.reverseProxy", false)) {
-            val proxyUrl = _sc.getConf.get("spark.ui.reverseProxyUrl", null)
-            if (proxyUrl != null) {
-              println(s"Spark Context Web UI is available at ${proxyUrl}/proxy/${_sc.applicationId}")
-            } else {
-              println(s"Spark Context Web UI is available at Spark Master Public URL")
-            }
-          } else {
-            _sc.uiWebUrl.foreach {
-              webUrl => println(s"Spark context Web UI available at ${webUrl}")
-            }
-          }
-          println("Spark context available as 'sc' " +
-            s"(master = ${_sc.master}, app id = ${_sc.applicationId}).")
-          println("Spark session available as 'spark'.")
-          _sc
-        }
-        """)
-      command("import org.apache.spark.SparkContext._")
-      command("import spark.implicits._")
-      command("import spark.sql")
-      command("import org.apache.spark.sql.functions._")
-    }
-  }
-
-  // code to be executed only after the interpreter is initialized
-  // and the lazy val `global` can be accessed without risk of deadlock.
-  private var pendingThunks: List[() => Unit] = Nil
-  protected def addThunk(body: => Unit) = synchronized {
-    pendingThunks :+= (() => body)
-  }
-  protected def runThunks(): Unit = synchronized {
-    if (pendingThunks.nonEmpty)
-      logDebug("Clearing " + pendingThunks.size + " thunks.")
-
-    while (pendingThunks.nonEmpty) {
-      val thunk = pendingThunks.head
-      pendingThunks = pendingThunks.tail
-      thunk()
-    }
-  }
-}
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkIMain.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkIMain.scala
deleted file mode 100644
index 74a04d5a42bb2..0000000000000
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkIMain.scala
+++ /dev/null
@@ -1,1808 +0,0 @@
-// scalastyle:off
-
-/* NSC -- new Scala compiler
- * Copyright 2005-2013 LAMP/EPFL
- * @author  Martin Odersky
- */
-
-package org.apache.spark.repl
-
-import java.io.File
-
-import scala.tools.nsc._
-import scala.tools.nsc.backend.JavaPlatform
-import scala.tools.nsc.interpreter._
-
-import Predef.{ println => _, _ }
-import scala.tools.nsc.util.{MergedClassPath, stringFromWriter, ScalaClassLoader, stackTraceString}
-import scala.reflect.internal.util._
-import java.net.URL
-import scala.sys.BooleanProp
-import io.{AbstractFile, PlainFile, VirtualDirectory}
-
-import reporters._
-import symtab.Flags
-import scala.reflect.internal.Names
-import scala.tools.util.PathResolver
-import ScalaClassLoader.URLClassLoader
-import scala.tools.nsc.util.Exceptional.unwrap
-import scala.collection.{ mutable, immutable }
-import scala.util.control.Exception.{ ultimately }
-import SparkIMain._
-import java.util.concurrent.Future
-import typechecker.Analyzer
-import scala.language.implicitConversions
-import scala.reflect.runtime.{ universe => ru }
-import scala.reflect.{ ClassTag, classTag }
-import scala.tools.reflect.StdRuntimeTags._
-import scala.util.control.ControlThrowable
-
-import org.apache.spark.{SparkConf, SparkContext}
-import org.apache.spark.internal.Logging
-import org.apache.spark.util.Utils
-import org.apache.spark.annotation.DeveloperApi
-
-// /** directory to save .class files to */
-// private class ReplVirtualDirectory(out: JPrintWriter) extends VirtualDirectory("((memory))", None) {
-//   private def pp(root: AbstractFile, indentLevel: Int) {
-//     val spaces = "    " * indentLevel
-//     out.println(spaces + root.name)
-//     if (root.isDirectory)
-//       root.toList sortBy (_.name) foreach (x => pp(x, indentLevel + 1))
-//   }
-//   // print the contents hierarchically
-//   def show() = pp(this, 0)
-// }
-
-  /** An interpreter for Scala code.
-   *
-   *  The main public entry points are compile(), interpret(), and bind().
-   *  The compile() method loads a complete Scala file.  The interpret() method
-   *  executes one line of Scala code at the request of the user.  The bind()
-   *  method binds an object to a variable that can then be used by later
-   *  interpreted code.
-   *
-   *  The overall approach is based on compiling the requested code and then
-   *  using a Java classloader and Java reflection to run the code
-   *  and access its results.
-   *
-   *  In more detail, a single compiler instance is used
-   *  to accumulate all successfully compiled or interpreted Scala code.  To
-   *  "interpret" a line of code, the compiler generates a fresh object that
-   *  includes the line of code and which has public member(s) to export
-   *  all variables defined by that code.  To extract the result of an
-   *  interpreted line to show the user, a second "result object" is created
-   *  which imports the variables exported by the above object and then
-   *  exports members called "$eval" and "$print". To accommodate user expressions
-   *  that read from variables or methods defined in previous statements, "import"
-   *  statements are used.
-   *
-   *  This interpreter shares the strengths and weaknesses of using the
-   *  full compiler-to-Java.  The main strength is that interpreted code
-   *  behaves exactly as does compiled code, including running at full speed.
-   *  The main weakness is that redefining classes and methods is not handled
-   *  properly, because rebinding at the Java level is technically difficult.
-   *
-   *  @author Moez A. Abdel-Gawad
-   *  @author Lex Spoon
-   */
-  @DeveloperApi
-  class SparkIMain(
-      initialSettings: Settings,
-      val out: JPrintWriter,
-      propagateExceptions: Boolean = false)
-    extends SparkImports with Logging { imain =>
-
-    private val conf = new SparkConf()
-
-    private val SPARK_DEBUG_REPL: Boolean = (System.getenv("SPARK_DEBUG_REPL") == "1")
-    /** Local directory to save .class files too */
-    private[repl] val outputDir = {
-      val rootDir = conf.getOption("spark.repl.classdir").getOrElse(Utils.getLocalDir(conf))
-      Utils.createTempDir(root = rootDir, namePrefix = "repl")
-    }
-    if (SPARK_DEBUG_REPL) {
-      echo("Output directory: " + outputDir)
-    }
-
-    /**
-     * Returns the path to the output directory containing all generated
-     * class files that will be served by the REPL class server.
-     */
-    @DeveloperApi
-    lazy val getClassOutputDirectory = outputDir
-
-    private val virtualDirectory                              = new PlainFile(outputDir) // "directory" for classfiles
-    /** Jetty server that will serve our classes to worker nodes */
-    private var currentSettings: Settings             = initialSettings
-    private var printResults                                  = true      // whether to print result lines
-    private var totalSilence                                  = false     // whether to print anything
-    private var _initializeComplete                   = false     // compiler is initialized
-    private var _isInitialized: Future[Boolean]       = null      // set up initialization future
-    private var bindExceptions                        = true      // whether to bind the lastException variable
-    private var _executionWrapper                     = ""        // code to be wrapped around all lines
-
-    /** We're going to go to some trouble to initialize the compiler asynchronously.
-     *  It's critical that nothing call into it until it's been initialized or we will
-     *  run into unrecoverable issues, but the perceived repl startup time goes
-     *  through the roof if we wait for it.  So we initialize it with a future and
-     *  use a lazy val to ensure that any attempt to use the compiler object waits
-     *  on the future.
-     */
-    private var _classLoader: AbstractFileClassLoader = null                              // active classloader
-    private val _compiler: Global                     = newCompiler(settings, reporter)   // our private compiler
-
-    private trait ExposeAddUrl extends URLClassLoader { def addNewUrl(url: URL) = this.addURL(url) }
-    private var _runtimeClassLoader: URLClassLoader with ExposeAddUrl = null              // wrapper exposing addURL
-
-    private val nextReqId = {
-      var counter = 0
-      () => { counter += 1 ; counter }
-    }
-
-    private def compilerClasspath: Seq[URL] = (
-      if (isInitializeComplete) global.classPath.asURLs
-      else new PathResolver(settings).result.asURLs  // the compiler's classpath
-      )
-    // NOTE: Exposed to repl package since accessed indirectly from SparkIMain
-    private[repl] def settings = currentSettings
-    private def mostRecentLine = prevRequestList match {
-      case Nil      => ""
-      case req :: _ => req.originalLine
-    }
-    // Run the code body with the given boolean settings flipped to true.
-    private def withoutWarnings[T](body: => T): T = beQuietDuring {
-      val saved = settings.nowarn.value
-      if (!saved)
-        settings.nowarn.value = true
-
-      try body
-      finally if (!saved) settings.nowarn.value = false
-    }
-
-    /** construct an interpreter that reports to Console */
-    def this(settings: Settings) = this(settings, new NewLinePrintWriter(new ConsoleWriter, true))
-    def this() = this(new Settings())
-
-    private lazy val repllog: Logger = new Logger {
-      val out: JPrintWriter = imain.out
-      val isInfo: Boolean  = BooleanProp keyExists "scala.repl.info"
-      val isDebug: Boolean = BooleanProp keyExists "scala.repl.debug"
-      val isTrace: Boolean = BooleanProp keyExists "scala.repl.trace"
-    }
-    private[repl] lazy val formatting: Formatting = new Formatting {
-      val prompt = Properties.shellPromptString
-    }
-
-    // NOTE: Exposed to repl package since used by SparkExprTyper and SparkILoop
-    private[repl] lazy val reporter: ConsoleReporter = new SparkIMain.ReplReporter(this)
-
-    /**
-     * Determines if errors were reported (typically during compilation).
-     *
-     * @note This is not for runtime errors
-     *
-     * @return True if had errors, otherwise false
-     */
-    @DeveloperApi
-    def isReportingErrors = reporter.hasErrors
-
-    import formatting._
-    import reporter.{ printMessage, withoutTruncating }
-
-    // This exists mostly because using the reporter too early leads to deadlock.
-    private def echo(msg: String) { Console println msg }
-    private def _initSources = List(new BatchSourceFile("<init>", "class $repl_$init { }"))
-    private def _initialize() = {
-      try {
-        // todo. if this crashes, REPL will hang
-        new _compiler.Run() compileSources _initSources
-        _initializeComplete = true
-        true
-      }
-      catch AbstractOrMissingHandler()
-    }
-    private def tquoted(s: String) = "\"\"\"" + s + "\"\"\""
-
-    // argument is a thunk to execute after init is done
-    // NOTE: Exposed to repl package since used by SparkILoop
-    private[repl] def initialize(postInitSignal: => Unit) {
-      synchronized {
-        if (_isInitialized == null) {
-          _isInitialized = io.spawn {
-            try _initialize()
-            finally postInitSignal
-          }
-        }
-      }
-    }
-
-    /**
-     * Initializes the underlying compiler/interpreter in a blocking fashion.
-     *
-     * @note Must be executed before using SparkIMain!
-     */
-    @DeveloperApi
-    def initializeSynchronous(): Unit = {
-      if (!isInitializeComplete) {
-        _initialize()
-        assert(global != null, global)
-      }
-    }
-    private def isInitializeComplete = _initializeComplete
-
-    /** the public, go through the future compiler */
-
-    /**
-     * The underlying compiler used to generate ASTs and execute code.
-     */
-    @DeveloperApi
-    lazy val global: Global = {
-      if (isInitializeComplete) _compiler
-      else {
-        // If init hasn't been called yet you're on your own.
-        if (_isInitialized == null) {
-          logWarning("Warning: compiler accessed before init set up.  Assuming no postInit code.")
-          initialize(())
-        }
-        //       // blocks until it is ; false means catastrophic failure
-        if (_isInitialized.get()) _compiler
-        else null
-      }
-    }
-    @deprecated("Use `global` for access to the compiler instance.", "2.9.0")
-    private lazy val compiler: global.type = global
-
-    import global._
-    import definitions.{ScalaPackage, JavaLangPackage, termMember, typeMember}
-    import rootMirror.{RootClass, getClassIfDefined, getModuleIfDefined, getRequiredModule, getRequiredClass}
-
-    private implicit class ReplTypeOps(tp: Type) {
-      def orElse(other: => Type): Type    = if (tp ne NoType) tp else other
-      def andAlso(fn: Type => Type): Type = if (tp eq NoType) tp else fn(tp)
-    }
-
-    // TODO: If we try to make naming a lazy val, we run into big time
-    // scalac unhappiness with what look like cycles.  It has not been easy to
-    // reduce, but name resolution clearly takes different paths.
-    // NOTE: Exposed to repl package since used by SparkExprTyper
-    private[repl] object naming extends {
-      val global: imain.global.type = imain.global
-    } with Naming {
-      // make sure we don't overwrite their unwisely named res3 etc.
-      def freshUserTermName(): TermName = {
-        val name = newTermName(freshUserVarName())
-        if (definedNameMap contains name) freshUserTermName()
-        else name
-      }
-      def isUserTermName(name: Name) = isUserVarName("" + name)
-      def isInternalTermName(name: Name) = isInternalVarName("" + name)
-    }
-    import naming._
-
-    // NOTE: Exposed to repl package since used by SparkILoop
-    private[repl] object deconstruct extends {
-      val global: imain.global.type = imain.global
-    } with StructuredTypeStrings
-
-    // NOTE: Exposed to repl package since used by SparkImports
-    private[repl] lazy val memberHandlers = new {
-      val intp: imain.type = imain
-    } with SparkMemberHandlers
-    import memberHandlers._
-
-    /**
-     * Suppresses overwriting print results during the operation.
-     *
-     * @param body The block to execute
-     * @tparam T The return type of the block
-     *
-     * @return The result from executing the block
-     */
-    @DeveloperApi
-    def beQuietDuring[T](body: => T): T = {
-      val saved = printResults
-      printResults = false
-      try body
-      finally printResults = saved
-    }
-
-    /**
-     * Completely masks all output during the operation (minus JVM standard
-     * out and error).
-     *
-     * @param operation The block to execute
-     * @tparam T The return type of the block
-     *
-     * @return The result from executing the block
-     */
-    @DeveloperApi
-    def beSilentDuring[T](operation: => T): T = {
-      val saved = totalSilence
-      totalSilence = true
-      try operation
-      finally totalSilence = saved
-    }
-
-    // NOTE: Exposed to repl package since used by SparkILoop
-    private[repl] def quietRun[T](code: String) = beQuietDuring(interpret(code))
-
-    private def logAndDiscard[T](label: String, alt: => T): PartialFunction[Throwable, T] = {
-      case t: ControlThrowable => throw t
-      case t: Throwable        =>
-        logDebug(label + ": " + unwrap(t))
-        logDebug(stackTraceString(unwrap(t)))
-      alt
-    }
-    /** takes AnyRef because it may be binding a Throwable or an Exceptional */
-
-    private def withLastExceptionLock[T](body: => T, alt: => T): T = {
-      assert(bindExceptions, "withLastExceptionLock called incorrectly.")
-      bindExceptions = false
-
-      try     beQuietDuring(body)
-      catch   logAndDiscard("withLastExceptionLock", alt)
-      finally bindExceptions = true
-    }
-
-    /**
-     * Contains the code (in string form) representing a wrapper around all
-     * code executed by this instance.
-     *
-     * @return The wrapper code as a string
-     */
-    @DeveloperApi
-    def executionWrapper = _executionWrapper
-
-    /**
-     * Sets the code to use as a wrapper around all code executed by this
-     * instance.
-     *
-     * @param code The wrapper code as a string
-     */
-    @DeveloperApi
-    def setExecutionWrapper(code: String) = _executionWrapper = code
-
-    /**
-     * Clears the code used as a wrapper around all code executed by
-     * this instance.
-     */
-    @DeveloperApi
-    def clearExecutionWrapper() = _executionWrapper = ""
-
-    /** interpreter settings */
-    private lazy val isettings = new SparkISettings(this)
-
-    /**
-     * Instantiates a new compiler used by SparkIMain. Overridable to provide
-     * own instance of a compiler.
-     *
-     * @param settings The settings to provide the compiler
-     * @param reporter The reporter to use for compiler output
-     *
-     * @return The compiler as a Global
-     */
-    @DeveloperApi
-    protected def newCompiler(settings: Settings, reporter: Reporter): ReplGlobal = {
-      settings.outputDirs setSingleOutput virtualDirectory
-      settings.exposeEmptyPackage.value = true
-      new Global(settings, reporter) with ReplGlobal {
-        override def toString: String = "<global>"
-      }
-    }
-
-    /**
-     * Adds any specified jars to the compile and runtime classpaths.
-     *
-     * @note Currently only supports jars, not directories
-     * @param urls The list of items to add to the compile and runtime classpaths
-     */
-    @DeveloperApi
-    def addUrlsToClassPath(urls: URL*): Unit = {
-      new Run // Needed to force initialization of "something" to correctly load Scala classes from jars
-      urls.foreach(_runtimeClassLoader.addNewUrl) // Add jars/classes to runtime for execution
-      updateCompilerClassPath(urls: _*)           // Add jars/classes to compile time for compiling
-    }
-
-    private def updateCompilerClassPath(urls: URL*): Unit = {
-      require(!global.forMSIL) // Only support JavaPlatform
-
-      val platform = global.platform.asInstanceOf[JavaPlatform]
-
-      val newClassPath = mergeUrlsIntoClassPath(platform, urls: _*)
-
-      // NOTE: Must use reflection until this is exposed/fixed upstream in Scala
-      val fieldSetter = platform.getClass.getMethods
-        .find(_.getName.endsWith("currentClassPath_$eq")).get
-      fieldSetter.invoke(platform, Some(newClassPath))
-
-      // Reload all jars specified into our compiler
-      global.invalidateClassPathEntries(urls.map(_.getPath): _*)
-    }
-
-    private def mergeUrlsIntoClassPath(platform: JavaPlatform, urls: URL*): MergedClassPath[AbstractFile] = {
-      // Collect our new jars/directories and add them to the existing set of classpaths
-      val allClassPaths = (
-        platform.classPath.asInstanceOf[MergedClassPath[AbstractFile]].entries ++
-        urls.map(url => {
-          platform.classPath.context.newClassPath(
-            if (url.getProtocol == "file") {
-              val f = new File(url.getPath)
-              if (f.isDirectory)
-                io.AbstractFile.getDirectory(f)
-              else
-                io.AbstractFile.getFile(f)
-            } else {
-              io.AbstractFile.getURL(url)
-            }
-          )
-        })
-      ).distinct
-
-      // Combine all of our classpaths (old and new) into one merged classpath
-      new MergedClassPath(allClassPaths, platform.classPath.context)
-    }
-
-    /**
-     * Represents the parent classloader used by this instance. Can be
-     * overridden to provide alternative classloader.
-     *
-     * @return The classloader used as the parent loader of this instance
-     */
-    @DeveloperApi
-    protected def parentClassLoader: ClassLoader =
-      SparkHelper.explicitParentLoader(settings).getOrElse( this.getClass.getClassLoader() )
-
-    /* A single class loader is used for all commands interpreted by this Interpreter.
-     It would also be possible to create a new class loader for each command
-     to interpret.  The advantages of the current approach are:
-
-    - Expressions are only evaluated one time.  This is especially
-    significant for I/O, e.g. "val x = Console.readLine"
-
-    The main disadvantage is:
-
-    - Objects, classes, and methods cannot be rebound.  Instead, definitions
-    shadow the old ones, and old code objects refer to the old
-    definitions.
-    */
-    private def resetClassLoader() = {
-      logDebug("Setting new classloader: was " + _classLoader)
-      _classLoader = null
-      ensureClassLoader()
-    }
-    private final def ensureClassLoader() {
-      if (_classLoader == null)
-        _classLoader = makeClassLoader()
-    }
-
-    // NOTE: Exposed to repl package since used by SparkILoop
-    private[repl] def classLoader: AbstractFileClassLoader = {
-      ensureClassLoader()
-      _classLoader
-    }
-    private class TranslatingClassLoader(parent: ClassLoader) extends AbstractFileClassLoader(virtualDirectory, parent) {
-      /** Overridden here to try translating a simple name to the generated
-       *  class name if the original attempt fails.  This method is used by
-       *  getResourceAsStream as well as findClass.
-       */
-      override protected def findAbstractFile(name: String): AbstractFile = {
-        super.findAbstractFile(name) match {
-          // deadlocks on startup if we try to translate names too early
-          case null if isInitializeComplete =>
-            generatedName(name) map (x => super.findAbstractFile(x)) orNull
-          case file                         =>
-            file
-        }
-      }
-    }
-    private def makeClassLoader(): AbstractFileClassLoader =
-      new TranslatingClassLoader(parentClassLoader match {
-        case null   => ScalaClassLoader fromURLs compilerClasspath
-        case p      =>
-          _runtimeClassLoader = new URLClassLoader(compilerClasspath, p) with ExposeAddUrl
-          _runtimeClassLoader
-      })
-
-    private def getInterpreterClassLoader() = classLoader
-
-    // Set the current Java "context" class loader to this interpreter's class loader
-    // NOTE: Exposed to repl package since used by SparkILoopInit
-    private[repl] def setContextClassLoader() = classLoader.setAsContext()
-
-    /**
-     * Returns the real name of a class based on its repl-defined name.
-     *
-     * ==Example==
-     * Given a simple repl-defined name, returns the real name of
-     * the class representing it, e.g. for "Bippy" it may return
-     * {{{
-     *     $line19.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$Bippy
-     * }}}
-     *
-     * @param simpleName The repl-defined name whose real name to retrieve
-     *
-     * @return Some real name if the simple name exists, else None
-     */
-    @DeveloperApi
-    def generatedName(simpleName: String): Option[String] = {
-      if (simpleName endsWith nme.MODULE_SUFFIX_STRING) optFlatName(simpleName.init) map (_ + nme.MODULE_SUFFIX_STRING)
-      else optFlatName(simpleName)
-    }
-
-    // NOTE: Exposed to repl package since used by SparkILoop
-    private[repl] def flatName(id: String)    = optFlatName(id) getOrElse id
-    // NOTE: Exposed to repl package since used by SparkILoop
-    private[repl] def optFlatName(id: String) = requestForIdent(id) map (_ fullFlatName id)
-
-    /**
-     * Retrieves all simple names contained in the current instance.
-     *
-     * @return A list of sorted names
-     */
-    @DeveloperApi
-    def allDefinedNames = definedNameMap.keys.toList.sorted
-
-    private def pathToType(id: String): String = pathToName(newTypeName(id))
-    // NOTE: Exposed to repl package since used by SparkILoop
-    private[repl] def pathToTerm(id: String): String = pathToName(newTermName(id))
-
-    /**
-     * Retrieves the full code path to access the specified simple name
-     * content.
-     *
-     * @param name The simple name of the target whose path to determine
-     *
-     * @return The full path used to access the specified target (name)
-     */
-    @DeveloperApi
-    def pathToName(name: Name): String = {
-      if (definedNameMap contains name)
-        definedNameMap(name) fullPath name
-      else name.toString
-    }
-
-    /** Most recent tree handled which wasn't wholly synthetic. */
-    private def mostRecentlyHandledTree: Option[Tree] = {
-      prevRequests.reverse foreach { req =>
-        req.handlers.reverse foreach {
-          case x: MemberDefHandler if x.definesValue && !isInternalTermName(x.name) => return Some(x.member)
-          case _ => ()
-        }
-      }
-      None
-    }
-
-    /** Stubs for work in progress. */
-    private def handleTypeRedefinition(name: TypeName, old: Request, req: Request) = {
-      for (t1 <- old.simpleNameOfType(name) ; t2 <- req.simpleNameOfType(name)) {
-        logDebug("Redefining type '%s'\n  %s -> %s".format(name, t1, t2))
-      }
-    }
-
-    private def handleTermRedefinition(name: TermName, old: Request, req: Request) = {
-      for (t1 <- old.compilerTypeOf get name ; t2 <- req.compilerTypeOf get name) {
-    //    Printing the types here has a tendency to cause assertion errors, like
-        //   assertion failed: fatal: <refinement> has owner value x, but a class owner is required
-        // so DBG is by-name now to keep it in the family.  (It also traps the assertion error,
-        // but we don't want to unnecessarily risk hosing the compiler's internal state.)
-        logDebug("Redefining term '%s'\n  %s -> %s".format(name, t1, t2))
-      }
-    }
-
-    private def recordRequest(req: Request) {
-      if (req == null || referencedNameMap == null)
-        return
-
-      prevRequests += req
-      req.referencedNames foreach (x => referencedNameMap(x) = req)
-
-      // warning about serially defining companions.  It'd be easy
-      // enough to just redefine them together but that may not always
-      // be what people want so I'm waiting until I can do it better.
-      for {
-        name   <- req.definedNames filterNot (x => req.definedNames contains x.companionName)
-        oldReq <- definedNameMap get name.companionName
-        newSym <- req.definedSymbols get name
-        oldSym <- oldReq.definedSymbols get name.companionName
-        if Seq(oldSym, newSym).permutations exists { case Seq(s1, s2) => s1.isClass && s2.isModule }
-      } {
-        afterTyper(replwarn(s"warning: previously defined $oldSym is not a companion to $newSym."))
-        replwarn("Companions must be defined together; you may wish to use :paste mode for this.")
-      }
-
-      // Updating the defined name map
-      req.definedNames foreach { name =>
-        if (definedNameMap contains name) {
-          if (name.isTypeName) handleTypeRedefinition(name.toTypeName, definedNameMap(name), req)
-          else handleTermRedefinition(name.toTermName, definedNameMap(name), req)
-        }
-         definedNameMap(name) = req
-      }
-    }
-
-    private def replwarn(msg: => String) {
-      if (!settings.nowarnings.value)
-        printMessage(msg)
-    }
-
-    private def isParseable(line: String): Boolean = {
-      beSilentDuring {
-        try parse(line) match {
-          case Some(xs) => xs.nonEmpty  // parses as-is
-          case None     => true         // incomplete
-        }
-        catch { case x: Exception =>    // crashed the compiler
-          replwarn("Exception in isParseable(\"" + line + "\"): " + x)
-           false
-         }
-      }
-    }
-
-    private def compileSourcesKeepingRun(sources: SourceFile*) = {
-      val run = new Run()
-      reporter.reset()
-      run compileSources sources.toList
-      (!reporter.hasErrors, run)
-    }
-
-    /**
-     * Compiles specified source files.
-     *
-     * @param sources The sequence of source files to compile
-     *
-     * @return True if successful, otherwise false
-     */
-    @DeveloperApi
-    def compileSources(sources: SourceFile*): Boolean =
-      compileSourcesKeepingRun(sources: _*)._1
-
-    /**
-     * Compiles a string of code.
-     *
-     * @param code The string of code to compile
-     *
-     * @return True if successful, otherwise false
-     */
-    @DeveloperApi
-    def compileString(code: String): Boolean =
-      compileSources(new BatchSourceFile("<script>", code))
-
-    /** Build a request from the user. `trees` is `line` after being parsed.
-     */
-    private def buildRequest(line: String, trees: List[Tree]): Request = {
-      executingRequest = new Request(line, trees)
-      executingRequest
-    }
-
-  // rewriting "5 // foo" to "val x = { 5 // foo }" creates broken code because
-  // the close brace is commented out.  Strip single-line comments.
-  // ... but for error message output reasons this is not used, and rather than
-  // enclosing in braces it is constructed like "val x =\n5 // foo".
-  private def removeComments(line: String): String = {
-    showCodeIfDebugging(line) // as we're about to lose our // show
-    line.lines map (s => s indexOf "//" match {
-      case -1   => s
-      case idx  => s take idx
-    }) mkString "\n"
-  }
-
-  private def safePos(t: Tree, alt: Int): Int =
-    try t.pos.startOrPoint
-    catch { case _: UnsupportedOperationException => alt }
-
-  // Given an expression like 10 * 10 * 10 we receive the parent tree positioned
-  // at a '*'.  So look at each subtree and find the earliest of all positions.
-  private def earliestPosition(tree: Tree): Int = {
-    var pos = Int.MaxValue
-    tree foreach { t =>
-      pos = math.min(pos, safePos(t, Int.MaxValue))
-    }
-    pos
-  }
-
-
-  private def requestFromLine(line: String, synthetic: Boolean): Either[IR.Result, Request] = {
-    val content = indentCode(line)
-    val trees = parse(content) match {
-      case None         => return Left(IR.Incomplete)
-      case Some(Nil)    => return Left(IR.Error) // parse error or empty input
-      case Some(trees)  => trees
-    }
-    logDebug(
-      trees map (t => {
-        // [Eugene to Paul] previously it just said `t map ...`
-        // because there was an implicit conversion from Tree to a list of Trees
-        // however Martin and I have removed the conversion
-        // (it was conflicting with the new reflection API),
-        // so I had to rewrite this a bit
-        val subs = t collect { case sub => sub }
-        subs map (t0 =>
-          "  " + safePos(t0, -1) + ": " + t0.shortClass + "\n"
-                ) mkString ""
-      }) mkString "\n"
-    )
-    // If the last tree is a bare expression, pinpoint where it begins using the
-    // AST node position and snap the line off there.  Rewrite the code embodied
-    // by the last tree as a ValDef instead, so we can access the value.
-    trees.last match {
-      case _:Assign                        => // we don't want to include assignments
-        case _:TermTree | _:Ident | _:Select => // ... but do want other unnamed terms.
-          val varName  = if (synthetic) freshInternalVarName() else freshUserVarName()
-      val rewrittenLine = (
-        // In theory this would come out the same without the 1-specific test, but
-        // it's a cushion against any more sneaky parse-tree position vs. code mismatches:
-        // this way such issues will only arise on multiple-statement repl input lines,
-        // which most people don't use.
-        if (trees.size == 1) "val " + varName + " =\n" + content
-        else {
-          // The position of the last tree
-          val lastpos0 = earliestPosition(trees.last)
-          // Oh boy, the parser throws away parens so "(2+2)" is mispositioned,
-          // with increasingly hard to decipher positions as we move on to "() => 5",
-          // (x: Int) => x + 1, and more.  So I abandon attempts to finesse and just
-          // look for semicolons and newlines, which I'm sure is also buggy.
-          val (raw1, raw2) = content splitAt lastpos0
-          logDebug("[raw] " + raw1 + "   <--->   " + raw2)
-
-          val adjustment = (raw1.reverse takeWhile (ch => (ch != ';') && (ch != '\n'))).size
-          val lastpos = lastpos0 - adjustment
-
-          // the source code split at the laboriously determined position.
-          val (l1, l2) = content splitAt lastpos
-          logDebug("[adj] " + l1 + "   <--->   " + l2)
-
-          val prefix   = if (l1.trim == "") "" else l1 + ";\n"
-          // Note to self: val source needs to have this precise structure so that
-          // error messages print the user-submitted part without the "val res0 = " part.
-          val combined   = prefix + "val " + varName + " =\n" + l2
-
-          logDebug(List(
-            "    line" -> line,
-            " content" -> content,
-            "     was" -> l2,
-            "combined" -> combined) map {
-              case (label, s) => label + ": '" + s + "'"
-            } mkString "\n"
-          )
-          combined
-        }
-      )
-        // Rewriting    "foo ; bar ; 123"
-        // to           "foo ; bar ; val resXX = 123"
-        requestFromLine(rewrittenLine, synthetic) match {
-          case Right(req) => return Right(req withOriginalLine line)
-          case x          => return x
-        }
-      case _ =>
-    }
-    Right(buildRequest(line, trees))
-  }
-
-  // normalize non-public types so we don't see protected aliases like Self
-  private def normalizeNonPublic(tp: Type) = tp match {
-    case TypeRef(_, sym, _) if sym.isAliasType && !sym.isPublic => tp.dealias
-    case _                                                      => tp
-  }
-
-  /**
-   * Interpret one line of input. All feedback, including parse errors
-   * and evaluation results, are printed via the supplied compiler's
-   * reporter. Values defined are available for future interpreted strings.
-   *
-   * @note This assigns variables with user name structure like "res0"
-   *
-   * @param line The line representing the code to interpret
-   *
-   * @return Whether the line was interpreted successfully, or failed due to
-   *         incomplete code, compilation error, or runtime error
-   */
-  @DeveloperApi
-  def interpret(line: String): IR.Result = interpret(line, false)
-
-  /**
-   * Interpret one line of input. All feedback, including parse errors
-   * and evaluation results, are printed via the supplied compiler's
-   * reporter. Values defined are available for future interpreted strings.
-   *
-   * @note This assigns variables with synthetic (generated) name structure
-   *       like "$ires0"
-   *
-   * @param line The line representing the code to interpret
-   *
-   * @return Whether the line was interpreted successfully, or failed due to
-   *         incomplete code, compilation error, or runtime error
-   */
-  @DeveloperApi
-  def interpretSynthetic(line: String): IR.Result = interpret(line, true)
-
-  private def interpret(line: String, synthetic: Boolean): IR.Result = {
-    def loadAndRunReq(req: Request) = {
-      classLoader.setAsContext()
-      val (result, succeeded) = req.loadAndRun
-
-      /** To our displeasure, ConsoleReporter offers only printMessage,
-       *  which tacks a newline on the end.  Since that breaks all the
-       *  output checking, we have to take one off to balance.
-       */
-      if (succeeded) {
-        if (printResults && result != "")
-          printMessage(result stripSuffix "\n")
-        else if (isReplDebug) // show quiet-mode activity
-          printMessage(result.trim.lines map ("[quiet] " + _) mkString "\n")
-
-        // Book-keeping.  Have to record synthetic requests too,
-        // as they may have been issued for information, e.g. :type
-        recordRequest(req)
-        IR.Success
-      }
-        else {
-          // don't truncate stack traces
-          withoutTruncating(printMessage(result))
-          IR.Error
-        }
-    }
-
-    if (global == null) IR.Error
-    else requestFromLine(line, synthetic) match {
-      case Left(result) => result
-      case Right(req)   =>
-        // null indicates a disallowed statement type; otherwise compile and
-        // fail if false (implying e.g. a type error)
-        if (req == null || !req.compile) IR.Error
-        else loadAndRunReq(req)
-    }
-  }
-
-  /**
-   * Bind a specified name to a specified value.  The name may
-   * later be used by expressions passed to interpret.
-   *
-   * @note This binds via compilation and interpretation
-   *
-   * @param name The variable name to bind
-   * @param boundType The type of the variable, as a string
-   * @param value The object value to bind to it
-   *
-   * @return An indication of whether the binding succeeded or failed
-   *         using interpreter results
-   */
-  @DeveloperApi
-  def bind(name: String, boundType: String, value: Any, modifiers: List[String] = Nil): IR.Result = {
-    val bindRep = new ReadEvalPrint()
-    val run = bindRep.compile("""
-                              |object %s {
-                                |  var value: %s = _
-                              |  def set(x: Any) = value = x.asInstanceOf[%s]
-                              |}
-                              """.stripMargin.format(bindRep.evalName, boundType, boundType)
-                            )
-    bindRep.callEither("set", value) match {
-      case Left(ex) =>
-        logDebug("Set failed in bind(%s, %s, %s)".format(name, boundType, value))
-        logDebug(util.stackTraceString(ex))
-        IR.Error
-
-      case Right(_) =>
-        val line = "%sval %s = %s.value".format(modifiers map (_ + " ") mkString, name, bindRep.evalPath)
-      logDebug("Interpreting: " + line)
-      interpret(line)
-    }
-  }
-
-  /**
-   * Bind a specified name to a specified value directly.
-   *
-   * @note This updates internal bound names directly
-   *
-   * @param name The variable name to bind
-   * @param boundType The type of the variable, as a string
-   * @param value The object value to bind to it
-   *
-   * @return An indication of whether the binding succeeded or failed
-   *         using interpreter results
-   */
-  @DeveloperApi
-  def directBind(name: String, boundType: String, value: Any): IR.Result = {
-    val result = bind(name, boundType, value)
-    if (result == IR.Success)
-      directlyBoundNames += newTermName(name)
-    result
-  }
-
-  private def directBind(p: NamedParam): IR.Result                                    = directBind(p.name, p.tpe, p.value)
-  private def directBind[T: ru.TypeTag : ClassTag](name: String, value: T): IR.Result = directBind((name, value))
-
-  /**
-   * Overwrites previously-bound val with a new instance.
-   *
-   * @param p The named parameters used to provide the name, value, and type
-   *
-   * @return The results of rebinding the named val
-   */
-  @DeveloperApi
-  def rebind(p: NamedParam): IR.Result = {
-    val name     = p.name
-    val oldType  = typeOfTerm(name) orElse { return IR.Error }
-    val newType  = p.tpe
-    val tempName = freshInternalVarName()
-
-    quietRun("val %s = %s".format(tempName, name))
-    quietRun("val %s = %s.asInstanceOf[%s]".format(name, tempName, newType))
-  }
-  private def quietImport(ids: String*): IR.Result = beQuietDuring(addImports(ids: _*))
-
-  /**
-   * Executes an import statement per "id" provided
-   *
-   * @example addImports("org.apache.spark.SparkContext")
-   *
-   * @param ids The series of "id" strings used for import statements
-   *
-   * @return The results of importing the series of "id" strings
-   */
-  @DeveloperApi
-  def addImports(ids: String*): IR.Result =
-    if (ids.isEmpty) IR.Success
-    else interpret("import " + ids.mkString(", "))
-
-  // NOTE: Exposed to repl package since used by SparkILoop
-  private[repl] def quietBind(p: NamedParam): IR.Result                               = beQuietDuring(bind(p))
-  private def bind(p: NamedParam): IR.Result                                    = bind(p.name, p.tpe, p.value)
-  private def bind[T: ru.TypeTag : ClassTag](name: String, value: T): IR.Result = bind((name, value))
-  private def bindSyntheticValue(x: Any): IR.Result                             = bindValue(freshInternalVarName(), x)
-  private def bindValue(x: Any): IR.Result                                      = bindValue(freshUserVarName(), x)
-  private def bindValue(name: String, x: Any): IR.Result                        = bind(name, TypeStrings.fromValue(x), x)
-
-  /**
-   * Reset this interpreter, forgetting all user-specified requests.
-   */
-  @DeveloperApi
-  def reset() {
-    clearExecutionWrapper()
-    resetClassLoader()
-    resetAllCreators()
-    prevRequests.clear()
-    referencedNameMap.clear()
-    definedNameMap.clear()
-    virtualDirectory.delete()
-    virtualDirectory.create()
-  }
-
-  /**
-   * Stops the underlying REPL class server and flushes the reporter used
-   * for compiler output.
-   */
-  @DeveloperApi
-  def close() {
-    reporter.flush()
-  }
-
-  /**
-   * Captures the session names (which are set by system properties) once, instead of for each line.
-   */
-  @DeveloperApi
-  object FixedSessionNames {
-    val lineName    = sessionNames.line
-    val readName    = sessionNames.read
-    val evalName    = sessionNames.eval
-    val printName   = sessionNames.print
-    val resultName  = sessionNames.result
-  }
-
-  /** Here is where we:
-   *
-   *  1) Read some source code, and put it in the "read" object.
-   *  2) Evaluate the read object, and put the result in the "eval" object.
-   *  3) Create a String for human consumption, and put it in the "print" object.
-   *
-   *  Read! Eval! Print! Some of that not yet centralized here.
-   */
-  class ReadEvalPrint(val lineId: Int) {
-    def this() = this(freshLineId())
-
-    private var lastRun: Run = _
-    private var evalCaught: Option[Throwable] = None
-    private var conditionalWarnings: List[ConditionalWarning] = Nil
-
-    val packageName = FixedSessionNames.lineName + lineId
-    val readName    = FixedSessionNames.readName
-    val evalName    = FixedSessionNames.evalName
-    val printName   = FixedSessionNames.printName
-    val resultName  = FixedSessionNames.resultName
-
-    def bindError(t: Throwable) = {
-      // Immediately throw the exception if we are asked to propagate them
-      if (propagateExceptions) {
-        throw unwrap(t)
-      }
-      if (!bindExceptions) // avoid looping if already binding
-        throw t
-
-      val unwrapped = unwrap(t)
-      withLastExceptionLock[String]({
-        directBind[Throwable]("lastException", unwrapped)(tagOfThrowable, classTag[Throwable])
-        util.stackTraceString(unwrapped)
-      }, util.stackTraceString(unwrapped))
-    }
-
-    // TODO: split it out into a package object and a regular
-    // object and we can do that much less wrapping.
-    def packageDecl = "package " + packageName
-
-    def pathTo(name: String)   = packageName + "." + name
-    def packaged(code: String) = packageDecl + "\n\n" + code
-
-    def readPath  = pathTo(readName)
-    def evalPath  = pathTo(evalName)
-    def printPath = pathTo(printName)
-
-    def call(name: String, args: Any*): AnyRef = {
-      val m = evalMethod(name)
-      logDebug("Invoking: " + m)
-      if (args.nonEmpty)
-        logDebug("  with args: " + args.mkString(", "))
-
-      m.invoke(evalClass, args.map(_.asInstanceOf[AnyRef]): _*)
-    }
-
-    def callEither(name: String, args: Any*): Either[Throwable, AnyRef] =
-      try Right(call(name, args: _*))
-    catch { case ex: Throwable => Left(ex) }
-
-    def callOpt(name: String, args: Any*): Option[AnyRef] =
-      try Some(call(name, args: _*))
-    catch { case ex: Throwable => bindError(ex) ; None }
-
-    class EvalException(msg: String, cause: Throwable) extends RuntimeException(msg, cause) { }
-
-    private def evalError(path: String, ex: Throwable) =
-      throw new EvalException("Failed to load '" + path + "': " + ex.getMessage, ex)
-
-    private def load(path: String): Class[_] = {
-      // scalastyle:off classforname
-      try Class.forName(path, true, classLoader)
-      catch { case ex: Throwable => evalError(path, unwrap(ex)) }
-      // scalastyle:on classforname
-    }
-
-    lazy val evalClass = load(evalPath)
-    lazy val evalValue = callEither(resultName) match {
-      case Left(ex)      => evalCaught = Some(ex) ; None
-      case Right(result) => Some(result)
-    }
-
-    def compile(source: String): Boolean = compileAndSaveRun("<console>", source)
-
-    /** The innermost object inside the wrapper, found by
-     * following accessPath into the outer one.
-     */
-    def resolvePathToSymbol(accessPath: String): Symbol = {
-      // val readRoot  = getRequiredModule(readPath)   // the outermost wrapper
-      // MATEI: Changed this to getClass because the root object is no longer a module (Scala singleton object)
-
-      val readRoot  = rootMirror.getClassByName(newTypeName(readPath))   // the outermost wrapper
-      (accessPath split '.').foldLeft(readRoot: Symbol) {
-        case (sym, "")    => sym
-        case (sym, name)  => afterTyper(termMember(sym, name))
-      }
-    }
-    /** We get a bunch of repeated warnings for reasons I haven't
-     *  entirely figured out yet.  For now, squash.
-     */
-    private def updateRecentWarnings(run: Run) {
-      def loop(xs: List[(Position, String)]): List[(Position, String)] = xs match {
-        case Nil                  => Nil
-        case ((pos, msg)) :: rest =>
-          val filtered = rest filter { case (pos0, msg0) =>
-            (msg != msg0) || (pos.lineContent.trim != pos0.lineContent.trim) || {
-              // same messages and same line content after whitespace removal
-              // but we want to let through multiple warnings on the same line
-              // from the same run.  The untrimmed line will be the same since
-              // there's no whitespace indenting blowing it.
-              (pos.lineContent == pos0.lineContent)
-            }
-                                    }
-        ((pos, msg)) :: loop(filtered)
-      }
-     // PRASHANT: This leads to a NoSuchMethodError for _.warnings. Yet to figure out its purpose.
-      // val warnings = loop(run.allConditionalWarnings flatMap (_.warnings))
-      // if (warnings.nonEmpty)
-      //   mostRecentWarnings = warnings
-    }
-    private def evalMethod(name: String) = evalClass.getMethods filter (_.getName == name) match {
-      case Array(method) => method
-      case xs            => sys.error("Internal error: eval object " + evalClass + ", " + xs.mkString("\n", "\n", ""))
-    }
-    private def compileAndSaveRun(label: String, code: String) = {
-      showCodeIfDebugging(code)
-      val (success, run) = compileSourcesKeepingRun(new BatchSourceFile(label, packaged(code)))
-      updateRecentWarnings(run)
-      lastRun = run
-      success
-    }
-  }
-
-  /** One line of code submitted by the user for interpretation */
-  // private
-  class Request(val line: String, val trees: List[Tree]) {
-    val reqId = nextReqId()
-    val lineRep = new ReadEvalPrint()
-
-    private var _originalLine: String = null
-    def withOriginalLine(s: String): this.type = { _originalLine = s ; this }
-    def originalLine = if (_originalLine == null) line else _originalLine
-
-    /** handlers for each tree in this request */
-    val handlers: List[MemberHandler] = trees map (memberHandlers chooseHandler _)
-    def defHandlers = handlers collect { case x: MemberDefHandler => x }
-
-    /** all (public) names defined by these statements */
-    val definedNames = handlers flatMap (_.definedNames)
-
-    /** list of names used by this expression */
-    val referencedNames: List[Name] = handlers flatMap (_.referencedNames)
-
-    /** def and val names */
-    def termNames = handlers flatMap (_.definesTerm)
-    def typeNames = handlers flatMap (_.definesType)
-    def definedOrImported = handlers flatMap (_.definedOrImported)
-    def definedSymbolList = defHandlers flatMap (_.definedSymbols)
-
-    def definedTypeSymbol(name: String) = definedSymbols(newTypeName(name))
-    def definedTermSymbol(name: String) = definedSymbols(newTermName(name))
-
-    val definedClasses = handlers.exists {
-      case _: ClassHandler => true
-      case _ => false
-    }
-
-    /** Code to import bound names from previous lines - accessPath is code to
-     * append to objectName to access anything bound by request.
-     */
-    val SparkComputedImports(importsPreamble, importsTrailer, accessPath) =
-      importsCode(referencedNames.toSet, definedClasses)
-
-    /** Code to access a variable with the specified name */
-    def fullPath(vname: String) = {
-      // lineRep.readPath + accessPath + ".`%s`".format(vname)
-      lineRep.readPath + ".INSTANCE" + accessPath + ".`%s`".format(vname)
-    }
-      /** Same as fullpath, but after it has been flattened, so:
-       *  $line5.$iw.$iw.$iw.Bippy      // fullPath
-       *  $line5.$iw$$iw$$iw$Bippy      // fullFlatName
-       */
-      def fullFlatName(name: String) =
-        // lineRep.readPath + accessPath.replace('.', '$') + nme.NAME_JOIN_STRING + name
-        lineRep.readPath + ".INSTANCE" + accessPath.replace('.', '$') + nme.NAME_JOIN_STRING + name
-
-    /** The unmangled symbol name, but supplemented with line info. */
-    def disambiguated(name: Name): String = name + " (in " + lineRep + ")"
-
-    /** Code to access a variable with the specified name */
-    def fullPath(vname: Name): String = fullPath(vname.toString)
-
-    /** the line of code to compute */
-    def toCompute = line
-
-    /** generate the source code for the object that computes this request */
-    private object ObjectSourceCode extends CodeAssembler[MemberHandler] {
-      def path = pathToTerm("$intp")
-      def envLines = {
-        if (!isReplPower) Nil // power mode only for now
-        // $intp is not bound; punt, but include the line.
-        else if (path == "$intp") List(
-          "def $line = " + tquoted(originalLine),
-          "def $trees = Nil"
-        )
-        else List(
-          "def $line  = " + tquoted(originalLine),
-          "def $req = %s.requestForReqId(%s).orNull".format(path, reqId),
-          "def $trees = if ($req eq null) Nil else $req.trees".format(lineRep.readName, path, reqId)
-        )
-      }
-
-      val preamble = s"""
-        |class ${lineRep.readName} extends Serializable {
-        |  ${envLines.map("  " + _ + ";\n").mkString}
-        |  $importsPreamble
-        |
-        |  // If we need to construct any objects defined in the REPL on an executor we will need
-        |  // to pass the outer scope to the appropriate encoder.
-        |  org.apache.spark.sql.catalyst.encoders.OuterScopes.addOuterScope(this)
-        |  ${indentCode(toCompute)}
-      """.stripMargin
-      val postamble = importsTrailer + "\n}" + "\n" +
-        "object " + lineRep.readName + " {\n" +
-        "  val INSTANCE = new " + lineRep.readName + "();\n" +
-        "}\n"
-      val generate = (m: MemberHandler) => m extraCodeToEvaluate Request.this
-
-      /*
-      val preamble = """
-        |object %s extends Serializable {
-        |%s%s%s
-      """.stripMargin.format(lineRep.readName, envLines.map("  " + _ + ";\n").mkString, importsPreamble, indentCode(toCompute))
-      val postamble = importsTrailer + "\n}"
-      val generate = (m: MemberHandler) => m extraCodeToEvaluate Request.this
-      */
-
-    }
-
-    private object ResultObjectSourceCode extends CodeAssembler[MemberHandler] {
-      /** We only want to generate this code when the result
-       *  is a value which can be referred to as-is.
-       */
-      val evalResult =
-        if (!handlers.last.definesValue) ""
-        else handlers.last.definesTerm match {
-          case Some(vname) if typeOf contains vname =>
-            "lazy val %s = %s".format(lineRep.resultName, fullPath(vname))
-          case _  => ""
-        }
-
-      // first line evaluates object to make sure constructor is run
-      // initial "" so later code can uniformly be: + etc
-      val preamble = """
-      |object %s {
-      |  %s
-      |  val %s: String = %s {
-      |    %s
-      |    (""
-      """.stripMargin.format(
-        lineRep.evalName, evalResult, lineRep.printName,
-        executionWrapper, lineRep.readName + ".INSTANCE" + accessPath
-      )
-      val postamble = """
-      |    )
-      |  }
-      |}
-      """.stripMargin
-      val generate = (m: MemberHandler) => m resultExtractionCode Request.this
-    }
-
-    // get it
-    def getEvalTyped[T] : Option[T] = getEval map (_.asInstanceOf[T])
-    def getEval: Option[AnyRef] = {
-      // ensure it has been compiled
-      compile
-      // try to load it and call the value method
-      lineRep.evalValue filterNot (_ == null)
-    }
-
-    /** Compile the object file.  Returns whether the compilation succeeded.
-     *  If all goes well, the "types" map is computed. */
-    lazy val compile: Boolean = {
-      // error counting is wrong, hence interpreter may overlook failure - so we reset
-      reporter.reset()
-
-      // compile the object containing the user's code
-      lineRep.compile(ObjectSourceCode(handlers)) && {
-        // extract and remember types
-        typeOf
-        typesOfDefinedTerms
-
-        // Assign symbols to the original trees
-        // TODO - just use the new trees.
-        defHandlers foreach { dh =>
-          val name = dh.member.name
-          definedSymbols get name foreach { sym =>
-            dh.member setSymbol sym
-           logDebug("Set symbol of " + name + " to " + sym.defString)
-          }
-        }
-
-        // compile the result-extraction object
-        withoutWarnings(lineRep compile ResultObjectSourceCode(handlers))
-      }
-    }
-
-    lazy val resultSymbol = lineRep.resolvePathToSymbol(accessPath)
-    def applyToResultMember[T](name: Name, f: Symbol => T) = afterTyper(f(resultSymbol.info.nonPrivateDecl(name)))
-
-    /* typeOf lookup with encoding */
-    def lookupTypeOf(name: Name) = typeOf.getOrElse(name, typeOf(global.encode(name.toString)))
-    def simpleNameOfType(name: TypeName) = (compilerTypeOf get name) map (_.typeSymbol.simpleName)
-
-    private def typeMap[T](f: Type => T) =
-      mapFrom[Name, Name, T](termNames ++ typeNames)(x => f(cleanMemberDecl(resultSymbol, x)))
-
-    /** Types of variables defined by this request. */
-    lazy val compilerTypeOf = typeMap[Type](x => x) withDefaultValue NoType
-    /** String representations of same. */
-    lazy val typeOf         = typeMap[String](tp => afterTyper(tp.toString))
-
-    // lazy val definedTypes: Map[Name, Type] = {
-    //   typeNames map (x => x -> afterTyper(resultSymbol.info.nonPrivateDecl(x).tpe)) toMap
-    // }
-    lazy val definedSymbols = (
-      termNames.map(x => x -> applyToResultMember(x, x => x)) ++
-      typeNames.map(x => x -> compilerTypeOf(x).typeSymbolDirect)
-    ).toMap[Name, Symbol] withDefaultValue NoSymbol
-
-    lazy val typesOfDefinedTerms = mapFrom[Name, Name, Type](termNames)(x => applyToResultMember(x, _.tpe))
-
-    /** load and run the code using reflection */
-    def loadAndRun: (String, Boolean) = {
-      try   { ("" + (lineRep call sessionNames.print), true) }
-      catch { case ex: Throwable => (lineRep.bindError(ex), false) }
-    }
-
-    override def toString = "Request(line=%s, %s trees)".format(line, trees.size)
-  }
-
-  /**
-   * Returns the name of the most recent interpreter result. Useful for
-   * for extracting information regarding the previous result.
-   *
-   * @return The simple name of the result (such as res0)
-   */
-  @DeveloperApi
-  def mostRecentVar: String =
-    if (mostRecentlyHandledTree.isEmpty) ""
-    else "" + (mostRecentlyHandledTree.get match {
-      case x: ValOrDefDef           => x.name
-      case Assign(Ident(name), _)   => name
-      case ModuleDef(_, name, _)    => name
-      case _                        => naming.mostRecentVar
-    })
-
-  private var mostRecentWarnings: List[(global.Position, String)] = Nil
-
-  /**
-   * Returns a list of recent warnings from compiler execution.
-   *
-   * @return The list of tuples (compiler position, warning)
-   */
-  @DeveloperApi
-  def lastWarnings = mostRecentWarnings
-
-  def treesForRequestId(id: Int): List[Tree] =
-    requestForReqId(id).toList flatMap (_.trees)
-
-  def requestForReqId(id: Int): Option[Request] =
-    if (executingRequest != null && executingRequest.reqId == id) Some(executingRequest)
-    else prevRequests find (_.reqId == id)
-
-  def requestForName(name: Name): Option[Request] = {
-    assert(definedNameMap != null, "definedNameMap is null")
-    definedNameMap get name
-  }
-
-  def requestForIdent(line: String): Option[Request] =
-    requestForName(newTermName(line)) orElse requestForName(newTypeName(line))
-
-  def requestHistoryForName(name: Name): List[Request] =
-    prevRequests.toList.reverse filter (_.definedNames contains name)
-
-
-  def definitionForName(name: Name): Option[MemberHandler] =
-    requestForName(name) flatMap { req =>
-      req.handlers find (_.definedNames contains name)
-    }
-
-  /**
-   * Retrieves the object representing the id (variable name, method name,
-   * class name, etc) provided.
-   *
-   * @param id The id (variable name, method name, class name, etc) whose
-   *           associated content to retrieve
-   *
-   * @return Some containing term name (id) representation if exists, else None
-   */
-  @DeveloperApi
-  def valueOfTerm(id: String): Option[AnyRef] =
-    requestForName(newTermName(id)) flatMap (_.getEval)
-
-  /**
-   * Retrieves the class representing the id (variable name, method name,
-   * class name, etc) provided.
-   *
-   * @param id The id (variable name, method name, class name, etc) whose
-   *           associated class to retrieve
-   *
-   * @return Some containing term name (id) class if exists, else None
-   */
-  @DeveloperApi
-  def classOfTerm(id: String): Option[JClass] =
-    valueOfTerm(id) map (_.getClass)
-
-  /**
-   * Retrieves the type representing the id (variable name, method name,
-   * class name, etc) provided.
-   *
-   * @param id The id (variable name, method name, class name, etc) whose
-   *           associated type to retrieve
-   *
-   * @return The Type information about the term name (id) provided
-   */
-  @DeveloperApi
-  def typeOfTerm(id: String): Type = newTermName(id) match {
-    case nme.ROOTPKG  => RootClass.tpe
-    case name         => requestForName(name).fold(NoType: Type)(_ compilerTypeOf name)
-  }
-
-  /**
-   * Retrieves the symbol representing the id (variable name, method name,
-   * class name, etc) provided.
-   *
-   * @param id The id (variable name, method name, class name, etc) whose
-   *           associated symbol to retrieve
-   *
-   * @return The Symbol information about the term name (id) provided
-   */
-  @DeveloperApi
-  def symbolOfTerm(id: String): Symbol =
-    requestForIdent(newTermName(id)).fold(NoSymbol: Symbol)(_ definedTermSymbol id)
-
-  // TODO: No use yet, but could be exposed as a DeveloperApi
-  private def symbolOfType(id: String): Symbol =
-    requestForName(newTypeName(id)).fold(NoSymbol: Symbol)(_ definedTypeSymbol id)
-
-  /**
-   * Retrieves the runtime class and type representing the id (variable name,
-   * method name, class name, etc) provided.
-   *
-   * @param id The id (variable name, method name, class name, etc) whose
-   *           associated runtime class and type to retrieve
-   *
-   * @return Some runtime class and Type information as a tuple for the
-   *         provided term name if it exists, else None
-   */
-  @DeveloperApi
-  def runtimeClassAndTypeOfTerm(id: String): Option[(JClass, Type)] = {
-    classOfTerm(id) flatMap { clazz =>
-      new RichClass(clazz).supers find(c => !(new RichClass(c).isScalaAnonymous)) map { nonAnon =>
-        (nonAnon, runtimeTypeOfTerm(id))
-      }
-    }
-  }
-
-  /**
-   * Retrieves the runtime type representing the id (variable name,
-   * method name, class name, etc) provided.
-   *
-   * @param id The id (variable name, method name, class name, etc) whose
-   *           associated runtime type to retrieve
-   *
-   * @return The runtime Type information about the term name (id) provided
-   */
-  @DeveloperApi
-  def runtimeTypeOfTerm(id: String): Type = {
-    typeOfTerm(id) andAlso { tpe =>
-      val clazz      = classOfTerm(id) getOrElse { return NoType }
-      val staticSym  = tpe.typeSymbol
-      val runtimeSym = getClassIfDefined(clazz.getName)
-
-      if ((runtimeSym != NoSymbol) && (runtimeSym != staticSym) && (runtimeSym isSubClass staticSym))
-        runtimeSym.info
-      else NoType
-    }
-  }
-
-  private def cleanMemberDecl(owner: Symbol, member: Name): Type = afterTyper {
-    normalizeNonPublic {
-      owner.info.nonPrivateDecl(member).tpe match {
-        case NullaryMethodType(tp) => tp
-        case tp                    => tp
-      }
-    }
-  }
-
-  private object exprTyper extends {
-    val repl: SparkIMain.this.type = imain
-  } with SparkExprTyper { }
-
-  /**
-   * Constructs a list of abstract syntax trees representing the provided code.
-   *
-   * @param line The line of code to parse and construct into ASTs
-   *
-   * @return Some list of ASTs if the line is valid, else None
-   */
-  @DeveloperApi
-  def parse(line: String): Option[List[Tree]] = exprTyper.parse(line)
-
-  /**
-   * Constructs a Symbol representing the final result of the expression
-   * provided or representing the definition provided.
-   *
-   * @param code The line of code
-   *
-   * @return The Symbol or NoSymbol (found under scala.reflect.internal)
-   */
-  @DeveloperApi
-  def symbolOfLine(code: String): Symbol =
-    exprTyper.symbolOfLine(code)
-
-  /**
-   * Constructs type information based on the provided expression's final
-   * result or the definition provided.
-   *
-   * @param expr The expression or definition
-   *
-   * @param silent Whether to output information while constructing the type
-   *
-   * @return The type information or an error
-   */
-  @DeveloperApi
-  def typeOfExpression(expr: String, silent: Boolean = true): Type =
-    exprTyper.typeOfExpression(expr, silent)
-
-  protected def onlyTerms(xs: List[Name]) = xs collect { case x: TermName => x }
-  protected def onlyTypes(xs: List[Name]) = xs collect { case x: TypeName => x }
-
-  /**
-   * Retrieves the defined, public names in the compiler.
-   *
-   * @return The list of matching "term" names
-   */
-  @DeveloperApi
-  def definedTerms      = onlyTerms(allDefinedNames) filterNot isInternalTermName
-
-  /**
-   * Retrieves the defined type names in the compiler.
-   *
-   * @return The list of matching type names
-   */
-  @DeveloperApi
-  def definedTypes      = onlyTypes(allDefinedNames)
-
-  /**
-   * Retrieves the defined symbols in the compiler.
-   *
-   * @return The set of matching Symbol instances
-   */
-  @DeveloperApi
-  def definedSymbols    = prevRequestList.flatMap(_.definedSymbols.values).toSet[Symbol]
-
-  /**
-   * Retrieves the list of public symbols in the compiler.
-   *
-   * @return The list of public Symbol instances
-   */
-  @DeveloperApi
-  def definedSymbolList = prevRequestList flatMap (_.definedSymbolList) filterNot (s => isInternalTermName(s.name))
-
-  // Terms with user-given names (i.e. not res0 and not synthetic)
-
-  /**
-   * Retrieves defined, public names that are not res0 or the result of a direct bind.
-   *
-   * @return The list of matching "term" names
-   */
-  @DeveloperApi
-  def namedDefinedTerms = definedTerms filterNot (x => isUserVarName("" + x) || directlyBoundNames(x))
-
-  private def findName(name: Name) = definedSymbols find (_.name == name) getOrElse NoSymbol
-
-  /** Translate a repl-defined identifier into a Symbol.
-   */
-  private def apply(name: String): Symbol =
-    types(name) orElse terms(name)
-
-  private def types(name: String): Symbol = {
-    val tpname = newTypeName(name)
-    findName(tpname) orElse getClassIfDefined(tpname)
-  }
-  private def terms(name: String): Symbol = {
-    val termname = newTypeName(name)
-    findName(termname) orElse getModuleIfDefined(termname)
-  }
-  // [Eugene to Paul] possibly you could make use of TypeTags here
-  private def types[T: ClassTag] : Symbol = types(classTag[T].runtimeClass.getName)
-  private def terms[T: ClassTag] : Symbol = terms(classTag[T].runtimeClass.getName)
-  private def apply[T: ClassTag] : Symbol = apply(classTag[T].runtimeClass.getName)
-
-  /**
-   * Retrieves the Symbols representing classes in the compiler.
-   *
-   * @return The list of matching ClassSymbol instances
-   */
-  @DeveloperApi
-  def classSymbols  = allDefSymbols collect { case x: ClassSymbol => x }
-
-  /**
-   * Retrieves the Symbols representing methods in the compiler.
-   *
-   * @return The list of matching MethodSymbol instances
-   */
-  @DeveloperApi
-  def methodSymbols = allDefSymbols collect { case x: MethodSymbol => x }
-
-  /** the previous requests this interpreter has processed */
-  private var executingRequest: Request = _
-  private val prevRequests       = mutable.ListBuffer[Request]()
-  private val referencedNameMap  = mutable.Map[Name, Request]()
-  private val definedNameMap     = mutable.Map[Name, Request]()
-  private val directlyBoundNames = mutable.Set[Name]()
-
-  private def allHandlers    = prevRequestList flatMap (_.handlers)
-  private def allDefHandlers = allHandlers collect { case x: MemberDefHandler => x }
-  private def allDefSymbols  = allDefHandlers map (_.symbol) filter (_ ne NoSymbol)
-
-  private def lastRequest         = if (prevRequests.isEmpty) null else prevRequests.last
-  // NOTE: Exposed to repl package since used by SparkImports
-  private[repl] def prevRequestList     = prevRequests.toList
-  private def allSeenTypes        = prevRequestList flatMap (_.typeOf.values.toList) distinct
-  private def allImplicits        = allHandlers filter (_.definesImplicit) flatMap (_.definedNames)
-  // NOTE: Exposed to repl package since used by SparkILoop and SparkImports
-  private[repl] def importHandlers      = allHandlers collect { case x: ImportHandler => x }
-
-  /**
-   * Retrieves a list of unique defined and imported names in the compiler.
-   *
-   * @return The list of "term" names
-   */
-  def visibleTermNames: List[Name] = definedTerms ++ importedTerms distinct
-
-  /** Another entry point for tab-completion, ids in scope */
-  // NOTE: Exposed to repl package since used by SparkJLineCompletion
-  private[repl] def unqualifiedIds = visibleTermNames map (_.toString) filterNot (_ contains "$") sorted
-
-  /** Parse the ScalaSig to find type aliases */
-  private def aliasForType(path: String) = ByteCode.aliasForType(path)
-
-  private def withoutUnwrapping(op: => Unit): Unit = {
-    val saved = isettings.unwrapStrings
-    isettings.unwrapStrings = false
-    try op
-    finally isettings.unwrapStrings = saved
-  }
-
-  // NOTE: Exposed to repl package since used by SparkILoop
-  private[repl] def symbolDefString(sym: Symbol) = {
-    TypeStrings.quieter(
-      afterTyper(sym.defString),
-      sym.owner.name + ".this.",
-      sym.owner.fullName + "."
-    )
-  }
-
-  private def showCodeIfDebugging(code: String) {
-    /** Secret bookcase entrance for repl debuggers: end the line
-     *  with "// show" and see what's going on.
-     */
-    def isShow    = code.lines exists (_.trim endsWith "// show")
-    def isShowRaw = code.lines exists (_.trim endsWith "// raw")
-
-    // old style
-    beSilentDuring(parse(code)) foreach { ts =>
-      ts foreach { t =>
-        if (isShow || isShowRaw)
-          withoutUnwrapping(echo(asCompactString(t)))
-        else
-          withoutUnwrapping(logDebug(asCompactString(t)))
-      }
-    }
-  }
-
-  // debugging
-  // NOTE: Exposed to repl package since accessed indirectly from SparkIMain
-  //       and SparkJLineCompletion
-  private[repl] def debugging[T](msg: String)(res: T) = {
-    logDebug(msg + " " + res)
-    res
-  }
-}
-
-/** Utility methods for the Interpreter. */
-object SparkIMain {
-  // The two name forms this is catching are the two sides of this assignment:
-  //
-  // $line3.$read.$iw.$iw.Bippy =
-  //   $line3.$read$$iw$$iw$Bippy@4a6a00ca
-  private def removeLineWrapper(s: String) = s.replaceAll("""\$line\d+[./]\$(read|eval|print)[$.]""", "")
-  private def removeIWPackages(s: String)  = s.replaceAll("""\$(iw|iwC|read|eval|print)[$.]""", "")
-  private def removeSparkVals(s: String)   = s.replaceAll("""\$VAL[0-9]+[$.]""", "")
-
-  def stripString(s: String)               = removeSparkVals(removeIWPackages(removeLineWrapper(s)))
-
-  trait CodeAssembler[T] {
-    def preamble: String
-    def generate: T => String
-    def postamble: String
-
-    def apply(contributors: List[T]): String = stringFromWriter { code =>
-      code println preamble
-      contributors map generate foreach (code println _)
-      code println postamble
-    }
-  }
-
-  trait StrippingWriter {
-    def isStripping: Boolean
-    def stripImpl(str: String): String
-    def strip(str: String): String = if (isStripping) stripImpl(str) else str
-  }
-  trait TruncatingWriter {
-    def maxStringLength: Int
-    def isTruncating: Boolean
-    def truncate(str: String): String = {
-      if (isTruncating && (maxStringLength != 0 && str.length > maxStringLength))
-        (str take maxStringLength - 3) + "..."
-      else str
-    }
-  }
-  abstract class StrippingTruncatingWriter(out: JPrintWriter)
-          extends JPrintWriter(out)
-             with StrippingWriter
-             with TruncatingWriter {
-    self =>
-
-    def clean(str: String): String = truncate(strip(str))
-    override def write(str: String) = super.write(clean(str))
-  }
-  class ReplStrippingWriter(intp: SparkIMain) extends StrippingTruncatingWriter(intp.out) {
-    import intp._
-    def maxStringLength    = isettings.maxPrintString
-    def isStripping        = isettings.unwrapStrings
-    def isTruncating       = reporter.truncationOK
-
-    def stripImpl(str: String): String = naming.unmangle(str)
-  }
-
-  class ReplReporter(intp: SparkIMain) extends ConsoleReporter(intp.settings, null, new ReplStrippingWriter(intp)) {
-    override def printMessage(msg: String) {
-      // Avoiding deadlock when the compiler starts logging before
-      // the lazy val is done.
-      if (intp.isInitializeComplete) {
-        if (intp.totalSilence) ()
-        else super.printMessage(msg)
-      }
-      // scalastyle:off println
-      else Console.println(msg)
-      // scalastyle:on println
-    }
-  }
-}
-
-class SparkISettings(intp: SparkIMain) extends Logging {
-  /** A list of paths where :load should look */
-  var loadPath = List(".")
-
-  /** Set this to true to see repl machinery under -Yrich-exceptions.
-   */
-  var showInternalStackTraces = false
-
-  /** The maximum length of toString to use when printing the result
-   *  of an evaluation.  0 means no maximum.  If a printout requires
-   *  more than this number of characters, then the printout is
-   *  truncated.
-   */
-  var maxPrintString = 800
-
-  /** The maximum number of completion candidates to print for tab
-   *  completion without requiring confirmation.
-   */
-  var maxAutoprintCompletion = 250
-
-  /** String unwrapping can be disabled if it is causing issues.
-   *  Settings this to false means you will see Strings like "$iw.$iw.".
-   */
-  var unwrapStrings = true
-
-  def deprecation_=(x: Boolean) = {
-    val old = intp.settings.deprecation.value
-    intp.settings.deprecation.value = x
-    if (!old && x) logDebug("Enabled -deprecation output.")
-    else if (old && !x) logDebug("Disabled -deprecation output.")
-  }
-
-  def deprecation: Boolean = intp.settings.deprecation.value
-
-  def allSettings = Map(
-    "maxPrintString" -> maxPrintString,
-    "maxAutoprintCompletion" -> maxAutoprintCompletion,
-    "unwrapStrings" -> unwrapStrings,
-    "deprecation" -> deprecation
-  )
-
-  private def allSettingsString =
-    allSettings.toList sortBy (_._1) map { case (k, v) => "  " + k + " = " + v + "\n" } mkString
-
-  override def toString = """
-    | SparkISettings {
-    | %s
-    | }""".stripMargin.format(allSettingsString)
-}
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkImports.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkImports.scala
deleted file mode 100644
index f22776592c288..0000000000000
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkImports.scala
+++ /dev/null
@@ -1,239 +0,0 @@
-// scalastyle:off
-
-/* NSC -- new Scala compiler
- * Copyright 2005-2013 LAMP/EPFL
- * @author  Paul Phillips
- */
-
-package org.apache.spark.repl
-
-import scala.tools.nsc._
-import scala.tools.nsc.interpreter._
-
-import scala.collection.{ mutable, immutable }
-
-private[repl] trait SparkImports {
-  self: SparkIMain =>
-
-  import global._
-  import definitions.{ ScalaPackage, JavaLangPackage, PredefModule }
-  import memberHandlers._
-
-  def isNoImports = settings.noimports.value
-  def isNoPredef  = settings.nopredef.value
-
-  /** Synthetic import handlers for the language defined imports. */
-  private def makeWildcardImportHandler(sym: Symbol): ImportHandler = {
-    val hd :: tl = sym.fullName.split('.').toList map newTermName
-    val tree = Import(
-      tl.foldLeft(Ident(hd): Tree)((x, y) => Select(x, y)),
-      ImportSelector.wildList
-    )
-    tree setSymbol sym
-    new ImportHandler(tree)
-  }
-
-  /** Symbols whose contents are language-defined to be imported. */
-  def languageWildcardSyms: List[Symbol] = List(JavaLangPackage, ScalaPackage, PredefModule)
-  def languageWildcards: List[Type] = languageWildcardSyms map (_.tpe)
-  def languageWildcardHandlers = languageWildcardSyms map makeWildcardImportHandler
-
-  def allImportedNames = importHandlers flatMap (_.importedNames)
-  def importedTerms    = onlyTerms(allImportedNames)
-  def importedTypes    = onlyTypes(allImportedNames)
-
-  /** Types which have been wildcard imported, such as:
-   *    val x = "abc" ; import x._  // type java.lang.String
-   *    import java.lang.String._   // object java.lang.String
-   *
-   *  Used by tab completion.
-   *
-   *  XXX right now this gets import x._ and import java.lang.String._,
-   *  but doesn't figure out import String._.  There's a lot of ad hoc
-   *  scope twiddling which should be swept away in favor of digging
-   *  into the compiler scopes.
-   */
-  def sessionWildcards: List[Type] = {
-    importHandlers filter (_.importsWildcard) map (_.targetType) distinct
-  }
-  def wildcardTypes = languageWildcards ++ sessionWildcards
-
-  def languageSymbols        = languageWildcardSyms flatMap membersAtPickler
-  def sessionImportedSymbols = importHandlers flatMap (_.importedSymbols)
-  def importedSymbols        = languageSymbols ++ sessionImportedSymbols
-  def importedTermSymbols    = importedSymbols collect { case x: TermSymbol => x }
-  def importedTypeSymbols    = importedSymbols collect { case x: TypeSymbol => x }
-  def implicitSymbols        = importedSymbols filter (_.isImplicit)
-
-  def importedTermNamed(name: String): Symbol =
-    importedTermSymbols find (_.name.toString == name) getOrElse NoSymbol
-
-  /** Tuples of (source, imported symbols) in the order they were imported.
-   */
-  def importedSymbolsBySource: List[(Symbol, List[Symbol])] = {
-    val lang    = languageWildcardSyms map (sym => (sym, membersAtPickler(sym)))
-    val session = importHandlers filter (_.targetType != NoType) map { mh =>
-      (mh.targetType.typeSymbol, mh.importedSymbols)
-    }
-
-    lang ++ session
-  }
-  def implicitSymbolsBySource: List[(Symbol, List[Symbol])] = {
-    importedSymbolsBySource map {
-      case (k, vs) => (k, vs filter (_.isImplicit))
-    } filterNot (_._2.isEmpty)
-  }
-
-  /** Compute imports that allow definitions from previous
-   *  requests to be visible in a new request.  Returns
-   *  three pieces of related code:
-   *
-   *  1. An initial code fragment that should go before
-   *  the code of the new request.
-   *
-   *  2. A code fragment that should go after the code
-   *  of the new request.
-   *
-   *  3. An access path which can be traversed to access
-   *  any bindings inside code wrapped by #1 and #2 .
-   *
-   * The argument is a set of Names that need to be imported.
-   *
-   * Limitations: This method is not as precise as it could be.
-   * (1) It does not process wildcard imports to see what exactly
-   * they import.
-   * (2) If it imports any names from a request, it imports all
-   * of them, which is not really necessary.
-   * (3) It imports multiple same-named implicits, but only the
-   * last one imported is actually usable.
-   */
-  case class SparkComputedImports(prepend: String, append: String, access: String)
-  def fallback = System.getProperty("spark.repl.fallback", "false").toBoolean
-
-  protected def importsCode(wanted: Set[Name], definedClass: Boolean): SparkComputedImports = {
-    /** Narrow down the list of requests from which imports
-     *  should be taken.  Removes requests which cannot contribute
-     *  useful imports for the specified set of wanted names.
-     */
-    case class ReqAndHandler(req: Request, handler: MemberHandler) { }
-
-    def reqsToUse: List[ReqAndHandler] = {
-      /**
-       * Loop through a list of MemberHandlers and select which ones to keep.
-       * 'wanted' is the set of names that need to be imported.
-       */
-      def select(reqs: List[ReqAndHandler], wanted: Set[Name]): List[ReqAndHandler] = {
-        // Single symbol imports might be implicits! See bug #1752.  Rather than
-        // try to finesse this, we will mimic all imports for now.
-        def keepHandler(handler: MemberHandler) = handler match {
-       /* This case clause tries to "precisely" import only what is required. And in this
-        * it may miss out on some implicits, because implicits are not known in `wanted`. Thus 
-        * it is suitable for defining classes. AFAIK while defining classes implicits are not
-        * needed.*/
-          case h: ImportHandler if definedClass && !fallback => 
-            h.importedNames.exists(x => wanted.contains(x))
-          case _: ImportHandler  => true
-          case x                 => x.definesImplicit || (x.definedNames exists wanted)
-        }
-
-        reqs match {
-          case Nil                                    => Nil
-          case rh :: rest if !keepHandler(rh.handler) => select(rest, wanted)
-          case rh :: rest                             =>
-            import rh.handler._
-            val newWanted = wanted ++ referencedNames -- definedNames -- importedNames
-            rh :: select(rest, newWanted)
-        }
-      }
-
-      /** Flatten the handlers out and pair each with the original request */
-      select(allReqAndHandlers reverseMap { case (r, h) => ReqAndHandler(r, h) }, wanted).reverse
-    }
-
-    val code, trailingBraces, accessPath = new StringBuilder
-    val currentImps = mutable.HashSet[Name]()
-
-    // add code for a new object to hold some imports
-    def addWrapper() {
-      val impname = nme.INTERPRETER_IMPORT_WRAPPER
-      code append "class %sC extends Serializable {\n".format(impname)
-      trailingBraces append "}\nval " + impname + " = new " + impname + "C;\n"
-      accessPath append ("." + impname)
-
-      currentImps.clear
-      // code append "object %s {\n".format(impname)
-      // trailingBraces append "}\n"
-      // accessPath append ("." + impname)
-
-      // currentImps.clear
-    }
-
-    addWrapper()
-
-    // loop through previous requests, adding imports for each one
-    for (ReqAndHandler(req, handler) <- reqsToUse) {
-      handler match {
-        // If the user entered an import, then just use it; add an import wrapping
-        // level if the import might conflict with some other import
-        case x: ImportHandler =>
-          if (x.importsWildcard || currentImps.exists(x.importedNames contains _))
-            addWrapper()
-
-          code append (x.member + "\n")
-
-          // give wildcard imports a import wrapper all to their own
-          if (x.importsWildcard) addWrapper()
-          else currentImps ++= x.importedNames
-
-        // For other requests, import each defined name.
-        // import them explicitly instead of with _, so that
-        // ambiguity errors will not be generated. Also, quote
-        // the name of the variable, so that we don't need to
-        // handle quoting keywords separately.
-        case x: ClassHandler if !fallback =>
-        // I am trying to guess if the import is a defined class
-        // This is an ugly hack, I am not 100% sure of the consequences.
-        // Here we, let everything but "defined classes" use the import with val.
-        // The reason for this is, otherwise the remote executor tries to pull the
-        // classes involved and may fail.
-          for (imv <- x.definedNames) {
-            val objName = req.lineRep.readPath
-            code.append("import " + objName + ".INSTANCE" + req.accessPath + ".`" + imv + "`\n")
-          }
-
-        case x =>
-          for (imv <- x.definedNames) {
-            if (currentImps contains imv) addWrapper()
-            val objName = req.lineRep.readPath
-            val valName = "$VAL" + newValId()
-
-            if(!code.toString.endsWith(".`" + imv + "`;\n")) { // Which means already imported
-                code.append("val " + valName + " = " + objName + ".INSTANCE;\n")
-                code.append("import " + valName + req.accessPath + ".`" + imv + "`;\n")
-            }
-            // code.append("val " + valName + " = " + objName + ".INSTANCE;\n")
-            // code.append("import " + valName + req.accessPath + ".`" + imv + "`;\n")
-            // code append ("import " + (req fullPath imv) + "\n")
-            currentImps += imv
-          }
-      }
-    }
-    // add one extra wrapper, to prevent warnings in the common case of
-    // redefining the value bound in the last interpreter request.
-    addWrapper()
-    SparkComputedImports(code.toString, trailingBraces.toString, accessPath.toString)
-  }
-
-  private def allReqAndHandlers =
-    prevRequestList flatMap (req => req.handlers map (req -> _))
-
-  private def membersAtPickler(sym: Symbol): List[Symbol] =
-    beforePickler(sym.info.nonPrivateMembers.toList)
-
-  private var curValId = 0
-
-  private def newValId(): Int = {
-    curValId += 1
-    curValId
-  }
-}
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkJLineCompletion.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkJLineCompletion.scala
deleted file mode 100644
index 1ba17dfd8e3d0..0000000000000
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkJLineCompletion.scala
+++ /dev/null
@@ -1,403 +0,0 @@
-// scalastyle:off
-
-/* NSC -- new Scala compiler
- * Copyright 2005-2013 LAMP/EPFL
- * @author Paul Phillips
- */
-
-package org.apache.spark.repl
-
-import scala.tools.nsc._
-import scala.tools.nsc.interpreter._
-
-import scala.tools.jline._
-import scala.tools.jline.console.completer._
-import Completion._
-import scala.collection.mutable.ListBuffer
-
-import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.internal.Logging
-
-/**
- * Represents an auto-completion tool for the supplied interpreter that
- * utilizes supplied queries for valid completions based on the current
- * contents of the internal buffer.
- *
- * @param intp The interpreter to use for information retrieval to do with
- *             auto completion
- */
-@DeveloperApi
-class SparkJLineCompletion(val intp: SparkIMain) extends Completion with CompletionOutput with Logging {
-  // NOTE: Exposed in package as used in quite a few classes
-  // NOTE: Must be public to override the global found in CompletionOutput
-  val global: intp.global.type = intp.global
-
-  import global._
-  import definitions.{ PredefModule, AnyClass, AnyRefClass, ScalaPackage, JavaLangPackage }
-  import rootMirror.{ RootClass, getModuleIfDefined }
-  type ExecResult = Any
-  import intp.{ debugging }
-
-  /**
-   * Represents the level of verbosity. Increments with consecutive tabs.
-   */
-  @DeveloperApi
-  var verbosity: Int = 0
-
-  /**
-   * Resets the level of verbosity to zero.
-   */
-  @DeveloperApi
-  def resetVerbosity() = verbosity = 0
-
-  private def getSymbol(name: String, isModule: Boolean) = (
-    if (isModule) getModuleIfDefined(name)
-    else getModuleIfDefined(name)
-  )
-  private def getType(name: String, isModule: Boolean) = getSymbol(name, isModule).tpe
-  private def typeOf(name: String)                     = getType(name, false)
-  private def moduleOf(name: String)                   = getType(name, true)
-
-  trait CompilerCompletion {
-    def tp: Type
-    def effectiveTp = tp match {
-      case MethodType(Nil, resType)   => resType
-      case NullaryMethodType(resType) => resType
-      case _                          => tp
-    }
-
-    // for some reason any's members don't show up in subclasses, which
-    // we need so 5.<tab> offers asInstanceOf etc.
-    private def anyMembers = AnyClass.tpe.nonPrivateMembers
-    def anyRefMethodsToShow = Set("isInstanceOf", "asInstanceOf", "toString")
-
-    def tos(sym: Symbol): String = sym.decodedName
-    def memberNamed(s: String) = afterTyper(effectiveTp member newTermName(s))
-    def hasMethod(s: String) = memberNamed(s).isMethod
-
-    // XXX we'd like to say "filterNot (_.isDeprecated)" but this causes the
-    // compiler to crash for reasons not yet known.
-    def members     = afterTyper((effectiveTp.nonPrivateMembers.toList ++ anyMembers) filter (_.isPublic))
-    def methods     = members.toList filter (_.isMethod)
-    def packages    = members.toList filter (_.isPackage)
-    def aliases     = members.toList filter (_.isAliasType)
-
-    def memberNames   = members map tos
-    def methodNames   = methods map tos
-    def packageNames  = packages map tos
-    def aliasNames    = aliases map tos
-  }
-
-  object NoTypeCompletion extends TypeMemberCompletion(NoType) {
-    override def memberNamed(s: String) = NoSymbol
-    override def members = Nil
-    override def follow(s: String) = None
-    override def alternativesFor(id: String) = Nil
-  }
-
-  object TypeMemberCompletion {
-    def apply(tp: Type, runtimeType: Type, param: NamedParam): TypeMemberCompletion = {
-      new TypeMemberCompletion(tp) {
-        var upgraded = false
-        lazy val upgrade = {
-          intp rebind param
-          intp.reporter.printMessage("\nRebinding stable value %s from %s to %s".format(param.name, tp, param.tpe))
-          upgraded = true
-          new TypeMemberCompletion(runtimeType)
-        }
-        override def completions(verbosity: Int) = {
-          super.completions(verbosity) ++ (
-            if (verbosity == 0) Nil
-            else upgrade.completions(verbosity)
-          )
-        }
-        override def follow(s: String) = super.follow(s) orElse {
-          if (upgraded) upgrade.follow(s)
-          else None
-        }
-        override def alternativesFor(id: String) = super.alternativesFor(id) ++ (
-          if (upgraded) upgrade.alternativesFor(id)
-          else Nil
-        ) distinct
-      }
-    }
-    def apply(tp: Type): TypeMemberCompletion = {
-      if (tp eq NoType) NoTypeCompletion
-      else if (tp.typeSymbol.isPackageClass) new PackageCompletion(tp)
-      else new TypeMemberCompletion(tp)
-    }
-    def imported(tp: Type) = new ImportCompletion(tp)
-  }
-
-  class TypeMemberCompletion(val tp: Type) extends CompletionAware
-                                              with CompilerCompletion {
-    def excludeEndsWith: List[String] = Nil
-    def excludeStartsWith: List[String] = List("<") // <byname>, <repeated>, etc.
-    def excludeNames: List[String] = (anyref.methodNames filterNot anyRefMethodsToShow) :+ "_root_"
-
-    def methodSignatureString(sym: Symbol) = {
-      IMain stripString afterTyper(new MethodSymbolOutput(sym).methodString())
-    }
-
-    def exclude(name: String): Boolean = (
-      (name contains "$") ||
-      (excludeNames contains name) ||
-      (excludeEndsWith exists (name endsWith _)) ||
-      (excludeStartsWith exists (name startsWith _))
-    )
-    def filtered(xs: List[String]) = xs filterNot exclude distinct
-
-    def completions(verbosity: Int) =
-      debugging(tp + " completions ==> ")(filtered(memberNames))
-
-    override def follow(s: String): Option[CompletionAware] =
-      debugging(tp + " -> '" + s + "' ==> ")(Some(TypeMemberCompletion(memberNamed(s).tpe)) filterNot (_ eq NoTypeCompletion))
-
-    override def alternativesFor(id: String): List[String] =
-      debugging(id + " alternatives ==> ") {
-        val alts = members filter (x => x.isMethod && tos(x) == id) map methodSignatureString
-
-        if (alts.nonEmpty) "" :: alts else Nil
-      }
-
-    override def toString = "%s (%d members)".format(tp, members.size)
-  }
-
-  class PackageCompletion(tp: Type) extends TypeMemberCompletion(tp) {
-    override def excludeNames = anyref.methodNames
-  }
-
-  class LiteralCompletion(lit: Literal) extends TypeMemberCompletion(lit.value.tpe) {
-    override def completions(verbosity: Int) = verbosity match {
-      case 0    => filtered(memberNames)
-      case _    => memberNames
-    }
-  }
-
-  class ImportCompletion(tp: Type) extends TypeMemberCompletion(tp) {
-    override def completions(verbosity: Int) = verbosity match {
-      case 0    => filtered(members filterNot (_.isSetter) map tos)
-      case _    => super.completions(verbosity)
-    }
-  }
-
-  // not for completion but for excluding
-  object anyref extends TypeMemberCompletion(AnyRefClass.tpe) { }
-
-  // the unqualified vals/defs/etc visible in the repl
-  object ids extends CompletionAware {
-    override def completions(verbosity: Int) = intp.unqualifiedIds ++ List("classOf") //, "_root_")
-    // now we use the compiler for everything.
-    override def follow(id: String): Option[CompletionAware] = {
-      if (!completions(0).contains(id))
-        return None
-
-      val tpe = intp typeOfExpression id
-      if (tpe == NoType)
-        return None
-
-      def default = Some(TypeMemberCompletion(tpe))
-
-      // only rebinding vals in power mode for now.
-      if (!isReplPower) default
-      else intp runtimeClassAndTypeOfTerm id match {
-        case Some((clazz, runtimeType)) =>
-          val sym = intp.symbolOfTerm(id)
-          if (sym.isStable) {
-            val param = new NamedParam.Untyped(id, intp valueOfTerm id getOrElse null)
-            Some(TypeMemberCompletion(tpe, runtimeType, param))
-          }
-          else default
-        case _        =>
-          default
-      }
-    }
-    override def toString = "<repl ids> (%s)".format(completions(0).size)
-  }
-
-  // user-issued wildcard imports like "import global._" or "import String._"
-  private def imported = intp.sessionWildcards map TypeMemberCompletion.imported
-
-  // literal Ints, Strings, etc.
-  object literals extends CompletionAware {
-    def simpleParse(code: String): Tree = newUnitParser(code).templateStats().last
-    def completions(verbosity: Int) = Nil
-
-    override def follow(id: String) = simpleParse(id) match {
-      case x: Literal   => Some(new LiteralCompletion(x))
-      case _            => None
-    }
-  }
-
-  // top level packages
-  object rootClass extends TypeMemberCompletion(RootClass.tpe) {
-    override def completions(verbosity: Int) = super.completions(verbosity) :+ "_root_"
-    override def follow(id: String) = id match {
-      case "_root_" => Some(this)
-      case _        => super.follow(id)
-    }
-  }
-  // members of Predef
-  object predef extends TypeMemberCompletion(PredefModule.tpe) {
-    override def excludeEndsWith    = super.excludeEndsWith ++ List("Wrapper", "ArrayOps")
-    override def excludeStartsWith  = super.excludeStartsWith ++ List("wrap")
-    override def excludeNames       = anyref.methodNames
-
-    override def exclude(name: String) = super.exclude(name) || (
-      (name contains "2")
-    )
-
-    override def completions(verbosity: Int) = verbosity match {
-      case 0    => Nil
-      case _    => super.completions(verbosity)
-    }
-  }
-  // members of scala.*
-  object scalalang extends PackageCompletion(ScalaPackage.tpe) {
-    def arityClasses = List("Product", "Tuple", "Function")
-    def skipArity(name: String) = arityClasses exists (x => name != x && (name startsWith x))
-    override def exclude(name: String) = super.exclude(name) || (
-      skipArity(name)
-    )
-
-    override def completions(verbosity: Int) = verbosity match {
-      case 0    => filtered(packageNames ++ aliasNames)
-      case _    => super.completions(verbosity)
-    }
-  }
-  // members of java.lang.*
-  object javalang extends PackageCompletion(JavaLangPackage.tpe) {
-    override lazy val excludeEndsWith   = super.excludeEndsWith ++ List("Exception", "Error")
-    override lazy val excludeStartsWith = super.excludeStartsWith ++ List("CharacterData")
-
-    override def completions(verbosity: Int) = verbosity match {
-      case 0    => filtered(packageNames)
-      case _    => super.completions(verbosity)
-    }
-  }
-
-  // the list of completion aware objects which should be consulted
-  // for top level unqualified, it's too noisy to let much in.
-  private lazy val topLevelBase: List[CompletionAware] = List(ids, rootClass, predef, scalalang, javalang, literals)
-  private def topLevel = topLevelBase ++ imported
-  private def topLevelThreshold = 50
-
-  // the first tier of top level objects (doesn't include file completion)
-  private def topLevelFor(parsed: Parsed): List[String] = {
-    val buf = new ListBuffer[String]
-    topLevel foreach { ca =>
-      buf ++= (ca completionsFor parsed)
-
-      if (buf.size > topLevelThreshold)
-        return buf.toList.sorted
-    }
-    buf.toList
-  }
-
-  // the most recent result
-  private def lastResult = Forwarder(() => ids follow intp.mostRecentVar)
-
-  private def lastResultFor(parsed: Parsed) = {
-    /** The logic is a little tortured right now because normally '.' is
-     *  ignored as a delimiter, but on .<tab> it needs to be propagated.
-     */
-    val xs = lastResult completionsFor parsed
-    if (parsed.isEmpty) xs map ("." + _) else xs
-  }
-
-  // generic interface for querying (e.g. interpreter loop, testing)
-  private def completions(buf: String): List[String] =
-    topLevelFor(Parsed.dotted(buf + ".", buf.length + 1))
-
-  /**
-   * Constructs a new ScalaCompleter for auto completion.
-   *
-   * @return The new JLineTabCompletion instance
-   */
-  @DeveloperApi
-  def completer(): ScalaCompleter = new JLineTabCompletion
-
-  /** This gets a little bit hairy.  It's no small feat delegating everything
-   *  and also keeping track of exactly where the cursor is and where it's supposed
-   *  to end up.  The alternatives mechanism is a little hacky: if there is an empty
-   *  string in the list of completions, that means we are expanding a unique
-   *  completion, so don't update the "last" buffer because it'll be wrong.
-   */
-  class JLineTabCompletion extends ScalaCompleter {
-    // For recording the buffer on the last tab hit
-    private var lastBuf: String = ""
-    private var lastCursor: Int = -1
-
-    // Does this represent two consecutive tabs?
-    def isConsecutiveTabs(buf: String, cursor: Int) =
-      cursor == lastCursor && buf == lastBuf
-
-    // Longest common prefix
-    def commonPrefix(xs: List[String]): String = {
-      if (xs.isEmpty || xs.contains("")) ""
-      else xs.head.head match {
-        case ch =>
-          if (xs.tail forall (_.head == ch)) "" + ch + commonPrefix(xs map (_.tail))
-          else ""
-      }
-    }
-
-    // This is jline's entry point for completion.
-    override def complete(buf: String, cursor: Int): Candidates = {
-      verbosity = if (isConsecutiveTabs(buf, cursor)) verbosity + 1 else 0
-      logDebug("\ncomplete(%s, %d) last = (%s, %d), verbosity: %s".format(buf, cursor, lastBuf, lastCursor, verbosity))
-
-      // we don't try lower priority completions unless higher ones return no results.
-      def tryCompletion(p: Parsed, completionFunction: Parsed => List[String]): Option[Candidates] = {
-        val winners = completionFunction(p)
-        if (winners.isEmpty)
-          return None
-        val newCursor =
-          if (winners contains "") p.cursor
-          else {
-            val advance = commonPrefix(winners)
-            lastCursor = p.position + advance.length
-            lastBuf = (buf take p.position) + advance
-            logDebug("tryCompletion(%s, _) lastBuf = %s, lastCursor = %s, p.position = %s".format(
-              p, lastBuf, lastCursor, p.position))
-            p.position
-          }
-
-        Some(Candidates(newCursor, winners))
-      }
-
-      def mkDotted      = Parsed.dotted(buf, cursor) withVerbosity verbosity
-      def mkUndelimited = Parsed.undelimited(buf, cursor) withVerbosity verbosity
-
-      // a single dot is special cased to completion on the previous result
-      def lastResultCompletion =
-        if (!looksLikeInvocation(buf)) None
-        else tryCompletion(Parsed.dotted(buf drop 1, cursor), lastResultFor)
-
-      def tryAll = (
-                  lastResultCompletion
-           orElse tryCompletion(mkDotted, topLevelFor)
-        getOrElse Candidates(cursor, Nil)
-      )
-
-      /**
-       *  This is the kickoff point for all manner of theoretically
-       *  possible compiler unhappiness. The fault may be here or
-       *  elsewhere, but we don't want to crash the repl regardless.
-       *  The compiler makes it impossible to avoid catching Throwable
-       *  with its unfortunate tendency to throw java.lang.Errors and
-       *  AssertionErrors as the hats drop. We take two swings at it
-       *  because there are some spots which like to throw an assertion
-       *  once, then work after that. Yeah, what can I say.
-       */
-      try tryAll
-      catch { case ex: Throwable =>
-        logWarning("Error: complete(%s, %s) provoked".format(buf, cursor) + ex)
-        Candidates(cursor,
-          if (isReplDebug) List("<error:" + ex + ">")
-          else Nil
-        )
-      }
-    }
-  }
-}
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkJLineReader.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkJLineReader.scala
deleted file mode 100644
index 016e0f039f4f1..0000000000000
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkJLineReader.scala
+++ /dev/null
@@ -1,90 +0,0 @@
-// scalastyle:off
-
-/* NSC -- new Scala compiler
- * Copyright 2005-2013 LAMP/EPFL
- * @author Stepan Koltsov
- */
-
-package org.apache.spark.repl
-
-import scala.reflect.io.{Path, File}
-import scala.tools.nsc._
-import scala.tools.nsc.interpreter._
-import scala.tools.nsc.interpreter.session.JLineHistory.JLineFileHistory
-
-import scala.tools.jline.console.ConsoleReader
-import scala.tools.jline.console.completer._
-import session._
-import scala.collection.JavaConverters._
-import Completion._
-import io.Streamable.slurp
-
-/**
- *  Reads from the console using JLine.
- */
-private[repl] class SparkJLineReader(_completion: => Completion) extends InteractiveReader {
-  val interactive = true
-  val consoleReader = new JLineConsoleReader()
-
-  lazy val completion = _completion
-  lazy val history: JLineHistory = new SparkJLineHistory
-
-  private def term = consoleReader.getTerminal()
-  def reset() = term.reset()
-  def init()  = term.init()
-
-  def scalaToJline(tc: ScalaCompleter): Completer = new Completer {
-    def complete(_buf: String, cursor: Int, candidates: JList[CharSequence]): Int = {
-      val buf   = if (_buf == null) "" else _buf
-      val Candidates(newCursor, newCandidates) = tc.complete(buf, cursor)
-      newCandidates foreach (candidates add _)
-      newCursor
-    }
-  }
-
-  class JLineConsoleReader extends ConsoleReader with ConsoleReaderHelper {
-    if ((history: History) ne NoHistory)
-      this setHistory history
-
-    // working around protected/trait/java insufficiencies.
-    def goBack(num: Int): Unit = back(num)
-    def readOneKey(prompt: String) = {
-      this.print(prompt)
-      this.flush()
-      this.readVirtualKey()
-    }
-    def eraseLine() = consoleReader.resetPromptLine("", "", 0)
-    def redrawLineAndFlush(): Unit = { flush() ; drawLine() ; flush() }
-    // override def readLine(prompt: String): String
-
-    // A hook for running code after the repl is done initializing.
-    lazy val postInit: Unit = {
-      this setBellEnabled false
-
-      if (completion ne NoCompletion) {
-        val argCompletor: ArgumentCompleter =
-          new ArgumentCompleter(new JLineDelimiter, scalaToJline(completion.completer()))
-        argCompletor setStrict false
-
-        this addCompleter argCompletor
-        this setAutoprintThreshold 400 // max completion candidates without warning
-      }
-    }
-  }
-
-  def currentLine = consoleReader.getCursorBuffer.buffer.toString
-  def redrawLine() = consoleReader.redrawLineAndFlush()
-  def eraseLine() = consoleReader.eraseLine()
-  // Alternate implementation, not sure if/when I need this.
-  // def eraseLine() = while (consoleReader.delete()) { }
-  def readOneLine(prompt: String) = consoleReader readLine prompt
-  def readOneKey(prompt: String)  = consoleReader readOneKey prompt
-}
-
-/** Changes the default history file to not collide with the scala repl's. */
-private[repl] class SparkJLineHistory extends JLineFileHistory {
-  import Properties.userHome
-
-  def defaultFileName = ".spark_history"
-  override protected lazy val historyFile = File(Path(userHome) / defaultFileName)
-}
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkMemberHandlers.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkMemberHandlers.scala
deleted file mode 100644
index 4de9714247c1f..0000000000000
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkMemberHandlers.scala
+++ /dev/null
@@ -1,232 +0,0 @@
-// scalastyle:off
-
-/* NSC -- new Scala compiler
- * Copyright 2005-2013 LAMP/EPFL
- * @author  Martin Odersky
- */
-
-package org.apache.spark.repl
-
-import scala.tools.nsc._
-import scala.tools.nsc.interpreter._
-
-import scala.collection.{ mutable, immutable }
-import scala.PartialFunction.cond
-import scala.reflect.internal.Chars
-import scala.reflect.internal.Flags._
-import scala.language.implicitConversions
-
-private[repl] trait SparkMemberHandlers {
-  val intp: SparkIMain
-
-  import intp.{ Request, global, naming }
-  import global._
-  import naming._
-
-  private def codegenln(leadingPlus: Boolean, xs: String*): String = codegen(leadingPlus, (xs ++ Array("\n")): _*)
-  private def codegenln(xs: String*): String = codegenln(true, xs: _*)
-
-  private def codegen(xs: String*): String = codegen(true, xs: _*)
-  private def codegen(leadingPlus: Boolean, xs: String*): String = {
-    val front = if (leadingPlus) "+ " else ""
-    front + (xs map string2codeQuoted mkString " + ")
-  }
-  private implicit def name2string(name: Name) = name.toString
-
-  /** A traverser that finds all mentioned identifiers, i.e. things
-   *  that need to be imported.  It might return extra names.
-   */
-  private class ImportVarsTraverser extends Traverser {
-    val importVars = new mutable.HashSet[Name]()
-
-    override def traverse(ast: Tree) = ast match {
-      case Ident(name) =>
-        // XXX this is obviously inadequate but it's going to require some effort
-        // to get right.
-        if (name.toString startsWith "x$") ()
-        else importVars += name
-      case _        => super.traverse(ast)
-    }
-  }
-  private object ImportVarsTraverser {
-    def apply(member: Tree) = {
-      val ivt = new ImportVarsTraverser()
-      ivt traverse member
-      ivt.importVars.toList
-    }
-  }
-
-  def chooseHandler(member: Tree): MemberHandler = member match {
-    case member: DefDef        => new DefHandler(member)
-    case member: ValDef        => new ValHandler(member)
-    case member: Assign        => new AssignHandler(member)
-    case member: ModuleDef     => new ModuleHandler(member)
-    case member: ClassDef      => new ClassHandler(member)
-    case member: TypeDef       => new TypeAliasHandler(member)
-    case member: Import        => new ImportHandler(member)
-    case DocDef(_, documented) => chooseHandler(documented)
-    case member                => new GenericHandler(member)
-  }
-
-  sealed abstract class MemberDefHandler(override val member: MemberDef) extends MemberHandler(member) {
-    def symbol          = if (member.symbol eq null) NoSymbol else member.symbol
-    def name: Name      = member.name
-    def mods: Modifiers = member.mods
-    def keyword         = member.keyword
-    def prettyName      = name.decode
-
-    override def definesImplicit = member.mods.isImplicit
-    override def definesTerm: Option[TermName] = Some(name.toTermName) filter (_ => name.isTermName)
-    override def definesType: Option[TypeName] = Some(name.toTypeName) filter (_ => name.isTypeName)
-    override def definedSymbols = if (symbol eq NoSymbol) Nil else List(symbol)
-  }
-
-  /** Class to handle one member among all the members included
-   *  in a single interpreter request.
-   */
-  sealed abstract class MemberHandler(val member: Tree) {
-    def definesImplicit = false
-    def definesValue    = false
-    def isLegalTopLevel = false
-
-    def definesTerm     = Option.empty[TermName]
-    def definesType     = Option.empty[TypeName]
-
-    lazy val referencedNames = ImportVarsTraverser(member)
-    def importedNames        = List[Name]()
-    def definedNames         = definesTerm.toList ++ definesType.toList
-    def definedOrImported    = definedNames ++ importedNames
-    def definedSymbols       = List[Symbol]()
-
-    def extraCodeToEvaluate(req: Request): String = ""
-    def resultExtractionCode(req: Request): String = ""
-
-    private def shortName = this.getClass.toString split '.' last
-    override def toString = shortName + referencedNames.mkString(" (refs: ", ", ", ")")
-  }
-
-  class GenericHandler(member: Tree) extends MemberHandler(member)
-
-  class ValHandler(member: ValDef) extends MemberDefHandler(member) {
-    val maxStringElements = 1000  // no need to mkString billions of elements
-    override def definesValue = true
-
-    override def resultExtractionCode(req: Request): String = {
-      val isInternal = isUserVarName(name) && req.lookupTypeOf(name) == "Unit"
-      if (!mods.isPublic || isInternal) ""
-      else {
-        // if this is a lazy val we avoid evaluating it here
-        val resultString =
-          if (mods.isLazy) codegenln(false, "<lazy>")
-          else any2stringOf(req fullPath name, maxStringElements)
-
-        val vidString =
-          if (replProps.vids) """" + " @ " + "%%8x".format(System.identityHashCode(%s)) + " """.trim.format(req fullPath name)
-          else ""
-
-        """ + "%s%s: %s = " + %s""".format(string2code(prettyName), vidString, string2code(req typeOf name), resultString)
-      }
-    }
-  }
-
-  class DefHandler(member: DefDef) extends MemberDefHandler(member) {
-    private def vparamss = member.vparamss
-    private def isMacro = member.symbol hasFlag MACRO
-    // true if not a macro and 0-arity
-    override def definesValue = !isMacro && flattensToEmpty(vparamss)
-    override def resultExtractionCode(req: Request) =
-      if (mods.isPublic) codegenln(name, ": ", req.typeOf(name)) else ""
-  }
-
-  class AssignHandler(member: Assign) extends MemberHandler(member) {
-    val Assign(lhs, rhs) = member
-    val name = newTermName(freshInternalVarName())
-
-    override def definesTerm = Some(name)
-    override def definesValue = true
-    override def extraCodeToEvaluate(req: Request) =
-      """val %s = %s""".format(name, lhs)
-
-    /** Print out lhs instead of the generated varName */
-    override def resultExtractionCode(req: Request) = {
-      val lhsType = string2code(req lookupTypeOf name)
-      val res     = string2code(req fullPath name)
-      """ + "%s: %s = " + %s + "\n" """.format(string2code(lhs.toString), lhsType, res) + "\n"
-    }
-  }
-
-  class ModuleHandler(module: ModuleDef) extends MemberDefHandler(module) {
-    override def definesTerm = Some(name)
-    override def definesValue = true
-    override def isLegalTopLevel = true
-
-    override def resultExtractionCode(req: Request) = codegenln("defined module ", name)
-  }
-
-  class ClassHandler(member: ClassDef) extends MemberDefHandler(member) {
-    override def definesType = Some(name.toTypeName)
-    override def definesTerm = Some(name.toTermName) filter (_ => mods.isCase)
-    override def isLegalTopLevel = true
-
-    override def resultExtractionCode(req: Request) =
-      codegenln("defined %s %s".format(keyword, name))
-  }
-
-  class TypeAliasHandler(member: TypeDef) extends MemberDefHandler(member) {
-    private def isAlias = mods.isPublic && treeInfo.isAliasTypeDef(member)
-    override def definesType = Some(name.toTypeName) filter (_ => isAlias)
-
-    override def resultExtractionCode(req: Request) =
-      codegenln("defined type alias ", name) + "\n"
-  }
-
-  class ImportHandler(imp: Import) extends MemberHandler(imp) {
-    val Import(expr, selectors) = imp
-    def targetType: Type = intp.typeOfExpression("" + expr)
-    override def isLegalTopLevel = true
-
-    def createImportForName(name: Name): String = {
-      selectors foreach {
-        case sel @ ImportSelector(old, _, `name`, _)  => return "import %s.{ %s }".format(expr, sel)
-        case _ => ()
-      }
-      "import %s.%s".format(expr, name)
-    }
-    // TODO: Need to track these specially to honor Predef masking attempts,
-    // because they must be the leading imports in the code generated for each
-    // line.  We can use the same machinery as Contexts now, anyway.
-    def isPredefImport = isReferenceToPredef(expr)
-
-    // wildcard imports, e.g. import foo._
-    private def selectorWild    = selectors filter (_.name == nme.USCOREkw)
-    // renamed imports, e.g. import foo.{ bar => baz }
-    private def selectorRenames = selectors map (_.rename) filterNot (_ == null)
-
-    /** Whether this import includes a wildcard import */
-    val importsWildcard = selectorWild.nonEmpty
-
-    /** Whether anything imported is implicit .*/
-    def importsImplicit = implicitSymbols.nonEmpty
-
-    def implicitSymbols = importedSymbols filter (_.isImplicit)
-    def importedSymbols = individualSymbols ++ wildcardSymbols
-
-    lazy val individualSymbols: List[Symbol] =
-      beforePickler(individualNames map (targetType nonPrivateMember _))
-
-    lazy val wildcardSymbols: List[Symbol] =
-      if (importsWildcard) beforePickler(targetType.nonPrivateMembers.toList)
-      else Nil
-
-    /** Complete list of names imported by a wildcard */
-    lazy val wildcardNames: List[Name]   = wildcardSymbols map (_.name)
-    lazy val individualNames: List[Name] = selectorRenames filterNot (_ == nme.USCOREkw) flatMap (_.bothNames)
-
-    /** The names imported by this statement */
-    override lazy val importedNames: List[Name] = wildcardNames ++ individualNames
-    lazy val importsSymbolNamed: Set[String] = importedNames map (_.toString) toSet
-
-    def importString = imp.toString
-    override def resultExtractionCode(req: Request) = codegenln(importString) + "\n"
-  }
-}
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkRunnerSettings.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkRunnerSettings.scala
deleted file mode 100644
index 94c801ebec7c1..0000000000000
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkRunnerSettings.scala
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.repl
-
-import scala.tools.nsc.Settings
-
-/**
- * <i>scala.tools.nsc.Settings</i> implementation adding Spark-specific REPL
- * command line options.
- */
-private[repl] class SparkRunnerSettings(error: String => Unit) extends Settings(error) {
-  val loadfiles = MultiStringSetting(
-      "-i",
-      "file",
-      "load a file (assumes the code is given interactively)")
-}
diff --git a/repl/scala-2.10/src/test/scala/org/apache/spark/repl/ReplSuite.scala b/repl/scala-2.10/src/test/scala/org/apache/spark/repl/ReplSuite.scala
deleted file mode 100644
index b3688c9606877..0000000000000
--- a/repl/scala-2.10/src/test/scala/org/apache/spark/repl/ReplSuite.scala
+++ /dev/null
@@ -1,366 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.repl
-
-import java.io._
-import java.net.URLClassLoader
-
-import scala.collection.mutable.ArrayBuffer
-
-import org.apache.spark.{SparkContext, SparkFunSuite}
-import org.apache.commons.lang3.StringEscapeUtils
-import org.apache.spark.util.Utils
-
-
-class ReplSuite extends SparkFunSuite {
-
-  def runInterpreter(master: String, input: String): String = {
-    val CONF_EXECUTOR_CLASSPATH = "spark.executor.extraClassPath"
-
-    val in = new BufferedReader(new StringReader(input + "\n"))
-    val out = new StringWriter()
-    val cl = getClass.getClassLoader
-    var paths = new ArrayBuffer[String]
-    if (cl.isInstanceOf[URLClassLoader]) {
-      val urlLoader = cl.asInstanceOf[URLClassLoader]
-      for (url <- urlLoader.getURLs) {
-        if (url.getProtocol == "file") {
-          paths += url.getFile
-        }
-      }
-    }
-    val classpath = paths.map(new File(_).getAbsolutePath).mkString(File.pathSeparator)
-
-    val oldExecutorClasspath = System.getProperty(CONF_EXECUTOR_CLASSPATH)
-    System.setProperty(CONF_EXECUTOR_CLASSPATH, classpath)
-
-    val interp = new SparkILoop(in, new PrintWriter(out), master)
-    org.apache.spark.repl.Main.interp = interp
-    interp.process(Array("-classpath", classpath))
-    org.apache.spark.repl.Main.interp = null
-    if (interp.sparkContext != null) {
-      interp.sparkContext.stop()
-    }
-    if (oldExecutorClasspath != null) {
-      System.setProperty(CONF_EXECUTOR_CLASSPATH, oldExecutorClasspath)
-    } else {
-      System.clearProperty(CONF_EXECUTOR_CLASSPATH)
-    }
-    return out.toString
-  }
-
-  def assertContains(message: String, output: String) {
-    val isContain = output.contains(message)
-    assert(isContain,
-      "Interpreter output did not contain '" + message + "':\n" + output)
-  }
-
-  def assertDoesNotContain(message: String, output: String) {
-    val isContain = output.contains(message)
-    assert(!isContain,
-      "Interpreter output contained '" + message + "':\n" + output)
-  }
-
-  test("propagation of local properties") {
-    // A mock ILoop that doesn't install the SIGINT handler.
-    class ILoop(out: PrintWriter) extends SparkILoop(None, out, None) {
-      settings = new scala.tools.nsc.Settings
-      settings.usejavacp.value = true
-      org.apache.spark.repl.Main.interp = this
-      override def createInterpreter() {
-        intp = new SparkILoopInterpreter
-        intp.setContextClassLoader()
-      }
-    }
-
-    val out = new StringWriter()
-    val interp = new ILoop(new PrintWriter(out))
-    interp.sparkContext = new SparkContext("local", "repl-test")
-    interp.createInterpreter()
-    interp.intp.initialize()
-    interp.sparkContext.setLocalProperty("someKey", "someValue")
-
-    // Make sure the value we set in the caller to interpret is propagated in the thread that
-    // interprets the command.
-    interp.interpret("org.apache.spark.repl.Main.interp.sparkContext.getLocalProperty(\"someKey\")")
-    assert(out.toString.contains("someValue"))
-
-    interp.sparkContext.stop()
-    System.clearProperty("spark.driver.port")
-  }
-
-  test("simple foreach with accumulator") {
-    val output = runInterpreter("local",
-      """
-        |val accum = sc.longAccumulator
-        |sc.parallelize(1 to 10).foreach(x => accum.add(x))
-        |accum.value
-      """.stripMargin)
-    assertDoesNotContain("error:", output)
-    assertDoesNotContain("Exception", output)
-    assertContains("res1: Long = 55", output)
-  }
-
-  test("external vars") {
-    val output = runInterpreter("local",
-      """
-        |var v = 7
-        |sc.parallelize(1 to 10).map(x => v).collect().reduceLeft(_+_)
-        |v = 10
-        |sc.parallelize(1 to 10).map(x => v).collect().reduceLeft(_+_)
-      """.stripMargin)
-    assertDoesNotContain("error:", output)
-    assertDoesNotContain("Exception", output)
-    assertContains("res0: Int = 70", output)
-    assertContains("res1: Int = 100", output)
-  }
-
-  test("external classes") {
-    val output = runInterpreter("local",
-      """
-        |class C {
-        |def foo = 5
-        |}
-        |sc.parallelize(1 to 10).map(x => (new C).foo).collect().reduceLeft(_+_)
-      """.stripMargin)
-    assertDoesNotContain("error:", output)
-    assertDoesNotContain("Exception", output)
-    assertContains("res0: Int = 50", output)
-  }
-
-  test("external functions") {
-    val output = runInterpreter("local",
-      """
-        |def double(x: Int) = x + x
-        |sc.parallelize(1 to 10).map(x => double(x)).collect().reduceLeft(_+_)
-      """.stripMargin)
-    assertDoesNotContain("error:", output)
-    assertDoesNotContain("Exception", output)
-    assertContains("res0: Int = 110", output)
-  }
-
-  test("external functions that access vars") {
-    val output = runInterpreter("local",
-      """
-        |var v = 7
-        |def getV() = v
-        |sc.parallelize(1 to 10).map(x => getV()).collect().reduceLeft(_+_)
-        |v = 10
-        |sc.parallelize(1 to 10).map(x => getV()).collect().reduceLeft(_+_)
-      """.stripMargin)
-    assertDoesNotContain("error:", output)
-    assertDoesNotContain("Exception", output)
-    assertContains("res0: Int = 70", output)
-    assertContains("res1: Int = 100", output)
-  }
-
-  test("broadcast vars") {
-    // Test that the value that a broadcast var had when it was created is used,
-    // even if that variable is then modified in the driver program
-    // TODO: This doesn't actually work for arrays when we run in local mode!
-    val output = runInterpreter("local",
-      """
-        |var array = new Array[Int](5)
-        |val broadcastArray = sc.broadcast(array)
-        |sc.parallelize(0 to 4).map(x => broadcastArray.value(x)).collect()
-        |array(0) = 5
-        |sc.parallelize(0 to 4).map(x => broadcastArray.value(x)).collect()
-      """.stripMargin)
-    assertDoesNotContain("error:", output)
-    assertDoesNotContain("Exception", output)
-    assertContains("res0: Array[Int] = Array(0, 0, 0, 0, 0)", output)
-    assertContains("res2: Array[Int] = Array(5, 0, 0, 0, 0)", output)
-  }
-
-  test("interacting with files") {
-    val tempDir = Utils.createTempDir()
-    val out = new FileWriter(tempDir + "/input")
-    out.write("Hello world!\n")
-    out.write("What's up?\n")
-    out.write("Goodbye\n")
-    out.close()
-    val output = runInterpreter("local",
-      """
-        |var file = sc.textFile("%s").cache()
-        |file.count()
-        |file.count()
-        |file.count()
-      """.stripMargin.format(StringEscapeUtils.escapeJava(
-        tempDir.getAbsolutePath + File.separator + "input")))
-    assertDoesNotContain("error:", output)
-    assertDoesNotContain("Exception", output)
-    assertContains("res0: Long = 3", output)
-    assertContains("res1: Long = 3", output)
-    assertContains("res2: Long = 3", output)
-    Utils.deleteRecursively(tempDir)
-  }
-
-  test("local-cluster mode") {
-    val output = runInterpreter("local-cluster[1,1,1024]",
-      """
-        |var v = 7
-        |def getV() = v
-        |sc.parallelize(1 to 10).map(x => getV()).collect().reduceLeft(_+_)
-        |v = 10
-        |sc.parallelize(1 to 10).map(x => getV()).collect().reduceLeft(_+_)
-        |var array = new Array[Int](5)
-        |val broadcastArray = sc.broadcast(array)
-        |sc.parallelize(0 to 4).map(x => broadcastArray.value(x)).collect()
-        |array(0) = 5
-        |sc.parallelize(0 to 4).map(x => broadcastArray.value(x)).collect()
-      """.stripMargin)
-    assertDoesNotContain("error:", output)
-    assertDoesNotContain("Exception", output)
-    assertContains("res0: Int = 70", output)
-    assertContains("res1: Int = 100", output)
-    assertContains("res2: Array[Int] = Array(0, 0, 0, 0, 0)", output)
-    assertContains("res4: Array[Int] = Array(0, 0, 0, 0, 0)", output)
-  }
-
-  test("SPARK-1199 two instances of same class don't type check.") {
-    val output = runInterpreter("local",
-      """
-        |case class Sum(exp: String, exp2: String)
-        |val a = Sum("A", "B")
-        |def b(a: Sum): String = a match { case Sum(_, _) => "Found Sum" }
-        |b(a)
-      """.stripMargin)
-    assertDoesNotContain("error:", output)
-    assertDoesNotContain("Exception", output)
-  }
-
-  test("SPARK-2452 compound statements.") {
-    val output = runInterpreter("local",
-      """
-        |val x = 4 ; def f() = x
-        |f()
-      """.stripMargin)
-    assertDoesNotContain("error:", output)
-    assertDoesNotContain("Exception", output)
-  }
-
-  test("SPARK-2576 importing SQLContext.implicits._") {
-    // We need to use local-cluster to test this case.
-    val output = runInterpreter("local-cluster[1,1,1024]",
-      """
-        |val sqlContext = new org.apache.spark.sql.SQLContext(sc)
-        |import sqlContext.implicits._
-        |case class TestCaseClass(value: Int)
-        |sc.parallelize(1 to 10).map(x => TestCaseClass(x)).toDF().collect()
-        |
-        |// Test Dataset Serialization in the REPL
-        |Seq(TestCaseClass(1)).toDS().collect()
-      """.stripMargin)
-    assertDoesNotContain("error:", output)
-    assertDoesNotContain("Exception", output)
-  }
-
-  test("SPARK-8461 SQL with codegen") {
-    val output = runInterpreter("local",
-    """
-      |val sqlContext = new org.apache.spark.sql.SQLContext(sc)
-      |sqlContext.setConf("spark.sql.codegen", "true")
-      |sqlContext.range(0, 100).filter('id > 50).count()
-    """.stripMargin)
-    assertContains("Long = 49", output)
-    assertDoesNotContain("java.lang.ClassNotFoundException", output)
-  }
-
-  test("Datasets and encoders") {
-    val output = runInterpreter("local",
-      """
-        |import org.apache.spark.sql.functions._
-        |import org.apache.spark.sql.{Encoder, Encoders}
-        |import org.apache.spark.sql.expressions.Aggregator
-        |import org.apache.spark.sql.TypedColumn
-        |val simpleSum = new Aggregator[Int, Int, Int] {
-        |  def zero: Int = 0                     // The initial value.
-        |  def reduce(b: Int, a: Int) = b + a    // Add an element to the running total
-        |  def merge(b1: Int, b2: Int) = b1 + b2 // Merge intermediate values.
-        |  def finish(b: Int) = b                // Return the final result.
-        |  def bufferEncoder: Encoder[Int] = Encoders.scalaInt
-        |  def outputEncoder: Encoder[Int] = Encoders.scalaInt
-        |}.toColumn
-        |
-        |val ds = Seq(1, 2, 3, 4).toDS()
-        |ds.select(simpleSum).collect
-      """.stripMargin)
-    assertDoesNotContain("error:", output)
-    assertDoesNotContain("Exception", output)
-  }
-
-  test("SPARK-2632 importing a method from non serializable class and not using it.") {
-    val output = runInterpreter("local-cluster[1,1,1024]",
-    """
-      |class TestClass() { def testMethod = 3 }
-      |val t = new TestClass
-      |import t.testMethod
-      |case class TestCaseClass(value: Int)
-      |sc.parallelize(1 to 10).map(x => TestCaseClass(x)).collect()
-    """.stripMargin)
-    assertDoesNotContain("error:", output)
-    assertDoesNotContain("Exception", output)
-  }
-
-  if (System.getenv("MESOS_NATIVE_JAVA_LIBRARY") != null) {
-    test("running on Mesos") {
-      val output = runInterpreter("localquiet",
-        """
-          |var v = 7
-          |def getV() = v
-          |sc.parallelize(1 to 10).map(x => getV()).collect().reduceLeft(_+_)
-          |v = 10
-          |sc.parallelize(1 to 10).map(x => getV()).collect().reduceLeft(_+_)
-          |var array = new Array[Int](5)
-          |val broadcastArray = sc.broadcast(array)
-          |sc.parallelize(0 to 4).map(x => broadcastArray.value(x)).collect()
-          |array(0) = 5
-          |sc.parallelize(0 to 4).map(x => broadcastArray.value(x)).collect()
-        """.stripMargin)
-      assertDoesNotContain("error:", output)
-      assertDoesNotContain("Exception", output)
-      assertContains("res0: Int = 70", output)
-      assertContains("res1: Int = 100", output)
-      assertContains("res2: Array[Int] = Array(0, 0, 0, 0, 0)", output)
-      assertContains("res4: Array[Int] = Array(0, 0, 0, 0, 0)", output)
-    }
-  }
-
-  test("collecting objects of class defined in repl") {
-    val output = runInterpreter("local[2]",
-      """
-        |case class Foo(i: Int)
-        |val ret = sc.parallelize((1 to 100).map(Foo), 10).collect()
-      """.stripMargin)
-    assertDoesNotContain("error:", output)
-    assertDoesNotContain("Exception", output)
-    assertContains("ret: Array[Foo] = Array(Foo(1),", output)
-  }
-
-  test("collecting objects of class defined in repl - shuffling") {
-    val output = runInterpreter("local-cluster[1,1,1024]",
-      """
-        |case class Foo(i: Int)
-        |val list = List((1, Foo(1)), (1, Foo(2)))
-        |val ret = sc.parallelize(list).groupByKey().collect()
-      """.stripMargin)
-    assertDoesNotContain("error:", output)
-    assertDoesNotContain("Exception", output)
-    assertContains("ret: Array[(Int, Iterable[Foo])] = Array((1,", output)
-  }
-}
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
index c29b002a998ca..aadfcaa56cc2d 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
@@ -19,6 +19,7 @@
 
 import java.io.IOException;
 
+import scala.collection.AbstractIterator;
 import scala.collection.Iterator;
 import scala.math.Ordering;
 
@@ -28,7 +29,6 @@
 import org.apache.spark.TaskContext;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
-import org.apache.spark.sql.catalyst.util.AbstractScalaRowIterator;
 import org.apache.spark.sql.types.StructType;
 import org.apache.spark.unsafe.Platform;
 import org.apache.spark.util.collection.unsafe.sort.PrefixComparator;
@@ -145,7 +145,7 @@ public Iterator<UnsafeRow> sort() throws IOException {
         // here in order to prevent memory leaks.
         cleanupResources();
       }
-      return new AbstractScalaRowIterator<UnsafeRow>() {
+      return new AbstractIterator<UnsafeRow>() {
 
         private final int numFields = schema.length();
         private UnsafeRow row = new UnsafeRow(numFields);
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 4d5401f30d392..004b4ef8f69fe 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -41,8 +41,7 @@ object ScalaReflection extends ScalaReflection {
   // Since we are creating a runtime mirror using the class loader of current thread,
   // we need to use def at here. So, every time we call mirror, it is using the
   // class loader of the current thread.
-  // SPARK-13640: Synchronize this because universe.runtimeMirror is not thread-safe in Scala 2.10.
-  override def mirror: universe.Mirror = ScalaReflectionLock.synchronized {
+  override def mirror: universe.Mirror = {
     universe.runtimeMirror(Thread.currentThread().getContextClassLoader)
   }
 
@@ -62,7 +61,7 @@ object ScalaReflection extends ScalaReflection {
    */
   def dataTypeFor[T : TypeTag]: DataType = dataTypeFor(localTypeOf[T])
 
-  private def dataTypeFor(tpe: `Type`): DataType = ScalaReflectionLock.synchronized {
+  private def dataTypeFor(tpe: `Type`): DataType = {
     tpe match {
       case t if t <:< definitions.IntTpe => IntegerType
       case t if t <:< definitions.LongTpe => LongType
@@ -94,7 +93,7 @@ object ScalaReflection extends ScalaReflection {
    * Special handling is performed for primitive types to map them back to their raw
    * JVM form instead of the Scala Array that handles auto boxing.
    */
-  private def arrayClassFor(tpe: `Type`): ObjectType = ScalaReflectionLock.synchronized {
+  private def arrayClassFor(tpe: `Type`): ObjectType = {
     val cls = tpe match {
       case t if t <:< definitions.IntTpe => classOf[Array[Int]]
       case t if t <:< definitions.LongTpe => classOf[Array[Long]]
@@ -141,7 +140,7 @@ object ScalaReflection extends ScalaReflection {
   private def deserializerFor(
       tpe: `Type`,
       path: Option[Expression],
-      walkedTypePath: Seq[String]): Expression = ScalaReflectionLock.synchronized {
+      walkedTypePath: Seq[String]): Expression = {
 
     /** Returns the current path with a sub-field extracted. */
     def addToPath(part: String, dataType: DataType, walkedTypePath: Seq[String]): Expression = {
@@ -329,8 +328,8 @@ object ScalaReflection extends ScalaReflection {
           }
         }
 
-        val companion = t.normalize.typeSymbol.companionSymbol.typeSignature
-        val cls = companion.member(newTermName("newBuilder")) match {
+        val companion = t.dealias.typeSymbol.companion.typeSignature
+        val cls = companion.member(TermName("newBuilder")) match {
           case NoSymbol if t <:< localTypeOf[Seq[_]] => classOf[Seq[_]]
           case NoSymbol if t <:< localTypeOf[scala.collection.Set[_]] =>
             classOf[scala.collection.Set[_]]
@@ -349,7 +348,7 @@ object ScalaReflection extends ScalaReflection {
           mirror.runtimeClass(t.typeSymbol.asClass)
         )
 
-      case t if t.typeSymbol.annotations.exists(_.tpe =:= typeOf[SQLUserDefinedType]) =>
+      case t if t.typeSymbol.annotations.exists(_.tree.tpe =:= typeOf[SQLUserDefinedType]) =>
         val udt = getClassFromType(t).getAnnotation(classOf[SQLUserDefinedType]).udt().newInstance()
         val obj = NewInstance(
           udt.userClass.getAnnotation(classOf[SQLUserDefinedType]).udt(),
@@ -436,7 +435,7 @@ object ScalaReflection extends ScalaReflection {
       inputObject: Expression,
       tpe: `Type`,
       walkedTypePath: Seq[String],
-      seenTypeSet: Set[`Type`] = Set.empty): Expression = ScalaReflectionLock.synchronized {
+      seenTypeSet: Set[`Type`] = Set.empty): Expression = {
 
     def toCatalystArray(input: Expression, elementType: `Type`): Expression = {
       dataTypeFor(elementType) match {
@@ -591,7 +590,7 @@ object ScalaReflection extends ScalaReflection {
       case t if t <:< localTypeOf[java.lang.Boolean] =>
         Invoke(inputObject, "booleanValue", BooleanType)
 
-      case t if t.typeSymbol.annotations.exists(_.tpe =:= typeOf[SQLUserDefinedType]) =>
+      case t if t.typeSymbol.annotations.exists(_.tree.tpe =:= typeOf[SQLUserDefinedType]) =>
         val udt = getClassFromType(t)
           .getAnnotation(classOf[SQLUserDefinedType]).udt().newInstance()
         val obj = NewInstance(
@@ -643,7 +642,7 @@ object ScalaReflection extends ScalaReflection {
    * Returns true if the given type is option of product type, e.g. `Option[Tuple2]`. Note that,
    * we also treat [[DefinedByConstructorParams]] as product type.
    */
-  def optionOfProductType(tpe: `Type`): Boolean = ScalaReflectionLock.synchronized {
+  def optionOfProductType(tpe: `Type`): Boolean = {
     tpe match {
       case t if t <:< localTypeOf[Option[_]] =>
         val TypeRef(_, _, Seq(optType)) = t
@@ -705,9 +704,9 @@ object ScalaReflection extends ScalaReflection {
   def schemaFor[T: TypeTag]: Schema = schemaFor(localTypeOf[T])
 
   /** Returns a catalyst DataType and its nullability for the given Scala Type using reflection. */
-  def schemaFor(tpe: `Type`): Schema = ScalaReflectionLock.synchronized {
+  def schemaFor(tpe: `Type`): Schema = {
     tpe match {
-      case t if t.typeSymbol.annotations.exists(_.tpe =:= typeOf[SQLUserDefinedType]) =>
+      case t if t.typeSymbol.annotations.exists(_.tree.tpe =:= typeOf[SQLUserDefinedType]) =>
         val udt = getClassFromType(t).getAnnotation(classOf[SQLUserDefinedType]).udt().newInstance()
         Schema(udt, nullable = true)
       case t if UDTRegistration.exists(getClassNameFromType(t)) =>
@@ -814,10 +813,9 @@ trait ScalaReflection {
    *
    * @see SPARK-5281
    */
-  // SPARK-13640: Synchronize this because TypeTag.tpe is not thread-safe in Scala 2.10.
-  def localTypeOf[T: TypeTag]: `Type` = ScalaReflectionLock.synchronized {
+  def localTypeOf[T: TypeTag]: `Type` = {
     val tag = implicitly[TypeTag[T]]
-    tag.in(mirror).tpe.normalize
+    tag.in(mirror).tpe.dealias
   }
 
   /**
@@ -866,9 +864,9 @@ trait ScalaReflection {
   }
 
   protected def constructParams(tpe: Type): Seq[Symbol] = {
-    val constructorSymbol = tpe.member(nme.CONSTRUCTOR)
+    val constructorSymbol = tpe.member(termNames.CONSTRUCTOR)
     val params = if (constructorSymbol.isMethod) {
-      constructorSymbol.asMethod.paramss
+      constructorSymbol.asMethod.paramLists
     } else {
       // Find the primary constructor, and use its parameter ordering.
       val primaryConstructorSymbol: Option[Symbol] = constructorSymbol.asTerm.alternatives.find(
@@ -876,7 +874,7 @@ trait ScalaReflection {
       if (primaryConstructorSymbol.isEmpty) {
         sys.error("Internal SQL error: Product object did not have a primary constructor.")
       } else {
-        primaryConstructorSymbol.get.asMethod.paramss
+        primaryConstructorSymbol.get.asMethod.paramLists
       }
     }
     params.flatten
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 53536496d0457..7745709e07fe5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -1518,9 +1518,9 @@ class Analyzer(
        */
       def unapply(e: Expression): Option[(Generator, Seq[String], Boolean)] = e match {
         case Alias(GeneratorOuter(g: Generator), name) if g.resolved => Some((g, name :: Nil, true))
-        case MultiAlias(GeneratorOuter(g: Generator), names) if g.resolved => Some(g, names, true)
+        case MultiAlias(GeneratorOuter(g: Generator), names) if g.resolved => Some((g, names, true))
         case Alias(g: Generator, name) if g.resolved => Some((g, name :: Nil, false))
-        case MultiAlias(g: Generator, names) if g.resolved => Some(g, names, false)
+        case MultiAlias(g: Generator, names) if g.resolved => Some((g, names, false))
         case _ => None
       }
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala
index 41128fe389d46..9ff5b87cea0b8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import scala.reflect.internal.util.AbstractFileClassLoader
+
 import org.apache.spark.sql.catalyst.rules
 import org.apache.spark.util.Utils
 
@@ -51,7 +53,7 @@ package object codegen {
       val classLoader =
         generatedClass
           .getClassLoader
-          .asInstanceOf[scala.tools.nsc.interpreter.AbstractFileClassLoader]
+          .asInstanceOf[AbstractFileClassLoader]
       val generatedBytes = classLoader.classBytes(generatedClass.getName)
 
       val packageDir = new java.io.File(dumpDirectory, generatedClass.getPackage.getName)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
index 29c33804f077a..b898484e0ecf1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
@@ -30,7 +30,7 @@ object NamedExpression {
   private val curId = new java.util.concurrent.atomic.AtomicLong()
   private[expressions] val jvmId = UUID.randomUUID()
   def newExprId: ExprId = ExprId(curId.getAndIncrement(), jvmId)
-  def unapply(expr: NamedExpression): Option[(String, DataType)] = Some(expr.name, expr.dataType)
+  def unapply(expr: NamedExpression): Option[(String, DataType)] = Some((expr.name, expr.dataType))
 }
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/package.scala
index f9c88d496e899..a6b10cfe75edc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/package.scala
@@ -22,10 +22,4 @@ package org.apache.spark.sql
  * considered an internal API to Spark SQL and are subject to change between minor releases.
  */
 package object catalyst {
-  /**
-   * A JVM-global lock that should be used to prevent thread safety issues when using things in
-   * scala.reflect.*.  Note that Scala Reflection API is made thread-safe in 2.11, but not yet for
-   * 2.10.* builds.  See SI-6240 for more details.
-   */
-  protected[sql] object ScalaReflectionLock
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/AbstractScalaRowIterator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/AbstractScalaRowIterator.scala
deleted file mode 100644
index 0c7205b3c6651..0000000000000
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/AbstractScalaRowIterator.scala
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.util
-
-/**
- * Shim to allow us to implement [[scala.Iterator]] in Java. Scala 2.11+ has an AbstractIterator
- * class for this, but that class is `private[scala]` in 2.10. We need to explicitly fix this to
- * `Row` in order to work around a spurious IntelliJ compiler error. This cannot be an abstract
- * class because that leads to compilation errors under Scala 2.11.
- */
-class AbstractScalaRowIterator[T] extends Iterator[T] {
-  override def hasNext: Boolean = throw new NotImplementedError
-
-  override def next(): T = throw new NotImplementedError
-}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BinaryType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BinaryType.scala
index 02c8318b4d413..032d6b54aeb79 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BinaryType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BinaryType.scala
@@ -21,7 +21,6 @@ import scala.math.Ordering
 import scala.reflect.runtime.universe.typeTag
 
 import org.apache.spark.annotation.InterfaceStability
-import org.apache.spark.sql.catalyst.ScalaReflectionLock
 import org.apache.spark.sql.catalyst.util.TypeUtils
 
 
@@ -37,7 +36,7 @@ class BinaryType private() extends AtomicType {
 
   private[sql] type InternalType = Array[Byte]
 
-  @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[InternalType] }
+  @transient private[sql] lazy val tag = typeTag[InternalType]
 
   private[sql] val ordering = new Ordering[InternalType] {
     def compare(x: Array[Byte], y: Array[Byte]): Int = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BooleanType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BooleanType.scala
index cee78f4b4ac1a..63f354d2243cf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BooleanType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BooleanType.scala
@@ -21,7 +21,6 @@ import scala.math.Ordering
 import scala.reflect.runtime.universe.typeTag
 
 import org.apache.spark.annotation.InterfaceStability
-import org.apache.spark.sql.catalyst.ScalaReflectionLock
 
 
 /**
@@ -35,7 +34,7 @@ class BooleanType private() extends AtomicType {
   // this type. Otherwise, the companion object would be of type "BooleanType$" in byte code.
   // Defined with a private constructor so the companion object is the only possible instantiation.
   private[sql] type InternalType = Boolean
-  @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[InternalType] }
+  @transient private[sql] lazy val tag = typeTag[InternalType]
   private[sql] val ordering = implicitly[Ordering[InternalType]]
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ByteType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ByteType.scala
index b1dd5eda36bd6..5854c3f5ba116 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ByteType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ByteType.scala
@@ -21,7 +21,6 @@ import scala.math.{Integral, Numeric, Ordering}
 import scala.reflect.runtime.universe.typeTag
 
 import org.apache.spark.annotation.InterfaceStability
-import org.apache.spark.sql.catalyst.ScalaReflectionLock
 
 /**
  * The data type representing `Byte` values. Please use the singleton `DataTypes.ByteType`.
@@ -34,7 +33,7 @@ class ByteType private() extends IntegralType {
   // this type. Otherwise, the companion object would be of type "ByteType$" in byte code.
   // Defined with a private constructor so the companion object is the only possible instantiation.
   private[sql] type InternalType = Byte
-  @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[InternalType] }
+  @transient private[sql] lazy val tag = typeTag[InternalType]
   private[sql] val numeric = implicitly[Numeric[Byte]]
   private[sql] val integral = implicitly[Integral[Byte]]
   private[sql] val ordering = implicitly[Ordering[InternalType]]
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DateType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DateType.scala
index 0c0574b845536..9e70dd486a125 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DateType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DateType.scala
@@ -21,8 +21,6 @@ import scala.math.Ordering
 import scala.reflect.runtime.universe.typeTag
 
 import org.apache.spark.annotation.InterfaceStability
-import org.apache.spark.sql.catalyst.ScalaReflectionLock
-
 
 /**
  * A date type, supporting "0001-01-01" through "9999-12-31".
@@ -40,7 +38,7 @@ class DateType private() extends AtomicType {
   // Defined with a private constructor so the companion object is the only possible instantiation.
   private[sql] type InternalType = Int
 
-  @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[InternalType] }
+  @transient private[sql] lazy val tag = typeTag[InternalType]
 
   private[sql] val ordering = implicitly[Ordering[InternalType]]
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
index 5c4bc5e33c53a..6e050c18b8acb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
@@ -23,7 +23,6 @@ import scala.reflect.runtime.universe.typeTag
 
 import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.ScalaReflectionLock
 import org.apache.spark.sql.catalyst.expressions.Expression
 
 
@@ -57,7 +56,7 @@ case class DecimalType(precision: Int, scale: Int) extends FractionalType {
   def this() = this(10)
 
   private[sql] type InternalType = Decimal
-  @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[InternalType] }
+  @transient private[sql] lazy val tag = typeTag[InternalType]
   private[sql] val numeric = Decimal.DecimalIsFractional
   private[sql] val fractional = Decimal.DecimalIsFractional
   private[sql] val ordering = Decimal.DecimalIsFractional
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DoubleType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DoubleType.scala
index 400f7aed6ae72..a5c79ff01ca06 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DoubleType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DoubleType.scala
@@ -22,7 +22,6 @@ import scala.math.Numeric.DoubleAsIfIntegral
 import scala.reflect.runtime.universe.typeTag
 
 import org.apache.spark.annotation.InterfaceStability
-import org.apache.spark.sql.catalyst.ScalaReflectionLock
 import org.apache.spark.util.Utils
 
 /**
@@ -36,7 +35,7 @@ class DoubleType private() extends FractionalType {
   // this type. Otherwise, the companion object would be of type "DoubleType$" in byte code.
   // Defined with a private constructor so the companion object is the only possible instantiation.
   private[sql] type InternalType = Double
-  @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[InternalType] }
+  @transient private[sql] lazy val tag = typeTag[InternalType]
   private[sql] val numeric = implicitly[Numeric[Double]]
   private[sql] val fractional = implicitly[Fractional[Double]]
   private[sql] val ordering = new Ordering[Double] {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/FloatType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/FloatType.scala
index b9812b236d575..352147ec936c9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/FloatType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/FloatType.scala
@@ -22,7 +22,6 @@ import scala.math.Numeric.FloatAsIfIntegral
 import scala.reflect.runtime.universe.typeTag
 
 import org.apache.spark.annotation.InterfaceStability
-import org.apache.spark.sql.catalyst.ScalaReflectionLock
 import org.apache.spark.util.Utils
 
 /**
@@ -36,7 +35,7 @@ class FloatType private() extends FractionalType {
   // this type. Otherwise, the companion object would be of type "FloatType$" in byte code.
   // Defined with a private constructor so the companion object is the only possible instantiation.
   private[sql] type InternalType = Float
-  @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[InternalType] }
+  @transient private[sql] lazy val tag = typeTag[InternalType]
   private[sql] val numeric = implicitly[Numeric[Float]]
   private[sql] val fractional = implicitly[Fractional[Float]]
   private[sql] val ordering = new Ordering[Float] {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/HiveStringType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/HiveStringType.scala
index b319eb70bc13c..e0bca937d1d84 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/HiveStringType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/HiveStringType.scala
@@ -19,7 +19,6 @@ package org.apache.spark.sql.types
 import scala.math.Ordering
 import scala.reflect.runtime.universe.typeTag
 
-import org.apache.spark.sql.catalyst.ScalaReflectionLock
 import org.apache.spark.unsafe.types.UTF8String
 
 /**
@@ -32,9 +31,7 @@ sealed abstract class HiveStringType extends AtomicType {
 
   private[sql] val ordering = implicitly[Ordering[InternalType]]
 
-  @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized {
-    typeTag[InternalType]
-  }
+  @transient private[sql] lazy val tag = typeTag[InternalType]
 
   override def defaultSize: Int = length
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/IntegerType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/IntegerType.scala
index dca612ecbfed9..a85e3729188d9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/IntegerType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/IntegerType.scala
@@ -21,8 +21,6 @@ import scala.math.{Integral, Numeric, Ordering}
 import scala.reflect.runtime.universe.typeTag
 
 import org.apache.spark.annotation.InterfaceStability
-import org.apache.spark.sql.catalyst.ScalaReflectionLock
-
 
 /**
  * The data type representing `Int` values. Please use the singleton `DataTypes.IntegerType`.
@@ -35,7 +33,7 @@ class IntegerType private() extends IntegralType {
   // this type. Otherwise, the companion object would be of type "IntegerType$" in byte code.
   // Defined with a private constructor so the companion object is the only possible instantiation.
   private[sql] type InternalType = Int
-  @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[InternalType] }
+  @transient private[sql] lazy val tag = typeTag[InternalType]
   private[sql] val numeric = implicitly[Numeric[Int]]
   private[sql] val integral = implicitly[Integral[Int]]
   private[sql] val ordering = implicitly[Ordering[InternalType]]
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/LongType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/LongType.scala
index 396c3355701c5..0997028fc1057 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/LongType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/LongType.scala
@@ -21,7 +21,6 @@ import scala.math.{Integral, Numeric, Ordering}
 import scala.reflect.runtime.universe.typeTag
 
 import org.apache.spark.annotation.InterfaceStability
-import org.apache.spark.sql.catalyst.ScalaReflectionLock
 
 /**
  * The data type representing `Long` values. Please use the singleton `DataTypes.LongType`.
@@ -34,7 +33,7 @@ class LongType private() extends IntegralType {
   // this type. Otherwise, the companion object would be of type "LongType$" in byte code.
   // Defined with a private constructor so the companion object is the only possible instantiation.
   private[sql] type InternalType = Long
-  @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[InternalType] }
+  @transient private[sql] lazy val tag = typeTag[InternalType]
   private[sql] val numeric = implicitly[Numeric[Long]]
   private[sql] val integral = implicitly[Integral[Long]]
   private[sql] val ordering = implicitly[Ordering[InternalType]]
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ShortType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ShortType.scala
index 1410d5ba0e0b0..ee655c338b59f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ShortType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ShortType.scala
@@ -21,7 +21,6 @@ import scala.math.{Integral, Numeric, Ordering}
 import scala.reflect.runtime.universe.typeTag
 
 import org.apache.spark.annotation.InterfaceStability
-import org.apache.spark.sql.catalyst.ScalaReflectionLock
 
 /**
  * The data type representing `Short` values. Please use the singleton `DataTypes.ShortType`.
@@ -34,7 +33,7 @@ class ShortType private() extends IntegralType {
   // this type. Otherwise, the companion object would be of type "ShortType$" in byte code.
   // Defined with a private constructor so the companion object is the only possible instantiation.
   private[sql] type InternalType = Short
-  @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[InternalType] }
+  @transient private[sql] lazy val tag = typeTag[InternalType]
   private[sql] val numeric = implicitly[Numeric[Short]]
   private[sql] val integral = implicitly[Integral[Short]]
   private[sql] val ordering = implicitly[Ordering[InternalType]]
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StringType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StringType.scala
index d1c0da3479d76..59b124cda7d14 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StringType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StringType.scala
@@ -21,7 +21,6 @@ import scala.math.Ordering
 import scala.reflect.runtime.universe.typeTag
 
 import org.apache.spark.annotation.InterfaceStability
-import org.apache.spark.sql.catalyst.ScalaReflectionLock
 import org.apache.spark.unsafe.types.UTF8String
 
 /**
@@ -35,7 +34,7 @@ class StringType private() extends AtomicType {
   // this type. Otherwise, the companion object would be of type "StringType$" in byte code.
   // Defined with a private constructor so the companion object is the only possible instantiation.
   private[sql] type InternalType = UTF8String
-  @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[InternalType] }
+  @transient private[sql] lazy val tag = typeTag[InternalType]
   private[sql] val ordering = implicitly[Ordering[InternalType]]
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/TimestampType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/TimestampType.scala
index 2875995420053..fdb91e0499920 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/TimestampType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/TimestampType.scala
@@ -21,8 +21,6 @@ import scala.math.Ordering
 import scala.reflect.runtime.universe.typeTag
 
 import org.apache.spark.annotation.InterfaceStability
-import org.apache.spark.sql.catalyst.ScalaReflectionLock
-
 
 /**
  * The data type representing `java.sql.Timestamp` values.
@@ -37,7 +35,7 @@ class TimestampType private() extends AtomicType {
   // Defined with a private constructor so the companion object is the only possible instantiation.
   private[sql] type InternalType = Long
 
-  @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[InternalType] }
+  @transient private[sql] lazy val tag = typeTag[InternalType]
 
   private[sql] val ordering = implicitly[Ordering[InternalType]]
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/JoinOptimizationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/JoinOptimizationSuite.scala
index 2f30a78f03211..97733a754ccc2 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/JoinOptimizationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/JoinOptimizationSuite.scala
@@ -70,27 +70,27 @@ class JoinOptimizationSuite extends PlanTest {
 
     testExtract(x, None)
     testExtract(x.where("x.b".attr === 1), None)
-    testExtract(x.join(y), Some(Seq(x, y), Seq()))
+    testExtract(x.join(y), Some((Seq(x, y), Seq())))
     testExtract(x.join(y, condition = Some("x.b".attr === "y.d".attr)),
-      Some(Seq(x, y), Seq("x.b".attr === "y.d".attr)))
+      Some((Seq(x, y), Seq("x.b".attr === "y.d".attr))))
     testExtract(x.join(y).where("x.b".attr === "y.d".attr),
-      Some(Seq(x, y), Seq("x.b".attr === "y.d".attr)))
-    testExtract(x.join(y).join(z), Some(Seq(x, y, z), Seq()))
+      Some((Seq(x, y), Seq("x.b".attr === "y.d".attr))))
+    testExtract(x.join(y).join(z), Some((Seq(x, y, z), Seq())))
     testExtract(x.join(y).where("x.b".attr === "y.d".attr).join(z),
-      Some(Seq(x, y, z), Seq("x.b".attr === "y.d".attr)))
-    testExtract(x.join(y).join(x.join(z)), Some(Seq(x, y, x.join(z)), Seq()))
+      Some((Seq(x, y, z), Seq("x.b".attr === "y.d".attr))))
+    testExtract(x.join(y).join(x.join(z)), Some((Seq(x, y, x.join(z)), Seq())))
     testExtract(x.join(y).join(x.join(z)).where("x.b".attr === "y.d".attr),
-      Some(Seq(x, y, x.join(z)), Seq("x.b".attr === "y.d".attr)))
+      Some((Seq(x, y, x.join(z)), Seq("x.b".attr === "y.d".attr))))
 
-    testExtractCheckCross(x.join(y, Cross), Some(Seq((x, Cross), (y, Cross)), Seq()))
+    testExtractCheckCross(x.join(y, Cross), Some((Seq((x, Cross), (y, Cross)), Seq())))
     testExtractCheckCross(x.join(y, Cross).join(z, Cross),
-      Some(Seq((x, Cross), (y, Cross), (z, Cross)), Seq()))
+      Some((Seq((x, Cross), (y, Cross), (z, Cross)), Seq())))
     testExtractCheckCross(x.join(y, Cross, Some("x.b".attr === "y.d".attr)).join(z, Cross),
-      Some(Seq((x, Cross), (y, Cross), (z, Cross)), Seq("x.b".attr === "y.d".attr)))
+      Some((Seq((x, Cross), (y, Cross), (z, Cross)), Seq("x.b".attr === "y.d".attr))))
     testExtractCheckCross(x.join(y, Inner, Some("x.b".attr === "y.d".attr)).join(z, Cross),
-      Some(Seq((x, Inner), (y, Inner), (z, Cross)), Seq("x.b".attr === "y.d".attr)))
+      Some((Seq((x, Inner), (y, Inner), (z, Cross)), Seq("x.b".attr === "y.d".attr))))
     testExtractCheckCross(x.join(y, Cross, Some("x.b".attr === "y.d".attr)).join(z, Inner),
-      Some(Seq((x, Cross), (y, Cross), (z, Inner)), Seq("x.b".attr === "y.d".attr)))
+      Some((Seq((x, Cross), (y, Cross), (z, Inner)), Seq("x.b".attr === "y.d".attr))))
   }
 
   test("reorder inner joins") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeCodegenSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeCodegenSuite.scala
index b71067c0af3a1..2abf9fe6aa490 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeCodegenSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeCodegenSuite.scala
@@ -43,8 +43,8 @@ class OptimizeCodegenSuite extends PlanTest {
       CaseWhen(Seq((TrueLiteral, Literal(1))), Literal(2)).toCodegen())
 
     assertEquivalent(
-      CaseWhen(List.fill(100)(TrueLiteral, Literal(1)), Literal(2)),
-      CaseWhen(List.fill(100)(TrueLiteral, Literal(1)), Literal(2)))
+      CaseWhen(List.fill(100)((TrueLiteral, Literal(1))), Literal(2)),
+      CaseWhen(List.fill(100)((TrueLiteral, Literal(1))), Literal(2)))
   }
 
   test("Nested CaseWhen Codegen.") {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index bd1669b6dba69..b546cccb7ea98 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -505,7 +505,7 @@ class Column(val expr: Expression) extends Logging {
    */
   def when(condition: Column, value: Any): Column = this.expr match {
     case CaseWhen(branches, None) =>
-      withExpr { CaseWhen(branches :+ (condition.expr, lit(value).expr)) }
+      withExpr { CaseWhen(branches :+ ((condition.expr, lit(value).expr))) }
     case CaseWhen(branches, Some(_)) =>
       throw new IllegalArgumentException(
         "when() cannot be applied once otherwise() is applied")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala
index 5cfad9126986b..02f45abaa3f08 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala
@@ -133,20 +133,20 @@ case class OptimizeMetadataOnlyQuery(catalog: SessionCatalog) extends Rule[Logic
       case l @ LogicalRelation(fsRelation: HadoopFsRelation, _, _)
         if fsRelation.partitionSchema.nonEmpty =>
         val partAttrs = getPartitionAttrs(fsRelation.partitionSchema.map(_.name), l)
-        Some(AttributeSet(partAttrs), l)
+        Some((AttributeSet(partAttrs), l))
 
       case relation: CatalogRelation if relation.tableMeta.partitionColumnNames.nonEmpty =>
         val partAttrs = getPartitionAttrs(relation.tableMeta.partitionColumnNames, relation)
-        Some(AttributeSet(partAttrs), relation)
+        Some((AttributeSet(partAttrs), relation))
 
       case p @ Project(projectList, child) if projectList.forall(_.deterministic) =>
         unapply(child).flatMap { case (partAttrs, relation) =>
-          if (p.references.subsetOf(partAttrs)) Some(p.outputSet, relation) else None
+          if (p.references.subsetOf(partAttrs)) Some((p.outputSet, relation)) else None
         }
 
       case f @ Filter(condition, child) if condition.deterministic =>
         unapply(child).flatMap { case (partAttrs, relation) =>
-          if (f.references.subsetOf(partAttrs)) Some(partAttrs, relation) else None
+          if (f.references.subsetOf(partAttrs)) Some((partAttrs, relation)) else None
         }
 
       case _ => None
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
index 9533144214a10..b56fbd4284d2f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -155,7 +155,7 @@ class QueryExecution(val sparkSession: SparkSession, val logical: LogicalPlan) {
     def toHiveStructString(a: (Any, DataType)): String = a match {
       case (struct: Row, StructType(fields)) =>
         struct.toSeq.zip(fields).map {
-          case (v, t) => s""""${t.name}":${toHiveStructString(v, t.dataType)}"""
+          case (v, t) => s""""${t.name}":${toHiveStructString((v, t.dataType))}"""
         }.mkString("{", ",", "}")
       case (seq: Seq[_], ArrayType(typ, _)) =>
         seq.map(v => (v, typ)).map(toHiveStructString).mkString("[", ",", "]")
@@ -173,7 +173,7 @@ class QueryExecution(val sparkSession: SparkSession, val logical: LogicalPlan) {
     a match {
       case (struct: Row, StructType(fields)) =>
         struct.toSeq.zip(fields).map {
-          case (v, t) => s""""${t.name}":${toHiveStructString(v, t.dataType)}"""
+          case (v, t) => s""""${t.name}":${toHiveStructString((v, t.dataType))}"""
         }.mkString("{", ",", "}")
       case (seq: Seq[_], ArrayType(typ, _)) =>
         seq.map(v => (v, typ)).map(toHiveStructString).mkString("[", ",", "]")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
index 69b4b7bb07de6..2b3c5f054893f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
@@ -133,7 +133,7 @@ object ExtractPythonUDFs extends Rule[SparkPlan] with PredicateHelper {
         val validUdfs = udfs.filter { udf =>
           // Check to make sure that the UDF can be evaluated with only the input of this child.
           udf.references.subsetOf(child.outputSet)
-        }.toArray  // Turn it into an array since iterators cannot be serialized in Scala 2.10
+        }.toArray
         if (validUdfs.nonEmpty) {
           val resultAttrs = udfs.zipWithIndex.map { case (u, i) =>
             AttributeReference(s"pythonUDF$i", u.dataType)()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 825840707d421..69d110e414278 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -359,7 +359,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
 
   test("reduce") {
     val ds = Seq(("a", 1), ("b", 2), ("c", 3)).toDS()
-    assert(ds.reduce((a, b) => ("sum", a._2 + b._2)) == ("sum", 6))
+    assert(ds.reduce((a, b) => ("sum", a._2 + b._2)) == (("sum", 6)))
   }
 
   test("joinWith, flat schema") {
@@ -784,7 +784,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
   test("SPARK-14000: case class with tuple type field") {
     checkDataset(
       Seq(TupleClass((1, "a"))).toDS(),
-      TupleClass(1, "a")
+      TupleClass((1, "a"))
     )
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
index b80bd80e93e8b..9e459ed00c8d5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
@@ -176,16 +176,16 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
 
   test("number format in statistics") {
     val numbers = Seq(
-      BigInt(0) -> ("0.0 B", "0"),
-      BigInt(100) -> ("100.0 B", "100"),
-      BigInt(2047) -> ("2047.0 B", "2.05E+3"),
-      BigInt(2048) -> ("2.0 KB", "2.05E+3"),
-      BigInt(3333333) -> ("3.2 MB", "3.33E+6"),
-      BigInt(4444444444L) -> ("4.1 GB", "4.44E+9"),
-      BigInt(5555555555555L) -> ("5.1 TB", "5.56E+12"),
-      BigInt(6666666666666666L) -> ("5.9 PB", "6.67E+15"),
-      BigInt(1L << 10 ) * (1L << 60) -> ("1024.0 EB", "1.18E+21"),
-      BigInt(1L << 11) * (1L << 60) -> ("2.36E+21 B", "2.36E+21")
+      BigInt(0) -> (("0.0 B", "0")),
+      BigInt(100) -> (("100.0 B", "100")),
+      BigInt(2047) -> (("2047.0 B", "2.05E+3")),
+      BigInt(2048) -> (("2.0 KB", "2.05E+3")),
+      BigInt(3333333) -> (("3.2 MB", "3.33E+6")),
+      BigInt(4444444444L) -> (("4.1 GB", "4.44E+9")),
+      BigInt(5555555555555L) -> (("5.1 TB", "5.56E+12")),
+      BigInt(6666666666666666L) -> (("5.9 PB", "6.67E+15")),
+      BigInt(1L << 10 ) * (1L << 60) -> (("1024.0 EB", "1.18E+21")),
+      BigInt(1L << 11) * (1L << 60) -> (("2.36E+21 B", "2.36E+21"))
     )
     numbers.foreach { case (input, (expectedSize, expectedRows)) =>
       val stats = Statistics(sizeInBytes = input, rowCount = Some(input))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
index 7bcb419e8df6c..274694b99541e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
@@ -27,23 +27,23 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
   val row = identity[(java.lang.Integer, java.lang.Double)](_)
 
   lazy val l = Seq(
-    row(1, 2.0),
-    row(1, 2.0),
-    row(2, 1.0),
-    row(2, 1.0),
-    row(3, 3.0),
-    row(null, null),
-    row(null, 5.0),
-    row(6, null)).toDF("a", "b")
+    row((1, 2.0)),
+    row((1, 2.0)),
+    row((2, 1.0)),
+    row((2, 1.0)),
+    row((3, 3.0)),
+    row((null, null)),
+    row((null, 5.0)),
+    row((6, null))).toDF("a", "b")
 
   lazy val r = Seq(
-    row(2, 3.0),
-    row(2, 3.0),
-    row(3, 2.0),
-    row(4, 1.0),
-    row(null, null),
-    row(null, 5.0),
-    row(6, null)).toDF("c", "d")
+    row((2, 3.0)),
+    row((2, 3.0)),
+    row((3, 2.0)),
+    row((4, 1.0)),
+    row((null, null)),
+    row((null, 5.0)),
+    row((6, null))).toDF("c", "d")
 
   lazy val t = r.filter($"c".isNotNull && $"d".isNotNull)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
index 94a2f9a00b3f3..d76990b482db2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
@@ -211,7 +211,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
   }
 
   testStandardAndLegacyModes("nested map with struct as value type") {
-    val data = (1 to 4).map(i => Tuple1(Map(i -> (i, s"val_$i"))))
+    val data = (1 to 4).map(i => Tuple1(Map(i -> ((i, s"val_$i")))))
     withParquetDataFrame(data) { df =>
       checkAnswer(df, data.map { case Tuple1(m) =>
         Row(m.mapValues(struct => Row(struct.productIterator.toSeq: _*)))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
index 2911cbbeee479..fd793233b0bc1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
@@ -146,8 +146,8 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
     // PhysicalRDD(nodeId = 1) -> Filter(nodeId = 0)
     val df = person.filter('age < 25)
     testSparkPlanMetrics(df, 1, Map(
-      0L -> ("Filter", Map(
-        "number of output rows" -> 1L)))
+      0L -> (("Filter", Map(
+        "number of output rows" -> 1L))))
     )
   }
 
@@ -170,8 +170,8 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
       Map("number of output rows" -> 1L,
         "avg hash probe (min, med, max)" -> "\n(1, 1, 1)"))
     testSparkPlanMetrics(df, 1, Map(
-      2L -> ("HashAggregate", expected1(0)),
-      0L -> ("HashAggregate", expected1(1)))
+      2L -> (("HashAggregate", expected1(0))),
+      0L -> (("HashAggregate", expected1(1))))
     )
 
     // 2 partitions and each partition contains 2 keys
@@ -182,8 +182,8 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
       Map("number of output rows" -> 3L,
         "avg hash probe (min, med, max)" -> "\n(1, 1, 1)"))
     testSparkPlanMetrics(df2, 1, Map(
-      2L -> ("HashAggregate", expected2(0)),
-      0L -> ("HashAggregate", expected2(1)))
+      2L -> (("HashAggregate", expected2(0))),
+      0L -> (("HashAggregate", expected2(1))))
     )
   }
 
@@ -234,15 +234,15 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
     // -> ObjectHashAggregate(nodeId = 0)
     val df = testData2.groupBy().agg(collect_set('a)) // 2 partitions
     testSparkPlanMetrics(df, 1, Map(
-      2L -> ("ObjectHashAggregate", Map("number of output rows" -> 2L)),
-      0L -> ("ObjectHashAggregate", Map("number of output rows" -> 1L)))
+      2L -> (("ObjectHashAggregate", Map("number of output rows" -> 2L))),
+      0L -> (("ObjectHashAggregate", Map("number of output rows" -> 1L))))
     )
 
     // 2 partitions and each partition contains 2 keys
     val df2 = testData2.groupBy('a).agg(collect_set('a))
     testSparkPlanMetrics(df2, 1, Map(
-      2L -> ("ObjectHashAggregate", Map("number of output rows" -> 4L)),
-      0L -> ("ObjectHashAggregate", Map("number of output rows" -> 3L)))
+      2L -> (("ObjectHashAggregate", Map("number of output rows" -> 4L))),
+      0L -> (("ObjectHashAggregate", Map("number of output rows" -> 3L))))
     )
   }
 
@@ -264,9 +264,9 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
       val df = spark.sql(
         "SELECT * FROM testData2 JOIN testDataForJoin ON testData2.a = testDataForJoin.a")
       testSparkPlanMetrics(df, 1, Map(
-        0L -> ("SortMergeJoin", Map(
+        0L -> (("SortMergeJoin", Map(
           // It's 4 because we only read 3 rows in the first partition and 1 row in the second one
-          "number of output rows" -> 4L)))
+          "number of output rows" -> 4L))))
       )
     }
   }
@@ -282,17 +282,17 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
       val df = spark.sql(
         "SELECT * FROM testData2 left JOIN testDataForJoin ON testData2.a = testDataForJoin.a")
       testSparkPlanMetrics(df, 1, Map(
-        0L -> ("SortMergeJoin", Map(
+        0L -> (("SortMergeJoin", Map(
           // It's 4 because we only read 3 rows in the first partition and 1 row in the second one
-          "number of output rows" -> 8L)))
+          "number of output rows" -> 8L))))
       )
 
       val df2 = spark.sql(
         "SELECT * FROM testDataForJoin right JOIN testData2 ON testData2.a = testDataForJoin.a")
       testSparkPlanMetrics(df2, 1, Map(
-        0L -> ("SortMergeJoin", Map(
+        0L -> (("SortMergeJoin", Map(
           // It's 4 because we only read 3 rows in the first partition and 1 row in the second one
-          "number of output rows" -> 8L)))
+          "number of output rows" -> 8L))))
       )
     }
   }
@@ -304,9 +304,9 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
     // ... -> BroadcastHashJoin(nodeId = 1) -> TungstenProject(nodeId = 0)
     val df = df1.join(broadcast(df2), "key")
     testSparkPlanMetrics(df, 2, Map(
-      1L -> ("BroadcastHashJoin", Map(
+      1L -> (("BroadcastHashJoin", Map(
         "number of output rows" -> 2L,
-        "avg hash probe (min, med, max)" -> "\n(1, 1, 1)")))
+        "avg hash probe (min, med, max)" -> "\n(1, 1, 1)"))))
     )
   }
 
@@ -365,9 +365,9 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
       val df = df1.join(df2, "key")
       val metrics = getSparkPlanMetrics(df, 1, Set(1L))
       testSparkPlanMetrics(df, 1, Map(
-        1L -> ("ShuffledHashJoin", Map(
+        1L -> (("ShuffledHashJoin", Map(
           "number of output rows" -> 2L,
-          "avg hash probe (min, med, max)" -> "\n(1, 1, 1)")))
+          "avg hash probe (min, med, max)" -> "\n(1, 1, 1)"))))
       )
     }
   }
@@ -426,14 +426,14 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
     // ... -> BroadcastHashJoin(nodeId = 0)
     val df = df1.join(broadcast(df2), $"key" === $"key2", "left_outer")
     testSparkPlanMetrics(df, 2, Map(
-      0L -> ("BroadcastHashJoin", Map(
-        "number of output rows" -> 5L)))
+      0L -> (("BroadcastHashJoin", Map(
+        "number of output rows" -> 5L))))
     )
 
     val df3 = df1.join(broadcast(df2), $"key" === $"key2", "right_outer")
     testSparkPlanMetrics(df3, 2, Map(
-      0L -> ("BroadcastHashJoin", Map(
-        "number of output rows" -> 6L)))
+      0L -> (("BroadcastHashJoin", Map(
+        "number of output rows" -> 6L))))
     )
   }
 
@@ -448,8 +448,8 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
           "SELECT * FROM testData2 left JOIN testDataForJoin ON " +
             "testData2.a * testDataForJoin.a != testData2.a + testDataForJoin.a")
         testSparkPlanMetrics(df, 3, Map(
-          1L -> ("BroadcastNestedLoopJoin", Map(
-            "number of output rows" -> 12L)))
+          1L -> (("BroadcastNestedLoopJoin", Map(
+            "number of output rows" -> 12L))))
         )
       }
     }
@@ -462,8 +462,8 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
     // ... -> BroadcastHashJoin(nodeId = 0)
     val df = df1.join(broadcast(df2), $"key" === $"key2", "leftsemi")
     testSparkPlanMetrics(df, 2, Map(
-      0L -> ("BroadcastHashJoin", Map(
-        "number of output rows" -> 2L)))
+      0L -> (("BroadcastHashJoin", Map(
+        "number of output rows" -> 2L))))
     )
   }
 
@@ -477,7 +477,7 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
         val df = spark.sql(
           "SELECT * FROM testData2 JOIN testDataForJoin")
         testSparkPlanMetrics(df, 1, Map(
-          0L -> ("CartesianProduct", Map("number of output rows" -> 12L)))
+          0L -> (("CartesianProduct", Map("number of output rows" -> 12L))))
         )
       }
     }
@@ -490,7 +490,7 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
       val df = spark.sql(
         "SELECT * FROM testData2 ANTI JOIN antiData ON testData2.a = antiData.a")
       testSparkPlanMetrics(df, 1, Map(
-        0L -> ("SortMergeJoin", Map("number of output rows" -> 4L)))
+        0L -> (("SortMergeJoin", Map("number of output rows" -> 4L))))
       )
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
index 82eff5e6491ef..2c6763ed79a68 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
@@ -395,7 +395,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext with JsonTest
     }
     // Listener tracks only SQL metrics, not other accumulators
     assert(trackedAccums.size === 1)
-    assert(trackedAccums.head === (sqlMetricInfo.id, sqlMetricInfo.update.get))
+    assert(trackedAccums.head === ((sqlMetricInfo.id, sqlMetricInfo.update.get)))
   }
 
   test("driver side SQL metrics") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/expressions/ReduceAggregatorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/expressions/ReduceAggregatorSuite.scala
index d826d3f54d922..f65dcdf119c3a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/expressions/ReduceAggregatorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/expressions/ReduceAggregatorSuite.scala
@@ -27,7 +27,7 @@ class ReduceAggregatorSuite extends SparkFunSuite {
     val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
     val func = (v1: Int, v2: Int) => v1 + v2
     val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)
-    assert(aggregator.zero == (false, null))
+    assert(aggregator.zero == (false, null).asInstanceOf[(Boolean, Int)])
   }
 
   test("reduce, merge and finish") {
@@ -36,22 +36,22 @@ class ReduceAggregatorSuite extends SparkFunSuite {
     val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)
 
     val firstReduce = aggregator.reduce(aggregator.zero, 1)
-    assert(firstReduce == (true, 1))
+    assert(firstReduce == ((true, 1)))
 
     val secondReduce = aggregator.reduce(firstReduce, 2)
-    assert(secondReduce == (true, 3))
+    assert(secondReduce == ((true, 3)))
 
     val thirdReduce = aggregator.reduce(secondReduce, 3)
-    assert(thirdReduce == (true, 6))
+    assert(thirdReduce == ((true, 6)))
 
     val mergeWithZero1 = aggregator.merge(aggregator.zero, firstReduce)
-    assert(mergeWithZero1 == (true, 1))
+    assert(mergeWithZero1 == ((true, 1)))
 
     val mergeWithZero2 = aggregator.merge(secondReduce, aggregator.zero)
-    assert(mergeWithZero2 == (true, 3))
+    assert(mergeWithZero2 == ((true, 3)))
 
     val mergeTwoReduced = aggregator.merge(firstReduce, secondReduce)
-    assert(mergeTwoReduced == (true, 4))
+    assert(mergeTwoReduced == ((true, 4)))
 
     assert(aggregator.finish(firstReduce)== 1)
     assert(aggregator.finish(secondReduce) == 3)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
index 59c6a6fade175..d21d1f1069475 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
@@ -20,12 +20,10 @@ package org.apache.spark.sql.streaming
 import java.util.UUID
 
 import scala.collection.mutable
-import scala.concurrent.duration._
 import scala.language.reflectiveCalls
 
 import org.scalactic.TolerantNumerics
 import org.scalatest.concurrent.AsyncAssertions.Waiter
-import org.scalatest.concurrent.Eventually._
 import org.scalatest.concurrent.PatienceConfiguration.Timeout
 import org.scalatest.BeforeAndAfter
 import org.scalatest.PrivateMethodTester._
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
index 3de60c7fc1318..a29d7a7565ee1 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
@@ -414,7 +414,7 @@ private[spark] object HiveUtils extends Logging {
   protected[sql] def toHiveString(a: (Any, DataType)): String = a match {
     case (struct: Row, StructType(fields)) =>
       struct.toSeq.zip(fields).map {
-        case (v, t) => s""""${t.name}":${toHiveStructString(v, t.dataType)}"""
+        case (v, t) => s""""${t.name}":${toHiveStructString((v, t.dataType))}"""
       }.mkString("{", ",", "}")
     case (seq: Seq[_], ArrayType(typ, _)) =>
       seq.map(v => (v, typ)).map(toHiveStructString).mkString("[", ",", "]")
@@ -437,7 +437,7 @@ private[spark] object HiveUtils extends Logging {
   protected def toHiveStructString(a: (Any, DataType)): String = a match {
     case (struct: Row, StructType(fields)) =>
       struct.toSeq.zip(fields).map {
-        case (v, t) => s""""${t.name}":${toHiveStructString(v, t.dataType)}"""
+        case (v, t) => s""""${t.name}":${toHiveStructString((v, t.dataType))}"""
       }.mkString("{", ",", "}")
     case (seq: Seq[_], ArrayType(typ, _)) =>
       seq.map(v => (v, typ)).map(toHiveStructString).mkString("[", ",", "]")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
index d8fd68b63d1eb..8140f883ee542 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
@@ -80,17 +80,17 @@ class DataSourceWithHiveMetastoreCatalogSuite
   ).coalesce(1)
 
   Seq(
-    "parquet" -> (
+    "parquet" -> ((
       "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
       "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
       "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"
-    ),
+    )),
 
-    "orc" -> (
+    "orc" -> ((
       "org.apache.hadoop.hive.ql.io.orc.OrcInputFormat",
       "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat",
       "org.apache.hadoop.hive.ql.io.orc.OrcSerde"
-    )
+    ))
   ).foreach { case (provider, (inputFormat, outputFormat, serde)) =>
     test(s"Persist non-partitioned $provider relation into metastore as managed table") {
       withTable("t") {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
index 5f15a705a2e99..17715465d38f5 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
@@ -151,7 +151,7 @@ class HiveSparkSubmitSuite
     // the HiveContext code mistakenly overrides the class loader that contains user classes.
     // For more detail, see sql/hive/src/test/resources/regression-test-SPARK-8489/*scala.
     val version = Properties.versionNumberString match {
-      case v if v.startsWith("2.10") || v.startsWith("2.11") => v.substring(0, 4)
+      case v if v.startsWith("2.12") || v.startsWith("2.11") => v.substring(0, 4)
       case x => throw new Exception(s"Unsupported Scala Version: $x")
     }
     val jarDir = getTestResourcePath("regression-test-SPARK-8489")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index 12daf3af11abe..5b62e37311d88 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -20,6 +20,8 @@ package org.apache.spark.sql.hive.execution
 import java.io.File
 import java.net.URI
 
+import scala.language.existentials
+
 import org.apache.hadoop.fs.Path
 import org.scalatest.BeforeAndAfterEach
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index cf33760360724..799abc1d0c42f 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -795,7 +795,7 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd
     sql("SELECT * FROM m").collect().zip(sql("SELECT * FROM src LIMIT 10").collect()).foreach {
       case (Row(map: Map[_, _]), Row(key: Int, value: String)) =>
         assert(map.size === 1)
-        assert(map.head === (key, value))
+        assert(map.head === ((key, value)))
     }
   }
 

From af80e01b577c02005b4c4d217f22297f484e5da7 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Thu, 13 Jul 2017 10:10:29 +0100
Subject: [PATCH 171/779] [SPARK-21373][CORE] Update Jetty to 9.3.20.v20170531

## What changes were proposed in this pull request?

This PR upgrades jetty to the latest version 9.3.20.v20170531. The version includes the fix of CVE-2017-9735.

Here are links to descriptions for CVE-2017-9735.
* https://nvd.nist.gov/vuln/detail/CVE-2017-9735
* https://github.com/eclipse/jetty.project/issues/1556

Here is [a release note](https://github.com/eclipse/jetty.project/blob/jetty-9.3.x/VERSION.txt) for the latest jetty

## How was this patch tested?

tested by existing test suites

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #18601 from kiszk/SPARK-21373.
---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index bea2798cc8619..1e51ddd63618e 100644
--- a/pom.xml
+++ b/pom.xml
@@ -133,7 +133,7 @@
     <derby.version>10.12.1.1</derby.version>
     <parquet.version>1.8.2</parquet.version>
     <hive.parquet.version>1.6.0</hive.parquet.version>
-    <jetty.version>9.3.11.v20160721</jetty.version>
+    <jetty.version>9.3.20.v20170531</jetty.version>
     <javaxservlet.version>3.1.0</javaxservlet.version>
     <chill.version>0.8.0</chill.version>
     <ivy.version>2.4.0</ivy.version>

From d8257b99ddae23f702f312640a5335ddb4554403 Mon Sep 17 00:00:00 2001
From: Stavros Kontopoulos <st.kontopoulos@gmail.com>
Date: Thu, 13 Jul 2017 10:37:15 -0700
Subject: [PATCH 172/779] [SPARK-21403][MESOS] fix --packages for mesos

## What changes were proposed in this pull request?
Fixes --packages flag for mesos in cluster mode. Probably I will handle standalone and Yarn in another commit, I need to investigate those cases as they are different.

## How was this patch tested?
Tested with a community 1.9 dc/os cluster. packages were successfully resolved in cluster mode within a container.

andrewor14  susanxhuynh ArtRand srowen  pls review.

Author: Stavros Kontopoulos <st.kontopoulos@gmail.com>

Closes #18587 from skonto/fix_packages_mesos_cluster.
---
 .../org/apache/spark/deploy/SparkSubmit.scala | 89 +++++++++++--------
 1 file changed, 50 insertions(+), 39 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index abde04062c4b1..0ea14361b2f77 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -273,6 +273,25 @@ object SparkSubmit extends CommandLineUtils {
       }
     }
 
+    // Fail fast, the following modes are not supported or applicable
+    (clusterManager, deployMode) match {
+      case (STANDALONE, CLUSTER) if args.isPython =>
+        printErrorAndExit("Cluster deploy mode is currently not supported for python " +
+          "applications on standalone clusters.")
+      case (STANDALONE, CLUSTER) if args.isR =>
+        printErrorAndExit("Cluster deploy mode is currently not supported for R " +
+          "applications on standalone clusters.")
+      case (LOCAL, CLUSTER) =>
+        printErrorAndExit("Cluster deploy mode is not compatible with master \"local\"")
+      case (_, CLUSTER) if isShell(args.primaryResource) =>
+        printErrorAndExit("Cluster deploy mode is not applicable to Spark shells.")
+      case (_, CLUSTER) if isSqlShell(args.mainClass) =>
+        printErrorAndExit("Cluster deploy mode is not applicable to Spark SQL shell.")
+      case (_, CLUSTER) if isThriftServer(args.mainClass) =>
+        printErrorAndExit("Cluster deploy mode is not applicable to Spark Thrift server.")
+      case _ =>
+    }
+
     // Update args.deployMode if it is null. It will be passed down as a Spark property later.
     (args.deployMode, deployMode) match {
       case (null, CLIENT) => args.deployMode = "client"
@@ -282,36 +301,40 @@ object SparkSubmit extends CommandLineUtils {
     val isYarnCluster = clusterManager == YARN && deployMode == CLUSTER
     val isMesosCluster = clusterManager == MESOS && deployMode == CLUSTER
 
-    // Resolve maven dependencies if there are any and add classpath to jars. Add them to py-files
-    // too for packages that include Python code
-    val exclusions: Seq[String] =
+    if (!isMesosCluster) {
+      // Resolve maven dependencies if there are any and add classpath to jars. Add them to py-files
+      // too for packages that include Python code
+      val exclusions: Seq[String] =
       if (!StringUtils.isBlank(args.packagesExclusions)) {
         args.packagesExclusions.split(",")
       } else {
         Nil
       }
 
-    // Create the IvySettings, either load from file or build defaults
-    val ivySettings = args.sparkProperties.get("spark.jars.ivySettings").map { ivySettingsFile =>
-      SparkSubmitUtils.loadIvySettings(ivySettingsFile, Option(args.repositories),
-        Option(args.ivyRepoPath))
-    }.getOrElse {
-      SparkSubmitUtils.buildIvySettings(Option(args.repositories), Option(args.ivyRepoPath))
-    }
+      // Create the IvySettings, either load from file or build defaults
+      val ivySettings = args.sparkProperties.get("spark.jars.ivySettings").map { ivySettingsFile =>
+        SparkSubmitUtils.loadIvySettings(ivySettingsFile, Option(args.repositories),
+          Option(args.ivyRepoPath))
+      }.getOrElse {
+        SparkSubmitUtils.buildIvySettings(Option(args.repositories), Option(args.ivyRepoPath))
+      }
 
-    val resolvedMavenCoordinates = SparkSubmitUtils.resolveMavenCoordinates(args.packages,
-      ivySettings, exclusions = exclusions)
-    if (!StringUtils.isBlank(resolvedMavenCoordinates)) {
-      args.jars = mergeFileLists(args.jars, resolvedMavenCoordinates)
-      if (args.isPython) {
-        args.pyFiles = mergeFileLists(args.pyFiles, resolvedMavenCoordinates)
+      val resolvedMavenCoordinates = SparkSubmitUtils.resolveMavenCoordinates(args.packages,
+        ivySettings, exclusions = exclusions)
+
+
+      if (!StringUtils.isBlank(resolvedMavenCoordinates)) {
+        args.jars = mergeFileLists(args.jars, resolvedMavenCoordinates)
+        if (args.isPython) {
+          args.pyFiles = mergeFileLists(args.pyFiles, resolvedMavenCoordinates)
+        }
       }
-    }
 
-    // install any R packages that may have been passed through --jars or --packages.
-    // Spark Packages may contain R source code inside the jar.
-    if (args.isR && !StringUtils.isBlank(args.jars)) {
-      RPackageUtils.checkAndBuildRPackage(args.jars, printStream, args.verbose)
+      // install any R packages that may have been passed through --jars or --packages.
+      // Spark Packages may contain R source code inside the jar.
+      if (args.isR && !StringUtils.isBlank(args.jars)) {
+        RPackageUtils.checkAndBuildRPackage(args.jars, printStream, args.verbose)
+      }
     }
 
     val hadoopConf = new HadoopConfiguration()
@@ -343,24 +366,6 @@ object SparkSubmit extends CommandLineUtils {
       }.orNull
     }
 
-    // The following modes are not supported or applicable
-    (clusterManager, deployMode) match {
-      case (STANDALONE, CLUSTER) if args.isPython =>
-        printErrorAndExit("Cluster deploy mode is currently not supported for python " +
-          "applications on standalone clusters.")
-      case (STANDALONE, CLUSTER) if args.isR =>
-        printErrorAndExit("Cluster deploy mode is currently not supported for R " +
-          "applications on standalone clusters.")
-      case (LOCAL, CLUSTER) =>
-        printErrorAndExit("Cluster deploy mode is not compatible with master \"local\"")
-      case (_, CLUSTER) if isShell(args.primaryResource) =>
-        printErrorAndExit("Cluster deploy mode is not applicable to Spark shells.")
-      case (_, CLUSTER) if isSqlShell(args.mainClass) =>
-        printErrorAndExit("Cluster deploy mode is not applicable to Spark SQL shell.")
-      case (_, CLUSTER) if isThriftServer(args.mainClass) =>
-        printErrorAndExit("Cluster deploy mode is not applicable to Spark Thrift server.")
-      case _ =>
-    }
 
     // If we're running a python app, set the main class to our specific python runner
     if (args.isPython && deployMode == CLIENT) {
@@ -468,6 +473,12 @@ object SparkSubmit extends CommandLineUtils {
       OptionAssigner(args.driverExtraLibraryPath, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES,
         sysProp = "spark.driver.extraLibraryPath"),
 
+      // Mesos only - propagate attributes for dependency resolution at the driver side
+      OptionAssigner(args.packages, MESOS, CLUSTER, sysProp = "spark.jars.packages"),
+      OptionAssigner(args.repositories, MESOS, CLUSTER, sysProp = "spark.jars.repositories"),
+      OptionAssigner(args.ivyRepoPath, MESOS, CLUSTER, sysProp = "spark.jars.ivy"),
+      OptionAssigner(args.packagesExclusions, MESOS, CLUSTER, sysProp = "spark.jars.excludes"),
+
       // Yarn only
       OptionAssigner(args.queue, YARN, ALL_DEPLOY_MODES, sysProp = "spark.yarn.queue"),
       OptionAssigner(args.numExecutors, YARN, ALL_DEPLOY_MODES,

From 5c8edfc4a8864f4091998901bbca062cd8466b6f Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Thu, 13 Jul 2017 10:41:19 -0700
Subject: [PATCH 173/779] [SPARK-15526][MLLIB] Shade JPMML

## What changes were proposed in this pull request?

Shade JPMML classes (`org.jpmml.**`) and related PMML model classes (`org.dmg.pmml.**`). This insulates downstream users from the version of JPMML in Spark, allows us to upgrade more freely, and allows downstream users to use a different version. JPMML minor releases are not generally forwards/backwards compatible.

## How was this patch tested?

Existing tests

Author: Sean Owen <sowen@cloudera.com>

Closes #18584 from srowen/SPARK-15526.
---
 pom.xml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/pom.xml b/pom.xml
index 1e51ddd63618e..11cd028c99145 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2363,6 +2363,7 @@
               <include>org.eclipse.jetty:jetty-util</include>
               <include>org.eclipse.jetty:jetty-server</include>
               <include>com.google.guava:guava</include>
+              <include>org.jpmml:*</include>
             </includes>
           </artifactSet>
           <relocations>
@@ -2377,6 +2378,14 @@
               <pattern>com.google.common</pattern>
               <shadedPattern>${spark.shade.packageName}.guava</shadedPattern>
             </relocation>
+            <relocation>
+              <pattern>org.dmg.pmml</pattern>
+              <shadedPattern>${spark.shade.packageName}.dmg.pmml</shadedPattern>
+            </relocation>
+            <relocation>
+              <pattern>org.jpmml</pattern>
+              <shadedPattern>${spark.shade.packageName}.jpmml</shadedPattern>
+            </relocation>
           </relocations>
         </configuration>
         <executions>

From cb8d5cc90ff8d3c991ff33da41b136ab7634f71b Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Thu, 13 Jul 2017 15:25:38 -0700
Subject: [PATCH 174/779] [SPARK-21376][YARN] Fix yarn client token expire
 issue when cleaning the staging files in long running scenario

## What changes were proposed in this pull request?

This issue happens in long running application with yarn cluster mode, because yarn#client doesn't sync token with AM, so it will always keep the initial token, this token may be expired in the long running scenario, so when yarn#client tries to clean up staging directory after application finished, it will use this expired token and meet token expire issue.

## How was this patch tested?

Manual verification is secure cluster.

Author: jerryshao <sshao@hortonworks.com>

Closes #18617 from jerryshao/SPARK-21376.
---
 .../org/apache/spark/deploy/yarn/Client.scala | 35 ++++++++++++++-----
 1 file changed, 26 insertions(+), 9 deletions(-)

diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 7caaa91e1af97..a5b0e19095aa5 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -21,6 +21,7 @@ import java.io.{File, FileOutputStream, IOException, OutputStreamWriter}
 import java.net.{InetAddress, UnknownHostException, URI}
 import java.nio.ByteBuffer
 import java.nio.charset.StandardCharsets
+import java.security.PrivilegedExceptionAction
 import java.util.{Locale, Properties, UUID}
 import java.util.zip.{ZipEntry, ZipOutputStream}
 
@@ -192,16 +193,32 @@ private[spark] class Client(
    * Cleanup application staging directory.
    */
   private def cleanupStagingDir(appId: ApplicationId): Unit = {
-    val stagingDirPath = new Path(appStagingBaseDir, getAppStagingDir(appId))
-    try {
-      val preserveFiles = sparkConf.get(PRESERVE_STAGING_FILES)
-      val fs = stagingDirPath.getFileSystem(hadoopConf)
-      if (!preserveFiles && fs.delete(stagingDirPath, true)) {
-        logInfo(s"Deleted staging directory $stagingDirPath")
+    if (sparkConf.get(PRESERVE_STAGING_FILES)) {
+      return
+    }
+
+    def cleanupStagingDirInternal(): Unit = {
+      val stagingDirPath = new Path(appStagingBaseDir, getAppStagingDir(appId))
+      try {
+        val fs = stagingDirPath.getFileSystem(hadoopConf)
+        if (fs.delete(stagingDirPath, true)) {
+          logInfo(s"Deleted staging directory $stagingDirPath")
+        }
+      } catch {
+        case ioe: IOException =>
+          logWarning("Failed to cleanup staging dir " + stagingDirPath, ioe)
       }
-    } catch {
-      case ioe: IOException =>
-        logWarning("Failed to cleanup staging dir " + stagingDirPath, ioe)
+    }
+
+    if (isClusterMode && principal != null && keytab != null) {
+      val newUgi = UserGroupInformation.loginUserFromKeytabAndReturnUGI(principal, keytab)
+      newUgi.doAs(new PrivilegedExceptionAction[Unit] {
+        override def run(): Unit = {
+          cleanupStagingDirInternal()
+        }
+      })
+    } else {
+      cleanupStagingDirInternal()
     }
   }
 

From 601a237b3033340856a8715d84025ecd8eb8fdba Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Fri, 14 Jul 2017 14:32:19 -0700
Subject: [PATCH 175/779] [SPARK-9825][YARN] Do not overwrite final Hadoop
 config entries.

When localizing the gateway config files in a YARN application, avoid
overwriting final configs by distributing the gateway files to a separate
directory, and explicitly loading them into the Hadoop config, instead
of placing those files before the cluster's files in the classpath.

This is done by saving the gateway's config to a separate XML file
distributed with the rest of the Spark app's config, and loading that
file when creating a new config through `YarnSparkHadoopUtil`.

Tested with existing unit tests, and by verifying the behavior in a YARN
cluster (final values are not overridden, non-final values are).

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #18370 from vanzin/SPARK-9825.
---
 .../org/apache/spark/deploy/yarn/Client.scala | 59 ++++++++++++++-----
 .../deploy/yarn/YarnSparkHadoopUtil.scala     |  7 ++-
 2 files changed, 48 insertions(+), 18 deletions(-)

diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index a5b0e19095aa5..3a7adb7724cef 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -683,21 +683,6 @@ private[spark] class Client(
   private def createConfArchive(): File = {
     val hadoopConfFiles = new HashMap[String, File]()
 
-    // Uploading $SPARK_CONF_DIR/log4j.properties file to the distributed cache to make sure that
-    // the executors will use the latest configurations instead of the default values. This is
-    // required when user changes log4j.properties directly to set the log configurations. If
-    // configuration file is provided through --files then executors will be taking configurations
-    // from --files instead of $SPARK_CONF_DIR/log4j.properties.
-
-    // Also uploading metrics.properties to distributed cache if exists in classpath.
-    // If user specify this file using --files then executors will use the one
-    // from --files instead.
-    for { prop <- Seq("log4j.properties", "metrics.properties")
-          url <- Option(Utils.getContextOrSparkClassLoader.getResource(prop))
-          if url.getProtocol == "file" } {
-      hadoopConfFiles(prop) = new File(url.getPath)
-    }
-
     Seq("HADOOP_CONF_DIR", "YARN_CONF_DIR").foreach { envKey =>
       sys.env.get(envKey).foreach { path =>
         val dir = new File(path)
@@ -722,14 +707,43 @@ private[spark] class Client(
 
     try {
       confStream.setLevel(0)
+
+      // Upload $SPARK_CONF_DIR/log4j.properties file to the distributed cache to make sure that
+      // the executors will use the latest configurations instead of the default values. This is
+      // required when user changes log4j.properties directly to set the log configurations. If
+      // configuration file is provided through --files then executors will be taking configurations
+      // from --files instead of $SPARK_CONF_DIR/log4j.properties.
+
+      // Also upload metrics.properties to distributed cache if exists in classpath.
+      // If user specify this file using --files then executors will use the one
+      // from --files instead.
+      for { prop <- Seq("log4j.properties", "metrics.properties")
+            url <- Option(Utils.getContextOrSparkClassLoader.getResource(prop))
+            if url.getProtocol == "file" } {
+        val file = new File(url.getPath())
+        confStream.putNextEntry(new ZipEntry(file.getName()))
+        Files.copy(file, confStream)
+        confStream.closeEntry()
+      }
+
+      // Save the Hadoop config files under a separate directory in the archive. This directory
+      // is appended to the classpath so that the cluster-provided configuration takes precedence.
+      confStream.putNextEntry(new ZipEntry(s"$LOCALIZED_HADOOP_CONF_DIR/"))
+      confStream.closeEntry()
       hadoopConfFiles.foreach { case (name, file) =>
         if (file.canRead()) {
-          confStream.putNextEntry(new ZipEntry(name))
+          confStream.putNextEntry(new ZipEntry(s"$LOCALIZED_HADOOP_CONF_DIR/$name"))
           Files.copy(file, confStream)
           confStream.closeEntry()
         }
       }
 
+      // Save the YARN configuration into a separate file that will be overlayed on top of the
+      // cluster's Hadoop conf.
+      confStream.putNextEntry(new ZipEntry(SPARK_HADOOP_CONF_FILE))
+      yarnConf.writeXml(confStream)
+      confStream.closeEntry()
+
       // Save Spark configuration to a file in the archive.
       val props = new Properties()
       sparkConf.getAll.foreach { case (k, v) => props.setProperty(k, v) }
@@ -1196,12 +1210,19 @@ private object Client extends Logging {
   // Subdirectory where the user's Spark and Hadoop config files will be placed.
   val LOCALIZED_CONF_DIR = "__spark_conf__"
 
+  // Subdirectory in the conf directory containing Hadoop config files.
+  val LOCALIZED_HADOOP_CONF_DIR = "__hadoop_conf__"
+
   // File containing the conf archive in the AM. See prepareLocalResources().
   val LOCALIZED_CONF_ARCHIVE = LOCALIZED_CONF_DIR + ".zip"
 
   // Name of the file in the conf archive containing Spark configuration.
   val SPARK_CONF_FILE = "__spark_conf__.properties"
 
+  // Name of the file containing the gateway's Hadoop configuration, to be overlayed on top of the
+  // cluster's Hadoop config.
+  val SPARK_HADOOP_CONF_FILE = "__spark_hadoop_conf__.xml"
+
   // Subdirectory where the user's python files (not archives) will be placed.
   val LOCALIZED_PYTHON_DIR = "__pyfiles__"
 
@@ -1307,6 +1328,12 @@ private object Client extends Logging {
     sys.env.get(ENV_DIST_CLASSPATH).foreach { cp =>
       addClasspathEntry(getClusterPath(sparkConf, cp), env)
     }
+
+    // Add the localized Hadoop config at the end of the classpath, in case it contains other
+    // files (such as configuration files for different services) that are not part of the
+    // YARN cluster's config.
+    addClasspathEntry(
+      buildPath(Environment.PWD.$$(), LOCALIZED_CONF_DIR, LOCALIZED_HADOOP_CONF_DIR), env)
   }
 
   /**
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
index 4522071bd92e2..a687f67c5b698 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
@@ -61,8 +61,11 @@ class YarnSparkHadoopUtil extends SparkHadoopUtil {
 
   // Return an appropriate (subclass) of Configuration. Creating a config initializes some Hadoop
   // subsystems. Always create a new config, don't reuse yarnConf.
-  override def newConfiguration(conf: SparkConf): Configuration =
-    new YarnConfiguration(super.newConfiguration(conf))
+  override def newConfiguration(conf: SparkConf): Configuration = {
+    val hadoopConf = new YarnConfiguration(super.newConfiguration(conf))
+    hadoopConf.addResource(Client.SPARK_HADOOP_CONF_FILE)
+    hadoopConf
+  }
 
   // Add any user credentials to the job conf which are necessary for running on a secure Hadoop
   // cluster

From 2d968a07d211688a9c588deb859667dd8b653b27 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Fri, 14 Jul 2017 14:37:27 -0700
Subject: [PATCH 176/779] [SPARK-21421][SS] Add the query id as a local
 property to allow source and sink using it

## What changes were proposed in this pull request?

Add the query id as a local property to allow source and sink using it.

## How was this patch tested?

The new unit test.

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #18638 from zsxwing/SPARK-21421.
---
 .../execution/streaming/StreamExecution.scala |  4 +++
 .../sql/streaming/StreamingQuerySuite.scala   | 27 +++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 10c42a7338e85..5ee596e06d5c5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -263,6 +263,7 @@ class StreamExecution(
     try {
       sparkSession.sparkContext.setJobGroup(runId.toString, getBatchDescriptionString,
         interruptOnCancel = true)
+      sparkSession.sparkContext.setLocalProperty(StreamExecution.QUERY_ID_KEY, id.toString)
       if (sparkSession.sessionState.conf.streamingMetricsEnabled) {
         sparkSession.sparkContext.env.metricsSystem.registerSource(streamMetrics)
       }
@@ -842,6 +843,9 @@ class StreamExecution(
   }
 }
 
+object StreamExecution {
+  val QUERY_ID_KEY = "sql.streaming.queryId"
+}
 
 /**
  * A special thread to run the stream query. Some codes require to run in the StreamExecutionThread
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index 0925646beb869..41f73b8529ad5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -613,6 +613,33 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging wi
     }
   }
 
+  test("get the query id in source") {
+    @volatile var queryId: String = null
+    val source = new Source {
+      override def stop(): Unit = {}
+      override def getOffset: Option[Offset] = {
+        queryId = spark.sparkContext.getLocalProperty(StreamExecution.QUERY_ID_KEY)
+        None
+      }
+      override def getBatch(start: Option[Offset], end: Offset): DataFrame = spark.emptyDataFrame
+      override def schema: StructType = MockSourceProvider.fakeSchema
+    }
+
+    MockSourceProvider.withMockSources(source) {
+      val df = spark.readStream
+        .format("org.apache.spark.sql.streaming.util.MockSourceProvider")
+        .load()
+      testStream(df)(
+        AssertOnQuery { sq =>
+          sq.processAllAvailable()
+          assert(sq.id.toString === queryId)
+          assert(sq.runId.toString !== queryId)
+          true
+        }
+      )
+    }
+  }
+
   /** Create a streaming DF that only execute one batch in which it returns the given static DF */
   private def createSingleTriggerStreamingDF(triggerDF: DataFrame): DataFrame = {
     require(!triggerDF.isStreaming)

From ac5d5d795909061a17e056696cf0ef87d9e65dd1 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Fri, 14 Jul 2017 20:16:04 -0700
Subject: [PATCH 177/779] [SPARK-21344][SQL] BinaryType comparison does signed
 byte array comparison

## What changes were proposed in this pull request?

This PR fixes a wrong comparison for `BinaryType`. This PR enables unsigned comparison and unsigned prefix generation for an array for `BinaryType`. Previous implementations uses signed operations.

## How was this patch tested?

Added a test suite in `OrderingSuite`.

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #18571 from kiszk/SPARK-21344.
---
 .../apache/spark/unsafe/types/ByteArray.java  |  2 +-
 .../unsafe/sort/PrefixComparatorsSuite.scala  | 17 ++++++++++++++++-
 .../spark/sql/catalyst/util/TypeUtils.scala   |  4 +++-
 .../catalyst/expressions/OrderingSuite.scala  | 19 +++++++++++++++++++
 .../resources/sql-tests/inputs/comparator.sql |  3 +++
 .../sql-tests/results/comparator.sql.out      | 18 ++++++++++++++++++
 6 files changed, 60 insertions(+), 3 deletions(-)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/comparator.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/comparator.sql.out

diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java
index 3ced2094f5e6b..7ced13d357237 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java
@@ -44,7 +44,7 @@ public static long getPrefix(byte[] bytes) {
       final int minLen = Math.min(bytes.length, 8);
       long p = 0;
       for (int i = 0; i < minLen; ++i) {
-        p |= (128L + Platform.getByte(bytes, Platform.BYTE_ARRAY_OFFSET + i))
+        p |= ((long) Platform.getByte(bytes, Platform.BYTE_ARRAY_OFFSET + i) & 0xff)
             << (56 - 8 * i);
       }
       return p;
diff --git a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala
index 5180c58a566cb..73546ef1b7a60 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala
@@ -63,7 +63,9 @@ class PrefixComparatorsSuite extends SparkFunSuite with PropertyChecks {
 
      def compareBinary(x: Array[Byte], y: Array[Byte]): Int = {
       for (i <- 0 until x.length; if i < y.length) {
-        val res = x(i).compare(y(i))
+        val v1 = x(i) & 0xff
+        val v2 = y(i) & 0xff
+        val res = v1 - v2
         if (res != 0) return res
       }
       x.length - y.length
@@ -80,6 +82,19 @@ class PrefixComparatorsSuite extends SparkFunSuite with PropertyChecks {
         (prefixComparisonResult > 0 && compareBinary(x, y) > 0))
     }
 
+    val binaryRegressionTests = Seq(
+      (Array[Byte](1), Array[Byte](-1)),
+      (Array[Byte](1, 1, 1, 1, 1), Array[Byte](1, 1, 1, 1, -1)),
+      (Array[Byte](1, 1, 1, 1, 1, 1, 1, 1, 1), Array[Byte](1, 1, 1, 1, 1, 1, 1, 1, -1)),
+      (Array[Byte](1), Array[Byte](1, 1, 1, 1)),
+      (Array[Byte](1, 1, 1, 1, 1), Array[Byte](1, 1, 1, 1, 1, 1, 1, 1, 1)),
+      (Array[Byte](-1), Array[Byte](-1, -1, -1, -1)),
+      (Array[Byte](-1, -1, -1, -1, -1), Array[Byte](-1, -1, -1, -1, -1, -1, -1, -1, -1))
+    )
+    binaryRegressionTests.foreach { case (b1, b2) =>
+      testPrefixComparison(b1, b2)
+    }
+
     // scalastyle:off
     val regressionTests = Table(
       ("s1", "s2"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala
index 7101ca5a17de9..45225779bffcb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala
@@ -70,7 +70,9 @@ object TypeUtils {
 
   def compareBinary(x: Array[Byte], y: Array[Byte]): Int = {
     for (i <- 0 until x.length; if i < y.length) {
-      val res = x(i).compareTo(y(i))
+      val v1 = x(i) & 0xff
+      val v2 = y(i) & 0xff
+      val res = v1 - v2
       if (res != 0) return res
     }
     x.length - y.length
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/OrderingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/OrderingSuite.scala
index 190fab5d249bb..aa61ba2bff2bb 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/OrderingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/OrderingSuite.scala
@@ -137,4 +137,23 @@ class OrderingSuite extends SparkFunSuite with ExpressionEvalHelper {
     // verify that we can support up to 5000 ordering comparisons, which should be sufficient
     GenerateOrdering.generate(Array.fill(5000)(sortOrder))
   }
+
+  test("SPARK-21344: BinaryType comparison does signed byte array comparison") {
+    val data = Seq(
+      (Array[Byte](1), Array[Byte](-1)),
+      (Array[Byte](1, 1, 1, 1, 1), Array[Byte](1, 1, 1, 1, -1)),
+      (Array[Byte](1, 1, 1, 1, 1, 1, 1, 1, 1), Array[Byte](1, 1, 1, 1, 1, 1, 1, 1, -1))
+      )
+    data.foreach { case (b1, b2) =>
+      val rowOrdering = InterpretedOrdering.forSchema(Seq(BinaryType))
+      val genOrdering = GenerateOrdering.generate(
+        BoundReference(0, BinaryType, nullable = true).asc :: Nil)
+      val rowType = StructType(StructField("b", BinaryType, nullable = true) :: Nil)
+      val toCatalyst = CatalystTypeConverters.createToCatalystConverter(rowType)
+      val rowB1 = toCatalyst(Row(b1)).asInstanceOf[InternalRow]
+      val rowB2 = toCatalyst(Row(b2)).asInstanceOf[InternalRow]
+      assert(rowOrdering.compare(rowB1, rowB2) < 0)
+      assert(genOrdering.compare(rowB1, rowB2) < 0)
+    }
+  }
 }
diff --git a/sql/core/src/test/resources/sql-tests/inputs/comparator.sql b/sql/core/src/test/resources/sql-tests/inputs/comparator.sql
new file mode 100644
index 0000000000000..3e2447723e576
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/comparator.sql
@@ -0,0 +1,3 @@
+-- binary type
+select x'00' < x'0f';
+select x'00' < x'ff';
diff --git a/sql/core/src/test/resources/sql-tests/results/comparator.sql.out b/sql/core/src/test/resources/sql-tests/results/comparator.sql.out
new file mode 100644
index 0000000000000..afc7b5448b7b6
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/comparator.sql.out
@@ -0,0 +1,18 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 2
+
+
+-- !query 0
+select x'00' < x'0f'
+-- !query 0 schema
+struct<(X'00' < X'0F'):boolean>
+-- !query 0 output
+true
+
+
+-- !query 1
+select x'00' < x'ff'
+-- !query 1 schema
+struct<(X'00' < X'FF'):boolean>
+-- !query 1 output
+true

From 74ac1fb081e9532d77278a4edca9f3f129fd62eb Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Sat, 15 Jul 2017 09:21:29 +0100
Subject: [PATCH 178/779] [SPARK-21267][DOCS][MINOR] Follow up to avoid
 referencing programming-guide redirector

## What changes were proposed in this pull request?

Update internal references from programming-guide to rdd-programming-guide

See https://github.com/apache/spark-website/commit/5ddf243fd84a0f0f98a5193a207737cea9cdc083 and https://github.com/apache/spark/pull/18485#issuecomment-314789751

Let's keep the redirector even if it's problematic to build, but not rely on it internally.

## How was this patch tested?

(Doc build)

Author: Sean Owen <sowen@cloudera.com>

Closes #18625 from srowen/SPARK-21267.2.
---
 R/pkg/R/DataFrame.R                 |  2 +-
 R/pkg/R/RDD.R                       |  2 +-
 docs/graphx-programming-guide.md    |  2 +-
 docs/index.md                       |  2 +-
 docs/ml-guide.md                    |  2 +-
 docs/mllib-optimization.md          |  2 +-
 docs/spark-standalone.md            |  2 +-
 docs/streaming-programming-guide.md | 14 ++++++++++----
 docs/tuning.md                      |  6 +++---
 9 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index e7a166c3014c1..5d6f9c042248b 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -593,7 +593,7 @@ setMethod("cache",
 #'
 #' Persist this SparkDataFrame with the specified storage level. For details of the
 #' supported storage levels, refer to
-#' \url{http://spark.apache.org/docs/latest/programming-guide.html#rdd-persistence}.
+#' \url{http://spark.apache.org/docs/latest/rdd-programming-guide.html#rdd-persistence}.
 #'
 #' @param x the SparkDataFrame to persist.
 #' @param newLevel storage level chosen for the persistance. See available options in
diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R
index 7ad3993e9ecbc..15ca212acf87f 100644
--- a/R/pkg/R/RDD.R
+++ b/R/pkg/R/RDD.R
@@ -227,7 +227,7 @@ setMethod("cacheRDD",
 #'
 #' Persist this RDD with the specified storage level. For details of the
 #' supported storage levels, refer to
-#'\url{http://spark.apache.org/docs/latest/programming-guide.html#rdd-persistence}.
+#'\url{http://spark.apache.org/docs/latest/rdd-programming-guide.html#rdd-persistence}.
 #'
 #' @param x The RDD to persist
 #' @param newLevel The new storage level to be assigned
diff --git a/docs/graphx-programming-guide.md b/docs/graphx-programming-guide.md
index 76aa7b405e18c..46225dc598da8 100644
--- a/docs/graphx-programming-guide.md
+++ b/docs/graphx-programming-guide.md
@@ -27,7 +27,7 @@ description: GraphX graph processing library guide for Spark SPARK_VERSION_SHORT
 [EdgeContext]: api/scala/index.html#org.apache.spark.graphx.EdgeContext
 [GraphOps.collectNeighborIds]: api/scala/index.html#org.apache.spark.graphx.GraphOps@collectNeighborIds(EdgeDirection):VertexRDD[Array[VertexId]]
 [GraphOps.collectNeighbors]: api/scala/index.html#org.apache.spark.graphx.GraphOps@collectNeighbors(EdgeDirection):VertexRDD[Array[(VertexId,VD)]]
-[RDD Persistence]: programming-guide.html#rdd-persistence
+[RDD Persistence]: rdd-programming-guide.html#rdd-persistence
 [Graph.cache]: api/scala/index.html#org.apache.spark.graphx.Graph@cache():Graph[VD,ED]
 [GraphOps.pregel]: api/scala/index.html#org.apache.spark.graphx.GraphOps@pregel[A](A,Int,EdgeDirection)((VertexId,VD,A)⇒VD,(EdgeTriplet[VD,ED])⇒Iterator[(VertexId,A)],(A,A)⇒A)(ClassTag[A]):Graph[VD,ED]
 [PartitionStrategy]: api/scala/index.html#org.apache.spark.graphx.PartitionStrategy$
diff --git a/docs/index.md b/docs/index.md
index 07b6b171014ed..2d4607b3119bd 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -87,7 +87,7 @@ options for deployment:
 **Programming Guides:**
 
 * [Quick Start](quick-start.html): a quick introduction to the Spark API; start here!
-* [RDD Programming Guide](programming-guide.html): overview of Spark basics - RDDs (core but old API), accumulators, and broadcast variables  
+* [RDD Programming Guide](rdd-programming-guide.html): overview of Spark basics - RDDs (core but old API), accumulators, and broadcast variables  
 * [Spark SQL, Datasets, and DataFrames](sql-programming-guide.html): processing structured data with relational queries (newer API than RDDs)
 * [Structured Streaming](structured-streaming-programming-guide.html): processing structured data streams with relation queries (using Datasets and DataFrames, newer API than DStreams)
 * [Spark Streaming](streaming-programming-guide.html): processing data streams using DStreams (old API)
diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index adb1c9aaefcdc..7aec6a40d4c64 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -18,7 +18,7 @@ At a high level, it provides tools such as:
 
 **The MLlib RDD-based API is now in maintenance mode.**
 
-As of Spark 2.0, the [RDD](programming-guide.html#resilient-distributed-datasets-rdds)-based APIs in the `spark.mllib` package have entered maintenance mode.
+As of Spark 2.0, the [RDD](rdd-programming-guide.html#resilient-distributed-datasets-rdds)-based APIs in the `spark.mllib` package have entered maintenance mode.
 The primary Machine Learning API for Spark is now the [DataFrame](sql-programming-guide.html)-based API in the `spark.ml` package.
 
 *What are the implications?*
diff --git a/docs/mllib-optimization.md b/docs/mllib-optimization.md
index eefd7dcf1108b..14d76a6e41e23 100644
--- a/docs/mllib-optimization.md
+++ b/docs/mllib-optimization.md
@@ -116,7 +116,7 @@ is a stochastic gradient. Here `$S$` is the sampled subset of size `$|S|=$ miniB
 $\cdot n$`.
 
 In each iteration, the sampling over the distributed dataset
-([RDD](programming-guide.html#resilient-distributed-datasets-rdds)), as well as the
+([RDD](rdd-programming-guide.html#resilient-distributed-datasets-rdds)), as well as the
 computation of the sum of the partial results from each worker machine is performed by the
 standard spark routines.
 
diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md
index edefbef93feb6..642575b46dd42 100644
--- a/docs/spark-standalone.md
+++ b/docs/spark-standalone.md
@@ -264,7 +264,7 @@ SPARK_WORKER_OPTS supports the following system properties:
 # Connecting an Application to the Cluster
 
 To run an application on the Spark cluster, simply pass the `spark://IP:PORT` URL of the master as to the [`SparkContext`
-constructor](programming-guide.html#initializing-spark).
+constructor](rdd-programming-guide.html#initializing-spark).
 
 To run an interactive Spark shell against the cluster, run the following command:
 
diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
index abd4ac9653606..fca0cf8ff05f2 100644
--- a/docs/streaming-programming-guide.md
+++ b/docs/streaming-programming-guide.md
@@ -535,7 +535,7 @@ After a context is defined, you have to do the following.
 It represents a continuous stream of data, either the input data stream received from source,
 or the processed data stream generated by transforming the input stream. Internally,
 a DStream is represented by a continuous series of RDDs, which is Spark's abstraction of an immutable,
-distributed dataset (see [Spark Programming Guide](programming-guide.html#resilient-distributed-datasets-rdds) for more details). Each RDD in a DStream contains data from a certain interval,
+distributed dataset (see [Spark Programming Guide](rdd-programming-guide.html#resilient-distributed-datasets-rdds) for more details). Each RDD in a DStream contains data from a certain interval,
 as shown in the following figure.
 
 <p style="text-align: center;">
@@ -1531,7 +1531,7 @@ default persistence level is set to replicate the data to two nodes for fault-to
 
 Note that, unlike RDDs, the default persistence level of DStreams keeps the data serialized in
 memory. This is further discussed in the [Performance Tuning](#memory-tuning) section. More
-information on different persistence levels can be found in the [Spark Programming Guide](programming-guide.html#rdd-persistence).
+information on different persistence levels can be found in the [Spark Programming Guide](rdd-programming-guide.html#rdd-persistence).
 
 ***
 
@@ -1720,7 +1720,13 @@ batch interval that is at least 10 seconds. It can be set by using
 
 ## Accumulators, Broadcast Variables, and Checkpoints
 
-[Accumulators](programming-guide.html#accumulators) and [Broadcast variables](programming-guide.html#broadcast-variables) cannot be recovered from checkpoint in Spark Streaming. If you enable checkpointing and use [Accumulators](programming-guide.html#accumulators) or [Broadcast variables](programming-guide.html#broadcast-variables) as well, you'll have to create lazily instantiated singleton instances for [Accumulators](programming-guide.html#accumulators) and [Broadcast variables](programming-guide.html#broadcast-variables) so that they can be re-instantiated after the driver restarts on failure. This is shown in the following example.
+[Accumulators](rdd-programming-guide.html#accumulators) and [Broadcast variables](rdd-programming-guide.html#broadcast-variables) 
+cannot be recovered from checkpoint in Spark Streaming. If you enable checkpointing and use 
+[Accumulators](rdd-programming-guide.html#accumulators) or [Broadcast variables](rdd-programming-guide.html#broadcast-variables) 
+as well, you'll have to create lazily instantiated singleton instances for 
+[Accumulators](rdd-programming-guide.html#accumulators) and [Broadcast variables](rdd-programming-guide.html#broadcast-variables) 
+so that they can be re-instantiated after the driver restarts on failure. 
+This is shown in the following example.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
@@ -2182,7 +2188,7 @@ overall processing throughput of the system, its use is still recommended to ach
 consistent batch processing times. Make sure you set the CMS GC on both the driver (using `--driver-java-options` in `spark-submit`) and the executors (using [Spark configuration](configuration.html#runtime-environment) `spark.executor.extraJavaOptions`).
 
 * **Other tips**: To further reduce GC overheads, here are some more tips to try.
-    - Persist RDDs using the `OFF_HEAP` storage level. See more detail in the [Spark Programming Guide](programming-guide.html#rdd-persistence).
+    - Persist RDDs using the `OFF_HEAP` storage level. See more detail in the [Spark Programming Guide](rdd-programming-guide.html#rdd-persistence).
     - Use more executors with smaller heap sizes. This will reduce the GC pressure within each JVM heap.
 
 ***
diff --git a/docs/tuning.md b/docs/tuning.md
index 0de303a3bd9bf..7d5f97a02fe6e 100644
--- a/docs/tuning.md
+++ b/docs/tuning.md
@@ -12,7 +12,7 @@ Because of the in-memory nature of most Spark computations, Spark programs can b
 by any resource in the cluster: CPU, network bandwidth, or memory.
 Most often, if the data fits in memory, the bottleneck is network bandwidth, but sometimes, you
 also need to do some tuning, such as
-[storing RDDs in serialized form](programming-guide.html#rdd-persistence), to
+[storing RDDs in serialized form](rdd-programming-guide.html#rdd-persistence), to
 decrease memory usage.
 This guide will cover two main topics: data serialization, which is crucial for good network
 performance and can also reduce memory use, and memory tuning. We also sketch several smaller topics.
@@ -155,7 +155,7 @@ pointer-based data structures and wrapper objects. There are several ways to do
 
 When your objects are still too large to efficiently store despite this tuning, a much simpler way
 to reduce memory usage is to store them in *serialized* form, using the serialized StorageLevels in
-the [RDD persistence API](programming-guide.html#rdd-persistence), such as `MEMORY_ONLY_SER`.
+the [RDD persistence API](rdd-programming-guide.html#rdd-persistence), such as `MEMORY_ONLY_SER`.
 Spark will then store each RDD partition as one large byte array.
 The only downside of storing data in serialized form is slower access times, due to having to
 deserialize each object on the fly.
@@ -262,7 +262,7 @@ number of cores in your clusters.
 
 ## Broadcasting Large Variables
 
-Using the [broadcast functionality](programming-guide.html#broadcast-variables)
+Using the [broadcast functionality](rdd-programming-guide.html#broadcast-variables)
 available in `SparkContext` can greatly reduce the size of each serialized task, and the cost
 of launching a job over a cluster. If your tasks use any large object from the driver program
 inside of them (e.g. a static lookup table), consider turning it into a broadcast variable.

From 69e5282d3c2998611680d3e10f2830d4e9c5f750 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Sat, 15 Jul 2017 20:56:38 +0800
Subject: [PATCH 179/779] [SPARK-20307][ML][SPARKR][FOLLOW-UP] RFormula should
 handle invalid for both features and label column.

## What changes were proposed in this pull request?
```RFormula``` should handle invalid for both features and label column.
#18496 only handle invalid values in features column. This PR add handling invalid values for label column and test cases.

## How was this patch tested?
Add test cases.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #18613 from yanboliang/spark-20307.
---
 R/pkg/tests/fulltests/test_mllib_tree.R       |  2 +-
 .../apache/spark/ml/feature/RFormula.scala    |  9 ++--
 .../spark/ml/feature/RFormulaSuite.scala      | 49 ++++++++++++++++++-
 python/pyspark/ml/feature.py                  |  5 +-
 4 files changed, 57 insertions(+), 8 deletions(-)

diff --git a/R/pkg/tests/fulltests/test_mllib_tree.R b/R/pkg/tests/fulltests/test_mllib_tree.R
index 66a0693a59a52..e31a65f8dfedb 100644
--- a/R/pkg/tests/fulltests/test_mllib_tree.R
+++ b/R/pkg/tests/fulltests/test_mllib_tree.R
@@ -225,7 +225,7 @@ test_that("spark.randomForest", {
   expect_error(collect(predictions))
   model <- spark.randomForest(traindf, clicked ~ ., type = "classification",
                              maxDepth = 10, maxBins = 10, numTrees = 10,
-                             handleInvalid = "skip")
+                             handleInvalid = "keep")
   predictions <- predict(model, testdf)
   expect_equal(class(collect(predictions)$clicked[1]), "character")
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index bb7acaf118ea5..c22445467dbc3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -134,16 +134,16 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
   def getFormula: String = $(formula)
 
   /**
-   * Param for how to handle invalid data (unseen labels or NULL values).
-   * Options are 'skip' (filter out rows with invalid data),
+   * Param for how to handle invalid data (unseen or NULL values) in features and label column
+   * of string type. Options are 'skip' (filter out rows with invalid data),
    * 'error' (throw an error), or 'keep' (put invalid data in a special additional
    * bucket, at index numLabels).
    * Default: "error"
    * @group param
    */
   @Since("2.3.0")
-  override val handleInvalid: Param[String] = new Param[String](this, "handleInvalid",
-    "How to handle invalid data (unseen labels or NULL values). " +
+  override val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "How to " +
+    "handle invalid data (unseen or NULL values) in features and label column of string type. " +
     "Options are 'skip' (filter out rows with invalid data), error (throw an error), " +
     "or 'keep' (put invalid data in a special additional bucket, at index numLabels).",
     ParamValidators.inArray(StringIndexer.supportedHandleInvalids))
@@ -265,6 +265,7 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
       encoderStages += new StringIndexer()
         .setInputCol(resolvedFormula.label)
         .setOutputCol($(labelCol))
+        .setHandleInvalid($(handleInvalid))
     }
 
     val pipelineModel = new Pipeline(uid).setStages(encoderStages.toArray).fit(dataset)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
index 23570d6e0b4cb..5d09c90ec6dfa 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.ml.attribute._
 import org.apache.spark.ml.linalg.Vectors
 import org.apache.spark.ml.param.ParamsSuite
@@ -501,4 +501,51 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
       assert(expected.resolvedFormula.hasIntercept === actual.resolvedFormula.hasIntercept)
     }
   }
+
+  test("handle unseen features or labels") {
+    val df1 = Seq((1, "foo", "zq"), (2, "bar", "zq"), (3, "bar", "zz")).toDF("id", "a", "b")
+    val df2 = Seq((1, "foo", "zq"), (2, "bar", "zq"), (3, "bar", "zy")).toDF("id", "a", "b")
+
+    // Handle unseen features.
+    val formula1 = new RFormula().setFormula("id ~ a + b")
+    intercept[SparkException] {
+      formula1.fit(df1).transform(df2).collect()
+    }
+    val result1 = formula1.setHandleInvalid("skip").fit(df1).transform(df2)
+    val result2 = formula1.setHandleInvalid("keep").fit(df1).transform(df2)
+
+    val expected1 = Seq(
+      (1, "foo", "zq", Vectors.dense(0.0, 1.0), 1.0),
+      (2, "bar", "zq", Vectors.dense(1.0, 1.0), 2.0)
+    ).toDF("id", "a", "b", "features", "label")
+    val expected2 = Seq(
+      (1, "foo", "zq", Vectors.dense(0.0, 1.0, 1.0, 0.0), 1.0),
+      (2, "bar", "zq", Vectors.dense(1.0, 0.0, 1.0, 0.0), 2.0),
+      (3, "bar", "zy", Vectors.dense(1.0, 0.0, 0.0, 0.0), 3.0)
+    ).toDF("id", "a", "b", "features", "label")
+
+    assert(result1.collect() === expected1.collect())
+    assert(result2.collect() === expected2.collect())
+
+    // Handle unseen labels.
+    val formula2 = new RFormula().setFormula("b ~ a + id")
+    intercept[SparkException] {
+      formula2.fit(df1).transform(df2).collect()
+    }
+    val result3 = formula2.setHandleInvalid("skip").fit(df1).transform(df2)
+    val result4 = formula2.setHandleInvalid("keep").fit(df1).transform(df2)
+
+    val expected3 = Seq(
+      (1, "foo", "zq", Vectors.dense(0.0, 1.0), 0.0),
+      (2, "bar", "zq", Vectors.dense(1.0, 2.0), 0.0)
+    ).toDF("id", "a", "b", "features", "label")
+    val expected4 = Seq(
+      (1, "foo", "zq", Vectors.dense(0.0, 1.0, 1.0), 0.0),
+      (2, "bar", "zq", Vectors.dense(1.0, 0.0, 2.0), 0.0),
+      (3, "bar", "zy", Vectors.dense(1.0, 0.0, 3.0), 2.0)
+    ).toDF("id", "a", "b", "features", "label")
+
+    assert(result3.collect() === expected3.collect())
+    assert(result4.collect() === expected4.collect())
+  }
 }
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 7eb1b9fac2f5a..54b4026f78bec 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -2107,8 +2107,9 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid,
                             typeConverter=TypeConverters.toString)
 
     handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid data (unseen " +
-                          "labels or NULL values). Options are 'skip' (filter out rows with " +
-                          "invalid data), error (throw an error), or 'keep' (put invalid data " +
+                          "or NULL values) in features and label column of string type. " +
+                          "Options are 'skip' (filter out rows with invalid data), " +
+                          "error (throw an error), or 'keep' (put invalid data " +
                           "in a special additional bucket, at index numLabels).",
                           typeConverter=TypeConverters.toString)
 

From fd52a747fd7f5a0f9657b4fa16c797916aef9a95 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Mon, 17 Jul 2017 09:22:42 +0800
Subject: [PATCH 180/779] [SPARK-19810][SPARK-19810][MINOR][FOLLOW-UP]
 Follow-ups from to remove Scala 2.10

## What changes were proposed in this pull request?

Follow up to a few comments on https://github.com/apache/spark/pull/17150#issuecomment-315020196 that couldn't be addressed before it was merged.

## How was this patch tested?

Existing tests.

Author: Sean Owen <sowen@cloudera.com>

Closes #18646 from srowen/SPARK-19810.2.
---
 .../sql/catalyst/ScalaReflectionSuite.scala   | 39 -------------------
 .../execution/python/ExtractPythonUDFs.scala  |  2 +-
 2 files changed, 1 insertion(+), 40 deletions(-)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
index ff2414b174acb..a5b9855e959d4 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
@@ -17,17 +17,13 @@
 
 package org.apache.spark.sql.catalyst
 
-import java.net.URLClassLoader
 import java.sql.{Date, Timestamp}
 
-import scala.reflect.runtime.universe.typeOf
-
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions.{BoundReference, Literal, SpecificInternalRow}
 import org.apache.spark.sql.catalyst.expressions.objects.NewInstance
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
-import org.apache.spark.util.Utils
 
 case class PrimitiveData(
     intField: Int,
@@ -339,39 +335,4 @@ class ScalaReflectionSuite extends SparkFunSuite {
     assert(linkedHashMapDeserializer.dataType == ObjectType(classOf[LHMap[_, _]]))
   }
 
-  private val dataTypeForComplexData = dataTypeFor[ComplexData]
-  private val typeOfComplexData = typeOf[ComplexData]
-
-  Seq(
-    ("mirror", () => mirror),
-    ("dataTypeFor", () => dataTypeFor[ComplexData]),
-    ("constructorFor", () => deserializerFor[ComplexData]),
-    ("extractorsFor", {
-      val inputObject = BoundReference(0, dataTypeForComplexData, nullable = false)
-      () => serializerFor[ComplexData](inputObject)
-    }),
-    ("getConstructorParameters(cls)", () => getConstructorParameters(classOf[ComplexData])),
-    ("getConstructorParameterNames", () => getConstructorParameterNames(classOf[ComplexData])),
-    ("getClassFromType", () => getClassFromType(typeOfComplexData)),
-    ("schemaFor", () => schemaFor[ComplexData]),
-    ("localTypeOf", () => localTypeOf[ComplexData]),
-    ("getClassNameFromType", () => getClassNameFromType(typeOfComplexData)),
-    ("getParameterTypes", () => getParameterTypes(() => ())),
-    ("getConstructorParameters(tpe)", () => getClassNameFromType(typeOfComplexData))).foreach {
-      case (name, exec) =>
-        test(s"SPARK-13640: thread safety of ${name}") {
-          (0 until 100).foreach { _ =>
-            val loader = new URLClassLoader(Array.empty, Utils.getContextOrSparkClassLoader)
-            (0 until 10).par.foreach { _ =>
-              val cl = Thread.currentThread.getContextClassLoader
-              try {
-                Thread.currentThread.setContextClassLoader(loader)
-                exec()
-              } finally {
-                Thread.currentThread.setContextClassLoader(cl)
-              }
-            }
-          }
-        }
-    }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
index 2b3c5f054893f..1400452833039 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
@@ -133,7 +133,7 @@ object ExtractPythonUDFs extends Rule[SparkPlan] with PredicateHelper {
         val validUdfs = udfs.filter { udf =>
           // Check to make sure that the UDF can be evaluated with only the input of this child.
           udf.references.subsetOf(child.outputSet)
-        }.toArray
+        }
         if (validUdfs.nonEmpty) {
           val resultAttrs = udfs.zipWithIndex.map { case (u, i) =>
             AttributeReference(s"pythonUDF$i", u.dataType)()

From e398c281467d62fe3c0ffe05048ef6a2fa80285b Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Mon, 17 Jul 2017 14:58:14 +0800
Subject: [PATCH 181/779] [SPARK-21354][SQL] INPUT FILE related functions do
 not support more than one sources

### What changes were proposed in this pull request?
The build-in functions `input_file_name`, `input_file_block_start`, `input_file_block_length` do not support more than one sources, like what Hive does. Currently, Spark does not block it and the outputs are ambiguous/non-deterministic. It could be from any side.

```
hive> select *, INPUT__FILE__NAME FROM t1, t2;
FAILED: SemanticException Column INPUT__FILE__NAME Found in more than One Tables/Subqueries
```

This PR blocks it and issues an error.

### How was this patch tested?
Added a test case

Author: gatorsmile <gatorsmile@gmail.com>

Closes #18580 from gatorsmile/inputFileName.
---
 .../sql/execution/datasources/rules.scala     | 38 ++++++++++++-
 .../internal/BaseSessionStateBuilder.scala    |  1 +
 .../spark/sql/ColumnExpressionSuite.scala     | 57 +++++++++++++++++++
 .../sql/streaming/FileStreamSinkSuite.scala   |  3 +-
 .../sql/hive/HiveSessionStateBuilder.scala    | 21 +++----
 5 files changed, 107 insertions(+), 13 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
index 41d40aa926fbb..b97fa54446e0c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -22,7 +22,7 @@ import java.util.Locale
 import org.apache.spark.sql.{AnalysisException, SaveMode, SparkSession}
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.catalog._
-import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, RowOrdering}
+import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, Expression, InputFileBlockLength, InputFileBlockStart, InputFileName, RowOrdering}
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.command.DDLUtils
@@ -409,6 +409,42 @@ object HiveOnlyCheck extends (LogicalPlan => Unit) {
   }
 }
 
+
+/**
+ * A rule to do various checks before reading a table.
+ */
+object PreReadCheck extends (LogicalPlan => Unit) {
+  def apply(plan: LogicalPlan): Unit = {
+    plan.foreach {
+      case operator: LogicalPlan =>
+        operator transformExpressionsUp {
+          case e @ (_: InputFileName | _: InputFileBlockLength | _: InputFileBlockStart) =>
+            checkNumInputFileBlockSources(e, operator)
+            e
+        }
+    }
+  }
+
+  private def checkNumInputFileBlockSources(e: Expression, operator: LogicalPlan): Int = {
+    operator match {
+      case _: CatalogRelation => 1
+      case _ @ LogicalRelation(_: HadoopFsRelation, _, _) => 1
+      case _: LeafNode => 0
+      // UNION ALL has multiple children, but these children do not concurrently use InputFileBlock.
+      case u: Union =>
+        if (u.children.map(checkNumInputFileBlockSources(e, _)).sum >= 1) 1 else 0
+      case o =>
+        val numInputFileBlockSources = o.children.map(checkNumInputFileBlockSources(e, _)).sum
+        if (numInputFileBlockSources > 1) {
+          e.failAnalysis(s"'${e.prettyName}' does not support more than one sources")
+        } else {
+          numInputFileBlockSources
+        }
+    }
+  }
+}
+
+
 /**
  * A rule to do various checks before inserting into or writing to a data source table.
  */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
index 267f76217df84..37f4f8d4ab65e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
@@ -168,6 +168,7 @@ abstract class BaseSessionStateBuilder(
 
     override val extendedCheckRules: Seq[LogicalPlan => Unit] =
       PreWriteCheck +:
+        PreReadCheck +:
         HiveOnlyCheck +:
         customCheckRules
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
index bc708ca88d7e1..7c45be21961d3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -530,6 +530,63 @@ class ColumnExpressionSuite extends QueryTest with SharedSQLContext {
     )
   }
 
+  test("input_file_name, input_file_block_start, input_file_block_length - more than one source") {
+    withTempView("tempView1") {
+      withTable("tab1", "tab2") {
+        val data = sparkContext.parallelize(0 to 9).toDF("id")
+        data.write.saveAsTable("tab1")
+        data.write.saveAsTable("tab2")
+        data.createOrReplaceTempView("tempView1")
+        Seq("input_file_name", "input_file_block_start", "input_file_block_length").foreach { f =>
+          val e = intercept[AnalysisException] {
+            sql(s"SELECT *, $f() FROM tab1 JOIN tab2 ON tab1.id = tab2.id")
+          }.getMessage
+          assert(e.contains(s"'$f' does not support more than one source"))
+        }
+
+        def checkResult(
+            fromClause: String,
+            exceptionExpected: Boolean,
+            numExpectedRows: Int = 0): Unit = {
+          val stmt = s"SELECT *, input_file_name() FROM ($fromClause)"
+          if (exceptionExpected) {
+            val e = intercept[AnalysisException](sql(stmt)).getMessage
+            assert(e.contains("'input_file_name' does not support more than one source"))
+          } else {
+            assert(sql(stmt).count() == numExpectedRows)
+          }
+        }
+
+        checkResult(
+          "SELECT * FROM tab1 UNION ALL SELECT * FROM tab2 UNION ALL SELECT * FROM tab2",
+          exceptionExpected = false,
+          numExpectedRows = 30)
+
+        checkResult(
+          "(SELECT * FROM tempView1 NATURAL JOIN tab2) UNION ALL SELECT * FROM tab2",
+          exceptionExpected = false,
+          numExpectedRows = 20)
+
+        checkResult(
+          "(SELECT * FROM tab1 UNION ALL SELECT * FROM tab2) NATURAL JOIN tempView1",
+          exceptionExpected = false,
+          numExpectedRows = 20)
+
+        checkResult(
+          "(SELECT * FROM tempView1 UNION ALL SELECT * FROM tab2) NATURAL JOIN tab2",
+          exceptionExpected = true)
+
+        checkResult(
+          "(SELECT * FROM tab1 NATURAL JOIN tab2) UNION ALL SELECT * FROM tab2",
+          exceptionExpected = true)
+
+        checkResult(
+          "(SELECT * FROM tab1 UNION ALL SELECT * FROM tab2) NATURAL JOIN tab2",
+          exceptionExpected = true)
+      }
+    }
+  }
+
   test("input_file_name, input_file_block_start, input_file_block_length - FileScanRDD") {
     withTempPath { dir =>
       val data = sparkContext.parallelize(0 to 10).toDF("id")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
index 6676099d426ba..a5cf40c3581c6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
@@ -127,8 +127,7 @@ class FileStreamSinkSuite extends StreamTest {
       // Verify that MetadataLogFileIndex is being used and the correct partitioning schema has
       // been inferred
       val hadoopdFsRelations = outputDf.queryExecution.analyzed.collect {
-        case LogicalRelation(baseRelation, _, _) if baseRelation.isInstanceOf[HadoopFsRelation] =>
-          baseRelation.asInstanceOf[HadoopFsRelation]
+        case LogicalRelation(baseRelation: HadoopFsRelation, _, _) => baseRelation
       }
       assert(hadoopdFsRelations.size === 1)
       assert(hadoopdFsRelations.head.location.isInstanceOf[MetadataLogFileIndex])
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
index e16c9e46b7723..92cb4ef11c9e3 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
@@ -69,22 +69,23 @@ class HiveSessionStateBuilder(session: SparkSession, parentState: Option[Session
   override protected def analyzer: Analyzer = new Analyzer(catalog, conf) {
     override val extendedResolutionRules: Seq[Rule[LogicalPlan]] =
       new ResolveHiveSerdeTable(session) +:
-      new FindDataSourceTable(session) +:
-      new ResolveSQLOnFile(session) +:
-      customResolutionRules
+        new FindDataSourceTable(session) +:
+        new ResolveSQLOnFile(session) +:
+        customResolutionRules
 
     override val postHocResolutionRules: Seq[Rule[LogicalPlan]] =
       new DetermineTableStats(session) +:
-      RelationConversions(conf, catalog) +:
-      PreprocessTableCreation(session) +:
-      PreprocessTableInsertion(conf) +:
-      DataSourceAnalysis(conf) +:
-      HiveAnalysis +:
-      customPostHocResolutionRules
+        RelationConversions(conf, catalog) +:
+        PreprocessTableCreation(session) +:
+        PreprocessTableInsertion(conf) +:
+        DataSourceAnalysis(conf) +:
+        HiveAnalysis +:
+        customPostHocResolutionRules
 
     override val extendedCheckRules: Seq[LogicalPlan => Unit] =
       PreWriteCheck +:
-      customCheckRules
+        PreReadCheck +:
+        customCheckRules
   }
 
   /**

From 4ce735eed103f3bd055c087126acd1366c2537ec Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Mon, 17 Jul 2017 00:37:36 -0700
Subject: [PATCH 182/779] [SPARK-21394][SPARK-21432][PYTHON] Reviving callable
 object/partial function support in UDF in PySpark

## What changes were proposed in this pull request?

This PR proposes to avoid `__name__` in the tuple naming the attributes assigned directly from the wrapped function to the wrapper function, and use `self._name` (`func.__name__` or `obj.__class__.name__`).

After SPARK-19161, we happened to break callable objects as UDFs in Python as below:

```python
from pyspark.sql import functions

class F(object):
    def __call__(self, x):
        return x

foo = F()
udf = functions.udf(foo)
```

```
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File ".../spark/python/pyspark/sql/functions.py", line 2142, in udf
    return _udf(f=f, returnType=returnType)
  File ".../spark/python/pyspark/sql/functions.py", line 2133, in _udf
    return udf_obj._wrapped()
  File ".../spark/python/pyspark/sql/functions.py", line 2090, in _wrapped
    functools.wraps(self.func)
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/functools.py", line 33, in update_wrapper
    setattr(wrapper, attr, getattr(wrapped, attr))
AttributeError: F instance has no attribute '__name__'
```

This worked in Spark 2.1:

```python
from pyspark.sql import functions

class F(object):
    def __call__(self, x):
        return x

foo = F()
udf = functions.udf(foo)
spark.range(1).select(udf("id")).show()
```

```
+-----+
|F(id)|
+-----+
|    0|
+-----+
```

**After**

```python
from pyspark.sql import functions

class F(object):
    def __call__(self, x):
        return x

foo = F()
udf = functions.udf(foo)
spark.range(1).select(udf("id")).show()
```

```
+-----+
|F(id)|
+-----+
|    0|
+-----+
```

_In addition, we also happened to break partial functions as below_:

```python
from pyspark.sql import functions
from functools import partial

partial_func = partial(lambda x: x, x=1)
udf = functions.udf(partial_func)
```

```
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File ".../spark/python/pyspark/sql/functions.py", line 2154, in udf
    return _udf(f=f, returnType=returnType)
  File ".../spark/python/pyspark/sql/functions.py", line 2145, in _udf
    return udf_obj._wrapped()
  File ".../spark/python/pyspark/sql/functions.py", line 2099, in _wrapped
    functools.wraps(self.func, assigned=assignments)
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/functools.py", line 33, in update_wrapper
    setattr(wrapper, attr, getattr(wrapped, attr))
AttributeError: 'functools.partial' object has no attribute '__module__'
```

This worked in Spark 2.1:

```python
from pyspark.sql import functions
from functools import partial

partial_func = partial(lambda x: x, x=1)
udf = functions.udf(partial_func)
spark.range(1).select(udf()).show()
```

```
+---------+
|partial()|
+---------+
|        1|
+---------+
```

**After**

```python
from pyspark.sql import functions
from functools import partial

partial_func = partial(lambda x: x, x=1)
udf = functions.udf(partial_func)
spark.range(1).select(udf()).show()
```

```
+---------+
|partial()|
+---------+
|        1|
+---------+
```

## How was this patch tested?

Unit tests in `python/pyspark/sql/tests.py` and manual tests.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #18615 from HyukjinKwon/callable-object.
---
 python/pyspark/sql/functions.py | 14 +++++++++++++-
 python/pyspark/sql/tests.py     | 21 +++++++++++++++++++++
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index d45ff63355de3..2c8c8e2d80f09 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -2087,10 +2087,22 @@ def _wrapped(self):
         """
         Wrap this udf with a function and attach docstring from func
         """
-        @functools.wraps(self.func)
+
+        # It is possible for a callable instance without __name__ attribute or/and
+        # __module__ attribute to be wrapped here. For example, functools.partial. In this case,
+        # we should avoid wrapping the attributes from the wrapped function to the wrapper
+        # function. So, we take out these attribute names from the default names to set and
+        # then manually assign it after being wrapped.
+        assignments = tuple(
+            a for a in functools.WRAPPER_ASSIGNMENTS if a != '__name__' and a != '__module__')
+
+        @functools.wraps(self.func, assigned=assignments)
         def wrapper(*args):
             return self(*args)
 
+        wrapper.__name__ = self._name
+        wrapper.__module__ = (self.func.__module__ if hasattr(self.func, '__module__')
+                              else self.func.__class__.__module__)
         wrapper.func = self.func
         wrapper.returnType = self.returnType
 
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 29e48a6ccf763..be5495ca019a2 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -679,6 +679,27 @@ def f(x):
         self.assertEqual(f, f_.func)
         self.assertEqual(return_type, f_.returnType)
 
+        class F(object):
+            """Identity"""
+            def __call__(self, x):
+                return x
+
+        f = F()
+        return_type = IntegerType()
+        f_ = udf(f, return_type)
+
+        self.assertTrue(f.__doc__ in f_.__doc__)
+        self.assertEqual(f, f_.func)
+        self.assertEqual(return_type, f_.returnType)
+
+        f = functools.partial(f, x=1)
+        return_type = IntegerType()
+        f_ = udf(f, return_type)
+
+        self.assertTrue(f.__doc__ in f_.__doc__)
+        self.assertEqual(f, f_.func)
+        self.assertEqual(return_type, f_.returnType)
+
     def test_basic_functions(self):
         rdd = self.sc.parallelize(['{"foo":"bar"}', '{"foo":"baz"}'])
         df = self.spark.read.json(rdd)

From 7047f49f45406be3b4a9b0aa209b3021621392ca Mon Sep 17 00:00:00 2001
From: Ajay Saini <ajays725@gmail.com>
Date: Mon, 17 Jul 2017 10:07:32 -0700
Subject: [PATCH 183/779] [SPARK-21221][ML] CrossValidator and
 TrainValidationSplit Persist Nested Estimators such as OneVsRest

## What changes were proposed in this pull request?
Added functionality for CrossValidator and TrainValidationSplit to persist nested estimators such as OneVsRest. Also added CrossValidator and TrainValidation split persistence to pyspark.

## How was this patch tested?
Performed both cross validation and train validation split with a one vs. rest estimator and tested read/write functionality of the estimator parameter maps required by these meta-algorithms.

Author: Ajay Saini <ajays725@gmail.com>

Closes #18428 from ajaysaini725/MetaAlgorithmPersistNestedEstimators.
---
 .../spark/ml/tuning/ValidatorParams.scala     |  31 ++-
 .../spark/ml/tuning/CrossValidatorSuite.scala | 103 ++++++--
 .../ml/tuning/TrainValidationSplitSuite.scala |  84 ++++++-
 .../tuning/ValidatorParamsSuiteHelpers.scala  |  86 +++++++
 .../spark/ml/util/DefaultReadWriteTest.scala  |   1 -
 python/pyspark/ml/classification.py           |  92 ++++---
 python/pyspark/ml/tests.py                    | 145 ++++++++++-
 python/pyspark/ml/tuning.py                   | 226 +++++++++++++++++-
 python/pyspark/ml/wrapper.py                  |   2 +-
 9 files changed, 696 insertions(+), 74 deletions(-)
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/tuning/ValidatorParamsSuiteHelpers.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ValidatorParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ValidatorParams.scala
index d55eb14d03456..0ab6eed959381 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/ValidatorParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/ValidatorParams.scala
@@ -126,10 +126,26 @@ private[ml] object ValidatorParams {
       extraMetadata: Option[JObject] = None): Unit = {
     import org.json4s.JsonDSL._
 
+    var numParamsNotJson = 0
     val estimatorParamMapsJson = compact(render(
       instance.getEstimatorParamMaps.map { case paramMap =>
         paramMap.toSeq.map { case ParamPair(p, v) =>
-          Map("parent" -> p.parent, "name" -> p.name, "value" -> p.jsonEncode(v))
+          v match {
+            case writeableObj: DefaultParamsWritable =>
+              val relativePath = "epm_" + p.name + numParamsNotJson
+              val paramPath = new Path(path, relativePath).toString
+              numParamsNotJson += 1
+              writeableObj.save(paramPath)
+              Map("parent" -> p.parent, "name" -> p.name,
+                "value" -> compact(render(JString(relativePath))),
+                "isJson" -> compact(render(JBool(false))))
+            case _: MLWritable =>
+              throw new NotImplementedError("ValidatorParams.saveImpl does not handle parameters " +
+                "of type: MLWritable that are not DefaultParamsWritable")
+            case _ =>
+              Map("parent" -> p.parent, "name" -> p.name, "value" -> p.jsonEncode(v),
+                "isJson" -> compact(render(JBool(true))))
+          }
         }
       }.toSeq
     ))
@@ -183,8 +199,17 @@ private[ml] object ValidatorParams {
           val paramPairs = pMap.map { case pInfo: Map[String, String] =>
             val est = uidToParams(pInfo("parent"))
             val param = est.getParam(pInfo("name"))
-            val value = param.jsonDecode(pInfo("value"))
-            param -> value
+            // [Spark-21221] introduced the isJson field
+            if (!pInfo.contains("isJson") ||
+                (pInfo.contains("isJson") && pInfo("isJson").toBoolean.booleanValue())) {
+              val value = param.jsonDecode(pInfo("value"))
+              param -> value
+            } else {
+              val relativePath = param.jsonDecode(pInfo("value")).toString
+              val value = DefaultParamsReader
+                .loadParamsInstance[MLWritable](new Path(path, relativePath).toString, sc)
+              param -> value
+            }
           }
           ParamMap(paramPairs: _*)
       }.toArray
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
index 2b4e6b53e4f81..2791ea715ace6 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
@@ -19,12 +19,12 @@ package org.apache.spark.ml.tuning
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.{Estimator, Model, Pipeline}
-import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
+import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel, OneVsRest}
 import org.apache.spark.ml.classification.LogisticRegressionSuite.generateLogisticInput
-import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator, RegressionEvaluator}
+import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator, MulticlassClassificationEvaluator, RegressionEvaluator}
 import org.apache.spark.ml.feature.HashingTF
-import org.apache.spark.ml.linalg.{DenseMatrix, Vectors}
-import org.apache.spark.ml.param.{ParamMap, ParamPair}
+import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.param.shared.HasInputCol
 import org.apache.spark.ml.regression.LinearRegression
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
@@ -153,7 +153,76 @@ class CrossValidatorSuite
           s" LogisticRegression but found ${other.getClass.getName}")
     }
 
-    CrossValidatorSuite.compareParamMaps(cv.getEstimatorParamMaps, cv2.getEstimatorParamMaps)
+    ValidatorParamsSuiteHelpers
+      .compareParamMaps(cv.getEstimatorParamMaps, cv2.getEstimatorParamMaps)
+  }
+
+  test("read/write: CrossValidator with nested estimator") {
+    val ova = new OneVsRest().setClassifier(new LogisticRegression)
+    val evaluator = new MulticlassClassificationEvaluator()
+      .setMetricName("accuracy")
+    val classifier1 = new LogisticRegression().setRegParam(2.0)
+    val classifier2 = new LogisticRegression().setRegParam(3.0)
+    // params that are not JSON serializable must inherit from Params
+    val paramMaps = new ParamGridBuilder()
+      .addGrid(ova.classifier, Array(classifier1, classifier2))
+      .build()
+    val cv = new CrossValidator()
+      .setEstimator(ova)
+      .setEvaluator(evaluator)
+      .setNumFolds(20)
+      .setEstimatorParamMaps(paramMaps)
+
+    val cv2 = testDefaultReadWrite(cv, testParams = false)
+
+    assert(cv.uid === cv2.uid)
+    assert(cv.getNumFolds === cv2.getNumFolds)
+    assert(cv.getSeed === cv2.getSeed)
+
+    assert(cv2.getEvaluator.isInstanceOf[MulticlassClassificationEvaluator])
+    val evaluator2 = cv2.getEvaluator.asInstanceOf[MulticlassClassificationEvaluator]
+    assert(evaluator.uid === evaluator2.uid)
+    assert(evaluator.getMetricName === evaluator2.getMetricName)
+
+    cv2.getEstimator match {
+      case ova2: OneVsRest =>
+        assert(ova.uid === ova2.uid)
+        val classifier = ova2.getClassifier
+        classifier match {
+          case lr: LogisticRegression =>
+            assert(ova.getClassifier.asInstanceOf[LogisticRegression].getMaxIter
+              === lr.getMaxIter)
+          case _ =>
+            throw new AssertionError(s"Loaded CrossValidator expected estimator of type" +
+              s" LogisticREgression but found ${classifier.getClass.getName}")
+        }
+
+      case other =>
+        throw new AssertionError(s"Loaded CrossValidator expected estimator of type" +
+          s" OneVsRest but found ${other.getClass.getName}")
+    }
+
+    ValidatorParamsSuiteHelpers
+      .compareParamMaps(cv.getEstimatorParamMaps, cv2.getEstimatorParamMaps)
+  }
+
+  test("read/write: Persistence of nested estimator works if parent directory changes") {
+    val ova = new OneVsRest().setClassifier(new LogisticRegression)
+    val evaluator = new MulticlassClassificationEvaluator()
+      .setMetricName("accuracy")
+    val classifier1 = new LogisticRegression().setRegParam(2.0)
+    val classifier2 = new LogisticRegression().setRegParam(3.0)
+    // params that are not JSON serializable must inherit from Params
+    val paramMaps = new ParamGridBuilder()
+      .addGrid(ova.classifier, Array(classifier1, classifier2))
+      .build()
+    val cv = new CrossValidator()
+      .setEstimator(ova)
+      .setEvaluator(evaluator)
+      .setNumFolds(20)
+      .setEstimatorParamMaps(paramMaps)
+
+    ValidatorParamsSuiteHelpers.testFileMove(cv)
   }
 
   test("read/write: CrossValidator with complex estimator") {
@@ -193,7 +262,8 @@ class CrossValidatorSuite
     assert(cv2.getEvaluator.isInstanceOf[BinaryClassificationEvaluator])
     assert(cv.getEvaluator.uid === cv2.getEvaluator.uid)
 
-    CrossValidatorSuite.compareParamMaps(cv.getEstimatorParamMaps, cv2.getEstimatorParamMaps)
+    ValidatorParamsSuiteHelpers
+      .compareParamMaps(cv.getEstimatorParamMaps, cv2.getEstimatorParamMaps)
 
     cv2.getEstimator match {
       case pipeline2: Pipeline =>
@@ -212,7 +282,8 @@ class CrossValidatorSuite
             assert(lrcv.uid === lrcv2.uid)
             assert(lrcv2.getEvaluator.isInstanceOf[BinaryClassificationEvaluator])
             assert(lrEvaluator.uid === lrcv2.getEvaluator.uid)
-            CrossValidatorSuite.compareParamMaps(lrParamMaps, lrcv2.getEstimatorParamMaps)
+            ValidatorParamsSuiteHelpers
+              .compareParamMaps(lrParamMaps, lrcv2.getEstimatorParamMaps)
           case other =>
             throw new AssertionError("Loaded Pipeline expected stages (HashingTF, CrossValidator)" +
               " but found: " + other.map(_.getClass.getName).mkString(", "))
@@ -278,7 +349,8 @@ class CrossValidatorSuite
           s" LogisticRegression but found ${other.getClass.getName}")
     }
 
-    CrossValidatorSuite.compareParamMaps(cv.getEstimatorParamMaps, cv2.getEstimatorParamMaps)
+   ValidatorParamsSuiteHelpers
+     .compareParamMaps(cv.getEstimatorParamMaps, cv2.getEstimatorParamMaps)
 
     cv2.bestModel match {
       case lrModel2: LogisticRegressionModel =>
@@ -296,21 +368,6 @@ class CrossValidatorSuite
 
 object CrossValidatorSuite extends SparkFunSuite {
 
-  /**
-   * Assert sequences of estimatorParamMaps are identical.
-   * Params must be simple types comparable with `===`.
-   */
-  def compareParamMaps(pMaps: Array[ParamMap], pMaps2: Array[ParamMap]): Unit = {
-    assert(pMaps.length === pMaps2.length)
-    pMaps.zip(pMaps2).foreach { case (pMap, pMap2) =>
-      assert(pMap.size === pMap2.size)
-      pMap.toSeq.foreach { case ParamPair(p, v) =>
-        assert(pMap2.contains(p))
-        assert(pMap2(p) === v)
-      }
-    }
-  }
-
   abstract class MyModel extends Model[MyModel]
 
   class MyEstimator(override val uid: String) extends Estimator[MyModel] with HasInputCol {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
index a34f930aa11c4..71a1776a2cdd0 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
@@ -19,11 +19,11 @@ package org.apache.spark.ml.tuning
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.{Estimator, Model}
-import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
+import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel, OneVsRest}
 import org.apache.spark.ml.classification.LogisticRegressionSuite.generateLogisticInput
 import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator, RegressionEvaluator}
 import org.apache.spark.ml.linalg.Vectors
-import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.ml.param.{ParamMap}
 import org.apache.spark.ml.param.shared.HasInputCol
 import org.apache.spark.ml.regression.LinearRegression
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
@@ -95,7 +95,7 @@ class TrainValidationSplitSuite
   }
 
   test("transformSchema should check estimatorParamMaps") {
-    import TrainValidationSplitSuite._
+    import TrainValidationSplitSuite.{MyEstimator, MyEvaluator}
 
     val est = new MyEstimator("est")
     val eval = new MyEvaluator
@@ -134,6 +134,82 @@ class TrainValidationSplitSuite
 
     assert(tvs.getTrainRatio === tvs2.getTrainRatio)
     assert(tvs.getSeed === tvs2.getSeed)
+
+    ValidatorParamsSuiteHelpers
+      .compareParamMaps(tvs.getEstimatorParamMaps, tvs2.getEstimatorParamMaps)
+
+    tvs2.getEstimator match {
+      case lr2: LogisticRegression =>
+        assert(lr.uid === lr2.uid)
+        assert(lr.getMaxIter === lr2.getMaxIter)
+      case other =>
+        throw new AssertionError(s"Loaded TrainValidationSplit expected estimator of type" +
+          s" LogisticRegression but found ${other.getClass.getName}")
+    }
+  }
+
+  test("read/write: TrainValidationSplit with nested estimator") {
+    val ova = new OneVsRest()
+      .setClassifier(new LogisticRegression)
+    val evaluator = new BinaryClassificationEvaluator()
+      .setMetricName("areaUnderPR")  // not default metric
+    val classifier1 = new LogisticRegression().setRegParam(2.0)
+    val classifier2 = new LogisticRegression().setRegParam(3.0)
+    val paramMaps = new ParamGridBuilder()
+      .addGrid(ova.classifier, Array(classifier1, classifier2))
+      .build()
+    val tvs = new TrainValidationSplit()
+      .setEstimator(ova)
+      .setEvaluator(evaluator)
+      .setTrainRatio(0.5)
+      .setEstimatorParamMaps(paramMaps)
+      .setSeed(42L)
+
+    val tvs2 = testDefaultReadWrite(tvs, testParams = false)
+
+    assert(tvs.getTrainRatio === tvs2.getTrainRatio)
+    assert(tvs.getSeed === tvs2.getSeed)
+
+    tvs2.getEstimator match {
+      case ova2: OneVsRest =>
+        assert(ova.uid === ova2.uid)
+        val classifier = ova2.getClassifier
+        classifier match {
+          case lr: LogisticRegression =>
+            assert(ova.getClassifier.asInstanceOf[LogisticRegression].getMaxIter
+              === lr.getMaxIter)
+          case _ =>
+            throw new AssertionError(s"Loaded TrainValidationSplit expected estimator of type" +
+              s" LogisticREgression but found ${classifier.getClass.getName}")
+        }
+
+      case other =>
+        throw new AssertionError(s"Loaded TrainValidationSplit expected estimator of type" +
+          s" OneVsRest but found ${other.getClass.getName}")
+    }
+
+    ValidatorParamsSuiteHelpers
+      .compareParamMaps(tvs.getEstimatorParamMaps, tvs2.getEstimatorParamMaps)
+  }
+
+  test("read/write: Persistence of nested estimator works if parent directory changes") {
+    val ova = new OneVsRest()
+      .setClassifier(new LogisticRegression)
+    val evaluator = new BinaryClassificationEvaluator()
+      .setMetricName("areaUnderPR")  // not default metric
+    val classifier1 = new LogisticRegression().setRegParam(2.0)
+    val classifier2 = new LogisticRegression().setRegParam(3.0)
+    val paramMaps = new ParamGridBuilder()
+      .addGrid(ova.classifier, Array(classifier1, classifier2))
+      .build()
+    val tvs = new TrainValidationSplit()
+      .setEstimator(ova)
+      .setEvaluator(evaluator)
+      .setTrainRatio(0.5)
+      .setEstimatorParamMaps(paramMaps)
+      .setSeed(42L)
+
+    ValidatorParamsSuiteHelpers.testFileMove(tvs)
   }
 
   test("read/write: TrainValidationSplitModel") {
@@ -160,7 +236,7 @@ class TrainValidationSplitSuite
   }
 }
 
-object TrainValidationSplitSuite {
+object TrainValidationSplitSuite extends SparkFunSuite{
 
   abstract class MyModel extends Model[MyModel]
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/ValidatorParamsSuiteHelpers.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/ValidatorParamsSuiteHelpers.scala
new file mode 100644
index 0000000000000..1df673cf40162
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/ValidatorParamsSuiteHelpers.scala
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.tuning
+
+import java.io.File
+import java.nio.file.{Files, StandardCopyOption}
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.param.{ParamMap, ParamPair, Params}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, Identifiable, MLReader, MLWritable}
+
+object ValidatorParamsSuiteHelpers extends SparkFunSuite with DefaultReadWriteTest {
+  /**
+   * Assert sequences of estimatorParamMaps are identical.
+   * If the values for a parameter are not directly comparable with ===
+   * and are instead Params types themselves then their corresponding paramMaps
+   * are compared against each other.
+   */
+  def compareParamMaps(pMaps: Array[ParamMap], pMaps2: Array[ParamMap]): Unit = {
+    assert(pMaps.length === pMaps2.length)
+    pMaps.zip(pMaps2).foreach { case (pMap, pMap2) =>
+      assert(pMap.size === pMap2.size)
+      pMap.toSeq.foreach { case ParamPair(p, v) =>
+        assert(pMap2.contains(p))
+        val otherParam = pMap2(p)
+        v match {
+          case estimator: Params =>
+            otherParam match {
+              case estimator2: Params =>
+                val estimatorParamMap = Array(estimator.extractParamMap())
+                val estimatorParamMap2 = Array(estimator2.extractParamMap())
+                compareParamMaps(estimatorParamMap, estimatorParamMap2)
+              case other =>
+                throw new AssertionError(s"Expected parameter of type Params but" +
+                  s" found ${otherParam.getClass.getName}")
+            }
+          case _ =>
+            assert(otherParam === v)
+        }
+      }
+    }
+  }
+
+  /**
+   * When nested estimators (ex. OneVsRest) are saved within meta-algorithms such as
+   * CrossValidator and TrainValidationSplit, relative paths should be used to store
+   * the path of the estimator so that if the parent directory changes, loading the
+   * model still works.
+   */
+  def testFileMove[T <: Params with MLWritable](instance: T): Unit = {
+    val uid = instance.uid
+    val subdirName = Identifiable.randomUID("test")
+
+    val subdir = new File(tempDir, subdirName)
+    val subDirWithUid = new File(subdir, uid)
+
+    instance.save(subDirWithUid.getPath)
+
+    val newSubdirName = Identifiable.randomUID("test_moved")
+    val newSubdir = new File(tempDir, newSubdirName)
+    val newSubdirWithUid = new File(newSubdir, uid)
+
+    Files.createDirectory(newSubdir.toPath)
+    Files.createDirectory(newSubdirWithUid.toPath)
+    Files.move(subDirWithUid.toPath, newSubdirWithUid.toPath, StandardCopyOption.ATOMIC_MOVE)
+
+    val loader = instance.getClass.getMethod("read").invoke(null).asInstanceOf[MLReader[T]]
+    val newInstance = loader.load(newSubdirWithUid.getPath)
+    assert(uid == newInstance.uid)
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/util/DefaultReadWriteTest.scala b/mllib/src/test/scala/org/apache/spark/ml/util/DefaultReadWriteTest.scala
index 27d606cb05dc2..4da95e74434ee 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/util/DefaultReadWriteTest.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/util/DefaultReadWriteTest.scala
@@ -55,7 +55,6 @@ trait DefaultReadWriteTest extends TempDirectory { self: Suite =>
     instance.write.overwrite().save(path)
     val loader = instance.getClass.getMethod("read").invoke(null).asInstanceOf[MLReader[T]]
     val newInstance = loader.load(path)
-
     assert(newInstance.uid === instance.uid)
     if (testParams) {
       instance.params.foreach { p =>
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 948806a5c936c..82207f664480a 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -25,7 +25,7 @@
 from pyspark.ml.util import *
 from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams
 from pyspark.ml.wrapper import JavaWrapper
-from pyspark.ml.common import inherit_doc
+from pyspark.ml.common import inherit_doc, _java2py, _py2java
 from pyspark.sql import DataFrame
 from pyspark.sql.functions import udf, when
 from pyspark.sql.types import ArrayType, DoubleType
@@ -1472,7 +1472,7 @@ def getClassifier(self):
 
 
 @inherit_doc
-class OneVsRest(Estimator, OneVsRestParams, MLReadable, MLWritable):
+class OneVsRest(Estimator, OneVsRestParams, JavaMLReadable, JavaMLWritable):
     """
     .. note:: Experimental
 
@@ -1589,22 +1589,6 @@ def copy(self, extra=None):
             newOvr.setClassifier(self.getClassifier().copy(extra))
         return newOvr
 
-    @since("2.0.0")
-    def write(self):
-        """Returns an MLWriter instance for this ML instance."""
-        return JavaMLWriter(self)
-
-    @since("2.0.0")
-    def save(self, path):
-        """Save this ML instance to the given path, a shortcut of `write().save(path)`."""
-        self.write().save(path)
-
-    @classmethod
-    @since("2.0.0")
-    def read(cls):
-        """Returns an MLReader instance for this class."""
-        return JavaMLReader(cls)
-
     @classmethod
     def _from_java(cls, java_stage):
         """
@@ -1634,8 +1618,52 @@ def _to_java(self):
         _java_obj.setPredictionCol(self.getPredictionCol())
         return _java_obj
 
+    def _make_java_param_pair(self, param, value):
+        """
+        Makes a Java param pair.
+        """
+        sc = SparkContext._active_spark_context
+        param = self._resolveParam(param)
+        _java_obj = JavaParams._new_java_obj("org.apache.spark.ml.classification.OneVsRest",
+                                             self.uid)
+        java_param = _java_obj.getParam(param.name)
+        if isinstance(value, JavaParams):
+            # used in the case of an estimator having another estimator as a parameter
+            # the reason why this is not in _py2java in common.py is that importing
+            # Estimator and Model in common.py results in a circular import with inherit_doc
+            java_value = value._to_java()
+        else:
+            java_value = _py2java(sc, value)
+        return java_param.w(java_value)
 
-class OneVsRestModel(Model, OneVsRestParams, MLReadable, MLWritable):
+    def _transfer_param_map_to_java(self, pyParamMap):
+        """
+        Transforms a Python ParamMap into a Java ParamMap.
+        """
+        paramMap = JavaWrapper._new_java_obj("org.apache.spark.ml.param.ParamMap")
+        for param in self.params:
+            if param in pyParamMap:
+                pair = self._make_java_param_pair(param, pyParamMap[param])
+                paramMap.put([pair])
+        return paramMap
+
+    def _transfer_param_map_from_java(self, javaParamMap):
+        """
+        Transforms a Java ParamMap into a Python ParamMap.
+        """
+        sc = SparkContext._active_spark_context
+        paramMap = dict()
+        for pair in javaParamMap.toList():
+            param = pair.param()
+            if self.hasParam(str(param.name())):
+                if param.name() == "classifier":
+                    paramMap[self.getParam(param.name())] = JavaParams._from_java(pair.value())
+                else:
+                    paramMap[self.getParam(param.name())] = _java2py(sc, pair.value())
+        return paramMap
+
+
+class OneVsRestModel(Model, OneVsRestParams, JavaMLReadable, JavaMLWritable):
     """
     .. note:: Experimental
 
@@ -1650,6 +1678,16 @@ class OneVsRestModel(Model, OneVsRestParams, MLReadable, MLWritable):
     def __init__(self, models):
         super(OneVsRestModel, self).__init__()
         self.models = models
+        java_models = [model._to_java() for model in self.models]
+        sc = SparkContext._active_spark_context
+        java_models_array = JavaWrapper._new_java_array(java_models,
+                                                        sc._gateway.jvm.org.apache.spark.ml
+                                                        .classification.ClassificationModel)
+        # TODO: need to set metadata
+        metadata = JavaParams._new_java_obj("org.apache.spark.sql.types.Metadata")
+        self._java_obj = \
+            JavaParams._new_java_obj("org.apache.spark.ml.classification.OneVsRestModel",
+                                     self.uid, metadata.empty(), java_models_array)
 
     def _transform(self, dataset):
         # determine the input columns: these need to be passed through
@@ -1715,22 +1753,6 @@ def copy(self, extra=None):
         newModel.models = [model.copy(extra) for model in self.models]
         return newModel
 
-    @since("2.0.0")
-    def write(self):
-        """Returns an MLWriter instance for this ML instance."""
-        return JavaMLWriter(self)
-
-    @since("2.0.0")
-    def save(self, path):
-        """Save this ML instance to the given path, a shortcut of `write().save(path)`."""
-        self.write().save(path)
-
-    @classmethod
-    @since("2.0.0")
-    def read(cls):
-        """Returns an MLReader instance for this class."""
-        return JavaMLReader(cls)
-
     @classmethod
     def _from_java(cls, java_stage):
         """
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 7870047651601..6c71e69c9b5f9 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -49,7 +49,8 @@
 from pyspark.ml.classification import *
 from pyspark.ml.clustering import *
 from pyspark.ml.common import _java2py, _py2java
-from pyspark.ml.evaluation import BinaryClassificationEvaluator, RegressionEvaluator
+from pyspark.ml.evaluation import BinaryClassificationEvaluator, \
+    MulticlassClassificationEvaluator, RegressionEvaluator
 from pyspark.ml.feature import *
 from pyspark.ml.fpm import FPGrowth, FPGrowthModel
 from pyspark.ml.linalg import DenseMatrix, DenseMatrix, DenseVector, Matrices, MatrixUDT, \
@@ -678,7 +679,7 @@ def test_fit_maximize_metric(self):
                          "Best model should have zero induced error")
         self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1")
 
-    def test_save_load(self):
+    def test_save_load_trained_model(self):
         # This tests saving and loading the trained model only.
         # Save/load for CrossValidator will be added later: SPARK-13786
         temp_path = tempfile.mkdtemp()
@@ -702,6 +703,76 @@ def test_save_load(self):
         self.assertEqual(loadedLrModel.uid, lrModel.uid)
         self.assertEqual(loadedLrModel.intercept, lrModel.intercept)
 
+    def test_save_load_simple_estimator(self):
+        temp_path = tempfile.mkdtemp()
+        dataset = self.spark.createDataFrame(
+            [(Vectors.dense([0.0]), 0.0),
+             (Vectors.dense([0.4]), 1.0),
+             (Vectors.dense([0.5]), 0.0),
+             (Vectors.dense([0.6]), 1.0),
+             (Vectors.dense([1.0]), 1.0)] * 10,
+            ["features", "label"])
+
+        lr = LogisticRegression()
+        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
+        evaluator = BinaryClassificationEvaluator()
+
+        # test save/load of CrossValidator
+        cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
+        cvModel = cv.fit(dataset)
+        cvPath = temp_path + "/cv"
+        cv.save(cvPath)
+        loadedCV = CrossValidator.load(cvPath)
+        self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid)
+        self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid)
+        self.assertEqual(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps())
+
+        # test save/load of CrossValidatorModel
+        cvModelPath = temp_path + "/cvModel"
+        cvModel.save(cvModelPath)
+        loadedModel = CrossValidatorModel.load(cvModelPath)
+        self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
+
+    def test_save_load_nested_estimator(self):
+        temp_path = tempfile.mkdtemp()
+        dataset = self.spark.createDataFrame(
+            [(Vectors.dense([0.0]), 0.0),
+             (Vectors.dense([0.4]), 1.0),
+             (Vectors.dense([0.5]), 0.0),
+             (Vectors.dense([0.6]), 1.0),
+             (Vectors.dense([1.0]), 1.0)] * 10,
+            ["features", "label"])
+
+        ova = OneVsRest(classifier=LogisticRegression())
+        lr1 = LogisticRegression().setMaxIter(100)
+        lr2 = LogisticRegression().setMaxIter(150)
+        grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build()
+        evaluator = MulticlassClassificationEvaluator()
+
+        # test save/load of CrossValidator
+        cv = CrossValidator(estimator=ova, estimatorParamMaps=grid, evaluator=evaluator)
+        cvModel = cv.fit(dataset)
+        cvPath = temp_path + "/cv"
+        cv.save(cvPath)
+        loadedCV = CrossValidator.load(cvPath)
+        self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid)
+        self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid)
+
+        originalParamMap = cv.getEstimatorParamMaps()
+        loadedParamMap = loadedCV.getEstimatorParamMaps()
+        for i, param in enumerate(loadedParamMap):
+            for p in param:
+                if p.name == "classifier":
+                    self.assertEqual(param[p].uid, originalParamMap[i][p].uid)
+                else:
+                    self.assertEqual(param[p], originalParamMap[i][p])
+
+        # test save/load of CrossValidatorModel
+        cvModelPath = temp_path + "/cvModel"
+        cvModel.save(cvModelPath)
+        loadedModel = CrossValidatorModel.load(cvModelPath)
+        self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
+
 
 class TrainValidationSplitTests(SparkSessionTestCase):
 
@@ -759,7 +830,7 @@ def test_fit_maximize_metric(self):
                          "validationMetrics has the same size of grid parameter")
         self.assertEqual(1.0, max(validationMetrics))
 
-    def test_save_load(self):
+    def test_save_load_trained_model(self):
         # This tests saving and loading the trained model only.
         # Save/load for TrainValidationSplit will be added later: SPARK-13786
         temp_path = tempfile.mkdtemp()
@@ -783,6 +854,74 @@ def test_save_load(self):
         self.assertEqual(loadedLrModel.uid, lrModel.uid)
         self.assertEqual(loadedLrModel.intercept, lrModel.intercept)
 
+    def test_save_load_simple_estimator(self):
+        # This tests saving and loading the trained model only.
+        # Save/load for TrainValidationSplit will be added later: SPARK-13786
+        temp_path = tempfile.mkdtemp()
+        dataset = self.spark.createDataFrame(
+            [(Vectors.dense([0.0]), 0.0),
+             (Vectors.dense([0.4]), 1.0),
+             (Vectors.dense([0.5]), 0.0),
+             (Vectors.dense([0.6]), 1.0),
+             (Vectors.dense([1.0]), 1.0)] * 10,
+            ["features", "label"])
+        lr = LogisticRegression()
+        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
+        evaluator = BinaryClassificationEvaluator()
+        tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
+        tvsModel = tvs.fit(dataset)
+
+        tvsPath = temp_path + "/tvs"
+        tvs.save(tvsPath)
+        loadedTvs = TrainValidationSplit.load(tvsPath)
+        self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid)
+        self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid)
+        self.assertEqual(loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps())
+
+        tvsModelPath = temp_path + "/tvsModel"
+        tvsModel.save(tvsModelPath)
+        loadedModel = TrainValidationSplitModel.load(tvsModelPath)
+        self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
+
+    def test_save_load_nested_estimator(self):
+        # This tests saving and loading the trained model only.
+        # Save/load for TrainValidationSplit will be added later: SPARK-13786
+        temp_path = tempfile.mkdtemp()
+        dataset = self.spark.createDataFrame(
+            [(Vectors.dense([0.0]), 0.0),
+             (Vectors.dense([0.4]), 1.0),
+             (Vectors.dense([0.5]), 0.0),
+             (Vectors.dense([0.6]), 1.0),
+             (Vectors.dense([1.0]), 1.0)] * 10,
+            ["features", "label"])
+        ova = OneVsRest(classifier=LogisticRegression())
+        lr1 = LogisticRegression().setMaxIter(100)
+        lr2 = LogisticRegression().setMaxIter(150)
+        grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build()
+        evaluator = MulticlassClassificationEvaluator()
+
+        tvs = TrainValidationSplit(estimator=ova, estimatorParamMaps=grid, evaluator=evaluator)
+        tvsModel = tvs.fit(dataset)
+        tvsPath = temp_path + "/tvs"
+        tvs.save(tvsPath)
+        loadedTvs = TrainValidationSplit.load(tvsPath)
+        self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid)
+        self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid)
+
+        originalParamMap = tvs.getEstimatorParamMaps()
+        loadedParamMap = loadedTvs.getEstimatorParamMaps()
+        for i, param in enumerate(loadedParamMap):
+            for p in param:
+                if p.name == "classifier":
+                    self.assertEqual(param[p].uid, originalParamMap[i][p].uid)
+                else:
+                    self.assertEqual(param[p], originalParamMap[i][p])
+
+        tvsModelPath = temp_path + "/tvsModel"
+        tvsModel.save(tvsModelPath)
+        loadedModel = TrainValidationSplitModel.load(tvsModelPath)
+        self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
+
     def test_copy(self):
         dataset = self.spark.createDataFrame([
             (10, 10.0),
diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py
index b64858214d20d..00c348aa9f7de 100644
--- a/python/pyspark/ml/tuning.py
+++ b/python/pyspark/ml/tuning.py
@@ -20,8 +20,11 @@
 
 from pyspark import since, keyword_only
 from pyspark.ml import Estimator, Model
+from pyspark.ml.common import _py2java
 from pyspark.ml.param import Params, Param, TypeConverters
 from pyspark.ml.param.shared import HasSeed
+from pyspark.ml.util import *
+from pyspark.ml.wrapper import JavaParams
 from pyspark.sql.functions import rand
 
 __all__ = ['ParamGridBuilder', 'CrossValidator', 'CrossValidatorModel', 'TrainValidationSplit',
@@ -137,8 +140,37 @@ def getEvaluator(self):
         """
         return self.getOrDefault(self.evaluator)
 
+    @classmethod
+    def _from_java_impl(cls, java_stage):
+        """
+        Return Python estimator, estimatorParamMaps, and evaluator from a Java ValidatorParams.
+        """
+
+        # Load information from java_stage to the instance.
+        estimator = JavaParams._from_java(java_stage.getEstimator())
+        evaluator = JavaParams._from_java(java_stage.getEvaluator())
+        epms = [estimator._transfer_param_map_from_java(epm)
+                for epm in java_stage.getEstimatorParamMaps()]
+        return estimator, epms, evaluator
+
+    def _to_java_impl(self):
+        """
+        Return Java estimator, estimatorParamMaps, and evaluator from this Python instance.
+        """
+
+        gateway = SparkContext._gateway
+        cls = SparkContext._jvm.org.apache.spark.ml.param.ParamMap
+
+        java_epms = gateway.new_array(cls, len(self.getEstimatorParamMaps()))
+        for idx, epm in enumerate(self.getEstimatorParamMaps()):
+            java_epms[idx] = self.getEstimator()._transfer_param_map_to_java(epm)
 
-class CrossValidator(Estimator, ValidatorParams):
+        java_estimator = self.getEstimator()._to_java()
+        java_evaluator = self.getEvaluator()._to_java()
+        return java_estimator, java_epms, java_evaluator
+
+
+class CrossValidator(Estimator, ValidatorParams, MLReadable, MLWritable):
     """
 
     K-fold cross validation performs model selection by splitting the dataset into a set of
@@ -263,8 +295,53 @@ def copy(self, extra=None):
             newCV.setEvaluator(self.getEvaluator().copy(extra))
         return newCV
 
+    @since("2.3.0")
+    def write(self):
+        """Returns an MLWriter instance for this ML instance."""
+        return JavaMLWriter(self)
+
+    @classmethod
+    @since("2.3.0")
+    def read(cls):
+        """Returns an MLReader instance for this class."""
+        return JavaMLReader(cls)
+
+    @classmethod
+    def _from_java(cls, java_stage):
+        """
+        Given a Java CrossValidator, create and return a Python wrapper of it.
+        Used for ML persistence.
+        """
 
-class CrossValidatorModel(Model, ValidatorParams):
+        estimator, epms, evaluator = super(CrossValidator, cls)._from_java_impl(java_stage)
+        numFolds = java_stage.getNumFolds()
+        seed = java_stage.getSeed()
+        # Create a new instance of this stage.
+        py_stage = cls(estimator=estimator, estimatorParamMaps=epms, evaluator=evaluator,
+                       numFolds=numFolds, seed=seed)
+        py_stage._resetUid(java_stage.uid())
+        return py_stage
+
+    def _to_java(self):
+        """
+        Transfer this instance to a Java CrossValidator. Used for ML persistence.
+
+        :return: Java object equivalent to this instance.
+        """
+
+        estimator, epms, evaluator = super(CrossValidator, self)._to_java_impl()
+
+        _java_obj = JavaParams._new_java_obj("org.apache.spark.ml.tuning.CrossValidator", self.uid)
+        _java_obj.setEstimatorParamMaps(epms)
+        _java_obj.setEvaluator(evaluator)
+        _java_obj.setEstimator(estimator)
+        _java_obj.setSeed(self.getSeed())
+        _java_obj.setNumFolds(self.getNumFolds())
+
+        return _java_obj
+
+
+class CrossValidatorModel(Model, ValidatorParams, MLReadable, MLWritable):
     """
 
     CrossValidatorModel contains the model with the highest average cross-validation
@@ -302,8 +379,55 @@ def copy(self, extra=None):
         avgMetrics = self.avgMetrics
         return CrossValidatorModel(bestModel, avgMetrics)
 
+    @since("2.3.0")
+    def write(self):
+        """Returns an MLWriter instance for this ML instance."""
+        return JavaMLWriter(self)
+
+    @classmethod
+    @since("2.3.0")
+    def read(cls):
+        """Returns an MLReader instance for this class."""
+        return JavaMLReader(cls)
 
-class TrainValidationSplit(Estimator, ValidatorParams):
+    @classmethod
+    def _from_java(cls, java_stage):
+        """
+        Given a Java CrossValidatorModel, create and return a Python wrapper of it.
+        Used for ML persistence.
+        """
+
+        bestModel = JavaParams._from_java(java_stage.bestModel())
+        estimator, epms, evaluator = super(CrossValidatorModel, cls)._from_java_impl(java_stage)
+
+        py_stage = cls(bestModel=bestModel).setEstimator(estimator)
+        py_stage = py_stage.setEstimatorParamMaps(epms).setEvaluator(evaluator)
+
+        py_stage._resetUid(java_stage.uid())
+        return py_stage
+
+    def _to_java(self):
+        """
+        Transfer this instance to a Java CrossValidatorModel. Used for ML persistence.
+
+        :return: Java object equivalent to this instance.
+        """
+
+        sc = SparkContext._active_spark_context
+        # TODO: persist average metrics as well
+        _java_obj = JavaParams._new_java_obj("org.apache.spark.ml.tuning.CrossValidatorModel",
+                                             self.uid,
+                                             self.bestModel._to_java(),
+                                             _py2java(sc, []))
+        estimator, epms, evaluator = super(CrossValidatorModel, self)._to_java_impl()
+
+        _java_obj.set("evaluator", evaluator)
+        _java_obj.set("estimator", estimator)
+        _java_obj.set("estimatorParamMaps", epms)
+        return _java_obj
+
+
+class TrainValidationSplit(Estimator, ValidatorParams, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -418,8 +542,53 @@ def copy(self, extra=None):
             newTVS.setEvaluator(self.getEvaluator().copy(extra))
         return newTVS
 
+    @since("2.3.0")
+    def write(self):
+        """Returns an MLWriter instance for this ML instance."""
+        return JavaMLWriter(self)
+
+    @classmethod
+    @since("2.3.0")
+    def read(cls):
+        """Returns an MLReader instance for this class."""
+        return JavaMLReader(cls)
+
+    @classmethod
+    def _from_java(cls, java_stage):
+        """
+        Given a Java TrainValidationSplit, create and return a Python wrapper of it.
+        Used for ML persistence.
+        """
+
+        estimator, epms, evaluator = super(TrainValidationSplit, cls)._from_java_impl(java_stage)
+        trainRatio = java_stage.getTrainRatio()
+        seed = java_stage.getSeed()
+        # Create a new instance of this stage.
+        py_stage = cls(estimator=estimator, estimatorParamMaps=epms, evaluator=evaluator,
+                       trainRatio=trainRatio, seed=seed)
+        py_stage._resetUid(java_stage.uid())
+        return py_stage
+
+    def _to_java(self):
+        """
+        Transfer this instance to a Java TrainValidationSplit. Used for ML persistence.
+        :return: Java object equivalent to this instance.
+        """
+
+        estimator, epms, evaluator = super(TrainValidationSplit, self)._to_java_impl()
 
-class TrainValidationSplitModel(Model, ValidatorParams):
+        _java_obj = JavaParams._new_java_obj("org.apache.spark.ml.tuning.TrainValidationSplit",
+                                             self.uid)
+        _java_obj.setEstimatorParamMaps(epms)
+        _java_obj.setEvaluator(evaluator)
+        _java_obj.setEstimator(estimator)
+        _java_obj.setTrainRatio(self.getTrainRatio())
+        _java_obj.setSeed(self.getSeed())
+
+        return _java_obj
+
+
+class TrainValidationSplitModel(Model, ValidatorParams, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -456,6 +625,55 @@ def copy(self, extra=None):
         validationMetrics = list(self.validationMetrics)
         return TrainValidationSplitModel(bestModel, validationMetrics)
 
+    @since("2.3.0")
+    def write(self):
+        """Returns an MLWriter instance for this ML instance."""
+        return JavaMLWriter(self)
+
+    @classmethod
+    @since("2.3.0")
+    def read(cls):
+        """Returns an MLReader instance for this class."""
+        return JavaMLReader(cls)
+
+    @classmethod
+    def _from_java(cls, java_stage):
+        """
+        Given a Java TrainValidationSplitModel, create and return a Python wrapper of it.
+        Used for ML persistence.
+        """
+
+        # Load information from java_stage to the instance.
+        bestModel = JavaParams._from_java(java_stage.bestModel())
+        estimator, epms, evaluator = super(TrainValidationSplitModel,
+                                           cls)._from_java_impl(java_stage)
+        # Create a new instance of this stage.
+        py_stage = cls(bestModel=bestModel).setEstimator(estimator)
+        py_stage = py_stage.setEstimatorParamMaps(epms).setEvaluator(evaluator)
+
+        py_stage._resetUid(java_stage.uid())
+        return py_stage
+
+    def _to_java(self):
+        """
+        Transfer this instance to a Java TrainValidationSplitModel. Used for ML persistence.
+        :return: Java object equivalent to this instance.
+        """
+
+        sc = SparkContext._active_spark_context
+        # TODO: persst validation metrics as well
+        _java_obj = JavaParams._new_java_obj(
+            "org.apache.spark.ml.tuning.TrainValidationSplitModel",
+            self.uid,
+            self.bestModel._to_java(),
+            _py2java(sc, []))
+        estimator, epms, evaluator = super(TrainValidationSplitModel, self)._to_java_impl()
+
+        _java_obj.set("evaluator", evaluator)
+        _java_obj.set("estimator", estimator)
+        _java_obj.set("estimatorParamMaps", epms)
+        return _java_obj
+
 
 if __name__ == "__main__":
     import doctest
diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py
index 80a0b31cd88d9..ee6301ef19a43 100644
--- a/python/pyspark/ml/wrapper.py
+++ b/python/pyspark/ml/wrapper.py
@@ -106,7 +106,7 @@ def __del__(self):
 
     def _make_java_param_pair(self, param, value):
         """
-        Makes a Java parm pair.
+        Makes a Java param pair.
         """
         sc = SparkContext._active_spark_context
         param = self._resolveParam(param)

From 0e07a29cf4a5587f939585e6885ed0f7e68c31b5 Mon Sep 17 00:00:00 2001
From: John Lee <jlee2@yahoo-inc.com>
Date: Mon, 17 Jul 2017 13:13:35 -0500
Subject: [PATCH 184/779] [SPARK-21321][SPARK CORE] Spark very verbose on
 shutdown

## What changes were proposed in this pull request?

The current code is very verbose on shutdown.

The changes I propose is to change the log level when the driver is shutting down and the RPC connections are closed (RpcEnvStoppedException).

## How was this patch tested?

Tested with word count(deploy-mode = cluster, master = yarn, num-executors = 4) with 300GB of data.

Author: John Lee <jlee2@yahoo-inc.com>

Closes #18547 from yoonlee95/SPARK-21321.
---
 .../scala/org/apache/spark/rpc/netty/Dispatcher.scala |  7 +++++--
 .../main/scala/org/apache/spark/rpc/netty/Inbox.scala |  7 ++++++-
 .../org/apache/spark/rpc/netty/NettyRpcEnv.scala      |  7 +++++--
 .../scala/org/apache/spark/rpc/netty/Outbox.scala     |  2 +-
 .../org/apache/spark/scheduler/LiveListenerBus.scala  |  2 +-
 .../scheduler/cluster/YarnSchedulerBackend.scala      | 11 +++++++++--
 6 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
index a02cf30a5d831..e94babb846128 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
@@ -109,8 +109,11 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
     val iter = endpoints.keySet().iterator()
     while (iter.hasNext) {
       val name = iter.next
-      postMessage(name, message, (e) => logWarning(s"Message $message dropped. ${e.getMessage}"))
-    }
+        postMessage(name, message, (e) => { e match {
+          case e: RpcEnvStoppedException => logDebug (s"Message $message dropped. ${e.getMessage}")
+          case e: Throwable => logWarning(s"Message $message dropped. ${e.getMessage}")
+        }}
+      )}
   }
 
   /** Posts a message sent by a remote endpoint. */
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala
index ae4a6003517cc..d32eba64e13e9 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala
@@ -205,7 +205,12 @@ private[netty] class Inbox(
     try action catch {
       case NonFatal(e) =>
         try endpoint.onError(e) catch {
-          case NonFatal(ee) => logError(s"Ignoring error", ee)
+          case NonFatal(ee) =>
+            if (stopped) {
+              logDebug("Ignoring error", ee)
+            } else {
+              logError("Ignoring error", ee)
+            }
         }
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
index b316e5443f639..64898499246ac 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
@@ -185,7 +185,7 @@ private[netty] class NettyRpcEnv(
       try {
         dispatcher.postOneWayMessage(message)
       } catch {
-        case e: RpcEnvStoppedException => logWarning(e.getMessage)
+        case e: RpcEnvStoppedException => logDebug(e.getMessage)
       }
     } else {
       // Message to a remote RPC endpoint.
@@ -203,7 +203,10 @@ private[netty] class NettyRpcEnv(
 
     def onFailure(e: Throwable): Unit = {
       if (!promise.tryFailure(e)) {
-        logWarning(s"Ignored failure: $e")
+        e match {
+          case e : RpcEnvStoppedException => logDebug (s"Ignored failure: $e")
+          case _ => logWarning(s"Ignored failure: $e")
+        }
       }
     }
 
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Outbox.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Outbox.scala
index a7b7f58376f6b..b7e068aa68357 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/Outbox.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/Outbox.scala
@@ -45,7 +45,7 @@ private[netty] case class OneWayOutboxMessage(content: ByteBuffer) extends Outbo
 
   override def onFailure(e: Throwable): Unit = {
     e match {
-      case e1: RpcEnvStoppedException => logWarning(e1.getMessage)
+      case e1: RpcEnvStoppedException => logDebug(e1.getMessage)
       case e1: Throwable => logWarning(s"Failed to send one-way RPC.", e1)
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala
index 0dd63d4392800..7d5e9809dd7b2 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala
@@ -136,7 +136,7 @@ private[spark] class LiveListenerBus(conf: SparkConf) extends SparkListenerBus {
   def post(event: SparkListenerEvent): Unit = {
     if (stopped.get) {
       // Drop further events to make `listenerThread` exit ASAP
-      logError(s"$name has already stopped! Dropping event $event")
+      logDebug(s"$name has already stopped! Dropping event $event")
       return
     }
     metrics.numEventsPosted.inc()
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
index cbc6e60e839c1..8452f43774194 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.scheduler.cluster
 
+import java.util.concurrent.atomic.{AtomicBoolean}
+
 import scala.concurrent.{ExecutionContext, Future}
 import scala.util.{Failure, Success}
 import scala.util.control.NonFatal
@@ -40,6 +42,8 @@ private[spark] abstract class YarnSchedulerBackend(
     sc: SparkContext)
   extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv) {
 
+  private val stopped = new AtomicBoolean(false)
+
   override val minRegisteredRatio =
     if (conf.getOption("spark.scheduler.minRegisteredResourcesRatio").isEmpty) {
       0.8
@@ -93,6 +97,7 @@ private[spark] abstract class YarnSchedulerBackend(
       requestTotalExecutors(0, 0, Map.empty)
       super.stop()
     } finally {
+      stopped.set(true)
       services.stop()
     }
   }
@@ -206,8 +211,10 @@ private[spark] abstract class YarnSchedulerBackend(
      */
     override def onDisconnected(rpcAddress: RpcAddress): Unit = {
       addressToExecutorId.get(rpcAddress).foreach { executorId =>
-        if (disableExecutor(executorId)) {
-          yarnSchedulerEndpoint.handleExecutorDisconnectedFromDriver(executorId, rpcAddress)
+        if (!stopped.get) {
+          if (disableExecutor(executorId)) {
+            yarnSchedulerEndpoint.handleExecutorDisconnectedFromDriver(executorId, rpcAddress)
+          }
         }
       }
     }

From 53465075c5d7bf2a52b6af0d682219285ee219c6 Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Mon, 17 Jul 2017 13:11:30 -0700
Subject: [PATCH 185/779] [SPARK-21377][YARN] Make jars specify with
 --jars/--packages load-able in AM's credential renwer

## What changes were proposed in this pull request?

In this issue we have a long running Spark application with secure HBase, which requires `HBaseCredentialProvider` to get tokens periodically, we specify HBase related jars with `--packages`, but these dependencies are not added into AM classpath, so when `HBaseCredentialProvider` tries to initialize HBase connections to get tokens, it will be failed.

Currently because jars specified with `--jars` or `--packages` are not added into AM classpath, the only way to extend AM classpath is to use "spark.driver.extraClassPath" which supposed to be used in yarn cluster mode.

So in this fix, we proposed to use/reuse a classloader for `AMCredentialRenewer` to acquire new tokens.

Also in this patch, we fixed AM cannot get tokens from HDFS issue, it is because FileSystem is gotten before kerberos logged, so using this FS to get tokens will throw exception.

## How was this patch tested?

Manual verification.

Author: jerryshao <sshao@hortonworks.com>

Closes #18616 from jerryshao/SPARK-21377.
---
 .../spark/deploy/yarn/ApplicationMaster.scala | 59 ++++++++++++-------
 1 file changed, 38 insertions(+), 21 deletions(-)

diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 4868180569778..ce290c399d9f2 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -90,6 +90,23 @@ private[spark] class ApplicationMaster(
   @volatile private var reporterThread: Thread = _
   @volatile private var allocator: YarnAllocator = _
 
+  private val userClassLoader = {
+    val classpath = Client.getUserClasspath(sparkConf)
+    val urls = classpath.map { entry =>
+      new URL("file:" + new File(entry.getPath()).getAbsolutePath())
+    }
+
+    if (isClusterMode) {
+      if (Client.isUserClassPathFirst(sparkConf, isDriver = true)) {
+        new ChildFirstURLClassLoader(urls, Utils.getContextOrSparkClassLoader)
+      } else {
+        new MutableURLClassLoader(urls, Utils.getContextOrSparkClassLoader)
+      }
+    } else {
+      new MutableURLClassLoader(urls, Utils.getContextOrSparkClassLoader)
+    }
+  }
+
   // Lock for controlling the allocator (heartbeat) thread.
   private val allocatorLock = new Object()
 
@@ -242,16 +259,27 @@ private[spark] class ApplicationMaster(
 
       // If the credentials file config is present, we must periodically renew tokens. So create
       // a new AMDelegationTokenRenewer
-      if (sparkConf.contains(CREDENTIALS_FILE_PATH.key)) {
-        // If a principal and keytab have been set, use that to create new credentials for executors
-        // periodically
-        val credentialManager = new YARNHadoopDelegationTokenManager(
-          sparkConf,
-          yarnConf,
-          YarnSparkHadoopUtil.get.hadoopFSsToAccess(sparkConf, yarnConf))
-
-        val credentialRenewer = new AMCredentialRenewer(sparkConf, yarnConf, credentialManager)
-        credentialRenewer.scheduleLoginFromKeytab()
+      if (sparkConf.contains(CREDENTIALS_FILE_PATH)) {
+        // Start a short-lived thread for AMCredentialRenewer, the only purpose is to set the
+        // classloader so that main jar and secondary jars could be used by AMCredentialRenewer.
+        val credentialRenewerThread = new Thread {
+          setName("AMCredentialRenewerStarter")
+          setContextClassLoader(userClassLoader)
+
+          override def run(): Unit = {
+            val credentialManager = new YARNHadoopDelegationTokenManager(
+              sparkConf,
+              yarnConf,
+              YarnSparkHadoopUtil.get.hadoopFSsToAccess(sparkConf, yarnConf))
+
+            val credentialRenewer =
+              new AMCredentialRenewer(sparkConf, yarnConf, credentialManager)
+            credentialRenewer.scheduleLoginFromKeytab()
+          }
+        }
+
+        credentialRenewerThread.start()
+        credentialRenewerThread.join()
       }
 
       if (isClusterMode) {
@@ -609,17 +637,6 @@ private[spark] class ApplicationMaster(
   private def startUserApplication(): Thread = {
     logInfo("Starting the user application in a separate Thread")
 
-    val classpath = Client.getUserClasspath(sparkConf)
-    val urls = classpath.map { entry =>
-      new URL("file:" + new File(entry.getPath()).getAbsolutePath())
-    }
-    val userClassLoader =
-      if (Client.isUserClassPathFirst(sparkConf, isDriver = true)) {
-        new ChildFirstURLClassLoader(urls, Utils.getContextOrSparkClassLoader)
-      } else {
-        new MutableURLClassLoader(urls, Utils.getContextOrSparkClassLoader)
-      }
-
     var userArgs = args.userArgs
     if (args.primaryPyFile != null && args.primaryPyFile.endsWith(".py")) {
       // When running pyspark, the app is run using PythonRunner. The second argument is the list

From 9d8c83179a4ab032a42a4f1c41423445657c2575 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Mon, 17 Jul 2017 16:48:15 -0700
Subject: [PATCH 186/779] [SPARK-21409][SS] Expose state store memory usage in
 SQL metrics and progress updates

## What changes were proposed in this pull request?

Currently, there is no tracking of memory usage of state stores. This JIRA is to expose that through SQL metrics and StreamingQueryProgress.

Additionally, added the ability to expose implementation-specific metrics through the StateStore APIs to the SQLMetrics.

## How was this patch tested?
Added unit tests.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #18629 from tdas/SPARK-21409.
---
 .../FlatMapGroupsWithStateExec.scala          |  2 +-
 .../streaming/ProgressReporter.scala          | 16 ++----
 .../state/HDFSBackedStateStoreProvider.scala  | 10 +++-
 .../streaming/state/StateStore.scala          | 48 ++++++++++++++---
 .../streaming/statefulOperators.scala         | 52 +++++++++++++++----
 .../apache/spark/sql/streaming/progress.scala | 12 ++++-
 .../streaming/state/StateStoreSuite.scala     | 22 +++++---
 .../FlatMapGroupsWithStateSuite.scala         |  4 +-
 ...StreamingQueryStatusAndProgressSuite.scala | 45 +++++++++-------
 9 files changed, 150 insertions(+), 61 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala
index 9dcac33b4107c..ab690fd5fbbca 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala
@@ -136,7 +136,7 @@ case class FlatMapGroupsWithStateExec(
           outputIterator,
           {
             store.commit()
-            longMetric("numTotalStateRows") += store.numKeys()
+            setStoreMetrics(store)
           }
         )
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
index a4e4ca821374c..1887b07c49b73 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
@@ -186,18 +186,10 @@ trait ProgressReporter extends Logging {
     if (lastExecution == null) return Nil
     // lastExecution could belong to one of the previous triggers if `!hasNewData`.
     // Walking the plan again should be inexpensive.
-    val stateNodes = lastExecution.executedPlan.collect {
-      case p if p.isInstanceOf[StateStoreWriter] => p
-    }
-    stateNodes.map { node =>
-      val numRowsUpdated = if (hasNewData) {
-        node.metrics.get("numUpdatedStateRows").map(_.value).getOrElse(0L)
-      } else {
-        0L
-      }
-      new StateOperatorProgress(
-        numRowsTotal = node.metrics.get("numTotalStateRows").map(_.value).getOrElse(0L),
-        numRowsUpdated = numRowsUpdated)
+    lastExecution.executedPlan.collect {
+      case p if p.isInstanceOf[StateStoreWriter] =>
+        val progress = p.asInstanceOf[StateStoreWriter].getProgress()
+        if (hasNewData) progress else progress.copy(newNumRowsUpdated = 0)
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
index bae7a15165e43..fa4c99c01916f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
@@ -35,7 +35,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.io.LZ4CompressionCodec
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.types.StructType
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{SizeEstimator, Utils}
 
 
 /**
@@ -172,7 +172,9 @@ private[state] class HDFSBackedStateStoreProvider extends StateStoreProvider wit
       }
     }
 
-    override def numKeys(): Long = mapToUpdate.size()
+    override def metrics: StateStoreMetrics = {
+      StateStoreMetrics(mapToUpdate.size(), SizeEstimator.estimate(mapToUpdate), Map.empty)
+    }
 
     /**
      * Whether all updates have been committed
@@ -230,6 +232,10 @@ private[state] class HDFSBackedStateStoreProvider extends StateStoreProvider wit
     loadedMaps.values.foreach(_.clear())
   }
 
+  override def supportedCustomMetrics: Seq[StateStoreCustomMetric] = {
+    Nil
+  }
+
   override def toString(): String = {
     s"HDFSStateStoreProvider[" +
       s"id = (op=${stateStoreId.operatorId},part=${stateStoreId.partitionId}),dir = $baseDir]"
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
index 86886466c4f56..9da610e359f90 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
@@ -94,8 +94,8 @@ trait StateStore {
 
   def iterator(): Iterator[UnsafeRowPair]
 
-  /** Number of keys in the state store */
-  def numKeys(): Long
+  /** Current metrics of the state store */
+  def metrics: StateStoreMetrics
 
   /**
    * Whether all updates have been committed
@@ -103,6 +103,24 @@ trait StateStore {
   def hasCommitted: Boolean
 }
 
+/**
+ * Metrics reported by a state store
+ * @param numKeys         Number of keys in the state store
+ * @param memoryUsedBytes Memory used by the state store
+ * @param customMetrics   Custom implementation-specific metrics
+ *                        The metrics reported through this must have the same `name` as those
+ *                        reported by `StateStoreProvider.customMetrics`.
+ */
+case class StateStoreMetrics(
+    numKeys: Long,
+    memoryUsedBytes: Long,
+    customMetrics: Map[StateStoreCustomMetric, Long])
+
+/**
+ * Name and description of custom implementation-specific metrics that a
+ * state store may wish to expose.
+ */
+case class StateStoreCustomMetric(name: String, desc: String)
 
 /**
  * Trait representing a provider that provide [[StateStore]] instances representing
@@ -158,22 +176,36 @@ trait StateStoreProvider {
 
   /** Optional method for providers to allow for background maintenance (e.g. compactions) */
   def doMaintenance(): Unit = { }
+
+  /**
+   * Optional custom metrics that the implementation may want to report.
+   * @note The StateStore objects created by this provider must report the same custom metrics
+   * (specifically, same names) through `StateStore.metrics`.
+   */
+  def supportedCustomMetrics: Seq[StateStoreCustomMetric] = Nil
 }
 
 object StateStoreProvider {
+
+  /**
+   * Return a instance of the given provider class name. The instance will not be initialized.
+   */
+  def create(providerClassName: String): StateStoreProvider = {
+    val providerClass = Utils.classForName(providerClassName)
+    providerClass.newInstance().asInstanceOf[StateStoreProvider]
+  }
+
   /**
-   * Return a provider instance of the given provider class.
-   * The instance will be already initialized.
+   * Return a instance of the required provider, initialized with the given configurations.
    */
-  def instantiate(
+  def createAndInit(
       stateStoreId: StateStoreId,
       keySchema: StructType,
       valueSchema: StructType,
       indexOrdinal: Option[Int], // for sorting the data
       storeConf: StateStoreConf,
       hadoopConf: Configuration): StateStoreProvider = {
-    val providerClass = Utils.classForName(storeConf.providerClass)
-    val provider = providerClass.newInstance().asInstanceOf[StateStoreProvider]
+    val provider = create(storeConf.providerClass)
     provider.init(stateStoreId, keySchema, valueSchema, indexOrdinal, storeConf, hadoopConf)
     provider
   }
@@ -298,7 +330,7 @@ object StateStore extends Logging {
       startMaintenanceIfNeeded()
       val provider = loadedProviders.getOrElseUpdate(
         storeProviderId,
-        StateStoreProvider.instantiate(
+        StateStoreProvider.createAndInit(
           storeProviderId.storeId, keySchema, valueSchema, indexOrdinal, storeConf, hadoopConf)
       )
       reportActiveStoreInstance(storeProviderId)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala
index c5722466a33af..77b1160a063fb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala
@@ -20,6 +20,8 @@ package org.apache.spark.sql.execution.streaming
 import java.util.UUID
 import java.util.concurrent.TimeUnit._
 
+import scala.collection.JavaConverters._
+
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.errors._
@@ -29,9 +31,9 @@ import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark
 import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Distribution, Partitioning}
 import org.apache.spark.sql.catalyst.streaming.InternalOutputModes._
 import org.apache.spark.sql.execution._
-import org.apache.spark.sql.execution.metric.SQLMetrics
+import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
 import org.apache.spark.sql.execution.streaming.state._
-import org.apache.spark.sql.streaming.OutputMode
+import org.apache.spark.sql.streaming.{OutputMode, StateOperatorProgress}
 import org.apache.spark.sql.types._
 import org.apache.spark.util.{CompletionIterator, NextIterator}
 
@@ -73,8 +75,21 @@ trait StateStoreWriter extends StatefulOperator { self: SparkPlan =>
     "numUpdatedStateRows" -> SQLMetrics.createMetric(sparkContext, "number of updated state rows"),
     "allUpdatesTimeMs" -> SQLMetrics.createTimingMetric(sparkContext, "total time to update rows"),
     "allRemovalsTimeMs" -> SQLMetrics.createTimingMetric(sparkContext, "total time to remove rows"),
-    "commitTimeMs" -> SQLMetrics.createTimingMetric(sparkContext, "time to commit changes")
-  )
+    "commitTimeMs" -> SQLMetrics.createTimingMetric(sparkContext, "time to commit changes"),
+    "stateMemory" -> SQLMetrics.createSizeMetric(sparkContext, "memory used by state")
+  ) ++ stateStoreCustomMetrics
+
+  /**
+   * Get the progress made by this stateful operator after execution. This should be called in
+   * the driver after this SparkPlan has been executed and metrics have been updated.
+   */
+  def getProgress(): StateOperatorProgress = {
+    new StateOperatorProgress(
+      numRowsTotal = longMetric("numTotalStateRows").value,
+      numRowsUpdated = longMetric("numUpdatedStateRows").value,
+      memoryUsedBytes = longMetric("stateMemory").value,
+      numPartitions = this.sqlContext.conf.numShufflePartitions)
+  }
 
   /** Records the duration of running `body` for the next query progress update. */
   protected def timeTakenMs(body: => Unit): Long = {
@@ -83,6 +98,26 @@ trait StateStoreWriter extends StatefulOperator { self: SparkPlan =>
     val endTime = System.nanoTime()
     math.max(NANOSECONDS.toMillis(endTime - startTime), 0)
   }
+
+  /**
+   * Set the SQL metrics related to the state store.
+   * This should be called in that task after the store has been updated.
+   */
+  protected def setStoreMetrics(store: StateStore): Unit = {
+
+    val storeMetrics = store.metrics
+    longMetric("numTotalStateRows") += storeMetrics.numKeys
+    longMetric("stateMemory") += storeMetrics.memoryUsedBytes
+    storeMetrics.customMetrics.foreach { case (metric, value) =>
+      longMetric(metric.name) += value
+    }
+  }
+
+  private def stateStoreCustomMetrics: Map[String, SQLMetric] = {
+    val provider = StateStoreProvider.create(sqlContext.conf.stateStoreProviderClass)
+    provider.supportedCustomMetrics.map { m =>
+      m.name -> SQLMetrics.createTimingMetric(sparkContext, m.desc) }.toMap
+  }
 }
 
 /** An operator that supports watermark. */
@@ -197,7 +232,6 @@ case class StateStoreSaveExec(
       Some(sqlContext.streams.stateStoreCoordinator)) { (store, iter) =>
         val getKey = GenerateUnsafeProjection.generate(keyExpressions, child.output)
         val numOutputRows = longMetric("numOutputRows")
-        val numTotalStateRows = longMetric("numTotalStateRows")
         val numUpdatedStateRows = longMetric("numUpdatedStateRows")
         val allUpdatesTimeMs = longMetric("allUpdatesTimeMs")
         val allRemovalsTimeMs = longMetric("allRemovalsTimeMs")
@@ -218,7 +252,7 @@ case class StateStoreSaveExec(
             commitTimeMs += timeTakenMs {
               store.commit()
             }
-            numTotalStateRows += store.numKeys()
+            setStoreMetrics(store)
             store.iterator().map { rowPair =>
               numOutputRows += 1
               rowPair.value
@@ -261,7 +295,7 @@ case class StateStoreSaveExec(
               override protected def close(): Unit = {
                 allRemovalsTimeMs += NANOSECONDS.toMillis(System.nanoTime - removalStartTimeNs)
                 commitTimeMs += timeTakenMs { store.commit() }
-                numTotalStateRows += store.numKeys()
+                setStoreMetrics(store)
               }
             }
 
@@ -285,7 +319,7 @@ case class StateStoreSaveExec(
                   // Remove old aggregates if watermark specified
                   allRemovalsTimeMs += timeTakenMs { removeKeysOlderThanWatermark(store) }
                   commitTimeMs += timeTakenMs { store.commit() }
-                  numTotalStateRows += store.numKeys()
+                  setStoreMetrics(store)
                   false
                 } else {
                   true
@@ -368,7 +402,7 @@ case class StreamingDeduplicateExec(
         allUpdatesTimeMs += NANOSECONDS.toMillis(System.nanoTime - updatesStartTimeNs)
         allRemovalsTimeMs += timeTakenMs { removeKeysOlderThanWatermark(store) }
         commitTimeMs += timeTakenMs { store.commit() }
-        numTotalStateRows += store.numKeys()
+        setStoreMetrics(store)
       })
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
index fb590e7df996b..81a2387b80396 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
@@ -37,7 +37,10 @@ import org.apache.spark.annotation.InterfaceStability
 @InterfaceStability.Evolving
 class StateOperatorProgress private[sql](
     val numRowsTotal: Long,
-    val numRowsUpdated: Long) extends Serializable {
+    val numRowsUpdated: Long,
+    val memoryUsedBytes: Long,
+    val numPartitions: Long
+  ) extends Serializable {
 
   /** The compact JSON representation of this progress. */
   def json: String = compact(render(jsonValue))
@@ -45,9 +48,14 @@ class StateOperatorProgress private[sql](
   /** The pretty (i.e. indented) JSON representation of this progress. */
   def prettyJson: String = pretty(render(jsonValue))
 
+  private[sql] def copy(newNumRowsUpdated: Long): StateOperatorProgress =
+    new StateOperatorProgress(numRowsTotal, newNumRowsUpdated, memoryUsedBytes, numPartitions)
+
   private[sql] def jsonValue: JValue = {
     ("numRowsTotal" -> JInt(numRowsTotal)) ~
-    ("numRowsUpdated" -> JInt(numRowsUpdated))
+    ("numRowsUpdated" -> JInt(numRowsUpdated)) ~
+    ("memoryUsedBytes" -> JInt(memoryUsedBytes)) ~
+    ("numPartitions" -> JInt(numPartitions))
   }
 }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala
index 7cb86dc143844..c843b65020d8c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution.streaming.state
 import java.io.{File, IOException}
 import java.net.URI
 import java.util.UUID
+import java.util.concurrent.ConcurrentHashMap
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable
@@ -184,6 +185,15 @@ class StateStoreSuite extends StateStoreSuiteBase[HDFSBackedStateStoreProvider]
     }
   }
 
+  test("reports memory usage") {
+    val provider = newStoreProvider()
+    val store = provider.getStore(0)
+    val noDataMemoryUsed = store.metrics.memoryUsedBytes
+    put(store, "a", 1)
+    store.commit()
+    assert(store.metrics.memoryUsedBytes > noDataMemoryUsed)
+  }
+
   test("StateStore.get") {
     quietly {
       val dir = newDir()
@@ -554,12 +564,12 @@ abstract class StateStoreSuiteBase[ProviderClass <: StateStoreProvider]
     assert(!store.hasCommitted)
     assert(get(store, "a") === None)
     assert(store.iterator().isEmpty)
-    assert(store.numKeys() === 0)
+    assert(store.metrics.numKeys === 0)
 
     // Verify state after updating
     put(store, "a", 1)
     assert(get(store, "a") === Some(1))
-    assert(store.numKeys() === 1)
+    assert(store.metrics.numKeys === 1)
 
     assert(store.iterator().nonEmpty)
     assert(getLatestData(provider).isEmpty)
@@ -567,9 +577,9 @@ abstract class StateStoreSuiteBase[ProviderClass <: StateStoreProvider]
     // Make updates, commit and then verify state
     put(store, "b", 2)
     put(store, "aa", 3)
-    assert(store.numKeys() === 3)
+    assert(store.metrics.numKeys === 3)
     remove(store, _.startsWith("a"))
-    assert(store.numKeys() === 1)
+    assert(store.metrics.numKeys === 1)
     assert(store.commit() === 1)
 
     assert(store.hasCommitted)
@@ -587,9 +597,9 @@ abstract class StateStoreSuiteBase[ProviderClass <: StateStoreProvider]
     // New updates to the reloaded store with new version, and does not change old version
     val reloadedProvider = newStoreProvider(store.id)
     val reloadedStore = reloadedProvider.getStore(1)
-    assert(reloadedStore.numKeys() === 1)
+    assert(reloadedStore.metrics.numKeys === 1)
     put(reloadedStore, "c", 4)
-    assert(reloadedStore.numKeys() === 2)
+    assert(reloadedStore.metrics.numKeys === 2)
     assert(reloadedStore.commit() === 2)
     assert(rowsToSet(reloadedStore.iterator()) === Set("b" -> 2, "c" -> 4))
     assert(getLatestData(provider) === Set("b" -> 2, "c" -> 4))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
index 0d9ca81349be5..9f2f0d195de9f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
@@ -31,7 +31,7 @@ import org.apache.spark.sql.catalyst.plans.physical.UnknownPartitioning
 import org.apache.spark.sql.catalyst.streaming.InternalOutputModes._
 import org.apache.spark.sql.execution.RDDScanExec
 import org.apache.spark.sql.execution.streaming.{FlatMapGroupsWithStateExec, GroupStateImpl, MemoryStream}
-import org.apache.spark.sql.execution.streaming.state.{StateStore, StateStoreId, UnsafeRowPair}
+import org.apache.spark.sql.execution.streaming.state.{StateStore, StateStoreId, StateStoreMetrics, UnsafeRowPair}
 import org.apache.spark.sql.streaming.FlatMapGroupsWithStateSuite.MemoryStateStore
 import org.apache.spark.sql.streaming.util.StreamManualClock
 import org.apache.spark.sql.types.{DataType, IntegerType}
@@ -1077,7 +1077,7 @@ object FlatMapGroupsWithStateSuite {
     override def abort(): Unit = { }
     override def id: StateStoreId = null
     override def version: Long = 0
-    override def numKeys(): Long = map.size
+    override def metrics: StateStoreMetrics = new StateStoreMetrics(map.size, 0, Map.empty)
     override def hasCommitted: Boolean = true
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala
index 901cf34f289cc..d3cafac4f1755 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala
@@ -33,16 +33,10 @@ import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.streaming.StreamingQueryStatusAndProgressSuite._
 
 class StreamingQueryStatusAndProgressSuite extends StreamTest with Eventually {
-  implicit class EqualsIgnoreCRLF(source: String) {
-    def equalsIgnoreCRLF(target: String): Boolean = {
-      source.replaceAll("\r\n|\r|\n", System.lineSeparator) ===
-        target.replaceAll("\r\n|\r|\n", System.lineSeparator)
-    }
-  }
-
   test("StreamingQueryProgress - prettyJson") {
     val json1 = testProgress1.prettyJson
-    assert(json1.equalsIgnoreCRLF(
+    assertJson(
+      json1,
       s"""
         |{
         |  "id" : "${testProgress1.id.toString}",
@@ -62,7 +56,9 @@ class StreamingQueryStatusAndProgressSuite extends StreamTest with Eventually {
         |  },
         |  "stateOperators" : [ {
         |    "numRowsTotal" : 0,
-        |    "numRowsUpdated" : 1
+        |    "numRowsUpdated" : 1,
+        |    "memoryUsedBytes" : 2,
+        |    "numPartitions" : 4
         |  } ],
         |  "sources" : [ {
         |    "description" : "source",
@@ -75,13 +71,13 @@ class StreamingQueryStatusAndProgressSuite extends StreamTest with Eventually {
         |    "description" : "sink"
         |  }
         |}
-      """.stripMargin.trim))
+      """.stripMargin.trim)
     assert(compact(parse(json1)) === testProgress1.json)
 
     val json2 = testProgress2.prettyJson
-    assert(
-      json2.equalsIgnoreCRLF(
-        s"""
+    assertJson(
+      json2,
+      s"""
          |{
          |  "id" : "${testProgress2.id.toString}",
          |  "runId" : "${testProgress2.runId.toString}",
@@ -93,7 +89,9 @@ class StreamingQueryStatusAndProgressSuite extends StreamTest with Eventually {
          |  },
          |  "stateOperators" : [ {
          |    "numRowsTotal" : 0,
-         |    "numRowsUpdated" : 1
+         |    "numRowsUpdated" : 1,
+         |    "memoryUsedBytes" : 2,
+         |    "numPartitions" : 4
          |  } ],
          |  "sources" : [ {
          |    "description" : "source",
@@ -105,7 +103,7 @@ class StreamingQueryStatusAndProgressSuite extends StreamTest with Eventually {
          |    "description" : "sink"
          |  }
          |}
-      """.stripMargin.trim))
+      """.stripMargin.trim)
     assert(compact(parse(json2)) === testProgress2.json)
   }
 
@@ -121,14 +119,15 @@ class StreamingQueryStatusAndProgressSuite extends StreamTest with Eventually {
 
   test("StreamingQueryStatus - prettyJson") {
     val json = testStatus.prettyJson
-    assert(json.equalsIgnoreCRLF(
+    assertJson(
+      json,
       """
         |{
         |  "message" : "active",
         |  "isDataAvailable" : true,
         |  "isTriggerActive" : false
         |}
-      """.stripMargin.trim))
+      """.stripMargin.trim)
   }
 
   test("StreamingQueryStatus - json") {
@@ -209,6 +208,12 @@ class StreamingQueryStatusAndProgressSuite extends StreamTest with Eventually {
       }
     }
   }
+
+  def assertJson(source: String, expected: String): Unit = {
+    assert(
+      source.replaceAll("\r\n|\r|\n", System.lineSeparator) ===
+        expected.replaceAll("\r\n|\r|\n", System.lineSeparator))
+  }
 }
 
 object StreamingQueryStatusAndProgressSuite {
@@ -224,7 +229,8 @@ object StreamingQueryStatusAndProgressSuite {
       "min" -> "2016-12-05T20:54:20.827Z",
       "avg" -> "2016-12-05T20:54:20.827Z",
       "watermark" -> "2016-12-05T20:54:20.827Z").asJava),
-    stateOperators = Array(new StateOperatorProgress(numRowsTotal = 0, numRowsUpdated = 1)),
+    stateOperators = Array(new StateOperatorProgress(
+      numRowsTotal = 0, numRowsUpdated = 1, memoryUsedBytes = 2, numPartitions = 4)),
     sources = Array(
       new SourceProgress(
         description = "source",
@@ -247,7 +253,8 @@ object StreamingQueryStatusAndProgressSuite {
     durationMs = new java.util.HashMap(Map("total" -> 0L).mapValues(long2Long).asJava),
     // empty maps should be handled correctly
     eventTime = new java.util.HashMap(Map.empty[String, String].asJava),
-    stateOperators = Array(new StateOperatorProgress(numRowsTotal = 0, numRowsUpdated = 1)),
+    stateOperators = Array(new StateOperatorProgress(
+      numRowsTotal = 0, numRowsUpdated = 1, memoryUsedBytes = 2, numPartitions = 4)),
     sources = Array(
       new SourceProgress(
         description = "source",

From a8c6d0f64ec45064e23004928814c968df11d97a Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 18 Jul 2017 09:15:18 +0800
Subject: [PATCH 187/779] [MINOR] Improve SQLConf messages

### What changes were proposed in this pull request?
The current SQLConf messages of `spark.sql.hive.convertMetastoreParquet` and `spark.sql.hive.convertMetastoreOrc` are not very clear to end users. This PR is to improve them.

### How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #18657 from gatorsmile/msgUpdates.
---
 .../main/scala/org/apache/spark/sql/hive/HiveUtils.scala  | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
index a29d7a7565ee1..2a522a1431f45 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
@@ -86,8 +86,8 @@ private[spark] object HiveUtils extends Logging {
     .createWithDefault("builtin")
 
   val CONVERT_METASTORE_PARQUET = buildConf("spark.sql.hive.convertMetastoreParquet")
-    .doc("When set to false, Spark SQL will use the Hive SerDe for parquet tables instead of " +
-      "the built in support.")
+    .doc("When set to true, the built-in Parquet reader and writer are used to process " +
+      "parquet tables created by using the HiveQL syntax, instead of Hive serde.")
     .booleanConf
     .createWithDefault(true)
 
@@ -101,8 +101,8 @@ private[spark] object HiveUtils extends Logging {
 
   val CONVERT_METASTORE_ORC = buildConf("spark.sql.hive.convertMetastoreOrc")
     .internal()
-    .doc("When set to false, Spark SQL will use the Hive SerDe for ORC tables instead of " +
-      "the built in support.")
+    .doc("When set to true, the built-in ORC reader and writer are used to process " +
+      "ORC tables created by using the HiveQL syntax, instead of Hive serde.")
     .booleanConf
     .createWithDefault(false)
 

From 7aac755ba05be3689ae25f4b9183b32fa3408b89 Mon Sep 17 00:00:00 2001
From: Zhang A Peng <zhangap@cn.ibm.com>
Date: Tue, 18 Jul 2017 09:57:53 +0800
Subject: [PATCH 188/779] [SPARK-21410][CORE] Create less partitions for
 RangePartitioner if RDD.count() is less than `partitions`

## What changes were proposed in this pull request?

Fix a bug in RangePartitioner:
In RangePartitioner(partitions: Int, rdd: RDD[]), RangePartitioner.numPartitions is wrong if the number of elements in RDD (rdd.count()) is less than number of partitions (partitions in constructor).

## How was this patch tested?

test as described in [SPARK-SPARK-21410](https://issues.apache.org/jira/browse/SPARK-21410
)

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Zhang A Peng <zhangap@cn.ibm.com>

Closes #18631 from apapi/fixRangePartitioner.numPartitions.
---
 core/src/main/scala/org/apache/spark/Partitioner.scala      | 2 +-
 .../src/test/scala/org/apache/spark/PartitioningSuite.scala | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/Partitioner.scala b/core/src/main/scala/org/apache/spark/Partitioner.scala
index f83f5278e8b8f..1484f29525a4e 100644
--- a/core/src/main/scala/org/apache/spark/Partitioner.scala
+++ b/core/src/main/scala/org/apache/spark/Partitioner.scala
@@ -153,7 +153,7 @@ class RangePartitioner[K : Ordering : ClassTag, V](
           val weight = (1.0 / fraction).toFloat
           candidates ++= reSampled.map(x => (x, weight))
         }
-        RangePartitioner.determineBounds(candidates, partitions)
+        RangePartitioner.determineBounds(candidates, math.min(partitions, candidates.size))
       }
     }
   }
diff --git a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
index 34c017806fe10..dfe4c25670ce0 100644
--- a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
+++ b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
@@ -253,6 +253,12 @@ class PartitioningSuite extends SparkFunSuite with SharedSparkContext with Priva
 
     // Add other tests here for classes that should be able to handle empty partitions correctly
   }
+
+  test("Number of elements in RDD is less than number of partitions") {
+    val rdd = sc.parallelize(1 to 3).map(x => (x, x))
+    val partitioner = new RangePartitioner(22, rdd)
+    assert(partitioner.numPartitions === 3)
+  }
 }
 
 

From e9faae135cf47c4592d4caa36f3f57e439d4fe88 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Mon, 17 Jul 2017 19:28:55 -0700
Subject: [PATCH 189/779] [SPARK-21409][SS] Follow up PR to allow different
 types of custom metrics to be exposed

## What changes were proposed in this pull request?

Implementation may expose both timing as well as size metrics. This PR enables that.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #18661 from tdas/SPARK-21409-2.
---
 .../sql/execution/streaming/state/StateStore.scala    | 11 ++++++++++-
 .../sql/execution/streaming/statefulOperators.scala   |  9 ++++++---
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
index 9da610e359f90..182fc2779e7ab 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
@@ -92,6 +92,10 @@ trait StateStore {
    */
   def abort(): Unit
 
+  /**
+   * Return an iterator containing all the key-value pairs in the SateStore. Implementations must
+   * ensure that updates (puts, removes) can be made while iterating over this iterator.
+   */
   def iterator(): Iterator[UnsafeRowPair]
 
   /** Current metrics of the state store */
@@ -120,7 +124,12 @@ case class StateStoreMetrics(
  * Name and description of custom implementation-specific metrics that a
  * state store may wish to expose.
  */
-case class StateStoreCustomMetric(name: String, desc: String)
+trait StateStoreCustomMetric {
+  def name: String
+  def desc: String
+}
+case class StateStoreCustomSizeMetric(name: String, desc: String) extends StateStoreCustomMetric
+case class StateStoreCustomTimingMetric(name: String, desc: String) extends StateStoreCustomMetric
 
 /**
  * Trait representing a provider that provide [[StateStore]] instances representing
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala
index 77b1160a063fb..3ca7f4b145d71 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala
@@ -104,7 +104,6 @@ trait StateStoreWriter extends StatefulOperator { self: SparkPlan =>
    * This should be called in that task after the store has been updated.
    */
   protected def setStoreMetrics(store: StateStore): Unit = {
-
     val storeMetrics = store.metrics
     longMetric("numTotalStateRows") += storeMetrics.numKeys
     longMetric("stateMemory") += storeMetrics.memoryUsedBytes
@@ -115,8 +114,12 @@ trait StateStoreWriter extends StatefulOperator { self: SparkPlan =>
 
   private def stateStoreCustomMetrics: Map[String, SQLMetric] = {
     val provider = StateStoreProvider.create(sqlContext.conf.stateStoreProviderClass)
-    provider.supportedCustomMetrics.map { m =>
-      m.name -> SQLMetrics.createTimingMetric(sparkContext, m.desc) }.toMap
+    provider.supportedCustomMetrics.map {
+      case StateStoreCustomSizeMetric(name, desc) =>
+        name -> SQLMetrics.createSizeMetric(sparkContext, desc)
+      case StateStoreCustomTimingMetric(name, desc) =>
+        name -> SQLMetrics.createTimingMetric(sparkContext, desc)
+    }.toMap
   }
 }
 

From 5952ad2b40c82c0ccb2ec16fa09071bf198ff99d Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Mon, 17 Jul 2017 20:40:32 -0700
Subject: [PATCH 190/779] [SPARK-21444] Be more defensive when removing
 broadcasts in MapOutputTracker

## What changes were proposed in this pull request?

In SPARK-21444, sitalkedia reported an issue where the `Broadcast.destroy()` call in `MapOutputTracker`'s `ShuffleStatus.invalidateSerializedMapOutputStatusCache()` was failing with an `IOException`, causing the DAGScheduler to crash and bring down the entire driver.

This is a bug introduced by #17955. In the old code, we removed a broadcast variable by calling `BroadcastManager.unbroadcast` with `blocking=false`, but the new code simply calls `Broadcast.destroy()` which is capable of failing with an IOException in case certain blocking RPCs time out.

The fix implemented here is to replace this with a call to `destroy(blocking = false)` and to wrap the entire operation in `Utils.tryLogNonFatalError`.

## How was this patch tested?

I haven't written regression tests for this because it's really hard to inject mocks to simulate RPC failures here. Instead, this class of issue is probably best uncovered with more generalized error injection / network unreliability / fuzz testing tools.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #18662 from JoshRosen/SPARK-21444.
---
 .../src/main/scala/org/apache/spark/MapOutputTracker.scala | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
index 5d48bc7c96555..7f760a59bda2f 100644
--- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
+++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
@@ -194,7 +194,12 @@ private class ShuffleStatus(numPartitions: Int) {
    */
   def invalidateSerializedMapOutputStatusCache(): Unit = synchronized {
     if (cachedSerializedBroadcast != null) {
-      cachedSerializedBroadcast.destroy()
+      // Prevent errors during broadcast cleanup from crashing the DAGScheduler (see SPARK-21444)
+      Utils.tryLogNonFatalError {
+        // Use `blocking = false` so that this operation doesn't hang while trying to send cleanup
+        // RPCs to dead executors.
+        cachedSerializedBroadcast.destroy(blocking = false)
+      }
       cachedSerializedBroadcast = null
     }
     cachedSerializedMapStatus = null

From 0be5fb41a6b7ef4da9ba36f3604ac646cb6d4ae3 Mon Sep 17 00:00:00 2001
From: aokolnychyi <anton.okolnychyi@sap.com>
Date: Mon, 17 Jul 2017 21:07:50 -0700
Subject: [PATCH 191/779] [SPARK-21332][SQL] Incorrect result type inferred for
 some decimal expressions

## What changes were proposed in this pull request?

This PR changes the direction of expression transformation in the DecimalPrecision rule. Previously, the expressions were transformed down, which led to incorrect result types when decimal expressions had other decimal expressions as their operands. The root cause of this issue was in visiting outer nodes before their children. Consider the example below:

```
    val inputSchema = StructType(StructField("col", DecimalType(26, 6)) :: Nil)
    val sc = spark.sparkContext
    val rdd = sc.parallelize(1 to 2).map(_ => Row(BigDecimal(12)))
    val df = spark.createDataFrame(rdd, inputSchema)

    // Works correctly since no nested decimal expression is involved
    // Expected result type: (26, 6) * (26, 6) = (38, 12)
    df.select($"col" * $"col").explain(true)
    df.select($"col" * $"col").printSchema()

    // Gives a wrong result since there is a nested decimal expression that should be visited first
    // Expected result type: ((26, 6) * (26, 6)) * (26, 6) = (38, 12) * (26, 6) = (38, 18)
    df.select($"col" * $"col" * $"col").explain(true)
    df.select($"col" * $"col" * $"col").printSchema()
```

The example above gives the following output:

```
// Correct result without sub-expressions
== Parsed Logical Plan ==
'Project [('col * 'col) AS (col * col)#4]
+- LogicalRDD [col#1]

== Analyzed Logical Plan ==
(col * col): decimal(38,12)
Project [CheckOverflow((promote_precision(cast(col#1 as decimal(26,6))) * promote_precision(cast(col#1 as decimal(26,6)))), DecimalType(38,12)) AS (col * col)#4]
+- LogicalRDD [col#1]

== Optimized Logical Plan ==
Project [CheckOverflow((col#1 * col#1), DecimalType(38,12)) AS (col * col)#4]
+- LogicalRDD [col#1]

== Physical Plan ==
*Project [CheckOverflow((col#1 * col#1), DecimalType(38,12)) AS (col * col)#4]
+- Scan ExistingRDD[col#1]

// Schema
root
 |-- (col * col): decimal(38,12) (nullable = true)

// Incorrect result with sub-expressions
== Parsed Logical Plan ==
'Project [(('col * 'col) * 'col) AS ((col * col) * col)#11]
+- LogicalRDD [col#1]

== Analyzed Logical Plan ==
((col * col) * col): decimal(38,12)
Project [CheckOverflow((promote_precision(cast(CheckOverflow((promote_precision(cast(col#1 as decimal(26,6))) * promote_precision(cast(col#1 as decimal(26,6)))), DecimalType(38,12)) as decimal(26,6))) * promote_precision(cast(col#1 as decimal(26,6)))), DecimalType(38,12)) AS ((col * col) * col)#11]
+- LogicalRDD [col#1]

== Optimized Logical Plan ==
Project [CheckOverflow((cast(CheckOverflow((col#1 * col#1), DecimalType(38,12)) as decimal(26,6)) * col#1), DecimalType(38,12)) AS ((col * col) * col)#11]
+- LogicalRDD [col#1]

== Physical Plan ==
*Project [CheckOverflow((cast(CheckOverflow((col#1 * col#1), DecimalType(38,12)) as decimal(26,6)) * col#1), DecimalType(38,12)) AS ((col * col) * col)#11]
+- Scan ExistingRDD[col#1]

// Schema
root
 |-- ((col * col) * col): decimal(38,12) (nullable = true)
```

## How was this patch tested?

This PR was tested with available unit tests. Moreover, there are tests to cover previously failing scenarios.

Author: aokolnychyi <anton.okolnychyi@sap.com>

Closes #18583 from aokolnychyi/spark-21332.
---
 .../spark/sql/catalyst/analysis/DecimalPrecision.scala      | 2 +-
 .../spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala
index 9c38dd2ee4e53..fd2ac78b25dbd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala
@@ -80,7 +80,7 @@ object DecimalPrecision extends Rule[LogicalPlan] {
 
   def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
     // fix decimal precision for expressions
-    case q => q.transformExpressions(
+    case q => q.transformExpressionsUp(
       decimalAndDecimal.orElse(integralAndDecimalLiteral).orElse(nondecimalAndDecimal))
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
index ccf3c3fb0949d..11a4a79089416 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
@@ -90,8 +90,14 @@ class DecimalPrecisionSuite extends AnalysisTest with BeforeAndAfter {
     checkType(Average(d1), DecimalType(6, 5))
 
     checkType(Add(Add(d1, d2), d1), DecimalType(7, 2))
+    checkType(Add(Add(d1, d1), d1), DecimalType(4, 1))
+    checkType(Add(d1, Add(d1, d1)), DecimalType(4, 1))
     checkType(Add(Add(Add(d1, d2), d1), d2), DecimalType(8, 2))
     checkType(Add(Add(d1, d2), Add(d1, d2)), DecimalType(7, 2))
+    checkType(Subtract(Subtract(d2, d1), d1), DecimalType(7, 2))
+    checkType(Multiply(Multiply(d1, d1), d2), DecimalType(11, 4))
+    checkType(Divide(d2, Add(d1, d1)), DecimalType(10, 6))
+    checkType(Sum(Add(d1, d1)), DecimalType(13, 1))
   }
 
   test("Comparison operations") {

From 26cd2ca0402d7d49780116d45a5622a45c79f661 Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Tue, 18 Jul 2017 12:09:07 +0800
Subject: [PATCH 192/779] [SPARK-21445] Make IntWrapper and LongWrapper in
 UTF8String Serializable

## What changes were proposed in this pull request?

Making those two classes will avoid Serialization issues like below:
```
Caused by: java.io.NotSerializableException: org.apache.spark.unsafe.types.UTF8String$IntWrapper
Serialization stack:
    - object not serializable (class: org.apache.spark.unsafe.types.UTF8String$IntWrapper, value: org.apache.spark.unsafe.types.UTF8String$IntWrapper326450e)
    - field (class: org.apache.spark.sql.catalyst.expressions.Cast$$anonfun$castToInt$1, name: result$2, type: class org.apache.spark.unsafe.types.UTF8String$IntWrapper)
    - object (class org.apache.spark.sql.catalyst.expressions.Cast$$anonfun$castToInt$1, <function1>)
```

## How was this patch tested?

- [x] Manual testing
- [ ] Unit test

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #18660 from brkyvz/serializableutf8.
---
 .../java/org/apache/spark/unsafe/types/UTF8String.java    | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 9de4ca71ff6d4..43f57672d9544 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -863,8 +863,8 @@ public UTF8String translate(Map<Character, Character> dict) {
    * Wrapper over `long` to allow result of parsing long from string to be accessed via reference.
    * This is done solely for better performance and is not expected to be used by end users.
    */
-  public static class LongWrapper {
-    public long value = 0;
+  public static class LongWrapper implements Serializable {
+    public transient long value = 0;
   }
 
   /**
@@ -874,8 +874,8 @@ public static class LongWrapper {
    * {@link LongWrapper} could have been used here but using `int` directly save the extra cost of
    * conversion from `long` to `int`
    */
-  public static class IntWrapper {
-    public int value = 0;
+  public static class IntWrapper implements Serializable {
+    public transient int value = 0;
   }
 
   /**

From e26dac5feb02033f980b1e69c9b0ff50869b6f9e Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Tue, 18 Jul 2017 08:47:17 +0100
Subject: [PATCH 193/779] [SPARK-21415] Triage scapegoat warnings, part 1

## What changes were proposed in this pull request?

Address scapegoat warnings for:
- BigDecimal double constructor
- Catching NPE
- Finalizer without super
- List.size is O(n)
- Prefer Seq.empty
- Prefer Set.empty
- reverse.map instead of reverseMap
- Type shadowing
- Unnecessary if condition.
- Use .log1p
- Var could be val

In some instances like Seq.empty, I avoided making the change even where valid in test code to keep the scope of the change smaller. Those issues are concerned with performance and it won't matter for tests.

## How was this patch tested?

Existing tests

Author: Sean Owen <sowen@cloudera.com>

Closes #18635 from srowen/Scapegoat1.
---
 .../scala/org/apache/spark/SSLOptions.scala   | 26 +++++++++--------
 .../scala/org/apache/spark/SparkEnv.scala     |  2 +-
 .../scala/org/apache/spark/TestUtils.scala    |  8 +++---
 .../apache/spark/api/python/PythonRDD.scala   |  1 +
 .../scala/org/apache/spark/api/r/SerDe.scala  |  3 +-
 .../apache/spark/deploy/SparkHadoopUtil.scala |  2 +-
 .../deploy/history/FsHistoryProvider.scala    |  2 +-
 .../spark/deploy/master/ui/MasterWebUI.scala  |  2 +-
 .../spark/deploy/worker/CommandUtils.scala    |  6 ++--
 .../main/scala/org/apache/spark/package.scala | 10 +++----
 .../org/apache/spark/rdd/CoalescedRDD.scala   |  2 +-
 .../org/apache/spark/scheduler/Pool.scala     |  2 +-
 .../apache/spark/scheduler/TaskResult.scala   |  2 +-
 .../spark/scheduler/TaskSetManager.scala      |  4 +--
 .../CoarseGrainedSchedulerBackend.scala       |  5 ++--
 .../apache/spark/storage/BlockManager.scala   |  6 ++--
 .../apache/spark/ui/jobs/AllJobsPage.scala    |  2 +-
 .../apache/spark/ui/jobs/AllStagesPage.scala  |  2 +-
 .../org/apache/spark/ui/jobs/PoolPage.scala   |  2 +-
 .../org/apache/spark/ui/jobs/StagePage.scala  |  2 +-
 .../org/apache/spark/ui/jobs/StageTable.scala |  2 +-
 .../org/apache/spark/ui/storage/RDDPage.scala |  2 +-
 .../apache/spark/util/ClosureCleaner.scala    | 26 +++++++++--------
 .../org/apache/spark/util/JsonProtocol.scala  |  4 +--
 .../scala/org/apache/spark/util/Utils.scala   |  6 ++--
 .../apache/spark/examples/LocalFileLR.scala   |  6 ++--
 .../apache/spark/examples/LocalKMeans.scala   | 15 +++++-----
 .../org/apache/spark/examples/LocalLR.scala   |  4 +--
 .../apache/spark/examples/SparkHdfsLR.scala   |  6 ++--
 .../org/apache/spark/examples/SparkLR.scala   |  2 +-
 .../examples/ml/DecisionTreeExample.scala     |  6 ++--
 .../apache/spark/examples/ml/GBTExample.scala |  6 ++--
 .../examples/ml/RandomForestExample.scala     |  6 ++--
 .../examples/mllib/DecisionTreeRunner.scala   |  2 +-
 .../spark/sql/kafka010/KafkaTestUtils.scala   |  2 +-
 .../spark/streaming/kafka010/KafkaRDD.scala   |  2 +-
 .../streaming/kafka010/KafkaTestUtils.scala   |  2 +-
 .../spark/graphx/impl/EdgePartition.scala     |  4 +--
 .../apache/spark/graphx/impl/GraphImpl.scala  |  2 +-
 .../spark/graphx/util/GraphGenerators.scala   |  3 +-
 .../classification/LogisticRegression.scala   |  4 +--
 .../spark/ml/feature/RFormulaParser.scala     |  2 +-
 .../ml/regression/LinearRegression.scala      |  2 +-
 .../spark/mllib/clustering/KMeans.scala       |  2 +-
 .../linalg/EigenValueDecomposition.scala      | 14 +++++-----
 .../spark/mllib/optimization/LBFGS.scala      |  2 +-
 .../correlation/SpearmanCorrelation.scala     |  2 +-
 .../spark/mllib/stat/test/StreamingTest.scala |  2 +-
 project/SparkBuild.scala                      |  4 +--
 .../spark/deploy/mesos/ui/DriverPage.scala    |  4 +--
 .../cluster/mesos/MesosSchedulerUtils.scala   |  2 +-
 .../sql/catalyst/analysis/Analyzer.scala      |  5 ++--
 .../expressions/collectionOperations.scala    |  4 +--
 .../optimizer/CostBasedJoinReorder.scala      |  2 +-
 .../sql/catalyst/optimizer/expressions.scala  |  2 +-
 .../sql/catalyst/parser/AstBuilder.scala      |  2 +-
 .../sql/catalyst/planning/patterns.scala      |  2 +-
 .../statsEstimation/FilterEstimation.scala    | 14 +++++-----
 .../expressions/MathExpressionsSuite.scala    |  6 ++--
 .../apache/spark/sql/types/DecimalSuite.scala |  4 +--
 .../spark/sql/execution/command/ddl.scala     |  4 +--
 .../datasources/jdbc/JDBCRelation.scala       |  3 +-
 .../parquet/ParquetFileFormat.scala           |  2 +-
 .../apache/spark/sql/execution/objects.scala  |  5 +---
 .../execution/r/MapPartitionsRWrapper.scala   |  3 +-
 .../streaming/CompactibleFileStreamLog.scala  |  8 ++----
 .../execution/streaming/StreamExecution.scala |  2 +-
 .../spark/sql/DataFrameAggregateSuite.scala   | 28 +++++++++----------
 .../org/apache/spark/sql/DataFrameSuite.scala |  4 +--
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  4 +--
 .../spark/sql/StringFunctionsSuite.scala      |  2 +-
 .../datasources/json/JsonSuite.scala          |  4 +--
 .../ParquetPartitionDiscoverySuite.scala      |  2 +-
 .../org/apache/spark/sql/hive/HiveUtils.scala |  2 +-
 .../apache/spark/sql/hive/TableReader.scala   |  8 +++---
 .../sql/hive/client/HiveClientImpl.scala      |  2 +-
 .../spark/sql/hive/HiveInspectorSuite.scala   |  2 +-
 .../sql/hive/client/HiveClientSuite.scala     |  9 ++----
 .../spark/streaming/StreamingContext.scala    |  2 +-
 .../streaming/api/java/JavaDStreamLike.scala  |  2 +-
 .../streaming/dstream/StateDStream.scala      |  2 +-
 .../spark/streaming/util/RawTextHelper.scala  |  1 -
 82 files changed, 186 insertions(+), 195 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SSLOptions.scala b/core/src/main/scala/org/apache/spark/SSLOptions.scala
index f86fd20e59190..477b01968c6ef 100644
--- a/core/src/main/scala/org/apache/spark/SSLOptions.scala
+++ b/core/src/main/scala/org/apache/spark/SSLOptions.scala
@@ -94,21 +94,23 @@ private[spark] case class SSLOptions(
    * are supported by the current Java security provider for this protocol.
    */
   private val supportedAlgorithms: Set[String] = if (enabledAlgorithms.isEmpty) {
-    Set()
+    Set.empty
   } else {
     var context: SSLContext = null
-    try {
-      context = SSLContext.getInstance(protocol.orNull)
-      /* The set of supported algorithms does not depend upon the keys, trust, or
+    if (protocol.isEmpty) {
+      logDebug("No SSL protocol specified")
+      context = SSLContext.getDefault
+    } else {
+      try {
+        context = SSLContext.getInstance(protocol.get)
+        /* The set of supported algorithms does not depend upon the keys, trust, or
          rng, although they will influence which algorithms are eventually used. */
-      context.init(null, null, null)
-    } catch {
-      case npe: NullPointerException =>
-        logDebug("No SSL protocol specified")
-        context = SSLContext.getDefault
-      case nsa: NoSuchAlgorithmException =>
-        logDebug(s"No support for requested SSL protocol ${protocol.get}")
-        context = SSLContext.getDefault
+        context.init(null, null, null)
+      } catch {
+        case nsa: NoSuchAlgorithmException =>
+          logDebug(s"No support for requested SSL protocol ${protocol.get}")
+          context = SSLContext.getDefault
+      }
     }
 
     val providerAlgorithms = context.getServerSocketFactory.getSupportedCipherSuites.toSet
diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
index 3196c1ece15eb..45ed9860ea8f9 100644
--- a/core/src/main/scala/org/apache/spark/SparkEnv.scala
+++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -420,7 +420,7 @@ object SparkEnv extends Logging {
       if (!conf.contains("spark.scheduler.mode")) {
         Seq(("spark.scheduler.mode", schedulingMode))
       } else {
-        Seq[(String, String)]()
+        Seq.empty[(String, String)]
       }
     val sparkProperties = (conf.getAll ++ schedulerMode).sorted
 
diff --git a/core/src/main/scala/org/apache/spark/TestUtils.scala b/core/src/main/scala/org/apache/spark/TestUtils.scala
index 3f912dc191515..a80016dd22fc5 100644
--- a/core/src/main/scala/org/apache/spark/TestUtils.scala
+++ b/core/src/main/scala/org/apache/spark/TestUtils.scala
@@ -58,8 +58,8 @@ private[spark] object TestUtils {
   def createJarWithClasses(
       classNames: Seq[String],
       toStringValue: String = "",
-      classNamesWithBase: Seq[(String, String)] = Seq(),
-      classpathUrls: Seq[URL] = Seq()): URL = {
+      classNamesWithBase: Seq[(String, String)] = Seq.empty,
+      classpathUrls: Seq[URL] = Seq.empty): URL = {
     val tempDir = Utils.createTempDir()
     val files1 = for (name <- classNames) yield {
       createCompiledClass(name, tempDir, toStringValue, classpathUrls = classpathUrls)
@@ -137,7 +137,7 @@ private[spark] object TestUtils {
     val options = if (classpathUrls.nonEmpty) {
       Seq("-classpath", classpathUrls.map { _.getFile }.mkString(File.pathSeparator))
     } else {
-      Seq()
+      Seq.empty
     }
     compiler.getTask(null, null, null, options.asJava, null, Arrays.asList(sourceFile)).call()
 
@@ -160,7 +160,7 @@ private[spark] object TestUtils {
       destDir: File,
       toStringValue: String = "",
       baseClass: String = null,
-      classpathUrls: Seq[URL] = Seq()): File = {
+      classpathUrls: Seq[URL] = Seq.empty): File = {
     val extendsText = Option(baseClass).map { c => s" extends ${c}" }.getOrElse("")
     val sourceFile = new JavaSourceFromString(className,
       "public class " + className + extendsText + " implements java.io.Serializable {" +
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index fb0405b1a69c6..6a817524000ee 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -974,6 +974,7 @@ private[spark] class PythonBroadcast(@transient var path: String) extends Serial
         }
       }
     }
+    super.finalize()
   }
 }
 // scalastyle:on no.finalize
diff --git a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
index dad928cdcfd0f..537ab57f9664d 100644
--- a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
@@ -128,8 +128,7 @@ private[spark] object SerDe {
   }
 
   def readBoolean(in: DataInputStream): Boolean = {
-    val intVal = in.readInt()
-    if (intVal == 0) false else true
+    in.readInt() != 0
   }
 
   def readDate(in: DataInputStream): Date = {
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index 6afe58bff5229..ccbabf09a8323 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -337,7 +337,7 @@ class SparkHadoopUtil extends Logging {
     if (credentials != null) {
       credentials.getAllTokens.asScala.map(tokenToString)
     } else {
-      Seq()
+      Seq.empty
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index b2a50bd055712..687fd2d3ffe64 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -317,7 +317,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
       val newLastScanTime = getNewLastScanTime()
       logDebug(s"Scanning $logDir with lastScanTime==$lastScanTime")
       val statusList = Option(fs.listStatus(new Path(logDir))).map(_.toSeq)
-        .getOrElse(Seq[FileStatus]())
+        .getOrElse(Seq.empty[FileStatus])
       // scan for modified applications, replay and merge them
       val logInfos: Seq[FileStatus] = statusList
         .filter { entry =>
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala
index 8cfd0f682932d..e42f41b97202a 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala
@@ -55,7 +55,7 @@ class MasterWebUI(
   }
 
   def addProxyTargets(id: String, target: String): Unit = {
-    var endTarget = target.stripSuffix("/")
+    val endTarget = target.stripSuffix("/")
     val handler = createProxyHandler("/proxy/" + id, endTarget)
     attachHandler(handler)
     proxyHandlers(id) = handler
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala b/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala
index cba4aaffe2caa..12e0dae3f5e5a 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala
@@ -44,7 +44,7 @@ object CommandUtils extends Logging {
       memory: Int,
       sparkHome: String,
       substituteArguments: String => String,
-      classPaths: Seq[String] = Seq[String](),
+      classPaths: Seq[String] = Seq.empty,
       env: Map[String, String] = sys.env): ProcessBuilder = {
     val localCommand = buildLocalCommand(
       command, securityMgr, substituteArguments, classPaths, env)
@@ -73,7 +73,7 @@ object CommandUtils extends Logging {
       command: Command,
       securityMgr: SecurityManager,
       substituteArguments: String => String,
-      classPath: Seq[String] = Seq[String](),
+      classPath: Seq[String] = Seq.empty,
       env: Map[String, String]): Command = {
     val libraryPathName = Utils.libraryPathEnvName
     val libraryPathEntries = command.libraryPathEntries
@@ -96,7 +96,7 @@ object CommandUtils extends Logging {
       command.arguments.map(substituteArguments),
       newEnvironment,
       command.classPathEntries ++ classPath,
-      Seq[String](), // library path already captured in environment variable
+      Seq.empty, // library path already captured in environment variable
       // filter out auth secret from java options
       command.javaOpts.filterNot(_.startsWith("-D" + SecurityManager.SPARK_AUTH_SECRET_CONF)))
   }
diff --git a/core/src/main/scala/org/apache/spark/package.scala b/core/src/main/scala/org/apache/spark/package.scala
index 2610d6f6e45a2..8058a4d5dbdea 100644
--- a/core/src/main/scala/org/apache/spark/package.scala
+++ b/core/src/main/scala/org/apache/spark/package.scala
@@ -17,6 +17,8 @@
 
 package org.apache
 
+import java.util.Properties
+
 /**
  * Core Spark functionality. [[org.apache.spark.SparkContext]] serves as the main entry point to
  * Spark, while [[org.apache.spark.rdd.RDD]] is the data type representing a distributed collection,
@@ -40,9 +42,6 @@ package org.apache
  * Developer API</span> are intended for advanced users want to extend Spark through lower
  * level interfaces. These are subject to changes or removal in minor releases.
  */
-
-import java.util.Properties
-
 package object spark {
 
   private object SparkBuildInfo {
@@ -57,6 +56,9 @@ package object spark {
 
       val resourceStream = Thread.currentThread().getContextClassLoader.
         getResourceAsStream("spark-version-info.properties")
+      if (resourceStream == null) {
+        throw new SparkException("Could not find spark-version-info.properties")
+      }
 
       try {
         val unknownProp = "<unknown>"
@@ -71,8 +73,6 @@ package object spark {
           props.getProperty("date", unknownProp)
         )
       } catch {
-        case npe: NullPointerException =>
-          throw new SparkException("Error while locating file spark-version-info.properties", npe)
         case e: Exception =>
           throw new SparkException("Error loading properties from spark-version-info.properties", e)
       } finally {
diff --git a/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala
index 2cba1febe8759..10451a324b0f4 100644
--- a/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala
@@ -269,7 +269,7 @@ private class DefaultPartitionCoalescer(val balanceSlack: Double = 0.10)
     tries = 0
     // if we don't have enough partition groups, create duplicates
     while (numCreated < targetLen) {
-      var (nxt_replica, nxt_part) = partitionLocs.partsWithLocs(tries)
+      val (nxt_replica, nxt_part) = partitionLocs.partsWithLocs(tries)
       tries += 1
       val pgroup = new PartitionGroup(Some(nxt_replica))
       groupArr += pgroup
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Pool.scala b/core/src/main/scala/org/apache/spark/scheduler/Pool.scala
index 1181371ab425a..f4b0ab10155a2 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Pool.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Pool.scala
@@ -97,7 +97,7 @@ private[spark] class Pool(
   }
 
   override def getSortedTaskSetQueue: ArrayBuffer[TaskSetManager] = {
-    var sortedTaskSetQueue = new ArrayBuffer[TaskSetManager]
+    val sortedTaskSetQueue = new ArrayBuffer[TaskSetManager]
     val sortedSchedulableQueue =
       schedulableQueue.asScala.toSeq.sortWith(taskSetSchedulingAlgorithm.comparator)
     for (schedulable <- sortedSchedulableQueue) {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala
index 366b92c5f2ada..836769e1723d5 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala
@@ -60,7 +60,7 @@ private[spark] class DirectTaskResult[T](
 
     val numUpdates = in.readInt
     if (numUpdates == 0) {
-      accumUpdates = Seq()
+      accumUpdates = Seq.empty
     } else {
       val _accumUpdates = new ArrayBuffer[AccumulatorV2[_, _]]
       for (i <- 0 until numUpdates) {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index 3968fb7e6356d..589fe672ade71 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -891,7 +891,7 @@ private[spark] class TaskSetManager(
   override def removeSchedulable(schedulable: Schedulable) {}
 
   override def getSortedTaskSetQueue(): ArrayBuffer[TaskSetManager] = {
-    var sortedTaskSetQueue = new ArrayBuffer[TaskSetManager]()
+    val sortedTaskSetQueue = new ArrayBuffer[TaskSetManager]()
     sortedTaskSetQueue += this
     sortedTaskSetQueue
   }
@@ -948,7 +948,7 @@ private[spark] class TaskSetManager(
 
     if (tasksSuccessful >= minFinishedForSpeculation && tasksSuccessful > 0) {
       val time = clock.getTimeMillis()
-      var medianDuration = successfulTaskDurations.median
+      val medianDuration = successfulTaskDurations.median
       val threshold = max(SPECULATION_MULTIPLIER * medianDuration, minTimeToSpeculation)
       // TODO: Threshold should also look at standard deviation of task durations and have a lower
       // bound based on that.
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index 0b396b794ddce..a46824a0c6fad 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -23,7 +23,6 @@ import javax.annotation.concurrent.GuardedBy
 
 import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
 import scala.concurrent.Future
-import scala.concurrent.duration.Duration
 
 import org.apache.spark.{ExecutorAllocationClient, SparkEnv, SparkException, TaskState}
 import org.apache.spark.internal.Logging
@@ -427,11 +426,11 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
    * be called in the yarn-client mode when AM re-registers after a failure.
    * */
   protected def reset(): Unit = {
-    val executors = synchronized {
+    val executors: Set[String] = synchronized {
       requestedTotalExecutors = 0
       numPendingExecutors = 0
       executorsPendingToRemove.clear()
-      Set() ++ executorDataMap.keys
+      executorDataMap.keys.toSet
     }
 
     // Remove all the lingering executors that should be removed but not yet. The reason might be
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
index adbe3cfd89ea6..aaacabe79ace4 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -1275,11 +1275,11 @@ private[spark] class BlockManager(
     val numPeersToReplicateTo = level.replication - 1
     val startTime = System.nanoTime
 
-    var peersReplicatedTo = mutable.HashSet.empty ++ existingReplicas
-    var peersFailedToReplicateTo = mutable.HashSet.empty[BlockManagerId]
+    val peersReplicatedTo = mutable.HashSet.empty ++ existingReplicas
+    val peersFailedToReplicateTo = mutable.HashSet.empty[BlockManagerId]
     var numFailures = 0
 
-    val initialPeers = getPeers(false).filterNot(existingReplicas.contains(_))
+    val initialPeers = getPeers(false).filterNot(existingReplicas.contains)
 
     var peersForReplication = blockReplicationPolicy.prioritize(
       blockManagerId,
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
index cce7a7611b420..a7f2caafe04b8 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
@@ -241,7 +241,7 @@ private[ui] class AllJobsPage(parent: JobsTab) extends WebUIPage("") {
     }.getOrElse(jobIdTitle)
     val jobSortDesc = Option(parameterJobSortDesc).map(_.toBoolean).getOrElse(
       // New jobs should be shown above old jobs by default.
-      if (jobSortColumn == jobIdTitle) true else false
+      jobSortColumn == jobIdTitle
     )
     val jobPageSize = Option(parameterJobPageSize).map(_.toInt).getOrElse(100)
     val jobPrevPageSize = Option(parameterJobPrevPageSize).map(_.toInt).getOrElse(jobPageSize)
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/AllStagesPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/AllStagesPage.scala
index 2b0816e35747d..a30c13592947c 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/AllStagesPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/AllStagesPage.scala
@@ -115,7 +115,7 @@ private[ui] class AllStagesPage(parent: StagesTab) extends WebUIPage("") {
           if (sc.isDefined && isFairScheduler) {
             <h4>{pools.size} Fair Scheduler Pools</h4> ++ poolTable.toNodeSeq
           } else {
-            Seq[Node]()
+            Seq.empty[Node]
           }
         }
       if (shouldShowActiveStages) {
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/PoolPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/PoolPage.scala
index b164f32b62e97..819fe57e14b2d 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/PoolPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/PoolPage.scala
@@ -41,7 +41,7 @@ private[ui] class PoolPage(parent: StagesTab) extends WebUIPage("pool") {
       val poolToActiveStages = listener.poolToActiveStages
       val activeStages = poolToActiveStages.get(poolName) match {
         case Some(s) => s.values.toSeq
-        case None => Seq[StageInfo]()
+        case None => Seq.empty[StageInfo]
       }
       val shouldShowActiveStages = activeStages.nonEmpty
       val activeStagesTable =
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index 6b3dadc333316..8ed51746ab9d0 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -565,7 +565,7 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
       val executorTable = new ExecutorTable(stageId, stageAttemptId, parent)
 
       val maybeAccumulableTable: Seq[Node] =
-        if (hasAccumulators) { <h4>Accumulators</h4> ++ accumulableTable } else Seq()
+        if (hasAccumulators) { <h4>Accumulators</h4> ++ accumulableTable } else Seq.empty
 
       val aggMetrics =
         <span class="collapse-aggregated-metrics collapse-table"
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
index a28daf7f90451..f0a12a28de069 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
@@ -60,7 +60,7 @@ private[ui] class StageTableBase(
   }.getOrElse("Stage Id")
   val stageSortDesc = Option(parameterStageSortDesc).map(_.toBoolean).getOrElse(
     // New stages should be shown above old jobs by default.
-    if (stageSortColumn == "Stage Id") true else false
+    stageSortColumn == "Stage Id"
   )
   val stagePageSize = Option(parameterStagePageSize).map(_.toInt).getOrElse(100)
   val stagePrevPageSize = Option(parameterStagePrevPageSize).map(_.toInt)
diff --git a/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala b/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala
index 317e0aa5ea25c..e8ff08f7d88ff 100644
--- a/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala
@@ -51,7 +51,7 @@ private[ui] class RDDPage(parent: StorageTab) extends WebUIPage("rdd") {
     val rddStorageInfo = AllRDDResource.getRDDStorageInfo(rddId, listener, includeDetails = true)
       .getOrElse {
         // Rather than crashing, render an "RDD Not Found" page
-        return UIUtils.headerSparkPage("RDD Not Found", Seq[Node](), parent)
+        return UIUtils.headerSparkPage("RDD Not Found", Seq.empty[Node], parent)
       }
 
     // Worker table
diff --git a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala
index 489688cb0880f..48a1d7b84b61b 100644
--- a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala
+++ b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala
@@ -81,7 +81,7 @@ private[spark] object ClosureCleaner extends Logging {
     val stack = Stack[Class[_]](obj.getClass)
     while (!stack.isEmpty) {
       val cr = getClassReader(stack.pop())
-      val set = Set[Class[_]]()
+      val set = Set.empty[Class[_]]
       cr.accept(new InnerClosureFinder(set), 0)
       for (cls <- set -- seen) {
         seen += cls
@@ -180,16 +180,18 @@ private[spark] object ClosureCleaner extends Logging {
     val declaredFields = func.getClass.getDeclaredFields
     val declaredMethods = func.getClass.getDeclaredMethods
 
-    logDebug(" + declared fields: " + declaredFields.size)
-    declaredFields.foreach { f => logDebug("     " + f) }
-    logDebug(" + declared methods: " + declaredMethods.size)
-    declaredMethods.foreach { m => logDebug("     " + m) }
-    logDebug(" + inner classes: " + innerClasses.size)
-    innerClasses.foreach { c => logDebug("     " + c.getName) }
-    logDebug(" + outer classes: " + outerClasses.size)
-    outerClasses.foreach { c => logDebug("     " + c.getName) }
-    logDebug(" + outer objects: " + outerObjects.size)
-    outerObjects.foreach { o => logDebug("     " + o) }
+    if (log.isDebugEnabled) {
+      logDebug(" + declared fields: " + declaredFields.size)
+      declaredFields.foreach { f => logDebug("     " + f) }
+      logDebug(" + declared methods: " + declaredMethods.size)
+      declaredMethods.foreach { m => logDebug("     " + m) }
+      logDebug(" + inner classes: " + innerClasses.size)
+      innerClasses.foreach { c => logDebug("     " + c.getName) }
+      logDebug(" + outer classes: " + outerClasses.size)
+      outerClasses.foreach { c => logDebug("     " + c.getName) }
+      logDebug(" + outer objects: " + outerObjects.size)
+      outerObjects.foreach { o => logDebug("     " + o) }
+    }
 
     // Fail fast if we detect return statements in closures
     getClassReader(func.getClass).accept(new ReturnStatementFinder(), 0)
@@ -201,7 +203,7 @@ private[spark] object ClosureCleaner extends Logging {
       // Initialize accessed fields with the outer classes first
       // This step is needed to associate the fields to the correct classes later
       for (cls <- outerClasses) {
-        accessedFields(cls) = Set[String]()
+        accessedFields(cls) = Set.empty[String]
       }
       // Populate accessed fields by visiting all fields and methods accessed by this and
       // all of its inner closures. If transitive cleaning is enabled, this may recursively
diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index 806d14e7cc119..8406826a228db 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -696,7 +696,7 @@ private[spark] object JsonProtocol {
     val accumulatedValues = {
       Utils.jsonOption(json \ "Accumulables").map(_.extract[List[JValue]]) match {
         case Some(values) => values.map(accumulableInfoFromJson)
-        case None => Seq[AccumulableInfo]()
+        case None => Seq.empty[AccumulableInfo]
       }
     }
 
@@ -726,7 +726,7 @@ private[spark] object JsonProtocol {
     val killed = Utils.jsonOption(json \ "Killed").exists(_.extract[Boolean])
     val accumulables = Utils.jsonOption(json \ "Accumulables").map(_.extract[Seq[JValue]]) match {
       case Some(values) => values.map(accumulableInfoFromJson)
-      case None => Seq[AccumulableInfo]()
+      case None => Seq.empty[AccumulableInfo]
     }
 
     val taskInfo =
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 584337a71cb43..d661293e529f9 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -1443,7 +1443,7 @@ private[spark] object Utils extends Logging {
     var firstUserFile = "<unknown>"
     var firstUserLine = 0
     var insideSpark = true
-    var callStack = new ArrayBuffer[String]() :+ "<unknown>"
+    val callStack = new ArrayBuffer[String]() :+ "<unknown>"
 
     Thread.currentThread.getStackTrace().foreach { ste: StackTraceElement =>
       // When running under some profilers, the current stack trace might contain some bogus
@@ -2438,7 +2438,7 @@ private[spark] object Utils extends Logging {
       .getOrElse(UserGroupInformation.getCurrentUser().getShortUserName())
   }
 
-  val EMPTY_USER_GROUPS = Set[String]()
+  val EMPTY_USER_GROUPS = Set.empty[String]
 
   // Returns the groups to which the current user belongs.
   def getCurrentUserGroups(sparkConf: SparkConf, username: String): Set[String] = {
@@ -2587,7 +2587,7 @@ private[spark] object Utils extends Logging {
    * Unions two comma-separated lists of files and filters out empty strings.
    */
   def unionFileLists(leftList: Option[String], rightList: Option[String]): Set[String] = {
-    var allFiles = Set[String]()
+    var allFiles = Set.empty[String]
     leftList.foreach { value => allFiles ++= value.split(",") }
     rightList.foreach { value => allFiles ++= value.split(",") }
     allFiles.filter { _.nonEmpty }
diff --git a/examples/src/main/scala/org/apache/spark/examples/LocalFileLR.scala b/examples/src/main/scala/org/apache/spark/examples/LocalFileLR.scala
index a897cad02ffd7..8dbb7ee4e5307 100644
--- a/examples/src/main/scala/org/apache/spark/examples/LocalFileLR.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/LocalFileLR.scala
@@ -53,16 +53,16 @@ object LocalFileLR {
 
     val fileSrc = scala.io.Source.fromFile(args(0))
     val lines = fileSrc.getLines().toArray
-    val points = lines.map(parsePoint _)
+    val points = lines.map(parsePoint)
     val ITERATIONS = args(1).toInt
 
     // Initialize w to a random value
-    var w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
+    val w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
     println("Initial w: " + w)
 
     for (i <- 1 to ITERATIONS) {
       println("On iteration " + i)
-      var gradient = DenseVector.zeros[Double](D)
+      val gradient = DenseVector.zeros[Double](D)
       for (p <- points) {
         val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y
         gradient += p.x * scale
diff --git a/examples/src/main/scala/org/apache/spark/examples/LocalKMeans.scala b/examples/src/main/scala/org/apache/spark/examples/LocalKMeans.scala
index fca585c2a362b..963c9a56d6cac 100644
--- a/examples/src/main/scala/org/apache/spark/examples/LocalKMeans.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/LocalKMeans.scala
@@ -47,12 +47,11 @@ object LocalKMeans {
   }
 
   def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
-    var index = 0
     var bestIndex = 0
     var closest = Double.PositiveInfinity
 
     for (i <- 1 to centers.size) {
-      val vCurr = centers.get(i).get
+      val vCurr = centers(i)
       val tempDist = squaredDistance(p, vCurr)
       if (tempDist < closest) {
         closest = tempDist
@@ -76,8 +75,8 @@ object LocalKMeans {
     showWarning()
 
     val data = generateData
-    var points = new HashSet[Vector[Double]]
-    var kPoints = new HashMap[Int, Vector[Double]]
+    val points = new HashSet[Vector[Double]]
+    val kPoints = new HashMap[Int, Vector[Double]]
     var tempDist = 1.0
 
     while (points.size < K) {
@@ -92,11 +91,11 @@ object LocalKMeans {
     println("Initial centers: " + kPoints)
 
     while(tempDist > convergeDist) {
-      var closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))
+      val closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))
 
-      var mappings = closest.groupBy[Int] (x => x._1)
+      val mappings = closest.groupBy[Int] (x => x._1)
 
-      var pointStats = mappings.map { pair =>
+      val pointStats = mappings.map { pair =>
         pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
           case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2))
         }
@@ -107,7 +106,7 @@ object LocalKMeans {
 
       tempDist = 0.0
       for (mapping <- newPoints) {
-        tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2)
+        tempDist += squaredDistance(kPoints(mapping._1), mapping._2)
       }
 
       for (newP <- newPoints) {
diff --git a/examples/src/main/scala/org/apache/spark/examples/LocalLR.scala b/examples/src/main/scala/org/apache/spark/examples/LocalLR.scala
index 13ccc2ae7c3d8..eb5221f085937 100644
--- a/examples/src/main/scala/org/apache/spark/examples/LocalLR.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/LocalLR.scala
@@ -60,12 +60,12 @@ object LocalLR {
 
     val data = generateData
     // Initialize w to a random value
-    var w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
+    val w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
     println("Initial w: " + w)
 
     for (i <- 1 to ITERATIONS) {
       println("On iteration " + i)
-      var gradient = DenseVector.zeros[Double](D)
+      val gradient = DenseVector.zeros[Double](D)
       for (p <- data) {
         val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y
         gradient +=  p.x * scale
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkHdfsLR.scala b/examples/src/main/scala/org/apache/spark/examples/SparkHdfsLR.scala
index 05ac6cbcb35bc..9d675bbc18f38 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkHdfsLR.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkHdfsLR.scala
@@ -40,8 +40,8 @@ object SparkHdfsLR {
 
   def parsePoint(line: String): DataPoint = {
     val tok = new java.util.StringTokenizer(line, " ")
-    var y = tok.nextToken.toDouble
-    var x = new Array[Double](D)
+    val y = tok.nextToken.toDouble
+    val x = new Array[Double](D)
     var i = 0
     while (i < D) {
       x(i) = tok.nextToken.toDouble; i += 1
@@ -78,7 +78,7 @@ object SparkHdfsLR {
     val ITERATIONS = args(1).toInt
 
     // Initialize w to a random value
-    var w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
+    val w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
     println("Initial w: " + w)
 
     for (i <- 1 to ITERATIONS) {
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkLR.scala b/examples/src/main/scala/org/apache/spark/examples/SparkLR.scala
index cb2be091ffcf3..c18e3d31f149e 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkLR.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkLR.scala
@@ -72,7 +72,7 @@ object SparkLR {
     val points = spark.sparkContext.parallelize(generateData, numSlices).cache()
 
     // Initialize w to a random value
-    var w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
+    val w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
     println("Initial w: " + w)
 
     for (i <- 1 to ITERATIONS) {
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala
index b03701e4915d0..19f2d7751bc54 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala
@@ -251,7 +251,7 @@ object DecisionTreeExample {
           .setMinInfoGain(params.minInfoGain)
           .setCacheNodeIds(params.cacheNodeIds)
           .setCheckpointInterval(params.checkpointInterval)
-      case _ => throw new IllegalArgumentException("Algo ${params.algo} not supported.")
+      case _ => throw new IllegalArgumentException(s"Algo ${params.algo} not supported.")
     }
     stages += dt
     val pipeline = new Pipeline().setStages(stages.toArray)
@@ -278,7 +278,7 @@ object DecisionTreeExample {
         } else {
           println(treeModel) // Print model summary.
         }
-      case _ => throw new IllegalArgumentException("Algo ${params.algo} not supported.")
+      case _ => throw new IllegalArgumentException(s"Algo ${params.algo} not supported.")
     }
 
     // Evaluate model on training, test data.
@@ -294,7 +294,7 @@ object DecisionTreeExample {
         println("Test data results:")
         evaluateRegressionModel(pipelineModel, test, labelColName)
       case _ =>
-        throw new IllegalArgumentException("Algo ${params.algo} not supported.")
+        throw new IllegalArgumentException(s"Algo ${params.algo} not supported.")
     }
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala
index 3bd8ff54c2238..8f3ce4b315bd3 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala
@@ -190,7 +190,7 @@ object GBTExample {
           .setCacheNodeIds(params.cacheNodeIds)
           .setCheckpointInterval(params.checkpointInterval)
           .setMaxIter(params.maxIter)
-      case _ => throw new IllegalArgumentException("Algo ${params.algo} not supported.")
+      case _ => throw new IllegalArgumentException(s"Algo ${params.algo} not supported.")
     }
     stages += dt
     val pipeline = new Pipeline().setStages(stages.toArray)
@@ -217,7 +217,7 @@ object GBTExample {
         } else {
           println(rfModel) // Print model summary.
         }
-      case _ => throw new IllegalArgumentException("Algo ${params.algo} not supported.")
+      case _ => throw new IllegalArgumentException(s"Algo ${params.algo} not supported.")
     }
 
     // Evaluate model on training, test data.
@@ -233,7 +233,7 @@ object GBTExample {
         println("Test data results:")
         DecisionTreeExample.evaluateRegressionModel(pipelineModel, test, labelColName)
       case _ =>
-        throw new IllegalArgumentException("Algo ${params.algo} not supported.")
+        throw new IllegalArgumentException(s"Algo ${params.algo} not supported.")
     }
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala
index a735c218c0d26..3c127a46e1f10 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala
@@ -198,7 +198,7 @@ object RandomForestExample {
           .setCheckpointInterval(params.checkpointInterval)
           .setFeatureSubsetStrategy(params.featureSubsetStrategy)
           .setNumTrees(params.numTrees)
-      case _ => throw new IllegalArgumentException("Algo ${params.algo} not supported.")
+      case _ => throw new IllegalArgumentException(s"Algo ${params.algo} not supported.")
     }
     stages += dt
     val pipeline = new Pipeline().setStages(stages.toArray)
@@ -225,7 +225,7 @@ object RandomForestExample {
         } else {
           println(rfModel) // Print model summary.
         }
-      case _ => throw new IllegalArgumentException("Algo ${params.algo} not supported.")
+      case _ => throw new IllegalArgumentException(s"Algo ${params.algo} not supported.")
     }
 
     // Evaluate model on training, test data.
@@ -241,7 +241,7 @@ object RandomForestExample {
         println("Test data results:")
         DecisionTreeExample.evaluateRegressionModel(pipelineModel, test, labelColName)
       case _ =>
-        throw new IllegalArgumentException("Algo ${params.algo} not supported.")
+        throw new IllegalArgumentException(s"Algo ${params.algo} not supported.")
     }
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala
index 0ad0465a023cf..fa47e12857f0c 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala
@@ -211,7 +211,7 @@ object DecisionTreeRunner {
       case Regression =>
         (origExamples, null, 0)
       case _ =>
-        throw new IllegalArgumentException("Algo ${params.algo} not supported.")
+        throw new IllegalArgumentException(s"Algo $algo not supported.")
     }
 
     // Create training, test sets.
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
index f86b8f586d2a0..5915d9f99a939 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
@@ -378,7 +378,7 @@ class KafkaTestUtils(withBrokerProps: Map[String, Object] = Map.empty) extends L
 
         zkUtils.getLeaderForPartition(topic, partition).isDefined &&
           Request.isValidBrokerId(leaderAndInSyncReplicas.leader) &&
-          leaderAndInSyncReplicas.isr.size >= 1
+          leaderAndInSyncReplicas.isr.nonEmpty
 
       case _ =>
         false
diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala
index 62cdf5b1134e4..d9fc9cc206647 100644
--- a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala
+++ b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala
@@ -156,7 +156,7 @@ private[spark] class KafkaRDD[K, V](
     val prefExecs = if (null == prefHost) allExecs else allExecs.filter(_.host == prefHost)
     val execs = if (prefExecs.isEmpty) allExecs else prefExecs
     if (execs.isEmpty) {
-      Seq()
+      Seq.empty
     } else {
       // execs is sorted, tp.hashCode depends only on topic and partition, so consistent index
       val index = Math.floorMod(tp.hashCode, execs.length)
diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaTestUtils.scala b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaTestUtils.scala
index 8273c2b49f6b5..6c7024ea4b5a5 100644
--- a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaTestUtils.scala
+++ b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaTestUtils.scala
@@ -257,7 +257,7 @@ private[kafka010] class KafkaTestUtils extends Logging {
 
         zkUtils.getLeaderForPartition(topic, partition).isDefined &&
           Request.isValidBrokerId(leaderAndInSyncReplicas.leader) &&
-          leaderAndInSyncReplicas.isr.size >= 1
+          leaderAndInSyncReplicas.isr.nonEmpty
 
       case _ =>
         false
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartition.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartition.scala
index 26349f4d88a19..0e6a340a680ba 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartition.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartition.scala
@@ -388,7 +388,7 @@ class EdgePartition[
     val aggregates = new Array[A](vertexAttrs.length)
     val bitset = new BitSet(vertexAttrs.length)
 
-    var ctx = new AggregatingEdgeContext[VD, ED, A](mergeMsg, aggregates, bitset)
+    val ctx = new AggregatingEdgeContext[VD, ED, A](mergeMsg, aggregates, bitset)
     var i = 0
     while (i < size) {
       val localSrcId = localSrcIds(i)
@@ -433,7 +433,7 @@ class EdgePartition[
     val aggregates = new Array[A](vertexAttrs.length)
     val bitset = new BitSet(vertexAttrs.length)
 
-    var ctx = new AggregatingEdgeContext[VD, ED, A](mergeMsg, aggregates, bitset)
+    val ctx = new AggregatingEdgeContext[VD, ED, A](mergeMsg, aggregates, bitset)
     index.iterator.foreach { cluster =>
       val clusterSrcId = cluster._1
       val clusterPos = cluster._2
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala
index 5d2a53782b55d..34e1253ff42ab 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala
@@ -74,7 +74,7 @@ class GraphImpl[VD: ClassTag, ED: ClassTag] protected (
   override def getCheckpointFiles: Seq[String] = {
     Seq(vertices.getCheckpointFile, replicatedVertexView.edges.getCheckpointFile).flatMap {
       case Some(path) => Seq(path)
-      case None => Seq()
+      case None => Seq.empty
     }
   }
 
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala b/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala
index 2b3e5f98c4fe5..419731146df7f 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.graphx.util
 
 import scala.annotation.tailrec
+import scala.collection.mutable
 import scala.reflect.ClassTag
 import scala.util._
 
@@ -133,7 +134,7 @@ object GraphGenerators extends Logging {
       throw new IllegalArgumentException(
         s"numEdges must be <= $numEdgesUpperBound but was $numEdges")
     }
-    var edges: Set[Edge[Int]] = Set()
+    var edges = mutable.Set.empty[Edge[Int]]
     while (edges.size < numEdges) {
       if (edges.size % 100 == 0) {
         logDebug(edges.size + " edges")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index b234bc4c2df4f..65b09e571264c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -736,7 +736,7 @@ class LogisticRegression @Since("1.2.0") (
                b_k' = b_k - \mean(b_k)
              }}}
            */
-          val rawIntercepts = histogram.map(c => math.log(c + 1)) // add 1 for smoothing
+          val rawIntercepts = histogram.map(math.log1p) // add 1 for smoothing (log1p(x) = log(1+x))
           val rawMean = rawIntercepts.sum / rawIntercepts.length
           rawIntercepts.indices.foreach { i =>
             initialCoefWithInterceptMatrix.update(i, numFeatures, rawIntercepts(i) - rawMean)
@@ -820,7 +820,7 @@ class LogisticRegression @Since("1.2.0") (
         val interceptVec = if ($(fitIntercept) || !isMultinomial) {
           Vectors.zeros(numCoefficientSets)
         } else {
-          Vectors.sparse(numCoefficientSets, Seq())
+          Vectors.sparse(numCoefficientSets, Seq.empty)
         }
         // separate intercepts and coefficients from the combined matrix
         allCoefMatrix.foreachActive { (classIndex, featureIndex, value) =>
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormulaParser.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormulaParser.scala
index 2dd565a782719..32835fb3aa6d1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormulaParser.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormulaParser.scala
@@ -99,7 +99,7 @@ private[ml] case class ParsedRFormula(label: ColumnRef, terms: Seq[Term]) {
     }).map(_.distinct)
 
     // Deduplicates feature interactions, for example, a:b is the same as b:a.
-    var seen = mutable.Set[Set[String]]()
+    val seen = mutable.Set[Set[String]]()
     validInteractions.flatMap {
       case t if seen.contains(t.toSet) =>
         None
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 91cd229704a37..ccc61feee82cf 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -286,7 +286,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
             s"training is not needed.")
         }
         if (handlePersistence) instances.unpersist()
-        val coefficients = Vectors.sparse(numFeatures, Seq())
+        val coefficients = Vectors.sparse(numFeatures, Seq.empty)
         val intercept = yMean
 
         val model = copyValues(new LinearRegressionModel(uid, coefficients, intercept))
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index 98e50c5b45cfd..49043b5acb807 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -363,7 +363,7 @@ class KMeans private (
     // to their squared distance from the centers. Note that only distances between points
     // and new centers are computed in each iteration.
     var step = 0
-    var bcNewCentersList = ArrayBuffer[Broadcast[_]]()
+    val bcNewCentersList = ArrayBuffer[Broadcast[_]]()
     while (step < initializationSteps) {
       val bcNewCenters = data.context.broadcast(newCenters)
       bcNewCentersList += bcNewCenters
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
index 7695aabf4313d..c7c1a5404e5e8 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
@@ -78,13 +78,13 @@ private[mllib] object EigenValueDecomposition {
     require(n * ncv.toLong <= Integer.MAX_VALUE && ncv * (ncv.toLong + 8) <= Integer.MAX_VALUE,
       s"k = $k and/or n = $n are too large to compute an eigendecomposition")
 
-    var ido = new intW(0)
-    var info = new intW(0)
-    var resid = new Array[Double](n)
-    var v = new Array[Double](n * ncv)
-    var workd = new Array[Double](n * 3)
-    var workl = new Array[Double](ncv * (ncv + 8))
-    var ipntr = new Array[Int](11)
+    val ido = new intW(0)
+    val info = new intW(0)
+    val resid = new Array[Double](n)
+    val v = new Array[Double](n * ncv)
+    val workd = new Array[Double](n * 3)
+    val workl = new Array[Double](ncv * (ncv + 8))
+    val ipntr = new Array[Int](11)
 
     // call ARPACK's reverse communication, first iteration with ido = 0
     arpack.dsaupd(ido, bmat, n, which, nev.`val`, tolW, resid, ncv, v, n, iparam, ipntr, workd,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala
index efedebe301387..21ec287e497d4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala
@@ -257,7 +257,7 @@ object LBFGS extends Logging {
           (denseGrad1, loss1 + loss2)
        }
 
-      val zeroSparseVector = Vectors.sparse(n, Seq())
+      val zeroSparseVector = Vectors.sparse(n, Seq.empty)
       val (gradientSum, lossSum) = data.treeAggregate((zeroSparseVector, 0.0))(seqOp, combOp)
 
       // broadcasted model is not needed anymore
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/correlation/SpearmanCorrelation.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/correlation/SpearmanCorrelation.scala
index b760347bcb6fb..ee51d332399e0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/correlation/SpearmanCorrelation.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/correlation/SpearmanCorrelation.scala
@@ -57,7 +57,7 @@ private[stat] object SpearmanCorrelation extends Correlation with Logging {
       var preCol = -1
       var preVal = Double.NaN
       var startRank = -1.0
-      var cachedUids = ArrayBuffer.empty[Long]
+      val cachedUids = ArrayBuffer.empty[Long]
       val flush: () => Iterable[(Long, (Int, Double))] = () => {
         val averageRank = startRank + (cachedUids.size - 1) / 2.0
         val output = cachedUids.map { uid =>
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala
index 551ea357950ba..80c6ef0ea1aa1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala
@@ -133,7 +133,7 @@ class StreamingTest @Since("1.6.0") () extends Logging with Serializable {
       if (time.milliseconds > data.slideDuration.milliseconds * peacePeriod) {
         rdd
       } else {
-        data.context.sparkContext.parallelize(Seq())
+        data.context.sparkContext.parallelize(Seq.empty)
       }
     }
   }
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 41f3a0451aa8a..b9db1df2d1919 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -589,7 +589,7 @@ object PySparkAssembly {
       val zipFile = new File(BuildCommons.sparkHome , "python/lib/pyspark.zip")
       zipFile.delete()
       zipRecursive(src, zipFile)
-      Seq[File]()
+      Seq.empty[File]
     }).value
   )
 
@@ -810,7 +810,7 @@ object TestSettings {
           require(d.mkdir() || d.isDirectory(), s"Failed to create directory $d")
         }
       }
-      Seq[File]()
+      Seq.empty[File]
     }).value,
     concurrentRestrictions in Global += Tags.limit(Tags.Test, 1),
     // Remove certain packages from Scaladoc
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/DriverPage.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/DriverPage.scala
index a6bb5d5915022..022191d0070fd 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/DriverPage.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/DriverPage.scala
@@ -112,7 +112,7 @@ private[ui] class DriverPage(parent: MesosClusterUI) extends WebUIPage("driver")
         <td>Last Task Status</td>
         <td>{state.mesosTaskStatus.map(_.toString).getOrElse("")}</td>
       </tr>
-    }.getOrElse(Seq[Node]())
+    }.getOrElse(Seq.empty[Node])
   }
 
   private def propertiesRow(properties: collection.Map[String, String]): Seq[Node] = {
@@ -175,6 +175,6 @@ private[ui] class DriverPage(parent: MesosClusterUI) extends WebUIPage("driver")
           {state.retries}
         </td>
       </tr>
-    }.getOrElse(Seq[Node]())
+    }.getOrElse(Seq.empty[Node])
   }
 }
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala
index 062ed1f93fa52..7ec116c74b10f 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala
@@ -333,7 +333,7 @@ trait MesosSchedulerUtils extends Logging {
       try {
         splitter.split(constraintsVal).asScala.toMap.mapValues(v =>
           if (v == null || v.isEmpty) {
-            Set[String]()
+            Set.empty[String]
           } else {
             v.split(',').toSet
           }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 7745709e07fe5..501e7e3c6961d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -2334,8 +2334,9 @@ object TimeWindowing extends Rule[LogicalPlan] {
       val windowExpressions =
         p.expressions.flatMap(_.collect { case t: TimeWindow => t }).toSet
 
+      val numWindowExpr = windowExpressions.size
       // Only support a single window expression for now
-      if (windowExpressions.size == 1 &&
+      if (numWindowExpr == 1 &&
           windowExpressions.head.timeColumn.resolved &&
           windowExpressions.head.checkInputDataTypes().isSuccess) {
 
@@ -2402,7 +2403,7 @@ object TimeWindowing extends Rule[LogicalPlan] {
 
           renamedPlan.withNewChildren(substitutedPlan :: Nil)
         }
-      } else if (windowExpressions.size > 1) {
+      } else if (numWindowExpr > 1) {
         p.failAnalysis("Multiple time window expressions would result in a cartesian product " +
           "of rows, therefore they are currently not supported.")
       } else {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index c863ba434120d..83a23cc97e45f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -228,10 +228,10 @@ case class ArrayContains(left: Expression, right: Expression)
   override def dataType: DataType = BooleanType
 
   override def inputTypes: Seq[AbstractDataType] = right.dataType match {
-    case NullType => Seq()
+    case NullType => Seq.empty
     case _ => left.dataType match {
       case n @ ArrayType(element, _) => Seq(n, element)
-      case _ => Seq()
+      case _ => Seq.empty
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/CostBasedJoinReorder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/CostBasedJoinReorder.scala
index db7baf6e9bc7d..064ca68b7a628 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/CostBasedJoinReorder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/CostBasedJoinReorder.scala
@@ -150,7 +150,7 @@ object JoinReorderDP extends PredicateHelper with Logging {
     // Create the initial plans: each plan is a single item with zero cost.
     val itemIndex = items.zipWithIndex
     val foundPlans = mutable.Buffer[JoinPlanMap](itemIndex.map {
-      case (item, id) => Set(id) -> JoinPlan(Set(id), item, Set(), Cost(0, 0))
+      case (item, id) => Set(id) -> JoinPlan(Set(id), item, Set.empty, Cost(0, 0))
     }.toMap)
 
     // Build filters from the join graph to be used by the search algorithm.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
index 6c83f4790004f..79a6c8663a56b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
@@ -134,7 +134,7 @@ object ReorderAssociativeOperator extends Rule[LogicalPlan] {
   private def collectGroupingExpressions(plan: LogicalPlan): ExpressionSet = plan match {
     case Aggregate(groupingExpressions, aggregateExpressions, child) =>
       ExpressionSet.apply(groupingExpressions)
-    case _ => ExpressionSet(Seq())
+    case _ => ExpressionSet(Seq.empty)
   }
 
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index ad359e714bcc7..45c1d3d430e0d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -877,7 +877,7 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
 
     // Reverse the contexts to have them in the same sequence as in the SQL statement & turn them
     // into expressions.
-    val expressions = contexts.reverse.map(expression)
+    val expressions = contexts.reverseMap(expression)
 
     // Create a balanced tree.
     def reduceToExpressionTree(low: Int, high: Int): Expression = high - low match {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
index 7f370fb731b2f..8d034c21a4960 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
@@ -173,7 +173,7 @@ object ExtractFiltersAndInnerJoins extends PredicateHelper {
       val (plans, conditions) = flattenJoin(j)
       (plans, conditions ++ splitConjunctivePredicates(filterCondition))
 
-    case _ => (Seq((plan, parentJoinType)), Seq())
+    case _ => (Seq((plan, parentJoinType)), Seq.empty)
   }
 
   def unapply(plan: LogicalPlan): Option[(Seq[(LogicalPlan, InnerLike)], Seq[Expression])]
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
index e13db85c7a76e..74820eb97d081 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
@@ -47,7 +47,7 @@ case class FilterEstimation(plan: Filter) extends Logging {
 
     // Estimate selectivity of this filter predicate, and update column stats if needed.
     // For not-supported condition, set filter selectivity to a conservative estimate 100%
-    val filterSelectivity = calculateFilterSelectivity(plan.condition).getOrElse(BigDecimal(1.0))
+    val filterSelectivity = calculateFilterSelectivity(plan.condition).getOrElse(BigDecimal(1))
 
     val filteredRowCount: BigInt = ceil(BigDecimal(childStats.rowCount.get) * filterSelectivity)
     val newColStats = if (filteredRowCount == 0) {
@@ -83,13 +83,13 @@ case class FilterEstimation(plan: Filter) extends Logging {
     : Option[BigDecimal] = {
     condition match {
       case And(cond1, cond2) =>
-        val percent1 = calculateFilterSelectivity(cond1, update).getOrElse(BigDecimal(1.0))
-        val percent2 = calculateFilterSelectivity(cond2, update).getOrElse(BigDecimal(1.0))
+        val percent1 = calculateFilterSelectivity(cond1, update).getOrElse(BigDecimal(1))
+        val percent2 = calculateFilterSelectivity(cond2, update).getOrElse(BigDecimal(1))
         Some(percent1 * percent2)
 
       case Or(cond1, cond2) =>
-        val percent1 = calculateFilterSelectivity(cond1, update = false).getOrElse(BigDecimal(1.0))
-        val percent2 = calculateFilterSelectivity(cond2, update = false).getOrElse(BigDecimal(1.0))
+        val percent1 = calculateFilterSelectivity(cond1, update = false).getOrElse(BigDecimal(1))
+        val percent2 = calculateFilterSelectivity(cond2, update = false).getOrElse(BigDecimal(1))
         Some(percent1 + percent2 - (percent1 * percent2))
 
       // Not-operator pushdown
@@ -464,7 +464,7 @@ case class FilterEstimation(plan: Filter) extends Logging {
         (numericLiteral > max, numericLiteral <= min)
     }
 
-    var percent = BigDecimal(1.0)
+    var percent = BigDecimal(1)
     if (noOverlap) {
       percent = 0.0
     } else if (completeOverlap) {
@@ -630,7 +630,7 @@ case class FilterEstimation(plan: Filter) extends Logging {
         )
     }
 
-    var percent = BigDecimal(1.0)
+    var percent = BigDecimal(1)
     if (noOverlap) {
       percent = 0.0
     } else if (completeOverlap) {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathExpressionsSuite.scala
index f4d5a4471d896..9ee777529aeda 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathExpressionsSuite.scala
@@ -609,9 +609,9 @@ class MathExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       checkEvaluation(BRound(floatPi, scale), floatResults(i), EmptyRow)
     }
 
-    val bdResults: Seq[BigDecimal] = Seq(BigDecimal(3.0), BigDecimal(3.1), BigDecimal(3.14),
-      BigDecimal(3.142), BigDecimal(3.1416), BigDecimal(3.14159),
-      BigDecimal(3.141593), BigDecimal(3.1415927))
+    val bdResults: Seq[BigDecimal] = Seq(BigDecimal(3), BigDecimal("3.1"), BigDecimal("3.14"),
+      BigDecimal("3.142"), BigDecimal("3.1416"), BigDecimal("3.14159"),
+      BigDecimal("3.141593"), BigDecimal("3.1415927"))
 
     (0 to 7).foreach { i =>
       checkEvaluation(Round(bdPi, i), bdResults(i), EmptyRow)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DecimalSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DecimalSuite.scala
index 144f3d688d402..3193d1320ad9d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DecimalSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DecimalSuite.scala
@@ -109,8 +109,8 @@ class DecimalSuite extends SparkFunSuite with PrivateMethodTester {
 
   test("small decimals represented as unscaled long") {
     checkCompact(new Decimal(), true)
-    checkCompact(Decimal(BigDecimal(10.03)), false)
-    checkCompact(Decimal(BigDecimal(1e20)), false)
+    checkCompact(Decimal(BigDecimal("10.03")), false)
+    checkCompact(Decimal(BigDecimal("100000000000000000000")), false)
     checkCompact(Decimal(17L), true)
     checkCompact(Decimal(17), true)
     checkCompact(Decimal(17L, 2, 1), true)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index ba7ca84f229fc..dae160f1bbb18 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -671,11 +671,11 @@ case class AlterTableRecoverPartitionsCommand(
         } else {
           logWarning(
             s"expected partition column ${partitionNames.head}, but got ${ps(0)}, ignoring it")
-          Seq()
+          Seq.empty
         }
       } else {
         logWarning(s"ignore ${new Path(path, name)}")
-        Seq()
+        Seq.empty
       }
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
index a521fd1323852..658d13768a976 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
@@ -23,7 +23,6 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.Partition
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Row, SaveMode, SparkSession, SQLContext}
-import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.jdbc.JdbcDialects
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.StructType
@@ -81,7 +80,7 @@ private[sql] object JDBCRelation extends Logging {
     val column = partitioning.column
     var i: Int = 0
     var currentValue: Long = lowerBound
-    var ans = new ArrayBuffer[Partition]()
+    val ans = new ArrayBuffer[Partition]()
     while (i < numPartitions) {
       val lBound = if (i != 0) s"$column >= $currentValue" else null
       currentValue += stride
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
index 87fbf8b1bc9c4..64eea26a9f98e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -220,7 +220,7 @@ class ParquetFileFormat
 
         val needMerged: Seq[FileStatus] =
           if (mergeRespectSummaries) {
-            Seq()
+            Seq.empty
           } else {
             filesByType.data
           }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
index 34391818f3b9a..9e4e02b99bbda 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
@@ -35,8 +35,6 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalGroupState
 import org.apache.spark.sql.execution.streaming.GroupStateImpl
 import org.apache.spark.sql.streaming.GroupStateTimeout
 import org.apache.spark.sql.types._
-import org.apache.spark.util.Utils
-
 
 /**
  * Physical version of `ObjectProducer`.
@@ -403,8 +401,7 @@ case class FlatMapGroupsInRExec(
     Seq(groupingAttributes.map(SortOrder(_, Ascending)))
 
   override protected def doExecute(): RDD[InternalRow] = {
-    val isSerializedRData =
-      if (outputSchema == SERIALIZED_R_DATA_SCHEMA) true else false
+    val isSerializedRData = outputSchema == SERIALIZED_R_DATA_SCHEMA
     val serializerForR = if (!isSerializedRData) {
       SerializationFormats.ROW
     } else {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/r/MapPartitionsRWrapper.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/r/MapPartitionsRWrapper.scala
index d2178e971ec20..b9835c7dbb025 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/r/MapPartitionsRWrapper.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/r/MapPartitionsRWrapper.scala
@@ -34,8 +34,7 @@ case class MapPartitionsRWrapper(
     outputSchema: StructType) extends (Iterator[Any] => Iterator[Any]) {
   def apply(iter: Iterator[Any]): Iterator[Any] = {
     // If the content of current DataFrame is serialized R data?
-    val isSerializedRData =
-      if (inputSchema == SERIALIZED_R_DATA_SCHEMA) true else false
+    val isSerializedRData = inputSchema == SERIALIZED_R_DATA_SCHEMA
 
     val (newIter, deserializer, colNames) =
       if (!isSerializedRData) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala
index 408c8f81f17ba..e37033b19a8eb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala
@@ -170,12 +170,8 @@ abstract class CompactibleFileStreamLog[T <: AnyRef : ClassTag](
   private def compact(batchId: Long, logs: Array[T]): Boolean = {
     val validBatches = getValidBatchesBeforeCompactionBatch(batchId, compactInterval)
     val allLogs = validBatches.flatMap(batchId => super.get(batchId)).flatten ++ logs
-    if (super.add(batchId, compactLogs(allLogs).toArray)) {
-      true
-    } else {
-      // Return false as there is another writer.
-      false
-    }
+    // Return false as there is another writer.
+    super.add(batchId, compactLogs(allLogs).toArray)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 5ee596e06d5c5..5711262654a16 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -609,7 +609,7 @@ class StreamExecution(
     }
 
     // A list of attributes that will need to be updated.
-    var replacements = new ArrayBuffer[(Attribute, Attribute)]
+    val replacements = new ArrayBuffer[(Attribute, Attribute)]
     // Replace sources in the logical plan with data that has arrived since the last batch.
     val withNewSources = logicalPlan transform {
       case StreamingExecutionRelation(source, output) =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index 4568b67024acb..d802557b36ec9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -65,9 +65,9 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {
 
     checkAnswer(
       decimalData.groupBy("a").agg(sum("b")),
-      Seq(Row(new java.math.BigDecimal(1.0), new java.math.BigDecimal(3.0)),
-        Row(new java.math.BigDecimal(2.0), new java.math.BigDecimal(3.0)),
-        Row(new java.math.BigDecimal(3.0), new java.math.BigDecimal(3.0)))
+      Seq(Row(new java.math.BigDecimal(1), new java.math.BigDecimal(3)),
+        Row(new java.math.BigDecimal(2), new java.math.BigDecimal(3)),
+        Row(new java.math.BigDecimal(3), new java.math.BigDecimal(3)))
     )
 
     val decimalDataWithNulls = spark.sparkContext.parallelize(
@@ -80,10 +80,10 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {
       DecimalData(null, 2) :: Nil).toDF()
     checkAnswer(
       decimalDataWithNulls.groupBy("a").agg(sum("b")),
-      Seq(Row(new java.math.BigDecimal(1.0), new java.math.BigDecimal(1.0)),
-        Row(new java.math.BigDecimal(2.0), new java.math.BigDecimal(1.0)),
-        Row(new java.math.BigDecimal(3.0), new java.math.BigDecimal(3.0)),
-        Row(null, new java.math.BigDecimal(2.0)))
+      Seq(Row(new java.math.BigDecimal(1), new java.math.BigDecimal(1)),
+        Row(new java.math.BigDecimal(2), new java.math.BigDecimal(1)),
+        Row(new java.math.BigDecimal(3), new java.math.BigDecimal(3)),
+        Row(null, new java.math.BigDecimal(2)))
     )
   }
 
@@ -259,19 +259,19 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {
 
     checkAnswer(
       decimalData.agg(avg('a)),
-      Row(new java.math.BigDecimal(2.0)))
+      Row(new java.math.BigDecimal(2)))
 
     checkAnswer(
       decimalData.agg(avg('a), sumDistinct('a)), // non-partial
-      Row(new java.math.BigDecimal(2.0), new java.math.BigDecimal(6)) :: Nil)
+      Row(new java.math.BigDecimal(2), new java.math.BigDecimal(6)) :: Nil)
 
     checkAnswer(
       decimalData.agg(avg('a cast DecimalType(10, 2))),
-      Row(new java.math.BigDecimal(2.0)))
+      Row(new java.math.BigDecimal(2)))
     // non-partial
     checkAnswer(
       decimalData.agg(avg('a cast DecimalType(10, 2)), sumDistinct('a cast DecimalType(10, 2))),
-      Row(new java.math.BigDecimal(2.0), new java.math.BigDecimal(6)) :: Nil)
+      Row(new java.math.BigDecimal(2), new java.math.BigDecimal(6)) :: Nil)
   }
 
   test("null average") {
@@ -520,9 +520,9 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {
   test("SQL decimal test (used for catching certain decimal handling bugs in aggregates)") {
     checkAnswer(
       decimalData.groupBy('a cast DecimalType(10, 2)).agg(avg('b cast DecimalType(10, 2))),
-      Seq(Row(new java.math.BigDecimal(1.0), new java.math.BigDecimal(1.5)),
-        Row(new java.math.BigDecimal(2.0), new java.math.BigDecimal(1.5)),
-        Row(new java.math.BigDecimal(3.0), new java.math.BigDecimal(1.5))))
+      Seq(Row(new java.math.BigDecimal(1), new java.math.BigDecimal("1.5")),
+        Row(new java.math.BigDecimal(2), new java.math.BigDecimal("1.5")),
+        Row(new java.math.BigDecimal(3), new java.math.BigDecimal("1.5"))))
   }
 
   test("SPARK-17616: distinct aggregate combined with a non-partial aggregate") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 3f3a6221d20b6..7c500728bdec9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -1167,7 +1167,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
   }
 
   test("SPARK-6899: type should match when using codegen") {
-    checkAnswer(decimalData.agg(avg('a)), Row(new java.math.BigDecimal(2.0)))
+    checkAnswer(decimalData.agg(avg('a)), Row(new java.math.BigDecimal(2)))
   }
 
   test("SPARK-7133: Implement struct, array, and map field accessor") {
@@ -1971,7 +1971,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
 
   test("SPARK-19691 Calculating percentile of decimal column fails with ClassCastException") {
     val df = spark.range(1).selectExpr("CAST(id as DECIMAL) as x").selectExpr("percentile(x, 0.5)")
-    checkAnswer(df, Row(BigDecimal(0.0)) :: Nil)
+    checkAnswer(df, Row(BigDecimal(0)) :: Nil)
   }
 
   test("SPARK-19893: cannot run set operations with map type") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index c78ec6d9a89ff..e95f6dba46079 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1546,10 +1546,10 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       Seq(Row(d)))
     checkAnswer(
       df.selectExpr("b * a + b"),
-      Seq(Row(BigDecimal(2.12321))))
+      Seq(Row(BigDecimal("2.12321"))))
     checkAnswer(
       df.selectExpr("b * a - b"),
-      Seq(Row(BigDecimal(0.12321))))
+      Seq(Row(BigDecimal("0.12321"))))
     checkAnswer(
       df.selectExpr("b * a * b"),
       Seq(Row(d)))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index bcc2351049953..a12efc835691b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -387,7 +387,7 @@ class StringFunctionsSuite extends QueryTest with SharedSQLContext {
       Row("6.4817"))
 
     checkAnswer(
-      df.select(format_number(lit(BigDecimal(7.128381)), 4)), // not convert anything
+      df.select(format_number(lit(BigDecimal("7.128381")), 4)), // not convert anything
       Row("7.1284"))
 
     intercept[AnalysisException] {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
index 1cde137edbb91..80e5dd161d1f3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -826,7 +826,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
       StructField("b", DecimalType(2, 2), true):: Nil)
 
     assert(expectedSchema === jsonDF.schema)
-    checkAnswer(jsonDF, Row(1.0E-39D, BigDecimal(0.01)))
+    checkAnswer(jsonDF, Row(1.0E-39D, BigDecimal("0.01")))
 
     val mergedJsonDF = spark.read
       .option("prefersDecimal", "true")
@@ -839,7 +839,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
     assert(expectedMergedSchema === mergedJsonDF.schema)
     checkAnswer(
       mergedJsonDF,
-      Row(1.0E-39D, BigDecimal(0.01)) ::
+      Row(1.0E-39D, BigDecimal("0.01")) ::
       Row(1.0E38D, BigDecimal("92233720368547758070")) :: Nil
     )
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
index b4f3de9961209..84b34d5ad26d1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
@@ -676,7 +676,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
         1.5.toFloat,
         4.5,
         new java.math.BigDecimal(new BigInteger("212500"), 5),
-        new java.math.BigDecimal(2.125),
+        new java.math.BigDecimal("2.125"),
         java.sql.Date.valueOf("2015-05-23"),
         new Timestamp(0),
         "This is a string, /[]?=:",
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
index 2a522a1431f45..be6339f7ddec3 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
@@ -245,7 +245,7 @@ private[spark] object HiveUtils extends Logging {
     val loader = new IsolatedClientLoader(
       version = IsolatedClientLoader.hiveVersion(hiveExecutionVersion),
       sparkConf = conf,
-      execJars = Seq(),
+      execJars = Seq.empty,
       hadoopConf = hadoopConf,
       config = newTemporaryConfiguration(useInMemoryDerby = true),
       isolationOn = false,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
index 16c1103dd1ea3..f238b9a4f7f6f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
@@ -162,8 +162,8 @@ class HadoopTableReader(
       if (!sparkSession.sessionState.conf.verifyPartitionPath) {
         partitionToDeserializer
       } else {
-        var existPathSet = collection.mutable.Set[String]()
-        var pathPatternSet = collection.mutable.Set[String]()
+        val existPathSet = collection.mutable.Set[String]()
+        val pathPatternSet = collection.mutable.Set[String]()
         partitionToDeserializer.filter {
           case (partition, partDeserializer) =>
             def updateExistPathSetByPathPattern(pathPatternStr: String) {
@@ -181,8 +181,8 @@ class HadoopTableReader(
             }
 
             val partPath = partition.getDataLocation
-            val partNum = Utilities.getPartitionDesc(partition).getPartSpec.size();
-            var pathPatternStr = getPathPatternByPath(partNum, partPath)
+            val partNum = Utilities.getPartitionDesc(partition).getPartSpec.size()
+            val pathPatternStr = getPathPatternByPath(partNum, partPath)
             if (!pathPatternSet.contains(pathPatternStr)) {
               pathPatternSet += pathPatternStr
               updateExistPathSetByPathPattern(pathPatternStr)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index be024adac8eb0..bde9a81c65a4e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -391,7 +391,7 @@ private[hive] class HiveClientImpl(
         val sortColumnNames = if (allAscendingSorted) {
           sortColumnOrders.map(_.getCol)
         } else {
-          Seq()
+          Seq.empty
         }
         Option(BucketSpec(h.getNumBuckets, h.getBucketCols.asScala, sortColumnNames))
       } else {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
index 3de1f4aeb74dc..11fd8c56e631b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
@@ -90,7 +90,7 @@ class HiveInspectorSuite extends SparkFunSuite with HiveInspectors {
     Literal(0.asInstanceOf[Double]) ::
     Literal("0") ::
     Literal(java.sql.Date.valueOf("2014-09-23")) ::
-    Literal(Decimal(BigDecimal(123.123))) ::
+    Literal(Decimal(BigDecimal("123.123"))) ::
     Literal(new java.sql.Timestamp(123123)) ::
     Literal(Array[Byte](1, 2, 3)) ::
     Literal.create(Seq[Int](1, 2, 3), ArrayType(IntegerType)) ::
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala
index 6a2c23a015529..3eedcf7e0874e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala
@@ -21,12 +21,9 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.hive.conf.HiveConf
 import org.scalatest.BeforeAndAfterAll
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.catalog._
-import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, EmptyRow, EqualTo, Expression, GreaterThan, GreaterThanOrEqual, In, InSet, LessThan, LessThanOrEqual, Like, Literal, Or}
+import org.apache.spark.sql.catalyst.expressions.{EmptyRow, Expression, In, InSet}
 import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
-import org.apache.spark.sql.hive.HiveUtils
-import org.apache.spark.sql.types.{ByteType, IntegerType, StringType}
 
 // TODO: Refactor this to `HivePartitionFilteringSuite`
 class HiveClientSuite(version: String)
@@ -146,7 +143,7 @@ class HiveClientSuite(version: String)
       0 to 23,
       "aa" :: "ab" :: "ba" :: "bb" :: Nil, {
         case expr @ In(v, list) if expr.inSetConvertible =>
-          InSet(v, Set() ++ list.map(_.eval(EmptyRow)))
+          InSet(v, list.map(_.eval(EmptyRow)).toSet)
       })
   }
 
@@ -165,7 +162,7 @@ class HiveClientSuite(version: String)
       0 to 23,
       "ab" :: "ba" :: Nil, {
         case expr @ In(v, list) if expr.inSetConvertible =>
-          InSet(v, Set() ++ list.map(_.eval(EmptyRow)))
+          InSet(v, list.map(_.eval(EmptyRow)).toSet)
       })
   }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index a34f6c73fea86..f3b4ff2d1d80c 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -458,7 +458,7 @@ class StreamingContext private[streaming] (
       queue: Queue[RDD[T]],
       oneAtATime: Boolean = true
     ): InputDStream[T] = {
-    queueStream(queue, oneAtATime, sc.makeRDD(Seq[T](), 1))
+    queueStream(queue, oneAtATime, sc.makeRDD(Seq.empty[T], 1))
   }
 
   /**
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
index a0a40fcee26d9..4a0ec31b5f3c8 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
@@ -153,7 +153,7 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
   def context(): StreamingContext = dstream.context
 
   /** Return a new DStream by applying a function to all elements of this DStream. */
-  def map[R](f: JFunction[T, R]): JavaDStream[R] = {
+  def map[U](f: JFunction[T, U]): JavaDStream[U] = {
     new JavaDStream(dstream.map(f)(fakeClassTag))(fakeClassTag)
   }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/StateDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/StateDStream.scala
index 5bf1dabf08f45..d1a5e91793708 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/StateDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/StateDStream.scala
@@ -76,7 +76,7 @@ class StateDStream[K: ClassTag, V: ClassTag, S: ClassTag](
             // Re-apply the update function to the old state RDD
             val updateFuncLocal = updateFunc
             val finalFunc = (iterator: Iterator[(K, S)]) => {
-              val i = iterator.map(t => (t._1, Seq[V](), Option(t._2)))
+              val i = iterator.map(t => (t._1, Seq.empty[V], Option(t._2)))
               updateFuncLocal(validTime, i)
             }
             val stateRDD = prevStateRDD.mapPartitions(finalFunc, preservePartitioning)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextHelper.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextHelper.scala
index 408936653c790..eb9996ece3779 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextHelper.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextHelper.scala
@@ -63,7 +63,6 @@ object RawTextHelper {
 
     var i = 0
     var len = 0
-    var done = false
     var value: (String, Long) = null
     var swap: (String, Long) = null
     var count = 0

From d3f4a2119634134320ebeb1ddbe0c49db080c040 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Tue, 18 Jul 2017 09:53:49 -0700
Subject: [PATCH 194/779] [SPARK-15526][ML][FOLLOWUP] Make JPMML provided scope
 to avoid including unshaded JARs, and repromote to compile in MLlib

Following the comment at https://issues.apache.org/jira/browse/SPARK-15526?focusedCommentId=16086106&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-16086106 -- this change actually needed a little more work to be complete.

This also marks JPMML as `provided` to make sure its JARs aren't included in the `jars` output, but then scopes to `compile` in `mllib`. This is how Guava is handled.

Checked result in `assembly/target/scala-2.11/jars` to verify there are no JPMML jars. Maven and SBT builds still work.

Author: Sean Owen <sowen@cloudera.com>

Closes #18637 from srowen/SPARK-15526.2.
---
 core/pom.xml                                  |  1 +
 dev/deps/spark-deps-hadoop-2.6                |  2 -
 dev/deps/spark-deps-hadoop-2.7                |  2 -
 .../launcher/AbstractCommandBuilder.java      |  1 +
 mllib/pom.xml                                 | 38 +++++++++++++++----
 pom.xml                                       | 21 +++++++++-
 6 files changed, 52 insertions(+), 13 deletions(-)

diff --git a/core/pom.xml b/core/pom.xml
index 91ee941471495..bc6b1c4f01117 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -429,6 +429,7 @@
           <!-- When using SPARK_PREPEND_CLASSES Spark classes compiled locally don't use
                shaded deps. So here we store jars in their original form which are added
                when the classpath is computed. -->
+          <!-- See similar execution in mllib/pom.xml -->
           <execution>
             <id>copy-dependencies</id>
             <phase>package</phase>
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 1a6515be51cff..76e37646af0ec 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -158,8 +158,6 @@ parquet-format-2.3.1.jar
 parquet-hadoop-1.8.2.jar
 parquet-hadoop-bundle-1.6.0.jar
 parquet-jackson-1.8.2.jar
-pmml-model-1.2.15.jar
-pmml-schema-1.2.15.jar
 protobuf-java-2.5.0.jar
 py4j-0.10.6.jar
 pyrolite-4.13.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 09e5a4288ca50..f4173477d05d3 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -159,8 +159,6 @@ parquet-format-2.3.1.jar
 parquet-hadoop-1.8.2.jar
 parquet-hadoop-bundle-1.6.0.jar
 parquet-jackson-1.8.2.jar
-pmml-model-1.2.15.jar
-pmml-schema-1.2.15.jar
 protobuf-java-2.5.0.jar
 py4j-0.10.6.jar
 pyrolite-4.13.jar
diff --git a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
index 481ff20e424d7..8004247423f3c 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
@@ -185,6 +185,7 @@ List<String> buildClassPath(String appClassPath) throws IOException {
       // Add this path to include jars that are shaded in the final deliverable created during
       // the maven build. These jars are copied to this directory during the build.
       addToClassPath(cp, String.format("%s/core/target/jars/*", sparkHome));
+      addToClassPath(cp, String.format("%s/mllib/target/jars/*", sparkHome));
     }
 
     // Add Spark jars to the classpath. For the testing case, we rely on the test code to set and
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 572670dc11b42..c72a16a56e05c 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -101,13 +101,7 @@
     <dependency>
       <groupId>org.jpmml</groupId>
       <artifactId>pmml-model</artifactId>
-      <version>1.2.15</version>
-      <exclusions>
-        <exclusion>
-          <groupId>org.jpmml</groupId>
-          <artifactId>pmml-agent</artifactId>
-        </exclusion>
-      </exclusions>
+      <scope>compile</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
@@ -139,8 +133,38 @@
       </dependencies>
     </profile>
   </profiles>
+
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+        <executions>
+          <!-- When using SPARK_PREPEND_CLASSES Spark classes compiled locally don't use
+               shaded deps. So here we store jars in their original form which are added
+               when the classpath is computed. -->
+          <!-- See similar execution in core/pom.xml -->
+          <execution>
+            <id>copy-dependencies</id>
+            <phase>package</phase>
+            <goals>
+              <goal>copy-dependencies</goal>
+            </goals>
+            <configuration>
+              <outputDirectory>${project.build.directory}</outputDirectory>
+              <overWriteReleases>false</overWriteReleases>
+              <overWriteSnapshots>false</overWriteSnapshots>
+              <overWriteIfNewer>true</overWriteIfNewer>
+              <useSubDirectoryPerType>true</useSubDirectoryPerType>
+              <includeGroupIds>org.jpmml</includeGroupIds>
+              <silent>true</silent>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
   </build>
+
 </project>
diff --git a/pom.xml b/pom.xml
index 11cd028c99145..1b812636e4f6e 100644
--- a/pom.xml
+++ b/pom.xml
@@ -382,6 +382,18 @@
         <version>14.0.1</version>
         <scope>provided</scope>
       </dependency>
+      <dependency>
+        <groupId>org.jpmml</groupId>
+        <artifactId>pmml-model</artifactId>
+        <version>1.2.15</version>
+        <scope>provided</scope>
+        <exclusions>
+          <exclusion>
+            <groupId>org.jpmml</groupId>
+            <artifactId>pmml-agent</artifactId>
+          </exclusion>
+        </exclusions>
+      </dependency>
       <!-- End of shaded deps -->
       <dependency>
         <groupId>org.apache.commons</groupId>
@@ -2511,8 +2523,8 @@
   <profiles>
 
     <!--
-      This profile is enabled automatically by the sbt built. It changes the scope for the guava
-      dependency, since we don't shade it in the artifacts generated by the sbt build.
+      This profile is enabled automatically by the sbt build. It changes the scope for shaded
+      dependencies, since we don't shade it in the artifacts generated by the sbt build.
     -->
     <profile>
       <id>sbt</id>
@@ -2522,6 +2534,11 @@
           <artifactId>guava</artifactId>
           <scope>compile</scope>
         </dependency>
+        <dependency>
+          <groupId>org.jpmml</groupId>
+          <artifactId>pmml-model</artifactId>
+          <scope>compile</scope>
+        </dependency>
       </dependencies>
     </profile>
 

From cde64add18dac712c48de0637f1979f1043e333e Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Tue, 18 Jul 2017 11:44:01 -0700
Subject: [PATCH 195/779] [SPARK-21411][YARN] Lazily create FS within
 kerberized UGI to avoid token acquiring failure

## What changes were proposed in this pull request?

In the current `YARNHadoopDelegationTokenManager`, `FileSystem` to which to get tokens are created out of KDC logged UGI, using these `FileSystem` to get new tokens will lead to exception. The main thing is that Spark code trying to get new tokens from the FS created with token auth-ed UGI, but Hadoop can only grant new tokens in kerberized UGI. To fix this issue, we should lazily create these FileSystem within KDC logged UGI.

## How was this patch tested?

Manual verification in secure cluster.

CC vanzin mgummelt please help to review, thanks!

Author: jerryshao <sshao@hortonworks.com>

Closes #18633 from jerryshao/SPARK-21411.
---
 .../deploy/security/HadoopDelegationTokenManager.scala    | 2 +-
 .../deploy/security/HadoopFSDelegationTokenProvider.scala | 7 ++++---
 .../security/HadoopDelegationTokenManagerSuite.scala      | 8 ++++----
 .../org/apache/spark/deploy/yarn/ApplicationMaster.scala  | 2 +-
 .../main/scala/org/apache/spark/deploy/yarn/Client.scala  | 2 +-
 .../apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala    | 2 +-
 .../yarn/security/YARNHadoopDelegationTokenManager.scala  | 2 +-
 .../security/YARNHadoopDelegationTokenManagerSuite.scala  | 2 +-
 8 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala b/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala
index 89b6f52ba4bca..01cbfe1ee6ae1 100644
--- a/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala
@@ -42,7 +42,7 @@ import org.apache.spark.internal.Logging
 private[spark] class HadoopDelegationTokenManager(
     sparkConf: SparkConf,
     hadoopConf: Configuration,
-    fileSystems: Set[FileSystem])
+    fileSystems: Configuration => Set[FileSystem])
   extends Logging {
 
   private val deprecatedProviderEnabledConfigs = List(
diff --git a/core/src/main/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProvider.scala b/core/src/main/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProvider.scala
index 13157f33e2bf9..f0ac7f501ceb1 100644
--- a/core/src/main/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProvider.scala
@@ -29,7 +29,7 @@ import org.apache.hadoop.security.token.delegation.AbstractDelegationTokenIdenti
 import org.apache.spark.SparkException
 import org.apache.spark.internal.Logging
 
-private[deploy] class HadoopFSDelegationTokenProvider(fileSystems: Set[FileSystem])
+private[deploy] class HadoopFSDelegationTokenProvider(fileSystems: Configuration => Set[FileSystem])
     extends HadoopDelegationTokenProvider with Logging {
 
   // This tokenRenewalInterval will be set in the first call to obtainDelegationTokens.
@@ -43,13 +43,14 @@ private[deploy] class HadoopFSDelegationTokenProvider(fileSystems: Set[FileSyste
       hadoopConf: Configuration,
       creds: Credentials): Option[Long] = {
 
+    val fsToGetTokens = fileSystems(hadoopConf)
     val newCreds = fetchDelegationTokens(
       getTokenRenewer(hadoopConf),
-      fileSystems)
+      fsToGetTokens)
 
     // Get the token renewal interval if it is not set. It will only be called once.
     if (tokenRenewalInterval == null) {
-      tokenRenewalInterval = getTokenRenewalInterval(hadoopConf, fileSystems)
+      tokenRenewalInterval = getTokenRenewalInterval(hadoopConf, fsToGetTokens)
     }
 
     // Get the time of next renewal.
diff --git a/core/src/test/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManagerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManagerSuite.scala
index 335f3449cb782..5b05521e48f8a 100644
--- a/core/src/test/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManagerSuite.scala
@@ -40,7 +40,7 @@ class HadoopDelegationTokenManagerSuite extends SparkFunSuite with Matchers {
     delegationTokenManager = new HadoopDelegationTokenManager(
       sparkConf,
       hadoopConf,
-      hadoopFSsToAccess(hadoopConf))
+      hadoopFSsToAccess)
 
     delegationTokenManager.getServiceDelegationTokenProvider("hadoopfs") should not be (None)
     delegationTokenManager.getServiceDelegationTokenProvider("hbase") should not be (None)
@@ -53,7 +53,7 @@ class HadoopDelegationTokenManagerSuite extends SparkFunSuite with Matchers {
     delegationTokenManager = new HadoopDelegationTokenManager(
       sparkConf,
       hadoopConf,
-      hadoopFSsToAccess(hadoopConf))
+      hadoopFSsToAccess)
 
     delegationTokenManager.getServiceDelegationTokenProvider("hadoopfs") should not be (None)
     delegationTokenManager.getServiceDelegationTokenProvider("hbase") should not be (None)
@@ -66,7 +66,7 @@ class HadoopDelegationTokenManagerSuite extends SparkFunSuite with Matchers {
     delegationTokenManager = new HadoopDelegationTokenManager(
       sparkConf,
       hadoopConf,
-      hadoopFSsToAccess(hadoopConf))
+      hadoopFSsToAccess)
 
     delegationTokenManager.getServiceDelegationTokenProvider("hadoopfs") should be (None)
     delegationTokenManager.getServiceDelegationTokenProvider("hive") should be (None)
@@ -77,7 +77,7 @@ class HadoopDelegationTokenManagerSuite extends SparkFunSuite with Matchers {
     delegationTokenManager = new HadoopDelegationTokenManager(
       sparkConf,
       hadoopConf,
-      hadoopFSsToAccess(hadoopConf))
+      hadoopFSsToAccess)
     val creds = new Credentials()
 
     // Tokens cannot be obtained from HDFS, Hive, HBase in unit tests.
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index ce290c399d9f2..6ff210adcbc55 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -270,7 +270,7 @@ private[spark] class ApplicationMaster(
             val credentialManager = new YARNHadoopDelegationTokenManager(
               sparkConf,
               yarnConf,
-              YarnSparkHadoopUtil.get.hadoopFSsToAccess(sparkConf, yarnConf))
+              conf => YarnSparkHadoopUtil.get.hadoopFSsToAccess(sparkConf, conf))
 
             val credentialRenewer =
               new AMCredentialRenewer(sparkConf, yarnConf, credentialManager)
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 3a7adb7724cef..d408ca90a5d1c 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -125,7 +125,7 @@ private[spark] class Client(
   private val credentialManager = new YARNHadoopDelegationTokenManager(
     sparkConf,
     hadoopConf,
-    YarnSparkHadoopUtil.get.hadoopFSsToAccess(sparkConf, hadoopConf))
+    conf => YarnSparkHadoopUtil.get.hadoopFSsToAccess(sparkConf, conf))
 
   def reportLauncherState(state: SparkAppHandle.State): Unit = {
     launcherBackend.setState(state)
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
index a687f67c5b698..4fef4394bb3f0 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
@@ -98,7 +98,7 @@ class YarnSparkHadoopUtil extends SparkHadoopUtil {
     val credentialManager = new YARNHadoopDelegationTokenManager(
       sparkConf,
       hadoopConf,
-      YarnSparkHadoopUtil.get.hadoopFSsToAccess(sparkConf, hadoopConf))
+      conf => YarnSparkHadoopUtil.get.hadoopFSsToAccess(sparkConf, conf))
     credentialUpdater = new CredentialUpdater(sparkConf, hadoopConf, credentialManager)
     credentialUpdater.start()
   }
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/YARNHadoopDelegationTokenManager.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/YARNHadoopDelegationTokenManager.scala
index bbd17c8fc1272..163cfb4eb8624 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/YARNHadoopDelegationTokenManager.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/YARNHadoopDelegationTokenManager.scala
@@ -38,7 +38,7 @@ import org.apache.spark.util.Utils
 private[yarn] class YARNHadoopDelegationTokenManager(
     sparkConf: SparkConf,
     hadoopConf: Configuration,
-    fileSystems: Set[FileSystem]) extends Logging {
+    fileSystems: Configuration => Set[FileSystem]) extends Logging {
 
   private val delegationTokenManager =
     new HadoopDelegationTokenManager(sparkConf, hadoopConf, fileSystems)
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/security/YARNHadoopDelegationTokenManagerSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/security/YARNHadoopDelegationTokenManagerSuite.scala
index 2b226eff5ce19..c918998bde07c 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/security/YARNHadoopDelegationTokenManagerSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/security/YARNHadoopDelegationTokenManagerSuite.scala
@@ -48,7 +48,7 @@ class YARNHadoopDelegationTokenManagerSuite extends SparkFunSuite with Matchers
     credentialManager = new YARNHadoopDelegationTokenManager(
       sparkConf,
       hadoopConf,
-      YarnSparkHadoopUtil.get.hadoopFSsToAccess(sparkConf, hadoopConf))
+      conf => YarnSparkHadoopUtil.get.hadoopFSsToAccess(sparkConf, conf))
 
     credentialManager.credentialProviders.get("yarn-test") should not be (None)
   }

From 264b0f36cedacd9a22b45a3e14b2186230432be6 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Tue, 18 Jul 2017 13:36:10 -0700
Subject: [PATCH 196/779] [SPARK-21408][CORE] Better default number of RPC
 dispatch threads.

Instead of using the host's cpu count, use the number of cores allocated
for the Spark process when sizing the RPC dispatch thread pool. This avoids
creating large thread pools on large machines when the number of allocated
cores is small.

Tested by verifying number of threads with spark.executor.cores set
to 1 and 4; same thing for YARN AM.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #18639 from vanzin/SPARK-21408.
---
 core/src/main/scala/org/apache/spark/SparkEnv.scala      | 2 +-
 core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala    | 6 ++++--
 .../scala/org/apache/spark/rpc/netty/Dispatcher.scala    | 9 +++++++--
 .../scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala   | 7 ++++---
 .../org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala    | 4 ++--
 .../org/apache/spark/deploy/yarn/ApplicationMaster.scala | 6 ++++--
 6 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
index 45ed9860ea8f9..24928150315e8 100644
--- a/core/src/main/scala/org/apache/spark/SparkEnv.scala
+++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -243,7 +243,7 @@ object SparkEnv extends Logging {
 
     val systemName = if (isDriver) driverSystemName else executorSystemName
     val rpcEnv = RpcEnv.create(systemName, bindAddress, advertiseAddress, port.getOrElse(-1), conf,
-      securityManager, clientMode = !isDriver)
+      securityManager, numUsableCores, !isDriver)
 
     // Figure out which port RpcEnv actually bound to in case the original port is 0 or occupied.
     if (isDriver) {
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
index 530743c03640b..de2cc56bc6b16 100644
--- a/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
@@ -40,7 +40,7 @@ private[spark] object RpcEnv {
       conf: SparkConf,
       securityManager: SecurityManager,
       clientMode: Boolean = false): RpcEnv = {
-    create(name, host, host, port, conf, securityManager, clientMode)
+    create(name, host, host, port, conf, securityManager, 0, clientMode)
   }
 
   def create(
@@ -50,9 +50,10 @@ private[spark] object RpcEnv {
       port: Int,
       conf: SparkConf,
       securityManager: SecurityManager,
+      numUsableCores: Int,
       clientMode: Boolean): RpcEnv = {
     val config = RpcEnvConfig(conf, name, bindAddress, advertiseAddress, port, securityManager,
-      clientMode)
+      numUsableCores, clientMode)
     new NettyRpcEnvFactory().create(config)
   }
 }
@@ -201,4 +202,5 @@ private[spark] case class RpcEnvConfig(
     advertiseAddress: String,
     port: Int,
     securityManager: SecurityManager,
+    numUsableCores: Int,
     clientMode: Boolean)
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
index e94babb846128..904c4d02dd2a4 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
@@ -32,8 +32,11 @@ import org.apache.spark.util.ThreadUtils
 
 /**
  * A message dispatcher, responsible for routing RPC messages to the appropriate endpoint(s).
+ *
+ * @param numUsableCores Number of CPU cores allocated to the process, for sizing the thread pool.
+ *                       If 0, will consider the available CPUs on the host.
  */
-private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
+private[netty] class Dispatcher(nettyEnv: NettyRpcEnv, numUsableCores: Int) extends Logging {
 
   private class EndpointData(
       val name: String,
@@ -192,8 +195,10 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
 
   /** Thread pool used for dispatching messages. */
   private val threadpool: ThreadPoolExecutor = {
+    val availableCores =
+      if (numUsableCores > 0) numUsableCores else Runtime.getRuntime.availableProcessors()
     val numThreads = nettyEnv.conf.getInt("spark.rpc.netty.dispatcher.numThreads",
-      math.max(2, Runtime.getRuntime.availableProcessors()))
+      math.max(2, availableCores))
     val pool = ThreadUtils.newDaemonFixedThreadPool(numThreads, "dispatcher-event-loop")
     for (i <- 0 until numThreads) {
       pool.execute(new MessageLoop)
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
index 64898499246ac..1777e7a539751 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
@@ -44,14 +44,15 @@ private[netty] class NettyRpcEnv(
     val conf: SparkConf,
     javaSerializerInstance: JavaSerializerInstance,
     host: String,
-    securityManager: SecurityManager) extends RpcEnv(conf) with Logging {
+    securityManager: SecurityManager,
+    numUsableCores: Int) extends RpcEnv(conf) with Logging {
 
   private[netty] val transportConf = SparkTransportConf.fromSparkConf(
     conf.clone.set("spark.rpc.io.numConnectionsPerPeer", "1"),
     "rpc",
     conf.getInt("spark.rpc.io.threads", 0))
 
-  private val dispatcher: Dispatcher = new Dispatcher(this)
+  private val dispatcher: Dispatcher = new Dispatcher(this, numUsableCores)
 
   private val streamManager = new NettyStreamManager(this)
 
@@ -451,7 +452,7 @@ private[rpc] class NettyRpcEnvFactory extends RpcEnvFactory with Logging {
       new JavaSerializer(sparkConf).newInstance().asInstanceOf[JavaSerializerInstance]
     val nettyEnv =
       new NettyRpcEnv(sparkConf, javaSerializerInstance, config.advertiseAddress,
-        config.securityManager)
+        config.securityManager, config.numUsableCores)
     if (!config.clientMode) {
       val startNettyRpcEnv: Int => (NettyRpcEnv, Int) = { actualPort =>
         nettyEnv.startServer(config.bindAddress, actualPort)
diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala
index 2b1bce4d208f6..777163709bbf5 100644
--- a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala
@@ -31,7 +31,7 @@ class NettyRpcEnvSuite extends RpcEnvSuite with MockitoSugar {
       port: Int,
       clientMode: Boolean = false): RpcEnv = {
     val config = RpcEnvConfig(conf, "test", "localhost", "localhost", port,
-      new SecurityManager(conf), clientMode)
+      new SecurityManager(conf), 0, clientMode)
     new NettyRpcEnvFactory().create(config)
   }
 
@@ -47,7 +47,7 @@ class NettyRpcEnvSuite extends RpcEnvSuite with MockitoSugar {
   test("advertise address different from bind address") {
     val sparkConf = new SparkConf()
     val config = RpcEnvConfig(sparkConf, "test", "localhost", "example.com", 0,
-      new SecurityManager(sparkConf), false)
+      new SecurityManager(sparkConf), 0, false)
     val env = new NettyRpcEnvFactory().create(config)
     try {
       assert(env.address.hostPort.startsWith("example.com:"))
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 6ff210adcbc55..fc925022b2718 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -459,8 +459,10 @@ private[spark] class ApplicationMaster(
   }
 
   private def runExecutorLauncher(securityMgr: SecurityManager): Unit = {
-    rpcEnv = RpcEnv.create("sparkYarnAM", Utils.localHostName, -1, sparkConf, securityMgr,
-      clientMode = true)
+    val hostname = Utils.localHostName
+    val amCores = sparkConf.get(AM_CORES)
+    rpcEnv = RpcEnv.create("sparkYarnAM", hostname, hostname, -1, sparkConf, securityMgr,
+      amCores, true)
     val driverRef = waitForSparkDriver()
     addAmIpFilter()
     registerAM(sparkConf, rpcEnv, driverRef, sparkConf.getOption("spark.driver.appUIAddress"),

From f18b905f6cace7686ef169fda7de474079d0af23 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 18 Jul 2017 15:56:16 -0700
Subject: [PATCH 197/779] [SPARK-21457][SQL] ExternalCatalog.listPartitions
 should correctly handle partition values with dot

## What changes were proposed in this pull request?

When we list partitions from hive metastore with a partial partition spec, we are expecting exact matching according to the partition values. However, hive treats dot specially and match any single character for dot. We should do an extra filter to drop unexpected partitions.

## How was this patch tested?

new regression test.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #18671 from cloud-fan/hive.
---
 .../sql/catalyst/catalog/ExternalCatalogUtils.scala  | 12 ++++++++++++
 .../spark/sql/catalyst/catalog/InMemoryCatalog.scala | 12 ------------
 .../sql/catalyst/catalog/ExternalCatalogSuite.scala  | 12 ++++++++++++
 .../apache/spark/sql/hive/HiveExternalCatalog.scala  | 12 +++++++++++-
 4 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala
index 1fc3a654cfeb1..50f32e81d997d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala
@@ -159,6 +159,18 @@ object ExternalCatalogUtils {
       }
     }
   }
+
+  /**
+   * Returns true if `spec1` is a partial partition spec w.r.t. `spec2`, e.g. PARTITION (a=1) is a
+   * partial partition spec w.r.t. PARTITION (a=1,b=2).
+   */
+  def isPartialPartitionSpec(
+      spec1: TablePartitionSpec,
+      spec2: TablePartitionSpec): Boolean = {
+    spec1.forall {
+      case (partitionColumn, value) => spec2(partitionColumn) == value
+    }
+  }
 }
 
 object CatalogUtils {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
index d253c72a62739..37e9eeadaad47 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
@@ -553,18 +553,6 @@ class InMemoryCatalog(
     }
   }
 
-  /**
-   * Returns true if `spec1` is a partial partition spec w.r.t. `spec2`, e.g. PARTITION (a=1) is a
-   * partial partition spec w.r.t. PARTITION (a=1,b=2).
-   */
-  private def isPartialPartitionSpec(
-      spec1: TablePartitionSpec,
-      spec2: TablePartitionSpec): Boolean = {
-    spec1.forall {
-      case (partitionColumn, value) => spec2(partitionColumn) == value
-    }
-  }
-
   override def listPartitionsByFilter(
       db: String,
       table: String,
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
index 66e895a4690c1..94593ef7efa50 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
@@ -448,6 +448,18 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac
     assert(catalog.listPartitions("db2", "tbl2", Some(Map("a" -> "unknown"))).isEmpty)
   }
 
+  test("SPARK-21457: list partitions with special chars") {
+    val catalog = newBasicCatalog()
+    assert(catalog.listPartitions("db2", "tbl1").isEmpty)
+
+    val part1 = CatalogTablePartition(Map("a" -> "1", "b" -> "i+j"), storageFormat)
+    val part2 = CatalogTablePartition(Map("a" -> "1", "b" -> "i.j"), storageFormat)
+    catalog.createPartitions("db2", "tbl1", Seq(part1, part2), ignoreIfExists = false)
+
+    assert(catalog.listPartitions("db2", "tbl1", Some(part1.spec)).map(_.spec) == Seq(part1.spec))
+    assert(catalog.listPartitions("db2", "tbl1", Some(part2.spec)).map(_.spec) == Seq(part2.spec))
+  }
+
   test("list partitions by filter") {
     val tz = TimeZone.getDefault.getID
     val catalog = newBasicCatalog()
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 306b38048e3a5..70d7dd23d908a 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -1088,9 +1088,19 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       table: String,
       partialSpec: Option[TablePartitionSpec] = None): Seq[CatalogTablePartition] = withClient {
     val partColNameMap = buildLowerCasePartColNameMap(getTable(db, table))
-    client.getPartitions(db, table, partialSpec.map(lowerCasePartitionSpec)).map { part =>
+    val res = client.getPartitions(db, table, partialSpec.map(lowerCasePartitionSpec)).map { part =>
       part.copy(spec = restorePartitionSpec(part.spec, partColNameMap))
     }
+
+    partialSpec match {
+      // This might be a bug of Hive: When the partition value inside the partial partition spec
+      // contains dot, and we ask Hive to list partitions w.r.t. the partial partition spec, Hive
+      // treats dot as matching any single character and may return more partitions than we
+      // expected. Here we do an extra filter to drop unexpected partitions.
+      case Some(spec) if spec.exists(_._2.contains(".")) =>
+        res.filter(p => isPartialPartitionSpec(spec, p.spec))
+      case _ => res
+    }
   }
 
   override def listPartitionsByFilter(

From 84f1b25f316a42ce4d3b69a3e136d0db41c9aec2 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Tue, 18 Jul 2017 16:29:45 -0700
Subject: [PATCH 198/779] [SPARK-21462][SS] Added batchId to
 StreamingQueryProgress.json

## What changes were proposed in this pull request?

- Added batchId to StreamingQueryProgress.json as that was missing from the generated json.
- Also, removed recently added numPartitions from StatefulOperatorProgress as this value does not change through the query run, and there are other ways to find that.

## How was this patch tested?
Updated unit tests

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #18675 from tdas/SPARK-21462.
---
 .../sql/execution/streaming/statefulOperators.scala  |  3 +--
 .../org/apache/spark/sql/streaming/progress.scala    |  9 ++++-----
 .../StreamingQueryStatusAndProgressSuite.scala       | 12 ++++++------
 3 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala
index 3ca7f4b145d71..6addab69f13ec 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala
@@ -87,8 +87,7 @@ trait StateStoreWriter extends StatefulOperator { self: SparkPlan =>
     new StateOperatorProgress(
       numRowsTotal = longMetric("numTotalStateRows").value,
       numRowsUpdated = longMetric("numUpdatedStateRows").value,
-      memoryUsedBytes = longMetric("stateMemory").value,
-      numPartitions = this.sqlContext.conf.numShufflePartitions)
+      memoryUsedBytes = longMetric("stateMemory").value)
   }
 
   /** Records the duration of running `body` for the next query progress update. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
index 81a2387b80396..3000c4233cfb3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
@@ -38,8 +38,7 @@ import org.apache.spark.annotation.InterfaceStability
 class StateOperatorProgress private[sql](
     val numRowsTotal: Long,
     val numRowsUpdated: Long,
-    val memoryUsedBytes: Long,
-    val numPartitions: Long
+    val memoryUsedBytes: Long
   ) extends Serializable {
 
   /** The compact JSON representation of this progress. */
@@ -49,13 +48,12 @@ class StateOperatorProgress private[sql](
   def prettyJson: String = pretty(render(jsonValue))
 
   private[sql] def copy(newNumRowsUpdated: Long): StateOperatorProgress =
-    new StateOperatorProgress(numRowsTotal, newNumRowsUpdated, memoryUsedBytes, numPartitions)
+    new StateOperatorProgress(numRowsTotal, newNumRowsUpdated, memoryUsedBytes)
 
   private[sql] def jsonValue: JValue = {
     ("numRowsTotal" -> JInt(numRowsTotal)) ~
     ("numRowsUpdated" -> JInt(numRowsUpdated)) ~
-    ("memoryUsedBytes" -> JInt(memoryUsedBytes)) ~
-    ("numPartitions" -> JInt(numPartitions))
+    ("memoryUsedBytes" -> JInt(memoryUsedBytes))
   }
 }
 
@@ -131,6 +129,7 @@ class StreamingQueryProgress private[sql](
     ("runId" -> JString(runId.toString)) ~
     ("name" -> JString(name)) ~
     ("timestamp" -> JString(timestamp)) ~
+    ("batchId" -> JInt(batchId)) ~
     ("numInputRows" -> JInt(numInputRows)) ~
     ("inputRowsPerSecond" -> safeDoubleToJValue(inputRowsPerSecond)) ~
     ("processedRowsPerSecond" -> safeDoubleToJValue(processedRowsPerSecond)) ~
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala
index d3cafac4f1755..79bb827e0de93 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala
@@ -43,6 +43,7 @@ class StreamingQueryStatusAndProgressSuite extends StreamTest with Eventually {
         |  "runId" : "${testProgress1.runId.toString}",
         |  "name" : "myName",
         |  "timestamp" : "2016-12-05T20:54:20.827Z",
+        |  "batchId" : 2,
         |  "numInputRows" : 678,
         |  "inputRowsPerSecond" : 10.0,
         |  "durationMs" : {
@@ -57,8 +58,7 @@ class StreamingQueryStatusAndProgressSuite extends StreamTest with Eventually {
         |  "stateOperators" : [ {
         |    "numRowsTotal" : 0,
         |    "numRowsUpdated" : 1,
-        |    "memoryUsedBytes" : 2,
-        |    "numPartitions" : 4
+        |    "memoryUsedBytes" : 2
         |  } ],
         |  "sources" : [ {
         |    "description" : "source",
@@ -83,6 +83,7 @@ class StreamingQueryStatusAndProgressSuite extends StreamTest with Eventually {
          |  "runId" : "${testProgress2.runId.toString}",
          |  "name" : null,
          |  "timestamp" : "2016-12-05T20:54:20.827Z",
+         |  "batchId" : 2,
          |  "numInputRows" : 678,
          |  "durationMs" : {
          |    "total" : 0
@@ -90,8 +91,7 @@ class StreamingQueryStatusAndProgressSuite extends StreamTest with Eventually {
          |  "stateOperators" : [ {
          |    "numRowsTotal" : 0,
          |    "numRowsUpdated" : 1,
-         |    "memoryUsedBytes" : 2,
-         |    "numPartitions" : 4
+         |    "memoryUsedBytes" : 2
          |  } ],
          |  "sources" : [ {
          |    "description" : "source",
@@ -230,7 +230,7 @@ object StreamingQueryStatusAndProgressSuite {
       "avg" -> "2016-12-05T20:54:20.827Z",
       "watermark" -> "2016-12-05T20:54:20.827Z").asJava),
     stateOperators = Array(new StateOperatorProgress(
-      numRowsTotal = 0, numRowsUpdated = 1, memoryUsedBytes = 2, numPartitions = 4)),
+      numRowsTotal = 0, numRowsUpdated = 1, memoryUsedBytes = 2)),
     sources = Array(
       new SourceProgress(
         description = "source",
@@ -254,7 +254,7 @@ object StreamingQueryStatusAndProgressSuite {
     // empty maps should be handled correctly
     eventTime = new java.util.HashMap(Map.empty[String, String].asJava),
     stateOperators = Array(new StateOperatorProgress(
-      numRowsTotal = 0, numRowsUpdated = 1, memoryUsedBytes = 2, numPartitions = 4)),
+      numRowsTotal = 0, numRowsUpdated = 1, memoryUsedBytes = 2)),
     sources = Array(
       new SourceProgress(
         description = "source",

From 81c99a5b953802129b36d48c965cbb0bb97501b3 Mon Sep 17 00:00:00 2001
From: xuanyuanking <xyliyuanjian@gmail.com>
Date: Wed, 19 Jul 2017 10:27:42 +0800
Subject: [PATCH 199/779] [SPARK-21435][SQL] Empty files should be skipped
 while write to file

## What changes were proposed in this pull request?

Add EmptyDirectoryWriteTask for empty task while writing files. Fix the empty result for parquet format by leaving the first partition for meta writing.

## How was this patch tested?

Add new test in `FileFormatWriterSuite `

Author: xuanyuanking <xyliyuanjian@gmail.com>

Closes #18654 from xuanyuanking/SPARK-21435.
---
 .../datasources/FileFormatWriter.scala        | 19 ++++++++++-
 .../datasources/FileFormatWriterSuite.scala   | 33 +++++++++++++++++++
 2 files changed, 51 insertions(+), 1 deletion(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileFormatWriterSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
index 9eb9eae699e94..073e8788aff86 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
@@ -236,7 +236,10 @@ object FileFormatWriter extends Logging {
     committer.setupTask(taskAttemptContext)
 
     val writeTask =
-      if (description.partitionColumns.isEmpty && description.bucketIdExpression.isEmpty) {
+      if (sparkPartitionId != 0 && !iterator.hasNext) {
+        // In case of empty job, leave first partition to save meta for file format like parquet.
+        new EmptyDirectoryWriteTask
+      } else if (description.partitionColumns.isEmpty && description.bucketIdExpression.isEmpty) {
         new SingleDirectoryWriteTask(description, taskAttemptContext, committer)
       } else {
         new DynamicPartitionWriteTask(description, taskAttemptContext, committer)
@@ -301,6 +304,20 @@ object FileFormatWriter extends Logging {
     }
   }
 
+  /** ExecuteWriteTask for empty partitions */
+  private class EmptyDirectoryWriteTask extends ExecuteWriteTask {
+
+    override def execute(iter: Iterator[InternalRow]): ExecutedWriteSummary = {
+      ExecutedWriteSummary(
+        updatedPartitions = Set.empty,
+        numOutputFile = 0,
+        numOutputBytes = 0,
+        numOutputRows = 0)
+    }
+
+    override def releaseResources(): Unit = {}
+  }
+
   /** Writes data to a single directory (used for non-dynamic-partition writes). */
   private class SingleDirectoryWriteTask(
       description: WriteJobDescription,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileFormatWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileFormatWriterSuite.scala
new file mode 100644
index 0000000000000..a0c1ea63d3827
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileFormatWriterSuite.scala
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.test.SharedSQLContext
+
+class FileFormatWriterSuite extends QueryTest with SharedSQLContext {
+
+  test("empty file should be skipped while write to file") {
+    withTempPath { path =>
+      spark.range(100).repartition(10).where("id = 50").write.parquet(path.toString)
+      val partFiles = path.listFiles()
+        .filter(f => f.isFile && !f.getName.startsWith(".") && !f.getName.startsWith("_"))
+      assert(partFiles.length === 2)
+    }
+  }
+}

From ae253e5a878a0adc2785ae050c49022687ac1d06 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Wed, 19 Jul 2017 10:57:15 +0800
Subject: [PATCH 200/779] [SPARK-21273][SQL][FOLLOW-UP] Propagate logical plan
 stats using visitor pattern and mixin

## What changes were proposed in this pull request?
This PR is to add back the stats propagation of `Window` and remove the stats calculation of the leaf node `Range`, which has been covered by https://github.com/rxin/spark/blob/9c32d2507d3f4f269e17e841a4a4e4920b35a5e9/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/SizeInBytesOnlyStatsPlanVisitor.scala#L56

## How was this patch tested?
Added two test cases.

Author: gatorsmile <gatorsmile@gmail.com>

Closes #18677 from gatorsmile/visitStats.
---
 .../plans/logical/LogicalPlanVisitor.scala    |  6 +++---
 .../BasicStatsPlanVisitor.scala               |  7 ++-----
 .../SizeInBytesOnlyStatsPlanVisitor.scala     |  6 ++----
 .../BasicStatsEstimationSuite.scala           | 20 +++++++++++++++++++
 4 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanVisitor.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanVisitor.scala
index 2652f6d72730c..e0748043c46e2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanVisitor.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanVisitor.scala
@@ -35,13 +35,13 @@ trait LogicalPlanVisitor[T] {
     case p: LocalLimit => visitLocalLimit(p)
     case p: Pivot => visitPivot(p)
     case p: Project => visitProject(p)
-    case p: Range => visitRange(p)
     case p: Repartition => visitRepartition(p)
     case p: RepartitionByExpression => visitRepartitionByExpr(p)
     case p: ResolvedHint => visitHint(p)
     case p: Sample => visitSample(p)
     case p: ScriptTransformation => visitScriptTransform(p)
     case p: Union => visitUnion(p)
+    case p: Window => visitWindow(p)
     case p: LogicalPlan => default(p)
   }
 
@@ -73,8 +73,6 @@ trait LogicalPlanVisitor[T] {
 
   def visitProject(p: Project): T
 
-  def visitRange(p: Range): T
-
   def visitRepartition(p: Repartition): T
 
   def visitRepartitionByExpr(p: RepartitionByExpression): T
@@ -84,4 +82,6 @@ trait LogicalPlanVisitor[T] {
   def visitScriptTransform(p: ScriptTransformation): T
 
   def visitUnion(p: Union): T
+
+  def visitWindow(p: Window): T
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/BasicStatsPlanVisitor.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/BasicStatsPlanVisitor.scala
index 93908b04fb643..4cff72d45a400 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/BasicStatsPlanVisitor.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/BasicStatsPlanVisitor.scala
@@ -65,11 +65,6 @@ object BasicStatsPlanVisitor extends LogicalPlanVisitor[Statistics] {
     ProjectEstimation.estimate(p).getOrElse(fallback(p))
   }
 
-  override def visitRange(p: logical.Range): Statistics = {
-    val sizeInBytes = LongType.defaultSize * p.numElements
-    Statistics(sizeInBytes = sizeInBytes)
-  }
-
   override def visitRepartition(p: Repartition): Statistics = fallback(p)
 
   override def visitRepartitionByExpr(p: RepartitionByExpression): Statistics = fallback(p)
@@ -79,4 +74,6 @@ object BasicStatsPlanVisitor extends LogicalPlanVisitor[Statistics] {
   override def visitScriptTransform(p: ScriptTransformation): Statistics = fallback(p)
 
   override def visitUnion(p: Union): Statistics = fallback(p)
+
+  override def visitWindow(p: Window): Statistics = fallback(p)
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/SizeInBytesOnlyStatsPlanVisitor.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/SizeInBytesOnlyStatsPlanVisitor.scala
index 559f12072e448..d701a956887a9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/SizeInBytesOnlyStatsPlanVisitor.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/SizeInBytesOnlyStatsPlanVisitor.scala
@@ -136,10 +136,6 @@ object SizeInBytesOnlyStatsPlanVisitor extends LogicalPlanVisitor[Statistics] {
 
   override def visitProject(p: Project): Statistics = visitUnaryNode(p)
 
-  override def visitRange(p: logical.Range): Statistics = {
-    p.computeStats()
-  }
-
   override def visitRepartition(p: Repartition): Statistics = default(p)
 
   override def visitRepartitionByExpr(p: RepartitionByExpression): Statistics = default(p)
@@ -160,4 +156,6 @@ object SizeInBytesOnlyStatsPlanVisitor extends LogicalPlanVisitor[Statistics] {
   override def visitUnion(p: Union): Statistics = {
     Statistics(sizeInBytes = p.children.map(_.stats.sizeInBytes).sum)
   }
+
+  override def visitWindow(p: Window): Statistics = visitUnaryNode(p)
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/BasicStatsEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/BasicStatsEstimationSuite.scala
index 913be6d1ff07f..7d532ff343178 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/BasicStatsEstimationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/BasicStatsEstimationSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.catalyst.statsEstimation
 
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference, Literal}
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical._
@@ -54,6 +56,24 @@ class BasicStatsEstimationSuite extends PlanTest with StatsEstimationTestBase {
     )
   }
 
+  test("range") {
+    val range = Range(1, 5, 1, None)
+    val rangeStats = Statistics(sizeInBytes = 4 * 8)
+    checkStats(
+      range,
+      expectedStatsCboOn = rangeStats,
+      expectedStatsCboOff = rangeStats)
+  }
+
+  test("windows") {
+    val windows = plan.window(Seq(min(attribute).as('sum_attr)), Seq(attribute), Nil)
+    val windowsStats = Statistics(sizeInBytes = plan.size.get * (4 + 4 + 8) / (4 + 8))
+    checkStats(
+      windows,
+      expectedStatsCboOn = windowsStats,
+      expectedStatsCboOff = windowsStats)
+  }
+
   test("limit estimation: limit < child's rowCount") {
     val localLimit = LocalLimit(Literal(2), plan)
     val globalLimit = GlobalLimit(Literal(2), plan)

From 46307b2cd3e504d5cf5c7121b903eaa9c4da4c4b Mon Sep 17 00:00:00 2001
From: Peng <peng.meng@intel.com>
Date: Wed, 19 Jul 2017 09:56:48 +0100
Subject: [PATCH 201/779] [SPARK-21401][ML][MLLIB] add poll function for
 BoundedPriorityQueue

## What changes were proposed in this pull request?
The most of BoundedPriorityQueue usages in ML/MLLIB are:
Get the value of BoundedPriorityQueue, then sort it.
For example, in Word2Vec: pq.toSeq.sortBy(-_._2)
in ALS, pq.toArray.sorted()

The test results show using pq.poll is much faster than sort the value.
It is good to add the poll function for BoundedPriorityQueue.

## How was this patch tested?
The existing UT

Author: Peng <peng.meng@intel.com>
Author: Peng Meng <peng.meng@intel.com>

Closes #18620 from mpjlu/add-poll.
---
 .../spark/util/BoundedPriorityQueue.scala     |  4 ++
 .../util/BoundedPriorityQueueSuite.scala      | 51 +++++++++++++++++++
 2 files changed, 55 insertions(+)
 create mode 100644 core/src/test/scala/org/apache/spark/util/BoundedPriorityQueueSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/util/BoundedPriorityQueue.scala b/core/src/main/scala/org/apache/spark/util/BoundedPriorityQueue.scala
index 1b2b1932e0c3d..eff0aa4453f08 100644
--- a/core/src/main/scala/org/apache/spark/util/BoundedPriorityQueue.scala
+++ b/core/src/main/scala/org/apache/spark/util/BoundedPriorityQueue.scala
@@ -51,6 +51,10 @@ private[spark] class BoundedPriorityQueue[A](maxSize: Int)(implicit ord: Orderin
     this
   }
 
+  def poll(): A = {
+    underlying.poll()
+  }
+
   override def +=(elem1: A, elem2: A, elems: A*): this.type = {
     this += elem1 += elem2 ++= elems
   }
diff --git a/core/src/test/scala/org/apache/spark/util/BoundedPriorityQueueSuite.scala b/core/src/test/scala/org/apache/spark/util/BoundedPriorityQueueSuite.scala
new file mode 100644
index 0000000000000..9465ca70e94f2
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/util/BoundedPriorityQueueSuite.scala
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util
+
+import org.apache.spark.SparkFunSuite
+
+class BoundedPriorityQueueSuite extends SparkFunSuite {
+  test("BoundedPriorityQueue poll test") {
+    val pq = new BoundedPriorityQueue[Double](4)
+
+    pq += 0.1
+    pq += 1.5
+    pq += 1.0
+    pq += 0.3
+    pq += 0.01
+
+    assert(pq.isEmpty == false)
+    assert(pq.poll() == 0.1)
+    assert(pq.poll() == 0.3)
+    assert(pq.poll() == 1.0)
+    assert(pq.poll() == 1.5)
+    assert(pq.isEmpty == true)
+
+    val pq2 = new BoundedPriorityQueue[(Int, Double)](4)(Ordering.by(_._2))
+    pq2 += 1 -> 0.5
+    pq2 += 5 -> 0.1
+    pq2 += 3 -> 0.3
+    pq2 += 4 -> 0.2
+    pq2 += 1 -> 0.4
+
+    assert(pq2.poll()._2 == 0.2)
+    assert(pq2.poll()._2 == 0.3)
+    assert(pq2.poll()._2 == 0.4)
+    assert(pq2.poll()._2 == 0.5)
+  }
+}

From 4eb081cc870a9d1c42aae90418535f7d782553e9 Mon Sep 17 00:00:00 2001
From: jinxing <jinxing6042@126.com>
Date: Wed, 19 Jul 2017 21:35:26 +0800
Subject: [PATCH 202/779] [SPARK-21414] Refine SlidingWindowFunctionFrame to
 avoid OOM.

## What changes were proposed in this pull request?

In `SlidingWindowFunctionFrame`, it is now adding all rows to the buffer for which the input row value is equal to or less than the output row upper bound, then drop all rows from the buffer for which the input row value is smaller than the output row lower bound.
This could result in the buffer is very big though the window is small.
For example:
```
select a, b, sum(a)
over (partition by b order by a range between 1000000 following and 1000001 following)
from table
```
We can refine the logic and just add the qualified rows into buffer.

## How was this patch tested?
Manual test:
Run sql
`select shop, shopInfo, district, sum(revenue) over(partition by district order by revenue range between 100 following and 200 following) from revenueList limit 10`
against a table with 4  columns(shop: String, shopInfo: String, district: String, revenue: Int). The biggest partition is around 2G bytes, containing 200k lines.
Configure the executor with 2G bytes memory.
With the change in this pr, it works find. Without this change, below exception will be thrown.
```
MemoryError: Java heap space
	at org.apache.spark.sql.catalyst.expressions.UnsafeRow.copy(UnsafeRow.java:504)
	at org.apache.spark.sql.catalyst.expressions.UnsafeRow.copy(UnsafeRow.java:62)
	at org.apache.spark.sql.execution.window.SlidingWindowFunctionFrame.write(WindowFunctionFrame.scala:201)
	at org.apache.spark.sql.execution.window.WindowExec$$anonfun$14$$anon$1.next(WindowExec.scala:365)
	at org.apache.spark.sql.execution.window.WindowExec$$anonfun$14$$anon$1.next(WindowExec.scala:289)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:395)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:231)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:225)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:827)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:827)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:341)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
```

Author: jinxing <jinxing6042@126.com>

Closes #18634 from jinxing64/SPARK-21414.
---
 .../window/WindowFunctionFrame.scala          | 22 +++++-----
 .../execution/SQLWindowFunctionSuite.scala    | 40 +++++++++++++++++++
 2 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowFunctionFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowFunctionFrame.scala
index af2b4fb92062b..156002ef58fbe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowFunctionFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowFunctionFrame.scala
@@ -195,15 +195,6 @@ private[window] final class SlidingWindowFunctionFrame(
   override def write(index: Int, current: InternalRow): Unit = {
     var bufferUpdated = index == 0
 
-    // Add all rows to the buffer for which the input row value is equal to or less than
-    // the output row upper bound.
-    while (nextRow != null && ubound.compare(nextRow, inputHighIndex, current, index) <= 0) {
-      buffer.add(nextRow.copy())
-      nextRow = WindowFunctionFrame.getNextOrNull(inputIterator)
-      inputHighIndex += 1
-      bufferUpdated = true
-    }
-
     // Drop all rows from the buffer for which the input row value is smaller than
     // the output row lower bound.
     while (!buffer.isEmpty && lbound.compare(buffer.peek(), inputLowIndex, current, index) < 0) {
@@ -212,6 +203,19 @@ private[window] final class SlidingWindowFunctionFrame(
       bufferUpdated = true
     }
 
+    // Add all rows to the buffer for which the input row value is equal to or less than
+    // the output row upper bound.
+    while (nextRow != null && ubound.compare(nextRow, inputHighIndex, current, index) <= 0) {
+      if (lbound.compare(nextRow, inputLowIndex, current, index) < 0) {
+        inputLowIndex += 1
+      } else {
+        buffer.add(nextRow.copy())
+        bufferUpdated = true
+      }
+      nextRow = WindowFunctionFrame.getNextOrNull(inputIterator)
+      inputHighIndex += 1
+    }
+
     // Only recalculate and update when the buffer changes.
     if (bufferUpdated) {
       processor.initialize(input.length)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLWindowFunctionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLWindowFunctionSuite.scala
index 52e4f047225de..a9f3fb355c775 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLWindowFunctionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLWindowFunctionSuite.scala
@@ -356,6 +356,46 @@ class SQLWindowFunctionSuite extends QueryTest with SharedSQLContext {
     spark.catalog.dropTempView("nums")
   }
 
+  test("window function: mutiple window expressions specified by range in a single expression") {
+    val nums = sparkContext.parallelize(1 to 10).map(x => (x, x % 2)).toDF("x", "y")
+    nums.createOrReplaceTempView("nums")
+    withTempView("nums") {
+      val expected =
+        Row(1, 1, 1, 4, null, 8, 25) ::
+          Row(1, 3, 4, 9, 1, 12, 24) ::
+          Row(1, 5, 9, 15, 4, 16, 21) ::
+          Row(1, 7, 16, 21, 8, 9, 16) ::
+          Row(1, 9, 25, 16, 12, null, 9) ::
+          Row(0, 2, 2, 6, null, 10, 30) ::
+          Row(0, 4, 6, 12, 2, 14, 28) ::
+          Row(0, 6, 12, 18, 6, 18, 24) ::
+          Row(0, 8, 20, 24, 10, 10, 18) ::
+          Row(0, 10, 30, 18, 14, null, 10) ::
+          Nil
+
+      val actual = sql(
+        """
+          |SELECT
+          |  y,
+          |  x,
+          |  sum(x) over w1 as history_sum,
+          |  sum(x) over w2 as period_sum1,
+          |  sum(x) over w3 as period_sum2,
+          |  sum(x) over w4 as period_sum3,
+          |  sum(x) over w5 as future_sum
+          |FROM nums
+          |WINDOW
+          |  w1 AS (PARTITION BY y ORDER BY x RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW),
+          |  w2 AS (PARTITION BY y ORDER BY x RANGE BETWEEN 2 PRECEDING AND 2 FOLLOWING),
+          |  w3 AS (PARTITION BY y ORDER BY x RANGE BETWEEN 4 PRECEDING AND 2 PRECEDING ),
+          |  w4 AS (PARTITION BY y ORDER BY x RANGE BETWEEN 2 FOLLOWING AND 4 FOLLOWING),
+          |  w5 AS (PARTITION BY y ORDER BY x RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING)
+        """.stripMargin
+      )
+      checkAnswer(actual, expected)
+    }
+  }
+
   test("SPARK-7595: Window will cause resolve failed with self join") {
     checkAnswer(sql(
       """

From 6b6dd682e84d3b03d0b15fbd81a0d16729e521d2 Mon Sep 17 00:00:00 2001
From: donnyzone <wellfengzhu@gmail.com>
Date: Wed, 19 Jul 2017 21:48:54 +0800
Subject: [PATCH 203/779] [SPARK-21441][SQL] Incorrect Codegen in
 SortMergeJoinExec results failures in some cases

## What changes were proposed in this pull request?

https://issues.apache.org/jira/projects/SPARK/issues/SPARK-21441

This issue can be reproduced by the following example:

```
val spark = SparkSession
   .builder()
   .appName("smj-codegen")
   .master("local")
   .config("spark.sql.autoBroadcastJoinThreshold", "1")
   .getOrCreate()
val df1 = spark.createDataFrame(Seq((1, 1), (2, 2), (3, 3))).toDF("key", "int")
val df2 = spark.createDataFrame(Seq((1, "1"), (2, "2"), (3, "3"))).toDF("key", "str")
val df = df1.join(df2, df1("key") === df2("key"))
   .filter("int = 2 or reflect('java.lang.Integer', 'valueOf', str) = 1")
   .select("int")
   df.show()
```

To conclude, the issue happens when:
(1) SortMergeJoin condition contains CodegenFallback expressions.
(2) In PhysicalPlan tree, SortMergeJoin node  is the child of root node, e.g., the Project in above example.

This patch fixes the logic in `CollapseCodegenStages` rule.

## How was this patch tested?
Unit test and manual verification in our cluster.

Author: donnyzone <wellfengzhu@gmail.com>

Closes #18656 from DonnyZone/Fix_SortMergeJoinExec.
---
 .../sql/execution/WholeStageCodegenExec.scala |  8 +++----
 .../execution/WholeStageCodegenSuite.scala    | 22 +++++++++++++++++++
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
index 1007a7d55691b..34134db278ad8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
@@ -489,13 +489,13 @@ case class CollapseCodegenStages(conf: SQLConf) extends Rule[SparkPlan] {
    * Inserts an InputAdapter on top of those that do not support codegen.
    */
   private def insertInputAdapter(plan: SparkPlan): SparkPlan = plan match {
-    case j @ SortMergeJoinExec(_, _, _, _, left, right) if j.supportCodegen =>
-      // The children of SortMergeJoin should do codegen separately.
-      j.copy(left = InputAdapter(insertWholeStageCodegen(left)),
-        right = InputAdapter(insertWholeStageCodegen(right)))
     case p if !supportCodegen(p) =>
       // collapse them recursively
       InputAdapter(insertWholeStageCodegen(p))
+    case j @ SortMergeJoinExec(_, _, _, _, left, right) =>
+      // The children of SortMergeJoin should do codegen separately.
+      j.copy(left = InputAdapter(insertWholeStageCodegen(left)),
+        right = InputAdapter(insertWholeStageCodegen(right)))
     case p =>
       p.withNewChildren(p.children.map(insertInputAdapter))
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
index a4b30a2f8cec1..183c68fd3c016 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
@@ -22,8 +22,10 @@ import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.expressions.{Add, Literal, Stack}
 import org.apache.spark.sql.execution.aggregate.HashAggregateExec
 import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec
+import org.apache.spark.sql.execution.joins.SortMergeJoinExec
 import org.apache.spark.sql.expressions.scalalang.typed
 import org.apache.spark.sql.functions.{avg, broadcast, col, max}
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
 
@@ -127,4 +129,24 @@ class WholeStageCodegenSuite extends SparkPlanTest with SharedSQLContext {
         "named_struct('a',id+2, 'b',id+2) as col2")
       .filter("col1 = col2").count()
   }
+
+  test("SPARK-21441 SortMergeJoin codegen with CodegenFallback expressions should be disabled") {
+    withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "1") {
+      import testImplicits._
+
+      val df1 = Seq((1, 1), (2, 2), (3, 3)).toDF("key", "int")
+      val df2 = Seq((1, "1"), (2, "2"), (3, "3")).toDF("key", "str")
+
+      val df = df1.join(df2, df1("key") === df2("key"))
+        .filter("int = 2 or reflect('java.lang.Integer', 'valueOf', str) = 1")
+        .select("int")
+
+      val plan = df.queryExecution.executedPlan
+      assert(!plan.find(p =>
+        p.isInstanceOf[WholeStageCodegenExec] &&
+          p.asInstanceOf[WholeStageCodegenExec].child.children(0)
+            .isInstanceOf[SortMergeJoinExec]).isDefined)
+      assert(df.collect() === Array(Row(1), Row(2)))
+    }
+  }
 }

From 70fe99dc62ef636a99bcb8a580ad4de4dca95181 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Wed, 19 Jul 2017 11:02:07 -0700
Subject: [PATCH 204/779] [SPARK-21464][SS] Minimize deprecation warnings
 caused by ProcessingTime class

## What changes were proposed in this pull request?

Use of `ProcessingTime` class was deprecated in favor of `Trigger.ProcessingTime` in Spark 2.2. However interval uses to ProcessingTime causes deprecation warnings during compilation. This cannot be avoided entirely as even though it is deprecated as a public API, ProcessingTime instances are used internally in TriggerExecutor. This PR is to minimize the warning by removing its uses from tests as much as possible.

## How was this patch tested?
Existing tests.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #18678 from tdas/SPARK-21464.
---
 .../spark/sql/ProcessingTimeSuite.scala       | 24 ++++++++++---------
 .../FlatMapGroupsWithStateSuite.scala         |  6 ++---
 .../streaming/StreamingAggregationSuite.scala |  8 +++----
 3 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ProcessingTimeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ProcessingTimeSuite.scala
index 52c200796ce41..623a1b6f854cf 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ProcessingTimeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ProcessingTimeSuite.scala
@@ -22,20 +22,22 @@ import java.util.concurrent.TimeUnit
 import scala.concurrent.duration._
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.streaming.ProcessingTime
+import org.apache.spark.sql.streaming.{ProcessingTime, Trigger}
 
 class ProcessingTimeSuite extends SparkFunSuite {
 
   test("create") {
-    assert(ProcessingTime(10.seconds).intervalMs === 10 * 1000)
-    assert(ProcessingTime.create(10, TimeUnit.SECONDS).intervalMs === 10 * 1000)
-    assert(ProcessingTime("1 minute").intervalMs === 60 * 1000)
-    assert(ProcessingTime("interval 1 minute").intervalMs === 60 * 1000)
-
-    intercept[IllegalArgumentException] { ProcessingTime(null: String) }
-    intercept[IllegalArgumentException] { ProcessingTime("") }
-    intercept[IllegalArgumentException] { ProcessingTime("invalid") }
-    intercept[IllegalArgumentException] { ProcessingTime("1 month") }
-    intercept[IllegalArgumentException] { ProcessingTime("1 year") }
+    def getIntervalMs(trigger: Trigger): Long = trigger.asInstanceOf[ProcessingTime].intervalMs
+
+    assert(getIntervalMs(Trigger.ProcessingTime(10.seconds)) === 10 * 1000)
+    assert(getIntervalMs(Trigger.ProcessingTime(10, TimeUnit.SECONDS)) === 10 * 1000)
+    assert(getIntervalMs(Trigger.ProcessingTime("1 minute")) === 60 * 1000)
+    assert(getIntervalMs(Trigger.ProcessingTime("interval 1 minute")) === 60 * 1000)
+
+    intercept[IllegalArgumentException] { Trigger.ProcessingTime(null: String) }
+    intercept[IllegalArgumentException] { Trigger.ProcessingTime("") }
+    intercept[IllegalArgumentException] { Trigger.ProcessingTime("invalid") }
+    intercept[IllegalArgumentException] { Trigger.ProcessingTime("1 month") }
+    intercept[IllegalArgumentException] { Trigger.ProcessingTime("1 year") }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
index 9f2f0d195de9f..a5399cdb6e5b1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
@@ -664,7 +664,7 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest with BeforeAndAf
         .flatMapGroupsWithState(Update, ProcessingTimeTimeout)(stateFunc)
 
     testStream(result, Update)(
-      StartStream(ProcessingTime("1 second"), triggerClock = clock),
+      StartStream(Trigger.ProcessingTime("1 second"), triggerClock = clock),
       AddData(inputData, "a"),
       AdvanceManualClock(1 * 1000),
       CheckLastBatch(("a", "1")),
@@ -729,7 +729,7 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest with BeforeAndAf
         .flatMapGroupsWithState(Update, EventTimeTimeout)(stateFunc)
 
     testStream(result, Update)(
-      StartStream(ProcessingTime("1 second")),
+      StartStream(Trigger.ProcessingTime("1 second")),
       AddData(inputData, ("a", 11), ("a", 13), ("a", 15)), // Set timeout timestamp of ...
       CheckLastBatch(("a", 15)),                           // "a" to 15 + 5 = 20s, watermark to 5s
       AddData(inputData, ("a", 4)),       // Add data older than watermark for "a"
@@ -901,7 +901,7 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest with BeforeAndAf
           .flatMapGroupsWithState(Update, ProcessingTimeTimeout)(stateFunc)
 
       testStream(result, Update)(
-        StartStream(ProcessingTime("1 second"), triggerClock = clock),
+        StartStream(Trigger.ProcessingTime("1 second"), triggerClock = clock),
         AddData(inputData, ("a", 1L)),
         AdvanceManualClock(1 * 1000),
         CheckLastBatch(("a", "1"))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
index 4345a70601c34..b6e82b621c8cb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
@@ -267,7 +267,7 @@ class StreamingAggregationSuite extends StateStoreMetricsTest with BeforeAndAfte
         .where('value >= current_timestamp().cast("long") - 10L)
 
     testStream(aggregated, Complete)(
-      StartStream(ProcessingTime("10 seconds"), triggerClock = clock),
+      StartStream(Trigger.ProcessingTime("10 seconds"), triggerClock = clock),
 
       // advance clock to 10 seconds, all keys retained
       AddData(inputData, 0L, 5L, 5L, 10L),
@@ -294,7 +294,7 @@ class StreamingAggregationSuite extends StateStoreMetricsTest with BeforeAndAfte
         clock.advance(60 * 1000L)
         true
       },
-      StartStream(ProcessingTime("10 seconds"), triggerClock = clock),
+      StartStream(Trigger.ProcessingTime("10 seconds"), triggerClock = clock),
       // The commit log blown, causing the last batch to re-run
       CheckLastBatch((20L, 1), (85L, 1)),
       AssertOnQuery { q =>
@@ -322,7 +322,7 @@ class StreamingAggregationSuite extends StateStoreMetricsTest with BeforeAndAfte
         .where($"value".cast("date") >= date_sub(current_date(), 10))
         .select(($"value".cast("long") / DateTimeUtils.SECONDS_PER_DAY).cast("long"), $"count(1)")
     testStream(aggregated, Complete)(
-      StartStream(ProcessingTime("10 day"), triggerClock = clock),
+      StartStream(Trigger.ProcessingTime("10 day"), triggerClock = clock),
       // advance clock to 10 days, should retain all keys
       AddData(inputData, 0L, 5L, 5L, 10L),
       AdvanceManualClock(DateTimeUtils.MILLIS_PER_DAY * 10),
@@ -346,7 +346,7 @@ class StreamingAggregationSuite extends StateStoreMetricsTest with BeforeAndAfte
         clock.advance(DateTimeUtils.MILLIS_PER_DAY * 60)
         true
       },
-      StartStream(ProcessingTime("10 day"), triggerClock = clock),
+      StartStream(Trigger.ProcessingTime("10 day"), triggerClock = clock),
       // Commit log blown, causing a re-run of the last batch
       CheckLastBatch((20L, 1), (85L, 1)),
 

From ef617755868077dbc57de4e7edea8f01335f5556 Mon Sep 17 00:00:00 2001
From: Dhruve Ashar <dhruveashar@gmail.com>
Date: Wed, 19 Jul 2017 15:53:28 -0500
Subject: [PATCH 205/779] [SPARK-21243][Core] Limit no. of map outputs in a
 shuffle fetch

## What changes were proposed in this pull request?
For configurations with external shuffle enabled, we have observed that if a very large no. of blocks are being fetched from a remote host, it puts the NM under extra pressure and can crash it. This change introduces a configuration `spark.reducer.maxBlocksInFlightPerAddress` , to limit the no. of map outputs being fetched from a given remote address. The changes applied here are applicable for both the scenarios - when external shuffle is enabled as well as disabled.

## How was this patch tested?
Ran the job with the default configuration which does not change the existing behavior and ran it with few configurations of lower values -10,20,50,100. The job ran fine and there is no change in the output. (I will update the metrics related to NM in some time.)

Author: Dhruve Ashar <dhruveashar@gmail.com>

Closes #18487 from dhruve/impr/SPARK-21243.
---
 .../spark/internal/config/package.scala       | 11 +++
 .../shuffle/BlockStoreShuffleReader.scala     |  1 +
 .../storage/ShuffleBlockFetcherIterator.scala | 81 ++++++++++++++++---
 .../ShuffleBlockFetcherIteratorSuite.scala    |  6 ++
 docs/configuration.md                         |  9 +++
 5 files changed, 98 insertions(+), 10 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index 512d539ee9c38..ef28e2c48ad02 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -321,6 +321,17 @@ package object config {
       .intConf
       .createWithDefault(3)
 
+  private[spark] val REDUCER_MAX_BLOCKS_IN_FLIGHT_PER_ADDRESS =
+    ConfigBuilder("spark.reducer.maxBlocksInFlightPerAddress")
+      .doc("This configuration limits the number of remote blocks being fetched per reduce task" +
+        " from a given host port. When a large number of blocks are being requested from a given" +
+        " address in a single fetch or simultaneously, this could crash the serving executor or" +
+        " Node Manager. This is especially useful to reduce the load on the Node Manager when" +
+        " external shuffle is enabled. You can mitigate the issue by setting it to a lower value.")
+      .intConf
+      .checkValue(_ > 0, "The max no. of blocks in flight cannot be non-positive.")
+      .createWithDefault(Int.MaxValue)
+
   private[spark] val REDUCER_MAX_REQ_SIZE_SHUFFLE_TO_MEM =
     ConfigBuilder("spark.reducer.maxReqSizeShuffleToMem")
       .doc("The blocks of a shuffle request will be fetched to disk when size of the request is " +
diff --git a/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala b/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala
index 2fbac79a2305b..c8d1460300934 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala
@@ -51,6 +51,7 @@ private[spark] class BlockStoreShuffleReader[K, C](
       // Note: we use getSizeAsMb when no suffix is provided for backwards compatibility
       SparkEnv.get.conf.getSizeAsMb("spark.reducer.maxSizeInFlight", "48m") * 1024 * 1024,
       SparkEnv.get.conf.getInt("spark.reducer.maxReqsInFlight", Int.MaxValue),
+      SparkEnv.get.conf.get(config.REDUCER_MAX_BLOCKS_IN_FLIGHT_PER_ADDRESS),
       SparkEnv.get.conf.get(config.REDUCER_MAX_REQ_SIZE_SHUFFLE_TO_MEM),
       SparkEnv.get.conf.getBoolean("spark.shuffle.detectCorrupt", true))
 
diff --git a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
index 81d822dc8a98f..2d176b62f8b36 100644
--- a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
+++ b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
@@ -23,7 +23,7 @@ import java.util.concurrent.LinkedBlockingQueue
 import javax.annotation.concurrent.GuardedBy
 
 import scala.collection.mutable
-import scala.collection.mutable.{ArrayBuffer, HashSet, Queue}
+import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, Queue}
 
 import org.apache.spark.{SparkException, TaskContext}
 import org.apache.spark.internal.Logging
@@ -52,6 +52,8 @@ import org.apache.spark.util.io.ChunkedByteBufferOutputStream
  * @param streamWrapper A function to wrap the returned input stream.
  * @param maxBytesInFlight max size (in bytes) of remote blocks to fetch at any given point.
  * @param maxReqsInFlight max number of remote requests to fetch blocks at any given point.
+ * @param maxBlocksInFlightPerAddress max number of shuffle blocks being fetched at any given point
+ *                                    for a given remote host:port.
  * @param maxReqSizeShuffleToMem max size (in bytes) of a request that can be shuffled to memory.
  * @param detectCorrupt whether to detect any corruption in fetched blocks.
  */
@@ -64,6 +66,7 @@ final class ShuffleBlockFetcherIterator(
     streamWrapper: (BlockId, InputStream) => InputStream,
     maxBytesInFlight: Long,
     maxReqsInFlight: Int,
+    maxBlocksInFlightPerAddress: Int,
     maxReqSizeShuffleToMem: Long,
     detectCorrupt: Boolean)
   extends Iterator[(BlockId, InputStream)] with TempShuffleFileManager with Logging {
@@ -110,12 +113,21 @@ final class ShuffleBlockFetcherIterator(
    */
   private[this] val fetchRequests = new Queue[FetchRequest]
 
+  /**
+   * Queue of fetch requests which could not be issued the first time they were dequeued. These
+   * requests are tried again when the fetch constraints are satisfied.
+   */
+  private[this] val deferredFetchRequests = new HashMap[BlockManagerId, Queue[FetchRequest]]()
+
   /** Current bytes in flight from our requests */
   private[this] var bytesInFlight = 0L
 
   /** Current number of requests in flight */
   private[this] var reqsInFlight = 0
 
+  /** Current number of blocks in flight per host:port */
+  private[this] val numBlocksInFlightPerAddress = new HashMap[BlockManagerId, Int]()
+
   /**
    * The blocks that can't be decompressed successfully, it is used to guarantee that we retry
    * at most once for those corrupted blocks.
@@ -248,7 +260,8 @@ final class ShuffleBlockFetcherIterator(
     // smaller than maxBytesInFlight is to allow multiple, parallel fetches from up to 5
     // nodes, rather than blocking on reading output from one node.
     val targetRequestSize = math.max(maxBytesInFlight / 5, 1L)
-    logDebug("maxBytesInFlight: " + maxBytesInFlight + ", targetRequestSize: " + targetRequestSize)
+    logDebug("maxBytesInFlight: " + maxBytesInFlight + ", targetRequestSize: " + targetRequestSize
+      + ", maxBlocksInFlightPerAddress: " + maxBlocksInFlightPerAddress)
 
     // Split local and remote blocks. Remote blocks are further split into FetchRequests of size
     // at most maxBytesInFlight in order to limit the amount of data in flight.
@@ -277,11 +290,13 @@ final class ShuffleBlockFetcherIterator(
           } else if (size < 0) {
             throw new BlockException(blockId, "Negative block size " + size)
           }
-          if (curRequestSize >= targetRequestSize) {
+          if (curRequestSize >= targetRequestSize ||
+              curBlocks.size >= maxBlocksInFlightPerAddress) {
             // Add this FetchRequest
             remoteRequests += new FetchRequest(address, curBlocks)
+            logDebug(s"Creating fetch request of $curRequestSize at $address "
+              + s"with ${curBlocks.size} blocks")
             curBlocks = new ArrayBuffer[(BlockId, Long)]
-            logDebug(s"Creating fetch request of $curRequestSize at $address")
             curRequestSize = 0
           }
         }
@@ -375,6 +390,7 @@ final class ShuffleBlockFetcherIterator(
       result match {
         case r @ SuccessFetchResult(blockId, address, size, buf, isNetworkReqDone) =>
           if (address != blockManager.blockManagerId) {
+            numBlocksInFlightPerAddress(address) = numBlocksInFlightPerAddress(address) - 1
             shuffleMetrics.incRemoteBytesRead(buf.size)
             if (buf.isInstanceOf[FileSegmentManagedBuffer]) {
               shuffleMetrics.incRemoteBytesReadToDisk(buf.size)
@@ -443,12 +459,57 @@ final class ShuffleBlockFetcherIterator(
   }
 
   private def fetchUpToMaxBytes(): Unit = {
-    // Send fetch requests up to maxBytesInFlight
-    while (fetchRequests.nonEmpty &&
-      (bytesInFlight == 0 ||
-        (reqsInFlight + 1 <= maxReqsInFlight &&
-          bytesInFlight + fetchRequests.front.size <= maxBytesInFlight))) {
-      sendRequest(fetchRequests.dequeue())
+    // Send fetch requests up to maxBytesInFlight. If you cannot fetch from a remote host
+    // immediately, defer the request until the next time it can be processed.
+
+    // Process any outstanding deferred fetch requests if possible.
+    if (deferredFetchRequests.nonEmpty) {
+      for ((remoteAddress, defReqQueue) <- deferredFetchRequests) {
+        while (isRemoteBlockFetchable(defReqQueue) &&
+            !isRemoteAddressMaxedOut(remoteAddress, defReqQueue.front)) {
+          val request = defReqQueue.dequeue()
+          logDebug(s"Processing deferred fetch request for $remoteAddress with "
+            + s"${request.blocks.length} blocks")
+          send(remoteAddress, request)
+          if (defReqQueue.isEmpty) {
+            deferredFetchRequests -= remoteAddress
+          }
+        }
+      }
+    }
+
+    // Process any regular fetch requests if possible.
+    while (isRemoteBlockFetchable(fetchRequests)) {
+      val request = fetchRequests.dequeue()
+      val remoteAddress = request.address
+      if (isRemoteAddressMaxedOut(remoteAddress, request)) {
+        logDebug(s"Deferring fetch request for $remoteAddress with ${request.blocks.size} blocks")
+        val defReqQueue = deferredFetchRequests.getOrElse(remoteAddress, new Queue[FetchRequest]())
+        defReqQueue.enqueue(request)
+        deferredFetchRequests(remoteAddress) = defReqQueue
+      } else {
+        send(remoteAddress, request)
+      }
+    }
+
+    def send(remoteAddress: BlockManagerId, request: FetchRequest): Unit = {
+      sendRequest(request)
+      numBlocksInFlightPerAddress(remoteAddress) =
+        numBlocksInFlightPerAddress.getOrElse(remoteAddress, 0) + request.blocks.size
+    }
+
+    def isRemoteBlockFetchable(fetchReqQueue: Queue[FetchRequest]): Boolean = {
+      fetchReqQueue.nonEmpty &&
+        (bytesInFlight == 0 ||
+          (reqsInFlight + 1 <= maxReqsInFlight &&
+            bytesInFlight + fetchReqQueue.front.size <= maxBytesInFlight))
+    }
+
+    // Checks if sending a new fetch request will exceed the max no. of blocks being fetched from a
+    // given remote address.
+    def isRemoteAddressMaxedOut(remoteAddress: BlockManagerId, request: FetchRequest): Boolean = {
+      numBlocksInFlightPerAddress.getOrElse(remoteAddress, 0) + request.blocks.size >
+        maxBlocksInFlightPerAddress
     }
   }
 
diff --git a/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala b/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala
index 6a70cedf769b8..c371cbcf8dff5 100644
--- a/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala
@@ -110,6 +110,7 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT
       48 * 1024 * 1024,
       Int.MaxValue,
       Int.MaxValue,
+      Int.MaxValue,
       true)
 
     // 3 local blocks fetched in initialization
@@ -187,6 +188,7 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT
       48 * 1024 * 1024,
       Int.MaxValue,
       Int.MaxValue,
+      Int.MaxValue,
       true)
 
     verify(blocks(ShuffleBlockId(0, 0, 0)), times(0)).release()
@@ -254,6 +256,7 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT
       48 * 1024 * 1024,
       Int.MaxValue,
       Int.MaxValue,
+      Int.MaxValue,
       true)
 
     // Continue only after the mock calls onBlockFetchFailure
@@ -319,6 +322,7 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT
       48 * 1024 * 1024,
       Int.MaxValue,
       Int.MaxValue,
+      Int.MaxValue,
       true)
 
     // Continue only after the mock calls onBlockFetchFailure
@@ -400,6 +404,7 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT
       48 * 1024 * 1024,
       Int.MaxValue,
       Int.MaxValue,
+      Int.MaxValue,
       false)
 
     // Continue only after the mock calls onBlockFetchFailure
@@ -457,6 +462,7 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT
         (_, in) => in,
         maxBytesInFlight = Int.MaxValue,
         maxReqsInFlight = Int.MaxValue,
+        maxBlocksInFlightPerAddress = Int.MaxValue,
         maxReqSizeShuffleToMem = 200,
         detectCorrupt = true)
     }
diff --git a/docs/configuration.md b/docs/configuration.md
index 91b5befd1b1eb..d3df923c42690 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -536,6 +536,15 @@ Apart from these, the following properties are also available, and may be useful
   </td>
 </tr>
 <tr>
+    <td><code>spark.reducer.maxBlocksInFlightPerAddress</code></td>
+    <td>Int.MaxValue</td>
+    <td>
+      This configuration limits the number of remote blocks being fetched per reduce task from a
+      given host port. When a large number of blocks are being requested from a given address in a
+      single fetch or simultaneously, this could crash the serving executor or Node Manager. This
+      is especially useful to reduce the load on the Node Manager when external shuffle is enabled.
+      You can mitigate this issue by setting it to a lower value.
+    </td>
   <td><code>spark.reducer.maxReqSizeShuffleToMem</code></td>
   <td>Long.MaxValue</td>
   <td>

From c9729187bcef78299390e53cd9af38c3e084060e Mon Sep 17 00:00:00 2001
From: DFFuture <albert.zhang23@gmail.com>
Date: Wed, 19 Jul 2017 14:45:11 -0700
Subject: [PATCH 206/779] [SPARK-21446][SQL] Fix setAutoCommit never executed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?
JIRA Issue: https://issues.apache.org/jira/browse/SPARK-21446
options.asConnectionProperties can not have fetchsize，because fetchsize belongs to Spark-only options, and Spark-only options have been excluded in connection properities.
So change properties of beforeFetch from  options.asConnectionProperties.asScala.toMap to options.asProperties.asScala.toMap

## How was this patch tested?

Author: DFFuture <albert.zhang23@gmail.com>

Closes #18665 from DFFuture/sparksql_pg.
---
 .../apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
index 57e9bc9b70454..24e13697c0c9f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
@@ -271,7 +271,7 @@ private[jdbc] class JDBCRDD(
     conn = getConnection()
     val dialect = JdbcDialects.get(url)
     import scala.collection.JavaConverters._
-    dialect.beforeFetch(conn, options.asConnectionProperties.asScala.toMap)
+    dialect.beforeFetch(conn, options.asProperties.asScala.toMap)
 
     // H2's JDBC driver does not support the setSchema() method.  We pass a
     // fully-qualified table name in the SELECT statement.  I don't know how to

From c42ef953343073a50ef04c5ce848b574ff7f2238 Mon Sep 17 00:00:00 2001
From: "Susan X. Huynh" <xhuynh@mesosphere.com>
Date: Wed, 19 Jul 2017 15:11:06 -0700
Subject: [PATCH 207/779] [SPARK-21456][MESOS] Make the driver failover_timeout
 configurable

## What changes were proposed in this pull request?

Current behavior: in Mesos cluster mode, the driver failover_timeout is set to zero. If the driver temporarily loses connectivity with the Mesos master, the framework will be torn down and all executors killed.

Proposed change: make the failover_timeout configurable via a new option, spark.mesos.driver.failoverTimeout. The default value is still zero.

Note: with non-zero failover_timeout, an explicit teardown is needed in some cases. This is captured in https://issues.apache.org/jira/browse/SPARK-21458

## How was this patch tested?

Added a unit test to make sure the config option is set while creating the scheduler driver.

Ran an integration test with mesosphere/spark showing that with a non-zero failover_timeout the Spark job finishes after a driver is disconnected from the master.

Author: Susan X. Huynh <xhuynh@mesosphere.com>

Closes #18674 from susanxhuynh/sh-mesos-failover-timeout.
---
 docs/running-on-mesos.md                      | 11 ++++++
 .../apache/spark/deploy/mesos/config.scala    |  9 ++++-
 .../MesosCoarseGrainedSchedulerBackend.scala  |  3 +-
 ...osCoarseGrainedSchedulerBackendSuite.scala | 36 +++++++++++++++++++
 4 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md
index 7401b63e022c1..cf257c06c9516 100644
--- a/docs/running-on-mesos.md
+++ b/docs/running-on-mesos.md
@@ -545,6 +545,17 @@ See the [configuration page](configuration.html) for information on Spark config
     Fetcher Cache</a>
   </td>
 </tr>
+<tr>
+  <td><code>spark.mesos.driver.failoverTimeout</code></td>
+  <td><code>0.0</code></td>
+  <td>
+    The amount of time (in seconds) that the master will wait for the 
+    driver to reconnect, after being temporarily disconnected, before 
+    it tears down the driver framework by killing all its 
+    executors. The default value is zero, meaning no timeout: if the 
+    driver disconnects, the master immediately tears down the framework.
+  </td>
+</tr>
 </table>
 
 # Troubleshooting and Debugging
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala
index 56d697f359614..6c8619e3c3c13 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala
@@ -58,9 +58,16 @@ package object config {
 
   private [spark] val DRIVER_LABELS =
     ConfigBuilder("spark.mesos.driver.labels")
-      .doc("Mesos labels to add to the driver.  Labels are free-form key-value pairs.  Key-value" +
+      .doc("Mesos labels to add to the driver.  Labels are free-form key-value pairs.  Key-value " +
         "pairs should be separated by a colon, and commas used to list more than one." +
         "Ex. key:value,key2:value2")
       .stringConf
       .createOptional
+
+  private [spark] val DRIVER_FAILOVER_TIMEOUT =
+    ConfigBuilder("spark.mesos.driver.failoverTimeout")
+      .doc("Amount of time in seconds that the master will wait to hear from the driver, " +
+          "during a temporary disconnection, before tearing down all the executors.")
+      .doubleConf
+      .createWithDefault(0.0)
 }
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
index 7dd42c41aa7c2..6e7f41dad34ba 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
@@ -29,6 +29,7 @@ import org.apache.mesos.Protos.{TaskInfo => MesosTaskInfo, _}
 import org.apache.mesos.SchedulerDriver
 
 import org.apache.spark.{SecurityManager, SparkContext, SparkException, TaskState}
+import org.apache.spark.deploy.mesos.config._
 import org.apache.spark.internal.config
 import org.apache.spark.network.netty.SparkTransportConf
 import org.apache.spark.network.shuffle.mesos.MesosExternalShuffleClient
@@ -177,7 +178,7 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
       sc.conf,
       sc.conf.getOption("spark.mesos.driver.webui.url").orElse(sc.ui.map(_.webUrl)),
       None,
-      None,
+      Some(sc.conf.get(DRIVER_FAILOVER_TIMEOUT)),
       sc.conf.getOption("spark.mesos.driver.frameworkId")
     )
 
diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
index 7cca5fedb31eb..d9ff4a403ea36 100644
--- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
+++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
@@ -33,6 +33,7 @@ import org.scalatest.mock.MockitoSugar
 import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.{LocalSparkContext, SecurityManager, SparkConf, SparkContext, SparkFunSuite}
+import org.apache.spark.deploy.mesos.config._
 import org.apache.spark.internal.config._
 import org.apache.spark.network.shuffle.mesos.MesosExternalShuffleClient
 import org.apache.spark.rpc.{RpcAddress, RpcEndpointRef}
@@ -369,6 +370,41 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite
     backend.start()
   }
 
+  test("failover timeout is set in created scheduler driver") {
+    val failoverTimeoutIn = 3600.0
+    initializeSparkConf(Map(DRIVER_FAILOVER_TIMEOUT.key -> failoverTimeoutIn.toString))
+    sc = new SparkContext(sparkConf)
+
+    val taskScheduler = mock[TaskSchedulerImpl]
+    when(taskScheduler.sc).thenReturn(sc)
+
+    val driver = mock[SchedulerDriver]
+    when(driver.start()).thenReturn(Protos.Status.DRIVER_RUNNING)
+
+    val securityManager = mock[SecurityManager]
+
+    val backend = new MesosCoarseGrainedSchedulerBackend(
+      taskScheduler, sc, "master", securityManager) {
+      override protected def createSchedulerDriver(
+          masterUrl: String,
+          scheduler: Scheduler,
+          sparkUser: String,
+          appName: String,
+          conf: SparkConf,
+          webuiUrl: Option[String] = None,
+          checkpoint: Option[Boolean] = None,
+          failoverTimeout: Option[Double] = None,
+          frameworkId: Option[String] = None): SchedulerDriver = {
+        markRegistered()
+        assert(failoverTimeout.isDefined)
+        assert(failoverTimeout.get.equals(failoverTimeoutIn))
+        driver
+      }
+    }
+
+    backend.start()
+  }
+
   test("honors unset spark.mesos.containerizer") {
     setBackend(Map("spark.mesos.executor.docker.image" -> "test"))
 

From 8cd9cdf17a7a4ad6f2eecd7c4b388ca363c20982 Mon Sep 17 00:00:00 2001
From: Corey Woodfield <coreywoodfield@gmail.com>
Date: Wed, 19 Jul 2017 15:21:38 -0700
Subject: [PATCH 208/779] [SPARK-21333][DOCS] Removed invalid joinTypes from
 javadoc of Dataset#joinWith

## What changes were proposed in this pull request?

Two invalid join types were mistakenly listed in the javadoc for joinWith, in the Dataset class. I presume these were copied from the javadoc of join, but since joinWith returns a Dataset\<Tuple2\>, left_semi and left_anti are invalid, as they only return values from one of the datasets, instead of from both

## How was this patch tested?

I ran the following code :
```
public static void main(String[] args) {
	SparkSession spark = new SparkSession(new SparkContext("local[*]", "Test"));
	Dataset<Row> one = spark.createDataFrame(Arrays.asList(new Bean(1), new Bean(2), new Bean(3), new Bean(4), new Bean(5)), Bean.class);
	Dataset<Row> two = spark.createDataFrame(Arrays.asList(new Bean(4), new Bean(5), new Bean(6), new Bean(7), new Bean(8), new Bean(9)), Bean.class);

	try {two.joinWith(one, one.col("x").equalTo(two.col("x")), "inner").show();} catch (Exception e) {e.printStackTrace();}
	try {two.joinWith(one, one.col("x").equalTo(two.col("x")), "cross").show();} catch (Exception e) {e.printStackTrace();}
	try {two.joinWith(one, one.col("x").equalTo(two.col("x")), "outer").show();} catch (Exception e) {e.printStackTrace();}
	try {two.joinWith(one, one.col("x").equalTo(two.col("x")), "full").show();} catch (Exception e) {e.printStackTrace();}
	try {two.joinWith(one, one.col("x").equalTo(two.col("x")), "full_outer").show();} catch (Exception e) {e.printStackTrace();}
	try {two.joinWith(one, one.col("x").equalTo(two.col("x")), "left").show();} catch (Exception e) {e.printStackTrace();}
	try {two.joinWith(one, one.col("x").equalTo(two.col("x")), "left_outer").show();} catch (Exception e) {e.printStackTrace();}
	try {two.joinWith(one, one.col("x").equalTo(two.col("x")), "right").show();} catch (Exception e) {e.printStackTrace();}
	try {two.joinWith(one, one.col("x").equalTo(two.col("x")), "right_outer").show();} catch (Exception e) {e.printStackTrace();}
	try {two.joinWith(one, one.col("x").equalTo(two.col("x")), "left_semi").show();} catch (Exception e) {e.printStackTrace();}
	try {two.joinWith(one, one.col("x").equalTo(two.col("x")), "left_anti").show();} catch (Exception e) {e.printStackTrace();}
}
```
which tests all the different join types, and the last two (left_semi and left_anti) threw exceptions. The same code using join instead of joinWith did fine. The Bean class was just a java bean with a single int field, x.

Author: Corey Woodfield <coreywoodfield@gmail.com>

Closes #18462 from coreywoodfield/master.
---
 .../scala/org/apache/spark/sql/Dataset.scala     |  6 +++++-
 .../org/apache/spark/sql/DatasetSuite.scala      | 16 ++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index b825b6cd6160f..71ab0ddf2d6f4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -980,7 +980,7 @@ class Dataset[T] private[sql](
    * @param condition Join expression.
    * @param joinType Type of join to perform. Default `inner`. Must be one of:
    *                 `inner`, `cross`, `outer`, `full`, `full_outer`, `left`, `left_outer`,
-   *                 `right`, `right_outer`, `left_semi`, `left_anti`.
+   *                 `right`, `right_outer`.
    *
    * @group typedrel
    * @since 1.6.0
@@ -997,6 +997,10 @@ class Dataset[T] private[sql](
         JoinType(joinType),
         Some(condition.expr))).analyzed.asInstanceOf[Join]
 
+    if (joined.joinType == LeftSemi || joined.joinType == LeftAnti) {
+      throw new AnalysisException("Invalid join type in joinWith: " + joined.joinType.sql)
+    }
+
     // For both join side, combine all outputs into a single column and alias it with "_1" or "_2",
     // to match the schema for the encoder of the join result.
     // Note that we do this before joining them, to enable the join operator to return null for one
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 69d110e414278..73098cdb92471 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -21,6 +21,7 @@ import java.io.{Externalizable, ObjectInput, ObjectOutput}
 import java.sql.{Date, Timestamp}
 
 import org.apache.spark.sql.catalyst.encoders.{OuterScopes, RowEncoder}
+import org.apache.spark.sql.catalyst.plans.{LeftAnti, LeftSemi}
 import org.apache.spark.sql.catalyst.util.sideBySide
 import org.apache.spark.sql.execution.{LogicalRDD, RDDScanExec}
 import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ShuffleExchange}
@@ -400,6 +401,21 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
       ((("b", 2), ("b", 2)), ("b", 2)))
   }
 
+  test("joinWith join types") {
+    val ds1 = Seq(1, 2, 3).toDS().as("a")
+    val ds2 = Seq(1, 2).toDS().as("b")
+
+    val e1 = intercept[AnalysisException] {
+      ds1.joinWith(ds2, $"a.value" === $"b.value", "left_semi")
+    }.getMessage
+    assert(e1.contains("Invalid join type in joinWith: " + LeftSemi.sql))
+
+    val e2 = intercept[AnalysisException] {
+      ds1.joinWith(ds2, $"a.value" === $"b.value", "left_anti")
+    }.getMessage
+    assert(e2.contains("Invalid join type in joinWith: " + LeftAnti.sql))
+  }
+
   test("groupBy function, keys") {
     val ds = Seq(("a", 1), ("b", 1)).toDS()
     val grouped = ds.groupByKey(v => (1, v._2))

From 2c9d5ef1f0c30713dafbf8ef0eb69d5520f7dcaf Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Wed, 19 Jul 2017 15:56:26 -0700
Subject: [PATCH 209/779] [SPARK-21463] Allow userSpecifiedSchema to override
 partition inference performed by MetadataLogFileIndex

## What changes were proposed in this pull request?

When using the MetadataLogFileIndex to read back a table, we don't respect the user provided schema as the proper column types. This can lead to issues when trying to read strings that look like dates that get truncated to DateType, or longs being truncated to IntegerType, just because a long value doesn't exist.

## How was this patch tested?

Unit tests and manual tests

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #18676 from brkyvz/stream-partitioning.
---
 .../execution/datasources/DataSource.scala    | 33 +++++++++++++++----
 .../streaming/FileStreamSource.scala          |  2 +-
 .../streaming/MetadataLogFileIndex.scala      | 11 +++++--
 .../ParquetPartitionDiscoverySuite.scala      | 33 +++++++++++++++++++
 4 files changed, 69 insertions(+), 10 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index d36a04f1fff8e..cbe8ce421f92b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -96,6 +96,24 @@ case class DataSource(
       bucket.sortColumnNames, "in the sort definition", equality)
   }
 
+  /**
+   * In the read path, only managed tables by Hive provide the partition columns properly when
+   * initializing this class. All other file based data sources will try to infer the partitioning,
+   * and then cast the inferred types to user specified dataTypes if the partition columns exist
+   * inside `userSpecifiedSchema`, otherwise we can hit data corruption bugs like SPARK-18510, or
+   * inconsistent data types as reported in SPARK-21463.
+   * @param fileIndex A FileIndex that will perform partition inference
+   * @return The PartitionSchema resolved from inference and cast according to `userSpecifiedSchema`
+   */
+  private def combineInferredAndUserSpecifiedPartitionSchema(fileIndex: FileIndex): StructType = {
+    val resolved = fileIndex.partitionSchema.map { partitionField =>
+      // SPARK-18510: try to get schema from userSpecifiedSchema, otherwise fallback to inferred
+      userSpecifiedSchema.flatMap(_.find(f => equality(f.name, partitionField.name))).getOrElse(
+        partitionField)
+    }
+    StructType(resolved)
+  }
+
   /**
    * Get the schema of the given FileFormat, if provided by `userSpecifiedSchema`, or try to infer
    * it. In the read path, only managed tables by Hive provide the partition columns properly when
@@ -139,12 +157,7 @@ case class DataSource(
     val partitionSchema = if (partitionColumns.isEmpty) {
       // Try to infer partitioning, because no DataSource in the read path provides the partitioning
       // columns properly unless it is a Hive DataSource
-      val resolved = tempFileIndex.partitionSchema.map { partitionField =>
-        // SPARK-18510: try to get schema from userSpecifiedSchema, otherwise fallback to inferred
-        userSpecifiedSchema.flatMap(_.find(f => equality(f.name, partitionField.name))).getOrElse(
-          partitionField)
-      }
-      StructType(resolved)
+      combineInferredAndUserSpecifiedPartitionSchema(tempFileIndex)
     } else {
       // maintain old behavior before SPARK-18510. If userSpecifiedSchema is empty used inferred
       // partitioning
@@ -336,7 +349,13 @@ case class DataSource(
             caseInsensitiveOptions.get("path").toSeq ++ paths,
             sparkSession.sessionState.newHadoopConf()) =>
         val basePath = new Path((caseInsensitiveOptions.get("path").toSeq ++ paths).head)
-        val fileCatalog = new MetadataLogFileIndex(sparkSession, basePath)
+        val tempFileCatalog = new MetadataLogFileIndex(sparkSession, basePath, None)
+        val fileCatalog = if (userSpecifiedSchema.nonEmpty) {
+          val partitionSchema = combineInferredAndUserSpecifiedPartitionSchema(tempFileCatalog)
+          new MetadataLogFileIndex(sparkSession, basePath, Option(partitionSchema))
+        } else {
+          tempFileCatalog
+        }
         val dataSchema = userSpecifiedSchema.orElse {
           format.inferSchema(
             sparkSession,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
index a9e64c640042a..4b1b2520390ba 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
@@ -195,7 +195,7 @@ class FileStreamSource(
   private def allFilesUsingMetadataLogFileIndex() = {
     // Note if `sourceHasMetadata` holds, then `qualifiedBasePath` is guaranteed to be a
     // non-glob path
-    new MetadataLogFileIndex(sparkSession, qualifiedBasePath).allFiles()
+    new MetadataLogFileIndex(sparkSession, qualifiedBasePath, None).allFiles()
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileIndex.scala
index aeaa134736937..1da703cefd8ea 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileIndex.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLogFileIndex.scala
@@ -23,14 +23,21 @@ import org.apache.hadoop.fs.{FileStatus, Path}
 
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.types.StructType
 
 
 /**
  * A [[FileIndex]] that generates the list of files to processing by reading them from the
  * metadata log files generated by the [[FileStreamSink]].
+ *
+ * @param userPartitionSchema an optional partition schema that will be use to provide types for
+ *                            the discovered partitions
  */
-class MetadataLogFileIndex(sparkSession: SparkSession, path: Path)
-  extends PartitioningAwareFileIndex(sparkSession, Map.empty, None) {
+class MetadataLogFileIndex(
+    sparkSession: SparkSession,
+    path: Path,
+    userPartitionSchema: Option[StructType])
+  extends PartitioningAwareFileIndex(sparkSession, Map.empty, userPartitionSchema) {
 
   private val metadataDirectory = new Path(path, FileStreamSink.metadataDir)
   logInfo(s"Reading streaming file log from $metadataDirectory")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
index 84b34d5ad26d1..2f5fd8438f682 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
@@ -35,6 +35,7 @@ import org.apache.spark.sql.catalyst.expressions.Literal
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.execution.datasources.{PartitionPath => Partition}
+import org.apache.spark.sql.execution.streaming.MemoryStream
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
@@ -1022,4 +1023,36 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
       }
     }
   }
+
+  test("SPARK-21463: MetadataLogFileIndex should respect userSpecifiedSchema for partition cols") {
+    withTempDir { tempDir =>
+      val output = new File(tempDir, "output").toString
+      val checkpoint = new File(tempDir, "chkpoint").toString
+      try {
+        val stream = MemoryStream[(String, Int)]
+        val df = stream.toDS().toDF("time", "value")
+        val sq = df.writeStream
+          .option("checkpointLocation", checkpoint)
+          .format("parquet")
+          .partitionBy("time")
+          .start(output)
+
+        stream.addData(("2017-01-01-00", 1), ("2017-01-01-01", 2))
+        sq.processAllAvailable()
+
+        val schema = new StructType()
+          .add("time", StringType)
+          .add("value", IntegerType)
+        val readBack = spark.read.schema(schema).parquet(output)
+        assert(readBack.schema.toSet === schema.toSet)
+
+        checkAnswer(
+          readBack,
+          Seq(Row("2017-01-01-00", 1), Row("2017-01-01-01", 2))
+        )
+      } finally {
+        spark.streams.active.foreach(_.stop())
+      }
+    }
+  }
 }

From b7a40f64e6d83bb6704ac3a63c46cc7c0e9f9e23 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Thu, 20 Jul 2017 12:46:06 +0900
Subject: [PATCH 210/779] [SPARK-16542][SQL][PYSPARK] Fix bugs about types that
 result an array of null when creating DataFrame using python

## What changes were proposed in this pull request?
This is the reopen of https://github.com/apache/spark/pull/14198, with merge conflicts resolved.

ueshin Could you please take a look at my code?

Fix bugs about types that result an array of null when creating DataFrame using python.

Python's array.array have richer type than python itself, e.g. we can have `array('f',[1,2,3])` and `array('d',[1,2,3])`. Codes in spark-sql and pyspark didn't take this into consideration which might cause a problem that you get an array of null values when you have `array('f')` in your rows.

A simple code to reproduce this bug is:

```
from pyspark import SparkContext
from pyspark.sql import SQLContext,Row,DataFrame
from array import array

sc = SparkContext()
sqlContext = SQLContext(sc)

row1 = Row(floatarray=array('f',[1,2,3]), doublearray=array('d',[1,2,3]))
rows = sc.parallelize([ row1 ])
df = sqlContext.createDataFrame(rows)
df.show()
```

which have output

```
+---------------+------------------+
|    doublearray|        floatarray|
+---------------+------------------+
|[1.0, 2.0, 3.0]|[null, null, null]|
+---------------+------------------+
```

## How was this patch tested?

New test case added

Author: Xiang Gao <qasdfgtyuiop@gmail.com>
Author: Gao, Xiang <qasdfgtyuiop@gmail.com>
Author: Takuya UESHIN <ueshin@databricks.com>

Closes #18444 from zasdfgbnm/fix_array_infer.
---
 .../apache/spark/api/python/SerDeUtil.scala   | 20 +++-
 python/pyspark/sql/tests.py                   | 97 ++++++++++++++++++-
 python/pyspark/sql/types.py                   | 95 +++++++++++++++++-
 .../sql/execution/python/EvaluatePython.scala | 10 ++
 4 files changed, 216 insertions(+), 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
index 42f67e8dbe865..aaf8e7a1d7461 100644
--- a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
@@ -55,13 +55,12 @@ private[spark] object SerDeUtil extends Logging {
     //    {'d', sizeof(double), d_getitem, d_setitem},
     //    {'\0', 0, 0, 0} /* Sentinel */
     //  };
-    // TODO: support Py_UNICODE with 2 bytes
     val machineCodes: Map[Char, Int] = if (ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN)) {
-      Map('c' -> 1, 'B' -> 0, 'b' -> 1, 'H' -> 3, 'h' -> 5, 'I' -> 7, 'i' -> 9,
+      Map('B' -> 0, 'b' -> 1, 'H' -> 3, 'h' -> 5, 'I' -> 7, 'i' -> 9,
         'L' -> 11, 'l' -> 13, 'f' -> 15, 'd' -> 17, 'u' -> 21
       )
     } else {
-      Map('c' -> 1, 'B' -> 0, 'b' -> 1, 'H' -> 2, 'h' -> 4, 'I' -> 6, 'i' -> 8,
+      Map('B' -> 0, 'b' -> 1, 'H' -> 2, 'h' -> 4, 'I' -> 6, 'i' -> 8,
         'L' -> 10, 'l' -> 12, 'f' -> 14, 'd' -> 16, 'u' -> 20
       )
     }
@@ -72,7 +71,20 @@ private[spark] object SerDeUtil extends Logging {
         val typecode = args(0).asInstanceOf[String].charAt(0)
         // This must be ISO 8859-1 / Latin 1, not UTF-8, to interoperate correctly
         val data = args(1).asInstanceOf[String].getBytes(StandardCharsets.ISO_8859_1)
-        construct(typecode, machineCodes(typecode), data)
+        if (typecode == 'c') {
+          // It seems like the pickle of pypy uses the similar protocol to Python 2.6, which uses
+          // a string for array data instead of list as Python 2.7, and handles an array of
+          // typecode 'c' as 1-byte character.
+          val result = new Array[Char](data.length)
+          var i = 0
+          while (i < data.length) {
+            result(i) = data(i).toChar
+            i += 1
+          }
+          result
+        } else {
+          construct(typecode, machineCodes(typecode), data)
+        }
       } else if (args.length == 2 && args(0) == "l") {
         // On Python 2, an array of typecode 'l' should be handled as long rather than int.
         val values = args(1).asInstanceOf[JArrayList[_]]
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index be5495ca019a2..1c1a0cad49625 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -30,8 +30,10 @@
 import functools
 import time
 import datetime
-
+import array
+import ctypes
 import py4j
+
 try:
     import xmlrunner
 except ImportError:
@@ -58,6 +60,8 @@
 from pyspark.sql import SparkSession, SQLContext, HiveContext, Column, Row
 from pyspark.sql.types import *
 from pyspark.sql.types import UserDefinedType, _infer_type, _make_type_verifier
+from pyspark.sql.types import _array_signed_int_typecode_ctype_mappings, _array_type_mappings
+from pyspark.sql.types import _array_unsigned_int_typecode_ctype_mappings
 from pyspark.tests import QuietTest, ReusedPySparkTestCase, SparkSubmitTests
 from pyspark.sql.functions import UserDefinedFunction, sha2, lit
 from pyspark.sql.window import Window
@@ -2333,6 +2337,97 @@ def test_BinaryType_serialization(self):
         df = self.spark.createDataFrame(data, schema=schema)
         df.collect()
 
+    # test for SPARK-16542
+    def test_array_types(self):
+        # This test need to make sure that the Scala type selected is at least
+        # as large as the python's types. This is necessary because python's
+        # array types depend on C implementation on the machine. Therefore there
+        # is no machine independent correspondence between python's array types
+        # and Scala types.
+        # See: https://docs.python.org/2/library/array.html
+
+        def assertCollectSuccess(typecode, value):
+            row = Row(myarray=array.array(typecode, [value]))
+            df = self.spark.createDataFrame([row])
+            self.assertEqual(df.first()["myarray"][0], value)
+
+        # supported string types
+        #
+        # String types in python's array are "u" for Py_UNICODE and "c" for char.
+        # "u" will be removed in python 4, and "c" is not supported in python 3.
+        supported_string_types = []
+        if sys.version_info[0] < 4:
+            supported_string_types += ['u']
+            # test unicode
+            assertCollectSuccess('u', u'a')
+        if sys.version_info[0] < 3:
+            supported_string_types += ['c']
+            # test string
+            assertCollectSuccess('c', 'a')
+
+        # supported float and double
+        #
+        # Test max, min, and precision for float and double, assuming IEEE 754
+        # floating-point format.
+        supported_fractional_types = ['f', 'd']
+        assertCollectSuccess('f', ctypes.c_float(1e+38).value)
+        assertCollectSuccess('f', ctypes.c_float(1e-38).value)
+        assertCollectSuccess('f', ctypes.c_float(1.123456).value)
+        assertCollectSuccess('d', sys.float_info.max)
+        assertCollectSuccess('d', sys.float_info.min)
+        assertCollectSuccess('d', sys.float_info.epsilon)
+
+        # supported signed int types
+        #
+        # The size of C types changes with implementation, we need to make sure
+        # that there is no overflow error on the platform running this test.
+        supported_signed_int_types = list(
+            set(_array_signed_int_typecode_ctype_mappings.keys())
+            .intersection(set(_array_type_mappings.keys())))
+        for t in supported_signed_int_types:
+            ctype = _array_signed_int_typecode_ctype_mappings[t]
+            max_val = 2 ** (ctypes.sizeof(ctype) * 8 - 1)
+            assertCollectSuccess(t, max_val - 1)
+            assertCollectSuccess(t, -max_val)
+
+        # supported unsigned int types
+        #
+        # JVM does not have unsigned types. We need to be very careful to make
+        # sure that there is no overflow error.
+        supported_unsigned_int_types = list(
+            set(_array_unsigned_int_typecode_ctype_mappings.keys())
+            .intersection(set(_array_type_mappings.keys())))
+        for t in supported_unsigned_int_types:
+            ctype = _array_unsigned_int_typecode_ctype_mappings[t]
+            assertCollectSuccess(t, 2 ** (ctypes.sizeof(ctype) * 8) - 1)
+
+        # all supported types
+        #
+        # Make sure the types tested above:
+        # 1. are all supported types
+        # 2. cover all supported types
+        supported_types = (supported_string_types +
+                           supported_fractional_types +
+                           supported_signed_int_types +
+                           supported_unsigned_int_types)
+        self.assertEqual(set(supported_types), set(_array_type_mappings.keys()))
+
+        # all unsupported types
+        #
+        # Keys in _array_type_mappings is a complete list of all supported types,
+        # and types not in _array_type_mappings are considered unsupported.
+        # `array.typecodes` are not supported in python 2.
+        if sys.version_info[0] < 3:
+            all_types = set(['c', 'b', 'B', 'u', 'h', 'H', 'i', 'I', 'l', 'L', 'f', 'd'])
+        else:
+            all_types = set(array.typecodes)
+        unsupported_types = all_types - set(supported_types)
+        # test unsupported types
+        for t in unsupported_types:
+            with self.assertRaises(TypeError):
+                a = array.array(t)
+                self.spark.createDataFrame([Row(myarray=a)]).collect()
+
     def test_bucketed_write(self):
         data = [
             (1, "foo", 3.0), (2, "foo", 5.0),
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 22fa273fc1aac..c376805c32738 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -24,6 +24,7 @@
 import re
 import base64
 from array import array
+import ctypes
 
 if sys.version >= "3":
     long = int
@@ -915,6 +916,93 @@ def _parse_datatype_json_value(json_value):
         long: LongType,
     })
 
+# Mapping Python array types to Spark SQL DataType
+# We should be careful here. The size of these types in python depends on C
+# implementation. We need to make sure that this conversion does not lose any
+# precision. Also, JVM only support signed types, when converting unsigned types,
+# keep in mind that it required 1 more bit when stored as singed types.
+#
+# Reference for C integer size, see:
+# ISO/IEC 9899:201x specification, chapter 5.2.4.2.1 Sizes of integer types <limits.h>.
+# Reference for python array typecode, see:
+# https://docs.python.org/2/library/array.html
+# https://docs.python.org/3.6/library/array.html
+# Reference for JVM's supported integral types:
+# http://docs.oracle.com/javase/specs/jvms/se8/html/jvms-2.html#jvms-2.3.1
+
+_array_signed_int_typecode_ctype_mappings = {
+    'b': ctypes.c_byte,
+    'h': ctypes.c_short,
+    'i': ctypes.c_int,
+    'l': ctypes.c_long,
+}
+
+_array_unsigned_int_typecode_ctype_mappings = {
+    'B': ctypes.c_ubyte,
+    'H': ctypes.c_ushort,
+    'I': ctypes.c_uint,
+    'L': ctypes.c_ulong
+}
+
+
+def _int_size_to_type(size):
+    """
+    Return the Catalyst datatype from the size of integers.
+    """
+    if size <= 8:
+        return ByteType
+    if size <= 16:
+        return ShortType
+    if size <= 32:
+        return IntegerType
+    if size <= 64:
+        return LongType
+
+# The list of all supported array typecodes is stored here
+_array_type_mappings = {
+    # Warning: Actual properties for float and double in C is not specified in C.
+    # On almost every system supported by both python and JVM, they are IEEE 754
+    # single-precision binary floating-point format and IEEE 754 double-precision
+    # binary floating-point format. And we do assume the same thing here for now.
+    'f': FloatType,
+    'd': DoubleType
+}
+
+# compute array typecode mappings for signed integer types
+for _typecode in _array_signed_int_typecode_ctype_mappings.keys():
+    size = ctypes.sizeof(_array_signed_int_typecode_ctype_mappings[_typecode]) * 8
+    dt = _int_size_to_type(size)
+    if dt is not None:
+        _array_type_mappings[_typecode] = dt
+
+# compute array typecode mappings for unsigned integer types
+for _typecode in _array_unsigned_int_typecode_ctype_mappings.keys():
+    # JVM does not have unsigned types, so use signed types that is at least 1
+    # bit larger to store
+    size = ctypes.sizeof(_array_unsigned_int_typecode_ctype_mappings[_typecode]) * 8 + 1
+    dt = _int_size_to_type(size)
+    if dt is not None:
+        _array_type_mappings[_typecode] = dt
+
+# Type code 'u' in Python's array is deprecated since version 3.3, and will be
+# removed in version 4.0. See: https://docs.python.org/3/library/array.html
+if sys.version_info[0] < 4:
+    _array_type_mappings['u'] = StringType
+
+# Type code 'c' are only available at python 2
+if sys.version_info[0] < 3:
+    _array_type_mappings['c'] = StringType
+
+# SPARK-21465:
+# In python2, array of 'L' happened to be mistakenly partially supported. To
+# avoid breaking user's code, we should keep this partial support. Below is a
+# dirty hacking to keep this partial support and make the unit test passes
+import platform
+if sys.version_info[0] < 3 and platform.python_implementation() != 'PyPy':
+    if 'L' not in _array_type_mappings.keys():
+        _array_type_mappings['L'] = LongType
+        _array_unsigned_int_typecode_ctype_mappings['L'] = ctypes.c_uint
+
 
 def _infer_type(obj):
     """Infer the DataType from obj
@@ -938,12 +1026,17 @@ def _infer_type(obj):
                 return MapType(_infer_type(key), _infer_type(value), True)
         else:
             return MapType(NullType(), NullType(), True)
-    elif isinstance(obj, (list, array)):
+    elif isinstance(obj, list):
         for v in obj:
             if v is not None:
                 return ArrayType(_infer_type(obj[0]), True)
         else:
             return ArrayType(NullType(), True)
+    elif isinstance(obj, array):
+        if obj.typecode in _array_type_mappings:
+            return ArrayType(_array_type_mappings[obj.typecode](), False)
+        else:
+            raise TypeError("not supported type: array(%s)" % obj.typecode)
     else:
         try:
             return _infer_schema(obj)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala
index fcd84705f7e8b..38b3aa7891b40 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala
@@ -91,20 +91,30 @@ object EvaluatePython {
 
     case (c: Boolean, BooleanType) => c
 
+    case (c: Byte, ByteType) => c
+    case (c: Short, ByteType) => c.toByte
     case (c: Int, ByteType) => c.toByte
     case (c: Long, ByteType) => c.toByte
 
+    case (c: Byte, ShortType) => c.toShort
+    case (c: Short, ShortType) => c
     case (c: Int, ShortType) => c.toShort
     case (c: Long, ShortType) => c.toShort
 
+    case (c: Byte, IntegerType) => c.toInt
+    case (c: Short, IntegerType) => c.toInt
     case (c: Int, IntegerType) => c
     case (c: Long, IntegerType) => c.toInt
 
+    case (c: Byte, LongType) => c.toLong
+    case (c: Short, LongType) => c.toLong
     case (c: Int, LongType) => c.toLong
     case (c: Long, LongType) => c
 
+    case (c: Float, FloatType) => c
     case (c: Double, FloatType) => c.toFloat
 
+    case (c: Float, DoubleType) => c.toDouble
     case (c: Double, DoubleType) => c
 
     case (c: java.math.BigDecimal, dt: DecimalType) => Decimal(c, dt.precision, dt.scale)

From 5b61cc6d629d537a7e5bcbd5205d1c8a43b14d43 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Thu, 20 Jul 2017 09:02:42 +0100
Subject: [PATCH 211/779] [MINOR][DOCS] Fix some missing notes for Python 2.6
 support drop

## What changes were proposed in this pull request?

After SPARK-12661, I guess we officially dropped Python 2.6 support. It looks there are few places missing this notes.

I grepped "Python 2.6" and "python 2.6" and the results were below:

```
./core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala:  // Unpickle array.array generated by Python 2.6
./docs/index.md:Note that support for Java 7, Python 2.6 and old Hadoop versions before 2.6.5 were removed as of Spark 2.2.0.
./docs/rdd-programming-guide.md:Spark {{site.SPARK_VERSION}} works with Python 2.6+ or Python 3.4+. It can use the standard CPython interpreter,
./docs/rdd-programming-guide.md:Note that support for Python 2.6 is deprecated as of Spark 2.0.0, and may be removed in Spark 2.2.0.
./python/pyspark/context.py:            warnings.warn("Support for Python 2.6 is deprecated as of Spark 2.0.0")
./python/pyspark/ml/tests.py:        sys.stderr.write('Please install unittest2 to test with Python 2.6 or earlier')
./python/pyspark/mllib/tests.py:        sys.stderr.write('Please install unittest2 to test with Python 2.6 or earlier')
./python/pyspark/serializers.py:        # On Python 2.6, we can't write bytearrays to streams, so we need to convert them
./python/pyspark/sql/tests.py:        sys.stderr.write('Please install unittest2 to test with Python 2.6 or earlier')
./python/pyspark/streaming/tests.py:        sys.stderr.write('Please install unittest2 to test with Python 2.6 or earlier')
./python/pyspark/tests.py:        sys.stderr.write('Please install unittest2 to test with Python 2.6 or earlier')
./python/pyspark/tests.py:        # NOTE: dict is used instead of collections.Counter for Python 2.6
./python/pyspark/tests.py:        # NOTE: dict is used instead of collections.Counter for Python 2.6
```

This PR only proposes to change visible changes as below:

```
./docs/rdd-programming-guide.md:Spark {{site.SPARK_VERSION}} works with Python 2.6+ or Python 3.4+. It can use the standard CPython interpreter,
./docs/rdd-programming-guide.md:Note that support for Python 2.6 is deprecated as of Spark 2.0.0, and may be removed in Spark 2.2.0.
./python/pyspark/context.py:            warnings.warn("Support for Python 2.6 is deprecated as of Spark 2.0.0")
```

This one is already correct:

```
./docs/index.md:Note that support for Java 7, Python 2.6 and old Hadoop versions before 2.6.5 were removed as of Spark 2.2.0.
```

## How was this patch tested?

```bash
 grep -r "Python 2.6" .
 grep -r "python 2.6" .
 ```

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #18682 from HyukjinKwon/minor-python.26.
---
 docs/rdd-programming-guide.md | 4 +---
 python/pyspark/context.py     | 3 ---
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/docs/rdd-programming-guide.md b/docs/rdd-programming-guide.md
index 3aa529169bb12..0966d3870e8f8 100644
--- a/docs/rdd-programming-guide.md
+++ b/docs/rdd-programming-guide.md
@@ -86,11 +86,9 @@ import org.apache.spark.SparkConf;
 
 <div data-lang="python"  markdown="1">
 
-Spark {{site.SPARK_VERSION}} works with Python 2.6+ or Python 3.4+. It can use the standard CPython interpreter,
+Spark {{site.SPARK_VERSION}} works with Python 2.7+ or Python 3.4+. It can use the standard CPython interpreter,
 so C libraries like NumPy can be used. It also works with PyPy 2.3+.
 
-Note that support for Python 2.6 is deprecated as of Spark 2.0.0, and may be removed in Spark 2.2.0.
-
 To run Spark applications in Python, use the `bin/spark-submit` script located in the Spark directory.
 This script will load Spark's Java/Scala libraries and allow you to submit applications to a cluster.
 You can also use `bin/pyspark` to launch an interactive Python shell.
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index c4b7e6372d1a2..80cb48fb8209e 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -191,9 +191,6 @@ def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize,
         self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')
         self.pythonVer = "%d.%d" % sys.version_info[:2]
 
-        if sys.version_info < (2, 7):
-            warnings.warn("Support for Python 2.6 is deprecated as of Spark 2.0.0")
-
         # Broadcast's __reduce__ method stores Broadcast instances here.
         # This allows other code to determine which Broadcast instances have
         # been pickled, so it can determine which Java broadcast objects to

From 256358f66a975aa62eb81378e4e073fc2d15644b Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Thu, 20 Jul 2017 19:16:26 +0800
Subject: [PATCH 212/779] [SPARK-21477][SQL][MINOR] Mark LocalTableScanExec's
 input data transient

## What changes were proposed in this pull request?
This PR is to mark the parameter `rows` and `unsafeRow` of LocalTableScanExec transient. It can avoid serializing the unneeded objects.

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #18686 from gatorsmile/LocalTableScanExec.
---
 .../org/apache/spark/sql/execution/LocalTableScanExec.scala   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScanExec.scala
index 19c68c13262a5..514ad7018d8c7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScanExec.scala
@@ -28,12 +28,12 @@ import org.apache.spark.sql.execution.metric.SQLMetrics
  */
 case class LocalTableScanExec(
     output: Seq[Attribute],
-    rows: Seq[InternalRow]) extends LeafExecNode {
+    @transient rows: Seq[InternalRow]) extends LeafExecNode {
 
   override lazy val metrics = Map(
     "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
 
-  private lazy val unsafeRows: Array[InternalRow] = {
+  @transient private lazy val unsafeRows: Array[InternalRow] = {
     if (rows.isEmpty) {
       Array.empty
     } else {

From 5d1850d4b541a8108c934a174097f3c7e10b5315 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 20 Jul 2017 20:07:16 +0800
Subject: [PATCH 213/779] [MINOR][ML] Reorg RFormula params.

## What changes were proposed in this pull request?
There are mainly two reasons for this reorg:
* Some params are placed in ```RFormulaBase```, while others are placed in ```RFormula```, this is disordered.
* ```RFormulaModel``` should have params ```handleInvalid```, ```formula``` and ```forceIndexLabel```, that users can get invalid values handling policy, formula or whether to force index label if they only have a ```RFormulaModel```. So we need move these params to ```RFormulaBase``` which is also inherited by ```RFormulaModel```.
* ```RFormulaModel``` should support set different ```handleInvalid``` when cross validation.

## How was this patch tested?
Existing tests.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #18681 from yanboliang/rformula-reorg.
---
 .../apache/spark/ml/feature/RFormula.scala    | 95 +++++++++----------
 1 file changed, 47 insertions(+), 48 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index c22445467dbc3..7da3339f8b487 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -35,7 +35,51 @@ import org.apache.spark.sql.types._
 /**
  * Base trait for [[RFormula]] and [[RFormulaModel]].
  */
-private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol {
+private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol with HasHandleInvalid {
+
+  /**
+   * R formula parameter. The formula is provided in string form.
+   * @group param
+   */
+  @Since("1.5.0")
+  val formula: Param[String] = new Param(this, "formula", "R model formula")
+
+  /** @group getParam */
+  @Since("1.5.0")
+  def getFormula: String = $(formula)
+
+  /**
+   * Force to index label whether it is numeric or string type.
+   * Usually we index label only when it is string type.
+   * If the formula was used by classification algorithms,
+   * we can force to index label even it is numeric type by setting this param with true.
+   * Default: false.
+   * @group param
+   */
+  @Since("2.1.0")
+  val forceIndexLabel: BooleanParam = new BooleanParam(this, "forceIndexLabel",
+    "Force to index label whether it is numeric or string")
+  setDefault(forceIndexLabel -> false)
+
+  /** @group getParam */
+  @Since("2.1.0")
+  def getForceIndexLabel: Boolean = $(forceIndexLabel)
+
+  /**
+   * Param for how to handle invalid data (unseen or NULL values) in features and label column
+   * of string type. Options are 'skip' (filter out rows with invalid data),
+   * 'error' (throw an error), or 'keep' (put invalid data in a special additional
+   * bucket, at index numLabels).
+   * Default: "error"
+   * @group param
+   */
+  @Since("2.3.0")
+  final override val handleInvalid: Param[String] = new Param[String](this, "handleInvalid",
+    "How to handle invalid data (unseen or NULL values) in features and label column of string " +
+    "type. Options are 'skip' (filter out rows with invalid data), error (throw an error), " +
+    "or 'keep' (put invalid data in a special additional bucket, at index numLabels).",
+    ParamValidators.inArray(StringIndexer.supportedHandleInvalids))
+  setDefault(handleInvalid, StringIndexer.ERROR_INVALID)
 
   /**
    * Param for how to order categories of a string FEATURE column used by `StringIndexer`.
@@ -68,6 +112,7 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol {
     "The default value is 'frequencyDesc'. When the ordering is set to 'alphabetDesc', " +
     "RFormula drops the same category as R when encoding strings.",
     ParamValidators.inArray(StringIndexer.supportedStringOrderType))
+  setDefault(stringIndexerOrderType, StringIndexer.frequencyDesc)
 
   /** @group getParam */
   @Since("2.3.0")
@@ -108,19 +153,11 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol {
 @Experimental
 @Since("1.5.0")
 class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
-  extends Estimator[RFormulaModel] with RFormulaBase with HasHandleInvalid
-    with DefaultParamsWritable {
+  extends Estimator[RFormulaModel] with RFormulaBase with DefaultParamsWritable {
 
   @Since("1.5.0")
   def this() = this(Identifiable.randomUID("rFormula"))
 
-  /**
-   * R formula parameter. The formula is provided in string form.
-   * @group param
-   */
-  @Since("1.5.0")
-  val formula: Param[String] = new Param(this, "formula", "R model formula")
-
   /**
    * Sets the formula to use for this transformer. Must be called before use.
    * @group setParam
@@ -129,26 +166,6 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
   @Since("1.5.0")
   def setFormula(value: String): this.type = set(formula, value)
 
-  /** @group getParam */
-  @Since("1.5.0")
-  def getFormula: String = $(formula)
-
-  /**
-   * Param for how to handle invalid data (unseen or NULL values) in features and label column
-   * of string type. Options are 'skip' (filter out rows with invalid data),
-   * 'error' (throw an error), or 'keep' (put invalid data in a special additional
-   * bucket, at index numLabels).
-   * Default: "error"
-   * @group param
-   */
-  @Since("2.3.0")
-  override val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "How to " +
-    "handle invalid data (unseen or NULL values) in features and label column of string type. " +
-    "Options are 'skip' (filter out rows with invalid data), error (throw an error), " +
-    "or 'keep' (put invalid data in a special additional bucket, at index numLabels).",
-    ParamValidators.inArray(StringIndexer.supportedHandleInvalids))
-  setDefault(handleInvalid, StringIndexer.ERROR_INVALID)
-
   /** @group setParam */
   @Since("2.3.0")
   def setHandleInvalid(value: String): this.type = set(handleInvalid, value)
@@ -161,23 +178,6 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
   @Since("1.5.0")
   def setLabelCol(value: String): this.type = set(labelCol, value)
 
-  /**
-   * Force to index label whether it is numeric or string type.
-   * Usually we index label only when it is string type.
-   * If the formula was used by classification algorithms,
-   * we can force to index label even it is numeric type by setting this param with true.
-   * Default: false.
-   * @group param
-   */
-  @Since("2.1.0")
-  val forceIndexLabel: BooleanParam = new BooleanParam(this, "forceIndexLabel",
-    "Force to index label whether it is numeric or string")
-  setDefault(forceIndexLabel -> false)
-
-  /** @group getParam */
-  @Since("2.1.0")
-  def getForceIndexLabel: Boolean = $(forceIndexLabel)
-
   /** @group setParam */
   @Since("2.1.0")
   def setForceIndexLabel(value: Boolean): this.type = set(forceIndexLabel, value)
@@ -185,7 +185,6 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
   /** @group setParam */
   @Since("2.3.0")
   def setStringIndexerOrderType(value: String): this.type = set(stringIndexerOrderType, value)
-  setDefault(stringIndexerOrderType, StringIndexer.frequencyDesc)
 
   /** Whether the formula specifies fitting an intercept. */
   private[ml] def hasIntercept: Boolean = {

From cb19880cd8d54d09fdd13cfad1914b8b36328a5a Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Thu, 20 Jul 2017 21:00:30 +0800
Subject: [PATCH 214/779] [SPARK-21472][SQL] Introduce ArrowColumnVector as a
 reader for Arrow vectors.

## What changes were proposed in this pull request?

Introducing `ArrowColumnVector` as a reader for Arrow vectors.
It extends `ColumnVector`, so we will be able to use it with `ColumnarBatch` and its functionalities.
Currently it supports primitive types and `StringType`, `ArrayType` and `StructType`.

## How was this patch tested?

Added tests for `ArrowColumnVector` and existing tests.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #18680 from ueshin/issues/SPARK-21472.
---
 .../vectorized/ArrowColumnVector.java         | 590 ++++++++++++++++++
 .../execution/vectorized/ColumnVector.java    |  16 +-
 .../vectorized/ReadOnlyColumnVector.java      | 251 ++++++++
 .../sql/execution/arrow/ArrowConverters.scala |  32 +-
 .../sql/execution/arrow/ArrowUtils.scala      | 109 ++++
 .../arrow/ArrowConvertersSuite.scala          |   2 +-
 .../sql/execution/arrow/ArrowUtilsSuite.scala |  65 ++
 .../vectorized/ArrowColumnVectorSuite.scala   | 410 ++++++++++++
 8 files changed, 1436 insertions(+), 39 deletions(-)
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ReadOnlyColumnVector.java
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowUtils.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowUtilsSuite.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
new file mode 100644
index 0000000000000..68e0abc11c39d
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
@@ -0,0 +1,590 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.vectorized;
+
+import org.apache.arrow.vector.*;
+import org.apache.arrow.vector.complex.*;
+import org.apache.arrow.vector.holders.NullableVarCharHolder;
+
+import org.apache.spark.memory.MemoryMode;
+import org.apache.spark.sql.execution.arrow.ArrowUtils;
+import org.apache.spark.sql.types.*;
+import org.apache.spark.unsafe.types.UTF8String;
+
+/**
+ * A column vector backed by Apache Arrow.
+ */
+public final class ArrowColumnVector extends ReadOnlyColumnVector {
+
+  private final ArrowVectorAccessor accessor;
+  private final int valueCount;
+
+  private void ensureAccessible(int index) {
+    if (index < 0 || index >= valueCount) {
+      throw new IndexOutOfBoundsException(
+        String.format("index: %d, valueCount: %d", index, valueCount));
+    }
+  }
+
+  private void ensureAccessible(int index, int count) {
+    if (index < 0 || index + count > valueCount) {
+      throw new IndexOutOfBoundsException(
+        String.format("index range: [%d, %d), valueCount: %d", index, index + count, valueCount));
+    }
+  }
+
+  @Override
+  public long nullsNativeAddress() {
+    throw new RuntimeException("Cannot get native address for arrow column");
+  }
+
+  @Override
+  public long valuesNativeAddress() {
+    throw new RuntimeException("Cannot get native address for arrow column");
+  }
+
+  @Override
+  public void close() {
+    if (childColumns != null) {
+      for (int i = 0; i < childColumns.length; i++) {
+        childColumns[i].close();
+      }
+    }
+    accessor.close();
+  }
+
+  //
+  // APIs dealing with nulls
+  //
+
+  @Override
+  public boolean isNullAt(int rowId) {
+    ensureAccessible(rowId);
+    return accessor.isNullAt(rowId);
+  }
+
+  //
+  // APIs dealing with Booleans
+  //
+
+  @Override
+  public boolean getBoolean(int rowId) {
+    ensureAccessible(rowId);
+    return accessor.getBoolean(rowId);
+  }
+
+  @Override
+  public boolean[] getBooleans(int rowId, int count) {
+    ensureAccessible(rowId, count);
+    boolean[] array = new boolean[count];
+    for (int i = 0; i < count; ++i) {
+      array[i] = accessor.getBoolean(rowId + i);
+    }
+    return array;
+  }
+
+  //
+  // APIs dealing with Bytes
+  //
+
+  @Override
+  public byte getByte(int rowId) {
+    ensureAccessible(rowId);
+    return accessor.getByte(rowId);
+  }
+
+  @Override
+  public byte[] getBytes(int rowId, int count) {
+    ensureAccessible(rowId, count);
+    byte[] array = new byte[count];
+    for (int i = 0; i < count; ++i) {
+      array[i] = accessor.getByte(rowId + i);
+    }
+    return array;
+  }
+
+  //
+  // APIs dealing with Shorts
+  //
+
+  @Override
+  public short getShort(int rowId) {
+    ensureAccessible(rowId);
+    return accessor.getShort(rowId);
+  }
+
+  @Override
+  public short[] getShorts(int rowId, int count) {
+    ensureAccessible(rowId, count);
+    short[] array = new short[count];
+    for (int i = 0; i < count; ++i) {
+      array[i] = accessor.getShort(rowId + i);
+    }
+    return array;
+  }
+
+  //
+  // APIs dealing with Ints
+  //
+
+  @Override
+  public int getInt(int rowId) {
+    ensureAccessible(rowId);
+    return accessor.getInt(rowId);
+  }
+
+  @Override
+  public int[] getInts(int rowId, int count) {
+    ensureAccessible(rowId, count);
+    int[] array = new int[count];
+    for (int i = 0; i < count; ++i) {
+      array[i] = accessor.getInt(rowId + i);
+    }
+    return array;
+  }
+
+  @Override
+  public int getDictId(int rowId) {
+    throw new UnsupportedOperationException();
+  }
+
+  //
+  // APIs dealing with Longs
+  //
+
+  @Override
+  public long getLong(int rowId) {
+    ensureAccessible(rowId);
+    return accessor.getLong(rowId);
+  }
+
+  @Override
+  public long[] getLongs(int rowId, int count) {
+    ensureAccessible(rowId, count);
+    long[] array = new long[count];
+    for (int i = 0; i < count; ++i) {
+      array[i] = accessor.getLong(rowId + i);
+    }
+    return array;
+  }
+
+  //
+  // APIs dealing with floats
+  //
+
+  @Override
+  public float getFloat(int rowId) {
+    ensureAccessible(rowId);
+    return accessor.getFloat(rowId);
+  }
+
+  @Override
+  public float[] getFloats(int rowId, int count) {
+    ensureAccessible(rowId, count);
+    float[] array = new float[count];
+    for (int i = 0; i < count; ++i) {
+      array[i] = accessor.getFloat(rowId + i);
+    }
+    return array;
+  }
+
+  //
+  // APIs dealing with doubles
+  //
+
+  @Override
+  public double getDouble(int rowId) {
+    ensureAccessible(rowId);
+    return accessor.getDouble(rowId);
+  }
+
+  @Override
+  public double[] getDoubles(int rowId, int count) {
+    ensureAccessible(rowId, count);
+    double[] array = new double[count];
+    for (int i = 0; i < count; ++i) {
+      array[i] = accessor.getDouble(rowId + i);
+    }
+    return array;
+  }
+
+  //
+  // APIs dealing with Arrays
+  //
+
+  @Override
+  public int getArrayLength(int rowId) {
+    ensureAccessible(rowId);
+    return accessor.getArrayLength(rowId);
+  }
+
+  @Override
+  public int getArrayOffset(int rowId) {
+    ensureAccessible(rowId);
+    return accessor.getArrayOffset(rowId);
+  }
+
+  @Override
+  public void loadBytes(Array array) {
+    throw new UnsupportedOperationException();
+  }
+
+  //
+  // APIs dealing with Decimals
+  //
+
+  @Override
+  public Decimal getDecimal(int rowId, int precision, int scale) {
+    ensureAccessible(rowId);
+    return accessor.getDecimal(rowId, precision, scale);
+  }
+
+  //
+  // APIs dealing with UTF8Strings
+  //
+
+  @Override
+  public UTF8String getUTF8String(int rowId) {
+    ensureAccessible(rowId);
+    return accessor.getUTF8String(rowId);
+  }
+
+  //
+  // APIs dealing with Binaries
+  //
+
+  @Override
+  public byte[] getBinary(int rowId) {
+    ensureAccessible(rowId);
+    return accessor.getBinary(rowId);
+  }
+
+  public ArrowColumnVector(ValueVector vector) {
+    super(vector.getValueCapacity(), ArrowUtils.fromArrowField(vector.getField()),
+      MemoryMode.OFF_HEAP);
+
+    if (vector instanceof NullableBitVector) {
+      accessor = new BooleanAccessor((NullableBitVector) vector);
+    } else if (vector instanceof NullableTinyIntVector) {
+      accessor = new ByteAccessor((NullableTinyIntVector) vector);
+    } else if (vector instanceof NullableSmallIntVector) {
+      accessor = new ShortAccessor((NullableSmallIntVector) vector);
+    } else if (vector instanceof NullableIntVector) {
+      accessor = new IntAccessor((NullableIntVector) vector);
+    } else if (vector instanceof NullableBigIntVector) {
+      accessor = new LongAccessor((NullableBigIntVector) vector);
+    } else if (vector instanceof NullableFloat4Vector) {
+      accessor = new FloatAccessor((NullableFloat4Vector) vector);
+    } else if (vector instanceof NullableFloat8Vector) {
+      accessor = new DoubleAccessor((NullableFloat8Vector) vector);
+    } else if (vector instanceof NullableDecimalVector) {
+      accessor = new DecimalAccessor((NullableDecimalVector) vector);
+    } else if (vector instanceof NullableVarCharVector) {
+      accessor = new StringAccessor((NullableVarCharVector) vector);
+    } else if (vector instanceof NullableVarBinaryVector) {
+      accessor = new BinaryAccessor((NullableVarBinaryVector) vector);
+    } else if (vector instanceof ListVector) {
+      ListVector listVector = (ListVector) vector;
+      accessor = new ArrayAccessor(listVector);
+
+      childColumns = new ColumnVector[1];
+      childColumns[0] = new ArrowColumnVector(listVector.getDataVector());
+      resultArray = new Array(childColumns[0]);
+    } else if (vector instanceof MapVector) {
+      MapVector mapVector = (MapVector) vector;
+      accessor = new StructAccessor(mapVector);
+
+      childColumns = new ArrowColumnVector[mapVector.size()];
+      for (int i = 0; i < childColumns.length; ++i) {
+        childColumns[i] = new ArrowColumnVector(mapVector.getVectorById(i));
+      }
+      resultStruct = new ColumnarBatch.Row(childColumns);
+    } else {
+      throw new UnsupportedOperationException();
+    }
+    valueCount = accessor.getValueCount();
+    numNulls = accessor.getNullCount();
+    anyNullsSet = numNulls > 0;
+  }
+
+  private static abstract class ArrowVectorAccessor {
+
+    private final ValueVector vector;
+    private final ValueVector.Accessor nulls;
+
+    private final int valueCount;
+    private final int nullCount;
+
+    ArrowVectorAccessor(ValueVector vector) {
+      this.vector = vector;
+      this.nulls = vector.getAccessor();
+      this.valueCount = nulls.getValueCount();
+      this.nullCount = nulls.getNullCount();
+    }
+
+    final boolean isNullAt(int rowId) {
+      return nulls.isNull(rowId);
+    }
+
+    final int getValueCount() {
+      return valueCount;
+    }
+
+    final int getNullCount() {
+      return nullCount;
+    }
+
+    final void close() {
+      vector.close();
+    }
+
+    boolean getBoolean(int rowId) {
+      throw new UnsupportedOperationException();
+    }
+
+    byte getByte(int rowId) {
+      throw new UnsupportedOperationException();
+    }
+
+    short getShort(int rowId) {
+      throw new UnsupportedOperationException();
+    }
+
+    int getInt(int rowId) {
+      throw new UnsupportedOperationException();
+    }
+
+    long getLong(int rowId) {
+      throw new UnsupportedOperationException();
+    }
+
+    float getFloat(int rowId) {
+      throw new UnsupportedOperationException();
+    }
+
+    double getDouble(int rowId) {
+      throw new UnsupportedOperationException();
+    }
+
+    Decimal getDecimal(int rowId, int precision, int scale) {
+      throw new UnsupportedOperationException();
+    }
+
+    UTF8String getUTF8String(int rowId) {
+      throw new UnsupportedOperationException();
+    }
+
+    byte[] getBinary(int rowId) {
+      throw new UnsupportedOperationException();
+    }
+
+    int getArrayLength(int rowId) {
+      throw new UnsupportedOperationException();
+    }
+
+    int getArrayOffset(int rowId) {
+      throw new UnsupportedOperationException();
+    }
+  }
+
+  private static class BooleanAccessor extends ArrowVectorAccessor {
+
+    private final NullableBitVector.Accessor accessor;
+
+    BooleanAccessor(NullableBitVector vector) {
+      super(vector);
+      this.accessor = vector.getAccessor();
+    }
+
+    @Override
+    final boolean getBoolean(int rowId) {
+      return accessor.get(rowId) == 1;
+    }
+  }
+
+  private static class ByteAccessor extends ArrowVectorAccessor {
+
+    private final NullableTinyIntVector.Accessor accessor;
+
+    ByteAccessor(NullableTinyIntVector vector) {
+      super(vector);
+      this.accessor = vector.getAccessor();
+    }
+
+    @Override
+    final byte getByte(int rowId) {
+      return accessor.get(rowId);
+    }
+  }
+
+  private static class ShortAccessor extends ArrowVectorAccessor {
+
+    private final NullableSmallIntVector.Accessor accessor;
+
+    ShortAccessor(NullableSmallIntVector vector) {
+      super(vector);
+      this.accessor = vector.getAccessor();
+    }
+
+    @Override
+    final short getShort(int rowId) {
+      return accessor.get(rowId);
+    }
+  }
+
+  private static class IntAccessor extends ArrowVectorAccessor {
+
+    private final NullableIntVector.Accessor accessor;
+
+    IntAccessor(NullableIntVector vector) {
+      super(vector);
+      this.accessor = vector.getAccessor();
+    }
+
+    @Override
+    final int getInt(int rowId) {
+      return accessor.get(rowId);
+    }
+  }
+
+  private static class LongAccessor extends ArrowVectorAccessor {
+
+    private final NullableBigIntVector.Accessor accessor;
+
+    LongAccessor(NullableBigIntVector vector) {
+      super(vector);
+      this.accessor = vector.getAccessor();
+    }
+
+    @Override
+    final long getLong(int rowId) {
+      return accessor.get(rowId);
+    }
+  }
+
+  private static class FloatAccessor extends ArrowVectorAccessor {
+
+    private final NullableFloat4Vector.Accessor accessor;
+
+    FloatAccessor(NullableFloat4Vector vector) {
+      super(vector);
+      this.accessor = vector.getAccessor();
+    }
+
+    @Override
+    final float getFloat(int rowId) {
+      return accessor.get(rowId);
+    }
+  }
+
+  private static class DoubleAccessor extends ArrowVectorAccessor {
+
+    private final NullableFloat8Vector.Accessor accessor;
+
+    DoubleAccessor(NullableFloat8Vector vector) {
+      super(vector);
+      this.accessor = vector.getAccessor();
+    }
+
+    @Override
+    final double getDouble(int rowId) {
+      return accessor.get(rowId);
+    }
+  }
+
+  private static class DecimalAccessor extends ArrowVectorAccessor {
+
+    private final NullableDecimalVector.Accessor accessor;
+
+    DecimalAccessor(NullableDecimalVector vector) {
+      super(vector);
+      this.accessor = vector.getAccessor();
+    }
+
+    @Override
+    final Decimal getDecimal(int rowId, int precision, int scale) {
+      if (isNullAt(rowId)) return null;
+      return Decimal.apply(accessor.getObject(rowId), precision, scale);
+    }
+  }
+
+  private static class StringAccessor extends ArrowVectorAccessor {
+
+    private final NullableVarCharVector.Accessor accessor;
+    private final NullableVarCharHolder stringResult = new NullableVarCharHolder();
+
+    StringAccessor(NullableVarCharVector vector) {
+      super(vector);
+      this.accessor = vector.getAccessor();
+    }
+
+    @Override
+    final UTF8String getUTF8String(int rowId) {
+      accessor.get(rowId, stringResult);
+      if (stringResult.isSet == 0) {
+        return null;
+      } else {
+        return UTF8String.fromAddress(null,
+          stringResult.buffer.memoryAddress() + stringResult.start,
+          stringResult.end - stringResult.start);
+      }
+    }
+  }
+
+  private static class BinaryAccessor extends ArrowVectorAccessor {
+
+    private final NullableVarBinaryVector.Accessor accessor;
+
+    BinaryAccessor(NullableVarBinaryVector vector) {
+      super(vector);
+      this.accessor = vector.getAccessor();
+    }
+
+    @Override
+    final byte[] getBinary(int rowId) {
+      return accessor.getObject(rowId);
+    }
+  }
+
+  private static class ArrayAccessor extends ArrowVectorAccessor {
+
+    private final UInt4Vector.Accessor accessor;
+
+    ArrayAccessor(ListVector vector) {
+      super(vector);
+      this.accessor = vector.getOffsetVector().getAccessor();
+    }
+
+    @Override
+    final int getArrayLength(int rowId) {
+      return accessor.get(rowId + 1) - accessor.get(rowId);
+    }
+
+    @Override
+    final int getArrayOffset(int rowId) {
+      return accessor.get(rowId);
+    }
+  }
+
+  private static class StructAccessor extends ArrowVectorAccessor {
+
+    StructAccessor(MapVector vector) {
+      super(vector);
+    }
+  }
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
index 0c027f80d48cc..77966382881b8 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
@@ -646,7 +646,7 @@ public MapData getMap(int ordinal) {
   /**
    * Returns the decimal for rowId.
    */
-  public final Decimal getDecimal(int rowId, int precision, int scale) {
+  public Decimal getDecimal(int rowId, int precision, int scale) {
     if (precision <= Decimal.MAX_INT_DIGITS()) {
       return Decimal.createUnsafe(getInt(rowId), precision, scale);
     } else if (precision <= Decimal.MAX_LONG_DIGITS()) {
@@ -661,7 +661,7 @@ public final Decimal getDecimal(int rowId, int precision, int scale) {
   }
 
 
-  public final void putDecimal(int rowId, Decimal value, int precision) {
+  public void putDecimal(int rowId, Decimal value, int precision) {
     if (precision <= Decimal.MAX_INT_DIGITS()) {
       putInt(rowId, (int) value.toUnscaledLong());
     } else if (precision <= Decimal.MAX_LONG_DIGITS()) {
@@ -675,7 +675,7 @@ public final void putDecimal(int rowId, Decimal value, int precision) {
   /**
    * Returns the UTF8String for rowId.
    */
-  public final UTF8String getUTF8String(int rowId) {
+  public UTF8String getUTF8String(int rowId) {
     if (dictionary == null) {
       ColumnVector.Array a = getByteArray(rowId);
       return UTF8String.fromBytes(a.byteArray, a.byteArrayOffset, a.length);
@@ -688,7 +688,7 @@ public final UTF8String getUTF8String(int rowId) {
   /**
    * Returns the byte array for rowId.
    */
-  public final byte[] getBinary(int rowId) {
+  public byte[] getBinary(int rowId) {
     if (dictionary == null) {
       ColumnVector.Array array = getByteArray(rowId);
       byte[] bytes = new byte[array.length];
@@ -956,7 +956,7 @@ public final int appendStruct(boolean isNull) {
   /**
    * Data type for this column.
    */
-  protected final DataType type;
+  protected DataType type;
 
   /**
    * Number of nulls in this column. This is an optimization for the reader, to skip NULL checks.
@@ -988,17 +988,17 @@ public final int appendStruct(boolean isNull) {
   /**
    * If this is a nested type (array or struct), the column for the child data.
    */
-  protected final ColumnVector[] childColumns;
+  protected ColumnVector[] childColumns;
 
   /**
    * Reusable Array holder for getArray().
    */
-  protected final Array resultArray;
+  protected Array resultArray;
 
   /**
    * Reusable Struct holder for getStruct().
    */
-  protected final ColumnarBatch.Row resultStruct;
+  protected ColumnarBatch.Row resultStruct;
 
   /**
    * The Dictionary for this column.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ReadOnlyColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ReadOnlyColumnVector.java
new file mode 100644
index 0000000000000..e9f6e7c631fd4
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ReadOnlyColumnVector.java
@@ -0,0 +1,251 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.vectorized;
+
+import org.apache.spark.memory.MemoryMode;
+import org.apache.spark.sql.types.*;
+
+/**
+ * An abstract class for read-only column vector.
+ */
+public abstract class ReadOnlyColumnVector extends ColumnVector {
+
+  protected ReadOnlyColumnVector(int capacity, DataType type, MemoryMode memMode) {
+    super(capacity, DataTypes.NullType, memMode);
+    this.type = type;
+    isConstant = true;
+  }
+
+  //
+  // APIs dealing with nulls
+  //
+
+  @Override
+  public final void putNotNull(int rowId) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public final void putNull(int rowId) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public final void putNulls(int rowId, int count) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public final void putNotNulls(int rowId, int count) {
+    throw new UnsupportedOperationException();
+  }
+
+  //
+  // APIs dealing with Booleans
+  //
+
+  @Override
+  public final void putBoolean(int rowId, boolean value) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public final void putBooleans(int rowId, int count, boolean value) {
+    throw new UnsupportedOperationException();
+  }
+
+  //
+  // APIs dealing with Bytes
+  //
+
+  @Override
+  public final void putByte(int rowId, byte value) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public final void putBytes(int rowId, int count, byte value) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public final void putBytes(int rowId, int count, byte[] src, int srcIndex) {
+    throw new UnsupportedOperationException();
+  }
+
+  //
+  // APIs dealing with Shorts
+  //
+
+  @Override
+  public final void putShort(int rowId, short value) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public final void putShorts(int rowId, int count, short value) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public final void putShorts(int rowId, int count, short[] src, int srcIndex) {
+    throw new UnsupportedOperationException();
+  }
+
+  //
+  // APIs dealing with Ints
+  //
+
+  @Override
+  public final void putInt(int rowId, int value) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public final void putInts(int rowId, int count, int value) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public final void putInts(int rowId, int count, int[] src, int srcIndex) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public final void putIntsLittleEndian(int rowId, int count, byte[] src, int srcIndex) {
+    throw new UnsupportedOperationException();
+  }
+
+  //
+  // APIs dealing with Longs
+  //
+
+  @Override
+  public final void putLong(int rowId, long value) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public final void putLongs(int rowId, int count, long value) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public final void putLongs(int rowId, int count, long[] src, int srcIndex) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public final void putLongsLittleEndian(int rowId, int count, byte[] src, int srcIndex) {
+    throw new UnsupportedOperationException();
+  }
+
+  //
+  // APIs dealing with floats
+  //
+
+  @Override
+  public final void putFloat(int rowId, float value) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public final void putFloats(int rowId, int count, float value) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public final void putFloats(int rowId, int count, float[] src, int srcIndex) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public final void putFloats(int rowId, int count, byte[] src, int srcIndex) {
+    throw new UnsupportedOperationException();
+  }
+
+  //
+  // APIs dealing with doubles
+  //
+
+  @Override
+  public final void putDouble(int rowId, double value) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public final void putDoubles(int rowId, int count, double value) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public final void putDoubles(int rowId, int count, double[] src, int srcIndex) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public final void putDoubles(int rowId, int count, byte[] src, int srcIndex) {
+    throw new UnsupportedOperationException();
+  }
+
+  //
+  // APIs dealing with Arrays
+  //
+
+  @Override
+  public final void putArray(int rowId, int offset, int length) {
+    throw new UnsupportedOperationException();
+  }
+
+  //
+  // APIs dealing with Byte Arrays
+  //
+
+  @Override
+  public final int putByteArray(int rowId, byte[] value, int offset, int count) {
+    throw new UnsupportedOperationException();
+  }
+
+  //
+  // APIs dealing with Decimals
+  //
+
+  @Override
+  public final void putDecimal(int rowId, Decimal value, int precision) {
+    throw new UnsupportedOperationException();
+  }
+
+  //
+  // Other APIs
+  //
+
+  @Override
+  public final void setDictionary(Dictionary dictionary) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public final ColumnVector reserveDictionaryIds(int capacity) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  protected final void reserveInternal(int newCapacity) {
+    throw new UnsupportedOperationException();
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
index 6af5c73422377..c913efe52a41c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
@@ -70,34 +70,6 @@ private[sql] object ArrowPayload {
 
 private[sql] object ArrowConverters {
 
-  /**
-   * Map a Spark DataType to ArrowType.
-   */
-  private[arrow] def sparkTypeToArrowType(dataType: DataType): ArrowType = {
-    dataType match {
-      case BooleanType => ArrowType.Bool.INSTANCE
-      case ShortType => new ArrowType.Int(8 * ShortType.defaultSize, true)
-      case IntegerType => new ArrowType.Int(8 * IntegerType.defaultSize, true)
-      case LongType => new ArrowType.Int(8 * LongType.defaultSize, true)
-      case FloatType => new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)
-      case DoubleType => new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)
-      case ByteType => new ArrowType.Int(8, true)
-      case StringType => ArrowType.Utf8.INSTANCE
-      case BinaryType => ArrowType.Binary.INSTANCE
-      case _ => throw new UnsupportedOperationException(s"Unsupported data type: $dataType")
-    }
-  }
-
-  /**
-   * Convert a Spark Dataset schema to Arrow schema.
-   */
-  private[arrow] def schemaToArrowSchema(schema: StructType): Schema = {
-    val arrowFields = schema.fields.map { f =>
-      new Field(f.name, f.nullable, sparkTypeToArrowType(f.dataType), List.empty[Field].asJava)
-    }
-    new Schema(arrowFields.toList.asJava)
-  }
-
   /**
    * Maps Iterator from InternalRow to ArrowPayload. Limit ArrowRecordBatch size in ArrowPayload
    * by setting maxRecordsPerBatch or use 0 to fully consume rowIter.
@@ -178,7 +150,7 @@ private[sql] object ArrowConverters {
       batch: ArrowRecordBatch,
       schema: StructType,
       allocator: BufferAllocator): Array[Byte] = {
-    val arrowSchema = ArrowConverters.schemaToArrowSchema(schema)
+    val arrowSchema = ArrowUtils.toArrowSchema(schema)
     val root = VectorSchemaRoot.create(arrowSchema, allocator)
     val out = new ByteArrayOutputStream()
     val writer = new ArrowFileWriter(root, null, Channels.newChannel(out))
@@ -410,7 +382,7 @@ private[arrow] object ColumnWriter {
    * Create an Arrow ColumnWriter given the type and ordinal of row.
    */
   def apply(dataType: DataType, ordinal: Int, allocator: BufferAllocator): ColumnWriter = {
-    val dtype = ArrowConverters.sparkTypeToArrowType(dataType)
+    val dtype = ArrowUtils.toArrowType(dataType)
     dataType match {
       case BooleanType => new BooleanColumnWriter(dtype, ordinal, allocator)
       case ShortType => new ShortColumnWriter(dtype, ordinal, allocator)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowUtils.scala
new file mode 100644
index 0000000000000..2caf1ef02909a
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowUtils.scala
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.arrow
+
+import scala.collection.JavaConverters._
+
+import org.apache.arrow.memory.RootAllocator
+import org.apache.arrow.vector.types.FloatingPointPrecision
+import org.apache.arrow.vector.types.pojo.{ArrowType, Field, FieldType, Schema}
+
+import org.apache.spark.sql.types._
+
+object ArrowUtils {
+
+  val rootAllocator = new RootAllocator(Long.MaxValue)
+
+  // todo: support more types.
+
+  def toArrowType(dt: DataType): ArrowType = dt match {
+    case BooleanType => ArrowType.Bool.INSTANCE
+    case ByteType => new ArrowType.Int(8, true)
+    case ShortType => new ArrowType.Int(8 * 2, true)
+    case IntegerType => new ArrowType.Int(8 * 4, true)
+    case LongType => new ArrowType.Int(8 * 8, true)
+    case FloatType => new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)
+    case DoubleType => new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)
+    case StringType => ArrowType.Utf8.INSTANCE
+    case BinaryType => ArrowType.Binary.INSTANCE
+    case DecimalType.Fixed(precision, scale) => new ArrowType.Decimal(precision, scale)
+    case _ => throw new UnsupportedOperationException(s"Unsupported data type: ${dt.simpleString}")
+  }
+
+  def fromArrowType(dt: ArrowType): DataType = dt match {
+    case ArrowType.Bool.INSTANCE => BooleanType
+    case int: ArrowType.Int if int.getIsSigned && int.getBitWidth == 8 => ByteType
+    case int: ArrowType.Int if int.getIsSigned && int.getBitWidth == 8 * 2 => ShortType
+    case int: ArrowType.Int if int.getIsSigned && int.getBitWidth == 8 * 4 => IntegerType
+    case int: ArrowType.Int if int.getIsSigned && int.getBitWidth == 8 * 8 => LongType
+    case float: ArrowType.FloatingPoint
+      if float.getPrecision() == FloatingPointPrecision.SINGLE => FloatType
+    case float: ArrowType.FloatingPoint
+      if float.getPrecision() == FloatingPointPrecision.DOUBLE => DoubleType
+    case ArrowType.Utf8.INSTANCE => StringType
+    case ArrowType.Binary.INSTANCE => BinaryType
+    case d: ArrowType.Decimal => DecimalType(d.getPrecision, d.getScale)
+    case _ => throw new UnsupportedOperationException(s"Unsupported data type: $dt")
+  }
+
+  def toArrowField(name: String, dt: DataType, nullable: Boolean): Field = {
+    dt match {
+      case ArrayType(elementType, containsNull) =>
+        val fieldType = new FieldType(nullable, ArrowType.List.INSTANCE, null)
+        new Field(name, fieldType, Seq(toArrowField("element", elementType, containsNull)).asJava)
+      case StructType(fields) =>
+        val fieldType = new FieldType(nullable, ArrowType.Struct.INSTANCE, null)
+        new Field(name, fieldType,
+          fields.map { field =>
+            toArrowField(field.name, field.dataType, field.nullable)
+          }.toSeq.asJava)
+      case dataType =>
+        val fieldType = new FieldType(nullable, toArrowType(dataType), null)
+        new Field(name, fieldType, Seq.empty[Field].asJava)
+    }
+  }
+
+  def fromArrowField(field: Field): DataType = {
+    field.getType match {
+      case ArrowType.List.INSTANCE =>
+        val elementField = field.getChildren().get(0)
+        val elementType = fromArrowField(elementField)
+        ArrayType(elementType, containsNull = elementField.isNullable)
+      case ArrowType.Struct.INSTANCE =>
+        val fields = field.getChildren().asScala.map { child =>
+          val dt = fromArrowField(child)
+          StructField(child.getName, dt, child.isNullable)
+        }
+        StructType(fields)
+      case arrowType => fromArrowType(arrowType)
+    }
+  }
+
+  def toArrowSchema(schema: StructType): Schema = {
+    new Schema(schema.map { field =>
+      toArrowField(field.name, field.dataType, field.nullable)
+    }.asJava)
+  }
+
+  def fromArrowSchema(schema: Schema): StructType = {
+    StructType(schema.getFields.asScala.map { field =>
+      val dt = fromArrowField(field)
+      StructField(field.getName, dt, field.isNullable)
+    })
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala
index 159328cc0d958..55b465578a42d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala
@@ -1202,7 +1202,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
     val allocator = new RootAllocator(Long.MaxValue)
     val jsonReader = new JsonFileReader(jsonFile, allocator)
 
-    val arrowSchema = ArrowConverters.schemaToArrowSchema(sparkSchema)
+    val arrowSchema = ArrowUtils.toArrowSchema(sparkSchema)
     val jsonSchema = jsonReader.start()
     Validator.compareSchemas(arrowSchema, jsonSchema)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowUtilsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowUtilsSuite.scala
new file mode 100644
index 0000000000000..638619fd39d06
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowUtilsSuite.scala
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.arrow
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.types._
+
+class ArrowUtilsSuite extends SparkFunSuite {
+
+  def roundtrip(dt: DataType): Unit = {
+    dt match {
+      case schema: StructType =>
+        assert(ArrowUtils.fromArrowSchema(ArrowUtils.toArrowSchema(schema)) === schema)
+      case _ =>
+        roundtrip(new StructType().add("value", dt))
+    }
+  }
+
+  test("simple") {
+    roundtrip(BooleanType)
+    roundtrip(ByteType)
+    roundtrip(ShortType)
+    roundtrip(IntegerType)
+    roundtrip(LongType)
+    roundtrip(FloatType)
+    roundtrip(DoubleType)
+    roundtrip(StringType)
+    roundtrip(BinaryType)
+    roundtrip(DecimalType.SYSTEM_DEFAULT)
+  }
+
+  test("array") {
+    roundtrip(ArrayType(IntegerType, containsNull = true))
+    roundtrip(ArrayType(IntegerType, containsNull = false))
+    roundtrip(ArrayType(ArrayType(IntegerType, containsNull = true), containsNull = true))
+    roundtrip(ArrayType(ArrayType(IntegerType, containsNull = false), containsNull = true))
+    roundtrip(ArrayType(ArrayType(IntegerType, containsNull = true), containsNull = false))
+    roundtrip(ArrayType(ArrayType(IntegerType, containsNull = false), containsNull = false))
+  }
+
+  test("struct") {
+    roundtrip(new StructType())
+    roundtrip(new StructType().add("i", IntegerType))
+    roundtrip(new StructType().add("arr", ArrayType(IntegerType)))
+    roundtrip(new StructType().add("i", IntegerType).add("arr", ArrayType(IntegerType)))
+    roundtrip(new StructType().add(
+      "struct",
+      new StructType().add("i", IntegerType).add("arr", ArrayType(IntegerType))))
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
new file mode 100644
index 0000000000000..d24a9e1f4bd16
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
@@ -0,0 +1,410 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.vectorized
+
+import org.apache.arrow.vector._
+import org.apache.arrow.vector.complex._
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.execution.arrow.ArrowUtils
+import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
+
+class ArrowColumnVectorSuite extends SparkFunSuite {
+
+  test("boolean") {
+    val allocator = ArrowUtils.rootAllocator.newChildAllocator("boolean", 0, Long.MaxValue)
+    val vector = ArrowUtils.toArrowField("boolean", BooleanType, nullable = true)
+      .createVector(allocator).asInstanceOf[NullableBitVector]
+    vector.allocateNew()
+    val mutator = vector.getMutator()
+
+    (0 until 10).foreach { i =>
+      mutator.setSafe(i, if (i % 2 == 0) 1 else 0)
+    }
+    mutator.setNull(10)
+    mutator.setValueCount(11)
+
+    val columnVector = new ArrowColumnVector(vector)
+    assert(columnVector.dataType === BooleanType)
+    assert(columnVector.anyNullsSet)
+    assert(columnVector.numNulls === 1)
+
+    (0 until 10).foreach { i =>
+      assert(columnVector.getBoolean(i) === (i % 2 == 0))
+    }
+    assert(columnVector.isNullAt(10))
+
+    assert(columnVector.getBooleans(0, 10) === (0 until 10).map(i => (i % 2 == 0)))
+
+    columnVector.close()
+    allocator.close()
+  }
+
+  test("byte") {
+    val allocator = ArrowUtils.rootAllocator.newChildAllocator("byte", 0, Long.MaxValue)
+    val vector = ArrowUtils.toArrowField("byte", ByteType, nullable = true)
+      .createVector(allocator).asInstanceOf[NullableTinyIntVector]
+    vector.allocateNew()
+    val mutator = vector.getMutator()
+
+    (0 until 10).foreach { i =>
+      mutator.setSafe(i, i.toByte)
+    }
+    mutator.setNull(10)
+    mutator.setValueCount(11)
+
+    val columnVector = new ArrowColumnVector(vector)
+    assert(columnVector.dataType === ByteType)
+    assert(columnVector.anyNullsSet)
+    assert(columnVector.numNulls === 1)
+
+    (0 until 10).foreach { i =>
+      assert(columnVector.getByte(i) === i.toByte)
+    }
+    assert(columnVector.isNullAt(10))
+
+    assert(columnVector.getBytes(0, 10) === (0 until 10).map(i => i.toByte))
+
+    columnVector.close()
+    allocator.close()
+  }
+
+  test("short") {
+    val allocator = ArrowUtils.rootAllocator.newChildAllocator("short", 0, Long.MaxValue)
+    val vector = ArrowUtils.toArrowField("short", ShortType, nullable = true)
+      .createVector(allocator).asInstanceOf[NullableSmallIntVector]
+    vector.allocateNew()
+    val mutator = vector.getMutator()
+
+    (0 until 10).foreach { i =>
+      mutator.setSafe(i, i.toShort)
+    }
+    mutator.setNull(10)
+    mutator.setValueCount(11)
+
+    val columnVector = new ArrowColumnVector(vector)
+    assert(columnVector.dataType === ShortType)
+    assert(columnVector.anyNullsSet)
+    assert(columnVector.numNulls === 1)
+
+    (0 until 10).foreach { i =>
+      assert(columnVector.getShort(i) === i.toShort)
+    }
+    assert(columnVector.isNullAt(10))
+
+    assert(columnVector.getShorts(0, 10) === (0 until 10).map(i => i.toShort))
+
+    columnVector.close()
+    allocator.close()
+  }
+
+  test("int") {
+    val allocator = ArrowUtils.rootAllocator.newChildAllocator("int", 0, Long.MaxValue)
+    val vector = ArrowUtils.toArrowField("int", IntegerType, nullable = true)
+      .createVector(allocator).asInstanceOf[NullableIntVector]
+    vector.allocateNew()
+    val mutator = vector.getMutator()
+
+    (0 until 10).foreach { i =>
+      mutator.setSafe(i, i)
+    }
+    mutator.setNull(10)
+    mutator.setValueCount(11)
+
+    val columnVector = new ArrowColumnVector(vector)
+    assert(columnVector.dataType === IntegerType)
+    assert(columnVector.anyNullsSet)
+    assert(columnVector.numNulls === 1)
+
+    (0 until 10).foreach { i =>
+      assert(columnVector.getInt(i) === i)
+    }
+    assert(columnVector.isNullAt(10))
+
+    assert(columnVector.getInts(0, 10) === (0 until 10))
+
+    columnVector.close()
+    allocator.close()
+  }
+
+  test("long") {
+    val allocator = ArrowUtils.rootAllocator.newChildAllocator("long", 0, Long.MaxValue)
+    val vector = ArrowUtils.toArrowField("long", LongType, nullable = true)
+      .createVector(allocator).asInstanceOf[NullableBigIntVector]
+    vector.allocateNew()
+    val mutator = vector.getMutator()
+
+    (0 until 10).foreach { i =>
+      mutator.setSafe(i, i.toLong)
+    }
+    mutator.setNull(10)
+    mutator.setValueCount(11)
+
+    val columnVector = new ArrowColumnVector(vector)
+    assert(columnVector.dataType === LongType)
+    assert(columnVector.anyNullsSet)
+    assert(columnVector.numNulls === 1)
+
+    (0 until 10).foreach { i =>
+      assert(columnVector.getLong(i) === i.toLong)
+    }
+    assert(columnVector.isNullAt(10))
+
+    assert(columnVector.getLongs(0, 10) === (0 until 10).map(i => i.toLong))
+
+    columnVector.close()
+    allocator.close()
+  }
+
+  test("float") {
+    val allocator = ArrowUtils.rootAllocator.newChildAllocator("float", 0, Long.MaxValue)
+    val vector = ArrowUtils.toArrowField("float", FloatType, nullable = true)
+      .createVector(allocator).asInstanceOf[NullableFloat4Vector]
+    vector.allocateNew()
+    val mutator = vector.getMutator()
+
+    (0 until 10).foreach { i =>
+      mutator.setSafe(i, i.toFloat)
+    }
+    mutator.setNull(10)
+    mutator.setValueCount(11)
+
+    val columnVector = new ArrowColumnVector(vector)
+    assert(columnVector.dataType === FloatType)
+    assert(columnVector.anyNullsSet)
+    assert(columnVector.numNulls === 1)
+
+    (0 until 10).foreach { i =>
+      assert(columnVector.getFloat(i) === i.toFloat)
+    }
+    assert(columnVector.isNullAt(10))
+
+    assert(columnVector.getFloats(0, 10) === (0 until 10).map(i => i.toFloat))
+
+    columnVector.close()
+    allocator.close()
+  }
+
+  test("double") {
+    val allocator = ArrowUtils.rootAllocator.newChildAllocator("double", 0, Long.MaxValue)
+    val vector = ArrowUtils.toArrowField("double", DoubleType, nullable = true)
+      .createVector(allocator).asInstanceOf[NullableFloat8Vector]
+    vector.allocateNew()
+    val mutator = vector.getMutator()
+
+    (0 until 10).foreach { i =>
+      mutator.setSafe(i, i.toDouble)
+    }
+    mutator.setNull(10)
+    mutator.setValueCount(11)
+
+    val columnVector = new ArrowColumnVector(vector)
+    assert(columnVector.dataType === DoubleType)
+    assert(columnVector.anyNullsSet)
+    assert(columnVector.numNulls === 1)
+
+    (0 until 10).foreach { i =>
+      assert(columnVector.getDouble(i) === i.toDouble)
+    }
+    assert(columnVector.isNullAt(10))
+
+    assert(columnVector.getDoubles(0, 10) === (0 until 10).map(i => i.toDouble))
+
+    columnVector.close()
+    allocator.close()
+  }
+
+  test("string") {
+    val allocator = ArrowUtils.rootAllocator.newChildAllocator("string", 0, Long.MaxValue)
+    val vector = ArrowUtils.toArrowField("string", StringType, nullable = true)
+      .createVector(allocator).asInstanceOf[NullableVarCharVector]
+    vector.allocateNew()
+    val mutator = vector.getMutator()
+
+    (0 until 10).foreach { i =>
+      val utf8 = s"str$i".getBytes("utf8")
+      mutator.setSafe(i, utf8, 0, utf8.length)
+    }
+    mutator.setNull(10)
+    mutator.setValueCount(11)
+
+    val columnVector = new ArrowColumnVector(vector)
+    assert(columnVector.dataType === StringType)
+    assert(columnVector.anyNullsSet)
+    assert(columnVector.numNulls === 1)
+
+    (0 until 10).foreach { i =>
+      assert(columnVector.getUTF8String(i) === UTF8String.fromString(s"str$i"))
+    }
+    assert(columnVector.isNullAt(10))
+
+    columnVector.close()
+    allocator.close()
+  }
+
+  test("binary") {
+    val allocator = ArrowUtils.rootAllocator.newChildAllocator("binary", 0, Long.MaxValue)
+    val vector = ArrowUtils.toArrowField("binary", BinaryType, nullable = true)
+      .createVector(allocator).asInstanceOf[NullableVarBinaryVector]
+    vector.allocateNew()
+    val mutator = vector.getMutator()
+
+    (0 until 10).foreach { i =>
+      val utf8 = s"str$i".getBytes("utf8")
+      mutator.setSafe(i, utf8, 0, utf8.length)
+    }
+    mutator.setNull(10)
+    mutator.setValueCount(11)
+
+    val columnVector = new ArrowColumnVector(vector)
+    assert(columnVector.dataType === BinaryType)
+    assert(columnVector.anyNullsSet)
+    assert(columnVector.numNulls === 1)
+
+    (0 until 10).foreach { i =>
+      assert(columnVector.getBinary(i) === s"str$i".getBytes("utf8"))
+    }
+    assert(columnVector.isNullAt(10))
+
+    columnVector.close()
+    allocator.close()
+  }
+
+  test("array") {
+    val allocator = ArrowUtils.rootAllocator.newChildAllocator("array", 0, Long.MaxValue)
+    val vector = ArrowUtils.toArrowField("array", ArrayType(IntegerType), nullable = true)
+      .createVector(allocator).asInstanceOf[ListVector]
+    vector.allocateNew()
+    val mutator = vector.getMutator()
+    val elementVector = vector.getDataVector().asInstanceOf[NullableIntVector]
+    val elementMutator = elementVector.getMutator()
+
+    // [1, 2]
+    mutator.startNewValue(0)
+    elementMutator.setSafe(0, 1)
+    elementMutator.setSafe(1, 2)
+    mutator.endValue(0, 2)
+
+    // [3, null, 5]
+    mutator.startNewValue(1)
+    elementMutator.setSafe(2, 3)
+    elementMutator.setNull(3)
+    elementMutator.setSafe(4, 5)
+    mutator.endValue(1, 3)
+
+    // null
+
+    // []
+    mutator.startNewValue(3)
+    mutator.endValue(3, 0)
+
+    elementMutator.setValueCount(5)
+    mutator.setValueCount(4)
+
+    val columnVector = new ArrowColumnVector(vector)
+    assert(columnVector.dataType === ArrayType(IntegerType))
+    assert(columnVector.anyNullsSet)
+    assert(columnVector.numNulls === 1)
+
+    val array0 = columnVector.getArray(0)
+    assert(array0.numElements() === 2)
+    assert(array0.getInt(0) === 1)
+    assert(array0.getInt(1) === 2)
+
+    val array1 = columnVector.getArray(1)
+    assert(array1.numElements() === 3)
+    assert(array1.getInt(0) === 3)
+    assert(array1.isNullAt(1))
+    assert(array1.getInt(2) === 5)
+
+    assert(columnVector.isNullAt(2))
+
+    val array3 = columnVector.getArray(3)
+    assert(array3.numElements() === 0)
+
+    columnVector.close()
+    allocator.close()
+  }
+
+  test("struct") {
+    val allocator = ArrowUtils.rootAllocator.newChildAllocator("struct", 0, Long.MaxValue)
+    val schema = new StructType().add("int", IntegerType).add("long", LongType)
+    val vector = ArrowUtils.toArrowField("struct", schema, nullable = true)
+      .createVector(allocator).asInstanceOf[NullableMapVector]
+    vector.allocateNew()
+    val mutator = vector.getMutator()
+    val intVector = vector.getChildByOrdinal(0).asInstanceOf[NullableIntVector]
+    val intMutator = intVector.getMutator()
+    val longVector = vector.getChildByOrdinal(1).asInstanceOf[NullableBigIntVector]
+    val longMutator = longVector.getMutator()
+
+    // (1, 1L)
+    mutator.setIndexDefined(0)
+    intMutator.setSafe(0, 1)
+    longMutator.setSafe(0, 1L)
+
+    // (2, null)
+    mutator.setIndexDefined(1)
+    intMutator.setSafe(1, 2)
+    longMutator.setNull(1)
+
+    // (null, 3L)
+    mutator.setIndexDefined(2)
+    intMutator.setNull(2)
+    longMutator.setSafe(2, 3L)
+
+    // null
+    mutator.setNull(3)
+
+    // (5, 5L)
+    mutator.setIndexDefined(4)
+    intMutator.setSafe(4, 5)
+    longMutator.setSafe(4, 5L)
+
+    intMutator.setValueCount(5)
+    longMutator.setValueCount(5)
+    mutator.setValueCount(5)
+
+    val columnVector = new ArrowColumnVector(vector)
+    assert(columnVector.dataType === schema)
+    assert(columnVector.anyNullsSet)
+    assert(columnVector.numNulls === 1)
+
+    val row0 = columnVector.getStruct(0, 2)
+    assert(row0.getInt(0) === 1)
+    assert(row0.getLong(1) === 1L)
+
+    val row1 = columnVector.getStruct(1, 2)
+    assert(row1.getInt(0) === 2)
+    assert(row1.isNullAt(1))
+
+    val row2 = columnVector.getStruct(2, 2)
+    assert(row2.isNullAt(0))
+    assert(row2.getLong(1) === 3L)
+
+    assert(columnVector.isNullAt(3))
+
+    val row4 = columnVector.getStruct(4, 2)
+    assert(row4.getInt(0) === 5)
+    assert(row4.getLong(1) === 5L)
+
+    columnVector.close()
+    allocator.close()
+  }
+}

From da9f067a1eae5e92a33a6e688efcf42b35a5f9da Mon Sep 17 00:00:00 2001
From: Oleg Danilov <oleg.danilov@wandisco.com>
Date: Thu, 20 Jul 2017 09:38:49 -0700
Subject: [PATCH 215/779] [SPARK-19531] Send UPDATE_LENGTH for Spark History
 service

## What changes were proposed in this pull request?

During writing to the .inprogress file (stored on the HDFS) Hadoop doesn't update file length until close and therefor Spark's history server can't detect any changes. We have to send UPDATE_LENGTH manually.

Author: Oleg Danilov <oleg.danilov@wandisco.com>

Closes #16924 from dosoft/SPARK-19531.
---
 .../org/apache/spark/scheduler/EventLoggingListener.scala | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
index 35690b2783ad3..00ab2a393e17f 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
@@ -20,6 +20,7 @@ package org.apache.spark.scheduler
 import java.io._
 import java.net.URI
 import java.nio.charset.StandardCharsets
+import java.util.EnumSet
 import java.util.Locale
 
 import scala.collection.mutable
@@ -28,6 +29,8 @@ import scala.collection.mutable.ArrayBuffer
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, FSDataOutputStream, Path}
 import org.apache.hadoop.fs.permission.FsPermission
+import org.apache.hadoop.hdfs.DFSOutputStream
+import org.apache.hadoop.hdfs.client.HdfsDataOutputStream.SyncFlag
 import org.json4s.JsonAST.JValue
 import org.json4s.jackson.JsonMethods._
 
@@ -138,7 +141,10 @@ private[spark] class EventLoggingListener(
     // scalastyle:on println
     if (flushLogger) {
       writer.foreach(_.flush())
-      hadoopDataStream.foreach(_.hflush())
+      hadoopDataStream.foreach(ds => ds.getWrappedStream match {
+        case wrapped: DFSOutputStream => wrapped.hsync(EnumSet.of(SyncFlag.UPDATE_LENGTH))
+        case _ => ds.hflush()
+      })
     }
     if (testing) {
       loggedEvents += eventJson

From 03367d7aa3acd3abbcacba57ea75c8efa2a9a794 Mon Sep 17 00:00:00 2001
From: Tim Van Wassenhove <github@timvw.be>
Date: Thu, 20 Jul 2017 18:19:14 +0100
Subject: [PATCH 216/779] [SPARK-21142][SS] spark-streaming-kafka-0-10 should
 depend on kafka-clients instead of full blown kafka library

## What changes were proposed in this pull request?

Currently spark-streaming-kafka-0-10 has a dependency on the full kafka distribution (but only uses and requires the kafka-clients library).

The PR fixes that (the library only depends on kafka-clients), and the tests depend on the full kafka.

## How was this patch tested?

All existing tests still pass.

Author: Tim Van Wassenhove <github@timvw.be>

Closes #18353 from timvw/master.
---
 external/kafka-0-10-sql/pom.xml               |  5 +--
 external/kafka-0-10/pom.xml                   | 31 +++++--------------
 .../streaming/kafka010/KafkaTestUtils.scala   |  0
 3 files changed, 11 insertions(+), 25 deletions(-)
 rename external/kafka-0-10/src/{main => test}/scala/org/apache/spark/streaming/kafka010/KafkaTestUtils.scala (100%)

diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index 557d27296345f..0f61a10e9b4a0 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -29,6 +29,7 @@
   <artifactId>spark-sql-kafka-0-10_2.11</artifactId>
   <properties>
     <sbt.project.name>sql-kafka-0-10</sbt.project.name>
+    <kafka.version>0.10.0.1</kafka.version>
   </properties>
   <packaging>jar</packaging>
   <name>Kafka 0.10 Source for Structured Streaming</name>
@@ -65,12 +66,12 @@
     <dependency>
       <groupId>org.apache.kafka</groupId>
       <artifactId>kafka-clients</artifactId>
-      <version>0.10.0.1</version>
+      <version>${kafka.version}</version>
     </dependency>
     <dependency>
       <groupId>org.apache.kafka</groupId>
       <artifactId>kafka_${scala.binary.version}</artifactId>
-      <version>0.10.0.1</version>
+      <version>${kafka.version}</version>
       <scope>test</scope>
     </dependency>
     <dependency>
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index 6c98cb04fcfa6..4d9861af1cc03 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -28,6 +28,7 @@
   <artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
   <properties>
     <sbt.project.name>streaming-kafka-0-10</sbt.project.name>
+    <kafka.version>0.10.0.1</kafka.version>
   </properties>
   <packaging>jar</packaging>
   <name>Spark Integration for Kafka 0.10</name>
@@ -47,32 +48,16 @@
       <type>test-jar</type>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.kafka</groupId>
+      <artifactId>kafka-clients</artifactId>
+      <version>${kafka.version}</version>
+    </dependency>
     <dependency>
       <groupId>org.apache.kafka</groupId>
       <artifactId>kafka_${scala.binary.version}</artifactId>
-      <version>0.10.0.1</version>
-      <exclusions>
-        <exclusion>
-          <groupId>com.sun.jmx</groupId>
-          <artifactId>jmxri</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>com.sun.jdmk</groupId>
-          <artifactId>jmxtools</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>net.sf.jopt-simple</groupId>
-          <artifactId>jopt-simple</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.slf4j</groupId>
-          <artifactId>slf4j-simple</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.apache.zookeeper</groupId>
-          <artifactId>zookeeper</artifactId>
-        </exclusion>
-      </exclusions>
+      <version>${kafka.version}</version>
+      <scope>test</scope>
     </dependency>
     <dependency>
       <groupId>net.sf.jopt-simple</groupId>
diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaTestUtils.scala b/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/KafkaTestUtils.scala
similarity index 100%
rename from external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaTestUtils.scala
rename to external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/KafkaTestUtils.scala

From 3ac60930865209bf804ec6506d9d8b0ddd613157 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 20 Jul 2017 12:08:20 -0700
Subject: [PATCH 217/779] [SPARK-10063] Follow-up: remove dead code related to
 an old output committer

## What changes were proposed in this pull request?

DirectParquetOutputCommitter was removed from Spark as it was deemed unsafe to use. We however still have some code to generate warning. This patch removes those code as well.

This is kind of a follow-up of https://github.com/apache/spark/pull/16796

## How was this patch tested?

existing tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #18689 from cloud-fan/minor.
---
 .../internal/io/FileCommitProtocol.scala      | 21 +++-----
 .../spark/internal/io/SparkHadoopWriter.scala |  8 +--
 .../OutputCommitCoordinatorSuite.scala        |  3 +-
 .../apache/spark/sql/internal/SQLConf.scala   |  6 ++-
 .../InsertIntoHadoopFsRelationCommand.scala   |  3 +-
 .../SQLHadoopMapReduceCommitProtocol.scala    | 49 ++++++++-----------
 .../execution/streaming/FileStreamSink.scala  |  3 +-
 .../sql/sources/PartitionedWriteSuite.scala   | 13 ++---
 .../hive/execution/InsertIntoHiveTable.scala  |  3 +-
 9 files changed, 43 insertions(+), 66 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index 7efa9416362a0..50f51e1af4530 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -24,12 +24,12 @@ import org.apache.spark.util.Utils
 
 
 /**
- * An interface to define how a single Spark job commits its outputs. Two notes:
+ * An interface to define how a single Spark job commits its outputs. Three notes:
  *
  * 1. Implementations must be serializable, as the committer instance instantiated on the driver
  *    will be used for tasks on executors.
- * 2. Implementations should have a constructor with either 2 or 3 arguments:
- *    (jobId: String, path: String) or (jobId: String, path: String, isAppend: Boolean).
+ * 2. Implementations should have a constructor with 2 arguments:
+ *      (jobId: String, path: String)
  * 3. A committer should not be reused across multiple Spark jobs.
  *
  * The proper call sequence is:
@@ -139,19 +139,10 @@ object FileCommitProtocol {
   /**
    * Instantiates a FileCommitProtocol using the given className.
    */
-  def instantiate(className: String, jobId: String, outputPath: String, isAppend: Boolean)
+  def instantiate(className: String, jobId: String, outputPath: String)
     : FileCommitProtocol = {
     val clazz = Utils.classForName(className).asInstanceOf[Class[FileCommitProtocol]]
-
-    // First try the one with argument (jobId: String, outputPath: String, isAppend: Boolean).
-    // If that doesn't exist, try the one with (jobId: string, outputPath: String).
-    try {
-      val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String], classOf[Boolean])
-      ctor.newInstance(jobId, outputPath, isAppend.asInstanceOf[java.lang.Boolean])
-    } catch {
-      case _: NoSuchMethodException =>
-        val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String])
-        ctor.newInstance(jobId, outputPath)
-    }
+    val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String])
+    ctor.newInstance(jobId, outputPath)
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriter.scala b/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriter.scala
index 7d846f9354df6..949d8c677998f 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriter.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriter.scala
@@ -197,8 +197,8 @@ class HadoopMapRedWriteConfigUtil[K, V: ClassTag](conf: SerializableJobConf)
     FileCommitProtocol.instantiate(
       className = classOf[HadoopMapRedCommitProtocol].getName,
       jobId = jobId.toString,
-      outputPath = getConf.get("mapred.output.dir"),
-      isAppend = false).asInstanceOf[HadoopMapReduceCommitProtocol]
+      outputPath = getConf.get("mapred.output.dir")
+    ).asInstanceOf[HadoopMapReduceCommitProtocol]
   }
 
   // --------------------------------------------------------------------------
@@ -325,8 +325,8 @@ class HadoopMapReduceWriteConfigUtil[K, V: ClassTag](conf: SerializableConfigura
     FileCommitProtocol.instantiate(
       className = classOf[HadoopMapReduceCommitProtocol].getName,
       jobId = jobId.toString,
-      outputPath = getConf.get("mapreduce.output.fileoutputformat.outputdir"),
-      isAppend = false).asInstanceOf[HadoopMapReduceCommitProtocol]
+      outputPath = getConf.get("mapreduce.output.fileoutputformat.outputdir")
+    ).asInstanceOf[HadoopMapReduceCommitProtocol]
   }
 
   // --------------------------------------------------------------------------
diff --git a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
index 60b5955321984..03b1903902491 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
@@ -265,8 +265,7 @@ private case class OutputCommitFunctions(tempDirPath: String) {
     val committer = FileCommitProtocol.instantiate(
       className = classOf[HadoopMapRedCommitProtocol].getName,
       jobId = jobId.value.getId.toString,
-      outputPath = jobConf.get("mapred.output.dir"),
-      isAppend = false)
+      outputPath = jobConf.get("mapred.output.dir"))
 
     // Create TaskAttemptContext.
     // Hadoop wants a 32-bit task attempt ID, so if ours is bigger than Int.MaxValue, roll it
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 55558ca9f700c..824908ddeb615 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -480,8 +480,10 @@ object SQLConf {
 
   // The output committer class used by data sources. The specified class needs to be a
   // subclass of org.apache.hadoop.mapreduce.OutputCommitter.
-  val OUTPUT_COMMITTER_CLASS =
-    buildConf("spark.sql.sources.outputCommitterClass").internal().stringConf.createOptional
+  val OUTPUT_COMMITTER_CLASS = buildConf("spark.sql.sources.outputCommitterClass")
+    .internal()
+    .stringConf
+    .createOptional
 
   val FILE_COMMIT_PROTOCOL_CLASS =
     buildConf("spark.sql.sources.commitProtocolClass")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index c1bcfb8610783..9ebe1e4d71c68 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -98,8 +98,7 @@ case class InsertIntoHadoopFsRelationCommand(
     val committer = FileCommitProtocol.instantiate(
       sparkSession.sessionState.conf.fileCommitProtocolClass,
       jobId = java.util.UUID.randomUUID().toString,
-      outputPath = outputPath.toString,
-      isAppend = isAppend)
+      outputPath = outputPath.toString)
 
     val doInsertion = (mode, pathExists) match {
       case (SaveMode.ErrorIfExists, true) =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
index 9b9ed28412cac..40825a1f724b1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -29,41 +29,34 @@ import org.apache.spark.sql.internal.SQLConf
  * A variant of [[HadoopMapReduceCommitProtocol]] that allows specifying the actual
  * Hadoop output committer using an option specified in SQLConf.
  */
-class SQLHadoopMapReduceCommitProtocol(jobId: String, path: String, isAppend: Boolean)
+class SQLHadoopMapReduceCommitProtocol(jobId: String, path: String)
   extends HadoopMapReduceCommitProtocol(jobId, path) with Serializable with Logging {
 
   override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     var committer = context.getOutputFormatClass.newInstance().getOutputCommitter(context)
 
-    if (!isAppend) {
-      // If we are appending data to an existing dir, we will only use the output committer
-      // associated with the file output format since it is not safe to use a custom
-      // committer for appending. For example, in S3, direct parquet output committer may
-      // leave partial data in the destination dir when the appending job fails.
-      // See SPARK-8578 for more details.
-      val configuration = context.getConfiguration
-      val clazz =
-        configuration.getClass(SQLConf.OUTPUT_COMMITTER_CLASS.key, null, classOf[OutputCommitter])
+    val configuration = context.getConfiguration
+    val clazz =
+      configuration.getClass(SQLConf.OUTPUT_COMMITTER_CLASS.key, null, classOf[OutputCommitter])
 
-      if (clazz != null) {
-        logInfo(s"Using user defined output committer class ${clazz.getCanonicalName}")
+    if (clazz != null) {
+      logInfo(s"Using user defined output committer class ${clazz.getCanonicalName}")
 
-        // Every output format based on org.apache.hadoop.mapreduce.lib.output.OutputFormat
-        // has an associated output committer. To override this output committer,
-        // we will first try to use the output committer set in SQLConf.OUTPUT_COMMITTER_CLASS.
-        // If a data source needs to override the output committer, it needs to set the
-        // output committer in prepareForWrite method.
-        if (classOf[FileOutputCommitter].isAssignableFrom(clazz)) {
-          // The specified output committer is a FileOutputCommitter.
-          // So, we will use the FileOutputCommitter-specified constructor.
-          val ctor = clazz.getDeclaredConstructor(classOf[Path], classOf[TaskAttemptContext])
-          committer = ctor.newInstance(new Path(path), context)
-        } else {
-          // The specified output committer is just an OutputCommitter.
-          // So, we will use the no-argument constructor.
-          val ctor = clazz.getDeclaredConstructor()
-          committer = ctor.newInstance()
-        }
+      // Every output format based on org.apache.hadoop.mapreduce.lib.output.OutputFormat
+      // has an associated output committer. To override this output committer,
+      // we will first try to use the output committer set in SQLConf.OUTPUT_COMMITTER_CLASS.
+      // If a data source needs to override the output committer, it needs to set the
+      // output committer in prepareForWrite method.
+      if (classOf[FileOutputCommitter].isAssignableFrom(clazz)) {
+        // The specified output committer is a FileOutputCommitter.
+        // So, we will use the FileOutputCommitter-specified constructor.
+        val ctor = clazz.getDeclaredConstructor(classOf[Path], classOf[TaskAttemptContext])
+        committer = ctor.newInstance(new Path(path), context)
+      } else {
+        // The specified output committer is just an OutputCommitter.
+        // So, we will use the no-argument constructor.
+        val ctor = clazz.getDeclaredConstructor()
+        committer = ctor.newInstance()
       }
     }
     logInfo(s"Using output committer class ${committer.getClass.getCanonicalName}")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
index 96225ecffad48..0ed2dbe177402 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
@@ -102,8 +102,7 @@ class FileStreamSink(
       val committer = FileCommitProtocol.instantiate(
         className = sparkSession.sessionState.conf.streamingFileCommitProtocolClass,
         jobId = batchId.toString,
-        outputPath = path,
-        isAppend = false)
+        outputPath = path)
 
       committer match {
         case manifestCommitter: ManifestFileCommitProtocol =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
index 6f998aa60faf5..0fe33e87318a5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
@@ -32,18 +32,13 @@ import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.util.Utils
 
-private class OnlyDetectCustomPathFileCommitProtocol(jobId: String, path: String, isAppend: Boolean)
-  extends SQLHadoopMapReduceCommitProtocol(jobId, path, isAppend)
+private class OnlyDetectCustomPathFileCommitProtocol(jobId: String, path: String)
+  extends SQLHadoopMapReduceCommitProtocol(jobId, path)
     with Serializable with Logging {
 
   override def newTaskTempFileAbsPath(
       taskContext: TaskAttemptContext, absoluteDir: String, ext: String): String = {
-    if (isAppend) {
-      throw new Exception("append data to an existed partitioned table, " +
-        "there should be no custom partition path sent to Task")
-    }
-
-    super.newTaskTempFileAbsPath(taskContext, absoluteDir, ext)
+    throw new Exception("there should be no custom partition path")
   }
 }
 
@@ -115,7 +110,7 @@ class PartitionedWriteSuite extends QueryTest with SharedSQLContext {
     }
   }
 
-  test("append data to an existed partitioned table without custom partition path") {
+  test("append data to an existing partitioned table without custom partition path") {
     withTable("t") {
       withSQLConf(SQLConf.FILE_COMMIT_PROTOCOL_CLASS.key ->
         classOf[OnlyDetectCustomPathFileCommitProtocol].getName) {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index cd263e8b6df8e..b9461ad489bce 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -337,8 +337,7 @@ case class InsertIntoHiveTable(
     val committer = FileCommitProtocol.instantiate(
       sparkSession.sessionState.conf.fileCommitProtocolClass,
       jobId = java.util.UUID.randomUUID().toString,
-      outputPath = tmpLocation.toString,
-      isAppend = false)
+      outputPath = tmpLocation.toString)
 
     val partitionAttributes = partitionColumnNames.takeRight(numDynamicPartitions).map { name =>
       query.resolve(name :: Nil, sparkSession.sessionState.analyzer.resolver).getOrElse {

From c57dfaef30dcf04d7c7911da1ac77679492d04c5 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Fri, 21 Jul 2017 08:43:38 +0100
Subject: [PATCH 218/779] [MINOR][SS][DOCS] Minor doc change for kafka
 integration

## What changes were proposed in this pull request?

Minor change to kafka integration document for structured streaming.

## How was this patch tested?

N/A, doc change only.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #18550 from viirya/minor-ss-kafka-doc.
---
 docs/structured-streaming-kafka-integration.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/structured-streaming-kafka-integration.md b/docs/structured-streaming-kafka-integration.md
index 217c1a91a16f3..bab0be8ddeb9f 100644
--- a/docs/structured-streaming-kafka-integration.md
+++ b/docs/structured-streaming-kafka-integration.md
@@ -15,6 +15,8 @@ For Scala/Java applications using SBT/Maven project definitions, link your appli
 For Python applications, you need to add this above library and its dependencies when deploying your
 application. See the [Deploying](#deploying) subsection below.
 
+For experimenting on `spark-shell`, you need to add this above library and its dependencies too when invoking `spark-shell`. Also see the [Deploying](#deploying) subsection below.
+
 ## Reading Data from Kafka
 
 ### Creating a Kafka Source for Streaming Queries
@@ -607,5 +609,9 @@ and its dependencies can be directly added to `spark-submit` using `--packages`,
 
     ./bin/spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_{{site.SCALA_BINARY_VERSION}}:{{site.SPARK_VERSION_SHORT}} ...
 
+For experimenting on `spark-shell`, you can also use `--packages` to add `spark-sql-kafka-0-10_{{site.SCALA_BINARY_VERSION}}` and its dependencies directly,
+
+    ./bin/spark-shell --packages org.apache.spark:spark-sql-kafka-0-10_{{site.SCALA_BINARY_VERSION}}:{{site.SPARK_VERSION_SHORT}} ...
+
 See [Application Submission Guide](submitting-applications.html) for more details about submitting
 applications with external dependencies.

From 2f1468429ff3d4a3e577d708880a3e28186d2585 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Fri, 21 Jul 2017 21:06:56 +0800
Subject: [PATCH 219/779] [SPARK-21472][SQL][FOLLOW-UP] Introduce
 ArrowColumnVector as a reader for Arrow vectors.

## What changes were proposed in this pull request?

This is a follow-up of #18680.

In some environment, a compile error happens saying:

```
.../sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java:243:
error: not found: type Array
  public void loadBytes(Array array) {
                        ^
```

This pr fixes it.

## How was this patch tested?

Existing tests.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #18701 from ueshin/issues/SPARK-21472_fup1.
---
 .../spark/sql/execution/vectorized/ArrowColumnVector.java     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
index 68e0abc11c39d..31dea6ad31b12 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
@@ -240,7 +240,7 @@ public int getArrayOffset(int rowId) {
   }
 
   @Override
-  public void loadBytes(Array array) {
+  public void loadBytes(ColumnVector.Array array) {
     throw new UnsupportedOperationException();
   }
 
@@ -304,7 +304,7 @@ public ArrowColumnVector(ValueVector vector) {
 
       childColumns = new ColumnVector[1];
       childColumns[0] = new ArrowColumnVector(listVector.getDataVector());
-      resultArray = new Array(childColumns[0]);
+      resultArray = new ColumnVector.Array(childColumns[0]);
     } else if (vector instanceof MapVector) {
       MapVector mapVector = (MapVector) vector;
       accessor = new StructAccessor(mapVector);

From 113399b8b0efd8f5e64fc929aec9d2d1a6fc68f2 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Fri, 21 Jul 2017 22:42:37 +0800
Subject: [PATCH 220/779] [SPARK-19810][BUILD][FOLLOW-UP] jcl-over-slf4j
 dependency needs to be compile scope for SBT build

## What changes were proposed in this pull request?

jcl-over-slf4j dependency needs to be compile scope for SBT build, to make it available for commons-logging dependents like Hadoop

https://github.com/apache/spark/pull/17150#issuecomment-316950717
https://github.com/apache/spark/pull/17150/files#r128728089

## How was this patch tested?

Manual tests

Author: Sean Owen <sowen@cloudera.com>

Closes #18703 from srowen/SPARK-19810.2.
---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 1b812636e4f6e..c24334333d687 100644
--- a/pom.xml
+++ b/pom.xml
@@ -510,7 +510,7 @@
         <groupId>org.slf4j</groupId>
         <artifactId>jcl-over-slf4j</artifactId>
         <version>${slf4j.version}</version>
-        <scope>runtime</scope>
+        <!-- runtime scope is appropriate, but causes SBT build problems -->
       </dependency>
       <dependency>
         <groupId>log4j</groupId>

From cc00e99d5396893b2d3d50960161080837cf950a Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@us.ibm.com>
Date: Fri, 21 Jul 2017 16:50:47 -0700
Subject: [PATCH 221/779] [SPARK-21434][PYTHON][DOCS] Add pyspark pip
 documentation.

## What changes were proposed in this pull request?

Update the Quickstart and RDD programming guides to mention pip.

## How was this patch tested?

Built docs locally.

Author: Holden Karau <holden@us.ibm.com>

Closes #18698 from holdenk/SPARK-21434-add-pyspark-pip-documentation.
---
 docs/quick-start.md           | 27 ++++++++++++++++++++++++++-
 docs/rdd-programming-guide.md | 13 ++++++++++++-
 2 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/docs/quick-start.md b/docs/quick-start.md
index b88ae5f6bb313..cb5211af377e5 100644
--- a/docs/quick-start.md
+++ b/docs/quick-start.md
@@ -66,6 +66,11 @@ res3: Long = 15
 
     ./bin/pyspark
 
+
+Or if PySpark is installed with pip in your current enviroment:
+
+    pyspark
+
 Spark's primary abstraction is a distributed collection of items called a Dataset. Datasets can be created from Hadoop InputFormats (such as HDFS files) or by transforming other Datasets. Due to Python's dynamic nature, we don't need the Dataset to be strongly-typed in Python. As a result, all Datasets in Python are Dataset[Row], and we call it `DataFrame` to be consistent with the data frame concept in Pandas and R. Let's make a new DataFrame from the text of the README file in the Spark source directory:
 
 {% highlight python %}
@@ -206,7 +211,7 @@ a cluster, as described in the [RDD programming guide](rdd-programming-guide.htm
 
 # Self-Contained Applications
 Suppose we wish to write a self-contained application using the Spark API. We will walk through a
-simple application in Scala (with sbt), Java (with Maven), and Python.
+simple application in Scala (with sbt), Java (with Maven), and Python (pip).
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
@@ -367,6 +372,16 @@ Lines with a: 46, Lines with b: 23
 
 Now we will show how to write an application using the Python API (PySpark).
 
+
+If you are building a packaged PySpark application or library you can add it to your setup.py file as:
+
+{% highlight python %}
+    install_requires=[
+        'pyspark=={site.SPARK_VERSION}'
+    ]
+{% endhighlight %}
+
+
 As an example, we'll create a simple Spark application, `SimpleApp.py`:
 
 {% highlight python %}
@@ -406,6 +421,16 @@ $ YOUR_SPARK_HOME/bin/spark-submit \
 Lines with a: 46, Lines with b: 23
 {% endhighlight %}
 
+If you have PySpark pip installed into your enviroment (e.g. `pip instal pyspark` you can run your application with the regular Python interpeter or use the provided spark-submit as you prefer.
+
+{% highlight bash %}
+# Use spark-submit to run your application
+$ python SimpleApp.py
+...
+Lines with a: 46, Lines with b: 23
+{% endhighlight %}
+
+
 </div>
 </div>
 
diff --git a/docs/rdd-programming-guide.md b/docs/rdd-programming-guide.md
index 0966d3870e8f8..c0215c8fb62f6 100644
--- a/docs/rdd-programming-guide.md
+++ b/docs/rdd-programming-guide.md
@@ -89,7 +89,18 @@ import org.apache.spark.SparkConf;
 Spark {{site.SPARK_VERSION}} works with Python 2.7+ or Python 3.4+. It can use the standard CPython interpreter,
 so C libraries like NumPy can be used. It also works with PyPy 2.3+.
 
-To run Spark applications in Python, use the `bin/spark-submit` script located in the Spark directory.
+Python 2.6 support was removed in Spark 2.2.0.
+
+Spark applications in Python can either be run with the `bin/spark-submit` script which includes Spark at runtime, or by including including it in your setup.py as:
+
+{% highlight python %}
+    install_requires=[
+        'pyspark=={site.SPARK_VERSION}'
+    ]
+{% endhighlight %}
+
+
+To run Spark applications in Python without pip installing PySpark, use the `bin/spark-submit` script located in the Spark directory.
 This script will load Spark's Java/Scala libraries and allow you to submit applications to a cluster.
 You can also use `bin/pyspark` to launch an interactive Python shell.
 

From ccaee5b54d119155fdf020ba4c8c084c4cd37e77 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Sun, 23 Jul 2017 21:32:59 +0800
Subject: [PATCH 222/779] [SPARK-10063] Follow-up: remove a useless test
 related to an old output committer

## What changes were proposed in this pull request?

It's a follow-up of https://github.com/apache/spark/pull/18689 , which forgot to remove a useless test.

## How was this patch tested?

N/A

Author: Wenchen Fan <wenchen@databricks.com>

Closes #18716 from cloud-fan/test.
---
 .../sql/sources/HadoopFsRelationTest.scala    | 73 -------------------
 1 file changed, 73 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala
index d23b66a5300e7..80aff446bc24b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala
@@ -22,9 +22,6 @@ import java.io.File
 import scala.util.Random
 
 import org.apache.hadoop.fs.Path
-import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
-import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
-import org.apache.parquet.hadoop.ParquetOutputCommitter
 
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.sql._
@@ -783,52 +780,6 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes
     }
   }
 
-  test("SPARK-8578 specified custom output committer will not be used to append data") {
-    withSQLConf(SQLConf.FILE_COMMIT_PROTOCOL_CLASS.key ->
-        classOf[SQLHadoopMapReduceCommitProtocol].getCanonicalName) {
-      val extraOptions = Map[String, String](
-        SQLConf.OUTPUT_COMMITTER_CLASS.key -> classOf[AlwaysFailOutputCommitter].getName,
-        // Since Parquet has its own output committer setting, also set it
-        // to AlwaysFailParquetOutputCommitter at here.
-        "spark.sql.parquet.output.committer.class" ->
-          classOf[AlwaysFailParquetOutputCommitter].getName
-      )
-
-      val df = spark.range(1, 10).toDF("i")
-      withTempPath { dir =>
-        df.write.mode("append").format(dataSourceName).save(dir.getCanonicalPath)
-        // Because there data already exists,
-        // this append should succeed because we will use the output committer associated
-        // with file format and AlwaysFailOutputCommitter will not be used.
-        df.write.mode("append").format(dataSourceName).save(dir.getCanonicalPath)
-        checkAnswer(
-          spark.read
-            .format(dataSourceName)
-            .option("dataSchema", df.schema.json)
-            .options(extraOptions)
-            .load(dir.getCanonicalPath),
-          df.union(df))
-
-        // This will fail because AlwaysFailOutputCommitter is used when we do append.
-        intercept[Exception] {
-          df.write.mode("overwrite")
-            .options(extraOptions).format(dataSourceName).save(dir.getCanonicalPath)
-        }
-      }
-      withTempPath { dir =>
-        // Because there is no existing data,
-        // this append will fail because AlwaysFailOutputCommitter is used when we do append
-        // and there is no existing data.
-        intercept[Exception] {
-          df.write.mode("append")
-            .options(extraOptions)
-            .format(dataSourceName)
-            .save(dir.getCanonicalPath)
-        }
-      }
-    }
-  }
-
   test("SPARK-8887: Explicitly define which data types can be used as dynamic partition columns") {
     val df = Seq(
       (1, "v1", Array(1, 2, 3), Map("k1" -> "v1"), Tuple2(1, "4")),
@@ -898,27 +849,3 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes
     }
   }
 }
-
-// This class is used to test SPARK-8578. We should not use any custom output committer when
-// we actually append data to an existing dir.
-class AlwaysFailOutputCommitter(
-    outputPath: Path,
-    context: TaskAttemptContext)
-  extends FileOutputCommitter(outputPath, context) {
-
-  override def commitJob(context: JobContext): Unit = {
-    sys.error("Intentional job commitment failure for testing purpose.")
-  }
-}
-
-// This class is used to test SPARK-8578. We should not use any custom output committer when
-// we actually append data to an existing dir.
-class AlwaysFailParquetOutputCommitter(
-    outputPath: Path,
-    context: TaskAttemptContext)
-  extends ParquetOutputCommitter(outputPath, context) {
-
-  override def commitJob(context: JobContext): Unit = {
-    sys.error("Intentional job commitment failure for testing purpose.")
-  }
-}

From cecd285a2aabad4e7db5a3d18944b87fbc4eee6c Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Sun, 23 Jul 2017 23:23:13 +0800
Subject: [PATCH 223/779] [SPARK-20904][CORE] Don't report task failures to
 driver during shutdown.

Executors run a thread pool with daemon threads to run tasks. This means
that those threads remain active when the JVM is shutting down, meaning
those tasks are affected by code that runs in shutdown hooks.

So if a shutdown hook messes with something that the task is using (e.g.
an HDFS connection), the task will fail and will report that failure to
the driver. That will make the driver mark the task as failed regardless
of what caused the executor to shut down. So, for example, if YARN pre-empted
that executor, the driver would consider that task failed when it should
instead ignore the failure.

This change avoids reporting failures to the driver when shutdown hooks
are executing; this fixes the YARN preemption accounting, and doesn't really
change things much for other scenarios, other than reporting a more generic
error ("Executor lost") when the executor shuts down unexpectedly - which
is arguably more correct.

Tested with a hacky app running on spark-shell that tried to cause failures
only when shutdown hooks were running, verified that preemption didn't cause
the app to fail because of task failures exceeding the threshold.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #18594 from vanzin/SPARK-20904.
---
 .../org/apache/spark/executor/Executor.scala  | 47 +++++++++++--------
 1 file changed, 28 insertions(+), 19 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index 21f0db1039188..690b5a212b144 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -473,29 +473,38 @@ private[spark] class Executor(
           // the default uncaught exception handler, which will terminate the Executor.
           logError(s"Exception in $taskName (TID $taskId)", t)
 
-          // Collect latest accumulator values to report back to the driver
-          val accums: Seq[AccumulatorV2[_, _]] =
-            if (task != null) {
-              task.metrics.setExecutorRunTime(System.currentTimeMillis() - taskStart)
-              task.metrics.setJvmGCTime(computeTotalGcTime() - startGCTime)
-              task.collectAccumulatorUpdates(taskFailed = true)
-            } else {
-              Seq.empty
-            }
+          // SPARK-20904: Do not report failure to driver if if happened during shut down. Because
+          // libraries may set up shutdown hooks that race with running tasks during shutdown,
+          // spurious failures may occur and can result in improper accounting in the driver (e.g.
+          // the task failure would not be ignored if the shutdown happened because of premption,
+          // instead of an app issue).
+          if (!ShutdownHookManager.inShutdown()) {
+            // Collect latest accumulator values to report back to the driver
+            val accums: Seq[AccumulatorV2[_, _]] =
+              if (task != null) {
+                task.metrics.setExecutorRunTime(System.currentTimeMillis() - taskStart)
+                task.metrics.setJvmGCTime(computeTotalGcTime() - startGCTime)
+                task.collectAccumulatorUpdates(taskFailed = true)
+              } else {
+                Seq.empty
+              }
 
-          val accUpdates = accums.map(acc => acc.toInfo(Some(acc.value), None))
+            val accUpdates = accums.map(acc => acc.toInfo(Some(acc.value), None))
 
-          val serializedTaskEndReason = {
-            try {
-              ser.serialize(new ExceptionFailure(t, accUpdates).withAccums(accums))
-            } catch {
-              case _: NotSerializableException =>
-                // t is not serializable so just send the stacktrace
-                ser.serialize(new ExceptionFailure(t, accUpdates, false).withAccums(accums))
+            val serializedTaskEndReason = {
+              try {
+                ser.serialize(new ExceptionFailure(t, accUpdates).withAccums(accums))
+              } catch {
+                case _: NotSerializableException =>
+                  // t is not serializable so just send the stacktrace
+                  ser.serialize(new ExceptionFailure(t, accUpdates, false).withAccums(accums))
+              }
             }
+            setTaskFinishedAndClearInterruptStatus()
+            execBackend.statusUpdate(taskId, TaskState.FAILED, serializedTaskEndReason)
+          } else {
+            logInfo("Not reporting error to driver during JVM shutdown.")
           }
-          setTaskFinishedAndClearInterruptStatus()
-          execBackend.statusUpdate(taskId, TaskState.FAILED, serializedTaskEndReason)
 
           // Don't forcibly exit unless the exception was inherently fatal, to avoid
           // stopping other tasks unnecessarily.

From 2a53fbfce72b3faef020e39a1e8628d68bc95beb Mon Sep 17 00:00:00 2001
From: "pj.fanning" <pj.fanning@workday.com>
Date: Sun, 23 Jul 2017 10:38:03 -0700
Subject: [PATCH 224/779] [SPARK-20871][SQL] limit logging of Janino code

## What changes were proposed in this pull request?

When the code that is generated is greater than 64k, then Janino compile will fail and CodeGenerator.scala will log the entire code at Error level.
SPARK-20871 suggests only logging the code at Debug level.
Since, the code is already logged at debug level, this Pull Request proposes not including the formatted code in the Error logging and exception message at all.
When an exception occurs, the code will be logged at Info level but truncated if it is more than 1000 lines long.

## How was this patch tested?

Existing tests were run.
An extra test test case was added to CodeFormatterSuite to test the new maxLines parameter,

Author: pj.fanning <pj.fanning@workday.com>

Closes #18658 from pjfanning/SPARK-20871.
---
 .../expressions/codegen/CodeFormatter.scala   | 10 ++++-
 .../expressions/codegen/CodeGenerator.scala   | 13 +++---
 .../apache/spark/sql/internal/SQLConf.scala   | 10 +++++
 .../codegen/CodeFormatterSuite.scala          | 40 ++++++++++++++++---
 4 files changed, 61 insertions(+), 12 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala
index 05b7c96e44c02..60e600d8dbd8f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala
@@ -28,14 +28,20 @@ import java.util.regex.Matcher
 object CodeFormatter {
   val commentHolder = """\/\*(.+?)\*\/""".r
 
-  def format(code: CodeAndComment): String = {
+  def format(code: CodeAndComment, maxLines: Int = -1): String = {
     val formatter = new CodeFormatter
-    code.body.split("\n").foreach { line =>
+    val lines = code.body.split("\n")
+    val needToTruncate = maxLines >= 0 && lines.length > maxLines
+    val filteredLines = if (needToTruncate) lines.take(maxLines) else lines
+    filteredLines.foreach { line =>
       val commentReplaced = commentHolder.replaceAllIn(
         line.trim,
         m => code.comment.get(m.group(1)).map(Matcher.quoteReplacement).getOrElse(m.group(0)))
       formatter.addLine(commentReplaced)
     }
+    if (needToTruncate) {
+      formatter.addLine(s"[truncated to $maxLines lines (total lines is ${lines.length})]")
+    }
     formatter.result()
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 7cf9daf628608..a014e2aa34820 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -39,6 +39,7 @@ import org.apache.spark.metrics.source.CodegenMetrics
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.util.{ArrayData, MapData}
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.Platform
 import org.apache.spark.unsafe.types._
@@ -1037,12 +1038,10 @@ object CodeGenerator extends Logging {
     ))
     evaluator.setExtendedClass(classOf[GeneratedClass])
 
-    lazy val formatted = CodeFormatter.format(code)
-
     logDebug({
       // Only add extra debugging info to byte code when we are going to print the source code.
       evaluator.setDebuggingInformation(true, true, false)
-      s"\n$formatted"
+      s"\n${CodeFormatter.format(code)}"
     })
 
     try {
@@ -1050,12 +1049,16 @@ object CodeGenerator extends Logging {
       recordCompilationStats(evaluator)
     } catch {
       case e: JaninoRuntimeException =>
-        val msg = s"failed to compile: $e\n$formatted"
+        val msg = s"failed to compile: $e"
         logError(msg, e)
+        val maxLines = SQLConf.get.loggingMaxLinesForCodegen
+        logInfo(s"\n${CodeFormatter.format(code, maxLines)}")
         throw new JaninoRuntimeException(msg, e)
       case e: CompileException =>
-        val msg = s"failed to compile: $e\n$formatted"
+        val msg = s"failed to compile: $e"
         logError(msg, e)
+        val maxLines = SQLConf.get.loggingMaxLinesForCodegen
+        logInfo(s"\n${CodeFormatter.format(code, maxLines)}")
         throw new CompileException(msg, e.getLocation)
     }
     evaluator.getClazz().newInstance().asInstanceOf[GeneratedClass]
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 824908ddeb615..a819cddcae988 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -564,6 +564,14 @@ object SQLConf {
     .intConf
     .createWithDefault(20)
 
+  val CODEGEN_LOGGING_MAX_LINES = buildConf("spark.sql.codegen.logging.maxLines")
+    .internal()
+    .doc("The maximum number of codegen lines to log when errors occur. Use -1 for unlimited.")
+    .intConf
+    .checkValue(maxLines => maxLines >= -1, "The maximum must be a positive integer, 0 to " +
+      "disable logging or -1 to apply no limit.")
+    .createWithDefault(1000)
+
   val FILES_MAX_PARTITION_BYTES = buildConf("spark.sql.files.maxPartitionBytes")
     .doc("The maximum number of bytes to pack into a single partition when reading files.")
     .longConf
@@ -1004,6 +1012,8 @@ class SQLConf extends Serializable with Logging {
 
   def maxCaseBranchesForCodegen: Int = getConf(MAX_CASES_BRANCHES)
 
+  def loggingMaxLinesForCodegen: Int = getConf(CODEGEN_LOGGING_MAX_LINES)
+
   def tableRelationCacheSize: Int =
     getConf(StaticSQLConf.FILESOURCE_TABLE_RELATION_CACHE_SIZE)
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala
index bc5a8f078234a..9d0a41661beaa 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala
@@ -20,18 +20,18 @@ package org.apache.spark.sql.catalyst.expressions.codegen
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.util._
 
-
 class CodeFormatterSuite extends SparkFunSuite {
 
-  def testCase(name: String)(
-      input: String, comment: Map[String, String] = Map.empty)(expected: String): Unit = {
+  def testCase(name: String)(input: String,
+      comment: Map[String, String] = Map.empty, maxLines: Int = -1)(expected: String): Unit = {
     test(name) {
       val sourceCode = new CodeAndComment(input.trim, comment)
-      if (CodeFormatter.format(sourceCode).trim !== expected.trim) {
+      if (CodeFormatter.format(sourceCode, maxLines).trim !== expected.trim) {
         fail(
           s"""
              |== FAIL: Formatted code doesn't match ===
-             |${sideBySide(CodeFormatter.format(sourceCode).trim, expected.trim).mkString("\n")}
+             |${sideBySide(CodeFormatter.format(sourceCode, maxLines).trim,
+                 expected.trim).mkString("\n")}
            """.stripMargin)
       }
     }
@@ -129,6 +129,36 @@ class CodeFormatterSuite extends SparkFunSuite {
     """.stripMargin
   }
 
+  testCase("function calls with maxLines=0") (
+    """
+      |foo(
+      |a,
+      |b,
+      |c)
+    """.stripMargin,
+    maxLines = 0
+  ) {
+    """
+      |/* 001 */ [truncated to 0 lines (total lines is 4)]
+    """.stripMargin
+  }
+
+  testCase("function calls with maxLines=2") (
+    """
+      |foo(
+      |a,
+      |b,
+      |c)
+    """.stripMargin,
+    maxLines = 2
+  ) {
+    """
+      |/* 001 */ foo(
+      |/* 002 */   a,
+      |/* 003 */   [truncated to 2 lines (total lines is 4)]
+    """.stripMargin
+  }
+
   testCase("single line comments") {
     """
       |// This is a comment about class A { { { ( (

From a4eac8b0bb2a00a898bc216376fe770aa402c893 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sun, 23 Jul 2017 10:41:38 -0700
Subject: [PATCH 225/779] [MINOR] Remove **** in test case names in
 FlatMapGroupsWithStateSuite

## What changes were proposed in this pull request?
This patch removes the `****` string from test names in FlatMapGroupsWithStateSuite. `***` is a common string developers grep for when using Scala test (because it immediately shows the failing test cases). The existence of the `****` in test names disrupts that workflow.

## How was this patch tested?
N/A - test only change.

Author: Reynold Xin <rxin@databricks.com>

Closes #18715 from rxin/FlatMapGroupsWithStateStar.
---
 .../spark/sql/streaming/FlatMapGroupsWithStateSuite.scala | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
index a5399cdb6e5b1..9d74a5c701ef1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
@@ -99,7 +99,7 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest with BeforeAndAf
     }
   }
 
-  test("GroupState - setTimeout**** with NoTimeout") {
+  test("GroupState - setTimeout - with NoTimeout") {
     for (initValue <- Seq(None, Some(5))) {
       val states = Seq(
         GroupStateImpl.createForStreaming(initValue, 1000, 1000, NoTimeout, hasTimedOut = false),
@@ -117,7 +117,7 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest with BeforeAndAf
     }
   }
 
-  test("GroupState - setTimeout**** with ProcessingTimeTimeout") {
+  test("GroupState - setTimeout - with ProcessingTimeTimeout") {
     // for streaming queries
     var state: GroupStateImpl[Int] = GroupStateImpl.createForStreaming(
       None, 1000, 1000, ProcessingTimeTimeout, hasTimedOut = false)
@@ -156,7 +156,7 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest with BeforeAndAf
     testTimeoutTimestampNotAllowed[UnsupportedOperationException](state)
   }
 
-  test("GroupState - setTimeout**** with EventTimeTimeout") {
+  test("GroupState - setTimeout - with EventTimeTimeout") {
     var state: GroupStateImpl[Int] = GroupStateImpl.createForStreaming(
       None, 1000, 1000, EventTimeTimeout, false)
 
@@ -195,7 +195,7 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest with BeforeAndAf
     testTimeoutDurationNotAllowed[UnsupportedOperationException](state)
   }
 
-  test("GroupState - illegal params to setTimeout****") {
+  test("GroupState - illegal params to setTimeout") {
     var state: GroupStateImpl[Int] = null
 
     // Test setTimeout****() with illegal values

From 481f0792944d9a77f0fe8b5e2596da1d600b9d0a Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Sun, 23 Jul 2017 11:31:27 -0700
Subject: [PATCH 226/779] [SPARK-21512][SQL][TEST] DatasetCacheSuite needs to
 execute unpersistent after executing peristent

## What changes were proposed in this pull request?

This PR avoids to reuse unpersistent dataset among test cases by making dataset unpersistent at the end of each test case.

In `DatasetCacheSuite`, the test case `"get storage level"` does not make dataset unpersisit after make the dataset persisitent. The same dataset will be made persistent by the test case `"persist and then rebind right encoder when join 2 datasets"` Thus, we run these test cases, the second case does not perform to make dataset persistent. This is because in

When we run only the second case, it performs to make dataset persistent. It is not good to change behavior of the second test suite. The first test case should correctly make dataset unpersistent.

```
Testing started at 17:52 ...
01:52:15.053 WARN org.apache.hadoop.util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
01:52:48.595 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.
01:52:48.692 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.
01:52:50.864 WARN org.apache.spark.storage.RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
01:52:50.864 WARN org.apache.spark.storage.RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
01:52:50.868 WARN org.apache.spark.storage.BlockManager: Block rdd_8_1 replicated to only 0 peer(s) instead of 1 peers
01:52:50.868 WARN org.apache.spark.storage.BlockManager: Block rdd_8_0 replicated to only 0 peer(s) instead of 1 peers
```

After this PR, these messages do not appear
```
Testing started at 18:14 ...
02:15:05.329 WARN org.apache.hadoop.util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable

Process finished with exit code 0
```

## How was this patch tested?

Used the existing test

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #18719 from kiszk/SPARK-21512.
---
 .../test/scala/org/apache/spark/sql/DatasetCacheSuite.scala  | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala
index e0561ee2797a5..2dc6b44ecef3d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala
@@ -25,6 +25,11 @@ import org.apache.spark.storage.StorageLevel
 class DatasetCacheSuite extends QueryTest with SharedSQLContext {
   import testImplicits._
 
+  // Clear all persistent datasets after each test
+  override def afterEach(): Unit = {
+    spark.sharedState.cacheManager.clearCache()
+  }
+
   test("get storage level") {
     val ds1 = Seq("1", "2").toDS().as("a")
     val ds2 = Seq(2, 3).toDS().as("b")

From 86664338f25f58b2f59db93b68cd57de671a4c0b Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Mon, 24 Jul 2017 10:18:28 -0700
Subject: [PATCH 227/779] [SPARK-17528][SQL][FOLLOWUP] remove unnecessary data
 copy in object hash aggregate

## What changes were proposed in this pull request?

In #18483 , we fixed the data copy bug when saving into `InternalRow`, and removed all workarounds for this bug in the aggregate code path. However, the object hash aggregate was missed, this PR fixes it.

This patch is also a requirement for #17419 , which shows that DataFrame version is slower than RDD version because of this issue.

## How was this patch tested?

existing tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #18712 from cloud-fan/minor.
---
 .../aggregate/ObjectAggregationIterator.scala | 20 ++++---------------
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala
index 6e47f9d611199..eef2c4e843f35 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala
@@ -70,10 +70,6 @@ class ObjectAggregationIterator(
     generateProcessRow(newExpressions, newFunctions, newInputAttributes)
   }
 
-  // A safe projection used to do deep clone of input rows to prevent false sharing.
-  private[this] val safeProjection: Projection =
-    FromUnsafeProjection(outputAttributes.map(_.dataType))
-
   /**
    * Start processing input rows.
    */
@@ -151,12 +147,11 @@ class ObjectAggregationIterator(
       val groupingKey = groupingProjection.apply(null)
       val buffer: InternalRow = getAggregationBufferByKey(hashMap, groupingKey)
       while (inputRows.hasNext) {
-        val newInput = safeProjection(inputRows.next())
-        processRow(buffer, newInput)
+        processRow(buffer, inputRows.next())
       }
     } else {
       while (inputRows.hasNext && !sortBased) {
-        val newInput = safeProjection(inputRows.next())
+        val newInput = inputRows.next()
         val groupingKey = groupingProjection.apply(newInput)
         val buffer: InternalRow = getAggregationBufferByKey(hashMap, groupingKey)
         processRow(buffer, newInput)
@@ -266,9 +261,7 @@ class SortBasedAggregator(
           // Firstly, update the aggregation buffer with input rows.
           while (hasNextInput &&
             groupingKeyOrdering.compare(inputIterator.getKey, groupingKey) == 0) {
-            // Since `inputIterator.getValue` is an `UnsafeRow` whose underlying buffer will be
-            // overwritten when `inputIterator` steps forward, we need to do a deep copy here.
-            processRow(result.aggregationBuffer, inputIterator.getValue.copy())
+            processRow(result.aggregationBuffer, inputIterator.getValue)
             hasNextInput = inputIterator.next()
           }
 
@@ -277,12 +270,7 @@ class SortBasedAggregator(
           // be called after calling processRow.
           while (hasNextAggBuffer &&
             groupingKeyOrdering.compare(initialAggBufferIterator.getKey, groupingKey) == 0) {
-            mergeAggregationBuffers(
-              result.aggregationBuffer,
-              // Since `inputIterator.getValue` is an `UnsafeRow` whose underlying buffer will be
-              // overwritten when `inputIterator` steps forward, we need to do a deep copy here.
-              initialAggBufferIterator.getValue.copy()
-            )
+            mergeAggregationBuffers(result.aggregationBuffer, initialAggBufferIterator.getValue)
             hasNextAggBuffer = initialAggBufferIterator.next()
           }
 

From b09ec92a6b57be1f16e6f9a60469b54819632ffe Mon Sep 17 00:00:00 2001
From: Stavros Kontopoulos <st.kontopoulos@gmail.com>
Date: Mon, 24 Jul 2017 11:11:34 -0700
Subject: [PATCH 228/779] [SPARK-21502][MESOS] fix --supervise for mesos in
 cluster mode

## What changes were proposed in this pull request?
With supervise enabled for a driver, re-launching it was failing because the driver had the same framework Id. This patch creates a new driver framework id every time we re-launch a driver, but we keep the driver submission id the same since that is the same with the task id the driver was launched with on mesos and retry state and other info within Dispatcher's data structures uses that as a key.
We append a "-retry-%4d" string as a suffix to the framework id passed by the dispatcher to the driver and the same value to the app_id created by each driver, except the first time where we dont need the retry suffix.
The previous format for the frameworkId was   'DispactherFId-DriverSubmissionId'.

We also detect the case where we have multiple spark contexts started from within the same driver and we do set proper names to their corresponding app-ids. The old practice was to unset the framework id passed from the dispatcher after the driver framework was started for the first time and let mesos decide the framework ID for subsequent spark contexts. The decided fId was passed as an appID.
This patch affects heavily the history server. Btw we dont have the issues of the standalone case where driver id must be different since the dispatcher will re-launch a driver(mesos task) only if it gets an update that it is dead and this is verified by mesos implicitly. We also dont fix the fine grained mode which is deprecated and of no use.

## How was this patch tested?

This task was manually tested on dc/os. Launched a driver, stoped its container and verified the expected behavior.

Initial retry of the driver, driver in pending state:

![image](https://user-images.githubusercontent.com/7945591/28473862-1088b736-6e4f-11e7-8d7d-7b785b1da6a6.png)

Driver re-launched:
![image](https://user-images.githubusercontent.com/7945591/28473885-26e02d16-6e4f-11e7-9eb8-6bf7bdb10cb8.png)

Another re-try:
![image](https://user-images.githubusercontent.com/7945591/28473897-35702318-6e4f-11e7-9585-fd295ad7c6b6.png)

The resulted entries in history server at the bottom:

![image](https://user-images.githubusercontent.com/7945591/28473910-4946dabc-6e4f-11e7-90a6-fa4f80893c61.png)

Regarding multiple spark contexts here is the end result regarding the spark history server, for the second spark context we add an increasing number as a suffix:

![image](https://user-images.githubusercontent.com/7945591/28474432-69cf8b06-6e51-11e7-93c7-e6c0b04dec93.png)

Author: Stavros Kontopoulos <st.kontopoulos@gmail.com>

Closes #18705 from skonto/fix_supervise_flag.
---
 .../cluster/mesos/MesosClusterScheduler.scala |  3 ++-
 .../MesosCoarseGrainedSchedulerBackend.scala  | 20 +++++++++++++++++--
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
index 577f9a876b381..28780d3a89e8a 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
@@ -369,7 +369,8 @@ private[spark] class MesosClusterScheduler(
   }
 
   private def getDriverFrameworkID(desc: MesosDriverDescription): String = {
-    s"${frameworkId}-${desc.submissionId}"
+    val retries = desc.retryState.map { d => s"-retry-${d.retries.toString}" }.getOrElse("")
+    s"${frameworkId}-${desc.submissionId}${retries}"
   }
 
   private def adjust[A, B](m: collection.Map[A, B], k: A, default: B)(f: B => B) = {
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
index 6e7f41dad34ba..e6b09572121d6 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
@@ -19,6 +19,7 @@ package org.apache.spark.scheduler.cluster.mesos
 
 import java.io.File
 import java.util.{Collections, List => JList}
+import java.util.concurrent.atomic.{AtomicBoolean, AtomicLong}
 import java.util.concurrent.locks.ReentrantLock
 
 import scala.collection.JavaConverters._
@@ -170,6 +171,15 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
 
   override def start() {
     super.start()
+
+    val startedBefore = IdHelper.startedBefore.getAndSet(true)
+
+    val suffix = if (startedBefore) {
+      f"-${IdHelper.nextSCNumber.incrementAndGet()}%04d"
+    } else {
+      ""
+    }
+
     val driver = createSchedulerDriver(
       master,
       MesosCoarseGrainedSchedulerBackend.this,
@@ -179,10 +189,9 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
       sc.conf.getOption("spark.mesos.driver.webui.url").orElse(sc.ui.map(_.webUrl)),
       None,
       Some(sc.conf.get(DRIVER_FAILOVER_TIMEOUT)),
-      sc.conf.getOption("spark.mesos.driver.frameworkId")
+      sc.conf.getOption("spark.mesos.driver.frameworkId").map(_ + suffix)
     )
 
-    unsetFrameworkID(sc)
     startScheduler(driver)
   }
 
@@ -271,6 +280,7 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
       driver: org.apache.mesos.SchedulerDriver,
       frameworkId: FrameworkID,
       masterInfo: MasterInfo) {
+
     this.appId = frameworkId.getValue
     this.mesosExternalShuffleClient.foreach(_.init(appId))
     this.schedulerDriver = driver
@@ -672,3 +682,9 @@ private class Slave(val hostname: String) {
   var taskFailures = 0
   var shuffleRegistered = false
 }
+
+object IdHelper {
+  // Use atomic values since Spark contexts can be initialized in parallel
+  private[mesos] val nextSCNumber = new AtomicLong(0)
+  private[mesos] val startedBefore = new AtomicBoolean(false)
+}

From 7f295059ca573fd56a7923badfe213cb6cb0881d Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Tue, 25 Jul 2017 10:51:00 +0800
Subject: [PATCH 229/779] [SPARK-21516][SQL][TEST] Overriding afterEach() in
 DatasetCacheSuite must call super.afterEach()

## What changes were proposed in this pull request?

This PR ensures to call `super.afterEach()` in overriding `afterEach()` method in `DatasetCacheSuite`. When we override `afterEach()` method in Testsuite, we have to call `super.afterEach()`.

This is a follow-up of #18719 and SPARK-21512.

## How was this patch tested?

Used the existing test suite

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #18721 from kiszk/SPARK-21516.
---
 .../test/scala/org/apache/spark/sql/DatasetCacheSuite.scala  | 5 -----
 .../scala/org/apache/spark/sql/test/SharedSQLContext.scala   | 2 ++
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala
index 2dc6b44ecef3d..e0561ee2797a5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala
@@ -25,11 +25,6 @@ import org.apache.spark.storage.StorageLevel
 class DatasetCacheSuite extends QueryTest with SharedSQLContext {
   import testImplicits._
 
-  // Clear all persistent datasets after each test
-  override def afterEach(): Unit = {
-    spark.sharedState.cacheManager.clearCache()
-  }
-
   test("get storage level") {
     val ds1 = Seq("1", "2").toDS().as("a")
     val ds2 = Seq(2, 3).toDS().as("b")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
index 7cea4c02155ea..5ec76a4f0ec90 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
@@ -87,6 +87,8 @@ trait SharedSQLContext extends SQLTestUtils with BeforeAndAfterEach with Eventua
 
   protected override def afterEach(): Unit = {
     super.afterEach()
+    // Clear all persistent datasets after each test
+    spark.sharedState.cacheManager.clearCache()
     // files can be closed from other threads, so wait a bit
     // normally this doesn't take more than 1s
     eventually(timeout(10.seconds)) {

From 4f77c0623885d4a7455f9841a888d9f6e098e7f0 Mon Sep 17 00:00:00 2001
From: Yash Sharma <ysharma@atlassian.com>
Date: Tue, 25 Jul 2017 08:27:03 +0100
Subject: [PATCH 230/779] [SPARK-20855][Docs][DStream] Update the Spark kinesis
 docs to use the KinesisInputDStream builder instead of deprecated
 KinesisUtils

## What changes were proposed in this pull request?

The examples and docs for Spark-Kinesis integrations use the deprecated KinesisUtils. We should update the docs to use the KinesisInputDStream builder to create DStreams.

## How was this patch tested?

The patch primarily updates the documents. The patch will also need to make changes to the Spark-Kinesis examples. The examples need to be tested.

Author: Yash Sharma <ysharma@atlassian.com>

Closes #18071 from yssharma/ysharma/kinesis_docs.
---
 docs/streaming-kinesis-integration.md         | 108 +++++++++++-------
 .../streaming/KinesisWordCountASL.scala       |  15 ++-
 2 files changed, 80 insertions(+), 43 deletions(-)

diff --git a/docs/streaming-kinesis-integration.md b/docs/streaming-kinesis-integration.md
index 9709bd3d6574d..678b0643fd706 100644
--- a/docs/streaming-kinesis-integration.md
+++ b/docs/streaming-kinesis-integration.md
@@ -24,41 +24,58 @@ A Kinesis stream can be set up at one of the valid Kinesis endpoints with 1 or m
 	For Python applications, you will have to add this above library and its dependencies when deploying your application. See the *Deploying* subsection below.
 	**Note that by linking to this library, you will include [ASL](https://aws.amazon.com/asl/)-licensed code in your application.**
 
-2. **Programming:** In the streaming application code, import `KinesisUtils` and create the input DStream of byte array as follows:
+2. **Programming:** In the streaming application code, import `KinesisInputDStream` and create the input DStream of byte array as follows:
 
 	<div class="codetabs">
 	<div data-lang="scala" markdown="1">
-		import org.apache.spark.streaming.Duration
-		import org.apache.spark.streaming.kinesis._
-		import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
-
-		val kinesisStream = KinesisUtils.createStream(
-			streamingContext, [Kinesis app name], [Kinesis stream name], [endpoint URL],
-			[region name], [initial position], [checkpoint interval], StorageLevel.MEMORY_AND_DISK_2)
-
-	See the [API docs](api/scala/index.html#org.apache.spark.streaming.kinesis.KinesisUtils$)
+            import org.apache.spark.storage.StorageLevel
+            import org.apache.spark.streaming.kinesis.KinesisInputDStream
+            import org.apache.spark.streaming.{Seconds, StreamingContext}
+            import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
+
+            val kinesisStream = KinesisInputDStream.builder
+                .streamingContext(streamingContext)
+                .endpointUrl([endpoint URL])
+                .regionName([region name])
+                .streamName([streamName])
+                .initialPositionInStream([initial position])
+                .checkpointAppName([Kinesis app name])
+                .checkpointInterval([checkpoint interval])
+                .storageLevel(StorageLevel.MEMORY_AND_DISK_2)
+                .build()
+
+	See the [API docs](api/scala/index.html#org.apache.spark.streaming.kinesis.KinesisInputDStream)
 	and the [example]({{site.SPARK_GITHUB_URL}}/tree/master/external/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala). Refer to the [Running the Example](#running-the-example) subsection for instructions on how to run the example.
 
 	</div>
 	<div data-lang="java" markdown="1">
-		import org.apache.spark.streaming.Duration;
-		import org.apache.spark.streaming.kinesis.*;
-		import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream;
-
-		JavaReceiverInputDStream<byte[]> kinesisStream = KinesisUtils.createStream(
-			streamingContext, [Kinesis app name], [Kinesis stream name], [endpoint URL],
-			[region name], [initial position], [checkpoint interval], StorageLevel.MEMORY_AND_DISK_2);
+            import org.apache.spark.storage.StorageLevel
+            import org.apache.spark.streaming.kinesis.KinesisInputDStream
+            import org.apache.spark.streaming.Seconds
+            import org.apache.spark.streaming.StreamingContext
+            import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
+
+            KinesisInputDStream<byte[]> kinesisStream = KinesisInputDStream.builder
+                .streamingContext(streamingContext)
+                .endpointUrl([endpoint URL])
+                .regionName([region name])
+                .streamName([streamName])
+                .initialPositionInStream([initial position])
+                .checkpointAppName([Kinesis app name])
+                .checkpointInterval([checkpoint interval])
+                .storageLevel(StorageLevel.MEMORY_AND_DISK_2)
+                .build();
 
 	See the [API docs](api/java/index.html?org/apache/spark/streaming/kinesis/KinesisUtils.html)
 	and the [example]({{site.SPARK_GITHUB_URL}}/tree/master/external/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java). Refer to the [Running the Example](#running-the-example) subsection for instructions to run the example.
 
 	</div>
 	<div data-lang="python" markdown="1">
-		from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream
+            from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream
 
-		kinesisStream = KinesisUtils.createStream(
-			streamingContext, [Kinesis app name], [Kinesis stream name], [endpoint URL],
-			[region name], [initial position], [checkpoint interval], StorageLevel.MEMORY_AND_DISK_2)
+            kinesisStream = KinesisUtils.createStream(
+                streamingContext, [Kinesis app name], [Kinesis stream name], [endpoint URL],
+                [region name], [initial position], [checkpoint interval], StorageLevel.MEMORY_AND_DISK_2)
 
 	See the [API docs](api/python/pyspark.streaming.html#pyspark.streaming.kinesis.KinesisUtils)
 	and the [example]({{site.SPARK_GITHUB_URL}}/tree/master/external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py). Refer to the [Running the Example](#running-the-example) subsection for instructions to run the example.
@@ -70,27 +87,40 @@ A Kinesis stream can be set up at one of the valid Kinesis endpoints with 1 or m
 
 	<div class="codetabs">
 	<div data-lang="scala" markdown="1">
-
-		import org.apache.spark.streaming.Duration
-		import org.apache.spark.streaming.kinesis._
-		import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
-
-		val kinesisStream = KinesisUtils.createStream[T](
-			streamingContext, [Kinesis app name], [Kinesis stream name], [endpoint URL],
-			[region name], [initial position], [checkpoint interval], StorageLevel.MEMORY_AND_DISK_2,
-			[message handler])
+                import org.apache.spark.storage.StorageLevel
+                import org.apache.spark.streaming.kinesis.KinesisInputDStream
+                import org.apache.spark.streaming.{Seconds, StreamingContext}
+                import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
+
+                val kinesisStream = KinesisInputDStream.builder
+                    .streamingContext(streamingContext)
+                    .endpointUrl([endpoint URL])
+                    .regionName([region name])
+                    .streamName([streamName])
+                    .initialPositionInStream([initial position])
+                    .checkpointAppName([Kinesis app name])
+                    .checkpointInterval([checkpoint interval])
+                    .storageLevel(StorageLevel.MEMORY_AND_DISK_2)
+                    .buildWithMessageHandler([message handler])
 
 	</div>
 	<div data-lang="java" markdown="1">
-
-		import org.apache.spark.streaming.Duration;
-		import org.apache.spark.streaming.kinesis.*;
-		import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream;
-
-		JavaReceiverInputDStream<T> kinesisStream = KinesisUtils.createStream(
-			streamingContext, [Kinesis app name], [Kinesis stream name], [endpoint URL],
-			[region name], [initial position], [checkpoint interval], StorageLevel.MEMORY_AND_DISK_2,
-			[message handler], [class T]);
+                import org.apache.spark.storage.StorageLevel
+                import org.apache.spark.streaming.kinesis.KinesisInputDStream
+                import org.apache.spark.streaming.Seconds
+                import org.apache.spark.streaming.StreamingContext
+                import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
+
+                KinesisInputDStream<byte[]> kinesisStream = KinesisInputDStream.builder
+                    .streamingContext(streamingContext)
+                    .endpointUrl([endpoint URL])
+                    .regionName([region name])
+                    .streamName([streamName])
+                    .initialPositionInStream([initial position])
+                    .checkpointAppName([Kinesis app name])
+                    .checkpointInterval([checkpoint interval])
+                    .storageLevel(StorageLevel.MEMORY_AND_DISK_2)
+                    .buildWithMessageHandler([message handler]);
 
 	</div>
 	</div>
diff --git a/external/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala b/external/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
index f14117b708a0d..cde2c4b04c0c7 100644
--- a/external/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
+++ b/external/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
@@ -23,7 +23,6 @@ import java.nio.ByteBuffer
 import scala.util.Random
 
 import com.amazonaws.auth.DefaultAWSCredentialsProviderChain
-import com.amazonaws.regions.RegionUtils
 import com.amazonaws.services.kinesis.AmazonKinesisClient
 import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
 import com.amazonaws.services.kinesis.model.PutRecordRequest
@@ -34,7 +33,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.{Milliseconds, StreamingContext}
 import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
-import org.apache.spark.streaming.kinesis.KinesisUtils
+import org.apache.spark.streaming.kinesis.KinesisInputDStream
 
 
 /**
@@ -135,8 +134,16 @@ object KinesisWordCountASL extends Logging {
 
     // Create the Kinesis DStreams
     val kinesisStreams = (0 until numStreams).map { i =>
-      KinesisUtils.createStream(ssc, appName, streamName, endpointUrl, regionName,
-        InitialPositionInStream.LATEST, kinesisCheckpointInterval, StorageLevel.MEMORY_AND_DISK_2)
+      KinesisInputDStream.builder
+        .streamingContext(ssc)
+        .streamName(streamName)
+        .endpointUrl(endpointUrl)
+        .regionName(regionName)
+        .initialPositionInStream(InitialPositionInStream.LATEST)
+        .checkpointAppName(appName)
+        .checkpointInterval(kinesisCheckpointInterval)
+        .storageLevel(StorageLevel.MEMORY_AND_DISK_2)
+        .build()
     }
 
     // Union all the streams

From 996a809c5eac61578850ebf5dcd91070458e7e47 Mon Sep 17 00:00:00 2001
From: Trueman <lizhaoch@users.noreply.github.com>
Date: Tue, 25 Jul 2017 10:53:50 +0100
Subject: [PATCH 231/779] [SPARK-21498][EXAMPLES] quick start -> one py demo
 have some bug in code

I find a bug about 'quick start',and created a new issues,Sean Owen  let
me to make a pull request, and I do

## What changes were proposed in this pull request?

(Please fill in changes proposed in this fix)

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)
(If this patch involves UI changes, please attach a screenshot; otherwise, remove this)

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Trueman <lizhaoch@users.noreply.github.com>
Author: lizhaoch <lizhaoc@163.com>

Closes #18722 from lizhaoch/master.
---
 docs/quick-start.md | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/docs/quick-start.md b/docs/quick-start.md
index cb5211af377e5..7cb3f75132206 100644
--- a/docs/quick-start.md
+++ b/docs/quick-start.md
@@ -389,7 +389,7 @@ As an example, we'll create a simple Spark application, `SimpleApp.py`:
 from pyspark.sql import SparkSession
 
 logFile = "YOUR_SPARK_HOME/README.md"  # Should be some file on your system
-spark = SparkSession.builder().appName(appName).master(master).getOrCreate()
+spark = SparkSession.builder.appName("SimpleApp").getOrCreate()
 logData = spark.read.text(logFile).cache()
 
 numAs = logData.filter(logData.value.contains('a')).count()
@@ -421,16 +421,15 @@ $ YOUR_SPARK_HOME/bin/spark-submit \
 Lines with a: 46, Lines with b: 23
 {% endhighlight %}
 
-If you have PySpark pip installed into your enviroment (e.g. `pip instal pyspark` you can run your application with the regular Python interpeter or use the provided spark-submit as you prefer.
+If you have PySpark pip installed into your enviroment (e.g., `pip install pyspark`), you can run your application with the regular Python interpreter or use the provided 'spark-submit' as you prefer.
 
 {% highlight bash %}
-# Use spark-submit to run your application
+# Use the Python interpreter to run your application
 $ python SimpleApp.py
 ...
 Lines with a: 46, Lines with b: 23
 {% endhighlight %}
 
-
 </div>
 </div>
 

From 799e13161e89f1ea96cb1bc7b507a05af2e89cd0 Mon Sep 17 00:00:00 2001
From: jinxing <jinxing6042@126.com>
Date: Tue, 25 Jul 2017 20:52:07 +0800
Subject: [PATCH 232/779] [SPARK-21175] Reject OpenBlocks when memory shortage
 on shuffle service.

## What changes were proposed in this pull request?

A shuffle service can serves blocks from multiple apps/tasks. Thus the shuffle service can suffers high memory usage when lots of shuffle-reads happen at the same time. In my cluster, OOM always happens on shuffle service. Analyzing heap dump, memory cost by Netty(ChannelOutboundBufferEntry) can be up to 2~3G. It might make sense to reject "open blocks" request when memory usage is high on shuffle service.

https://github.com/apache/spark/commit/93dd0c518d040155b04e5ab258c5835aec7776fc and https://github.com/apache/spark/commit/85c6ce61930490e2247fb4b0e22dfebbb8b6a1ee tried to alleviate the memory pressure on shuffle service but cannot solve the root cause. This pr proposes to control currency of shuffle read.

## How was this patch tested?
Added unit test.

Author: jinxing <jinxing6042@126.com>

Closes #18388 from jinxing64/SPARK-21175.
---
 .../spark/network/TransportContext.java       |   2 +-
 .../server/OneForOneStreamManager.java        |  60 +++++++-
 .../spark/network/server/StreamManager.java   |  27 ++++
 .../server/TransportRequestHandler.java       |  42 +++++-
 .../spark/network/util/TransportConf.java     |   6 +
 .../network/TransportRequestHandlerSuite.java | 134 ++++++++++++++++++
 docs/configuration.md                         |   7 +
 7 files changed, 265 insertions(+), 13 deletions(-)
 create mode 100644 common/network-common/src/test/java/org/apache/spark/network/TransportRequestHandlerSuite.java

diff --git a/common/network-common/src/main/java/org/apache/spark/network/TransportContext.java b/common/network-common/src/main/java/org/apache/spark/network/TransportContext.java
index 965c4ae307667..ae91bc9cfdd08 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/TransportContext.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/TransportContext.java
@@ -168,7 +168,7 @@ private TransportChannelHandler createChannelHandler(Channel channel, RpcHandler
     TransportResponseHandler responseHandler = new TransportResponseHandler(channel);
     TransportClient client = new TransportClient(channel, responseHandler);
     TransportRequestHandler requestHandler = new TransportRequestHandler(channel, client,
-      rpcHandler);
+      rpcHandler, conf.maxChunksBeingTransferred());
     return new TransportChannelHandler(client, responseHandler, requestHandler,
       conf.connectionTimeoutMs(), closeIdleConnections);
   }
diff --git a/common/network-common/src/main/java/org/apache/spark/network/server/OneForOneStreamManager.java b/common/network-common/src/main/java/org/apache/spark/network/server/OneForOneStreamManager.java
index 85ca2f1728e6a..0f6a8824d95e5 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/server/OneForOneStreamManager.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/server/OneForOneStreamManager.java
@@ -25,6 +25,8 @@
 
 import com.google.common.base.Preconditions;
 import io.netty.channel.Channel;
+import org.apache.commons.lang3.tuple.ImmutablePair;
+import org.apache.commons.lang3.tuple.Pair;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -53,6 +55,9 @@ private static class StreamState {
     // that the caller only requests each chunk one at a time, in order.
     int curChunk = 0;
 
+    // Used to keep track of the number of chunks being transferred and not finished yet.
+    volatile long chunksBeingTransferred = 0L;
+
     StreamState(String appId, Iterator<ManagedBuffer> buffers) {
       this.appId = appId;
       this.buffers = Preconditions.checkNotNull(buffers);
@@ -96,18 +101,25 @@ public ManagedBuffer getChunk(long streamId, int chunkIndex) {
 
   @Override
   public ManagedBuffer openStream(String streamChunkId) {
-    String[] array = streamChunkId.split("_");
-    assert array.length == 2:
-      "Stream id and chunk index should be specified when open stream for fetching block.";
-    long streamId = Long.valueOf(array[0]);
-    int chunkIndex = Integer.valueOf(array[1]);
-    return getChunk(streamId, chunkIndex);
+    Pair<Long, Integer> streamChunkIdPair = parseStreamChunkId(streamChunkId);
+    return getChunk(streamChunkIdPair.getLeft(), streamChunkIdPair.getRight());
   }
 
   public static String genStreamChunkId(long streamId, int chunkId) {
     return String.format("%d_%d", streamId, chunkId);
   }
 
+  // Parse streamChunkId to be stream id and chunk id. This is used when fetch remote chunk as a
+  // stream.
+  public static Pair<Long, Integer> parseStreamChunkId(String streamChunkId) {
+    String[] array = streamChunkId.split("_");
+    assert array.length == 2:
+      "Stream id and chunk index should be specified.";
+    long streamId = Long.valueOf(array[0]);
+    int chunkIndex = Integer.valueOf(array[1]);
+    return ImmutablePair.of(streamId, chunkIndex);
+  }
+
   @Override
   public void connectionTerminated(Channel channel) {
     // Close all streams which have been associated with the channel.
@@ -139,6 +151,42 @@ public void checkAuthorization(TransportClient client, long streamId) {
     }
   }
 
+  @Override
+  public void chunkBeingSent(long streamId) {
+    StreamState streamState = streams.get(streamId);
+    if (streamState != null) {
+      streamState.chunksBeingTransferred++;
+    }
+
+  }
+
+  @Override
+  public void streamBeingSent(String streamId) {
+    chunkBeingSent(parseStreamChunkId(streamId).getLeft());
+  }
+
+  @Override
+  public void chunkSent(long streamId) {
+    StreamState streamState = streams.get(streamId);
+    if (streamState != null) {
+      streamState.chunksBeingTransferred--;
+    }
+  }
+
+  @Override
+  public void streamSent(String streamId) {
+    chunkSent(OneForOneStreamManager.parseStreamChunkId(streamId).getLeft());
+  }
+
+  @Override
+  public long chunksBeingTransferred() {
+    long sum = 0L;
+    for (StreamState streamState: streams.values()) {
+      sum += streamState.chunksBeingTransferred;
+    }
+    return sum;
+  }
+
   /**
    * Registers a stream of ManagedBuffers which are served as individual chunks one at a time to
    * callers. Each ManagedBuffer will be release()'d after it is transferred on the wire. If a
diff --git a/common/network-common/src/main/java/org/apache/spark/network/server/StreamManager.java b/common/network-common/src/main/java/org/apache/spark/network/server/StreamManager.java
index 07f161a29cfb8..c535295831606 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/server/StreamManager.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/server/StreamManager.java
@@ -83,4 +83,31 @@ public void connectionTerminated(Channel channel) { }
    */
   public void checkAuthorization(TransportClient client, long streamId) { }
 
+  /**
+   * Return the number of chunks being transferred and not finished yet in this StreamManager.
+   */
+  public long chunksBeingTransferred() {
+    return 0;
+  }
+
+  /**
+   * Called when start sending a chunk.
+   */
+  public void chunkBeingSent(long streamId) { }
+
+  /**
+   * Called when start sending a stream.
+   */
+  public void streamBeingSent(String streamId) { }
+
+  /**
+   * Called when a chunk is successfully sent.
+   */
+  public void chunkSent(long streamId) { }
+
+  /**
+   * Called when a stream is successfully sent.
+   */
+  public void streamSent(String streamId) { }
+
 }
diff --git a/common/network-common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java b/common/network-common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
index 8193bc1376102..e94453578e6b0 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
@@ -22,6 +22,7 @@
 
 import com.google.common.base.Throwables;
 import io.netty.channel.Channel;
+import io.netty.channel.ChannelFuture;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -65,14 +66,19 @@ public class TransportRequestHandler extends MessageHandler<RequestMessage> {
   /** Returns each chunk part of a stream. */
   private final StreamManager streamManager;
 
+  /** The max number of chunks being transferred and not finished yet. */
+  private final long maxChunksBeingTransferred;
+
   public TransportRequestHandler(
       Channel channel,
       TransportClient reverseClient,
-      RpcHandler rpcHandler) {
+      RpcHandler rpcHandler,
+      Long maxChunksBeingTransferred) {
     this.channel = channel;
     this.reverseClient = reverseClient;
     this.rpcHandler = rpcHandler;
     this.streamManager = rpcHandler.getStreamManager();
+    this.maxChunksBeingTransferred = maxChunksBeingTransferred;
   }
 
   @Override
@@ -117,7 +123,13 @@ private void processFetchRequest(final ChunkFetchRequest req) {
       logger.trace("Received req from {} to fetch block {}", getRemoteAddress(channel),
         req.streamChunkId);
     }
-
+    long chunksBeingTransferred = streamManager.chunksBeingTransferred();
+    if (chunksBeingTransferred >= maxChunksBeingTransferred) {
+      logger.warn("The number of chunks being transferred {} is above {}, close the connection.",
+        chunksBeingTransferred, maxChunksBeingTransferred);
+      channel.close();
+      return;
+    }
     ManagedBuffer buf;
     try {
       streamManager.checkAuthorization(reverseClient, req.streamChunkId.streamId);
@@ -130,10 +142,25 @@ private void processFetchRequest(final ChunkFetchRequest req) {
       return;
     }
 
-    respond(new ChunkFetchSuccess(req.streamChunkId, buf));
+    streamManager.chunkBeingSent(req.streamChunkId.streamId);
+    respond(new ChunkFetchSuccess(req.streamChunkId, buf)).addListener(future -> {
+      streamManager.chunkSent(req.streamChunkId.streamId);
+    });
   }
 
   private void processStreamRequest(final StreamRequest req) {
+    if (logger.isTraceEnabled()) {
+      logger.trace("Received req from {} to fetch stream {}", getRemoteAddress(channel),
+        req.streamId);
+    }
+
+    long chunksBeingTransferred = streamManager.chunksBeingTransferred();
+    if (chunksBeingTransferred >= maxChunksBeingTransferred) {
+      logger.warn("The number of chunks being transferred {} is above {}, close the connection.",
+        chunksBeingTransferred, maxChunksBeingTransferred);
+      channel.close();
+      return;
+    }
     ManagedBuffer buf;
     try {
       buf = streamManager.openStream(req.streamId);
@@ -145,7 +172,10 @@ private void processStreamRequest(final StreamRequest req) {
     }
 
     if (buf != null) {
-      respond(new StreamResponse(req.streamId, buf.size(), buf));
+      streamManager.streamBeingSent(req.streamId);
+      respond(new StreamResponse(req.streamId, buf.size(), buf)).addListener(future -> {
+        streamManager.streamSent(req.streamId);
+      });
     } else {
       respond(new StreamFailure(req.streamId, String.format(
         "Stream '%s' was not found.", req.streamId)));
@@ -187,9 +217,9 @@ private void processOneWayMessage(OneWayMessage req) {
    * Responds to a single message with some Encodable object. If a failure occurs while sending,
    * it will be logged and the channel closed.
    */
-  private void respond(Encodable result) {
+  private ChannelFuture respond(Encodable result) {
     SocketAddress remoteAddress = channel.remoteAddress();
-    channel.writeAndFlush(result).addListener(future -> {
+    return channel.writeAndFlush(result).addListener(future -> {
       if (future.isSuccess()) {
         logger.trace("Sent result {} to client {}", result, remoteAddress);
       } else {
diff --git a/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java b/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java
index a25078e262efb..ea52e9fe6c1c1 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java
@@ -257,4 +257,10 @@ public Properties cryptoConf() {
     return CryptoUtils.toCryptoConf("spark.network.crypto.config.", conf.getAll());
   }
 
+  /**
+   * The max number of chunks allowed to being transferred at the same time on shuffle service.
+   */
+  public long maxChunksBeingTransferred() {
+    return conf.getLong("spark.shuffle.maxChunksBeingTransferred", Long.MAX_VALUE);
+  }
 }
diff --git a/common/network-common/src/test/java/org/apache/spark/network/TransportRequestHandlerSuite.java b/common/network-common/src/test/java/org/apache/spark/network/TransportRequestHandlerSuite.java
new file mode 100644
index 0000000000000..1fb987a8a7aa7
--- /dev/null
+++ b/common/network-common/src/test/java/org/apache/spark/network/TransportRequestHandlerSuite.java
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.network;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import io.netty.channel.Channel;
+import io.netty.channel.ChannelPromise;
+import io.netty.channel.DefaultChannelPromise;
+import io.netty.util.concurrent.Future;
+import io.netty.util.concurrent.GenericFutureListener;
+import org.junit.Test;
+
+import static org.mockito.Mockito.*;
+
+import org.apache.commons.lang3.tuple.ImmutablePair;
+import org.apache.commons.lang3.tuple.Pair;
+import org.apache.spark.network.buffer.ManagedBuffer;
+import org.apache.spark.network.client.TransportClient;
+import org.apache.spark.network.protocol.*;
+import org.apache.spark.network.server.NoOpRpcHandler;
+import org.apache.spark.network.server.OneForOneStreamManager;
+import org.apache.spark.network.server.RpcHandler;
+import org.apache.spark.network.server.TransportRequestHandler;
+
+public class TransportRequestHandlerSuite {
+
+  @Test
+  public void handleFetchRequestAndStreamRequest() throws Exception {
+    RpcHandler rpcHandler = new NoOpRpcHandler();
+    OneForOneStreamManager streamManager = (OneForOneStreamManager) (rpcHandler.getStreamManager());
+    Channel channel = mock(Channel.class);
+    List<Pair<Object, ExtendedChannelPromise>> responseAndPromisePairs =
+      new ArrayList<>();
+    when(channel.writeAndFlush(any()))
+      .thenAnswer(invocationOnMock0 -> {
+        Object response = invocationOnMock0.getArguments()[0];
+        ExtendedChannelPromise channelFuture = new ExtendedChannelPromise(channel);
+        responseAndPromisePairs.add(ImmutablePair.of(response, channelFuture));
+        return channelFuture;
+      });
+
+    // Prepare the stream.
+    List<ManagedBuffer> managedBuffers = new ArrayList<>();
+    managedBuffers.add(new TestManagedBuffer(10));
+    managedBuffers.add(new TestManagedBuffer(20));
+    managedBuffers.add(new TestManagedBuffer(30));
+    managedBuffers.add(new TestManagedBuffer(40));
+    long streamId = streamManager.registerStream("test-app", managedBuffers.iterator());
+    streamManager.registerChannel(channel, streamId);
+    TransportClient reverseClient = mock(TransportClient.class);
+    TransportRequestHandler requestHandler = new TransportRequestHandler(channel, reverseClient,
+      rpcHandler, 2L);
+
+    RequestMessage request0 = new ChunkFetchRequest(new StreamChunkId(streamId, 0));
+    requestHandler.handle(request0);
+    assert responseAndPromisePairs.size() == 1;
+    assert responseAndPromisePairs.get(0).getLeft() instanceof ChunkFetchSuccess;
+    assert ((ChunkFetchSuccess) (responseAndPromisePairs.get(0).getLeft())).body() ==
+      managedBuffers.get(0);
+
+    RequestMessage request1 = new ChunkFetchRequest(new StreamChunkId(streamId, 1));
+    requestHandler.handle(request1);
+    assert responseAndPromisePairs.size() == 2;
+    assert responseAndPromisePairs.get(1).getLeft() instanceof ChunkFetchSuccess;
+    assert ((ChunkFetchSuccess) (responseAndPromisePairs.get(1).getLeft())).body() ==
+      managedBuffers.get(1);
+
+    // Finish flushing the response for request0.
+    responseAndPromisePairs.get(0).getRight().finish(true);
+
+    RequestMessage request2 = new StreamRequest(String.format("%d_%d", streamId, 2));
+    requestHandler.handle(request2);
+    assert responseAndPromisePairs.size() == 3;
+    assert responseAndPromisePairs.get(2).getLeft() instanceof StreamResponse;
+    assert ((StreamResponse) (responseAndPromisePairs.get(2).getLeft())).body() ==
+      managedBuffers.get(2);
+
+    // Request3 will trigger the close of channel, because the number of max chunks being
+    // transferred is 2;
+    RequestMessage request3 = new StreamRequest(String.format("%d_%d", streamId, 3));
+    requestHandler.handle(request3);
+    verify(channel, times(1)).close();
+    assert responseAndPromisePairs.size() == 3;
+  }
+
+  private class ExtendedChannelPromise extends DefaultChannelPromise {
+
+    private List<GenericFutureListener> listeners = new ArrayList<>();
+    private boolean success;
+
+    public ExtendedChannelPromise(Channel channel) {
+      super(channel);
+      success = false;
+    }
+
+    @Override
+    public ChannelPromise addListener(
+        GenericFutureListener<? extends Future<? super Void>> listener) {
+      listeners.add(listener);
+      return super.addListener(listener);
+    }
+
+    @Override
+    public boolean isSuccess() {
+      return success;
+    }
+
+    public void finish(boolean success) {
+      this.success = success;
+      listeners.forEach(listener -> {
+        try {
+          listener.operationComplete(this);
+        } catch (Exception e) { }
+      });
+    }
+  }
+}
diff --git a/docs/configuration.md b/docs/configuration.md
index d3df923c42690..f4b6f46db5b66 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -631,6 +631,13 @@ Apart from these, the following properties are also available, and may be useful
     Max number of entries to keep in the index cache of the shuffle service.
   </td>
 </tr>
+<tr>
+  <td><code>spark.shuffle.maxChunksBeingTransferred</code></td>
+  <td>Long.MAX_VALUE</td>
+  <td>
+    The max number of chunks allowed to being transferred at the same time on shuffle service.
+  </td>
+</tr>
 <tr>
   <td><code>spark.shuffle.sort.bypassMergeThreshold</code></td>
   <td>200</td>

From 8de080d9f9d3deac7745f9b3428d97595975701d Mon Sep 17 00:00:00 2001
From: DjvuLee <lihu@bytedance.com>
Date: Tue, 25 Jul 2017 10:21:18 -0700
Subject: [PATCH 233/779] [SPARK-21383][YARN] Fix the YarnAllocator allocates
 more Resource

When NodeManagers launching Executors,
the `missing` value will exceed the
real value when the launch is slow, this can lead to YARN allocates more resource.

We add the `numExecutorsRunning` when calculate the `missing` to avoid this.

Test by experiment.

Author: DjvuLee <lihu@bytedance.com>

Closes #18651 from djvulee/YarnAllocate.
---
 .../spark/deploy/yarn/YarnAllocator.scala     | 50 ++++++++++++-------
 1 file changed, 33 insertions(+), 17 deletions(-)

diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
index ed77a6e4a1c7c..cc571c330f8d8 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -19,6 +19,7 @@ package org.apache.spark.deploy.yarn
 
 import java.util.Collections
 import java.util.concurrent._
+import java.util.concurrent.atomic.AtomicInteger
 import java.util.regex.Pattern
 
 import scala.collection.mutable
@@ -30,7 +31,6 @@ import org.apache.hadoop.yarn.api.records._
 import org.apache.hadoop.yarn.client.api.AMRMClient
 import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
 import org.apache.hadoop.yarn.conf.YarnConfiguration
-import org.apache.log4j.{Level, Logger}
 
 import org.apache.spark.{SecurityManager, SparkConf, SparkException}
 import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._
@@ -80,7 +80,9 @@ private[yarn] class YarnAllocator(
   private val releasedContainers = Collections.newSetFromMap[ContainerId](
     new ConcurrentHashMap[ContainerId, java.lang.Boolean])
 
-  @volatile private var numExecutorsRunning = 0
+  private val numExecutorsRunning = new AtomicInteger(0)
+
+  private val numExecutorsStarting = new AtomicInteger(0)
 
   /**
    * Used to generate a unique ID per executor
@@ -163,7 +165,7 @@ private[yarn] class YarnAllocator(
     clock = newClock
   }
 
-  def getNumExecutorsRunning: Int = numExecutorsRunning
+  def getNumExecutorsRunning: Int = numExecutorsRunning.get()
 
   def getNumExecutorsFailed: Int = synchronized {
     val endTime = clock.getTimeMillis()
@@ -242,7 +244,7 @@ private[yarn] class YarnAllocator(
     if (executorIdToContainer.contains(executorId)) {
       val container = executorIdToContainer.get(executorId).get
       internalReleaseContainer(container)
-      numExecutorsRunning -= 1
+      numExecutorsRunning.decrementAndGet()
     } else {
       logWarning(s"Attempted to kill unknown executor $executorId!")
     }
@@ -267,10 +269,12 @@ private[yarn] class YarnAllocator(
     val allocatedContainers = allocateResponse.getAllocatedContainers()
 
     if (allocatedContainers.size > 0) {
-      logDebug("Allocated containers: %d. Current executor count: %d. Cluster resources: %s."
+      logDebug(("Allocated containers: %d. Current executor count: %d. " +
+        "Launching executor count: %d. Cluster resources: %s.")
         .format(
           allocatedContainers.size,
-          numExecutorsRunning,
+          numExecutorsRunning.get,
+          numExecutorsStarting.get,
           allocateResponse.getAvailableResources))
 
       handleAllocatedContainers(allocatedContainers.asScala)
@@ -281,7 +285,7 @@ private[yarn] class YarnAllocator(
       logDebug("Completed %d containers".format(completedContainers.size))
       processCompletedContainers(completedContainers.asScala)
       logDebug("Finished processing %d completed containers. Current running executor count: %d."
-        .format(completedContainers.size, numExecutorsRunning))
+        .format(completedContainers.size, numExecutorsRunning.get))
     }
   }
 
@@ -294,7 +298,11 @@ private[yarn] class YarnAllocator(
   def updateResourceRequests(): Unit = {
     val pendingAllocate = getPendingAllocate
     val numPendingAllocate = pendingAllocate.size
-    val missing = targetNumExecutors - numPendingAllocate - numExecutorsRunning
+    val missing = targetNumExecutors - numPendingAllocate -
+      numExecutorsStarting.get - numExecutorsRunning.get
+    logDebug(s"Updating resource requests, target: $targetNumExecutors, " +
+      s"pending: $numPendingAllocate, running: ${numExecutorsRunning.get}, " +
+      s"executorsStarting: ${numExecutorsStarting.get}")
 
     if (missing > 0) {
       logInfo(s"Will request $missing executor container(s), each with " +
@@ -493,7 +501,8 @@ private[yarn] class YarnAllocator(
         s"for executor with ID $executorId")
 
       def updateInternalState(): Unit = synchronized {
-        numExecutorsRunning += 1
+        numExecutorsRunning.incrementAndGet()
+        numExecutorsStarting.decrementAndGet()
         executorIdToContainer(executorId) = container
         containerIdToExecutorId(container.getId) = executorId
 
@@ -503,7 +512,8 @@ private[yarn] class YarnAllocator(
         allocatedContainerToHostMap.put(containerId, executorHostname)
       }
 
-      if (numExecutorsRunning < targetNumExecutors) {
+      if (numExecutorsRunning.get < targetNumExecutors) {
+        numExecutorsStarting.incrementAndGet()
         if (launchContainers) {
           launcherPool.execute(new Runnable {
             override def run(): Unit = {
@@ -523,11 +533,16 @@ private[yarn] class YarnAllocator(
                 ).run()
                 updateInternalState()
               } catch {
-                case NonFatal(e) =>
-                  logError(s"Failed to launch executor $executorId on container $containerId", e)
-                  // Assigned container should be released immediately to avoid unnecessary resource
-                  // occupation.
-                  amClient.releaseAssignedContainer(containerId)
+                case e: Throwable =>
+                  numExecutorsStarting.decrementAndGet()
+                  if (NonFatal(e)) {
+                    logError(s"Failed to launch executor $executorId on container $containerId", e)
+                    // Assigned container should be released immediately
+                    // to avoid unnecessary resource occupation.
+                    amClient.releaseAssignedContainer(containerId)
+                  } else {
+                    throw e
+                  }
               }
             }
           })
@@ -537,7 +552,8 @@ private[yarn] class YarnAllocator(
         }
       } else {
         logInfo(("Skip launching executorRunnable as runnning Excecutors count: %d " +
-          "reached target Executors count: %d.").format(numExecutorsRunning, targetNumExecutors))
+          "reached target Executors count: %d.").format(
+          numExecutorsRunning.get, targetNumExecutors))
       }
     }
   }
@@ -552,7 +568,7 @@ private[yarn] class YarnAllocator(
       val exitReason = if (!alreadyReleased) {
         // Decrement the number of executors running. The next iteration of
         // the ApplicationMaster's reporting thread will take care of allocating.
-        numExecutorsRunning -= 1
+        numExecutorsRunning.decrementAndGet()
         logInfo("Completed container %s%s (state: %s, exit status: %s)".format(
           containerId,
           onHostStr,

From 06a9793793ca41dcef2f10ca06af091a57c721c4 Mon Sep 17 00:00:00 2001
From: Eric Vandenberg <ericvandenberg@fb.com>
Date: Tue, 25 Jul 2017 11:45:35 -0700
Subject: [PATCH 234/779] [SPARK-21447][WEB UI] Spark history server fails to
 render compressed

inprogress history file in some cases.

Add failure handling for EOFException that can be thrown during
decompression of an inprogress spark history file, treat same as case
where can't parse the last line.

## What changes were proposed in this pull request?

Failure handling for case of EOFException thrown within the ReplayListenerBus.replay method to handle the case analogous to json parse fail case.  This path can arise in compressed inprogress history files since an incomplete compression block could be read (not flushed by writer on a block boundary).  See the stack trace of this occurrence in the jira ticket (https://issues.apache.org/jira/browse/SPARK-21447)

## How was this patch tested?

Added a unit test that specifically targets validating the failure handling path appropriately when maybeTruncated is true and false.

Author: Eric Vandenberg <ericvandenberg@fb.com>

Closes #18673 from ericvandenbergfb/fix_inprogress_compr_history_file.
---
 .../spark/scheduler/ReplayListenerBus.scala   |  3 +-
 .../spark/scheduler/ReplayListenerSuite.scala | 77 ++++++++++++++++++-
 2 files changed, 77 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
index 08e05ae0c095b..26a6a3effc9ac 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.scheduler
 
-import java.io.{InputStream, IOException}
+import java.io.{EOFException, InputStream, IOException}
 
 import scala.io.Source
 
@@ -107,6 +107,7 @@ private[spark] class ReplayListenerBus extends SparkListenerBus with Logging {
         }
       }
     } catch {
+      case _: EOFException if maybeTruncated =>
       case ioe: IOException =>
         throw ioe
       case e: Exception =>
diff --git a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala
index 1732aca9417ea..88a68af6b647d 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala
@@ -17,15 +17,16 @@
 
 package org.apache.spark.scheduler
 
-import java.io.{File, PrintWriter}
+import java.io._
 import java.net.URI
+import java.util.concurrent.atomic.AtomicInteger
 
 import org.json4s.jackson.JsonMethods._
 import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite}
 import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.io.CompressionCodec
+import org.apache.spark.io.{CompressionCodec, LZ4CompressionCodec}
 import org.apache.spark.util.{JsonProtocol, JsonProtocolSuite, Utils}
 
 /**
@@ -72,6 +73,59 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp
     assert(eventMonster.loggedEvents(1) === JsonProtocol.sparkEventToJson(applicationEnd))
   }
 
+  /**
+   * Test replaying compressed spark history file that internally throws an EOFException.  To
+   * avoid sensitivity to the compression specifics the test forces an EOFException to occur
+   * while reading bytes from the underlying stream (such as observed in actual history files
+   * in some cases) and forces specific failure handling.  This validates correctness in both
+   * cases when maybeTruncated is true or false.
+   */
+  test("Replay compressed inprogress log file succeeding on partial read") {
+    val buffered = new ByteArrayOutputStream
+    val codec = new LZ4CompressionCodec(new SparkConf())
+    val compstream = codec.compressedOutputStream(buffered)
+    val writer = new PrintWriter(compstream)
+
+    val applicationStart = SparkListenerApplicationStart("AppStarts", None,
+      125L, "Mickey", None)
+    val applicationEnd = SparkListenerApplicationEnd(1000L)
+
+    // scalastyle:off println
+    writer.println(compact(render(JsonProtocol.sparkEventToJson(applicationStart))))
+    writer.println(compact(render(JsonProtocol.sparkEventToJson(applicationEnd))))
+    // scalastyle:on println
+    writer.close()
+
+    val logFilePath = Utils.getFilePath(testDir, "events.lz4.inprogress")
+    val fstream = fileSystem.create(logFilePath)
+    val bytes = buffered.toByteArray
+
+    fstream.write(bytes, 0, buffered.size)
+    fstream.close
+
+    // Read the compressed .inprogress file and verify only first event was parsed.
+    val conf = EventLoggingListenerSuite.getLoggingConf(logFilePath)
+    val replayer = new ReplayListenerBus()
+
+    val eventMonster = new EventMonster(conf)
+    replayer.addListener(eventMonster)
+
+    // Verify the replay returns the events given the input maybe truncated.
+    val logData = EventLoggingListener.openEventLog(logFilePath, fileSystem)
+    val failingStream = new EarlyEOFInputStream(logData, buffered.size - 10)
+    replayer.replay(failingStream, logFilePath.toString, true)
+
+    assert(eventMonster.loggedEvents.size === 1)
+    assert(failingStream.didFail)
+
+    // Verify the replay throws the EOF exception since the input may not be truncated.
+    val logData2 = EventLoggingListener.openEventLog(logFilePath, fileSystem)
+    val failingStream2 = new EarlyEOFInputStream(logData2, buffered.size - 10)
+    intercept[EOFException] {
+      replayer.replay(failingStream2, logFilePath.toString, false)
+    }
+  }
+
   // This assumes the correctness of EventLoggingListener
   test("End-to-end replay") {
     testApplicationReplay()
@@ -156,4 +210,23 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp
     override def start() { }
 
   }
+
+  /*
+   * This is a dummy input stream that wraps another input stream but ends prematurely when
+   * reading at the specified position, throwing an EOFExeption.
+   */
+  private class EarlyEOFInputStream(in: InputStream, failAtPos: Int) extends InputStream {
+    private val countDown = new AtomicInteger(failAtPos)
+
+    def didFail: Boolean = countDown.get == 0
+
+    @throws[IOException]
+    def read: Int = {
+      if (countDown.get == 0) {
+        throw new EOFException("Stream ended prematurely")
+      }
+      countDown.decrementAndGet()
+      in.read
+    }
+  }
 }

From 9b4da7b7906131e42fcac109d40e4d663f57c291 Mon Sep 17 00:00:00 2001
From: "iurii.ant" <sereneant@gmail.com>
Date: Tue, 25 Jul 2017 21:43:39 +0100
Subject: [PATCH 235/779] [SPARK-21491][GRAPHX] Enhance GraphX performance:
 breakOut instead of .toMap

## What changes were proposed in this pull request?

`Traversable.toMap` changed to 'collections.breakOut', that eliminates intermediate tuple collection creation, see [Stack Overflow article](https://stackoverflow.com/questions/1715681/scala-2-8-breakout).

## How was this patch tested?
Unit tests run.
No performance tests performed yet.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: iurii.ant <sereneant@gmail.com>

Closes #18693 from SereneAnt/performance_toMap-breakOut.
---
 .../scala/org/apache/spark/graphx/lib/LabelPropagation.scala | 2 +-
 .../scala/org/apache/spark/graphx/lib/ShortestPaths.scala    | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/LabelPropagation.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/LabelPropagation.scala
index fc7547a2c7c27..cb3025f8bef54 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/lib/LabelPropagation.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/LabelPropagation.scala
@@ -55,7 +55,7 @@ object LabelPropagation {
         val count1Val = count1.getOrElse(i, 0L)
         val count2Val = count2.getOrElse(i, 0L)
         i -> (count1Val + count2Val)
-      }.toMap
+      }(collection.breakOut) // more efficient alternative to [[collection.Traversable.toMap]]
     }
     def vertexProgram(vid: VertexId, attr: Long, message: Map[VertexId, Long]): VertexId = {
       if (message.isEmpty) attr else message.maxBy(_._2)._1
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/ShortestPaths.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/ShortestPaths.scala
index f0c6bcb93445c..4cac633aed008 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/lib/ShortestPaths.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/ShortestPaths.scala
@@ -33,10 +33,11 @@ object ShortestPaths {
 
   private def incrementMap(spmap: SPMap): SPMap = spmap.map { case (v, d) => v -> (d + 1) }
 
-  private def addMaps(spmap1: SPMap, spmap2: SPMap): SPMap =
+  private def addMaps(spmap1: SPMap, spmap2: SPMap): SPMap = {
     (spmap1.keySet ++ spmap2.keySet).map {
       k => k -> math.min(spmap1.getOrElse(k, Int.MaxValue), spmap2.getOrElse(k, Int.MaxValue))
-    }.toMap
+    }(collection.breakOut) // more efficient alternative to [[collection.Traversable.toMap]]
+  }
 
   /**
    * Computes shortest paths to the given set of landmark vertices.

From ebc24a9b7fde273ee4912f9bc1c5059703f7b31e Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 25 Jul 2017 17:19:44 -0700
Subject: [PATCH 236/779] [SPARK-20586][SQL] Add deterministic to ScalaUDF

### What changes were proposed in this pull request?
Like [Hive UDFType](https://hive.apache.org/javadocs/r2.0.1/api/org/apache/hadoop/hive/ql/udf/UDFType.html), we should allow users to add the extra flags for ScalaUDF and JavaUDF too. _stateful_/_impliesOrder_ are not applicable to our Scala UDF. Thus, we only add the following two flags.

- deterministic: Certain optimizations should not be applied if UDF is not deterministic. Deterministic UDF returns same result each time it is invoked with a particular input. This determinism just needs to hold within the context of a query.

When the deterministic flag is not correctly set, the results could be wrong.

For ScalaUDF in Dataset APIs, users can call the following extra APIs for `UserDefinedFunction` to make the corresponding changes.
- `nonDeterministic`: Updates UserDefinedFunction to non-deterministic.

Also fixed the Java UDF name loss issue.

Will submit a separate PR for `distinctLike`  for UDAF

### How was this patch tested?
Added test cases for both ScalaUDF

Author: gatorsmile <gatorsmile@gmail.com>
Author: Wenchen Fan <cloud0fan@gmail.com>

Closes #17848 from gatorsmile/udfRegister.
---
 python/pyspark/sql/context.py                 |   4 +-
 .../sql/catalyst/analysis/Analyzer.scala      |   2 +-
 .../sql/catalyst/expressions/ScalaUDF.scala   |  10 +-
 .../apache/spark/sql/UDFRegistration.scala    | 243 ++++++++++--------
 .../sql/expressions/UserDefinedFunction.scala |  48 +++-
 .../org/apache/spark/sql/functions.scala      | 113 +++++---
 .../scala/org/apache/spark/sql/UDFSuite.scala |  22 +-
 7 files changed, 278 insertions(+), 164 deletions(-)

diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index c44ab247fd3d3..b1e723cdecef3 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -220,11 +220,11 @@ def registerJavaFunction(self, name, javaClassName, returnType=None):
         >>> sqlContext.registerJavaFunction("javaStringLength",
         ...   "test.org.apache.spark.sql.JavaStringLength", IntegerType())
         >>> sqlContext.sql("SELECT javaStringLength('test')").collect()
-        [Row(UDF(test)=4)]
+        [Row(UDF:javaStringLength(test)=4)]
         >>> sqlContext.registerJavaFunction("javaStringLength2",
         ...   "test.org.apache.spark.sql.JavaStringLength")
         >>> sqlContext.sql("SELECT javaStringLength2('test')").collect()
-        [Row(UDF(test)=4)]
+        [Row(UDF:javaStringLength2(test)=4)]
 
         """
         jdt = None
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 501e7e3c6961d..913d846a8c23b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -1950,7 +1950,7 @@ class Analyzer(
 
       case p => p transformExpressionsUp {
 
-        case udf @ ScalaUDF(func, _, inputs, _, _, _) =>
+        case udf @ ScalaUDF(func, _, inputs, _, _, _, _) =>
           val parameterTypes = ScalaReflection.getParameterTypes(func)
           assert(parameterTypes.length == inputs.length)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
index a54f6d0e11147..9df0e2e1415c0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
@@ -24,7 +24,6 @@ import org.apache.spark.sql.types.DataType
 
 /**
  * User-defined function.
- * Note that the user-defined functions must be deterministic.
  * @param function  The user defined scala function to run.
  *                  Note that if you use primitive parameters, you are not able to check if it is
  *                  null or not, and the UDF will return null for you if the primitive input is
@@ -35,8 +34,10 @@ import org.apache.spark.sql.types.DataType
  *                    not want to perform coercion, simply use "Nil". Note that it would've been
  *                    better to use Option of Seq[DataType] so we can use "None" as the case for no
  *                    type coercion. However, that would require more refactoring of the codebase.
- * @param udfName   The user-specified name of this UDF.
+ * @param udfName  The user-specified name of this UDF.
  * @param nullable  True if the UDF can return null value.
+ * @param udfDeterministic  True if the UDF is deterministic. Deterministic UDF returns same result
+ *                          each time it is invoked with a particular input.
  */
 case class ScalaUDF(
     function: AnyRef,
@@ -44,9 +45,12 @@ case class ScalaUDF(
     children: Seq[Expression],
     inputTypes: Seq[DataType] = Nil,
     udfName: Option[String] = None,
-    nullable: Boolean = true)
+    nullable: Boolean = true,
+    udfDeterministic: Boolean = true)
   extends Expression with ImplicitCastInputTypes with NonSQLExpression {
 
+  override def deterministic: Boolean = udfDeterministic && children.forall(_.deterministic)
+
   override def toString: String =
     s"${udfName.map(name => s"UDF:$name").getOrElse("UDF")}(${children.mkString(", ")})"
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
index c66d4057b9135..737afb4ac564e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
@@ -64,7 +64,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined aggregate function (UDAF).
+   * Registers a user-defined aggregate function (UDAF).
    *
    * @param name the name of the UDAF.
    * @param udaf the UDAF needs to be registered.
@@ -79,8 +79,19 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function (UDF), for a UDF that's already defined using the DataFrame
-   * API (i.e. of type UserDefinedFunction).
+   * Registers a user-defined function (UDF), for a UDF that's already defined using the Dataset
+   * API (i.e. of type UserDefinedFunction). To change a UDF to nondeterministic, call the API
+   * `UserDefinedFunction.asNondeterministic()`. To change a UDF to nonNullable, call the API
+   * `UserDefinedFunction.asNonNullabe()`.
+   *
+   * Example:
+   * {{{
+   *   val foo = udf(() => Math.random())
+   *   spark.udf.register("random", foo.asNondeterministic())
+   *
+   *   val bar = udf(() => "bar")
+   *   spark.udf.register("stringLit", bar.asNonNullabe())
+   * }}}
    *
    * @param name the name of the UDF.
    * @param udf the UDF needs to be registered.
@@ -104,7 +115,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
       val inputTypes = (1 to x).foldRight("Nil")((i, s) => {s"ScalaReflection.schemaFor[A$i].dataType :: $s"})
       println(s"""
         /**
-         * Register a Scala closure of ${x} arguments as user-defined function (UDF).
+         * Registers a deterministic Scala closure of ${x} arguments as user-defined function (UDF).
          * @tparam RT return type of UDF.
          * @since 1.3.0
          */
@@ -112,13 +123,14 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
           val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
           val inputTypes = Try($inputTypes).toOption
           def builder(e: Seq[Expression]) = if (e.length == $x) {
-            ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+            ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable, udfDeterministic = true)
           } else {
              throw new AnalysisException("Invalid number of arguments for function " + name +
                ". Expected: $x; Found: " + e.length)
           }
           functionRegistry.createOrReplaceTempFunction(name, builder)
-          UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
+          val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
+          if (nullable) udf else udf.asNonNullabe()
         }""")
     }
 
@@ -137,7 +149,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
         |def register(name: String, f: UDF$i[$extTypeArgs], returnType: DataType): Unit = {
         |  val func = f$anyCast.call($anyParams)
         |  def builder(e: Seq[Expression]) = if (e.length == $i) {
-        |    ScalaUDF($funcCall, returnType, e)
+        |    ScalaUDF($funcCall, returnType, e, udfName = Some(name))
         |  } else {
         |    throw new AnalysisException("Invalid number of arguments for function " + name +
         |      ". Expected: $i; Found: " + e.length)
@@ -148,7 +160,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     */
 
   /**
-   * Register a Scala closure of 0 arguments as user-defined function (UDF).
+   * Registers a deterministic Scala closure of 0 arguments as user-defined function (UDF).
    * @tparam RT return type of UDF.
    * @since 1.3.0
    */
@@ -156,17 +168,18 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(Nil).toOption
     def builder(e: Seq[Expression]) = if (e.length == 0) {
-      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable, udfDeterministic = true)
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 0; Found: " + e.length)
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
-    UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
+    val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Register a Scala closure of 1 arguments as user-defined function (UDF).
+   * Registers a deterministic Scala closure of 1 arguments as user-defined function (UDF).
    * @tparam RT return type of UDF.
    * @since 1.3.0
    */
@@ -174,17 +187,18 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: Nil).toOption
     def builder(e: Seq[Expression]) = if (e.length == 1) {
-      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable, udfDeterministic = true)
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 1; Found: " + e.length)
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
-    UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
+    val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Register a Scala closure of 2 arguments as user-defined function (UDF).
+   * Registers a deterministic Scala closure of 2 arguments as user-defined function (UDF).
    * @tparam RT return type of UDF.
    * @since 1.3.0
    */
@@ -192,17 +206,18 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: Nil).toOption
     def builder(e: Seq[Expression]) = if (e.length == 2) {
-      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable, udfDeterministic = true)
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 2; Found: " + e.length)
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
-    UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
+    val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Register a Scala closure of 3 arguments as user-defined function (UDF).
+   * Registers a deterministic Scala closure of 3 arguments as user-defined function (UDF).
    * @tparam RT return type of UDF.
    * @since 1.3.0
    */
@@ -210,17 +225,18 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: Nil).toOption
     def builder(e: Seq[Expression]) = if (e.length == 3) {
-      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable, udfDeterministic = true)
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 3; Found: " + e.length)
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
-    UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
+    val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Register a Scala closure of 4 arguments as user-defined function (UDF).
+   * Registers a deterministic Scala closure of 4 arguments as user-defined function (UDF).
    * @tparam RT return type of UDF.
    * @since 1.3.0
    */
@@ -228,17 +244,18 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: Nil).toOption
     def builder(e: Seq[Expression]) = if (e.length == 4) {
-      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable, udfDeterministic = true)
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 4; Found: " + e.length)
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
-    UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
+    val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Register a Scala closure of 5 arguments as user-defined function (UDF).
+   * Registers a deterministic Scala closure of 5 arguments as user-defined function (UDF).
    * @tparam RT return type of UDF.
    * @since 1.3.0
    */
@@ -246,17 +263,18 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: Nil).toOption
     def builder(e: Seq[Expression]) = if (e.length == 5) {
-      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable, udfDeterministic = true)
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 5; Found: " + e.length)
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
-    UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
+    val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Register a Scala closure of 6 arguments as user-defined function (UDF).
+   * Registers a deterministic Scala closure of 6 arguments as user-defined function (UDF).
    * @tparam RT return type of UDF.
    * @since 1.3.0
    */
@@ -264,17 +282,18 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: Nil).toOption
     def builder(e: Seq[Expression]) = if (e.length == 6) {
-      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable, udfDeterministic = true)
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 6; Found: " + e.length)
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
-    UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
+    val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Register a Scala closure of 7 arguments as user-defined function (UDF).
+   * Registers a deterministic Scala closure of 7 arguments as user-defined function (UDF).
    * @tparam RT return type of UDF.
    * @since 1.3.0
    */
@@ -282,17 +301,18 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: Nil).toOption
     def builder(e: Seq[Expression]) = if (e.length == 7) {
-      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable, udfDeterministic = true)
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 7; Found: " + e.length)
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
-    UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
+    val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Register a Scala closure of 8 arguments as user-defined function (UDF).
+   * Registers a deterministic Scala closure of 8 arguments as user-defined function (UDF).
    * @tparam RT return type of UDF.
    * @since 1.3.0
    */
@@ -300,17 +320,18 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: Nil).toOption
     def builder(e: Seq[Expression]) = if (e.length == 8) {
-      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable, udfDeterministic = true)
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 8; Found: " + e.length)
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
-    UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
+    val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Register a Scala closure of 9 arguments as user-defined function (UDF).
+   * Registers a deterministic Scala closure of 9 arguments as user-defined function (UDF).
    * @tparam RT return type of UDF.
    * @since 1.3.0
    */
@@ -318,17 +339,18 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: Nil).toOption
     def builder(e: Seq[Expression]) = if (e.length == 9) {
-      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable, udfDeterministic = true)
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 9; Found: " + e.length)
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
-    UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
+    val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Register a Scala closure of 10 arguments as user-defined function (UDF).
+   * Registers a deterministic Scala closure of 10 arguments as user-defined function (UDF).
    * @tparam RT return type of UDF.
    * @since 1.3.0
    */
@@ -336,17 +358,18 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: Nil).toOption
     def builder(e: Seq[Expression]) = if (e.length == 10) {
-      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable, udfDeterministic = true)
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 10; Found: " + e.length)
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
-    UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
+    val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Register a Scala closure of 11 arguments as user-defined function (UDF).
+   * Registers a deterministic Scala closure of 11 arguments as user-defined function (UDF).
    * @tparam RT return type of UDF.
    * @since 1.3.0
    */
@@ -354,17 +377,18 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: Nil).toOption
     def builder(e: Seq[Expression]) = if (e.length == 11) {
-      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable, udfDeterministic = true)
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 11; Found: " + e.length)
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
-    UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
+    val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Register a Scala closure of 12 arguments as user-defined function (UDF).
+   * Registers a deterministic Scala closure of 12 arguments as user-defined function (UDF).
    * @tparam RT return type of UDF.
    * @since 1.3.0
    */
@@ -372,17 +396,18 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: Nil).toOption
     def builder(e: Seq[Expression]) = if (e.length == 12) {
-      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable, udfDeterministic = true)
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 12; Found: " + e.length)
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
-    UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
+    val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Register a Scala closure of 13 arguments as user-defined function (UDF).
+   * Registers a deterministic Scala closure of 13 arguments as user-defined function (UDF).
    * @tparam RT return type of UDF.
    * @since 1.3.0
    */
@@ -390,17 +415,18 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: Nil).toOption
     def builder(e: Seq[Expression]) = if (e.length == 13) {
-      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable, udfDeterministic = true)
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 13; Found: " + e.length)
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
-    UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
+    val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Register a Scala closure of 14 arguments as user-defined function (UDF).
+   * Registers a deterministic Scala closure of 14 arguments as user-defined function (UDF).
    * @tparam RT return type of UDF.
    * @since 1.3.0
    */
@@ -408,17 +434,18 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: Nil).toOption
     def builder(e: Seq[Expression]) = if (e.length == 14) {
-      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable, udfDeterministic = true)
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 14; Found: " + e.length)
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
-    UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
+    val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Register a Scala closure of 15 arguments as user-defined function (UDF).
+   * Registers a deterministic Scala closure of 15 arguments as user-defined function (UDF).
    * @tparam RT return type of UDF.
    * @since 1.3.0
    */
@@ -426,17 +453,18 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: Nil).toOption
     def builder(e: Seq[Expression]) = if (e.length == 15) {
-      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable, udfDeterministic = true)
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 15; Found: " + e.length)
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
-    UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
+    val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Register a Scala closure of 16 arguments as user-defined function (UDF).
+   * Registers a deterministic Scala closure of 16 arguments as user-defined function (UDF).
    * @tparam RT return type of UDF.
    * @since 1.3.0
    */
@@ -444,17 +472,18 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: ScalaReflection.schemaFor[A16].dataType :: Nil).toOption
     def builder(e: Seq[Expression]) = if (e.length == 16) {
-      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable, udfDeterministic = true)
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 16; Found: " + e.length)
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
-    UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
+    val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Register a Scala closure of 17 arguments as user-defined function (UDF).
+   * Registers a deterministic Scala closure of 17 arguments as user-defined function (UDF).
    * @tparam RT return type of UDF.
    * @since 1.3.0
    */
@@ -462,17 +491,18 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: ScalaReflection.schemaFor[A16].dataType :: ScalaReflection.schemaFor[A17].dataType :: Nil).toOption
     def builder(e: Seq[Expression]) = if (e.length == 17) {
-      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable, udfDeterministic = true)
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 17; Found: " + e.length)
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
-    UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
+    val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Register a Scala closure of 18 arguments as user-defined function (UDF).
+   * Registers a deterministic Scala closure of 18 arguments as user-defined function (UDF).
    * @tparam RT return type of UDF.
    * @since 1.3.0
    */
@@ -480,17 +510,18 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: ScalaReflection.schemaFor[A16].dataType :: ScalaReflection.schemaFor[A17].dataType :: ScalaReflection.schemaFor[A18].dataType :: Nil).toOption
     def builder(e: Seq[Expression]) = if (e.length == 18) {
-      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable, udfDeterministic = true)
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 18; Found: " + e.length)
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
-    UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
+    val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Register a Scala closure of 19 arguments as user-defined function (UDF).
+   * Registers a deterministic Scala closure of 19 arguments as user-defined function (UDF).
    * @tparam RT return type of UDF.
    * @since 1.3.0
    */
@@ -498,17 +529,18 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: ScalaReflection.schemaFor[A16].dataType :: ScalaReflection.schemaFor[A17].dataType :: ScalaReflection.schemaFor[A18].dataType :: ScalaReflection.schemaFor[A19].dataType :: Nil).toOption
     def builder(e: Seq[Expression]) = if (e.length == 19) {
-      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable, udfDeterministic = true)
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 19; Found: " + e.length)
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
-    UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
+    val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Register a Scala closure of 20 arguments as user-defined function (UDF).
+   * Registers a deterministic Scala closure of 20 arguments as user-defined function (UDF).
    * @tparam RT return type of UDF.
    * @since 1.3.0
    */
@@ -516,17 +548,18 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: ScalaReflection.schemaFor[A16].dataType :: ScalaReflection.schemaFor[A17].dataType :: ScalaReflection.schemaFor[A18].dataType :: ScalaReflection.schemaFor[A19].dataType :: ScalaReflection.schemaFor[A20].dataType :: Nil).toOption
     def builder(e: Seq[Expression]) = if (e.length == 20) {
-      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable, udfDeterministic = true)
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 20; Found: " + e.length)
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
-    UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
+    val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Register a Scala closure of 21 arguments as user-defined function (UDF).
+   * Registers a deterministic Scala closure of 21 arguments as user-defined function (UDF).
    * @tparam RT return type of UDF.
    * @since 1.3.0
    */
@@ -534,17 +567,18 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: ScalaReflection.schemaFor[A16].dataType :: ScalaReflection.schemaFor[A17].dataType :: ScalaReflection.schemaFor[A18].dataType :: ScalaReflection.schemaFor[A19].dataType :: ScalaReflection.schemaFor[A20].dataType :: ScalaReflection.schemaFor[A21].dataType :: Nil).toOption
     def builder(e: Seq[Expression]) = if (e.length == 21) {
-      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable, udfDeterministic = true)
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 21; Found: " + e.length)
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
-    UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
+    val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Register a Scala closure of 22 arguments as user-defined function (UDF).
+   * Registers a deterministic Scala closure of 22 arguments as user-defined function (UDF).
    * @tparam RT return type of UDF.
    * @since 1.3.0
    */
@@ -552,13 +586,14 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: ScalaReflection.schemaFor[A16].dataType :: ScalaReflection.schemaFor[A17].dataType :: ScalaReflection.schemaFor[A18].dataType :: ScalaReflection.schemaFor[A19].dataType :: ScalaReflection.schemaFor[A20].dataType :: ScalaReflection.schemaFor[A21].dataType :: ScalaReflection.schemaFor[A22].dataType :: Nil).toOption
     def builder(e: Seq[Expression]) = if (e.length == 22) {
-      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable)
+      ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable, udfDeterministic = true)
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 22; Found: " + e.length)
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
-    UserDefinedFunction(func, dataType, inputTypes).withName(name).withNullability(nullable)
+    val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   //////////////////////////////////////////////////////////////////////////////////////////////
@@ -581,9 +616,9 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
         .map(_.asInstanceOf[ParameterizedType])
         .filter(e => e.getRawType.isInstanceOf[Class[_]] && e.getRawType.asInstanceOf[Class[_]].getCanonicalName.startsWith("org.apache.spark.sql.api.java.UDF"))
       if (udfInterfaces.length == 0) {
-        throw new AnalysisException(s"UDF class ${className} doesn't implement any UDF interface")
+        throw new AnalysisException(s"UDF class $className doesn't implement any UDF interface")
       } else if (udfInterfaces.length > 1) {
-        throw new AnalysisException(s"It is invalid to implement multiple UDF interfaces, UDF class ${className}")
+        throw new AnalysisException(s"It is invalid to implement multiple UDF interfaces, UDF class $className")
       } else {
         try {
           val udf = clazz.newInstance()
@@ -618,15 +653,15 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
             case 22 => register(name, udf.asInstanceOf[UDF21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
             case 23 => register(name, udf.asInstanceOf[UDF22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
             case n =>
-              throw new AnalysisException(s"UDF class with ${n} type arguments is not supported.")
+              throw new AnalysisException(s"UDF class with $n type arguments is not supported.")
           }
         } catch {
           case e @ (_: InstantiationException | _: IllegalArgumentException) =>
-            throw new AnalysisException(s"Can not instantiate class ${className}, please make sure it has public non argument constructor")
+            throw new AnalysisException(s"Can not instantiate class $className, please make sure it has public non argument constructor")
         }
       }
     } catch {
-      case e: ClassNotFoundException => throw new AnalysisException(s"Can not load class ${className}, please make sure it is on the classpath")
+      case e: ClassNotFoundException => throw new AnalysisException(s"Can not load class $className, please make sure it is on the classpath")
     }
 
   }
@@ -659,7 +694,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register(name: String, f: UDF0[_], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF0[Any]].call()
     def builder(e: Seq[Expression]) = if (e.length == 0) {
-      ScalaUDF(() => func, returnType, e)
+      ScalaUDF(() => func, returnType, e, udfName = Some(name))
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 0; Found: " + e.length)
@@ -674,7 +709,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register(name: String, f: UDF1[_, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF1[Any, Any]].call(_: Any)
     def builder(e: Seq[Expression]) = if (e.length == 1) {
-      ScalaUDF(func, returnType, e)
+      ScalaUDF(func, returnType, e, udfName = Some(name))
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 1; Found: " + e.length)
@@ -689,7 +724,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register(name: String, f: UDF2[_, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF2[Any, Any, Any]].call(_: Any, _: Any)
     def builder(e: Seq[Expression]) = if (e.length == 2) {
-      ScalaUDF(func, returnType, e)
+      ScalaUDF(func, returnType, e, udfName = Some(name))
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 2; Found: " + e.length)
@@ -704,7 +739,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register(name: String, f: UDF3[_, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF3[Any, Any, Any, Any]].call(_: Any, _: Any, _: Any)
     def builder(e: Seq[Expression]) = if (e.length == 3) {
-      ScalaUDF(func, returnType, e)
+      ScalaUDF(func, returnType, e, udfName = Some(name))
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 3; Found: " + e.length)
@@ -719,7 +754,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register(name: String, f: UDF4[_, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF4[Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any)
     def builder(e: Seq[Expression]) = if (e.length == 4) {
-      ScalaUDF(func, returnType, e)
+      ScalaUDF(func, returnType, e, udfName = Some(name))
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 4; Found: " + e.length)
@@ -734,7 +769,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register(name: String, f: UDF5[_, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF5[Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any)
     def builder(e: Seq[Expression]) = if (e.length == 5) {
-      ScalaUDF(func, returnType, e)
+      ScalaUDF(func, returnType, e, udfName = Some(name))
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 5; Found: " + e.length)
@@ -749,7 +784,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register(name: String, f: UDF6[_, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF6[Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     def builder(e: Seq[Expression]) = if (e.length == 6) {
-      ScalaUDF(func, returnType, e)
+      ScalaUDF(func, returnType, e, udfName = Some(name))
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 6; Found: " + e.length)
@@ -764,7 +799,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register(name: String, f: UDF7[_, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF7[Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     def builder(e: Seq[Expression]) = if (e.length == 7) {
-      ScalaUDF(func, returnType, e)
+      ScalaUDF(func, returnType, e, udfName = Some(name))
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 7; Found: " + e.length)
@@ -779,7 +814,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register(name: String, f: UDF8[_, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF8[Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     def builder(e: Seq[Expression]) = if (e.length == 8) {
-      ScalaUDF(func, returnType, e)
+      ScalaUDF(func, returnType, e, udfName = Some(name))
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 8; Found: " + e.length)
@@ -794,7 +829,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register(name: String, f: UDF9[_, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF9[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     def builder(e: Seq[Expression]) = if (e.length == 9) {
-      ScalaUDF(func, returnType, e)
+      ScalaUDF(func, returnType, e, udfName = Some(name))
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 9; Found: " + e.length)
@@ -809,7 +844,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register(name: String, f: UDF10[_, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF10[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     def builder(e: Seq[Expression]) = if (e.length == 10) {
-      ScalaUDF(func, returnType, e)
+      ScalaUDF(func, returnType, e, udfName = Some(name))
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 10; Found: " + e.length)
@@ -824,7 +859,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register(name: String, f: UDF11[_, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF11[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     def builder(e: Seq[Expression]) = if (e.length == 11) {
-      ScalaUDF(func, returnType, e)
+      ScalaUDF(func, returnType, e, udfName = Some(name))
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 11; Found: " + e.length)
@@ -839,7 +874,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register(name: String, f: UDF12[_, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF12[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     def builder(e: Seq[Expression]) = if (e.length == 12) {
-      ScalaUDF(func, returnType, e)
+      ScalaUDF(func, returnType, e, udfName = Some(name))
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 12; Found: " + e.length)
@@ -854,7 +889,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register(name: String, f: UDF13[_, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF13[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     def builder(e: Seq[Expression]) = if (e.length == 13) {
-      ScalaUDF(func, returnType, e)
+      ScalaUDF(func, returnType, e, udfName = Some(name))
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 13; Found: " + e.length)
@@ -869,7 +904,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register(name: String, f: UDF14[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF14[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     def builder(e: Seq[Expression]) = if (e.length == 14) {
-      ScalaUDF(func, returnType, e)
+      ScalaUDF(func, returnType, e, udfName = Some(name))
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 14; Found: " + e.length)
@@ -884,7 +919,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register(name: String, f: UDF15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF15[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     def builder(e: Seq[Expression]) = if (e.length == 15) {
-      ScalaUDF(func, returnType, e)
+      ScalaUDF(func, returnType, e, udfName = Some(name))
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 15; Found: " + e.length)
@@ -899,7 +934,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register(name: String, f: UDF16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF16[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     def builder(e: Seq[Expression]) = if (e.length == 16) {
-      ScalaUDF(func, returnType, e)
+      ScalaUDF(func, returnType, e, udfName = Some(name))
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 16; Found: " + e.length)
@@ -914,7 +949,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register(name: String, f: UDF17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF17[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     def builder(e: Seq[Expression]) = if (e.length == 17) {
-      ScalaUDF(func, returnType, e)
+      ScalaUDF(func, returnType, e, udfName = Some(name))
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 17; Found: " + e.length)
@@ -929,7 +964,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register(name: String, f: UDF18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF18[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     def builder(e: Seq[Expression]) = if (e.length == 18) {
-      ScalaUDF(func, returnType, e)
+      ScalaUDF(func, returnType, e, udfName = Some(name))
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 18; Found: " + e.length)
@@ -944,7 +979,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register(name: String, f: UDF19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF19[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     def builder(e: Seq[Expression]) = if (e.length == 19) {
-      ScalaUDF(func, returnType, e)
+      ScalaUDF(func, returnType, e, udfName = Some(name))
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 19; Found: " + e.length)
@@ -959,7 +994,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register(name: String, f: UDF20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF20[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     def builder(e: Seq[Expression]) = if (e.length == 20) {
-      ScalaUDF(func, returnType, e)
+      ScalaUDF(func, returnType, e, udfName = Some(name))
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 20; Found: " + e.length)
@@ -974,7 +1009,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register(name: String, f: UDF21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF21[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     def builder(e: Seq[Expression]) = if (e.length == 21) {
-      ScalaUDF(func, returnType, e)
+      ScalaUDF(func, returnType, e, udfName = Some(name))
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 21; Found: " + e.length)
@@ -989,7 +1024,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register(name: String, f: UDF22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
     val func = f.asInstanceOf[UDF22[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     def builder(e: Seq[Expression]) = if (e.length == 22) {
-      ScalaUDF(func, returnType, e)
+      ScalaUDF(func, returnType, e, udfName = Some(name))
     } else {
       throw new AnalysisException("Invalid number of arguments for function " + name +
         ". Expected: 22; Found: " + e.length)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
index 0c5f1b436591d..97b921a622636 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
@@ -20,7 +20,6 @@ package org.apache.spark.sql.expressions
 import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.sql.catalyst.expressions.ScalaUDF
 import org.apache.spark.sql.Column
-import org.apache.spark.sql.functions
 import org.apache.spark.sql.types.DataType
 
 /**
@@ -35,10 +34,6 @@ import org.apache.spark.sql.types.DataType
  *   df.select( predict(df("score")) )
  * }}}
  *
- * @note The user-defined functions must be deterministic. Due to optimization,
- * duplicate invocations may be eliminated or the function may even be invoked more times than
- * it is present in the query.
- *
  * @since 1.3.0
  */
 @InterfaceStability.Stable
@@ -49,6 +44,7 @@ case class UserDefinedFunction protected[sql] (
 
   private var _nameOption: Option[String] = None
   private var _nullable: Boolean = true
+  private var _deterministic: Boolean = true
 
   /**
    * Returns true when the UDF can return a nullable value.
@@ -57,6 +53,14 @@ case class UserDefinedFunction protected[sql] (
    */
   def nullable: Boolean = _nullable
 
+  /**
+   * Returns true iff the UDF is deterministic, i.e. the UDF produces the same output given the same
+   * input.
+   *
+   * @since 2.3.0
+   */
+  def deterministic: Boolean = _deterministic
+
   /**
    * Returns an expression that invokes the UDF, using the given arguments.
    *
@@ -69,13 +73,15 @@ case class UserDefinedFunction protected[sql] (
       exprs.map(_.expr),
       inputTypes.getOrElse(Nil),
       udfName = _nameOption,
-      nullable = _nullable))
+      nullable = _nullable,
+      udfDeterministic = _deterministic))
   }
 
   private def copyAll(): UserDefinedFunction = {
     val udf = copy()
     udf._nameOption = _nameOption
     udf._nullable = _nullable
+    udf._deterministic = _deterministic
     udf
   }
 
@@ -84,22 +90,38 @@ case class UserDefinedFunction protected[sql] (
    *
    * @since 2.3.0
    */
-  def withName(name: String): this.type = {
-    this._nameOption = Option(name)
-    this
+  def withName(name: String): UserDefinedFunction = {
+    val udf = copyAll()
+    udf._nameOption = Option(name)
+    udf
+  }
+
+  /**
+   * Updates UserDefinedFunction to non-nullable.
+   *
+   * @since 2.3.0
+   */
+  def asNonNullabe(): UserDefinedFunction = {
+    if (!nullable) {
+      this
+    } else {
+      val udf = copyAll()
+      udf._nullable = false
+      udf
+    }
   }
 
   /**
-   * Updates UserDefinedFunction with a given nullability.
+   * Updates UserDefinedFunction to nondeterministic.
    *
    * @since 2.3.0
    */
-  def withNullability(nullable: Boolean): UserDefinedFunction = {
-    if (nullable == _nullable) {
+  def asNondeterministic(): UserDefinedFunction = {
+    if (!_deterministic) {
       this
     } else {
       val udf = copyAll()
-      udf._nullable = nullable
+      udf._deterministic = false
       udf
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index ebdeb42b0bfb1..ccff00e570dbd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -3185,8 +3185,10 @@ object functions {
     val inputTypes = (1 to x).foldRight("Nil")((i, s) => {s"ScalaReflection.schemaFor(typeTag[A$i]).dataType :: $s"})
     println(s"""
     /**
-     * Defines a user-defined function of ${x} arguments as user-defined function (UDF).
-     * The data types are automatically inferred based on the function's signature.
+     * Defines a deterministic user-defined function of ${x} arguments as user-defined
+     * function (UDF). The data types are automatically inferred based on the function's
+     * signature. To change a UDF to nondeterministic, call the API
+     * `UserDefinedFunction.asNondeterministic()`.
      *
      * @group udf_funcs
      * @since 1.3.0
@@ -3194,15 +3196,18 @@ object functions {
     def udf[$typeTags](f: Function$x[$types]): UserDefinedFunction = {
       val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
       val inputTypes = Try($inputTypes).toOption
-      UserDefinedFunction(f, dataType, inputTypes).withNullability(nullable)
+      val udf = UserDefinedFunction(f, dataType, inputTypes)
+      if (nullable) udf else udf.asNonNullabe()
     }""")
   }
 
   */
 
   /**
-   * Defines a user-defined function of 0 arguments as user-defined function (UDF).
-   * The data types are automatically inferred based on the function's signature.
+   * Defines a deterministic user-defined function of 0 arguments as user-defined
+   * function (UDF). The data types are automatically inferred based on the function's
+   * signature. To change a UDF to nondeterministic, call the API
+   * `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3210,12 +3215,15 @@ object functions {
   def udf[RT: TypeTag](f: Function0[RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(Nil).toOption
-    UserDefinedFunction(f, dataType, inputTypes).withNullability(nullable)
+    val udf = UserDefinedFunction(f, dataType, inputTypes)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Defines a user-defined function of 1 arguments as user-defined function (UDF).
-   * The data types are automatically inferred based on the function's signature.
+   * Defines a deterministic user-defined function of 1 arguments as user-defined
+   * function (UDF). The data types are automatically inferred based on the function's
+   * signature. To change a UDF to nondeterministic, call the API
+   * `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3223,12 +3231,15 @@ object functions {
   def udf[RT: TypeTag, A1: TypeTag](f: Function1[A1, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor(typeTag[A1]).dataType :: Nil).toOption
-    UserDefinedFunction(f, dataType, inputTypes).withNullability(nullable)
+    val udf = UserDefinedFunction(f, dataType, inputTypes)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Defines a user-defined function of 2 arguments as user-defined function (UDF).
-   * The data types are automatically inferred based on the function's signature.
+   * Defines a deterministic user-defined function of 2 arguments as user-defined
+   * function (UDF). The data types are automatically inferred based on the function's
+   * signature. To change a UDF to nondeterministic, call the API
+   * `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3236,12 +3247,15 @@ object functions {
   def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag](f: Function2[A1, A2, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor(typeTag[A1]).dataType :: ScalaReflection.schemaFor(typeTag[A2]).dataType :: Nil).toOption
-    UserDefinedFunction(f, dataType, inputTypes).withNullability(nullable)
+    val udf = UserDefinedFunction(f, dataType, inputTypes)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Defines a user-defined function of 3 arguments as user-defined function (UDF).
-   * The data types are automatically inferred based on the function's signature.
+   * Defines a deterministic user-defined function of 3 arguments as user-defined
+   * function (UDF). The data types are automatically inferred based on the function's
+   * signature. To change a UDF to nondeterministic, call the API
+   * `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3249,12 +3263,15 @@ object functions {
   def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag](f: Function3[A1, A2, A3, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor(typeTag[A1]).dataType :: ScalaReflection.schemaFor(typeTag[A2]).dataType :: ScalaReflection.schemaFor(typeTag[A3]).dataType :: Nil).toOption
-    UserDefinedFunction(f, dataType, inputTypes).withNullability(nullable)
+    val udf = UserDefinedFunction(f, dataType, inputTypes)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Defines a user-defined function of 4 arguments as user-defined function (UDF).
-   * The data types are automatically inferred based on the function's signature.
+   * Defines a deterministic user-defined function of 4 arguments as user-defined
+   * function (UDF). The data types are automatically inferred based on the function's
+   * signature. To change a UDF to nondeterministic, call the API
+   * `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3262,12 +3279,15 @@ object functions {
   def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag](f: Function4[A1, A2, A3, A4, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor(typeTag[A1]).dataType :: ScalaReflection.schemaFor(typeTag[A2]).dataType :: ScalaReflection.schemaFor(typeTag[A3]).dataType :: ScalaReflection.schemaFor(typeTag[A4]).dataType :: Nil).toOption
-    UserDefinedFunction(f, dataType, inputTypes).withNullability(nullable)
+    val udf = UserDefinedFunction(f, dataType, inputTypes)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Defines a user-defined function of 5 arguments as user-defined function (UDF).
-   * The data types are automatically inferred based on the function's signature.
+   * Defines a deterministic user-defined function of 5 arguments as user-defined
+   * function (UDF). The data types are automatically inferred based on the function's
+   * signature. To change a UDF to nondeterministic, call the API
+   * `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3275,12 +3295,15 @@ object functions {
   def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag](f: Function5[A1, A2, A3, A4, A5, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor(typeTag[A1]).dataType :: ScalaReflection.schemaFor(typeTag[A2]).dataType :: ScalaReflection.schemaFor(typeTag[A3]).dataType :: ScalaReflection.schemaFor(typeTag[A4]).dataType :: ScalaReflection.schemaFor(typeTag[A5]).dataType :: Nil).toOption
-    UserDefinedFunction(f, dataType, inputTypes).withNullability(nullable)
+    val udf = UserDefinedFunction(f, dataType, inputTypes)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Defines a user-defined function of 6 arguments as user-defined function (UDF).
-   * The data types are automatically inferred based on the function's signature.
+   * Defines a deterministic user-defined function of 6 arguments as user-defined
+   * function (UDF). The data types are automatically inferred based on the function's
+   * signature. To change a UDF to nondeterministic, call the API
+   * `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3288,12 +3311,15 @@ object functions {
   def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag](f: Function6[A1, A2, A3, A4, A5, A6, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor(typeTag[A1]).dataType :: ScalaReflection.schemaFor(typeTag[A2]).dataType :: ScalaReflection.schemaFor(typeTag[A3]).dataType :: ScalaReflection.schemaFor(typeTag[A4]).dataType :: ScalaReflection.schemaFor(typeTag[A5]).dataType :: ScalaReflection.schemaFor(typeTag[A6]).dataType :: Nil).toOption
-    UserDefinedFunction(f, dataType, inputTypes).withNullability(nullable)
+    val udf = UserDefinedFunction(f, dataType, inputTypes)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Defines a user-defined function of 7 arguments as user-defined function (UDF).
-   * The data types are automatically inferred based on the function's signature.
+   * Defines a deterministic user-defined function of 7 arguments as user-defined
+   * function (UDF). The data types are automatically inferred based on the function's
+   * signature. To change a UDF to nondeterministic, call the API
+   * `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3301,12 +3327,15 @@ object functions {
   def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag](f: Function7[A1, A2, A3, A4, A5, A6, A7, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor(typeTag[A1]).dataType :: ScalaReflection.schemaFor(typeTag[A2]).dataType :: ScalaReflection.schemaFor(typeTag[A3]).dataType :: ScalaReflection.schemaFor(typeTag[A4]).dataType :: ScalaReflection.schemaFor(typeTag[A5]).dataType :: ScalaReflection.schemaFor(typeTag[A6]).dataType :: ScalaReflection.schemaFor(typeTag[A7]).dataType :: Nil).toOption
-    UserDefinedFunction(f, dataType, inputTypes).withNullability(nullable)
+    val udf = UserDefinedFunction(f, dataType, inputTypes)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Defines a user-defined function of 8 arguments as user-defined function (UDF).
-   * The data types are automatically inferred based on the function's signature.
+   * Defines a deterministic user-defined function of 8 arguments as user-defined
+   * function (UDF). The data types are automatically inferred based on the function's
+   * signature. To change a UDF to nondeterministic, call the API
+   * `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3314,12 +3343,15 @@ object functions {
   def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag](f: Function8[A1, A2, A3, A4, A5, A6, A7, A8, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor(typeTag[A1]).dataType :: ScalaReflection.schemaFor(typeTag[A2]).dataType :: ScalaReflection.schemaFor(typeTag[A3]).dataType :: ScalaReflection.schemaFor(typeTag[A4]).dataType :: ScalaReflection.schemaFor(typeTag[A5]).dataType :: ScalaReflection.schemaFor(typeTag[A6]).dataType :: ScalaReflection.schemaFor(typeTag[A7]).dataType :: ScalaReflection.schemaFor(typeTag[A8]).dataType :: Nil).toOption
-    UserDefinedFunction(f, dataType, inputTypes).withNullability(nullable)
+    val udf = UserDefinedFunction(f, dataType, inputTypes)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Defines a user-defined function of 9 arguments as user-defined function (UDF).
-   * The data types are automatically inferred based on the function's signature.
+   * Defines a deterministic user-defined function of 9 arguments as user-defined
+   * function (UDF). The data types are automatically inferred based on the function's
+   * signature. To change a UDF to nondeterministic, call the API
+   * `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3327,12 +3359,15 @@ object functions {
   def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag](f: Function9[A1, A2, A3, A4, A5, A6, A7, A8, A9, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor(typeTag[A1]).dataType :: ScalaReflection.schemaFor(typeTag[A2]).dataType :: ScalaReflection.schemaFor(typeTag[A3]).dataType :: ScalaReflection.schemaFor(typeTag[A4]).dataType :: ScalaReflection.schemaFor(typeTag[A5]).dataType :: ScalaReflection.schemaFor(typeTag[A6]).dataType :: ScalaReflection.schemaFor(typeTag[A7]).dataType :: ScalaReflection.schemaFor(typeTag[A8]).dataType :: ScalaReflection.schemaFor(typeTag[A9]).dataType :: Nil).toOption
-    UserDefinedFunction(f, dataType, inputTypes).withNullability(nullable)
+    val udf = UserDefinedFunction(f, dataType, inputTypes)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   /**
-   * Defines a user-defined function of 10 arguments as user-defined function (UDF).
-   * The data types are automatically inferred based on the function's signature.
+   * Defines a deterministic user-defined function of 10 arguments as user-defined
+   * function (UDF). The data types are automatically inferred based on the function's
+   * signature. To change a UDF to nondeterministic, call the API
+   * `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3340,15 +3375,17 @@ object functions {
   def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag](f: Function10[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, RT]): UserDefinedFunction = {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor(typeTag[A1]).dataType :: ScalaReflection.schemaFor(typeTag[A2]).dataType :: ScalaReflection.schemaFor(typeTag[A3]).dataType :: ScalaReflection.schemaFor(typeTag[A4]).dataType :: ScalaReflection.schemaFor(typeTag[A5]).dataType :: ScalaReflection.schemaFor(typeTag[A6]).dataType :: ScalaReflection.schemaFor(typeTag[A7]).dataType :: ScalaReflection.schemaFor(typeTag[A8]).dataType :: ScalaReflection.schemaFor(typeTag[A9]).dataType :: ScalaReflection.schemaFor(typeTag[A10]).dataType :: Nil).toOption
-    UserDefinedFunction(f, dataType, inputTypes).withNullability(nullable)
+    val udf = UserDefinedFunction(f, dataType, inputTypes)
+    if (nullable) udf else udf.asNonNullabe()
   }
 
   // scalastyle:on parameter.number
   // scalastyle:on line.size.limit
 
   /**
-   * Defines a user-defined function (UDF) using a Scala closure. For this variant, the caller must
-   * specify the output data type, and there is no automatic input type coercion.
+   * Defines a deterministic user-defined function (UDF) using a Scala closure. For this variant,
+   * the caller must specify the output data type, and there is no automatic input type coercion.
+   * To change a UDF to nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @param f  A closure in Scala
    * @param dataType  The output data type of the UDF
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
index 335b882ace92a..7f1c009ca6e7a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
@@ -17,9 +17,12 @@
 
 package org.apache.spark.sql
 
+import org.apache.spark.sql.catalyst.plans.logical.Project
 import org.apache.spark.sql.execution.command.ExplainCommand
+import org.apache.spark.sql.functions.{col, udf}
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.test.SQLTestData._
+import org.apache.spark.sql.types.DataTypes
 
 private case class FunctionResult(f1: String, f2: String)
 
@@ -109,9 +112,22 @@ class UDFSuite extends QueryTest with SharedSQLContext {
     assert(sql("select foo(5)").head().getInt(0) == 6)
   }
 
-  test("ZeroArgument UDF") {
-    spark.udf.register("random0", () => { Math.random()})
-    assert(sql("SELECT random0()").head().getDouble(0) >= 0.0)
+  test("ZeroArgument non-deterministic UDF") {
+    val foo = udf(() => Math.random())
+    spark.udf.register("random0", foo.asNondeterministic())
+    val df = sql("SELECT random0()")
+    assert(df.logicalPlan.asInstanceOf[Project].projectList.forall(!_.deterministic))
+    assert(df.head().getDouble(0) >= 0.0)
+
+    val foo1 = foo.asNondeterministic()
+    val df1 = testData.select(foo1())
+    assert(df1.logicalPlan.asInstanceOf[Project].projectList.forall(!_.deterministic))
+    assert(df1.head().getDouble(0) >= 0.0)
+
+    val bar = udf(() => Math.random(), DataTypes.DoubleType).asNondeterministic()
+    val df2 = testData.select(bar())
+    assert(df2.logicalPlan.asInstanceOf[Project].projectList.forall(!_.deterministic))
+    assert(df2.head().getDouble(0) >= 0.0)
   }
 
   test("TwoArgument UDF") {

From 300807c6e3011e4d78c6cf750201d0ab8e5bdaf5 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Tue, 25 Jul 2017 17:57:26 -0700
Subject: [PATCH 237/779] [SPARK-21494][NETWORK] Use correct app id when
 authenticating to external service.

There was some code based on the old SASL handler in the new auth client that
was incorrectly using the SASL user as the user to authenticate against the
external shuffle service. This caused the external service to not be able to
find the correct secret to authenticate the connection, failing the connection.

In the course of debugging, I found that some log messages from the YARN shuffle
service were a little noisy, so I silenced some of them, and also added a couple
of new ones that helped find this issue. On top of that, I found that a check
in the code that records app secrets was wrong, causing more log spam and also
using an O(n) operation instead of an O(1) call.

Also added a new integration suite for the YARN shuffle service with auth on,
and verified it failed before, and passes now.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #18706 from vanzin/SPARK-21494.
---
 .../network/crypto/AuthClientBootstrap.java   |  6 +--
 .../spark/network/crypto/AuthRpcHandler.java  |  7 +++-
 .../network/sasl/ShuffleSecretManager.java    |  4 +-
 .../network/yarn/YarnShuffleService.java      |  2 -
 .../yarn/YarnShuffleIntegrationSuite.scala    | 42 ++++++++++++++++---
 5 files changed, 47 insertions(+), 14 deletions(-)

diff --git a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthClientBootstrap.java b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthClientBootstrap.java
index 799f4540aa934..3c263783a6104 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthClientBootstrap.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthClientBootstrap.java
@@ -50,7 +50,6 @@ public class AuthClientBootstrap implements TransportClientBootstrap {
 
   private final TransportConf conf;
   private final String appId;
-  private final String authUser;
   private final SecretKeyHolder secretKeyHolder;
 
   public AuthClientBootstrap(
@@ -65,7 +64,6 @@ public AuthClientBootstrap(
     // required by the protocol. At some point, though, it would be better for the actual app ID
     // to be provided here.
     this.appId = appId;
-    this.authUser = secretKeyHolder.getSaslUser(appId);
     this.secretKeyHolder = secretKeyHolder;
   }
 
@@ -97,8 +95,8 @@ public void doBootstrap(TransportClient client, Channel channel) {
   private void doSparkAuth(TransportClient client, Channel channel)
     throws GeneralSecurityException, IOException {
 
-    String secretKey = secretKeyHolder.getSecretKey(authUser);
-    try (AuthEngine engine = new AuthEngine(authUser, secretKey, conf)) {
+    String secretKey = secretKeyHolder.getSecretKey(appId);
+    try (AuthEngine engine = new AuthEngine(appId, secretKey, conf)) {
       ClientChallenge challenge = engine.challenge();
       ByteBuf challengeData = Unpooled.buffer(challenge.encodedLength());
       challenge.encode(challengeData);
diff --git a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthRpcHandler.java b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthRpcHandler.java
index 0a5c029940005..8a6e3858081bf 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthRpcHandler.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthRpcHandler.java
@@ -20,6 +20,7 @@
 import java.nio.ByteBuffer;
 
 import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
 import com.google.common.base.Throwables;
 import io.netty.buffer.ByteBuf;
 import io.netty.buffer.Unpooled;
@@ -113,7 +114,11 @@ public void receive(TransportClient client, ByteBuffer message, RpcResponseCallb
     // Here we have the client challenge, so perform the new auth protocol and set up the channel.
     AuthEngine engine = null;
     try {
-      engine = new AuthEngine(challenge.appId, secretKeyHolder.getSecretKey(challenge.appId), conf);
+      String secret = secretKeyHolder.getSecretKey(challenge.appId);
+      Preconditions.checkState(secret != null,
+        "Trying to authenticate non-registered app %s.", challenge.appId);
+      LOG.debug("Authenticating challenge for app {}.", challenge.appId);
+      engine = new AuthEngine(challenge.appId, secret, conf);
       ServerResponse response = engine.respond(challenge);
       ByteBuf responseData = Unpooled.buffer(response.encodedLength());
       response.encode(responseData);
diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/sasl/ShuffleSecretManager.java b/common/network-shuffle/src/main/java/org/apache/spark/network/sasl/ShuffleSecretManager.java
index 426a604f4f157..d2d008f8a3d35 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/sasl/ShuffleSecretManager.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/sasl/ShuffleSecretManager.java
@@ -47,7 +47,7 @@ public ShuffleSecretManager() {
    * fetching shuffle files written by other executors in this application.
    */
   public void registerApp(String appId, String shuffleSecret) {
-    if (!shuffleSecretMap.contains(appId)) {
+    if (!shuffleSecretMap.containsKey(appId)) {
       shuffleSecretMap.put(appId, shuffleSecret);
       logger.info("Registered shuffle secret for application {}", appId);
     } else {
@@ -67,7 +67,7 @@ public void registerApp(String appId, ByteBuffer shuffleSecret) {
    * This is called when the application terminates.
    */
   public void unregisterApp(String appId) {
-    if (shuffleSecretMap.contains(appId)) {
+    if (shuffleSecretMap.containsKey(appId)) {
       shuffleSecretMap.remove(appId);
       logger.info("Unregistered shuffle secret for application {}", appId);
     } else {
diff --git a/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java b/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java
index fd50e3a4bfb9b..cd67eb28573e8 100644
--- a/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java
+++ b/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java
@@ -243,7 +243,6 @@ public void initializeApplication(ApplicationInitializationContext context) {
     String appId = context.getApplicationId().toString();
     try {
       ByteBuffer shuffleSecret = context.getApplicationDataForService();
-      logger.info("Initializing application {}", appId);
       if (isAuthenticationEnabled()) {
         AppId fullId = new AppId(appId);
         if (db != null) {
@@ -262,7 +261,6 @@ public void initializeApplication(ApplicationInitializationContext context) {
   public void stopApplication(ApplicationTerminationContext context) {
     String appId = context.getApplicationId().toString();
     try {
-      logger.info("Stopping application {}", appId);
       if (isAuthenticationEnabled()) {
         AppId fullId = new AppId(appId);
         if (db != null) {
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
index 950ebd9a2d4d9..75427b4ad6cb4 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
@@ -26,7 +26,9 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.scalatest.Matchers
 
 import org.apache.spark._
+import org.apache.spark.deploy.yarn.config._
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config._
 import org.apache.spark.network.shuffle.ShuffleTestAccessor
 import org.apache.spark.network.yarn.{YarnShuffleService, YarnTestAccessor}
 import org.apache.spark.tags.ExtendedYarnTest
@@ -46,28 +48,58 @@ class YarnShuffleIntegrationSuite extends BaseYarnClusterSuite {
     yarnConfig
   }
 
+  protected def extraSparkConf(): Map[String, String] = {
+    val shuffleServicePort = YarnTestAccessor.getShuffleServicePort
+    val shuffleService = YarnTestAccessor.getShuffleServiceInstance
+    logInfo("Shuffle service port = " + shuffleServicePort)
+
+    Map(
+      "spark.shuffle.service.enabled" -> "true",
+      "spark.shuffle.service.port" -> shuffleServicePort.toString,
+      MAX_EXECUTOR_FAILURES.key -> "1"
+    )
+  }
+
   test("external shuffle service") {
     val shuffleServicePort = YarnTestAccessor.getShuffleServicePort
     val shuffleService = YarnTestAccessor.getShuffleServiceInstance
 
     val registeredExecFile = YarnTestAccessor.getRegisteredExecutorFile(shuffleService)
 
-    logInfo("Shuffle service port = " + shuffleServicePort)
     val result = File.createTempFile("result", null, tempDir)
     val finalState = runSpark(
       false,
       mainClassName(YarnExternalShuffleDriver.getClass),
       appArgs = Seq(result.getAbsolutePath(), registeredExecFile.getAbsolutePath),
-      extraConf = Map(
-        "spark.shuffle.service.enabled" -> "true",
-        "spark.shuffle.service.port" -> shuffleServicePort.toString
-      )
+      extraConf = extraSparkConf()
     )
     checkResult(finalState, result)
     assert(YarnTestAccessor.getRegisteredExecutorFile(shuffleService).exists())
   }
 }
 
+/**
+ * Integration test for the external shuffle service with auth on.
+ */
+@ExtendedYarnTest
+class YarnShuffleAuthSuite extends YarnShuffleIntegrationSuite {
+
+  override def newYarnConfig(): YarnConfiguration = {
+    val yarnConfig = super.newYarnConfig()
+    yarnConfig.set(NETWORK_AUTH_ENABLED.key, "true")
+    yarnConfig.set(NETWORK_ENCRYPTION_ENABLED.key, "true")
+    yarnConfig
+  }
+
+  override protected def extraSparkConf(): Map[String, String] = {
+    super.extraSparkConf() ++ Map(
+      NETWORK_AUTH_ENABLED.key -> "true",
+      NETWORK_ENCRYPTION_ENABLED.key -> "true"
+    )
+  }
+
+}
+
 private object YarnExternalShuffleDriver extends Logging with Matchers {
 
   val WAIT_TIMEOUT_MILLIS = 10000

From 16612638f0539f197eb7deb1be2ec53fed60d707 Mon Sep 17 00:00:00 2001
From: zhoukang <zhoukang@xiaomi.com>
Date: Tue, 25 Jul 2017 17:59:21 -0700
Subject: [PATCH 238/779] [SPARK-21517][CORE] Avoid copying memory when
 transfer chunks remotely

## What changes were proposed in this pull request?

In our production cluster,oom happens when NettyBlockRpcServer receive OpenBlocks message.The reason we observed is below:
When BlockManagerManagedBuffer call ChunkedByteBuffer#toNetty, it will use Unpooled.wrappedBuffer(ByteBuffer... buffers) which use default maxNumComponents=16 in low-level CompositeByteBuf.When our component's number is bigger than 16, it will execute consolidateIfNeeded

        int numComponents = this.components.size();
        if(numComponents > this.maxNumComponents) {
            int capacity = ((CompositeByteBuf.Component)this.components.get(numComponents - 1)).endOffset;
            ByteBuf consolidated = this.allocBuffer(capacity);

            for(int c = 0; c < numComponents; ++c) {
                CompositeByteBuf.Component c1 = (CompositeByteBuf.Component)this.components.get(c);
                ByteBuf b = c1.buf;
                consolidated.writeBytes(b);
                c1.freeIfNecessary();
            }

            CompositeByteBuf.Component var7 = new CompositeByteBuf.Component(consolidated);
            var7.endOffset = var7.length;
            this.components.clear();
            this.components.add(var7);
        }

in CompositeByteBuf which will consume some memory during buffer copy.
We can use another api Unpooled. wrappedBuffer(int maxNumComponents, ByteBuffer... buffers) to avoid this comsuming.

## How was this patch tested?

Test in production cluster.

Author: zhoukang <zhoukang@xiaomi.com>

Closes #18723 from caneGuy/zhoukang/fix-chunkbuffer.
---
 .../main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala b/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala
index 2f905c8af0f63..f48bfd5c25f77 100644
--- a/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala
+++ b/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala
@@ -66,7 +66,7 @@ private[spark] class ChunkedByteBuffer(var chunks: Array[ByteBuffer]) {
    * Wrap this buffer to view it as a Netty ByteBuf.
    */
   def toNetty: ByteBuf = {
-    Unpooled.wrappedBuffer(getChunks(): _*)
+    Unpooled.wrappedBuffer(chunks.length, getChunks(): _*)
   }
 
   /**

From ae4ea5fe253e9083e807bdc769e829f3f37265b4 Mon Sep 17 00:00:00 2001
From: Yuhao Yang <yuhao.yang@intel.com>
Date: Wed, 26 Jul 2017 10:37:48 +0100
Subject: [PATCH 239/779] [SPARK-21524][ML] unit test fix:
 ValidatorParamsSuiteHelpers generates wrong temp files

## What changes were proposed in this pull request?
jira: https://issues.apache.org/jira/browse/SPARK-21524

ValidatorParamsSuiteHelpers.testFileMove() is generating temp dir in the wrong place and does not delete them.

ValidatorParamsSuiteHelpers.testFileMove() is invoked by TrainValidationSplitSuite and crossValidatorSuite. Currently it uses `tempDir` from `TempDirectory`, which unfortunately is never initialized since the `boforeAll()` of `ValidatorParamsSuiteHelpers` is never invoked.

In my system, it leaves some temp directories in the assembly folder each time I run the TrainValidationSplitSuite and crossValidatorSuite.

## How was this patch tested?
unit test fix

Author: Yuhao Yang <yuhao.yang@intel.com>

Closes #18728 from hhbyyh/tempDirFix.
---
 .../org/apache/spark/ml/tuning/CrossValidatorSuite.scala | 2 +-
 .../spark/ml/tuning/TrainValidationSplitSuite.scala      | 2 +-
 .../spark/ml/tuning/ValidatorParamsSuiteHelpers.scala    | 9 +++++----
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
index 2791ea715ace6..dc6043ef19fe2 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
@@ -222,7 +222,7 @@ class CrossValidatorSuite
       .setNumFolds(20)
       .setEstimatorParamMaps(paramMaps)
 
-    ValidatorParamsSuiteHelpers.testFileMove(cv)
+    ValidatorParamsSuiteHelpers.testFileMove(cv, tempDir)
   }
 
   test("read/write: CrossValidator with complex estimator") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
index 71a1776a2cdd0..7c97865e45202 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
@@ -209,7 +209,7 @@ class TrainValidationSplitSuite
       .setEstimatorParamMaps(paramMaps)
       .setSeed(42L)
 
-    ValidatorParamsSuiteHelpers.testFileMove(tvs)
+    ValidatorParamsSuiteHelpers.testFileMove(tvs, tempDir)
   }
 
   test("read/write: TrainValidationSplitModel") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/ValidatorParamsSuiteHelpers.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/ValidatorParamsSuiteHelpers.scala
index 1df673cf40162..eae1f5adc8842 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/ValidatorParamsSuiteHelpers.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/ValidatorParamsSuiteHelpers.scala
@@ -20,11 +20,12 @@ package org.apache.spark.ml.tuning
 import java.io.File
 import java.nio.file.{Files, StandardCopyOption}
 
-import org.apache.spark.SparkFunSuite
+import org.scalatest.Assertions
+
 import org.apache.spark.ml.param.{ParamMap, ParamPair, Params}
-import org.apache.spark.ml.util.{DefaultReadWriteTest, Identifiable, MLReader, MLWritable}
+import org.apache.spark.ml.util.{Identifiable, MLReader, MLWritable}
 
-object ValidatorParamsSuiteHelpers extends SparkFunSuite with DefaultReadWriteTest {
+object ValidatorParamsSuiteHelpers extends Assertions {
   /**
    * Assert sequences of estimatorParamMaps are identical.
    * If the values for a parameter are not directly comparable with ===
@@ -62,7 +63,7 @@ object ValidatorParamsSuiteHelpers extends SparkFunSuite with DefaultReadWriteTe
    * the path of the estimator so that if the parent directory changes, loading the
    * model still works.
    */
-  def testFileMove[T <: Params with MLWritable](instance: T): Unit = {
+  def testFileMove[T <: Params with MLWritable](instance: T, tempDir: File): Unit = {
     val uid = instance.uid
     val subdirName = Identifiable.randomUID("test")
 

From cf29828d720611f7633a8114917bac901f76dece Mon Sep 17 00:00:00 2001
From: sethah <shendrickson@cloudera.com>
Date: Wed, 26 Jul 2017 13:38:53 +0200
Subject: [PATCH 240/779] [SPARK-20988][ML] Logistic regression uses aggregator
 hierarchy

## What changes were proposed in this pull request?

This change pulls the `LogisticAggregator` class out of LogisticRegression.scala and makes it extend `DifferentiableLossAggregator`. It also changes logistic regression to use the generic `RDDLossFunction` instead of having its own.

Other minor changes:
* L2Regularization accepts `Option[Int => Double]` for features standard deviation
* L2Regularization uses `Vector` type instead of Array
* Some tests added to LeastSquaresAggregator

## How was this patch tested?

Unit test suites are added.

Author: sethah <shendrickson@cloudera.com>

Closes #18305 from sethah/SPARK-20988.
---
 .../classification/LogisticRegression.scala   | 492 +-----------------
 .../optim/aggregator/LogisticAggregator.scala | 364 +++++++++++++
 .../loss/DifferentiableRegularization.scala   |  55 +-
 .../spark/ml/optim/loss/RDDLossFunction.scala |  10 +-
 .../ml/regression/LinearRegression.scala      |   3 +-
 .../LogisticRegressionSuite.scala             |   9 +-
 .../DifferentiableLossAggregatorSuite.scala   |  37 ++
 .../LeastSquaresAggregatorSuite.scala         |  47 +-
 .../aggregator/LogisticAggregatorSuite.scala  | 253 +++++++++
 .../DifferentiableRegularizationSuite.scala   |  13 +-
 .../ml/optim/loss/RDDLossFunctionSuite.scala  |   6 +-
 11 files changed, 752 insertions(+), 537 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregator.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 65b09e571264c..6bba7f9b08dbd 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -27,11 +27,11 @@ import org.apache.hadoop.fs.Path
 
 import org.apache.spark.SparkException
 import org.apache.spark.annotation.{Experimental, Since}
-import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.internal.Logging
 import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.linalg._
-import org.apache.spark.ml.linalg.BLAS._
+import org.apache.spark.ml.optim.aggregator.LogisticAggregator
+import org.apache.spark.ml.optim.loss.{L2Regularization, RDDLossFunction}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
@@ -598,8 +598,23 @@ class LogisticRegression @Since("1.2.0") (
         val regParamL2 = (1.0 - $(elasticNetParam)) * $(regParam)
 
         val bcFeaturesStd = instances.context.broadcast(featuresStd)
-        val costFun = new LogisticCostFun(instances, numClasses, $(fitIntercept),
-          $(standardization), bcFeaturesStd, regParamL2, multinomial = isMultinomial,
+        val getAggregatorFunc = new LogisticAggregator(bcFeaturesStd, numClasses, $(fitIntercept),
+          multinomial = isMultinomial)(_)
+        val getFeaturesStd = (j: Int) => if (j >= 0 && j < numCoefficientSets * numFeatures) {
+          featuresStd(j / numCoefficientSets)
+        } else {
+          0.0
+        }
+
+        val regularization = if (regParamL2 != 0.0) {
+          val shouldApply = (idx: Int) => idx >= 0 && idx < numFeatures * numCoefficientSets
+          Some(new L2Regularization(regParamL2, shouldApply,
+            if ($(standardization)) None else Some(getFeaturesStd)))
+        } else {
+          None
+        }
+
+        val costFun = new RDDLossFunction(instances, getAggregatorFunc, regularization,
           $(aggregationDepth))
 
         val numCoeffsPlusIntercepts = numFeaturesPlusIntercept * numCoefficientSets
@@ -1236,7 +1251,7 @@ object LogisticRegressionModel extends MLReadable[LogisticRegressionModel] {
  * Two MultilabelSummarizer can be merged together to have a statistical summary of the
  * corresponding joint dataset.
  */
-private[classification] class MultiClassSummarizer extends Serializable {
+private[ml] class MultiClassSummarizer extends Serializable {
   // The first element of value in distinctMap is the actually number of instances,
   // and the second element of value is sum of the weights.
   private val distinctMap = new mutable.HashMap[Int, (Long, Double)]
@@ -1468,470 +1483,3 @@ class BinaryLogisticRegressionSummary private[classification] (
     binaryMetrics.recallByThreshold().toDF("threshold", "recall")
   }
 }
-
-/**
- * LogisticAggregator computes the gradient and loss for binary or multinomial logistic (softmax)
- * loss function, as used in classification for instances in sparse or dense vector in an online
- * fashion.
- *
- * Two LogisticAggregators can be merged together to have a summary of loss and gradient of
- * the corresponding joint dataset.
- *
- * For improving the convergence rate during the optimization process and also to prevent against
- * features with very large variances exerting an overly large influence during model training,
- * packages like R's GLMNET perform the scaling to unit variance and remove the mean in order to
- * reduce the condition number. The model is then trained in this scaled space, but returns the
- * coefficients in the original scale. See page 9 in
- * http://cran.r-project.org/web/packages/glmnet/glmnet.pdf
- *
- * However, we don't want to apply the [[org.apache.spark.ml.feature.StandardScaler]] on the
- * training dataset, and then cache the standardized dataset since it will create a lot of overhead.
- * As a result, we perform the scaling implicitly when we compute the objective function (though
- * we do not subtract the mean).
- *
- * Note that there is a difference between multinomial (softmax) and binary loss. The binary case
- * uses one outcome class as a "pivot" and regresses the other class against the pivot. In the
- * multinomial case, the softmax loss function is used to model each class probability
- * independently. Using softmax loss produces `K` sets of coefficients, while using a pivot class
- * produces `K - 1` sets of coefficients (a single coefficient vector in the binary case). In the
- * binary case, we can say that the coefficients are shared between the positive and negative
- * classes. When regularization is applied, multinomial (softmax) loss will produce a result
- * different from binary loss since the positive and negative don't share the coefficients while the
- * binary regression shares the coefficients between positive and negative.
- *
- * The following is a mathematical derivation for the multinomial (softmax) loss.
- *
- * The probability of the multinomial outcome $y$ taking on any of the K possible outcomes is:
- *
- * <blockquote>
- *    $$
- *    P(y_i=0|\vec{x}_i, \beta) = \frac{e^{\vec{x}_i^T \vec{\beta}_0}}{\sum_{k=0}^{K-1}
- *       e^{\vec{x}_i^T \vec{\beta}_k}} \\
- *    P(y_i=1|\vec{x}_i, \beta) = \frac{e^{\vec{x}_i^T \vec{\beta}_1}}{\sum_{k=0}^{K-1}
- *       e^{\vec{x}_i^T \vec{\beta}_k}}\\
- *    P(y_i=K-1|\vec{x}_i, \beta) = \frac{e^{\vec{x}_i^T \vec{\beta}_{K-1}}\,}{\sum_{k=0}^{K-1}
- *       e^{\vec{x}_i^T \vec{\beta}_k}}
- *    $$
- * </blockquote>
- *
- * The model coefficients $\beta = (\beta_0, \beta_1, \beta_2, ..., \beta_{K-1})$ become a matrix
- * which has dimension of $K \times (N+1)$ if the intercepts are added. If the intercepts are not
- * added, the dimension will be $K \times N$.
- *
- * Note that the coefficients in the model above lack identifiability. That is, any constant scalar
- * can be added to all of the coefficients and the probabilities remain the same.
- *
- * <blockquote>
- *    $$
- *    \begin{align}
- *    \frac{e^{\vec{x}_i^T \left(\vec{\beta}_0 + \vec{c}\right)}}{\sum_{k=0}^{K-1}
- *       e^{\vec{x}_i^T \left(\vec{\beta}_k + \vec{c}\right)}}
- *    = \frac{e^{\vec{x}_i^T \vec{\beta}_0}e^{\vec{x}_i^T \vec{c}}\,}{e^{\vec{x}_i^T \vec{c}}
- *       \sum_{k=0}^{K-1} e^{\vec{x}_i^T \vec{\beta}_k}}
- *    = \frac{e^{\vec{x}_i^T \vec{\beta}_0}}{\sum_{k=0}^{K-1} e^{\vec{x}_i^T \vec{\beta}_k}}
- *    \end{align}
- *    $$
- * </blockquote>
- *
- * However, when regularization is added to the loss function, the coefficients are indeed
- * identifiable because there is only one set of coefficients which minimizes the regularization
- * term. When no regularization is applied, we choose the coefficients with the minimum L2
- * penalty for consistency and reproducibility. For further discussion see:
- *
- * Friedman, et al. "Regularization Paths for Generalized Linear Models via Coordinate Descent"
- *
- * The loss of objective function for a single instance of data (we do not include the
- * regularization term here for simplicity) can be written as
- *
- * <blockquote>
- *    $$
- *    \begin{align}
- *    \ell\left(\beta, x_i\right) &= -log{P\left(y_i \middle| \vec{x}_i, \beta\right)} \\
- *    &= log\left(\sum_{k=0}^{K-1}e^{\vec{x}_i^T \vec{\beta}_k}\right) - \vec{x}_i^T \vec{\beta}_y\\
- *    &= log\left(\sum_{k=0}^{K-1} e^{margins_k}\right) - margins_y
- *    \end{align}
- *    $$
- * </blockquote>
- *
- * where ${margins}_k = \vec{x}_i^T \vec{\beta}_k$.
- *
- * For optimization, we have to calculate the first derivative of the loss function, and a simple
- * calculation shows that
- *
- * <blockquote>
- *    $$
- *    \begin{align}
- *    \frac{\partial \ell(\beta, \vec{x}_i, w_i)}{\partial \beta_{j, k}}
- *    &= x_{i,j} \cdot w_i \cdot \left(\frac{e^{\vec{x}_i \cdot \vec{\beta}_k}}{\sum_{k'=0}^{K-1}
- *      e^{\vec{x}_i \cdot \vec{\beta}_{k'}}\,} - I_{y=k}\right) \\
- *    &= x_{i, j} \cdot w_i \cdot multiplier_k
- *    \end{align}
- *    $$
- * </blockquote>
- *
- * where $w_i$ is the sample weight, $I_{y=k}$ is an indicator function
- *
- *  <blockquote>
- *    $$
- *    I_{y=k} = \begin{cases}
- *          1 & y = k \\
- *          0 & else
- *       \end{cases}
- *    $$
- * </blockquote>
- *
- * and
- *
- * <blockquote>
- *    $$
- *    multiplier_k = \left(\frac{e^{\vec{x}_i \cdot \vec{\beta}_k}}{\sum_{k=0}^{K-1}
- *       e^{\vec{x}_i \cdot \vec{\beta}_k}} - I_{y=k}\right)
- *    $$
- * </blockquote>
- *
- * If any of margins is larger than 709.78, the numerical computation of multiplier and loss
- * function will suffer from arithmetic overflow. This issue occurs when there are outliers in
- * data which are far away from the hyperplane, and this will cause the failing of training once
- * infinity is introduced. Note that this is only a concern when max(margins) &gt; 0.
- *
- * Fortunately, when max(margins) = maxMargin &gt; 0, the loss function and the multiplier can
- * easily be rewritten into the following equivalent numerically stable formula.
- *
- * <blockquote>
- *    $$
- *    \ell\left(\beta, x\right) = log\left(\sum_{k=0}^{K-1} e^{margins_k - maxMargin}\right) -
- *       margins_{y} + maxMargin
- *    $$
- * </blockquote>
- *
- * Note that each term, $(margins_k - maxMargin)$ in the exponential is no greater than zero; as a
- * result, overflow will not happen with this formula.
- *
- * For $multiplier$, a similar trick can be applied as the following,
- *
- * <blockquote>
- *    $$
- *    multiplier_k = \left(\frac{e^{\vec{x}_i \cdot \vec{\beta}_k - maxMargin}}{\sum_{k'=0}^{K-1}
- *       e^{\vec{x}_i \cdot \vec{\beta}_{k'} - maxMargin}} - I_{y=k}\right)
- *    $$
- * </blockquote>
- *
- * @param bcCoefficients The broadcast coefficients corresponding to the features.
- * @param bcFeaturesStd The broadcast standard deviation values of the features.
- * @param numClasses the number of possible outcomes for k classes classification problem in
- *                   Multinomial Logistic Regression.
- * @param fitIntercept Whether to fit an intercept term.
- * @param multinomial Whether to use multinomial (softmax) or binary loss
- *
- * @note In order to avoid unnecessary computation during calculation of the gradient updates
- * we lay out the coefficients in column major order during training. This allows us to
- * perform feature standardization once, while still retaining sequential memory access
- * for speed. We convert back to row major order when we create the model,
- * since this form is optimal for the matrix operations used for prediction.
- */
-private class LogisticAggregator(
-    bcCoefficients: Broadcast[Vector],
-    bcFeaturesStd: Broadcast[Array[Double]],
-    numClasses: Int,
-    fitIntercept: Boolean,
-    multinomial: Boolean) extends Serializable with Logging {
-
-  private val numFeatures = bcFeaturesStd.value.length
-  private val numFeaturesPlusIntercept = if (fitIntercept) numFeatures + 1 else numFeatures
-  private val coefficientSize = bcCoefficients.value.size
-  private val numCoefficientSets = if (multinomial) numClasses else 1
-  if (multinomial) {
-    require(numClasses ==  coefficientSize / numFeaturesPlusIntercept, s"The number of " +
-      s"coefficients should be ${numClasses * numFeaturesPlusIntercept} but was $coefficientSize")
-  } else {
-    require(coefficientSize == numFeaturesPlusIntercept, s"Expected $numFeaturesPlusIntercept " +
-      s"coefficients but got $coefficientSize")
-    require(numClasses == 1 || numClasses == 2, s"Binary logistic aggregator requires numClasses " +
-      s"in {1, 2} but found $numClasses.")
-  }
-
-  private var weightSum = 0.0
-  private var lossSum = 0.0
-
-  @transient private lazy val coefficientsArray: Array[Double] = bcCoefficients.value match {
-    case DenseVector(values) => values
-    case _ => throw new IllegalArgumentException(s"coefficients only supports dense vector but " +
-      s"got type ${bcCoefficients.value.getClass}.)")
-  }
-  private lazy val gradientSumArray = new Array[Double](coefficientSize)
-
-  if (multinomial && numClasses <= 2) {
-    logInfo(s"Multinomial logistic regression for binary classification yields separate " +
-      s"coefficients for positive and negative classes. When no regularization is applied, the" +
-      s"result will be effectively the same as binary logistic regression. When regularization" +
-      s"is applied, multinomial loss will produce a result different from binary loss.")
-  }
-
-  /** Update gradient and loss using binary loss function. */
-  private def binaryUpdateInPlace(
-      features: Vector,
-      weight: Double,
-      label: Double): Unit = {
-
-    val localFeaturesStd = bcFeaturesStd.value
-    val localCoefficients = coefficientsArray
-    val localGradientArray = gradientSumArray
-    val margin = - {
-      var sum = 0.0
-      features.foreachActive { (index, value) =>
-        if (localFeaturesStd(index) != 0.0 && value != 0.0) {
-          sum += localCoefficients(index) * value / localFeaturesStd(index)
-        }
-      }
-      if (fitIntercept) sum += localCoefficients(numFeaturesPlusIntercept - 1)
-      sum
-    }
-
-    val multiplier = weight * (1.0 / (1.0 + math.exp(margin)) - label)
-
-    features.foreachActive { (index, value) =>
-      if (localFeaturesStd(index) != 0.0 && value != 0.0) {
-        localGradientArray(index) += multiplier * value / localFeaturesStd(index)
-      }
-    }
-
-    if (fitIntercept) {
-      localGradientArray(numFeaturesPlusIntercept - 1) += multiplier
-    }
-
-    if (label > 0) {
-      // The following is equivalent to log(1 + exp(margin)) but more numerically stable.
-      lossSum += weight * MLUtils.log1pExp(margin)
-    } else {
-      lossSum += weight * (MLUtils.log1pExp(margin) - margin)
-    }
-  }
-
-  /** Update gradient and loss using multinomial (softmax) loss function. */
-  private def multinomialUpdateInPlace(
-      features: Vector,
-      weight: Double,
-      label: Double): Unit = {
-    // TODO: use level 2 BLAS operations
-    /*
-      Note: this can still be used when numClasses = 2 for binary
-      logistic regression without pivoting.
-     */
-    val localFeaturesStd = bcFeaturesStd.value
-    val localCoefficients = coefficientsArray
-    val localGradientArray = gradientSumArray
-
-    // marginOfLabel is margins(label) in the formula
-    var marginOfLabel = 0.0
-    var maxMargin = Double.NegativeInfinity
-
-    val margins = new Array[Double](numClasses)
-    features.foreachActive { (index, value) =>
-      val stdValue = value / localFeaturesStd(index)
-      var j = 0
-      while (j < numClasses) {
-        margins(j) += localCoefficients(index * numClasses + j) * stdValue
-        j += 1
-      }
-    }
-    var i = 0
-    while (i < numClasses) {
-      if (fitIntercept) {
-        margins(i) += localCoefficients(numClasses * numFeatures + i)
-      }
-      if (i == label.toInt) marginOfLabel = margins(i)
-      if (margins(i) > maxMargin) {
-        maxMargin = margins(i)
-      }
-      i += 1
-    }
-
-    /**
-     * When maxMargin is greater than 0, the original formula could cause overflow.
-     * We address this by subtracting maxMargin from all the margins, so it's guaranteed
-     * that all of the new margins will be smaller than zero to prevent arithmetic overflow.
-     */
-    val multipliers = new Array[Double](numClasses)
-    val sum = {
-      var temp = 0.0
-      var i = 0
-      while (i < numClasses) {
-        if (maxMargin > 0) margins(i) -= maxMargin
-        val exp = math.exp(margins(i))
-        temp += exp
-        multipliers(i) = exp
-        i += 1
-      }
-      temp
-    }
-
-    margins.indices.foreach { i =>
-      multipliers(i) = multipliers(i) / sum - (if (label == i) 1.0 else 0.0)
-    }
-    features.foreachActive { (index, value) =>
-      if (localFeaturesStd(index) != 0.0 && value != 0.0) {
-        val stdValue = value / localFeaturesStd(index)
-        var j = 0
-        while (j < numClasses) {
-          localGradientArray(index * numClasses + j) +=
-            weight * multipliers(j) * stdValue
-          j += 1
-        }
-      }
-    }
-    if (fitIntercept) {
-      var i = 0
-      while (i < numClasses) {
-        localGradientArray(numFeatures * numClasses + i) += weight * multipliers(i)
-        i += 1
-      }
-    }
-
-    val loss = if (maxMargin > 0) {
-      math.log(sum) - marginOfLabel + maxMargin
-    } else {
-      math.log(sum) - marginOfLabel
-    }
-    lossSum += weight * loss
-  }
-
-  /**
-   * Add a new training instance to this LogisticAggregator, and update the loss and gradient
-   * of the objective function.
-   *
-   * @param instance The instance of data point to be added.
-   * @return This LogisticAggregator object.
-   */
-  def add(instance: Instance): this.type = {
-    instance match { case Instance(label, weight, features) =>
-
-      if (weight == 0.0) return this
-
-      if (multinomial) {
-        multinomialUpdateInPlace(features, weight, label)
-      } else {
-        binaryUpdateInPlace(features, weight, label)
-      }
-      weightSum += weight
-      this
-    }
-  }
-
-  /**
-   * Merge another LogisticAggregator, and update the loss and gradient
-   * of the objective function.
-   * (Note that it's in place merging; as a result, `this` object will be modified.)
-   *
-   * @param other The other LogisticAggregator to be merged.
-   * @return This LogisticAggregator object.
-   */
-  def merge(other: LogisticAggregator): this.type = {
-
-    if (other.weightSum != 0.0) {
-      weightSum += other.weightSum
-      lossSum += other.lossSum
-
-      var i = 0
-      val localThisGradientSumArray = this.gradientSumArray
-      val localOtherGradientSumArray = other.gradientSumArray
-      val len = localThisGradientSumArray.length
-      while (i < len) {
-        localThisGradientSumArray(i) += localOtherGradientSumArray(i)
-        i += 1
-      }
-    }
-    this
-  }
-
-  def loss: Double = {
-    require(weightSum > 0.0, s"The effective number of instances should be " +
-      s"greater than 0.0, but $weightSum.")
-    lossSum / weightSum
-  }
-
-  def gradient: Matrix = {
-    require(weightSum > 0.0, s"The effective number of instances should be " +
-      s"greater than 0.0, but $weightSum.")
-    val result = Vectors.dense(gradientSumArray.clone())
-    scal(1.0 / weightSum, result)
-    new DenseMatrix(numCoefficientSets, numFeaturesPlusIntercept, result.toArray)
-  }
-}
-
-/**
- * LogisticCostFun implements Breeze's DiffFunction[T] for a multinomial (softmax) logistic loss
- * function, as used in multi-class classification (it is also used in binary logistic regression).
- * It returns the loss and gradient with L2 regularization at a particular point (coefficients).
- * It's used in Breeze's convex optimization routines.
- */
-private class LogisticCostFun(
-    instances: RDD[Instance],
-    numClasses: Int,
-    fitIntercept: Boolean,
-    standardization: Boolean,
-    bcFeaturesStd: Broadcast[Array[Double]],
-    regParamL2: Double,
-    multinomial: Boolean,
-    aggregationDepth: Int) extends DiffFunction[BDV[Double]] {
-
-  override def calculate(coefficients: BDV[Double]): (Double, BDV[Double]) = {
-    val coeffs = Vectors.fromBreeze(coefficients)
-    val bcCoeffs = instances.context.broadcast(coeffs)
-    val featuresStd = bcFeaturesStd.value
-    val numFeatures = featuresStd.length
-    val numCoefficientSets = if (multinomial) numClasses else 1
-    val numFeaturesPlusIntercept = if (fitIntercept) numFeatures + 1 else numFeatures
-
-    val logisticAggregator = {
-      val seqOp = (c: LogisticAggregator, instance: Instance) => c.add(instance)
-      val combOp = (c1: LogisticAggregator, c2: LogisticAggregator) => c1.merge(c2)
-
-      instances.treeAggregate(
-        new LogisticAggregator(bcCoeffs, bcFeaturesStd, numClasses, fitIntercept,
-          multinomial)
-      )(seqOp, combOp, aggregationDepth)
-    }
-
-    val totalGradientMatrix = logisticAggregator.gradient
-    val coefMatrix = new DenseMatrix(numCoefficientSets, numFeaturesPlusIntercept, coeffs.toArray)
-    // regVal is the sum of coefficients squares excluding intercept for L2 regularization.
-    val regVal = if (regParamL2 == 0.0) {
-      0.0
-    } else {
-      var sum = 0.0
-      coefMatrix.foreachActive { case (classIndex, featureIndex, value) =>
-        // We do not apply regularization to the intercepts
-        val isIntercept = fitIntercept && (featureIndex == numFeatures)
-        if (!isIntercept) {
-          // The following code will compute the loss of the regularization; also
-          // the gradient of the regularization, and add back to totalGradientArray.
-          sum += {
-            if (standardization) {
-              val gradValue = totalGradientMatrix(classIndex, featureIndex)
-              totalGradientMatrix.update(classIndex, featureIndex, gradValue + regParamL2 * value)
-              value * value
-            } else {
-              if (featuresStd(featureIndex) != 0.0) {
-                // If `standardization` is false, we still standardize the data
-                // to improve the rate of convergence; as a result, we have to
-                // perform this reverse standardization by penalizing each component
-                // differently to get effectively the same objective function when
-                // the training dataset is not standardized.
-                val temp = value / (featuresStd(featureIndex) * featuresStd(featureIndex))
-                val gradValue = totalGradientMatrix(classIndex, featureIndex)
-                totalGradientMatrix.update(classIndex, featureIndex, gradValue + regParamL2 * temp)
-                value * temp
-              } else {
-                0.0
-              }
-            }
-          }
-        }
-      }
-      0.5 * regParamL2 * sum
-    }
-    bcCoeffs.destroy(blocking = false)
-
-    (logisticAggregator.loss + regVal, new BDV(totalGradientMatrix.toArray))
-  }
-}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregator.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregator.scala
new file mode 100644
index 0000000000000..66a52942e668c
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregator.scala
@@ -0,0 +1,364 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.ml.optim.aggregator
+
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.internal.Logging
+import org.apache.spark.ml.feature.Instance
+import org.apache.spark.ml.linalg.{DenseVector, Vector}
+import org.apache.spark.mllib.util.MLUtils
+
+/**
+ * LogisticAggregator computes the gradient and loss for binary or multinomial logistic (softmax)
+ * loss function, as used in classification for instances in sparse or dense vector in an online
+ * fashion.
+ *
+ * Two LogisticAggregators can be merged together to have a summary of loss and gradient of
+ * the corresponding joint dataset.
+ *
+ * For improving the convergence rate during the optimization process and also to prevent against
+ * features with very large variances exerting an overly large influence during model training,
+ * packages like R's GLMNET perform the scaling to unit variance and remove the mean in order to
+ * reduce the condition number. The model is then trained in this scaled space, but returns the
+ * coefficients in the original scale. See page 9 in
+ * http://cran.r-project.org/web/packages/glmnet/glmnet.pdf
+ *
+ * However, we don't want to apply the [[org.apache.spark.ml.feature.StandardScaler]] on the
+ * training dataset, and then cache the standardized dataset since it will create a lot of overhead.
+ * As a result, we perform the scaling implicitly when we compute the objective function (though
+ * we do not subtract the mean).
+ *
+ * Note that there is a difference between multinomial (softmax) and binary loss. The binary case
+ * uses one outcome class as a "pivot" and regresses the other class against the pivot. In the
+ * multinomial case, the softmax loss function is used to model each class probability
+ * independently. Using softmax loss produces `K` sets of coefficients, while using a pivot class
+ * produces `K - 1` sets of coefficients (a single coefficient vector in the binary case). In the
+ * binary case, we can say that the coefficients are shared between the positive and negative
+ * classes. When regularization is applied, multinomial (softmax) loss will produce a result
+ * different from binary loss since the positive and negative don't share the coefficients while the
+ * binary regression shares the coefficients between positive and negative.
+ *
+ * The following is a mathematical derivation for the multinomial (softmax) loss.
+ *
+ * The probability of the multinomial outcome $y$ taking on any of the K possible outcomes is:
+ *
+ * <blockquote>
+ *    $$
+ *    P(y_i=0|\vec{x}_i, \beta) = \frac{e^{\vec{x}_i^T \vec{\beta}_0}}{\sum_{k=0}^{K-1}
+ *       e^{\vec{x}_i^T \vec{\beta}_k}} \\
+ *    P(y_i=1|\vec{x}_i, \beta) = \frac{e^{\vec{x}_i^T \vec{\beta}_1}}{\sum_{k=0}^{K-1}
+ *       e^{\vec{x}_i^T \vec{\beta}_k}}\\
+ *    P(y_i=K-1|\vec{x}_i, \beta) = \frac{e^{\vec{x}_i^T \vec{\beta}_{K-1}}\,}{\sum_{k=0}^{K-1}
+ *       e^{\vec{x}_i^T \vec{\beta}_k}}
+ *    $$
+ * </blockquote>
+ *
+ * The model coefficients $\beta = (\beta_0, \beta_1, \beta_2, ..., \beta_{K-1})$ become a matrix
+ * which has dimension of $K \times (N+1)$ if the intercepts are added. If the intercepts are not
+ * added, the dimension will be $K \times N$.
+ *
+ * Note that the coefficients in the model above lack identifiability. That is, any constant scalar
+ * can be added to all of the coefficients and the probabilities remain the same.
+ *
+ * <blockquote>
+ *    $$
+ *    \begin{align}
+ *    \frac{e^{\vec{x}_i^T \left(\vec{\beta}_0 + \vec{c}\right)}}{\sum_{k=0}^{K-1}
+ *       e^{\vec{x}_i^T \left(\vec{\beta}_k + \vec{c}\right)}}
+ *    = \frac{e^{\vec{x}_i^T \vec{\beta}_0}e^{\vec{x}_i^T \vec{c}}\,}{e^{\vec{x}_i^T \vec{c}}
+ *       \sum_{k=0}^{K-1} e^{\vec{x}_i^T \vec{\beta}_k}}
+ *    = \frac{e^{\vec{x}_i^T \vec{\beta}_0}}{\sum_{k=0}^{K-1} e^{\vec{x}_i^T \vec{\beta}_k}}
+ *    \end{align}
+ *    $$
+ * </blockquote>
+ *
+ * However, when regularization is added to the loss function, the coefficients are indeed
+ * identifiable because there is only one set of coefficients which minimizes the regularization
+ * term. When no regularization is applied, we choose the coefficients with the minimum L2
+ * penalty for consistency and reproducibility. For further discussion see:
+ *
+ * Friedman, et al. "Regularization Paths for Generalized Linear Models via Coordinate Descent"
+ *
+ * The loss of objective function for a single instance of data (we do not include the
+ * regularization term here for simplicity) can be written as
+ *
+ * <blockquote>
+ *    $$
+ *    \begin{align}
+ *    \ell\left(\beta, x_i\right) &= -log{P\left(y_i \middle| \vec{x}_i, \beta\right)} \\
+ *    &= log\left(\sum_{k=0}^{K-1}e^{\vec{x}_i^T \vec{\beta}_k}\right) - \vec{x}_i^T \vec{\beta}_y\\
+ *    &= log\left(\sum_{k=0}^{K-1} e^{margins_k}\right) - margins_y
+ *    \end{align}
+ *    $$
+ * </blockquote>
+ *
+ * where ${margins}_k = \vec{x}_i^T \vec{\beta}_k$.
+ *
+ * For optimization, we have to calculate the first derivative of the loss function, and a simple
+ * calculation shows that
+ *
+ * <blockquote>
+ *    $$
+ *    \begin{align}
+ *    \frac{\partial \ell(\beta, \vec{x}_i, w_i)}{\partial \beta_{j, k}}
+ *    &= x_{i,j} \cdot w_i \cdot \left(\frac{e^{\vec{x}_i \cdot \vec{\beta}_k}}{\sum_{k'=0}^{K-1}
+ *      e^{\vec{x}_i \cdot \vec{\beta}_{k'}}\,} - I_{y=k}\right) \\
+ *    &= x_{i, j} \cdot w_i \cdot multiplier_k
+ *    \end{align}
+ *    $$
+ * </blockquote>
+ *
+ * where $w_i$ is the sample weight, $I_{y=k}$ is an indicator function
+ *
+ *  <blockquote>
+ *    $$
+ *    I_{y=k} = \begin{cases}
+ *          1 & y = k \\
+ *          0 & else
+ *       \end{cases}
+ *    $$
+ * </blockquote>
+ *
+ * and
+ *
+ * <blockquote>
+ *    $$
+ *    multiplier_k = \left(\frac{e^{\vec{x}_i \cdot \vec{\beta}_k}}{\sum_{k=0}^{K-1}
+ *       e^{\vec{x}_i \cdot \vec{\beta}_k}} - I_{y=k}\right)
+ *    $$
+ * </blockquote>
+ *
+ * If any of margins is larger than 709.78, the numerical computation of multiplier and loss
+ * function will suffer from arithmetic overflow. This issue occurs when there are outliers in
+ * data which are far away from the hyperplane, and this will cause the failing of training once
+ * infinity is introduced. Note that this is only a concern when max(margins) &gt; 0.
+ *
+ * Fortunately, when max(margins) = maxMargin &gt; 0, the loss function and the multiplier can
+ * easily be rewritten into the following equivalent numerically stable formula.
+ *
+ * <blockquote>
+ *    $$
+ *    \ell\left(\beta, x\right) = log\left(\sum_{k=0}^{K-1} e^{margins_k - maxMargin}\right) -
+ *       margins_{y} + maxMargin
+ *    $$
+ * </blockquote>
+ *
+ * Note that each term, $(margins_k - maxMargin)$ in the exponential is no greater than zero; as a
+ * result, overflow will not happen with this formula.
+ *
+ * For $multiplier$, a similar trick can be applied as the following,
+ *
+ * <blockquote>
+ *    $$
+ *    multiplier_k = \left(\frac{e^{\vec{x}_i \cdot \vec{\beta}_k - maxMargin}}{\sum_{k'=0}^{K-1}
+ *       e^{\vec{x}_i \cdot \vec{\beta}_{k'} - maxMargin}} - I_{y=k}\right)
+ *    $$
+ * </blockquote>
+ *
+ *
+ * @param bcCoefficients The broadcast coefficients corresponding to the features.
+ * @param bcFeaturesStd The broadcast standard deviation values of the features.
+ * @param numClasses the number of possible outcomes for k classes classification problem in
+ *                   Multinomial Logistic Regression.
+ * @param fitIntercept Whether to fit an intercept term.
+ * @param multinomial Whether to use multinomial (softmax) or binary loss
+ * @note In order to avoid unnecessary computation during calculation of the gradient updates
+ * we lay out the coefficients in column major order during training. This allows us to
+ * perform feature standardization once, while still retaining sequential memory access
+ * for speed. We convert back to row major order when we create the model,
+ * since this form is optimal for the matrix operations used for prediction.
+ */
+private[ml] class LogisticAggregator(
+    bcFeaturesStd: Broadcast[Array[Double]],
+    numClasses: Int,
+    fitIntercept: Boolean,
+    multinomial: Boolean)(bcCoefficients: Broadcast[Vector])
+  extends DifferentiableLossAggregator[Instance, LogisticAggregator] with Logging {
+
+  private val numFeatures = bcFeaturesStd.value.length
+  private val numFeaturesPlusIntercept = if (fitIntercept) numFeatures + 1 else numFeatures
+  private val coefficientSize = bcCoefficients.value.size
+  protected override val dim: Int = coefficientSize
+  if (multinomial) {
+    require(numClasses ==  coefficientSize / numFeaturesPlusIntercept, s"The number of " +
+      s"coefficients should be ${numClasses * numFeaturesPlusIntercept} but was $coefficientSize")
+  } else {
+    require(coefficientSize == numFeaturesPlusIntercept, s"Expected $numFeaturesPlusIntercept " +
+      s"coefficients but got $coefficientSize")
+    require(numClasses == 1 || numClasses == 2, s"Binary logistic aggregator requires numClasses " +
+      s"in {1, 2} but found $numClasses.")
+  }
+
+  @transient private lazy val coefficientsArray: Array[Double] = bcCoefficients.value match {
+    case DenseVector(values) => values
+    case _ => throw new IllegalArgumentException(s"coefficients only supports dense vector but " +
+      s"got type ${bcCoefficients.value.getClass}.)")
+  }
+
+  if (multinomial && numClasses <= 2) {
+    logInfo(s"Multinomial logistic regression for binary classification yields separate " +
+      s"coefficients for positive and negative classes. When no regularization is applied, the" +
+      s"result will be effectively the same as binary logistic regression. When regularization" +
+      s"is applied, multinomial loss will produce a result different from binary loss.")
+  }
+
+  /** Update gradient and loss using binary loss function. */
+  private def binaryUpdateInPlace(features: Vector, weight: Double, label: Double): Unit = {
+
+    val localFeaturesStd = bcFeaturesStd.value
+    val localCoefficients = coefficientsArray
+    val localGradientArray = gradientSumArray
+    val margin = - {
+      var sum = 0.0
+      features.foreachActive { (index, value) =>
+        if (localFeaturesStd(index) != 0.0 && value != 0.0) {
+          sum += localCoefficients(index) * value / localFeaturesStd(index)
+        }
+      }
+      if (fitIntercept) sum += localCoefficients(numFeaturesPlusIntercept - 1)
+      sum
+    }
+
+    val multiplier = weight * (1.0 / (1.0 + math.exp(margin)) - label)
+
+    features.foreachActive { (index, value) =>
+      if (localFeaturesStd(index) != 0.0 && value != 0.0) {
+        localGradientArray(index) += multiplier * value / localFeaturesStd(index)
+      }
+    }
+
+    if (fitIntercept) {
+      localGradientArray(numFeaturesPlusIntercept - 1) += multiplier
+    }
+
+    if (label > 0) {
+      // The following is equivalent to log(1 + exp(margin)) but more numerically stable.
+      lossSum += weight * MLUtils.log1pExp(margin)
+    } else {
+      lossSum += weight * (MLUtils.log1pExp(margin) - margin)
+    }
+  }
+
+  /** Update gradient and loss using multinomial (softmax) loss function. */
+  private def multinomialUpdateInPlace(features: Vector, weight: Double, label: Double): Unit = {
+    // TODO: use level 2 BLAS operations
+    /*
+      Note: this can still be used when numClasses = 2 for binary
+      logistic regression without pivoting.
+     */
+    val localFeaturesStd = bcFeaturesStd.value
+    val localCoefficients = coefficientsArray
+    val localGradientArray = gradientSumArray
+
+    // marginOfLabel is margins(label) in the formula
+    var marginOfLabel = 0.0
+    var maxMargin = Double.NegativeInfinity
+
+    val margins = new Array[Double](numClasses)
+    features.foreachActive { (index, value) =>
+      val stdValue = value / localFeaturesStd(index)
+      var j = 0
+      while (j < numClasses) {
+        margins(j) += localCoefficients(index * numClasses + j) * stdValue
+        j += 1
+      }
+    }
+    var i = 0
+    while (i < numClasses) {
+      if (fitIntercept) {
+        margins(i) += localCoefficients(numClasses * numFeatures + i)
+      }
+      if (i == label.toInt) marginOfLabel = margins(i)
+      if (margins(i) > maxMargin) {
+        maxMargin = margins(i)
+      }
+      i += 1
+    }
+
+    /**
+     * When maxMargin is greater than 0, the original formula could cause overflow.
+     * We address this by subtracting maxMargin from all the margins, so it's guaranteed
+     * that all of the new margins will be smaller than zero to prevent arithmetic overflow.
+     */
+    val multipliers = new Array[Double](numClasses)
+    val sum = {
+      var temp = 0.0
+      var i = 0
+      while (i < numClasses) {
+        if (maxMargin > 0) margins(i) -= maxMargin
+        val exp = math.exp(margins(i))
+        temp += exp
+        multipliers(i) = exp
+        i += 1
+      }
+      temp
+    }
+
+    margins.indices.foreach { i =>
+      multipliers(i) = multipliers(i) / sum - (if (label == i) 1.0 else 0.0)
+    }
+    features.foreachActive { (index, value) =>
+      if (localFeaturesStd(index) != 0.0 && value != 0.0) {
+        val stdValue = value / localFeaturesStd(index)
+        var j = 0
+        while (j < numClasses) {
+          localGradientArray(index * numClasses + j) += weight * multipliers(j) * stdValue
+          j += 1
+        }
+      }
+    }
+    if (fitIntercept) {
+      var i = 0
+      while (i < numClasses) {
+        localGradientArray(numFeatures * numClasses + i) += weight * multipliers(i)
+        i += 1
+      }
+    }
+
+    val loss = if (maxMargin > 0) {
+      math.log(sum) - marginOfLabel + maxMargin
+    } else {
+      math.log(sum) - marginOfLabel
+    }
+    lossSum += weight * loss
+  }
+
+  /**
+   * Add a new training instance to this LogisticAggregator, and update the loss and gradient
+   * of the objective function.
+   *
+   * @param instance The instance of data point to be added.
+   * @return This LogisticAggregator object.
+   */
+  def add(instance: Instance): this.type = {
+    instance match { case Instance(label, weight, features) =>
+      require(numFeatures == features.size, s"Dimensions mismatch when adding new instance." +
+        s" Expecting $numFeatures but got ${features.size}.")
+      require(weight >= 0.0, s"instance weight, $weight has to be >= 0.0")
+
+      if (weight == 0.0) return this
+
+      if (multinomial) {
+        multinomialUpdateInPlace(features, weight, label)
+      } else {
+        binaryUpdateInPlace(features, weight, label)
+      }
+      weightSum += weight
+      this
+    }
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/loss/DifferentiableRegularization.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/loss/DifferentiableRegularization.scala
index 118c0ebfa513e..7ac7c225e5acb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/optim/loss/DifferentiableRegularization.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/loss/DifferentiableRegularization.scala
@@ -18,6 +18,8 @@ package org.apache.spark.ml.optim.loss
 
 import breeze.optimize.DiffFunction
 
+import org.apache.spark.ml.linalg._
+
 /**
  * A Breeze diff function which represents a cost function for differentiable regularization
  * of parameters. e.g. L2 regularization: 1 / 2 regParam * beta dot beta
@@ -32,40 +34,45 @@ private[ml] trait DifferentiableRegularization[T] extends DiffFunction[T] {
 }
 
 /**
- * A Breeze diff function for computing the L2 regularized loss and gradient of an array of
+ * A Breeze diff function for computing the L2 regularized loss and gradient of a vector of
  * coefficients.
  *
  * @param regParam The magnitude of the regularization.
  * @param shouldApply A function (Int => Boolean) indicating whether a given index should have
  *                    regularization applied to it.
- * @param featuresStd Option indicating whether the regularization should be scaled by the standard
- *                    deviation of the features.
+ * @param applyFeaturesStd Option for a function which maps coefficient index (column major) to the
+ *                         feature standard deviation. If `None`, no standardization is applied.
  */
 private[ml] class L2Regularization(
-    val regParam: Double,
+    override val regParam: Double,
     shouldApply: Int => Boolean,
-    featuresStd: Option[Array[Double]]) extends DifferentiableRegularization[Array[Double]] {
+    applyFeaturesStd: Option[Int => Double]) extends DifferentiableRegularization[Vector] {
 
-  override def calculate(coefficients: Array[Double]): (Double, Array[Double]) = {
-    var sum = 0.0
-    val gradient = new Array[Double](coefficients.length)
-    coefficients.indices.filter(shouldApply).foreach { j =>
-      val coef = coefficients(j)
-      featuresStd match {
-        case Some(stds) =>
-          val std = stds(j)
-          if (std != 0.0) {
-            val temp = coef / (std * std)
-            sum += coef * temp
-            gradient(j) = regParam * temp
-          } else {
-            0.0
+  override def calculate(coefficients: Vector): (Double, Vector) = {
+    coefficients match {
+      case dv: DenseVector =>
+        var sum = 0.0
+        val gradient = new Array[Double](dv.size)
+        dv.values.indices.filter(shouldApply).foreach { j =>
+          val coef = coefficients(j)
+          applyFeaturesStd match {
+            case Some(getStd) =>
+              val std = getStd(j)
+              if (std != 0.0) {
+                val temp = coef / (std * std)
+                sum += coef * temp
+                gradient(j) = regParam * temp
+              } else {
+                0.0
+              }
+            case None =>
+              sum += coef * coef
+              gradient(j) = coef * regParam
           }
-        case None =>
-          sum += coef * coef
-          gradient(j) = coef * regParam
-      }
+        }
+        (0.5 * sum * regParam, Vectors.dense(gradient))
+      case _: SparseVector =>
+        throw new IllegalArgumentException("Sparse coefficients are not currently supported.")
     }
-    (0.5 * sum * regParam, gradient)
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/loss/RDDLossFunction.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/loss/RDDLossFunction.scala
index 3b1618eb0b6fe..173041688128f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/optim/loss/RDDLossFunction.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/loss/RDDLossFunction.scala
@@ -29,7 +29,7 @@ import org.apache.spark.rdd.RDD
 
 /**
  * This class computes the gradient and loss of a differentiable loss function by mapping a
- * [[DifferentiableLossAggregator]] over an [[RDD]] of [[Instance]]s. The loss function is the
+ * [[DifferentiableLossAggregator]] over an [[RDD]]. The loss function is the
  * sum of the loss computed on a single instance across all points in the RDD. Therefore, the actual
  * analytical form of the loss function is specified by the aggregator, which computes each points
  * contribution to the overall loss.
@@ -37,7 +37,7 @@ import org.apache.spark.rdd.RDD
  * A differentiable regularization component can also be added by providing a
  * [[DifferentiableRegularization]] loss function.
  *
- * @param instances
+ * @param instances RDD containing the data to compute the loss function over.
  * @param getAggregator A function which gets a new loss aggregator in every tree aggregate step.
  * @param regularization An option representing the regularization loss function to apply to the
  *                       coefficients.
@@ -50,7 +50,7 @@ private[ml] class RDDLossFunction[
     Agg <: DifferentiableLossAggregator[T, Agg]: ClassTag](
     instances: RDD[T],
     getAggregator: (Broadcast[Vector] => Agg),
-    regularization: Option[DifferentiableRegularization[Array[Double]]],
+    regularization: Option[DifferentiableRegularization[Vector]],
     aggregationDepth: Int = 2)
   extends DiffFunction[BDV[Double]] {
 
@@ -62,8 +62,8 @@ private[ml] class RDDLossFunction[
     val newAgg = instances.treeAggregate(thisAgg)(seqOp, combOp, aggregationDepth)
     val gradient = newAgg.gradient
     val regLoss = regularization.map { regFun =>
-      val (regLoss, regGradient) = regFun.calculate(coefficients.data)
-      BLAS.axpy(1.0, Vectors.dense(regGradient), gradient)
+      val (regLoss, regGradient) = regFun.calculate(Vectors.fromBreeze(coefficients))
+      BLAS.axpy(1.0, regGradient, gradient)
       regLoss
     }.getOrElse(0.0)
     bcCoefficients.destroy(blocking = false)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index ccc61feee82cf..50931fecbc2f0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -336,10 +336,11 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
 
     val getAggregatorFunc = new LeastSquaresAggregator(yStd, yMean, $(fitIntercept),
       bcFeaturesStd, bcFeaturesMean)(_)
+    val getFeaturesStd = (j: Int) => if (j >= 0 && j < numFeatures) featuresStd(j) else 0.0
     val regularization = if (effectiveL2RegParam != 0.0) {
       val shouldApply = (idx: Int) => idx >= 0 && idx < numFeatures
       Some(new L2Regularization(effectiveL2RegParam, shouldApply,
-        if ($(standardization)) None else Some(featuresStd)))
+        if ($(standardization)) None else Some(getFeaturesStd)))
     } else {
       None
     }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 1ffd8dcd53d61..0570499e74516 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -27,6 +27,7 @@ import org.apache.spark.ml.attribute.NominalAttribute
 import org.apache.spark.ml.classification.LogisticRegressionSuite._
 import org.apache.spark.ml.feature.{Instance, LabeledPoint}
 import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, Matrix, SparseMatrix, Vector, Vectors}
+import org.apache.spark.ml.optim.aggregator.LogisticAggregator
 import org.apache.spark.ml.param.{ParamMap, ParamsSuite}
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
@@ -506,8 +507,8 @@ class LogisticRegressionSuite
   test("sparse coefficients in LogisticAggregator") {
     val bcCoefficientsBinary = spark.sparkContext.broadcast(Vectors.sparse(2, Array(0), Array(1.0)))
     val bcFeaturesStd = spark.sparkContext.broadcast(Array(1.0))
-    val binaryAgg = new LogisticAggregator(bcCoefficientsBinary, bcFeaturesStd, 2,
-      fitIntercept = true, multinomial = false)
+    val binaryAgg = new LogisticAggregator(bcFeaturesStd, 2,
+      fitIntercept = true, multinomial = false)(bcCoefficientsBinary)
     val thrownBinary = withClue("binary logistic aggregator cannot handle sparse coefficients") {
       intercept[IllegalArgumentException] {
         binaryAgg.add(Instance(1.0, 1.0, Vectors.dense(1.0)))
@@ -516,8 +517,8 @@ class LogisticRegressionSuite
     assert(thrownBinary.getMessage.contains("coefficients only supports dense"))
 
     val bcCoefficientsMulti = spark.sparkContext.broadcast(Vectors.sparse(6, Array(0), Array(1.0)))
-    val multinomialAgg = new LogisticAggregator(bcCoefficientsMulti, bcFeaturesStd, 3,
-      fitIntercept = true, multinomial = true)
+    val multinomialAgg = new LogisticAggregator(bcFeaturesStd, 3,
+      fitIntercept = true, multinomial = true)(bcCoefficientsMulti)
     val thrown = withClue("multinomial logistic aggregator cannot handle sparse coefficients") {
       intercept[IllegalArgumentException] {
         multinomialAgg.add(Instance(1.0, 1.0, Vectors.dense(1.0)))
diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/DifferentiableLossAggregatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/DifferentiableLossAggregatorSuite.scala
index 7a4faeb1c10bf..d7cdeae30be20 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/DifferentiableLossAggregatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/DifferentiableLossAggregatorSuite.scala
@@ -17,9 +17,12 @@
 package org.apache.spark.ml.optim.aggregator
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.classification.MultiClassSummarizer
 import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors}
 import org.apache.spark.ml.util.TestingUtils._
+import org.apache.spark.mllib.linalg.VectorImplicits._
+import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
 
 class DifferentiableLossAggregatorSuite extends SparkFunSuite {
 
@@ -157,4 +160,38 @@ object DifferentiableLossAggregatorSuite {
       this
     }
   }
+
+  /** Get feature and label summarizers for provided data. */
+  private[ml] def getRegressionSummarizers(
+      instances: Array[Instance]): (MultivariateOnlineSummarizer, MultivariateOnlineSummarizer) = {
+    val seqOp = (c: (MultivariateOnlineSummarizer, MultivariateOnlineSummarizer),
+                 instance: Instance) =>
+      (c._1.add(instance.features, instance.weight),
+        c._2.add(Vectors.dense(instance.label), instance.weight))
+
+    val combOp = (c1: (MultivariateOnlineSummarizer, MultivariateOnlineSummarizer),
+                  c2: (MultivariateOnlineSummarizer, MultivariateOnlineSummarizer)) =>
+      (c1._1.merge(c2._1), c1._2.merge(c2._2))
+
+    instances.aggregate(
+      new MultivariateOnlineSummarizer, new MultivariateOnlineSummarizer
+    )(seqOp, combOp)
+  }
+
+  /** Get feature and label summarizers for provided data. */
+  private[ml] def getClassificationSummarizers(
+      instances: Array[Instance]): (MultivariateOnlineSummarizer, MultiClassSummarizer) = {
+    val seqOp = (c: (MultivariateOnlineSummarizer, MultiClassSummarizer),
+                 instance: Instance) =>
+      (c._1.add(instance.features, instance.weight),
+        c._2.add(instance.label, instance.weight))
+
+    val combOp = (c1: (MultivariateOnlineSummarizer, MultiClassSummarizer),
+                  c2: (MultivariateOnlineSummarizer, MultiClassSummarizer)) =>
+      (c1._1.merge(c2._1), c1._2.merge(c2._2))
+
+    instances.aggregate(
+      new MultivariateOnlineSummarizer, new MultiClassSummarizer
+    )(seqOp, combOp)
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LeastSquaresAggregatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LeastSquaresAggregatorSuite.scala
index d1cb0d380e7a5..35b6944624707 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LeastSquaresAggregatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LeastSquaresAggregatorSuite.scala
@@ -20,12 +20,12 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors}
 import org.apache.spark.ml.util.TestingUtils._
-import org.apache.spark.mllib.linalg.VectorImplicits._
-import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 
 class LeastSquaresAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
 
+  import DifferentiableLossAggregatorSuite.getRegressionSummarizers
+
   @transient var instances: Array[Instance] = _
   @transient var instancesConstantFeature: Array[Instance] = _
   @transient var instancesConstantLabel: Array[Instance] = _
@@ -49,29 +49,12 @@ class LeastSquaresAggregatorSuite extends SparkFunSuite with MLlibTestSparkConte
     )
   }
 
-  /** Get feature and label summarizers for provided data. */
-  def getSummarizers(
-    instances: Array[Instance]): (MultivariateOnlineSummarizer, MultivariateOnlineSummarizer) = {
-    val seqOp = (c: (MultivariateOnlineSummarizer, MultivariateOnlineSummarizer),
-                 instance: Instance) =>
-      (c._1.add(instance.features, instance.weight),
-        c._2.add(Vectors.dense(instance.label), instance.weight))
-
-    val combOp = (c1: (MultivariateOnlineSummarizer, MultivariateOnlineSummarizer),
-                  c2: (MultivariateOnlineSummarizer, MultivariateOnlineSummarizer)) =>
-      (c1._1.merge(c2._1), c1._2.merge(c2._2))
-
-    instances.aggregate(
-      new MultivariateOnlineSummarizer, new MultivariateOnlineSummarizer
-    )(seqOp, combOp)
-  }
-
   /** Get summary statistics for some data and create a new LeastSquaresAggregator. */
-  def getNewAggregator(
+  private def getNewAggregator(
       instances: Array[Instance],
       coefficients: Vector,
       fitIntercept: Boolean): LeastSquaresAggregator = {
-    val (featuresSummarizer, ySummarizer) = getSummarizers(instances)
+    val (featuresSummarizer, ySummarizer) = getRegressionSummarizers(instances)
     val yStd = math.sqrt(ySummarizer.variance(0))
     val yMean = ySummarizer.mean(0)
     val featuresStd = featuresSummarizer.variance.toArray.map(math.sqrt)
@@ -83,6 +66,26 @@ class LeastSquaresAggregatorSuite extends SparkFunSuite with MLlibTestSparkConte
       bcFeaturesMean)(bcCoefficients)
   }
 
+  test("aggregator add method input size") {
+    val coefficients = Vectors.dense(1.0, 2.0)
+    val agg = getNewAggregator(instances, coefficients, fitIntercept = true)
+    withClue("LeastSquaresAggregator features dimension must match coefficients dimension") {
+      intercept[IllegalArgumentException] {
+        agg.add(Instance(1.0, 1.0, Vectors.dense(2.0)))
+      }
+    }
+  }
+
+  test("negative weight") {
+    val coefficients = Vectors.dense(1.0, 2.0)
+    val agg = getNewAggregator(instances, coefficients, fitIntercept = true)
+    withClue("LeastSquaresAggregator does not support negative instance weights.") {
+      intercept[IllegalArgumentException] {
+        agg.add(Instance(1.0, -1.0, Vectors.dense(2.0, 1.0)))
+      }
+    }
+  }
+
   test("check sizes") {
     val coefficients = Vectors.dense(1.0, 2.0)
     val aggIntercept = getNewAggregator(instances, coefficients, fitIntercept = true)
@@ -102,7 +105,7 @@ class LeastSquaresAggregatorSuite extends SparkFunSuite with MLlibTestSparkConte
      */
     val coefficients = Vectors.dense(1.0, 2.0)
     val numFeatures = coefficients.size
-    val (featuresSummarizer, ySummarizer) = getSummarizers(instances)
+    val (featuresSummarizer, ySummarizer) = getRegressionSummarizers(instances)
     val featuresStd = featuresSummarizer.variance.toArray.map(math.sqrt)
     val featuresMean = featuresSummarizer.mean.toArray
     val yStd = math.sqrt(ySummarizer.variance(0))
diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala
new file mode 100644
index 0000000000000..2b29c67d859db
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala
@@ -0,0 +1,253 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.ml.optim.aggregator
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.feature.Instance
+import org.apache.spark.ml.linalg.{BLAS, Matrices, Vector, Vectors}
+import org.apache.spark.ml.util.TestingUtils._
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+
+class LogisticAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
+
+  import DifferentiableLossAggregatorSuite.getClassificationSummarizers
+
+  @transient var instances: Array[Instance] = _
+  @transient var instancesConstantFeature: Array[Instance] = _
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    instances = Array(
+      Instance(0.0, 0.1, Vectors.dense(1.0, 2.0)),
+      Instance(1.0, 0.5, Vectors.dense(1.5, 1.0)),
+      Instance(2.0, 0.3, Vectors.dense(4.0, 0.5))
+    )
+    instancesConstantFeature = Array(
+      Instance(0.0, 0.1, Vectors.dense(1.0, 2.0)),
+      Instance(1.0, 0.5, Vectors.dense(1.0, 1.0)),
+      Instance(2.0, 0.3, Vectors.dense(1.0, 0.5))
+    )
+  }
+
+  /** Get summary statistics for some data and create a new LogisticAggregator. */
+  private def getNewAggregator(
+      instances: Array[Instance],
+      coefficients: Vector,
+      fitIntercept: Boolean,
+      isMultinomial: Boolean): LogisticAggregator = {
+    val (featuresSummarizer, ySummarizer) =
+      DifferentiableLossAggregatorSuite.getClassificationSummarizers(instances)
+    val numClasses = ySummarizer.histogram.length
+    val featuresStd = featuresSummarizer.variance.toArray.map(math.sqrt)
+    val bcFeaturesStd = spark.sparkContext.broadcast(featuresStd)
+    val bcCoefficients = spark.sparkContext.broadcast(coefficients)
+    new LogisticAggregator(bcFeaturesStd, numClasses, fitIntercept, isMultinomial)(bcCoefficients)
+  }
+
+  test("aggregator add method input size") {
+    val coefArray = Array(1.0, 2.0, -2.0, 3.0, 0.0, -1.0)
+    val interceptArray = Array(4.0, 2.0, -3.0)
+    val agg = getNewAggregator(instances, Vectors.dense(coefArray ++ interceptArray),
+      fitIntercept = true, isMultinomial = true)
+    withClue("LogisticAggregator features dimension must match coefficients dimension") {
+      intercept[IllegalArgumentException] {
+        agg.add(Instance(1.0, 1.0, Vectors.dense(2.0)))
+      }
+    }
+  }
+
+  test("negative weight") {
+    val coefArray = Array(1.0, 2.0, -2.0, 3.0, 0.0, -1.0)
+    val interceptArray = Array(4.0, 2.0, -3.0)
+    val agg = getNewAggregator(instances, Vectors.dense(coefArray ++ interceptArray),
+      fitIntercept = true, isMultinomial = true)
+    withClue("LogisticAggregator does not support negative instance weights") {
+      intercept[IllegalArgumentException] {
+        agg.add(Instance(1.0, -1.0, Vectors.dense(2.0, 1.0)))
+      }
+    }
+  }
+
+  test("check sizes multinomial") {
+    val rng = new scala.util.Random
+    val numFeatures = instances.head.features.size
+    val numClasses = instances.map(_.label).toSet.size
+    val coefWithIntercept = Vectors.dense(
+      Array.fill(numClasses * (numFeatures + 1))(rng.nextDouble))
+    val coefWithoutIntercept = Vectors.dense(
+      Array.fill(numClasses * numFeatures)(rng.nextDouble))
+    val aggIntercept = getNewAggregator(instances, coefWithIntercept, fitIntercept = true,
+      isMultinomial = true)
+    val aggNoIntercept = getNewAggregator(instances, coefWithoutIntercept, fitIntercept = false,
+      isMultinomial = true)
+    instances.foreach(aggIntercept.add)
+    instances.foreach(aggNoIntercept.add)
+
+    assert(aggIntercept.gradient.size === (numFeatures + 1) * numClasses)
+    assert(aggNoIntercept.gradient.size === numFeatures * numClasses)
+  }
+
+  test("check sizes binomial") {
+    val rng = new scala.util.Random
+    val binaryInstances = instances.filter(_.label < 2.0)
+    val numFeatures = binaryInstances.head.features.size
+    val coefWithIntercept = Vectors.dense(Array.fill(numFeatures + 1)(rng.nextDouble))
+    val coefWithoutIntercept = Vectors.dense(Array.fill(numFeatures)(rng.nextDouble))
+    val aggIntercept = getNewAggregator(binaryInstances, coefWithIntercept, fitIntercept = true,
+      isMultinomial = false)
+    val aggNoIntercept = getNewAggregator(binaryInstances, coefWithoutIntercept,
+      fitIntercept = false, isMultinomial = false)
+    binaryInstances.foreach(aggIntercept.add)
+    binaryInstances.foreach(aggNoIntercept.add)
+
+    assert(aggIntercept.gradient.size === numFeatures + 1)
+    assert(aggNoIntercept.gradient.size === numFeatures)
+  }
+
+  test("check correctness multinomial") {
+    /*
+    Check that the aggregator computes loss/gradient for:
+      -sum_i w_i * (beta_y dot x_i - log(sum_k e^(beta_k dot x_i)))
+     */
+    val coefArray = Array(1.0, 2.0, -2.0, 3.0, 0.0, -1.0)
+    val interceptArray = Array(4.0, 2.0, -3.0)
+    val numFeatures = instances.head.features.size
+    val numClasses = instances.map(_.label).toSet.size
+    val intercepts = Vectors.dense(interceptArray)
+    val (featuresSummarizer, ySummarizer) = getClassificationSummarizers(instances)
+    val featuresStd = featuresSummarizer.variance.toArray.map(math.sqrt)
+    val weightSum = instances.map(_.weight).sum
+
+    val agg = getNewAggregator(instances, Vectors.dense(coefArray ++ interceptArray),
+      fitIntercept = true, isMultinomial = true)
+    instances.foreach(agg.add)
+
+    // compute the loss
+    val stdCoef = coefArray.indices.map(i => coefArray(i) / featuresStd(i / numClasses)).toArray
+    val linearPredictors = instances.map { case Instance(l, w, f) =>
+      val result = intercepts.copy.toDense
+      BLAS.gemv(1.0, Matrices.dense(numClasses, numFeatures, stdCoef), f, 1.0, result)
+      (l, w, result)
+    }
+
+    // sum_i w * beta_k dot x_i
+    val sumLinear = linearPredictors.map { case (l, w, p) =>
+      w * p(l.toInt)
+    }.sum
+
+    // sum_i w * log(sum_k e^(beta_K dot x_i))
+    val sumLogs = linearPredictors.map { case (l, w, p) =>
+      w * math.log(p.values.map(math.exp).sum)
+    }.sum
+    val loss = (sumLogs - sumLinear) / weightSum
+
+
+    // compute the gradients
+    val gradientCoef = new Array[Double](numFeatures * numClasses)
+    val gradientIntercept = new Array[Double](numClasses)
+    instances.foreach { case Instance(l, w, f) =>
+      val margin = intercepts.copy.toDense
+      BLAS.gemv(1.0, Matrices.dense(numClasses, numFeatures, stdCoef), f, 1.0, margin)
+      val sum = margin.values.map(math.exp).sum
+
+      gradientCoef.indices.foreach { i =>
+        val fStd = f(i / numClasses) / featuresStd(i / numClasses)
+        val cidx = i % numClasses
+        if (cidx == l.toInt) gradientCoef(i) -= w * fStd
+        gradientCoef(i) += w * math.exp(margin(cidx)) / sum * fStd
+      }
+
+      gradientIntercept.indices.foreach { i =>
+        val cidx = i % numClasses
+        if (cidx == l.toInt) gradientIntercept(i) -= w
+        gradientIntercept(i) += w * math.exp(margin(cidx)) / sum
+      }
+    }
+    val gradient = Vectors.dense((gradientCoef ++ gradientIntercept).map(_ / weightSum))
+
+    assert(loss ~== agg.loss relTol 0.01)
+    assert(gradient ~== agg.gradient relTol 0.01)
+  }
+
+  test("check correctness binomial") {
+    /*
+    Check that the aggregator computes loss/gradient for:
+      -sum_i y_i * log(1 / (1 + e^(-beta dot x_i)) + (1 - y_i) * log(1 - 1 / (1 + e^(-beta dot x_i))
+     */
+    val binaryInstances = instances.map { instance =>
+      if (instance.label <= 1.0) instance else Instance(0.0, instance.weight, instance.features)
+    }
+    val coefArray = Array(1.0, 2.0)
+    val intercept = 1.0
+    val numFeatures = binaryInstances.head.features.size
+    val (featuresSummarizer, _) = getClassificationSummarizers(binaryInstances)
+    val featuresStd = featuresSummarizer.variance.toArray.map(math.sqrt)
+    val weightSum = binaryInstances.map(_.weight).sum
+
+    val agg = getNewAggregator(binaryInstances, Vectors.dense(coefArray ++ Array(intercept)),
+      fitIntercept = true, isMultinomial = false)
+    binaryInstances.foreach(agg.add)
+
+    // compute the loss
+    val stdCoef = coefArray.indices.map(i => coefArray(i) / featuresStd(i)).toArray
+    val lossSum = binaryInstances.map { case Instance(l, w, f) =>
+      val margin = BLAS.dot(Vectors.dense(stdCoef), f) + intercept
+      val prob = 1.0 / (1.0 + math.exp(-margin))
+      -w * l * math.log(prob) - w * (1.0 - l) * math.log(1.0 - prob)
+    }.sum
+    val loss = lossSum / weightSum
+
+
+
+    // compute the gradients
+    val gradientCoef = new Array[Double](numFeatures)
+    var gradientIntercept = 0.0
+    binaryInstances.foreach { case Instance(l, w, f) =>
+      val margin = BLAS.dot(f, Vectors.dense(coefArray)) + intercept
+      gradientCoef.indices.foreach { i =>
+        gradientCoef(i) += w * (1.0 / (1.0 + math.exp(-margin)) - l) * f(i) / featuresStd(i)
+      }
+      gradientIntercept += w * (1.0 / (1.0 + math.exp(-margin)) - l)
+    }
+    val gradient = Vectors.dense((gradientCoef ++ Array(gradientIntercept)).map(_ / weightSum))
+
+    assert(loss ~== agg.loss relTol 0.01)
+    assert(gradient ~== agg.gradient relTol 0.01)
+  }
+
+  test("check with zero standard deviation") {
+    val binaryInstances = instancesConstantFeature.map { instance =>
+      if (instance.label <= 1.0) instance else Instance(0.0, instance.weight, instance.features)
+    }
+    val coefArray = Array(1.0, 2.0, -2.0, 3.0, 0.0, -1.0)
+    val interceptArray = Array(4.0, 2.0, -3.0)
+    val aggConstantFeature = getNewAggregator(instancesConstantFeature,
+      Vectors.dense(coefArray ++ interceptArray), fitIntercept = true, isMultinomial = true)
+    instances.foreach(aggConstantFeature.add)
+    // constant features should not affect gradient
+    assert(aggConstantFeature.gradient(0) === 0.0)
+
+    val binaryCoefArray = Array(1.0, 2.0)
+    val intercept = 1.0
+    val aggConstantFeatureBinary = getNewAggregator(binaryInstances,
+      Vectors.dense(binaryCoefArray ++ Array(intercept)), fitIntercept = true,
+      isMultinomial = false)
+    instances.foreach(aggConstantFeatureBinary.add)
+    // constant features should not affect gradient
+    assert(aggConstantFeatureBinary.gradient(0) === 0.0)
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/loss/DifferentiableRegularizationSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/loss/DifferentiableRegularizationSuite.scala
index 0794417a8d4bb..4377a6bd75dba 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/optim/loss/DifferentiableRegularizationSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/optim/loss/DifferentiableRegularizationSuite.scala
@@ -17,20 +17,21 @@
 package org.apache.spark.ml.optim.loss
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.linalg.{BLAS, Vectors}
 
 class DifferentiableRegularizationSuite extends SparkFunSuite {
 
   test("L2 regularization") {
     val shouldApply = (_: Int) => true
     val regParam = 0.3
-    val coefficients = Array(1.0, 3.0, -2.0)
+    val coefficients = Vectors.dense(Array(1.0, 3.0, -2.0))
     val numFeatures = coefficients.size
 
     // check without features standard
     val regFun = new L2Regularization(regParam, shouldApply, None)
     val (loss, grad) = regFun.calculate(coefficients)
-    assert(loss === 0.5 * regParam * coefficients.map(x => x * x).sum)
-    assert(grad === coefficients.map(_ * regParam))
+    assert(loss === 0.5 * regParam * BLAS.dot(coefficients, coefficients))
+    assert(grad === Vectors.dense(coefficients.toArray.map(_ * regParam)))
 
     // check with features standard
     val featuresStd = Array(0.1, 1.1, 0.5)
@@ -39,9 +40,9 @@ class DifferentiableRegularizationSuite extends SparkFunSuite {
     val expectedLossStd = 0.5 * regParam * (0 until numFeatures).map { j =>
       coefficients(j) * coefficients(j) / (featuresStd(j) * featuresStd(j))
     }.sum
-    val expectedGradientStd = (0 until numFeatures).map { j =>
+    val expectedGradientStd = Vectors.dense((0 until numFeatures).map { j =>
       regParam * coefficients(j) / (featuresStd(j) * featuresStd(j))
-    }.toArray
+    }.toArray)
     assert(lossStd === expectedLossStd)
     assert(gradStd === expectedGradientStd)
 
@@ -50,7 +51,7 @@ class DifferentiableRegularizationSuite extends SparkFunSuite {
     val regFunApply = new L2Regularization(regParam, shouldApply2, None)
     val (lossApply, gradApply) = regFunApply.calculate(coefficients)
     assert(lossApply === 0.5 * regParam * coefficients(1) * coefficients(1))
-    assert(gradApply ===  Array(0.0, coefficients(1) * regParam, 0.0))
+    assert(gradApply ===  Vectors.dense(0.0, coefficients(1) * regParam, 0.0))
 
     // check with zero features standard
     val featuresStdZero = Array(0.1, 0.0, 0.5)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/loss/RDDLossFunctionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/loss/RDDLossFunctionSuite.scala
index cd5cebee5f7b8..f70da5750f2d5 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/optim/loss/RDDLossFunctionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/optim/loss/RDDLossFunctionSuite.scala
@@ -46,11 +46,11 @@ class RDDLossFunctionSuite extends SparkFunSuite with MLlibTestSparkContext {
     val lossWithReg = new RDDLossFunction(instances, getAgg, Some(regLossFun))
 
     val (loss1, grad1) = lossNoReg.calculate(coefficients.asBreeze.toDenseVector)
-    val (regLoss, regGrad) = regLossFun.calculate(coefficients.toArray)
+    val (regLoss, regGrad) = regLossFun.calculate(coefficients)
     val (loss2, grad2) = lossWithReg.calculate(coefficients.asBreeze.toDenseVector)
 
-    BLAS.axpy(1.0, Vectors.fromBreeze(grad1), Vectors.dense(regGrad))
-    assert(Vectors.dense(regGrad) ~== Vectors.fromBreeze(grad2) relTol 1e-5)
+    BLAS.axpy(1.0, Vectors.fromBreeze(grad1), regGrad)
+    assert(regGrad ~== Vectors.fromBreeze(grad2) relTol 1e-5)
     assert(loss1 + regLoss === loss2)
   }
 

From 60472dbfd97acfd6c4420a13f9b32bc9d84219f3 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Wed, 26 Jul 2017 09:38:51 -0700
Subject: [PATCH 241/779] [SPARK-21485][SQL][DOCS] Spark SQL documentation
 generation for built-in functions

## What changes were proposed in this pull request?

This generates a documentation for Spark SQL built-in functions.

One drawback is, this requires a proper build to generate built-in function list.
Once it is built, it only takes few seconds by `sql/create-docs.sh`.

Please see https://spark-test.github.io/sparksqldoc/ that I hosted to show the output documentation.

There are few more works to be done in order to make the documentation pretty, for example, separating `Arguments:` and `Examples:` but I guess this should be done within `ExpressionDescription` and `ExpressionInfo` rather than manually parsing it. I will fix these in a follow up.

This requires `pip install mkdocs` to generate HTMLs from markdown files.

## How was this patch tested?

Manually tested:

```
cd docs
jekyll build
```
,

```
cd docs
jekyll serve
```

and

```
cd sql
create-docs.sh
```

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #18702 from HyukjinKwon/SPARK-21485.
---
 .gitignore                                    |  2 +
 docs/README.md                                |  6 +-
 docs/_layouts/global.html                     |  1 +
 docs/_plugins/copy_api_dirs.rb                | 27 ++++++
 docs/api.md                                   |  1 +
 docs/index.md                                 |  1 +
 sql/README.md                                 |  2 +
 .../spark/sql/api/python/PythonSQLUtils.scala |  7 ++
 sql/create-docs.sh                            | 49 ++++++++++
 sql/gen-sql-markdown.py                       | 91 +++++++++++++++++++
 sql/mkdocs.yml                                | 19 ++++
 11 files changed, 203 insertions(+), 3 deletions(-)
 create mode 100755 sql/create-docs.sh
 create mode 100644 sql/gen-sql-markdown.py
 create mode 100644 sql/mkdocs.yml

diff --git a/.gitignore b/.gitignore
index cf9780db37ad7..903297db96901 100644
--- a/.gitignore
+++ b/.gitignore
@@ -47,6 +47,8 @@ dev/pr-deps/
 dist/
 docs/_site
 docs/api
+sql/docs
+sql/site
 lib_managed/
 lint-r-report.log
 log/
diff --git a/docs/README.md b/docs/README.md
index 90e10a104b517..0090dd071e15f 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -68,6 +68,6 @@ jekyll plugin to run `build/sbt unidoc` before building the site so if you haven
 may take some time as it generates all of the scaladoc.  The jekyll plugin also generates the
 PySpark docs using [Sphinx](http://sphinx-doc.org/).
 
-NOTE: To skip the step of building and copying over the Scala, Python, R API docs, run `SKIP_API=1
-jekyll`. In addition, `SKIP_SCALADOC=1`, `SKIP_PYTHONDOC=1`, and `SKIP_RDOC=1` can be used to skip a single
-step of the corresponding language.
+NOTE: To skip the step of building and copying over the Scala, Python, R and SQL API docs, run `SKIP_API=1
+jekyll`. In addition, `SKIP_SCALADOC=1`, `SKIP_PYTHONDOC=1`, `SKIP_RDOC=1` and `SKIP_SQLDOC=1` can be used
+to skip a single step of the corresponding language.
diff --git a/docs/_layouts/global.html b/docs/_layouts/global.html
index 570483c0b04ea..67b05ecf7a858 100755
--- a/docs/_layouts/global.html
+++ b/docs/_layouts/global.html
@@ -86,6 +86,7 @@
                                 <li><a href="api/java/index.html">Java</a></li>
                                 <li><a href="api/python/index.html">Python</a></li>
                                 <li><a href="api/R/index.html">R</a></li>
+                                <li><a href="api/sql/index.html">SQL, Built-in Functions</a></li>
                             </ul>
                         </li>
 
diff --git a/docs/_plugins/copy_api_dirs.rb b/docs/_plugins/copy_api_dirs.rb
index 95e3ba35e9027..00366f803c2ad 100644
--- a/docs/_plugins/copy_api_dirs.rb
+++ b/docs/_plugins/copy_api_dirs.rb
@@ -150,4 +150,31 @@
     cp("../R/pkg/DESCRIPTION", "api")
   end
 
+  if not (ENV['SKIP_SQLDOC'] == '1')
+    # Build SQL API docs
+
+    puts "Moving to project root and building API docs."
+    curr_dir = pwd
+    cd("..")
+
+    puts "Running 'build/sbt clean package' from " + pwd + "; this may take a few minutes..."
+    system("build/sbt clean package") || raise("SQL doc generation failed")
+
+    puts "Moving back into docs dir."
+    cd("docs")
+
+    puts "Moving to SQL directory and building docs."
+    cd("../sql")
+    system("./create-docs.sh") || raise("SQL doc generation failed")
+
+    puts "Moving back into docs dir."
+    cd("../docs")
+
+    puts "Making directory api/sql"
+    mkdir_p "api/sql"
+
+    puts "cp -r ../sql/site/. api/sql"
+    cp_r("../sql/site/.", "api/sql")
+  end
+
 end
diff --git a/docs/api.md b/docs/api.md
index ae7d51c2aefbf..70484f02de78d 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -9,3 +9,4 @@ Here you can read API docs for Spark and its submodules.
 - [Spark Java API (Javadoc)](api/java/index.html)
 - [Spark Python API (Sphinx)](api/python/index.html)
 - [Spark R API (Roxygen2)](api/R/index.html)
+- [Spark SQL, Built-in Functions (MkDocs)](api/sql/index.html)
diff --git a/docs/index.md b/docs/index.md
index 2d4607b3119bd..b867c972b4b48 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -100,6 +100,7 @@ options for deployment:
 * [Spark Java API (Javadoc)](api/java/index.html)
 * [Spark Python API (Sphinx)](api/python/index.html)
 * [Spark R API (Roxygen2)](api/R/index.html)
+* [Spark SQL, Built-in Functions (MkDocs)](api/sql/index.html)
 
 **Deployment Guides:**
 
diff --git a/sql/README.md b/sql/README.md
index 58e9097ed4db1..fe1d352050c09 100644
--- a/sql/README.md
+++ b/sql/README.md
@@ -8,3 +8,5 @@ Spark SQL is broken up into four subprojects:
  - Execution (sql/core) - A query planner / execution engine for translating Catalyst's logical query plans into Spark RDDs.  This component also includes a new public interface, SQLContext, that allows users to execute SQL or LINQ statements against existing RDDs and Parquet files.
  - Hive Support (sql/hive) - Includes an extension of SQLContext called HiveContext that allows users to write queries using a subset of HiveQL and access data from a Hive Metastore using Hive SerDes.  There are also wrappers that allows users to run queries that include Hive UDFs, UDAFs, and UDTFs.
  - HiveServer and CLI support (sql/hive-thriftserver) - Includes support for the SQL CLI (bin/spark-sql) and a HiveServer2 (for JDBC/ODBC) compatible server.
+
+Running `sql/create-docs.sh` generates SQL documentation for built-in functions under `sql/site`.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala
index 731feb914d251..4d5ce0bb60c0b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala
@@ -17,9 +17,16 @@
 
 package org.apache.spark.sql.api.python
 
+import org.apache.spark.sql.catalyst.analysis.FunctionRegistry
+import org.apache.spark.sql.catalyst.expressions.ExpressionInfo
 import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
 import org.apache.spark.sql.types.DataType
 
 private[sql] object PythonSQLUtils {
   def parseDataType(typeText: String): DataType = CatalystSqlParser.parseDataType(typeText)
+
+  // This is needed when generating SQL documentation for built-in functions.
+  def listBuiltinFunctionInfos(): Array[ExpressionInfo] = {
+    FunctionRegistry.functionSet.flatMap(f => FunctionRegistry.builtin.lookupFunction(f)).toArray
+  }
 }
diff --git a/sql/create-docs.sh b/sql/create-docs.sh
new file mode 100755
index 0000000000000..275e4c391a388
--- /dev/null
+++ b/sql/create-docs.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Script to create SQL API docs. This requires `mkdocs` and to build
+# Spark first. After running this script the html docs can be found in
+# $SPARK_HOME/sql/site
+
+set -o pipefail
+set -e
+
+FWDIR="$(cd "`dirname "${BASH_SOURCE[0]}"`"; pwd)"
+SPARK_HOME="$(cd "`dirname "${BASH_SOURCE[0]}"`"/..; pwd)"
+
+if ! hash python 2>/dev/null; then
+  echo "Missing python in your path, skipping SQL documentation generation."
+  exit 0
+fi
+
+if ! hash mkdocs 2>/dev/null; then
+  echo "Missing mkdocs in your path, skipping SQL documentation generation."
+  exit 0
+fi
+
+# Now create the markdown file
+rm -fr docs
+mkdir docs
+echo "Generating markdown files for SQL documentation."
+"$SPARK_HOME/bin/spark-submit" gen-sql-markdown.py
+
+# Now create the HTML files
+echo "Generating HTML files for SQL documentation."
+mkdocs build --clean
+rm -fr docs
diff --git a/sql/gen-sql-markdown.py b/sql/gen-sql-markdown.py
new file mode 100644
index 0000000000000..8132af2708aea
--- /dev/null
+++ b/sql/gen-sql-markdown.py
@@ -0,0 +1,91 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import sys
+import os
+from collections import namedtuple
+
+ExpressionInfo = namedtuple("ExpressionInfo", "className usage name extended")
+
+
+def _list_function_infos(jvm):
+    """
+    Returns a list of function information via JVM. Sorts wrapped expression infos by name
+    and returns them.
+    """
+
+    jinfos = jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listBuiltinFunctionInfos()
+    infos = []
+    for jinfo in jinfos:
+        name = jinfo.getName()
+        usage = jinfo.getUsage()
+        usage = usage.replace("_FUNC_", name) if usage is not None else usage
+        extended = jinfo.getExtended()
+        extended = extended.replace("_FUNC_", name) if extended is not None else extended
+        infos.append(ExpressionInfo(
+            className=jinfo.getClassName(),
+            usage=usage,
+            name=name,
+            extended=extended))
+    return sorted(infos, key=lambda i: i.name)
+
+
+def _make_pretty_usage(usage):
+    """
+    Makes the usage description pretty and returns a formatted string.
+    Otherwise, returns None.
+    """
+
+    if usage is not None and usage.strip() != "":
+        usage = "\n".join(map(lambda u: u.strip(), usage.split("\n")))
+        return "%s\n\n" % usage
+
+
+def _make_pretty_extended(extended):
+    """
+    Makes the extended description pretty and returns a formatted string.
+    Otherwise, returns None.
+    """
+
+    if extended is not None and extended.strip() != "":
+        extended = "\n".join(map(lambda u: u.strip(), extended.split("\n")))
+        return "```%s```\n\n" % extended
+
+
+def generate_sql_markdown(jvm, path):
+    """
+    Generates a markdown file after listing the function information. The output file
+    is created in `path`.
+    """
+
+    with open(path, 'w') as mdfile:
+        for info in _list_function_infos(jvm):
+            mdfile.write("### %s\n\n" % info.name)
+            usage = _make_pretty_usage(info.usage)
+            extended = _make_pretty_extended(info.extended)
+            if usage is not None:
+                mdfile.write(usage)
+            if extended is not None:
+                mdfile.write(extended)
+
+
+if __name__ == "__main__":
+    from pyspark.java_gateway import launch_gateway
+
+    jvm = launch_gateway().jvm
+    markdown_file_path = "%s/docs/index.md" % os.path.dirname(sys.argv[0])
+    generate_sql_markdown(jvm, markdown_file_path)
diff --git a/sql/mkdocs.yml b/sql/mkdocs.yml
new file mode 100644
index 0000000000000..c34c891bb9e42
--- /dev/null
+++ b/sql/mkdocs.yml
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+site_name: Spark SQL, Built-in Functions
+theme: readthedocs
+pages:
+  - 'Functions': 'index.md'

From cfb25b27c0b32a8a70a518955fb269314b1fd716 Mon Sep 17 00:00:00 2001
From: jinxing <jinxing6042@126.com>
Date: Thu, 27 Jul 2017 11:55:48 +0800
Subject: [PATCH 242/779] [SPARK-21530] Update description of
 spark.shuffle.maxChunksBeingTransferred.

## What changes were proposed in this pull request?

Update the description of `spark.shuffle.maxChunksBeingTransferred` to include that the new coming connections will be closed when the max is hit and client should have retry mechanism.

Author: jinxing <jinxing6042@126.com>

Closes #18735 from jinxing64/SPARK-21530.
---
 .../java/org/apache/spark/network/util/TransportConf.java   | 6 +++++-
 docs/configuration.md                                       | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java b/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java
index ea52e9fe6c1c1..88256b810bf04 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java
@@ -258,7 +258,11 @@ public Properties cryptoConf() {
   }
 
   /**
-   * The max number of chunks allowed to being transferred at the same time on shuffle service.
+   * The max number of chunks allowed to be transferred at the same time on shuffle service.
+   * Note that new incoming connections will be closed when the max number is hit. The client will
+   * retry according to the shuffle retry configs (see `spark.shuffle.io.maxRetries` and
+   * `spark.shuffle.io.retryWait`), if those limits are reached the task will fail with fetch
+   * failure.
    */
   public long maxChunksBeingTransferred() {
     return conf.getLong("spark.shuffle.maxChunksBeingTransferred", Long.MAX_VALUE);
diff --git a/docs/configuration.md b/docs/configuration.md
index f4b6f46db5b66..500f980455b0e 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -635,7 +635,11 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.shuffle.maxChunksBeingTransferred</code></td>
   <td>Long.MAX_VALUE</td>
   <td>
-    The max number of chunks allowed to being transferred at the same time on shuffle service.
+    The max number of chunks allowed to be transferred at the same time on shuffle service.
+    Note that new incoming connections will be closed when the max number is hit. The client will
+    retry according to the shuffle retry configs (see <code>spark.shuffle.io.maxRetries</code> and
+    <code>spark.shuffle.io.retryWait</code>), if those limits are reached the task will fail with
+    fetch failure.
   </td>
 </tr>
 <tr>

From ebbe589d12434bc108672268bee05a7b7e571ee6 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Thu, 27 Jul 2017 15:27:24 +0800
Subject: [PATCH 243/779] [SPARK-21271][SQL] Ensure Unsafe.sizeInBytes is a
 multiple of 8

## What changes were proposed in this pull request?

This PR ensures that `Unsafe.sizeInBytes` must be a multiple of 8. It it is not satisfied. `Unsafe.hashCode` causes the assertion violation.

## How was this patch tested?

Will add test cases

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #18503 from kiszk/SPARK-21271.
---
 .../FixedLengthRowBasedKeyValueBatch.java            |  6 +++---
 .../spark/sql/catalyst/expressions/UnsafeRow.java    |  2 ++
 .../VariableLengthRowBasedKeyValueBatch.java         |  6 +++---
 .../spark/sql/execution/UnsafeExternalRowSorter.java |  7 ++++---
 .../spark/sql/execution/UnsafeKVExternalSorter.java  |  6 +++---
 .../state/HDFSBackedStateStoreProvider.scala         | 12 ++++++++++--
 6 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/FixedLengthRowBasedKeyValueBatch.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/FixedLengthRowBasedKeyValueBatch.java
index a88a315bf479f..df52f9c2d5496 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/FixedLengthRowBasedKeyValueBatch.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/FixedLengthRowBasedKeyValueBatch.java
@@ -62,7 +62,7 @@ public UnsafeRow appendRow(Object kbase, long koff, int klen,
 
     keyRowId = numRows;
     keyRow.pointTo(base, recordOffset, klen);
-    valueRow.pointTo(base, recordOffset + klen, vlen + 4);
+    valueRow.pointTo(base, recordOffset + klen, vlen);
     numRows++;
     return valueRow;
   }
@@ -95,7 +95,7 @@ protected UnsafeRow getValueFromKey(int rowId) {
       getKeyRow(rowId);
     }
     assert(rowId >= 0);
-    valueRow.pointTo(base, keyRow.getBaseOffset() + klen, vlen + 4);
+    valueRow.pointTo(base, keyRow.getBaseOffset() + klen, vlen);
     return valueRow;
   }
 
@@ -131,7 +131,7 @@ public boolean next() {
         }
 
         key.pointTo(base, offsetInPage, klen);
-        value.pointTo(base, offsetInPage + klen, vlen + 4);
+        value.pointTo(base, offsetInPage + klen, vlen);
 
         offsetInPage += recordLength;
         recordsInPage -= 1;
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index 56994fafe064b..ec947d7580282 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -167,6 +167,7 @@ public UnsafeRow() {}
    */
   public void pointTo(Object baseObject, long baseOffset, int sizeInBytes) {
     assert numFields >= 0 : "numFields (" + numFields + ") should >= 0";
+    assert sizeInBytes % 8 == 0 : "sizeInBytes (" + sizeInBytes + ") should be a multiple of 8";
     this.baseObject = baseObject;
     this.baseOffset = baseOffset;
     this.sizeInBytes = sizeInBytes;
@@ -183,6 +184,7 @@ public void pointTo(byte[] buf, int sizeInBytes) {
   }
 
   public void setTotalSize(int sizeInBytes) {
+    assert sizeInBytes % 8 == 0 : "sizeInBytes (" + sizeInBytes + ") should be a multiple of 8";
     this.sizeInBytes = sizeInBytes;
   }
 
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/VariableLengthRowBasedKeyValueBatch.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/VariableLengthRowBasedKeyValueBatch.java
index ea4f984be24e5..905e6820ce6e2 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/VariableLengthRowBasedKeyValueBatch.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/VariableLengthRowBasedKeyValueBatch.java
@@ -65,7 +65,7 @@ public UnsafeRow appendRow(Object kbase, long koff, int klen,
 
     keyRowId = numRows;
     keyRow.pointTo(base, recordOffset + 8, klen);
-    valueRow.pointTo(base, recordOffset + 8 + klen, vlen + 4);
+    valueRow.pointTo(base, recordOffset + 8 + klen, vlen);
     numRows++;
     return valueRow;
   }
@@ -102,7 +102,7 @@ public UnsafeRow getValueFromKey(int rowId) {
     long offset = keyRow.getBaseOffset();
     int klen = keyRow.getSizeInBytes();
     int vlen = Platform.getInt(base, offset - 8) - klen - 4;
-    valueRow.pointTo(base, offset + klen, vlen + 4);
+    valueRow.pointTo(base, offset + klen, vlen);
     return valueRow;
   }
 
@@ -146,7 +146,7 @@ public boolean next() {
         currentvlen = totalLength - currentklen;
 
         key.pointTo(base, offsetInPage + 8, currentklen);
-        value.pointTo(base, offsetInPage + 8 + currentklen, currentvlen + 4);
+        value.pointTo(base, offsetInPage + 8 + currentklen, currentvlen);
 
         offsetInPage += 8 + totalLength + 8;
         recordsInPage -= 1;
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
index aadfcaa56cc2d..53b0886541fe1 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
@@ -208,9 +208,10 @@ private static final class RowComparator extends RecordComparator {
 
     @Override
     public int compare(Object baseObj1, long baseOff1, Object baseObj2, long baseOff2) {
-      // TODO: Why are the sizes -1?
-      row1.pointTo(baseObj1, baseOff1, -1);
-      row2.pointTo(baseObj2, baseOff2, -1);
+      // Note that since ordering doesn't need the total length of the record, we just pass 0
+      // into the row.
+      row1.pointTo(baseObj1, baseOff1, 0);
+      row2.pointTo(baseObj2, baseOff2, 0);
       return ordering.compare(row1, row2);
     }
   }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
index ee5bcfd02c79e..d8acf11a97915 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
@@ -238,10 +238,10 @@ private static final class KVComparator extends RecordComparator {
 
     @Override
     public int compare(Object baseObj1, long baseOff1, Object baseObj2, long baseOff2) {
-      // Note that since ordering doesn't need the total length of the record, we just pass -1
+      // Note that since ordering doesn't need the total length of the record, we just pass 0
       // into the row.
-      row1.pointTo(baseObj1, baseOff1 + 4, -1);
-      row2.pointTo(baseObj2, baseOff2 + 4, -1);
+      row1.pointTo(baseObj1, baseOff1 + 4, 0);
+      row2.pointTo(baseObj2, baseOff2 + 4, 0);
       return ordering.compare(row1, row2);
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
index fa4c99c01916f..e0c2e942072c7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
@@ -369,7 +369,11 @@ private[state] class HDFSBackedStateStoreProvider extends StateStoreProvider wit
             val valueRowBuffer = new Array[Byte](valueSize)
             ByteStreams.readFully(input, valueRowBuffer, 0, valueSize)
             val valueRow = new UnsafeRow(valueSchema.fields.length)
-            valueRow.pointTo(valueRowBuffer, valueSize)
+            // If valueSize in existing file is not multiple of 8, floor it to multiple of 8.
+            // This is a workaround for the following:
+            // Prior to Spark 2.3 mistakenly append 4 bytes to the value row in
+            // `RowBasedKeyValueBatch`, which gets persisted into the checkpoint data
+            valueRow.pointTo(valueRowBuffer, (valueSize / 8) * 8)
             map.put(keyRow, valueRow)
           }
         }
@@ -433,7 +437,11 @@ private[state] class HDFSBackedStateStoreProvider extends StateStoreProvider wit
             val valueRowBuffer = new Array[Byte](valueSize)
             ByteStreams.readFully(input, valueRowBuffer, 0, valueSize)
             val valueRow = new UnsafeRow(valueSchema.fields.length)
-            valueRow.pointTo(valueRowBuffer, valueSize)
+            // If valueSize in existing file is not multiple of 8, floor it to multiple of 8.
+            // This is a workaround for the following:
+            // Prior to Spark 2.3 mistakenly append 4 bytes to the value row in
+            // `RowBasedKeyValueBatch`, which gets persisted into the checkpoint data
+            valueRow.pointTo(valueRowBuffer, (valueSize / 8) * 8)
             map.put(keyRow, valueRow)
           }
         }

From 2ff35a057efd36bd5c8a545a1ec3bc341432a904 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Thu, 27 Jul 2017 19:19:51 +0800
Subject: [PATCH 244/779] [SPARK-21440][SQL][PYSPARK] Refactor ArrowConverters
 and add ArrayType and StructType support.

## What changes were proposed in this pull request?

This is a refactoring of `ArrowConverters` and related classes.

1. Refactor `ColumnWriter` as `ArrowWriter`.
2. Add `ArrayType` and `StructType` support.
3. Refactor `ArrowConverters` to skip intermediate `ArrowRecordBatch` creation.

## How was this patch tested?

Added some tests and existing tests.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #18655 from ueshin/issues/SPARK-21440.
---
 python/pyspark/sql/tests.py                   |   4 +-
 .../scala/org/apache/spark/sql/Dataset.scala  |   4 +-
 .../sql/execution/arrow/ArrowConverters.scala | 351 ++------------
 .../sql/execution/arrow/ArrowWriter.scala     | 323 +++++++++++++
 .../arrow/ArrowConvertersSuite.scala          | 447 +++++++++++++++++-
 .../execution/arrow/ArrowWriterSuite.scala    | 260 ++++++++++
 6 files changed, 1074 insertions(+), 315 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 1c1a0cad49625..54756edd9345d 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -3018,8 +3018,8 @@ def assertFramesEqual(self, df_with_arrow, df_without):
         self.assertTrue(df_without.equals(df_with_arrow), msg=msg)
 
     def test_unsupported_datatype(self):
-        schema = StructType([StructField("array", ArrayType(IntegerType(), False), True)])
-        df = self.spark.createDataFrame([([1, 2, 3],)], schema=schema)
+        schema = StructType([StructField("dt", DateType(), True)])
+        df = self.spark.createDataFrame([(datetime.date(1970, 1, 1),)], schema=schema)
         with QuietTest(self.sc):
             self.assertRaises(Exception, lambda: df.toPandas())
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 71ab0ddf2d6f4..9007367f5aa8f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -27,6 +27,7 @@ import scala.util.control.NonFatal
 
 import org.apache.commons.lang3.StringUtils
 
+import org.apache.spark.TaskContext
 import org.apache.spark.annotation.{DeveloperApi, Experimental, InterfaceStability}
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.api.java.function._
@@ -3090,7 +3091,8 @@ class Dataset[T] private[sql](
     val schemaCaptured = this.schema
     val maxRecordsPerBatch = sparkSession.sessionState.conf.arrowMaxRecordsPerBatch
     queryExecution.toRdd.mapPartitionsInternal { iter =>
-      ArrowConverters.toPayloadIterator(iter, schemaCaptured, maxRecordsPerBatch)
+      val context = TaskContext.get()
+      ArrowConverters.toPayloadIterator(iter, schemaCaptured, maxRecordsPerBatch, context)
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
index c913efe52a41c..240f38f5bfeb4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
@@ -20,18 +20,13 @@ package org.apache.spark.sql.execution.arrow
 import java.io.ByteArrayOutputStream
 import java.nio.channels.Channels
 
-import scala.collection.JavaConverters._
-
-import io.netty.buffer.ArrowBuf
-import org.apache.arrow.memory.{BufferAllocator, RootAllocator}
+import org.apache.arrow.memory.BufferAllocator
 import org.apache.arrow.vector._
-import org.apache.arrow.vector.BaseValueVector.BaseMutator
 import org.apache.arrow.vector.file._
-import org.apache.arrow.vector.schema.{ArrowFieldNode, ArrowRecordBatch}
-import org.apache.arrow.vector.types.FloatingPointPrecision
-import org.apache.arrow.vector.types.pojo.{ArrowType, Field, FieldType, Schema}
+import org.apache.arrow.vector.schema.ArrowRecordBatch
 import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel
 
+import org.apache.spark.TaskContext
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
@@ -55,19 +50,6 @@ private[sql] class ArrowPayload private[arrow] (payload: Array[Byte]) extends Se
   def asPythonSerializable: Array[Byte] = payload
 }
 
-private[sql] object ArrowPayload {
-
-  /**
-   * Create an ArrowPayload from an ArrowRecordBatch and Spark schema.
-   */
-  def apply(
-      batch: ArrowRecordBatch,
-      schema: StructType,
-      allocator: BufferAllocator): ArrowPayload = {
-    new ArrowPayload(ArrowConverters.batchToByteArray(batch, schema, allocator))
-  }
-}
-
 private[sql] object ArrowConverters {
 
   /**
@@ -77,95 +59,55 @@ private[sql] object ArrowConverters {
   private[sql] def toPayloadIterator(
       rowIter: Iterator[InternalRow],
       schema: StructType,
-      maxRecordsPerBatch: Int): Iterator[ArrowPayload] = {
-    new Iterator[ArrowPayload] {
-      private val _allocator = new RootAllocator(Long.MaxValue)
-      private var _nextPayload = if (rowIter.nonEmpty) convert() else null
+      maxRecordsPerBatch: Int,
+      context: TaskContext): Iterator[ArrowPayload] = {
 
-      override def hasNext: Boolean = _nextPayload != null
-
-      override def next(): ArrowPayload = {
-        val obj = _nextPayload
-        if (hasNext) {
-          if (rowIter.hasNext) {
-            _nextPayload = convert()
-          } else {
-            _allocator.close()
-            _nextPayload = null
-          }
-        }
-        obj
-      }
-
-      private def convert(): ArrowPayload = {
-        val batch = internalRowIterToArrowBatch(rowIter, schema, _allocator, maxRecordsPerBatch)
-        ArrowPayload(batch, schema, _allocator)
-      }
-    }
-  }
+    val arrowSchema = ArrowUtils.toArrowSchema(schema)
+    val allocator =
+      ArrowUtils.rootAllocator.newChildAllocator("toPayloadIterator", 0, Long.MaxValue)
 
-  /**
-   * Iterate over InternalRows and write to an ArrowRecordBatch, stopping when rowIter is consumed
-   * or the number of records in the batch equals maxRecordsInBatch.  If maxRecordsPerBatch is 0,
-   * then rowIter will be fully consumed.
-   */
-  private def internalRowIterToArrowBatch(
-      rowIter: Iterator[InternalRow],
-      schema: StructType,
-      allocator: BufferAllocator,
-      maxRecordsPerBatch: Int = 0): ArrowRecordBatch = {
+    val root = VectorSchemaRoot.create(arrowSchema, allocator)
+    val arrowWriter = ArrowWriter.create(root)
 
-    val columnWriters = schema.fields.zipWithIndex.map { case (field, ordinal) =>
-      ColumnWriter(field.dataType, ordinal, allocator).init()
-    }
+    var closed = false
 
-    val writerLength = columnWriters.length
-    var recordsInBatch = 0
-    while (rowIter.hasNext && (maxRecordsPerBatch <= 0 || recordsInBatch < maxRecordsPerBatch)) {
-      val row = rowIter.next()
-      var i = 0
-      while (i < writerLength) {
-        columnWriters(i).write(row)
-        i += 1
+    context.addTaskCompletionListener { _ =>
+      if (!closed) {
+        root.close()
+        allocator.close()
       }
-      recordsInBatch += 1
     }
 
-    val (fieldNodes, bufferArrays) = columnWriters.map(_.finish()).unzip
-    val buffers = bufferArrays.flatten
-
-    val rowLength = if (fieldNodes.nonEmpty) fieldNodes.head.getLength else 0
-    val recordBatch = new ArrowRecordBatch(rowLength,
-      fieldNodes.toList.asJava, buffers.toList.asJava)
+    new Iterator[ArrowPayload] {
 
-    buffers.foreach(_.release())
-    recordBatch
-  }
+      override def hasNext: Boolean = rowIter.hasNext || {
+        root.close()
+        allocator.close()
+        closed = true
+        false
+      }
 
-  /**
-   * Convert an ArrowRecordBatch to a byte array and close batch to release resources. Once closed,
-   * the batch can no longer be used.
-   */
-  private[arrow] def batchToByteArray(
-      batch: ArrowRecordBatch,
-      schema: StructType,
-      allocator: BufferAllocator): Array[Byte] = {
-    val arrowSchema = ArrowUtils.toArrowSchema(schema)
-    val root = VectorSchemaRoot.create(arrowSchema, allocator)
-    val out = new ByteArrayOutputStream()
-    val writer = new ArrowFileWriter(root, null, Channels.newChannel(out))
+      override def next(): ArrowPayload = {
+        val out = new ByteArrayOutputStream()
+        val writer = new ArrowFileWriter(root, null, Channels.newChannel(out))
+
+        Utils.tryWithSafeFinally {
+          var rowCount = 0
+          while (rowIter.hasNext && (maxRecordsPerBatch <= 0 || rowCount < maxRecordsPerBatch)) {
+            val row = rowIter.next()
+            arrowWriter.write(row)
+            rowCount += 1
+          }
+          arrowWriter.finish()
+          writer.writeBatch()
+        } {
+          arrowWriter.reset()
+          writer.close()
+        }
 
-    // Write a batch to byte stream, ensure the batch, allocator and writer are closed
-    Utils.tryWithSafeFinally {
-      val loader = new VectorLoader(root)
-      loader.load(batch)
-      writer.writeBatch()  // writeBatch can throw IOException
-    } {
-      batch.close()
-      root.close()
-      writer.close()
+        new ArrowPayload(out.toByteArray)
+      }
     }
-    out.toByteArray
   }
 
   /**
@@ -188,214 +130,3 @@ private[sql] object ArrowConverters {
     }
   }
 }
-
-/**
- * Interface for writing InternalRows to Arrow Buffers.
- */
-private[arrow] trait ColumnWriter {
-  def init(): this.type
-  def write(row: InternalRow): Unit
-
-  /**
-   * Clear the column writer and return the ArrowFieldNode and ArrowBuf.
-   * This should be called only once after all the data is written.
-   */
-  def finish(): (ArrowFieldNode, Array[ArrowBuf])
-}
-
-/**
- * Base class for flat arrow column writer, i.e., column without children.
- */
-private[arrow] abstract class PrimitiveColumnWriter(val ordinal: Int)
-  extends ColumnWriter {
-
-  def getFieldType(dtype: ArrowType): FieldType = FieldType.nullable(dtype)
-
-  def valueVector: BaseDataValueVector
-  def valueMutator: BaseMutator
-
-  def setNull(): Unit
-  def setValue(row: InternalRow): Unit
-
-  protected var count = 0
-  protected var nullCount = 0
-
-  override def init(): this.type = {
-    valueVector.allocateNew()
-    this
-  }
-
-  override def write(row: InternalRow): Unit = {
-    if (row.isNullAt(ordinal)) {
-      setNull()
-      nullCount += 1
-    } else {
-      setValue(row)
-    }
-    count += 1
-  }
-
-  override def finish(): (ArrowFieldNode, Array[ArrowBuf]) = {
-    valueMutator.setValueCount(count)
-    val fieldNode = new ArrowFieldNode(count, nullCount)
-    val valueBuffers = valueVector.getBuffers(true)
-    (fieldNode, valueBuffers)
-  }
-}
-
-private[arrow] class BooleanColumnWriter(dtype: ArrowType, ordinal: Int, allocator: BufferAllocator)
-  extends PrimitiveColumnWriter(ordinal) {
-  override val valueVector: NullableBitVector
-    = new NullableBitVector("BooleanValue", getFieldType(dtype), allocator)
-  override val valueMutator: NullableBitVector#Mutator = valueVector.getMutator
-
-  override def setNull(): Unit = valueMutator.setNull(count)
-  override def setValue(row: InternalRow): Unit
-    = valueMutator.setSafe(count, if (row.getBoolean(ordinal)) 1 else 0 )
-}
-
-private[arrow] class ShortColumnWriter(dtype: ArrowType, ordinal: Int, allocator: BufferAllocator)
-  extends PrimitiveColumnWriter(ordinal) {
-  override val valueVector: NullableSmallIntVector
-    = new NullableSmallIntVector("ShortValue", getFieldType(dtype: ArrowType), allocator)
-  override val valueMutator: NullableSmallIntVector#Mutator = valueVector.getMutator
-
-  override def setNull(): Unit = valueMutator.setNull(count)
-  override def setValue(row: InternalRow): Unit
-    = valueMutator.setSafe(count, row.getShort(ordinal))
-}
-
-private[arrow] class IntegerColumnWriter(dtype: ArrowType, ordinal: Int, allocator: BufferAllocator)
-  extends PrimitiveColumnWriter(ordinal) {
-  override val valueVector: NullableIntVector
-    = new NullableIntVector("IntValue", getFieldType(dtype), allocator)
-  override val valueMutator: NullableIntVector#Mutator = valueVector.getMutator
-
-  override def setNull(): Unit = valueMutator.setNull(count)
-  override def setValue(row: InternalRow): Unit
-    = valueMutator.setSafe(count, row.getInt(ordinal))
-}
-
-private[arrow] class LongColumnWriter(dtype: ArrowType, ordinal: Int, allocator: BufferAllocator)
-  extends PrimitiveColumnWriter(ordinal) {
-  override val valueVector: NullableBigIntVector
-    = new NullableBigIntVector("LongValue", getFieldType(dtype), allocator)
-  override val valueMutator: NullableBigIntVector#Mutator = valueVector.getMutator
-
-  override def setNull(): Unit = valueMutator.setNull(count)
-  override def setValue(row: InternalRow): Unit
-    = valueMutator.setSafe(count, row.getLong(ordinal))
-}
-
-private[arrow] class FloatColumnWriter(dtype: ArrowType, ordinal: Int, allocator: BufferAllocator)
-  extends PrimitiveColumnWriter(ordinal) {
-  override val valueVector: NullableFloat4Vector
-    = new NullableFloat4Vector("FloatValue", getFieldType(dtype), allocator)
-  override val valueMutator: NullableFloat4Vector#Mutator = valueVector.getMutator
-
-  override def setNull(): Unit = valueMutator.setNull(count)
-  override def setValue(row: InternalRow): Unit
-    = valueMutator.setSafe(count, row.getFloat(ordinal))
-}
-
-private[arrow] class DoubleColumnWriter(dtype: ArrowType, ordinal: Int, allocator: BufferAllocator)
-  extends PrimitiveColumnWriter(ordinal) {
-  override val valueVector: NullableFloat8Vector
-    = new NullableFloat8Vector("DoubleValue", getFieldType(dtype), allocator)
-  override val valueMutator: NullableFloat8Vector#Mutator = valueVector.getMutator
-
-  override def setNull(): Unit = valueMutator.setNull(count)
-  override def setValue(row: InternalRow): Unit
-    = valueMutator.setSafe(count, row.getDouble(ordinal))
-}
-
-private[arrow] class ByteColumnWriter(dtype: ArrowType, ordinal: Int, allocator: BufferAllocator)
-  extends PrimitiveColumnWriter(ordinal) {
-  override val valueVector: NullableUInt1Vector
-    = new NullableUInt1Vector("ByteValue", getFieldType(dtype), allocator)
-  override val valueMutator: NullableUInt1Vector#Mutator = valueVector.getMutator
-
-  override def setNull(): Unit = valueMutator.setNull(count)
-  override def setValue(row: InternalRow): Unit
-    = valueMutator.setSafe(count, row.getByte(ordinal))
-}
-
-private[arrow] class UTF8StringColumnWriter(
-    dtype: ArrowType,
-    ordinal: Int,
-    allocator: BufferAllocator)
-  extends PrimitiveColumnWriter(ordinal) {
-  override val valueVector: NullableVarCharVector
-    = new NullableVarCharVector("UTF8StringValue", getFieldType(dtype), allocator)
-  override val valueMutator: NullableVarCharVector#Mutator = valueVector.getMutator
-
-  override def setNull(): Unit = valueMutator.setNull(count)
-  override def setValue(row: InternalRow): Unit = {
-    val str = row.getUTF8String(ordinal)
-    valueMutator.setSafe(count, str.getByteBuffer, 0, str.numBytes)
-  }
-}
-
-private[arrow] class BinaryColumnWriter(dtype: ArrowType, ordinal: Int, allocator: BufferAllocator)
-  extends PrimitiveColumnWriter(ordinal) {
-  override val valueVector: NullableVarBinaryVector
-    = new NullableVarBinaryVector("BinaryValue", getFieldType(dtype), allocator)
-  override val valueMutator: NullableVarBinaryVector#Mutator = valueVector.getMutator
-
-  override def setNull(): Unit = valueMutator.setNull(count)
-  override def setValue(row: InternalRow): Unit = {
-    val bytes = row.getBinary(ordinal)
-    valueMutator.setSafe(count, bytes, 0, bytes.length)
-  }
-}
-
-private[arrow] class DateColumnWriter(dtype: ArrowType, ordinal: Int, allocator: BufferAllocator)
-  extends PrimitiveColumnWriter(ordinal) {
-  override val valueVector: NullableDateDayVector
-    = new NullableDateDayVector("DateValue", getFieldType(dtype), allocator)
-  override val valueMutator: NullableDateDayVector#Mutator = valueVector.getMutator
-
-  override def setNull(): Unit = valueMutator.setNull(count)
-  override def setValue(row: InternalRow): Unit = {
-    valueMutator.setSafe(count, row.getInt(ordinal))
-  }
-}
-
-private[arrow] class TimeStampColumnWriter(
-    dtype: ArrowType,
-    ordinal: Int,
-    allocator: BufferAllocator)
-  extends PrimitiveColumnWriter(ordinal) {
-  override val valueVector: NullableTimeStampMicroVector
-    = new NullableTimeStampMicroVector("TimeStampValue", getFieldType(dtype), allocator)
-  override val valueMutator: NullableTimeStampMicroVector#Mutator = valueVector.getMutator
-
-  override def setNull(): Unit = valueMutator.setNull(count)
-  override def setValue(row: InternalRow): Unit = {
-    valueMutator.setSafe(count, row.getLong(ordinal))
-  }
-}
-
-private[arrow] object ColumnWriter {
-
-  /**
-   * Create an Arrow ColumnWriter given the type and ordinal of row.
-   */
-  def apply(dataType: DataType, ordinal: Int, allocator: BufferAllocator): ColumnWriter = {
-    val dtype = ArrowUtils.toArrowType(dataType)
-    dataType match {
-      case BooleanType => new BooleanColumnWriter(dtype, ordinal, allocator)
-      case ShortType => new ShortColumnWriter(dtype, ordinal, allocator)
-      case IntegerType => new IntegerColumnWriter(dtype, ordinal, allocator)
-      case LongType => new LongColumnWriter(dtype, ordinal, allocator)
-      case FloatType => new FloatColumnWriter(dtype, ordinal, allocator)
-      case DoubleType => new DoubleColumnWriter(dtype, ordinal, allocator)
-      case ByteType => new ByteColumnWriter(dtype, ordinal, allocator)
-      case StringType => new UTF8StringColumnWriter(dtype, ordinal, allocator)
-      case BinaryType => new BinaryColumnWriter(dtype, ordinal, allocator)
-      case DateType => new DateColumnWriter(dtype, ordinal, allocator)
-      case TimestampType => new TimeStampColumnWriter(dtype, ordinal, allocator)
-      case _ => throw new UnsupportedOperationException(s"Unsupported data type: $dataType")
-    }
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala
new file mode 100644
index 0000000000000..11ba04d2ce9a7
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala
@@ -0,0 +1,323 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.arrow
+
+import scala.collection.JavaConverters._
+
+import org.apache.arrow.vector._
+import org.apache.arrow.vector.complex._
+import org.apache.arrow.vector.util.DecimalUtility
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.SpecializedGetters
+import org.apache.spark.sql.types._
+
+object ArrowWriter {
+
+  def create(schema: StructType): ArrowWriter = {
+    val arrowSchema = ArrowUtils.toArrowSchema(schema)
+    val root = VectorSchemaRoot.create(arrowSchema, ArrowUtils.rootAllocator)
+    create(root)
+  }
+
+  def create(root: VectorSchemaRoot): ArrowWriter = {
+    val children = root.getFieldVectors().asScala.map { vector =>
+      vector.allocateNew()
+      createFieldWriter(vector)
+    }
+    new ArrowWriter(root, children.toArray)
+  }
+
+  private def createFieldWriter(vector: ValueVector): ArrowFieldWriter = {
+    val field = vector.getField()
+    (ArrowUtils.fromArrowField(field), vector) match {
+      case (BooleanType, vector: NullableBitVector) => new BooleanWriter(vector)
+      case (ByteType, vector: NullableTinyIntVector) => new ByteWriter(vector)
+      case (ShortType, vector: NullableSmallIntVector) => new ShortWriter(vector)
+      case (IntegerType, vector: NullableIntVector) => new IntegerWriter(vector)
+      case (LongType, vector: NullableBigIntVector) => new LongWriter(vector)
+      case (FloatType, vector: NullableFloat4Vector) => new FloatWriter(vector)
+      case (DoubleType, vector: NullableFloat8Vector) => new DoubleWriter(vector)
+      case (StringType, vector: NullableVarCharVector) => new StringWriter(vector)
+      case (BinaryType, vector: NullableVarBinaryVector) => new BinaryWriter(vector)
+      case (ArrayType(_, _), vector: ListVector) =>
+        val elementVector = createFieldWriter(vector.getDataVector())
+        new ArrayWriter(vector, elementVector)
+      case (StructType(_), vector: NullableMapVector) =>
+        val children = (0 until vector.size()).map { ordinal =>
+          createFieldWriter(vector.getChildByOrdinal(ordinal))
+        }
+        new StructWriter(vector, children.toArray)
+      case (dt, _) =>
+        throw new UnsupportedOperationException(s"Unsupported data type: ${dt.simpleString}")
+    }
+  }
+}
+
+class ArrowWriter(
+    val root: VectorSchemaRoot,
+    fields: Array[ArrowFieldWriter]) {
+
+  def schema: StructType = StructType(fields.map { f =>
+    StructField(f.name, f.dataType, f.nullable)
+  })
+
+  private var count: Int = 0
+
+  def write(row: InternalRow): Unit = {
+    var i = 0
+    while (i < fields.size) {
+      fields(i).write(row, i)
+      i += 1
+    }
+    count += 1
+  }
+
+  def finish(): Unit = {
+    root.setRowCount(count)
+    fields.foreach(_.finish())
+  }
+
+  def reset(): Unit = {
+    root.setRowCount(0)
+    count = 0
+    fields.foreach(_.reset())
+  }
+}
+
+private[arrow] abstract class ArrowFieldWriter {
+
+  def valueVector: ValueVector
+  def valueMutator: ValueVector.Mutator
+
+  def name: String = valueVector.getField().getName()
+  def dataType: DataType = ArrowUtils.fromArrowField(valueVector.getField())
+  def nullable: Boolean = valueVector.getField().isNullable()
+
+  def setNull(): Unit
+  def setValue(input: SpecializedGetters, ordinal: Int): Unit
+
+  private[arrow] var count: Int = 0
+
+  def write(input: SpecializedGetters, ordinal: Int): Unit = {
+    if (input.isNullAt(ordinal)) {
+      setNull()
+    } else {
+      setValue(input, ordinal)
+    }
+    count += 1
+  }
+
+  def finish(): Unit = {
+    valueMutator.setValueCount(count)
+  }
+
+  def reset(): Unit = {
+    valueMutator.reset()
+    count = 0
+  }
+}
+
+private[arrow] class BooleanWriter(val valueVector: NullableBitVector) extends ArrowFieldWriter {
+
+  override def valueMutator: NullableBitVector#Mutator = valueVector.getMutator()
+
+  override def setNull(): Unit = {
+    valueMutator.setNull(count)
+  }
+
+  override def setValue(input: SpecializedGetters, ordinal: Int): Unit = {
+    valueMutator.setSafe(count, if (input.getBoolean(ordinal)) 1 else 0)
+  }
+}
+
+private[arrow] class ByteWriter(val valueVector: NullableTinyIntVector) extends ArrowFieldWriter {
+
+  override def valueMutator: NullableTinyIntVector#Mutator = valueVector.getMutator()
+
+  override def setNull(): Unit = {
+    valueMutator.setNull(count)
+  }
+
+  override def setValue(input: SpecializedGetters, ordinal: Int): Unit = {
+    valueMutator.setSafe(count, input.getByte(ordinal))
+  }
+}
+
+private[arrow] class ShortWriter(val valueVector: NullableSmallIntVector) extends ArrowFieldWriter {
+
+  override def valueMutator: NullableSmallIntVector#Mutator = valueVector.getMutator()
+
+  override def setNull(): Unit = {
+    valueMutator.setNull(count)
+  }
+
+  override def setValue(input: SpecializedGetters, ordinal: Int): Unit = {
+    valueMutator.setSafe(count, input.getShort(ordinal))
+  }
+}
+
+private[arrow] class IntegerWriter(val valueVector: NullableIntVector) extends ArrowFieldWriter {
+
+  override def valueMutator: NullableIntVector#Mutator = valueVector.getMutator()
+
+  override def setNull(): Unit = {
+    valueMutator.setNull(count)
+  }
+
+  override def setValue(input: SpecializedGetters, ordinal: Int): Unit = {
+    valueMutator.setSafe(count, input.getInt(ordinal))
+  }
+}
+
+private[arrow] class LongWriter(val valueVector: NullableBigIntVector) extends ArrowFieldWriter {
+
+  override def valueMutator: NullableBigIntVector#Mutator = valueVector.getMutator()
+
+  override def setNull(): Unit = {
+    valueMutator.setNull(count)
+  }
+
+  override def setValue(input: SpecializedGetters, ordinal: Int): Unit = {
+    valueMutator.setSafe(count, input.getLong(ordinal))
+  }
+}
+
+private[arrow] class FloatWriter(val valueVector: NullableFloat4Vector) extends ArrowFieldWriter {
+
+  override def valueMutator: NullableFloat4Vector#Mutator = valueVector.getMutator()
+
+  override def setNull(): Unit = {
+    valueMutator.setNull(count)
+  }
+
+  override def setValue(input: SpecializedGetters, ordinal: Int): Unit = {
+    valueMutator.setSafe(count, input.getFloat(ordinal))
+  }
+}
+
+private[arrow] class DoubleWriter(val valueVector: NullableFloat8Vector) extends ArrowFieldWriter {
+
+  override def valueMutator: NullableFloat8Vector#Mutator = valueVector.getMutator()
+
+  override def setNull(): Unit = {
+    valueMutator.setNull(count)
+  }
+
+  override def setValue(input: SpecializedGetters, ordinal: Int): Unit = {
+    valueMutator.setSafe(count, input.getDouble(ordinal))
+  }
+}
+
+private[arrow] class StringWriter(val valueVector: NullableVarCharVector) extends ArrowFieldWriter {
+
+  override def valueMutator: NullableVarCharVector#Mutator = valueVector.getMutator()
+
+  override def setNull(): Unit = {
+    valueMutator.setNull(count)
+  }
+
+  override def setValue(input: SpecializedGetters, ordinal: Int): Unit = {
+    val utf8 = input.getUTF8String(ordinal)
+    // todo: for off-heap UTF8String, how to pass in to arrow without copy?
+    valueMutator.setSafe(count, utf8.getByteBuffer, 0, utf8.numBytes())
+  }
+}
+
+private[arrow] class BinaryWriter(
+    val valueVector: NullableVarBinaryVector) extends ArrowFieldWriter {
+
+  override def valueMutator: NullableVarBinaryVector#Mutator = valueVector.getMutator()
+
+  override def setNull(): Unit = {
+    valueMutator.setNull(count)
+  }
+
+  override def setValue(input: SpecializedGetters, ordinal: Int): Unit = {
+    val bytes = input.getBinary(ordinal)
+    valueMutator.setSafe(count, bytes, 0, bytes.length)
+  }
+}
+
+private[arrow] class ArrayWriter(
+    val valueVector: ListVector,
+    val elementWriter: ArrowFieldWriter) extends ArrowFieldWriter {
+
+  override def valueMutator: ListVector#Mutator = valueVector.getMutator()
+
+  override def setNull(): Unit = {
+  }
+
+  override def setValue(input: SpecializedGetters, ordinal: Int): Unit = {
+    val array = input.getArray(ordinal)
+    var i = 0
+    valueMutator.startNewValue(count)
+    while (i < array.numElements()) {
+      elementWriter.write(array, i)
+      i += 1
+    }
+    valueMutator.endValue(count, array.numElements())
+  }
+
+  override def finish(): Unit = {
+    super.finish()
+    elementWriter.finish()
+  }
+
+  override def reset(): Unit = {
+    super.reset()
+    elementWriter.reset()
+  }
+}
+
+private[arrow] class StructWriter(
+    val valueVector: NullableMapVector,
+    children: Array[ArrowFieldWriter]) extends ArrowFieldWriter {
+
+  override def valueMutator: NullableMapVector#Mutator = valueVector.getMutator()
+
+  override def setNull(): Unit = {
+    var i = 0
+    while (i < children.length) {
+      children(i).setNull()
+      children(i).count += 1
+      i += 1
+    }
+    valueMutator.setNull(count)
+  }
+
+  override def setValue(input: SpecializedGetters, ordinal: Int): Unit = {
+    val struct = input.getStruct(ordinal, children.length)
+    var i = 0
+    while (i < struct.numFields) {
+      children(i).write(struct, i)
+      i += 1
+    }
+    valueMutator.setIndexDefined(count)
+  }
+
+  override def finish(): Unit = {
+    super.finish()
+    children.foreach(_.finish())
+  }
+
+  override def reset(): Unit = {
+    super.reset()
+    children.foreach(_.reset())
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala
index 55b465578a42d..4893b52f240ec 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala
@@ -32,7 +32,7 @@ import org.scalatest.BeforeAndAfterAll
 import org.apache.spark.SparkException
 import org.apache.spark.sql.{DataFrame, Row}
 import org.apache.spark.sql.test.SharedSQLContext
-import org.apache.spark.sql.types.{BinaryType, StructField, StructType}
+import org.apache.spark.sql.types.{BinaryType, IntegerType, StructField, StructType}
 import org.apache.spark.util.Utils
 
 
@@ -857,6 +857,449 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
     collectAndValidate(df, json, "nanData-floating_point.json")
   }
 
+  test("array type conversion") {
+    val json =
+      s"""
+         |{
+         |  "schema" : {
+         |    "fields" : [ {
+         |      "name" : "a_arr",
+         |      "nullable" : true,
+         |      "type" : {
+         |        "name" : "list"
+         |      },
+         |      "children" : [ {
+         |        "name" : "element",
+         |        "nullable" : false,
+         |        "type" : {
+         |          "name" : "int",
+         |          "bitWidth" : 32,
+         |          "isSigned" : true
+         |        },
+         |        "children" : [ ],
+         |        "typeLayout" : {
+         |          "vectors" : [ {
+         |            "type" : "VALIDITY",
+         |            "typeBitWidth" : 1
+         |          }, {
+         |            "type" : "DATA",
+         |            "typeBitWidth" : 32
+         |          } ]
+         |        }
+         |      } ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "OFFSET",
+         |          "typeBitWidth" : 32
+         |        } ]
+         |      }
+         |    }, {
+         |      "name" : "b_arr",
+         |      "nullable" : true,
+         |      "type" : {
+         |        "name" : "list"
+         |      },
+         |      "children" : [ {
+         |        "name" : "element",
+         |        "nullable" : false,
+         |        "type" : {
+         |          "name" : "int",
+         |          "bitWidth" : 32,
+         |          "isSigned" : true
+         |        },
+         |        "children" : [ ],
+         |        "typeLayout" : {
+         |          "vectors" : [ {
+         |            "type" : "VALIDITY",
+         |            "typeBitWidth" : 1
+         |          }, {
+         |            "type" : "DATA",
+         |            "typeBitWidth" : 32
+         |          } ]
+         |        }
+         |      } ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "OFFSET",
+         |          "typeBitWidth" : 32
+         |        } ]
+         |      }
+         |    }, {
+         |      "name" : "c_arr",
+         |      "nullable" : true,
+         |      "type" : {
+         |        "name" : "list"
+         |      },
+         |      "children" : [ {
+         |        "name" : "element",
+         |        "nullable" : true,
+         |        "type" : {
+         |          "name" : "int",
+         |          "bitWidth" : 32,
+         |          "isSigned" : true
+         |        },
+         |        "children" : [ ],
+         |        "typeLayout" : {
+         |          "vectors" : [ {
+         |            "type" : "VALIDITY",
+         |            "typeBitWidth" : 1
+         |          }, {
+         |            "type" : "DATA",
+         |            "typeBitWidth" : 32
+         |          } ]
+         |        }
+         |      } ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "OFFSET",
+         |          "typeBitWidth" : 32
+         |        } ]
+         |      }
+         |    }, {
+         |      "name" : "d_arr",
+         |      "nullable" : true,
+         |      "type" : {
+         |        "name" : "list"
+         |      },
+         |      "children" : [ {
+         |        "name" : "element",
+         |        "nullable" : true,
+         |        "type" : {
+         |          "name" : "list"
+         |        },
+         |        "children" : [ {
+         |          "name" : "element",
+         |          "nullable" : false,
+         |          "type" : {
+         |            "name" : "int",
+         |            "bitWidth" : 32,
+         |            "isSigned" : true
+         |          },
+         |          "children" : [ ],
+         |          "typeLayout" : {
+         |            "vectors" : [ {
+         |              "type" : "VALIDITY",
+         |              "typeBitWidth" : 1
+         |            }, {
+         |              "type" : "DATA",
+         |              "typeBitWidth" : 32
+         |            } ]
+         |          }
+         |        } ],
+         |        "typeLayout" : {
+         |          "vectors" : [ {
+         |            "type" : "VALIDITY",
+         |            "typeBitWidth" : 1
+         |          }, {
+         |            "type" : "OFFSET",
+         |            "typeBitWidth" : 32
+         |          } ]
+         |        }
+         |      } ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "OFFSET",
+         |          "typeBitWidth" : 32
+         |        } ]
+         |      }
+         |    } ]
+         |  },
+         |  "batches" : [ {
+         |    "count" : 4,
+         |    "columns" : [ {
+         |      "name" : "a_arr",
+         |      "count" : 4,
+         |      "VALIDITY" : [ 1, 1, 1, 1 ],
+         |      "OFFSET" : [ 0, 2, 4, 4, 5 ],
+         |      "children" : [ {
+         |        "name" : "element",
+         |        "count" : 5,
+         |        "VALIDITY" : [ 1, 1, 1, 1, 1 ],
+         |        "DATA" : [ 1, 2, 3, 4, 5 ]
+         |      } ]
+         |    }, {
+         |      "name" : "b_arr",
+         |      "count" : 4,
+         |      "VALIDITY" : [ 1, 0, 1, 0 ],
+         |      "OFFSET" : [ 0, 2, 2, 2, 2 ],
+         |      "children" : [ {
+         |        "name" : "element",
+         |        "count" : 2,
+         |        "VALIDITY" : [ 1, 1 ],
+         |        "DATA" : [ 1, 2 ]
+         |      } ]
+         |    }, {
+         |      "name" : "c_arr",
+         |      "count" : 4,
+         |      "VALIDITY" : [ 1, 1, 1, 1 ],
+         |      "OFFSET" : [ 0, 2, 4, 4, 5 ],
+         |      "children" : [ {
+         |        "name" : "element",
+         |        "count" : 5,
+         |        "VALIDITY" : [ 1, 1, 1, 0, 1 ],
+         |        "DATA" : [ 1, 2, 3, 0, 5 ]
+         |      } ]
+         |    }, {
+         |      "name" : "d_arr",
+         |      "count" : 4,
+         |      "VALIDITY" : [ 1, 1, 1, 1 ],
+         |      "OFFSET" : [ 0, 1, 3, 3, 4 ],
+         |      "children" : [ {
+         |        "name" : "element",
+         |        "count" : 4,
+         |        "VALIDITY" : [ 1, 1, 1, 1 ],
+         |        "OFFSET" : [ 0, 2, 3, 3, 4 ],
+         |        "children" : [ {
+         |          "name" : "element",
+         |          "count" : 4,
+         |          "VALIDITY" : [ 1, 1, 1, 1 ],
+         |          "DATA" : [ 1, 2, 3, 5 ]
+         |        } ]
+         |      } ]
+         |    } ]
+         |  } ]
+         |}
+       """.stripMargin
+
+    val aArr = Seq(Seq(1, 2), Seq(3, 4), Seq(), Seq(5))
+    val bArr = Seq(Some(Seq(1, 2)), None, Some(Seq()), None)
+    val cArr = Seq(Seq(Some(1), Some(2)), Seq(Some(3), None), Seq(), Seq(Some(5)))
+    val dArr = Seq(Seq(Seq(1, 2)), Seq(Seq(3), Seq()), Seq(), Seq(Seq(5)))
+
+    val df = aArr.zip(bArr).zip(cArr).zip(dArr).map {
+      case (((a, b), c), d) => (a, b, c, d)
+    }.toDF("a_arr", "b_arr", "c_arr", "d_arr")
+
+    collectAndValidate(df, json, "arrayData.json")
+  }
+
+  test("struct type conversion") {
+    val json =
+      s"""
+         |{
+         |  "schema" : {
+         |    "fields" : [ {
+         |      "name" : "a_struct",
+         |      "nullable" : false,
+         |      "type" : {
+         |        "name" : "struct"
+         |      },
+         |      "children" : [ {
+         |        "name" : "i",
+         |        "nullable" : false,
+         |        "type" : {
+         |          "name" : "int",
+         |          "bitWidth" : 32,
+         |          "isSigned" : true
+         |        },
+         |        "children" : [ ],
+         |        "typeLayout" : {
+         |          "vectors" : [ {
+         |            "type" : "VALIDITY",
+         |            "typeBitWidth" : 1
+         |          }, {
+         |            "type" : "DATA",
+         |            "typeBitWidth" : 32
+         |          } ]
+         |        }
+         |      } ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        } ]
+         |      }
+         |    }, {
+         |      "name" : "b_struct",
+         |      "nullable" : true,
+         |      "type" : {
+         |        "name" : "struct"
+         |      },
+         |      "children" : [ {
+         |        "name" : "i",
+         |        "nullable" : false,
+         |        "type" : {
+         |          "name" : "int",
+         |          "bitWidth" : 32,
+         |          "isSigned" : true
+         |        },
+         |        "children" : [ ],
+         |        "typeLayout" : {
+         |          "vectors" : [ {
+         |            "type" : "VALIDITY",
+         |            "typeBitWidth" : 1
+         |          }, {
+         |            "type" : "DATA",
+         |            "typeBitWidth" : 32
+         |          } ]
+         |        }
+         |      } ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        } ]
+         |      }
+         |    }, {
+         |      "name" : "c_struct",
+         |      "nullable" : false,
+         |      "type" : {
+         |        "name" : "struct"
+         |      },
+         |      "children" : [ {
+         |        "name" : "i",
+         |        "nullable" : true,
+         |        "type" : {
+         |          "name" : "int",
+         |          "bitWidth" : 32,
+         |          "isSigned" : true
+         |        },
+         |        "children" : [ ],
+         |        "typeLayout" : {
+         |          "vectors" : [ {
+         |            "type" : "VALIDITY",
+         |            "typeBitWidth" : 1
+         |          }, {
+         |            "type" : "DATA",
+         |            "typeBitWidth" : 32
+         |          } ]
+         |        }
+         |      } ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        } ]
+         |      }
+         |    }, {
+         |      "name" : "d_struct",
+         |      "nullable" : true,
+         |      "type" : {
+         |        "name" : "struct"
+         |      },
+         |      "children" : [ {
+         |        "name" : "nested",
+         |        "nullable" : true,
+         |        "type" : {
+         |          "name" : "struct"
+         |        },
+         |        "children" : [ {
+         |          "name" : "i",
+         |          "nullable" : true,
+         |          "type" : {
+         |            "name" : "int",
+         |            "bitWidth" : 32,
+         |            "isSigned" : true
+         |          },
+         |          "children" : [ ],
+         |          "typeLayout" : {
+         |            "vectors" : [ {
+         |              "type" : "VALIDITY",
+         |              "typeBitWidth" : 1
+         |            }, {
+         |              "type" : "DATA",
+         |              "typeBitWidth" : 32
+         |            } ]
+         |          }
+         |        } ],
+         |        "typeLayout" : {
+         |          "vectors" : [ {
+         |            "type" : "VALIDITY",
+         |            "typeBitWidth" : 1
+         |          } ]
+         |        }
+         |      } ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        } ]
+         |      }
+         |    } ]
+         |  },
+         |  "batches" : [ {
+         |    "count" : 3,
+         |    "columns" : [ {
+         |      "name" : "a_struct",
+         |      "count" : 3,
+         |      "VALIDITY" : [ 1, 1, 1 ],
+         |      "children" : [ {
+         |        "name" : "i",
+         |        "count" : 3,
+         |        "VALIDITY" : [ 1, 1, 1 ],
+         |        "DATA" : [ 1, 2, 3 ]
+         |      } ]
+         |    }, {
+         |      "name" : "b_struct",
+         |      "count" : 3,
+         |      "VALIDITY" : [ 1, 0, 1 ],
+         |      "children" : [ {
+         |        "name" : "i",
+         |        "count" : 3,
+         |        "VALIDITY" : [ 1, 0, 1 ],
+         |        "DATA" : [ 1, 2, 3 ]
+         |      } ]
+         |    }, {
+         |      "name" : "c_struct",
+         |      "count" : 3,
+         |      "VALIDITY" : [ 1, 1, 1 ],
+         |      "children" : [ {
+         |        "name" : "i",
+         |        "count" : 3,
+         |        "VALIDITY" : [ 1, 0, 1 ],
+         |        "DATA" : [ 1, 2, 3 ]
+         |      } ]
+         |    }, {
+         |      "name" : "d_struct",
+         |      "count" : 3,
+         |      "VALIDITY" : [ 1, 0, 1 ],
+         |      "children" : [ {
+         |        "name" : "nested",
+         |        "count" : 3,
+         |        "VALIDITY" : [ 1, 0, 0 ],
+         |        "children" : [ {
+         |          "name" : "i",
+         |          "count" : 3,
+         |          "VALIDITY" : [ 1, 0, 0 ],
+         |          "DATA" : [ 1, 2, 0 ]
+         |        } ]
+         |      } ]
+         |    } ]
+         |  } ]
+         |}
+       """.stripMargin
+
+    val aStruct = Seq(Row(1), Row(2), Row(3))
+    val bStruct = Seq(Row(1), null, Row(3))
+    val cStruct = Seq(Row(1), Row(null), Row(3))
+    val dStruct = Seq(Row(Row(1)), null, Row(null))
+    val data = aStruct.zip(bStruct).zip(cStruct).zip(dStruct).map {
+      case (((a, b), c), d) => Row(a, b, c, d)
+    }
+
+    val rdd = sparkContext.parallelize(data)
+    val schema = new StructType()
+      .add("a_struct", new StructType().add("i", IntegerType, nullable = false), nullable = false)
+      .add("b_struct", new StructType().add("i", IntegerType, nullable = false), nullable = true)
+      .add("c_struct", new StructType().add("i", IntegerType, nullable = true), nullable = false)
+      .add("d_struct", new StructType().add("nested", new StructType().add("i", IntegerType)))
+    val df = spark.createDataFrame(rdd, schema)
+
+    collectAndValidate(df, json, "structData.json")
+  }
+
   test("partitioned DataFrame") {
     val json1 =
       s"""
@@ -1015,6 +1458,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
     spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", maxRecordsPerBatch)
     val df = spark.sparkContext.parallelize(1 to totalRecords, 2).toDF("i")
     val arrowPayloads = df.toArrowPayload.collect()
+    assert(arrowPayloads.length >= 4)
     val allocator = new RootAllocator(Long.MaxValue)
     val arrowRecordBatches = arrowPayloads.map(_.loadBatch(allocator))
     var recordCount = 0
@@ -1039,7 +1483,6 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
     }
 
     runUnsupported { decimalData.toArrowPayload.collect() }
-    runUnsupported { arrayData.toDF().toArrowPayload.collect() }
     runUnsupported { mapData.toDF().toArrowPayload.collect() }
     runUnsupported { complexData.toArrowPayload.collect() }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala
new file mode 100644
index 0000000000000..e9a629315f5f4
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala
@@ -0,0 +1,260 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.arrow
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.util.ArrayData
+import org.apache.spark.sql.execution.vectorized.ArrowColumnVector
+import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
+
+class ArrowWriterSuite extends SparkFunSuite {
+
+  test("simple") {
+    def check(dt: DataType, data: Seq[Any]): Unit = {
+      val schema = new StructType().add("value", dt, nullable = true)
+      val writer = ArrowWriter.create(schema)
+      assert(writer.schema === schema)
+
+      data.foreach { datum =>
+        writer.write(InternalRow(datum))
+      }
+      writer.finish()
+
+      val reader = new ArrowColumnVector(writer.root.getFieldVectors().get(0))
+      data.zipWithIndex.foreach {
+        case (null, rowId) => assert(reader.isNullAt(rowId))
+        case (datum, rowId) =>
+          val value = dt match {
+            case BooleanType => reader.getBoolean(rowId)
+            case ByteType => reader.getByte(rowId)
+            case ShortType => reader.getShort(rowId)
+            case IntegerType => reader.getInt(rowId)
+            case LongType => reader.getLong(rowId)
+            case FloatType => reader.getFloat(rowId)
+            case DoubleType => reader.getDouble(rowId)
+            case StringType => reader.getUTF8String(rowId)
+            case BinaryType => reader.getBinary(rowId)
+          }
+          assert(value === datum)
+      }
+
+      writer.root.close()
+    }
+    check(BooleanType, Seq(true, null, false))
+    check(ByteType, Seq(1.toByte, 2.toByte, null, 4.toByte))
+    check(ShortType, Seq(1.toShort, 2.toShort, null, 4.toShort))
+    check(IntegerType, Seq(1, 2, null, 4))
+    check(LongType, Seq(1L, 2L, null, 4L))
+    check(FloatType, Seq(1.0f, 2.0f, null, 4.0f))
+    check(DoubleType, Seq(1.0d, 2.0d, null, 4.0d))
+    check(StringType, Seq("a", "b", null, "d").map(UTF8String.fromString))
+    check(BinaryType, Seq("a".getBytes(), "b".getBytes(), null, "d".getBytes()))
+  }
+
+  test("get multiple") {
+    def check(dt: DataType, data: Seq[Any]): Unit = {
+      val schema = new StructType().add("value", dt, nullable = false)
+      val writer = ArrowWriter.create(schema)
+      assert(writer.schema === schema)
+
+      data.foreach { datum =>
+        writer.write(InternalRow(datum))
+      }
+      writer.finish()
+
+      val reader = new ArrowColumnVector(writer.root.getFieldVectors().get(0))
+      val values = dt match {
+        case BooleanType => reader.getBooleans(0, data.size)
+        case ByteType => reader.getBytes(0, data.size)
+        case ShortType => reader.getShorts(0, data.size)
+        case IntegerType => reader.getInts(0, data.size)
+        case LongType => reader.getLongs(0, data.size)
+        case FloatType => reader.getFloats(0, data.size)
+        case DoubleType => reader.getDoubles(0, data.size)
+      }
+      assert(values === data)
+
+      writer.root.close()
+    }
+    check(BooleanType, Seq(true, false))
+    check(ByteType, (0 until 10).map(_.toByte))
+    check(ShortType, (0 until 10).map(_.toShort))
+    check(IntegerType, (0 until 10))
+    check(LongType, (0 until 10).map(_.toLong))
+    check(FloatType, (0 until 10).map(_.toFloat))
+    check(DoubleType, (0 until 10).map(_.toDouble))
+  }
+
+  test("array") {
+    val schema = new StructType()
+      .add("arr", ArrayType(IntegerType, containsNull = true), nullable = true)
+    val writer = ArrowWriter.create(schema)
+    assert(writer.schema === schema)
+
+    writer.write(InternalRow(ArrayData.toArrayData(Array(1, 2, 3))))
+    writer.write(InternalRow(ArrayData.toArrayData(Array(4, 5))))
+    writer.write(InternalRow(null))
+    writer.write(InternalRow(ArrayData.toArrayData(Array.empty[Int])))
+    writer.write(InternalRow(ArrayData.toArrayData(Array(6, null, 8))))
+    writer.finish()
+
+    val reader = new ArrowColumnVector(writer.root.getFieldVectors().get(0))
+
+    val array0 = reader.getArray(0)
+    assert(array0.numElements() === 3)
+    assert(array0.getInt(0) === 1)
+    assert(array0.getInt(1) === 2)
+    assert(array0.getInt(2) === 3)
+
+    val array1 = reader.getArray(1)
+    assert(array1.numElements() === 2)
+    assert(array1.getInt(0) === 4)
+    assert(array1.getInt(1) === 5)
+
+    assert(reader.isNullAt(2))
+
+    val array3 = reader.getArray(3)
+    assert(array3.numElements() === 0)
+
+    val array4 = reader.getArray(4)
+    assert(array4.numElements() === 3)
+    assert(array4.getInt(0) === 6)
+    assert(array4.isNullAt(1))
+    assert(array4.getInt(2) === 8)
+
+    writer.root.close()
+  }
+
+  test("nested array") {
+    val schema = new StructType().add("nested", ArrayType(ArrayType(IntegerType)))
+    val writer = ArrowWriter.create(schema)
+    assert(writer.schema === schema)
+
+    writer.write(InternalRow(ArrayData.toArrayData(Array(
+      ArrayData.toArrayData(Array(1, 2, 3)),
+      ArrayData.toArrayData(Array(4, 5)),
+      null,
+      ArrayData.toArrayData(Array.empty[Int]),
+      ArrayData.toArrayData(Array(6, null, 8))))))
+    writer.write(InternalRow(null))
+    writer.write(InternalRow(ArrayData.toArrayData(Array.empty)))
+    writer.finish()
+
+    val reader = new ArrowColumnVector(writer.root.getFieldVectors().get(0))
+
+    val array0 = reader.getArray(0)
+    assert(array0.numElements() === 5)
+
+    val array00 = array0.getArray(0)
+    assert(array00.numElements() === 3)
+    assert(array00.getInt(0) === 1)
+    assert(array00.getInt(1) === 2)
+    assert(array00.getInt(2) === 3)
+
+    val array01 = array0.getArray(1)
+    assert(array01.numElements() === 2)
+    assert(array01.getInt(0) === 4)
+    assert(array01.getInt(1) === 5)
+
+    assert(array0.isNullAt(2))
+
+    val array03 = array0.getArray(3)
+    assert(array03.numElements() === 0)
+
+    val array04 = array0.getArray(4)
+    assert(array04.numElements() === 3)
+    assert(array04.getInt(0) === 6)
+    assert(array04.isNullAt(1))
+    assert(array04.getInt(2) === 8)
+
+    assert(reader.isNullAt(1))
+
+    val array2 = reader.getArray(2)
+    assert(array2.numElements() === 0)
+
+    writer.root.close()
+  }
+
+  test("struct") {
+    val schema = new StructType()
+      .add("struct", new StructType().add("i", IntegerType).add("str", StringType))
+    val writer = ArrowWriter.create(schema)
+    assert(writer.schema === schema)
+
+    writer.write(InternalRow(InternalRow(1, UTF8String.fromString("str1"))))
+    writer.write(InternalRow(InternalRow(null, null)))
+    writer.write(InternalRow(null))
+    writer.write(InternalRow(InternalRow(4, null)))
+    writer.write(InternalRow(InternalRow(null, UTF8String.fromString("str5"))))
+    writer.finish()
+
+    val reader = new ArrowColumnVector(writer.root.getFieldVectors().get(0))
+
+    val struct0 = reader.getStruct(0, 2)
+    assert(struct0.getInt(0) === 1)
+    assert(struct0.getUTF8String(1) === UTF8String.fromString("str1"))
+
+    val struct1 = reader.getStruct(1, 2)
+    assert(struct1.isNullAt(0))
+    assert(struct1.isNullAt(1))
+
+    assert(reader.isNullAt(2))
+
+    val struct3 = reader.getStruct(3, 2)
+    assert(struct3.getInt(0) === 4)
+    assert(struct3.isNullAt(1))
+
+    val struct4 = reader.getStruct(4, 2)
+    assert(struct4.isNullAt(0))
+    assert(struct4.getUTF8String(1) === UTF8String.fromString("str5"))
+
+    writer.root.close()
+  }
+
+  test("nested struct") {
+    val schema = new StructType().add("struct",
+      new StructType().add("nested", new StructType().add("i", IntegerType).add("str", StringType)))
+    val writer = ArrowWriter.create(schema)
+    assert(writer.schema === schema)
+
+    writer.write(InternalRow(InternalRow(InternalRow(1, UTF8String.fromString("str1")))))
+    writer.write(InternalRow(InternalRow(InternalRow(null, null))))
+    writer.write(InternalRow(InternalRow(null)))
+    writer.write(InternalRow(null))
+    writer.finish()
+
+    val reader = new ArrowColumnVector(writer.root.getFieldVectors().get(0))
+
+    val struct00 = reader.getStruct(0, 1).getStruct(0, 2)
+    assert(struct00.getInt(0) === 1)
+    assert(struct00.getUTF8String(1) === UTF8String.fromString("str1"))
+
+    val struct10 = reader.getStruct(1, 1).getStruct(0, 2)
+    assert(struct10.isNullAt(0))
+    assert(struct10.isNullAt(1))
+
+    val struct2 = reader.getStruct(2, 1)
+    assert(struct2.isNullAt(0))
+
+    assert(reader.isNullAt(3))
+
+    writer.root.close()
+  }
+}

From ddcd2e8269db36e4b43edd5cee921d4b12def203 Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Thu, 27 Jul 2017 22:00:59 +0800
Subject: [PATCH 245/779] [SPARK-19270][ML] Add summary table to GLM summary

## What changes were proposed in this pull request?

Add R-like summary table to GLM summary, which includes feature name (if exist), parameter estimate, standard error, t-stat and p-value. This allows scala users to easily gather these commonly used inference results.

srowen yanboliang  felixcheung

## How was this patch tested?
New tests. One for testing feature Name, and one for testing the summary Table.

Author: actuaryzhang <actuaryzhang10@gmail.com>
Author: Wayne Zhang <actuaryzhang10@gmail.com>
Author: Yanbo Liang <ybliang8@gmail.com>

Closes #16630 from actuaryzhang/glmTable.
---
 .../GeneralizedLinearRegressionWrapper.scala  |  39 ++----
 .../GeneralizedLinearRegression.scala         | 111 +++++++++++++++++-
 .../GeneralizedLinearRegressionSuite.scala    |  83 ++++++++++++-
 3 files changed, 199 insertions(+), 34 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
index ee1fc9b14ceaa..176a6cf852914 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
@@ -83,11 +83,7 @@ private[r] object GeneralizedLinearRegressionWrapper
       .setStringIndexerOrderType(stringIndexerOrderType)
     checkDataColumns(rFormula, data)
     val rFormulaModel = rFormula.fit(data)
-    // get labels and feature names from output schema
-    val schema = rFormulaModel.transform(data).schema
-    val featureAttrs = AttributeGroup.fromStructField(schema(rFormula.getFeaturesCol))
-      .attributes.get
-    val features = featureAttrs.map(_.name.get)
+
     // assemble and fit the pipeline
     val glr = new GeneralizedLinearRegression()
       .setFamily(family)
@@ -113,37 +109,16 @@ private[r] object GeneralizedLinearRegressionWrapper
     val summary = glm.summary
 
     val rFeatures: Array[String] = if (glm.getFitIntercept) {
-      Array("(Intercept)") ++ features
+      Array("(Intercept)") ++ summary.featureNames
     } else {
-      features
+      summary.featureNames
     }
 
     val rCoefficients: Array[Double] = if (summary.isNormalSolver) {
-      val rCoefficientStandardErrors = if (glm.getFitIntercept) {
-        Array(summary.coefficientStandardErrors.last) ++
-          summary.coefficientStandardErrors.dropRight(1)
-      } else {
-        summary.coefficientStandardErrors
-      }
-
-      val rTValues = if (glm.getFitIntercept) {
-        Array(summary.tValues.last) ++ summary.tValues.dropRight(1)
-      } else {
-        summary.tValues
-      }
-
-      val rPValues = if (glm.getFitIntercept) {
-        Array(summary.pValues.last) ++ summary.pValues.dropRight(1)
-      } else {
-        summary.pValues
-      }
-
-      if (glm.getFitIntercept) {
-        Array(glm.intercept) ++ glm.coefficients.toArray ++
-          rCoefficientStandardErrors ++ rTValues ++ rPValues
-      } else {
-        glm.coefficients.toArray ++ rCoefficientStandardErrors ++ rTValues ++ rPValues
-      }
+      summary.coefficientsWithStatistics.map(_._2) ++
+        summary.coefficientsWithStatistics.map(_._3) ++
+        summary.coefficientsWithStatistics.map(_._4) ++
+        summary.coefficientsWithStatistics.map(_._5)
     } else {
       if (glm.getFitIntercept) {
         Array(glm.intercept) ++ glm.coefficients.toArray
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 815607f0a76d2..917a4d238d467 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -20,12 +20,14 @@ package org.apache.spark.ml.regression
 import java.util.Locale
 
 import breeze.stats.{distributions => dist}
+import org.apache.commons.lang3.StringUtils
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.SparkException
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.internal.Logging
 import org.apache.spark.ml.PredictorParams
+import org.apache.spark.ml.attribute.AttributeGroup
 import org.apache.spark.ml.feature.{Instance, OffsetInstance}
 import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors}
 import org.apache.spark.ml.optim._
@@ -37,7 +39,6 @@ import org.apache.spark.sql.{Column, DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
 
-
 /**
  * Params for Generalized Linear Regression.
  */
@@ -141,6 +142,7 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam
   /**
    * Param for offset column name. If this is not set or empty, we treat all instance offsets
    * as 0.0. The feature specified as offset has a constant coefficient of 1.0.
+   *
    * @group param
    */
   @Since("2.3.0")
@@ -1204,6 +1206,21 @@ class GeneralizedLinearRegressionSummary private[regression] (
   @Since("2.2.0")
   lazy val numInstances: Long = predictions.count()
 
+
+  /**
+   * Name of features. If the name cannot be retrieved from attributes,
+   * set default names to feature column name with numbered suffix "_0", "_1", and so on.
+   */
+  private[ml] lazy val featureNames: Array[String] = {
+    val featureAttrs = AttributeGroup.fromStructField(
+      dataset.schema(model.getFeaturesCol)).attributes
+    if (featureAttrs.isDefined) {
+      featureAttrs.get.map(_.name.get)
+    } else {
+      Array.tabulate[String](origModel.numFeatures)((x: Int) => model.getFeaturesCol + "_" + x)
+    }
+  }
+
   /** The numeric rank of the fitted linear model. */
   @Since("2.0.0")
   lazy val rank: Long = if (model.getFitIntercept) {
@@ -1458,4 +1475,96 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
         "No p-value available for this GeneralizedLinearRegressionModel")
     }
   }
+
+  /**
+   * Coefficients with statistics: feature name, coefficients, standard error, tValue and pValue.
+   */
+  private[ml] lazy val coefficientsWithStatistics: Array[
+    (String, Double, Double, Double, Double)] = {
+    var featureNamesLocal = featureNames
+    var coefficientsArray = model.coefficients.toArray
+    var index = Array.range(0, coefficientsArray.length)
+    if (model.getFitIntercept) {
+      featureNamesLocal = featureNamesLocal :+ "(Intercept)"
+      coefficientsArray = coefficientsArray :+ model.intercept
+      // Reorder so that intercept comes first
+      index = (coefficientsArray.length - 1) +: index
+    }
+    index.map { i =>
+      (featureNamesLocal(i), coefficientsArray(i), coefficientStandardErrors(i),
+        tValues(i), pValues(i))
+    }
+  }
+
+  override def toString: String = {
+    if (isNormalSolver) {
+
+      def round(x: Double): String = {
+        BigDecimal(x).setScale(4, BigDecimal.RoundingMode.HALF_UP).toString
+      }
+
+      val colNames = Array("Feature", "Estimate", "Std Error", "T Value", "P Value")
+
+      val data = coefficientsWithStatistics.map { row =>
+        val strRow = row.productIterator.map { cell =>
+          val str = cell match {
+            case s: String => s
+            case n: Double => round(n)
+          }
+          // Truncate if length > 20
+          if (str.length > 20) {
+            str.substring(0, 17) + "..."
+          } else {
+            str
+          }
+        }
+        strRow.toArray
+      }
+
+      // Compute the width of each column
+      val colWidths = colNames.map(_.length)
+      data.foreach { strRow =>
+        strRow.zipWithIndex.foreach { case (cell: String, i: Int) =>
+          colWidths(i) = math.max(colWidths(i), cell.length)
+        }
+      }
+
+      val sb = new StringBuilder
+
+      // Output coefficients with statistics
+      sb.append("Coefficients:\n")
+      colNames.zipWithIndex.map { case (colName: String, i: Int) =>
+        StringUtils.leftPad(colName, colWidths(i))
+      }.addString(sb, "", " ", "\n")
+
+      data.foreach { case strRow: Array[String] =>
+        strRow.zipWithIndex.map { case (cell: String, i: Int) =>
+          StringUtils.leftPad(cell.toString, colWidths(i))
+        }.addString(sb, "", " ", "\n")
+      }
+
+      sb.append("\n")
+      sb.append(s"(Dispersion parameter for ${family.name} family taken to be " +
+        s"${round(dispersion)})")
+
+      sb.append("\n")
+      val nd = s"Null deviance: ${round(nullDeviance)} on $degreesOfFreedom degrees of freedom"
+      val rd = s"Residual deviance: ${round(deviance)} on $residualDegreeOfFreedom degrees of " +
+        "freedom"
+      val l = math.max(nd.length, rd.length)
+      sb.append(StringUtils.leftPad(nd, l))
+      sb.append("\n")
+      sb.append(StringUtils.leftPad(rd, l))
+
+      if (family.name != "tweedie") {
+        sb.append("\n")
+        sb.append(s"AIC: " + round(aic))
+      }
+
+      sb.toString()
+    } else {
+      throw new UnsupportedOperationException(
+        "No summary available for this GeneralizedLinearRegressionModel")
+    }
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index a47bd17f47bb1..df7dee869d058 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -22,7 +22,7 @@ import scala.util.Random
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.classification.LogisticRegressionSuite._
 import org.apache.spark.ml.feature.{Instance, OffsetInstance}
-import org.apache.spark.ml.feature.LabeledPoint
+import org.apache.spark.ml.feature.{LabeledPoint, RFormula}
 import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors}
 import org.apache.spark.ml.param.{ParamMap, ParamsSuite}
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
@@ -1524,6 +1524,87 @@ class GeneralizedLinearRegressionSuite
       .fit(datasetGaussianIdentity.as[LabeledPoint])
   }
 
+  test("glm summary: feature name") {
+    // dataset1 with no attribute
+    val dataset1 = Seq(
+      Instance(2.0, 1.0, Vectors.dense(0.0, 5.0)),
+      Instance(8.0, 2.0, Vectors.dense(1.0, 7.0)),
+      Instance(3.0, 3.0, Vectors.dense(2.0, 11.0)),
+      Instance(9.0, 4.0, Vectors.dense(3.0, 13.0)),
+      Instance(2.0, 5.0, Vectors.dense(2.0, 3.0))
+    ).toDF()
+
+    // dataset2 with attribute
+    val datasetTmp = Seq(
+      (2.0, 1.0, 0.0, 5.0),
+      (8.0, 2.0, 1.0, 7.0),
+      (3.0, 3.0, 2.0, 11.0),
+      (9.0, 4.0, 3.0, 13.0),
+      (2.0, 5.0, 2.0, 3.0)
+    ).toDF("y", "w", "x1", "x2")
+    val formula = new RFormula().setFormula("y ~ x1 + x2")
+    val dataset2 = formula.fit(datasetTmp).transform(datasetTmp)
+
+    val expectedFeature = Seq(Array("features_0", "features_1"), Array("x1", "x2"))
+
+    var idx = 0
+    for (dataset <- Seq(dataset1, dataset2)) {
+      val model = new GeneralizedLinearRegression().fit(dataset)
+      model.summary.featureNames.zip(expectedFeature(idx))
+        .foreach{ x => assert(x._1 === x._2) }
+      idx += 1
+    }
+  }
+
+  test("glm summary: coefficient with statistics") {
+    /*
+      R code:
+
+      A <- matrix(c(0, 1, 2, 3, 2, 5, 7, 11, 13, 3), 5, 2)
+      b <- c(2, 8, 3, 9, 2)
+      df <- as.data.frame(cbind(A, b))
+      model <- glm(formula = "b ~ .",  data = df)
+      summary(model)
+
+      Coefficients:
+                  Estimate Std. Error t value Pr(>|t|)
+      (Intercept)   0.7903     4.0129   0.197    0.862
+      V1            0.2258     2.1153   0.107    0.925
+      V2            0.4677     0.5815   0.804    0.506
+    */
+    val dataset = Seq(
+      Instance(2.0, 1.0, Vectors.dense(0.0, 5.0)),
+      Instance(8.0, 2.0, Vectors.dense(1.0, 7.0)),
+      Instance(3.0, 3.0, Vectors.dense(2.0, 11.0)),
+      Instance(9.0, 4.0, Vectors.dense(3.0, 13.0)),
+      Instance(2.0, 5.0, Vectors.dense(2.0, 3.0))
+    ).toDF()
+
+    val expectedFeature = Seq(Array("features_0", "features_1"),
+      Array("(Intercept)", "features_0", "features_1"))
+    val expectedEstimate = Seq(Vectors.dense(0.2884, 0.538),
+      Vectors.dense(0.7903, 0.2258, 0.4677))
+    val expectedStdError = Seq(Vectors.dense(1.724, 0.3787),
+      Vectors.dense(4.0129, 2.1153, 0.5815))
+
+    var idx = 0
+    for (fitIntercept <- Seq(false, true)) {
+      val trainer = new GeneralizedLinearRegression()
+        .setFamily("gaussian")
+        .setFitIntercept(fitIntercept)
+      val model = trainer.fit(dataset)
+      val coefficientsWithStatistics = model.summary.coefficientsWithStatistics
+
+      coefficientsWithStatistics.map(_._1).zip(expectedFeature(idx)).foreach { x =>
+        assert(x._1 === x._2, "Feature name mismatch in coefficientsWithStatistics") }
+      assert(Vectors.dense(coefficientsWithStatistics.map(_._2)) ~= expectedEstimate(idx)
+        absTol 1E-3, "Coefficients mismatch in coefficientsWithStatistics")
+      assert(Vectors.dense(coefficientsWithStatistics.map(_._3)) ~= expectedStdError(idx)
+        absTol 1E-3, "Standard error mismatch in coefficientsWithStatistics")
+      idx += 1
+    }
+  }
+
   test("generalized linear regression: regularization parameter") {
     /*
       R code:

From 9f5647d62ee569a4c8bdc242adcb8d4e05c662f9 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 27 Jul 2017 22:56:26 +0800
Subject: [PATCH 246/779] [SPARK-21319][SQL] Fix memory leak in sorter

## What changes were proposed in this pull request?

`UnsafeExternalSorter.recordComparator` can be either `KVComparator` or `RowComparator`, and both of them will keep the reference to the input rows they compared last time.

After sorting, we return the sorted iterator to upstream operators. However, the upstream operators may take a while to consume up the sorted iterator, and `UnsafeExternalSorter` is registered to `TaskContext` at [here](https://github.com/apache/spark/blob/v2.2.0/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java#L159-L161), which means we will keep the `UnsafeExternalSorter` instance and keep the last compared input rows in memory until the sorted iterator is consumed up.

Things get worse if we sort within partitions of a dataset and coalesce all partitions into one, as we will keep a lot of input rows in memory and the time to consume up all the sorted iterators is long.

This PR takes over https://github.com/apache/spark/pull/18543 , the idea is that, we do not keep the record comparator instance in `UnsafeExternalSorter`, but a generator of record comparator.

close #18543

## How was this patch tested?

N/A

Author: Wenchen Fan <wenchen@databricks.com>

Closes #18679 from cloud-fan/memory-leak.
---
 .../unsafe/sort/UnsafeExternalSorter.java     | 40 ++++++++++++++-----
 .../sort/UnsafeExternalSorterSuite.java       |  4 +-
 .../execution/UnsafeExternalRowSorter.java    |  4 +-
 .../sql/execution/UnsafeKVExternalSorter.java | 16 +++++---
 4 files changed, 42 insertions(+), 22 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
index a6e858ca72021..e2059cec132d2 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
@@ -22,6 +22,7 @@
 import java.io.IOException;
 import java.util.LinkedList;
 import java.util.Queue;
+import java.util.function.Supplier;
 
 import com.google.common.annotations.VisibleForTesting;
 import org.slf4j.Logger;
@@ -48,8 +49,16 @@ public final class UnsafeExternalSorter extends MemoryConsumer {
 
   @Nullable
   private final PrefixComparator prefixComparator;
+
+  /**
+   * {@link RecordComparator} may probably keep the reference to the records they compared last
+   * time, so we should not keep a {@link RecordComparator} instance inside
+   * {@link UnsafeExternalSorter}, because {@link UnsafeExternalSorter} is referenced by
+   * {@link TaskContext} and thus can not be garbage collected until the end of the task.
+   */
   @Nullable
-  private final RecordComparator recordComparator;
+  private final Supplier<RecordComparator> recordComparatorSupplier;
+
   private final TaskMemoryManager taskMemoryManager;
   private final BlockManager blockManager;
   private final SerializerManager serializerManager;
@@ -90,14 +99,14 @@ public static UnsafeExternalSorter createWithExistingInMemorySorter(
       BlockManager blockManager,
       SerializerManager serializerManager,
       TaskContext taskContext,
-      RecordComparator recordComparator,
+      Supplier<RecordComparator> recordComparatorSupplier,
       PrefixComparator prefixComparator,
       int initialSize,
       long pageSizeBytes,
       long numElementsForSpillThreshold,
       UnsafeInMemorySorter inMemorySorter) throws IOException {
     UnsafeExternalSorter sorter = new UnsafeExternalSorter(taskMemoryManager, blockManager,
-      serializerManager, taskContext, recordComparator, prefixComparator, initialSize,
+      serializerManager, taskContext, recordComparatorSupplier, prefixComparator, initialSize,
         numElementsForSpillThreshold, pageSizeBytes, inMemorySorter, false /* ignored */);
     sorter.spill(Long.MAX_VALUE, sorter);
     // The external sorter will be used to insert records, in-memory sorter is not needed.
@@ -110,14 +119,14 @@ public static UnsafeExternalSorter create(
       BlockManager blockManager,
       SerializerManager serializerManager,
       TaskContext taskContext,
-      RecordComparator recordComparator,
+      Supplier<RecordComparator> recordComparatorSupplier,
       PrefixComparator prefixComparator,
       int initialSize,
       long pageSizeBytes,
       long numElementsForSpillThreshold,
       boolean canUseRadixSort) {
     return new UnsafeExternalSorter(taskMemoryManager, blockManager, serializerManager,
-      taskContext, recordComparator, prefixComparator, initialSize, pageSizeBytes,
+      taskContext, recordComparatorSupplier, prefixComparator, initialSize, pageSizeBytes,
       numElementsForSpillThreshold, null, canUseRadixSort);
   }
 
@@ -126,7 +135,7 @@ private UnsafeExternalSorter(
       BlockManager blockManager,
       SerializerManager serializerManager,
       TaskContext taskContext,
-      RecordComparator recordComparator,
+      Supplier<RecordComparator> recordComparatorSupplier,
       PrefixComparator prefixComparator,
       int initialSize,
       long pageSizeBytes,
@@ -138,15 +147,24 @@ private UnsafeExternalSorter(
     this.blockManager = blockManager;
     this.serializerManager = serializerManager;
     this.taskContext = taskContext;
-    this.recordComparator = recordComparator;
+    this.recordComparatorSupplier = recordComparatorSupplier;
     this.prefixComparator = prefixComparator;
     // Use getSizeAsKb (not bytes) to maintain backwards compatibility for units
     // this.fileBufferSizeBytes = (int) conf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024
     this.fileBufferSizeBytes = 32 * 1024;
 
     if (existingInMemorySorter == null) {
+      RecordComparator comparator = null;
+      if (recordComparatorSupplier != null) {
+        comparator = recordComparatorSupplier.get();
+      }
       this.inMemSorter = new UnsafeInMemorySorter(
-        this, taskMemoryManager, recordComparator, prefixComparator, initialSize, canUseRadixSort);
+        this,
+        taskMemoryManager,
+        comparator,
+        prefixComparator,
+        initialSize,
+        canUseRadixSort);
     } else {
       this.inMemSorter = existingInMemorySorter;
     }
@@ -451,14 +469,14 @@ public void merge(UnsafeExternalSorter other) throws IOException {
    * after consuming this iterator.
    */
   public UnsafeSorterIterator getSortedIterator() throws IOException {
-    assert(recordComparator != null);
+    assert(recordComparatorSupplier != null);
     if (spillWriters.isEmpty()) {
       assert(inMemSorter != null);
       readingIterator = new SpillableIterator(inMemSorter.getSortedIterator());
       return readingIterator;
     } else {
-      final UnsafeSorterSpillMerger spillMerger =
-        new UnsafeSorterSpillMerger(recordComparator, prefixComparator, spillWriters.size());
+      final UnsafeSorterSpillMerger spillMerger = new UnsafeSorterSpillMerger(
+        recordComparatorSupplier.get(), prefixComparator, spillWriters.size());
       for (UnsafeSorterSpillWriter spillWriter : spillWriters) {
         spillMerger.addSpillIfNotEmpty(spillWriter.getReader(serializerManager));
       }
diff --git a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
index cd5db1a70f722..5330a688e63e3 100644
--- a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
+++ b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
@@ -154,7 +154,7 @@ private UnsafeExternalSorter newSorter() throws IOException {
       blockManager,
       serializerManager,
       taskContext,
-      recordComparator,
+      () -> recordComparator,
       prefixComparator,
       /* initialSize */ 1024,
       pageSizeBytes,
@@ -440,7 +440,7 @@ public void testPeakMemoryUsed() throws Exception {
       blockManager,
       serializerManager,
       taskContext,
-      recordComparator,
+      () -> recordComparator,
       prefixComparator,
       1024,
       pageSizeBytes,
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
index 53b0886541fe1..12a123ee0bcff 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
@@ -84,7 +84,7 @@ public UnsafeExternalRowSorter(
       sparkEnv.blockManager(),
       sparkEnv.serializerManager(),
       taskContext,
-      new RowComparator(ordering, schema.length()),
+      () -> new RowComparator(ordering, schema.length()),
       prefixComparator,
       sparkEnv.conf().getInt("spark.shuffle.sort.initialBufferSize",
                              DEFAULT_INITIAL_SORT_BUFFER_SIZE),
@@ -195,12 +195,10 @@ public Iterator<UnsafeRow> sort(Iterator<UnsafeRow> inputIterator) throws IOExce
 
   private static final class RowComparator extends RecordComparator {
     private final Ordering<InternalRow> ordering;
-    private final int numFields;
     private final UnsafeRow row1;
     private final UnsafeRow row2;
 
     RowComparator(Ordering<InternalRow> ordering, int numFields) {
-      this.numFields = numFields;
       this.row1 = new UnsafeRow(numFields);
       this.row2 = new UnsafeRow(numFields);
       this.ordering = ordering;
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
index d8acf11a97915..6aa52f1aae048 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java
@@ -19,6 +19,7 @@
 
 import javax.annotation.Nullable;
 import java.io.IOException;
+import java.util.function.Supplier;
 
 import com.google.common.annotations.VisibleForTesting;
 
@@ -76,7 +77,8 @@ public UnsafeKVExternalSorter(
     prefixComputer = SortPrefixUtils.createPrefixGenerator(keySchema);
     PrefixComparator prefixComparator = SortPrefixUtils.getPrefixComparator(keySchema);
     BaseOrdering ordering = GenerateOrdering.create(keySchema);
-    KVComparator recordComparator = new KVComparator(ordering, keySchema.length());
+    Supplier<RecordComparator> comparatorSupplier =
+      () -> new KVComparator(ordering, keySchema.length());
     boolean canUseRadixSort = keySchema.length() == 1 &&
       SortPrefixUtils.canSortFullyWithPrefix(keySchema.apply(0));
 
@@ -88,7 +90,7 @@ public UnsafeKVExternalSorter(
         blockManager,
         serializerManager,
         taskContext,
-        recordComparator,
+        comparatorSupplier,
         prefixComparator,
         SparkEnv.get().conf().getInt("spark.shuffle.sort.initialBufferSize",
                                      UnsafeExternalRowSorter.DEFAULT_INITIAL_SORT_BUFFER_SIZE),
@@ -104,7 +106,11 @@ public UnsafeKVExternalSorter(
       // as the underlying array for in-memory sorter (it's always large enough).
       // Since we will not grow the array, it's fine to pass `null` as consumer.
       final UnsafeInMemorySorter inMemSorter = new UnsafeInMemorySorter(
-        null, taskMemoryManager, recordComparator, prefixComparator, map.getArray(),
+        null,
+        taskMemoryManager,
+        comparatorSupplier.get(),
+        prefixComparator,
+        map.getArray(),
         canUseRadixSort);
 
       // We cannot use the destructive iterator here because we are reusing the existing memory
@@ -137,7 +143,7 @@ public UnsafeKVExternalSorter(
         blockManager,
         serializerManager,
         taskContext,
-        new KVComparator(ordering, keySchema.length()),
+        comparatorSupplier,
         prefixComparator,
         SparkEnv.get().conf().getInt("spark.shuffle.sort.initialBufferSize",
                                      UnsafeExternalRowSorter.DEFAULT_INITIAL_SORT_BUFFER_SIZE),
@@ -227,10 +233,8 @@ private static final class KVComparator extends RecordComparator {
     private final BaseOrdering ordering;
     private final UnsafeRow row1;
     private final UnsafeRow row2;
-    private final int numKeyFields;
 
     KVComparator(BaseOrdering ordering, int numKeyFields) {
-      this.numKeyFields = numKeyFields;
       this.row1 = new UnsafeRow(numKeyFields);
       this.row2 = new UnsafeRow(numKeyFields);
       this.ordering = ordering;

From f44ead89f48f040b7eb9dfc88df0ec995b47bfe9 Mon Sep 17 00:00:00 2001
From: aokolnychyi <anton.okolnychyi@sap.com>
Date: Thu, 27 Jul 2017 16:49:42 -0700
Subject: [PATCH 247/779] [SPARK-21538][SQL] Attribute resolution inconsistency
 in the Dataset API

## What changes were proposed in this pull request?

This PR contains a tiny update that removes an attribute resolution inconsistency in the Dataset API. The following example is taken from the ticket description:

```
spark.range(1).withColumnRenamed("id", "x").sort(col("id"))  // works
spark.range(1).withColumnRenamed("id", "x").sort($"id")  // works
spark.range(1).withColumnRenamed("id", "x").sort('id) // works
spark.range(1).withColumnRenamed("id", "x").sort("id") // fails with:
org.apache.spark.sql.AnalysisException: Cannot resolve column name "id" among (x);
```
The above `AnalysisException` happens because the last case calls `Dataset.apply()` to convert strings into columns, which triggers attribute resolution. To make the API consistent between overloaded methods, this PR defers the resolution and constructs columns directly.

Author: aokolnychyi <anton.okolnychyi@sap.com>

Closes #18740 from aokolnychyi/spark-21538.
---
 .../main/scala/org/apache/spark/sql/Dataset.scala   |  2 +-
 .../scala/org/apache/spark/sql/DatasetSuite.scala   | 13 +++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 9007367f5aa8f..aa968d8b3c34d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -1108,7 +1108,7 @@ class Dataset[T] private[sql](
    */
   @scala.annotation.varargs
   def sort(sortCol: String, sortCols: String*): Dataset[T] = {
-    sort((sortCol +: sortCols).map(apply) : _*)
+    sort((sortCol +: sortCols).map(Column(_)) : _*)
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 73098cdb92471..40235e32d35da 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -1304,6 +1304,19 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
       assert(rlike3.count() == 0)
     }
   }
+
+  test("SPARK-21538: Attribute resolution inconsistency in Dataset API") {
+    val df = spark.range(3).withColumnRenamed("id", "x")
+    val expected = Row(0) :: Row(1) :: Row (2) :: Nil
+    checkAnswer(df.sort("id"), expected)
+    checkAnswer(df.sort(col("id")), expected)
+    checkAnswer(df.sort($"id"), expected)
+    checkAnswer(df.sort('id), expected)
+    checkAnswer(df.orderBy("id"), expected)
+    checkAnswer(df.orderBy(col("id")), expected)
+    checkAnswer(df.orderBy($"id"), expected)
+    checkAnswer(df.orderBy('id), expected)
+  }
 }
 
 case class WithImmutableMap(id: String, map_test: scala.collection.immutable.Map[Long, String])

From a5a3189974ea4628e9489eb50099a5432174e80c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Fri, 28 Jul 2017 10:10:35 +0800
Subject: [PATCH 248/779] [SPARK-21306][ML] OneVsRest should support
 setWeightCol
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

add `setWeightCol` method for OneVsRest.

`weightCol` is ignored if classifier doesn't inherit HasWeightCol trait.

## How was this patch tested?

+ [x] add an unit test.

Author: Yan Facai (颜发才) <facai.yan@gmail.com>

Closes #18554 from facaiy/BUG/oneVsRest_missing_weightCol.
---
 .../spark/ml/classification/OneVsRest.scala   | 39 +++++++++++++++++--
 .../ml/classification/OneVsRestSuite.scala    | 10 +++++
 python/pyspark/ml/classification.py           | 27 ++++++++++---
 python/pyspark/ml/tests.py                    | 14 +++++++
 4 files changed, 81 insertions(+), 9 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
index 7cbcccf2720a3..05b8c3ab5456e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
@@ -34,6 +34,7 @@ import org.apache.spark.ml._
 import org.apache.spark.ml.attribute._
 import org.apache.spark.ml.linalg.Vector
 import org.apache.spark.ml.param.{Param, ParamMap, ParamPair, Params}
+import org.apache.spark.ml.param.shared.HasWeightCol
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions._
@@ -53,7 +54,8 @@ private[ml] trait ClassifierTypeTrait {
 /**
  * Params for [[OneVsRest]].
  */
-private[ml] trait OneVsRestParams extends PredictorParams with ClassifierTypeTrait {
+private[ml] trait OneVsRestParams extends PredictorParams
+  with ClassifierTypeTrait with HasWeightCol {
 
   /**
    * param for the base binary classifier that we reduce multiclass classification into.
@@ -294,6 +296,18 @@ final class OneVsRest @Since("1.4.0") (
   @Since("1.5.0")
   def setPredictionCol(value: String): this.type = set(predictionCol, value)
 
+  /**
+   * Sets the value of param [[weightCol]].
+   *
+   * This is ignored if weight is not supported by [[classifier]].
+   * If this is not set or empty, we treat all instance weights as 1.0.
+   * Default is not set, so all instances have weight one.
+   *
+   * @group setParam
+   */
+  @Since("2.3.0")
+  def setWeightCol(value: String): this.type = set(weightCol, value)
+
   @Since("1.4.0")
   override def transformSchema(schema: StructType): StructType = {
     validateAndTransformSchema(schema, fitting = true, getClassifier.featuresDataType)
@@ -317,7 +331,20 @@ final class OneVsRest @Since("1.4.0") (
     val numClasses = MetadataUtils.getNumClasses(labelSchema).fold(computeNumClasses())(identity)
     instr.logNumClasses(numClasses)
 
-    val multiclassLabeled = dataset.select($(labelCol), $(featuresCol))
+    val weightColIsUsed = isDefined(weightCol) && $(weightCol).nonEmpty && {
+      getClassifier match {
+        case _: HasWeightCol => true
+        case c =>
+          logWarning(s"weightCol is ignored, as it is not supported by $c now.")
+          false
+      }
+    }
+
+    val multiclassLabeled = if (weightColIsUsed) {
+      dataset.select($(labelCol), $(featuresCol), $(weightCol))
+    } else {
+      dataset.select($(labelCol), $(featuresCol))
+    }
 
     // persist if underlying dataset is not persistent.
     val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE
@@ -337,7 +364,13 @@ final class OneVsRest @Since("1.4.0") (
       paramMap.put(classifier.labelCol -> labelColName)
       paramMap.put(classifier.featuresCol -> getFeaturesCol)
       paramMap.put(classifier.predictionCol -> getPredictionCol)
-      classifier.fit(trainingDataset, paramMap)
+      if (weightColIsUsed) {
+        val classifier_ = classifier.asInstanceOf[ClassifierType with HasWeightCol]
+        paramMap.put(classifier_.weightCol -> getWeightCol)
+        classifier_.fit(trainingDataset, paramMap)
+      } else {
+        classifier.fit(trainingDataset, paramMap)
+      }
     }.toArray[ClassificationModel[_, _]]
     instr.logNumFeatures(models.head.numFeatures)
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
index c02e38ad64e3e..17f82827b74e6 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
@@ -156,6 +156,16 @@ class OneVsRestSuite extends SparkFunSuite with MLlibTestSparkContext with Defau
     assert(output.schema.fieldNames.toSet === Set("label", "features", "prediction"))
   }
 
+  test("SPARK-21306: OneVsRest should support setWeightCol") {
+    val dataset2 = dataset.withColumn("weight", lit(1))
+    // classifier inherits hasWeightCol
+    val ova = new OneVsRest().setWeightCol("weight").setClassifier(new LogisticRegression())
+    assert(ova.fit(dataset2) !== null)
+    // classifier doesn't inherit hasWeightCol
+    val ova2 = new OneVsRest().setWeightCol("weight").setClassifier(new DecisionTreeClassifier())
+    assert(ova2.fit(dataset2) !== null)
+  }
+
   test("OneVsRest.copy and OneVsRestModel.copy") {
     val lr = new LogisticRegression()
       .setMaxIter(1)
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 82207f664480a..4af6f71e19257 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -1447,7 +1447,7 @@ def weights(self):
         return self._call_java("weights")
 
 
-class OneVsRestParams(HasFeaturesCol, HasLabelCol, HasPredictionCol):
+class OneVsRestParams(HasFeaturesCol, HasLabelCol, HasWeightCol, HasPredictionCol):
     """
     Parameters for OneVsRest and OneVsRestModel.
     """
@@ -1517,10 +1517,10 @@ class OneVsRest(Estimator, OneVsRestParams, JavaMLReadable, JavaMLWritable):
 
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
-                 classifier=None):
+                 classifier=None, weightCol=None):
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
-                 classifier=None)
+                 classifier=None, weightCol=None)
         """
         super(OneVsRest, self).__init__()
         kwargs = self._input_kwargs
@@ -1528,9 +1528,11 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
 
     @keyword_only
     @since("2.0.0")
-    def setParams(self, featuresCol=None, labelCol=None, predictionCol=None, classifier=None):
+    def setParams(self, featuresCol=None, labelCol=None, predictionCol=None,
+                  classifier=None, weightCol=None):
         """
-        setParams(self, featuresCol=None, labelCol=None, predictionCol=None, classifier=None):
+        setParams(self, featuresCol=None, labelCol=None, predictionCol=None, \
+                  classifier=None, weightCol=None):
         Sets params for OneVsRest.
         """
         kwargs = self._input_kwargs
@@ -1546,7 +1548,18 @@ def _fit(self, dataset):
 
         numClasses = int(dataset.agg({labelCol: "max"}).head()["max("+labelCol+")"]) + 1
 
-        multiclassLabeled = dataset.select(labelCol, featuresCol)
+        weightCol = None
+        if (self.isDefined(self.weightCol) and self.getWeightCol()):
+            if isinstance(classifier, HasWeightCol):
+                weightCol = self.getWeightCol()
+            else:
+                warnings.warn("weightCol is ignored, "
+                              "as it is not supported by {} now.".format(classifier))
+
+        if weightCol:
+            multiclassLabeled = dataset.select(labelCol, featuresCol, weightCol)
+        else:
+            multiclassLabeled = dataset.select(labelCol, featuresCol)
 
         # persist if underlying dataset is not persistent.
         handlePersistence = \
@@ -1562,6 +1575,8 @@ def trainSingleClass(index):
             paramMap = dict([(classifier.labelCol, binaryLabelCol),
                             (classifier.featuresCol, featuresCol),
                             (classifier.predictionCol, predictionCol)])
+            if weightCol:
+                paramMap[classifier.weightCol] = weightCol
             return classifier.fit(trainingDataset, paramMap)
 
         # TODO: Parallel training for all classes.
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 6c71e69c9b5f9..a9ca346fa5d83 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -1394,6 +1394,20 @@ def test_output_columns(self):
         output = model.transform(df)
         self.assertEqual(output.columns, ["label", "features", "prediction"])
 
+    def test_support_for_weightCol(self):
+        df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8), 1.0),
+                                         (1.0, Vectors.sparse(2, [], []), 1.0),
+                                         (2.0, Vectors.dense(0.5, 0.5), 1.0)],
+                                        ["label", "features", "weight"])
+        # classifier inherits hasWeightCol
+        lr = LogisticRegression(maxIter=5, regParam=0.01)
+        ovr = OneVsRest(classifier=lr, weightCol="weight")
+        self.assertIsNotNone(ovr.fit(df))
+        # classifier doesn't inherit hasWeightCol
+        dt = DecisionTreeClassifier()
+        ovr2 = OneVsRest(classifier=dt, weightCol="weight")
+        self.assertIsNotNone(ovr2.fit(df))
+
 
 class HashingTFTest(SparkSessionTestCase):
 

From 63d168cbb8ebe80bfcf91a4d5070f1d9f3ec7aca Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Fri, 28 Jul 2017 11:31:40 +0100
Subject: [PATCH 249/779] [MINOR][BUILD] Fix current lint-java failures

## What changes were proposed in this pull request?

Fixes current failures in dev/lint-java

## How was this patch tested?

Existing linter, tests.

Author: Sean Owen <sowen@cloudera.com>

Closes #18757 from srowen/LintJava.
---
 .../apache/spark/network/TransportRequestHandlerSuite.java  | 6 ++++--
 .../apache/spark/shuffle/sort/ShuffleExternalSorter.java    | 6 ++++--
 .../spark/sql/execution/vectorized/ArrowColumnVector.java   | 2 +-
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/common/network-common/src/test/java/org/apache/spark/network/TransportRequestHandlerSuite.java b/common/network-common/src/test/java/org/apache/spark/network/TransportRequestHandlerSuite.java
index 1fb987a8a7aa7..1ed57116bc7bf 100644
--- a/common/network-common/src/test/java/org/apache/spark/network/TransportRequestHandlerSuite.java
+++ b/common/network-common/src/test/java/org/apache/spark/network/TransportRequestHandlerSuite.java
@@ -105,7 +105,7 @@ private class ExtendedChannelPromise extends DefaultChannelPromise {
     private List<GenericFutureListener> listeners = new ArrayList<>();
     private boolean success;
 
-    public ExtendedChannelPromise(Channel channel) {
+    ExtendedChannelPromise(Channel channel) {
       super(channel);
       success = false;
     }
@@ -127,7 +127,9 @@ public void finish(boolean success) {
       listeners.forEach(listener -> {
         try {
           listener.operationComplete(this);
-        } catch (Exception e) { }
+        } catch (Exception e) {
+          // do nothing
+        }
       });
     }
   }
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java
index 338faaadb33d4..da6c55d9b8ac3 100644
--- a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java
@@ -120,14 +120,16 @@ final class ShuffleExternalSorter extends MemoryConsumer {
     this.taskContext = taskContext;
     this.numPartitions = numPartitions;
     // Use getSizeAsKb (not bytes) to maintain backwards compatibility if no units are provided
-    this.fileBufferSizeBytes = (int) (long) conf.get(package$.MODULE$.SHUFFLE_FILE_BUFFER_SIZE()) * 1024;
+    this.fileBufferSizeBytes =
+        (int) (long) conf.get(package$.MODULE$.SHUFFLE_FILE_BUFFER_SIZE()) * 1024;
     this.numElementsForSpillThreshold =
       conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold", 1024 * 1024 * 1024);
     this.writeMetrics = writeMetrics;
     this.inMemSorter = new ShuffleInMemorySorter(
       this, initialSize, conf.getBoolean("spark.shuffle.sort.useRadixSort", true));
     this.peakMemoryUsedBytes = getMemoryUsage();
-    this.diskWriteBufferSize = (int) (long) conf.get(package$.MODULE$.SHUFFLE_DISK_WRITE_BUFFER_SIZE());
+    this.diskWriteBufferSize =
+        (int) (long) conf.get(package$.MODULE$.SHUFFLE_DISK_WRITE_BUFFER_SIZE());
   }
 
   /**
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
index 31dea6ad31b12..59d66c599c518 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
@@ -322,7 +322,7 @@ public ArrowColumnVector(ValueVector vector) {
     anyNullsSet = numNulls > 0;
   }
 
-  private static abstract class ArrowVectorAccessor {
+  private abstract static class ArrowVectorAccessor {
 
     private final ValueVector vector;
     private final ValueVector.Accessor nulls;

From 784680903cdf003ac965b2d97b3f793324909bd3 Mon Sep 17 00:00:00 2001
From: davidxdh <xu.donghui@zte.com.cn>
Date: Fri, 28 Jul 2017 15:21:45 +0100
Subject: [PATCH 250/779] [SPARK-21553][SPARK SHELL] Add the description of the
 default value of master parameter in the spark-shell

When I type spark-shell --help, I find that the default value description for the master parameter is missing. The user does not know what the default value is when the master parameter is not included, so we need to add the master parameter default description to the help information.
[https://issues.apache.org/jira/browse/SPARK-21553](https://issues.apache.org/jira/browse/SPARK-21553)

Author: davidxdh <xu.donghui@zte.com.cn>
Author: Donghui Xu <xu.donghui@zte.com.cn>

Closes #18755 from davidxdh/dev_0728.
---
 .../scala/org/apache/spark/deploy/SparkSubmitArguments.scala   | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
index fd1521193fdee..3721b98d68685 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -514,7 +514,8 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
     outStream.println(
       s"""
         |Options:
-        |  --master MASTER_URL         spark://host:port, mesos://host:port, yarn, or local.
+        |  --master MASTER_URL         spark://host:port, mesos://host:port, yarn, or local
+        |                              (Default: local[*]).
         |  --deploy-mode DEPLOY_MODE   Whether to launch the driver program locally ("client") or
         |                              on one of the worker machines inside the cluster ("cluster")
         |                              (Default: client).

From 69ab0e4bddccb461f960fcb48a390a1517e504dd Mon Sep 17 00:00:00 2001
From: pgandhi <pgandhi@yahoo-inc.com>
Date: Fri, 28 Jul 2017 09:23:08 -0500
Subject: [PATCH 251/779] [SPARK-21541][YARN] Spark Logs show incorrect job
 status for a job that does not create SparkContext

If you run a spark job without creating the SparkSession or SparkContext, the spark job logs says it succeeded but yarn says it fails and retries 3 times. Also, since, Application Master unregisters with Resource Manager and exits successfully, it deletes the spark staging directory, so when yarn makes subsequent retries, it fails to find the staging directory and thus, the retries fail.

Added a flag to check whether user has initialized SparkContext. If it is true, we let Application Master unregister with Resource Manager else, we do not let AM unregister with RM.

## How was this patch tested?
Manually tested the fix.
Before:
<img width="1253" alt="screen shot-before" src="https://user-images.githubusercontent.com/22228190/28647214-69bf81e2-722b-11e7-9ed0-d416d2bf23be.png">

After:
<img width="1319" alt="screen shot-after" src="https://user-images.githubusercontent.com/22228190/28647220-70f9eea2-722b-11e7-85c6-e56276b15614.png">

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: pgandhi <pgandhi@yahoo-inc.com>
Author: pgandhi999 <parthkgandhi9@gmail.com>

Closes #18741 from pgandhi999/SPARK-21541.
---
 .../spark/deploy/yarn/ApplicationMaster.scala | 21 ++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index fc925022b2718..ca6a3ef3ebbb5 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -90,6 +90,9 @@ private[spark] class ApplicationMaster(
   @volatile private var reporterThread: Thread = _
   @volatile private var allocator: YarnAllocator = _
 
+  // A flag to check whether user has initialized spark context
+  @volatile private var registered = false
+
   private val userClassLoader = {
     val classpath = Client.getUserClasspath(sparkConf)
     val urls = classpath.map { entry =>
@@ -319,7 +322,7 @@ private[spark] class ApplicationMaster(
    */
   final def unregister(status: FinalApplicationStatus, diagnostics: String = null): Unit = {
     synchronized {
-      if (!unregistered) {
+      if (registered && !unregistered) {
         logInfo(s"Unregistering ApplicationMaster with $status" +
           Option(diagnostics).map(msg => s" (diag message: $msg)").getOrElse(""))
         unregistered = true
@@ -332,10 +335,15 @@ private[spark] class ApplicationMaster(
     synchronized {
       if (!finished) {
         val inShutdown = ShutdownHookManager.inShutdown()
-        logInfo(s"Final app status: $status, exitCode: $code" +
+        if (registered) {
+          exitCode = code
+          finalStatus = status
+        } else {
+          finalStatus = FinalApplicationStatus.FAILED
+          exitCode = ApplicationMaster.EXIT_SC_NOT_INITED
+        }
+        logInfo(s"Final app status: $finalStatus, exitCode: $exitCode" +
           Option(msg).map(msg => s", (reason: $msg)").getOrElse(""))
-        exitCode = code
-        finalStatus = status
         finalMsg = msg
         finished = true
         if (!inShutdown && Thread.currentThread() != reporterThread && reporterThread != null) {
@@ -439,12 +447,11 @@ private[spark] class ApplicationMaster(
           sc.getConf.get("spark.driver.port"),
           isClusterMode = true)
         registerAM(sc.getConf, rpcEnv, driverRef, sc.ui.map(_.webUrl), securityMgr)
+        registered = true
       } else {
         // Sanity check; should never happen in normal operation, since sc should only be null
         // if the user app did not create a SparkContext.
-        if (!finished) {
-          throw new IllegalStateException("SparkContext is null but app is still running!")
-        }
+        throw new IllegalStateException("User did not initialize spark context!")
       }
       userClassThread.join()
     } catch {

From 0ef9fe64e2d59bdf98c0f4729e5017d258f7c07d Mon Sep 17 00:00:00 2001
From: Johan Grande <nahoj@crans.org>
Date: Fri, 28 Jul 2017 16:51:18 +0100
Subject: [PATCH 252/779] Typo in comment

-

Author: Johan Grande <nahoj@crans.org>

Closes #18738 from nahoj/patch-1.
---
 core/src/main/scala/org/apache/spark/rdd/RDD.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index 2985c90119468..5435f59ea0d28 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -55,7 +55,7 @@ import org.apache.spark.util.random.{BernoulliCellSampler, BernoulliSampler, Poi
  * Doubles; and
  * [[org.apache.spark.rdd.SequenceFileRDDFunctions]] contains operations available on RDDs that
  * can be saved as SequenceFiles.
- * All operations are automatically available on any RDD of the right type (e.g. RDD[(Int, Int)]
+ * All operations are automatically available on any RDD of the right type (e.g. RDD[(Int, Int)])
  * through implicit.
  *
  * Internally, each RDD is characterized by five main properties:

From b56f79cc359d093d757af83171175cfd933162d1 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Fri, 28 Jul 2017 20:59:32 -0700
Subject: [PATCH 253/779] [SPARK-20090][PYTHON] Add StructType.fieldNames in
 PySpark

## What changes were proposed in this pull request?

This PR proposes `StructType.fieldNames` that returns a copy of a field name list rather than a (undocumented) `StructType.names`.

There are two points here:

  - API consistency with Scala/Java

  - Provide a safe way to get the field names. Manipulating these might cause unexpected behaviour as below:

    ```python
    from pyspark.sql.types import *

    struct = StructType([StructField("f1", StringType(), True)])
    names = struct.names
    del names[0]
    spark.createDataFrame([{"f1": 1}], struct).show()
    ```

    ```
    ...
    java.lang.IllegalStateException: Input row doesn't have expected number of values required by the schema. 1 fields are required while 0 values are provided.
    	at org.apache.spark.sql.execution.python.EvaluatePython$.fromJava(EvaluatePython.scala:138)
    	at org.apache.spark.sql.SparkSession$$anonfun$6.apply(SparkSession.scala:741)
    	at org.apache.spark.sql.SparkSession$$anonfun$6.apply(SparkSession.scala:741)
    ...
    ```

## How was this patch tested?

Added tests in `python/pyspark/sql/tests.py`.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #18618 from HyukjinKwon/SPARK-20090.
---
 python/pyspark/sql/tests.py | 16 ++++++++--------
 python/pyspark/sql/types.py | 15 ++++++++++++++-
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 54756edd9345d..cfd9c558ff67e 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -1241,26 +1241,29 @@ def test_struct_type(self):
         struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None)
         struct2 = StructType([StructField("f1", StringType(), True),
                               StructField("f2", StringType(), True, None)])
+        self.assertEqual(struct1.fieldNames(), struct2.names)
         self.assertEqual(struct1, struct2)
 
         struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None)
         struct2 = StructType([StructField("f1", StringType(), True)])
+        self.assertNotEqual(struct1.fieldNames(), struct2.names)
         self.assertNotEqual(struct1, struct2)
 
         struct1 = (StructType().add(StructField("f1", StringType(), True))
                    .add(StructField("f2", StringType(), True, None)))
         struct2 = StructType([StructField("f1", StringType(), True),
                               StructField("f2", StringType(), True, None)])
+        self.assertEqual(struct1.fieldNames(), struct2.names)
         self.assertEqual(struct1, struct2)
 
         struct1 = (StructType().add(StructField("f1", StringType(), True))
                    .add(StructField("f2", StringType(), True, None)))
         struct2 = StructType([StructField("f1", StringType(), True)])
+        self.assertNotEqual(struct1.fieldNames(), struct2.names)
         self.assertNotEqual(struct1, struct2)
 
         # Catch exception raised during improper construction
-        with self.assertRaises(ValueError):
-            struct1 = StructType().add("name")
+        self.assertRaises(ValueError, lambda: StructType().add("name"))
 
         struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None)
         for field in struct1:
@@ -1273,12 +1276,9 @@ def test_struct_type(self):
         self.assertIs(struct1["f1"], struct1.fields[0])
         self.assertIs(struct1[0], struct1.fields[0])
         self.assertEqual(struct1[0:1], StructType(struct1.fields[0:1]))
-        with self.assertRaises(KeyError):
-            not_a_field = struct1["f9"]
-        with self.assertRaises(IndexError):
-            not_a_field = struct1[9]
-        with self.assertRaises(TypeError):
-            not_a_field = struct1[9.9]
+        self.assertRaises(KeyError, lambda: struct1["f9"])
+        self.assertRaises(IndexError, lambda: struct1[9])
+        self.assertRaises(TypeError, lambda: struct1[9.9])
 
     def test_parse_datatype_string(self):
         from pyspark.sql.types import _all_atomic_types, _parse_datatype_string
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index c376805c32738..ecb8eb9a2f2fa 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -446,9 +446,12 @@ class StructType(DataType):
 
     This is the data type representing a :class:`Row`.
 
-    Iterating a :class:`StructType` will iterate its :class:`StructField`s.
+    Iterating a :class:`StructType` will iterate its :class:`StructField`\\s.
     A contained :class:`StructField` can be accessed by name or position.
 
+    .. note:: `names` attribute is deprecated in 2.3. Use `fieldNames` method instead
+        to get a list of field names.
+
     >>> struct1 = StructType([StructField("f1", StringType(), True)])
     >>> struct1["f1"]
     StructField(f1,StringType,true)
@@ -563,6 +566,16 @@ def jsonValue(self):
     def fromJson(cls, json):
         return StructType([StructField.fromJson(f) for f in json["fields"]])
 
+    def fieldNames(self):
+        """
+        Returns all field names in a list.
+
+        >>> struct = StructType([StructField("f1", StringType(), True)])
+        >>> struct.fieldNames()
+        ['f1']
+        """
+        return list(self.names)
+
     def needConversion(self):
         # We need convert Row()/namedtuple into tuple()
         return True

From c14382030b373177cf6aa3c045e27d754368a927 Mon Sep 17 00:00:00 2001
From: Remis Haroon <Remis.Haroon@insdc01.pwc.com>
Date: Sat, 29 Jul 2017 13:26:10 +0100
Subject: [PATCH 254/779] [SPARK-21508][DOC] Fix example code provided in Spark
 Streaming Documentation

## What changes were proposed in this pull request?

JIRA ticket : [SPARK-21508](https://issues.apache.org/jira/projects/SPARK/issues/SPARK-21508)

correcting a mistake in example code provided in Spark Streaming Custom Receivers Documentation
The example code provided in the documentation on 'Spark Streaming Custom Receivers' has an error.
doc link : https://spark.apache.org/docs/latest/streaming-custom-receivers.html

```

// Assuming ssc is the StreamingContext
val customReceiverStream = ssc.receiverStream(new CustomReceiver(host, port))
val words = lines.flatMap(_.split(" "))
...
```

instead of `lines.flatMap(_.split(" "))`
it should be `customReceiverStream.flatMap(_.split(" "))`

## How was this patch tested?
this documentation change is tested manually by jekyll build , running below commands
```
jekyll build
jekyll serve --watch
```
screen-shots provided below
![screenshot1](https://user-images.githubusercontent.com/8828470/28744636-a6de1ac6-7482-11e7-843b-ff84b5855ec0.png)
![screenshot2](https://user-images.githubusercontent.com/8828470/28744637-a6def496-7482-11e7-9512-7f4bbe027c6a.png)

Author: Remis Haroon <Remis.Haroon@insdc01.pwc.com>

Closes #18770 from remisharoon/master.
---
 docs/streaming-custom-receivers.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/streaming-custom-receivers.md b/docs/streaming-custom-receivers.md
index d4ddcb16bdd0e..44ae52e81cd64 100644
--- a/docs/streaming-custom-receivers.md
+++ b/docs/streaming-custom-receivers.md
@@ -175,7 +175,7 @@ an input DStream using data received by the instance of custom receiver, as show
 {% highlight scala %}
 // Assuming ssc is the StreamingContext
 val customReceiverStream = ssc.receiverStream(new CustomReceiver(host, port))
-val words = lines.flatMap(_.split(" "))
+val words = customReceiverStream.flatMap(_.split(" "))
 ...
 {% endhighlight %}
 
@@ -187,7 +187,7 @@ The full source code is in the example [CustomReceiver.scala]({{site.SPARK_GITHU
 {% highlight java %}
 // Assuming ssc is the JavaStreamingContext
 JavaDStream<String> customReceiverStream = ssc.receiverStream(new JavaCustomReceiver(host, port));
-JavaDStream<String> words = lines.flatMap(s -> ...);
+JavaDStream<String> words = customReceiverStream.flatMap(s -> ...);
 ...
 {% endhighlight %}
 

From 60e9b2bdd55854a6be077b17aa032c25bfb031bf Mon Sep 17 00:00:00 2001
From: shaofei007 <1427357147@qq.com>
Date: Sat, 29 Jul 2017 13:27:39 +0100
Subject: [PATCH 255/779] [SPARK-21357][DSTREAMS] FileInputDStream not remove
 out of date RDD

## What changes were proposed in this pull request?

```DStreams
         class FileInputDStream

 [line 162]   protected[streaming] override def clearMetadata(time: Time) {
    batchTimeToSelectedFiles.synchronized {
      val oldFiles = batchTimeToSelectedFiles.filter(_._1 < (time - rememberDuration))
      batchTimeToSelectedFiles --= oldFiles.keys

```
The above code does not remove the old generatedRDDs. "super.clearMetadata(time)" was added to the beginning of clearMetadata to remove the old generatedRDDs.

## How was this patch tested?

At the end of clearMetadata, the testing code (print the number of generatedRDDs) was added to check the old RDDS were removed manually.

Author: shaofei007 <1427357147@qq.com>
Author: Fei Shao <1427357147@qq.com>

Closes #18718 from shaofei007/master.
---
 .../org/apache/spark/streaming/dstream/FileInputDStream.scala    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
index 905b1c52afa69..b8a5a96faf15c 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
@@ -164,6 +164,7 @@ class FileInputDStream[K, V, F <: NewInputFormat[K, V]](
 
   /** Clear the old time-to-files mappings along with old RDDs */
   protected[streaming] override def clearMetadata(time: Time) {
+    super.clearMetadata(time)
     batchTimeToSelectedFiles.synchronized {
       val oldFiles = batchTimeToSelectedFiles.filter(_._1 < (time - rememberDuration))
       batchTimeToSelectedFiles --= oldFiles.keys

From 9c8109ef414c92553335bb1e90e9681e142128a4 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Sat, 29 Jul 2017 10:02:56 -0700
Subject: [PATCH 256/779] [SPARK-21555][SQL] RuntimeReplaceable should be
 compared semantically by its canonicalized child

## What changes were proposed in this pull request?

When there are aliases (these aliases were added for nested fields) as parameters in `RuntimeReplaceable`, as they are not in the children expression, those aliases can't be cleaned up in analyzer rule `CleanupAliases`.

An expression `nvl(foo.foo1, "value")` can be resolved to two semantically different expressions in a group by query because they contain different aliases.

Because those aliases are not children of `RuntimeReplaceable` which is an `UnaryExpression`. So we can't trim the aliases out by simple transforming the expressions in `CleanupAliases`.

If we want to replace the non-children aliases in `RuntimeReplaceable`, we need to add more codes to `RuntimeReplaceable` and modify all expressions of `RuntimeReplaceable`. It makes the interface ugly IMO.

Consider those aliases will be replaced later at optimization and so they're no harm, this patch chooses to simply override `canonicalized` of `RuntimeReplaceable`.

One concern is about `CleanupAliases`. Because it actually cannot clean up ALL aliases inside a plan. To make caller of this rule notice that, this patch adds a comment to `CleanupAliases`.

## How was this patch tested?

Added test.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #18761 from viirya/SPARK-21555.
---
 .../spark/sql/catalyst/analysis/Analyzer.scala |  4 +++-
 .../sql/catalyst/expressions/Expression.scala  |  4 ++++
 .../inputs/sql-compatibility-functions.sql     |  4 ++++
 .../sql-compatibility-functions.sql.out        | 18 +++++++++++++++++-
 4 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 913d846a8c23b..f987ed880176d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -2234,7 +2234,9 @@ object EliminateUnions extends Rule[LogicalPlan] {
 /**
  * Cleans up unnecessary Aliases inside the plan. Basically we only need Alias as a top level
  * expression in Project(project list) or Aggregate(aggregate expressions) or
- * Window(window expressions).
+ * Window(window expressions). Notice that if an expression has other expression parameters which
+ * are not in its `children`, e.g. `RuntimeReplaceable`, the transformation for Aliases in this
+ * rule can't work for those parameters.
  */
 object CleanupAliases extends Rule[LogicalPlan] {
   private def trimAliases(e: Expression): Expression = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index b847ef7bfaa97..74c4cddf2b47e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -241,6 +241,10 @@ trait RuntimeReplaceable extends UnaryExpression with Unevaluable {
   override def nullable: Boolean = child.nullable
   override def foldable: Boolean = child.foldable
   override def dataType: DataType = child.dataType
+  // As this expression gets replaced at optimization with its `child" expression,
+  // two `RuntimeReplaceable` are considered to be semantically equal if their "child" expressions
+  // are semantically equal.
+  override lazy val canonicalized: Expression = child.canonicalized
 }
 
 
diff --git a/sql/core/src/test/resources/sql-tests/inputs/sql-compatibility-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/sql-compatibility-functions.sql
index 2b5b692d29ef4..f1461032065ad 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/sql-compatibility-functions.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/sql-compatibility-functions.sql
@@ -23,3 +23,7 @@ SELECT float(1), double(1), decimal(1);
 SELECT date("2014-04-04"), timestamp(date("2014-04-04"));
 -- error handling: only one argument
 SELECT string(1, 2);
+
+-- SPARK-21555: RuntimeReplaceable used in group by
+CREATE TEMPORARY VIEW tempView1 AS VALUES (1, NAMED_STRUCT('col1', 'gamma', 'col2', 'delta')) AS T(id, st);
+SELECT nvl(st.col1, "value"), count(*) FROM from tempView1 GROUP BY nvl(st.col1, "value");
diff --git a/sql/core/src/test/resources/sql-tests/results/sql-compatibility-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/sql-compatibility-functions.sql.out
index 732b11050f461..e035505f15d28 100644
--- a/sql/core/src/test/resources/sql-tests/results/sql-compatibility-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/sql-compatibility-functions.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 13
+-- Number of queries: 15
 
 
 -- !query 0
@@ -122,3 +122,19 @@ struct<>
 -- !query 12 output
 org.apache.spark.sql.AnalysisException
 Function string accepts only one argument; line 1 pos 7
+
+
+-- !query 13
+CREATE TEMPORARY VIEW tempView1 AS VALUES (1, NAMED_STRUCT('col1', 'gamma', 'col2', 'delta')) AS T(id, st)
+-- !query 13 schema
+struct<>
+-- !query 13 output
+
+
+
+-- !query 14
+SELECT nvl(st.col1, "value"), count(*) FROM from tempView1 GROUP BY nvl(st.col1, "value")
+-- !query 14 schema
+struct<nvl(tempview1.`st`.`col1` AS `col1`, 'value'):string,FROM:bigint>
+-- !query 14 output
+gamma	1

From 92d85637e7f382aae61c0f26eb1524d2b4c93516 Mon Sep 17 00:00:00 2001
From: Xingbo Jiang <xingbo.jiang@databricks.com>
Date: Sat, 29 Jul 2017 10:11:31 -0700
Subject: [PATCH 257/779] [SPARK-19451][SQL] rangeBetween method should accept
 Long value as boundary

## What changes were proposed in this pull request?

Long values can be passed to `rangeBetween` as range frame boundaries, but we silently convert it to Int values, this can cause wrong results and we should fix this.

Further more, we should accept any legal literal values as range frame boundaries. In this PR, we make it possible for Long values, and make accepting other DataTypes really easy to add.

This PR is mostly based on Herman's previous amazing work: https://github.com/hvanhovell/spark/commit/596f53c339b1b4629f5651070e56a8836a397768

After this been merged, we can close #16818 .

## How was this patch tested?

Add new tests in `DataFrameWindowFunctionsSuite` and `TypeCoercionSuite`.

Author: Xingbo Jiang <xingbo.jiang@databricks.com>

Closes #18540 from jiangxb1987/rangeFrame.
---
 .../sql/catalyst/analysis/CheckAnalysis.scala |  15 +-
 .../sql/catalyst/analysis/TypeCoercion.scala  |  23 ++
 .../sql/catalyst/expressions/package.scala    |   7 +
 .../expressions/windowExpressions.scala       | 328 +++++++++---------
 .../sql/catalyst/parser/AstBuilder.scala      |  20 +-
 .../spark/sql/catalyst/trees/TreeNode.scala   |   2 -
 .../analysis/AnalysisErrorSuite.scala         |   2 +-
 .../catalyst/analysis/TypeCoercionSuite.scala |  36 ++
 .../parser/ExpressionParserSuite.scala        |  17 +-
 .../sql/catalyst/parser/PlanParserSuite.scala |   2 +-
 .../sql/catalyst/trees/TreeNodeSuite.scala    |  27 +-
 .../sql/execution/window/WindowExec.scala     | 103 +++---
 .../spark/sql/expressions/WindowSpec.scala    |  33 +-
 .../resources/sql-tests/inputs/window.sql     |  24 +-
 .../sql-tests/results/window.sql.out          | 146 ++++++--
 .../sql/DataFrameWindowFunctionsSuite.scala   |  42 +++
 .../catalyst/ExpressionSQLBuilderSuite.scala  |  10 +-
 17 files changed, 533 insertions(+), 304 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 85c52792ef659..e235689cc36ec 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -108,11 +108,9 @@ trait CheckAnalysis extends PredicateHelper {
           case w @ WindowExpression(AggregateExpression(_, _, true, _), _) =>
             failAnalysis(s"Distinct window functions are not supported: $w")
 
-          case w @ WindowExpression(_: OffsetWindowFunction, WindowSpecDefinition(_, order,
-               SpecifiedWindowFrame(frame,
-                 FrameBoundary(l),
-                 FrameBoundary(h))))
-             if order.isEmpty || frame != RowFrame || l != h =>
+          case w @ WindowExpression(_: OffsetWindowFunction,
+            WindowSpecDefinition(_, order, frame: SpecifiedWindowFrame))
+             if order.isEmpty || !frame.isOffset =>
             failAnalysis("An offset window function can only be evaluated in an ordered " +
               s"row-based window frame with a single offset: $w")
 
@@ -121,15 +119,10 @@ trait CheckAnalysis extends PredicateHelper {
             // function.
             e match {
               case _: AggregateExpression | _: OffsetWindowFunction | _: AggregateWindowFunction =>
+                w
               case _ =>
                 failAnalysis(s"Expression '$e' not supported within a window function.")
             }
-            // Make sure the window specification is valid.
-            s.validate match {
-              case Some(m) =>
-                failAnalysis(s"Window specification $s is not valid because $m")
-              case None => w
-            }
 
           case s: SubqueryExpression =>
             checkSubqueryExpression(operator, s)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
index a78e1c98e89de..25af014f67fe9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
@@ -59,6 +59,7 @@ object TypeCoercion {
       PropagateTypes ::
       ImplicitTypeCasts ::
       DateTimeOperations ::
+      WindowFrameCoercion ::
       Nil
 
   // See https://cwiki.apache.org/confluence/display/Hive/LanguageManual+Types.
@@ -805,4 +806,26 @@ object TypeCoercion {
       Option(ret)
     }
   }
+
+  /**
+   * Cast WindowFrame boundaries to the type they operate upon.
+   */
+  object WindowFrameCoercion extends Rule[LogicalPlan] {
+    def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
+      case s @ WindowSpecDefinition(_, Seq(order), SpecifiedWindowFrame(RangeFrame, lower, upper))
+          if order.resolved =>
+        s.copy(frameSpecification = SpecifiedWindowFrame(
+          RangeFrame,
+          createBoundaryCast(lower, order.dataType),
+          createBoundaryCast(upper, order.dataType)))
+    }
+
+    private def createBoundaryCast(boundary: Expression, dt: DataType): Expression = {
+      boundary match {
+        case e: SpecialFrameBoundary => e
+        case e: Expression if e.dataType != dt && Cast.canCast(e.dataType, dt) => Cast(e, dt)
+        case _ => boundary
+      }
+    }
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
index 4c8b177237d23..1a48995358af7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
@@ -74,6 +74,13 @@ package object expressions  {
     def initialize(partitionIndex: Int): Unit = {}
   }
 
+  /**
+   * An identity projection. This returns the input row.
+   */
+  object IdentityProjection extends Projection {
+    override def apply(row: InternalRow): InternalRow = row
+  }
+
   /**
    * Converts a [[InternalRow]] to another Row given a sequence of expression that define each
    * column of the new row. If the schema of the input row is specified, then the given expression
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
index 88afd43223d1d..a829dccfd3e36 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
@@ -19,7 +19,6 @@ package org.apache.spark.sql.catalyst.expressions
 
 import java.util.Locale
 
-import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, UnresolvedException}
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess}
 import org.apache.spark.sql.catalyst.expressions.aggregate.{DeclarativeAggregate, NoOp}
@@ -43,34 +42,7 @@ case class WindowSpecDefinition(
     orderSpec: Seq[SortOrder],
     frameSpecification: WindowFrame) extends Expression with WindowSpec with Unevaluable {
 
-  def validate: Option[String] = frameSpecification match {
-    case UnspecifiedFrame =>
-      Some("Found a UnspecifiedFrame. It should be converted to a SpecifiedWindowFrame " +
-        "during analysis. Please file a bug report.")
-    case frame: SpecifiedWindowFrame => frame.validate.orElse {
-      def checkValueBasedBoundaryForRangeFrame(): Option[String] = {
-        if (orderSpec.length > 1)  {
-          // It is not allowed to have a value-based PRECEDING and FOLLOWING
-          // as the boundary of a Range Window Frame.
-          Some("This Range Window Frame only accepts at most one ORDER BY expression.")
-        } else if (orderSpec.nonEmpty && !orderSpec.head.dataType.isInstanceOf[NumericType]) {
-          Some("The data type of the expression in the ORDER BY clause should be a numeric type.")
-        } else {
-          None
-        }
-      }
-
-      (frame.frameType, frame.frameStart, frame.frameEnd) match {
-        case (RangeFrame, vp: ValuePreceding, _) => checkValueBasedBoundaryForRangeFrame()
-        case (RangeFrame, vf: ValueFollowing, _) => checkValueBasedBoundaryForRangeFrame()
-        case (RangeFrame, _, vp: ValuePreceding) => checkValueBasedBoundaryForRangeFrame()
-        case (RangeFrame, _, vf: ValueFollowing) => checkValueBasedBoundaryForRangeFrame()
-        case (_, _, _) => None
-      }
-    }
-  }
-
-  override def children: Seq[Expression] = partitionSpec ++ orderSpec
+  override def children: Seq[Expression] = partitionSpec ++ orderSpec :+ frameSpecification
 
   override lazy val resolved: Boolean =
     childrenResolved && checkInputDataTypes().isSuccess &&
@@ -78,23 +50,46 @@ case class WindowSpecDefinition(
 
   override def nullable: Boolean = true
   override def foldable: Boolean = false
-  override def dataType: DataType = throw new UnsupportedOperationException
+  override def dataType: DataType = throw new UnsupportedOperationException("dataType")
 
-  override def sql: String = {
-    val partition = if (partitionSpec.isEmpty) {
-      ""
-    } else {
-      "PARTITION BY " + partitionSpec.map(_.sql).mkString(", ") + " "
+  override def checkInputDataTypes(): TypeCheckResult = {
+    frameSpecification match {
+      case UnspecifiedFrame =>
+        TypeCheckFailure(
+          "Cannot use an UnspecifiedFrame. This should have been converted during analysis. " +
+            "Please file a bug report.")
+      case f: SpecifiedWindowFrame if f.frameType == RangeFrame && !f.isUnbounded &&
+          orderSpec.isEmpty =>
+        TypeCheckFailure(
+          "A range window frame cannot be used in an unordered window specification.")
+      case f: SpecifiedWindowFrame if f.frameType == RangeFrame && f.isValueBound &&
+          orderSpec.size > 1 =>
+        TypeCheckFailure(
+          s"A range window frame with value boundaries cannot be used in a window specification " +
+            s"with multiple order by expressions: ${orderSpec.mkString(",")}")
+      case f: SpecifiedWindowFrame if f.frameType == RangeFrame && f.isValueBound &&
+          !isValidFrameType(f.valueBoundary.head.dataType) =>
+        TypeCheckFailure(
+          s"The data type '${orderSpec.head.dataType}' used in the order specification does " +
+            s"not match the data type '${f.valueBoundary.head.dataType}' which is used in the " +
+            "range frame.")
+      case _ => TypeCheckSuccess
     }
+  }
 
-    val order = if (orderSpec.isEmpty) {
-      ""
-    } else {
-      "ORDER BY " + orderSpec.map(_.sql).mkString(", ") + " "
+  override def sql: String = {
+    def toSql(exprs: Seq[Expression], prefix: String): Seq[String] = {
+      Seq(exprs).filter(_.nonEmpty).map(_.map(_.sql).mkString(prefix, ", ", ""))
     }
 
-    s"($partition$order${frameSpecification.toString})"
+    val elements =
+      toSql(partitionSpec, "PARTITION BY ") ++
+        toSql(orderSpec, "ORDER BY ") ++
+        Seq(frameSpecification.sql)
+    elements.mkString("(", " ", ")")
   }
+
+  private def isValidFrameType(ft: DataType): Boolean = orderSpec.head.dataType == ft
 }
 
 /**
@@ -106,22 +101,26 @@ case class WindowSpecReference(name: String) extends WindowSpec
 /**
  * The trait used to represent the type of a Window Frame.
  */
-sealed trait FrameType
+sealed trait FrameType {
+  def inputType: AbstractDataType
+  def sql: String
+}
 
 /**
- * RowFrame treats rows in a partition individually. When a [[ValuePreceding]]
- * or a [[ValueFollowing]] is used as its [[FrameBoundary]], the value is considered
- * as a physical offset.
+ * RowFrame treats rows in a partition individually. Values used in a row frame are considered
+ * to be physical offsets.
  * For example, `ROW BETWEEN 1 PRECEDING AND 1 FOLLOWING` represents a 3-row frame,
  * from the row that precedes the current row to the row that follows the current row.
  */
-case object RowFrame extends FrameType
+case object RowFrame extends FrameType {
+  override def inputType: AbstractDataType = IntegerType
+  override def sql: String = "ROWS"
+}
 
 /**
- * RangeFrame treats rows in a partition as groups of peers.
- * All rows having the same `ORDER BY` ordering are considered as peers.
- * When a [[ValuePreceding]] or a [[ValueFollowing]] is used as its [[FrameBoundary]],
- * the value is considered as a logical offset.
+ * RangeFrame treats rows in a partition as groups of peers. All rows having the same `ORDER BY`
+ * ordering are considered as peers. Values used in a range frame are considered to be logical
+ * offsets.
  * For example, assuming the value of the current row's `ORDER BY` expression `expr` is `v`,
  * `RANGE BETWEEN 1 PRECEDING AND 1 FOLLOWING` represents a frame containing rows whose values
  * `expr` are in the range of [v-1, v+1].
@@ -129,138 +128,144 @@ case object RowFrame extends FrameType
  * If `ORDER BY` clause is not defined, all rows in the partition are considered as peers
  * of the current row.
  */
-case object RangeFrame extends FrameType
-
-/**
- * The trait used to represent the type of a Window Frame Boundary.
- */
-sealed trait FrameBoundary {
-  def notFollows(other: FrameBoundary): Boolean
+case object RangeFrame extends FrameType {
+  override def inputType: AbstractDataType = NumericType
+  override def sql: String = "RANGE"
 }
 
 /**
- * Extractor for making working with frame boundaries easier.
+ * The trait used to represent special boundaries used in a window frame.
  */
-object FrameBoundary {
-  def apply(boundary: FrameBoundary): Option[Int] = unapply(boundary)
-  def unapply(boundary: FrameBoundary): Option[Int] = boundary match {
-    case CurrentRow => Some(0)
-    case ValuePreceding(offset) => Some(-offset)
-    case ValueFollowing(offset) => Some(offset)
-    case _ => None
-  }
+sealed trait SpecialFrameBoundary extends Expression with Unevaluable {
+  override def children: Seq[Expression] = Nil
+  override def dataType: DataType = NullType
+  override def foldable: Boolean = false
+  override def nullable: Boolean = false
 }
 
-/** UNBOUNDED PRECEDING boundary. */
-case object UnboundedPreceding extends FrameBoundary {
-  def notFollows(other: FrameBoundary): Boolean = other match {
-    case UnboundedPreceding => true
-    case vp: ValuePreceding => true
-    case CurrentRow => true
-    case vf: ValueFollowing => true
-    case UnboundedFollowing => true
-  }
-
-  override def toString: String = "UNBOUNDED PRECEDING"
+/** UNBOUNDED boundary. */
+case object UnboundedPreceding extends SpecialFrameBoundary {
+  override def sql: String = "UNBOUNDED PRECEDING"
 }
 
-/** <value> PRECEDING boundary. */
-case class ValuePreceding(value: Int) extends FrameBoundary {
-  def notFollows(other: FrameBoundary): Boolean = other match {
-    case UnboundedPreceding => false
-    case ValuePreceding(anotherValue) => value >= anotherValue
-    case CurrentRow => true
-    case vf: ValueFollowing => true
-    case UnboundedFollowing => true
-  }
-
-  override def toString: String = s"$value PRECEDING"
+case object UnboundedFollowing extends SpecialFrameBoundary {
+  override def sql: String = "UNBOUNDED FOLLOWING"
 }
 
 /** CURRENT ROW boundary. */
-case object CurrentRow extends FrameBoundary {
-  def notFollows(other: FrameBoundary): Boolean = other match {
-    case UnboundedPreceding => false
-    case vp: ValuePreceding => false
-    case CurrentRow => true
-    case vf: ValueFollowing => true
-    case UnboundedFollowing => true
-  }
-
-  override def toString: String = "CURRENT ROW"
-}
-
-/** <value> FOLLOWING boundary. */
-case class ValueFollowing(value: Int) extends FrameBoundary {
-  def notFollows(other: FrameBoundary): Boolean = other match {
-    case UnboundedPreceding => false
-    case vp: ValuePreceding => false
-    case CurrentRow => false
-    case ValueFollowing(anotherValue) => value <= anotherValue
-    case UnboundedFollowing => true
-  }
-
-  override def toString: String = s"$value FOLLOWING"
-}
-
-/** UNBOUNDED FOLLOWING boundary. */
-case object UnboundedFollowing extends FrameBoundary {
-  def notFollows(other: FrameBoundary): Boolean = other match {
-    case UnboundedPreceding => false
-    case vp: ValuePreceding => false
-    case CurrentRow => false
-    case vf: ValueFollowing => false
-    case UnboundedFollowing => true
-  }
-
-  override def toString: String = "UNBOUNDED FOLLOWING"
+case object CurrentRow extends SpecialFrameBoundary {
+  override def sql: String = "CURRENT ROW"
 }
 
 /**
  * Represents a window frame.
  */
-sealed trait WindowFrame
+sealed trait WindowFrame extends Expression with Unevaluable {
+  override def children: Seq[Expression] = Nil
+  override def dataType: DataType = throw new UnsupportedOperationException("dataType")
+  override def foldable: Boolean = false
+  override def nullable: Boolean = false
+}
 
 /** Used as a placeholder when a frame specification is not defined. */
 case object UnspecifiedFrame extends WindowFrame
 
-/** A specified Window Frame. */
+/**
+ * A specified Window Frame. The val lower/uppper can be either a foldable [[Expression]] or a
+ * [[SpecialFrameBoundary]].
+ */
 case class SpecifiedWindowFrame(
     frameType: FrameType,
-    frameStart: FrameBoundary,
-    frameEnd: FrameBoundary) extends WindowFrame {
-
-  /** If this WindowFrame is valid or not. */
-  def validate: Option[String] = (frameType, frameStart, frameEnd) match {
-    case (_, UnboundedFollowing, _) =>
-      Some(s"$UnboundedFollowing is not allowed as the start of a Window Frame.")
-    case (_, _, UnboundedPreceding) =>
-      Some(s"$UnboundedPreceding is not allowed as the end of a Window Frame.")
-    // case (RowFrame, start, end) => ??? RowFrame specific rule
-    // case (RangeFrame, start, end) => ??? RangeFrame specific rule
-    case (_, start, end) =>
-      if (start.notFollows(end)) {
-        None
-      } else {
-        val reason =
-          s"The end of this Window Frame $end is smaller than the start of " +
-          s"this Window Frame $start."
-        Some(reason)
-      }
+    lower: Expression,
+    upper: Expression)
+  extends WindowFrame {
+
+  override def children: Seq[Expression] = lower :: upper :: Nil
+
+  lazy val valueBoundary: Seq[Expression] =
+    children.filterNot(_.isInstanceOf[SpecialFrameBoundary])
+
+  override def checkInputDataTypes(): TypeCheckResult = {
+    // Check lower value.
+    val lowerCheck = checkBoundary(lower, "lower")
+    if (lowerCheck.isFailure) {
+      return lowerCheck
+    }
+
+    // Check upper value.
+    val upperCheck = checkBoundary(upper, "upper")
+    if (upperCheck.isFailure) {
+      return upperCheck
+    }
+
+    // Check combination (of expressions).
+    (lower, upper) match {
+      case (l: Expression, u: Expression) if !isValidFrameBoundary(l, u) =>
+        TypeCheckFailure(s"Window frame upper bound '$upper' does not followes the lower bound " +
+          s"'$lower'.")
+      case (l: SpecialFrameBoundary, _) => TypeCheckSuccess
+      case (_, u: SpecialFrameBoundary) => TypeCheckSuccess
+      case (l: Expression, u: Expression) if l.dataType != u.dataType =>
+        TypeCheckFailure(
+          s"Window frame bounds '$lower' and '$upper' do no not have the same data type: " +
+            s"'${l.dataType.catalogString}' <> '${u.dataType.catalogString}'")
+      case (l: Expression, u: Expression) if isGreaterThan(l, u) =>
+        TypeCheckFailure(
+          "The lower bound of a window frame must be less than or equal to the upper bound")
+      case _ => TypeCheckSuccess
+    }
+  }
+
+  override def sql: String = {
+    val lowerSql = boundarySql(lower)
+    val upperSql = boundarySql(upper)
+    s"${frameType.sql} BETWEEN $lowerSql AND $upperSql"
   }
 
-  override def toString: String = frameType match {
-    case RowFrame => s"ROWS BETWEEN $frameStart AND $frameEnd"
-    case RangeFrame => s"RANGE BETWEEN $frameStart AND $frameEnd"
+  def isUnbounded: Boolean = lower == UnboundedPreceding && upper == UnboundedFollowing
+
+  def isValueBound: Boolean = valueBoundary.nonEmpty
+
+  def isOffset: Boolean = (lower, upper) match {
+    case (l: Expression, u: Expression) => frameType == RowFrame && l == u
+    case _ => false
+  }
+
+  private def boundarySql(expr: Expression): String = expr match {
+    case e: SpecialFrameBoundary => e.sql
+    case UnaryMinus(n) => n.sql + " PRECEDING"
+    case e: Expression => e.sql + " FOLLOWING"
+  }
+
+  private def isGreaterThan(l: Expression, r: Expression): Boolean = {
+    GreaterThan(l, r).eval().asInstanceOf[Boolean]
+  }
+
+  private def checkBoundary(b: Expression, location: String): TypeCheckResult = b match {
+    case _: SpecialFrameBoundary => TypeCheckSuccess
+    case e: Expression if !e.foldable =>
+      TypeCheckFailure(s"Window frame $location bound '$e' is not a literal.")
+    case e: Expression if !frameType.inputType.acceptsType(e.dataType) =>
+      TypeCheckFailure(
+        s"The data type of the $location bound '${e.dataType} does not match " +
+          s"the expected data type '${frameType.inputType}'.")
+    case _ => TypeCheckSuccess
+  }
+
+  private def isValidFrameBoundary(l: Expression, u: Expression): Boolean = {
+    (l, u) match {
+      case (UnboundedFollowing, _) => false
+      case (_, UnboundedPreceding) => false
+      case _ => true
+    }
   }
 }
 
 object SpecifiedWindowFrame {
   /**
-   *
    * @param hasOrderSpecification If the window spec has order by expressions.
    * @param acceptWindowFrame If the window function accepts user-specified frame.
-   * @return
+   * @return the default window frame.
    */
   def defaultWindowFrame(
       hasOrderSpecification: Boolean,
@@ -351,20 +356,25 @@ abstract class OffsetWindowFunction
 
   override def nullable: Boolean = default == null || default.nullable || input.nullable
 
-  override lazy val frame = {
-    // This will be triggered by the Analyzer.
-    val offsetValue = offset.eval() match {
-      case o: Int => o
-      case x => throw new AnalysisException(
-        s"Offset expression must be a foldable integer expression: $x")
-    }
+  override lazy val frame: WindowFrame = {
     val boundary = direction match {
-      case Ascending => ValueFollowing(offsetValue)
-      case Descending => ValuePreceding(offsetValue)
+      case Ascending => offset
+      case Descending => UnaryMinus(offset)
     }
     SpecifiedWindowFrame(RowFrame, boundary, boundary)
   }
 
+  override def checkInputDataTypes(): TypeCheckResult = {
+    val check = super.checkInputDataTypes()
+    if (check.isFailure) {
+      check
+    } else if (!offset.foldable) {
+      TypeCheckFailure(s"Offset expression '$offset' must be a literal.")
+    } else {
+      TypeCheckSuccess
+    }
+  }
+
   override def dataType: DataType = input.dataType
 
   override def inputTypes: Seq[AbstractDataType] =
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 45c1d3d430e0d..6795be758e381 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -1179,32 +1179,26 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
   }
 
   /**
-   * Create or resolve a [[FrameBoundary]]. Simple math expressions are allowed for Value
-   * Preceding/Following boundaries. These expressions must be constant (foldable) and return an
-   * integer value.
+   * Create or resolve a frame boundary expressions.
    */
-  override def visitFrameBound(ctx: FrameBoundContext): FrameBoundary = withOrigin(ctx) {
-    // We currently only allow foldable integers.
-    def value: Int = {
+  override def visitFrameBound(ctx: FrameBoundContext): Expression = withOrigin(ctx) {
+    def value: Expression = {
       val e = expression(ctx.expression)
-      validate(e.resolved && e.foldable && e.dataType == IntegerType,
-        "Frame bound value must be a constant integer.",
-        ctx)
-      e.eval().asInstanceOf[Int]
+      validate(e.resolved && e.foldable, "Frame bound value must be a literal.", ctx)
+      e
     }
 
-    // Create the FrameBoundary
     ctx.boundType.getType match {
       case SqlBaseParser.PRECEDING if ctx.UNBOUNDED != null =>
         UnboundedPreceding
       case SqlBaseParser.PRECEDING =>
-        ValuePreceding(value)
+        UnaryMinus(value)
       case SqlBaseParser.CURRENT =>
         CurrentRow
       case SqlBaseParser.FOLLOWING if ctx.UNBOUNDED != null =>
         UnboundedFollowing
       case SqlBaseParser.FOLLOWING =>
-        ValueFollowing(value)
+        value
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
index 7375a0bcbae75..b6889f21cc6ed 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -688,8 +688,6 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
     case id: FunctionIdentifier => true
     case spec: BucketSpec => true
     case catalog: CatalogTable => true
-    case boundary: FrameBoundary => true
-    case frame: WindowFrame => true
     case partition: Partitioning => true
     case resource: FunctionResource => true
     case broadcast: BroadcastMode => true
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
index 7311dc3899e53..4e0613619add6 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
@@ -190,7 +190,7 @@ class AnalysisErrorSuite extends AnalysisTest {
         WindowSpecDefinition(
           UnresolvedAttribute("a") :: Nil,
           SortOrder(UnresolvedAttribute("b"), Ascending) :: Nil,
-          SpecifiedWindowFrame(RangeFrame, ValueFollowing(1), ValueFollowing(2)))).as('window)),
+          SpecifiedWindowFrame(RangeFrame, Literal(1), Literal(2)))).as('window)),
     "window frame" :: "must match the required frame" :: Nil)
 
   errorTest(
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
index b3994ab0828ad..d62e3b6dfe34f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
@@ -1109,6 +1109,42 @@ class TypeCoercionSuite extends AnalysisTest {
       EqualTo(Literal(Array(1, 2)), Literal("123")),
       EqualTo(Literal(Array(1, 2)), Literal("123")))
   }
+
+  test("cast WindowFrame boundaries to the type they operate upon") {
+    // Can cast frame boundaries to order dataType.
+    ruleTest(WindowFrameCoercion,
+      windowSpec(
+        Seq(UnresolvedAttribute("a")),
+        Seq(SortOrder(Literal(1L), Ascending)),
+        SpecifiedWindowFrame(RangeFrame, Literal(3), Literal(2147483648L))),
+      windowSpec(
+        Seq(UnresolvedAttribute("a")),
+        Seq(SortOrder(Literal(1L), Ascending)),
+        SpecifiedWindowFrame(RangeFrame, Cast(3, LongType), Literal(2147483648L)))
+    )
+    // Cannot cast frame boundaries to order dataType.
+    ruleTest(WindowFrameCoercion,
+      windowSpec(
+        Seq(UnresolvedAttribute("a")),
+        Seq(SortOrder(Literal.default(DateType), Ascending)),
+        SpecifiedWindowFrame(RangeFrame, Literal(10.0), Literal(2147483648L))),
+      windowSpec(
+        Seq(UnresolvedAttribute("a")),
+        Seq(SortOrder(Literal.default(DateType), Ascending)),
+        SpecifiedWindowFrame(RangeFrame, Literal(10.0), Literal(2147483648L)))
+    )
+    // Should not cast SpecialFrameBoundary.
+    ruleTest(WindowFrameCoercion,
+      windowSpec(
+        Seq(UnresolvedAttribute("a")),
+        Seq(SortOrder(Literal(1L), Ascending)),
+        SpecifiedWindowFrame(RangeFrame, CurrentRow, UnboundedFollowing)),
+      windowSpec(
+        Seq(UnresolvedAttribute("a")),
+        Seq(SortOrder(Literal(1L), Ascending)),
+        SpecifiedWindowFrame(RangeFrame, CurrentRow, UnboundedFollowing))
+    )
+  }
 }
 
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
index 45f9f72dccc45..76c79b3d0760c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
@@ -267,16 +267,17 @@ class ExpressionParserSuite extends PlanTest {
     // Range/Row
     val frameTypes = Seq(("rows", RowFrame), ("range", RangeFrame))
     val boundaries = Seq(
-      ("10 preceding", ValuePreceding(10), CurrentRow),
-      ("3 + 1 following", ValueFollowing(4), CurrentRow), // Will fail during analysis
+      ("10 preceding", -Literal(10), CurrentRow),
+      ("2147483648 preceding", -Literal(2147483648L), CurrentRow),
+      ("3 + 1 following", Add(Literal(3), Literal(1)), CurrentRow),
       ("unbounded preceding", UnboundedPreceding, CurrentRow),
       ("unbounded following", UnboundedFollowing, CurrentRow), // Will fail during analysis
       ("between unbounded preceding and current row", UnboundedPreceding, CurrentRow),
       ("between unbounded preceding and unbounded following",
         UnboundedPreceding, UnboundedFollowing),
-      ("between 10 preceding and current row", ValuePreceding(10), CurrentRow),
-      ("between current row and 5 following", CurrentRow, ValueFollowing(5)),
-      ("between 10 preceding and 5 following", ValuePreceding(10), ValueFollowing(5))
+      ("between 10 preceding and current row", -Literal(10), CurrentRow),
+      ("between current row and 5 following", CurrentRow, Literal(5)),
+      ("between 10 preceding and 5 following", -Literal(10), Literal(5))
     )
     frameTypes.foreach {
       case (frameTypeSql, frameType) =>
@@ -288,13 +289,9 @@ class ExpressionParserSuite extends PlanTest {
         }
     }
 
-    // We cannot use non integer constants.
-    intercept("foo(*) over (partition by a order by b rows 10.0 preceding)",
-      "Frame bound value must be a constant integer.")
-
     // We cannot use an arbitrary expression.
     intercept("foo(*) over (partition by a order by b rows exp(b) preceding)",
-      "Frame bound value must be a constant integer.")
+      "Frame bound value must be a literal.")
   }
 
   test("row constructor") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
index 6dad097041a15..725bcb833f5e3 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
@@ -243,7 +243,7 @@ class PlanParserSuite extends AnalysisTest {
     val sql = "select * from t"
     val plan = table("t").select(star())
     val spec = WindowSpecDefinition(Seq('a, 'b), Seq('c.asc),
-      SpecifiedWindowFrame(RowFrame, ValuePreceding(1), ValueFollowing(1)))
+      SpecifiedWindowFrame(RowFrame, -Literal(1), Literal(1)))
 
     // Test window resolution.
     val ws1 = Map("w1" -> spec, "w2" -> spec, "w3" -> spec)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
index 819078218c546..4fc947a88f6b8 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
@@ -436,21 +436,22 @@ class TreeNodeSuite extends SparkFunSuite {
         "bucketColumnNames" -> "[bucket]",
         "sortColumnNames" -> "[sort]"))
 
-    // Converts FrameBoundary to JSON
-    assertJSON(
-      ValueFollowing(3),
-      JObject(
-        "product-class" -> classOf[ValueFollowing].getName,
-        "value" -> 3))
-
     // Converts WindowFrame to JSON
     assertJSON(
-      SpecifiedWindowFrame(RowFrame, UnboundedFollowing, CurrentRow),
-      JObject(
-        "product-class" -> classOf[SpecifiedWindowFrame].getName,
-        "frameType" -> JObject("object" -> JString(RowFrame.getClass.getName)),
-        "frameStart" -> JObject("object" -> JString(UnboundedFollowing.getClass.getName)),
-        "frameEnd" -> JObject("object" -> JString(CurrentRow.getClass.getName))))
+      SpecifiedWindowFrame(RowFrame, UnboundedPreceding, CurrentRow),
+      List(
+        JObject(
+          "class" -> classOf[SpecifiedWindowFrame].getName,
+          "num-children" -> 2,
+          "frameType" -> JObject("object" -> JString(RowFrame.getClass.getName)),
+          "lower" -> 0,
+          "upper" -> 1),
+        JObject(
+          "class" -> UnboundedPreceding.getClass.getName,
+          "num-children" -> 0),
+        JObject(
+          "class" -> CurrentRow.getClass.getName,
+          "num-children" -> 0)))
 
     // Converts Partitioning to JSON
     assertJSON(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala
index 1820cb0ef540b..0766e37826cb2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala
@@ -26,7 +26,6 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution.{ExternalAppendOnlyUnsafeRowArray, SparkPlan, UnaryExecNode}
-import org.apache.spark.sql.types.IntegerType
 
 /**
  * This class calculates and outputs (windowed) aggregates over the rows in a single (sorted)
@@ -109,46 +108,50 @@ case class WindowExec(
    *
    * This method uses Code Generation. It can only be used on the executor side.
    *
-   * @param frameType to evaluate. This can either be Row or Range based.
-   * @param offset with respect to the row.
+   * @param frame to evaluate. This can either be a Row or Range frame.
+   * @param bound with respect to the row.
    * @return a bound ordering object.
    */
-  private[this] def createBoundOrdering(frameType: FrameType, offset: Int): BoundOrdering = {
-    frameType match {
-      case RangeFrame =>
-        val (exprs, current, bound) = if (offset == 0) {
-          // Use the entire order expression when the offset is 0.
-          val exprs = orderSpec.map(_.child)
-          val buildProjection = () => newMutableProjection(exprs, child.output)
-          (orderSpec, buildProjection(), buildProjection())
-        } else if (orderSpec.size == 1) {
-          // Use only the first order expression when the offset is non-null.
-          val sortExpr = orderSpec.head
-          val expr = sortExpr.child
-          // Create the projection which returns the current 'value'.
-          val current = newMutableProjection(expr :: Nil, child.output)
-          // Flip the sign of the offset when processing the order is descending
-          val boundOffset = sortExpr.direction match {
-            case Descending => -offset
-            case Ascending => offset
-          }
-          // Create the projection which returns the current 'value' modified by adding the offset.
-          val boundExpr = Add(expr, Cast(Literal.create(boundOffset, IntegerType), expr.dataType))
-          val bound = newMutableProjection(boundExpr :: Nil, child.output)
-          (sortExpr :: Nil, current, bound)
-        } else {
-          sys.error("Non-Zero range offsets are not supported for windows " +
-            "with multiple order expressions.")
+  private[this] def createBoundOrdering(frame: FrameType, bound: Expression): BoundOrdering = {
+    (frame, bound) match {
+      case (RowFrame, CurrentRow) =>
+        RowBoundOrdering(0)
+
+      case (RowFrame, IntegerLiteral(offset)) =>
+        RowBoundOrdering(offset)
+
+      case (RangeFrame, CurrentRow) =>
+        val ordering = newOrdering(orderSpec, child.output)
+        RangeBoundOrdering(ordering, IdentityProjection, IdentityProjection)
+
+      case (RangeFrame, offset: Expression) if orderSpec.size == 1 =>
+        // Use only the first order expression when the offset is non-null.
+        val sortExpr = orderSpec.head
+        val expr = sortExpr.child
+
+        // Create the projection which returns the current 'value'.
+        val current = newMutableProjection(expr :: Nil, child.output)
+
+        // Flip the sign of the offset when processing the order is descending
+        val boundOffset = sortExpr.direction match {
+          case Descending => UnaryMinus(offset)
+          case Ascending => offset
         }
+
+        // Create the projection which returns the current 'value' modified by adding the offset.
+        val boundExpr = Add(expr, Cast(boundOffset, expr.dataType))
+        val bound = newMutableProjection(boundExpr :: Nil, child.output)
+
         // Construct the ordering. This is used to compare the result of current value projection
         // to the result of bound value projection. This is done manually because we want to use
         // Code Generation (if it is enabled).
-        val sortExprs = exprs.zipWithIndex.map { case (e, i) =>
-          SortOrder(BoundReference(i, e.dataType, e.nullable), e.direction)
-        }
-        val ordering = newOrdering(sortExprs, Nil)
+        val boundSortExprs = sortExpr.copy(BoundReference(0, expr.dataType, expr.nullable)) :: Nil
+        val ordering = newOrdering(boundSortExprs, Nil)
         RangeBoundOrdering(ordering, current, bound)
-      case RowFrame => RowBoundOrdering(offset)
+
+      case (RangeFrame, _) =>
+        sys.error("Non-Zero range offsets are not supported for windows " +
+          "with multiple order expressions.")
     }
   }
 
@@ -157,13 +160,13 @@ case class WindowExec(
    * [[WindowExpression]]s and factory function for the WindowFrameFunction.
    */
   private[this] lazy val windowFrameExpressionFactoryPairs = {
-    type FrameKey = (String, FrameType, Option[Int], Option[Int])
+    type FrameKey = (String, FrameType, Expression, Expression)
     type ExpressionBuffer = mutable.Buffer[Expression]
     val framedFunctions = mutable.Map.empty[FrameKey, (ExpressionBuffer, ExpressionBuffer)]
 
     // Add a function and its function to the map for a given frame.
     def collect(tpe: String, fr: SpecifiedWindowFrame, e: Expression, fn: Expression): Unit = {
-      val key = (tpe, fr.frameType, FrameBoundary(fr.frameStart), FrameBoundary(fr.frameEnd))
+      val key = (tpe, fr.frameType, fr.lower, fr.upper)
       val (es, fns) = framedFunctions.getOrElseUpdate(
         key, (ArrayBuffer.empty[Expression], ArrayBuffer.empty[Expression]))
       es += e
@@ -203,7 +206,7 @@ case class WindowExec(
         // Create the factory
         val factory = key match {
           // Offset Frame
-          case ("OFFSET", RowFrame, Some(offset), Some(h)) if offset == h =>
+          case ("OFFSET", _, IntegerLiteral(offset), _) =>
             target: InternalRow =>
               new OffsetWindowFunctionFrame(
                 target,
@@ -215,38 +218,38 @@ case class WindowExec(
                   newMutableProjection(expressions, schema, subexpressionEliminationEnabled),
                 offset)
 
+          // Entire Partition Frame.
+          case ("AGGREGATE", _, UnboundedPreceding, UnboundedFollowing) =>
+            target: InternalRow => {
+              new UnboundedWindowFunctionFrame(target, processor)
+            }
+
           // Growing Frame.
-          case ("AGGREGATE", frameType, None, Some(high)) =>
+          case ("AGGREGATE", frameType, UnboundedPreceding, upper) =>
             target: InternalRow => {
               new UnboundedPrecedingWindowFunctionFrame(
                 target,
                 processor,
-                createBoundOrdering(frameType, high))
+                createBoundOrdering(frameType, upper))
             }
 
           // Shrinking Frame.
-          case ("AGGREGATE", frameType, Some(low), None) =>
+          case ("AGGREGATE", frameType, lower, UnboundedFollowing) =>
             target: InternalRow => {
               new UnboundedFollowingWindowFunctionFrame(
                 target,
                 processor,
-                createBoundOrdering(frameType, low))
+                createBoundOrdering(frameType, lower))
             }
 
           // Moving Frame.
-          case ("AGGREGATE", frameType, Some(low), Some(high)) =>
+          case ("AGGREGATE", frameType, lower, upper) =>
             target: InternalRow => {
               new SlidingWindowFunctionFrame(
                 target,
                 processor,
-                createBoundOrdering(frameType, low),
-                createBoundOrdering(frameType, high))
-            }
-
-          // Entire Partition Frame.
-          case ("AGGREGATE", frameType, None, None) =>
-            target: InternalRow => {
-              new UnboundedWindowFunctionFrame(target, processor)
+                createBoundOrdering(frameType, lower),
+                createBoundOrdering(frameType, upper))
             }
         }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala
index f653890f6c7ba..f8b404de77a4a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.expressions
 
 import org.apache.spark.annotation.InterfaceStability
-import org.apache.spark.sql.Column
+import org.apache.spark.sql.{AnalysisException, Column}
 import org.apache.spark.sql.catalyst.expressions._
 
 /**
@@ -123,7 +123,24 @@ class WindowSpec private[sql](
    */
   // Note: when updating the doc for this method, also update Window.rowsBetween.
   def rowsBetween(start: Long, end: Long): WindowSpec = {
-    between(RowFrame, start, end)
+    val boundaryStart = start match {
+      case 0 => CurrentRow
+      case Long.MinValue => UnboundedPreceding
+      case x if Int.MinValue <= x && x <= Int.MaxValue => Literal(x.toInt)
+      case x => throw new AnalysisException(s"Boundary start is not a valid integer: $x")
+    }
+
+    val boundaryEnd = end match {
+      case 0 => CurrentRow
+      case Long.MaxValue => UnboundedFollowing
+      case x if Int.MinValue <= x && x <= Int.MaxValue => Literal(x.toInt)
+      case x => throw new AnalysisException(s"Boundary end is not a valid integer: $x")
+    }
+
+    new WindowSpec(
+      partitionSpec,
+      orderSpec,
+      SpecifiedWindowFrame(RowFrame, boundaryStart, boundaryEnd))
   }
 
   /**
@@ -174,28 +191,22 @@ class WindowSpec private[sql](
    */
   // Note: when updating the doc for this method, also update Window.rangeBetween.
   def rangeBetween(start: Long, end: Long): WindowSpec = {
-    between(RangeFrame, start, end)
-  }
-
-  private def between(typ: FrameType, start: Long, end: Long): WindowSpec = {
     val boundaryStart = start match {
       case 0 => CurrentRow
       case Long.MinValue => UnboundedPreceding
-      case x if x < 0 => ValuePreceding(-start.toInt)
-      case x if x > 0 => ValueFollowing(start.toInt)
+      case x => Literal(x)
     }
 
     val boundaryEnd = end match {
       case 0 => CurrentRow
       case Long.MaxValue => UnboundedFollowing
-      case x if x < 0 => ValuePreceding(-end.toInt)
-      case x if x > 0 => ValueFollowing(end.toInt)
+      case x => Literal(x)
     }
 
     new WindowSpec(
       partitionSpec,
       orderSpec,
-      SpecifiedWindowFrame(typ, boundaryStart, boundaryEnd))
+      SpecifiedWindowFrame(RangeFrame, boundaryStart, boundaryEnd))
   }
 
   /**
diff --git a/sql/core/src/test/resources/sql-tests/inputs/window.sql b/sql/core/src/test/resources/sql-tests/inputs/window.sql
index c800fc3d49891..342e5719e9a60 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/window.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/window.sql
@@ -1,24 +1,44 @@
 -- Test data.
 CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
-(null, "a"), (1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b"), (null, null), (3, null)
-AS testData(val, cate);
+(null, 1L, "a"), (1, 1L, "a"), (1, 2L, "a"), (2, 2147483650L, "a"), (1, null, "b"), (2, 3L, "b"),
+(3, 2147483650L, "b"), (null, null, null), (3, 1L, null)
+AS testData(val, val_long, cate);
 
 -- RowsBetween
 SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY val ROWS CURRENT ROW) FROM testData
 ORDER BY cate, val;
 SELECT val, cate, sum(val) OVER(PARTITION BY cate ORDER BY val
 ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING) FROM testData ORDER BY cate, val;
+SELECT val_long, cate, sum(val_long) OVER(PARTITION BY cate ORDER BY val_long
+ROWS BETWEEN CURRENT ROW AND 2147483648 FOLLOWING) FROM testData ORDER BY cate, val_long;
 
 -- RangeBetween
 SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY val RANGE 1 PRECEDING) FROM testData
 ORDER BY cate, val;
 SELECT val, cate, sum(val) OVER(PARTITION BY cate ORDER BY val
 RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, val;
+SELECT val_long, cate, sum(val_long) OVER(PARTITION BY cate ORDER BY val_long
+RANGE BETWEEN CURRENT ROW AND 2147483648 FOLLOWING) FROM testData ORDER BY cate, val_long;
 
 -- RangeBetween with reverse OrderBy
 SELECT val, cate, sum(val) OVER(PARTITION BY cate ORDER BY val DESC
 RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, val;
 
+-- Invalid window frame
+SELECT val, cate, count(val) OVER(PARTITION BY cate
+ROWS BETWEEN UNBOUNDED FOLLOWING AND 1 FOLLOWING) FROM testData ORDER BY cate, val;
+SELECT val, cate, count(val) OVER(PARTITION BY cate
+RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, val;
+SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY val, cate
+RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, val;
+SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY current_date
+RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, val;
+SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY val
+RANGE BETWEEN 1 FOLLOWING AND 1 PRECEDING) FROM testData ORDER BY cate, val;
+SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY val
+RANGE BETWEEN CURRENT ROW AND current_date PRECEDING) FROM testData ORDER BY cate, val;
+
+
 -- Window functions
 SELECT val, cate,
 max(val) OVER w AS max,
diff --git a/sql/core/src/test/resources/sql-tests/results/window.sql.out b/sql/core/src/test/resources/sql-tests/results/window.sql.out
index aa5856138ed81..97511068b323c 100644
--- a/sql/core/src/test/resources/sql-tests/results/window.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/window.sql.out
@@ -1,11 +1,12 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 11
+-- Number of queries: 19
 
 
 -- !query 0
 CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
-(null, "a"), (1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b"), (null, null), (3, null)
-AS testData(val, cate)
+(null, 1L, "a"), (1, 1L, "a"), (1, 2L, "a"), (2, 2147483650L, "a"), (1, null, "b"), (2, 3L, "b"),
+(3, 2147483650L, "b"), (null, null, null), (3, 1L, null)
+AS testData(val, val_long, cate)
 -- !query 0 schema
 struct<>
 -- !query 0 output
@@ -47,11 +48,21 @@ NULL	a	1
 
 
 -- !query 3
+SELECT val_long, cate, sum(val_long) OVER(PARTITION BY cate ORDER BY val_long
+ROWS BETWEEN CURRENT ROW AND 2147483648 FOLLOWING) FROM testData ORDER BY cate, val_long
+-- !query 3 schema
+struct<>
+-- !query 3 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'ROWS BETWEEN CURRENT ROW AND 2147483648L FOLLOWING' due to data type mismatch: The data type of the upper bound 'LongType does not match the expected data type 'IntegerType'.; line 1 pos 41
+
+
+-- !query 4
 SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY val RANGE 1 PRECEDING) FROM testData
 ORDER BY cate, val
--- !query 3 schema
+-- !query 4 schema
 struct<val:int,cate:string,count(val) OVER (PARTITION BY cate ORDER BY val ASC NULLS FIRST RANGE BETWEEN 1 PRECEDING AND CURRENT ROW):bigint>
--- !query 3 output
+-- !query 4 output
 NULL	NULL	0
 3	NULL	1
 NULL	a	0
@@ -63,12 +74,12 @@ NULL	a	0
 3	b	2
 
 
--- !query 4
+-- !query 5
 SELECT val, cate, sum(val) OVER(PARTITION BY cate ORDER BY val
 RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, val
--- !query 4 schema
+-- !query 5 schema
 struct<val:int,cate:string,sum(val) OVER (PARTITION BY cate ORDER BY val ASC NULLS FIRST RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING):bigint>
--- !query 4 output
+-- !query 5 output
 NULL	NULL	NULL
 3	NULL	3
 NULL	a	NULL
@@ -80,12 +91,29 @@ NULL	a	NULL
 3	b	3
 
 
--- !query 5
+-- !query 6
+SELECT val_long, cate, sum(val_long) OVER(PARTITION BY cate ORDER BY val_long
+RANGE BETWEEN CURRENT ROW AND 2147483648 FOLLOWING) FROM testData ORDER BY cate, val_long
+-- !query 6 schema
+struct<val_long:bigint,cate:string,sum(val_long) OVER (PARTITION BY cate ORDER BY val_long ASC NULLS FIRST RANGE BETWEEN CURRENT ROW AND 2147483648 FOLLOWING):bigint>
+-- !query 6 output
+NULL	NULL	NULL
+1	NULL	1
+1	a	4
+1	a	4
+2	a	2147483652
+2147483650	a	2147483650
+NULL	b	NULL
+3	b	2147483653
+2147483650	b	2147483650
+
+
+-- !query 7
 SELECT val, cate, sum(val) OVER(PARTITION BY cate ORDER BY val DESC
 RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, val
--- !query 5 schema
+-- !query 7 schema
 struct<val:int,cate:string,sum(val) OVER (PARTITION BY cate ORDER BY val DESC NULLS LAST RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING):bigint>
--- !query 5 output
+-- !query 7 output
 NULL	NULL	NULL
 3	NULL	3
 NULL	a	NULL
@@ -97,7 +125,73 @@ NULL	a	NULL
 3	b	5
 
 
--- !query 6
+-- !query 8
+SELECT val, cate, count(val) OVER(PARTITION BY cate
+ROWS BETWEEN UNBOUNDED FOLLOWING AND 1 FOLLOWING) FROM testData ORDER BY cate, val
+-- !query 8 schema
+struct<>
+-- !query 8 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'ROWS BETWEEN UNBOUNDED FOLLOWING AND 1 FOLLOWING' due to data type mismatch: Window frame upper bound '1' does not followes the lower bound 'unboundedfollowing$()'.; line 1 pos 33
+
+
+-- !query 9
+SELECT val, cate, count(val) OVER(PARTITION BY cate
+RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, val
+-- !query 9 schema
+struct<>
+-- !query 9 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(PARTITION BY testdata.`cate` RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)' due to data type mismatch: A range window frame cannot be used in an unordered window specification.; line 1 pos 33
+
+
+-- !query 10
+SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY val, cate
+RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, val
+-- !query 10 schema
+struct<>
+-- !query 10 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(PARTITION BY testdata.`cate` ORDER BY testdata.`val` ASC NULLS FIRST, testdata.`cate` ASC NULLS FIRST RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)' due to data type mismatch: A range window frame with value boundaries cannot be used in a window specification with multiple order by expressions: val#x ASC NULLS FIRST,cate#x ASC NULLS FIRST; line 1 pos 33
+
+
+-- !query 11
+SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY current_date
+RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, val
+-- !query 11 schema
+struct<>
+-- !query 11 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(PARTITION BY testdata.`cate` ORDER BY current_date() ASC NULLS FIRST RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)' due to data type mismatch: The data type 'DateType' used in the order specification does not match the data type 'IntegerType' which is used in the range frame.; line 1 pos 33
+
+
+-- !query 12
+SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY val
+RANGE BETWEEN 1 FOLLOWING AND 1 PRECEDING) FROM testData ORDER BY cate, val
+-- !query 12 schema
+struct<>
+-- !query 12 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'RANGE BETWEEN 1 FOLLOWING AND 1 PRECEDING' due to data type mismatch: The lower bound of a window frame must be less than or equal to the upper bound; line 1 pos 33
+
+
+-- !query 13
+SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY val
+RANGE BETWEEN CURRENT ROW AND current_date PRECEDING) FROM testData ORDER BY cate, val
+-- !query 13 schema
+struct<>
+-- !query 13 output
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Frame bound value must be a literal.(line 2, pos 30)
+
+== SQL ==
+SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY val
+RANGE BETWEEN CURRENT ROW AND current_date PRECEDING) FROM testData ORDER BY cate, val
+------------------------------^^^
+
+
+-- !query 14
 SELECT val, cate,
 max(val) OVER w AS max,
 min(val) OVER w AS min,
@@ -124,9 +218,9 @@ approx_count_distinct(val) OVER w AS approx_count_distinct
 FROM testData
 WINDOW w AS (PARTITION BY cate ORDER BY val)
 ORDER BY cate, val
--- !query 6 schema
+-- !query 14 schema
 struct<val:int,cate:string,max:int,min:int,min:int,count:bigint,sum:bigint,avg:double,stddev:double,first_value:int,first_value_ignore_null:int,first_value_contain_null:int,last_value:int,last_value_ignore_null:int,last_value_contain_null:int,rank:int,dense_rank:int,cume_dist:double,percent_rank:double,ntile:int,row_number:int,var_pop:double,var_samp:double,approx_count_distinct:bigint>
--- !query 6 output
+-- !query 14 output
 NULL	NULL	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1	1	0.5	0.0	1	1	NULL	NULL	0
 3	NULL	3	3	3	1	3	3.0	NaN	NULL	3	NULL	3	3	3	2	2	1.0	1.0	2	2	0.0	NaN	1
 NULL	a	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1	1	0.25	0.0	1	1	NULL	NULL	0
@@ -138,11 +232,11 @@ NULL	a	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1	1	0.25	0.
 3	b	3	1	1	3	6	2.0	1.0	1	1	1	3	3	3	3	3	1.0	1.0	2	3	0.6666666666666666	1.0	3
 
 
--- !query 7
+-- !query 15
 SELECT val, cate, avg(null) OVER(PARTITION BY cate ORDER BY val) FROM testData ORDER BY cate, val
--- !query 7 schema
+-- !query 15 schema
 struct<val:int,cate:string,avg(CAST(NULL AS DOUBLE)) OVER (PARTITION BY cate ORDER BY val ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW):double>
--- !query 7 output
+-- !query 15 output
 NULL	NULL	NULL
 3	NULL	NULL
 NULL	a	NULL
@@ -154,20 +248,20 @@ NULL	a	NULL
 3	b	NULL
 
 
--- !query 8
+-- !query 16
 SELECT val, cate, row_number() OVER(PARTITION BY cate) FROM testData ORDER BY cate, val
--- !query 8 schema
+-- !query 16 schema
 struct<>
--- !query 8 output
+-- !query 16 output
 org.apache.spark.sql.AnalysisException
 Window function row_number() requires window to be ordered, please add ORDER BY clause. For example SELECT row_number()(value_expr) OVER (PARTITION BY window_partition ORDER BY window_ordering) from table;
 
 
--- !query 9
+-- !query 17
 SELECT val, cate, sum(val) OVER(), avg(val) OVER() FROM testData ORDER BY cate, val
--- !query 9 schema
+-- !query 17 schema
 struct<val:int,cate:string,sum(CAST(val AS BIGINT)) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING):bigint,avg(CAST(val AS BIGINT)) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING):double>
--- !query 9 output
+-- !query 17 output
 NULL	NULL	13	1.8571428571428572
 3	NULL	13	1.8571428571428572
 NULL	a	13	1.8571428571428572
@@ -179,7 +273,7 @@ NULL	a	13	1.8571428571428572
 3	b	13	1.8571428571428572
 
 
--- !query 10
+-- !query 18
 SELECT val, cate,
 first_value(false) OVER w AS first_value,
 first_value(true, true) OVER w AS first_value_ignore_null,
@@ -190,9 +284,9 @@ last_value(false, false) OVER w AS last_value_contain_null
 FROM testData
 WINDOW w AS ()
 ORDER BY cate, val
--- !query 10 schema
+-- !query 18 schema
 struct<val:int,cate:string,first_value:boolean,first_value_ignore_null:boolean,first_value_contain_null:boolean,last_value:boolean,last_value_ignore_null:boolean,last_value_contain_null:boolean>
--- !query 10 output
+-- !query 18 output
 NULL	NULL	false	true	false	false	true	false
 3	NULL	false	true	false	false	true	false
 NULL	a	false	true	false	false	true	false
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala
index 204858fa29787..9806e57f08744 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala
@@ -151,6 +151,48 @@ class DataFrameWindowFunctionsSuite extends QueryTest with SharedSQLContext {
         Row(2.0d), Row(2.0d)))
   }
 
+  test("row between should accept integer values as boundary") {
+    val df = Seq((1L, "1"), (1L, "1"), (2147483650L, "1"),
+      (3L, "2"), (2L, "1"), (2147483650L, "2"))
+      .toDF("key", "value")
+    df.createOrReplaceTempView("window_table")
+    checkAnswer(
+      df.select(
+        $"key",
+        count("key").over(
+          Window.partitionBy($"value").orderBy($"key").rowsBetween(0, 2147483647))),
+      Seq(Row(1, 3), Row(1, 4), Row(2, 2), Row(3, 2), Row(2147483650L, 1), Row(2147483650L, 1))
+    )
+
+    val e = intercept[AnalysisException](
+      df.select(
+        $"key",
+        count("key").over(
+          Window.partitionBy($"value").orderBy($"key").rowsBetween(0, 2147483648L))))
+    assert(e.message.contains("Boundary end is not a valid integer: 2147483648"))
+  }
+
+  test("range between should accept integer/long values as boundary") {
+    val df = Seq((1L, "1"), (1L, "1"), (2147483650L, "1"),
+      (3L, "2"), (2L, "1"), (2147483650L, "2"))
+      .toDF("key", "value")
+    df.createOrReplaceTempView("window_table")
+    checkAnswer(
+      df.select(
+        $"key",
+        count("key").over(
+          Window.partitionBy($"value").orderBy($"key").rangeBetween(0, 2147483648L))),
+      Seq(Row(1, 3), Row(1, 3), Row(2, 2), Row(3, 2), Row(2147483650L, 1), Row(2147483650L, 1))
+    )
+    checkAnswer(
+      df.select(
+        $"key",
+        count("key").over(
+          Window.partitionBy($"value").orderBy($"key").rangeBetween(-2147483649L, 0))),
+      Seq(Row(1, 2), Row(1, 2), Row(2, 3), Row(2147483650L, 2), Row(2147483650L, 4), Row(3, 1))
+    )
+  }
+
   test("aggregation and rows between with unbounded") {
     val df = Seq((1, "1"), (2, "2"), (2, "3"), (1, "3"), (3, "2"), (4, "3")).toDF("key", "value")
     df.createOrReplaceTempView("window_table")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionSQLBuilderSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionSQLBuilderSuite.scala
index 149ce1e195111..90f90599d5bf4 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionSQLBuilderSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionSQLBuilderSuite.scala
@@ -98,27 +98,27 @@ class ExpressionSQLBuilderSuite extends SQLBuilderTest {
 
     checkSQL(
       WindowSpecDefinition('a.int :: Nil, Nil, frame),
-      s"(PARTITION BY `a` $frame)"
+      s"(PARTITION BY `a` ${frame.sql})"
     )
 
     checkSQL(
       WindowSpecDefinition('a.int :: 'b.string :: Nil, Nil, frame),
-      s"(PARTITION BY `a`, `b` $frame)"
+      s"(PARTITION BY `a`, `b` ${frame.sql})"
     )
 
     checkSQL(
       WindowSpecDefinition(Nil, 'a.int.asc :: Nil, frame),
-      s"(ORDER BY `a` ASC NULLS FIRST $frame)"
+      s"(ORDER BY `a` ASC NULLS FIRST ${frame.sql})"
     )
 
     checkSQL(
       WindowSpecDefinition(Nil, 'a.int.asc :: 'b.string.desc :: Nil, frame),
-      s"(ORDER BY `a` ASC NULLS FIRST, `b` DESC NULLS LAST $frame)"
+      s"(ORDER BY `a` ASC NULLS FIRST, `b` DESC NULLS LAST ${frame.sql})"
     )
 
     checkSQL(
       WindowSpecDefinition('a.int :: 'b.string :: Nil, 'c.int.asc :: 'd.string.desc :: Nil, frame),
-      s"(PARTITION BY `a`, `b` ORDER BY `c` ASC NULLS FIRST, `d` DESC NULLS LAST $frame)"
+      s"(PARTITION BY `a`, `b` ORDER BY `c` ASC NULLS FIRST, `d` DESC NULLS LAST ${frame.sql})"
     )
   }
 

From 6550086bbdf4c1581cbfa90550c5a388e531a736 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Sat, 29 Jul 2017 10:14:47 -0700
Subject: [PATCH 258/779] [SPARK-20962][SQL] Support subquery column aliases in
 FROM clause

## What changes were proposed in this pull request?
This pr added parsing rules to support subquery column aliases in FROM clause.
This pr is a sub-task of #18079.

## How was this patch tested?
Added tests in `PlanParserSuite` and `SQLQueryTestSuite`.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #18185 from maropu/SPARK-20962.
---
 .../spark/sql/catalyst/parser/SqlBase.g4      |  2 +-
 .../sql/catalyst/analysis/Analyzer.scala      | 25 +++++++++++++++++++
 .../sql/catalyst/analysis/unresolved.scala    | 23 ++++++++++++++++-
 .../sql/catalyst/parser/AstBuilder.scala      | 18 +++++++++----
 .../sql/catalyst/analysis/AnalysisSuite.scala | 20 +++++++++++++++
 .../sql/catalyst/parser/PlanParserSuite.scala | 13 +++++++++-
 .../sql-tests/inputs/table-aliases.sql        |  3 +++
 .../sql-tests/results/table-aliases.sql.out   | 10 +++++++-
 8 files changed, 105 insertions(+), 9 deletions(-)

diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index ef9f88a9026c9..4534b7dcf6399 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -474,7 +474,7 @@ identifierComment
 
 relationPrimary
     : tableIdentifier sample? tableAlias                   #tableName
-    | '(' queryNoWith ')' sample? (AS? strictIdentifier)?  #aliasedQuery
+    | '(' queryNoWith ')' sample? tableAlias               #aliasedQuery
     | '(' relation ')' sample? (AS? strictIdentifier)?     #aliasedRelation
     | inlineTable                                          #inlineTableDefault2
     | functionTable                                        #tableValuedFunction
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index f987ed880176d..a6d297cfd6538 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -141,6 +141,7 @@ class Analyzer(
       ResolveFunctions ::
       ResolveAliases ::
       ResolveSubquery ::
+      ResolveSubqueryColumnAliases ::
       ResolveWindowOrder ::
       ResolveWindowFrame ::
       ResolveNaturalAndUsingJoin ::
@@ -1323,6 +1324,30 @@ class Analyzer(
     }
   }
 
+  /**
+   * Replaces unresolved column aliases for a subquery with projections.
+   */
+  object ResolveSubqueryColumnAliases extends Rule[LogicalPlan] {
+
+     def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators {
+      case u @ UnresolvedSubqueryColumnAliases(columnNames, child) if child.resolved =>
+        // Resolves output attributes if a query has alias names in its subquery:
+        // e.g., SELECT * FROM (SELECT 1 AS a, 1 AS b) t(col1, col2)
+        val outputAttrs = child.output
+        // Checks if the number of the aliases equals to the number of output columns
+        // in the subquery.
+        if (columnNames.size != outputAttrs.size) {
+          u.failAnalysis("Number of column aliases does not match number of columns. " +
+            s"Number of column aliases: ${columnNames.size}; " +
+            s"number of columns: ${outputAttrs.size}.")
+        }
+        val aliases = outputAttrs.zip(columnNames).map { case (attr, aliasName) =>
+          Alias(attr, aliasName)()
+        }
+        Project(aliases, child)
+    }
+  }
+
   /**
    * Turns projections that contain aggregate expressions into aggregations.
    */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
index fb322697c7c68..b7a704dc8453a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodegenFallback, ExprCode}
 import org.apache.spark.sql.catalyst.parser.ParserUtils
-import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan}
+import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, UnaryNode}
 import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.catalyst.util.quoteIdentifier
 import org.apache.spark.sql.types.{DataType, Metadata, StructType}
@@ -422,6 +422,27 @@ case class UnresolvedAlias(
   override lazy val resolved = false
 }
 
+/**
+ * Aliased column names resolved by positions for subquery. We could add alias names for output
+ * columns in the subquery:
+ * {{{
+ *   // Assign alias names for output columns
+ *   SELECT col1, col2 FROM testData AS t(col1, col2);
+ * }}}
+ *
+ * @param outputColumnNames the [[LogicalPlan]] on which this subquery column aliases apply.
+ * @param child the logical plan of this subquery.
+ */
+case class UnresolvedSubqueryColumnAliases(
+    outputColumnNames: Seq[String],
+    child: LogicalPlan)
+  extends UnaryNode {
+
+  override def output: Seq[Attribute] = Nil
+
+  override lazy val resolved = false
+}
+
 /**
  * Holds the deserializer expression and the attributes that are available during the resolution
  * for it.  Deserializer expression is a special kind of expression that is not always resolved by
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 6795be758e381..07578261781b7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -750,20 +750,28 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
   /**
    * Create an alias (SubqueryAlias) for a sub-query. This is practically the same as
    * visitAliasedRelation and visitNamedExpression, ANTLR4 however requires us to use 3 different
-   * hooks.
+   * hooks. We could add alias names for output columns, for example:
+   * {{{
+   *   SELECT col1, col2 FROM testData AS t(col1, col2)
+   * }}}
    */
   override def visitAliasedQuery(ctx: AliasedQueryContext): LogicalPlan = withOrigin(ctx) {
-    val alias = if (ctx.strictIdentifier == null) {
+    val alias = if (ctx.tableAlias.strictIdentifier == null) {
       // For un-aliased subqueries, use a default alias name that is not likely to conflict with
       // normal subquery names, so that parent operators can only access the columns in subquery by
       // unqualified names. Users can still use this special qualifier to access columns if they
       // know it, but that's not recommended.
       "__auto_generated_subquery_name"
     } else {
-      ctx.strictIdentifier.getText
+      ctx.tableAlias.strictIdentifier.getText
+    }
+    val subquery = SubqueryAlias(alias, plan(ctx.queryNoWith).optionalMap(ctx.sample)(withSample))
+    if (ctx.tableAlias.identifierList != null) {
+      val columnAliases = visitIdentifierList(ctx.tableAlias.identifierList)
+      UnresolvedSubqueryColumnAliases(columnAliases, subquery)
+    } else {
+      subquery
     }
-
-    SubqueryAlias(alias, plan(ctx.queryNoWith).optionalMap(ctx.sample)(withSample))
   }
 
   /**
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index be26b1b26f175..9bcf4773fa903 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -470,4 +470,24 @@ class AnalysisSuite extends AnalysisTest with ShouldMatchers {
       Seq("Number of column aliases does not match number of columns. Table name: TaBlE3; " +
         "number of column aliases: 5; number of columns: 4."))
   }
+
+  test("SPARK-20962 Support subquery column aliases in FROM clause") {
+    def tableColumnsWithAliases(outputNames: Seq[String]): LogicalPlan = {
+      UnresolvedSubqueryColumnAliases(
+        outputNames,
+        SubqueryAlias(
+          "t",
+          UnresolvedRelation(TableIdentifier("TaBlE3")))
+      ).select(star())
+    }
+    assertAnalysisSuccess(tableColumnsWithAliases("col1" :: "col2" :: "col3" :: "col4" :: Nil))
+    assertAnalysisError(
+      tableColumnsWithAliases("col1" :: Nil),
+      Seq("Number of column aliases does not match number of columns. " +
+        "Number of column aliases: 1; number of columns: 4."))
+    assertAnalysisError(
+      tableColumnsWithAliases("col1" :: "col2" :: "col3" :: "col4" :: "col5" :: Nil),
+      Seq("Number of column aliases does not match number of columns. " +
+        "Number of column aliases: 5; number of columns: 4."))
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
index 725bcb833f5e3..c7f39ae18162e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.parser
 
 import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
-import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, UnresolvedAttribute, UnresolvedFunction, UnresolvedGenerator, UnresolvedInlineTable, UnresolvedRelation, UnresolvedTableValuedFunction}
+import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, UnresolvedAttribute, UnresolvedFunction, UnresolvedGenerator, UnresolvedInlineTable, UnresolvedRelation, UnresolvedSubqueryColumnAliases, UnresolvedTableValuedFunction}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
@@ -495,6 +495,17 @@ class PlanParserSuite extends AnalysisTest {
         .select(star()))
   }
 
+  test("SPARK-20962 Support subquery column aliases in FROM clause") {
+    assertEqual(
+      "SELECT * FROM (SELECT a AS x, b AS y FROM t) t(col1, col2)",
+      UnresolvedSubqueryColumnAliases(
+        Seq("col1", "col2"),
+        SubqueryAlias(
+          "t",
+          UnresolvedRelation(TableIdentifier("t")).select('a.as("x"), 'b.as("y")))
+      ).select(star()))
+  }
+
   test("inline table") {
     assertEqual("values 1, 2, 3, 4",
       UnresolvedInlineTable(Seq("col1"), Seq(1, 2, 3, 4).map(x => Seq(Literal(x)))))
diff --git a/sql/core/src/test/resources/sql-tests/inputs/table-aliases.sql b/sql/core/src/test/resources/sql-tests/inputs/table-aliases.sql
index c90a9c7f85587..85481cbbf9377 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/table-aliases.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/table-aliases.sql
@@ -15,3 +15,6 @@ SELECT * FROM testData AS t(col1);
 
 -- Check alias duplication
 SELECT a AS col1, b AS col2 FROM testData AS t(c, d);
+
+-- Subquery aliases in FROM clause
+SELECT * FROM (SELECT 1 AS a, 1 AS b) t(col1, col2);
diff --git a/sql/core/src/test/resources/sql-tests/results/table-aliases.sql.out b/sql/core/src/test/resources/sql-tests/results/table-aliases.sql.out
index 7abbcd834a523..4459f3186c77b 100644
--- a/sql/core/src/test/resources/sql-tests/results/table-aliases.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/table-aliases.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 7
+-- Number of queries: 8
 
 
 -- !query 0
@@ -61,3 +61,11 @@ struct<>
 -- !query 6 output
 org.apache.spark.sql.AnalysisException
 cannot resolve '`a`' given input columns: [t.c, t.d]; line 1 pos 7
+
+
+-- !query 7
+SELECT * FROM (SELECT 1 AS a, 1 AS b) t(col1, col2)
+-- !query 7 schema
+struct<col1:int,col2:int>
+-- !query 7 output
+1	1

From 51f99fb25b0d524164e7bf15e63d99abb6c22431 Mon Sep 17 00:00:00 2001
From: GuoChenzhao <chenzhao.guo@intel.com>
Date: Sun, 30 Jul 2017 22:18:38 +0900
Subject: [PATCH 259/779] [SQL] Fix typo in DataframeWriter doc

## What changes were proposed in this pull request?

The format of none should be consistent with other compression codec(\`snappy\`, \`lz4\`) as \`none\`.

## How was this patch tested?

This is a typo.

Author: GuoChenzhao <chenzhao.guo@intel.com>

Closes #18758 from gczsjdy/typo.
---
 .../src/main/scala/org/apache/spark/sql/DataFrameWriter.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 255c4064eb574..0fcda46c9b3c5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -499,7 +499,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
    * <ul>
    * <li>`compression` (default is the value specified in `spark.sql.parquet.compression.codec`):
    * compression codec to use when saving to file. This can be one of the known case-insensitive
-   * shorten names(none, `snappy`, `gzip`, and `lzo`). This will override
+   * shorten names(`none`, `snappy`, `gzip`, and `lzo`). This will override
    * `spark.sql.parquet.compression.codec`.</li>
    * </ul>
    *

From d79816ddb941782e4126e894e28e8e624802429d Mon Sep 17 00:00:00 2001
From: guoxiaolong <guo.xiaolong1@zte.com.cn>
Date: Sun, 30 Jul 2017 18:44:31 +0100
Subject: [PATCH 260/779] [SPARK-21297][WEB-UI] Add count in 'JDBC/ODBC Server'
 page.

## What changes were proposed in this pull request?

1.add count about 'Session Statistics' and 'SQL Statistics' in 'JDBC/ODBC Server' page.The purpose is to know the statistics clearly.

fix before:
![1](https://user-images.githubusercontent.com/26266482/27819373-7fbe4002-60cc-11e7-9e7f-e9cc6f9ef746.png)

fix after:
![1](https://user-images.githubusercontent.com/26266482/28700157-876cb7d6-7380-11e7-869c-0a4f18d65357.png)

## How was this patch tested?

manual tests

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: guoxiaolong <guo.xiaolong1@zte.com.cn>

Closes #18525 from guoxiaolongzte/SPARK-21297.
---
 .../spark/sql/hive/thriftserver/ui/ThriftServerPage.scala     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPage.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPage.scala
index 17589cf44b998..f517bffccdf31 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPage.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPage.scala
@@ -103,7 +103,7 @@ private[ui] class ThriftServerPage(parent: ThriftServerTab) extends WebUIPage(""
     }
 
     val content =
-      <h5 id="sqlstat">SQL Statistics</h5> ++
+      <h5 id="sqlstat">SQL Statistics ({numStatement})</h5> ++
         <div>
           <ul class="unstyled">
             {table.getOrElse("No statistics have been generated yet.")}
@@ -164,7 +164,7 @@ private[ui] class ThriftServerPage(parent: ThriftServerTab) extends WebUIPage(""
     }
 
     val content =
-      <h5 id="sessionstat">Session Statistics</h5> ++
+      <h5 id="sessionstat">Session Statistics ({numBatches})</h5> ++
       <div>
         <ul class="unstyled">
           {table.getOrElse("No statistics have been generated yet.")}

From 6830e90de5c98a5c42e75144f76313e4d51d381a Mon Sep 17 00:00:00 2001
From: Cheng Wang <chengwang0511@gmail.com>
Date: Sun, 30 Jul 2017 18:45:45 +0100
Subject: [PATCH 261/779] [MINOR][DOC] Replace numTasks with numPartitions in
 programming guide

In programming guide, `numTasks` is used in several places as arguments of Transformations. However, in code, `numPartitions` is used. In this fix, I replace `numTasks` with `numPartitions` in programming guide for consistency.

Author: Cheng Wang <chengwang0511@gmail.com>

Closes #18774 from polarke/replace-numtasks-with-numpartitions-in-doc.
---
 docs/rdd-programming-guide.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/rdd-programming-guide.md b/docs/rdd-programming-guide.md
index c0215c8fb62f6..26025984da64c 100644
--- a/docs/rdd-programming-guide.md
+++ b/docs/rdd-programming-guide.md
@@ -978,40 +978,40 @@ for details.
   <td> Return a new RDD that contains the intersection of elements in the source dataset and the argument. </td>
 </tr>
 <tr>
-  <td> <b>distinct</b>([<i>numTasks</i>])) </td>
+  <td> <b>distinct</b>([<i>numPartitions</i>])) </td>
   <td> Return a new dataset that contains the distinct elements of the source dataset.</td>
 </tr>
 <tr>
-  <td> <b>groupByKey</b>([<i>numTasks</i>]) <a name="GroupByLink"></a> </td>
+  <td> <b>groupByKey</b>([<i>numPartitions</i>]) <a name="GroupByLink"></a> </td>
   <td> When called on a dataset of (K, V) pairs, returns a dataset of (K, Iterable&lt;V&gt;) pairs. <br />
     <b>Note:</b> If you are grouping in order to perform an aggregation (such as a sum or
       average) over each key, using <code>reduceByKey</code> or <code>aggregateByKey</code> will yield much better
       performance.
     <br />
     <b>Note:</b> By default, the level of parallelism in the output depends on the number of partitions of the parent RDD.
-      You can pass an optional <code>numTasks</code> argument to set a different number of tasks.
+      You can pass an optional <code>numPartitions</code> argument to set a different number of tasks.
   </td>
 </tr>
 <tr>
-  <td> <b>reduceByKey</b>(<i>func</i>, [<i>numTasks</i>]) <a name="ReduceByLink"></a> </td>
+  <td> <b>reduceByKey</b>(<i>func</i>, [<i>numPartitions</i>]) <a name="ReduceByLink"></a> </td>
   <td> When called on a dataset of (K, V) pairs, returns a dataset of (K, V) pairs where the values for each key are aggregated using the given reduce function <i>func</i>, which must be of type (V,V) => V. Like in <code>groupByKey</code>, the number of reduce tasks is configurable through an optional second argument. </td>
 </tr>
 <tr>
-  <td> <b>aggregateByKey</b>(<i>zeroValue</i>)(<i>seqOp</i>, <i>combOp</i>, [<i>numTasks</i>]) <a name="AggregateByLink"></a> </td>
+  <td> <b>aggregateByKey</b>(<i>zeroValue</i>)(<i>seqOp</i>, <i>combOp</i>, [<i>numPartitions</i>]) <a name="AggregateByLink"></a> </td>
   <td> When called on a dataset of (K, V) pairs, returns a dataset of (K, U) pairs where the values for each key are aggregated using the given combine functions and a neutral "zero" value. Allows an aggregated value type that is different than the input value type, while avoiding unnecessary allocations. Like in <code>groupByKey</code>, the number of reduce tasks is configurable through an optional second argument. </td>
 </tr>
 <tr>
-  <td> <b>sortByKey</b>([<i>ascending</i>], [<i>numTasks</i>]) <a name="SortByLink"></a> </td>
+  <td> <b>sortByKey</b>([<i>ascending</i>], [<i>numPartitions</i>]) <a name="SortByLink"></a> </td>
   <td> When called on a dataset of (K, V) pairs where K implements Ordered, returns a dataset of (K, V) pairs sorted by keys in ascending or descending order, as specified in the boolean <code>ascending</code> argument.</td>
 </tr>
 <tr>
-  <td> <b>join</b>(<i>otherDataset</i>, [<i>numTasks</i>]) <a name="JoinLink"></a> </td>
+  <td> <b>join</b>(<i>otherDataset</i>, [<i>numPartitions</i>]) <a name="JoinLink"></a> </td>
   <td> When called on datasets of type (K, V) and (K, W), returns a dataset of (K, (V, W)) pairs with all pairs of elements for each key.
     Outer joins are supported through <code>leftOuterJoin</code>, <code>rightOuterJoin</code>, and <code>fullOuterJoin</code>.
   </td>
 </tr>
 <tr>
-  <td> <b>cogroup</b>(<i>otherDataset</i>, [<i>numTasks</i>]) <a name="CogroupLink"></a> </td>
+  <td> <b>cogroup</b>(<i>otherDataset</i>, [<i>numPartitions</i>]) <a name="CogroupLink"></a> </td>
   <td> When called on datasets of type (K, V) and (K, W), returns a dataset of (K, (Iterable&lt;V&gt;, Iterable&lt;W&gt;)) tuples. This operation is also called <code>groupWith</code>. </td>
 </tr>
 <tr>

From f1a798b5763abb5fca3aed592c3114dab5aefda2 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Mon, 31 Jul 2017 10:07:33 +0900
Subject: [PATCH 262/779] [MINOR] Minor comment fixes in merge_spark_pr.py
 script

## What changes were proposed in this pull request?

This PR proposes to fix few rather typos in `merge_spark_pr.py`.

- `#   usage: ./apache-pr-merge.py    (see config env vars below)`
  -> `#   usage: ./merge_spark_pr.py    (see config env vars below)`

- `... have local a Spark ...` -> `... have a local Spark ...`

- `... to Apache.` -> `... to Apache Spark.`

I skimmed this file and these look all I could find.

## How was this patch tested?

pep8 check (`./dev/lint-python`).

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #18776 from HyukjinKwon/minor-merge-script.
---
 dev/merge_spark_pr.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py
index 4bacb385184c6..28971b87f403f 100755
--- a/dev/merge_spark_pr.py
+++ b/dev/merge_spark_pr.py
@@ -17,10 +17,11 @@
 # limitations under the License.
 #
 
-# Utility for creating well-formed pull request merges and pushing them to Apache.
-#   usage: ./apache-pr-merge.py    (see config env vars below)
+# Utility for creating well-formed pull request merges and pushing them to Apache
+# Spark.
+#   usage: ./merge_spark_pr.py    (see config env vars below)
 #
-# This utility assumes you already have local a Spark git folder and that you
+# This utility assumes you already have a local Spark git folder and that you
 # have added remotes corresponding to both (i) the github apache Spark
 # mirror and (ii) the apache git repo.
 

From 44e501ace38ea6e6535a8731e045681175d170ec Mon Sep 17 00:00:00 2001
From: Zhan Zhang <zhanzhang@fb.com>
Date: Sun, 30 Jul 2017 18:50:19 -0700
Subject: [PATCH 263/779] [SPARK-19839][CORE] release longArray in
 BytesToBytesMap

## What changes were proposed in this pull request?
When BytesToBytesMap spills, its longArray should be released. Otherwise, it may not released until the task complete. This array may take a significant amount of memory, which cannot be used by later operator, such as UnsafeShuffleExternalSorter, resulting in more frequent spill in sorter. This patch release the array as destructive iterator will not use this array anymore.

## How was this patch tested?
Manual test in production

Author: Zhan Zhang <zhanzhang@fb.com>

Closes #17180 from zhzhan/memory.
---
 .../java/org/apache/spark/unsafe/map/BytesToBytesMap.java    | 5 +++++
 .../sql/execution/UnsafeFixedWidthAggregationMapSuite.scala  | 5 +++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
index 3b6200e74f1e1..610ace30f8a62 100644
--- a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
+++ b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
@@ -258,6 +258,11 @@ private MapIterator(int numRecords, Location loc, boolean destructive) {
       this.destructive = destructive;
       if (destructive) {
         destructiveIterator = this;
+        // longArray will not be used anymore if destructive is true, release it now.
+        if (longArray != null) {
+          freeArray(longArray);
+          longArray = null;
+        }
       }
     }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
index 50d8e3024598d..d194f58cd1cdd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
@@ -127,9 +127,10 @@ class UnsafeFixedWidthAggregationMapSuite
       PAGE_SIZE_BYTES
     )
     val groupKey = InternalRow(UTF8String.fromString("cats"))
+    val row = map.getAggregationBuffer(groupKey)
 
     // Looking up a key stores a zero-entry in the map (like Python Counters or DefaultDicts)
-    assert(map.getAggregationBuffer(groupKey) != null)
+    assert(row != null)
     val iter = map.iterator()
     assert(iter.next())
     iter.getKey.getString(0) should be ("cats")
@@ -138,7 +139,7 @@ class UnsafeFixedWidthAggregationMapSuite
 
     // Modifications to rows retrieved from the map should update the values in the map
     iter.getValue.setInt(0, 42)
-    map.getAggregationBuffer(groupKey).getInt(0) should be (42)
+    row.getInt(0) should be (42)
 
     map.free()
   }

From 106eaa9b95192f0cdbb382c11efdcb85032e679b Mon Sep 17 00:00:00 2001
From: "iurii.ant" <sereneant@gmail.com>
Date: Mon, 31 Jul 2017 10:42:09 +0800
Subject: [PATCH 264/779] [SPARK-21575][SPARKR] Eliminate needless
 synchronization in java-R serialization

## What changes were proposed in this pull request?
Remove surplus synchronized blocks.

## How was this patch tested?
Unit tests run OK.

Author: iurii.ant <sereneant@gmail.com>

Closes #18775 from SereneAnt/eliminate_unnecessary_synchronization_in_java-R_serialization.
---
 .../apache/spark/api/r/JVMObjectTracker.scala    | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala b/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala
index 3432700f11602..fe7438ac54f18 100644
--- a/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala
@@ -37,13 +37,7 @@ private[r] class JVMObjectTracker {
   /**
    * Returns the JVM object associated with the input key or None if not found.
    */
-  final def get(id: JVMObjectId): Option[Object] = this.synchronized {
-    if (objMap.containsKey(id)) {
-      Some(objMap.get(id))
-    } else {
-      None
-    }
-  }
+  final def get(id: JVMObjectId): Option[Object] = Option(objMap.get(id))
 
   /**
    * Returns the JVM object associated with the input key or throws an exception if not found.
@@ -67,13 +61,7 @@ private[r] class JVMObjectTracker {
   /**
    * Removes and returns a JVM object with the specific ID from the tracker, or None if not found.
    */
-  final def remove(id: JVMObjectId): Option[Object] = this.synchronized {
-    if (objMap.containsKey(id)) {
-      Some(objMap.remove(id))
-    } else {
-      None
-    }
-  }
+  final def remove(id: JVMObjectId): Option[Object] = Option(objMap.remove(id))
 
   /**
    * Number of JVM objects being tracked.

From 6b186c9d606d598435a5c1981b0a54afea188513 Mon Sep 17 00:00:00 2001
From: bravo-zhang <mzhang1230@gmail.com>
Date: Mon, 31 Jul 2017 17:19:55 -0700
Subject: [PATCH 265/779] [SPARK-18950][SQL] Report conflicting fields when
 merging two StructTypes

## What changes were proposed in this pull request?

Currently, StructType.merge() only reports data types of conflicting fields when merging two incompatible schemas. It would be nice to also report the field names for easier debugging.

## How was this patch tested?

Unit test in DataTypeSuite.
Print exception message when conflict is triggered.

Author: bravo-zhang <mzhang1230@gmail.com>

Closes #16365 from bravo-zhang/spark-18950.
---
 .../org/apache/spark/sql/types/StructType.scala   | 15 +++++++++++----
 .../apache/spark/sql/types/DataTypeSuite.scala    |  6 ++++--
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
index 54006e20a3eb6..b314ef4e35d6d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.types
 
 import scala.collection.mutable.ArrayBuffer
+import scala.util.control.NonFatal
 import scala.util.Try
 
 import org.json4s.JsonDSL._
@@ -467,10 +468,16 @@ object StructType extends AbstractDataType {
         leftFields.foreach {
           case leftField @ StructField(leftName, leftType, leftNullable, _) =>
             rightMapped.get(leftName)
-              .map { case rightField @ StructField(_, rightType, rightNullable, _) =>
-                leftField.copy(
-                  dataType = merge(leftType, rightType),
-                  nullable = leftNullable || rightNullable)
+              .map { case rightField @ StructField(rightName, rightType, rightNullable, _) =>
+                try {
+                  leftField.copy(
+                    dataType = merge(leftType, rightType),
+                    nullable = leftNullable || rightNullable)
+                } catch {
+                  case NonFatal(e) =>
+                    throw new SparkException(s"Failed to merge fields '$leftName' and " +
+                      s"'$rightName'. " + e.getMessage)
+                }
               }
               .orElse {
                 Some(leftField)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
index c4635c8f126af..193826d66be26 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
@@ -142,9 +142,11 @@ class DataTypeSuite extends SparkFunSuite {
     val right = StructType(
       StructField("b", LongType) :: Nil)
 
-    intercept[SparkException] {
+    val message = intercept[SparkException] {
       left.merge(right)
-    }
+    }.getMessage
+    assert(message.equals("Failed to merge fields 'b' and 'b'. " +
+      "Failed to merge incompatible data types FloatType and LongType"))
   }
 
   test("existsRecursively") {

From 9570e81aa949cddb30a0e94c92093cd16e34326a Mon Sep 17 00:00:00 2001
From: wangmiao1981 <wm624@hotmail.com>
Date: Mon, 31 Jul 2017 20:37:06 -0700
Subject: [PATCH 266/779] [SPARK-21381][SPARKR] SparkR: pass on
 setHandleInvalid for classification algorithms

## What changes were proposed in this pull request?

SPARK-20307 Added handleInvalid option to RFormula for tree-based classification algorithms. We should add this parameter for other classification algorithms in SparkR.

This is a followup PR for SPARK-20307.

## How was this patch tested?

New Unit tests are added.

Author: wangmiao1981 <wm624@hotmail.com>

Closes #18605 from wangmiao1981/class.
---
 R/pkg/R/mllib_classification.R                | 49 +++++++++++++---
 R/pkg/R/mllib_tree.R                          | 33 ++++++++---
 .../fulltests/test_mllib_classification.R     | 58 +++++++++++++++++++
 R/pkg/tests/fulltests/test_mllib_tree.R       | 30 ++++++++++
 .../r/DecisionTreeClassificationWrapper.scala |  4 +-
 .../spark/ml/r/GBTClassificationWrapper.scala |  4 +-
 .../apache/spark/ml/r/LinearSVCWrapper.scala  |  4 +-
 .../ml/r/LogisticRegressionWrapper.scala      |  4 +-
 ...ultilayerPerceptronClassifierWrapper.scala |  6 +-
 .../apache/spark/ml/r/NaiveBayesWrapper.scala |  7 ++-
 10 files changed, 175 insertions(+), 24 deletions(-)

diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R
index 82d2428f3c444..15af8298ba484 100644
--- a/R/pkg/R/mllib_classification.R
+++ b/R/pkg/R/mllib_classification.R
@@ -69,6 +69,11 @@ setClass("NaiveBayesModel", representation(jobj = "jobj"))
 #' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the dimensions of features
 #'                         or the number of partitions are large, this param could be adjusted to a larger size.
 #'                         This is an expert parameter. Default value should be good for most cases.
+#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and label
+#'                      column of string type.
+#'                      Supported options: "skip" (filter out rows with invalid data),
+#'                                         "error" (throw an error), "keep" (put invalid data in a special additional
+#'                                         bucket, at index numLabels). Default is "error".
 #' @param ... additional arguments passed to the method.
 #' @return \code{spark.svmLinear} returns a fitted linear SVM model.
 #' @rdname spark.svmLinear
@@ -98,7 +103,8 @@ setClass("NaiveBayesModel", representation(jobj = "jobj"))
 #' @note spark.svmLinear since 2.2.0
 setMethod("spark.svmLinear", signature(data = "SparkDataFrame", formula = "formula"),
           function(data, formula, regParam = 0.0, maxIter = 100, tol = 1E-6, standardization = TRUE,
-                   threshold = 0.0, weightCol = NULL, aggregationDepth = 2) {
+                   threshold = 0.0, weightCol = NULL, aggregationDepth = 2,
+                   handleInvalid = c("error", "keep", "skip")) {
             formula <- paste(deparse(formula), collapse = "")
 
             if (!is.null(weightCol) && weightCol == "") {
@@ -107,10 +113,12 @@ setMethod("spark.svmLinear", signature(data = "SparkDataFrame", formula = "formu
               weightCol <- as.character(weightCol)
             }
 
+            handleInvalid <- match.arg(handleInvalid)
+
             jobj <- callJStatic("org.apache.spark.ml.r.LinearSVCWrapper", "fit",
                                 data@sdf, formula, as.numeric(regParam), as.integer(maxIter),
                                 as.numeric(tol), as.logical(standardization), as.numeric(threshold),
-                                weightCol, as.integer(aggregationDepth))
+                                weightCol, as.integer(aggregationDepth), handleInvalid)
             new("LinearSVCModel", jobj = jobj)
           })
 
@@ -218,6 +226,11 @@ function(object, path, overwrite = FALSE) {
 #' @param upperBoundsOnIntercepts The upper bounds on intercepts if fitting under bound constrained optimization.
 #'                                The bound vector size must be equal to 1 for binomial regression, or the number
 #'                                of classes for multinomial regression.
+#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and label
+#'                      column of string type.
+#'                      Supported options: "skip" (filter out rows with invalid data),
+#'                                         "error" (throw an error), "keep" (put invalid data in a special additional
+#'                                         bucket, at index numLabels). Default is "error".
 #' @param ... additional arguments passed to the method.
 #' @return \code{spark.logit} returns a fitted logistic regression model.
 #' @rdname spark.logit
@@ -257,7 +270,8 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula")
                    tol = 1E-6, family = "auto", standardization = TRUE,
                    thresholds = 0.5, weightCol = NULL, aggregationDepth = 2,
                    lowerBoundsOnCoefficients = NULL, upperBoundsOnCoefficients = NULL,
-                   lowerBoundsOnIntercepts = NULL, upperBoundsOnIntercepts = NULL) {
+                   lowerBoundsOnIntercepts = NULL, upperBoundsOnIntercepts = NULL,
+                   handleInvalid = c("error", "keep", "skip")) {
             formula <- paste(deparse(formula), collapse = "")
             row <- 0
             col <- 0
@@ -304,6 +318,8 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula")
               upperBoundsOnCoefficients <- as.array(as.vector(upperBoundsOnCoefficients))
             }
 
+            handleInvalid <- match.arg(handleInvalid)
+
             jobj <- callJStatic("org.apache.spark.ml.r.LogisticRegressionWrapper", "fit",
                                 data@sdf, formula, as.numeric(regParam),
                                 as.numeric(elasticNetParam), as.integer(maxIter),
@@ -312,7 +328,8 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula")
                                 weightCol, as.integer(aggregationDepth),
                                 as.integer(row), as.integer(col),
                                 lowerBoundsOnCoefficients, upperBoundsOnCoefficients,
-                                lowerBoundsOnIntercepts, upperBoundsOnIntercepts)
+                                lowerBoundsOnIntercepts, upperBoundsOnIntercepts,
+                                handleInvalid)
             new("LogisticRegressionModel", jobj = jobj)
           })
 
@@ -394,7 +411,12 @@ setMethod("write.ml", signature(object = "LogisticRegressionModel", path = "char
 #' @param stepSize stepSize parameter.
 #' @param seed seed parameter for weights initialization.
 #' @param initialWeights initialWeights parameter for weights initialization, it should be a
-#' numeric vector.
+#'        numeric vector.
+#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and label
+#'                      column of string type.
+#'                      Supported options: "skip" (filter out rows with invalid data),
+#'                                         "error" (throw an error), "keep" (put invalid data in a special additional
+#'                                         bucket, at index numLabels). Default is "error".
 #' @param ... additional arguments passed to the method.
 #' @return \code{spark.mlp} returns a fitted Multilayer Perceptron Classification Model.
 #' @rdname spark.mlp
@@ -426,7 +448,8 @@ setMethod("write.ml", signature(object = "LogisticRegressionModel", path = "char
 #' @note spark.mlp since 2.1.0
 setMethod("spark.mlp", signature(data = "SparkDataFrame", formula = "formula"),
           function(data, formula, layers, blockSize = 128, solver = "l-bfgs", maxIter = 100,
-                   tol = 1E-6, stepSize = 0.03, seed = NULL, initialWeights = NULL) {
+                   tol = 1E-6, stepSize = 0.03, seed = NULL, initialWeights = NULL,
+                   handleInvalid = c("error", "keep", "skip")) {
             formula <- paste(deparse(formula), collapse = "")
             if (is.null(layers)) {
               stop ("layers must be a integer vector with length > 1.")
@@ -441,10 +464,11 @@ setMethod("spark.mlp", signature(data = "SparkDataFrame", formula = "formula"),
             if (!is.null(initialWeights)) {
               initialWeights <- as.array(as.numeric(na.omit(initialWeights)))
             }
+            handleInvalid <- match.arg(handleInvalid)
             jobj <- callJStatic("org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper",
                                 "fit", data@sdf, formula, as.integer(blockSize), as.array(layers),
                                 as.character(solver), as.integer(maxIter), as.numeric(tol),
-                                as.numeric(stepSize), seed, initialWeights)
+                                as.numeric(stepSize), seed, initialWeights, handleInvalid)
             new("MultilayerPerceptronClassificationModel", jobj = jobj)
           })
 
@@ -514,6 +538,11 @@ setMethod("write.ml", signature(object = "MultilayerPerceptronClassificationMode
 #' @param formula a symbolic description of the model to be fitted. Currently only a few formula
 #'               operators are supported, including '~', '.', ':', '+', and '-'.
 #' @param smoothing smoothing parameter.
+#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and label
+#'                      column of string type.
+#'                      Supported options: "skip" (filter out rows with invalid data),
+#'                                         "error" (throw an error), "keep" (put invalid data in a special additional
+#'                                         bucket, at index numLabels). Default is "error".
 #' @param ... additional argument(s) passed to the method. Currently only \code{smoothing}.
 #' @return \code{spark.naiveBayes} returns a fitted naive Bayes model.
 #' @rdname spark.naiveBayes
@@ -543,10 +572,12 @@ setMethod("write.ml", signature(object = "MultilayerPerceptronClassificationMode
 #' }
 #' @note spark.naiveBayes since 2.0.0
 setMethod("spark.naiveBayes", signature(data = "SparkDataFrame", formula = "formula"),
-          function(data, formula, smoothing = 1.0) {
+          function(data, formula, smoothing = 1.0,
+                   handleInvalid = c("error", "keep", "skip")) {
             formula <- paste(deparse(formula), collapse = "")
+            handleInvalid <- match.arg(handleInvalid)
             jobj <- callJStatic("org.apache.spark.ml.r.NaiveBayesWrapper", "fit",
-            formula, data@sdf, smoothing)
+                                formula, data@sdf, smoothing, handleInvalid)
             new("NaiveBayesModel", jobj = jobj)
           })
 
diff --git a/R/pkg/R/mllib_tree.R b/R/pkg/R/mllib_tree.R
index 75b1a74ee8c7c..33c4653f4c184 100644
--- a/R/pkg/R/mllib_tree.R
+++ b/R/pkg/R/mllib_tree.R
@@ -164,6 +164,11 @@ print.summary.decisionTree <- function(x) {
 #'                     nodes. If TRUE, the algorithm will cache node IDs for each instance. Caching
 #'                     can speed up training of deeper trees. Users can set how often should the
 #'                     cache be checkpointed or disable it by setting checkpointInterval.
+#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and label
+#'                      column of string type in classification model.
+#'                      Supported options: "skip" (filter out rows with invalid data),
+#'                                         "error" (throw an error), "keep" (put invalid data in a special additional
+#'                                         bucket, at index numLabels). Default is "error".
 #' @param ... additional arguments passed to the method.
 #' @aliases spark.gbt,SparkDataFrame,formula-method
 #' @return \code{spark.gbt} returns a fitted Gradient Boosted Tree model.
@@ -205,7 +210,8 @@ setMethod("spark.gbt", signature(data = "SparkDataFrame", formula = "formula"),
           function(data, formula, type = c("regression", "classification"),
                    maxDepth = 5, maxBins = 32, maxIter = 20, stepSize = 0.1, lossType = NULL,
                    seed = NULL, subsamplingRate = 1.0, minInstancesPerNode = 1, minInfoGain = 0.0,
-                   checkpointInterval = 10, maxMemoryInMB = 256, cacheNodeIds = FALSE) {
+                   checkpointInterval = 10, maxMemoryInMB = 256, cacheNodeIds = FALSE,
+                   handleInvalid = c("error", "keep", "skip")) {
             type <- match.arg(type)
             formula <- paste(deparse(formula), collapse = "")
             if (!is.null(seed)) {
@@ -225,6 +231,7 @@ setMethod("spark.gbt", signature(data = "SparkDataFrame", formula = "formula"),
                      new("GBTRegressionModel", jobj = jobj)
                    },
                    classification = {
+                     handleInvalid <- match.arg(handleInvalid)
                      if (is.null(lossType)) lossType <- "logistic"
                      lossType <- match.arg(lossType, "logistic")
                      jobj <- callJStatic("org.apache.spark.ml.r.GBTClassifierWrapper",
@@ -233,7 +240,8 @@ setMethod("spark.gbt", signature(data = "SparkDataFrame", formula = "formula"),
                                          as.numeric(stepSize), as.integer(minInstancesPerNode),
                                          as.numeric(minInfoGain), as.integer(checkpointInterval),
                                          lossType, seed, as.numeric(subsamplingRate),
-                                         as.integer(maxMemoryInMB), as.logical(cacheNodeIds))
+                                         as.integer(maxMemoryInMB), as.logical(cacheNodeIds),
+                                         handleInvalid)
                      new("GBTClassificationModel", jobj = jobj)
                    }
             )
@@ -374,10 +382,11 @@ setMethod("write.ml", signature(object = "GBTClassificationModel", path = "chara
 #'                     nodes. If TRUE, the algorithm will cache node IDs for each instance. Caching
 #'                     can speed up training of deeper trees. Users can set how often should the
 #'                     cache be checkpointed or disable it by setting checkpointInterval.
-#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in classification model.
-#'        Supported options: "skip" (filter out rows with invalid data),
-#'                           "error" (throw an error), "keep" (put invalid data in a special additional
-#'                           bucket, at index numLabels). Default is "error".
+#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and label
+#'                      column of string type in classification model.
+#'                      Supported options: "skip" (filter out rows with invalid data),
+#'                                         "error" (throw an error), "keep" (put invalid data in a special additional
+#'                                         bucket, at index numLabels). Default is "error".
 #' @param ... additional arguments passed to the method.
 #' @aliases spark.randomForest,SparkDataFrame,formula-method
 #' @return \code{spark.randomForest} returns a fitted Random Forest model.
@@ -583,6 +592,11 @@ setMethod("write.ml", signature(object = "RandomForestClassificationModel", path
 #'                     nodes. If TRUE, the algorithm will cache node IDs for each instance. Caching
 #'                     can speed up training of deeper trees. Users can set how often should the
 #'                     cache be checkpointed or disable it by setting checkpointInterval.
+#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and label
+#'                      column of string type in classification model.
+#'                      Supported options: "skip" (filter out rows with invalid data),
+#'                                         "error" (throw an error), "keep" (put invalid data in a special additional
+#'                                         bucket, at index numLabels). Default is "error".
 #' @param ... additional arguments passed to the method.
 #' @aliases spark.decisionTree,SparkDataFrame,formula-method
 #' @return \code{spark.decisionTree} returns a fitted Decision Tree model.
@@ -617,7 +631,8 @@ setMethod("spark.decisionTree", signature(data = "SparkDataFrame", formula = "fo
           function(data, formula, type = c("regression", "classification"),
                    maxDepth = 5, maxBins = 32, impurity = NULL, seed = NULL,
                    minInstancesPerNode = 1, minInfoGain = 0.0, checkpointInterval = 10,
-                   maxMemoryInMB = 256, cacheNodeIds = FALSE) {
+                   maxMemoryInMB = 256, cacheNodeIds = FALSE,
+                   handleInvalid = c("error", "keep", "skip")) {
             type <- match.arg(type)
             formula <- paste(deparse(formula), collapse = "")
             if (!is.null(seed)) {
@@ -636,6 +651,7 @@ setMethod("spark.decisionTree", signature(data = "SparkDataFrame", formula = "fo
                      new("DecisionTreeRegressionModel", jobj = jobj)
                    },
                    classification = {
+                     handleInvalid <- match.arg(handleInvalid)
                      if (is.null(impurity)) impurity <- "gini"
                      impurity <- match.arg(impurity, c("gini", "entropy"))
                      jobj <- callJStatic("org.apache.spark.ml.r.DecisionTreeClassifierWrapper",
@@ -643,7 +659,8 @@ setMethod("spark.decisionTree", signature(data = "SparkDataFrame", formula = "fo
                                          as.integer(maxBins), impurity,
                                          as.integer(minInstancesPerNode), as.numeric(minInfoGain),
                                          as.integer(checkpointInterval), seed,
-                                         as.integer(maxMemoryInMB), as.logical(cacheNodeIds))
+                                         as.integer(maxMemoryInMB), as.logical(cacheNodeIds),
+                                         handleInvalid)
                      new("DecisionTreeClassificationModel", jobj = jobj)
                    }
             )
diff --git a/R/pkg/tests/fulltests/test_mllib_classification.R b/R/pkg/tests/fulltests/test_mllib_classification.R
index 3d75f4ce11ec8..a4d0397236d17 100644
--- a/R/pkg/tests/fulltests/test_mllib_classification.R
+++ b/R/pkg/tests/fulltests/test_mllib_classification.R
@@ -70,6 +70,20 @@ test_that("spark.svmLinear", {
   prediction <- collect(select(predict(model, df), "prediction"))
   expect_equal(sort(prediction$prediction), c("0.0", "0.0", "0.0", "1.0", "1.0"))
 
+  # Test unseen labels
+  data <- data.frame(clicked = base::sample(c(0, 1), 10, replace = TRUE),
+  someString = base::sample(c("this", "that"), 10, replace = TRUE),
+                            stringsAsFactors = FALSE)
+  trainidxs <- base::sample(nrow(data), nrow(data) * 0.7)
+  traindf <- as.DataFrame(data[trainidxs, ])
+  testdf <- as.DataFrame(rbind(data[-trainidxs, ], c(0, "the other")))
+  model <- spark.svmLinear(traindf, clicked ~ ., regParam = 0.1)
+  predictions <- predict(model, testdf)
+  expect_error(collect(predictions))
+  model <- spark.svmLinear(traindf, clicked ~ ., regParam = 0.1, handleInvalid = "skip")
+  predictions <- predict(model, testdf)
+  expect_equal(class(collect(predictions)$clicked[1]), "list")
+
 })
 
 test_that("spark.logit", {
@@ -263,6 +277,21 @@ test_that("spark.logit", {
   virginicaCoefs <- summary$coefficients[, "virginica"]
   expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
   expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
+
+  # Test unseen labels
+  data <- data.frame(clicked = base::sample(c(0, 1), 10, replace = TRUE),
+  someString = base::sample(c("this", "that"), 10, replace = TRUE),
+                            stringsAsFactors = FALSE)
+  trainidxs <- base::sample(nrow(data), nrow(data) * 0.7)
+  traindf <- as.DataFrame(data[trainidxs, ])
+  testdf <- as.DataFrame(rbind(data[-trainidxs, ], c(0, "the other")))
+  model <- spark.logit(traindf, clicked ~ ., regParam = 0.5)
+  predictions <- predict(model, testdf)
+  expect_error(collect(predictions))
+  model <- spark.logit(traindf, clicked ~ ., regParam = 0.5, handleInvalid = "keep")
+  predictions <- predict(model, testdf)
+  expect_equal(class(collect(predictions)$clicked[1]), "character")
+
 })
 
 test_that("spark.mlp", {
@@ -344,6 +373,21 @@ test_that("spark.mlp", {
   expect_equal(summary$numOfOutputs, 3)
   expect_equal(summary$layers, c(4, 3))
   expect_equal(length(summary$weights), 15)
+
+  # Test unseen labels
+  data <- data.frame(clicked = base::sample(c(0, 1), 10, replace = TRUE),
+  someString = base::sample(c("this", "that"), 10, replace = TRUE),
+                            stringsAsFactors = FALSE)
+  trainidxs <- base::sample(nrow(data), nrow(data) * 0.7)
+  traindf <- as.DataFrame(data[trainidxs, ])
+  testdf <- as.DataFrame(rbind(data[-trainidxs, ], c(0, "the other")))
+  model <- spark.mlp(traindf, clicked ~ ., layers = c(1, 3))
+  predictions <- predict(model, testdf)
+  expect_error(collect(predictions))
+  model <- spark.mlp(traindf, clicked ~ ., layers = c(1, 3), handleInvalid = "skip")
+  predictions <- predict(model, testdf)
+  expect_equal(class(collect(predictions)$clicked[1]), "list")
+
 })
 
 test_that("spark.naiveBayes", {
@@ -427,6 +471,20 @@ test_that("spark.naiveBayes", {
   expect_equal(as.double(s$apriori[1, 1]), 0.5833333, tolerance = 1e-6)
   expect_equal(sum(s$apriori), 1)
   expect_equal(as.double(s$tables[1, "Age_Adult"]), 0.5714286, tolerance = 1e-6)
+
+  # Test unseen labels
+  data <- data.frame(clicked = base::sample(c(0, 1), 10, replace = TRUE),
+  someString = base::sample(c("this", "that"), 10, replace = TRUE),
+                            stringsAsFactors = FALSE)
+  trainidxs <- base::sample(nrow(data), nrow(data) * 0.7)
+  traindf <- as.DataFrame(data[trainidxs, ])
+  testdf <- as.DataFrame(rbind(data[-trainidxs, ], c(0, "the other")))
+  model <- spark.naiveBayes(traindf, clicked ~ ., smoothing = 0.0)
+  predictions <- predict(model, testdf)
+  expect_error(collect(predictions))
+  model <- spark.naiveBayes(traindf, clicked ~ ., smoothing = 0.0, handleInvalid = "keep")
+  predictions <- predict(model, testdf)
+  expect_equal(class(collect(predictions)$clicked[1]), "character")
 })
 
 sparkR.session.stop()
diff --git a/R/pkg/tests/fulltests/test_mllib_tree.R b/R/pkg/tests/fulltests/test_mllib_tree.R
index e31a65f8dfedb..799f94401d008 100644
--- a/R/pkg/tests/fulltests/test_mllib_tree.R
+++ b/R/pkg/tests/fulltests/test_mllib_tree.R
@@ -109,6 +109,20 @@ test_that("spark.gbt", {
     model <- spark.gbt(data, label ~ features, "classification")
     expect_equal(summary(model)$numFeatures, 692)
   }
+
+  # Test unseen labels
+  data <- data.frame(clicked = base::sample(c(0, 1), 10, replace = TRUE),
+  someString = base::sample(c("this", "that"), 10, replace = TRUE),
+                            stringsAsFactors = FALSE)
+  trainidxs <- base::sample(nrow(data), nrow(data) * 0.7)
+  traindf <- as.DataFrame(data[trainidxs, ])
+  testdf <- as.DataFrame(rbind(data[-trainidxs, ], c(0, "the other")))
+  model <- spark.gbt(traindf, clicked ~ ., type = "classification")
+  predictions <- predict(model, testdf)
+  expect_error(collect(predictions))
+  model <- spark.gbt(traindf, clicked ~ ., type = "classification", handleInvalid = "keep")
+  predictions <- predict(model, testdf)
+  expect_equal(class(collect(predictions)$clicked[1]), "character")
 })
 
 test_that("spark.randomForest", {
@@ -328,6 +342,22 @@ test_that("spark.decisionTree", {
     model <- spark.decisionTree(data, label ~ features, "classification")
     expect_equal(summary(model)$numFeatures, 4)
   }
+
+  # Test unseen labels
+  data <- data.frame(clicked = base::sample(c(0, 1), 10, replace = TRUE),
+  someString = base::sample(c("this", "that"), 10, replace = TRUE),
+                            stringsAsFactors = FALSE)
+  trainidxs <- base::sample(nrow(data), nrow(data) * 0.7)
+  traindf <- as.DataFrame(data[trainidxs, ])
+  testdf <- as.DataFrame(rbind(data[-trainidxs, ], c(0, "the other")))
+  model <- spark.decisionTree(traindf, clicked ~ ., type = "classification",
+                              maxDepth = 5, maxBins = 16)
+  predictions <- predict(model, testdf)
+  expect_error(collect(predictions))
+  model <- spark.decisionTree(traindf, clicked ~ ., type = "classification",
+                              maxDepth = 5, maxBins = 16, handleInvalid = "keep")
+  predictions <- predict(model, testdf)
+  expect_equal(class(collect(predictions)$clicked[1]), "character")
 })
 
 sparkR.session.stop()
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/DecisionTreeClassificationWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/DecisionTreeClassificationWrapper.scala
index 7f59825504d8e..a90cae5869b2a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/DecisionTreeClassificationWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/DecisionTreeClassificationWrapper.scala
@@ -73,11 +73,13 @@ private[r] object DecisionTreeClassifierWrapper extends MLReadable[DecisionTreeC
       checkpointInterval: Int,
       seed: String,
       maxMemoryInMB: Int,
-      cacheNodeIds: Boolean): DecisionTreeClassifierWrapper = {
+      cacheNodeIds: Boolean,
+      handleInvalid: String): DecisionTreeClassifierWrapper = {
 
     val rFormula = new RFormula()
       .setFormula(formula)
       .setForceIndexLabel(true)
+      .setHandleInvalid(handleInvalid)
     checkDataColumns(rFormula, data)
     val rFormulaModel = rFormula.fit(data)
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GBTClassificationWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GBTClassificationWrapper.scala
index c07eadb30a4d2..ecaeec5a7791a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/GBTClassificationWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/GBTClassificationWrapper.scala
@@ -78,11 +78,13 @@ private[r] object GBTClassifierWrapper extends MLReadable[GBTClassifierWrapper]
       seed: String,
       subsamplingRate: Double,
       maxMemoryInMB: Int,
-      cacheNodeIds: Boolean): GBTClassifierWrapper = {
+      cacheNodeIds: Boolean,
+      handleInvalid: String): GBTClassifierWrapper = {
 
     val rFormula = new RFormula()
       .setFormula(formula)
       .setForceIndexLabel(true)
+      .setHandleInvalid(handleInvalid)
     checkDataColumns(rFormula, data)
     val rFormulaModel = rFormula.fit(data)
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/LinearSVCWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/LinearSVCWrapper.scala
index 0dd1f1146fbf8..7a22a71c3a819 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/LinearSVCWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/LinearSVCWrapper.scala
@@ -79,12 +79,14 @@ private[r] object LinearSVCWrapper
       standardization: Boolean,
       threshold: Double,
       weightCol: String,
-      aggregationDepth: Int
+      aggregationDepth: Int,
+      handleInvalid: String
       ): LinearSVCWrapper = {
 
     val rFormula = new RFormula()
       .setFormula(formula)
       .setForceIndexLabel(true)
+      .setHandleInvalid(handleInvalid)
     checkDataColumns(rFormula, data)
     val rFormulaModel = rFormula.fit(data)
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
index b96481acf46d7..18acf7d21656f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
@@ -103,12 +103,14 @@ private[r] object LogisticRegressionWrapper
       lowerBoundsOnCoefficients: Array[Double],
       upperBoundsOnCoefficients: Array[Double],
       lowerBoundsOnIntercepts: Array[Double],
-      upperBoundsOnIntercepts: Array[Double]
+      upperBoundsOnIntercepts: Array[Double],
+      handleInvalid: String
       ): LogisticRegressionWrapper = {
 
     val rFormula = new RFormula()
       .setFormula(formula)
       .setForceIndexLabel(true)
+      .setHandleInvalid(handleInvalid)
     checkDataColumns(rFormula, data)
     val rFormulaModel = rFormula.fit(data)
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/MultilayerPerceptronClassifierWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/MultilayerPerceptronClassifierWrapper.scala
index 48c87743dee60..62f642142701b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/MultilayerPerceptronClassifierWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/MultilayerPerceptronClassifierWrapper.scala
@@ -62,7 +62,7 @@ private[r] object MultilayerPerceptronClassifierWrapper
   val PREDICTED_LABEL_INDEX_COL = "pred_label_idx"
   val PREDICTED_LABEL_COL = "prediction"
 
-  def fit(
+  def fit(  // scalastyle:ignore
       data: DataFrame,
       formula: String,
       blockSize: Int,
@@ -72,11 +72,13 @@ private[r] object MultilayerPerceptronClassifierWrapper
       tol: Double,
       stepSize: Double,
       seed: String,
-      initialWeights: Array[Double]
+      initialWeights: Array[Double],
+      handleInvalid: String
      ): MultilayerPerceptronClassifierWrapper = {
     val rFormula = new RFormula()
       .setFormula(formula)
       .setForceIndexLabel(true)
+      .setHandleInvalid(handleInvalid)
     checkDataColumns(rFormula, data)
     val rFormulaModel = rFormula.fit(data)
     // get labels and feature names from output schema
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/NaiveBayesWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/NaiveBayesWrapper.scala
index 0afea4be3d1dd..fbf9f462ff5f6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/NaiveBayesWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/NaiveBayesWrapper.scala
@@ -57,10 +57,15 @@ private[r] object NaiveBayesWrapper extends MLReadable[NaiveBayesWrapper] {
   val PREDICTED_LABEL_INDEX_COL = "pred_label_idx"
   val PREDICTED_LABEL_COL = "prediction"
 
-  def fit(formula: String, data: DataFrame, smoothing: Double): NaiveBayesWrapper = {
+  def fit(
+      formula: String,
+      data: DataFrame,
+      smoothing: Double,
+      handleInvalid: String): NaiveBayesWrapper = {
     val rFormula = new RFormula()
       .setFormula(formula)
       .setForceIndexLabel(true)
+      .setHandleInvalid(handleInvalid)
     checkDataColumns(rFormula, data)
     val rFormulaModel = rFormula.fit(data)
     // get labels and feature names from output schema

From 110695db701d0420ee53cfd5b7096428c489d456 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Mon, 31 Jul 2017 23:15:52 -0700
Subject: [PATCH 267/779] [SPARK-21589][SQL][DOC] Add documents about Hive
 UDF/UDTF/UDAF

## What changes were proposed in this pull request?
This pr added documents about unsupported functions in Hive UDF/UDTF/UDAF.
This pr relates to #18768 and #18527.

## How was this patch tested?
N/A

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #18792 from maropu/HOTFIX-20170731.
---
 docs/sql-programming-guide.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index b5eca76480eb8..7f7cf59b7a9a8 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1903,6 +1903,23 @@ releases of Spark SQL.
   Hive can optionally merge the small files into fewer large files to avoid overflowing the HDFS
   metadata. Spark SQL does not support that.
 
+**Hive UDF/UDTF/UDAF**
+
+Not all the APIs of the Hive UDF/UDTF/UDAF are supported by Spark SQL. Below are the unsupported APIs:
+
+* `getRequiredJars` and `getRequiredFiles` (`UDF` and `GenericUDF`) are functions to automatically
+  include additional resources required by this UDF.
+* `initialize(StructObjectInspector)` in `GenericUDTF` is not supported yet. Spark SQL currently uses
+  a deprecated interface `initialize(ObjectInspector[])` only.
+* `configure` (`GenericUDF`, `GenericUDTF`, and `GenericUDAFEvaluator`) is a function to initialize
+  functions with `MapredContext`, which is inapplicable to Spark.
+* `close` (`GenericUDF` and `GenericUDAFEvaluator`) is a function to release associated resources.
+  Spark SQL does not call this function when tasks finish.
+* `reset` (`GenericUDAFEvaluator`) is a function to re-initialize aggregation for reusing the same aggregation.
+  Spark SQL currently does not support the reuse of aggregation.
+* `getWindowingEvaluator` (`GenericUDAFEvaluator`) is a function to optimize aggregation by evaluating
+  an aggregate over a fixed window.
+
 # Reference
 
 ## Data Types

From 5fd0294ff8960982cfb3b901f84bc91a9f51bf28 Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Tue, 1 Aug 2017 10:23:45 +0100
Subject: [PATCH 268/779] [SPARK-21475][CORE] Use NIO's Files API to replace
 FileInputStream/FileOutputStream in some critical paths

## What changes were proposed in this pull request?

Java's `FileInputStream` and `FileOutputStream` overrides finalize(), even this file input/output stream is closed correctly and promptly, it will still leave some memory footprints which will only get cleaned in Full GC. This will introduce two side effects:

1. Lots of memory footprints regarding to Finalizer will be kept in memory and this will increase the memory overhead. In our use case of external shuffle service, a busy shuffle service will have bunch of this object and potentially lead to OOM.
2. The Finalizer will only be called in Full GC, and this will increase the overhead of Full GC and lead to long GC pause.

https://bugs.openjdk.java.net/browse/JDK-8080225

https://www.cloudbees.com/blog/fileinputstream-fileoutputstream-considered-harmful

So to fix this potential issue, here propose to use NIO's Files#newInput/OutputStream instead in some critical paths like shuffle.

Left unchanged FileInputStream in core which I think is not so critical:

```
./core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala:467:    val file = new DataInputStream(new FileInputStream(filename))
./core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala:942:    val in = new FileInputStream(new File(path))
./core/src/main/scala/org/apache/spark/deploy/master/FileSystemPersistenceEngine.scala:76:    val fileIn = new FileInputStream(file)
./core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala:248:        val fis = new FileInputStream(file)
./core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala:910:                input = new FileInputStream(new File(t))
./core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala:20:import java.io.{FileInputStream, InputStream}
./core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala:132:        case Some(f) => new FileInputStream(f)
./core/src/main/scala/org/apache/spark/scheduler/SchedulableBuilder.scala:20:import java.io.{FileInputStream, InputStream}
./core/src/main/scala/org/apache/spark/scheduler/SchedulableBuilder.scala:77:        val fis = new FileInputStream(f)
./core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala:27:import org.apache.spark.io.NioBufferedFileInputStream
./core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala:94:      new DataInputStream(new NioBufferedFileInputStream(index))
./core/src/main/scala/org/apache/spark/storage/DiskStore.scala:111:        val channel = new FileInputStream(file).getChannel()
./core/src/main/scala/org/apache/spark/storage/DiskStore.scala:219:    val channel = new FileInputStream(file).getChannel()
./core/src/main/scala/org/apache/spark/TestUtils.scala:20:import java.io.{ByteArrayInputStream, File, FileInputStream, FileOutputStream}
./core/src/main/scala/org/apache/spark/TestUtils.scala:106:      val in = new FileInputStream(file)
./core/src/main/scala/org/apache/spark/util/logging/RollingFileAppender.scala:89:        inputStream = new FileInputStream(activeFile)
./core/src/main/scala/org/apache/spark/util/Utils.scala:329:      if (in.isInstanceOf[FileInputStream] && out.isInstanceOf[FileOutputStream]
./core/src/main/scala/org/apache/spark/util/Utils.scala:332:        val inChannel = in.asInstanceOf[FileInputStream].getChannel()
./core/src/main/scala/org/apache/spark/util/Utils.scala:1533:      gzInputStream = new GZIPInputStream(new FileInputStream(file))
./core/src/main/scala/org/apache/spark/util/Utils.scala:1560:      new GZIPInputStream(new FileInputStream(file))
./core/src/main/scala/org/apache/spark/util/Utils.scala:1562:      new FileInputStream(file)
./core/src/main/scala/org/apache/spark/util/Utils.scala:2090:    val inReader = new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8)
```

Left unchanged FileOutputStream in core:

```
./core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala:957:    val out = new FileOutputStream(file)
./core/src/main/scala/org/apache/spark/api/r/RBackend.scala:20:import java.io.{DataOutputStream, File, FileOutputStream, IOException}
./core/src/main/scala/org/apache/spark/api/r/RBackend.scala:131:      val dos = new DataOutputStream(new FileOutputStream(f))
./core/src/main/scala/org/apache/spark/deploy/master/FileSystemPersistenceEngine.scala:62:    val fileOut = new FileOutputStream(file)
./core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala:160:          val outStream = new FileOutputStream(outPath)
./core/src/main/scala/org/apache/spark/deploy/RPackageUtils.scala:239:    val zipOutputStream = new ZipOutputStream(new FileOutputStream(zipFile, false))
./core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala:949:        val out = new FileOutputStream(tempFile)
./core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala:20:import java.io.{File, FileOutputStream, InputStream, IOException}
./core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala:106:    val out = new FileOutputStream(file, true)
./core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala:109:     * Therefore, for local files, use FileOutputStream instead. */
./core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala:112:        new FileOutputStream(uri.getPath)
./core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala:20:import java.io.{BufferedOutputStream, File, FileOutputStream, OutputStream}
./core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala:71:  private var fos: FileOutputStream = null
./core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala:102:    fos = new FileOutputStream(file, true)
./core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala:213:      var truncateStream: FileOutputStream = null
./core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala:215:        truncateStream = new FileOutputStream(file, true)
./core/src/main/scala/org/apache/spark/storage/DiskStore.scala:153:    val out = new FileOutputStream(file).getChannel()
./core/src/main/scala/org/apache/spark/TestUtils.scala:20:import java.io.{ByteArrayInputStream, File, FileInputStream, FileOutputStream}
./core/src/main/scala/org/apache/spark/TestUtils.scala:81:    val jarStream = new JarOutputStream(new FileOutputStream(jarFile))
./core/src/main/scala/org/apache/spark/TestUtils.scala:96:    val jarFileStream = new FileOutputStream(jarFile)
./core/src/main/scala/org/apache/spark/util/logging/FileAppender.scala:20:import java.io.{File, FileOutputStream, InputStream, IOException}
./core/src/main/scala/org/apache/spark/util/logging/FileAppender.scala:31:  volatile private var outputStream: FileOutputStream = null
./core/src/main/scala/org/apache/spark/util/logging/FileAppender.scala:97:    outputStream = new FileOutputStream(file, true)
./core/src/main/scala/org/apache/spark/util/logging/RollingFileAppender.scala:90:        gzOutputStream = new GZIPOutputStream(new FileOutputStream(gzFile))
./core/src/main/scala/org/apache/spark/util/Utils.scala:329:      if (in.isInstanceOf[FileInputStream] && out.isInstanceOf[FileOutputStream]
./core/src/main/scala/org/apache/spark/util/Utils.scala:333:        val outChannel = out.asInstanceOf[FileOutputStream].getChannel()
./core/src/main/scala/org/apache/spark/util/Utils.scala:527:      val out = new FileOutputStream(tempFile)
```

Here in `DiskBlockObjectWriter`, it uses `FileDescriptor` so it is not easy to change to NIO Files API.

For the `FileInputStream` and `FileOutputStream` in common/shuffle* I changed them all.

## How was this patch tested?

Existing tests and manual verification.

Author: jerryshao <sshao@hortonworks.com>

Closes #18684 from jerryshao/SPARK-21475.
---
 .../buffer/FileSegmentManagedBuffer.java      |  9 ++++----
 .../shuffle/OneForOneBlockFetcher.java        |  4 ++--
 .../shuffle/ShuffleIndexInformation.java      |  4 ++--
 .../sort/BypassMergeSortShuffleWriter.java    | 16 +++++++-------
 .../shuffle/sort/UnsafeShuffleWriter.java     |  9 ++++----
 .../shuffle/IndexShuffleBlockResolver.scala   |  6 ++++--
 .../collection/ExternalAppendOnlyMap.scala    | 21 +++++++++++--------
 .../util/collection/ExternalSorter.scala      | 17 ++++++++-------
 8 files changed, 49 insertions(+), 37 deletions(-)

diff --git a/common/network-common/src/main/java/org/apache/spark/network/buffer/FileSegmentManagedBuffer.java b/common/network-common/src/main/java/org/apache/spark/network/buffer/FileSegmentManagedBuffer.java
index c20fab83c3460..ea9b3ce4e3522 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/buffer/FileSegmentManagedBuffer.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/buffer/FileSegmentManagedBuffer.java
@@ -18,12 +18,13 @@
 package org.apache.spark.network.buffer;
 
 import java.io.File;
-import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.RandomAccessFile;
 import java.nio.ByteBuffer;
 import java.nio.channels.FileChannel;
+import java.nio.file.Files;
+import java.nio.file.StandardOpenOption;
 
 import com.google.common.base.Objects;
 import com.google.common.io.ByteStreams;
@@ -93,9 +94,9 @@ public ByteBuffer nioByteBuffer() throws IOException {
 
   @Override
   public InputStream createInputStream() throws IOException {
-    FileInputStream is = null;
+    InputStream is = null;
     try {
-      is = new FileInputStream(file);
+      is = Files.newInputStream(file.toPath());
       ByteStreams.skipFully(is, offset);
       return new LimitedInputStream(is, length);
     } catch (IOException e) {
@@ -132,7 +133,7 @@ public Object convertToNetty() throws IOException {
     if (conf.lazyFileDescriptor()) {
       return new DefaultFileRegion(file, offset, length);
     } else {
-      FileChannel fileChannel = new FileInputStream(file).getChannel();
+      FileChannel fileChannel = FileChannel.open(file.toPath(), StandardOpenOption.READ);
       return new DefaultFileRegion(fileChannel, offset, length);
     }
   }
diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java
index 2f160d12af22b..66b67e282c80d 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java
@@ -18,11 +18,11 @@
 package org.apache.spark.network.shuffle;
 
 import java.io.File;
-import java.io.FileOutputStream;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.nio.channels.Channels;
 import java.nio.channels.WritableByteChannel;
+import java.nio.file.Files;
 import java.util.Arrays;
 
 import org.slf4j.Logger;
@@ -165,7 +165,7 @@ private class DownloadCallback implements StreamCallback {
 
     DownloadCallback(int chunkIndex) throws IOException {
       this.targetFile = tempShuffleFileManager.createTempShuffleFile();
-      this.channel = Channels.newChannel(new FileOutputStream(targetFile));
+      this.channel = Channels.newChannel(Files.newOutputStream(targetFile.toPath()));
       this.chunkIndex = chunkIndex;
     }
 
diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java
index ec57f0259d55c..39ca9ba574853 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java
@@ -19,10 +19,10 @@
 
 import java.io.DataInputStream;
 import java.io.File;
-import java.io.FileInputStream;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.nio.LongBuffer;
+import java.nio.file.Files;
 
 /**
  * Keeps the index information for a particular map output
@@ -38,7 +38,7 @@ public ShuffleIndexInformation(File indexFile) throws IOException {
     offsets = buffer.asLongBuffer();
     DataInputStream dis = null;
     try {
-      dis = new DataInputStream(new FileInputStream(indexFile));
+      dis = new DataInputStream(Files.newInputStream(indexFile.toPath()));
       dis.readFully(buffer.array());
     } finally {
       if (dis != null) {
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
index 323a5d3c52831..a9b5236ab8173 100644
--- a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
@@ -18,9 +18,9 @@
 package org.apache.spark.shuffle.sort;
 
 import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
 import java.io.IOException;
+import java.nio.channels.FileChannel;
+import static java.nio.file.StandardOpenOption.*;
 import javax.annotation.Nullable;
 
 import scala.None$;
@@ -75,7 +75,6 @@ final class BypassMergeSortShuffleWriter<K, V> extends ShuffleWriter<K, V> {
   private static final Logger logger = LoggerFactory.getLogger(BypassMergeSortShuffleWriter.class);
 
   private final int fileBufferSize;
-  private final boolean transferToEnabled;
   private final int numPartitions;
   private final BlockManager blockManager;
   private final Partitioner partitioner;
@@ -107,7 +106,6 @@ final class BypassMergeSortShuffleWriter<K, V> extends ShuffleWriter<K, V> {
       SparkConf conf) {
     // Use getSizeAsKb (not bytes) to maintain backwards compatibility if no units are provided
     this.fileBufferSize = (int) conf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024;
-    this.transferToEnabled = conf.getBoolean("spark.file.transferTo", true);
     this.blockManager = blockManager;
     final ShuffleDependency<K, V, V> dep = handle.dependency();
     this.mapId = mapId;
@@ -188,17 +186,21 @@ private long[] writePartitionedFile(File outputFile) throws IOException {
       return lengths;
     }
 
-    final FileOutputStream out = new FileOutputStream(outputFile, true);
+    // This file needs to opened in append mode in order to work around a Linux kernel bug that
+    // affects transferTo; see SPARK-3948 for more details.
+    final FileChannel out = FileChannel.open(outputFile.toPath(), WRITE, APPEND, CREATE);
     final long writeStartTime = System.nanoTime();
     boolean threwException = true;
     try {
       for (int i = 0; i < numPartitions; i++) {
         final File file = partitionWriterSegments[i].file();
         if (file.exists()) {
-          final FileInputStream in = new FileInputStream(file);
+          final FileChannel in = FileChannel.open(file.toPath(), READ);
           boolean copyThrewException = true;
           try {
-            lengths[i] = Utils.copyStream(in, out, false, transferToEnabled);
+            long size = in.size();
+            Utils.copyFileStreamNIO(in, out, 0, size);
+            lengths[i] = size;
             copyThrewException = false;
           } finally {
             Closeables.close(in, copyThrewException);
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
index 1b578491b81d7..c0ebe3cc9b792 100644
--- a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
@@ -20,6 +20,7 @@
 import javax.annotation.Nullable;
 import java.io.*;
 import java.nio.channels.FileChannel;
+import static java.nio.file.StandardOpenOption.*;
 import java.util.Iterator;
 
 import scala.Option;
@@ -290,7 +291,7 @@ private long[] mergeSpills(SpillInfo[] spills, File outputFile) throws IOExcepti
     final boolean encryptionEnabled = blockManager.serializerManager().encryptionEnabled();
     try {
       if (spills.length == 0) {
-        new FileOutputStream(outputFile).close(); // Create an empty file
+        java.nio.file.Files.newOutputStream(outputFile.toPath()).close(); // Create an empty file
         return new long[partitioner.numPartitions()];
       } else if (spills.length == 1) {
         // Here, we don't need to perform any metrics updates because the bytes written to this
@@ -367,7 +368,7 @@ private long[] mergeSpillsWithFileStream(
     final InputStream[] spillInputStreams = new InputStream[spills.length];
 
     final OutputStream bos = new BufferedOutputStream(
-            new FileOutputStream(outputFile),
+            java.nio.file.Files.newOutputStream(outputFile.toPath()),
             outputBufferSizeInBytes);
     // Use a counting output stream to avoid having to close the underlying file and ask
     // the file system for its size after each partition is written.
@@ -442,11 +443,11 @@ private long[] mergeSpillsWithTransferTo(SpillInfo[] spills, File outputFile) th
     boolean threwException = true;
     try {
       for (int i = 0; i < spills.length; i++) {
-        spillInputChannels[i] = new FileInputStream(spills[i].file).getChannel();
+        spillInputChannels[i] = FileChannel.open(spills[i].file.toPath(), READ);
       }
       // This file needs to opened in append mode in order to work around a Linux kernel bug that
       // affects transferTo; see SPARK-3948 for more details.
-      mergedFileOutputChannel = new FileOutputStream(outputFile, true).getChannel();
+      mergedFileOutputChannel = FileChannel.open(outputFile.toPath(), WRITE, CREATE, APPEND);
 
       long bytesWrittenToMergedFile = 0;
       for (int partition = 0; partition < numPartitions; partition++) {
diff --git a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
index 15540485170d0..94a3a78e94165 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.shuffle
 
 import java.io._
+import java.nio.file.Files
 
 import com.google.common.io.ByteStreams
 
@@ -141,7 +142,8 @@ private[spark] class IndexShuffleBlockResolver(
     val indexFile = getIndexFile(shuffleId, mapId)
     val indexTmp = Utils.tempFileWith(indexFile)
     try {
-      val out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(indexTmp)))
+      val out = new DataOutputStream(
+        new BufferedOutputStream(Files.newOutputStream(indexTmp.toPath)))
       Utils.tryWithSafeFinally {
         // We take in lengths of each block, need to convert it to offsets.
         var offset = 0L
@@ -196,7 +198,7 @@ private[spark] class IndexShuffleBlockResolver(
     // find out the consolidated file, then the offset within that from our index
     val indexFile = getIndexFile(blockId.shuffleId, blockId.mapId)
 
-    val in = new DataInputStream(new FileInputStream(indexFile))
+    val in = new DataInputStream(Files.newInputStream(indexFile.toPath))
     try {
       ByteStreams.skipFully(in, blockId.reduceId * 8)
       val offset = in.readLong()
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
index 8aafda5e45d52..a08563562b874 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
@@ -18,6 +18,8 @@
 package org.apache.spark.util.collection
 
 import java.io._
+import java.nio.channels.{Channels, FileChannel}
+import java.nio.file.StandardOpenOption
 import java.util.Comparator
 
 import scala.collection.BufferedIterator
@@ -460,7 +462,7 @@ class ExternalAppendOnlyMap[K, V, C](
     )
 
     private var batchIndex = 0  // Which batch we're in
-    private var fileStream: FileInputStream = null
+    private var fileChannel: FileChannel = null
 
     // An intermediate stream that reads from exactly one batch
     // This guards against pre-fetching and other arbitrary behavior of higher level streams
@@ -477,14 +479,14 @@ class ExternalAppendOnlyMap[K, V, C](
       if (batchIndex < batchOffsets.length - 1) {
         if (deserializeStream != null) {
           deserializeStream.close()
-          fileStream.close()
+          fileChannel.close()
           deserializeStream = null
-          fileStream = null
+          fileChannel = null
         }
 
         val start = batchOffsets(batchIndex)
-        fileStream = new FileInputStream(file)
-        fileStream.getChannel.position(start)
+        fileChannel = FileChannel.open(file.toPath, StandardOpenOption.READ)
+        fileChannel.position(start)
         batchIndex += 1
 
         val end = batchOffsets(batchIndex)
@@ -492,7 +494,8 @@ class ExternalAppendOnlyMap[K, V, C](
         assert(end >= start, "start = " + start + ", end = " + end +
           ", batchOffsets = " + batchOffsets.mkString("[", ", ", "]"))
 
-        val bufferedStream = new BufferedInputStream(ByteStreams.limit(fileStream, end - start))
+        val bufferedStream = new BufferedInputStream(
+          ByteStreams.limit(Channels.newInputStream(fileChannel), end - start))
         val wrappedStream = serializerManager.wrapStream(blockId, bufferedStream)
         ser.deserializeStream(wrappedStream)
       } else {
@@ -552,9 +555,9 @@ class ExternalAppendOnlyMap[K, V, C](
         ds.close()
         deserializeStream = null
       }
-      if (fileStream != null) {
-        fileStream.close()
-        fileStream = null
+      if (fileChannel != null) {
+        fileChannel.close()
+        fileChannel = null
       }
       if (file.exists()) {
         if (!file.delete()) {
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
index 176f84fa2a0d2..3593cfd507783 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
@@ -18,6 +18,8 @@
 package org.apache.spark.util.collection
 
 import java.io._
+import java.nio.channels.{Channels, FileChannel}
+import java.nio.file.StandardOpenOption
 import java.util.Comparator
 
 import scala.collection.mutable
@@ -492,7 +494,7 @@ private[spark] class ExternalSorter[K, V, C](
 
     // Intermediate file and deserializer streams that read from exactly one batch
     // This guards against pre-fetching and other arbitrary behavior of higher level streams
-    var fileStream: FileInputStream = null
+    var fileChannel: FileChannel = null
     var deserializeStream = nextBatchStream()  // Also sets fileStream
 
     var nextItem: (K, C) = null
@@ -505,14 +507,14 @@ private[spark] class ExternalSorter[K, V, C](
       if (batchId < batchOffsets.length - 1) {
         if (deserializeStream != null) {
           deserializeStream.close()
-          fileStream.close()
+          fileChannel.close()
           deserializeStream = null
-          fileStream = null
+          fileChannel = null
         }
 
         val start = batchOffsets(batchId)
-        fileStream = new FileInputStream(spill.file)
-        fileStream.getChannel.position(start)
+        fileChannel = FileChannel.open(spill.file.toPath, StandardOpenOption.READ)
+        fileChannel.position(start)
         batchId += 1
 
         val end = batchOffsets(batchId)
@@ -520,7 +522,8 @@ private[spark] class ExternalSorter[K, V, C](
         assert(end >= start, "start = " + start + ", end = " + end +
           ", batchOffsets = " + batchOffsets.mkString("[", ", ", "]"))
 
-        val bufferedStream = new BufferedInputStream(ByteStreams.limit(fileStream, end - start))
+        val bufferedStream = new BufferedInputStream(
+          ByteStreams.limit(Channels.newInputStream(fileChannel), end - start))
 
         val wrappedStream = serializerManager.wrapStream(spill.blockId, bufferedStream)
         serInstance.deserializeStream(wrappedStream)
@@ -610,7 +613,7 @@ private[spark] class ExternalSorter[K, V, C](
       batchId = batchOffsets.length  // Prevent reading any other batch
       val ds = deserializeStream
       deserializeStream = null
-      fileStream = null
+      fileChannel = null
       if (ds != null) {
         ds.close()
       }

From 253a07e43a35f3494aa5e5ead9f4997c653325aa Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Tue, 1 Aug 2017 21:34:26 +0800
Subject: [PATCH 269/779] [SPARK-21388][ML][PYSPARK] GBTs inherit from
 HasStepSize & LInearSVC from HasThreshold

## What changes were proposed in this pull request?
GBTs inherit from HasStepSize & LInearSVC/Binarizer from HasThreshold

## How was this patch tested?
existing tests

Author: Zheng RuiFeng <ruifengz@foxmail.com>
Author: Ruifeng Zheng <ruifengz@foxmail.com>

Closes #18612 from zhengruifeng/override_HasXXX.
---
 .../spark/ml/classification/LinearSVC.scala   |  7 ++-----
 .../classification/LogisticRegression.scala   |  1 +
 .../apache/spark/ml/feature/Word2Vec.scala    |  1 -
 .../ml/param/shared/SharedParamsCodeGen.scala |  6 +++---
 .../spark/ml/param/shared/sharedParams.scala  |  6 ++----
 .../org/apache/spark/ml/tree/treeParams.scala |  7 ++-----
 python/pyspark/ml/classification.py           | 19 ++++++-------------
 python/pyspark/ml/regression.py               |  5 +++++
 8 files changed, 21 insertions(+), 31 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
index d6ed6a4570a4a..8d556deef2be8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
@@ -42,7 +42,7 @@ import org.apache.spark.sql.functions.{col, lit}
 /** Params for linear SVM Classifier. */
 private[classification] trait LinearSVCParams extends ClassifierParams with HasRegParam
   with HasMaxIter with HasFitIntercept with HasTol with HasStandardization with HasWeightCol
-  with HasAggregationDepth {
+  with HasAggregationDepth with HasThreshold {
 
   /**
    * Param for threshold in binary classification prediction.
@@ -53,11 +53,8 @@ private[classification] trait LinearSVCParams extends ClassifierParams with HasR
    *
    * @group param
    */
-  final val threshold: DoubleParam = new DoubleParam(this, "threshold",
+  final override val threshold: DoubleParam = new DoubleParam(this, "threshold",
     "threshold in binary classification prediction applied to rawPrediction")
-
-  /** @group getParam */
-  def getThreshold: Double = $(threshold)
 }
 
 /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 6bba7f9b08dbd..21957d94e2dc3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -366,6 +366,7 @@ class LogisticRegression @Since("1.2.0") (
 
   @Since("1.5.0")
   override def setThreshold(value: Double): this.type = super.setThreshold(value)
+  setDefault(threshold -> 0.5)
 
   @Since("1.5.0")
   override def getThreshold: Double = super.getThreshold
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
index b6909b3386b71..d4c8e4b361959 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
@@ -19,7 +19,6 @@ package org.apache.spark.ml.feature
 
 import org.apache.hadoop.fs.Path
 
-import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Since
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors, VectorUDT}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
index fd9b20ed9364a..1860fe8361749 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
@@ -47,8 +47,8 @@ private[shared] object SharedParamsCodeGen {
         Some("\"probability\"")),
       ParamDesc[String]("varianceCol", "Column name for the biased sample variance of prediction"),
       ParamDesc[Double]("threshold",
-        "threshold in binary classification prediction, in range [0, 1]", Some("0.5"),
-        isValid = "ParamValidators.inRange(0, 1)", finalMethods = false),
+        "threshold in binary classification prediction, in range [0, 1]",
+        isValid = "ParamValidators.inRange(0, 1)", finalMethods = false, finalFields = false),
       ParamDesc[Array[Double]]("thresholds", "Thresholds in multi-class classification" +
         " to adjust the probability of predicting each class." +
         " Array must have length equal to the number of classes, with values > 0" +
@@ -77,7 +77,7 @@ private[shared] object SharedParamsCodeGen {
       ParamDesc[Double]("tol", "the convergence tolerance for iterative algorithms (>= 0)",
         isValid = "ParamValidators.gtEq(0)"),
       ParamDesc[Double]("stepSize", "Step size to be used for each iteration of optimization (>" +
-        " 0)", isValid = "ParamValidators.gt(0)"),
+        " 0)", isValid = "ParamValidators.gt(0)", finalFields = false),
       ParamDesc[String]("weightCol", "weight column name. If this is not set or empty, we treat " +
         "all instance weights as 1.0"),
       ParamDesc[String]("solver", "the solver algorithm for optimization", finalFields = false),
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
index a29b45c3ec66c..545e45e84e9ea 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -162,9 +162,7 @@ private[ml] trait HasThreshold extends Params {
    * Param for threshold in binary classification prediction, in range [0, 1].
    * @group param
    */
-  final val threshold: DoubleParam = new DoubleParam(this, "threshold", "threshold in binary classification prediction, in range [0, 1]", ParamValidators.inRange(0, 1))
-
-  setDefault(threshold, 0.5)
+  val threshold: DoubleParam = new DoubleParam(this, "threshold", "threshold in binary classification prediction, in range [0, 1]", ParamValidators.inRange(0, 1))
 
   /** @group getParam */
   def getThreshold: Double = $(threshold)
@@ -352,7 +350,7 @@ private[ml] trait HasStepSize extends Params {
    * Param for Step size to be used for each iteration of optimization (&gt; 0).
    * @group param
    */
-  final val stepSize: DoubleParam = new DoubleParam(this, "stepSize", "Step size to be used for each iteration of optimization (> 0)", ParamValidators.gt(0))
+  val stepSize: DoubleParam = new DoubleParam(this, "stepSize", "Step size to be used for each iteration of optimization (> 0)", ParamValidators.gt(0))
 
   /** @group getParam */
   final def getStepSize: Double = $(stepSize)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
index 3fc3ac58b7795..47079d9c6bb1c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
@@ -458,7 +458,7 @@ private[ml] trait RandomForestRegressorParams
  *
  * Note: Marked as private and DeveloperApi since this may be made public in the future.
  */
-private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter {
+private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter with HasStepSize {
 
   /* TODO: Add this doc when we add this param.  SPARK-7132
    * Threshold for stopping early when runWithValidation is used.
@@ -484,13 +484,10 @@ private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter {
    * (default = 0.1)
    * @group param
    */
-  final val stepSize: DoubleParam = new DoubleParam(this, "stepSize", "Step size " +
+  final override val stepSize: DoubleParam = new DoubleParam(this, "stepSize", "Step size " +
     "(a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each estimator.",
     ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true))
 
-  /** @group getParam */
-  final def getStepSize: Double = $(stepSize)
-
   /**
    * @deprecated This method is deprecated and will be removed in 3.0.0.
    * @group setParam
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 4af6f71e19257..ab1617ba47221 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -63,7 +63,7 @@ def numClasses(self):
 @inherit_doc
 class LinearSVC(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter,
                 HasRegParam, HasTol, HasRawPredictionCol, HasFitIntercept, HasStandardization,
-                HasWeightCol, HasAggregationDepth, JavaMLWritable, JavaMLReadable):
+                HasWeightCol, HasAggregationDepth, HasThreshold, JavaMLWritable, JavaMLReadable):
     """
     .. note:: Experimental
 
@@ -153,18 +153,6 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
     def _create_model(self, java_model):
         return LinearSVCModel(java_model)
 
-    def setThreshold(self, value):
-        """
-        Sets the value of :py:attr:`threshold`.
-        """
-        return self._set(threshold=value)
-
-    def getThreshold(self):
-        """
-        Gets the value of threshold or its default value.
-        """
-        return self.getOrDefault(self.threshold)
-
 
 class LinearSVCModel(JavaModel, JavaClassificationModel, JavaMLWritable, JavaMLReadable):
     """
@@ -1030,6 +1018,11 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol
                      "Supported options: " + ", ".join(GBTParams.supportedLossTypes),
                      typeConverter=TypeConverters.toString)
 
+    stepSize = Param(Params._dummy(), "stepSize",
+                     "Step size (a.k.a. learning rate) in interval (0, 1] for shrinking " +
+                     "the contribution of each estimator.",
+                     typeConverter=TypeConverters.toFloat)
+
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index f0ff7a5f59abf..2cc623427edc8 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -1014,6 +1014,11 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
                      "Supported options: " + ", ".join(GBTParams.supportedLossTypes),
                      typeConverter=TypeConverters.toString)
 
+    stepSize = Param(Params._dummy(), "stepSize",
+                     "Step size (a.k.a. learning rate) in interval (0, 1] for shrinking " +
+                     "the contribution of each estimator.",
+                     typeConverter=TypeConverters.toFloat)
+
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,

From 97ccc63f7d377e43d9ae989d164e9f50788c66a6 Mon Sep 17 00:00:00 2001
From: pgandhi <pgandhi@yahoo-inc.com>
Date: Tue, 1 Aug 2017 08:42:24 -0500
Subject: [PATCH 270/779] [SPARK-21585] Application Master marking application
 status as Failed for Client Mode

The fix deployed for SPARK-21541 resulted in the Application Master to set the final status of a spark application as Failed for the client mode as the flag 'registered' was not being set to true for client mode. So, in order to fix the issue, I have set the flag 'registered' as true in client mode on successfully registering Application Master.

## How was this patch tested?
Tested the patch manually.

Before:
<img width="1275" alt="screen shot-before2" src="https://user-images.githubusercontent.com/22228190/28799641-02b5ed78-760f-11e7-9eb0-bf8407dad0ad.png">

After:
<img width="1221" alt="screen shot-after2" src="https://user-images.githubusercontent.com/22228190/28799646-0ac9ef14-760f-11e7-8bf5-9dfd743d0f2f.png">

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: pgandhi <pgandhi@yahoo-inc.com>
Author: pgandhi999 <parthkgandhi9@gmail.com>

Closes #18788 from pgandhi999/SPARK-21585.
---
 .../scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index ca6a3ef3ebbb5..0387b44dbcc10 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -474,6 +474,7 @@ private[spark] class ApplicationMaster(
     addAmIpFilter()
     registerAM(sparkConf, rpcEnv, driverRef, sparkConf.getOption("spark.driver.appUIAddress"),
       securityMgr)
+    registered = true
 
     // In client mode the actor will stop the reporter thread.
     reporterThread.join()

From b133501800b43fa5c538a4e5ad597c9dc7d8378e Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Tue, 1 Aug 2017 10:06:03 -0700
Subject: [PATCH 271/779] [SPARK-21522][CORE] Fix flakiness in
 LauncherServerSuite.

Handle the case where the server closes the socket before the full message
has been written by the client.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #18727 from vanzin/SPARK-21522.
---
 .../apache/spark/launcher/LauncherServerSuite.java    | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java b/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
index 03c2934e2692e..d0f6abe2fe692 100644
--- a/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
+++ b/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
@@ -22,6 +22,7 @@
 import java.io.ObjectInputStream;
 import java.net.InetAddress;
 import java.net.Socket;
+import java.net.SocketException;
 import java.util.Arrays;
 import java.util.List;
 import java.util.concurrent.BlockingQueue;
@@ -171,7 +172,15 @@ public void testStreamFiltering() throws Exception {
         LauncherServer.getServerInstance().getPort());
 
       client = new TestClient(s);
-      client.send(new EvilPayload());
+
+      try {
+        client.send(new EvilPayload());
+      } catch (SocketException se) {
+        // SPARK-21522: this can happen if the server closes the socket before the full message has
+        // been written, so it's expected. It may cause false positives though (socket errors
+        // happening for other reasons).
+      }
+
       waitForError(client, handle.getSecret());
       assertEquals(0, EvilPayload.EVIL_BIT);
     } finally {

From 6735433cde44b3430fb44edfff58ef8149c66c13 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Tue, 1 Aug 2017 10:12:30 -0700
Subject: [PATCH 272/779] [SPARK-20079][YARN] Fix client AM not allocating
 executors after restart.

The main goal of this change is to avoid the situation described
in the bug, where an AM restart in the middle of a job may cause
no new executors to be allocated because of faulty logic in the
reset path.

The change does two things:

- fixes the executor alloc manager's reset() so that it does not
  stop allocation after a reset() in the middle of a job
- re-orders the initialization of the YarnAllocator class so that
  it fetches the current executor ID before triggering the reset()
  above.

This ensures both that the new allocator gets new requests for executors,
and that it starts from the correct executor id.

Tested with unit tests and by manually causing AM restarts while
running jobs using spark-shell in YARN mode.

Closes #17882

Author: Marcelo Vanzin <vanzin@cloudera.com>
Author: Guoqiang Li <witgo@qq.com>

Closes #18663 from vanzin/SPARK-20079.
---
 .../spark/ExecutorAllocationManager.scala     | 26 ++++++---
 .../spark/deploy/yarn/ApplicationMaster.scala | 58 +++++++------------
 .../cluster/YarnSchedulerBackend.scala        | 11 +---
 3 files changed, 41 insertions(+), 54 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
index fcc72ff49276d..9a0e3b5557892 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
@@ -21,7 +21,7 @@ import java.util.concurrent.TimeUnit
 
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
-import scala.util.control.ControlThrowable
+import scala.util.control.{ControlThrowable, NonFatal}
 
 import com.codahale.metrics.{Gauge, MetricRegistry}
 
@@ -245,14 +245,15 @@ private[spark] class ExecutorAllocationManager(
   }
 
   /**
-   * Reset the allocation manager to the initial state. Currently this will only be called in
-   * yarn-client mode when AM re-registers after a failure.
+   * Reset the allocation manager when the cluster manager loses track of the driver's state.
+   * This is currently only done in YARN client mode, when the AM is restarted.
+   *
+   * This method forgets about any state about existing executors, and forces the scheduler to
+   * re-evaluate the number of needed executors the next time it's run.
    */
   def reset(): Unit = synchronized {
-    initializing = true
+    addTime = 0L
     numExecutorsTarget = initialNumExecutors
-    numExecutorsToAdd = 1
-
     executorsPendingToRemove.clear()
     removeTimes.clear()
   }
@@ -376,8 +377,17 @@ private[spark] class ExecutorAllocationManager(
       return 0
     }
 
-    val addRequestAcknowledged = testing ||
-      client.requestTotalExecutors(numExecutorsTarget, localityAwareTasks, hostToLocalTaskCount)
+    val addRequestAcknowledged = try {
+      testing ||
+        client.requestTotalExecutors(numExecutorsTarget, localityAwareTasks, hostToLocalTaskCount)
+    } catch {
+      case NonFatal(e) =>
+        // Use INFO level so the error it doesn't show up by default in shells. Errors here are more
+        // commonly caused by YARN AM restarts, which is a recoverable issue, and generate a lot of
+        // noisy output.
+        logInfo("Error reaching cluster manager.", e)
+        false
+    }
     if (addRequestAcknowledged) {
       val executorsString = "executor" + { if (delta > 1) "s" else "" }
       logInfo(s"Requesting $delta new $executorsString because tasks are backlogged" +
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 0387b44dbcc10..e227bff88f71d 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -130,7 +130,6 @@ private[spark] class ApplicationMaster(
   private var nextAllocationInterval = initialAllocationInterval
 
   private var rpcEnv: RpcEnv = null
-  private var amEndpoint: RpcEndpointRef = _
 
   // In cluster mode, used to tell the AM when the user's SparkContext has been initialized.
   private val sparkContextPromise = Promise[SparkContext]()
@@ -405,32 +404,26 @@ private[spark] class ApplicationMaster(
       securityMgr,
       localResources)
 
+    // Initialize the AM endpoint *after* the allocator has been initialized. This ensures
+    // that when the driver sends an initial executor request (e.g. after an AM restart),
+    // the allocator is ready to service requests.
+    rpcEnv.setupEndpoint("YarnAM", new AMEndpoint(rpcEnv, driverRef))
+
     allocator.allocateResources()
     reporterThread = launchReporterThread()
   }
 
   /**
-   * Create an [[RpcEndpoint]] that communicates with the driver.
-   *
-   * In cluster mode, the AM and the driver belong to same process
-   * so the AMEndpoint need not monitor lifecycle of the driver.
-   *
-   * @return A reference to the driver's RPC endpoint.
+   * @return An [[RpcEndpoint]] that communicates with the driver's scheduler backend.
    */
-  private def runAMEndpoint(
-      host: String,
-      port: String,
-      isClusterMode: Boolean): RpcEndpointRef = {
-    val driverEndpoint = rpcEnv.setupEndpointRef(
+  private def createSchedulerRef(host: String, port: String): RpcEndpointRef = {
+    rpcEnv.setupEndpointRef(
       RpcAddress(host, port.toInt),
       YarnSchedulerBackend.ENDPOINT_NAME)
-    amEndpoint =
-      rpcEnv.setupEndpoint("YarnAM", new AMEndpoint(rpcEnv, driverEndpoint, isClusterMode))
-    driverEndpoint
   }
 
   private def runDriver(securityMgr: SecurityManager): Unit = {
-    addAmIpFilter()
+    addAmIpFilter(None)
     userClassThread = startUserApplication()
 
     // This a bit hacky, but we need to wait until the spark.driver.port property has
@@ -442,10 +435,9 @@ private[spark] class ApplicationMaster(
         Duration(totalWaitTime, TimeUnit.MILLISECONDS))
       if (sc != null) {
         rpcEnv = sc.env.rpcEnv
-        val driverRef = runAMEndpoint(
+        val driverRef = createSchedulerRef(
           sc.getConf.get("spark.driver.host"),
-          sc.getConf.get("spark.driver.port"),
-          isClusterMode = true)
+          sc.getConf.get("spark.driver.port"))
         registerAM(sc.getConf, rpcEnv, driverRef, sc.ui.map(_.webUrl), securityMgr)
         registered = true
       } else {
@@ -471,7 +463,7 @@ private[spark] class ApplicationMaster(
     rpcEnv = RpcEnv.create("sparkYarnAM", hostname, hostname, -1, sparkConf, securityMgr,
       amCores, true)
     val driverRef = waitForSparkDriver()
-    addAmIpFilter()
+    addAmIpFilter(Some(driverRef))
     registerAM(sparkConf, rpcEnv, driverRef, sparkConf.getOption("spark.driver.appUIAddress"),
       securityMgr)
     registered = true
@@ -620,20 +612,21 @@ private[spark] class ApplicationMaster(
 
     sparkConf.set("spark.driver.host", driverHost)
     sparkConf.set("spark.driver.port", driverPort.toString)
-
-    runAMEndpoint(driverHost, driverPort.toString, isClusterMode = false)
+    createSchedulerRef(driverHost, driverPort.toString)
   }
 
   /** Add the Yarn IP filter that is required for properly securing the UI. */
-  private def addAmIpFilter() = {
+  private def addAmIpFilter(driver: Option[RpcEndpointRef]) = {
     val proxyBase = System.getenv(ApplicationConstants.APPLICATION_WEB_PROXY_BASE_ENV)
     val amFilter = "org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter"
     val params = client.getAmIpFilterParams(yarnConf, proxyBase)
-    if (isClusterMode) {
-      System.setProperty("spark.ui.filters", amFilter)
-      params.foreach { case (k, v) => System.setProperty(s"spark.$amFilter.param.$k", v) }
-    } else {
-      amEndpoint.send(AddWebUIFilter(amFilter, params.toMap, proxyBase))
+    driver match {
+      case Some(d) =>
+        d.send(AddWebUIFilter(amFilter, params.toMap, proxyBase))
+
+      case None =>
+        System.setProperty("spark.ui.filters", amFilter)
+        params.foreach { case (k, v) => System.setProperty(s"spark.$amFilter.param.$k", v) }
     }
   }
 
@@ -704,20 +697,13 @@ private[spark] class ApplicationMaster(
   /**
    * An [[RpcEndpoint]] that communicates with the driver's scheduler backend.
    */
-  private class AMEndpoint(
-      override val rpcEnv: RpcEnv, driver: RpcEndpointRef, isClusterMode: Boolean)
+  private class AMEndpoint(override val rpcEnv: RpcEnv, driver: RpcEndpointRef)
     extends RpcEndpoint with Logging {
 
     override def onStart(): Unit = {
       driver.send(RegisterClusterManager(self))
     }
 
-    override def receive: PartialFunction[Any, Unit] = {
-      case x: AddWebUIFilter =>
-        logInfo(s"Add WebUI Filter. $x")
-        driver.send(x)
-    }
-
     override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
       case r: RequestExecutors =>
         Option(allocator) match {
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
index 8452f43774194..415a29fd887e8 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
@@ -69,9 +69,6 @@ private[spark] abstract class YarnSchedulerBackend(
   /** Scheduler extension services. */
   private val services: SchedulerExtensionServices = new SchedulerExtensionServices()
 
-  // Flag to specify whether this schedulerBackend should be reset.
-  private var shouldResetOnAmRegister = false
-
   /**
    * Bind to YARN. This *must* be done before calling [[start()]].
    *
@@ -262,13 +259,7 @@ private[spark] abstract class YarnSchedulerBackend(
       case RegisterClusterManager(am) =>
         logInfo(s"ApplicationMaster registered as $am")
         amEndpoint = Option(am)
-        if (!shouldResetOnAmRegister) {
-          shouldResetOnAmRegister = true
-        } else {
-          // AM is already registered before, this potentially means that AM failed and
-          // a new one registered after the failure. This will only happen in yarn-client mode.
-          reset()
-        }
+        reset()
 
       case AddWebUIFilter(filterName, filterParams, proxyBase) =>
         addWebUIFilter(filterName, filterParams, proxyBase)

From 74cda94c5e496e29f42f1044aab90cab7dbe9d38 Mon Sep 17 00:00:00 2001
From: Grzegorz Slowikowski <gslowikowski@gmail.com>
Date: Tue, 1 Aug 2017 19:03:34 +0100
Subject: [PATCH 273/779] [SPARK-21592][BUILD] Skip maven-compiler-plugin main
 and test compilations in Maven build

`scala-maven-plugin` in `incremental` mode compiles `Scala` and `Java` classes. There is no need to execute `maven-compiler-plugin` goals to compile (in fact recompile) `Java`.

This change reduces compilation time (over 10% on my machine).

Author: Grzegorz Slowikowski <gslowikowski@gmail.com>

Closes #18750 from gslowikowski/remove-redundant-compilation-from-maven.
---
 common/sketch/pom.xml | 11 -----------
 common/unsafe/pom.xml | 11 -----------
 pom.xml               | 12 ++----------
 3 files changed, 2 insertions(+), 32 deletions(-)

diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 6b81fc2b2b040..ca929d8f253d7 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -69,17 +69,6 @@
             </javacArgs>
           </configuration>
         </plugin>
-        <plugin>
-          <groupId>org.apache.maven.plugins</groupId>
-          <artifactId>maven-compiler-plugin</artifactId>
-          <version>3.6.1</version>
-          <configuration>
-            <compilerArgs combine.children="append">
-              <!-- This option is needed to suppress warnings from sun.misc.Unsafe usage -->
-              <arg>-XDignore.symbol.file</arg>
-            </compilerArgs>
-          </configuration>
-        </plugin>
       </plugins>
     </pluginManagement>
   </build>
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 680d0413b1616..772b0ef82f01c 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -106,17 +106,6 @@
             </javacArgs>
           </configuration>
         </plugin>
-        <plugin>
-          <groupId>org.apache.maven.plugins</groupId>
-          <artifactId>maven-compiler-plugin</artifactId>
-          <version>3.6.1</version>
-          <configuration>
-            <compilerArgs combine.children="append">
-              <!-- This option is needed to suppress warnings from sun.misc.Unsafe usage -->
-              <arg>-XDignore.symbol.file</arg>
-            </compilerArgs>
-          </configuration>
-        </plugin>
       </plugins>
     </pluginManagement>
   </build>
diff --git a/pom.xml b/pom.xml
index c24334333d687..589c7afe1e5f9 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1972,14 +1972,12 @@
             </execution>
             <execution>
               <id>scala-compile-first</id>
-              <phase>process-resources</phase>
               <goals>
                 <goal>compile</goal>
               </goals>
             </execution>
             <execution>
               <id>scala-test-compile-first</id>
-              <phase>process-test-resources</phase>
               <goals>
                 <goal>testCompile</goal>
               </goals>
@@ -2022,14 +2020,8 @@
           <artifactId>maven-compiler-plugin</artifactId>
           <version>3.6.1</version>
           <configuration>
-            <source>${java.version}</source>
-            <target>${java.version}</target>
-            <encoding>UTF-8</encoding>
-            <maxmem>1024m</maxmem>
-            <fork>true</fork>
-            <compilerArgs>
-              <arg>-Xlint:all,-serial,-path</arg>
-            </compilerArgs>
+            <skipMain>true</skipMain> <!-- skip compile -->
+            <skip>true</skip> <!-- skip testCompile -->
           </configuration>
         </plugin>
         <plugin>

From b1d59e60dee2a41f8eff8ef29b3bcac69111e2f0 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Tue, 1 Aug 2017 19:05:55 +0100
Subject: [PATCH 274/779] [SPARK-21593][DOCS] Fix 2 rendering errors on
 configuration page

## What changes were proposed in this pull request?

Fix 2 rendering errors on configuration doc page, due to SPARK-21243 and SPARK-15355.

## How was this patch tested?

Manually built and viewed docs with jekyll

Author: Sean Owen <sowen@cloudera.com>

Closes #18793 from srowen/SPARK-21593.
---
 docs/configuration.md | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/docs/configuration.md b/docs/configuration.md
index 500f980455b0e..011d583d6ea70 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -536,15 +536,17 @@ Apart from these, the following properties are also available, and may be useful
   </td>
 </tr>
 <tr>
-    <td><code>spark.reducer.maxBlocksInFlightPerAddress</code></td>
-    <td>Int.MaxValue</td>
-    <td>
-      This configuration limits the number of remote blocks being fetched per reduce task from a
-      given host port. When a large number of blocks are being requested from a given address in a
-      single fetch or simultaneously, this could crash the serving executor or Node Manager. This
-      is especially useful to reduce the load on the Node Manager when external shuffle is enabled.
-      You can mitigate this issue by setting it to a lower value.
-    </td>
+  <td><code>spark.reducer.maxBlocksInFlightPerAddress</code></td>
+  <td>Int.MaxValue</td>
+  <td>
+    This configuration limits the number of remote blocks being fetched per reduce task from a
+    given host port. When a large number of blocks are being requested from a given address in a
+    single fetch or simultaneously, this could crash the serving executor or Node Manager. This
+    is especially useful to reduce the load on the Node Manager when external shuffle is enabled.
+    You can mitigate this issue by setting it to a lower value.
+  </td>
+</tr>
+<tr>
   <td><code>spark.reducer.maxReqSizeShuffleToMem</code></td>
   <td>Long.MaxValue</td>
   <td>
@@ -1081,7 +1083,7 @@ Apart from these, the following properties are also available, and may be useful
   </td>
 </tr>
 <tr>
-  <td><code>spark.storage.replication.proactive<code></td>
+  <td><code>spark.storage.replication.proactive</code></td>
   <td>false</td>
   <td>
     Enables proactive block replication for RDD blocks. Cached RDD block replicas lost due to

From 58da1a2455258156fe8ba57241611eac1a7928ef Mon Sep 17 00:00:00 2001
From: Devaraj K <devaraj@apache.org>
Date: Tue, 1 Aug 2017 13:38:55 -0700
Subject: [PATCH 275/779] [SPARK-21339][CORE] spark-shell --packages option
 does not add jars to classpath on windows

The --packages option jars are getting added to the classpath with the scheme as "file:///", in Unix it doesn't have problem with this since the scheme contains the Unix Path separator which separates the jar name with location in the classpath. In Windows, the jar file is not getting resolved from the classpath because of the scheme.

Windows : file:///C:/Users/<user>/.ivy2/jars/<jar-name>.jar
Unix : file:///home/<user>/.ivy2/jars/<jar-name>.jar

With this PR, we are avoiding the 'file://' scheme to get added to the packages jar files.

I have verified manually in Windows and Unix environments, with the change it adds the jar to classpath like below,

Windows : C:\Users\<user>\.ivy2\jars\<jar-name>.jar
Unix : /home/<user>/.ivy2/jars/<jar-name>.jar

Author: Devaraj K <devaraj@apache.org>

Closes #18708 from devaraj-kavali/SPARK-21339.
---
 .../src/main/scala/org/apache/spark/repl/Main.scala         | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
index b8b38e828b255..9702a1e653c32 100644
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
+++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.repl
 
 import java.io.File
+import java.net.URI
 import java.util.Locale
 
 import scala.tools.nsc.GenericRunnerSettings
@@ -56,7 +57,10 @@ object Main extends Logging {
   // Visible for testing
   private[repl] def doMain(args: Array[String], _interp: SparkILoop): Unit = {
     interp = _interp
-    val jars = Utils.getUserJars(conf, isShell = true).mkString(File.pathSeparator)
+    val jars = Utils.getUserJars(conf, isShell = true)
+      // Remove file:///, file:// or file:/ scheme if exists for each jar
+      .map { x => if (x.startsWith("file:")) new File(new URI(x)).getPath else x }
+      .mkString(File.pathSeparator)
     val interpArguments = List(
       "-Yrepl-class-based",
       "-Yrepl-outdir", s"${outputDir.getAbsolutePath}",

From 77cc0d67d5a7ea526f8efd37b2590923953cb8e0 Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Wed, 2 Aug 2017 07:12:23 +0900
Subject: [PATCH 276/779] [SPARK-12717][PYTHON] Adding thread-safe broadcast
 pickle registry

## What changes were proposed in this pull request?

When using PySpark broadcast variables in a multi-threaded environment,  `SparkContext._pickled_broadcast_vars` becomes a shared resource.  A race condition can occur when broadcast variables that are pickled from one thread get added to the shared ` _pickled_broadcast_vars` and become part of the python command from another thread.  This PR introduces a thread-safe pickled registry using thread local storage so that when python command is pickled (causing the broadcast variable to be pickled and added to the registry) each thread will have their own view of the pickle registry to retrieve and clear the broadcast variables used.

## How was this patch tested?

Added a unit test that causes this race condition using another thread.

Author: Bryan Cutler <cutlerb@gmail.com>

Closes #18695 from BryanCutler/pyspark-bcast-threadsafe-SPARK-12717.
---
 python/pyspark/broadcast.py | 19 ++++++++++++++++
 python/pyspark/context.py   |  4 ++--
 python/pyspark/tests.py     | 44 +++++++++++++++++++++++++++++++++++++
 3 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/broadcast.py b/python/pyspark/broadcast.py
index b1b59f73d6718..02fc515fb824a 100644
--- a/python/pyspark/broadcast.py
+++ b/python/pyspark/broadcast.py
@@ -19,6 +19,7 @@
 import sys
 import gc
 from tempfile import NamedTemporaryFile
+import threading
 
 from pyspark.cloudpickle import print_exec
 from pyspark.util import _exception_message
@@ -139,6 +140,24 @@ def __reduce__(self):
         return _from_id, (self._jbroadcast.id(),)
 
 
+class BroadcastPickleRegistry(threading.local):
+    """ Thread-local registry for broadcast variables that have been pickled
+    """
+
+    def __init__(self):
+        self.__dict__.setdefault("_registry", set())
+
+    def __iter__(self):
+        for bcast in self._registry:
+            yield bcast
+
+    def add(self, bcast):
+        self._registry.add(bcast)
+
+    def clear(self):
+        self._registry.clear()
+
+
 if __name__ == "__main__":
     import doctest
     (failure_count, test_count) = doctest.testmod()
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index 80cb48fb8209e..a7046043e0376 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -30,7 +30,7 @@
 
 from pyspark import accumulators
 from pyspark.accumulators import Accumulator
-from pyspark.broadcast import Broadcast
+from pyspark.broadcast import Broadcast, BroadcastPickleRegistry
 from pyspark.conf import SparkConf
 from pyspark.files import SparkFiles
 from pyspark.java_gateway import launch_gateway
@@ -195,7 +195,7 @@ def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize,
         # This allows other code to determine which Broadcast instances have
         # been pickled, so it can determine which Java broadcast objects to
         # send.
-        self._pickled_broadcast_vars = set()
+        self._pickled_broadcast_vars = BroadcastPickleRegistry()
 
         SparkFiles._sc = self
         root_dir = SparkFiles.getRootDirectory()
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 73ab442dfd791..000dd1eb8e481 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -858,6 +858,50 @@ def test_multiple_broadcasts(self):
         self.assertEqual(N, size)
         self.assertEqual(checksum, csum)
 
+    def test_multithread_broadcast_pickle(self):
+        import threading
+
+        b1 = self.sc.broadcast(list(range(3)))
+        b2 = self.sc.broadcast(list(range(3)))
+
+        def f1():
+            return b1.value
+
+        def f2():
+            return b2.value
+
+        funcs_num_pickled = {f1: None, f2: None}
+
+        def do_pickle(f, sc):
+            command = (f, None, sc.serializer, sc.serializer)
+            ser = CloudPickleSerializer()
+            ser.dumps(command)
+
+        def process_vars(sc):
+            broadcast_vars = list(sc._pickled_broadcast_vars)
+            num_pickled = len(broadcast_vars)
+            sc._pickled_broadcast_vars.clear()
+            return num_pickled
+
+        def run(f, sc):
+            do_pickle(f, sc)
+            funcs_num_pickled[f] = process_vars(sc)
+
+        # pickle f1, adds b1 to sc._pickled_broadcast_vars in main thread local storage
+        do_pickle(f1, self.sc)
+
+        # run all for f2, should only add/count/clear b2 from worker thread local storage
+        t = threading.Thread(target=run, args=(f2, self.sc))
+        t.start()
+        t.join()
+
+        # count number of vars pickled in main thread, only b1 should be counted and cleared
+        funcs_num_pickled[f1] = process_vars(self.sc)
+
+        self.assertEqual(funcs_num_pickled[f1], 1)
+        self.assertEqual(funcs_num_pickled[f2], 1)
+        self.assertEqual(len(list(self.sc._pickled_broadcast_vars)), 0)
+
     def test_large_closure(self):
         N = 200000
         data = [float(i) for i in xrange(N)]

From 4cc704b12194cfa70717fb35d90b8b2ccda29866 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 1 Aug 2017 17:10:46 -0700
Subject: [PATCH 277/779] [CORE][MINOR] Improve the error message of checkpoint
 RDD verification

### What changes were proposed in this pull request?
The original error message is pretty confusing. It is unable to tell which number is `number of partitions` and which one is the `RDD ID`. This PR is to improve the checkpoint checking.

### How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #18796 from gatorsmile/improveErrMsgForCheckpoint.
---
 .../scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala  | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala
index 37c67cee55f90..979152b55f957 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala
@@ -152,8 +152,10 @@ private[spark] object ReliableCheckpointRDD extends Logging {
       sc, checkpointDirPath.toString, originalRDD.partitioner)
     if (newRDD.partitions.length != originalRDD.partitions.length) {
       throw new SparkException(
-        s"Checkpoint RDD $newRDD(${newRDD.partitions.length}) has different " +
-          s"number of partitions from original RDD $originalRDD(${originalRDD.partitions.length})")
+        "Checkpoint RDD has a different number of partitions from original RDD. Original " +
+          s"RDD [ID: ${originalRDD.id}, num of partitions: ${originalRDD.partitions.length}]; " +
+          s"Checkpoint RDD [ID: ${newRDD.id}, num of partitions: " +
+          s"${newRDD.partitions.length}].")
     }
     newRDD
   }

From 14e75758ab1ea19da377d30ccc2297b7082af651 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Tue, 1 Aug 2017 21:59:02 -0700
Subject: [PATCH 278/779] [SPARK-21578][CORE] Add JavaSparkContextSuite

## What changes were proposed in this pull request?

Due to SI-8479, [SPARK-1093](https://issues.apache.org/jira/browse/SPARK-21578) introduced redundant [SparkContext constructors](https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/SparkContext.scala#L148-L181). However, [SI-8479](https://issues.scala-lang.org/browse/SI-8479) is already fixed in Scala 2.10.5 and Scala 2.11.1.

The real reason to provide this constructor is that Java code can access `SparkContext` directly. It's Scala behavior, SI-4278. So, this PR adds an explicit testsuite, `JavaSparkContextSuite`  to prevent future regression, and fixes the outdate comment, too.

## How was this patch tested?

Pass the Jenkins with a new test suite.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #18778 from dongjoon-hyun/SPARK-21578.
---
 .../scala/org/apache/spark/SparkContext.scala |  5 +-
 .../apache/spark/JavaSparkContextSuite.java   | 62 +++++++++++++++++++
 2 files changed, 64 insertions(+), 3 deletions(-)
 create mode 100644 core/src/test/java/test/org/apache/spark/JavaSparkContextSuite.java

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index e1ce66a547bc6..531646891499b 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -145,9 +145,8 @@ class SparkContext(config: SparkConf) extends Logging {
     this(SparkContext.updatedConf(new SparkConf(), master, appName, sparkHome, jars, environment))
   }
 
-  // NOTE: The below constructors could be consolidated using default arguments. Due to
-  // Scala bug SI-8479, however, this causes the compile step to fail when generating docs.
-  // Until we have a good workaround for that bug the constructors remain broken out.
+  // The following constructors are required when Java code accesses SparkContext directly.
+  // Please see SI-4278
 
   /**
    * Alternative constructor that allows setting common Spark properties directly
diff --git a/core/src/test/java/test/org/apache/spark/JavaSparkContextSuite.java b/core/src/test/java/test/org/apache/spark/JavaSparkContextSuite.java
new file mode 100644
index 0000000000000..7e9cc70d8651f
--- /dev/null
+++ b/core/src/test/java/test/org/apache/spark/JavaSparkContextSuite.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package test.org.apache.spark;
+
+import java.io.*;
+
+import scala.collection.immutable.List;
+import scala.collection.immutable.List$;
+import scala.collection.immutable.Map;
+import scala.collection.immutable.Map$;
+
+import org.junit.Test;
+
+import org.apache.spark.api.java.*;
+import org.apache.spark.*;
+
+/**
+ * Java apps can uses both Java-friendly JavaSparkContext and Scala SparkContext.
+ */
+public class JavaSparkContextSuite implements Serializable {
+
+  @Test
+  public void javaSparkContext() {
+    String[] jars = new String[] {};
+    java.util.Map<String, String> environment = new java.util.HashMap<>();
+
+    new JavaSparkContext(new SparkConf().setMaster("local").setAppName("name")).stop();
+    new JavaSparkContext("local", "name", new SparkConf()).stop();
+    new JavaSparkContext("local", "name").stop();
+    new JavaSparkContext("local", "name", "sparkHome", "jarFile").stop();
+    new JavaSparkContext("local", "name", "sparkHome", jars).stop();
+    new JavaSparkContext("local", "name", "sparkHome", jars, environment).stop();
+  }
+
+  @Test
+  public void scalaSparkContext() {
+    List<String> jars = List$.MODULE$.empty();
+    Map<String, String> environment = Map$.MODULE$.empty();
+
+    new SparkContext(new SparkConf().setMaster("local").setAppName("name")).stop();
+    new SparkContext("local", "name", new SparkConf()).stop();
+    new SparkContext("local", "name").stop();
+    new SparkContext("local", "name", "sparkHome").stop();
+    new SparkContext("local", "name", "sparkHome", jars).stop();
+    new SparkContext("local", "name", "sparkHome", jars, environment).stop();
+  }
+}

From 845c039ceb1662632a97631b110e875e934894ad Mon Sep 17 00:00:00 2001
From: zero323 <zero323@users.noreply.github.com>
Date: Wed, 2 Aug 2017 18:10:26 +0800
Subject: [PATCH 279/779] [SPARK-20601][ML] Python API for Constrained Logistic
 Regression

## What changes were proposed in this pull request?
Python API for Constrained Logistic Regression based on #17922 , thanks for the original contribution from zero323 .

## How was this patch tested?
Unit tests.

Author: zero323 <zero323@users.noreply.github.com>
Author: Yanbo Liang <ybliang8@gmail.com>

Closes #18759 from yanboliang/SPARK-20601.
---
 python/pyspark/ml/classification.py | 105 ++++++++++++++++++++++++++--
 python/pyspark/ml/param/__init__.py |  11 ++-
 python/pyspark/ml/tests.py          |  37 ++++++++++
 3 files changed, 148 insertions(+), 5 deletions(-)

diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index ab1617ba47221..bccf8e7f636f1 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -252,18 +252,55 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
                    "be used in the model. Supported options: auto, binomial, multinomial",
                    typeConverter=TypeConverters.toString)
 
+    lowerBoundsOnCoefficients = Param(Params._dummy(), "lowerBoundsOnCoefficients",
+                                      "The lower bounds on coefficients if fitting under bound "
+                                      "constrained optimization. The bound matrix must be "
+                                      "compatible with the shape "
+                                      "(1, number of features) for binomial regression, or "
+                                      "(number of classes, number of features) "
+                                      "for multinomial regression.",
+                                      typeConverter=TypeConverters.toMatrix)
+
+    upperBoundsOnCoefficients = Param(Params._dummy(), "upperBoundsOnCoefficients",
+                                      "The upper bounds on coefficients if fitting under bound "
+                                      "constrained optimization. The bound matrix must be "
+                                      "compatible with the shape "
+                                      "(1, number of features) for binomial regression, or "
+                                      "(number of classes, number of features) "
+                                      "for multinomial regression.",
+                                      typeConverter=TypeConverters.toMatrix)
+
+    lowerBoundsOnIntercepts = Param(Params._dummy(), "lowerBoundsOnIntercepts",
+                                    "The lower bounds on intercepts if fitting under bound "
+                                    "constrained optimization. The bounds vector size must be"
+                                    "equal with 1 for binomial regression, or the number of"
+                                    "lasses for multinomial regression.",
+                                    typeConverter=TypeConverters.toVector)
+
+    upperBoundsOnIntercepts = Param(Params._dummy(), "upperBoundsOnIntercepts",
+                                    "The upper bounds on intercepts if fitting under bound "
+                                    "constrained optimization. The bound vector size must be "
+                                    "equal with 1 for binomial regression, or the number of "
+                                    "classes for multinomial regression.",
+                                    typeConverter=TypeConverters.toVector)
+
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                  maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
                  threshold=0.5, thresholds=None, probabilityCol="probability",
                  rawPredictionCol="rawPrediction", standardization=True, weightCol=None,
-                 aggregationDepth=2, family="auto"):
+                 aggregationDepth=2, family="auto",
+                 lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None,
+                 lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None):
+
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
                  threshold=0.5, thresholds=None, probabilityCol="probability", \
                  rawPredictionCol="rawPrediction", standardization=True, weightCol=None, \
-                 aggregationDepth=2, family="auto")
+                 aggregationDepth=2, family="auto", \
+                 lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, \
+                 lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None):
         If the threshold and thresholds Params are both set, they must be equivalent.
         """
         super(LogisticRegression, self).__init__()
@@ -280,13 +317,17 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
                   maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
                   threshold=0.5, thresholds=None, probabilityCol="probability",
                   rawPredictionCol="rawPrediction", standardization=True, weightCol=None,
-                  aggregationDepth=2, family="auto"):
+                  aggregationDepth=2, family="auto",
+                  lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None,
+                  lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None):
         """
         setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                   maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
                   threshold=0.5, thresholds=None, probabilityCol="probability", \
                   rawPredictionCol="rawPrediction", standardization=True, weightCol=None, \
-                  aggregationDepth=2, family="auto")
+                  aggregationDepth=2, family="auto", \
+                  lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, \
+                  lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None):
         Sets params for logistic regression.
         If the threshold and thresholds Params are both set, they must be equivalent.
         """
@@ -381,6 +422,62 @@ def getFamily(self):
         """
         return self.getOrDefault(self.family)
 
+    @since("2.3.0")
+    def setLowerBoundsOnCoefficients(self, value):
+        """
+        Sets the value of :py:attr:`lowerBoundsOnCoefficients`
+        """
+        return self._set(lowerBoundsOnCoefficients=value)
+
+    @since("2.3.0")
+    def getLowerBoundsOnCoefficients(self):
+        """
+        Gets the value of :py:attr:`lowerBoundsOnCoefficients`
+        """
+        return self.getOrDefault(self.lowerBoundsOnCoefficients)
+
+    @since("2.3.0")
+    def setUpperBoundsOnCoefficients(self, value):
+        """
+        Sets the value of :py:attr:`upperBoundsOnCoefficients`
+        """
+        return self._set(upperBoundsOnCoefficients=value)
+
+    @since("2.3.0")
+    def getUpperBoundsOnCoefficients(self):
+        """
+        Gets the value of :py:attr:`upperBoundsOnCoefficients`
+        """
+        return self.getOrDefault(self.upperBoundsOnCoefficients)
+
+    @since("2.3.0")
+    def setLowerBoundsOnIntercepts(self, value):
+        """
+        Sets the value of :py:attr:`lowerBoundsOnIntercepts`
+        """
+        return self._set(lowerBoundsOnIntercepts=value)
+
+    @since("2.3.0")
+    def getLowerBoundsOnIntercepts(self):
+        """
+        Gets the value of :py:attr:`lowerBoundsOnIntercepts`
+        """
+        return self.getOrDefault(self.lowerBoundsOnIntercepts)
+
+    @since("2.3.0")
+    def setUpperBoundsOnIntercepts(self, value):
+        """
+        Sets the value of :py:attr:`upperBoundsOnIntercepts`
+        """
+        return self._set(upperBoundsOnIntercepts=value)
+
+    @since("2.3.0")
+    def getUpperBoundsOnIntercepts(self):
+        """
+        Gets the value of :py:attr:`upperBoundsOnIntercepts`
+        """
+        return self.getOrDefault(self.upperBoundsOnIntercepts)
+
 
 class LogisticRegressionModel(JavaModel, JavaClassificationModel, JavaMLWritable, JavaMLReadable):
     """
diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py
index 99d8fa3a5b73e..4583ae85dc8fa 100644
--- a/python/pyspark/ml/param/__init__.py
+++ b/python/pyspark/ml/param/__init__.py
@@ -27,7 +27,7 @@
 
 from py4j.java_gateway import JavaObject
 
-from pyspark.ml.linalg import DenseVector, Vector
+from pyspark.ml.linalg import DenseVector, Vector, Matrix
 from pyspark.ml.util import Identifiable
 
 
@@ -169,6 +169,15 @@ def toVector(value):
                 return DenseVector(value)
         raise TypeError("Could not convert %s to vector" % value)
 
+    @staticmethod
+    def toMatrix(value):
+        """
+        Convert a value to a MLlib Matrix, if possible.
+        """
+        if isinstance(value, Matrix):
+            return value
+        raise TypeError("Could not convert %s to matrix" % value)
+
     @staticmethod
     def toFloat(value):
         """
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index a9ca346fa5d83..7ee2c2f3ffe76 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -1459,6 +1459,43 @@ def test_offset(self):
         self.assertTrue(np.isclose(model.intercept, -1.561613, atol=1E-4))
 
 
+class LogisticRegressionTest(SparkSessionTestCase):
+
+    def test_binomial_logistic_regression_with_bound(self):
+
+        df = self.spark.createDataFrame(
+            [(1.0, 1.0, Vectors.dense(0.0, 5.0)),
+             (0.0, 2.0, Vectors.dense(1.0, 2.0)),
+             (1.0, 3.0, Vectors.dense(2.0, 1.0)),
+             (0.0, 4.0, Vectors.dense(3.0, 3.0)), ], ["label", "weight", "features"])
+
+        lor = LogisticRegression(regParam=0.01, weightCol="weight",
+                                 lowerBoundsOnCoefficients=Matrices.dense(1, 2, [-1.0, -1.0]),
+                                 upperBoundsOnIntercepts=Vectors.dense(0.0))
+        model = lor.fit(df)
+        self.assertTrue(
+            np.allclose(model.coefficients.toArray(), [-0.2944, -0.0484], atol=1E-4))
+        self.assertTrue(np.isclose(model.intercept, 0.0, atol=1E-4))
+
+    def test_multinomial_logistic_regression_with_bound(self):
+
+        data_path = "data/mllib/sample_multiclass_classification_data.txt"
+        df = self.spark.read.format("libsvm").load(data_path)
+
+        lor = LogisticRegression(regParam=0.01,
+                                 lowerBoundsOnCoefficients=Matrices.dense(3, 4, range(12)),
+                                 upperBoundsOnIntercepts=Vectors.dense(0.0, 0.0, 0.0))
+        model = lor.fit(df)
+        expected = [[4.593, 4.5516, 9.0099, 12.2904],
+                    [1.0, 8.1093, 7.0, 10.0],
+                    [3.041, 5.0, 8.0, 11.0]]
+        for i in range(0, len(expected)):
+            self.assertTrue(
+                np.allclose(model.coefficientMatrix.toArray()[i], expected[i], atol=1E-4))
+        self.assertTrue(
+            np.allclose(model.interceptVector.toArray(), [-0.9057, -1.1392, -0.0033], atol=1E-4))
+
+
 class FPGrowthTests(SparkSessionTestCase):
     def setUp(self):
         super(FPGrowthTests, self).setUp()

From 7f63e85b47a93434030482160e88fe63bf9cff4e Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Wed, 2 Aug 2017 10:59:59 -0700
Subject: [PATCH 280/779] [SPARK-21597][SS] Fix a potential overflow issue in
 EventTimeStats

## What changes were proposed in this pull request?

This PR fixed a potential overflow issue in EventTimeStats.

## How was this patch tested?

The new unit tests

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #18803 from zsxwing/avg.
---
 .../streaming/EventTimeWatermarkExec.scala    | 10 ++---
 .../streaming/ProgressReporter.scala          |  2 +-
 .../streaming/EventTimeWatermarkSuite.scala   | 41 ++++++++++++++++++-
 3 files changed, 44 insertions(+), 9 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala
index 87e5b78550423..b161651c4e6a3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala
@@ -27,27 +27,25 @@ import org.apache.spark.unsafe.types.CalendarInterval
 import org.apache.spark.util.AccumulatorV2
 
 /** Class for collecting event time stats with an accumulator */
-case class EventTimeStats(var max: Long, var min: Long, var sum: Long, var count: Long) {
+case class EventTimeStats(var max: Long, var min: Long, var avg: Double, var count: Long) {
   def add(eventTime: Long): Unit = {
     this.max = math.max(this.max, eventTime)
     this.min = math.min(this.min, eventTime)
-    this.sum += eventTime
     this.count += 1
+    this.avg += (eventTime - avg) / count
   }
 
   def merge(that: EventTimeStats): Unit = {
     this.max = math.max(this.max, that.max)
     this.min = math.min(this.min, that.min)
-    this.sum += that.sum
     this.count += that.count
+    this.avg += (that.avg - this.avg) * that.count / this.count
   }
-
-  def avg: Long = sum / count
 }
 
 object EventTimeStats {
   def zero: EventTimeStats = EventTimeStats(
-    max = Long.MinValue, min = Long.MaxValue, sum = 0L, count = 0L)
+    max = Long.MinValue, min = Long.MaxValue, avg = 0.0, count = 0L)
 }
 
 /** Accumulator that collects stats on event time in a batch. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
index 1887b07c49b73..c5fbb638e3494 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
@@ -259,7 +259,7 @@ trait ProgressReporter extends Logging {
         Map(
           "max" -> stats.max,
           "min" -> stats.min,
-          "avg" -> stats.avg).mapValues(formatTimestamp)
+          "avg" -> stats.avg.toLong).mapValues(formatTimestamp)
     }.headOption.getOrElse(Map.empty) ++ watermarkTimestamp
 
     ExecutionStats(numInputRows, stateOperators, eventTimeStats)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala
index 1b60a06ec402f..552911f32ee43 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala
@@ -21,7 +21,7 @@ import java.{util => ju}
 import java.text.SimpleDateFormat
 import java.util.Date
 
-import org.scalatest.BeforeAndAfter
+import org.scalatest.{BeforeAndAfter, Matchers}
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.AnalysisException
@@ -30,7 +30,7 @@ import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.functions.{count, window}
 import org.apache.spark.sql.streaming.OutputMode._
 
-class EventTimeWatermarkSuite extends StreamTest with BeforeAndAfter with Logging {
+class EventTimeWatermarkSuite extends StreamTest with BeforeAndAfter with Matchers with Logging {
 
   import testImplicits._
 
@@ -38,6 +38,43 @@ class EventTimeWatermarkSuite extends StreamTest with BeforeAndAfter with Loggin
     sqlContext.streams.active.foreach(_.stop())
   }
 
+  test("EventTimeStats") {
+    val epsilon = 10E-6
+
+    val stats = EventTimeStats(max = 100, min = 10, avg = 20.0, count = 5)
+    stats.add(80L)
+    stats.max should be (100)
+    stats.min should be (10)
+    stats.avg should be (30.0 +- epsilon)
+    stats.count should be (6)
+
+    val stats2 = EventTimeStats(80L, 5L, 15.0, 4)
+    stats.merge(stats2)
+    stats.max should be (100)
+    stats.min should be (5)
+    stats.avg should be (24.0 +- epsilon)
+    stats.count should be (10)
+  }
+
+  test("EventTimeStats: avg on large values") {
+    val epsilon = 10E-6
+    val largeValue = 10000000000L // 10B
+    // Make sure `largeValue` will cause overflow if we use a Long sum to calc avg.
+    assert(largeValue * largeValue != BigInt(largeValue) * BigInt(largeValue))
+    val stats =
+      EventTimeStats(max = largeValue, min = largeValue, avg = largeValue, count = largeValue - 1)
+    stats.add(largeValue)
+    stats.avg should be (largeValue.toDouble +- epsilon)
+
+    val stats2 = EventTimeStats(
+      max = largeValue + 1,
+      min = largeValue,
+      avg = largeValue + 1,
+      count = largeValue)
+    stats.merge(stats2)
+    stats.avg should be ((largeValue + 0.5) +- epsilon)
+  }
+
   test("error on bad column") {
     val inputData = MemoryStream[Int].toDF()
     val e = intercept[AnalysisException] {

From 9456176da092c6d49c4d31df499798a44603e286 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Wed, 2 Aug 2017 12:05:55 -0700
Subject: [PATCH 281/779] [SPARK-21490][CORE] Make sure SparkLauncher redirects
 needed streams.

The code was failing to account for some cases when setting up log
redirection. For example, if a user redirected only stdout to a file,
the launcher code would leave stderr without redirection, which could
lead to child processes getting stuck because stderr wasn't being
read.

So detect cases where only one of the streams is redirected, and
redirect the other stream to the log as appropriate.

For the old "launch()" API, redirection of the unconfigured stream
only happens if the user has explicitly requested for log redirection.
Log redirection is on by default with "startApplication()".

Most of the change is actually adding new unit tests to make sure the
different cases work as expected. As part of that, I moved some tests
that were in the core/ module to the launcher/ module instead, since
they don't depend on spark-submit.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #18696 from vanzin/SPARK-21490.
---
 .../spark/launcher/SparkLauncherSuite.java    |  61 +----
 .../launcher/AbstractCommandBuilder.java      |   3 +
 .../spark/launcher/ChildProcAppHandle.java    |  14 +-
 .../spark/launcher/OutputRedirector.java      |   4 +
 .../apache/spark/launcher/SparkLauncher.java  |  87 ++++---
 .../launcher/OutputRedirectionSuite.java      | 226 ++++++++++++++++++
 launcher/src/test/resources/log4j.properties  |   4 +
 7 files changed, 307 insertions(+), 92 deletions(-)
 create mode 100644 launcher/src/test/java/org/apache/spark/launcher/OutputRedirectionSuite.java

diff --git a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
index 0c77123740852..19861b81c2562 100644
--- a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
+++ b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
@@ -45,12 +45,7 @@ public class SparkLauncherSuite {
   private static final Logger LOG = LoggerFactory.getLogger(SparkLauncherSuite.class);
   private static final NamedThreadFactory TF = new NamedThreadFactory("SparkLauncherSuite-%d");
 
-  private SparkLauncher launcher;
-
-  @Before
-  public void configureLauncher() {
-    launcher = new SparkLauncher().setSparkHome(System.getProperty("spark.test.home"));
-  }
+  private final SparkLauncher launcher = new SparkLauncher();
 
   @Test
   public void testSparkArgumentHandling() throws Exception {
@@ -101,60 +96,6 @@ public void testSparkArgumentHandling() throws Exception {
     assertEquals("python3.5", launcher.builder.conf.get(package$.MODULE$.PYSPARK_PYTHON().key()));
   }
 
-  @Test(expected=IllegalStateException.class)
-  public void testRedirectTwiceFails() throws Exception {
-    launcher.setAppResource("fake-resource.jar")
-      .setMainClass("my.fake.class.Fake")
-      .redirectError()
-      .redirectError(ProcessBuilder.Redirect.PIPE)
-      .launch();
-  }
-
-  @Test(expected=IllegalStateException.class)
-  public void testRedirectToLogWithOthersFails() throws Exception {
-    launcher.setAppResource("fake-resource.jar")
-      .setMainClass("my.fake.class.Fake")
-      .redirectToLog("fakeLog")
-      .redirectError(ProcessBuilder.Redirect.PIPE)
-      .launch();
-  }
-
-  @Test
-  public void testRedirectErrorToOutput() throws Exception {
-    launcher.redirectError();
-    assertTrue(launcher.redirectErrorStream);
-  }
-
-  @Test
-  public void testRedirectsSimple() throws Exception {
-    launcher.redirectError(ProcessBuilder.Redirect.PIPE);
-    assertNotNull(launcher.errorStream);
-    assertEquals(launcher.errorStream.type(), ProcessBuilder.Redirect.Type.PIPE);
-
-    launcher.redirectOutput(ProcessBuilder.Redirect.PIPE);
-    assertNotNull(launcher.outputStream);
-    assertEquals(launcher.outputStream.type(), ProcessBuilder.Redirect.Type.PIPE);
-  }
-
-  @Test
-  public void testRedirectLastWins() throws Exception {
-    launcher.redirectError(ProcessBuilder.Redirect.PIPE)
-      .redirectError(ProcessBuilder.Redirect.INHERIT);
-    assertEquals(launcher.errorStream.type(), ProcessBuilder.Redirect.Type.INHERIT);
-
-    launcher.redirectOutput(ProcessBuilder.Redirect.PIPE)
-      .redirectOutput(ProcessBuilder.Redirect.INHERIT);
-    assertEquals(launcher.outputStream.type(), ProcessBuilder.Redirect.Type.INHERIT);
-  }
-
-  @Test
-  public void testRedirectToLog() throws Exception {
-    launcher.redirectToLog("fakeLogger");
-    assertTrue(launcher.redirectToLog);
-    assertTrue(launcher.builder.getEffectiveConfig()
-      .containsKey(SparkLauncher.CHILD_PROCESS_LOGGER_NAME));
-  }
-
   @Test
   public void testChildProcLauncher() throws Exception {
     // This test is failed on Windows due to the failure of initiating executors
diff --git a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
index 8004247423f3c..860ab35852331 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
@@ -245,6 +245,9 @@ String getScalaVersion() {
 
   String getSparkHome() {
     String path = getenv(ENV_SPARK_HOME);
+    if (path == null && "1".equals(getenv("SPARK_TESTING"))) {
+      path = System.getProperty("spark.test.home");
+    }
     checkState(path != null,
       "Spark home not found; set it explicitly or use the SPARK_HOME environment variable.");
     return path;
diff --git a/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
index 12bf29d3b1aa8..3ce4b79ee9c05 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
@@ -18,6 +18,7 @@
 package org.apache.spark.launcher;
 
 import java.io.IOException;
+import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.logging.Level;
@@ -113,10 +114,12 @@ String getSecret() {
     return secret;
   }
 
-  void setChildProc(Process childProc, String loggerName) {
+  void setChildProc(Process childProc, String loggerName, InputStream logStream) {
     this.childProc = childProc;
-    this.redirector = new OutputRedirector(childProc.getInputStream(), loggerName,
-      SparkLauncher.REDIRECTOR_FACTORY);
+    if (logStream != null) {
+      this.redirector = new OutputRedirector(logStream, loggerName,
+        SparkLauncher.REDIRECTOR_FACTORY);
+    }
   }
 
   void setConnection(LauncherConnection connection) {
@@ -146,6 +149,11 @@ void setAppId(String appId) {
     fireEvent(true);
   }
 
+  // Visible for testing.
+  boolean isRunning() {
+    return childProc == null || childProc.isAlive() || (redirector != null && redirector.isAlive());
+  }
+
   private synchronized void fireEvent(boolean isInfoChanged) {
     if (listeners != null) {
       for (Listener l : listeners) {
diff --git a/launcher/src/main/java/org/apache/spark/launcher/OutputRedirector.java b/launcher/src/main/java/org/apache/spark/launcher/OutputRedirector.java
index ff8045390c157..63abae9a1c49d 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/OutputRedirector.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/OutputRedirector.java
@@ -71,4 +71,8 @@ void stop() {
     active = false;
   }
 
+  boolean isAlive() {
+    return thread.isAlive();
+  }
+
 }
diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java b/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java
index ea56214d2390c..b83fe1b2d01cb 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java
@@ -19,6 +19,7 @@
 
 import java.io.File;
 import java.io.IOException;
+import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
@@ -110,7 +111,6 @@ public static void setConfig(String name, String value) {
   // Visible for testing.
   final SparkSubmitCommandBuilder builder;
   File workingDir;
-  boolean redirectToLog;
   boolean redirectErrorStream;
   ProcessBuilder.Redirect errorStream;
   ProcessBuilder.Redirect outputStream;
@@ -446,7 +446,6 @@ public SparkLauncher redirectOutput(File outFile) {
    */
   public SparkLauncher redirectToLog(String loggerName) {
     setConf(CHILD_PROCESS_LOGGER_NAME, loggerName);
-    redirectToLog = true;
     return this;
   }
 
@@ -459,11 +458,22 @@ public SparkLauncher redirectToLog(String loggerName) {
    * @return A process handle for the Spark app.
    */
   public Process launch() throws IOException {
-    Process childProc = createBuilder().start();
-    if (redirectToLog) {
-      String loggerName = builder.getEffectiveConfig().get(CHILD_PROCESS_LOGGER_NAME);
-      new OutputRedirector(childProc.getInputStream(), loggerName, REDIRECTOR_FACTORY);
+    ProcessBuilder pb = createBuilder();
+
+    boolean outputToLog = outputStream == null;
+    boolean errorToLog = !redirectErrorStream && errorStream == null;
+
+    String loggerName = getLoggerName();
+    if (loggerName != null && outputToLog && errorToLog) {
+      pb.redirectErrorStream(true);
+    }
+
+    Process childProc = pb.start();
+    if (loggerName != null) {
+      InputStream logStream = outputToLog ? childProc.getInputStream() : childProc.getErrorStream();
+      new OutputRedirector(logStream, loggerName, REDIRECTOR_FACTORY);
     }
+
     return childProc;
   }
 
@@ -498,30 +508,35 @@ public SparkAppHandle startApplication(SparkAppHandle.Listener... listeners) thr
       handle.addListener(l);
     }
 
-    String loggerName = builder.getEffectiveConfig().get(CHILD_PROCESS_LOGGER_NAME);
+    String loggerName = getLoggerName();
     ProcessBuilder pb = createBuilder();
+
+    boolean outputToLog = outputStream == null;
+    boolean errorToLog = !redirectErrorStream && errorStream == null;
+
     // Only setup stderr + stdout to logger redirection if user has not otherwise configured output
     // redirection.
-    if (loggerName == null) {
-      String appName = builder.getEffectiveConfig().get(CHILD_PROCESS_LOGGER_NAME);
-      if (appName == null) {
-        if (builder.appName != null) {
-          appName = builder.appName;
-        } else if (builder.mainClass != null) {
-          int dot = builder.mainClass.lastIndexOf(".");
-          if (dot >= 0 && dot < builder.mainClass.length() - 1) {
-            appName = builder.mainClass.substring(dot + 1, builder.mainClass.length());
-          } else {
-            appName = builder.mainClass;
-          }
-        } else if (builder.appResource != null) {
-          appName = new File(builder.appResource).getName();
+    if (loggerName == null && (outputToLog || errorToLog)) {
+      String appName;
+      if (builder.appName != null) {
+        appName = builder.appName;
+      } else if (builder.mainClass != null) {
+        int dot = builder.mainClass.lastIndexOf(".");
+        if (dot >= 0 && dot < builder.mainClass.length() - 1) {
+          appName = builder.mainClass.substring(dot + 1, builder.mainClass.length());
         } else {
-          appName = String.valueOf(COUNTER.incrementAndGet());
+          appName = builder.mainClass;
         }
+      } else if (builder.appResource != null) {
+        appName = new File(builder.appResource).getName();
+      } else {
+        appName = String.valueOf(COUNTER.incrementAndGet());
       }
       String loggerPrefix = getClass().getPackage().getName();
       loggerName = String.format("%s.app.%s", loggerPrefix, appName);
+    }
+
+    if (outputToLog && errorToLog) {
       pb.redirectErrorStream(true);
     }
 
@@ -529,7 +544,12 @@ public SparkAppHandle startApplication(SparkAppHandle.Listener... listeners) thr
       String.valueOf(LauncherServer.getServerInstance().getPort()));
     pb.environment().put(LauncherProtocol.ENV_LAUNCHER_SECRET, handle.getSecret());
     try {
-      handle.setChildProc(pb.start(), loggerName);
+      Process child = pb.start();
+      InputStream logStream = null;
+      if (loggerName != null) {
+        logStream = outputToLog ? child.getInputStream() : child.getErrorStream();
+      }
+      handle.setChildProc(child, loggerName, logStream);
     } catch (IOException ioe) {
       handle.kill();
       throw ioe;
@@ -538,10 +558,9 @@ public SparkAppHandle startApplication(SparkAppHandle.Listener... listeners) thr
     return handle;
   }
 
-  private ProcessBuilder createBuilder() {
+  private ProcessBuilder createBuilder() throws IOException {
     List<String> cmd = new ArrayList<>();
-    String script = isWindows() ? "spark-submit.cmd" : "spark-submit";
-    cmd.add(join(File.separator, builder.getSparkHome(), "bin", script));
+    cmd.add(findSparkSubmit());
     cmd.addAll(builder.buildSparkSubmitArgs());
 
     // Since the child process is a batch script, let's quote things so that special characters are
@@ -568,11 +587,11 @@ private ProcessBuilder createBuilder() {
     // Similarly, if redirectToLog is specified, no other redirections should be specified.
     checkState(!redirectErrorStream || errorStream == null,
       "Cannot specify both redirectError() and redirectError(...) ");
-    checkState(!redirectToLog ||
-      (!redirectErrorStream && errorStream == null && outputStream == null),
+    checkState(getLoggerName() == null ||
+      ((!redirectErrorStream && errorStream == null) || outputStream == null),
       "Cannot used redirectToLog() in conjunction with other redirection methods.");
 
-    if (redirectErrorStream || redirectToLog) {
+    if (redirectErrorStream) {
       pb.redirectErrorStream(true);
     }
     if (errorStream != null) {
@@ -585,6 +604,16 @@ private ProcessBuilder createBuilder() {
     return pb;
   }
 
+  // Visible for testing.
+  String findSparkSubmit() throws IOException {
+    String script = isWindows() ? "spark-submit.cmd" : "spark-submit";
+    return join(File.separator, builder.getSparkHome(), "bin", script);
+  }
+
+  private String getLoggerName() throws IOException {
+    return builder.getEffectiveConfig().get(CHILD_PROCESS_LOGGER_NAME);
+  }
+
   private static class ArgumentValidator extends SparkSubmitOptionParser {
 
     private final boolean hasValue;
diff --git a/launcher/src/test/java/org/apache/spark/launcher/OutputRedirectionSuite.java b/launcher/src/test/java/org/apache/spark/launcher/OutputRedirectionSuite.java
new file mode 100644
index 0000000000000..ba044d3e4c0d7
--- /dev/null
+++ b/launcher/src/test/java/org/apache/spark/launcher/OutputRedirectionSuite.java
@@ -0,0 +1,226 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.EnumSet;
+import java.util.List;
+import java.util.stream.Collectors;
+import static java.nio.file.attribute.PosixFilePermission.*;
+
+import org.apache.log4j.AppenderSkeleton;
+import org.apache.log4j.spi.LoggingEvent;
+import org.junit.AfterClass;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import static org.junit.Assert.*;
+import static org.junit.Assume.*;
+
+import static org.apache.spark.launcher.CommandBuilderUtils.*;
+
+public class OutputRedirectionSuite extends BaseSuite {
+
+  private static final List<String> MESSAGES = new ArrayList<>();
+
+  private static final List<String> TEST_SCRIPT = Arrays.asList(
+    "#!/bin/sh",
+    "echo \"output\"",
+    "echo \"error\" 1>&2");
+
+  private static File TEST_SCRIPT_PATH;
+
+  @AfterClass
+  public static void cleanupClass() throws Exception {
+    if (TEST_SCRIPT_PATH != null) {
+      TEST_SCRIPT_PATH.delete();
+      TEST_SCRIPT_PATH = null;
+    }
+  }
+
+  @BeforeClass
+  public static void setupClass() throws Exception {
+    TEST_SCRIPT_PATH = File.createTempFile("output-redir-test", ".sh");
+    Files.setPosixFilePermissions(TEST_SCRIPT_PATH.toPath(),
+      EnumSet.of(OWNER_READ, OWNER_EXECUTE, OWNER_WRITE));
+    Files.write(TEST_SCRIPT_PATH.toPath(), TEST_SCRIPT);
+  }
+
+  @Before
+  public void cleanupLog() {
+    MESSAGES.clear();
+  }
+
+  @Test
+  public void testRedirectsSimple() throws Exception {
+    SparkLauncher launcher = new SparkLauncher();
+    launcher.redirectError(ProcessBuilder.Redirect.PIPE);
+    assertNotNull(launcher.errorStream);
+    assertEquals(launcher.errorStream.type(), ProcessBuilder.Redirect.Type.PIPE);
+
+    launcher.redirectOutput(ProcessBuilder.Redirect.PIPE);
+    assertNotNull(launcher.outputStream);
+    assertEquals(launcher.outputStream.type(), ProcessBuilder.Redirect.Type.PIPE);
+  }
+
+  @Test
+  public void testRedirectLastWins() throws Exception {
+    SparkLauncher launcher = new SparkLauncher();
+    launcher.redirectError(ProcessBuilder.Redirect.PIPE)
+      .redirectError(ProcessBuilder.Redirect.INHERIT);
+    assertEquals(launcher.errorStream.type(), ProcessBuilder.Redirect.Type.INHERIT);
+
+    launcher.redirectOutput(ProcessBuilder.Redirect.PIPE)
+      .redirectOutput(ProcessBuilder.Redirect.INHERIT);
+    assertEquals(launcher.outputStream.type(), ProcessBuilder.Redirect.Type.INHERIT);
+  }
+
+  @Test
+  public void testRedirectToLog() throws Exception {
+    assumeFalse(isWindows());
+
+    ChildProcAppHandle handle = (ChildProcAppHandle) new TestSparkLauncher().startApplication();
+    waitFor(handle);
+
+    assertTrue(MESSAGES.contains("output"));
+    assertTrue(MESSAGES.contains("error"));
+  }
+
+  @Test
+  public void testRedirectErrorToLog() throws Exception {
+    assumeFalse(isWindows());
+
+    Path err = Files.createTempFile("stderr", "txt");
+
+    ChildProcAppHandle handle = (ChildProcAppHandle) new TestSparkLauncher()
+      .redirectError(err.toFile())
+      .startApplication();
+    waitFor(handle);
+
+    assertTrue(MESSAGES.contains("output"));
+    assertEquals(Arrays.asList("error"), Files.lines(err).collect(Collectors.toList()));
+  }
+
+  @Test
+  public void testRedirectOutputToLog() throws Exception {
+    assumeFalse(isWindows());
+
+    Path out = Files.createTempFile("stdout", "txt");
+
+    ChildProcAppHandle handle = (ChildProcAppHandle) new TestSparkLauncher()
+      .redirectOutput(out.toFile())
+      .startApplication();
+    waitFor(handle);
+
+    assertTrue(MESSAGES.contains("error"));
+    assertEquals(Arrays.asList("output"), Files.lines(out).collect(Collectors.toList()));
+  }
+
+  @Test
+  public void testNoRedirectToLog() throws Exception {
+    assumeFalse(isWindows());
+
+    Path out = Files.createTempFile("stdout", "txt");
+    Path err = Files.createTempFile("stderr", "txt");
+
+    ChildProcAppHandle handle = (ChildProcAppHandle) new TestSparkLauncher()
+      .redirectError(err.toFile())
+      .redirectOutput(out.toFile())
+      .startApplication();
+    waitFor(handle);
+
+    assertTrue(MESSAGES.isEmpty());
+    assertEquals(Arrays.asList("error"), Files.lines(err).collect(Collectors.toList()));
+    assertEquals(Arrays.asList("output"), Files.lines(out).collect(Collectors.toList()));
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void testBadLogRedirect() throws Exception {
+    new SparkLauncher()
+      .redirectError()
+      .redirectOutput(Files.createTempFile("stdout", "txt").toFile())
+      .redirectToLog("foo")
+      .launch()
+      .waitFor();
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void testRedirectErrorTwiceFails() throws Exception {
+    new SparkLauncher()
+      .redirectError()
+      .redirectError(Files.createTempFile("stderr", "txt").toFile())
+      .launch()
+      .waitFor();
+  }
+
+  private void waitFor(ChildProcAppHandle handle) throws Exception {
+    try {
+      while (handle.isRunning()) {
+        Thread.sleep(10);
+      }
+    } finally {
+      // Explicit unregister from server since the handle doesn't yet do that when the
+      // process finishes by itself.
+      LauncherServer server = LauncherServer.getServerInstance();
+      if (server != null) {
+        server.unregister(handle);
+      }
+    }
+  }
+
+  private static class TestSparkLauncher extends SparkLauncher {
+
+    TestSparkLauncher() {
+      setAppResource("outputredirtest");
+    }
+
+    @Override
+    String findSparkSubmit() {
+      return TEST_SCRIPT_PATH.getAbsolutePath();
+    }
+
+  }
+
+  /**
+   * A log4j appender used by child apps of this test. It records all messages logged through it in
+   * memory so the test can check them.
+   */
+  public static class LogAppender extends AppenderSkeleton {
+
+    @Override
+    protected void append(LoggingEvent event) {
+      MESSAGES.add(event.getMessage().toString());
+    }
+
+    @Override
+    public boolean requiresLayout() {
+      return false;
+    }
+
+    @Override
+    public void close() {
+
+    }
+
+  }
+}
diff --git a/launcher/src/test/resources/log4j.properties b/launcher/src/test/resources/log4j.properties
index 744c456cb29c1..bd982b1d44aa8 100644
--- a/launcher/src/test/resources/log4j.properties
+++ b/launcher/src/test/resources/log4j.properties
@@ -29,5 +29,9 @@ log4j.appender.childproc.target=System.err
 log4j.appender.childproc.layout=org.apache.log4j.PatternLayout
 log4j.appender.childproc.layout.ConversionPattern=%t: %m%n
 
+log4j.appender.outputredirtest=org.apache.spark.launcher.OutputRedirectionSuite$LogAppender
+log4j.logger.org.apache.spark.launcher.app.outputredirtest=INFO, outputredirtest
+log4j.logger.org.apache.spark.launcher.app.outputredirtest.additivity=false
+
 # Ignore messages below warning level from Jetty, because it's a bit verbose
 log4j.logger.org.spark_project.jetty=WARN

From 0d26b3aa55f9cc75096b0e2b309f64fe3270b9a5 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Wed, 2 Aug 2017 14:02:13 -0700
Subject: [PATCH 282/779] [SPARK-21546][SS] dropDuplicates should ignore
 watermark when it's not a key

## What changes were proposed in this pull request?

When the watermark is not a column of `dropDuplicates`, right now it will crash. This PR fixed this issue.

## How was this patch tested?

The new unit test.

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #18822 from zsxwing/SPARK-21546.
---
 .../sql/execution/streaming/statefulOperators.scala |  9 +++++++--
 .../spark/sql/streaming/DeduplicateSuite.scala      | 13 +++++++++++++
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala
index 6addab69f13ec..e46356392c51b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala
@@ -156,8 +156,13 @@ trait WatermarkSupport extends UnaryExecNode {
   }
 
   /** Predicate based on keys that matches data older than the watermark */
-  lazy val watermarkPredicateForKeys: Option[Predicate] =
-    watermarkExpression.map(newPredicate(_, keyExpressions))
+  lazy val watermarkPredicateForKeys: Option[Predicate] = watermarkExpression.flatMap { e =>
+    if (keyExpressions.exists(_.metadata.contains(EventTimeWatermark.delayKey))) {
+      Some(newPredicate(e, keyExpressions))
+    } else {
+      None
+    }
+  }
 
   /** Predicate based on the child output that matches data older than the watermark. */
   lazy val watermarkPredicateForData: Option[Predicate] =
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/DeduplicateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/DeduplicateSuite.scala
index a15c2cff930fc..e858b7d9998a8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/DeduplicateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/DeduplicateSuite.scala
@@ -268,4 +268,17 @@ class DeduplicateSuite extends StateStoreMetricsTest with BeforeAndAfterAll {
       CheckLastBatch(7)
     )
   }
+
+  test("SPARK-21546: dropDuplicates should ignore watermark when it's not a key") {
+    val input = MemoryStream[(Int, Int)]
+    val df = input.toDS.toDF("id", "time")
+      .withColumn("time", $"time".cast("timestamp"))
+      .withWatermark("time", "1 second")
+      .dropDuplicates("id")
+      .select($"id", $"time".cast("long"))
+    testStream(df)(
+      AddData(input, 1 -> 1, 1 -> 2, 2 -> 2),
+      CheckLastBatch(1 -> 1, 2 -> 2)
+    )
+  }
 }

From 7c206dd332e3bbcd2dc748c1325b64b69f6fd2a7 Mon Sep 17 00:00:00 2001
From: Ayush Singh <singhay@ccs.neu.edu>
Date: Thu, 3 Aug 2017 09:58:43 +0100
Subject: [PATCH 283/779] [SPARK-21615][ML][MLLIB][DOCS] Fix broken redirect in
 collaborative filtering docs to databricks training repo

## What changes were proposed in this pull request?
* Current [MLlib Collaborative Filtering tutorial](https://spark.apache.org/docs/latest/mllib-collaborative-filtering.html) points to broken links to old databricks website.
* Databricks moved all their content to [git repo](https://github.com/databricks/spark-training)
* Two links needs to be fixed,
  * [training exercises](https://databricks-training.s3.amazonaws.com/index.html)
  * [personalized movie recommendation with spark.mllib](https://databricks-training.s3.amazonaws.com/movie-recommendation-with-mllib.html)

## How was this patch tested?
Generated docs locally

Author: Ayush Singh <singhay@ccs.neu.edu>

Closes #18821 from singhay/SPARK-21615.
---
 docs/mllib-collaborative-filtering.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/mllib-collaborative-filtering.md b/docs/mllib-collaborative-filtering.md
index d1bb6d69f1256..76a00f18b3b90 100644
--- a/docs/mllib-collaborative-filtering.md
+++ b/docs/mllib-collaborative-filtering.md
@@ -119,5 +119,5 @@ a dependency.
 
 ## Tutorial
 
-The [training exercises](https://databricks-training.s3.amazonaws.com/index.html) from the Spark Summit 2014 include a hands-on tutorial for
-[personalized movie recommendation with `spark.mllib`](https://databricks-training.s3.amazonaws.com/movie-recommendation-with-mllib.html).
+The [training exercises](https://github.com/databricks/spark-training) from the Spark Summit 2014 include a hands-on tutorial for
+[personalized movie recommendation with `spark.mllib`](https://github.com/databricks/spark-training/blob/master/website/movie-recommendation-with-mllib.md).

From f13dbb3a4ee5019d39cb5012f3f4d426449895b3 Mon Sep 17 00:00:00 2001
From: zuotingbing <zuo.tingbing9@zte.com.cn>
Date: Thu, 3 Aug 2017 10:13:52 +0100
Subject: [PATCH 284/779] [SPARK-21604][SQL] if the object extends Logging, i
 suggest to remove the var LOG which is useless.

## What changes were proposed in this pull request?

if the object extends Logging, i suggest to remove the var LOG which is useless.

## How was this patch tested?

Exist tests

Author: zuotingbing <zuo.tingbing9@zte.com.cn>

Closes #18811 from zuotingbing/SPARK-21604.
---
 .../apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala  | 2 --
 1 file changed, 2 deletions(-)

diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
index 5e4734ad3ad25..7442c987efc72 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
@@ -23,7 +23,6 @@ import java.util.concurrent.atomic.AtomicBoolean
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
-import org.apache.commons.logging.LogFactory
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 import org.apache.hive.service.cli.thrift.{ThriftBinaryCLIService, ThriftHttpCLIService}
@@ -45,7 +44,6 @@ import org.apache.spark.util.{ShutdownHookManager, Utils}
  * `HiveThriftServer2` thrift server.
  */
 object HiveThriftServer2 extends Logging {
-  var LOG = LogFactory.getLog(classOf[HiveServer2])
   var uiTab: Option[ThriftServerTab] = None
   var listener: HiveThriftServer2Listener = _
 

From 32214706113789e9f2f2befe2cb1eadad41743ee Mon Sep 17 00:00:00 2001
From: zuotingbing <zuo.tingbing9@zte.com.cn>
Date: Thu, 3 Aug 2017 11:08:18 +0100
Subject: [PATCH 285/779] [SPARK-21611][SQL] Error class name for log in
 several classes.

## What changes were proposed in this pull request?

Error class name for log in several classes. such as:
`2017-08-02 16:43:37,695 INFO CompositeService: Operation log root directory is created: /tmp/mr/operation_logs`
`Operation log root directory is created ... ` is in `SessionManager.java` actually.

## How was this patch tested?

manual tests

Author: zuotingbing <zuo.tingbing9@zte.com.cn>

Closes #18816 from zuotingbing/SPARK-21611.
---
 .../main/java/org/apache/hive/service/ServiceOperations.java    | 2 +-
 .../org/apache/hive/service/cli/session/SessionManager.java     | 2 +-
 .../apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/ServiceOperations.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/ServiceOperations.java
index c3219aabfc23b..f16863c1b41aa 100644
--- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/ServiceOperations.java
+++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/ServiceOperations.java
@@ -27,7 +27,7 @@
  *
  */
 public final class ServiceOperations {
-  private static final Log LOG = LogFactory.getLog(AbstractService.class);
+  private static final Log LOG = LogFactory.getLog(ServiceOperations.class);
 
   private ServiceOperations() {
   }
diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/SessionManager.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/SessionManager.java
index c1b3892f52060..859f9c8b449e5 100644
--- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/SessionManager.java
+++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/SessionManager.java
@@ -48,7 +48,7 @@
  */
 public class SessionManager extends CompositeService {
 
-  private static final Log LOG = LogFactory.getLog(CompositeService.class);
+  private static final Log LOG = LogFactory.getLog(SessionManager.class);
   public static final String HIVERCFILE = ".hiverc";
   private HiveConf hiveConf;
   private final Map<SessionHandle, HiveSession> handleToSession =
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
index 33e18a8da60fb..38c458948c90e 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
@@ -272,7 +272,7 @@ private[hive] object SparkSQLCLIDriver extends Logging {
 private[hive] class SparkSQLCLIDriver extends CliDriver with Logging {
   private val sessionState = SessionState.get().asInstanceOf[CliSessionState]
 
-  private val LOG = LogFactory.getLog("CliDriver")
+  private val LOG = LogFactory.getLog(classOf[SparkSQLCLIDriver])
 
   private val console = new SessionState.LogHelper(LOG)
 

From e7c59b41770e5bd311e8fd9cfd03027a20ea71b4 Mon Sep 17 00:00:00 2001
From: Chang chen <baibaichen@gmail.com>
Date: Thu, 3 Aug 2017 11:59:50 +0100
Subject: [PATCH 286/779] [SPARK-21605][BUILD] Let IntelliJ IDEA correctly
 detect Language level and Target byte code version

With SPARK-21592, removing source and target properties from maven-compiler-plugin lets IntelliJ IDEA use default Language level and Target byte code version which are 1.4.

This change adds source, target and encoding properties back to fix this issue.  As I test, it doesn't increase compile time.

Author: Chang chen <baibaichen@gmail.com>

Closes #18808 from baibaichen/feature/idea-fix.
---
 pom.xml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pom.xml b/pom.xml
index 589c7afe1e5f9..d54a9c4f19667 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2020,6 +2020,8 @@
           <artifactId>maven-compiler-plugin</artifactId>
           <version>3.6.1</version>
           <configuration>
+            <source>${java.version}</source>
+            <target>${java.version}</target>
             <skipMain>true</skipMain> <!-- skip compile -->
             <skip>true</skip> <!-- skip testCompile -->
           </configuration>

From 97ba4918368ba15334427bdd91230829ece606f6 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Thu, 3 Aug 2017 23:00:00 +0900
Subject: [PATCH 287/779] [SPARK-21602][R] Add map_keys and map_values
 functions to R

## What changes were proposed in this pull request?

This PR adds `map_values` and `map_keys` to R API.

```r
> df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))
> tmp <- mutate(df, v = create_map(df$model, df$cyl))
> head(select(tmp, map_keys(tmp$v)))
```
```
        map_keys(v)
1         Mazda RX4
2     Mazda RX4 Wag
3        Datsun 710
4    Hornet 4 Drive
5 Hornet Sportabout
6           Valiant
```
```r
> head(select(tmp, map_values(tmp$v)))
```
```
  map_values(v)
1             6
2             6
3             4
4             6
5             8
6             6
```

## How was this patch tested?

Manual tests and unit tests in `R/pkg/tests/fulltests/test_sparkSQL.R`

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #18809 from HyukjinKwon/map-keys-values-r.
---
 R/pkg/NAMESPACE                       |  2 ++
 R/pkg/R/functions.R                   | 33 ++++++++++++++++++++++++++-
 R/pkg/R/generics.R                    | 10 ++++++++
 R/pkg/tests/fulltests/test_sparkSQL.R |  8 +++++++
 4 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 232f5cf31f319..a1dd1af4bf266 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -286,6 +286,8 @@ exportMethods("%<=>%",
               "lower",
               "lpad",
               "ltrim",
+              "map_keys",
+              "map_values",
               "max",
               "md5",
               "mean",
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 86507f13f038d..5a46d737aeeb7 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -195,7 +195,10 @@ NULL
 #' head(tmp2)
 #' head(select(tmp, posexplode(tmp$v1)))
 #' head(select(tmp, sort_array(tmp$v1)))
-#' head(select(tmp, sort_array(tmp$v1, asc = FALSE)))}
+#' head(select(tmp, sort_array(tmp$v1, asc = FALSE)))
+#' tmp3 <- mutate(df, v3 = create_map(df$model, df$cyl))
+#' head(select(tmp3, map_keys(tmp3$v3)))
+#' head(select(tmp3, map_values(tmp3$v3)))}
 NULL
 
 #' Window functions for Column operations
@@ -3055,6 +3058,34 @@ setMethod("array_contains",
             column(jc)
           })
 
+#' @details
+#' \code{map_keys}: Returns an unordered array containing the keys of the map.
+#'
+#' @rdname column_collection_functions
+#' @aliases map_keys map_keys,Column-method
+#' @export
+#' @note map_keys since 2.3.0
+setMethod("map_keys",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "map_keys", x@jc)
+            column(jc)
+         })
+
+#' @details
+#' \code{map_values}: Returns an unordered array containing the values of the map.
+#'
+#' @rdname column_collection_functions
+#' @aliases map_values map_values,Column-method
+#' @export
+#' @note map_values since 2.3.0
+setMethod("map_values",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "map_values", x@jc)
+            column(jc)
+          })
+
 #' @details
 #' \code{explode}: Creates a new row for each element in the given array or map column.
 #'
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 92098741f72f9..df91c35f7d851 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -1213,6 +1213,16 @@ setGeneric("lpad", function(x, len, pad) { standardGeneric("lpad") })
 #' @name NULL
 setGeneric("ltrim", function(x) { standardGeneric("ltrim") })
 
+#' @rdname column_collection_functions
+#' @export
+#' @name NULL
+setGeneric("map_keys", function(x) { standardGeneric("map_keys") })
+
+#' @rdname column_collection_functions
+#' @export
+#' @name NULL
+setGeneric("map_values", function(x) { standardGeneric("map_values") })
+
 #' @rdname column_misc_functions
 #' @export
 #' @name NULL
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index 77052d4a28345..deb0e163a8d58 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -1436,6 +1436,14 @@ test_that("column functions", {
   result <- collect(select(df, sort_array(df[[1]])))[[1]]
   expect_equal(result, list(list(1L, 2L, 3L), list(4L, 5L, 6L)))
 
+  # Test map_keys() and map_values()
+  df <- createDataFrame(list(list(map = as.environment(list(x = 1, y = 2)))))
+  result <- collect(select(df, map_keys(df$map)))[[1]]
+  expect_equal(result, list(list("x", "y")))
+
+  result <- collect(select(df, map_values(df$map)))[[1]]
+  expect_equal(result, list(list(1, 2)))
+
   # Test that stats::lag is working
   expect_equal(length(lag(ldeaths, 12)), 72)
 

From 13785daa8df22cbe3541024d9118600a1e023afe Mon Sep 17 00:00:00 2001
From: Dilip Biswal <dbiswal@us.ibm.com>
Date: Thu, 3 Aug 2017 09:25:48 -0700
Subject: [PATCH 288/779] [SPARK-21599][SQL] Collecting column statistics for
 datasource tables may fail with java.util.NoSuchElementException

## What changes were proposed in this pull request?
In case of datasource tables (when they are stored in non-hive compatible way) , the schema information is recorded as table properties in hive meta-store. The alterTableStats method needs to get the schema information from table properties for data source tables before recording the column level statistics. Currently, we don't get the correct schema information and fail with java.util.NoSuchElement exception.

## How was this patch tested?
A new test case is added in StatisticsSuite.

Author: Dilip Biswal <dbiswal@us.ibm.com>

Closes #18804 from dilipbiswal/datasource_stats.
---
 .../spark/sql/hive/HiveExternalCatalog.scala  |  7 +-
 .../spark/sql/hive/StatisticsSuite.scala      | 76 ++++++++++++++++++-
 2 files changed, 81 insertions(+), 2 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 70d7dd23d908a..172317c34659d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -642,8 +642,13 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       if (stats.get.rowCount.isDefined) {
         statsProperties += STATISTICS_NUM_ROWS -> stats.get.rowCount.get.toString()
       }
+
+      // For datasource tables and hive serde tables created by spark 2.1 or higher,
+      // the data schema is stored in the table properties.
+      val schema = restoreTableMetadata(rawTable).schema
+
       val colNameTypeMap: Map[String, DataType] =
-        rawTable.schema.fields.map(f => (f.name, f.dataType)).toMap
+        schema.fields.map(f => (f.name, f.dataType)).toMap
       stats.get.colStats.foreach { case (colName, colStat) =>
         colStat.toMap(colName, colNameTypeMap(colName)).foreach { case (k, v) =>
           statsProperties += (columnStatKeyPropName(colName, k) -> v)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index 84bcea30d61a6..36566bffb9335 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -19,10 +19,11 @@ package org.apache.spark.sql.hive
 
 import java.io.{File, PrintWriter}
 
-import org.apache.hadoop.hive.common.StatsSetupConst
 import scala.reflect.ClassTag
 import scala.util.matching.Regex
 
+import org.apache.hadoop.hive.common.StatsSetupConst
+
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog.{CatalogRelation, CatalogStatistics}
@@ -34,9 +35,16 @@ import org.apache.spark.sql.execution.joins._
 import org.apache.spark.sql.hive.HiveExternalCatalog._
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types._
 
 
 class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleton {
+  private def dropMetadata(schema: StructType): StructType = {
+    val newFields = schema.fields.map { f =>
+      StructField(f.name, f.dataType, f.nullable, Metadata.empty)
+    }
+    StructType(newFields)
+  }
 
   test("Hive serde tables should fallback to HDFS for size estimation") {
     withSQLConf(SQLConf.ENABLE_FALL_BACK_TO_HDFS_FOR_STATS.key -> "true") {
@@ -117,6 +125,72 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
     }
   }
 
+  test("analyze non hive compatible datasource tables") {
+    val table = "parquet_tab"
+    withTable(table) {
+      sql(
+        s"""
+          |CREATE TABLE $table (a int, b int)
+          |USING parquet
+          |OPTIONS (skipHiveMetadata true)
+        """.stripMargin)
+
+      // Verify that the schema stored in catalog is a dummy one used for
+      // data source tables. The actual schema is stored in table properties.
+      val rawSchema = dropMetadata(hiveClient.getTable("default", table).schema)
+      val expectedRawSchema = new StructType()
+        .add("col", "array<string>")
+      assert(rawSchema == expectedRawSchema)
+
+      val actualSchema = spark.sharedState.externalCatalog.getTable("default", table).schema
+      val expectedActualSchema = new StructType()
+        .add("a", "int")
+        .add("b", "int")
+      assert(actualSchema == expectedActualSchema)
+
+      sql(s"INSERT INTO $table VALUES (1, 1)")
+      sql(s"INSERT INTO $table VALUES (2, 1)")
+      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS a, b")
+      val fetchedStats0 =
+        checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(2))
+      assert(fetchedStats0.get.colStats == Map(
+        "a" -> ColumnStat(2, Some(1), Some(2), 0, 4, 4),
+        "b" -> ColumnStat(1, Some(1), Some(1), 0, 4, 4)))
+    }
+  }
+
+  test("Analyze hive serde tables when schema is not same as schema in table properties") {
+
+    val table = "hive_serde"
+    withTable(table) {
+      sql(s"CREATE TABLE $table (C1 INT, C2 STRING, C3 DOUBLE)")
+
+      // Verify that the table schema stored in hive catalog is
+      // different than the schema stored in table properties.
+      val rawSchema = dropMetadata(hiveClient.getTable("default", table).schema)
+      val expectedRawSchema = new StructType()
+        .add("c1", "int")
+        .add("c2", "string")
+        .add("c3", "double")
+      assert(rawSchema == expectedRawSchema)
+
+      val actualSchema = spark.sharedState.externalCatalog.getTable("default", table).schema
+      val expectedActualSchema = new StructType()
+        .add("C1", "int")
+        .add("C2", "string")
+        .add("C3", "double")
+      assert(actualSchema == expectedActualSchema)
+
+      sql(s"INSERT INTO TABLE $table SELECT 1, 'a', 10.0")
+      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS C1")
+      val fetchedStats1 =
+        checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(1)).get
+      assert(fetchedStats1.colStats == Map(
+        "C1" -> ColumnStat(distinctCount = 1, min = Some(1), max = Some(1), nullCount = 0,
+          avgLen = 4, maxLen = 4)))
+    }
+  }
+
   test("SPARK-21079 - analyze table with location different than that of individual partitions") {
     val tableName = "analyzeTable_part"
     withTable(tableName) {

From bb7afb4e10bea406a0d7ab03c2ed7aa753f081b7 Mon Sep 17 00:00:00 2001
From: louis lyu <llyu@c02tk24rg8wl-lm.champ.corp.yahoo.com>
Date: Thu, 3 Aug 2017 13:15:35 -0500
Subject: [PATCH 289/779] [SPARK-20713][SPARK CORE] Convert CommitDenied to
 TaskKilled.

## What changes were proposed in this pull request?

In executor, toTaskFailedReason is converted to toTaskCommitDeniedReason to avoid the inconsistency of taskState. In JobProgressListener, add case TaskCommitDenied so that now the stage killed number is been incremented other than failed number.
This pull request is picked up from: https://github.com/apache/spark/pull/18070 using commit: ff93ade0248baf3793ab55659042f9d7b8efbdef
The case match for TaskCommitDenied is added incrementing the correct num of killed after pull/18070.

## How was this patch tested?

Run a normal speculative job and check the Stage UI page, should have no failed displayed.

Author: louis lyu <llyu@c02tk24rg8wl-lm.champ.corp.yahoo.com>

Closes #18819 from nlyu/SPARK-20713.
---
 .../spark/executor/CommitDeniedException.scala      |  4 ++--
 .../scala/org/apache/spark/executor/Executor.scala  |  4 ++--
 .../apache/spark/ui/jobs/JobProgressListener.scala  | 13 +++++++++++++
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/executor/CommitDeniedException.scala b/core/src/main/scala/org/apache/spark/executor/CommitDeniedException.scala
index 326e042419774..3e0d52cb4ccb9 100644
--- a/core/src/main/scala/org/apache/spark/executor/CommitDeniedException.scala
+++ b/core/src/main/scala/org/apache/spark/executor/CommitDeniedException.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.executor
 
-import org.apache.spark.{TaskCommitDenied, TaskFailedReason}
+import org.apache.spark.TaskCommitDenied
 
 /**
  * Exception thrown when a task attempts to commit output to HDFS but is denied by the driver.
@@ -29,5 +29,5 @@ private[spark] class CommitDeniedException(
     attemptNumber: Int)
   extends Exception(msg) {
 
-  def toTaskFailedReason: TaskFailedReason = TaskCommitDenied(jobID, splitID, attemptNumber)
+  def toTaskCommitDeniedReason: TaskCommitDenied = TaskCommitDenied(jobID, splitID, attemptNumber)
 }
diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index 690b5a212b144..6237a4df0be58 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -463,9 +463,9 @@ private[spark] class Executor(
             taskId, TaskState.KILLED, ser.serialize(TaskKilled(killReason)))
 
         case CausedBy(cDE: CommitDeniedException) =>
-          val reason = cDE.toTaskFailedReason
+          val reason = cDE.toTaskCommitDeniedReason
           setTaskFinishedAndClearInterruptStatus()
-          execBackend.statusUpdate(taskId, TaskState.FAILED, ser.serialize(reason))
+          execBackend.statusUpdate(taskId, TaskState.KILLED, ser.serialize(reason))
 
         case t: Throwable =>
           // Attempt to exit cleanly by informing the driver of our failure.
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
index 1b10feb36e439..a18e86ec0a73b 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
@@ -374,6 +374,10 @@ class JobProgressListener(conf: SparkConf) extends SparkListener with Logging {
         case kill: TaskKilled =>
           execSummary.reasonToNumKilled = execSummary.reasonToNumKilled.updated(
             kill.reason, execSummary.reasonToNumKilled.getOrElse(kill.reason, 0) + 1)
+        case commitDenied: TaskCommitDenied =>
+          execSummary.reasonToNumKilled = execSummary.reasonToNumKilled.updated(
+            commitDenied.toErrorString, execSummary.reasonToNumKilled.getOrElse(
+              commitDenied.toErrorString, 0) + 1)
         case _ =>
           execSummary.failedTasks += 1
       }
@@ -390,6 +394,11 @@ class JobProgressListener(conf: SparkConf) extends SparkListener with Logging {
             stageData.reasonToNumKilled = stageData.reasonToNumKilled.updated(
               kill.reason, stageData.reasonToNumKilled.getOrElse(kill.reason, 0) + 1)
             Some(kill.toErrorString)
+          case commitDenied: TaskCommitDenied =>
+            stageData.reasonToNumKilled = stageData.reasonToNumKilled.updated(
+              commitDenied.toErrorString, stageData.reasonToNumKilled.getOrElse(
+                commitDenied.toErrorString, 0) + 1)
+            Some(commitDenied.toErrorString)
           case e: ExceptionFailure => // Handle ExceptionFailure because we might have accumUpdates
             stageData.numFailedTasks += 1
             Some(e.toErrorString)
@@ -428,6 +437,10 @@ class JobProgressListener(conf: SparkConf) extends SparkListener with Logging {
           case kill: TaskKilled =>
             jobData.reasonToNumKilled = jobData.reasonToNumKilled.updated(
               kill.reason, jobData.reasonToNumKilled.getOrElse(kill.reason, 0) + 1)
+          case commitDenied: TaskCommitDenied =>
+            jobData.reasonToNumKilled = jobData.reasonToNumKilled.updated(
+              commitDenied.toErrorString, jobData.reasonToNumKilled.getOrElse(
+                commitDenied.toErrorString, 0) + 1)
           case _ =>
             jobData.numFailedTasks += 1
         }

From dd72b10aba9997977f82605c5c1778f02dd1f91e Mon Sep 17 00:00:00 2001
From: Christiam Camacho <camacho@ncbi.nlm.nih.gov>
Date: Thu, 3 Aug 2017 23:40:25 +0100
Subject: [PATCH 290/779] Fix Java SimpleApp spark application

## What changes were proposed in this pull request?

Add missing import and missing parentheses to invoke `SparkSession::text()`.

## How was this patch tested?

Built and the code for this application, ran jekyll locally per docs/README.md.

Author: Christiam Camacho <camacho@ncbi.nlm.nih.gov>

Closes #18795 from christiam/master.
---
 docs/quick-start.md                            | 3 ++-
 docs/structured-streaming-programming-guide.md | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/docs/quick-start.md b/docs/quick-start.md
index 7cb3f75132206..a85e5b28a6996 100644
--- a/docs/quick-start.md
+++ b/docs/quick-start.md
@@ -297,12 +297,13 @@ We'll create a very simple Spark application, `SimpleApp.java`:
 {% highlight java %}
 /* SimpleApp.java */
 import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.Dataset;
 
 public class SimpleApp {
   public static void main(String[] args) {
     String logFile = "YOUR_SPARK_HOME/README.md"; // Should be some file on your system
     SparkSession spark = SparkSession.builder().appName("Simple Application").getOrCreate();
-    Dataset<String> logData = spark.read.textFile(logFile).cache();
+    Dataset<String> logData = spark.read().textFile(logFile).cache();
 
     long numAs = logData.filter(s -> s.contains("a")).count();
     long numBs = logData.filter(s -> s.contains("b")).count();
diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md
index 8f64faadc32dc..8367f5a08c755 100644
--- a/docs/structured-streaming-programming-guide.md
+++ b/docs/structured-streaming-programming-guide.md
@@ -1041,8 +1041,8 @@ streamingDf.join(staticDf, "type", "right_join")  // right outer join with a sta
 <div data-lang="java"  markdown="1">
 
 {% highlight java %}
-Dataset<Row> staticDf = spark.read. ...;
-Dataset<Row> streamingDf = spark.readStream. ...;
+Dataset<Row> staticDf = spark.read(). ...;
+Dataset<Row> streamingDf = spark.readStream(). ...;
 streamingDf.join(staticDf, "type");         // inner equi-join with a static DF
 streamingDf.join(staticDf, "type", "right_join");  // right outer join with a static DF
 {% endhighlight %}
@@ -1087,7 +1087,7 @@ streamingDf
 <div data-lang="java"  markdown="1">
 
 {% highlight java %}
-Dataset<Row> streamingDf = spark.readStream. ...;  // columns: guid, eventTime, ...
+Dataset<Row> streamingDf = spark.readStream(). ...;  // columns: guid, eventTime, ...
 
 // Without watermark using guid column
 streamingDf.dropDuplicates("guid");

From e3967dc5cc5912155c9d67d422438162a5a9ff44 Mon Sep 17 00:00:00 2001
From: Dmitry Parfenchik <d.parfenchik@criteo.com>
Date: Fri, 4 Aug 2017 08:19:54 +0100
Subject: [PATCH 291/779] [SPARK-21254][WEBUI] History UI performance fixes

## What changes were proposed in this pull request?

As described in JIRA ticket, History page is taking ~1min to load for cases when amount of jobs is 10k+.
Most of the time is currently being spent on DOM manipulations and all additional costs implied by this (browser repaints and reflows).
PR's goal is not to change any behavior but to optimize time of History UI rendering:

1. The most costly operation is setting `innerHTML` for `duration` column within a loop, which is [extremely unperformant](https://jsperf.com/jquery-append-vs-html-list-performance/24). [Refactoring ](https://github.com/criteo-forks/spark/commit/114943b21a730092aa3249b7a905b240bd46e531) this helped to get page load time **down to 10-15s**

2. Second big gain bringing page load time **down to 4s** was [was achieved](https://github.com/criteo-forks/spark/commit/f35fdcd5f129339fce75996e9242c88085a9b8ab) by detaching table's DOM before parsing it with DataTables jQuery plugin.

3. Another chunk of improvements ([1](https://github.com/criteo-forks/spark/commit/332b398db7eb3052484d436919185cb0b62b2385), [2](https://github.com/criteo-forks/spark/commit/0af596a547e3a1f2b594a83cbda1f6ef559de86b), [3](https://github.com/criteo-forks/spark/commit/235f164178a09e22306f05090ee1ff5f314a6710)) was focused on removing unnecessary DOM manipulations that in  total contributed ~250ms to page load time.

## How was this patch tested?

Tested by existing Selenium tests in `org.apache.spark.deploy.history.HistoryServerSuite`.

Changes were also tested on Criteo's spark-2.1 fork with 20k+ number of rows in the table, reducing load time to 4s.

Author: Dmitry Parfenchik <d.parfenchik@criteo.com>
Author: Anna Savarin <a.savarin@criteo.com>

Closes #18783 from 2ooom/history-ui-perf-fix-upstream-master.
---
 .../spark/ui/static/historypage-template.html |  24 ++--
 .../org/apache/spark/ui/static/historypage.js | 118 +++++++++---------
 2 files changed, 73 insertions(+), 69 deletions(-)

diff --git a/core/src/main/resources/org/apache/spark/ui/static/historypage-template.html b/core/src/main/resources/org/apache/spark/ui/static/historypage-template.html
index 6cff0068d8bcb..18d921ab67be8 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/historypage-template.html
+++ b/core/src/main/resources/org/apache/spark/ui/static/historypage-template.html
@@ -29,26 +29,30 @@
           App Name
         </span>
       </th>
-      <th class="attemptIDSpan">
+      {{#hasMultipleAttempts}}
+      <th>
         <span data-toggle="tooltip" data-placement="top" title="The attempt ID of this application since one application might be launched several times">
           Attempt ID
         </span>
       </th>
+      {{/hasMultipleAttempts}}
       <th>
         <span data-toggle="tooltip" data-placement="top" title="Started time of this application.">
           Started
         </span>
       </th>
-      <th class="completedColumn">
+      {{#showCompletedColumns}}
+      <th>
         <span data-toggle="tooltip" data-placement="top" title="The completed time of this application.">
           Completed
         </span>
       </th>
-      <th class="durationColumn">
+      <th>
         <span data-toggle="tooltip" data-placement="top" title="The duration time of this application.">
           Duration
         </span>
       </th>
+      {{/showCompletedColumns}}
       <th>
         <span data-toggle="tooltip" data-placement="top" title="The Spark user of this application">
           Spark User
@@ -68,13 +72,17 @@
   <tbody>
   {{#applications}}
     <tr>
-      <td class="rowGroupColumn"><span title="{{id}}"><a href="{{uiroot}}/history/{{id}}/{{num}}/jobs/">{{id}}</a></span></td>
-      <td class="rowGroupColumn">{{name}}</td>
+      <td {{#hasMultipleAttempts}}style="background-color:#fff"{{/hasMultipleAttempts}}><span title="{{id}}"><a href="{{uiroot}}/history/{{id}}/{{num}}/jobs/">{{id}}</a></span></td>
+      <td {{#hasMultipleAttempts}}style="background-color:#fff"{{/hasMultipleAttempts}}>{{name}}</td>
       {{#attempts}}
-      <td class="attemptIDSpan"><a href="{{uiroot}}/history/{{id}}/{{attemptId}}/jobs/">{{attemptId}}</a></td>
+      {{#hasMultipleAttempts}}
+      <td><a href="{{uiroot}}/history/{{id}}/{{attemptId}}/jobs/">{{attemptId}}</a></td>
+      {{/hasMultipleAttempts}}
       <td>{{startTime}}</td>
-      <td class="completedColumn">{{endTime}}</td>
-      <td class="durationColumn"><span title="{{duration}}" class="durationClass">{{duration}}</span></td>
+      {{#showCompletedColumns}}
+      <td>{{endTime}}</td>
+      <td><span title="{{durationMillisec}}">{{duration}}</span></td>
+      {{/showCompletedColumns}}
       <td>{{sparkUser}}</td>
       <td>{{lastUpdated}}</td>
       <td><a href="{{log}}" class="btn btn-info btn-mini">Download</a></td>
diff --git a/core/src/main/resources/org/apache/spark/ui/static/historypage.js b/core/src/main/resources/org/apache/spark/ui/static/historypage.js
index 9edd3ba0e0ba6..aa7e860372553 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/historypage.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/historypage.js
@@ -48,6 +48,18 @@ function getParameterByName(name, searchString) {
   return results === null ? "" : decodeURIComponent(results[1].replace(/\+/g, " "));
 }
 
+function removeColumnByName(columns, columnName) {
+  return columns.filter(function(col) {return col.name != columnName})
+}
+
+function getColumnIndex(columns, columnName) {
+  for(var i = 0; i < columns.length; i++) {
+    if (columns[i].name == columnName)
+      return i;
+  }
+  return -1;
+}
+
 jQuery.extend( jQuery.fn.dataTableExt.oSort, {
     "title-numeric-pre": function ( a ) {
         var x = a.match(/title="*(-?[0-9\.]+)/)[1];
@@ -122,84 +134,68 @@ $(document).ready(function() {
           attempt["lastUpdated"] = formatDate(attempt["lastUpdated"]);
           attempt["log"] = uiRoot + "/api/v1/applications/" + id + "/" +
             (attempt.hasOwnProperty("attemptId") ? attempt["attemptId"] + "/" : "") + "logs";
-
+          attempt["durationMillisec"] = attempt["duration"];
+          attempt["duration"] = formatDuration(attempt["duration"]);
           var app_clone = {"id" : id, "name" : name, "num" : num, "attempts" : [attempt]};
           array.push(app_clone);
         }
       }
+      if(array.length < 20) {
+        $.fn.dataTable.defaults.paging = false;
+      }
 
       var data = {
         "uiroot": uiRoot,
-        "applications": array
-        }
+        "applications": array,
+        "hasMultipleAttempts": hasMultipleAttempts,
+        "showCompletedColumns": !requestedIncomplete,
+      }
 
       $.get("static/historypage-template.html", function(template) {
-        historySummary.append(Mustache.render($(template).filter("#history-summary-template").html(),data));
-        var selector = "#history-summary-table";
+        var sibling = historySummary.prev();
+        historySummary.detach();
+        var apps = $(Mustache.render($(template).filter("#history-summary-template").html(),data));
+        var attemptIdColumnName = 'attemptId';
+        var startedColumnName = 'started';
+        var defaultSortColumn = completedColumnName = 'completed';
+        var durationColumnName = 'duration';
         var conf = {
-                    "columns": [
-                        {name: 'first', type: "appid-numeric"},
-                        {name: 'second'},
-                        {name: 'third'},
-                        {name: 'fourth'},
-                        {name: 'fifth'},
-                        {name: 'sixth', type: "title-numeric"},
-                        {name: 'seventh'},
-                        {name: 'eighth'},
-                        {name: 'ninth'},
-                    ],
-                    "columnDefs": [
-                        {"searchable": false, "targets": [5]}
-                    ],
-                    "autoWidth": false,
-                    "order": [[ 4, "desc" ]]
-        };
-
-        var rowGroupConf = {
-                           "rowsGroup": [
-                               'first:name',
-                               'second:name'
-                           ],
+          "columns": [
+            {name: 'appId', type: "appid-numeric"},
+            {name: 'appName'},
+            {name: attemptIdColumnName},
+            {name: startedColumnName},
+            {name: completedColumnName},
+            {name: durationColumnName, type: "title-numeric"},
+            {name: 'user'},
+            {name: 'lastUpdated'},
+            {name: 'eventLog'},
+          ],
+          "autoWidth": false,
         };
 
         if (hasMultipleAttempts) {
-          jQuery.extend(conf, rowGroupConf);
-          var rowGroupCells = document.getElementsByClassName("rowGroupColumn");
-          for (i = 0; i < rowGroupCells.length; i++) {
-            rowGroupCells[i].style='background-color: #ffffff';
-          }
-        }
-
-        if (!hasMultipleAttempts) {
-          var attemptIDCells = document.getElementsByClassName("attemptIDSpan");
-          for (i = 0; i < attemptIDCells.length; i++) {
-            attemptIDCells[i].style.display='none';
-          }
-        }
-
-        if (requestedIncomplete) {
-          var completedCells = document.getElementsByClassName("completedColumn");
-          for (i = 0; i < completedCells.length; i++) {
-            completedCells[i].style.display='none';
-          }
-
-          var durationCells = document.getElementsByClassName("durationColumn");
-          for (i = 0; i < durationCells.length; i++) {
-            durationCells[i].style.display='none';
-          }
+          conf.rowsGroup = [
+            'appId:name',
+            'appName:name'
+          ];
         } else {
-          var durationCells = document.getElementsByClassName("durationClass");
-          for (i = 0; i < durationCells.length; i++) {
-            var timeInMilliseconds = parseInt(durationCells[i].title);
-            durationCells[i].innerHTML = formatDuration(timeInMilliseconds);
-          }
+          conf.columns = removeColumnByName(conf.columns, attemptIdColumnName);
         }
 
-        if ($(selector.concat(" tr")).length < 20) {
-          $.extend(conf, {paging: false});
+        var defaultSortColumn = completedColumnName;
+        if (requestedIncomplete) {
+          defaultSortColumn = startedColumnName;
+          conf.columns = removeColumnByName(conf.columns, completedColumnName);
+          conf.columns = removeColumnByName(conf.columns, durationColumnName);
         }
-
-        $(selector).DataTable(conf);
+        conf.order = [[ getColumnIndex(conf.columns, defaultSortColumn), "desc" ]];
+        conf.columnDefs = [
+          {"searchable": false, "targets": [getColumnIndex(conf.columns, durationColumnName)]}
+        ];
+        historySummary.append(apps);
+        apps.DataTable(conf);
+        sibling.after(historySummary);
         $('#history-summary [data-toggle="tooltip"]').tooltip();
       });
     });

From 25826c77ddf0d5753d2501d0e764111da2caa8b6 Mon Sep 17 00:00:00 2001
From: Andrew Ray <ray.andrew@gmail.com>
Date: Fri, 4 Aug 2017 08:58:01 +0100
Subject: [PATCH 292/779] [SPARK-21330][SQL] Bad partitioning does not allow to
 read a JDBC table with extreme values on the partition column

## What changes were proposed in this pull request?

An overflow of the difference of bounds on the partitioning column leads to no data being read. This
patch checks for this overflow.

## How was this patch tested?

New unit test.

Author: Andrew Ray <ray.andrew@gmail.com>

Closes #18800 from aray/SPARK-21330.
---
 .../execution/datasources/jdbc/JDBCRelation.scala |  3 ++-
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala     | 15 +++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
index 658d13768a976..68ff53c612188 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
@@ -64,7 +64,8 @@ private[sql] object JDBCRelation extends Logging {
       s"bound. Lower bound: $lowerBound; Upper bound: $upperBound")
 
     val numPartitions =
-      if ((upperBound - lowerBound) >= partitioning.numPartitions) {
+      if ((upperBound - lowerBound) >= partitioning.numPartitions || /* check for overflow */
+          (upperBound - lowerBound) < 0) {
         partitioning.numPartitions
       } else {
         logWarning("The number of partitions is reduced because the specified number of " +
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index d1daf860fdfff..24f46a6a057d9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -96,6 +96,15 @@ class JDBCSuite extends SparkFunSuite
         |         partitionColumn 'THEID', lowerBound '1', upperBound '4', numPartitions '3')
        """.stripMargin.replaceAll("\n", " "))
 
+    sql(
+      s"""
+        |CREATE OR REPLACE TEMPORARY VIEW partsoverflow
+        |USING org.apache.spark.sql.jdbc
+        |OPTIONS (url '$url', dbtable 'TEST.PEOPLE', user 'testUser', password 'testPass',
+        |         partitionColumn 'THEID', lowerBound '-9223372036854775808',
+        |         upperBound '9223372036854775807', numPartitions '3')
+       """.stripMargin.replaceAll("\n", " "))
+
     conn.prepareStatement("create table test.inttypes (a INT, b BOOLEAN, c TINYINT, "
       + "d SMALLINT, e BIGINT)").executeUpdate()
     conn.prepareStatement("insert into test.inttypes values (1, false, 3, 4, 1234567890123)"
@@ -376,6 +385,12 @@ class JDBCSuite extends SparkFunSuite
     assert(ids(2) === 3)
   }
 
+  test("overflow of partition bound difference does not give negative stride") {
+    val df = sql("SELECT * FROM partsoverflow")
+    checkNumPartitions(df, expectedNumPartitions = 3)
+    assert(df.collect().length == 3)
+  }
+
   test("Register JDBC query with renamed fields") {
     // Regression test for bug SPARK-7345
     sql(

From 1347b2a697aa798c04b39fbb352efc735aa42ea3 Mon Sep 17 00:00:00 2001
From: Ajay Saini <ajays725@gmail.com>
Date: Fri, 4 Aug 2017 01:01:32 -0700
Subject: [PATCH 293/779] [SPARK-21633][ML][PYTHON] UnaryTransformer in Python

## What changes were proposed in this pull request?

Implemented UnaryTransformer in Python.

## How was this patch tested?

This patch was tested by creating a MockUnaryTransformer class in the unit tests that extends UnaryTransformer and testing that the transform function produced correct output.

Author: Ajay Saini <ajays725@gmail.com>

Closes #18746 from ajaysaini725/AddPythonUnaryTransformer.
---
 python/pyspark/ml/__init__.py |  4 +--
 python/pyspark/ml/base.py     | 56 +++++++++++++++++++++++++++++++
 python/pyspark/ml/tests.py    | 62 ++++++++++++++++++++++++++++++++++-
 3 files changed, 119 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/ml/__init__.py b/python/pyspark/ml/__init__.py
index 1d42d49a8816b..129d7d68f7cbb 100644
--- a/python/pyspark/ml/__init__.py
+++ b/python/pyspark/ml/__init__.py
@@ -19,7 +19,7 @@
 DataFrame-based machine learning APIs to let users quickly assemble and configure practical
 machine learning pipelines.
 """
-from pyspark.ml.base import Estimator, Model, Transformer
+from pyspark.ml.base import Estimator, Model, Transformer, UnaryTransformer
 from pyspark.ml.pipeline import Pipeline, PipelineModel
 
-__all__ = ["Transformer", "Estimator", "Model", "Pipeline", "PipelineModel"]
+__all__ = ["Transformer", "UnaryTransformer", "Estimator", "Model", "Pipeline", "PipelineModel"]
diff --git a/python/pyspark/ml/base.py b/python/pyspark/ml/base.py
index 339e5d6af52a7..a6767cee9bf28 100644
--- a/python/pyspark/ml/base.py
+++ b/python/pyspark/ml/base.py
@@ -17,9 +17,14 @@
 
 from abc import ABCMeta, abstractmethod
 
+import copy
+
 from pyspark import since
 from pyspark.ml.param import Params
+from pyspark.ml.param.shared import *
 from pyspark.ml.common import inherit_doc
+from pyspark.sql.functions import udf
+from pyspark.sql.types import StructField, StructType, DoubleType
 
 
 @inherit_doc
@@ -116,3 +121,54 @@ class Model(Transformer):
     """
 
     __metaclass__ = ABCMeta
+
+
+@inherit_doc
+class UnaryTransformer(HasInputCol, HasOutputCol, Transformer):
+    """
+    Abstract class for transformers that take one input column, apply transformation,
+    and output the result as a new column.
+
+    .. versionadded:: 2.3.0
+    """
+
+    @abstractmethod
+    def createTransformFunc(self):
+        """
+        Creates the transform function using the given param map. The input param map already takes
+        account of the embedded param map. So the param values should be determined
+        solely by the input param map.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def outputDataType(self):
+        """
+        Returns the data type of the output column.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def validateInputType(self, inputType):
+        """
+        Validates the input type. Throw an exception if it is invalid.
+        """
+        raise NotImplementedError()
+
+    def transformSchema(self, schema):
+        inputType = schema[self.getInputCol()].dataType
+        self.validateInputType(inputType)
+        if self.getOutputCol() in schema.names:
+            raise ValueError("Output column %s already exists." % self.getOutputCol())
+        outputFields = copy.copy(schema.fields)
+        outputFields.append(StructField(self.getOutputCol(),
+                                        self.outputDataType(),
+                                        nullable=False))
+        return StructType(outputFields)
+
+    def _transform(self, dataset):
+        self.transformSchema(dataset.schema)
+        transformUDF = udf(self.createTransformFunc(), self.outputDataType())
+        transformedDataset = dataset.withColumn(self.getOutputCol(),
+                                                transformUDF(dataset[self.getInputCol()]))
+        return transformedDataset
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 7ee2c2f3ffe76..3bd4d3737a056 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -45,7 +45,7 @@
 import inspect
 
 from pyspark import keyword_only, SparkContext
-from pyspark.ml import Estimator, Model, Pipeline, PipelineModel, Transformer
+from pyspark.ml import Estimator, Model, Pipeline, PipelineModel, Transformer, UnaryTransformer
 from pyspark.ml.classification import *
 from pyspark.ml.clustering import *
 from pyspark.ml.common import _java2py, _py2java
@@ -66,6 +66,7 @@
 from pyspark.serializers import PickleSerializer
 from pyspark.sql import DataFrame, Row, SparkSession
 from pyspark.sql.functions import rand
+from pyspark.sql.types import DoubleType, IntegerType
 from pyspark.storagelevel import *
 from pyspark.tests import ReusedPySparkTestCase as PySparkTestCase
 
@@ -121,6 +122,36 @@ def _transform(self, dataset):
         return dataset
 
 
+class MockUnaryTransformer(UnaryTransformer):
+
+    shift = Param(Params._dummy(), "shift", "The amount by which to shift " +
+                  "data in a DataFrame",
+                  typeConverter=TypeConverters.toFloat)
+
+    def __init__(self, shiftVal=1):
+        super(MockUnaryTransformer, self).__init__()
+        self._setDefault(shift=1)
+        self._set(shift=shiftVal)
+
+    def getShift(self):
+        return self.getOrDefault(self.shift)
+
+    def setShift(self, shift):
+        self._set(shift=shift)
+
+    def createTransformFunc(self):
+        shiftVal = self.getShift()
+        return lambda x: x + shiftVal
+
+    def outputDataType(self):
+        return DoubleType()
+
+    def validateInputType(self, inputType):
+        if inputType != DoubleType():
+            raise TypeError("Bad input type: {}. ".format(inputType) +
+                            "Requires Integer.")
+
+
 class MockEstimator(Estimator, HasFake):
 
     def __init__(self):
@@ -2008,6 +2039,35 @@ def test_chisquaretest(self):
         self.assertTrue(all(field in fieldNames for field in expectedFields))
 
 
+class UnaryTransformerTests(SparkSessionTestCase):
+
+    def test_unary_transformer_validate_input_type(self):
+        shiftVal = 3
+        transformer = MockUnaryTransformer(shiftVal=shiftVal)\
+            .setInputCol("input").setOutputCol("output")
+
+        # should not raise any errors
+        transformer.validateInputType(DoubleType())
+
+        with self.assertRaises(TypeError):
+            # passing the wrong input type should raise an error
+            transformer.validateInputType(IntegerType())
+
+    def test_unary_transformer_transform(self):
+        shiftVal = 3
+        transformer = MockUnaryTransformer(shiftVal=shiftVal)\
+            .setInputCol("input").setOutputCol("output")
+
+        df = self.spark.range(0, 10).toDF('input')
+        df = df.withColumn("input", df.input.cast(dataType="double"))
+
+        transformed_df = transformer.transform(df)
+        results = transformed_df.select("input", "output").collect()
+
+        for res in results:
+            self.assertEqual(res.input + shiftVal, res.output)
+
+
 if __name__ == "__main__":
     from pyspark.ml.tests import *
     if xmlrunner:

From 231f67247b1506859652119c95e59434b66882da Mon Sep 17 00:00:00 2001
From: Yuming Wang <wgyumg@gmail.com>
Date: Fri, 4 Aug 2017 12:06:08 +0200
Subject: [PATCH 294/779] [SPARK-21205][SQL] pmod(number, 0) should be null.

## What changes were proposed in this pull request?
Hive `pmod(3.13, 0)`:
```:sql
hive> select pmod(3.13, 0);
OK
NULL
Time taken: 2.514 seconds, Fetched: 1 row(s)
hive>
```

Spark `mod(3.13, 0)`:
```:sql
spark-sql> select mod(3.13, 0);
NULL
spark-sql>
```

But the Spark `pmod(3.13, 0)`:
```:sql
spark-sql> select pmod(3.13, 0);
17/06/25 09:35:58 ERROR SparkSQLDriver: Failed in [select pmod(3.13, 0)]
java.lang.NullPointerException
	at org.apache.spark.sql.catalyst.expressions.Pmod.pmod(arithmetic.scala:504)
	at org.apache.spark.sql.catalyst.expressions.Pmod.nullSafeEval(arithmetic.scala:432)
	at org.apache.spark.sql.catalyst.expressions.BinaryExpression.eval(Expression.scala:419)
	at org.apache.spark.sql.catalyst.expressions.UnaryExpression.eval(Expression.scala:323)
...
```
This PR make `pmod(number, 0)` to null.

## How was this patch tested?
unit tests

Author: Yuming Wang <wgyumg@gmail.com>

Closes #18413 from wangyum/SPARK-21205.
---
 .../sql/catalyst/expressions/arithmetic.scala | 137 ++++++++++++------
 .../ArithmeticExpressionSuite.scala           |   9 +-
 .../resources/sql-tests/inputs/operators.sql  |   4 +
 .../sql-tests/results/operators.sql.out       |  18 ++-
 4 files changed, 122 insertions(+), 46 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index 423bf66a24d1f..94264e8a9ded5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -421,52 +421,101 @@ case class Pmod(left: Expression, right: Expression) extends BinaryArithmetic {
 
   override def inputType: AbstractDataType = NumericType
 
-  protected override def nullSafeEval(left: Any, right: Any) =
-    dataType match {
-      case IntegerType => pmod(left.asInstanceOf[Int], right.asInstanceOf[Int])
-      case LongType => pmod(left.asInstanceOf[Long], right.asInstanceOf[Long])
-      case ShortType => pmod(left.asInstanceOf[Short], right.asInstanceOf[Short])
-      case ByteType => pmod(left.asInstanceOf[Byte], right.asInstanceOf[Byte])
-      case FloatType => pmod(left.asInstanceOf[Float], right.asInstanceOf[Float])
-      case DoubleType => pmod(left.asInstanceOf[Double], right.asInstanceOf[Double])
-      case _: DecimalType => pmod(left.asInstanceOf[Decimal], right.asInstanceOf[Decimal])
+  override def nullable: Boolean = true
+
+  override def eval(input: InternalRow): Any = {
+    val input2 = right.eval(input)
+    if (input2 == null || input2 == 0) {
+      null
+    } else {
+      val input1 = left.eval(input)
+      if (input1 == null) {
+        null
+      } else {
+        input1 match {
+          case i: Integer => pmod(i, input2.asInstanceOf[java.lang.Integer])
+          case l: Long => pmod(l, input2.asInstanceOf[java.lang.Long])
+          case s: Short => pmod(s, input2.asInstanceOf[java.lang.Short])
+          case b: Byte => pmod(b, input2.asInstanceOf[java.lang.Byte])
+          case f: Float => pmod(f, input2.asInstanceOf[java.lang.Float])
+          case d: Double => pmod(d, input2.asInstanceOf[java.lang.Double])
+          case d: Decimal => pmod(d, input2.asInstanceOf[Decimal])
+        }
+      }
     }
+  }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
-      val remainder = ctx.freshName("remainder")
-      dataType match {
-        case dt: DecimalType =>
-          val decimalAdd = "$plus"
-          s"""
-            ${ctx.javaType(dataType)} $remainder = $eval1.remainder($eval2);
-            if ($remainder.compare(new org.apache.spark.sql.types.Decimal().set(0)) < 0) {
-              ${ev.value} = ($remainder.$decimalAdd($eval2)).remainder($eval2);
-            } else {
-              ${ev.value} = $remainder;
-            }
-          """
-        // byte and short are casted into int when add, minus, times or divide
-        case ByteType | ShortType =>
-          s"""
-            ${ctx.javaType(dataType)} $remainder = (${ctx.javaType(dataType)})($eval1 % $eval2);
-            if ($remainder < 0) {
-              ${ev.value} = (${ctx.javaType(dataType)})(($remainder + $eval2) % $eval2);
-            } else {
-              ${ev.value} = $remainder;
-            }
-          """
-        case _ =>
-          s"""
-            ${ctx.javaType(dataType)} $remainder = $eval1 % $eval2;
-            if ($remainder < 0) {
-              ${ev.value} = ($remainder + $eval2) % $eval2;
-            } else {
-              ${ev.value} = $remainder;
-            }
-          """
-      }
-    })
+    val eval1 = left.genCode(ctx)
+    val eval2 = right.genCode(ctx)
+    val isZero = if (dataType.isInstanceOf[DecimalType]) {
+      s"${eval2.value}.isZero()"
+    } else {
+      s"${eval2.value} == 0"
+    }
+    val remainder = ctx.freshName("remainder")
+    val javaType = ctx.javaType(dataType)
+
+    val result = dataType match {
+      case DecimalType.Fixed(_, _) =>
+        val decimalAdd = "$plus"
+        s"""
+          ${ctx.javaType(dataType)} $remainder = ${eval1.value}.remainder(${eval2.value});
+          if ($remainder.compare(new org.apache.spark.sql.types.Decimal().set(0)) < 0) {
+            ${ev.value}=($remainder.$decimalAdd(${eval2.value})).remainder(${eval2.value});
+          } else {
+            ${ev.value}=$remainder;
+          }
+        """
+      // byte and short are casted into int when add, minus, times or divide
+      case ByteType | ShortType =>
+        s"""
+          ${ctx.javaType(dataType)} $remainder =
+            (${ctx.javaType(dataType)})(${eval1.value} % ${eval2.value});
+          if ($remainder < 0) {
+            ${ev.value}=(${ctx.javaType(dataType)})(($remainder + ${eval2.value}) % ${eval2.value});
+          } else {
+            ${ev.value}=$remainder;
+          }
+        """
+      case _ =>
+        s"""
+          ${ctx.javaType(dataType)} $remainder = ${eval1.value} % ${eval2.value};
+          if ($remainder < 0) {
+            ${ev.value}=($remainder + ${eval2.value}) % ${eval2.value};
+          } else {
+            ${ev.value}=$remainder;
+          }
+        """
+    }
+
+    if (!left.nullable && !right.nullable) {
+      ev.copy(code = s"""
+        ${eval2.code}
+        boolean ${ev.isNull} = false;
+        $javaType ${ev.value} = ${ctx.defaultValue(javaType)};
+        if ($isZero) {
+          ${ev.isNull} = true;
+        } else {
+          ${eval1.code}
+          $result
+        }""")
+    } else {
+      ev.copy(code = s"""
+        ${eval2.code}
+        boolean ${ev.isNull} = false;
+        $javaType ${ev.value} = ${ctx.defaultValue(javaType)};
+        if (${eval2.isNull} || $isZero) {
+          ${ev.isNull} = true;
+        } else {
+          ${eval1.code}
+          if (${eval1.isNull}) {
+            ${ev.isNull} = true;
+          } else {
+            $result
+          }
+        }""")
+    }
   }
 
   private def pmod(a: Int, n: Int): Int = {
@@ -501,7 +550,7 @@ case class Pmod(left: Expression, right: Expression) extends BinaryArithmetic {
 
   private def pmod(a: Decimal, n: Decimal): Decimal = {
     val r = a % n
-    if (r.compare(Decimal.ZERO) < 0) {(r + n) % n} else r
+    if (r != null && r.compare(Decimal.ZERO) < 0) {(r + n) % n} else r
   }
 
   override def sql: String = s"$prettyName(${left.sql}, ${right.sql})"
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala
index 0d86efda7ea86..031053727f08e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala
@@ -214,7 +214,7 @@ class ArithmeticExpressionSuite extends SparkFunSuite with ExpressionEvalHelper
       checkEvaluation(Pmod(left, right), convert(1))
       checkEvaluation(Pmod(Literal.create(null, left.dataType), right), null)
       checkEvaluation(Pmod(left, Literal.create(null, right.dataType)), null)
-      checkEvaluation(Remainder(left, Literal(convert(0))), null)  // mod by 0
+      checkEvaluation(Pmod(left, Literal(convert(0))), null)  // mod by 0
     }
     checkEvaluation(Pmod(Literal(-7), Literal(3)), 2)
     checkEvaluation(Pmod(Literal(7.2D), Literal(4.1D)), 3.1000000000000005)
@@ -223,6 +223,13 @@ class ArithmeticExpressionSuite extends SparkFunSuite with ExpressionEvalHelper
     checkEvaluation(Pmod(positiveShort, negativeShort), positiveShort.toShort)
     checkEvaluation(Pmod(positiveInt, negativeInt), positiveInt)
     checkEvaluation(Pmod(positiveLong, negativeLong), positiveLong)
+
+    // mod by 0
+    checkEvaluation(Pmod(Literal(-7), Literal(0)), null)
+    checkEvaluation(Pmod(Literal(7.2D), Literal(0D)), null)
+    checkEvaluation(Pmod(Literal(7.2F), Literal(0F)), null)
+    checkEvaluation(Pmod(Literal(2.toByte), Literal(0.toByte)), null)
+    checkEvaluation(Pmod(positiveShort, 0.toShort), null)
   }
 
   test("function least") {
diff --git a/sql/core/src/test/resources/sql-tests/inputs/operators.sql b/sql/core/src/test/resources/sql-tests/inputs/operators.sql
index a766275192492..15d981985c55b 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/operators.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/operators.sql
@@ -92,3 +92,7 @@ select abs(-3.13), abs('-2.19');
 
 -- positive/negative
 select positive('-1.11'), positive(-1.11), negative('-1.11'), negative(-1.11);
+
+-- pmod
+select pmod(-7, 2), pmod(0, 2), pmod(7, 0), pmod(7, null), pmod(null, 2), pmod(null, null);
+select pmod(cast(3.13 as decimal), cast(0 as decimal)), pmod(cast(2 as smallint), cast(0 as smallint));
diff --git a/sql/core/src/test/resources/sql-tests/results/operators.sql.out b/sql/core/src/test/resources/sql-tests/results/operators.sql.out
index fec423fca5bbe..237b618a8b904 100644
--- a/sql/core/src/test/resources/sql-tests/results/operators.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/operators.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 57
+-- Number of queries: 59
 
 
 -- !query 0
@@ -468,3 +468,19 @@ select positive('-1.11'), positive(-1.11), negative('-1.11'), negative(-1.11)
 struct<(+ CAST(-1.11 AS DOUBLE)):double,(+ -1.11):decimal(3,2),(- CAST(-1.11 AS DOUBLE)):double,(- -1.11):decimal(3,2)>
 -- !query 56 output
 -1.11	-1.11	1.11	1.11
+
+
+-- !query 57
+select pmod(-7, 2), pmod(0, 2), pmod(7, 0), pmod(7, null), pmod(null, 2), pmod(null, null)
+-- !query 57 schema
+struct<pmod(-7, 2):int,pmod(0, 2):int,pmod(7, 0):int,pmod(7, CAST(NULL AS INT)):int,pmod(CAST(NULL AS INT), 2):int,pmod(CAST(NULL AS DOUBLE), CAST(NULL AS DOUBLE)):double>
+-- !query 57 output
+1	0	NULL	NULL	NULL	NULL
+
+
+-- !query 58
+select pmod(cast(3.13 as decimal), cast(0 as decimal)), pmod(cast(2 as smallint), cast(0 as smallint))
+-- !query 58 schema
+struct<pmod(CAST(3.13 AS DECIMAL(10,0)), CAST(0 AS DECIMAL(10,0))):decimal(10,0),pmod(CAST(2 AS SMALLINT), CAST(0 AS SMALLINT)):smallint>
+-- !query 58 output
+NULL	NULL

From 5ad1796b9fd6bce31bbc1cdc2f607115d2dd0e7d Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Fri, 4 Aug 2017 10:36:08 -0700
Subject: [PATCH 295/779] [SPARK-21634][SQL] Change OneRowRelation from a case
 object to case class

## What changes were proposed in this pull request?
OneRowRelation is the only plan that is a case object, which causes some issues with makeCopy using a 0-arg constructor. This patch changes it from a case object to a case class.

This blocks SPARK-21619.

## How was this patch tested?
Should be covered by existing test cases.

Author: Reynold Xin <rxin@databricks.com>

Closes #18839 from rxin/SPARK-21634.
---
 .../spark/sql/catalyst/parser/AstBuilder.scala     |  2 +-
 .../plans/logical/basicLogicalOperators.scala      |  5 ++++-
 .../sql/catalyst/analysis/AnalysisSuite.scala      |  2 +-
 .../expressions/ExpressionEvalHelper.scala         |  2 +-
 .../expressions/MathExpressionsSuite.scala         |  2 +-
 .../catalyst/optimizer/ColumnPruningSuite.scala    |  2 +-
 .../catalyst/optimizer/CombineConcatsSuite.scala   |  4 ++--
 .../optimizer/EliminateSubqueryAliasesSuite.scala  |  4 ++--
 .../optimizer/FoldablePropagationSuite.scala       |  6 +++---
 .../catalyst/optimizer/OptimizeCodegenSuite.scala  | 12 ++++++------
 .../sql/catalyst/optimizer/SetOperationSuite.scala | 14 +++++++-------
 .../optimizer/SimplifyConditionalSuite.scala       |  4 ++--
 .../sql/catalyst/parser/PlanParserSuite.scala      | 10 +++++-----
 .../apache/spark/sql/catalyst/plans/PlanTest.scala |  2 +-
 .../spark/sql/execution/SparkSqlParser.scala       |  2 +-
 .../spark/sql/execution/SparkStrategies.scala      |  2 +-
 .../sql/execution/columnar/InMemoryRelation.scala  |  1 -
 .../spark/sql/execution/datasources/rules.scala    |  2 +-
 .../org/apache/spark/sql/DataFrameSuite.scala      |  2 +-
 .../spark/sql/execution/QueryExecutionSuite.scala  |  2 +-
 .../org/apache/spark/sql/hive/test/TestHive.scala  |  2 +-
 21 files changed, 43 insertions(+), 41 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 07578261781b7..5935017704eda 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -302,7 +302,7 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
    */
   override def visitQuerySpecification(
       ctx: QuerySpecificationContext): LogicalPlan = withOrigin(ctx) {
-    val from = OneRowRelation.optional(ctx.fromClause) {
+    val from = OneRowRelation().optional(ctx.fromClause) {
       visitFromClause(ctx.fromClause)
     }
     withQuerySpecification(ctx, from)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index 0bd3166352d35..303014e0b8d31 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -772,10 +772,13 @@ case class RepartitionByExpression(
 /**
  * A relation with one row. This is used in "SELECT ..." without a from clause.
  */
-case object OneRowRelation extends LeafNode {
+case class OneRowRelation() extends LeafNode {
   override def maxRows: Option[Long] = Some(1)
   override def output: Seq[Attribute] = Nil
   override def computeStats(): Statistics = Statistics(sizeInBytes = 1)
+
+  /** [[org.apache.spark.sql.catalyst.trees.TreeNode.makeCopy()]] does not support 0-arg ctor. */
+  override def makeCopy(newArgs: Array[AnyRef]): OneRowRelation = OneRowRelation()
 }
 
 /** A logical plan for `dropDuplicates`. */
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index 9bcf4773fa903..847713a0455b9 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -384,7 +384,7 @@ class AnalysisSuite extends AnalysisTest with ShouldMatchers {
       expression: Expression,
       expectedDataType: DataType): Unit = {
     val afterAnalyze =
-      Project(Seq(Alias(expression, "a")()), OneRowRelation).analyze.expressions.head
+      Project(Seq(Alias(expression, "a")()), OneRowRelation()).analyze.expressions.head
     if (!afterAnalyze.dataType.equals(expectedDataType)) {
       fail(
         s"""
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
index b6399edb68dd6..0496d611ec3c7 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
@@ -187,7 +187,7 @@ trait ExpressionEvalHelper extends GeneratorDrivenPropertyChecks {
       expression: Expression,
       expected: Any,
       inputRow: InternalRow = EmptyRow): Unit = {
-    val plan = Project(Alias(expression, s"Optimized($expression)")() :: Nil, OneRowRelation)
+    val plan = Project(Alias(expression, s"Optimized($expression)")() :: Nil, OneRowRelation())
     val optimizedPlan = SimpleTestOptimizer.execute(plan)
     checkEvaluationWithoutCodegen(optimizedPlan.expressions.head, expected, inputRow)
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathExpressionsSuite.scala
index 9ee777529aeda..39e0060d41dd4 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathExpressionsSuite.scala
@@ -151,7 +151,7 @@ class MathExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   private def checkNaNWithOptimization(
     expression: Expression,
     inputRow: InternalRow = EmptyRow): Unit = {
-    val plan = Project(Alias(expression, s"Optimized($expression)")() :: Nil, OneRowRelation)
+    val plan = Project(Alias(expression, s"Optimized($expression)")() :: Nil, OneRowRelation())
     val optimizedPlan = SimpleTestOptimizer.execute(plan)
     checkNaNWithoutCodegen(optimizedPlan.expressions.head, inputRow)
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala
index 08e58d47e0e25..77e4eff26c69b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala
@@ -164,7 +164,7 @@ class ColumnPruningSuite extends PlanTest {
   }
 
   test("Eliminate the Project with an empty projectList") {
-    val input = OneRowRelation
+    val input = OneRowRelation()
     val expected = Project(Literal(1).as("1") :: Nil, input).analyze
 
     val query1 =
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CombineConcatsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CombineConcatsSuite.scala
index 7aa9fbba9a10a..412e199dfaae3 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CombineConcatsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CombineConcatsSuite.scala
@@ -32,8 +32,8 @@ class CombineConcatsSuite extends PlanTest {
   }
 
   protected def assertEquivalent(e1: Expression, e2: Expression): Unit = {
-    val correctAnswer = Project(Alias(e2, "out")() :: Nil, OneRowRelation).analyze
-    val actual = Optimize.execute(Project(Alias(e1, "out")() :: Nil, OneRowRelation).analyze)
+    val correctAnswer = Project(Alias(e2, "out")() :: Nil, OneRowRelation()).analyze
+    val actual = Optimize.execute(Project(Alias(e1, "out")() :: Nil, OneRowRelation()).analyze)
     comparePlans(actual, correctAnswer)
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateSubqueryAliasesSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateSubqueryAliasesSuite.scala
index 9b6d68aee803a..4df1a145a271b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateSubqueryAliasesSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateSubqueryAliasesSuite.scala
@@ -35,8 +35,8 @@ class EliminateSubqueryAliasesSuite extends PlanTest with PredicateHelper {
   }
 
   private def assertEquivalent(e1: Expression, e2: Expression): Unit = {
-    val correctAnswer = Project(Alias(e2, "out")() :: Nil, OneRowRelation).analyze
-    val actual = Optimize.execute(Project(Alias(e1, "out")() :: Nil, OneRowRelation).analyze)
+    val correctAnswer = Project(Alias(e2, "out")() :: Nil, OneRowRelation()).analyze
+    val actual = Optimize.execute(Project(Alias(e1, "out")() :: Nil, OneRowRelation()).analyze)
     comparePlans(actual, correctAnswer)
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FoldablePropagationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FoldablePropagationSuite.scala
index d128315b68869..dccb32f0379a8 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FoldablePropagationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FoldablePropagationSuite.scala
@@ -34,12 +34,12 @@ class FoldablePropagationSuite extends PlanTest {
   val testRelation = LocalRelation('a.int, 'b.int)
 
   test("Propagate from subquery") {
-    val query = OneRowRelation
+    val query = OneRowRelation()
       .select(Literal(1).as('a), Literal(2).as('b))
       .subquery('T)
       .select('a, 'b)
     val optimized = Optimize.execute(query.analyze)
-    val correctAnswer = OneRowRelation
+    val correctAnswer = OneRowRelation()
       .select(Literal(1).as('a), Literal(2).as('b))
       .subquery('T)
       .select(Literal(1).as('a), Literal(2).as('b)).analyze
@@ -152,7 +152,7 @@ class FoldablePropagationSuite extends PlanTest {
     val expand = Expand(
       Seq(Seq(Literal(null), 'b), Seq('a, Literal(null))),
       Seq(a1, a2),
-      OneRowRelation.select(c1, c2))
+      OneRowRelation().select(c1, c2))
     val query = expand.where(a1.isNotNull).select(a1, a2).analyze
     val optimized = Optimize.execute(query)
     val correctExpand = expand.copy(projections = Seq(
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeCodegenSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeCodegenSuite.scala
index 2abf9fe6aa490..b1157f3e3edd2 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeCodegenSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeCodegenSuite.scala
@@ -32,8 +32,8 @@ class OptimizeCodegenSuite extends PlanTest {
   }
 
   protected def assertEquivalent(e1: Expression, e2: Expression): Unit = {
-    val correctAnswer = Project(Alias(e2, "out")() :: Nil, OneRowRelation).analyze
-    val actual = Optimize.execute(Project(Alias(e1, "out")() :: Nil, OneRowRelation).analyze)
+    val correctAnswer = Project(Alias(e2, "out")() :: Nil, OneRowRelation()).analyze
+    val actual = Optimize.execute(Project(Alias(e1, "out")() :: Nil, OneRowRelation()).analyze)
     comparePlans(actual, correctAnswer)
   }
 
@@ -58,13 +58,13 @@ class OptimizeCodegenSuite extends PlanTest {
   }
 
   test("Multiple CaseWhen in one operator.") {
-    val plan = OneRowRelation
+    val plan = OneRowRelation()
       .select(
         CaseWhen(Seq((TrueLiteral, Literal(1))), Literal(2)),
         CaseWhen(Seq((FalseLiteral, Literal(3))), Literal(4)),
         CaseWhen(List.fill(20)((TrueLiteral, Literal(0))), Literal(0)),
         CaseWhen(Seq((TrueLiteral, Literal(5))), Literal(6))).analyze
-    val correctAnswer = OneRowRelation
+    val correctAnswer = OneRowRelation()
       .select(
         CaseWhen(Seq((TrueLiteral, Literal(1))), Literal(2)).toCodegen(),
         CaseWhen(Seq((FalseLiteral, Literal(3))), Literal(4)).toCodegen(),
@@ -75,7 +75,7 @@ class OptimizeCodegenSuite extends PlanTest {
   }
 
   test("Multiple CaseWhen in different operators") {
-    val plan = OneRowRelation
+    val plan = OneRowRelation()
       .select(
         CaseWhen(Seq((TrueLiteral, Literal(1))), Literal(2)),
         CaseWhen(Seq((FalseLiteral, Literal(3))), Literal(4)),
@@ -85,7 +85,7 @@ class OptimizeCodegenSuite extends PlanTest {
           CaseWhen(Seq((TrueLiteral, Literal(5))), Literal(6)),
           CaseWhen(List.fill(20)((TrueLiteral, Literal(0))), Literal(0)))
       ).analyze
-    val correctAnswer = OneRowRelation
+    val correctAnswer = OneRowRelation()
       .select(
         CaseWhen(Seq((TrueLiteral, Literal(1))), Literal(2)).toCodegen(),
         CaseWhen(Seq((FalseLiteral, Literal(3))), Literal(4)).toCodegen(),
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SetOperationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SetOperationSuite.scala
index 21b7f49e14bd5..aa8841109329c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SetOperationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SetOperationSuite.scala
@@ -79,11 +79,11 @@ class SetOperationSuite extends PlanTest {
   }
 
   test("Remove unnecessary distincts in multiple unions") {
-    val query1 = OneRowRelation
+    val query1 = OneRowRelation()
       .select(Literal(1).as('a))
-    val query2 = OneRowRelation
+    val query2 = OneRowRelation()
       .select(Literal(2).as('b))
-    val query3 = OneRowRelation
+    val query3 = OneRowRelation()
       .select(Literal(3).as('c))
 
     // D - U - D - U - query1
@@ -111,13 +111,13 @@ class SetOperationSuite extends PlanTest {
   }
 
   test("Keep necessary distincts in multiple unions") {
-    val query1 = OneRowRelation
+    val query1 = OneRowRelation()
       .select(Literal(1).as('a))
-    val query2 = OneRowRelation
+    val query2 = OneRowRelation()
       .select(Literal(2).as('b))
-    val query3 = OneRowRelation
+    val query3 = OneRowRelation()
       .select(Literal(3).as('c))
-    val query4 = OneRowRelation
+    val query4 = OneRowRelation()
       .select(Literal(4).as('d))
 
     // U - D - U - query1
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SimplifyConditionalSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SimplifyConditionalSuite.scala
index adb3e8fc8a564..b597c8e162c83 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SimplifyConditionalSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SimplifyConditionalSuite.scala
@@ -33,8 +33,8 @@ class SimplifyConditionalSuite extends PlanTest with PredicateHelper {
   }
 
   protected def assertEquivalent(e1: Expression, e2: Expression): Unit = {
-    val correctAnswer = Project(Alias(e2, "out")() :: Nil, OneRowRelation).analyze
-    val actual = Optimize.execute(Project(Alias(e1, "out")() :: Nil, OneRowRelation).analyze)
+    val correctAnswer = Project(Alias(e2, "out")() :: Nil, OneRowRelation()).analyze
+    val actual = Optimize.execute(Project(Alias(e1, "out")() :: Nil, OneRowRelation()).analyze)
     comparePlans(actual, correctAnswer)
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
index c7f39ae18162e..5fa72e1e92660 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
@@ -88,11 +88,11 @@ class PlanParserSuite extends AnalysisTest {
       cte(table("cte1").select(star()), "cte1" -> table("a").select(star())))
     assertEqual(
       "with cte1 (select 1) select * from cte1",
-      cte(table("cte1").select(star()), "cte1" -> OneRowRelation.select(1)))
+      cte(table("cte1").select(star()), "cte1" -> OneRowRelation().select(1)))
     assertEqual(
       "with cte1 (select 1), cte2 as (select * from cte1) select * from cte2",
       cte(table("cte2").select(star()),
-        "cte1" -> OneRowRelation.select(1),
+        "cte1" -> OneRowRelation().select(1),
         "cte2" -> table("cte1").select(star())))
     intercept(
       "with cte1 (select 1), cte1 as (select 1 from cte1) select * from cte1",
@@ -100,8 +100,8 @@ class PlanParserSuite extends AnalysisTest {
   }
 
   test("simple select query") {
-    assertEqual("select 1", OneRowRelation.select(1))
-    assertEqual("select a, b", OneRowRelation.select('a, 'b))
+    assertEqual("select 1", OneRowRelation().select(1))
+    assertEqual("select a, b", OneRowRelation().select('a, 'b))
     assertEqual("select a, b from db.c", table("db", "c").select('a, 'b))
     assertEqual("select a, b from db.c where x < 1", table("db", "c").where('x < 1).select('a, 'b))
     assertEqual(
@@ -109,7 +109,7 @@ class PlanParserSuite extends AnalysisTest {
       table("db", "c").select('a, 'b).where('x < 1))
     assertEqual("select distinct a, b from db.c", Distinct(table("db", "c").select('a, 'b)))
     assertEqual("select all a, b from db.c", table("db", "c").select('a, 'b))
-    assertEqual("select from tbl", OneRowRelation.select('from.as("tbl")))
+    assertEqual("select from tbl", OneRowRelation().select('from.as("tbl")))
   }
 
   test("reverse select query") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
index 5389bf3389da4..10bdfafd6f933 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
@@ -116,7 +116,7 @@ trait PlanTest extends SparkFunSuite with PredicateHelper {
 
   /** Fails the test if the two expressions do not match */
   protected def compareExpressions(e1: Expression, e2: Expression): Unit = {
-    comparePlans(Filter(e1, OneRowRelation), Filter(e2, OneRowRelation), checkAnalysis = false)
+    comparePlans(Filter(e1, OneRowRelation()), Filter(e2, OneRowRelation()), checkAnalysis = false)
   }
 
   /** Fails the test if the join order in the two plans do not match */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index 2f8e416e7df1b..d4414b6f78ca2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -304,7 +304,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
         codegen = ctx.CODEGEN != null,
         cost = ctx.COST != null)
     } else {
-      ExplainCommand(OneRowRelation)
+      ExplainCommand(OneRowRelation())
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index a57d5abb90c0e..691f71a7d4ac2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -422,7 +422,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         execution.GenerateExec(
           generator, join = join, outer = outer, g.qualifiedGeneratorOutput,
           planLater(child)) :: Nil
-      case logical.OneRowRelation =>
+      case _: logical.OneRowRelation =>
         execution.RDDScanExec(Nil, singleRowRdd, "OneRowRelation") :: Nil
       case r: logical.Range =>
         execution.RangeExec(r) :: Nil
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
index 39cf8fcac5116..bc98d8d9d6d61 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
@@ -62,7 +62,6 @@ case class InMemoryRelation(
     @transient var _cachedColumnBuffers: RDD[CachedBatch] = null,
     val batchStats: LongAccumulator = child.sqlContext.sparkContext.longAccumulator)
   extends logical.LeafNode with MultiInstanceRelation {
-
   override def innerChildren: Seq[SparkPlan] = Seq(child)
 
   override def producedAttributes: AttributeSet = outputSet
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
index b97fa54446e0c..cb8dc1e041a9b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -479,7 +479,7 @@ object PreWriteCheck extends (LogicalPlan => Unit) {
       case InsertIntoTable(t, _, _, _, _)
         if !t.isInstanceOf[LeafNode] ||
           t.isInstanceOf[Range] ||
-          t == OneRowRelation ||
+          t.isInstanceOf[OneRowRelation] ||
           t.isInstanceOf[LocalRelation] =>
         failAnalysis(s"Inserting into an RDD-based table is not allowed.")
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 7c500728bdec9..b41ff3f921363 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -1333,7 +1333,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
       assert(e2.getMessage.contains("Inserting into an RDD-based table is not allowed."))
 
       // error case: insert into an OneRowRelation
-      Dataset.ofRows(spark, OneRowRelation).createOrReplaceTempView("one_row")
+      Dataset.ofRows(spark, OneRowRelation()).createOrReplaceTempView("one_row")
       val e3 = intercept[AnalysisException] {
         insertion.write.insertInto("one_row")
       }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala
index afccbe5cc6d19..964440346deb0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala
@@ -27,7 +27,7 @@ class QueryExecutionSuite extends SharedSQLContext {
           override def apply(plan: LogicalPlan): Seq[SparkPlan] = Nil
         })
 
-    def qe: QueryExecution = new QueryExecution(spark, OneRowRelation)
+    def qe: QueryExecution = new QueryExecution(spark, OneRowRelation())
 
     // Nothing!
     assert(qe.toString.contains("OneRowRelation"))
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 801f9b9923641..3a5c0c397b15e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -463,7 +463,7 @@ private[hive] class TestHiveSparkSession(
       // has already set the execution id.
       if (sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) == null) {
         // We don't actually have a `QueryExecution` here, use a fake one instead.
-        SQLExecution.withNewExecutionId(this, new QueryExecution(this, OneRowRelation)) {
+        SQLExecution.withNewExecutionId(this, new QueryExecution(this, OneRowRelation())) {
           createCmds.foreach(_())
         }
       } else {

From 6cbd18c9d0f5a1ddbcbcc4d92d24a1c68d05b4d0 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Fri, 4 Aug 2017 22:40:04 -0700
Subject: [PATCH 296/779] [SPARK-21374][CORE] Fix reading globbed paths from S3
 into DF with disabled FS cache

## What changes were proposed in this pull request?

This PR replaces #18623 to do some clean up.

Closes #18623

## How was this patch tested?

Jenkins

Author: Shixiong Zhu <shixiong@databricks.com>
Author: Andrey Taptunov <taptunov@amazon.com>

Closes #18848 from zsxwing/review-pr18623.
---
 .../apache/spark/deploy/SparkHadoopUtil.scala |  8 ++++
 .../execution/datasources/DataSource.scala    | 45 ++++++++++++-------
 2 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index ccbabf09a8323..ce916b43bf4d3 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -227,6 +227,10 @@ class SparkHadoopUtil extends Logging {
 
   def globPath(pattern: Path): Seq[Path] = {
     val fs = pattern.getFileSystem(conf)
+    globPath(fs, pattern)
+  }
+
+  def globPath(fs: FileSystem, pattern: Path): Seq[Path] = {
     Option(fs.globStatus(pattern)).map { statuses =>
       statuses.map(_.getPath.makeQualified(fs.getUri, fs.getWorkingDirectory)).toSeq
     }.getOrElse(Seq.empty[Path])
@@ -236,6 +240,10 @@ class SparkHadoopUtil extends Logging {
     if (isGlobPath(pattern)) globPath(pattern) else Seq(pattern)
   }
 
+  def globPathIfNecessary(fs: FileSystem, pattern: Path): Seq[Path] = {
+    if (isGlobPath(pattern)) globPath(fs, pattern) else Seq(pattern)
+  }
+
   /**
    * Lists all the files in a directory with the specified prefix, and does not end with the
    * given suffix. The returned {{FileStatus}} instances are sorted by the modification times of
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index cbe8ce421f92b..567ff49773f9b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -23,6 +23,7 @@ import scala.collection.JavaConverters._
 import scala.language.{existentials, implicitConversions}
 import scala.util.{Failure, Success, Try}
 
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.deploy.SparkHadoopUtil
@@ -150,7 +151,7 @@ case class DataSource(
         val hdfsPath = new Path(path)
         val fs = hdfsPath.getFileSystem(hadoopConf)
         val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
-        SparkHadoopUtil.get.globPathIfNecessary(qualified)
+        SparkHadoopUtil.get.globPathIfNecessary(fs, qualified)
       }.toArray
       new InMemoryFileIndex(sparkSession, globbedPaths, options, None, fileStatusCache)
     }
@@ -379,22 +380,8 @@ case class DataSource(
       case (format: FileFormat, _) =>
         val allPaths = caseInsensitiveOptions.get("path") ++ paths
         val hadoopConf = sparkSession.sessionState.newHadoopConf()
-        val globbedPaths = allPaths.flatMap { path =>
-          val hdfsPath = new Path(path)
-          val fs = hdfsPath.getFileSystem(hadoopConf)
-          val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
-          val globPath = SparkHadoopUtil.get.globPathIfNecessary(qualified)
-
-          if (globPath.isEmpty) {
-            throw new AnalysisException(s"Path does not exist: $qualified")
-          }
-          // Sufficient to check head of the globPath seq for non-glob scenario
-          // Don't need to check once again if files exist in streaming mode
-          if (checkFilesExist && !fs.exists(globPath.head)) {
-            throw new AnalysisException(s"Path does not exist: ${globPath.head}")
-          }
-          globPath
-        }.toArray
+        val globbedPaths = allPaths.flatMap(
+          DataSource.checkAndGlobPathIfNecessary(hadoopConf, _, checkFilesExist)).toArray
 
         val fileStatusCache = FileStatusCache.getOrCreate(sparkSession)
         val (dataSchema, partitionSchema) = getOrInferFileFormatSchema(format, fileStatusCache)
@@ -657,4 +644,28 @@ object DataSource extends Logging {
     CatalogStorageFormat.empty.copy(
       locationUri = path.map(CatalogUtils.stringToURI), properties = optionsWithoutPath)
   }
+
+  /**
+   * If `path` is a file pattern, return all the files that match it. Otherwise, return itself.
+   * If `checkFilesExist` is `true`, also check the file existence.
+   */
+  private def checkAndGlobPathIfNecessary(
+      hadoopConf: Configuration,
+      path: String,
+      checkFilesExist: Boolean): Seq[Path] = {
+    val hdfsPath = new Path(path)
+    val fs = hdfsPath.getFileSystem(hadoopConf)
+    val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
+    val globPath = SparkHadoopUtil.get.globPathIfNecessary(fs, qualified)
+
+    if (globPath.isEmpty) {
+      throw new AnalysisException(s"Path does not exist: $qualified")
+    }
+    // Sufficient to check head of the globPath seq for non-glob scenario
+    // Don't need to check once again if files exist in streaming mode
+    if (checkFilesExist && !fs.exists(globPath.head)) {
+      throw new AnalysisException(s"Path does not exist: ${globPath.head}")
+    }
+    globPath
+  }
 }

From 894d5a453a3f47525408ee8c91b3b594daa43ccb Mon Sep 17 00:00:00 2001
From: liuxian <liu.xian3@zte.com.cn>
Date: Fri, 4 Aug 2017 22:55:06 -0700
Subject: [PATCH 297/779] [SPARK-21580][SQL] Integers in aggregation
 expressions are wrongly taken as group-by ordinal

## What changes were proposed in this pull request?

create temporary view data as select * from values
(1, 1),
(1, 2),
(2, 1),
(2, 2),
(3, 1),
(3, 2)
as data(a, b);

`select 3, 4, sum(b) from data group by 1, 2;`
`select 3 as c, 4 as d, sum(b) from data group by c, d;`
When running these two cases, the following exception occurred:
`Error in query: GROUP BY position 4 is not in select list (valid range is [1, 3]); line 1 pos 10`

The cause of this failure:
If an aggregateExpression is integer, after replaced with this aggregateExpression, the
groupExpression still considered as an ordinal.

The solution:
This bug is due to re-entrance of an analyzed plan. We can solve it by using `resolveOperators` in `SubstituteUnresolvedOrdinals`.

## How was this patch tested?
Added unit test case

Author: liuxian <liu.xian3@zte.com.cn>

Closes #18779 from 10110346/groupby.
---
 .../analysis/SubstituteUnresolvedOrdinals.scala  |  2 +-
 .../sql-tests/inputs/group-by-ordinal.sql        |  2 +-
 .../spark/sql/DataFrameAggregateSuite.scala      | 16 ++++++++++++++++
 .../org/apache/spark/sql/DataFrameSuite.scala    |  6 ++++++
 4 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinals.scala
index 256b18771052a..860d20f897690 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinals.scala
@@ -33,7 +33,7 @@ class SubstituteUnresolvedOrdinals(conf: SQLConf) extends Rule[LogicalPlan] {
     case _ => false
   }
 
-  def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+  def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
     case s: Sort if conf.orderByOrdinal && s.order.exists(o => isIntLiteral(o.child)) =>
       val newOrders = s.order.map {
         case order @ SortOrder(ordinal @ Literal(index: Int, IntegerType), _, _, _) =>
diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by-ordinal.sql b/sql/core/src/test/resources/sql-tests/inputs/group-by-ordinal.sql
index 6566338f3d4a9..928f766b4add2 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/group-by-ordinal.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/group-by-ordinal.sql
@@ -52,7 +52,7 @@ select count(a), a from (select 1 as a) tmp group by 2 having a > 0;
 -- mixed cases: group-by ordinals and aliases
 select a, a AS k, count(b) from data group by k, 1;
 
--- turn of group by ordinal
+-- turn off group by ordinal
 set spark.sql.groupByOrdinal=false;
 
 -- can now group by negative literal
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index d802557b36ec9..69ea62ef5eb74 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -557,4 +557,20 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {
     }
     assert(e.message.contains("aggregate functions are not allowed in GROUP BY"))
   }
+
+  test("SPARK-21580 ints in aggregation expressions are taken as group-by ordinal.") {
+    checkAnswer(
+      testData2.groupBy(lit(3), lit(4)).agg(lit(6), lit(7), sum("b")),
+      Seq(Row(3, 4, 6, 7, 9)))
+    checkAnswer(
+      testData2.groupBy(lit(3), lit(4)).agg(lit(6), 'b, sum("b")),
+      Seq(Row(3, 4, 6, 1, 3), Row(3, 4, 6, 2, 6)))
+
+    checkAnswer(
+      spark.sql("SELECT 3, 4, SUM(b) FROM testData2 GROUP BY 1, 2"),
+      Seq(Row(3, 4, 9)))
+    checkAnswer(
+      spark.sql("SELECT 3 AS c, 4 AS d, SUM(b) FROM testData2 GROUP BY c, d"),
+      Seq(Row(3, 4, 9)))
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index b41ff3f921363..5eb34e587e95f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -2023,4 +2023,10 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
       assert(df1.join(df2, $"t1.i" === $"t2.i").cache().count() == 1)
     }
   }
+
+  test("order-by ordinal.") {
+    checkAnswer(
+      testData2.select(lit(7), 'a, 'b).orderBy(lit(1), lit(2), lit(3)),
+      Seq(Row(7, 1, 1), Row(7, 1, 2), Row(7, 2, 1), Row(7, 2, 2), Row(7, 3, 1), Row(7, 3, 2)))
+  }
 }

From 3a45c7fee6190270505d32409184b6ed1ed7b52b Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sat, 5 Aug 2017 21:58:38 +0900
Subject: [PATCH 298/779] [INFRA] Close stale PRs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

This PR proposes to close stale PRs, mostly the same instances with #18017

Closes #14085 - [SPARK-16408][SQL] SparkSQL Added file get Exception: is a directory …
Closes #14239 - [SPARK-16593] [CORE] [WIP] Provide a pre-fetch mechanism to accelerate shuffle stage.
Closes #14567 - [SPARK-16992][PYSPARK] Python Pep8 formatting and import reorganisation
Closes #14579 - [SPARK-16921][PYSPARK] RDD/DataFrame persist()/cache() should return Python context managers
Closes #14601 - [SPARK-13979][Core] Killed executor is re spawned without AWS key…
Closes #14830 - [SPARK-16992][PYSPARK][DOCS] import sort and autopep8 on Pyspark examples
Closes #14963 - [SPARK-16992][PYSPARK] Virtualenv for Pylint and pep8 in lint-python
Closes #15227 - [SPARK-17655][SQL]Remove unused variables declarations and definations in a WholeStageCodeGened stage
Closes #15240 - [SPARK-17556] [CORE] [SQL] Executor side broadcast for broadcast joins
Closes #15405 - [SPARK-15917][CORE] Added support for number of executors in Standalone [WIP]
Closes #16099 - [SPARK-18665][SQL] set statement state to "ERROR" after user cancel job
Closes #16445 - [SPARK-19043][SQL]Make SparkSQLSessionManager more configurable
Closes #16618 - [SPARK-14409][ML][WIP] Add RankingEvaluator
Closes #16766 - [SPARK-19426][SQL] Custom coalesce for Dataset
Closes #16832 - [SPARK-19490][SQL] ignore case sensitivity when filtering hive partition columns
Closes #17052 - [SPARK-19690][SS] Join a streaming DataFrame with a batch DataFrame which has an aggregation may not work
Closes #17267 - [SPARK-19926][PYSPARK] Make pyspark exception more user-friendly
Closes #17371 - [SPARK-19903][PYSPARK][SS] window operator miss the `watermark` metadata of time column
Closes #17401 - [SPARK-18364][YARN] Expose metrics for YarnShuffleService
Closes #17519 - [SPARK-15352][Doc] follow-up: add configuration docs for topology-aware block replication
Closes #17530 - [SPARK-5158] Access kerberized HDFS from Spark standalone
Closes #17854 - [SPARK-20564][Deploy] Reduce massive executor failures when executor count is large (>2000)
Closes #17979 - [SPARK-19320][MESOS][WIP]allow specifying a hard limit on number of gpus required in each spark executor when running on mesos
Closes #18127 - [SPARK-6628][SQL][Branch-2.1] Fix ClassCastException when executing sql statement 'insert into' on hbase table
Closes #18236 - [SPARK-21015] Check field name is not null and empty in GenericRowWit…
Closes #18269 - [SPARK-21056][SQL] Use at most one spark job to list files in InMemoryFileIndex
Closes #18328 - [SPARK-21121][SQL] Support changing storage level via the spark.sql.inMemoryColumnarStorage.level variable
Closes #18354 - [SPARK-18016][SQL][CATALYST][BRANCH-2.1] Code Generation: Constant Pool Limit - Class Splitting
Closes #18383 - [SPARK-21167][SS] Set kafka clientId while fetch messages
Closes #18414 - [SPARK-21169] [core] Make sure to update application status to RUNNING if executors are accepted and RUNNING after recovery
Closes #18432 - resolve com.esotericsoftware.kryo.KryoException
Closes #18490 - [SPARK-21269][Core][WIP] Fix FetchFailedException when enable maxReqSizeShuffleToMem and KryoSerializer
Closes #18585 - SPARK-21359
Closes #18609 - Spark SQL merge small files to big files Update InsertIntoHiveTable.scala

Added:
Closes #18308 - [SPARK-21099][Spark Core] INFO Log Message Using Incorrect Executor I…
Closes #18599 - [SPARK-21372] spark writes one log file even I set the number of spark_rotate_log to 0
Closes #18619 - [SPARK-21397][BUILD]Maven shade plugin adding dependency-reduced-pom.xml to …
Closes #18667 - Fix the simpleString used in error messages
Closes #18782 - Branch 2.1

Added:
Closes #17694 - [SPARK-12717][PYSPARK] Resolving race condition with pyspark broadcasts when using multiple threads

Added:
Closes #16456 - [SPARK-18994] clean up the local directories for application in future by annother thread
Closes #18683 - [SPARK-21474][CORE] Make number of parallel fetches from a reducer configurable
Closes #18690 - [SPARK-21334][CORE] Add metrics reporting service to External Shuffle Server

Added:
Closes #18827 - Merge pull request 1 from apache/master

## How was this patch tested?

N/A

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #18780 from HyukjinKwon/close-prs.

From ba327ee54c32b11107793604895bd38559804858 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sat, 5 Aug 2017 10:10:56 -0700
Subject: [PATCH 299/779] [SPARK-21485][FOLLOWUP][SQL][DOCS] Describes examples
 and arguments separately, and note/since in SQL built-in function
 documentation

## What changes were proposed in this pull request?

This PR proposes to separate `extended` into `examples` and `arguments` internally so that both can be separately documented and add `since` and `note` for additional information.

For `since`, it looks users sometimes get confused by, up to my knowledge, missing version information. For example, see https://www.mail-archive.com/userspark.apache.org/msg64798.html

For few good examples to check the built documentation, please see both:
`from_json` - https://spark-test.github.io/sparksqldoc/#from_json
`like` - https://spark-test.github.io/sparksqldoc/#like

For `DESCRIBE FUNCTION`, `note` and `since` are added as below:

```
> DESCRIBE FUNCTION EXTENDED rlike;
...
Extended Usage:
    Arguments:
      ...

    Examples:
      ...

    Note:
      Use LIKE to match with simple string pattern
```

```
> DESCRIBE FUNCTION EXTENDED to_json;
...
    Examples:
      ...

    Since: 2.2.0
```

For the complete documentation, see https://spark-test.github.io/sparksqldoc/

## How was this patch tested?

Manual tests and existing tests. Please see https://spark-test.github.io/sparksqldoc

Jenkins tests are needed to double check

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #18749 from HyukjinKwon/followup-sql-doc-gen.
---
 .../expressions/ExpressionDescription.java    |  42 +++++-
 .../catalyst/expressions/ExpressionInfo.java  |  65 +++++++-
 .../catalyst/analysis/FunctionRegistry.scala  |  20 ++-
 .../expressions/CallMethodViaReflection.scala |   2 +-
 .../spark/sql/catalyst/expressions/Cast.scala |   2 +-
 .../aggregate/ApproximatePercentile.scala     |   2 +-
 .../sql/catalyst/expressions/arithmetic.scala |  20 +--
 .../expressions/bitwiseExpressions.scala      |   8 +-
 .../expressions/collectionOperations.scala    |  10 +-
 .../expressions/complexTypeCreator.scala      |  11 +-
 .../expressions/conditionalExpressions.scala  |   2 +-
 .../expressions/datetimeExpressions.scala     |  52 +++----
 .../sql/catalyst/expressions/generators.scala |   8 +-
 .../spark/sql/catalyst/expressions/hash.scala |  10 +-
 .../expressions/jsonExpressions.scala         |  14 +-
 .../expressions/mathExpressions.scala         |  80 +++++-----
 .../spark/sql/catalyst/expressions/misc.scala |   6 +-
 .../expressions/nullExpressions.scala         |  18 +--
 .../expressions/randomExpressions.scala       |   4 +-
 .../expressions/regexpExpressions.scala       |  69 +++++----
 .../expressions/stringExpressions.scala       |  87 +++++------
 .../sql/catalyst/expressions/xml/xpath.scala  |  16 +-
 .../resources/sql-tests/results/cast.sql.out  |   4 +-
 .../sql-tests/results/json-functions.sql.out  |   4 +
 sql/gen-sql-markdown.py                       | 142 +++++++++++++++---
 25 files changed, 461 insertions(+), 237 deletions(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionDescription.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionDescription.java
index 62a2ce47d0ce6..ea6fffaebc9a1 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionDescription.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionDescription.java
@@ -24,20 +24,50 @@
 
 /**
  * ::DeveloperApi::
-
+ *
  * A function description type which can be recognized by FunctionRegistry, and will be used to
  * show the usage of the function in human language.
  *
  * `usage()` will be used for the function usage in brief way.
- * `extended()` will be used for the function usage in verbose way, suppose
- *              an example will be provided.
  *
- *  And we can refer the function name by `_FUNC_`, in `usage` and `extended`, as it's
+ * These below are concatenated and used for the function usage in verbose way, suppose arguments,
+ * examples, note and since will be provided.
+ *
+ * `arguments()` describes arguments for the expression. This should follow the format as below:
+ *
+ *   Arguments:
+ *     * arg0 - ...
+ *         ....
+ *     * arg1 - ...
+ *         ....
+ *
+ * `examples()` describes examples for the expression. This should follow the format as below:
+ *
+ *   Examples:
+ *     > SELECT ...;
+ *      ...
+ *     > SELECT ...;
+ *      ...
+ *
+ * `note()` contains some notes for the expression optionally.
+ *
+ * `since()` contains version information for the expression. Version is specified by,
+ * for example, "2.2.0".
+ *
+ *  We can refer the function name by `_FUNC_`, in `usage`, `arguments` and `examples`, as it's
  *  registered in `FunctionRegistry`.
+ *
+ *  Note that, if `extended()` is defined, `arguments()`, `examples()`, `note()` and `since()` will
+ *  be ignored and `extended()` will be used for the extended description for backward
+ *  compatibility.
  */
 @DeveloperApi
 @Retention(RetentionPolicy.RUNTIME)
 public @interface ExpressionDescription {
-    String usage() default "_FUNC_ is undocumented";
-    String extended() default "\n    No example/argument for _FUNC_.\n";
+    String usage() default "";
+    String extended() default "";
+    String arguments() default "";
+    String examples() default "";
+    String note() default "";
+    String since() default "";
 }
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java
index 4565ed44877a5..ab13ac9cc5483 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java
@@ -26,6 +26,10 @@ public class ExpressionInfo {
     private String name;
     private String extended;
     private String db;
+    private String arguments;
+    private String examples;
+    private String note;
+    private String since;
 
     public String getClassName() {
         return className;
@@ -43,23 +47,76 @@ public String getExtended() {
         return extended;
     }
 
+    public String getSince() {
+        return since;
+    }
+
+    public String getArguments() {
+        return arguments;
+    }
+
+    public String getExamples() {
+        return examples;
+    }
+
+    public String getNote() {
+        return note;
+    }
+
     public String getDb() {
         return db;
     }
 
-    public ExpressionInfo(String className, String db, String name, String usage, String extended) {
+    public ExpressionInfo(
+            String className,
+            String db,
+            String name,
+            String usage,
+            String arguments,
+            String examples,
+            String note,
+            String since) {
+        assert name != null;
+        assert arguments != null;
+        assert examples != null;
+        assert examples.isEmpty() || examples.startsWith("\n    Examples:");
+        assert note != null;
+        assert since != null;
+
         this.className = className;
         this.db = db;
         this.name = name;
         this.usage = usage;
-        this.extended = extended;
+        this.arguments = arguments;
+        this.examples = examples;
+        this.note = note;
+        this.since = since;
+
+        // Make the extended description.
+        this.extended = arguments + examples;
+        if (this.extended.isEmpty()) {
+            this.extended = "\n    No example/argument for _FUNC_.\n";
+        }
+        if (!note.isEmpty()) {
+            this.extended += "\n    Note:\n      " + note.trim() + "\n";
+        }
+        if (!since.isEmpty()) {
+            this.extended += "\n    Since: " + since + "\n";
+        }
     }
 
     public ExpressionInfo(String className, String name) {
-        this(className, null, name, null, null);
+        this(className, null, name, null, "", "", "", "");
     }
 
     public ExpressionInfo(String className, String db, String name) {
-        this(className, db, name, null, null);
+        this(className, db, name, null, "", "", "", "");
+    }
+
+    // This is to keep the original constructor just in case.
+    public ExpressionInfo(String className, String db, String name, String usage, String extended) {
+        // `arguments` and `examples` are concatenated for the extended description. So, here
+        // simply pass the `extended` as `arguments` and an empty string for `examples`.
+        this(className, db, name, usage, extended, "", "", "");
     }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 10b22ae562bcf..11538bd31b4fd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -557,7 +557,9 @@ object FunctionRegistry {
     }
     val clazz = scala.reflect.classTag[Cast].runtimeClass
     val usage = "_FUNC_(expr) - Casts the value `expr` to the target data type `_FUNC_`."
-    (name, (new ExpressionInfo(clazz.getCanonicalName, null, name, usage, null), builder))
+    val expressionInfo =
+      new ExpressionInfo(clazz.getCanonicalName, null, name, usage, "", "", "", "")
+    (name, (expressionInfo, builder))
   }
 
   /**
@@ -567,7 +569,21 @@ object FunctionRegistry {
     val clazz = scala.reflect.classTag[T].runtimeClass
     val df = clazz.getAnnotation(classOf[ExpressionDescription])
     if (df != null) {
-      new ExpressionInfo(clazz.getCanonicalName, null, name, df.usage(), df.extended())
+      if (df.extended().isEmpty) {
+        new ExpressionInfo(
+          clazz.getCanonicalName,
+          null,
+          name,
+          df.usage(),
+          df.arguments(),
+          df.examples(),
+          df.note(),
+          df.since())
+      } else {
+        // This exists for the backward compatibility with old `ExpressionDescription`s defining
+        // the extended description in `extended()`.
+        new ExpressionInfo(clazz.getCanonicalName, null, name, df.usage(), df.extended())
+      }
     } else {
       new ExpressionInfo(clazz.getCanonicalName, name)
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflection.scala
index 4859e0c537610..cd97304302e48 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflection.scala
@@ -45,7 +45,7 @@ import org.apache.spark.util.Utils
  */
 @ExpressionDescription(
   usage = "_FUNC_(class, method[, arg1[, arg2 ..]]) - Calls a method with reflection.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('java.util.UUID', 'randomUUID');
        c33fb387-8500-4bfa-81d2-6e0e3e930df2
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 3862e64b9d828..d949b8f1d6696 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -164,7 +164,7 @@ object Cast {
  */
 @ExpressionDescription(
   usage = "_FUNC_(expr AS type) - Casts the value `expr` to the target data type `type`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('10' as int);
        10
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
index 1ec2e4a9e9319..896c009b32971 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
@@ -58,7 +58,7 @@ import org.apache.spark.sql.types._
       In this case, returns the approximate percentile array of column `col` at the given
       percentage array.
   """,
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(10.0, array(0.5, 0.4, 0.1), 100);
        [10.0,10.0,10.0]
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index 94264e8a9ded5..7559852a2ac45 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -26,7 +26,7 @@ import org.apache.spark.unsafe.types.CalendarInterval
 
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the negated value of `expr`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(1);
        -1
@@ -89,7 +89,7 @@ case class UnaryPositive(child: Expression)
  */
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the absolute value of the numeric value.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(-1);
        1
@@ -141,7 +141,7 @@ object BinaryArithmetic {
 
 @ExpressionDescription(
   usage = "expr1 _FUNC_ expr2 - Returns `expr1`+`expr2`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT 1 _FUNC_ 2;
        3
@@ -177,7 +177,7 @@ case class Add(left: Expression, right: Expression) extends BinaryArithmetic {
 
 @ExpressionDescription(
   usage = "expr1 _FUNC_ expr2 - Returns `expr1`-`expr2`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT 2 _FUNC_ 1;
        1
@@ -213,7 +213,7 @@ case class Subtract(left: Expression, right: Expression) extends BinaryArithmeti
 
 @ExpressionDescription(
   usage = "expr1 _FUNC_ expr2 - Returns `expr1`*`expr2`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT 2 _FUNC_ 3;
        6
@@ -233,7 +233,7 @@ case class Multiply(left: Expression, right: Expression) extends BinaryArithmeti
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "expr1 _FUNC_ expr2 - Returns `expr1`/`expr2`. It always performs floating point division.",
-  extended = """
+  examples = """
     Examples:
       > SELECT 3 _FUNC_ 2;
        1.5
@@ -316,7 +316,7 @@ case class Divide(left: Expression, right: Expression) extends BinaryArithmetic
 
 @ExpressionDescription(
   usage = "expr1 _FUNC_ expr2 - Returns the remainder after `expr1`/`expr2`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT 2 _FUNC_ 1.8;
        0.2
@@ -403,7 +403,7 @@ case class Remainder(left: Expression, right: Expression) extends BinaryArithmet
 
 @ExpressionDescription(
   usage = "_FUNC_(expr1, expr2) - Returns the positive value of `expr1` mod `expr2`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(10, 3);
        1
@@ -562,7 +562,7 @@ case class Pmod(left: Expression, right: Expression) extends BinaryArithmetic {
  */
 @ExpressionDescription(
   usage = "_FUNC_(expr, ...) - Returns the least value of all parameters, skipping null values.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(10, 9, 2, 4, 3);
        2
@@ -628,7 +628,7 @@ case class Least(children: Seq[Expression]) extends Expression {
  */
 @ExpressionDescription(
   usage = "_FUNC_(expr, ...) - Returns the greatest value of all parameters, skipping null values.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(10, 9, 2, 4, 3);
        10
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala
index 425efbb6c96c4..173481f06a716 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.types._
  */
 @ExpressionDescription(
   usage = "expr1 _FUNC_ expr2 - Returns the result of bitwise AND of `expr1` and `expr2`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT 3 _FUNC_ 5;
        1
@@ -60,7 +60,7 @@ case class BitwiseAnd(left: Expression, right: Expression) extends BinaryArithme
  */
 @ExpressionDescription(
   usage = "expr1 _FUNC_ expr2 - Returns the result of bitwise OR of `expr1` and `expr2`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT 3 _FUNC_ 5;
        7
@@ -92,7 +92,7 @@ case class BitwiseOr(left: Expression, right: Expression) extends BinaryArithmet
  */
 @ExpressionDescription(
   usage = "expr1 _FUNC_ expr2 - Returns the result of bitwise exclusive OR of `expr1` and `expr2`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT 3 _FUNC_ 5;
        2
@@ -122,7 +122,7 @@ case class BitwiseXor(left: Expression, right: Expression) extends BinaryArithme
  */
 @ExpressionDescription(
   usage = "_FUNC_ expr - Returns the result of bitwise NOT of `expr`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_ 0;
        -1
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index 83a23cc97e45f..4270b987d6de0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -29,7 +29,7 @@ import org.apache.spark.sql.types._
  */
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the size of an array or a map. Returns -1 if null.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(array('b', 'd', 'c', 'a'));
        4
@@ -64,7 +64,7 @@ case class Size(child: Expression) extends UnaryExpression with ExpectsInputType
  */
 @ExpressionDescription(
   usage = "_FUNC_(map) - Returns an unordered array containing the keys of the map.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(map(1, 'a', 2, 'b'));
        [1,2]
@@ -92,7 +92,7 @@ case class MapKeys(child: Expression)
  */
 @ExpressionDescription(
   usage = "_FUNC_(map) - Returns an unordered array containing the values of the map.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(map(1, 'a', 2, 'b'));
        ["a","b"]
@@ -122,7 +122,7 @@ case class MapValues(child: Expression)
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(array[, ascendingOrder]) - Sorts the input array in ascending or descending order according to the natural ordering of the array elements.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(array('b', 'd', 'c', 'a'), true);
        ["a","b","c","d"]
@@ -217,7 +217,7 @@ case class SortArray(base: Expression, ascendingOrder: Expression)
  */
 @ExpressionDescription(
   usage = "_FUNC_(array, value) - Returns true if the array contains the value.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(array(1, 2, 3), 2);
        true
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index d9eeb5358ef79..4b6574a31424e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -32,7 +32,7 @@ import org.apache.spark.unsafe.types.UTF8String
  */
 @ExpressionDescription(
   usage = "_FUNC_(expr, ...) - Returns an array with the given elements.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(1, 2, 3);
        [1,2,3]
@@ -157,7 +157,7 @@ private [sql] object GenArrayData {
  */
 @ExpressionDescription(
   usage = "_FUNC_(key0, value0, key1, value1, ...) - Creates a map with the given key/value pairs.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(1.0, '2', 3.0, '4');
        {1.0:"2",3.0:"4"}
@@ -264,6 +264,9 @@ object CreateStruct extends FunctionBuilder {
       null,
       "struct",
       "_FUNC_(col1, col2, col3, ...) - Creates a struct with the given field values.",
+      "",
+      "",
+      "",
       "")
     ("struct", (info, this))
   }
@@ -336,7 +339,7 @@ trait CreateNamedStructLike extends Expression {
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(name1, val1, name2, val2, ...) - Creates a struct with the given field names and values.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_("a", 1, "b", 2, "c", 3);
        {"a":1,"b":2,"c":3}
@@ -393,7 +396,7 @@ case class CreateNamedStructUnsafe(children: Seq[Expression]) extends CreateName
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(text[, pairDelim[, keyValueDelim]]) - Creates a map after splitting the text into key/value pairs using delimiters. Default delimiters are ',' for `pairDelim` and ':' for `keyValueDelim`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('a:1,b:2,c:3', ',', ':');
        map("a":"1","b":"2","c":"3")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
index ae8efb673f91c..b59b6dec61389 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.types._
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(expr1, expr2, expr3) - If `expr1` evaluates to true, then returns `expr2`; otherwise returns `expr3`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(1 < 2, 'a', 'b');
        a
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index 505ed945cd68e..7dc32e1d6d492 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -137,7 +137,7 @@ case class CurrentBatchTimestamp(
  */
 @ExpressionDescription(
   usage = "_FUNC_(start_date, num_days) - Returns the date that is `num_days` after `start_date`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('2016-07-30', 1);
        2016-07-31
@@ -170,7 +170,7 @@ case class DateAdd(startDate: Expression, days: Expression)
  */
 @ExpressionDescription(
   usage = "_FUNC_(start_date, num_days) - Returns the date that is `num_days` before `start_date`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('2016-07-30', 1);
        2016-07-29
@@ -199,7 +199,7 @@ case class DateSub(startDate: Expression, days: Expression)
 
 @ExpressionDescription(
   usage = "_FUNC_(timestamp) - Returns the hour component of the string/timestamp.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('2009-07-30 12:58:59');
        12
@@ -229,7 +229,7 @@ case class Hour(child: Expression, timeZoneId: Option[String] = None)
 
 @ExpressionDescription(
   usage = "_FUNC_(timestamp) - Returns the minute component of the string/timestamp.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('2009-07-30 12:58:59');
        58
@@ -259,7 +259,7 @@ case class Minute(child: Expression, timeZoneId: Option[String] = None)
 
 @ExpressionDescription(
   usage = "_FUNC_(timestamp) - Returns the second component of the string/timestamp.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('2009-07-30 12:58:59');
        59
@@ -289,7 +289,7 @@ case class Second(child: Expression, timeZoneId: Option[String] = None)
 
 @ExpressionDescription(
   usage = "_FUNC_(date) - Returns the day of year of the date/timestamp.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('2016-04-09');
        100
@@ -312,7 +312,7 @@ case class DayOfYear(child: Expression) extends UnaryExpression with ImplicitCas
 
 @ExpressionDescription(
   usage = "_FUNC_(date) - Returns the year component of the date/timestamp.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('2016-07-30');
        2016
@@ -335,7 +335,7 @@ case class Year(child: Expression) extends UnaryExpression with ImplicitCastInpu
 
 @ExpressionDescription(
   usage = "_FUNC_(date) - Returns the quarter of the year for date, in the range 1 to 4.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('2016-08-31');
        3
@@ -358,7 +358,7 @@ case class Quarter(child: Expression) extends UnaryExpression with ImplicitCastI
 
 @ExpressionDescription(
   usage = "_FUNC_(date) - Returns the month component of the date/timestamp.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('2016-07-30');
        7
@@ -381,7 +381,7 @@ case class Month(child: Expression) extends UnaryExpression with ImplicitCastInp
 
 @ExpressionDescription(
   usage = "_FUNC_(date) - Returns the day of month of the date/timestamp.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('2009-07-30');
        30
@@ -405,7 +405,7 @@ case class DayOfMonth(child: Expression) extends UnaryExpression with ImplicitCa
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(date) - Returns the day of the week for date/timestamp (1 = Sunday, 2 = Monday, ..., 7 = Saturday).",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('2009-07-30');
        5
@@ -443,7 +443,7 @@ case class DayOfWeek(child: Expression) extends UnaryExpression with ImplicitCas
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(date) - Returns the week of the year of the given date. A week is considered to start on a Monday and week 1 is the first week with >3 days.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('2008-02-20');
        8
@@ -489,7 +489,7 @@ case class WeekOfYear(child: Expression) extends UnaryExpression with ImplicitCa
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(timestamp, fmt) - Converts `timestamp` to a value of string in the format specified by the date format `fmt`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('2016-04-08', 'y');
        2016
@@ -530,7 +530,7 @@ case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Opti
  */
 @ExpressionDescription(
   usage = "_FUNC_(expr[, pattern]) - Returns the UNIX timestamp of the given time.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('2016-04-08', 'yyyy-MM-dd');
        1460041200
@@ -568,7 +568,7 @@ case class ToUnixTimestamp(
  */
 @ExpressionDescription(
   usage = "_FUNC_([expr[, pattern]]) - Returns the UNIX timestamp of current or specified time.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_();
        1476884637
@@ -720,7 +720,7 @@ abstract class UnixTime
  */
 @ExpressionDescription(
   usage = "_FUNC_(unix_time, format) - Returns `unix_time` in the specified `format`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(0, 'yyyy-MM-dd HH:mm:ss');
        1970-01-01 00:00:00
@@ -829,7 +829,7 @@ case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[
  */
 @ExpressionDescription(
   usage = "_FUNC_(date) - Returns the last day of the month which the date belongs to.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('2009-01-12');
        2009-01-31
@@ -863,7 +863,7 @@ case class LastDay(startDate: Expression) extends UnaryExpression with ImplicitC
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(start_date, day_of_week) - Returns the first date which is later than `start_date` and named as indicated.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('2015-01-14', 'TU');
        2015-01-20
@@ -964,7 +964,7 @@ case class TimeAdd(start: Expression, interval: Expression, timeZoneId: Option[S
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(timestamp, timezone) - Given a timestamp, which corresponds to a certain time of day in UTC, returns another timestamp that corresponds to the same time of day in the given timezone.",
-  extended = """
+  examples = """
     Examples:
       > SELECT from_utc_timestamp('2016-08-31', 'Asia/Seoul');
        2016-08-31 09:00:00
@@ -1057,7 +1057,7 @@ case class TimeSub(start: Expression, interval: Expression, timeZoneId: Option[S
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(start_date, num_months) - Returns the date that is `num_months` after `start_date`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('2016-08-31', 1);
        2016-09-30
@@ -1093,7 +1093,7 @@ case class AddMonths(startDate: Expression, numMonths: Expression)
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(timestamp1, timestamp2) - Returns number of months between `timestamp1` and `timestamp2`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('1997-02-28 10:30:00', '1996-10-30');
        3.94959677
@@ -1136,7 +1136,7 @@ case class MonthsBetween(date1: Expression, date2: Expression, timeZoneId: Optio
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(timestamp, timezone) - Given a timestamp, which corresponds to a certain time of day in the given timezone, returns another timestamp that corresponds to the same time of day in UTC.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('2016-08-31', 'Asia/Seoul');
        2016-08-30 15:00:00
@@ -1197,7 +1197,7 @@ case class ToUTCTimestamp(left: Expression, right: Expression)
       a date. Returns null with invalid input. By default, it follows casting rules to a date if
       the `fmt` is omitted.
   """,
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('2009-07-30 04:17:52');
        2009-07-30
@@ -1238,7 +1238,7 @@ case class ParseToDate(left: Expression, format: Option[Expression], child: Expr
       a timestamp. Returns null with invalid input. By default, it follows casting rules to
       a timestamp if the `fmt` is omitted.
   """,
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('2016-12-31 00:12:00');
        2016-12-31 00:12:00
@@ -1273,7 +1273,7 @@ case class ParseToTimestamp(left: Expression, format: Option[Expression], child:
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(date, fmt) - Returns `date` with the time portion of the day truncated to the unit specified by the format model `fmt`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('2009-02-12', 'MM');
        2009-02-01
@@ -1352,7 +1352,7 @@ case class TruncDate(date: Expression, format: Expression)
  */
 @ExpressionDescription(
   usage = "_FUNC_(endDate, startDate) - Returns the number of days from `startDate` to `endDate`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('2009-07-31', '2009-07-30');
        1
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
index c217aa875d9eb..8618f49086077 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
@@ -127,7 +127,7 @@ case class UserDefinedGenerator(
  */
 @ExpressionDescription(
   usage = "_FUNC_(n, expr1, ..., exprk) - Separates `expr1`, ..., `exprk` into `n` rows.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(2, 1, 2, 3);
        1  2
@@ -324,7 +324,7 @@ abstract class ExplodeBase extends UnaryExpression with CollectionGenerator with
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Separates the elements of array `expr` into multiple rows, or the elements of map `expr` into multiple rows and columns.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(array(10, 20));
        10
@@ -347,7 +347,7 @@ case class Explode(child: Expression) extends ExplodeBase {
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Separates the elements of array `expr` into multiple rows with positions, or the elements of map `expr` into multiple rows and columns with positions.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(array(10,20));
        0  10
@@ -363,7 +363,7 @@ case class PosExplode(child: Expression) extends ExplodeBase {
  */
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Explodes an array of structs into a table.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(array(struct(1, 'a'), struct(2, 'b')));
        1  a
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
index 2476fc962a6fa..1e64d2129ea7b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
@@ -44,7 +44,7 @@ import org.apache.spark.unsafe.Platform
  */
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns an MD5 128-bit checksum as a hex string of `expr`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('Spark');
        8cde774d6f7333752ed72cacddb05126
@@ -78,7 +78,7 @@ case class Md5(child: Expression) extends UnaryExpression with ImplicitCastInput
     _FUNC_(expr, bitLength) - Returns a checksum of SHA-2 family as a hex string of `expr`.
       SHA-224, SHA-256, SHA-384, and SHA-512 are supported. Bit length of 0 is equivalent to 256.
   """,
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('Spark', 256);
        529bc3b07127ecb7e53a4dcf1991d9152c24537d919178022b2c42657f79a26b
@@ -151,7 +151,7 @@ case class Sha2(left: Expression, right: Expression)
  */
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns a sha1 hash value as a hex string of the `expr`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('Spark');
        85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c
@@ -178,7 +178,7 @@ case class Sha1(child: Expression) extends UnaryExpression with ImplicitCastInpu
  */
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns a cyclic redundancy check value of the `expr` as a bigint.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('Spark');
        1557323817
@@ -522,7 +522,7 @@ abstract class InterpretedHashFunction {
  */
 @ExpressionDescription(
   usage = "_FUNC_(expr1, expr2, ...) - Returns a hash value of the arguments.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('Spark', array(123), 2);
        -1321691492
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index 6b90354367f40..6f451fe54761a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -112,7 +112,7 @@ private[this] object SharedFactory {
  */
 @ExpressionDescription(
   usage = "_FUNC_(json_txt, path) - Extracts a json object from `path`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('{"a":"b"}', '$.a');
        b
@@ -335,7 +335,7 @@ case class GetJsonObject(json: Expression, path: Expression)
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(jsonStr, p1, p2, ..., pn) - Returns a tuple like the function get_json_object, but it takes multiple names. All the input parameters and output column types are string.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('{"a":1, "b":2}', 'a', 'b');
        1  2
@@ -492,13 +492,14 @@ case class JsonTuple(children: Seq[Expression])
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(jsonStr, schema[, options]) - Returns a struct value with the given `jsonStr` and `schema`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('{"a":1, "b":0.8}', 'a INT, b DOUBLE');
        {"a":1, "b":0.8}
       > SELECT _FUNC_('{"time":"26/08/2015"}', 'time Timestamp', map('timestampFormat', 'dd/MM/yyyy'));
        {"time":"2015-08-26 00:00:00.0"}
-  """)
+  """,
+  since = "2.2.0")
 // scalastyle:on line.size.limit
 case class JsonToStructs(
     schema: DataType,
@@ -600,7 +601,7 @@ case class JsonToStructs(
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(expr[, options]) - Returns a json string with a given struct value",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(named_struct('a', 1, 'b', 2));
        {"a":1,"b":2}
@@ -608,7 +609,8 @@ case class JsonToStructs(
        {"time":"26/08/2015"}
       > SELECT _FUNC_(array(named_struct('a', 1, 'b', 2));
        [{"a":1,"b":2}]
-  """)
+  """,
+  since = "2.2.0")
 // scalastyle:on line.size.limit
 case class StructsToJson(
     options: Map[String, String],
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
index 615256243ae2a..5c54ffa9845a2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
@@ -142,7 +142,7 @@ abstract class BinaryMathExpression(f: (Double, Double) => Double, name: String)
  */
 @ExpressionDescription(
   usage = "_FUNC_() - Returns Euler's number, e.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_();
        2.718281828459045
@@ -155,7 +155,7 @@ case class EulerNumber() extends LeafMathExpression(math.E, "E")
  */
 @ExpressionDescription(
   usage = "_FUNC_() - Returns pi.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_();
        3.141592653589793
@@ -171,7 +171,7 @@ case class Pi() extends LeafMathExpression(math.Pi, "PI")
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the inverse cosine (a.k.a. arccosine) of `expr` if -1<=`expr`<=1 or NaN otherwise.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(1);
        0.0
@@ -184,7 +184,7 @@ case class Acos(child: Expression) extends UnaryMathExpression(math.acos, "ACOS"
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the inverse sine (a.k.a. arcsine) the arc sin of `expr` if -1<=`expr`<=1 or NaN otherwise.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(0);
        0.0
@@ -197,7 +197,7 @@ case class Asin(child: Expression) extends UnaryMathExpression(math.asin, "ASIN"
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the inverse tangent (a.k.a. arctangent).",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(0);
        0.0
@@ -207,7 +207,7 @@ case class Atan(child: Expression) extends UnaryMathExpression(math.atan, "ATAN"
 
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the cube root of `expr`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(27.0);
        3.0
@@ -216,7 +216,7 @@ case class Cbrt(child: Expression) extends UnaryMathExpression(math.cbrt, "CBRT"
 
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the smallest integer not smaller than `expr`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(-0.1);
        0
@@ -253,7 +253,7 @@ case class Ceil(child: Expression) extends UnaryMathExpression(math.ceil, "CEIL"
 
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the cosine of `expr`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(0);
        1.0
@@ -262,7 +262,7 @@ case class Cos(child: Expression) extends UnaryMathExpression(math.cos, "COS")
 
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the hyperbolic cosine of `expr`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(0);
        1.0
@@ -278,7 +278,7 @@ case class Cosh(child: Expression) extends UnaryMathExpression(math.cosh, "COSH"
  */
 @ExpressionDescription(
   usage = "_FUNC_(num, from_base, to_base) - Convert `num` from `from_base` to `to_base`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('100', 2, 10);
        4
@@ -315,7 +315,7 @@ case class Conv(numExpr: Expression, fromBaseExpr: Expression, toBaseExpr: Expre
 
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns e to the power of `expr`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(0);
        1.0
@@ -324,7 +324,7 @@ case class Exp(child: Expression) extends UnaryMathExpression(math.exp, "EXP")
 
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns exp(`expr`) - 1.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(0);
        0.0
@@ -333,7 +333,7 @@ case class Expm1(child: Expression) extends UnaryMathExpression(math.expm1, "EXP
 
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the largest integer not greater than `expr`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(-0.1);
        -1
@@ -401,7 +401,7 @@ object Factorial {
 
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the factorial of `expr`. `expr` is [0..20]. Otherwise, null.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(5);
        120
@@ -440,7 +440,7 @@ case class Factorial(child: Expression) extends UnaryExpression with ImplicitCas
 
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the natural logarithm (base e) of `expr`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(1);
        0.0
@@ -449,7 +449,7 @@ case class Log(child: Expression) extends UnaryLogExpression(math.log, "LOG")
 
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the logarithm of `expr` with base 2.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(2);
        1.0
@@ -471,7 +471,7 @@ case class Log2(child: Expression)
 
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the logarithm of `expr` with base 10.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(10);
        1.0
@@ -480,7 +480,7 @@ case class Log10(child: Expression) extends UnaryLogExpression(math.log10, "LOG1
 
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns log(1 + `expr`).",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(0);
        0.0
@@ -492,7 +492,7 @@ case class Log1p(child: Expression) extends UnaryLogExpression(math.log1p, "LOG1
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the double value that is closest in value to the argument and is equal to a mathematical integer.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(12.3456);
        12.0
@@ -504,7 +504,7 @@ case class Rint(child: Expression) extends UnaryMathExpression(math.rint, "ROUND
 
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns -1.0, 0.0 or 1.0 as `expr` is negative, 0 or positive.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(40);
        1.0
@@ -513,7 +513,7 @@ case class Signum(child: Expression) extends UnaryMathExpression(math.signum, "S
 
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the sine of `expr`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(0);
        0.0
@@ -522,7 +522,7 @@ case class Sin(child: Expression) extends UnaryMathExpression(math.sin, "SIN")
 
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the hyperbolic sine of `expr`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(0);
        0.0
@@ -531,7 +531,7 @@ case class Sinh(child: Expression) extends UnaryMathExpression(math.sinh, "SINH"
 
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the square root of `expr`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(4);
        2.0
@@ -540,7 +540,7 @@ case class Sqrt(child: Expression) extends UnaryMathExpression(math.sqrt, "SQRT"
 
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the tangent of `expr`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(0);
        0.0
@@ -549,7 +549,7 @@ case class Tan(child: Expression) extends UnaryMathExpression(math.tan, "TAN")
 
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the cotangent of `expr`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(1);
        0.6420926159343306
@@ -563,7 +563,7 @@ case class Cot(child: Expression)
 
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the hyperbolic tangent of `expr`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(0);
        0.0
@@ -572,7 +572,7 @@ case class Tanh(child: Expression) extends UnaryMathExpression(math.tanh, "TANH"
 
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Converts radians to degrees.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(3.141592653589793);
        180.0
@@ -583,7 +583,7 @@ case class ToDegrees(child: Expression) extends UnaryMathExpression(math.toDegre
 
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Converts degrees to radians.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(180);
        3.141592653589793
@@ -595,7 +595,7 @@ case class ToRadians(child: Expression) extends UnaryMathExpression(math.toRadia
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the string representation of the long value `expr` represented in binary.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(13);
        1101
@@ -698,7 +698,7 @@ object Hex {
  */
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Converts `expr` to hexadecimal.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(17);
        11
@@ -735,7 +735,7 @@ case class Hex(child: Expression) extends UnaryExpression with ImplicitCastInput
  */
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Converts hexadecimal `expr` to binary.",
-  extended = """
+  examples = """
     Examples:
       > SELECT decode(_FUNC_('537061726B2053514C'), 'UTF-8');
        Spark SQL
@@ -771,7 +771,7 @@ case class Unhex(child: Expression) extends UnaryExpression with ImplicitCastInp
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(expr1, expr2) - Returns the angle in radians between the positive x-axis of a plane and the point given by the coordinates (`expr1`, `expr2`).",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(0, 0);
        0.0
@@ -792,7 +792,7 @@ case class Atan2(left: Expression, right: Expression)
 
 @ExpressionDescription(
   usage = "_FUNC_(expr1, expr2) - Raises `expr1` to the power of `expr2`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(2, 3);
        8.0
@@ -813,7 +813,7 @@ case class Pow(left: Expression, right: Expression)
  */
 @ExpressionDescription(
   usage = "_FUNC_(base, expr) - Bitwise left shift.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(2, 1);
        4
@@ -847,7 +847,7 @@ case class ShiftLeft(left: Expression, right: Expression)
  */
 @ExpressionDescription(
   usage = "_FUNC_(base, expr) - Bitwise (signed) right shift.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(4, 1);
        2
@@ -881,7 +881,7 @@ case class ShiftRight(left: Expression, right: Expression)
  */
 @ExpressionDescription(
   usage = "_FUNC_(base, expr) - Bitwise unsigned right shift.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(4, 1);
        2
@@ -908,7 +908,7 @@ case class ShiftRightUnsigned(left: Expression, right: Expression)
 
 @ExpressionDescription(
   usage = "_FUNC_(expr1, expr2) - Returns sqrt(`expr1`**2 + `expr2`**2).",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(3, 4);
        5.0
@@ -925,7 +925,7 @@ case class Hypot(left: Expression, right: Expression)
  */
 @ExpressionDescription(
   usage = "_FUNC_(base, expr) - Returns the logarithm of `expr` with `base`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(10, 100);
        2.0
@@ -1155,7 +1155,7 @@ abstract class RoundBase(child: Expression, scale: Expression,
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(expr, d) - Returns `expr` rounded to `d` decimal places using HALF_UP rounding mode.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(2.5, 0);
        3.0
@@ -1175,7 +1175,7 @@ case class Round(child: Expression, scale: Expression)
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(expr, d) - Returns `expr` rounded to `d` decimal places using HALF_EVEN rounding mode.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(2.5, 0);
        2.0
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
index 3fc4bb7041636..ef293ff3f18ea 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
@@ -50,7 +50,7 @@ case class PrintToStderr(child: Expression) extends UnaryExpression {
  */
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Throws an exception if `expr` is not true.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(0 < 1);
        NULL
@@ -96,7 +96,7 @@ case class AssertTrue(child: Expression) extends UnaryExpression with ImplicitCa
  */
 @ExpressionDescription(
   usage = "_FUNC_() - Returns the current database.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_();
        default
@@ -111,7 +111,7 @@ case class CurrentDatabase() extends LeafExpression with Unevaluable {
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_() - Returns an universally unique identifier (UUID) string. The value is returned as a canonical UUID 36-character string.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_();
        46707d92-02f4-4817-8116-a4c3b23e6266
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
index 1b625141d56ac..62786e13bda2c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
@@ -37,7 +37,7 @@ import org.apache.spark.sql.types._
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(expr1, expr2, ...) - Returns the first non-null argument if exists. Otherwise, null.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(NULL, 1, NULL);
        1
@@ -97,7 +97,7 @@ case class Coalesce(children: Seq[Expression]) extends Expression {
 
 @ExpressionDescription(
   usage = "_FUNC_(expr1, expr2) - Returns `expr2` if `expr1` is null, or `expr1` otherwise.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(NULL, array('2'));
        ["2"]
@@ -116,7 +116,7 @@ case class IfNull(left: Expression, right: Expression, child: Expression)
 
 @ExpressionDescription(
   usage = "_FUNC_(expr1, expr2) - Returns null if `expr1` equals to `expr2`, or `expr1` otherwise.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(2, 2);
        NULL
@@ -135,7 +135,7 @@ case class NullIf(left: Expression, right: Expression, child: Expression)
 
 @ExpressionDescription(
   usage = "_FUNC_(expr1, expr2) - Returns `expr2` if `expr1` is null, or `expr1` otherwise.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(NULL, array('2'));
        ["2"]
@@ -154,7 +154,7 @@ case class Nvl(left: Expression, right: Expression, child: Expression) extends R
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(expr1, expr2, expr3) - Returns `expr2` if `expr1` is not null, or `expr3` otherwise.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(NULL, 2, 1);
        1
@@ -177,7 +177,7 @@ case class Nvl2(expr1: Expression, expr2: Expression, expr3: Expression, child:
  */
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns true if `expr` is NaN, or false otherwise.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(cast('NaN' as double));
        true
@@ -219,7 +219,7 @@ case class IsNaN(child: Expression) extends UnaryExpression
  */
 @ExpressionDescription(
   usage = "_FUNC_(expr1, expr2) - Returns `expr1` if it's not NaN, or `expr2` otherwise.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(cast('NaN' as double), 123);
        123.0
@@ -279,7 +279,7 @@ case class NaNvl(left: Expression, right: Expression)
  */
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns true if `expr` is null, or false otherwise.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(1);
        false
@@ -305,7 +305,7 @@ case class IsNull(child: Expression) extends UnaryExpression with Predicate {
  */
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns true if `expr` is not null, or false otherwise.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(1);
        true
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala
index 1d7a3c7356075..97051769cbf72 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala
@@ -60,7 +60,7 @@ abstract class RDG extends UnaryExpression with ExpectsInputTypes with Nondeterm
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_([seed]) - Returns a random value with independent and identically distributed (i.i.d.) uniformly distributed values in [0, 1).",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_();
        0.9629742951434543
@@ -95,7 +95,7 @@ object Rand {
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_([seed]) - Returns a random value with independent and identically distributed (i.i.d.) values drawn from the standard normal distribution.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_();
        -0.3254147983080288
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index 5418acedbef21..d0d663f63f5db 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -71,34 +71,35 @@ abstract class StringRegexExpression extends BinaryExpression
 @ExpressionDescription(
   usage = "str _FUNC_ pattern - Returns true if str matches pattern, " +
     "null if any arguments are null, false otherwise.",
-  extended = """
+  arguments = """
     Arguments:
-      str - a string expression
-      pattern - a string expression. The pattern is a string which is matched literally, with
-        exception to the following special symbols:
+      * str - a string expression
+      * pattern - a string expression. The pattern is a string which is matched literally, with
+          exception to the following special symbols:
 
           _ matches any one character in the input (similar to . in posix regular expressions)
 
           % matches zero or more characters in the input (similar to .* in posix regular
           expressions)
 
-        The escape character is '\'. If an escape character precedes a special symbol or another
-        escape character, the following character is matched literally. It is invalid to escape
-        any other character.
+          The escape character is '\'. If an escape character precedes a special symbol or another
+          escape character, the following character is matched literally. It is invalid to escape
+          any other character.
 
-        Since Spark 2.0, string literals are unescaped in our SQL parser. For example, in order
-        to match "\abc", the pattern should be "\\abc".
-
-        When SQL config 'spark.sql.parser.escapedStringLiterals' is enabled, it fallbacks
-        to Spark 1.6 behavior regarding string literal parsing. For example, if the config is
-        enabled, the pattern to match "\abc" should be "\abc".
+          Since Spark 2.0, string literals are unescaped in our SQL parser. For example, in order
+          to match "\abc", the pattern should be "\\abc".
 
+          When SQL config 'spark.sql.parser.escapedStringLiterals' is enabled, it fallbacks
+          to Spark 1.6 behavior regarding string literal parsing. For example, if the config is
+          enabled, the pattern to match "\abc" should be "\abc".
+  """,
+  examples = """
     Examples:
       > SELECT '%SystemDrive%\Users\John' _FUNC_ '\%SystemDrive\%\\Users%'
       true
-
-    See also:
-      Use RLIKE to match with standard regular expressions.
+  """,
+  note = """
+    Use RLIKE to match with standard regular expressions.
   """)
 case class Like(left: Expression, right: Expression) extends StringRegexExpression {
 
@@ -152,18 +153,20 @@ case class Like(left: Expression, right: Expression) extends StringRegexExpressi
 
 @ExpressionDescription(
   usage = "str _FUNC_ regexp - Returns true if `str` matches `regexp`, or false otherwise.",
-  extended = """
+  arguments = """
     Arguments:
-      str - a string expression
-      regexp - a string expression. The pattern string should be a Java regular expression.
-
-        Since Spark 2.0, string literals (including regex patterns) are unescaped in our SQL parser.
-        For example, to match "\abc", a regular expression for `regexp` can be "^\\abc$".
-
-        There is a SQL config 'spark.sql.parser.escapedStringLiterals' that can be used to fallback
-        to the Spark 1.6 behavior regarding string literal parsing. For example, if the config is
-        enabled, the `regexp` that can match "\abc" is "^\abc$".
-
+      * str - a string expression
+      * regexp - a string expression. The pattern string should be a Java regular expression.
+
+          Since Spark 2.0, string literals (including regex patterns) are unescaped in our SQL
+          parser. For example, to match "\abc", a regular expression for `regexp` can be
+          "^\\abc$".
+
+          There is a SQL config 'spark.sql.parser.escapedStringLiterals' that can be used to
+          fallback to the Spark 1.6 behavior regarding string literal parsing. For example,
+          if the config is enabled, the `regexp` that can match "\abc" is "^\abc$".
+  """,
+  examples = """
     Examples:
       When spark.sql.parser.escapedStringLiterals is disabled (default).
       > SELECT '%SystemDrive%\Users\John' _FUNC_ '%SystemDrive%\\Users.*'
@@ -172,9 +175,9 @@ case class Like(left: Expression, right: Expression) extends StringRegexExpressi
       When spark.sql.parser.escapedStringLiterals is enabled.
       > SELECT '%SystemDrive%\Users\John' _FUNC_ '%SystemDrive%\Users.*'
       true
-
-    See also:
-      Use LIKE to match with simple string pattern.
+  """,
+  note = """
+    Use LIKE to match with simple string pattern.
   """)
 case class RLike(left: Expression, right: Expression) extends StringRegexExpression {
 
@@ -229,7 +232,7 @@ case class RLike(left: Expression, right: Expression) extends StringRegexExpress
  */
 @ExpressionDescription(
   usage = "_FUNC_(str, regex) - Splits `str` around occurrences that match `regex`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('oneAtwoBthreeC', '[ABC]');
        ["one","two","three",""]
@@ -266,7 +269,7 @@ case class StringSplit(str: Expression, pattern: Expression)
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(str, regexp, rep) - Replaces all substrings of `str` that match `regexp` with `rep`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('100-200', '(\d+)', 'num');
        num-num
@@ -373,7 +376,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
  */
 @ExpressionDescription(
   usage = "_FUNC_(str, regexp[, idx]) - Extracts a group that matches `regexp`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('100-200', '(\d+)-(\d+)', 1);
        100
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index d75b9d6280328..7ab45a6ee8737 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -42,7 +42,7 @@ import org.apache.spark.unsafe.types.{ByteArray, UTF8String}
  */
 @ExpressionDescription(
   usage = "_FUNC_(str1, str2, ..., strN) - Returns the concatenation of str1, str2, ..., strN.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('Spark', 'SQL');
        SparkSQL
@@ -85,7 +85,7 @@ case class Concat(children: Seq[Expression]) extends Expression with ImplicitCas
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(sep, [str | array(str)]+) - Returns the concatenation of the strings separated by `sep`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(' ', 'Spark', 'SQL');
         Spark SQL
@@ -179,7 +179,7 @@ case class ConcatWs(children: Seq[Expression])
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(n, str1, str2, ...) - Returns the `n`-th string, e.g., returns `str2` when `n` is 2.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(1, 'scala', 'java');
        scala
@@ -262,7 +262,7 @@ trait String2StringExpression extends ImplicitCastInputTypes {
  */
 @ExpressionDescription(
   usage = "_FUNC_(str) - Returns `str` with all characters changed to uppercase.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('SparkSql');
        SPARKSQL
@@ -282,7 +282,7 @@ case class Upper(child: Expression)
  */
 @ExpressionDescription(
   usage = "_FUNC_(str) - Returns `str` with all characters changed to lowercase.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('SparkSql');
        sparksql
@@ -346,13 +346,14 @@ case class EndsWith(left: Expression, right: Expression) extends StringPredicate
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(str, search[, replace]) - Replaces all occurrences of `search` with `replace`.",
-  extended = """
+  arguments = """
     Arguments:
-      str - a string expression
-      search - a string expression. If `search` is not found in `str`, `str` is returned unchanged.
-      replace - a string expression. If `replace` is not specified or is an empty string, nothing replaces
-                the string that is removed from `str`.
-
+      * str - a string expression
+      * search - a string expression. If `search` is not found in `str`, `str` is returned unchanged.
+      * replace - a string expression. If `replace` is not specified or is an empty string, nothing replaces
+          the string that is removed from `str`.
+  """,
+  examples = """
     Examples:
       > SELECT _FUNC_('ABCabc', 'abc', 'DEF');
        ABCDEF
@@ -410,7 +411,7 @@ object StringTranslate {
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(input, from, to) - Translates the `input` string by replacing the characters present in the `from` string with the corresponding characters in the `to` string.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('AaBbCc', 'abc', '123');
        A1B2C3
@@ -477,7 +478,7 @@ case class StringTranslate(srcExpr: Expression, matchingExpr: Expression, replac
     _FUNC_(str, str_array) - Returns the index (1-based) of the given string (`str`) in the comma-delimited list (`str_array`).
       Returns 0, if the string was not found or if the given string (`str`) contains a comma.
   """,
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('ab','abc,b,ab,c,def');
        3
@@ -507,7 +508,7 @@ case class FindInSet(left: Expression, right: Expression) extends BinaryExpressi
  */
 @ExpressionDescription(
   usage = "_FUNC_(str) - Removes the leading and trailing space characters from `str`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('    SparkSQL   ');
        SparkSQL
@@ -529,7 +530,7 @@ case class StringTrim(child: Expression)
  */
 @ExpressionDescription(
   usage = "_FUNC_(str) - Removes the leading and trailing space characters from `str`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('    SparkSQL');
        SparkSQL
@@ -551,7 +552,7 @@ case class StringTrimLeft(child: Expression)
  */
 @ExpressionDescription(
   usage = "_FUNC_(str) - Removes the trailing space characters from `str`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('    SparkSQL   ');
            SparkSQL
@@ -577,7 +578,7 @@ case class StringTrimRight(child: Expression)
  */
 @ExpressionDescription(
   usage = "_FUNC_(str, substr) - Returns the (1-based) index of the first occurrence of `substr` in `str`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('SparkSQL', 'SQL');
        6
@@ -617,7 +618,7 @@ case class StringInstr(str: Expression, substr: Expression)
       (counting from the right) is returned. The function substring_index performs a case-sensitive match
       when searching for `delim`.
   """,
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('www.apache.org', '.', 2);
        www.apache
@@ -652,7 +653,7 @@ case class SubstringIndex(strExpr: Expression, delimExpr: Expression, countExpr:
     _FUNC_(substr, str[, pos]) - Returns the position of the first occurrence of `substr` in `str` after position `pos`.
       The given `pos` and return value are 1-based.
   """,
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('bar', 'foobarbar');
        4
@@ -739,7 +740,7 @@ case class StringLocate(substr: Expression, str: Expression, start: Expression)
     _FUNC_(str, len, pad) - Returns `str`, left-padded with `pad` to a length of `len`.
       If `str` is longer than `len`, the return value is shortened to `len` characters.
   """,
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('hi', 5, '??');
        ???hi
@@ -772,7 +773,7 @@ case class StringLPad(str: Expression, len: Expression, pad: Expression)
     _FUNC_(str, len, pad) - Returns `str`, right-padded with `pad` to a length of `len`.
       If `str` is longer than `len`, the return value is shortened to `len` characters.
   """,
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('hi', 5, '??');
        hi???
@@ -815,7 +816,7 @@ object ParseUrl {
  */
 @ExpressionDescription(
   usage = "_FUNC_(url, partToExtract[, key]) - Extracts a part from a URL.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'HOST')
        spark.apache.org
@@ -970,7 +971,7 @@ case class ParseUrl(children: Seq[Expression])
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(strfmt, obj, ...) - Returns a formatted string from printf-style format strings.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_("Hello World %d %s", 100, "days");
        Hello World 100 days
@@ -1049,7 +1050,7 @@ case class FormatString(children: Expression*) extends Expression with ImplicitC
     _FUNC_(str) - Returns `str` with the first letter of each word in uppercase.
       All other letters are in lowercase. Words are delimited by white space.
   """,
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('sPark sql');
        Spark Sql
@@ -1072,7 +1073,7 @@ case class InitCap(child: Expression) extends UnaryExpression with ImplicitCastI
  */
 @ExpressionDescription(
   usage = "_FUNC_(str, n) - Returns the string which repeats the given string value n times.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('123', 2);
        123123
@@ -1101,7 +1102,7 @@ case class StringRepeat(str: Expression, times: Expression)
  */
 @ExpressionDescription(
   usage = "_FUNC_(str) - Returns the reversed given string.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('Spark SQL');
        LQS krapS
@@ -1121,7 +1122,7 @@ case class StringReverse(child: Expression) extends UnaryExpression with String2
  */
 @ExpressionDescription(
   usage = "_FUNC_(n) - Returns a string consisting of `n` spaces.",
-  extended = """
+  examples = """
     Examples:
       > SELECT concat(_FUNC_(2), '1');
          1
@@ -1154,7 +1155,7 @@ case class StringSpace(child: Expression)
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(str, pos[, len]) - Returns the substring of `str` that starts at `pos` and is of length `len`, or the slice of byte array that starts at `pos` and is of length `len`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('Spark SQL', 5);
        k SQL
@@ -1204,7 +1205,7 @@ case class Substring(str: Expression, pos: Expression, len: Expression)
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(str, len) - Returns the rightmost `len`(`len` can be string type) characters from the string `str`,if `len` is less or equal than 0 the result is an empty string.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('Spark SQL', 3);
        SQL
@@ -1226,7 +1227,7 @@ case class Right(str: Expression, len: Expression, child: Expression) extends Ru
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(str, len) - Returns the leftmost `len`(`len` can be string type) characters from the string `str`,if `len` is less or equal than 0 the result is an empty string.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('Spark SQL', 3);
        Spa
@@ -1248,7 +1249,7 @@ case class Left(str: Expression, len: Expression, child: Expression) extends Run
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the character length of `expr` or number of bytes in binary data.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('Spark SQL');
        9
@@ -1280,7 +1281,7 @@ case class Length(child: Expression) extends UnaryExpression with ImplicitCastIn
  */
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the bit length of `expr` or number of bits in binary data.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('Spark SQL');
        72
@@ -1307,7 +1308,7 @@ case class BitLength(child: Expression) extends UnaryExpression with ImplicitCas
  */
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the byte length of `expr` or number of bytes in binary data.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('Spark SQL');
        9
@@ -1334,7 +1335,7 @@ case class OctetLength(child: Expression) extends UnaryExpression with ImplicitC
  */
 @ExpressionDescription(
   usage = "_FUNC_(str1, str2) - Returns the Levenshtein distance between the two given strings.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('kitten', 'sitting');
        3
@@ -1359,7 +1360,7 @@ case class Levenshtein(left: Expression, right: Expression) extends BinaryExpres
  */
 @ExpressionDescription(
   usage = "_FUNC_(str) - Returns Soundex code of the string.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('Miller');
        M460
@@ -1382,7 +1383,7 @@ case class SoundEx(child: Expression) extends UnaryExpression with ExpectsInputT
  */
 @ExpressionDescription(
   usage = "_FUNC_(str) - Returns the numeric value of the first character of `str`.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('222');
        50
@@ -1424,7 +1425,7 @@ case class Ascii(child: Expression) extends UnaryExpression with ImplicitCastInp
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the ASCII character having the binary equivalent to `expr`. If n is larger than 256 the result is equivalent to chr(n % 256)",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(65);
        A
@@ -1467,7 +1468,7 @@ case class Chr(child: Expression) extends UnaryExpression with ImplicitCastInput
  */
 @ExpressionDescription(
   usage = "_FUNC_(bin) - Converts the argument from a binary `bin` to a base 64 string.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('Spark SQL');
        U3BhcmsgU1FM
@@ -1496,7 +1497,7 @@ case class Base64(child: Expression) extends UnaryExpression with ImplicitCastIn
  */
 @ExpressionDescription(
   usage = "_FUNC_(str) - Converts the argument from a base 64 string `str` to a binary.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('U3BhcmsgU1FM');
        Spark SQL
@@ -1525,7 +1526,7 @@ case class UnBase64(child: Expression) extends UnaryExpression with ImplicitCast
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(bin, charset) - Decodes the first argument using the second argument character set.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(encode('abc', 'utf-8'), 'utf-8');
        abc
@@ -1564,7 +1565,7 @@ case class Decode(bin: Expression, charset: Expression)
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(str, charset) - Encodes the first argument using the second argument character set.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('abc', 'utf-8');
        abc
@@ -1605,7 +1606,7 @@ case class Encode(value: Expression, charset: Expression)
       decimal places. If `expr2` is 0, the result has no decimal point or fractional part.
       This is supposed to function like MySQL's FORMAT.
   """,
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_(12332.123456, 4);
        12,332.1235
@@ -1738,7 +1739,7 @@ case class FormatNumber(x: Expression, d: Expression)
  */
 @ExpressionDescription(
   usage = "_FUNC_(str[, lang, country]) - Splits `str` into an array of array of words.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('Hi there! Good morning.');
        [["Hi","there"],["Good","morning"]]
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala
index aa328045cafdb..d0185562c9cfc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala
@@ -58,7 +58,7 @@ abstract class XPathExtract extends BinaryExpression with ExpectsInputTypes with
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(xml, xpath) - Returns true if the XPath expression evaluates to true, or if a matching node is found.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('<a><b>1</b></a>','a/b');
        true
@@ -77,7 +77,7 @@ case class XPathBoolean(xml: Expression, path: Expression) extends XPathExtract
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(xml, xpath) - Returns a short integer value, or the value zero if no match is found, or a match is found but the value is non-numeric.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('<a><b>1</b><b>2</b></a>', 'sum(a/b)');
        3
@@ -96,7 +96,7 @@ case class XPathShort(xml: Expression, path: Expression) extends XPathExtract {
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(xml, xpath) - Returns an integer value, or the value zero if no match is found, or a match is found but the value is non-numeric.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('<a><b>1</b><b>2</b></a>', 'sum(a/b)');
        3
@@ -115,7 +115,7 @@ case class XPathInt(xml: Expression, path: Expression) extends XPathExtract {
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(xml, xpath) - Returns a long integer value, or the value zero if no match is found, or a match is found but the value is non-numeric.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('<a><b>1</b><b>2</b></a>', 'sum(a/b)');
        3
@@ -134,7 +134,7 @@ case class XPathLong(xml: Expression, path: Expression) extends XPathExtract {
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(xml, xpath) - Returns a float value, the value zero if no match is found, or NaN if a match is found but the value is non-numeric.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('<a><b>1</b><b>2</b></a>', 'sum(a/b)');
        3.0
@@ -153,7 +153,7 @@ case class XPathFloat(xml: Expression, path: Expression) extends XPathExtract {
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(xml, xpath) - Returns a double value, the value zero if no match is found, or NaN if a match is found but the value is non-numeric.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('<a><b>1</b><b>2</b></a>', 'sum(a/b)');
        3.0
@@ -172,7 +172,7 @@ case class XPathDouble(xml: Expression, path: Expression) extends XPathExtract {
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(xml, xpath) - Returns the text contents of the first xml node that matches the XPath expression.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('<a><b>b</b><c>cc</c></a>','a/c');
        cc
@@ -191,7 +191,7 @@ case class XPathString(xml: Expression, path: Expression) extends XPathExtract {
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(xml, xpath) - Returns a string array of values within the nodes of xml that match the XPath expression.",
-  extended = """
+  examples = """
     Examples:
       > SELECT _FUNC_('<a><b>b1</b><b>b2</b><b>b3</b><c>c1</c><c>c2</c></a>','a/b/text()');
        ['b1','b2','b3']
diff --git a/sql/core/src/test/resources/sql-tests/results/cast.sql.out b/sql/core/src/test/resources/sql-tests/results/cast.sql.out
index 4e6353b1f332c..9c5f4554d9fe9 100644
--- a/sql/core/src/test/resources/sql-tests/results/cast.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/cast.sql.out
@@ -194,6 +194,8 @@ DESC FUNCTION EXTENDED boolean
 struct<function_desc:string>
 -- !query 23 output
 Class: org.apache.spark.sql.catalyst.expressions.Cast
-Extended Usage:N/A.
+Extended Usage:
+    No example/argument for boolean.
+
 Function: boolean
 Usage: boolean(expr) - Casts the value `expr` to the target data type `boolean`.
diff --git a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
index fedabaee2237f..22da20d9a9f4e 100644
--- a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
@@ -27,6 +27,8 @@ Extended Usage:
       > SELECT to_json(array(named_struct('a', 1, 'b', 2));
        [{"a":1,"b":2}]
   
+    Since: 2.2.0
+
 Function: to_json
 Usage: to_json(expr[, options]) - Returns a json string with a given struct value
 
@@ -105,6 +107,8 @@ Extended Usage:
       > SELECT from_json('{"time":"26/08/2015"}', 'time Timestamp', map('timestampFormat', 'dd/MM/yyyy'));
        {"time":"2015-08-26 00:00:00.0"}
   
+    Since: 2.2.0
+
 Function: from_json
 Usage: from_json(jsonStr, schema[, options]) - Returns a struct value with the given `jsonStr` and `schema`.
 
diff --git a/sql/gen-sql-markdown.py b/sql/gen-sql-markdown.py
index 8132af2708aea..fa8124b4513ac 100644
--- a/sql/gen-sql-markdown.py
+++ b/sql/gen-sql-markdown.py
@@ -19,7 +19,8 @@
 import os
 from collections import namedtuple
 
-ExpressionInfo = namedtuple("ExpressionInfo", "className usage name extended")
+ExpressionInfo = namedtuple(
+    "ExpressionInfo", "className name usage arguments examples note since")
 
 
 def _list_function_infos(jvm):
@@ -34,20 +35,21 @@ def _list_function_infos(jvm):
         name = jinfo.getName()
         usage = jinfo.getUsage()
         usage = usage.replace("_FUNC_", name) if usage is not None else usage
-        extended = jinfo.getExtended()
-        extended = extended.replace("_FUNC_", name) if extended is not None else extended
         infos.append(ExpressionInfo(
             className=jinfo.getClassName(),
-            usage=usage,
             name=name,
-            extended=extended))
+            usage=usage,
+            arguments=jinfo.getArguments().replace("_FUNC_", name),
+            examples=jinfo.getExamples().replace("_FUNC_", name),
+            note=jinfo.getNote(),
+            since=jinfo.getSince()))
     return sorted(infos, key=lambda i: i.name)
 
 
 def _make_pretty_usage(usage):
     """
-    Makes the usage description pretty and returns a formatted string.
-    Otherwise, returns None.
+    Makes the usage description pretty and returns a formatted string if `usage`
+    is not an empty string. Otherwise, returns None.
     """
 
     if usage is not None and usage.strip() != "":
@@ -55,32 +57,136 @@ def _make_pretty_usage(usage):
         return "%s\n\n" % usage
 
 
-def _make_pretty_extended(extended):
+def _make_pretty_arguments(arguments):
+    """
+    Makes the arguments description pretty and returns a formatted string if `arguments`
+    starts with the argument prefix. Otherwise, returns None.
+
+    Expected input:
+
+        Arguments:
+          * arg0 - ...
+              ...
+          * arg0 - ...
+              ...
+
+    Expected output:
+    **Arguments:**
+
+    * arg0 - ...
+        ...
+    * arg0 - ...
+        ...
+
+    """
+
+    if arguments.startswith("\n    Arguments:"):
+        arguments = "\n".join(map(lambda u: u[6:], arguments.strip().split("\n")[1:]))
+        return "**Arguments:**\n\n%s\n\n" % arguments
+
+
+def _make_pretty_examples(examples):
     """
-    Makes the extended description pretty and returns a formatted string.
-    Otherwise, returns None.
+    Makes the examples description pretty and returns a formatted string if `examples`
+    starts with the example prefix. Otherwise, returns None.
+
+    Expected input:
+
+        Examples:
+          > SELECT ...;
+           ...
+          > SELECT ...;
+           ...
+
+    Expected output:
+    **Examples:**
+
+    ```
+    > SELECT ...;
+     ...
+    > SELECT ...;
+     ...
+    ```
+
     """
 
-    if extended is not None and extended.strip() != "":
-        extended = "\n".join(map(lambda u: u.strip(), extended.split("\n")))
-        return "```%s```\n\n" % extended
+    if examples.startswith("\n    Examples:"):
+        examples = "\n".join(map(lambda u: u[6:], examples.strip().split("\n")[1:]))
+        return "**Examples:**\n\n```\n%s\n```\n\n" % examples
+
+
+def _make_pretty_note(note):
+    """
+    Makes the note description pretty and returns a formatted string if `note` is not
+    an empty string. Otherwise, returns None.
+
+    Expected input:
+
+        ...
+
+    Expected output:
+    **Note:**
+
+    ...
+
+    """
+
+    if note != "":
+        note = "\n".join(map(lambda n: n[4:], note.split("\n")))
+        return "**Note:**\n%s\n" % note
 
 
 def generate_sql_markdown(jvm, path):
     """
     Generates a markdown file after listing the function information. The output file
     is created in `path`.
+
+    Expected output:
+    ### NAME
+
+    USAGE
+
+    **Arguments:**
+
+    ARGUMENTS
+
+    **Examples:**
+
+    ```
+    EXAMPLES
+    ```
+
+    **Note:**
+
+    NOTE
+
+    **Since:** SINCE
+
+    <br/>
+
     """
 
     with open(path, 'w') as mdfile:
         for info in _list_function_infos(jvm):
-            mdfile.write("### %s\n\n" % info.name)
+            name = info.name
             usage = _make_pretty_usage(info.usage)
-            extended = _make_pretty_extended(info.extended)
+            arguments = _make_pretty_arguments(info.arguments)
+            examples = _make_pretty_examples(info.examples)
+            note = _make_pretty_note(info.note)
+            since = info.since
+
+            mdfile.write("### %s\n\n" % name)
             if usage is not None:
-                mdfile.write(usage)
-            if extended is not None:
-                mdfile.write(extended)
+                mdfile.write("%s\n\n" % usage.strip())
+            if arguments is not None:
+                mdfile.write(arguments)
+            if examples is not None:
+                mdfile.write(examples)
+            if note is not None:
+                mdfile.write(note)
+            if since is not None and since != "":
+                mdfile.write("**Since:** %s\n\n" % since.strip())
+            mdfile.write("<br/>\n\n")
 
 
 if __name__ == "__main__":

From dcac1d57f0fd05605edf596c303546d83062a352 Mon Sep 17 00:00:00 2001
From: arodriguez <arodriguez@arodriguez.stratio>
Date: Sat, 5 Aug 2017 11:21:51 -0700
Subject: [PATCH 300/779] [SPARK-21640] Add errorifexists as a valid string for
 ErrorIfExists save mode

## What changes were proposed in this pull request?

This PR includes the changes to make the string "errorifexists" also valid for ErrorIfExists save mode.

## How was this patch tested?

Unit tests and manual tests

Author: arodriguez <arodriguez@arodriguez.stratio>

Closes #18844 from ardlema/SPARK-21640.
---
 .../src/main/scala/org/apache/spark/sql/DataFrameWriter.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 0fcda46c9b3c5..079f699a181f5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -71,7 +71,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
       case "overwrite" => SaveMode.Overwrite
       case "append" => SaveMode.Append
       case "ignore" => SaveMode.Ignore
-      case "error" | "default" => SaveMode.ErrorIfExists
+      case "error" | "errorifexists" | "default" => SaveMode.ErrorIfExists
       case _ => throw new IllegalArgumentException(s"Unknown save mode: $saveMode. " +
         "Accepted save modes are 'overwrite', 'append', 'ignore', 'error'.")
     }

From 41568e9a0fc4f1373171c6f8dc33c87d9affde70 Mon Sep 17 00:00:00 2001
From: hzyaoqin <hzyaoqin@corp.netease.com>
Date: Sat, 5 Aug 2017 17:30:47 -0700
Subject: [PATCH 301/779] [SPARK-21637][SPARK-21451][SQL] get `spark.hadoop.*`
 properties from sysProps to hiveconf

## What changes were proposed in this pull request?
When we use `bin/spark-sql` command configuring `--conf spark.hadoop.foo=bar`, the `SparkSQLCliDriver` initializes an instance of  hiveconf, it does not add `foo->bar` to it.
this pr gets `spark.hadoop.*` properties from sysProps to this hiveconf

## How was this patch tested?
UT

Author: hzyaoqin <hzyaoqin@corp.netease.com>
Author: Kent Yao <yaooqinn@hotmail.com>

Closes #18668 from yaooqinn/SPARK-21451.
---
 .../apache/spark/deploy/SparkHadoopUtil.scala | 33 ++++++++++++++----
 docs/configuration.md                         | 34 ++++++++++++++++++-
 .../hive/thriftserver/SparkSQLCLIDriver.scala | 19 +++++++----
 .../sql/hive/thriftserver/CliSuite.scala      | 13 +++++++
 .../org/apache/spark/sql/hive/HiveUtils.scala |  8 +++++
 .../spark/sql/hive/HiveUtilsSuite.scala       |  9 +++++
 6 files changed, 102 insertions(+), 14 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index ce916b43bf4d3..eeb6d10d3e741 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -22,8 +22,10 @@ import java.security.PrivilegedExceptionAction
 import java.text.DateFormat
 import java.util.{Arrays, Comparator, Date, Locale}
 
+import scala.collection.immutable.Map
 import scala.collection.JavaConverters._
 import scala.collection.mutable
+import scala.collection.mutable.HashMap
 import scala.util.control.NonFatal
 
 import com.google.common.primitives.Longs
@@ -74,7 +76,6 @@ class SparkHadoopUtil extends Logging {
     }
   }
 
-
   /**
    * Appends S3-specific, spark.hadoop.*, and spark.buffer.size configurations to a Hadoop
    * configuration.
@@ -99,17 +100,35 @@ class SparkHadoopUtil extends Logging {
           hadoopConf.set("fs.s3a.session.token", sessionToken)
         }
       }
-      // Copy any "spark.hadoop.foo=bar" system properties into conf as "foo=bar"
-      conf.getAll.foreach { case (key, value) =>
-        if (key.startsWith("spark.hadoop.")) {
-          hadoopConf.set(key.substring("spark.hadoop.".length), value)
-        }
-      }
+      appendSparkHadoopConfigs(conf, hadoopConf)
       val bufferSize = conf.get("spark.buffer.size", "65536")
       hadoopConf.set("io.file.buffer.size", bufferSize)
     }
   }
 
+  /**
+   * Appends spark.hadoop.* configurations from a [[SparkConf]] to a Hadoop
+   * configuration without the spark.hadoop. prefix.
+   */
+  def appendSparkHadoopConfigs(conf: SparkConf, hadoopConf: Configuration): Unit = {
+    // Copy any "spark.hadoop.foo=bar" spark properties into conf as "foo=bar"
+    for ((key, value) <- conf.getAll if key.startsWith("spark.hadoop.")) {
+      hadoopConf.set(key.substring("spark.hadoop.".length), value)
+    }
+  }
+
+  /**
+   * Appends spark.hadoop.* configurations from a Map to another without the spark.hadoop. prefix.
+   */
+  def appendSparkHadoopConfigs(
+      srcMap: Map[String, String],
+      destMap: HashMap[String, String]): Unit = {
+    // Copy any "spark.hadoop.foo=bar" system properties into destMap as "foo=bar"
+    for ((key, value) <- srcMap if key.startsWith("spark.hadoop.")) {
+      destMap.put(key.substring("spark.hadoop.".length), value)
+    }
+  }
+
   /**
    * Return an appropriate (subclass) of Configuration. Creating config can initializes some Hadoop
    * subsystems.
diff --git a/docs/configuration.md b/docs/configuration.md
index 011d583d6ea70..e7c0306920e08 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -2357,5 +2357,37 @@ The location of these configuration files varies across Hadoop versions, but
 a common location is inside of `/etc/hadoop/conf`. Some tools create
 configurations on-the-fly, but offer a mechanisms to download copies of them.
 
-To make these files visible to Spark, set `HADOOP_CONF_DIR` in `$SPARK_HOME/spark-env.sh`
+To make these files visible to Spark, set `HADOOP_CONF_DIR` in `$SPARK_HOME/conf/spark-env.sh`
 to a location containing the configuration files.
+
+# Custom Hadoop/Hive Configuration
+
+If your Spark application is interacting with Hadoop, Hive, or both, there are probably Hadoop/Hive
+configuration files in Spark's classpath.
+
+Multiple running applications might require different Hadoop/Hive client side configurations.
+You can copy and modify `hdfs-site.xml`, `core-site.xml`, `yarn-site.xml`, `hive-site.xml` in
+Spark's classpath for each application. In a Spark cluster running on YARN, these configuration
+files are set cluster-wide, and cannot safely be changed by the application.
+
+The better choice is to use spark hadoop properties in the form of `spark.hadoop.*`. 
+They can be considered as same as normal spark properties which can be set in `$SPARK_HOME/conf/spark-defalut.conf`
+
+In some cases, you may want to avoid hard-coding certain configurations in a `SparkConf`. For
+instance, Spark allows you to simply create an empty conf and set spark/spark hadoop properties.
+
+{% highlight scala %}
+val conf = new SparkConf().set("spark.hadoop.abc.def","xyz")
+val sc = new SparkContext(conf)
+{% endhighlight %}
+
+Also, you can modify or add configurations at runtime:
+{% highlight bash %}
+./bin/spark-submit \ 
+  --name "My app" \ 
+  --master local[4] \  
+  --conf spark.eventLog.enabled=false \ 
+  --conf "spark.executor.extraJavaOptions=-XX:+PrintGCDetails -XX:+PrintGCTimeStamps" \ 
+  --conf spark.hadoop.abc.def=xyz \ 
+  myApp.jar
+{% endhighlight %}
\ No newline at end of file
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
index 38c458948c90e..761e832ed14b8 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
@@ -50,6 +50,7 @@ private[hive] object SparkSQLCLIDriver extends Logging {
   private val prompt = "spark-sql"
   private val continuedPrompt = "".padTo(prompt.length, ' ')
   private var transport: TSocket = _
+  private final val SPARK_HADOOP_PROP_PREFIX = "spark.hadoop."
 
   installSignalHandler()
 
@@ -134,6 +135,16 @@ private[hive] object SparkSQLCLIDriver extends Logging {
       // Hive 1.2 + not supported in CLI
       throw new RuntimeException("Remote operations not supported")
     }
+    // Respect the configurations set by --hiveconf from the command line
+    // (based on Hive's CliDriver).
+    val hiveConfFromCmd = sessionState.getOverriddenConfigurations.entrySet().asScala
+    val newHiveConf = hiveConfFromCmd.map { kv =>
+      // If the same property is configured by spark.hadoop.xxx, we ignore it and
+      // obey settings from spark properties
+      val k = kv.getKey
+      val v = sys.props.getOrElseUpdate(SPARK_HADOOP_PROP_PREFIX + k, kv.getValue)
+      (k, v)
+    }
 
     val cli = new SparkSQLCLIDriver
     cli.setHiveVariables(oproc.getHiveVariables)
@@ -157,12 +168,8 @@ private[hive] object SparkSQLCLIDriver extends Logging {
     // Execute -i init files (always in silent mode)
     cli.processInitFiles(sessionState)
 
-    // Respect the configurations set by --hiveconf from the command line
-    // (based on Hive's CliDriver).
-    val it = sessionState.getOverriddenConfigurations.entrySet().iterator()
-    while (it.hasNext) {
-      val kv = it.next()
-      SparkSQLEnv.sqlContext.setConf(kv.getKey, kv.getValue)
+    newHiveConf.foreach { kv =>
+      SparkSQLEnv.sqlContext.setConf(kv._1, kv._2)
     }
 
     if (sessionState.execString != null) {
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
index d3cec11bd7567..933fd7369380a 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
@@ -283,4 +283,17 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll with Logging {
       "SET conf3;" -> "conftest"
     )
   }
+
+  test("SPARK-21451: spark.sql.warehouse.dir should respect options in --hiveconf") {
+    runCliWithin(1.minute)("set spark.sql.warehouse.dir;" -> warehousePath.getAbsolutePath)
+  }
+
+  test("SPARK-21451: Apply spark.hadoop.* configurations") {
+    val tmpDir = Utils.createTempDir(namePrefix = "SPARK-21451")
+    runCliWithin(
+      1.minute,
+      Seq(s"--conf", s"spark.hadoop.${ConfVars.METASTOREWAREHOUSE}=$tmpDir"))(
+      "set spark.sql.warehouse.dir;" -> tmpDir.getAbsolutePath)
+    tmpDir.delete()
+  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
index be6339f7ddec3..b32b6fb82663f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
@@ -36,6 +36,7 @@ import org.apache.hadoop.hive.serde2.io.{DateWritable, TimestampWritable}
 import org.apache.hadoop.util.VersionInfo
 
 import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.catalog.CatalogTable
@@ -404,6 +405,13 @@ private[spark] object HiveUtils extends Logging {
     propMap.put(ConfVars.METASTORE_EVENT_LISTENERS.varname, "")
     propMap.put(ConfVars.METASTORE_END_FUNCTION_LISTENERS.varname, "")
 
+    // SPARK-21451: Spark will gather all `spark.hadoop.*` properties from a `SparkConf` to a
+    // Hadoop Configuration internally, as long as it happens after SparkContext initialized.
+    // Some instances such as `CliSessionState` used in `SparkSQLCliDriver` may also rely on these
+    // Configuration. But it happens before SparkContext initialized, we need to take them from
+    // system properties in the form of regular hadoop configurations.
+    SparkHadoopUtil.get.appendSparkHadoopConfigs(sys.props.toMap, propMap)
+
     propMap.toMap
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala
index 667a7ddd8bb61..2ebb1de428fd1 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala
@@ -33,4 +33,13 @@ class HiveUtilsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton
       assert(conf(ConfVars.METASTORE_END_FUNCTION_LISTENERS.varname) === "")
     }
   }
+
+  test("newTemporaryConfiguration respect spark.hadoop.foo=bar in SparkConf") {
+    sys.props.put("spark.hadoop.foo", "bar")
+    Seq(true, false) foreach { useInMemoryDerby =>
+      val hiveConf = HiveUtils.newTemporaryConfiguration(useInMemoryDerby)
+      assert(!hiveConf.contains("spark.hadoop.foo"))
+      assert(hiveConf("foo") === "bar")
+    }
+  }
 }

From 990efad1c62dec8f80debb6a1b11bdd030142768 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Sat, 5 Aug 2017 20:35:54 -0700
Subject: [PATCH 302/779] [SPARK-20963][SQL] Support column aliases for join
 relations in FROM clause

## What changes were proposed in this pull request?
This pr added parsing rules to support column aliases for join relations in FROM clause.
This pr is a sub-task of #18079.

## How was this patch tested?
Added tests in `AnalysisSuite`, `PlanParserSuite,` and `SQLQueryTestSuite`.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #18772 from maropu/SPARK-20963-2.
---
 .../spark/sql/catalyst/parser/SqlBase.g4      | 10 ++---
 .../sql/catalyst/parser/AstBuilder.scala      | 44 ++++++++++++-------
 .../sql/catalyst/analysis/AnalysisSuite.scala | 24 +++++++++-
 .../sql/catalyst/parser/PlanParserSuite.scala | 13 ++++++
 .../sql-tests/inputs/table-aliases.sql        |  7 +++
 .../sql-tests/results/table-aliases.sql.out   | 28 +++++++++++-
 6 files changed, 104 insertions(+), 22 deletions(-)

diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index 4534b7dcf6399..954955b6b1293 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -473,11 +473,11 @@ identifierComment
     ;
 
 relationPrimary
-    : tableIdentifier sample? tableAlias                   #tableName
-    | '(' queryNoWith ')' sample? tableAlias               #aliasedQuery
-    | '(' relation ')' sample? (AS? strictIdentifier)?     #aliasedRelation
-    | inlineTable                                          #inlineTableDefault2
-    | functionTable                                        #tableValuedFunction
+    : tableIdentifier sample? tableAlias      #tableName
+    | '(' queryNoWith ')' sample? tableAlias  #aliasedQuery
+    | '(' relation ')' sample? tableAlias     #aliasedRelation
+    | inlineTable                             #inlineTableDefault2
+    | functionTable                           #tableValuedFunction
     ;
 
 inlineTable
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 5935017704eda..532d6ee3f4302 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -739,12 +739,14 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
   /**
    * Create an alias (SubqueryAlias) for a join relation. This is practically the same as
    * visitAliasedQuery and visitNamedExpression, ANTLR4 however requires us to use 3 different
-   * hooks.
+   * hooks. We could add alias names for output columns, for example:
+   * {{{
+   *   SELECT a, b, c, d FROM (src1 s1 INNER JOIN src2 s2 ON s1.id = s2.id) dst(a, b, c, d)
+   * }}}
    */
   override def visitAliasedRelation(ctx: AliasedRelationContext): LogicalPlan = withOrigin(ctx) {
-    plan(ctx.relation)
-      .optionalMap(ctx.sample)(withSample)
-      .optionalMap(ctx.strictIdentifier)(aliasPlan)
+    val relation = plan(ctx.relation).optionalMap(ctx.sample)(withSample)
+    mayApplyAliasPlan(ctx.tableAlias, relation)
   }
 
   /**
@@ -756,31 +758,43 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
    * }}}
    */
   override def visitAliasedQuery(ctx: AliasedQueryContext): LogicalPlan = withOrigin(ctx) {
-    val alias = if (ctx.tableAlias.strictIdentifier == null) {
+    val relation = plan(ctx.queryNoWith).optionalMap(ctx.sample)(withSample)
+    if (ctx.tableAlias.strictIdentifier == null) {
       // For un-aliased subqueries, use a default alias name that is not likely to conflict with
       // normal subquery names, so that parent operators can only access the columns in subquery by
       // unqualified names. Users can still use this special qualifier to access columns if they
       // know it, but that's not recommended.
-      "__auto_generated_subquery_name"
+      SubqueryAlias("__auto_generated_subquery_name", relation)
     } else {
-      ctx.tableAlias.strictIdentifier.getText
-    }
-    val subquery = SubqueryAlias(alias, plan(ctx.queryNoWith).optionalMap(ctx.sample)(withSample))
-    if (ctx.tableAlias.identifierList != null) {
-      val columnAliases = visitIdentifierList(ctx.tableAlias.identifierList)
-      UnresolvedSubqueryColumnAliases(columnAliases, subquery)
-    } else {
-      subquery
+      mayApplyAliasPlan(ctx.tableAlias, relation)
     }
   }
 
   /**
-   * Create an alias (SubqueryAlias) for a LogicalPlan.
+   * Create an alias ([[SubqueryAlias]]) for a [[LogicalPlan]].
    */
   private def aliasPlan(alias: ParserRuleContext, plan: LogicalPlan): LogicalPlan = {
     SubqueryAlias(alias.getText, plan)
   }
 
+  /**
+   * If aliases specified in a FROM clause, create a subquery alias ([[SubqueryAlias]]) and
+   * column aliases for a [[LogicalPlan]].
+   */
+  private def mayApplyAliasPlan(tableAlias: TableAliasContext, plan: LogicalPlan): LogicalPlan = {
+    if (tableAlias.strictIdentifier != null) {
+      val subquery = SubqueryAlias(tableAlias.strictIdentifier.getText, plan)
+      if (tableAlias.identifierList != null) {
+        val columnNames = visitIdentifierList(tableAlias.identifierList)
+        UnresolvedSubqueryColumnAliases(columnNames, subquery)
+      } else {
+        subquery
+      }
+    } else {
+      plan
+    }
+  }
+
   /**
    * Create a Sequence of Strings for a parenthesis enclosed alias list.
    */
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index 847713a0455b9..4195e955cbabd 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.Cross
+import org.apache.spark.sql.catalyst.plans.{Cross, Inner}
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.types._
 
@@ -490,4 +490,26 @@ class AnalysisSuite extends AnalysisTest with ShouldMatchers {
       Seq("Number of column aliases does not match number of columns. " +
         "Number of column aliases: 5; number of columns: 4."))
   }
+
+  test("SPARK-20963 Support aliases for join relations in FROM clause") {
+    def joinRelationWithAliases(outputNames: Seq[String]): LogicalPlan = {
+      val src1 = LocalRelation('id.int, 'v1.string).as("s1")
+      val src2 = LocalRelation('id.int, 'v2.string).as("s2")
+      UnresolvedSubqueryColumnAliases(
+        outputNames,
+        SubqueryAlias(
+          "dst",
+          src1.join(src2, Inner, Option(Symbol("s1.id") === Symbol("s2.id"))))
+      ).select(star())
+    }
+    assertAnalysisSuccess(joinRelationWithAliases("col1" :: "col2" :: "col3" :: "col4" :: Nil))
+    assertAnalysisError(
+      joinRelationWithAliases("col1" :: Nil),
+      Seq("Number of column aliases does not match number of columns. " +
+        "Number of column aliases: 1; number of columns: 4."))
+    assertAnalysisError(
+      joinRelationWithAliases("col1" :: "col2" :: "col3" :: "col4" :: "col5" :: Nil),
+      Seq("Number of column aliases does not match number of columns. " +
+        "Number of column aliases: 5; number of columns: 4."))
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
index 5fa72e1e92660..157d11dbd13c7 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
@@ -506,6 +506,19 @@ class PlanParserSuite extends AnalysisTest {
       ).select(star()))
   }
 
+  test("SPARK-20963 Support aliases for join relations in FROM clause") {
+    val src1 = UnresolvedRelation(TableIdentifier("src1")).as("s1")
+    val src2 = UnresolvedRelation(TableIdentifier("src2")).as("s2")
+    assertEqual(
+      "SELECT * FROM (src1 s1 INNER JOIN src2 s2 ON s1.id = s2.id) dst(a, b, c, d)",
+      UnresolvedSubqueryColumnAliases(
+        Seq("a", "b", "c", "d"),
+        SubqueryAlias(
+          "dst",
+          src1.join(src2, Inner, Option(Symbol("s1.id") === Symbol("s2.id"))))
+      ).select(star()))
+  }
+
   test("inline table") {
     assertEqual("values 1, 2, 3, 4",
       UnresolvedInlineTable(Seq("col1"), Seq(1, 2, 3, 4).map(x => Seq(Literal(x)))))
diff --git a/sql/core/src/test/resources/sql-tests/inputs/table-aliases.sql b/sql/core/src/test/resources/sql-tests/inputs/table-aliases.sql
index 85481cbbf9377..4cfd5f28afdaa 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/table-aliases.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/table-aliases.sql
@@ -18,3 +18,10 @@ SELECT a AS col1, b AS col2 FROM testData AS t(c, d);
 
 -- Subquery aliases in FROM clause
 SELECT * FROM (SELECT 1 AS a, 1 AS b) t(col1, col2);
+
+-- Aliases for join relations in FROM clause
+CREATE OR REPLACE TEMPORARY VIEW src1 AS SELECT * FROM VALUES (1, "a"), (2, "b"), (3, "c") AS src1(id, v1);
+
+CREATE OR REPLACE TEMPORARY VIEW src2 AS SELECT * FROM VALUES (2, 1.0), (3, 3.2), (1, 8.5) AS src2(id, v2);
+
+SELECT * FROM (src1 s1 INNER JOIN src2 s2 ON s1.id = s2.id) dst(a, b, c, d);
diff --git a/sql/core/src/test/resources/sql-tests/results/table-aliases.sql.out b/sql/core/src/test/resources/sql-tests/results/table-aliases.sql.out
index 4459f3186c77b..53b069cd39876 100644
--- a/sql/core/src/test/resources/sql-tests/results/table-aliases.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/table-aliases.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 8
+-- Number of queries: 11
 
 
 -- !query 0
@@ -69,3 +69,29 @@ SELECT * FROM (SELECT 1 AS a, 1 AS b) t(col1, col2)
 struct<col1:int,col2:int>
 -- !query 7 output
 1	1
+
+
+-- !query 8
+CREATE OR REPLACE TEMPORARY VIEW src1 AS SELECT * FROM VALUES (1, "a"), (2, "b"), (3, "c") AS src1(id, v1)
+-- !query 8 schema
+struct<>
+-- !query 8 output
+
+
+
+-- !query 9
+CREATE OR REPLACE TEMPORARY VIEW src2 AS SELECT * FROM VALUES (2, 1.0), (3, 3.2), (1, 8.5) AS src2(id, v2)
+-- !query 9 schema
+struct<>
+-- !query 9 output
+
+
+
+-- !query 10
+SELECT * FROM (src1 s1 INNER JOIN src2 s2 ON s1.id = s2.id) dst(a, b, c, d)
+-- !query 10 schema
+struct<a:int,b:string,c:int,d:decimal(2,1)>
+-- !query 10 output
+1	a	1	8.5
+2	b	2	1
+3	c	3	3.2

From 1ba967b25e6d88be2db7a4e100ac3ead03a2ade9 Mon Sep 17 00:00:00 2001
From: vinodkc <vinod.kc.in@gmail.com>
Date: Sat, 5 Aug 2017 23:04:39 -0700
Subject: [PATCH 303/779] [SPARK-21588][SQL] SQLContext.getConf(key, null)
 should return null

## What changes were proposed in this pull request?

In SQLContext.get(key,null) for a key that is not defined in the conf, and doesn't have a default value defined, throws a NPE. Int happens only when conf has a value converter

Added null check on defaultValue inside SQLConf.getConfString to avoid calling entry.valueConverter(defaultValue)

## How was this patch tested?
Added unit test

Author: vinodkc <vinod.kc.in@gmail.com>

Closes #18852 from vinodkc/br_Fix_SPARK-21588.
---
 .../scala/org/apache/spark/sql/internal/SQLConf.scala | 10 ++++++----
 .../org/apache/spark/sql/internal/SQLConfSuite.scala  | 11 +++++++++++
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index a819cddcae988..ecb941c5fa9e6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1228,10 +1228,12 @@ class SQLConf extends Serializable with Logging {
    * not set yet, return `defaultValue`.
    */
   def getConfString(key: String, defaultValue: String): String = {
-    val entry = sqlConfEntries.get(key)
-    if (entry != null && defaultValue != "<undefined>") {
-      // Only verify configs in the SQLConf object
-      entry.valueConverter(defaultValue)
+    if (defaultValue != null && defaultValue != "<undefined>") {
+      val entry = sqlConfEntries.get(key)
+      if (entry != null) {
+        // Only verify configs in the SQLConf object
+        entry.valueConverter(defaultValue)
+      }
     }
     Option(settings.get(key)).getOrElse(defaultValue)
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
index a283ff971adcd..948f179f5e8f0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
@@ -270,4 +270,15 @@ class SQLConfSuite extends QueryTest with SharedSQLContext {
     val e2 = intercept[AnalysisException](spark.conf.unset(SCHEMA_STRING_LENGTH_THRESHOLD.key))
     assert(e2.message.contains("Cannot modify the value of a static config"))
   }
+
+  test("SPARK-21588 SQLContext.getConf(key, null) should return null") {
+    withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
+      assert("1" == spark.conf.get(SQLConf.SHUFFLE_PARTITIONS.key, null))
+      assert("1" == spark.conf.get(SQLConf.SHUFFLE_PARTITIONS.key, "<undefined>"))
+    }
+
+    assert(spark.conf.getOption("spark.sql.nonexistent").isEmpty)
+    assert(null == spark.conf.get("spark.sql.nonexistent", null))
+    assert("<undefined>" == spark.conf.get("spark.sql.nonexistent", "<undefined>"))
+  }
 }

From d4e7f20f5416a5c1b726337a5b2f104bd02495e3 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Sun, 6 Aug 2017 19:51:35 +0900
Subject: [PATCH 304/779] [SPARKR][BUILD] AppVeyor change to latest R version

## What changes were proposed in this pull request?

R version update

## How was this patch tested?

AppVeyor

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #18856 from felixcheung/rappveyorver.
---
 dev/appveyor-install-dependencies.ps1 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/appveyor-install-dependencies.ps1 b/dev/appveyor-install-dependencies.ps1
index 1c34f1bbc1aa3..cf82389173048 100644
--- a/dev/appveyor-install-dependencies.ps1
+++ b/dev/appveyor-install-dependencies.ps1
@@ -114,7 +114,7 @@ $env:Path += ";$env:HADOOP_HOME\bin"
 Pop-Location
 
 # ========================== R
-$rVer = "3.3.1"
+$rVer = "3.4.1"
 $rToolsVer = "3.4.0"
 
 InstallR

From 10b3ca3e9382e2b407492a4ca008a887f706f763 Mon Sep 17 00:00:00 2001
From: Yuming Wang <wgyumg@gmail.com>
Date: Sun, 6 Aug 2017 10:08:44 -0700
Subject: [PATCH 305/779] [SPARK-21574][SQL] Point out user to set hive config
 before SparkSession is initialized

## What changes were proposed in this pull request?
Since Spark 2.0.0, SET hive config commands do not pass the values to HiveClient, this PR point out user to set hive config before SparkSession is initialized when they try to set hive config.

## How was this patch tested?
manual tests

<img width="1637" alt="spark-set" src="https://user-images.githubusercontent.com/5399861/29001141-03f943ee-7ab3-11e7-8584-ba5a5e81f6ad.png">

Author: Yuming Wang <wgyumg@gmail.com>

Closes #18769 from wangyum/SPARK-21574.
---
 .../apache/spark/sql/execution/command/SetCommand.scala  | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala
index 5f12830ee621f..7477d025dfe89 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala
@@ -21,6 +21,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{Row, SparkSession}
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION
 import org.apache.spark.sql.types.{StringType, StructField, StructType}
 
 
@@ -87,6 +88,14 @@ case class SetCommand(kv: Option[(String, Option[String])]) extends RunnableComm
     // Configures a single property.
     case Some((key, Some(value))) =>
       val runFunc = (sparkSession: SparkSession) => {
+        if (sparkSession.conf.get(CATALOG_IMPLEMENTATION.key).equals("hive") &&
+            key.startsWith("hive.")) {
+          logWarning(s"'SET $key=$value' might not work, since Spark doesn't support changing " +
+            "the Hive config dynamically. Please passing the Hive-specific config by adding the " +
+            s"prefix spark.hadoop (e.g., spark.hadoop.$key) when starting a Spark application. " +
+            "For details, see the link: https://spark.apache.org/docs/latest/configuration.html#" +
+            "dynamically-loading-spark-properties.")
+        }
         sparkSession.conf.set(key, value)
         Seq(Row(key, value))
       }

From 74b47845ea08440058e5f93ff6135e94e78ae470 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Sun, 6 Aug 2017 10:14:45 -0700
Subject: [PATCH 306/779] [SPARK-20963][SQL][FOLLOW-UP] Use
 UnresolvedSubqueryColumnAliases for visitTableName

## What changes were proposed in this pull request?
This pr (follow-up of #18772) used `UnresolvedSubqueryColumnAliases` for `visitTableName` in `AstBuilder`, which is a new unresolved `LogicalPlan` implemented in #18185.

## How was this patch tested?
Existing tests

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #18857 from maropu/SPARK-20963-FOLLOWUP.
---
 .../sql/catalyst/analysis/Analyzer.scala      | 20 +------------------
 .../sql/catalyst/analysis/unresolved.scala    | 11 +---------
 .../sql/catalyst/parser/AstBuilder.scala      | 13 ++----------
 .../sql/catalyst/analysis/AnalysisSuite.scala | 14 +++++++------
 .../sql/catalyst/parser/PlanParserSuite.scala |  6 ++++--
 .../sql-tests/results/table-aliases.sql.out   |  6 +++---
 .../benchmark/TPCDSQueryBenchmark.scala       |  4 ++--
 .../apache/spark/sql/hive/test/TestHive.scala |  2 +-
 8 files changed, 22 insertions(+), 54 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index a6d297cfd6538..8628a89aa949a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -592,25 +592,7 @@ class Analyzer(
       case u: UnresolvedRelation if !isRunningDirectlyOnFiles(u.tableIdentifier) =>
         val defaultDatabase = AnalysisContext.get.defaultDatabase
         val foundRelation = lookupTableFromCatalog(u, defaultDatabase)
-
-        // Add `Project` to rename output column names if a query has alias names:
-        // e.g., SELECT col1, col2 FROM testData AS t(col1, col2)
-        val relation = if (u.outputColumnNames.nonEmpty) {
-          val outputAttrs = foundRelation.output
-          // Checks if the number of the aliases equals to the number of columns in the table.
-          if (u.outputColumnNames.size != outputAttrs.size) {
-            u.failAnalysis(s"Number of column aliases does not match number of columns. " +
-              s"Table name: ${u.tableName}; number of column aliases: " +
-              s"${u.outputColumnNames.size}; number of columns: ${outputAttrs.size}.")
-          }
-          val aliases = outputAttrs.zip(u.outputColumnNames).map {
-            case (attr, name) => Alias(attr, name)()
-          }
-          Project(aliases, foundRelation)
-        } else {
-          foundRelation
-        }
-        resolveRelation(relation)
+        resolveRelation(foundRelation)
       // The view's child should be a logical plan parsed from the `desc.viewText`, the variable
       // `viewText` should be defined, or else we throw an error on the generation of the View
       // operator.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
index b7a704dc8453a..d336f801d0770 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
@@ -37,19 +37,10 @@ class UnresolvedException[TreeType <: TreeNode[_]](tree: TreeType, function: Str
 
 /**
  * Holds the name of a relation that has yet to be looked up in a catalog.
- * We could add alias names for columns in a relation:
- * {{{
- *   // Assign alias names
- *   SELECT col1, col2 FROM testData AS t(col1, col2);
- * }}}
  *
  * @param tableIdentifier table name
- * @param outputColumnNames alias names of columns. If these names given, an analyzer adds
- *                          [[Project]] to rename the columns.
  */
-case class UnresolvedRelation(
-    tableIdentifier: TableIdentifier,
-    outputColumnNames: Seq[String] = Seq.empty)
+case class UnresolvedRelation(tableIdentifier: TableIdentifier)
   extends LeafNode {
 
   /** Returns a `.` separated name for this relation. */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 532d6ee3f4302..22c5484b76638 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -681,17 +681,8 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
    */
   override def visitTableName(ctx: TableNameContext): LogicalPlan = withOrigin(ctx) {
     val tableId = visitTableIdentifier(ctx.tableIdentifier)
-    val table = if (ctx.tableAlias.identifierList != null) {
-      UnresolvedRelation(tableId, visitIdentifierList(ctx.tableAlias.identifierList))
-    } else {
-      UnresolvedRelation(tableId)
-    }
-    val tableWithAlias = if (ctx.tableAlias.strictIdentifier != null) {
-      SubqueryAlias(ctx.tableAlias.strictIdentifier.getText, table)
-    } else {
-      table
-    }
-    tableWithAlias.optionalMap(ctx.sample)(withSample)
+    val table = mayApplyAliasPlan(ctx.tableAlias, UnresolvedRelation(tableId))
+    table.optionalMap(ctx.sample)(withSample)
   }
 
   /**
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index 4195e955cbabd..e5fcd60b2d3da 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -457,18 +457,20 @@ class AnalysisSuite extends AnalysisTest with ShouldMatchers {
 
   test("SPARK-20841 Support table column aliases in FROM clause") {
     def tableColumnsWithAliases(outputNames: Seq[String]): LogicalPlan = {
-      SubqueryAlias("t", UnresolvedRelation(TableIdentifier("TaBlE3"), outputNames))
-        .select(star())
+      UnresolvedSubqueryColumnAliases(
+        outputNames,
+        SubqueryAlias("t", UnresolvedRelation(TableIdentifier("TaBlE3")))
+      ).select(star())
     }
     assertAnalysisSuccess(tableColumnsWithAliases("col1" :: "col2" :: "col3" :: "col4" :: Nil))
     assertAnalysisError(
       tableColumnsWithAliases("col1" :: Nil),
-      Seq("Number of column aliases does not match number of columns. Table name: TaBlE3; " +
-        "number of column aliases: 1; number of columns: 4."))
+      Seq("Number of column aliases does not match number of columns. " +
+        "Number of column aliases: 1; number of columns: 4."))
     assertAnalysisError(
       tableColumnsWithAliases("col1" :: "col2" :: "col3" :: "col4" :: "col5" :: Nil),
-      Seq("Number of column aliases does not match number of columns. Table name: TaBlE3; " +
-        "number of column aliases: 5; number of columns: 4."))
+      Seq("Number of column aliases does not match number of columns. " +
+        "Number of column aliases: 5; number of columns: 4."))
   }
 
   test("SPARK-20962 Support subquery column aliases in FROM clause") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
index 157d11dbd13c7..b0d2fb26a6006 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
@@ -491,8 +491,10 @@ class PlanParserSuite extends AnalysisTest {
   test("SPARK-20841 Support table column aliases in FROM clause") {
     assertEqual(
       "SELECT * FROM testData AS t(col1, col2)",
-      SubqueryAlias("t", UnresolvedRelation(TableIdentifier("testData"), Seq("col1", "col2")))
-        .select(star()))
+      UnresolvedSubqueryColumnAliases(
+        Seq("col1", "col2"),
+        SubqueryAlias("t", UnresolvedRelation(TableIdentifier("testData")))
+      ).select(star()))
   }
 
   test("SPARK-20962 Support subquery column aliases in FROM clause") {
diff --git a/sql/core/src/test/resources/sql-tests/results/table-aliases.sql.out b/sql/core/src/test/resources/sql-tests/results/table-aliases.sql.out
index 53b069cd39876..1a2bd5ea91cde 100644
--- a/sql/core/src/test/resources/sql-tests/results/table-aliases.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/table-aliases.sql.out
@@ -42,7 +42,7 @@ SELECT * FROM testData AS t(col1, col2, col3)
 struct<>
 -- !query 4 output
 org.apache.spark.sql.AnalysisException
-Number of column aliases does not match number of columns. Table name: testData; number of column aliases: 3; number of columns: 2.; line 1 pos 14
+Number of column aliases does not match number of columns. Number of column aliases: 3; number of columns: 2.; line 1 pos 14
 
 
 -- !query 5
@@ -51,7 +51,7 @@ SELECT * FROM testData AS t(col1)
 struct<>
 -- !query 5 output
 org.apache.spark.sql.AnalysisException
-Number of column aliases does not match number of columns. Table name: testData; number of column aliases: 1; number of columns: 2.; line 1 pos 14
+Number of column aliases does not match number of columns. Number of column aliases: 1; number of columns: 2.; line 1 pos 14
 
 
 -- !query 6
@@ -60,7 +60,7 @@ SELECT a AS col1, b AS col2 FROM testData AS t(c, d)
 struct<>
 -- !query 6 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '`a`' given input columns: [t.c, t.d]; line 1 pos 7
+cannot resolve '`a`' given input columns: [c, d]; line 1 pos 7
 
 
 -- !query 7
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala
index 6a5b74b01df80..d2d013682cd2d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala
@@ -74,13 +74,13 @@ object TPCDSQueryBenchmark {
       // per-row processing time for those cases.
       val queryRelations = scala.collection.mutable.HashSet[String]()
       spark.sql(queryString).queryExecution.logical.map {
-        case UnresolvedRelation(t: TableIdentifier, _) =>
+        case UnresolvedRelation(t: TableIdentifier) =>
           queryRelations.add(t.table)
         case lp: LogicalPlan =>
           lp.expressions.foreach { _ foreach {
             case subquery: SubqueryExpression =>
               subquery.plan.foreach {
-                case UnresolvedRelation(t: TableIdentifier, _) =>
+                case UnresolvedRelation(t: TableIdentifier) =>
                   queryRelations.add(t.table)
                 case _ =>
               }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 3a5c0c397b15e..9e15baa4b2b74 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -554,7 +554,7 @@ private[hive] class TestHiveQueryExecution(
     // Make sure any test tables referenced are loaded.
     val referencedTables =
       describedTables ++
-        logical.collect { case UnresolvedRelation(tableIdent, _) => tableIdent.table }
+        logical.collect { case UnresolvedRelation(tableIdent) => tableIdent.table }
     val resolver = sparkSession.sessionState.conf.resolver
     val referencedTestTables = sparkSession.testTables.keys.filter { testTable =>
       referencedTables.exists(resolver(_, testTable))

From 55aa4da285ef0f2fa43852f2bbcffacc7ddefdbf Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Sun, 6 Aug 2017 15:14:12 -0700
Subject: [PATCH 307/779] [SPARK-21622][ML][SPARKR] Support offset in SparkR
 GLM

## What changes were proposed in this pull request?
Support offset in SparkR GLM #16699

Author: actuaryzhang <actuaryzhang10@gmail.com>

Closes #18831 from actuaryzhang/sparkROffset.
---
 R/pkg/R/mllib_regression.R                    | 22 +++++++++++++++----
 R/pkg/tests/fulltests/test_mllib_regression.R |  8 +++++++
 .../GeneralizedLinearRegressionWrapper.scala  |  4 +++-
 3 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index 9ecd887f2c127..ebaeae970218a 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -76,6 +76,8 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 #'                               "frequencyDesc", "frequencyAsc", "alphabetDesc", and "alphabetAsc".
 #'                               The default value is "frequencyDesc". When the ordering is set to
 #'                               "alphabetDesc", this drops the same category as R when encoding strings.
+#' @param offsetCol the offset column name. If this is not set or empty, we treat all instance offsets
+#'                  as 0.0. The feature specified as offset has a constant coefficient of 1.0.
 #' @param ... additional arguments passed to the method.
 #' @aliases spark.glm,SparkDataFrame,formula-method
 #' @return \code{spark.glm} returns a fitted generalized linear model.
@@ -127,7 +129,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
           function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25, weightCol = NULL,
                    regParam = 0.0, var.power = 0.0, link.power = 1.0 - var.power,
                    stringIndexerOrderType = c("frequencyDesc", "frequencyAsc",
-                                              "alphabetDesc", "alphabetAsc")) {
+                                              "alphabetDesc", "alphabetAsc"),
+                   offsetCol = NULL) {
 
             stringIndexerOrderType <- match.arg(stringIndexerOrderType)
             if (is.character(family)) {
@@ -159,12 +162,19 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
               weightCol <- as.character(weightCol)
             }
 
+            if (!is.null(offsetCol)) {
+              offsetCol <- as.character(offsetCol)
+              if (nchar(offsetCol) == 0) {
+                offsetCol <- NULL
+              }
+            }
+
             # For known families, Gamma is upper-cased
             jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
                                 "fit", formula, data@sdf, tolower(family$family), family$link,
                                 tol, as.integer(maxIter), weightCol, regParam,
                                 as.double(var.power), as.double(link.power),
-                                stringIndexerOrderType)
+                                stringIndexerOrderType, offsetCol)
             new("GeneralizedLinearRegressionModel", jobj = jobj)
           })
 
@@ -192,6 +202,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
 #'                               "frequencyDesc", "frequencyAsc", "alphabetDesc", and "alphabetAsc".
 #'                               The default value is "frequencyDesc". When the ordering is set to
 #'                               "alphabetDesc", this drops the same category as R when encoding strings.
+#' @param offsetCol the offset column name. If this is not set or empty, we treat all instance offsets
+#'                  as 0.0. The feature specified as offset has a constant coefficient of 1.0.
 #' @return \code{glm} returns a fitted generalized linear model.
 #' @rdname glm
 #' @export
@@ -209,10 +221,12 @@ setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDat
           function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 25, weightCol = NULL,
                    var.power = 0.0, link.power = 1.0 - var.power,
                    stringIndexerOrderType = c("frequencyDesc", "frequencyAsc",
-                                              "alphabetDesc", "alphabetAsc")) {
+                                              "alphabetDesc", "alphabetAsc"),
+                   offsetCol = NULL) {
             spark.glm(data, formula, family, tol = epsilon, maxIter = maxit, weightCol = weightCol,
                       var.power = var.power, link.power = link.power,
-                      stringIndexerOrderType = stringIndexerOrderType)
+                      stringIndexerOrderType = stringIndexerOrderType,
+                      offsetCol = offsetCol)
           })
 
 #  Returns the summary of a model produced by glm() or spark.glm(), similarly to R's summary().
diff --git a/R/pkg/tests/fulltests/test_mllib_regression.R b/R/pkg/tests/fulltests/test_mllib_regression.R
index 6b72a09b200d6..23daca75fcc22 100644
--- a/R/pkg/tests/fulltests/test_mllib_regression.R
+++ b/R/pkg/tests/fulltests/test_mllib_regression.R
@@ -173,6 +173,14 @@ test_that("spark.glm summary", {
   expect_equal(stats$df.residual, rStats$df.residual)
   expect_equal(stats$aic, rStats$aic)
 
+  # Test spark.glm works with offset
+  training <- suppressWarnings(createDataFrame(iris))
+  stats <- summary(spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
+                             family = poisson(), offsetCol = "Petal_Length"))
+  rStats <- suppressWarnings(summary(glm(Sepal.Width ~ Sepal.Length + Species,
+                        data = iris, family = poisson(), offset = iris$Petal.Length)))
+  expect_true(all(abs(rStats$coefficients - stats$coefficients) < 1e-3))
+
   # Test summary works on base GLM models
   baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris)
   baseSummary <- summary(baseModel)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
index 176a6cf852914..64575b0cb0cb5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
@@ -77,7 +77,8 @@ private[r] object GeneralizedLinearRegressionWrapper
       regParam: Double,
       variancePower: Double,
       linkPower: Double,
-      stringIndexerOrderType: String): GeneralizedLinearRegressionWrapper = {
+      stringIndexerOrderType: String,
+      offsetCol: String): GeneralizedLinearRegressionWrapper = {
   // scalastyle:on
     val rFormula = new RFormula().setFormula(formula)
       .setStringIndexerOrderType(stringIndexerOrderType)
@@ -99,6 +100,7 @@ private[r] object GeneralizedLinearRegressionWrapper
       glr.setLink(link)
     }
     if (weightCol != null) glr.setWeightCol(weightCol)
+    if (offsetCol != null) glr.setOffsetCol(offsetCol)
 
     val pipeline = new Pipeline()
       .setStages(Array(rFormulaModel, glr))

From 438c3815846d1ba17d0b1dae8e69a96f10107904 Mon Sep 17 00:00:00 2001
From: BartekH <bartekhamielec@gmail.com>
Date: Sun, 6 Aug 2017 16:40:59 -0700
Subject: [PATCH 308/779] Add "full_outer" name to join types

I have discovered that "full_outer" name option is working in Spark 2.0, but it is not printed in exception. Please verify.

## What changes were proposed in this pull request?

(Please fill in changes proposed in this fix)

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)
(If this patch involves UI changes, please attach a screenshot; otherwise, remove this)

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: BartekH <bartekhamielec@gmail.com>

Closes #17985 from BartekH/patch-1.
---
 .../spark/sql/catalyst/plans/joinTypes.scala  | 10 +--
 .../sql/catalyst/plans/JoinTypesTest.scala    | 62 +++++++++++++++++++
 2 files changed, 67 insertions(+), 5 deletions(-)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/JoinTypesTest.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/joinTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/joinTypes.scala
index 90d11d6d91512..c77849035a975 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/joinTypes.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/joinTypes.scala
@@ -33,11 +33,11 @@ object JoinType {
     case _ =>
       val supported = Seq(
         "inner",
-        "outer", "full", "fullouter",
-        "leftouter", "left",
-        "rightouter", "right",
-        "leftsemi",
-        "leftanti",
+        "outer", "full", "fullouter", "full_outer",
+        "leftouter", "left", "left_outer",
+        "rightouter", "right", "right_outer",
+        "leftsemi", "left_semi",
+        "leftanti", "left_anti",
         "cross")
 
       throw new IllegalArgumentException(s"Unsupported join type '$typ'. " +
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/JoinTypesTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/JoinTypesTest.scala
new file mode 100644
index 0000000000000..d56f97970122c
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/JoinTypesTest.scala
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.spark.sql.catalyst.plans
+
+import org.apache.spark.SparkFunSuite
+
+class JoinTypesTest extends SparkFunSuite {
+
+  test("construct an Inner type") {
+    assert(JoinType("inner") === Inner)
+  }
+
+  test("construct a FullOuter type") {
+    assert(JoinType("fullouter") === FullOuter)
+    assert(JoinType("full_outer") === FullOuter)
+    assert(JoinType("outer") === FullOuter)
+    assert(JoinType("full") === FullOuter)
+  }
+
+  test("construct a LeftOuter type") {
+    assert(JoinType("leftouter") === LeftOuter)
+    assert(JoinType("left_outer") === LeftOuter)
+    assert(JoinType("left") === LeftOuter)
+  }
+
+  test("construct a RightOuter type") {
+    assert(JoinType("rightouter") === RightOuter)
+    assert(JoinType("right_outer") === RightOuter)
+    assert(JoinType("right") === RightOuter)
+  }
+
+  test("construct a LeftSemi type") {
+    assert(JoinType("leftsemi") === LeftSemi)
+    assert(JoinType("left_semi") === LeftSemi)
+  }
+
+  test("construct a LeftAnti type") {
+    assert(JoinType("leftanti") === LeftAnti)
+    assert(JoinType("left_anti") === LeftAnti)
+  }
+
+  test("construct a Cross type") {
+    assert(JoinType("cross") === Cross)
+  }
+
+}

From 39e044e3d87783840d104e93ec60f4f6ac43b006 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Sun, 6 Aug 2017 16:48:49 -0700
Subject: [PATCH 309/779] [MINOR][BUILD] Remove duplicate test-jar:test
 spark-sql dependency from Hive module

## What changes were proposed in this pull request?

Remove duplicate test-jar:test spark-sql dependency from Hive module; move test-jar dependencies together logically. This generates a big warning at the start of the Maven build otherwise.

## How was this patch tested?

Existing build. No functional changes here.

Author: Sean Owen <sowen@cloudera.com>

Closes #18858 from srowen/DupeSqlTestDep.
---
 sql/hive/pom.xml | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index f9462e79a69f3..5508a00259e09 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -64,6 +64,13 @@
       <type>test-jar</type>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-catalyst_${scala.binary.version}</artifactId>
+      <type>test-jar</type>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
@@ -174,20 +181,6 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>org.apache.spark</groupId>
-      <artifactId>spark-sql_${scala.binary.version}</artifactId>
-      <type>test-jar</type>
-      <version>${project.version}</version>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.spark</groupId>
-      <artifactId>spark-catalyst_${scala.binary.version}</artifactId>
-      <type>test-jar</type>
-      <version>${project.version}</version>
-      <scope>test</scope>
-    </dependency>
   </dependencies>
   <profiles>
     <profile>

From 534a063f7c693158437d13224f50d4ae789ff6fb Mon Sep 17 00:00:00 2001
From: Xianyang Liu <xianyang.liu@intel.com>
Date: Mon, 7 Aug 2017 17:04:53 +0800
Subject: [PATCH 310/779] [SPARK-21621][CORE] Reset numRecordsWritten after
 DiskBlockObjectWriter.commitAndGet called

## What changes were proposed in this pull request?

We should reset numRecordsWritten to zero after DiskBlockObjectWriter.commitAndGet called.
Because when `revertPartialWritesAndClose` be called, we decrease the written records in `ShuffleWriteMetrics` . However, we decreased the written records to zero, this should be wrong, we should only decreased the number reords after the last `commitAndGet` called.

## How was this patch tested?
Modified existing test.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Xianyang Liu <xianyang.liu@intel.com>

Closes #18830 from ConeyLiu/DiskBlockObjectWriter.
---
 .../scala/org/apache/spark/storage/DiskBlockObjectWriter.scala  | 2 ++
 .../org/apache/spark/storage/DiskBlockObjectWriterSuite.scala   | 1 +
 2 files changed, 3 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala
index eb3ff926372a2..a024c83d8d8b7 100644
--- a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala
@@ -95,6 +95,7 @@ private[spark] class DiskBlockObjectWriter(
   /**
    * Keep track of number of records written and also use this to periodically
    * output bytes written since the latter is expensive to do for each record.
+   * And we reset it after every commitAndGet called.
    */
   private var numRecordsWritten = 0
 
@@ -185,6 +186,7 @@ private[spark] class DiskBlockObjectWriter(
       // In certain compression codecs, more bytes are written after streams are closed
       writeMetrics.incBytesWritten(committedPosition - reportedPosition)
       reportedPosition = committedPosition
+      numRecordsWritten = 0
       fileSegment
     } else {
       new FileSegment(file, committedPosition, 0)
diff --git a/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala b/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala
index bfb3ac4c15bca..cea55012c1de5 100644
--- a/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala
@@ -116,6 +116,7 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
     writer.revertPartialWritesAndClose()
     assert(firstSegment.length === file.length())
     assert(writeMetrics.bytesWritten === file.length())
+    assert(writeMetrics.recordsWritten == 1)
   }
 
   test("calling revertPartialWritesAndClose() after commit() should have no effect") {

From 663f30d14a0c9219e07697af1ab56e11a714d9a6 Mon Sep 17 00:00:00 2001
From: Stavros Kontopoulos <st.kontopoulos@gmail.com>
Date: Mon, 7 Aug 2017 10:32:19 +0100
Subject: [PATCH 311/779] [SPARK-13041][MESOS] Adds sandbox uri to spark
 dispatcher ui

## What changes were proposed in this pull request?
Adds a sandbox link per driver in the dispatcher ui with minimal changes after a bug was fixed here:
https://issues.apache.org/jira/browse/MESOS-4992
The sandbox uri has the following format:
http://<proxy_uri>/#/slaves/\<agent-id\>/ frameworks/ \<scheduler-id\>/executors/\<driver-id\>/browse

For dc/os the proxy uri is <dc/os uri>/mesos. For the dc/os deployment scenario and to make things easier I introduced a new config property named `spark.mesos.proxy.baseURL` which should be passed to the dispatcher when launched using --conf. If no such configuration is detected then no sandbox uri is depicted, and there is an empty column with a header (this can be changed so nothing is shown).

Within dc/os the base url must be a property for the dispatcher that we should add in the future here:
https://github.com/mesosphere/universe/blob/9e7c909c3b8680eeb0494f2a58d5746e3bab18c1/repo/packages/S/spark/26/config.json
It is not easy to detect in different environments what is that uri so user should pass it.

## How was this patch tested?
Tested with the mesos test suite here: https://github.com/typesafehub/mesos-spark-integration-tests.
Attached image shows the ui modification where the sandbox header is added.
![image](https://user-images.githubusercontent.com/7945591/27831630-2a3b447e-60d4-11e7-87bb-d057efd4efa7.png)

Tested the uri redirection the way it was suggested here:
https://issues.apache.org/jira/browse/MESOS-4992

Built mesos 1.4 from the master branch and started the mesos dispatcher with the command:

`./sbin/start-mesos-dispatcher.sh --conf spark.mesos.proxy.baseURL=http://localhost:5050 -m mesos://127.0.0.1:5050`

Run a spark example:

`./bin/spark-submit   --class org.apache.spark.examples.SparkPi   --master mesos://10.10.1.79:7078   --deploy-mode cluster   --executor-memory 2G   --total-executor-cores 2     http://<path>/spark-examples_2.11-2.1.1.jar  10`

Sandbox uri is shown at the bottom of the page:

![image](https://user-images.githubusercontent.com/7945591/28599237-89d0a8c8-71b1-11e7-8f94-41ad117ceead.png)

Redirection works as expected:
![image](https://user-images.githubusercontent.com/7945591/28599247-a5d65248-71b1-11e7-8b5e-a0ac2a79fa23.png)

Author: Stavros Kontopoulos <st.kontopoulos@gmail.com>

Closes #18528 from skonto/adds_the_sandbox_uri.
---
 docs/running-on-mesos.md                           |  2 ++
 .../spark/deploy/mesos/ui/MesosClusterPage.scala   | 14 +++++++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md
index cf257c06c9516..ae3855084a650 100644
--- a/docs/running-on-mesos.md
+++ b/docs/running-on-mesos.md
@@ -153,6 +153,8 @@ can find the results of the driver from the Mesos Web UI.
 To use cluster mode, you must start the `MesosClusterDispatcher` in your cluster via the `sbin/start-mesos-dispatcher.sh` script,
 passing in the Mesos master URL (e.g: mesos://host:5050). This starts the `MesosClusterDispatcher` as a daemon running on the host.
 
+By setting the Mesos proxy config property (requires mesos version >= 1.4), `--conf spark.mesos.proxy.baseURL=http://localhost:5050` when launching the dispacther, the mesos sandbox URI for each driver is added to the mesos dispatcher UI.
+
 If you like to run the `MesosClusterDispatcher` with Marathon, you need to run the `MesosClusterDispatcher` in the foreground (i.e: `bin/spark-class org.apache.spark.deploy.mesos.MesosClusterDispatcher`). Note that the `MesosClusterDispatcher` not yet supports multiple instances for HA.
 
 The `MesosClusterDispatcher` also supports writing recovery state into Zookeeper. This will allow the `MesosClusterDispatcher` to be able to recover all submitted and running containers on relaunch.   In order to enable this recovery mode, you can set SPARK_DAEMON_JAVA_OPTS in spark-env by configuring `spark.deploy.recoveryMode` and related spark.deploy.zookeeper.* configurations.
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterPage.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterPage.scala
index c9107c3e73d3f..219b52ef1403a 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterPage.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterPage.scala
@@ -37,10 +37,11 @@ private[mesos] class MesosClusterPage(parent: MesosClusterUI) extends WebUIPage(
     val driverHeader = Seq("Driver ID")
     val historyHeader = historyServerURL.map(url => Seq("History")).getOrElse(Nil)
     val submissionHeader = Seq("Submit Date", "Main Class", "Driver Resources")
+    val sandboxHeader = Seq("Sandbox")
 
     val queuedHeaders = driverHeader ++ submissionHeader
     val driverHeaders = driverHeader ++ historyHeader ++ submissionHeader ++
-      Seq("Start Date", "Mesos Slave ID", "State")
+      Seq("Start Date", "Mesos Slave ID", "State") ++ sandboxHeader
     val retryHeaders = Seq("Driver ID", "Submit Date", "Description") ++
       Seq("Last Failed Status", "Next Retry Time", "Attempt Count")
     val queuedTable = UIUtils.listingTable(queuedHeaders, queuedRow, state.queuedDrivers)
@@ -76,6 +77,16 @@ private[mesos] class MesosClusterPage(parent: MesosClusterUI) extends WebUIPage(
 
   private def driverRow(state: MesosClusterSubmissionState): Seq[Node] = {
     val id = state.driverDescription.submissionId
+    val proxy = parent.conf.getOption("spark.mesos.proxy.baseURL")
+
+    val sandboxCol = if (proxy.isDefined) {
+      val clusterSchedulerId = parent.scheduler.getSchedulerState().frameworkId
+      val sandBoxUri = s"${proxy.get}/#/agents/${state.slaveId.getValue}/frameworks/" +
+        s"${clusterSchedulerId}/executors/${id}/browse"
+      <a href={sandBoxUri}>Sandbox</a>
+    } else {
+      " "
+    }
 
     val historyCol = if (historyServerURL.isDefined) {
       <td>
@@ -94,6 +105,7 @@ private[mesos] class MesosClusterPage(parent: MesosClusterUI) extends WebUIPage(
       <td>{UIUtils.formatDate(state.startDate)}</td>
       <td>{state.slaveId.getValue}</td>
       <td>{stateString(state.mesosTaskStatus)}</td>
+      <td>{sandboxCol}</td>
     </tr>
   }
 

From 1426eea84c544000273d176514532cb7f7015cea Mon Sep 17 00:00:00 2001
From: Peng Meng <peng.meng@intel.com>
Date: Mon, 7 Aug 2017 11:03:07 +0100
Subject: [PATCH 312/779] [SPARK-21623][ML] fix RF doc

## What changes were proposed in this pull request?

comments of parentStats in RF are wrong.
parentStats is not only used for the first iteration, it is used with all the iteration for unordered features.

## How was this patch tested?

Author: Peng Meng <peng.meng@intel.com>

Closes #18832 from mpjlu/fixRFDoc.
---
 .../org/apache/spark/ml/tree/impl/DTStatsAggregator.scala   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DTStatsAggregator.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DTStatsAggregator.scala
index 61091bb803e49..5aeea1443d499 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DTStatsAggregator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DTStatsAggregator.scala
@@ -78,9 +78,9 @@ private[spark] class DTStatsAggregator(
 
   /**
    * Array of parent node sufficient stats.
-   *
-   * Note: this is necessary because stats for the parent node are not available
-   *       on the first iteration of tree learning.
+   * Note: parent stats need to be explicitly tracked in the [[DTStatsAggregator]] for unordered
+   *       categorical features, because the parent [[Node]] object does not have [[ImpurityStats]]
+   *       on the first iteration.
    */
   private val parentStats: Array[Double] = new Array[Double](statsSize)
 

From 8b69b17f3fde2c890068a5a9ef490fe675cc80c1 Mon Sep 17 00:00:00 2001
From: zhoukang <zhoukang199191@gmail.com>
Date: Mon, 7 Aug 2017 12:51:39 +0100
Subject: [PATCH 313/779] [SPARK-21544][DEPLOY][TEST-MAVEN] Tests jar of some
 module should not upload twice

## What changes were proposed in this pull request?

**For moudle below:**
common/network-common
streaming
sql/core
sql/catalyst
**tests.jar will install or deploy twice.Like:**
`[DEBUG] Installing org.apache.spark:spark-streaming_2.11/maven-metadata.xml to /home/mi/.m2/repository/org/apache/spark/spark-streaming_2.11/maven-metadata-local.xml
[INFO] Installing /home/mi/Work/Spark/scala2.11/spark/streaming/target/spark-streaming_2.11-2.1.0-mdh2.1.0.1-SNAPSHOT-tests.jar to /home/mi/.m2/repository/org/apache/spark/spark-streaming_2.11/2.1.0-mdh2.1.0.1-SNAPSHOT/spark-streaming_2.11-2.1.0-mdh2.1.0.1-SNAPSHOT-tests.jar
[DEBUG] Skipped re-installing /home/mi/Work/Spark/scala2.11/spark/streaming/target/spark-streaming_2.11-2.1.0-mdh2.1.0.1-SNAPSHOT-tests.jar to /home/mi/.m2/repository/org/apache/spark/spark-streaming_2.11/2.1.0-mdh2.1.0.1-SNAPSHOT/spark-streaming_2.11-2.1.0-mdh2.1.0.1-SNAPSHOT-tests.jar, seems unchanged`
**The reason is below:**
`[DEBUG]   (f) artifact = org.apache.spark:spark-streaming_2.11:jar:2.1.0-mdh2.1.0.1-SNAPSHOT
[DEBUG]   (f) attachedArtifacts = [org.apache.spark:spark-streaming_2.11:test-jar:tests:2.1.0-mdh2.1.0.1-SNAPSHOT, org.apache.spark:spark-streaming_2.11:jar:tests:2.1.0-mdh2.1.0.1-SNAPSHOT, org.apache.spark:spark
-streaming_2.11:java-source:sources:2.1.0-mdh2.1.0.1-SNAPSHOT, org.apache.spark:spark-streaming_2.11:java-source:test-sources:2.1.0-mdh2.1.0.1-SNAPSHOT, org.apache.spark:spark-streaming_2.11:javadoc:javadoc:2.1.0
-mdh2.1.0.1-SNAPSHOT]`

when executing 'mvn deploy' to nexus during release.I will fail since release nexus can not be overrided.

## How was this patch tested?
Execute 'mvn clean install -Pyarn -Phadoop-2.6 -Phadoop-provided -DskipTests'

Author: zhoukang <zhoukang199191@gmail.com>

Closes #18745 from caneGuy/zhoukang/fix-installtwice.
---
 common/network-common/pom.xml | 2 +-
 sql/catalyst/pom.xml          | 7 +------
 sql/core/pom.xml              | 7 +------
 3 files changed, 3 insertions(+), 13 deletions(-)

diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 0254d0cefc368..ccd8504ba0d7b 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -126,7 +126,7 @@
         <artifactId>maven-jar-plugin</artifactId>
         <executions>
           <execution>
-            <id>test-jar-on-test-compile</id>
+            <id>prepare-test-jar</id>
             <phase>test-compile</phase>
             <goals>
               <goal>test-jar</goal>
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 0bbf7a95124cf..fce81493795c8 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -118,12 +118,7 @@
         <artifactId>maven-jar-plugin</artifactId>
         <executions>
           <execution>
-            <goals>
-              <goal>test-jar</goal>
-            </goals>
-          </execution>
-          <execution>
-            <id>test-jar-on-test-compile</id>
+            <id>prepare-test-jar</id>
             <phase>test-compile</phase>
             <goals>
               <goal>test-jar</goal>
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 661c31ded7148..25004e5326097 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -174,12 +174,7 @@
         <artifactId>maven-jar-plugin</artifactId>
         <executions>
           <execution>
-            <goals>
-              <goal>test-jar</goal>
-            </goals>
-          </execution>
-          <execution>
-            <id>test-jar-on-test-compile</id>
+            <id>prepare-test-jar</id>
             <phase>test-compile</phase>
             <goals>
               <goal>test-jar</goal>

From bbfd6b5d24be5919a3ab1ac3eaec46e33201df39 Mon Sep 17 00:00:00 2001
From: Xiao Li <gatorsmile@gmail.com>
Date: Tue, 8 Aug 2017 00:00:01 +0800
Subject: [PATCH 314/779] [SPARK-21647][SQL] Fix SortMergeJoin when using CROSS

### What changes were proposed in this pull request?
author: BoleynSu
closes https://github.com/apache/spark/pull/18836

```Scala
val df = Seq((1, 1)).toDF("i", "j")
df.createOrReplaceTempView("T")
withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") {
  sql("select * from (select a.i from T a cross join T t where t.i = a.i) as t1 " +
    "cross join T t2 where t2.i = t1.i").explain(true)
}
```
The above code could cause the following exception:
```
SortMergeJoinExec should not take Cross as the JoinType
java.lang.IllegalArgumentException: SortMergeJoinExec should not take Cross as the JoinType
	at org.apache.spark.sql.execution.joins.SortMergeJoinExec.outputOrdering(SortMergeJoinExec.scala:100)
```

Our SortMergeJoinExec supports CROSS. We should not hit such an exception. This PR is to fix the issue.

### How was this patch tested?
Modified the two existing test cases.

Author: Xiao Li <gatorsmile@gmail.com>
Author: Boleyn Su <boleyn.su@gmail.com>

Closes #18863 from gatorsmile/pr-18836.
---
 .../execution/joins/SortMergeJoinExec.scala   |  2 +-
 .../spark/sql/execution/PlannerSuite.scala    | 36 ++++++++++---------
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
index 639b8e00c121b..f41fa14213df5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
@@ -82,7 +82,7 @@ case class SortMergeJoinExec(
 
   override def outputOrdering: Seq[SortOrder] = joinType match {
     // For inner join, orders of both sides keys should be kept.
-    case Inner =>
+    case _: InnerLike =>
       val leftKeyOrdering = getKeyOrdering(leftKeys, left.outputOrdering)
       val rightKeyOrdering = getKeyOrdering(rightKeys, right.outputOrdering)
       leftKeyOrdering.zip(rightKeyOrdering).map { case (lKey, rKey) =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index 4d155d538d637..63e17c7f372b0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -21,7 +21,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{execution, Row}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.{FullOuter, Inner, LeftOuter, RightOuter}
+import org.apache.spark.sql.catalyst.plans.{Cross, FullOuter, Inner, LeftOuter, RightOuter}
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Repartition}
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution.columnar.InMemoryRelation
@@ -513,26 +513,30 @@ class PlannerSuite extends SharedSQLContext {
   }
 
   test("EnsureRequirements skips sort when either side of join keys is required after inner SMJ") {
-    val innerSmj = SortMergeJoinExec(exprA :: Nil, exprB :: Nil, Inner, None, planA, planB)
-    // Both left and right keys should be sorted after the SMJ.
-    Seq(orderingA, orderingB).foreach { ordering =>
-      assertSortRequirementsAreSatisfied(
-        childPlan = innerSmj,
-        requiredOrdering = Seq(ordering),
-        shouldHaveSort = false)
+    Seq(Inner, Cross).foreach { joinType =>
+      val innerSmj = SortMergeJoinExec(exprA :: Nil, exprB :: Nil, joinType, None, planA, planB)
+      // Both left and right keys should be sorted after the SMJ.
+      Seq(orderingA, orderingB).foreach { ordering =>
+        assertSortRequirementsAreSatisfied(
+          childPlan = innerSmj,
+          requiredOrdering = Seq(ordering),
+          shouldHaveSort = false)
+      }
     }
   }
 
   test("EnsureRequirements skips sort when key order of a parent SMJ is propagated from its " +
     "child SMJ") {
-    val childSmj = SortMergeJoinExec(exprA :: Nil, exprB :: Nil, Inner, None, planA, planB)
-    val parentSmj = SortMergeJoinExec(exprB :: Nil, exprC :: Nil, Inner, None, childSmj, planC)
-    // After the second SMJ, exprA, exprB and exprC should all be sorted.
-    Seq(orderingA, orderingB, orderingC).foreach { ordering =>
-      assertSortRequirementsAreSatisfied(
-        childPlan = parentSmj,
-        requiredOrdering = Seq(ordering),
-        shouldHaveSort = false)
+    Seq(Inner, Cross).foreach { joinType =>
+      val childSmj = SortMergeJoinExec(exprA :: Nil, exprB :: Nil, joinType, None, planA, planB)
+      val parentSmj = SortMergeJoinExec(exprB :: Nil, exprC :: Nil, joinType, None, childSmj, planC)
+      // After the second SMJ, exprA, exprB and exprC should all be sorted.
+      Seq(orderingA, orderingB, orderingC).foreach { ordering =>
+        assertSortRequirementsAreSatisfied(
+          childPlan = parentSmj,
+          requiredOrdering = Seq(ordering),
+          shouldHaveSort = false)
+      }
     }
   }
 

From 4f7ec3a316fd2c6e0828d8777d29abb472fd7a14 Mon Sep 17 00:00:00 2001
From: Mac <maclockard@gmail.com>
Date: Mon, 7 Aug 2017 17:16:03 +0100
Subject: [PATCH 315/779] [SPARK][DOCS] Added note on meaning of position to
 substring function

## What changes were proposed in this pull request?

Enhanced some existing documentation

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Mac <maclockard@gmail.com>

Closes #18710 from maclockard/maclockard-patch-1.
---
 python/pyspark/sql/functions.py                              | 4 +++-
 sql/core/src/main/scala/org/apache/spark/sql/functions.scala | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 2c8c8e2d80f09..0e76182e0e02d 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -1424,7 +1424,9 @@ def substring(str, pos, len):
     """
     Substring starts at `pos` and is of length `len` when str is String type or
     returns the slice of byte array that starts at `pos` in byte and is of length `len`
-    when str is Binary type
+    when str is Binary type.
+
+    .. note:: The position is not zero based, but 1 based index.
 
     >>> df = spark.createDataFrame([('abcd',)], ['s',])
     >>> df.select(substring(df.s, 1, 2).alias('s')).collect()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index ccff00e570dbd..496619a43a503 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2408,6 +2408,8 @@ object functions {
    * returns the slice of byte array that starts at `pos` in byte and is of length `len`
    * when str is Binary type
    *
+   * @note The position is not zero based, but 1 based index.
+   *
    * @group string_funcs
    * @since 1.5.0
    */

From cce25b360ee9e39d9510134c73a1761475eaf4ac Mon Sep 17 00:00:00 2001
From: Jose Torres <joseph-torres@databricks.com>
Date: Mon, 7 Aug 2017 12:27:16 -0700
Subject: [PATCH 316/779] [SPARK-21565][SS] Propagate metadata in attribute
 replacement.

## What changes were proposed in this pull request?

Propagate metadata in attribute replacement during streaming execution. This is necessary for EventTimeWatermarks consuming replaced attributes.

## How was this patch tested?
new unit test, which was verified to fail before the fix

Author: Jose Torres <joseph-torres@databricks.com>

Closes #18840 from joseph-torres/SPARK-21565.
---
 .../execution/streaming/StreamExecution.scala |  3 +-
 .../streaming/EventTimeWatermarkSuite.scala   | 28 +++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 5711262654a16..1528e7f469bef 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -628,7 +628,8 @@ class StreamExecution(
     // Rewire the plan to use the new attributes that were returned by the source.
     val replacementMap = AttributeMap(replacements)
     val triggerLogicalPlan = withNewSources transformAllExpressions {
-      case a: Attribute if replacementMap.contains(a) => replacementMap(a)
+      case a: Attribute if replacementMap.contains(a) =>
+        replacementMap(a).withMetadata(a.metadata)
       case ct: CurrentTimestamp =>
         CurrentBatchTimestamp(offsetSeqMetadata.batchTimestampMs,
           ct.dataType)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala
index 552911f32ee43..4f19fa0bb4a97 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala
@@ -391,6 +391,34 @@ class EventTimeWatermarkSuite extends StreamTest with BeforeAndAfter with Matche
     checkDataset[Long](df, 1L to 100L: _*)
   }
 
+  test("SPARK-21565: watermark operator accepts attributes from replacement") {
+    withTempDir { dir =>
+      dir.delete()
+
+      val df = Seq(("a", 100.0, new java.sql.Timestamp(100L)))
+        .toDF("symbol", "price", "eventTime")
+      df.write.json(dir.getCanonicalPath)
+
+      val input = spark.readStream.schema(df.schema)
+        .json(dir.getCanonicalPath)
+
+      val groupEvents = input
+        .withWatermark("eventTime", "2 seconds")
+        .groupBy("symbol", "eventTime")
+        .agg(count("price") as 'count)
+        .select("symbol", "eventTime", "count")
+      val q = groupEvents.writeStream
+        .outputMode("append")
+        .format("console")
+        .start()
+      try {
+        q.processAllAvailable()
+      } finally {
+        q.stop()
+      }
+    }
+  }
+
   private def assertNumStateRows(numTotalRows: Long): AssertOnQuery = AssertOnQuery { q =>
     val progressWithData = q.recentProgress.filter(_.numInputRows > 0).lastOption.get
     assert(progressWithData.stateOperators(0).numRowsTotal === numTotalRows)

From baf5cac0f8c35925c366464d7e0eb5f6023fce57 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Mon, 7 Aug 2017 13:04:04 -0700
Subject: [PATCH 317/779] [SPARK-21648][SQL] Fix confusing assert failure in
 JDBC source when parallel fetching parameters are not properly provided.

### What changes were proposed in this pull request?
```SQL
CREATE TABLE mytesttable1
USING org.apache.spark.sql.jdbc
  OPTIONS (
  url 'jdbc:mysql://${jdbcHostname}:${jdbcPort}/${jdbcDatabase}?user=${jdbcUsername}&password=${jdbcPassword}',
  dbtable 'mytesttable1',
  paritionColumn 'state_id',
  lowerBound '0',
  upperBound '52',
  numPartitions '53',
  fetchSize '10000'
)
```

The above option name `paritionColumn` is wrong. That mean, users did not provide the value for `partitionColumn`. In such case, users hit a confusing error.

```
AssertionError: assertion failed
java.lang.AssertionError: assertion failed
	at scala.Predef$.assert(Predef.scala:156)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:39)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:312)
```

### How was this patch tested?
Added a test case

Author: gatorsmile <gatorsmile@gmail.com>

Closes #18864 from gatorsmile/jdbcPartCol.
---
 .../datasources/jdbc/JDBCOptions.scala        | 11 ++++++----
 .../jdbc/JdbcRelationProvider.scala           |  9 ++++++--
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 22 +++++++++++++++++++
 .../spark/sql/jdbc/JDBCWriteSuite.scala       |  5 +++--
 4 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala
index 591096d5efd22..96a8a51da18e5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala
@@ -97,10 +97,13 @@ class JDBCOptions(
   val lowerBound = parameters.get(JDBC_LOWER_BOUND).map(_.toLong)
   // the upper bound of the partition column
   val upperBound = parameters.get(JDBC_UPPER_BOUND).map(_.toLong)
-  require(partitionColumn.isEmpty ||
-    (lowerBound.isDefined && upperBound.isDefined && numPartitions.isDefined),
-    s"If '$JDBC_PARTITION_COLUMN' is specified then '$JDBC_LOWER_BOUND', '$JDBC_UPPER_BOUND'," +
-      s" and '$JDBC_NUM_PARTITIONS' are required.")
+  // numPartitions is also used for data source writing
+  require((partitionColumn.isEmpty && lowerBound.isEmpty && upperBound.isEmpty) ||
+    (partitionColumn.isDefined && lowerBound.isDefined && upperBound.isDefined &&
+      numPartitions.isDefined),
+    s"When reading JDBC data sources, users need to specify all or none for the following " +
+      s"options: '$JDBC_PARTITION_COLUMN', '$JDBC_LOWER_BOUND', '$JDBC_UPPER_BOUND', " +
+      s"and '$JDBC_NUM_PARTITIONS'")
   val fetchSize = {
     val size = parameters.getOrElse(JDBC_BATCH_FETCH_SIZE, "0").toInt
     require(size >= 0,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcRelationProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcRelationProvider.scala
index 74dcfb06f5c2b..37e7bb0a59bb6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcRelationProvider.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcRelationProvider.scala
@@ -29,6 +29,8 @@ class JdbcRelationProvider extends CreatableRelationProvider
   override def createRelation(
       sqlContext: SQLContext,
       parameters: Map[String, String]): BaseRelation = {
+    import JDBCOptions._
+
     val jdbcOptions = new JDBCOptions(parameters)
     val partitionColumn = jdbcOptions.partitionColumn
     val lowerBound = jdbcOptions.lowerBound
@@ -36,10 +38,13 @@ class JdbcRelationProvider extends CreatableRelationProvider
     val numPartitions = jdbcOptions.numPartitions
 
     val partitionInfo = if (partitionColumn.isEmpty) {
-      assert(lowerBound.isEmpty && upperBound.isEmpty)
+      assert(lowerBound.isEmpty && upperBound.isEmpty, "When 'partitionColumn' is not specified, " +
+        s"'$JDBC_LOWER_BOUND' and '$JDBC_UPPER_BOUND' are expected to be empty")
       null
     } else {
-      assert(lowerBound.nonEmpty && upperBound.nonEmpty && numPartitions.nonEmpty)
+      assert(lowerBound.nonEmpty && upperBound.nonEmpty && numPartitions.nonEmpty,
+        s"When 'partitionColumn' is specified, '$JDBC_LOWER_BOUND', '$JDBC_UPPER_BOUND', and " +
+          s"'$JDBC_NUM_PARTITIONS' are also required")
       JDBCPartitioningInfo(
         partitionColumn.get, lowerBound.get, upperBound.get, numPartitions.get)
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index 24f46a6a057d9..4c43646889418 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -421,6 +421,28 @@ class JDBCSuite extends SparkFunSuite
     assert(e.contains("Invalid value `-1` for parameter `fetchsize`"))
   }
 
+  test("Missing partition columns") {
+    withView("tempPeople") {
+      val e = intercept[IllegalArgumentException] {
+        sql(
+          s"""
+             |CREATE OR REPLACE TEMPORARY VIEW tempPeople
+             |USING org.apache.spark.sql.jdbc
+             |OPTIONS (
+             |  url 'jdbc:h2:mem:testdb0;user=testUser;password=testPass',
+             |  dbtable 'TEST.PEOPLE',
+             |  lowerBound '0',
+             |  upperBound '52',
+             |  numPartitions '53',
+             |  fetchSize '10000' )
+           """.stripMargin.replaceAll("\n", " "))
+      }.getMessage
+      assert(e.contains("When reading JDBC data sources, users need to specify all or none " +
+        "for the following options: 'partitionColumn', 'lowerBound', 'upperBound', and " +
+        "'numPartitions'"))
+    }
+  }
+
   test("Basic API with FetchSize") {
     (0 to 4).foreach { size =>
       val properties = new Properties()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
index 2334d5ae32dc3..b7f97f204b24c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
@@ -324,8 +324,9 @@ class JDBCWriteSuite extends SharedSQLContext with BeforeAndAfter {
         .option("partitionColumn", "foo")
         .save()
     }.getMessage
-    assert(e.contains("If 'partitionColumn' is specified then 'lowerBound', 'upperBound'," +
-      " and 'numPartitions' are required."))
+    assert(e.contains("When reading JDBC data sources, users need to specify all or none " +
+      "for the following options: 'partitionColumn', 'lowerBound', 'upperBound', and " +
+      "'numPartitions'"))
   }
 
   test("SPARK-18433: Improve DataSource option keys to be more case-insensitive") {

From fdcee028afa7a7ac0f8bd8f59ee4933d7caea064 Mon Sep 17 00:00:00 2001
From: Ajay Saini <ajays725@gmail.com>
Date: Mon, 7 Aug 2017 17:03:20 -0700
Subject: [PATCH 318/779] [SPARK-21542][ML][PYTHON] Python persistence helper
 functions

## What changes were proposed in this pull request?

Added DefaultParamsWriteable, DefaultParamsReadable, DefaultParamsWriter, and DefaultParamsReader to Python to support Python-only persistence of Json-serializable parameters.

## How was this patch tested?

Instantiated an estimator with Json-serializable parameters (ex. LogisticRegression), saved it using the added helper functions, and loaded it back, and compared it to the original instance to make sure it is the same. This test was both done in the Python REPL and implemented in the unit tests.

Note to reviewers: there are a few excess comments that I left in the code for clarity but will remove before the code is merged to master.

Author: Ajay Saini <ajays725@gmail.com>

Closes #18742 from ajaysaini725/PythonPersistenceHelperFunctions.
---
 .../org/apache/spark/ml/util/ReadWrite.scala  |  37 ++-
 python/pyspark/ml/param/__init__.py           |  11 +
 python/pyspark/ml/pipeline.py                 |  10 -
 python/pyspark/ml/tests.py                    |  34 ++
 python/pyspark/ml/util.py                     | 302 ++++++++++++++++--
 5 files changed, 342 insertions(+), 52 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
index b54e258cff2f8..65f142cfbbcb6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
@@ -96,21 +96,7 @@ abstract class MLWriter extends BaseReadWrite with Logging {
   @Since("1.6.0")
   @throws[IOException]("If the input path already exists but overwrite is not enabled.")
   def save(path: String): Unit = {
-    val hadoopConf = sc.hadoopConfiguration
-    val outputPath = new Path(path)
-    val fs = outputPath.getFileSystem(hadoopConf)
-    val qualifiedOutputPath = outputPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
-    if (fs.exists(qualifiedOutputPath)) {
-      if (shouldOverwrite) {
-        logInfo(s"Path $path already exists. It will be overwritten.")
-        // TODO: Revert back to the original content if save is not successful.
-        fs.delete(qualifiedOutputPath, true)
-      } else {
-        throw new IOException(s"Path $path already exists. To overwrite it, " +
-          s"please use write.overwrite().save(path) for Scala and use " +
-          s"write().overwrite().save(path) for Java and Python.")
-      }
-    }
+    new FileSystemOverwrite().handleOverwrite(path, shouldOverwrite, sc)
     saveImpl(path)
   }
 
@@ -471,3 +457,24 @@ private[ml] object MetaAlgorithmReadWrite {
     List((instance.uid, instance)) ++ subStageMaps
   }
 }
+
+private[ml] class FileSystemOverwrite extends Logging {
+
+  def handleOverwrite(path: String, shouldOverwrite: Boolean, sc: SparkContext): Unit = {
+    val hadoopConf = sc.hadoopConfiguration
+    val outputPath = new Path(path)
+    val fs = outputPath.getFileSystem(hadoopConf)
+    val qualifiedOutputPath = outputPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
+    if (fs.exists(qualifiedOutputPath)) {
+      if (shouldOverwrite) {
+        logInfo(s"Path $path already exists. It will be overwritten.")
+        // TODO: Revert back to the original content if save is not successful.
+        fs.delete(qualifiedOutputPath, true)
+      } else {
+        throw new IOException(s"Path $path already exists. To overwrite it, " +
+          s"please use write.overwrite().save(path) for Scala and use " +
+          s"write().overwrite().save(path) for Java and Python.")
+      }
+    }
+  }
+}
diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py
index 4583ae85dc8fa..13342073b4988 100644
--- a/python/pyspark/ml/param/__init__.py
+++ b/python/pyspark/ml/param/__init__.py
@@ -384,6 +384,17 @@ def copy(self, extra=None):
         that._defaultParamMap = {}
         return self._copyValues(that, extra)
 
+    def set(self, param, value):
+        """
+        Sets a parameter in the embedded param map.
+        """
+        self._shouldOwn(param)
+        try:
+            value = param.typeConverter(value)
+        except ValueError as e:
+            raise ValueError('Invalid param value given for param "%s". %s' % (param.name, e))
+        self._paramMap[param] = value
+
     def _shouldOwn(self, param):
         """
         Validates that the input param belongs to this Params instance.
diff --git a/python/pyspark/ml/pipeline.py b/python/pyspark/ml/pipeline.py
index 4aac6a4466b54..a8dc76b846c24 100644
--- a/python/pyspark/ml/pipeline.py
+++ b/python/pyspark/ml/pipeline.py
@@ -132,11 +132,6 @@ def write(self):
         """Returns an MLWriter instance for this ML instance."""
         return JavaMLWriter(self)
 
-    @since("2.0.0")
-    def save(self, path):
-        """Save this ML instance to the given path, a shortcut of `write().save(path)`."""
-        self.write().save(path)
-
     @classmethod
     @since("2.0.0")
     def read(cls):
@@ -211,11 +206,6 @@ def write(self):
         """Returns an MLWriter instance for this ML instance."""
         return JavaMLWriter(self)
 
-    @since("2.0.0")
-    def save(self, path):
-        """Save this ML instance to the given path, a shortcut of `write().save(path)`."""
-        self.write().save(path)
-
     @classmethod
     @since("2.0.0")
     def read(cls):
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 3bd4d3737a056..6aecc7fe87074 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -62,6 +62,7 @@
     LinearRegression
 from pyspark.ml.stat import ChiSquareTest
 from pyspark.ml.tuning import *
+from pyspark.ml.util import *
 from pyspark.ml.wrapper import JavaParams, JavaWrapper
 from pyspark.serializers import PickleSerializer
 from pyspark.sql import DataFrame, Row, SparkSession
@@ -377,6 +378,12 @@ def test_params(self):
         with self.assertRaises(KeyError):
             testParams.getInputCol()
 
+        otherParam = Param(Params._dummy(), "otherParam", "Parameter used to test that " +
+                           "set raises an error for a non-member parameter.",
+                           typeConverter=TypeConverters.toString)
+        with self.assertRaises(ValueError):
+            testParams.set(otherParam, "value")
+
         # Since the default is normally random, set it to a known number for debug str
         testParams._setDefault(seed=41)
         testParams.setSeed(43)
@@ -1189,6 +1196,33 @@ def test_decisiontree_regressor(self):
         except OSError:
             pass
 
+    def test_default_read_write(self):
+        temp_path = tempfile.mkdtemp()
+
+        lr = LogisticRegression()
+        lr.setMaxIter(50)
+        lr.setThreshold(.75)
+        writer = DefaultParamsWriter(lr)
+
+        savePath = temp_path + "/lr"
+        writer.save(savePath)
+
+        reader = DefaultParamsReadable.read()
+        lr2 = reader.load(savePath)
+
+        self.assertEqual(lr.uid, lr2.uid)
+        self.assertEqual(lr.extractParamMap(), lr2.extractParamMap())
+
+        # test overwrite
+        lr.setThreshold(.8)
+        writer.overwrite().save(savePath)
+
+        reader = DefaultParamsReadable.read()
+        lr3 = reader.load(savePath)
+
+        self.assertEqual(lr.uid, lr3.uid)
+        self.assertEqual(lr.extractParamMap(), lr3.extractParamMap())
+
 
 class LDATest(SparkSessionTestCase):
 
diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py
index 7863edda7e7ab..67772910c0d38 100644
--- a/python/pyspark/ml/util.py
+++ b/python/pyspark/ml/util.py
@@ -15,16 +15,21 @@
 # limitations under the License.
 #
 
+import json
 import sys
+import os
+import time
 import uuid
 import warnings
 
 if sys.version > '3':
     basestring = str
     unicode = str
+    long = int
 
 from pyspark import SparkContext, since
 from pyspark.ml.common import inherit_doc
+from pyspark.sql import SparkSession
 
 
 def _jvm():
@@ -61,33 +66,86 @@ def _randomUID(cls):
 
 
 @inherit_doc
-class MLWriter(object):
+class BaseReadWrite(object):
     """
-    Utility class that can save ML instances.
+    Base class for MLWriter and MLReader. Stores information about the SparkContext
+    and SparkSession.
 
-    .. versionadded:: 2.0.0
+    .. versionadded:: 2.3.0
     """
 
-    def save(self, path):
-        """Save the ML instance to the input path."""
-        raise NotImplementedError("MLWriter is not yet implemented for type: %s" % type(self))
-
-    def overwrite(self):
-        """Overwrites if the output path already exists."""
-        raise NotImplementedError("MLWriter is not yet implemented for type: %s" % type(self))
+    def __init__(self):
+        self._sparkSession = None
 
     def context(self, sqlContext):
         """
-        Sets the SQL context to use for saving.
+        Sets the Spark SQLContext to use for saving/loading.
 
         .. note:: Deprecated in 2.1 and will be removed in 3.0, use session instead.
         """
-        raise NotImplementedError("MLWriter is not yet implemented for type: %s" % type(self))
+        raise NotImplementedError("Read/Write is not yet implemented for type: %s" % type(self))
 
     def session(self, sparkSession):
-        """Sets the Spark Session to use for saving."""
+        """
+        Sets the Spark Session to use for saving/loading.
+        """
+        self._sparkSession = sparkSession
+        return self
+
+    @property
+    def sparkSession(self):
+        """
+        Returns the user-specified Spark Session or the default.
+        """
+        if self._sparkSession is None:
+            self._sparkSession = SparkSession.builder.getOrCreate()
+        return self._sparkSession
+
+    @property
+    def sc(self):
+        """
+        Returns the underlying `SparkContext`.
+        """
+        return self.sparkSession.sparkContext
+
+
+@inherit_doc
+class MLWriter(BaseReadWrite):
+    """
+    Utility class that can save ML instances.
+
+    .. versionadded:: 2.0.0
+    """
+
+    def __init__(self):
+        super(MLWriter, self).__init__()
+        self.shouldOverwrite = False
+
+    def _handleOverwrite(self, path):
+        from pyspark.ml.wrapper import JavaWrapper
+
+        _java_obj = JavaWrapper._new_java_obj("org.apache.spark.ml.util.FileSystemOverwrite")
+        wrapper = JavaWrapper(_java_obj)
+        wrapper._call_java("handleOverwrite", path, True, self.sc._jsc.sc())
+
+    def save(self, path):
+        """Save the ML instance to the input path."""
+        if self.shouldOverwrite:
+            self._handleOverwrite(path)
+        self.saveImpl(path)
+
+    def saveImpl(self, path):
+        """
+        save() handles overwriting and then calls this method.  Subclasses should override this
+        method to implement the actual saving of the instance.
+        """
         raise NotImplementedError("MLWriter is not yet implemented for type: %s" % type(self))
 
+    def overwrite(self):
+        """Overwrites if the output path already exists."""
+        self.shouldOverwrite = True
+        return self
+
 
 @inherit_doc
 class JavaMLWriter(MLWriter):
@@ -140,7 +198,7 @@ def write(self):
         raise NotImplementedError("MLWritable is not yet implemented for type: %r" % type(self))
 
     def save(self, path):
-        """Save this ML instance to the given path, a shortcut of `write().save(path)`."""
+        """Save this ML instance to the given path, a shortcut of 'write().save(path)'."""
         self.write().save(path)
 
 
@@ -156,29 +214,20 @@ def write(self):
 
 
 @inherit_doc
-class MLReader(object):
+class MLReader(BaseReadWrite):
     """
     Utility class that can load ML instances.
 
     .. versionadded:: 2.0.0
     """
 
+    def __init__(self):
+        super(MLReader, self).__init__()
+
     def load(self, path):
         """Load the ML instance from the input path."""
         raise NotImplementedError("MLReader is not yet implemented for type: %s" % type(self))
 
-    def context(self, sqlContext):
-        """
-        Sets the SQL context to use for loading.
-
-        .. note:: Deprecated in 2.1 and will be removed in 3.0, use session instead.
-        """
-        raise NotImplementedError("MLReader is not yet implemented for type: %s" % type(self))
-
-    def session(self, sparkSession):
-        """Sets the Spark Session to use for loading."""
-        raise NotImplementedError("MLReader is not yet implemented for type: %s" % type(self))
-
 
 @inherit_doc
 class JavaMLReader(MLReader):
@@ -187,6 +236,7 @@ class JavaMLReader(MLReader):
     """
 
     def __init__(self, clazz):
+        super(JavaMLReader, self).__init__()
         self._clazz = clazz
         self._jread = self._load_java_obj(clazz).read()
 
@@ -283,3 +333,201 @@ def numFeatures(self):
         Returns the number of features the model was trained on. If unknown, returns -1
         """
         return self._call_java("numFeatures")
+
+
+@inherit_doc
+class DefaultParamsWritable(MLWritable):
+    """
+    .. note:: DeveloperApi
+
+    Helper trait for making simple :py:class:`Params` types writable.  If a :py:class:`Params`
+    class stores all data as :py:class:`Param` values, then extending this trait will provide
+    a default implementation of writing saved instances of the class.
+    This only handles simple :py:class:`Param` types; e.g., it will not handle
+    :py:class:`Dataset`. See :py:class:`DefaultParamsReadable`, the counterpart to this trait.
+
+    .. versionadded:: 2.3.0
+    """
+
+    def write(self):
+        """Returns a DefaultParamsWriter instance for this class."""
+        from pyspark.ml.param import Params
+
+        if isinstance(self, Params):
+            return DefaultParamsWriter(self)
+        else:
+            raise TypeError("Cannot use DefautParamsWritable with type %s because it does not " +
+                            " extend Params.", type(self))
+
+
+@inherit_doc
+class DefaultParamsWriter(MLWriter):
+    """
+    .. note:: DeveloperApi
+
+    Specialization of :py:class:`MLWriter` for :py:class:`Params` types
+
+    Class for writing Estimators and Transformers whose parameters are JSON-serializable.
+
+    .. versionadded:: 2.3.0
+    """
+
+    def __init__(self, instance):
+        super(DefaultParamsWriter, self).__init__()
+        self.instance = instance
+
+    def saveImpl(self, path):
+        DefaultParamsWriter.saveMetadata(self.instance, path, self.sc)
+
+    @staticmethod
+    def saveMetadata(instance, path, sc, extraMetadata=None, paramMap=None):
+        """
+        Saves metadata + Params to: path + "/metadata"
+        - class
+        - timestamp
+        - sparkVersion
+        - uid
+        - paramMap
+        - (optionally, extra metadata)
+        :param extraMetadata:  Extra metadata to be saved at same level as uid, paramMap, etc.
+        :param paramMap:  If given, this is saved in the "paramMap" field.
+        """
+        metadataPath = os.path.join(path, "metadata")
+        metadataJson = DefaultParamsWriter._get_metadata_to_save(instance,
+                                                                 sc,
+                                                                 extraMetadata,
+                                                                 paramMap)
+        sc.parallelize([metadataJson], 1).saveAsTextFile(metadataPath)
+
+    @staticmethod
+    def _get_metadata_to_save(instance, sc, extraMetadata=None, paramMap=None):
+        """
+        Helper for :py:meth:`DefaultParamsWriter.saveMetadata` which extracts the JSON to save.
+        This is useful for ensemble models which need to save metadata for many sub-models.
+
+        .. note:: :py:meth:`DefaultParamsWriter.saveMetadata` for details on what this includes.
+        """
+        uid = instance.uid
+        cls = instance.__module__ + '.' + instance.__class__.__name__
+        params = instance.extractParamMap()
+        jsonParams = {}
+        if paramMap is not None:
+            jsonParams = paramMap
+        else:
+            for p in params:
+                jsonParams[p.name] = params[p]
+        basicMetadata = {"class": cls, "timestamp": long(round(time.time() * 1000)),
+                         "sparkVersion": sc.version, "uid": uid, "paramMap": jsonParams}
+        if extraMetadata is not None:
+            basicMetadata.update(extraMetadata)
+        return json.dumps(basicMetadata, separators=[',',  ':'])
+
+
+@inherit_doc
+class DefaultParamsReadable(MLReadable):
+    """
+    .. note:: DeveloperApi
+
+    Helper trait for making simple :py:class:`Params` types readable.
+    If a :py:class:`Params` class stores all data as :py:class:`Param` values,
+    then extending this trait will provide a default implementation of reading saved
+    instances of the class. This only handles simple :py:class:`Param` types;
+    e.g., it will not handle :py:class:`Dataset`. See :py:class:`DefaultParamsWritable`,
+    the counterpart to this trait.
+
+    .. versionadded:: 2.3.0
+    """
+
+    @classmethod
+    def read(cls):
+        """Returns a DefaultParamsReader instance for this class."""
+        return DefaultParamsReader(cls)
+
+
+@inherit_doc
+class DefaultParamsReader(MLReader):
+    """
+    .. note:: DeveloperApi
+
+    Specialization of :py:class:`MLReader` for :py:class:`Params` types
+
+    Default :py:class:`MLReader` implementation for transformers and estimators that
+    contain basic (json-serializable) params and no data. This will not handle
+    more complex params or types with data (e.g., models with coefficients).
+
+    .. versionadded:: 2.3.0
+    """
+
+    def __init__(self, cls):
+        super(DefaultParamsReader, self).__init__()
+        self.cls = cls
+
+    @staticmethod
+    def __get_class(clazz):
+        """
+        Loads Python class from its name.
+        """
+        parts = clazz.split('.')
+        module = ".".join(parts[:-1])
+        m = __import__(module)
+        for comp in parts[1:]:
+            m = getattr(m, comp)
+        return m
+
+    def load(self, path):
+        metadata = DefaultParamsReader.loadMetadata(path, self.sc)
+        py_type = DefaultParamsReader.__get_class(metadata['class'])
+        instance = py_type()
+        instance._resetUid(metadata['uid'])
+        DefaultParamsReader.getAndSetParams(instance, metadata)
+        return instance
+
+    @staticmethod
+    def loadMetadata(path, sc, expectedClassName=""):
+        """
+        Load metadata saved using :py:meth:`DefaultParamsWriter.saveMetadata`
+
+        :param expectedClassName:  If non empty, this is checked against the loaded metadata.
+        """
+        metadataPath = os.path.join(path, "metadata")
+        metadataStr = sc.textFile(metadataPath, 1).first()
+        loadedVals = DefaultParamsReader._parseMetaData(metadataStr, expectedClassName)
+        return loadedVals
+
+    @staticmethod
+    def _parseMetaData(metadataStr, expectedClassName=""):
+        """
+        Parse metadata JSON string produced by :py:meth`DefaultParamsWriter._get_metadata_to_save`.
+        This is a helper function for :py:meth:`DefaultParamsReader.loadMetadata`.
+
+        :param metadataStr:  JSON string of metadata
+        :param expectedClassName:  If non empty, this is checked against the loaded metadata.
+        """
+        metadata = json.loads(metadataStr)
+        className = metadata['class']
+        if len(expectedClassName) > 0:
+            assert className == expectedClassName, "Error loading metadata: Expected " + \
+                "class name {} but found class name {}".format(expectedClassName, className)
+        return metadata
+
+    @staticmethod
+    def getAndSetParams(instance, metadata):
+        """
+        Extract Params from metadata, and set them in the instance.
+        """
+        for paramName in metadata['paramMap']:
+            param = instance.getParam(paramName)
+            paramValue = metadata['paramMap'][paramName]
+            instance.set(param, paramValue)
+
+    @staticmethod
+    def loadParamsInstance(path, sc):
+        """
+        Load a :py:class:`Params` instance from the given path, and return it.
+        This assumes the instance inherits from :py:class:`MLReadable`.
+        """
+        metadata = DefaultParamsReader.loadMetadata(path, sc)
+        pythonClassName = metadata['class'].replace("org.apache.spark", "pyspark")
+        py_type = DefaultParamsReader.__get_class(pythonClassName)
+        instance = py_type.load(path)
+        return instance

From f763d8464b32852d7fd33e962e5476a7f03bc6c6 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Tue, 8 Aug 2017 08:43:58 +0800
Subject: [PATCH 319/779] [SPARK-19270][FOLLOW-UP][ML] PySpark GLR
 model.summary should return a printable representation.

## What changes were proposed in this pull request?
PySpark GLR ```model.summary``` should return a printable representation by calling Scala ```toString```.

## How was this patch tested?
```
from pyspark.ml.regression import GeneralizedLinearRegression
dataset = spark.read.format("libsvm").load("data/mllib/sample_linear_regression_data.txt")
glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3)
model = glr.fit(dataset)
model.summary
```
Before this PR:
![image](https://user-images.githubusercontent.com/1962026/29021059-e221633e-7b96-11e7-8d77-5d53f89c81a9.png)
After this PR:
![image](https://user-images.githubusercontent.com/1962026/29021097-fce80fa6-7b96-11e7-8ab4-7e113d447d5d.png)

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #18870 from yanboliang/spark-19270.
---
 python/pyspark/ml/regression.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index 2cc623427edc8..72374acbe019f 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -1745,6 +1745,9 @@ def pValues(self):
         """
         return self._call_java("pValues")
 
+    def __repr__(self):
+        return self._call_java("toString")
+
 
 if __name__ == "__main__":
     import doctest

From 312bebfb6d9e3fc8d48d3c1f7509ba05059bd8b0 Mon Sep 17 00:00:00 2001
From: "Marcos P. Sanchez" <mpenate@stratio.com>
Date: Mon, 7 Aug 2017 22:41:57 -0700
Subject: [PATCH 320/779] [SPARK-21640][FOLLOW-UP][SQL] added errorifexists on
 IllegalArgumentException message

## What changes were proposed in this pull request?

This commit adds a new argument for IllegalArgumentException message. This recent commit added the argument:

[https://github.com/apache/spark/commit/dcac1d57f0fd05605edf596c303546d83062a352](https://github.com/apache/spark/commit/dcac1d57f0fd05605edf596c303546d83062a352)

## How was this patch tested?

Unit test have been passed

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Marcos P. Sanchez <mpenate@stratio.com>

Closes #18862 from mpenate/feature/exception-errorifexists.
---
 docs/sql-programming-guide.md                                   | 2 +-
 .../src/main/scala/org/apache/spark/sql/DataFrameWriter.scala   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 7f7cf59b7a9a8..2ac2383d699c4 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -514,7 +514,7 @@ new data.
 <tr><th>Scala/Java</th><th>Any Language</th><th>Meaning</th></tr>
 <tr>
   <td><code>SaveMode.ErrorIfExists</code> (default)</td>
-  <td><code>"error"</code> (default)</td>
+  <td><code>"error" or "errorifexists"</code> (default)</td>
   <td>
     When saving a DataFrame to a data source, if data already exists,
     an exception is expected to be thrown.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 079f699a181f5..65c9ef40777a2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -73,7 +73,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
       case "ignore" => SaveMode.Ignore
       case "error" | "errorifexists" | "default" => SaveMode.ErrorIfExists
       case _ => throw new IllegalArgumentException(s"Unknown save mode: $saveMode. " +
-        "Accepted save modes are 'overwrite', 'append', 'ignore', 'error'.")
+        "Accepted save modes are 'overwrite', 'append', 'ignore', 'error', 'errorifexists'.")
     }
     this
   }

From ee1304199bcd9c1d5fc94f5b06fdd5f6fe7336a1 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Tue, 8 Aug 2017 16:12:41 +0800
Subject: [PATCH 321/779] [SPARK-21567][SQL] Dataset should work with type
 alias

## What changes were proposed in this pull request?

If we create a type alias for a type workable with Dataset, the type alias doesn't work with Dataset.

A reproducible case looks like:

    object C {
      type TwoInt = (Int, Int)
      def tupleTypeAlias: TwoInt = (1, 1)
    }

    Seq(1).toDS().map(_ => ("", C.tupleTypeAlias))

It throws an exception like:

    type T1 is not a class
    scala.ScalaReflectionException: type T1 is not a class
      at scala.reflect.api.Symbols$SymbolApi$class.asClass(Symbols.scala:275)
      ...

This patch accesses the dealias of type in many places in `ScalaReflection` to fix it.

## How was this patch tested?

Added test case.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #18813 from viirya/SPARK-21567.
---
 .../spark/sql/catalyst/ScalaReflection.scala  | 27 ++++++++++---------
 .../org/apache/spark/sql/DatasetSuite.scala   | 24 +++++++++++++++++
 2 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 004b4ef8f69fe..17e595f9c5d8d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -62,7 +62,7 @@ object ScalaReflection extends ScalaReflection {
   def dataTypeFor[T : TypeTag]: DataType = dataTypeFor(localTypeOf[T])
 
   private def dataTypeFor(tpe: `Type`): DataType = {
-    tpe match {
+    tpe.dealias match {
       case t if t <:< definitions.IntTpe => IntegerType
       case t if t <:< definitions.LongTpe => LongType
       case t if t <:< definitions.DoubleTpe => DoubleType
@@ -94,7 +94,7 @@ object ScalaReflection extends ScalaReflection {
    * JVM form instead of the Scala Array that handles auto boxing.
    */
   private def arrayClassFor(tpe: `Type`): ObjectType = {
-    val cls = tpe match {
+    val cls = tpe.dealias match {
       case t if t <:< definitions.IntTpe => classOf[Array[Int]]
       case t if t <:< definitions.LongTpe => classOf[Array[Long]]
       case t if t <:< definitions.DoubleTpe => classOf[Array[Double]]
@@ -193,7 +193,7 @@ object ScalaReflection extends ScalaReflection {
       case _ => UpCast(expr, expected, walkedTypePath)
     }
 
-    tpe match {
+    tpe.dealias match {
       case t if !dataTypeFor(t).isInstanceOf[ObjectType] => getPath
 
       case t if t <:< localTypeOf[Option[_]] =>
@@ -469,7 +469,7 @@ object ScalaReflection extends ScalaReflection {
       }
     }
 
-    tpe match {
+    tpe.dealias match {
       case _ if !inputObject.dataType.isInstanceOf[ObjectType] => inputObject
 
       case t if t <:< localTypeOf[Option[_]] =>
@@ -643,7 +643,7 @@ object ScalaReflection extends ScalaReflection {
    * we also treat [[DefinedByConstructorParams]] as product type.
    */
   def optionOfProductType(tpe: `Type`): Boolean = {
-    tpe match {
+    tpe.dealias match {
       case t if t <:< localTypeOf[Option[_]] =>
         val TypeRef(_, _, Seq(optType)) = t
         definedByConstructorParams(optType)
@@ -690,7 +690,7 @@ object ScalaReflection extends ScalaReflection {
   /*
    * Retrieves the runtime class corresponding to the provided type.
    */
-  def getClassFromType(tpe: Type): Class[_] = mirror.runtimeClass(tpe.typeSymbol.asClass)
+  def getClassFromType(tpe: Type): Class[_] = mirror.runtimeClass(tpe.dealias.typeSymbol.asClass)
 
   case class Schema(dataType: DataType, nullable: Boolean)
 
@@ -705,7 +705,7 @@ object ScalaReflection extends ScalaReflection {
 
   /** Returns a catalyst DataType and its nullability for the given Scala Type using reflection. */
   def schemaFor(tpe: `Type`): Schema = {
-    tpe match {
+    tpe.dealias match {
       case t if t.typeSymbol.annotations.exists(_.tree.tpe =:= typeOf[SQLUserDefinedType]) =>
         val udt = getClassFromType(t).getAnnotation(classOf[SQLUserDefinedType]).udt().newInstance()
         Schema(udt, nullable = true)
@@ -775,7 +775,7 @@ object ScalaReflection extends ScalaReflection {
    * Whether the fields of the given type is defined entirely by its constructor parameters.
    */
   def definedByConstructorParams(tpe: Type): Boolean = {
-    tpe <:< localTypeOf[Product] || tpe <:< localTypeOf[DefinedByConstructorParams]
+    tpe.dealias <:< localTypeOf[Product] || tpe.dealias <:< localTypeOf[DefinedByConstructorParams]
   }
 
   private val javaKeywords = Set("abstract", "assert", "boolean", "break", "byte", "case", "catch",
@@ -829,7 +829,7 @@ trait ScalaReflection {
    * synthetic classes, emulating behaviour in Java bytecode.
    */
   def getClassNameFromType(tpe: `Type`): String = {
-    tpe.erasure.typeSymbol.asClass.fullName
+    tpe.dealias.erasure.typeSymbol.asClass.fullName
   }
 
   /**
@@ -848,9 +848,10 @@ trait ScalaReflection {
    * support inner class.
    */
   def getConstructorParameters(tpe: Type): Seq[(String, Type)] = {
-    val formalTypeArgs = tpe.typeSymbol.asClass.typeParams
-    val TypeRef(_, _, actualTypeArgs) = tpe
-    val params = constructParams(tpe)
+    val dealiasedTpe = tpe.dealias
+    val formalTypeArgs = dealiasedTpe.typeSymbol.asClass.typeParams
+    val TypeRef(_, _, actualTypeArgs) = dealiasedTpe
+    val params = constructParams(dealiasedTpe)
     // if there are type variables to fill in, do the substitution (SomeClass[T] -> SomeClass[Int])
     if (actualTypeArgs.nonEmpty) {
       params.map { p =>
@@ -864,7 +865,7 @@ trait ScalaReflection {
   }
 
   protected def constructParams(tpe: Type): Seq[Symbol] = {
-    val constructorSymbol = tpe.member(termNames.CONSTRUCTOR)
+    val constructorSymbol = tpe.dealias.member(termNames.CONSTRUCTOR)
     val params = if (constructorSymbol.isMethod) {
       constructorSymbol.asMethod.paramLists
     } else {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 40235e32d35da..6245b2eff9fa1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -34,6 +34,16 @@ import org.apache.spark.sql.types._
 case class TestDataPoint(x: Int, y: Double, s: String, t: TestDataPoint2)
 case class TestDataPoint2(x: Int, s: String)
 
+object TestForTypeAlias {
+  type TwoInt = (Int, Int)
+  type ThreeInt = (TwoInt, Int)
+  type SeqOfTwoInt = Seq[TwoInt]
+
+  def tupleTypeAlias: TwoInt = (1, 1)
+  def nestedTupleTypeAlias: ThreeInt = ((1, 1), 2)
+  def seqOfTupleTypeAlias: SeqOfTwoInt = Seq((1, 1), (2, 2))
+}
+
 class DatasetSuite extends QueryTest with SharedSQLContext {
   import testImplicits._
 
@@ -1317,6 +1327,20 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
     checkAnswer(df.orderBy($"id"), expected)
     checkAnswer(df.orderBy('id), expected)
   }
+
+  test("SPARK-21567: Dataset should work with type alias") {
+    checkDataset(
+      Seq(1).toDS().map(_ => ("", TestForTypeAlias.tupleTypeAlias)),
+      ("", (1, 1)))
+
+    checkDataset(
+      Seq(1).toDS().map(_ => ("", TestForTypeAlias.nestedTupleTypeAlias)),
+      ("", ((1, 1), 2)))
+
+    checkDataset(
+      Seq(1).toDS().map(_ => ("", TestForTypeAlias.seqOfTupleTypeAlias)),
+      ("", Seq((1, 1), (2, 2))))
+  }
 }
 
 case class WithImmutableMap(id: String, map_test: scala.collection.immutable.Map[Long, String])

From 08ef7d71875378e324dd23c6d2739e606799c818 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Tue, 8 Aug 2017 23:18:59 +0900
Subject: [PATCH 322/779] [MINOR][R][BUILD] More reliable detection of R
 version for Windows in AppVeyor

## What changes were proposed in this pull request?

This PR proposes to use https://rversions.r-pkg.org/r-release-win instead of https://rversions.r-pkg.org/r-release to check R's version for Windows correctly.

We met a syncing problem with Windows release (see #15709) before. To cut this short, it was ...

- 3.3.2 release was released but not for Windows for few hours.
- `https://rversions.r-pkg.org/r-release` returns the latest as 3.3.2 and the download link for 3.3.1 becomes `windows/base/old` by our script
- 3.3.2 release for WIndows yet
- 3.3.1 is still not in `windows/base/old` but `windows/base` as the latest
- Failed to download with `windows/base/old` link and builds were broken

I believe this problem is not only what we met. Please see https://github.com/krlmlr/r-appveyor/commit/01ce943929993bbf27facd2cdc20ae2e03808eb4 and also this `r-release-win` API came out between 3.3.1 and 3.3.2 (assuming to deal with this issue), please see `https://github.com/metacran/rversions.app/issues/2`.

Using this API will prevent the problem although it looks quite rare assuming from the commit logs in https://github.com/metacran/rversions.app/commits/master. After 3.3.2, both  `r-release-win` and `r-release` are being updated together.

## How was this patch tested?

AppVeyor tests.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #18859 from HyukjinKwon/use-reliable-link.
---
 dev/appveyor-install-dependencies.ps1 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/appveyor-install-dependencies.ps1 b/dev/appveyor-install-dependencies.ps1
index cf82389173048..a357fbf59f6c8 100644
--- a/dev/appveyor-install-dependencies.ps1
+++ b/dev/appveyor-install-dependencies.ps1
@@ -26,7 +26,7 @@ Function InstallR {
   }
 
   $urlPath = ""
-  $latestVer = $(ConvertFrom-JSON $(Invoke-WebRequest http://rversions.r-pkg.org/r-release).Content).version
+  $latestVer = $(ConvertFrom-JSON $(Invoke-WebRequest http://rversions.r-pkg.org/r-release-win).Content).version
   If ($rVer -ne $latestVer) {
     $urlPath = ("old/" + $rVer + "/")
   }

From 979bf946d5fc7c15c2fdaa2e6c4df07bbbb74d93 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Tue, 8 Aug 2017 11:02:54 -0700
Subject: [PATCH 323/779] [SPARK-20655][CORE] In-memory KVStore implementation.

This change adds an in-memory implementation of KVStore that can be
used by the live UI.

The implementation is not fully optimized, neither for speed nor
space, but should be fast enough for using in the listener bus.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #18395 from vanzin/SPARK-20655.
---
 common/kvstore/pom.xml                        |   5 +
 .../apache/spark/kvstore/ArrayWrappers.java   | 213 ++++++++++++
 .../apache/spark/kvstore/InMemoryStore.java   | 320 ++++++++++++++++++
 .../org/apache/spark/kvstore/KVTypeInfo.java  |   2 +-
 .../spark/kvstore/ArrayKeyIndexType.java      |  44 +++
 .../spark/kvstore/ArrayWrappersSuite.java     |  59 ++++
 .../spark/kvstore/InMemoryIteratorSuite.java  |  27 ++
 .../spark/kvstore/InMemoryStoreSuite.java     | 161 +++++++++
 .../apache/spark/kvstore/LevelDBSuite.java    |  24 --
 project/SparkBuild.scala                      |   3 +-
 10 files changed, 832 insertions(+), 26 deletions(-)
 create mode 100644 common/kvstore/src/main/java/org/apache/spark/kvstore/ArrayWrappers.java
 create mode 100644 common/kvstore/src/main/java/org/apache/spark/kvstore/InMemoryStore.java
 create mode 100644 common/kvstore/src/test/java/org/apache/spark/kvstore/ArrayKeyIndexType.java
 create mode 100644 common/kvstore/src/test/java/org/apache/spark/kvstore/ArrayWrappersSuite.java
 create mode 100644 common/kvstore/src/test/java/org/apache/spark/kvstore/InMemoryIteratorSuite.java
 create mode 100644 common/kvstore/src/test/java/org/apache/spark/kvstore/InMemoryStoreSuite.java

diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index d00cf2788b964..cf93d41cd77cf 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -35,6 +35,11 @@
   </properties>
 
   <dependencies>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-tags_${scala.binary.version}</artifactId>
+    </dependency>
+
     <dependency>
       <groupId>com.google.guava</groupId>
       <artifactId>guava</artifactId>
diff --git a/common/kvstore/src/main/java/org/apache/spark/kvstore/ArrayWrappers.java b/common/kvstore/src/main/java/org/apache/spark/kvstore/ArrayWrappers.java
new file mode 100644
index 0000000000000..5efa842e3aad7
--- /dev/null
+++ b/common/kvstore/src/main/java/org/apache/spark/kvstore/ArrayWrappers.java
@@ -0,0 +1,213 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.kvstore;
+
+import java.util.Arrays;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * A factory for array wrappers so that arrays can be used as keys in a map, sorted or not.
+ *
+ * The comparator implementation makes two assumptions:
+ * - All elements are instances of Comparable
+ * - When comparing two arrays, they both contain elements of the same type in corresponding
+ *   indices.
+ *
+ * Otherwise, ClassCastExceptions may occur. The equality method can compare any two arrays.
+ *
+ * This class is not efficient and is mostly meant to compare really small arrays, like those
+ * generally used as indices and keys in a KVStore.
+ */
+class ArrayWrappers {
+
+  @SuppressWarnings("unchecked")
+  public static Comparable<Object> forArray(Object a) {
+    Preconditions.checkArgument(a.getClass().isArray());
+    Comparable<?> ret;
+    if (a instanceof int[]) {
+      ret = new ComparableIntArray((int[]) a);
+    } else if (a instanceof long[]) {
+      ret = new ComparableLongArray((long[]) a);
+    } else if (a instanceof byte[]) {
+      ret = new ComparableByteArray((byte[]) a);
+    } else {
+      Preconditions.checkArgument(!a.getClass().getComponentType().isPrimitive());
+      ret = new ComparableObjectArray((Object[]) a);
+    }
+    return (Comparable<Object>) ret;
+  }
+
+  private static class ComparableIntArray implements Comparable<ComparableIntArray> {
+
+    private final int[] array;
+
+    ComparableIntArray(int[] array) {
+      this.array = array;
+    }
+
+    @Override
+    public boolean equals(Object other) {
+      if (!(other instanceof ComparableIntArray)) {
+        return false;
+      }
+      return Arrays.equals(array, ((ComparableIntArray) other).array);
+    }
+
+    @Override
+    public int hashCode() {
+      int code = 0;
+      for (int i = 0; i < array.length; i++) {
+        code = (code * 31) + array[i];
+      }
+      return code;
+    }
+
+    @Override
+    public int compareTo(ComparableIntArray other) {
+      int len = Math.min(array.length, other.array.length);
+      for (int i = 0; i < len; i++) {
+        int diff = array[i] - other.array[i];
+        if (diff != 0) {
+          return diff;
+        }
+      }
+
+      return array.length - other.array.length;
+    }
+  }
+
+  private static class ComparableLongArray implements Comparable<ComparableLongArray> {
+
+    private final long[] array;
+
+    ComparableLongArray(long[] array) {
+      this.array = array;
+    }
+
+    @Override
+    public boolean equals(Object other) {
+      if (!(other instanceof ComparableLongArray)) {
+        return false;
+      }
+      return Arrays.equals(array, ((ComparableLongArray) other).array);
+    }
+
+    @Override
+    public int hashCode() {
+      int code = 0;
+      for (int i = 0; i < array.length; i++) {
+        code = (code * 31) + (int) array[i];
+      }
+      return code;
+    }
+
+    @Override
+    public int compareTo(ComparableLongArray other) {
+      int len = Math.min(array.length, other.array.length);
+      for (int i = 0; i < len; i++) {
+        long diff = array[i] - other.array[i];
+        if (diff != 0) {
+          return diff > 0 ? 1 : -1;
+        }
+      }
+
+      return array.length - other.array.length;
+    }
+  }
+
+  private static class ComparableByteArray implements Comparable<ComparableByteArray> {
+
+    private final byte[] array;
+
+    ComparableByteArray(byte[] array) {
+      this.array = array;
+    }
+
+    @Override
+    public boolean equals(Object other) {
+      if (!(other instanceof ComparableByteArray)) {
+        return false;
+      }
+      return Arrays.equals(array, ((ComparableByteArray) other).array);
+    }
+
+    @Override
+    public int hashCode() {
+      int code = 0;
+      for (int i = 0; i < array.length; i++) {
+        code = (code * 31) + array[i];
+      }
+      return code;
+    }
+
+    @Override
+    public int compareTo(ComparableByteArray other) {
+      int len = Math.min(array.length, other.array.length);
+      for (int i = 0; i < len; i++) {
+        int diff = array[i] - other.array[i];
+        if (diff != 0) {
+          return diff;
+        }
+      }
+
+      return array.length - other.array.length;
+    }
+  }
+
+  private static class ComparableObjectArray implements Comparable<ComparableObjectArray> {
+
+    private final Object[] array;
+
+    ComparableObjectArray(Object[] array) {
+      this.array = array;
+    }
+
+    @Override
+    public boolean equals(Object other) {
+      if (!(other instanceof ComparableObjectArray)) {
+        return false;
+      }
+      return Arrays.equals(array, ((ComparableObjectArray) other).array);
+    }
+
+    @Override
+    public int hashCode() {
+      int code = 0;
+      for (int i = 0; i < array.length; i++) {
+        code = (code * 31) + array[i].hashCode();
+      }
+      return code;
+    }
+
+    @Override
+    @SuppressWarnings("unchecked")
+    public int compareTo(ComparableObjectArray other) {
+      int len = Math.min(array.length, other.array.length);
+      for (int i = 0; i < len; i++) {
+        int diff = ((Comparable<Object>) array[i]).compareTo((Comparable<Object>) other.array[i]);
+        if (diff != 0) {
+          return diff;
+        }
+      }
+
+      return array.length - other.array.length;
+    }
+  }
+
+}
diff --git a/common/kvstore/src/main/java/org/apache/spark/kvstore/InMemoryStore.java b/common/kvstore/src/main/java/org/apache/spark/kvstore/InMemoryStore.java
new file mode 100644
index 0000000000000..f3aeb822e72cd
--- /dev/null
+++ b/common/kvstore/src/main/java/org/apache/spark/kvstore/InMemoryStore.java
@@ -0,0 +1,320 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.kvstore;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.NoSuchElementException;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import com.google.common.base.Objects;
+import com.google.common.base.Preconditions;
+import com.google.common.base.Throwables;
+
+import org.apache.spark.annotation.Private;
+
+/**
+ * Implementation of KVStore that keeps data deserialized in memory. This store does not index
+ * data; instead, whenever iterating over an indexed field, the stored data is copied and sorted
+ * according to the index. This saves memory but makes iteration more expensive.
+ */
+@Private
+public class InMemoryStore implements KVStore {
+
+  private Object metadata;
+  private ConcurrentMap<Class<?>, InstanceList> data = new ConcurrentHashMap<>();
+
+  @Override
+  public <T> T getMetadata(Class<T> klass) {
+    return klass.cast(metadata);
+  }
+
+  @Override
+  public void setMetadata(Object value) {
+    this.metadata = value;
+  }
+
+  @Override
+  public long count(Class<?> type) {
+    InstanceList list = data.get(type);
+    return list != null ? list.size() : 0;
+  }
+
+  @Override
+  public long count(Class<?> type, String index, Object indexedValue) throws Exception {
+    InstanceList list = data.get(type);
+    int count = 0;
+    Object comparable = asKey(indexedValue);
+    KVTypeInfo.Accessor accessor = list.getIndexAccessor(index);
+    for (Object o : view(type)) {
+      if (Objects.equal(comparable, asKey(accessor.get(o)))) {
+        count++;
+      }
+    }
+    return count;
+  }
+
+  @Override
+  public <T> T read(Class<T> klass, Object naturalKey) {
+    InstanceList list = data.get(klass);
+    Object value = list != null ? list.get(naturalKey) : null;
+    if (value == null) {
+      throw new NoSuchElementException();
+    }
+    return klass.cast(value);
+  }
+
+  @Override
+  public void write(Object value) throws Exception {
+    InstanceList list = data.computeIfAbsent(value.getClass(), key -> {
+      try {
+        return new InstanceList(key);
+      } catch (Exception e) {
+        throw Throwables.propagate(e);
+      }
+    });
+    list.put(value);
+  }
+
+  @Override
+  public void delete(Class<?> type, Object naturalKey) {
+    InstanceList list = data.get(type);
+    if (list != null) {
+      list.delete(naturalKey);
+    }
+  }
+
+  @Override
+  public <T> KVStoreView<T> view(Class<T> type){
+    InstanceList list = data.get(type);
+    return list != null ? list.view(type)
+      : new InMemoryView<>(type, Collections.<T>emptyList(), null);
+  }
+
+  @Override
+  public void close() {
+    metadata = null;
+    data.clear();
+  }
+
+  @SuppressWarnings("unchecked")
+  private static Comparable<Object> asKey(Object in) {
+    if (in.getClass().isArray()) {
+      in = ArrayWrappers.forArray(in);
+    }
+    return (Comparable<Object>) in;
+  }
+
+  private static class InstanceList {
+
+    private final KVTypeInfo ti;
+    private final KVTypeInfo.Accessor naturalKey;
+    private final ConcurrentMap<Comparable<Object>, Object> data;
+
+    private int size;
+
+    private InstanceList(Class<?> type) throws Exception {
+      this.ti = new KVTypeInfo(type);
+      this.naturalKey = ti.getAccessor(KVIndex.NATURAL_INDEX_NAME);
+      this.data = new ConcurrentHashMap<>();
+      this.size = 0;
+    }
+
+    KVTypeInfo.Accessor getIndexAccessor(String indexName) {
+      return ti.getAccessor(indexName);
+    }
+
+    public Object get(Object key) {
+      return data.get(asKey(key));
+    }
+
+    public void put(Object value) throws Exception {
+      Preconditions.checkArgument(ti.type().equals(value.getClass()),
+        "Unexpected type: %s", value.getClass());
+      if (data.put(asKey(naturalKey.get(value)), value) == null) {
+        size++;
+      }
+    }
+
+    public void delete(Object key) {
+      if (data.remove(asKey(key)) != null) {
+        size--;
+      }
+    }
+
+    public int size() {
+      return size;
+    }
+
+    @SuppressWarnings("unchecked")
+    public <T> InMemoryView<T> view(Class<T> type) {
+      Preconditions.checkArgument(ti.type().equals(type), "Unexpected type: %s", type);
+      Collection<T> all = (Collection<T>) data.values();
+      return new InMemoryView(type, all, ti);
+    }
+
+  }
+
+  private static class InMemoryView<T> extends KVStoreView<T> {
+
+    private final Collection<T> elements;
+    private final KVTypeInfo ti;
+    private final KVTypeInfo.Accessor natural;
+
+    InMemoryView(Class<T> type, Collection<T> elements, KVTypeInfo ti) {
+      super(type);
+      this.elements = elements;
+      this.ti = ti;
+      this.natural = ti != null ? ti.getAccessor(KVIndex.NATURAL_INDEX_NAME) : null;
+    }
+
+    @Override
+    public Iterator<T> iterator() {
+      if (elements.isEmpty()) {
+        return new InMemoryIterator<>(elements.iterator());
+      }
+
+      try {
+        KVTypeInfo.Accessor getter = index != null ? ti.getAccessor(index) : null;
+        int modifier = ascending ? 1 : -1;
+
+        final List<T> sorted = copyElements();
+        Collections.sort(sorted, (e1, e2) -> modifier * compare(e1, e2, getter));
+        Stream<T> stream = sorted.stream();
+
+        if (first != null) {
+          stream = stream.filter(e -> modifier * compare(e, getter, first) >= 0);
+        }
+
+        if (last != null) {
+          stream = stream.filter(e -> modifier * compare(e, getter, last) <= 0);
+        }
+
+        if (skip > 0) {
+          stream = stream.skip(skip);
+        }
+
+        if (max < sorted.size()) {
+          stream = stream.limit((int) max);
+        }
+
+        return new InMemoryIterator<>(stream.iterator());
+      } catch (Exception e) {
+        throw Throwables.propagate(e);
+      }
+    }
+
+    /**
+     * Create a copy of the input elements, filtering the values for child indices if needed.
+     */
+    private List<T> copyElements() {
+      if (parent != null) {
+        KVTypeInfo.Accessor parentGetter = ti.getParentAccessor(index);
+        Preconditions.checkArgument(parentGetter != null, "Parent filter for non-child index.");
+
+        return elements.stream()
+          .filter(e -> compare(e, parentGetter, parent) == 0)
+          .collect(Collectors.toList());
+      } else {
+        return new ArrayList<>(elements);
+      }
+    }
+
+    private int compare(T e1, T e2, KVTypeInfo.Accessor getter) {
+      try {
+        int diff = compare(e1, getter, getter.get(e2));
+        if (diff == 0 && getter != natural) {
+          diff = compare(e1, natural, natural.get(e2));
+        }
+        return diff;
+      } catch (Exception e) {
+        throw Throwables.propagate(e);
+      }
+    }
+
+    private int compare(T e1, KVTypeInfo.Accessor getter, Object v2) {
+      try {
+        return asKey(getter.get(e1)).compareTo(asKey(v2));
+      } catch (Exception e) {
+        throw Throwables.propagate(e);
+      }
+    }
+
+  }
+
+  private static class InMemoryIterator<T> implements KVStoreIterator<T> {
+
+    private final Iterator<T> iter;
+
+    InMemoryIterator(Iterator<T> iter) {
+      this.iter = iter;
+    }
+
+    @Override
+    public boolean hasNext() {
+      return iter.hasNext();
+    }
+
+    @Override
+    public T next() {
+      return iter.next();
+    }
+
+    @Override
+    public void remove() {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public List<T> next(int max) {
+      List<T> list = new ArrayList<>(max);
+      while (hasNext() && list.size() < max) {
+        list.add(next());
+      }
+      return list;
+    }
+
+    @Override
+    public boolean skip(long n) {
+      long skipped = 0;
+      while (skipped < n) {
+        if (hasNext()) {
+          next();
+          skipped++;
+        } else {
+          return false;
+        }
+      }
+
+      return hasNext();
+    }
+
+    @Override
+    public void close() {
+      // no op.
+    }
+
+  }
+
+}
diff --git a/common/kvstore/src/main/java/org/apache/spark/kvstore/KVTypeInfo.java b/common/kvstore/src/main/java/org/apache/spark/kvstore/KVTypeInfo.java
index e1cc0ba3f5aa7..e3e61e0a0e452 100644
--- a/common/kvstore/src/main/java/org/apache/spark/kvstore/KVTypeInfo.java
+++ b/common/kvstore/src/main/java/org/apache/spark/kvstore/KVTypeInfo.java
@@ -89,7 +89,7 @@ private void checkIndex(KVIndex idx, Map<String, KVIndex> indices) {
       "Duplicate index %s for type %s.", idx.value(), type.getName());
   }
 
-  public Class<?> getType() {
+  public Class<?> type() {
     return type;
   }
 
diff --git a/common/kvstore/src/test/java/org/apache/spark/kvstore/ArrayKeyIndexType.java b/common/kvstore/src/test/java/org/apache/spark/kvstore/ArrayKeyIndexType.java
new file mode 100644
index 0000000000000..d5938acc3e80e
--- /dev/null
+++ b/common/kvstore/src/test/java/org/apache/spark/kvstore/ArrayKeyIndexType.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.kvstore;
+
+import java.util.Arrays;
+
+public class ArrayKeyIndexType {
+
+  @KVIndex
+  public int[] key;
+
+  @KVIndex("id")
+  public String[] id;
+
+  @Override
+  public boolean equals(Object o) {
+    if (o instanceof ArrayKeyIndexType) {
+      ArrayKeyIndexType other = (ArrayKeyIndexType) o;
+      return Arrays.equals(key, other.key) && Arrays.equals(id, other.id);
+    }
+    return false;
+  }
+
+  @Override
+  public int hashCode() {
+    return key.hashCode();
+  }
+
+}
diff --git a/common/kvstore/src/test/java/org/apache/spark/kvstore/ArrayWrappersSuite.java b/common/kvstore/src/test/java/org/apache/spark/kvstore/ArrayWrappersSuite.java
new file mode 100644
index 0000000000000..f9b4774820ea0
--- /dev/null
+++ b/common/kvstore/src/test/java/org/apache/spark/kvstore/ArrayWrappersSuite.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.kvstore;
+
+import org.junit.Test;
+import static org.junit.Assert.*;
+
+public class ArrayWrappersSuite {
+
+  @Test
+  public void testGenericArrayKey() {
+   byte[] b1 = new byte[] { 0x01, 0x02, 0x03 };
+   byte[] b2 = new byte[] { 0x01, 0x02 };
+   int[] i1 = new int[] { 1, 2, 3 };
+   int[] i2 = new int[] { 1, 2 };
+   String[] s1 = new String[] { "1", "2", "3" };
+   String[] s2 = new String[] { "1", "2" };
+
+   assertEquals(ArrayWrappers.forArray(b1), ArrayWrappers.forArray(b1));
+   assertNotEquals(ArrayWrappers.forArray(b1), ArrayWrappers.forArray(b2));
+   assertNotEquals(ArrayWrappers.forArray(b1), ArrayWrappers.forArray(i1));
+   assertNotEquals(ArrayWrappers.forArray(b1), ArrayWrappers.forArray(s1));
+
+   assertEquals(ArrayWrappers.forArray(i1), ArrayWrappers.forArray(i1));
+   assertNotEquals(ArrayWrappers.forArray(i1), ArrayWrappers.forArray(i2));
+   assertNotEquals(ArrayWrappers.forArray(i1), ArrayWrappers.forArray(b1));
+   assertNotEquals(ArrayWrappers.forArray(i1), ArrayWrappers.forArray(s1));
+
+   assertEquals(ArrayWrappers.forArray(s1), ArrayWrappers.forArray(s1));
+   assertNotEquals(ArrayWrappers.forArray(s1), ArrayWrappers.forArray(s2));
+   assertNotEquals(ArrayWrappers.forArray(s1), ArrayWrappers.forArray(b1));
+   assertNotEquals(ArrayWrappers.forArray(s1), ArrayWrappers.forArray(i1));
+
+   assertEquals(0, ArrayWrappers.forArray(b1).compareTo(ArrayWrappers.forArray(b1)));
+   assertTrue(ArrayWrappers.forArray(b1).compareTo(ArrayWrappers.forArray(b2)) > 0);
+
+   assertEquals(0, ArrayWrappers.forArray(i1).compareTo(ArrayWrappers.forArray(i1)));
+   assertTrue(ArrayWrappers.forArray(i1).compareTo(ArrayWrappers.forArray(i2)) > 0);
+
+   assertEquals(0, ArrayWrappers.forArray(s1).compareTo(ArrayWrappers.forArray(s1)));
+   assertTrue(ArrayWrappers.forArray(s1).compareTo(ArrayWrappers.forArray(s2)) > 0);
+  }
+
+}
diff --git a/common/kvstore/src/test/java/org/apache/spark/kvstore/InMemoryIteratorSuite.java b/common/kvstore/src/test/java/org/apache/spark/kvstore/InMemoryIteratorSuite.java
new file mode 100644
index 0000000000000..57ee4f6dd7cb6
--- /dev/null
+++ b/common/kvstore/src/test/java/org/apache/spark/kvstore/InMemoryIteratorSuite.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.kvstore;
+
+public class InMemoryIteratorSuite extends DBIteratorSuite {
+
+  @Override
+  protected KVStore createStore() {
+    return new InMemoryStore();
+  }
+
+}
diff --git a/common/kvstore/src/test/java/org/apache/spark/kvstore/InMemoryStoreSuite.java b/common/kvstore/src/test/java/org/apache/spark/kvstore/InMemoryStoreSuite.java
new file mode 100644
index 0000000000000..6a7915f9385e0
--- /dev/null
+++ b/common/kvstore/src/test/java/org/apache/spark/kvstore/InMemoryStoreSuite.java
@@ -0,0 +1,161 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.kvstore;
+
+import java.util.NoSuchElementException;
+
+import org.junit.Test;
+import static org.junit.Assert.*;
+
+public class InMemoryStoreSuite {
+
+  @Test
+  public void testObjectWriteReadDelete() throws Exception {
+    KVStore store = new InMemoryStore();
+
+    CustomType1 t = new CustomType1();
+    t.key = "key";
+    t.id = "id";
+    t.name = "name";
+
+    try {
+      store.read(CustomType1.class, t.key);
+      fail("Expected exception for non-existant object.");
+    } catch (NoSuchElementException nsee) {
+      // Expected.
+    }
+
+    store.write(t);
+    assertEquals(t, store.read(t.getClass(), t.key));
+    assertEquals(1L, store.count(t.getClass()));
+
+    store.delete(t.getClass(), t.key);
+    try {
+      store.read(t.getClass(), t.key);
+      fail("Expected exception for deleted object.");
+    } catch (NoSuchElementException nsee) {
+      // Expected.
+    }
+  }
+
+  @Test
+  public void testMultipleObjectWriteReadDelete() throws Exception {
+    KVStore store = new InMemoryStore();
+
+    CustomType1 t1 = new CustomType1();
+    t1.key = "key1";
+    t1.id = "id";
+    t1.name = "name1";
+
+    CustomType1 t2 = new CustomType1();
+    t2.key = "key2";
+    t2.id = "id";
+    t2.name = "name2";
+
+    store.write(t1);
+    store.write(t2);
+
+    assertEquals(t1, store.read(t1.getClass(), t1.key));
+    assertEquals(t2, store.read(t2.getClass(), t2.key));
+    assertEquals(2L, store.count(t1.getClass()));
+
+    store.delete(t1.getClass(), t1.key);
+    assertEquals(t2, store.read(t2.getClass(), t2.key));
+    store.delete(t2.getClass(), t2.key);
+    try {
+      store.read(t2.getClass(), t2.key);
+      fail("Expected exception for deleted object.");
+    } catch (NoSuchElementException nsee) {
+      // Expected.
+    }
+  }
+
+  @Test
+  public void testMetadata() throws Exception {
+    KVStore store = new InMemoryStore();
+    assertNull(store.getMetadata(CustomType1.class));
+
+    CustomType1 t = new CustomType1();
+    t.id = "id";
+    t.name = "name";
+
+    store.setMetadata(t);
+    assertEquals(t, store.getMetadata(CustomType1.class));
+
+    store.setMetadata(null);
+    assertNull(store.getMetadata(CustomType1.class));
+  }
+
+  @Test
+  public void testUpdate() throws Exception {
+    KVStore store = new InMemoryStore();
+
+    CustomType1 t = new CustomType1();
+    t.key = "key";
+    t.id = "id";
+    t.name = "name";
+
+    store.write(t);
+
+    t.name = "anotherName";
+
+    store.write(t);
+    assertEquals(1, store.count(t.getClass()));
+    assertSame(t, store.read(t.getClass(), t.key));
+  }
+
+  @Test
+  public void testArrayIndices() throws Exception {
+    KVStore store = new InMemoryStore();
+
+    ArrayKeyIndexType o = new ArrayKeyIndexType();
+    o.key = new int[] { 1, 2 };
+    o.id = new String[] { "3", "4" };
+
+    store.write(o);
+    assertEquals(o, store.read(ArrayKeyIndexType.class, o.key));
+    assertEquals(o, store.view(ArrayKeyIndexType.class).index("id").first(o.id).iterator().next());
+  }
+
+  @Test
+  public void testBasicIteration() throws Exception {
+    KVStore store = new InMemoryStore();
+
+    CustomType1 t1 = new CustomType1();
+    t1.key = "1";
+    t1.id = "id1";
+    t1.name = "name1";
+    store.write(t1);
+
+    CustomType1 t2 = new CustomType1();
+    t2.key = "2";
+    t2.id = "id2";
+    t2.name = "name2";
+    store.write(t2);
+
+    assertEquals(t1.id, store.view(t1.getClass()).iterator().next().id);
+    assertEquals(t2.id, store.view(t1.getClass()).skip(1).iterator().next().id);
+    assertEquals(t2.id, store.view(t1.getClass()).skip(1).max(1).iterator().next().id);
+    assertEquals(t1.id,
+      store.view(t1.getClass()).first(t1.key).max(1).iterator().next().id);
+    assertEquals(t2.id,
+      store.view(t1.getClass()).first(t2.key).max(1).iterator().next().id);
+    assertFalse(store.view(t1.getClass()).first(t2.id).skip(1).iterator().hasNext());
+  }
+
+}
diff --git a/common/kvstore/src/test/java/org/apache/spark/kvstore/LevelDBSuite.java b/common/kvstore/src/test/java/org/apache/spark/kvstore/LevelDBSuite.java
index 42bff610457e7..86c85c1b7a032 100644
--- a/common/kvstore/src/test/java/org/apache/spark/kvstore/LevelDBSuite.java
+++ b/common/kvstore/src/test/java/org/apache/spark/kvstore/LevelDBSuite.java
@@ -283,28 +283,4 @@ public int hashCode() {
 
   }
 
-  public static class ArrayKeyIndexType {
-
-    @KVIndex
-    public int[] key;
-
-    @KVIndex("id")
-    public String[] id;
-
-    @Override
-    public boolean equals(Object o) {
-      if (o instanceof ArrayKeyIndexType) {
-        ArrayKeyIndexType other = (ArrayKeyIndexType) o;
-        return Arrays.equals(key, other.key) && Arrays.equals(id, other.id);
-      }
-      return false;
-    }
-
-    @Override
-    public int hashCode() {
-      return key.hashCode();
-    }
-
-  }
-
 }
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index b9db1df2d1919..371a171aa98e3 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -230,7 +230,8 @@ object SparkBuild extends PomBuild {
 
     javacOptions in Compile ++= Seq(
       "-encoding", "UTF-8",
-      "-source", javacJVMVersion.value
+      "-source", javacJVMVersion.value,
+      "-Xlint:unchecked"
     ),
     // This -target option cannot be set in the Compile configuration scope since `javadoc` doesn't
     // play nicely with it; see https://github.com/sbt/sbt/issues/355#issuecomment-3817629 for

From 2c1bfb497f31ff402796b57b617a9075c6044e4d Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Tue, 8 Aug 2017 14:33:27 -0700
Subject: [PATCH 324/779] [SPARK-21671][CORE] Move kvstore to "util"
 sub-package, add private annotation.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #18886 from vanzin/SPARK-21671.
---
 .../org/apache/spark/{ => util}/kvstore/ArrayWrappers.java   | 2 +-
 .../org/apache/spark/{ => util}/kvstore/InMemoryStore.java   | 2 +-
 .../java/org/apache/spark/{ => util}/kvstore/KVIndex.java    | 5 ++++-
 .../java/org/apache/spark/{ => util}/kvstore/KVStore.java    | 5 ++++-
 .../org/apache/spark/{ => util}/kvstore/KVStoreIterator.java | 5 ++++-
 .../apache/spark/{ => util}/kvstore/KVStoreSerializer.java   | 5 ++++-
 .../org/apache/spark/{ => util}/kvstore/KVStoreView.java     | 5 ++++-
 .../java/org/apache/spark/{ => util}/kvstore/KVTypeInfo.java | 5 ++++-
 .../java/org/apache/spark/{ => util}/kvstore/LevelDB.java    | 5 ++++-
 .../org/apache/spark/{ => util}/kvstore/LevelDBIterator.java | 2 +-
 .../org/apache/spark/{ => util}/kvstore/LevelDBTypeInfo.java | 2 +-
 .../{ => util}/kvstore/UnsupportedStoreVersionException.java | 5 ++++-
 .../apache/spark/{ => util}/kvstore/ArrayKeyIndexType.java   | 2 +-
 .../apache/spark/{ => util}/kvstore/ArrayWrappersSuite.java  | 2 +-
 .../org/apache/spark/{ => util}/kvstore/CustomType1.java     | 2 +-
 .../org/apache/spark/{ => util}/kvstore/DBIteratorSuite.java | 2 +-
 .../spark/{ => util}/kvstore/InMemoryIteratorSuite.java      | 2 +-
 .../apache/spark/{ => util}/kvstore/InMemoryStoreSuite.java  | 2 +-
 .../apache/spark/{ => util}/kvstore/LevelDBBenchmark.java    | 2 +-
 .../spark/{ => util}/kvstore/LevelDBIteratorSuite.java       | 2 +-
 .../org/apache/spark/{ => util}/kvstore/LevelDBSuite.java    | 2 +-
 .../spark/{ => util}/kvstore/LevelDBTypeInfoSuite.java       | 2 +-
 22 files changed, 46 insertions(+), 22 deletions(-)
 rename common/kvstore/src/main/java/org/apache/spark/{ => util}/kvstore/ArrayWrappers.java (99%)
 rename common/kvstore/src/main/java/org/apache/spark/{ => util}/kvstore/InMemoryStore.java (99%)
 rename common/kvstore/src/main/java/org/apache/spark/{ => util}/kvstore/KVIndex.java (97%)
 rename common/kvstore/src/main/java/org/apache/spark/{ => util}/kvstore/KVStore.java (97%)
 rename common/kvstore/src/main/java/org/apache/spark/{ => util}/kvstore/KVStoreIterator.java (93%)
 rename common/kvstore/src/main/java/org/apache/spark/{ => util}/kvstore/KVStoreSerializer.java (96%)
 rename common/kvstore/src/main/java/org/apache/spark/{ => util}/kvstore/KVStoreView.java (97%)
 rename common/kvstore/src/main/java/org/apache/spark/{ => util}/kvstore/KVTypeInfo.java (98%)
 rename common/kvstore/src/main/java/org/apache/spark/{ => util}/kvstore/LevelDB.java (99%)
 rename common/kvstore/src/main/java/org/apache/spark/{ => util}/kvstore/LevelDBIterator.java (99%)
 rename common/kvstore/src/main/java/org/apache/spark/{ => util}/kvstore/LevelDBTypeInfo.java (99%)
 rename common/kvstore/src/main/java/org/apache/spark/{ => util}/kvstore/UnsupportedStoreVersionException.java (91%)
 rename common/kvstore/src/test/java/org/apache/spark/{ => util}/kvstore/ArrayKeyIndexType.java (96%)
 rename common/kvstore/src/test/java/org/apache/spark/{ => util}/kvstore/ArrayWrappersSuite.java (98%)
 rename common/kvstore/src/test/java/org/apache/spark/{ => util}/kvstore/CustomType1.java (97%)
 rename common/kvstore/src/test/java/org/apache/spark/{ => util}/kvstore/DBIteratorSuite.java (99%)
 rename common/kvstore/src/test/java/org/apache/spark/{ => util}/kvstore/InMemoryIteratorSuite.java (96%)
 rename common/kvstore/src/test/java/org/apache/spark/{ => util}/kvstore/InMemoryStoreSuite.java (99%)
 rename common/kvstore/src/test/java/org/apache/spark/{ => util}/kvstore/LevelDBBenchmark.java (99%)
 rename common/kvstore/src/test/java/org/apache/spark/{ => util}/kvstore/LevelDBIteratorSuite.java (97%)
 rename common/kvstore/src/test/java/org/apache/spark/{ => util}/kvstore/LevelDBSuite.java (99%)
 rename common/kvstore/src/test/java/org/apache/spark/{ => util}/kvstore/LevelDBTypeInfoSuite.java (99%)

diff --git a/common/kvstore/src/main/java/org/apache/spark/kvstore/ArrayWrappers.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/ArrayWrappers.java
similarity index 99%
rename from common/kvstore/src/main/java/org/apache/spark/kvstore/ArrayWrappers.java
rename to common/kvstore/src/main/java/org/apache/spark/util/kvstore/ArrayWrappers.java
index 5efa842e3aad7..9bc8c55bd5389 100644
--- a/common/kvstore/src/main/java/org/apache/spark/kvstore/ArrayWrappers.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/ArrayWrappers.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.kvstore;
+package org.apache.spark.util.kvstore;
 
 import java.util.Arrays;
 
diff --git a/common/kvstore/src/main/java/org/apache/spark/kvstore/InMemoryStore.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/InMemoryStore.java
similarity index 99%
rename from common/kvstore/src/main/java/org/apache/spark/kvstore/InMemoryStore.java
rename to common/kvstore/src/main/java/org/apache/spark/util/kvstore/InMemoryStore.java
index f3aeb822e72cd..9cae5da5d2600 100644
--- a/common/kvstore/src/main/java/org/apache/spark/kvstore/InMemoryStore.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/InMemoryStore.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.kvstore;
+package org.apache.spark.util.kvstore;
 
 import java.util.ArrayList;
 import java.util.Collection;
diff --git a/common/kvstore/src/main/java/org/apache/spark/kvstore/KVIndex.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVIndex.java
similarity index 97%
rename from common/kvstore/src/main/java/org/apache/spark/kvstore/KVIndex.java
rename to common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVIndex.java
index 0cffefe07c25d..80f4921107246 100644
--- a/common/kvstore/src/main/java/org/apache/spark/kvstore/KVIndex.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVIndex.java
@@ -15,13 +15,15 @@
  * limitations under the License.
  */
 
-package org.apache.spark.kvstore;
+package org.apache.spark.util.kvstore;
 
 import java.lang.annotation.ElementType;
 import java.lang.annotation.Retention;
 import java.lang.annotation.RetentionPolicy;
 import java.lang.annotation.Target;
 
+import org.apache.spark.annotation.Private;
+
 /**
  * Tags a field to be indexed when storing an object.
  *
@@ -46,6 +48,7 @@
  * of those values.
  * </p>
  */
+@Private
 @Retention(RetentionPolicy.RUNTIME)
 @Target({ElementType.FIELD, ElementType.METHOD})
 public @interface KVIndex {
diff --git a/common/kvstore/src/main/java/org/apache/spark/kvstore/KVStore.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVStore.java
similarity index 97%
rename from common/kvstore/src/main/java/org/apache/spark/kvstore/KVStore.java
rename to common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVStore.java
index c7808ea3c3881..72d06a8ca8070 100644
--- a/common/kvstore/src/main/java/org/apache/spark/kvstore/KVStore.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVStore.java
@@ -15,10 +15,12 @@
  * limitations under the License.
  */
 
-package org.apache.spark.kvstore;
+package org.apache.spark.util.kvstore;
 
 import java.io.Closeable;
 
+import org.apache.spark.annotation.Private;
+
 /**
  * Abstraction for a local key/value store for storing app data.
  *
@@ -59,6 +61,7 @@
  * KVStore instances are thread-safe for both reads and writes.
  * </p>
  */
+@Private
 public interface KVStore extends Closeable {
 
   /**
diff --git a/common/kvstore/src/main/java/org/apache/spark/kvstore/KVStoreIterator.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVStoreIterator.java
similarity index 93%
rename from common/kvstore/src/main/java/org/apache/spark/kvstore/KVStoreIterator.java
rename to common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVStoreIterator.java
index 3efdec9ed32be..28a432b26d98e 100644
--- a/common/kvstore/src/main/java/org/apache/spark/kvstore/KVStoreIterator.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVStoreIterator.java
@@ -15,11 +15,13 @@
  * limitations under the License.
  */
 
-package org.apache.spark.kvstore;
+package org.apache.spark.util.kvstore;
 
 import java.util.Iterator;
 import java.util.List;
 
+import org.apache.spark.annotation.Private;
+
 /**
  * An iterator for KVStore.
  *
@@ -28,6 +30,7 @@
  * explicitly close iterators after they're used.
  * </p>
  */
+@Private
 public interface KVStoreIterator<T> extends Iterator<T>, AutoCloseable {
 
   /**
diff --git a/common/kvstore/src/main/java/org/apache/spark/kvstore/KVStoreSerializer.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVStoreSerializer.java
similarity index 96%
rename from common/kvstore/src/main/java/org/apache/spark/kvstore/KVStoreSerializer.java
rename to common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVStoreSerializer.java
index b84ec91cf67a0..bd8d9486acde5 100644
--- a/common/kvstore/src/main/java/org/apache/spark/kvstore/KVStoreSerializer.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVStoreSerializer.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.kvstore;
+package org.apache.spark.util.kvstore;
 
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
@@ -25,6 +25,8 @@
 
 import com.fasterxml.jackson.databind.ObjectMapper;
 
+import org.apache.spark.annotation.Private;
+
 /**
  * Serializer used to translate between app-defined types and the LevelDB store.
  *
@@ -33,6 +35,7 @@
  * and integers to be written as values directly, which will be written as UTF-8 strings.
  * </p>
  */
+@Private
 public class KVStoreSerializer {
 
   /**
diff --git a/common/kvstore/src/main/java/org/apache/spark/kvstore/KVStoreView.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVStoreView.java
similarity index 97%
rename from common/kvstore/src/main/java/org/apache/spark/kvstore/KVStoreView.java
rename to common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVStoreView.java
index 8cd1f52892293..8ea79bbe160d9 100644
--- a/common/kvstore/src/main/java/org/apache/spark/kvstore/KVStoreView.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVStoreView.java
@@ -15,10 +15,12 @@
  * limitations under the License.
  */
 
-package org.apache.spark.kvstore;
+package org.apache.spark.util.kvstore;
 
 import com.google.common.base.Preconditions;
 
+import org.apache.spark.annotation.Private;
+
 /**
  * A configurable view that allows iterating over values in a {@link KVStore}.
  *
@@ -33,6 +35,7 @@
  * to be closed explicitly unless all elements are read.
  * </p>
  */
+@Private
 public abstract class KVStoreView<T> implements Iterable<T> {
 
   final Class<T> type;
diff --git a/common/kvstore/src/main/java/org/apache/spark/kvstore/KVTypeInfo.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVTypeInfo.java
similarity index 98%
rename from common/kvstore/src/main/java/org/apache/spark/kvstore/KVTypeInfo.java
rename to common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVTypeInfo.java
index e3e61e0a0e452..a2b077e4531ee 100644
--- a/common/kvstore/src/main/java/org/apache/spark/kvstore/KVTypeInfo.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVTypeInfo.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.kvstore;
+package org.apache.spark.util.kvstore;
 
 import java.lang.reflect.Field;
 import java.lang.reflect.Method;
@@ -25,9 +25,12 @@
 
 import com.google.common.base.Preconditions;
 
+import org.apache.spark.annotation.Private;
+
 /**
  * Wrapper around types managed in a KVStore, providing easy access to their indexed fields.
  */
+@Private
 public class KVTypeInfo {
 
   private final Class<?> type;
diff --git a/common/kvstore/src/main/java/org/apache/spark/kvstore/LevelDB.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java
similarity index 99%
rename from common/kvstore/src/main/java/org/apache/spark/kvstore/LevelDB.java
rename to common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java
index 27141358dc0f2..310febc352ef8 100644
--- a/common/kvstore/src/main/java/org/apache/spark/kvstore/LevelDB.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.kvstore;
+package org.apache.spark.util.kvstore;
 
 import java.io.File;
 import java.io.IOException;
@@ -36,9 +36,12 @@
 import org.iq80.leveldb.Options;
 import org.iq80.leveldb.WriteBatch;
 
+import org.apache.spark.annotation.Private;
+
 /**
  * Implementation of KVStore that uses LevelDB as the underlying data store.
  */
+@Private
 public class LevelDB implements KVStore {
 
   @VisibleForTesting
diff --git a/common/kvstore/src/main/java/org/apache/spark/kvstore/LevelDBIterator.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBIterator.java
similarity index 99%
rename from common/kvstore/src/main/java/org/apache/spark/kvstore/LevelDBIterator.java
rename to common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBIterator.java
index 263d45c242106..a2181f3874f86 100644
--- a/common/kvstore/src/main/java/org/apache/spark/kvstore/LevelDBIterator.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBIterator.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.kvstore;
+package org.apache.spark.util.kvstore;
 
 import java.io.IOException;
 import java.util.ArrayList;
diff --git a/common/kvstore/src/main/java/org/apache/spark/kvstore/LevelDBTypeInfo.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java
similarity index 99%
rename from common/kvstore/src/main/java/org/apache/spark/kvstore/LevelDBTypeInfo.java
rename to common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java
index 722f54e6f9c66..93aa0bbc9dc8f 100644
--- a/common/kvstore/src/main/java/org/apache/spark/kvstore/LevelDBTypeInfo.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.kvstore;
+package org.apache.spark.util.kvstore;
 
 import java.lang.reflect.Array;
 import java.util.Collection;
diff --git a/common/kvstore/src/main/java/org/apache/spark/kvstore/UnsupportedStoreVersionException.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/UnsupportedStoreVersionException.java
similarity index 91%
rename from common/kvstore/src/main/java/org/apache/spark/kvstore/UnsupportedStoreVersionException.java
rename to common/kvstore/src/main/java/org/apache/spark/util/kvstore/UnsupportedStoreVersionException.java
index 2ed246e4f4c97..75a33b7c75de4 100644
--- a/common/kvstore/src/main/java/org/apache/spark/kvstore/UnsupportedStoreVersionException.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/UnsupportedStoreVersionException.java
@@ -15,13 +15,16 @@
  * limitations under the License.
  */
 
-package org.apache.spark.kvstore;
+package org.apache.spark.util.kvstore;
 
 import java.io.IOException;
 
+import org.apache.spark.annotation.Private;
+
 /**
  * Exception thrown when the store implementation is not compatible with the underlying data.
  */
+@Private
 public class UnsupportedStoreVersionException extends IOException {
 
 }
diff --git a/common/kvstore/src/test/java/org/apache/spark/kvstore/ArrayKeyIndexType.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/ArrayKeyIndexType.java
similarity index 96%
rename from common/kvstore/src/test/java/org/apache/spark/kvstore/ArrayKeyIndexType.java
rename to common/kvstore/src/test/java/org/apache/spark/util/kvstore/ArrayKeyIndexType.java
index d5938acc3e80e..32030fb4115c3 100644
--- a/common/kvstore/src/test/java/org/apache/spark/kvstore/ArrayKeyIndexType.java
+++ b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/ArrayKeyIndexType.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.kvstore;
+package org.apache.spark.util.kvstore;
 
 import java.util.Arrays;
 
diff --git a/common/kvstore/src/test/java/org/apache/spark/kvstore/ArrayWrappersSuite.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/ArrayWrappersSuite.java
similarity index 98%
rename from common/kvstore/src/test/java/org/apache/spark/kvstore/ArrayWrappersSuite.java
rename to common/kvstore/src/test/java/org/apache/spark/util/kvstore/ArrayWrappersSuite.java
index f9b4774820ea0..b1c8927d0761d 100644
--- a/common/kvstore/src/test/java/org/apache/spark/kvstore/ArrayWrappersSuite.java
+++ b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/ArrayWrappersSuite.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.kvstore;
+package org.apache.spark.util.kvstore;
 
 import org.junit.Test;
 import static org.junit.Assert.*;
diff --git a/common/kvstore/src/test/java/org/apache/spark/kvstore/CustomType1.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/CustomType1.java
similarity index 97%
rename from common/kvstore/src/test/java/org/apache/spark/kvstore/CustomType1.java
rename to common/kvstore/src/test/java/org/apache/spark/util/kvstore/CustomType1.java
index afb72b8689223..92b643b0cb928 100644
--- a/common/kvstore/src/test/java/org/apache/spark/kvstore/CustomType1.java
+++ b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/CustomType1.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.kvstore;
+package org.apache.spark.util.kvstore;
 
 import com.google.common.base.Objects;
 
diff --git a/common/kvstore/src/test/java/org/apache/spark/kvstore/DBIteratorSuite.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/DBIteratorSuite.java
similarity index 99%
rename from common/kvstore/src/test/java/org/apache/spark/kvstore/DBIteratorSuite.java
rename to common/kvstore/src/test/java/org/apache/spark/util/kvstore/DBIteratorSuite.java
index 3a418189ecfec..9a81f86812cde 100644
--- a/common/kvstore/src/test/java/org/apache/spark/kvstore/DBIteratorSuite.java
+++ b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/DBIteratorSuite.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.kvstore;
+package org.apache.spark.util.kvstore;
 
 import java.util.Arrays;
 import java.util.ArrayList;
diff --git a/common/kvstore/src/test/java/org/apache/spark/kvstore/InMemoryIteratorSuite.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/InMemoryIteratorSuite.java
similarity index 96%
rename from common/kvstore/src/test/java/org/apache/spark/kvstore/InMemoryIteratorSuite.java
rename to common/kvstore/src/test/java/org/apache/spark/util/kvstore/InMemoryIteratorSuite.java
index 57ee4f6dd7cb6..27dde6a9fbea1 100644
--- a/common/kvstore/src/test/java/org/apache/spark/kvstore/InMemoryIteratorSuite.java
+++ b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/InMemoryIteratorSuite.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.kvstore;
+package org.apache.spark.util.kvstore;
 
 public class InMemoryIteratorSuite extends DBIteratorSuite {
 
diff --git a/common/kvstore/src/test/java/org/apache/spark/kvstore/InMemoryStoreSuite.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/InMemoryStoreSuite.java
similarity index 99%
rename from common/kvstore/src/test/java/org/apache/spark/kvstore/InMemoryStoreSuite.java
rename to common/kvstore/src/test/java/org/apache/spark/util/kvstore/InMemoryStoreSuite.java
index 6a7915f9385e0..510b3058a4e3c 100644
--- a/common/kvstore/src/test/java/org/apache/spark/kvstore/InMemoryStoreSuite.java
+++ b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/InMemoryStoreSuite.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.kvstore;
+package org.apache.spark.util.kvstore;
 
 import java.util.NoSuchElementException;
 
diff --git a/common/kvstore/src/test/java/org/apache/spark/kvstore/LevelDBBenchmark.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBBenchmark.java
similarity index 99%
rename from common/kvstore/src/test/java/org/apache/spark/kvstore/LevelDBBenchmark.java
rename to common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBBenchmark.java
index 5e33606b12dd4..268d025f5f06a 100644
--- a/common/kvstore/src/test/java/org/apache/spark/kvstore/LevelDBBenchmark.java
+++ b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBBenchmark.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.kvstore;
+package org.apache.spark.util.kvstore;
 
 import java.io.File;
 import java.util.ArrayList;
diff --git a/common/kvstore/src/test/java/org/apache/spark/kvstore/LevelDBIteratorSuite.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBIteratorSuite.java
similarity index 97%
rename from common/kvstore/src/test/java/org/apache/spark/kvstore/LevelDBIteratorSuite.java
rename to common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBIteratorSuite.java
index 93409712986ca..f8195da58cf9f 100644
--- a/common/kvstore/src/test/java/org/apache/spark/kvstore/LevelDBIteratorSuite.java
+++ b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBIteratorSuite.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.kvstore;
+package org.apache.spark.util.kvstore;
 
 import java.io.File;
 
diff --git a/common/kvstore/src/test/java/org/apache/spark/kvstore/LevelDBSuite.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBSuite.java
similarity index 99%
rename from common/kvstore/src/test/java/org/apache/spark/kvstore/LevelDBSuite.java
rename to common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBSuite.java
index 86c85c1b7a032..2b07d249d2022 100644
--- a/common/kvstore/src/test/java/org/apache/spark/kvstore/LevelDBSuite.java
+++ b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBSuite.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.kvstore;
+package org.apache.spark.util.kvstore;
 
 import java.io.File;
 import java.util.Arrays;
diff --git a/common/kvstore/src/test/java/org/apache/spark/kvstore/LevelDBTypeInfoSuite.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBTypeInfoSuite.java
similarity index 99%
rename from common/kvstore/src/test/java/org/apache/spark/kvstore/LevelDBTypeInfoSuite.java
rename to common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBTypeInfoSuite.java
index 8e6196506c6a8..38db3bedaef6a 100644
--- a/common/kvstore/src/test/java/org/apache/spark/kvstore/LevelDBTypeInfoSuite.java
+++ b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBTypeInfoSuite.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.kvstore;
+package org.apache.spark.util.kvstore;
 
 import static java.nio.charset.StandardCharsets.UTF_8;
 

From fb54a564d75aea835f57bc147b83a76d1da0a01f Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Tue, 8 Aug 2017 18:15:29 -0700
Subject: [PATCH 325/779] [SPARK-20433][BUILD] Bump jackson from 2.6.5 to
 2.6.7.1

## What changes were proposed in this pull request?

Taking over https://github.com/apache/spark/pull/18789 ; Closes #18789

Update Jackson to 2.6.7 uniformly, and some components to 2.6.7.1, to get some fixes and prep for Scala 2.12

## How was this patch tested?

Existing tests

Author: Sean Owen <sowen@cloudera.com>

Closes #18881 from srowen/SPARK-20433.
---
 dev/deps/spark-deps-hadoop-2.6 | 10 +++++-----
 dev/deps/spark-deps-hadoop-2.7 | 10 +++++-----
 pom.xml                        | 12 +++++++++---
 sql/core/pom.xml               |  1 -
 4 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 76e37646af0ec..a41183accf718 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -86,14 +86,14 @@ htrace-core-3.0.4.jar
 httpclient-4.5.2.jar
 httpcore-4.4.4.jar
 ivy-2.4.0.jar
-jackson-annotations-2.6.5.jar
-jackson-core-2.6.5.jar
+jackson-annotations-2.6.7.jar
+jackson-core-2.6.7.jar
 jackson-core-asl-1.9.13.jar
-jackson-databind-2.6.5.jar
+jackson-databind-2.6.7.1.jar
 jackson-jaxrs-1.9.13.jar
 jackson-mapper-asl-1.9.13.jar
-jackson-module-paranamer-2.6.5.jar
-jackson-module-scala_2.11-2.6.5.jar
+jackson-module-paranamer-2.6.7.jar
+jackson-module-scala_2.11-2.6.7.1.jar
 jackson-xc-1.9.13.jar
 janino-3.0.0.jar
 java-xmlbuilder-1.0.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index f4173477d05d3..5e1321be14d0c 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -86,14 +86,14 @@ htrace-core-3.1.0-incubating.jar
 httpclient-4.5.2.jar
 httpcore-4.4.4.jar
 ivy-2.4.0.jar
-jackson-annotations-2.6.5.jar
-jackson-core-2.6.5.jar
+jackson-annotations-2.6.7.jar
+jackson-core-2.6.7.jar
 jackson-core-asl-1.9.13.jar
-jackson-databind-2.6.5.jar
+jackson-databind-2.6.7.1.jar
 jackson-jaxrs-1.9.13.jar
 jackson-mapper-asl-1.9.13.jar
-jackson-module-paranamer-2.6.5.jar
-jackson-module-scala_2.11-2.6.5.jar
+jackson-module-paranamer-2.6.7.jar
+jackson-module-scala_2.11-2.6.7.1.jar
 jackson-xc-1.9.13.jar
 janino-3.0.0.jar
 java-xmlbuilder-1.0.jar
diff --git a/pom.xml b/pom.xml
index d54a9c4f19667..95dd52acdfd5a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -158,7 +158,8 @@
     <scala.version>2.11.8</scala.version>
     <scala.binary.version>2.11</scala.binary.version>
     <codehaus.jackson.version>1.9.13</codehaus.jackson.version>
-    <fasterxml.jackson.version>2.6.5</fasterxml.jackson.version>
+    <fasterxml.jackson.version>2.6.7</fasterxml.jackson.version>
+    <fasterxml.jackson.databind.version>2.6.7.1</fasterxml.jackson.databind.version>
     <snappy.version>1.1.2.6</snappy.version>
     <netlib.java.version>1.1.2</netlib.java.version>
     <calcite.version>1.2.0-incubating</calcite.version>
@@ -615,7 +616,7 @@
       <dependency>
         <groupId>com.fasterxml.jackson.core</groupId>
         <artifactId>jackson-databind</artifactId>
-        <version>${fasterxml.jackson.version}</version>
+        <version>${fasterxml.jackson.databind.version}</version>
       </dependency>
       <dependency>
         <groupId>com.fasterxml.jackson.core</groupId>
@@ -627,7 +628,7 @@
       <dependency>
         <groupId>com.fasterxml.jackson.module</groupId>
         <artifactId>jackson-module-scala_${scala.binary.version}</artifactId>
-        <version>${fasterxml.jackson.version}</version>
+        <version>${fasterxml.jackson.databind.version}</version>
         <exclusions>
           <exclusion>
             <groupId>com.google.guava</groupId>
@@ -635,6 +636,11 @@
           </exclusion>
         </exclusions>
       </dependency>
+      <dependency>
+        <groupId>com.fasterxml.jackson.module</groupId>
+        <artifactId>jackson-module-paranamer</artifactId>
+        <version>${fasterxml.jackson.version}</version>
+      </dependency>
       <dependency>
         <groupId>com.fasterxml.jackson.module</groupId>
         <artifactId>jackson-module-jaxb-annotations</artifactId>
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 25004e5326097..a16411ec304a9 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -101,7 +101,6 @@
     <dependency>
       <groupId>com.fasterxml.jackson.core</groupId>
       <artifactId>jackson-databind</artifactId>
-      <version>${fasterxml.jackson.version}</version>
     </dependency>
     <dependency>
       <groupId>org.apache.arrow</groupId>

From 6edfff055caea81dc3a98a6b4081313a0c0b0729 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Tue, 8 Aug 2017 20:20:26 -0700
Subject: [PATCH 326/779] [SPARK-21596][SS] Ensure places calling
 HDFSMetadataLog.get check the return value

## What changes were proposed in this pull request?

When I was investigating a flaky test, I realized that many places don't check the return value of `HDFSMetadataLog.get(batchId: Long): Option[T]`. When a batch is supposed to be there, the caller just ignores None rather than throwing an error. If some bug causes a query doesn't generate a batch metadata file, this behavior will hide it and allow the query continuing to run and finally delete metadata logs and make it hard to debug.

This PR ensures that places calling HDFSMetadataLog.get always check the return value.

## How was this patch tested?

Jenkins

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #18799 from zsxwing/SPARK-21596.
---
 .../streaming/CompactibleFileStreamLog.scala  | 16 +++++-
 .../streaming/FileStreamSourceLog.scala       |  5 +-
 .../execution/streaming/HDFSMetadataLog.scala | 57 ++++++++++++++++++-
 .../execution/streaming/StreamExecution.scala | 17 ++++--
 .../streaming/HDFSMetadataLogSuite.scala      | 17 ++++++
 .../sql/streaming/FileStreamSourceSuite.scala |  1 +
 6 files changed, 102 insertions(+), 11 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala
index e37033b19a8eb..77bc0ba5548dd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala
@@ -169,7 +169,13 @@ abstract class CompactibleFileStreamLog[T <: AnyRef : ClassTag](
    */
   private def compact(batchId: Long, logs: Array[T]): Boolean = {
     val validBatches = getValidBatchesBeforeCompactionBatch(batchId, compactInterval)
-    val allLogs = validBatches.flatMap(batchId => super.get(batchId)).flatten ++ logs
+    val allLogs = validBatches.map { id =>
+      super.get(id).getOrElse {
+        throw new IllegalStateException(
+          s"${batchIdToPath(id)} doesn't exist when compacting batch $batchId " +
+            s"(compactInterval: $compactInterval)")
+      }
+    }.flatten ++ logs
     // Return false as there is another writer.
     super.add(batchId, compactLogs(allLogs).toArray)
   }
@@ -186,7 +192,13 @@ abstract class CompactibleFileStreamLog[T <: AnyRef : ClassTag](
       if (latestId >= 0) {
         try {
           val logs =
-            getAllValidBatches(latestId, compactInterval).flatMap(id => super.get(id)).flatten
+            getAllValidBatches(latestId, compactInterval).map { id =>
+              super.get(id).getOrElse {
+                throw new IllegalStateException(
+                  s"${batchIdToPath(id)} doesn't exist " +
+                    s"(latestId: $latestId, compactInterval: $compactInterval)")
+              }
+            }.flatten
           return compactLogs(logs).toArray
         } catch {
           case e: IOException =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceLog.scala
index 33e6a1d5d6e18..8628471fdb925 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceLog.scala
@@ -115,7 +115,10 @@ class FileStreamSourceLog(
       Map.empty[Long, Option[Array[FileEntry]]]
     }
 
-    (existedBatches ++ retrievedBatches).map(i => i._1 -> i._2.get).toArray.sortBy(_._1)
+    val batches =
+      (existedBatches ++ retrievedBatches).map(i => i._1 -> i._2.get).toArray.sortBy(_._1)
+    HDFSMetadataLog.verifyBatchIds(batches.map(_._1), startId, endId)
+    batches
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
index 46bfc297931fb..5f8973fd09460 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
@@ -123,7 +123,7 @@ class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path:
           serialize(metadata, output)
           return Some(tempPath)
         } finally {
-          IOUtils.closeQuietly(output)
+          output.close()
         }
       } catch {
         case e: FileAlreadyExistsException =>
@@ -211,13 +211,17 @@ class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path:
   }
 
   override def get(startId: Option[Long], endId: Option[Long]): Array[(Long, T)] = {
+    assert(startId.isEmpty || endId.isEmpty || startId.get <= endId.get)
     val files = fileManager.list(metadataPath, batchFilesFilter)
     val batchIds = files
       .map(f => pathToBatchId(f.getPath))
       .filter { batchId =>
         (endId.isEmpty || batchId <= endId.get) && (startId.isEmpty || batchId >= startId.get)
-    }
-    batchIds.sorted.map(batchId => (batchId, get(batchId))).filter(_._2.isDefined).map {
+    }.sorted
+
+    verifyBatchIds(batchIds, startId, endId)
+
+    batchIds.map(batchId => (batchId, get(batchId))).filter(_._2.isDefined).map {
       case (batchId, metadataOption) =>
         (batchId, metadataOption.get)
     }
@@ -437,4 +441,51 @@ object HDFSMetadataLog {
       }
     }
   }
+
+  /**
+   * Verify if batchIds are continuous and between `startId` and `endId`.
+   *
+   * @param batchIds the sorted ids to verify.
+   * @param startId the start id. If it's set, batchIds should start with this id.
+   * @param endId the start id. If it's set, batchIds should end with this id.
+   */
+  def verifyBatchIds(batchIds: Seq[Long], startId: Option[Long], endId: Option[Long]): Unit = {
+    // Verify that we can get all batches between `startId` and `endId`.
+    if (startId.isDefined || endId.isDefined) {
+      if (batchIds.isEmpty) {
+        throw new IllegalStateException(s"batch ${startId.orElse(endId).get} doesn't exist")
+      }
+      if (startId.isDefined) {
+        val minBatchId = batchIds.head
+        assert(minBatchId >= startId.get)
+        if (minBatchId != startId.get) {
+          val missingBatchIds = startId.get to minBatchId
+          throw new IllegalStateException(
+            s"batches (${missingBatchIds.mkString(", ")}) don't exist " +
+              s"(startId: $startId, endId: $endId)")
+        }
+      }
+
+      if (endId.isDefined) {
+        val maxBatchId = batchIds.last
+        assert(maxBatchId <= endId.get)
+        if (maxBatchId != endId.get) {
+          val missingBatchIds = maxBatchId to endId.get
+          throw new IllegalStateException(
+            s"batches (${missingBatchIds.mkString(", ")}) don't  exist " +
+              s"(startId: $startId, endId: $endId)")
+        }
+      }
+    }
+
+    if (batchIds.nonEmpty) {
+      val minBatchId = batchIds.head
+      val maxBatchId = batchIds.last
+      val missingBatchIds = (minBatchId to maxBatchId).toSet -- batchIds
+      if (missingBatchIds.nonEmpty) {
+        throw new IllegalStateException(s"batches (${missingBatchIds.mkString(", ")}) " +
+          s"don't exist (startId: $startId, endId: $endId)")
+      }
+    }
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 1528e7f469bef..9bc114f138562 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -438,7 +438,10 @@ class StreamExecution(
         availableOffsets = nextOffsets.toStreamProgress(sources)
         /* Initialize committed offsets to a committed batch, which at this
          * is the second latest batch id in the offset log. */
-        offsetLog.get(latestBatchId - 1).foreach { secondLatestBatchId =>
+        if (latestBatchId != 0) {
+          val secondLatestBatchId = offsetLog.get(latestBatchId - 1).getOrElse {
+            throw new IllegalStateException(s"batch ${latestBatchId - 1} doesn't exist")
+          }
           committedOffsets = secondLatestBatchId.toStreamProgress(sources)
         }
 
@@ -565,10 +568,14 @@ class StreamExecution(
 
         // Now that we've updated the scheduler's persistent checkpoint, it is safe for the
         // sources to discard data from the previous batch.
-        val prevBatchOff = offsetLog.get(currentBatchId - 1)
-        if (prevBatchOff.isDefined) {
-          prevBatchOff.get.toStreamProgress(sources).foreach {
-            case (src, off) => src.commit(off)
+        if (currentBatchId != 0) {
+          val prevBatchOff = offsetLog.get(currentBatchId - 1)
+          if (prevBatchOff.isDefined) {
+            prevBatchOff.get.toStreamProgress(sources).foreach {
+              case (src, off) => src.commit(off)
+            }
+          } else {
+            throw new IllegalStateException(s"batch $currentBatchId doesn't exist")
           }
         }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala
index 7689bc03a4ccf..48e70e48b1799 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala
@@ -259,6 +259,23 @@ class HDFSMetadataLogSuite extends SparkFunSuite with SharedSQLContext {
       fm.rename(path2, path3)
     }
   }
+
+  test("verifyBatchIds") {
+    import HDFSMetadataLog.verifyBatchIds
+    verifyBatchIds(Seq(1L, 2L, 3L), Some(1L), Some(3L))
+    verifyBatchIds(Seq(1L), Some(1L), Some(1L))
+    verifyBatchIds(Seq(1L, 2L, 3L), None, Some(3L))
+    verifyBatchIds(Seq(1L, 2L, 3L), Some(1L), None)
+    verifyBatchIds(Seq(1L, 2L, 3L), None, None)
+
+    intercept[IllegalStateException](verifyBatchIds(Seq(), Some(1L), None))
+    intercept[IllegalStateException](verifyBatchIds(Seq(), None, Some(1L)))
+    intercept[IllegalStateException](verifyBatchIds(Seq(), Some(1L), Some(1L)))
+    intercept[IllegalStateException](verifyBatchIds(Seq(2, 3, 4), Some(1L), None))
+    intercept[IllegalStateException](verifyBatchIds(Seq(2, 3, 4), None, Some(5L)))
+    intercept[IllegalStateException](verifyBatchIds(Seq(2, 3, 4), Some(1L), Some(5L)))
+    intercept[IllegalStateException](verifyBatchIds(Seq(1, 2, 4, 5), Some(1L), Some(5L)))
+  }
 }
 
 /** FakeFileSystem to test fallback of the HDFSMetadataLog from FileContext to FileSystem API */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
index 2108b118bf059..e2ec690d90e52 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
@@ -1314,6 +1314,7 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
     val metadataLog =
       new FileStreamSourceLog(FileStreamSourceLog.VERSION, spark, dir.getAbsolutePath)
       assert(metadataLog.add(0, Array(FileEntry(s"$scheme:///file1", 100L, 0))))
+      assert(metadataLog.add(1, Array(FileEntry(s"$scheme:///file2", 200L, 0))))
 
       val newSource = new FileStreamSource(spark, s"$scheme:///", "parquet", StructType(Nil), Nil,
         dir.getAbsolutePath, Map.empty)

From 031910b0ec24526d044fd31c05430dcda42b5be3 Mon Sep 17 00:00:00 2001
From: Xingbo Jiang <xingbo.jiang@databricks.com>
Date: Wed, 9 Aug 2017 13:23:49 +0800
Subject: [PATCH 327/779] [SPARK-21608][SPARK-9221][SQL] Window rangeBetween()
 API should allow literal boundary

## What changes were proposed in this pull request?

Window rangeBetween() API should allow literal boundary, that means, the window range frame can calculate frame of double/date/timestamp.

Example of the use case can be:
```
SELECT
	val_timestamp,
	cate,
	avg(val_timestamp) OVER(PARTITION BY cate ORDER BY val_timestamp RANGE BETWEEN CURRENT ROW AND interval 23 days 4 hours FOLLOWING)
FROM testData
```

This PR refactors the Window `rangeBetween` and `rowsBetween` API, while the legacy user code should still be valid.

## How was this patch tested?

Add new test cases both in `DataFrameWindowFunctionsSuite` and in `window.sql`.

Author: Xingbo Jiang <xingbo.jiang@databricks.com>

Closes #18814 from jiangxb1987/literal-boundary.
---
 .../sql/catalyst/analysis/TypeCoercion.scala  |   9 +-
 .../expressions/windowExpressions.scala       |   8 +-
 .../sql/execution/window/WindowExec.scala     |   9 +-
 .../apache/spark/sql/expressions/Window.scala |  63 +++++++-
 .../spark/sql/expressions/WindowSpec.scala    |  69 ++++++++-
 .../org/apache/spark/sql/functions.scala      |  27 ++++
 .../resources/sql-tests/inputs/window.sql     |  22 ++-
 .../sql-tests/results/window.sql.out          | 141 +++++++++++++-----
 .../sql/DataFrameWindowFunctionsSuite.scala   |  53 ++++++-
 9 files changed, 335 insertions(+), 66 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
index 25af014f67fe9..06d8350db9891 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
@@ -821,9 +821,12 @@ object TypeCoercion {
     }
 
     private def createBoundaryCast(boundary: Expression, dt: DataType): Expression = {
-      boundary match {
-        case e: SpecialFrameBoundary => e
-        case e: Expression if e.dataType != dt && Cast.canCast(e.dataType, dt) => Cast(e, dt)
+      (boundary, dt) match {
+        case (e: SpecialFrameBoundary, _) => e
+        case (e, _: DateType) => e
+        case (e, _: TimestampType) => e
+        case (e: Expression, t) if e.dataType != t && Cast.canCast(e.dataType, t) =>
+          Cast(e, t)
         case _ => boundary
       }
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
index a829dccfd3e36..e11e3a105f597 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
@@ -89,7 +89,11 @@ case class WindowSpecDefinition(
     elements.mkString("(", " ", ")")
   }
 
-  private def isValidFrameType(ft: DataType): Boolean = orderSpec.head.dataType == ft
+  private def isValidFrameType(ft: DataType): Boolean = (orderSpec.head.dataType, ft) match {
+    case (DateType, IntegerType) => true
+    case (TimestampType, CalendarIntervalType) => true
+    case (a, b) => a == b
+  }
 }
 
 /**
@@ -129,7 +133,7 @@ case object RowFrame extends FrameType {
  * of the current row.
  */
 case object RangeFrame extends FrameType {
-  override def inputType: AbstractDataType = NumericType
+  override def inputType: AbstractDataType = TypeCollection.NumericAndInterval
   override def sql: String = "RANGE"
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala
index 0766e37826cb2..f8bb667e30064 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala
@@ -25,7 +25,9 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.plans.physical._
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.{ExternalAppendOnlyUnsafeRowArray, SparkPlan, UnaryExecNode}
+import org.apache.spark.sql.types.{CalendarIntervalType, DateType, IntegerType, TimestampType}
 
 /**
  * This class calculates and outputs (windowed) aggregates over the rows in a single (sorted)
@@ -139,7 +141,12 @@ case class WindowExec(
         }
 
         // Create the projection which returns the current 'value' modified by adding the offset.
-        val boundExpr = Add(expr, Cast(boundOffset, expr.dataType))
+        val boundExpr = (expr.dataType, boundOffset.dataType) match {
+          case (DateType, IntegerType) => DateAdd(expr, boundOffset)
+          case (TimestampType, CalendarIntervalType) =>
+            TimeAdd(expr, boundOffset, Some(conf.sessionLocalTimeZone))
+          case (a, b) if a== b => Add(expr, boundOffset)
+        }
         val bound = newMutableProjection(boundExpr :: Nil, child.output)
 
         // Construct the ordering. This is used to compare the result of current value projection
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
index cd79128d8f375..1caa243f8d118 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
@@ -75,7 +75,7 @@ object Window {
   }
 
   /**
-   * Value representing the last row in the partition, equivalent to "UNBOUNDED PRECEDING" in SQL.
+   * Value representing the first row in the partition, equivalent to "UNBOUNDED PRECEDING" in SQL.
    * This can be used to specify the frame boundaries:
    *
    * {{{
@@ -167,17 +167,17 @@ object Window {
    * current row.
    *
    * We recommend users use `Window.unboundedPreceding`, `Window.unboundedFollowing`,
-   * and `Window.currentRow` to specify special boundary values, rather than using integral
-   * values directly.
+   * and `Window.currentRow` to specify special boundary values, rather than using long values
+   * directly.
    *
    * A range-based boundary is based on the actual value of the ORDER BY
    * expression(s). An offset is used to alter the value of the ORDER BY expression, for
    * instance if the current order by expression has a value of 10 and the lower bound offset
    * is -3, the resulting lower bound for the current row will be 10 - 3 = 7. This however puts a
    * number of constraints on the ORDER BY expressions: there can be only one expression and this
-   * expression must have a numerical data type. An exception can be made when the offset is 0,
-   * because no value modification is needed, in this case multiple and non-numeric ORDER BY
-   * expression are allowed.
+   * expression must have a numerical data type. An exception can be made when the offset is
+   * unbounded, because no value modification is needed, in this case multiple and non-numeric
+   * ORDER BY expression are allowed.
    *
    * {{{
    *   import org.apache.spark.sql.expressions.Window
@@ -210,6 +210,57 @@ object Window {
     spec.rangeBetween(start, end)
   }
 
+  /**
+   * Creates a [[WindowSpec]] with the frame boundaries defined,
+   * from `start` (inclusive) to `end` (inclusive).
+   *
+   * Both `start` and `end` are relative to the current row. For example, "lit(0)" means
+   * "current row", while "lit(-1)" means one off before the current row, and "lit(5)" means the
+   * five off after the current row.
+   *
+   * Users should use `unboundedPreceding()`, `unboundedFollowing()`, and `currentRow()` from
+   * [[org.apache.spark.sql.functions]] to specify special boundary values, literals are not
+   * transformed to [[org.apache.spark.sql.catalyst.expressions.SpecialFrameBoundary]]s.
+   *
+   * A range-based boundary is based on the actual value of the ORDER BY
+   * expression(s). An offset is used to alter the value of the ORDER BY expression, for
+   * instance if the current order by expression has a value of 10 and the lower bound offset
+   * is -3, the resulting lower bound for the current row will be 10 - 3 = 7. This however puts a
+   * number of constraints on the ORDER BY expressions: there can be only one expression and this
+   * expression must have a numerical/date/timestamp data type. An exception can be made when the
+   * offset is unbounded, because no value modification is needed, in this case multiple and
+   * non-numerical/date/timestamp data type ORDER BY expression are allowed.
+   *
+   * {{{
+   *   import org.apache.spark.sql.expressions.Window
+   *   val df = Seq((1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b"))
+   *     .toDF("id", "category")
+   *   val byCategoryOrderedById =
+   *     Window.partitionBy('category).orderBy('id).rangeBetween(currentRow(), lit(1))
+   *   df.withColumn("sum", sum('id) over byCategoryOrderedById).show()
+   *
+   *   +---+--------+---+
+   *   | id|category|sum|
+   *   +---+--------+---+
+   *   |  1|       b|  3|
+   *   |  2|       b|  5|
+   *   |  3|       b|  3|
+   *   |  1|       a|  4|
+   *   |  1|       a|  4|
+   *   |  2|       a|  2|
+   *   +---+--------+---+
+   * }}}
+   *
+   * @param start boundary start, inclusive. The frame is unbounded if the expression is
+   *              [[org.apache.spark.sql.catalyst.expressions.UnboundedPreceding]].
+   * @param end boundary end, inclusive. The frame is unbounded if the expression is
+   *            [[org.apache.spark.sql.catalyst.expressions.UnboundedFollowing]].
+   * @since 2.3.0
+   */
+  def rangeBetween(start: Column, end: Column): WindowSpec = {
+    spec.rangeBetween(start, end)
+  }
+
   private[sql] def spec: WindowSpec = {
     new WindowSpec(Seq.empty, Seq.empty, UnspecifiedFrame)
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala
index f8b404de77a4a..4c41aa3c5fb67 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala
@@ -146,22 +146,22 @@ class WindowSpec private[sql](
   /**
    * Defines the frame boundaries, from `start` (inclusive) to `end` (inclusive).
    *
-   * Both `start` and `end` are relative from the current row. For example, "0" means "current row",
-   * while "-1" means one off before the current row, and "5" means the five off after the
-   * current row.
+   * Both `start` and `end` are relative from the current row. For example, "0" means
+   * "current row", while "-1" means one off before the current row, and "5" means the five off
+   * after the current row.
    *
    * We recommend users use `Window.unboundedPreceding`, `Window.unboundedFollowing`,
-   * and `Window.currentRow` to specify special boundary values, rather than using integral
-   * values directly.
+   * and `Window.currentRow` to specify special boundary values, rather than using long values
+   * directly.
    *
    * A range-based boundary is based on the actual value of the ORDER BY
    * expression(s). An offset is used to alter the value of the ORDER BY expression, for
    * instance if the current order by expression has a value of 10 and the lower bound offset
    * is -3, the resulting lower bound for the current row will be 10 - 3 = 7. This however puts a
    * number of constraints on the ORDER BY expressions: there can be only one expression and this
-   * expression must have a numerical data type. An exception can be made when the offset is 0,
-   * because no value modification is needed, in this case multiple and non-numeric ORDER BY
-   * expression are allowed.
+   * expression must have a numerical data type. An exception can be made when the offset is
+   * unbounded, because no value modification is needed, in this case multiple and non-numeric
+   * ORDER BY expression are allowed.
    *
    * {{{
    *   import org.apache.spark.sql.expressions.Window
@@ -209,6 +209,59 @@ class WindowSpec private[sql](
       SpecifiedWindowFrame(RangeFrame, boundaryStart, boundaryEnd))
   }
 
+  /**
+   * Defines the frame boundaries, from `start` (inclusive) to `end` (inclusive).
+   *
+   * Both `start` and `end` are relative to the current row. For example, "lit(0)" means
+   * "current row", while "lit(-1)" means one off before the current row, and "lit(5)" means the
+   * five off after the current row.
+   *
+   * Users should use `unboundedPreceding()`, `unboundedFollowing()`, and `currentRow()` from
+   * [[org.apache.spark.sql.functions]] to specify special boundary values, literals are not
+   * transformed to [[org.apache.spark.sql.catalyst.expressions.SpecialFrameBoundary]]s.
+   *
+   * A range-based boundary is based on the actual value of the ORDER BY
+   * expression(s). An offset is used to alter the value of the ORDER BY expression, for
+   * instance if the current order by expression has a value of 10 and the lower bound offset
+   * is -3, the resulting lower bound for the current row will be 10 - 3 = 7. This however puts a
+   * number of constraints on the ORDER BY expressions: there can be only one expression and this
+   * expression must have a numerical/date/timestamp data type. An exception can be made when the
+   * offset is unbounded, because no value modification is needed, in this case multiple and
+   * non-numerical/date/timestamp data type ORDER BY expression are allowed.
+   *
+   * {{{
+   *   import org.apache.spark.sql.expressions.Window
+   *   val df = Seq((1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b"))
+   *     .toDF("id", "category")
+   *   val byCategoryOrderedById =
+   *     Window.partitionBy('category).orderBy('id).rangeBetween(currentRow(), lit(1))
+   *   df.withColumn("sum", sum('id) over byCategoryOrderedById).show()
+   *
+   *   +---+--------+---+
+   *   | id|category|sum|
+   *   +---+--------+---+
+   *   |  1|       b|  3|
+   *   |  2|       b|  5|
+   *   |  3|       b|  3|
+   *   |  1|       a|  4|
+   *   |  1|       a|  4|
+   *   |  2|       a|  2|
+   *   +---+--------+---+
+   * }}}
+   *
+   * @param start boundary start, inclusive. The frame is unbounded if the expression is
+   *              [[org.apache.spark.sql.catalyst.expressions.UnboundedPreceding]].
+   * @param end boundary end, inclusive. The frame is unbounded if the expression is
+   *            [[org.apache.spark.sql.catalyst.expressions.UnboundedFollowing]].
+   * @since 2.3.0
+   */
+  def rangeBetween(start: Column, end: Column): WindowSpec = {
+    new WindowSpec(
+      partitionSpec,
+      orderSpec,
+      SpecifiedWindowFrame(RangeFrame, start.expr, end.expr))
+  }
+
   /**
    * Converts this [[WindowSpec]] into a [[Column]] with an aggregate expression.
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 496619a43a503..14ab8a2665141 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -32,6 +32,7 @@ import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.plans.logical.{HintInfo, ResolvedHint}
 import org.apache.spark.sql.execution.SparkSqlParser
 import org.apache.spark.sql.expressions.UserDefinedFunction
+import org.apache.spark.sql.expressions.Window
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
@@ -777,6 +778,32 @@ object functions {
   //////////////////////////////////////////////////////////////////////////////////////////////
   // Window functions
   //////////////////////////////////////////////////////////////////////////////////////////////
+  /**
+   * Window function: returns the special frame boundary that represents the first row in the
+   * window partition.
+   *
+   * @group window_funcs
+   * @since 2.3.0
+   */
+  def unboundedPreceding(): Column = Column(UnboundedPreceding)
+
+  /**
+   * Window function: returns the special frame boundary that represents the last row in the
+   * window partition.
+   *
+   * @group window_funcs
+   * @since 2.3.0
+   */
+  def unboundedFollowing(): Column = Column(UnboundedFollowing)
+
+  /**
+   * Window function: returns the special frame boundary that represents the current row in the
+   * window partition.
+   *
+   * @group window_funcs
+   * @since 2.3.0
+   */
+  def currentRow(): Column = Column(CurrentRow)
 
   /**
    * Window function: returns the cumulative distribution of values within a window partition,
diff --git a/sql/core/src/test/resources/sql-tests/inputs/window.sql b/sql/core/src/test/resources/sql-tests/inputs/window.sql
index 342e5719e9a60..c4bea34ec4cf3 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/window.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/window.sql
@@ -1,8 +1,15 @@
 -- Test data.
 CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
-(null, 1L, "a"), (1, 1L, "a"), (1, 2L, "a"), (2, 2147483650L, "a"), (1, null, "b"), (2, 3L, "b"),
-(3, 2147483650L, "b"), (null, null, null), (3, 1L, null)
-AS testData(val, val_long, cate);
+(null, 1L, 1.0D, date("2017-08-01"), timestamp(1501545600), "a"),
+(1, 1L, 1.0D, date("2017-08-01"), timestamp(1501545600), "a"),
+(1, 2L, 2.5D, date("2017-08-02"), timestamp(1502000000), "a"),
+(2, 2147483650L, 100.001D, date("2020-12-31"), timestamp(1609372800), "a"),
+(1, null, 1.0D, date("2017-08-01"), timestamp(1501545600), "b"),
+(2, 3L, 3.3D, date("2017-08-03"), timestamp(1503000000), "b"),
+(3, 2147483650L, 100.001D, date("2020-12-31"), timestamp(1609372800), "b"),
+(null, null, null, null, null, null),
+(3, 1L, 1.0D, date("2017-08-01"), timestamp(1501545600), null)
+AS testData(val, val_long, val_double, val_date, val_timestamp, cate);
 
 -- RowsBetween
 SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY val ROWS CURRENT ROW) FROM testData
@@ -19,6 +26,13 @@ SELECT val, cate, sum(val) OVER(PARTITION BY cate ORDER BY val
 RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, val;
 SELECT val_long, cate, sum(val_long) OVER(PARTITION BY cate ORDER BY val_long
 RANGE BETWEEN CURRENT ROW AND 2147483648 FOLLOWING) FROM testData ORDER BY cate, val_long;
+SELECT val_double, cate, sum(val_double) OVER(PARTITION BY cate ORDER BY val_double
+RANGE BETWEEN CURRENT ROW AND 2.5 FOLLOWING) FROM testData ORDER BY cate, val_double;
+SELECT val_date, cate, max(val_date) OVER(PARTITION BY cate ORDER BY val_date
+RANGE BETWEEN CURRENT ROW AND 2 FOLLOWING) FROM testData ORDER BY cate, val_date;
+SELECT val_timestamp, cate, avg(val_timestamp) OVER(PARTITION BY cate ORDER BY val_timestamp
+RANGE BETWEEN CURRENT ROW AND interval 23 days 4 hours FOLLOWING) FROM testData
+ORDER BY cate, val_timestamp;
 
 -- RangeBetween with reverse OrderBy
 SELECT val, cate, sum(val) OVER(PARTITION BY cate ORDER BY val DESC
@@ -31,7 +45,7 @@ SELECT val, cate, count(val) OVER(PARTITION BY cate
 RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, val;
 SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY val, cate
 RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, val;
-SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY current_date
+SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY current_timestamp
 RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, val;
 SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY val
 RANGE BETWEEN 1 FOLLOWING AND 1 PRECEDING) FROM testData ORDER BY cate, val;
diff --git a/sql/core/src/test/resources/sql-tests/results/window.sql.out b/sql/core/src/test/resources/sql-tests/results/window.sql.out
index 97511068b323c..73ad27e5bf8ce 100644
--- a/sql/core/src/test/resources/sql-tests/results/window.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/window.sql.out
@@ -1,12 +1,19 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 19
+-- Number of queries: 22
 
 
 -- !query 0
 CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
-(null, 1L, "a"), (1, 1L, "a"), (1, 2L, "a"), (2, 2147483650L, "a"), (1, null, "b"), (2, 3L, "b"),
-(3, 2147483650L, "b"), (null, null, null), (3, 1L, null)
-AS testData(val, val_long, cate)
+(null, 1L, 1.0D, date("2017-08-01"), timestamp(1501545600), "a"),
+(1, 1L, 1.0D, date("2017-08-01"), timestamp(1501545600), "a"),
+(1, 2L, 2.5D, date("2017-08-02"), timestamp(1502000000), "a"),
+(2, 2147483650L, 100.001D, date("2020-12-31"), timestamp(1609372800), "a"),
+(1, null, 1.0D, date("2017-08-01"), timestamp(1501545600), "b"),
+(2, 3L, 3.3D, date("2017-08-03"), timestamp(1503000000), "b"),
+(3, 2147483650L, 100.001D, date("2020-12-31"), timestamp(1609372800), "b"),
+(null, null, null, null, null, null),
+(3, 1L, 1.0D, date("2017-08-01"), timestamp(1501545600), null)
+AS testData(val, val_long, val_double, val_date, val_timestamp, cate)
 -- !query 0 schema
 struct<>
 -- !query 0 output
@@ -109,11 +116,63 @@ NULL	b	NULL
 
 
 -- !query 7
+SELECT val_double, cate, sum(val_double) OVER(PARTITION BY cate ORDER BY val_double
+RANGE BETWEEN CURRENT ROW AND 2.5 FOLLOWING) FROM testData ORDER BY cate, val_double
+-- !query 7 schema
+struct<val_double:double,cate:string,sum(val_double) OVER (PARTITION BY cate ORDER BY val_double ASC NULLS FIRST RANGE BETWEEN CURRENT ROW AND CAST(2.5 AS DOUBLE) FOLLOWING):double>
+-- !query 7 output
+NULL	NULL	NULL
+1.0	NULL	1.0
+1.0	a	4.5
+1.0	a	4.5
+2.5	a	2.5
+100.001	a	100.001
+1.0	b	4.3
+3.3	b	3.3
+100.001	b	100.001
+
+
+-- !query 8
+SELECT val_date, cate, max(val_date) OVER(PARTITION BY cate ORDER BY val_date
+RANGE BETWEEN CURRENT ROW AND 2 FOLLOWING) FROM testData ORDER BY cate, val_date
+-- !query 8 schema
+struct<val_date:date,cate:string,max(val_date) OVER (PARTITION BY cate ORDER BY val_date ASC NULLS FIRST RANGE BETWEEN CURRENT ROW AND 2 FOLLOWING):date>
+-- !query 8 output
+NULL	NULL	NULL
+2017-08-01	NULL	2017-08-01
+2017-08-01	a	2017-08-02
+2017-08-01	a	2017-08-02
+2017-08-02	a	2017-08-02
+2020-12-31	a	2020-12-31
+2017-08-01	b	2017-08-03
+2017-08-03	b	2017-08-03
+2020-12-31	b	2020-12-31
+
+
+-- !query 9
+SELECT val_timestamp, cate, avg(val_timestamp) OVER(PARTITION BY cate ORDER BY val_timestamp
+RANGE BETWEEN CURRENT ROW AND interval 23 days 4 hours FOLLOWING) FROM testData
+ORDER BY cate, val_timestamp
+-- !query 9 schema
+struct<val_timestamp:timestamp,cate:string,avg(CAST(val_timestamp AS DOUBLE)) OVER (PARTITION BY cate ORDER BY val_timestamp ASC NULLS FIRST RANGE BETWEEN CURRENT ROW AND interval 3 weeks 2 days 4 hours FOLLOWING):double>
+-- !query 9 output
+NULL	NULL	NULL
+2017-07-31 17:00:00	NULL	1.5015456E9
+2017-07-31 17:00:00	a	1.5016970666666667E9
+2017-07-31 17:00:00	a	1.5016970666666667E9
+2017-08-05 23:13:20	a	1.502E9
+2020-12-30 16:00:00	a	1.6093728E9
+2017-07-31 17:00:00	b	1.5022728E9
+2017-08-17 13:00:00	b	1.503E9
+2020-12-30 16:00:00	b	1.6093728E9
+
+
+-- !query 10
 SELECT val, cate, sum(val) OVER(PARTITION BY cate ORDER BY val DESC
 RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, val
--- !query 7 schema
+-- !query 10 schema
 struct<val:int,cate:string,sum(val) OVER (PARTITION BY cate ORDER BY val DESC NULLS LAST RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING):bigint>
--- !query 7 output
+-- !query 10 output
 NULL	NULL	NULL
 3	NULL	3
 NULL	a	NULL
@@ -125,62 +184,62 @@ NULL	a	NULL
 3	b	5
 
 
--- !query 8
+-- !query 11
 SELECT val, cate, count(val) OVER(PARTITION BY cate
 ROWS BETWEEN UNBOUNDED FOLLOWING AND 1 FOLLOWING) FROM testData ORDER BY cate, val
--- !query 8 schema
+-- !query 11 schema
 struct<>
--- !query 8 output
+-- !query 11 output
 org.apache.spark.sql.AnalysisException
 cannot resolve 'ROWS BETWEEN UNBOUNDED FOLLOWING AND 1 FOLLOWING' due to data type mismatch: Window frame upper bound '1' does not followes the lower bound 'unboundedfollowing$()'.; line 1 pos 33
 
 
--- !query 9
+-- !query 12
 SELECT val, cate, count(val) OVER(PARTITION BY cate
 RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, val
--- !query 9 schema
+-- !query 12 schema
 struct<>
--- !query 9 output
+-- !query 12 output
 org.apache.spark.sql.AnalysisException
 cannot resolve '(PARTITION BY testdata.`cate` RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)' due to data type mismatch: A range window frame cannot be used in an unordered window specification.; line 1 pos 33
 
 
--- !query 10
+-- !query 13
 SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY val, cate
 RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, val
--- !query 10 schema
+-- !query 13 schema
 struct<>
--- !query 10 output
+-- !query 13 output
 org.apache.spark.sql.AnalysisException
 cannot resolve '(PARTITION BY testdata.`cate` ORDER BY testdata.`val` ASC NULLS FIRST, testdata.`cate` ASC NULLS FIRST RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)' due to data type mismatch: A range window frame with value boundaries cannot be used in a window specification with multiple order by expressions: val#x ASC NULLS FIRST,cate#x ASC NULLS FIRST; line 1 pos 33
 
 
--- !query 11
-SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY current_date
+-- !query 14
+SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY current_timestamp
 RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, val
--- !query 11 schema
+-- !query 14 schema
 struct<>
--- !query 11 output
+-- !query 14 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(PARTITION BY testdata.`cate` ORDER BY current_date() ASC NULLS FIRST RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)' due to data type mismatch: The data type 'DateType' used in the order specification does not match the data type 'IntegerType' which is used in the range frame.; line 1 pos 33
+cannot resolve '(PARTITION BY testdata.`cate` ORDER BY current_timestamp() ASC NULLS FIRST RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)' due to data type mismatch: The data type 'TimestampType' used in the order specification does not match the data type 'IntegerType' which is used in the range frame.; line 1 pos 33
 
 
--- !query 12
+-- !query 15
 SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY val
 RANGE BETWEEN 1 FOLLOWING AND 1 PRECEDING) FROM testData ORDER BY cate, val
--- !query 12 schema
+-- !query 15 schema
 struct<>
--- !query 12 output
+-- !query 15 output
 org.apache.spark.sql.AnalysisException
 cannot resolve 'RANGE BETWEEN 1 FOLLOWING AND 1 PRECEDING' due to data type mismatch: The lower bound of a window frame must be less than or equal to the upper bound; line 1 pos 33
 
 
--- !query 13
+-- !query 16
 SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY val
 RANGE BETWEEN CURRENT ROW AND current_date PRECEDING) FROM testData ORDER BY cate, val
--- !query 13 schema
+-- !query 16 schema
 struct<>
--- !query 13 output
+-- !query 16 output
 org.apache.spark.sql.catalyst.parser.ParseException
 
 Frame bound value must be a literal.(line 2, pos 30)
@@ -191,7 +250,7 @@ RANGE BETWEEN CURRENT ROW AND current_date PRECEDING) FROM testData ORDER BY cat
 ------------------------------^^^
 
 
--- !query 14
+-- !query 17
 SELECT val, cate,
 max(val) OVER w AS max,
 min(val) OVER w AS min,
@@ -218,9 +277,9 @@ approx_count_distinct(val) OVER w AS approx_count_distinct
 FROM testData
 WINDOW w AS (PARTITION BY cate ORDER BY val)
 ORDER BY cate, val
--- !query 14 schema
+-- !query 17 schema
 struct<val:int,cate:string,max:int,min:int,min:int,count:bigint,sum:bigint,avg:double,stddev:double,first_value:int,first_value_ignore_null:int,first_value_contain_null:int,last_value:int,last_value_ignore_null:int,last_value_contain_null:int,rank:int,dense_rank:int,cume_dist:double,percent_rank:double,ntile:int,row_number:int,var_pop:double,var_samp:double,approx_count_distinct:bigint>
--- !query 14 output
+-- !query 17 output
 NULL	NULL	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1	1	0.5	0.0	1	1	NULL	NULL	0
 3	NULL	3	3	3	1	3	3.0	NaN	NULL	3	NULL	3	3	3	2	2	1.0	1.0	2	2	0.0	NaN	1
 NULL	a	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1	1	0.25	0.0	1	1	NULL	NULL	0
@@ -232,11 +291,11 @@ NULL	a	NULL	NULL	NULL	0	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1	1	0.25	0.
 3	b	3	1	1	3	6	2.0	1.0	1	1	1	3	3	3	3	3	1.0	1.0	2	3	0.6666666666666666	1.0	3
 
 
--- !query 15
+-- !query 18
 SELECT val, cate, avg(null) OVER(PARTITION BY cate ORDER BY val) FROM testData ORDER BY cate, val
--- !query 15 schema
+-- !query 18 schema
 struct<val:int,cate:string,avg(CAST(NULL AS DOUBLE)) OVER (PARTITION BY cate ORDER BY val ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW):double>
--- !query 15 output
+-- !query 18 output
 NULL	NULL	NULL
 3	NULL	NULL
 NULL	a	NULL
@@ -248,20 +307,20 @@ NULL	a	NULL
 3	b	NULL
 
 
--- !query 16
+-- !query 19
 SELECT val, cate, row_number() OVER(PARTITION BY cate) FROM testData ORDER BY cate, val
--- !query 16 schema
+-- !query 19 schema
 struct<>
--- !query 16 output
+-- !query 19 output
 org.apache.spark.sql.AnalysisException
 Window function row_number() requires window to be ordered, please add ORDER BY clause. For example SELECT row_number()(value_expr) OVER (PARTITION BY window_partition ORDER BY window_ordering) from table;
 
 
--- !query 17
+-- !query 20
 SELECT val, cate, sum(val) OVER(), avg(val) OVER() FROM testData ORDER BY cate, val
--- !query 17 schema
+-- !query 20 schema
 struct<val:int,cate:string,sum(CAST(val AS BIGINT)) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING):bigint,avg(CAST(val AS BIGINT)) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING):double>
--- !query 17 output
+-- !query 20 output
 NULL	NULL	13	1.8571428571428572
 3	NULL	13	1.8571428571428572
 NULL	a	13	1.8571428571428572
@@ -273,7 +332,7 @@ NULL	a	13	1.8571428571428572
 3	b	13	1.8571428571428572
 
 
--- !query 18
+-- !query 21
 SELECT val, cate,
 first_value(false) OVER w AS first_value,
 first_value(true, true) OVER w AS first_value_ignore_null,
@@ -284,9 +343,9 @@ last_value(false, false) OVER w AS last_value_contain_null
 FROM testData
 WINDOW w AS ()
 ORDER BY cate, val
--- !query 18 schema
+-- !query 21 schema
 struct<val:int,cate:string,first_value:boolean,first_value_ignore_null:boolean,first_value_contain_null:boolean,last_value:boolean,last_value_ignore_null:boolean,last_value_contain_null:boolean>
--- !query 18 output
+-- !query 21 output
 NULL	NULL	false	true	false	false	true	false
 3	NULL	false	true	false	false	true	false
 NULL	a	false	true	false	false	true	false
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala
index 9806e57f08744..ea725af8d1ad8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala
@@ -17,11 +17,14 @@
 
 package org.apache.spark.sql
 
+import java.sql.{Date, Timestamp}
+
 import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction, Window}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.CalendarInterval
 
 /**
  * Window function testing for DataFrame API.
@@ -172,7 +175,7 @@ class DataFrameWindowFunctionsSuite extends QueryTest with SharedSQLContext {
     assert(e.message.contains("Boundary end is not a valid integer: 2147483648"))
   }
 
-  test("range between should accept integer/long values as boundary") {
+  test("range between should accept int/long values as boundary") {
     val df = Seq((1L, "1"), (1L, "1"), (2147483650L, "1"),
       (3L, "2"), (2L, "1"), (2147483650L, "2"))
       .toDF("key", "value")
@@ -191,6 +194,54 @@ class DataFrameWindowFunctionsSuite extends QueryTest with SharedSQLContext {
           Window.partitionBy($"value").orderBy($"key").rangeBetween(-2147483649L, 0))),
       Seq(Row(1, 2), Row(1, 2), Row(2, 3), Row(2147483650L, 2), Row(2147483650L, 4), Row(3, 1))
     )
+
+    def dt(date: String): Date = Date.valueOf(date)
+
+    val df2 = Seq((dt("2017-08-01"), "1"), (dt("2017-08-01"), "1"), (dt("2020-12-31"), "1"),
+      (dt("2017-08-03"), "2"), (dt("2017-08-02"), "1"), (dt("2020-12-31"), "2"))
+      .toDF("key", "value")
+    checkAnswer(
+      df2.select(
+        $"key",
+        count("key").over(
+          Window.partitionBy($"value").orderBy($"key").rangeBetween(lit(0), lit(2)))),
+      Seq(Row(dt("2017-08-01"), 3), Row(dt("2017-08-01"), 3), Row(dt("2020-12-31"), 1),
+        Row(dt("2017-08-03"), 1), Row(dt("2017-08-02"), 1), Row(dt("2020-12-31"), 1))
+    )
+  }
+
+  test("range between should accept double values as boundary") {
+    val df = Seq((1.0D, "1"), (1.0D, "1"), (100.001D, "1"),
+      (3.3D, "2"), (2.02D, "1"), (100.001D, "2"))
+      .toDF("key", "value")
+    df.createOrReplaceTempView("window_table")
+    checkAnswer(
+      df.select(
+        $"key",
+        count("key").over(
+          Window.partitionBy($"value").orderBy($"key")
+            .rangeBetween(currentRow, lit(2.5D)))),
+      Seq(Row(1.0, 3), Row(1.0, 3), Row(100.001, 1), Row(3.3, 1), Row(2.02, 1), Row(100.001, 1))
+    )
+  }
+
+  test("range between should accept interval values as boundary") {
+    def ts(timestamp: Long): Timestamp = new Timestamp(timestamp * 1000)
+
+    val df = Seq((ts(1501545600), "1"), (ts(1501545600), "1"), (ts(1609372800), "1"),
+      (ts(1503000000), "2"), (ts(1502000000), "1"), (ts(1609372800), "2"))
+      .toDF("key", "value")
+    df.createOrReplaceTempView("window_table")
+    checkAnswer(
+      df.select(
+        $"key",
+        count("key").over(
+          Window.partitionBy($"value").orderBy($"key")
+            .rangeBetween(currentRow,
+              lit(CalendarInterval.fromString("interval 23 days 4 hours"))))),
+      Seq(Row(ts(1501545600), 3), Row(ts(1501545600), 3), Row(ts(1609372800), 1),
+        Row(ts(1503000000), 1), Row(ts(1502000000), 1), Row(ts(1609372800), 1))
+    )
   }
 
   test("aggregation and rows between with unbounded") {

From f016f5c8f6c6aae674e9905a5c0b0bede09163a4 Mon Sep 17 00:00:00 2001
From: pgandhi <pgandhi@yahoo-inc.com>
Date: Wed, 9 Aug 2017 13:46:06 +0800
Subject: [PATCH 328/779] [SPARK-21503][UI] Spark UI shows incorrect task
 status for a killed Executor Process

The executor tab on Spark UI page shows task as completed when an executor process that is running that task is killed using the kill command.
Added the case ExecutorLostFailure which was previously not there, thus, the default case would be executed in which case, task would be marked as completed. This case will consider all those cases where executor connection to Spark Driver was lost due to killing the executor process, network connection etc.

## How was this patch tested?
Manually Tested the fix by observing the UI change before and after.
Before:
<img width="1398" alt="screen shot-before" src="https://user-images.githubusercontent.com/22228190/28482929-571c9cea-6e30-11e7-93dd-728de5cdea95.png">
After:
<img width="1385" alt="screen shot-after" src="https://user-images.githubusercontent.com/22228190/28482964-8649f5ee-6e30-11e7-91bd-2eb2089c61cc.png">

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: pgandhi <pgandhi@yahoo-inc.com>
Author: pgandhi999 <parthkgandhi9@gmail.com>

Closes #18707 from pgandhi999/master.
---
 .../apache/spark/ui/exec/ExecutorsTab.scala   | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsTab.scala b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsTab.scala
index aabf6e0c63c02..770da2226fe04 100644
--- a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsTab.scala
+++ b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsTab.scala
@@ -19,7 +19,7 @@ package org.apache.spark.ui.exec
 
 import scala.collection.mutable.{LinkedHashMap, ListBuffer}
 
-import org.apache.spark.{ExceptionFailure, Resubmitted, SparkConf, SparkContext}
+import org.apache.spark.{Resubmitted, SparkConf, SparkContext}
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.scheduler._
 import org.apache.spark.storage.{StorageStatus, StorageStatusListener}
@@ -131,17 +131,17 @@ class ExecutorsListener(storageStatusListener: StorageStatusListener, conf: Spar
     if (info != null) {
       val eid = info.executorId
       val taskSummary = executorToTaskSummary.getOrElseUpdate(eid, ExecutorTaskSummary(eid))
-      taskEnd.reason match {
-        case Resubmitted =>
-          // Note: For resubmitted tasks, we continue to use the metrics that belong to the
-          // first attempt of this task. This may not be 100% accurate because the first attempt
-          // could have failed half-way through. The correct fix would be to keep track of the
-          // metrics added by each attempt, but this is much more complicated.
-          return
-        case _: ExceptionFailure =>
-          taskSummary.tasksFailed += 1
-        case _ =>
-          taskSummary.tasksComplete += 1
+      // Note: For resubmitted tasks, we continue to use the metrics that belong to the
+      // first attempt of this task. This may not be 100% accurate because the first attempt
+      // could have failed half-way through. The correct fix would be to keep track of the
+      // metrics added by each attempt, but this is much more complicated.
+      if (taskEnd.reason == Resubmitted) {
+        return
+      }
+      if (info.successful) {
+        taskSummary.tasksComplete += 1
+      } else {
+        taskSummary.tasksFailed += 1
       }
       if (taskSummary.tasksActive >= 1) {
         taskSummary.tasksActive -= 1

From ae8a2b14966b1dfa10e620bb24ca6560778c20e7 Mon Sep 17 00:00:00 2001
From: Anderson Osagie <osagie@gmail.com>
Date: Wed, 9 Aug 2017 14:35:27 +0800
Subject: [PATCH 329/779] [SPARK-21176][WEB UI] Use a single ProxyServlet to
 proxy all workers and applications

## What changes were proposed in this pull request?

Currently, each application and each worker creates their own proxy servlet. Each proxy servlet is backed by its own HTTP client and a relatively large number of selector threads. This is excessive but was fixed (to an extent) by https://github.com/apache/spark/pull/18437.

However, a single HTTP client (backed by a single selector thread) should be enough to handle all proxy requests. This PR creates a single proxy servlet no matter how many applications and workers there are.

## How was this patch tested?
.
The unit tests for rewriting proxied locations and headers were updated. I then spun up a 100 node cluster to ensure that proxy'ing worked correctly

jiangxb1987 Please let me know if there's anything else I can do to help push this thru. Thanks!

Author: Anderson Osagie <osagie@gmail.com>

Closes #18499 from aosagie/fix/minimize-proxy-threads.
---
 .../apache/spark/deploy/master/Master.scala   | 15 ++-----
 .../spark/deploy/master/ui/MasterWebUI.scala  | 21 +++++----
 .../org/apache/spark/ui/JettyUtils.scala      | 45 ++++++++++---------
 .../scala/org/apache/spark/ui/UISuite.scala   | 20 ++++-----
 4 files changed, 46 insertions(+), 55 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
index 4cc580eb75ca2..e030cac60a8e4 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
@@ -133,6 +133,7 @@ private[deploy] class Master(
     masterWebUiUrl = "http://" + masterPublicAddress + ":" + webUi.boundPort
     if (reverseProxy) {
       masterWebUiUrl = conf.get("spark.ui.reverseProxyUrl", masterWebUiUrl)
+      webUi.addProxy()
       logInfo(s"Spark Master is acting as a reverse proxy. Master, Workers and " +
        s"Applications UIs are available at $masterWebUiUrl")
     }
@@ -769,9 +770,6 @@ private[deploy] class Master(
     workers += worker
     idToWorker(worker.id) = worker
     addressToWorker(workerAddress) = worker
-    if (reverseProxy) {
-       webUi.addProxyTargets(worker.id, worker.webUiAddress)
-    }
     true
   }
 
@@ -780,9 +778,7 @@ private[deploy] class Master(
     worker.setState(WorkerState.DEAD)
     idToWorker -= worker.id
     addressToWorker -= worker.endpoint.address
-    if (reverseProxy) {
-      webUi.removeProxyTargets(worker.id)
-    }
+
     for (exec <- worker.executors.values) {
       logInfo("Telling app of lost executor: " + exec.id)
       exec.application.driver.send(ExecutorUpdated(
@@ -844,9 +840,6 @@ private[deploy] class Master(
     endpointToApp(app.driver) = app
     addressToApp(appAddress) = app
     waitingApps += app
-    if (reverseProxy) {
-      webUi.addProxyTargets(app.id, app.desc.appUiUrl)
-    }
   }
 
   private def finishApplication(app: ApplicationInfo) {
@@ -860,9 +853,7 @@ private[deploy] class Master(
       idToApp -= app.id
       endpointToApp -= app.driver
       addressToApp -= app.driver.address
-      if (reverseProxy) {
-        webUi.removeProxyTargets(app.id)
-      }
+
       if (completedApps.size >= RETAINED_APPLICATIONS) {
         val toRemove = math.max(RETAINED_APPLICATIONS / 10, 1)
         completedApps.take(toRemove).foreach { a =>
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala
index e42f41b97202a..35b7ddd46e4db 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala
@@ -17,10 +17,7 @@
 
 package org.apache.spark.deploy.master.ui
 
-import scala.collection.mutable.HashMap
-
-import org.eclipse.jetty.servlet.ServletContextHandler
-
+import org.apache.spark.deploy.DeployMessages.{MasterStateResponse, RequestMasterState}
 import org.apache.spark.deploy.master.Master
 import org.apache.spark.internal.Logging
 import org.apache.spark.ui.{SparkUI, WebUI}
@@ -38,7 +35,6 @@ class MasterWebUI(
 
   val masterEndpointRef = master.self
   val killEnabled = master.conf.getBoolean("spark.ui.killEnabled", true)
-  private val proxyHandlers = new HashMap[String, ServletContextHandler]
 
   initialize()
 
@@ -54,16 +50,19 @@ class MasterWebUI(
       "/driver/kill", "/", masterPage.handleDriverKillRequest, httpMethods = Set("POST")))
   }
 
-  def addProxyTargets(id: String, target: String): Unit = {
-    val endTarget = target.stripSuffix("/")
-    val handler = createProxyHandler("/proxy/" + id, endTarget)
+  def addProxy(): Unit = {
+    val handler = createProxyHandler(idToUiAddress)
     attachHandler(handler)
-    proxyHandlers(id) = handler
   }
 
-  def removeProxyTargets(id: String): Unit = {
-    proxyHandlers.remove(id).foreach(detachHandler)
+  def idToUiAddress(id: String): Option[String] = {
+    val state = masterEndpointRef.askSync[MasterStateResponse](RequestMasterState)
+    val maybeWorkerUiAddress = state.workers.find(_.id == id).map(_.webUiAddress)
+    val maybeAppUiAddress = state.activeApps.find(_.id == id).map(_.desc.appUiUrl)
+
+    maybeWorkerUiAddress.orElse(maybeAppUiAddress)
   }
+
 }
 
 private[master] object MasterWebUI {
diff --git a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
index 0fa96713eaf95..880cf08d34016 100644
--- a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
@@ -194,28 +194,32 @@ private[spark] object JettyUtils extends Logging {
   }
 
   /** Create a handler for proxying request to Workers and Application Drivers */
-  def createProxyHandler(
-      prefix: String,
-      target: String): ServletContextHandler = {
+  def createProxyHandler(idToUiAddress: String => Option[String]): ServletContextHandler = {
     val servlet = new ProxyServlet {
       override def rewriteTarget(request: HttpServletRequest): String = {
-        val rewrittenURI = createProxyURI(
-          prefix, target, request.getRequestURI(), request.getQueryString())
-        if (rewrittenURI == null) {
-          return null
-        }
-        if (!validateDestination(rewrittenURI.getHost(), rewrittenURI.getPort())) {
-          return null
+        val path = request.getPathInfo
+        if (path == null) return null
+
+        val prefixTrailingSlashIndex = path.indexOf('/', 1)
+        val prefix = if (prefixTrailingSlashIndex == -1) {
+          path
+        } else {
+          path.substring(0, prefixTrailingSlashIndex)
         }
-        rewrittenURI.toString()
+        val id = prefix.drop(1)
+
+        // Query master state for id's corresponding UI address
+        // If that address exists, turn it into a valid, target URI string or return null
+        idToUiAddress(id)
+          .map(createProxyURI(prefix, _, path, request.getQueryString))
+          .filter(uri => uri != null && validateDestination(uri.getHost, uri.getPort))
+          .map(_.toString)
+          .orNull
       }
 
       override def newHttpClient(): HttpClient = {
         // SPARK-21176: Use the Jetty logic to calculate the number of selector threads (#CPUs/2),
         // but limit it to 8 max.
-        // Otherwise, it might happen that we exhaust the threadpool since in reverse proxy mode
-        // a proxy is instantiated for each executor. If the head node has many processors, this
-        // can quickly add up to an unreasonably high number of threads.
         val numSelectors = math.max(1, math.min(8, Runtime.getRuntime().availableProcessors() / 2))
         new HttpClient(new HttpClientTransportOverHTTP(numSelectors), null)
       }
@@ -226,8 +230,8 @@ private[spark] object JettyUtils extends Logging {
           headerName: String,
           headerValue: String): String = {
         if (headerName.equalsIgnoreCase("location")) {
-          val newHeader = createProxyLocationHeader(
-            prefix, headerValue, clientRequest, serverResponse.getRequest().getURI())
+          val newHeader = createProxyLocationHeader(headerValue, clientRequest,
+            serverResponse.getRequest().getURI())
           if (newHeader != null) {
             return newHeader
           }
@@ -239,8 +243,8 @@ private[spark] object JettyUtils extends Logging {
 
     val contextHandler = new ServletContextHandler
     val holder = new ServletHolder(servlet)
-    contextHandler.setContextPath(prefix)
-    contextHandler.addServlet(holder, "/")
+    contextHandler.setContextPath("/proxy")
+    contextHandler.addServlet(holder, "/*")
     contextHandler
   }
 
@@ -438,7 +442,7 @@ private[spark] object JettyUtils extends Logging {
     val rest = path.substring(prefix.length())
 
     if (!rest.isEmpty()) {
-      if (!rest.startsWith("/")) {
+      if (!rest.startsWith("/") && !uri.endsWith("/")) {
         uri.append("/")
       }
       uri.append(rest)
@@ -458,14 +462,13 @@ private[spark] object JettyUtils extends Logging {
   }
 
   def createProxyLocationHeader(
-      prefix: String,
       headerValue: String,
       clientRequest: HttpServletRequest,
       targetUri: URI): String = {
     val toReplace = targetUri.getScheme() + "://" + targetUri.getAuthority()
     if (headerValue.startsWith(toReplace)) {
       clientRequest.getScheme() + "://" + clientRequest.getHeader("host") +
-          prefix + headerValue.substring(toReplace.length())
+          clientRequest.getPathInfo() + headerValue.substring(toReplace.length())
     } else {
       null
     }
diff --git a/core/src/test/scala/org/apache/spark/ui/UISuite.scala b/core/src/test/scala/org/apache/spark/ui/UISuite.scala
index 0c3d4caeeabf9..0428903bb4d8b 100644
--- a/core/src/test/scala/org/apache/spark/ui/UISuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/UISuite.scala
@@ -200,36 +200,34 @@ class UISuite extends SparkFunSuite {
   }
 
   test("verify proxy rewrittenURI") {
-    val prefix = "/proxy/worker-id"
+    val prefix = "/worker-id"
     val target = "http://localhost:8081"
-    val path = "/proxy/worker-id/json"
+    val path = "/worker-id/json"
     var rewrittenURI = JettyUtils.createProxyURI(prefix, target, path, null)
     assert(rewrittenURI.toString() === "http://localhost:8081/json")
     rewrittenURI = JettyUtils.createProxyURI(prefix, target, path, "test=done")
     assert(rewrittenURI.toString() === "http://localhost:8081/json?test=done")
-    rewrittenURI = JettyUtils.createProxyURI(prefix, target, "/proxy/worker-id", null)
+    rewrittenURI = JettyUtils.createProxyURI(prefix, target, "/worker-id", null)
     assert(rewrittenURI.toString() === "http://localhost:8081")
-    rewrittenURI = JettyUtils.createProxyURI(prefix, target, "/proxy/worker-id/test%2F", null)
+    rewrittenURI = JettyUtils.createProxyURI(prefix, target, "/worker-id/test%2F", null)
     assert(rewrittenURI.toString() === "http://localhost:8081/test%2F")
-    rewrittenURI = JettyUtils.createProxyURI(prefix, target, "/proxy/worker-id/%F0%9F%98%84", null)
+    rewrittenURI = JettyUtils.createProxyURI(prefix, target, "/worker-id/%F0%9F%98%84", null)
     assert(rewrittenURI.toString() === "http://localhost:8081/%F0%9F%98%84")
-    rewrittenURI = JettyUtils.createProxyURI(prefix, target, "/proxy/worker-noid/json", null)
+    rewrittenURI = JettyUtils.createProxyURI(prefix, target, "/worker-noid/json", null)
     assert(rewrittenURI === null)
   }
 
   test("verify rewriting location header for reverse proxy") {
     val clientRequest = mock(classOf[HttpServletRequest])
     var headerValue = "http://localhost:4040/jobs"
-    val prefix = "/proxy/worker-id"
     val targetUri = URI.create("http://localhost:4040")
     when(clientRequest.getScheme()).thenReturn("http")
     when(clientRequest.getHeader("host")).thenReturn("localhost:8080")
-    var newHeader = JettyUtils.createProxyLocationHeader(
-      prefix, headerValue, clientRequest, targetUri)
+    when(clientRequest.getPathInfo()).thenReturn("/proxy/worker-id")
+    var newHeader = JettyUtils.createProxyLocationHeader(headerValue, clientRequest, targetUri)
     assert(newHeader.toString() === "http://localhost:8080/proxy/worker-id/jobs")
     headerValue = "http://localhost:4041/jobs"
-    newHeader = JettyUtils.createProxyLocationHeader(
-      prefix, headerValue, clientRequest, targetUri)
+    newHeader = JettyUtils.createProxyLocationHeader(headerValue, clientRequest, targetUri)
     assert(newHeader === null)
   }
 

From b35660dd0e930f4b484a079d9e2516b0a7dacf1d Mon Sep 17 00:00:00 2001
From: WeichenXu <WeichenXu123@outlook.com>
Date: Wed, 9 Aug 2017 14:44:10 +0800
Subject: [PATCH 330/779] [SPARK-21523][ML] update breeze to 0.13.2 for an
 emergency bugfix in strong wolfe line search

## What changes were proposed in this pull request?

Update breeze to 0.13.1 for an emergency bugfix in strong wolfe line search
https://github.com/scalanlp/breeze/pull/651

## How was this patch tested?

N/A

Author: WeichenXu <WeichenXu123@outlook.com>

Closes #18797 from WeichenXu123/update-breeze.
---
 dev/deps/spark-deps-hadoop-2.6                     |  4 ++--
 dev/deps/spark-deps-hadoop-2.7                     |  4 ++--
 .../ml/regression/AFTSurvivalRegression.scala      |  2 ++
 .../ml/regression/AFTSurvivalRegressionSuite.scala |  1 -
 .../org/apache/spark/ml/util/MLTestingUtils.scala  |  1 -
 .../spark/mllib/optimization/LBFGSSuite.scala      |  4 ++--
 pom.xml                                            |  2 +-
 python/pyspark/ml/regression.py                    | 14 +++++++-------
 8 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index a41183accf718..d7587fbcb4b1b 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -22,8 +22,8 @@ avro-mapred-1.7.7-hadoop2.jar
 base64-2.3.8.jar
 bcprov-jdk15on-1.51.jar
 bonecp-0.8.0.RELEASE.jar
-breeze-macros_2.11-0.13.1.jar
-breeze_2.11-0.13.1.jar
+breeze-macros_2.11-0.13.2.jar
+breeze_2.11-0.13.2.jar
 calcite-avatica-1.2.0-incubating.jar
 calcite-core-1.2.0-incubating.jar
 calcite-linq4j-1.2.0-incubating.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 5e1321be14d0c..887eeca9e422a 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -22,8 +22,8 @@ avro-mapred-1.7.7-hadoop2.jar
 base64-2.3.8.jar
 bcprov-jdk15on-1.51.jar
 bonecp-0.8.0.RELEASE.jar
-breeze-macros_2.11-0.13.1.jar
-breeze_2.11-0.13.1.jar
+breeze-macros_2.11-0.13.2.jar
+breeze_2.11-0.13.2.jar
 calcite-avatica-1.2.0-incubating.jar
 calcite-core-1.2.0-incubating.jar
 calcite-linq4j-1.2.0-incubating.jar
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
index 094853b6f4802..0891994530f88 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
@@ -553,6 +553,8 @@ private class AFTAggregator(
     val ti = data.label
     val delta = data.censor
 
+    require(ti > 0.0, "The lifetime or label should be  greater than 0.")
+
     val localFeaturesStd = bcFeaturesStd.value
 
     val margin = {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
index fb39e50a83552..02e5c6d294f44 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
@@ -364,7 +364,6 @@ class AFTSurvivalRegressionSuite
 
   test("should support all NumericType censors, and not support other types") {
     val df = spark.createDataFrame(Seq(
-      (0, Vectors.dense(0)),
       (1, Vectors.dense(1)),
       (2, Vectors.dense(2)),
       (3, Vectors.dense(3)),
diff --git a/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala b/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala
index bef79e634f75f..aef81c8c173a0 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala
@@ -160,7 +160,6 @@ object MLTestingUtils extends SparkFunSuite {
       featuresColName: String = "features",
       censorColName: String = "censor"): Map[NumericType, DataFrame] = {
     val df = spark.createDataFrame(Seq(
-      (0, Vectors.dense(0)),
       (1, Vectors.dense(1)),
       (2, Vectors.dense(2)),
       (3, Vectors.dense(3)),
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala
index 3d6a9f8d84cac..69c303ee932e0 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/optimization/LBFGSSuite.scala
@@ -191,8 +191,8 @@ class LBFGSSuite extends SparkFunSuite with MLlibTestSparkContext with Matchers
     // With smaller convergenceTol, it takes more steps.
     assert(lossLBFGS3.length > lossLBFGS2.length)
 
-    // Based on observation, lossLBFGS3 runs 7 iterations, no theoretically guaranteed.
-    assert(lossLBFGS3.length == 7)
+    // Based on observation, lossLBFGS3 runs 6 iterations, no theoretically guaranteed.
+    assert(lossLBFGS3.length == 6)
     assert((lossLBFGS3(4) - lossLBFGS3(5)) / lossLBFGS3(4) < convergenceTol)
   }
 
diff --git a/pom.xml b/pom.xml
index 95dd52acdfd5a..500fa1c421127 100644
--- a/pom.xml
+++ b/pom.xml
@@ -684,7 +684,7 @@
       <dependency>
         <groupId>org.scalanlp</groupId>
         <artifactId>breeze_${scala.binary.version}</artifactId>
-        <version>0.13.1</version>
+        <version>0.13.2</version>
         <exclusions>
           <!-- This is included as a compile-scoped dependency by jtransforms, which is
                a dependency of breeze. -->
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index 72374acbe019f..9d5b768091cf4 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -1123,7 +1123,7 @@ class AFTSurvivalRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
     >>> from pyspark.ml.linalg import Vectors
     >>> df = spark.createDataFrame([
     ...     (1.0, Vectors.dense(1.0), 1.0),
-    ...     (0.0, Vectors.sparse(1, [], []), 0.0)], ["label", "features", "censor"])
+    ...     (1e-40, Vectors.sparse(1, [], []), 0.0)], ["label", "features", "censor"])
     >>> aftsr = AFTSurvivalRegression()
     >>> model = aftsr.fit(df)
     >>> model.predict(Vectors.dense(6.3))
@@ -1131,12 +1131,12 @@ class AFTSurvivalRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
     >>> model.predictQuantiles(Vectors.dense(6.3))
     DenseVector([0.0101, 0.0513, 0.1054, 0.2877, 0.6931, 1.3863, 2.3026, 2.9957, 4.6052])
     >>> model.transform(df).show()
-    +-----+---------+------+----------+
-    |label| features|censor|prediction|
-    +-----+---------+------+----------+
-    |  1.0|    [1.0]|   1.0|       1.0|
-    |  0.0|(1,[],[])|   0.0|       1.0|
-    +-----+---------+------+----------+
+    +-------+---------+------+----------+
+    |  label| features|censor|prediction|
+    +-------+---------+------+----------+
+    |    1.0|    [1.0]|   1.0|       1.0|
+    |1.0E-40|(1,[],[])|   0.0|       1.0|
+    +-------+---------+------+----------+
     ...
     >>> aftsr_path = temp_path + "/aftsr"
     >>> aftsr.save(aftsr_path)

From 6426adffaf152651c30d481bb925d5025fd6130a Mon Sep 17 00:00:00 2001
From: 10087686 <wang.jiaochun@zte.com.cn>
Date: Wed, 9 Aug 2017 18:45:38 +0800
Subject: [PATCH 331/779] [SPARK-21663][TESTS] test("remote fetch below max RPC
 message size") should call masterTracker.stop() in MapOutputTrackerSuite
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 10087686 <wang.jiaochunzte.com.cn>

## What changes were proposed in this pull request?
After Unit tests end，there should be call masterTracker.stop() to free resource;
(Please fill in changes proposed in this fix)

## How was this patch tested?
Run Unit tests;
(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)
(If this patch involves UI changes, please attach a screenshot; otherwise, remove this)

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: 10087686 <wang.jiaochun@zte.com.cn>

Closes #18867 from wangjiaochun/mapout.
---
 .../test/scala/org/apache/spark/MapOutputTrackerSuite.scala  | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala
index bc3d23e3fbb29..493ae514e78fe 100644
--- a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala
@@ -176,7 +176,8 @@ class MapOutputTrackerSuite extends SparkFunSuite {
     val masterTracker = newTrackerMaster(newConf)
     val rpcEnv = createRpcEnv("spark")
     val masterEndpoint = new MapOutputTrackerMasterEndpoint(rpcEnv, masterTracker, newConf)
-    rpcEnv.setupEndpoint(MapOutputTracker.ENDPOINT_NAME, masterEndpoint)
+    masterTracker.trackerEndpoint =
+      rpcEnv.setupEndpoint(MapOutputTracker.ENDPOINT_NAME, masterEndpoint)
 
     // Message size should be ~123B, and no exception should be thrown
     masterTracker.registerShuffle(10, 1)
@@ -191,7 +192,7 @@ class MapOutputTrackerSuite extends SparkFunSuite {
     verify(rpcCallContext, timeout(30000)).reply(any())
     assert(0 == masterTracker.getNumCachedSerializedBroadcast)
 
-//    masterTracker.stop() // this throws an exception
+    masterTracker.stop()
     rpcEnv.shutdown()
   }
 

From 83fe3b5e10f8dc62245ea37143abb96be0f39805 Mon Sep 17 00:00:00 2001
From: vinodkc <vinod.kc.in@gmail.com>
Date: Wed, 9 Aug 2017 15:18:38 +0200
Subject: [PATCH 332/779] [SPARK-21665][CORE] Need to close resources after use

## What changes were proposed in this pull request?
Resources in Core - SparkSubmitArguments.scala, Spark-launcher - AbstractCommandBuilder.java, resource-managers- YARN - Client.scala are released

## How was this patch tested?
No new test cases added, Unit test have been passed

Author: vinodkc <vinod.kc.in@gmail.com>

Closes #18880 from vinodkc/br_fixresouceleak.
---
 .../spark/deploy/SparkSubmitArguments.scala      |  9 +++++----
 .../spark/launcher/AbstractCommandBuilder.java   | 16 +++-------------
 2 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
index 3721b98d68685..a7722e4f86023 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -207,11 +207,12 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
       uriScheme match {
         case "file" =>
           try {
-            val jar = new JarFile(uri.getPath)
-            // Note that this might still return null if no main-class is set; we catch that later
-            mainClass = jar.getManifest.getMainAttributes.getValue("Main-Class")
+            Utils.tryWithResource(new JarFile(uri.getPath)) { jar =>
+              // Note that this might still return null if no main-class is set; we catch that later
+              mainClass = jar.getManifest.getMainAttributes.getValue("Main-Class")
+            }
           } catch {
-            case e: Exception =>
+            case _: Exception =>
               SparkSubmit.printErrorAndExit(s"Cannot load main class from JAR $primaryResource")
           }
         case _ =>
diff --git a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
index 860ab35852331..44028c58ac489 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
@@ -291,24 +291,14 @@ private Properties loadPropertiesFile() throws IOException {
     }
 
     if (propsFile.isFile()) {
-      FileInputStream fd = null;
-      try {
-        fd = new FileInputStream(propsFile);
-        props.load(new InputStreamReader(fd, StandardCharsets.UTF_8));
+      try (InputStreamReader isr = new InputStreamReader(
+          new FileInputStream(propsFile), StandardCharsets.UTF_8)) {
+        props.load(isr);
         for (Map.Entry<Object, Object> e : props.entrySet()) {
           e.setValue(e.getValue().toString().trim());
         }
-      } finally {
-        if (fd != null) {
-          try {
-            fd.close();
-          } catch (IOException e) {
-            // Ignore.
-          }
-        }
       }
     }
-
     return props;
   }
 

From b78cf13bf05f0eadd7ae97df84b6e1505dc5ff9f Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Wed, 9 Aug 2017 17:31:52 +0200
Subject: [PATCH 333/779] [SPARK-21276][CORE] Update lz4-java to the latest
 (v1.4.0)

## What changes were proposed in this pull request?
This pr updated `lz4-java` to the latest (v1.4.0) and removed custom `LZ4BlockInputStream`. We currently use custom `LZ4BlockInputStream` to read concatenated byte stream in shuffle. But, this functionality has been implemented in the latest lz4-java (https://github.com/lz4/lz4-java/pull/105). So, we might update the latest to remove the custom `LZ4BlockInputStream`.

Major diffs between the latest release and v1.3.0 in the master are as follows (https://github.com/lz4/lz4-java/compare/62f7547abb0819d1ca1e669645ee1a9d26cd60b0...6d4693f56253fcddfad7b441bb8d917b182efa2d);
- fixed NPE in XXHashFactory similarly
- Don't place resources in default package to support shading
- Fixes ByteBuffer methods failing to apply arrayOffset() for array-backed
- Try to load lz4-java from java.library.path, then fallback to bundled
- Add ppc64le binary
- Add s390x JNI binding
- Add basic LZ4 Frame v1.5.0 support
- enable aarch64 support for lz4-java
- Allow unsafeInstance() for ppc64le archiecture
- Add unsafeInstance support for AArch64
- Support 64-bit JNI build on Solaris
- Avoid over-allocating a buffer
- Allow EndMark to be incompressible for LZ4FrameInputStream.
- Concat byte stream

## How was this patch tested?
Existing tests.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #18883 from maropu/SPARK-21276.
---
 core/pom.xml                                  |   4 +-
 .../apache/spark/io/LZ4BlockInputStream.java  | 260 ------------------
 .../apache/spark/io/CompressionCodec.scala    |   7 +-
 dev/deps/spark-deps-hadoop-2.6                |   2 +-
 dev/deps/spark-deps-hadoop-2.7                |   2 +-
 external/kafka-0-10-assembly/pom.xml          |   4 +-
 external/kafka-0-8-assembly/pom.xml           |   4 +-
 pom.xml                                       |   6 +-
 project/MimaExcludes.scala                    |   5 +-
 9 files changed, 20 insertions(+), 274 deletions(-)
 delete mode 100644 core/src/main/java/org/apache/spark/io/LZ4BlockInputStream.java

diff --git a/core/pom.xml b/core/pom.xml
index bc6b1c4f01117..431967e057e6f 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -190,8 +190,8 @@
       <artifactId>snappy-java</artifactId>
     </dependency>
     <dependency>
-      <groupId>net.jpountz.lz4</groupId>
-      <artifactId>lz4</artifactId>
+      <groupId>org.lz4</groupId>
+      <artifactId>lz4-java</artifactId>
     </dependency>
     <dependency>
       <groupId>org.roaringbitmap</groupId>
diff --git a/core/src/main/java/org/apache/spark/io/LZ4BlockInputStream.java b/core/src/main/java/org/apache/spark/io/LZ4BlockInputStream.java
deleted file mode 100644
index 9d6f06ed2888f..0000000000000
--- a/core/src/main/java/org/apache/spark/io/LZ4BlockInputStream.java
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.io;
-
-import java.io.EOFException;
-import java.io.FilterInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.zip.Checksum;
-
-import net.jpountz.lz4.LZ4Exception;
-import net.jpountz.lz4.LZ4Factory;
-import net.jpountz.lz4.LZ4FastDecompressor;
-import net.jpountz.util.SafeUtils;
-import net.jpountz.xxhash.XXHashFactory;
-
-/**
- * {@link InputStream} implementation to decode data written with
- * {@link net.jpountz.lz4.LZ4BlockOutputStream}. This class is not thread-safe and does not
- * support {@link #mark(int)}/{@link #reset()}.
- * @see net.jpountz.lz4.LZ4BlockOutputStream
- *
- * This is based on net.jpountz.lz4.LZ4BlockInputStream
- *
- * changes: https://github.com/davies/lz4-java/commit/cc1fa940ac57cc66a0b937300f805d37e2bf8411
- *
- * TODO: merge this into upstream
- */
-public final class LZ4BlockInputStream extends FilterInputStream {
-
-  // Copied from net.jpountz.lz4.LZ4BlockOutputStream
-  static final byte[] MAGIC = new byte[] { 'L', 'Z', '4', 'B', 'l', 'o', 'c', 'k' };
-  static final int MAGIC_LENGTH = MAGIC.length;
-
-  static final int HEADER_LENGTH =
-    MAGIC_LENGTH // magic bytes
-      + 1          // token
-      + 4          // compressed length
-      + 4          // decompressed length
-      + 4;         // checksum
-
-  static final int COMPRESSION_LEVEL_BASE = 10;
-
-  static final int COMPRESSION_METHOD_RAW = 0x10;
-  static final int COMPRESSION_METHOD_LZ4 = 0x20;
-
-  static final int DEFAULT_SEED = 0x9747b28c;
-
-  private final LZ4FastDecompressor decompressor;
-  private final Checksum checksum;
-  private byte[] buffer;
-  private byte[] compressedBuffer;
-  private int originalLen;
-  private int o;
-  private boolean finished;
-
-  /**
-   * Create a new {@link InputStream}.
-   *
-   * @param in            the {@link InputStream} to poll
-   * @param decompressor  the {@link LZ4FastDecompressor decompressor} instance to
-   *                      use
-   * @param checksum      the {@link Checksum} instance to use, must be
-   *                      equivalent to the instance which has been used to
-   *                      write the stream
-   */
-  public LZ4BlockInputStream(InputStream in, LZ4FastDecompressor decompressor, Checksum checksum) {
-    super(in);
-    this.decompressor = decompressor;
-    this.checksum = checksum;
-    this.buffer = new byte[0];
-    this.compressedBuffer = new byte[HEADER_LENGTH];
-    o = originalLen = 0;
-    finished = false;
-  }
-
-  /**
-   * Create a new instance using {@link net.jpountz.xxhash.XXHash32} for checksuming.
-   * @see #LZ4BlockInputStream(InputStream, LZ4FastDecompressor, Checksum)
-   * @see net.jpountz.xxhash.StreamingXXHash32#asChecksum()
-   */
-  public LZ4BlockInputStream(InputStream in, LZ4FastDecompressor decompressor) {
-    this(in, decompressor,
-      XXHashFactory.fastestInstance().newStreamingHash32(DEFAULT_SEED).asChecksum());
-  }
-
-  /**
-   * Create a new instance which uses the fastest {@link LZ4FastDecompressor} available.
-   * @see LZ4Factory#fastestInstance()
-   * @see #LZ4BlockInputStream(InputStream, LZ4FastDecompressor)
-   */
-  public LZ4BlockInputStream(InputStream in) {
-    this(in, LZ4Factory.fastestInstance().fastDecompressor());
-  }
-
-  @Override
-  public int available() throws IOException {
-    refill();
-    return originalLen - o;
-  }
-
-  @Override
-  public int read() throws IOException {
-    refill();
-    if (finished) {
-      return -1;
-    }
-    return buffer[o++] & 0xFF;
-  }
-
-  @Override
-  public int read(byte[] b, int off, int len) throws IOException {
-    SafeUtils.checkRange(b, off, len);
-    refill();
-    if (finished) {
-      return -1;
-    }
-    len = Math.min(len, originalLen - o);
-    System.arraycopy(buffer, o, b, off, len);
-    o += len;
-    return len;
-  }
-
-  @Override
-  public int read(byte[] b) throws IOException {
-    return read(b, 0, b.length);
-  }
-
-  @Override
-  public long skip(long n) throws IOException {
-    refill();
-    if (finished) {
-      return -1;
-    }
-    final int skipped = (int) Math.min(n, originalLen - o);
-    o += skipped;
-    return skipped;
-  }
-
-  private void refill() throws IOException {
-    if (finished || o < originalLen) {
-      return;
-    }
-    try {
-      readFully(compressedBuffer, HEADER_LENGTH);
-    } catch (EOFException e) {
-      finished = true;
-      return;
-    }
-    for (int i = 0; i < MAGIC_LENGTH; ++i) {
-      if (compressedBuffer[i] != MAGIC[i]) {
-        throw new IOException("Stream is corrupted");
-      }
-    }
-    final int token = compressedBuffer[MAGIC_LENGTH] & 0xFF;
-    final int compressionMethod = token & 0xF0;
-    final int compressionLevel = COMPRESSION_LEVEL_BASE + (token & 0x0F);
-    if (compressionMethod != COMPRESSION_METHOD_RAW && compressionMethod != COMPRESSION_METHOD_LZ4)
-    {
-      throw new IOException("Stream is corrupted");
-    }
-    final int compressedLen = SafeUtils.readIntLE(compressedBuffer, MAGIC_LENGTH + 1);
-    originalLen = SafeUtils.readIntLE(compressedBuffer, MAGIC_LENGTH + 5);
-    final int check = SafeUtils.readIntLE(compressedBuffer, MAGIC_LENGTH + 9);
-    assert HEADER_LENGTH == MAGIC_LENGTH + 13;
-    if (originalLen > 1 << compressionLevel
-      || originalLen < 0
-      || compressedLen < 0
-      || (originalLen == 0 && compressedLen != 0)
-      || (originalLen != 0 && compressedLen == 0)
-      || (compressionMethod == COMPRESSION_METHOD_RAW && originalLen != compressedLen)) {
-      throw new IOException("Stream is corrupted");
-    }
-    if (originalLen == 0 && compressedLen == 0) {
-      if (check != 0) {
-        throw new IOException("Stream is corrupted");
-      }
-      refill();
-      return;
-    }
-    if (buffer.length < originalLen) {
-      buffer = new byte[Math.max(originalLen, buffer.length * 3 / 2)];
-    }
-    switch (compressionMethod) {
-      case COMPRESSION_METHOD_RAW:
-        readFully(buffer, originalLen);
-        break;
-      case COMPRESSION_METHOD_LZ4:
-        if (compressedBuffer.length < originalLen) {
-          compressedBuffer = new byte[Math.max(compressedLen, compressedBuffer.length * 3 / 2)];
-        }
-        readFully(compressedBuffer, compressedLen);
-        try {
-          final int compressedLen2 =
-            decompressor.decompress(compressedBuffer, 0, buffer, 0, originalLen);
-          if (compressedLen != compressedLen2) {
-            throw new IOException("Stream is corrupted");
-          }
-        } catch (LZ4Exception e) {
-          throw new IOException("Stream is corrupted", e);
-        }
-        break;
-      default:
-        throw new AssertionError();
-    }
-    checksum.reset();
-    checksum.update(buffer, 0, originalLen);
-    if ((int) checksum.getValue() != check) {
-      throw new IOException("Stream is corrupted");
-    }
-    o = 0;
-  }
-
-  private void readFully(byte[] b, int len) throws IOException {
-    int read = 0;
-    while (read < len) {
-      final int r = in.read(b, read, len - read);
-      if (r < 0) {
-        throw new EOFException("Stream ended prematurely");
-      }
-      read += r;
-    }
-    assert len == read;
-  }
-
-  @Override
-  public boolean markSupported() {
-    return false;
-  }
-
-  @SuppressWarnings("sync-override")
-  @Override
-  public void mark(int readlimit) {
-    // unsupported
-  }
-
-  @SuppressWarnings("sync-override")
-  @Override
-  public void reset() throws IOException {
-    throw new IOException("mark/reset not supported");
-  }
-
-  @Override
-  public String toString() {
-    return getClass().getSimpleName() + "(in=" + in
-      + ", decompressor=" + decompressor + ", checksum=" + checksum + ")";
-  }
-
-}
diff --git a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
index 0cb16f0627b72..27f2e429395db 100644
--- a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
+++ b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
@@ -21,7 +21,7 @@ import java.io._
 import java.util.Locale
 
 import com.ning.compress.lzf.{LZFInputStream, LZFOutputStream}
-import net.jpountz.lz4.LZ4BlockOutputStream
+import net.jpountz.lz4.{LZ4BlockInputStream, LZ4BlockOutputStream}
 import org.xerial.snappy.{Snappy, SnappyInputStream, SnappyOutputStream}
 
 import org.apache.spark.SparkConf
@@ -115,7 +115,10 @@ class LZ4CompressionCodec(conf: SparkConf) extends CompressionCodec {
     new LZ4BlockOutputStream(s, blockSize)
   }
 
-  override def compressedInputStream(s: InputStream): InputStream = new LZ4BlockInputStream(s)
+  override def compressedInputStream(s: InputStream): InputStream = {
+    val disableConcatenationOfByteStream = false
+    new LZ4BlockInputStream(s, disableConcatenationOfByteStream)
+  }
 }
 
 
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index d7587fbcb4b1b..83070a906d877 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -133,7 +133,7 @@ leveldbjni-all-1.8.jar
 libfb303-0.9.3.jar
 libthrift-0.9.3.jar
 log4j-1.2.17.jar
-lz4-1.3.0.jar
+lz4-java-1.4.0.jar
 machinist_2.11-0.6.1.jar
 macro-compat_2.11-1.1.1.jar
 mail-1.4.7.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 887eeca9e422a..5481e255a53de 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -134,7 +134,7 @@ leveldbjni-all-1.8.jar
 libfb303-0.9.3.jar
 libthrift-0.9.3.jar
 log4j-1.2.17.jar
-lz4-1.3.0.jar
+lz4-java-1.4.0.jar
 machinist_2.11-0.6.1.jar
 macro-compat_2.11-1.1.1.jar
 mail-1.4.7.jar
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index 75df886ca44f6..d6f97316b326a 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -65,8 +65,8 @@
       <scope>provided</scope>
     </dependency>
     <dependency>
-      <groupId>net.jpountz.lz4</groupId>
-      <artifactId>lz4</artifactId>
+      <groupId>org.lz4</groupId>
+      <artifactId>lz4-java</artifactId>
       <scope>provided</scope>
     </dependency>
     <dependency>
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index f9c2dcb38dc0e..786349474389b 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -65,8 +65,8 @@
       <scope>provided</scope>
     </dependency>
     <dependency>
-      <groupId>net.jpountz.lz4</groupId>
-      <artifactId>lz4</artifactId>
+      <groupId>org.lz4</groupId>
+      <artifactId>lz4-java</artifactId>
       <scope>provided</scope>
     </dependency>
     <dependency>
diff --git a/pom.xml b/pom.xml
index 500fa1c421127..9616f6d6edbac 100644
--- a/pom.xml
+++ b/pom.xml
@@ -531,9 +531,9 @@
         <scope>${hadoop.deps.scope}</scope>
       </dependency>
       <dependency>
-        <groupId>net.jpountz.lz4</groupId>
-        <artifactId>lz4</artifactId>
-        <version>1.3.0</version>
+        <groupId>org.lz4</groupId>
+        <artifactId>lz4-java</artifactId>
+        <version>1.4.0</version>
       </dependency>
       <dependency>
         <groupId>com.clearspring.analytics</groupId>
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 1793da03a2c3e..7ba85bda0cd7a 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -41,7 +41,10 @@ object MimaExcludes {
 
     // [SPARK-19937] Add remote bytes read to disk.
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.status.api.v1.ShuffleReadMetrics.this"),
-    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.status.api.v1.ShuffleReadMetricDistributions.this")
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.status.api.v1.ShuffleReadMetricDistributions.this"),
+
+    // [SPARK-21276] Update lz4-java to the latest (v1.4.0)
+    ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.io.LZ4BlockInputStream")
   )
 
   // Exclude rules for 2.2.x

From 2d799d08081032828cc2c95cbf58a268653c7a05 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Wed, 9 Aug 2017 08:46:25 -0700
Subject: [PATCH 334/779] [SPARK-21504][SQL] Add spark version info into table
 metadata

## What changes were proposed in this pull request?
This PR is to add the spark version info in the table metadata. When creating the table, this value is assigned. It can help users find which version of Spark was used to create the table.

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #18709 from gatorsmile/addVersion.
---
 .../sql/catalyst/catalog/ExternalCatalog.scala |  4 +++-
 .../spark/sql/catalyst/catalog/interface.scala |  7 ++++++-
 .../sql/catalyst/trees/TreeNodeSuite.scala     |  4 +++-
 .../describe-table-after-alter-table.sql.out   | 15 ++++++++++-----
 .../sql-tests/results/describe.sql.out         | 18 ++++++++++++------
 .../sql-tests/results/show-tables.sql.out      |  9 ++++++---
 .../apache/spark/sql/SQLQueryTestSuite.scala   |  3 ++-
 .../spark/sql/execution/command/DDLSuite.scala |  1 +
 .../spark/sql/hive/HiveExternalCatalog.scala   | 12 +++++++++++-
 .../sql/hive/execution/HiveDDLSuite.scala      |  1 +
 10 files changed, 55 insertions(+), 19 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
index 6000d483db209..68644f4d6b95b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
@@ -106,8 +106,10 @@ abstract class ExternalCatalog
   final def createTable(tableDefinition: CatalogTable, ignoreIfExists: Boolean): Unit = {
     val db = tableDefinition.database
     val name = tableDefinition.identifier.table
+    val tableDefinitionWithVersion =
+      tableDefinition.copy(createVersion = org.apache.spark.SPARK_VERSION)
     postToAll(CreateTablePreEvent(db, name))
-    doCreateTable(tableDefinition, ignoreIfExists)
+    doCreateTable(tableDefinitionWithVersion, ignoreIfExists)
     postToAll(CreateTableEvent(db, name))
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index 9531456434a15..f86510624aa78 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -205,6 +205,9 @@ case class BucketSpec(
  *                           configured.
  * @param ignoredProperties is a list of table properties that are used by the underlying table
  *                          but ignored by Spark SQL yet.
+ * @param createVersion records the version of Spark that created this table metadata. The default
+ *                      is an empty string. We expect it will be read from the catalog or filled by
+ *                      ExternalCatalog.createTable. For temporary views, the value will be empty.
  */
 case class CatalogTable(
     identifier: TableIdentifier,
@@ -217,6 +220,7 @@ case class CatalogTable(
     owner: String = "",
     createTime: Long = System.currentTimeMillis,
     lastAccessTime: Long = -1,
+    createVersion: String = "",
     properties: Map[String, String] = Map.empty,
     stats: Option[CatalogStatistics] = None,
     viewText: Option[String] = None,
@@ -302,8 +306,9 @@ case class CatalogTable(
     identifier.database.foreach(map.put("Database", _))
     map.put("Table", identifier.table)
     if (owner.nonEmpty) map.put("Owner", owner)
-    map.put("Created", new Date(createTime).toString)
+    map.put("Created Time", new Date(createTime).toString)
     map.put("Last Access", new Date(lastAccessTime).toString)
+    map.put("Created By", "Spark " + createVersion)
     map.put("Type", tableType.name)
     provider.foreach(map.put("Provider", _))
     bucketSpec.foreach(map ++= _.toLinkedHashMap)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
index 4fc947a88f6b8..f83c6379185c4 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
@@ -480,7 +480,8 @@ class TreeNodeSuite extends SparkFunSuite {
         CatalogTableType.MANAGED,
         CatalogStorageFormat.empty,
         StructType(StructField("a", IntegerType, true) :: Nil),
-        createTime = 0L),
+        createTime = 0L,
+        createVersion = "2.x"),
 
       JObject(
         "product-class" -> classOf[CatalogTable].getName,
@@ -509,6 +510,7 @@ class TreeNodeSuite extends SparkFunSuite {
         "owner" -> "",
         "createTime" -> 0,
         "lastAccessTime" -> -1,
+        "createVersion" -> "2.x",
         "tracksPartitionsInCatalog" -> false,
         "properties" -> JNull,
         "unsupportedFeatures" -> List.empty[String],
diff --git a/sql/core/src/test/resources/sql-tests/results/describe-table-after-alter-table.sql.out b/sql/core/src/test/resources/sql-tests/results/describe-table-after-alter-table.sql.out
index 4bf4633491bd9..7873085da5069 100644
--- a/sql/core/src/test/resources/sql-tests/results/describe-table-after-alter-table.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/describe-table-after-alter-table.sql.out
@@ -23,8 +23,9 @@ d                   	string
 # Detailed Table Information	                    	                    
 Database            	default             	                    
 Table               	table_with_comment  	                    
-Created [not included in comparison]
+Created Time [not included in comparison]
 Last Access [not included in comparison]
+Created By [not included in comparison]
 Type                	MANAGED             	                    
 Provider            	parquet             	                    
 Comment             	added               	                    
@@ -52,8 +53,9 @@ d                   	string
 # Detailed Table Information	                    	                    
 Database            	default             	                    
 Table               	table_with_comment  	                    
-Created [not included in comparison]
+Created Time [not included in comparison]
 Last Access [not included in comparison]
+Created By [not included in comparison]
 Type                	MANAGED             	                    
 Provider            	parquet             	                    
 Comment             	modified comment    	                    
@@ -88,8 +90,9 @@ b                   	int
 # Detailed Table Information	                    	                    
 Database            	default             	                    
 Table               	table_comment       	                    
-Created [not included in comparison]
+Created Time [not included in comparison]
 Last Access [not included in comparison]
+Created By [not included in comparison]
 Type                	MANAGED             	                    
 Provider            	parquet             	                    
 Location [not included in comparison]sql/core/spark-warehouse/table_comment
@@ -114,8 +117,9 @@ b                   	int
 # Detailed Table Information	                    	                    
 Database            	default             	                    
 Table               	table_comment       	                    
-Created [not included in comparison]
+Created Time [not included in comparison]
 Last Access [not included in comparison]
+Created By [not included in comparison]
 Type                	MANAGED             	                    
 Provider            	parquet             	                    
 Comment             	added comment       	                    
@@ -141,8 +145,9 @@ b                   	int
 # Detailed Table Information	                    	                    
 Database            	default             	                    
 Table               	table_comment       	                    
-Created [not included in comparison]
+Created Time [not included in comparison]
 Last Access [not included in comparison]
+Created By [not included in comparison]
 Type                	MANAGED             	                    
 Provider            	parquet             	                    
 Location [not included in comparison]sql/core/spark-warehouse/table_comment
diff --git a/sql/core/src/test/resources/sql-tests/results/describe.sql.out b/sql/core/src/test/resources/sql-tests/results/describe.sql.out
index e2b79e8f7801d..b91f2c09f3cd4 100644
--- a/sql/core/src/test/resources/sql-tests/results/describe.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/describe.sql.out
@@ -120,8 +120,9 @@ d                   	string
 # Detailed Table Information	                    	                    
 Database            	default             	                    
 Table               	t                   	                    
-Created [not included in comparison]
+Created Time [not included in comparison]
 Last Access [not included in comparison]
+Created By [not included in comparison]
 Type                	MANAGED             	                    
 Provider            	parquet             	                    
 Num Buckets         	2                   	                    
@@ -151,8 +152,9 @@ d                   	string
 # Detailed Table Information	                    	                    
 Database            	default             	                    
 Table               	t                   	                    
-Created [not included in comparison]
+Created Time [not included in comparison]
 Last Access [not included in comparison]
+Created By [not included in comparison]
 Type                	MANAGED             	                    
 Provider            	parquet             	                    
 Num Buckets         	2                   	                    
@@ -190,8 +192,9 @@ d                   	string
 # Detailed Table Information	                    	                    
 Database            	default             	                    
 Table               	t                   	                    
-Created [not included in comparison]
+Created Time [not included in comparison]
 Last Access [not included in comparison]
+Created By [not included in comparison]
 Type                	MANAGED             	                    
 Provider            	parquet             	                    
 Num Buckets         	2                   	                    
@@ -228,8 +231,9 @@ d                   	string
 # Detailed Table Information	                    	                    
 Database            	default             	                    
 Table               	t                   	                    
-Created [not included in comparison]
+Created Time [not included in comparison]
 Last Access [not included in comparison]
+Created By [not included in comparison]
 Type                	MANAGED             	                    
 Provider            	parquet             	                    
 Num Buckets         	2                   	                    
@@ -458,8 +462,9 @@ d                   	string
 # Detailed Table Information	                    	                    
 Database            	default             	                    
 Table               	v                   	                    
-Created [not included in comparison]
+Created Time [not included in comparison]
 Last Access [not included in comparison]
+Created By [not included in comparison]
 Type                	VIEW                	                    
 View Text           	SELECT * FROM t     	                    
 View Default Database	default             	                    
@@ -480,8 +485,9 @@ d                   	string
 # Detailed Table Information	                    	                    
 Database            	default             	                    
 Table               	v                   	                    
-Created [not included in comparison]
+Created Time [not included in comparison]
 Last Access [not included in comparison]
+Created By [not included in comparison]
 Type                	VIEW                	                    
 View Text           	SELECT * FROM t     	                    
 View Default Database	default             	                    
diff --git a/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out b/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out
index 8f2a54f7c24e2..da729cd757cfc 100644
--- a/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out
@@ -119,8 +119,9 @@ SHOW TABLE EXTENDED LIKE 'show_t*'
 struct<database:string,tableName:string,isTemporary:boolean,information:string>
 -- !query 12 output
 show_t3	true	Table: show_t3
-Created [not included in comparison]
+Created Time [not included in comparison]
 Last Access [not included in comparison]
+Created By [not included in comparison]
 Type: VIEW
 Schema: root
  |-- e: integer (nullable = true)
@@ -128,8 +129,9 @@ Schema: root
 
 showdb	show_t1	false	Database: showdb
 Table: show_t1
-Created [not included in comparison]
+Created Time [not included in comparison]
 Last Access [not included in comparison]
+Created By [not included in comparison]
 Type: MANAGED
 Provider: parquet
 Location [not included in comparison]sql/core/spark-warehouse/showdb.db/show_t1
@@ -144,8 +146,9 @@ Schema: root
 
 showdb	show_t2	false	Database: showdb
 Table: show_t2
-Created [not included in comparison]
+Created Time [not included in comparison]
 Last Access [not included in comparison]
+Created By [not included in comparison]
 Type: MANAGED
 Provider: parquet
 Location [not included in comparison]sql/core/spark-warehouse/showdb.db/show_t2
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
index d9130fdcfaea6..aa000bddf9c7e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
@@ -228,7 +228,8 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext {
       // Get answer, but also get rid of the #1234 expression ids that show up in explain plans
       val answer = df.queryExecution.hiveResultString().map(_.replaceAll("#\\d+", "#x")
         .replaceAll("Location.*/sql/core/", s"Location ${notIncludedMsg}sql/core/")
-        .replaceAll("Created.*", s"Created $notIncludedMsg")
+        .replaceAll("Created By.*", s"Created By $notIncludedMsg")
+        .replaceAll("Created Time.*", s"Created Time $notIncludedMsg")
         .replaceAll("Last Access.*", s"Last Access $notIncludedMsg"))
 
       // If the output is not pre-sorted, sort it.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 5c0a6aa724bf0..9332f773430e7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -68,6 +68,7 @@ class InMemoryCatalogedDDLSuite extends DDLSuite with SharedSQLContext with Befo
       provider = Some("parquet"),
       partitionColumnNames = Seq("a", "b"),
       createTime = 0L,
+      createVersion = org.apache.spark.SPARK_VERSION,
       tracksPartitionsInCatalog = true)
   }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 172317c34659d..19e5f789822e8 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -390,6 +390,9 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     val bucketSpec = table.bucketSpec
 
     val properties = new mutable.HashMap[String, String]
+
+    properties.put(CREATED_SPARK_VERSION, table.createVersion)
+
     // Serialized JSON schema string may be too long to be stored into a single metastore table
     // property. In this case, we split the JSON string and store each part as a separate table
     // property.
@@ -594,7 +597,8 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       // Set the `schema`, `partitionColumnNames` and `bucketSpec` from the old table definition,
       // to retain the spark specific format if it is.
       val propsFromOldTable = oldTableDef.properties.filter { case (k, v) =>
-        k.startsWith(DATASOURCE_PREFIX) || k.startsWith(STATISTICS_PREFIX)
+        k.startsWith(DATASOURCE_PREFIX) || k.startsWith(STATISTICS_PREFIX) ||
+          k.startsWith(CREATED_SPARK_VERSION)
       }
       val newTableProps = propsFromOldTable ++ tableDefinition.properties + partitionProviderProp
       val newDef = tableDefinition.copy(
@@ -700,6 +704,9 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
         table = restoreDataSourceTable(table, provider)
     }
 
+    // Restore version info
+    val version: String = table.properties.getOrElse(CREATED_SPARK_VERSION, "2.2 or prior")
+
     // Restore Spark's statistics from information in Metastore.
     val statsProps = table.properties.filterKeys(_.startsWith(STATISTICS_PREFIX))
 
@@ -735,6 +742,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
 
     // Get the original table properties as defined by the user.
     table.copy(
+      createVersion = version,
       properties = table.properties.filterNot { case (key, _) => key.startsWith(SPARK_SQL_PREFIX) })
   }
 
@@ -1208,6 +1216,8 @@ object HiveExternalCatalog {
   val TABLE_PARTITION_PROVIDER_CATALOG = "catalog"
   val TABLE_PARTITION_PROVIDER_FILESYSTEM = "filesystem"
 
+  val CREATED_SPARK_VERSION = SPARK_SQL_PREFIX + "create.version"
+
   /**
    * Returns the fully qualified name used in table properties for a particular column stat.
    * For example, for column "mycol", and "min" stat, this should return
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index 5b62e37311d88..0007d25614a29 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -90,6 +90,7 @@ class HiveCatalogedDDLSuite extends DDLSuite with TestHiveSingleton with BeforeA
       provider = if (isDataSource) Some("parquet") else Some("hive"),
       partitionColumnNames = Seq("a", "b"),
       createTime = 0L,
+      createVersion = org.apache.spark.SPARK_VERSION,
       tracksPartitionsInCatalog = true)
   }
 

From 0fb73253fc832361d5d89ba85692ae653961e104 Mon Sep 17 00:00:00 2001
From: Jose Torres <joseph-torres@databricks.com>
Date: Wed, 9 Aug 2017 12:50:04 -0700
Subject: [PATCH 335/779] [SPARK-21587][SS] Added filter pushdown through
 watermarks.

## What changes were proposed in this pull request?

Push filter predicates through EventTimeWatermark if they're deterministic and do not reference the watermarked attribute. (This is similar but not identical to the logic for pushing through UnaryNode.)

## How was this patch tested?
unit tests

Author: Jose Torres <joseph-torres@databricks.com>

Closes #18790 from joseph-torres/SPARK-21587.
---
 .../sql/catalyst/optimizer/Optimizer.scala    | 19 +++++++
 .../optimizer/FilterPushdownSuite.scala       | 57 +++++++++++++++++++
 2 files changed, 76 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index d82af94dbffb7..a51b385399d88 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -867,6 +867,25 @@ object PushDownPredicate extends Rule[LogicalPlan] with PredicateHelper {
         filter
       }
 
+    case filter @ Filter(condition, watermark: EventTimeWatermark) =>
+      // We can only push deterministic predicates which don't reference the watermark attribute.
+      // We could in theory span() only on determinism and pull out deterministic predicates
+      // on the watermark separately. But it seems unnecessary and a bit confusing to not simply
+      // use the prefix as we do for nondeterminism in other cases.
+
+      val (pushDown, stayUp) = splitConjunctivePredicates(condition).span(
+        p => p.deterministic && !p.references.contains(watermark.eventTime))
+
+      if (pushDown.nonEmpty) {
+        val pushDownPredicate = pushDown.reduceLeft(And)
+        val newWatermark = watermark.copy(child = Filter(pushDownPredicate, watermark.child))
+        // If there is no more filter to stay up, just eliminate the filter.
+        // Otherwise, create "Filter(stayUp) <- watermark <- Filter(pushDownPredicate)".
+        if (stayUp.isEmpty) newWatermark else Filter(stayUp.reduceLeft(And), newWatermark)
+      } else {
+        filter
+      }
+
     case filter @ Filter(_, u: UnaryNode)
         if canPushThrough(u) && u.expressions.forall(_.deterministic) =>
       pushDownPredicate(filter, u.child) { predicate =>
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
index 3553d23560dad..582b3ead5e54a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
@@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.types.IntegerType
+import org.apache.spark.unsafe.types.CalendarInterval
 
 class FilterPushdownSuite extends PlanTest {
 
@@ -1134,4 +1135,60 @@ class FilterPushdownSuite extends PlanTest {
     comparePlans(Optimize.execute(originalQuery.analyze), correctAnswer.analyze,
       checkAnalysis = false)
   }
+
+  test("watermark pushdown: no pushdown on watermark attribute") {
+    val interval = new CalendarInterval(2, 2000L)
+
+    // Verify that all conditions preceding the first watermark touching condition are pushed down
+    // by the optimizer and others are not.
+    val originalQuery = EventTimeWatermark('b, interval, testRelation)
+      .where('a === 5 && 'b === 10 && 'c === 5)
+    val correctAnswer = EventTimeWatermark(
+      'b, interval, testRelation.where('a === 5))
+      .where('b === 10 && 'c === 5)
+
+    comparePlans(Optimize.execute(originalQuery.analyze), correctAnswer.analyze,
+      checkAnalysis = false)
+  }
+
+  test("watermark pushdown: no pushdown for nondeterministic filter") {
+    val interval = new CalendarInterval(2, 2000L)
+
+    // Verify that all conditions preceding the first watermark touching condition are pushed down
+    // by the optimizer and others are not.
+    val originalQuery = EventTimeWatermark('c, interval, testRelation)
+      .where('a === 5 && 'b === Rand(10) && 'c === 5)
+    val correctAnswer = EventTimeWatermark(
+      'c, interval, testRelation.where('a === 5))
+      .where('b === Rand(10) && 'c === 5)
+
+    comparePlans(Optimize.execute(originalQuery.analyze), correctAnswer.analyze,
+      checkAnalysis = false)
+  }
+
+  test("watermark pushdown: full pushdown") {
+    val interval = new CalendarInterval(2, 2000L)
+
+    // Verify that all conditions preceding the first watermark touching condition are pushed down
+    // by the optimizer and others are not.
+    val originalQuery = EventTimeWatermark('c, interval, testRelation)
+      .where('a === 5 && 'b === 10)
+    val correctAnswer = EventTimeWatermark(
+      'c, interval, testRelation.where('a === 5 && 'b === 10))
+
+    comparePlans(Optimize.execute(originalQuery.analyze), correctAnswer.analyze,
+      checkAnalysis = false)
+  }
+
+  test("watermark pushdown: empty pushdown") {
+    val interval = new CalendarInterval(2, 2000L)
+
+    // Verify that all conditions preceding the first watermark touching condition are pushed down
+    // by the optimizer and others are not.
+    val originalQuery = EventTimeWatermark('a, interval, testRelation)
+      .where('a === 5 && 'b === 10)
+
+    comparePlans(Optimize.execute(originalQuery.analyze), originalQuery.analyze,
+      checkAnalysis = false)
+  }
 }

From c06f3f5ac500b02d38ca7ec5fcb33085e07f2f75 Mon Sep 17 00:00:00 2001
From: peay <peay@protonmail.com>
Date: Wed, 9 Aug 2017 14:03:18 -0700
Subject: [PATCH 336/779] [SPARK-21551][PYTHON] Increase timeout for
 PythonRDD.serveIterator

## What changes were proposed in this pull request?

This modification increases the timeout for `serveIterator` (which is not dynamically configurable). This fixes timeout issues in pyspark when using `collect` and similar functions, in cases where Python may take more than a couple seconds to connect.

See https://issues.apache.org/jira/browse/SPARK-21551

## How was this patch tested?

Ran the tests.

cc rxin

Author: peay <peay@protonmail.com>

Closes #18752 from peay/spark-21551.
---
 .../main/scala/org/apache/spark/api/python/PythonRDD.scala  | 6 +++---
 python/pyspark/rdd.py                                       | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 6a817524000ee..33771011fe364 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -683,7 +683,7 @@ private[spark] object PythonRDD extends Logging {
    * Create a socket server and a background thread to serve the data in `items`,
    *
    * The socket server can only accept one connection, or close if no connection
-   * in 3 seconds.
+   * in 15 seconds.
    *
    * Once a connection comes in, it tries to serialize all the data in `items`
    * and send them into this connection.
@@ -692,8 +692,8 @@ private[spark] object PythonRDD extends Logging {
    */
   def serveIterator[T](items: Iterator[T], threadName: String): Int = {
     val serverSocket = new ServerSocket(0, 1, InetAddress.getByName("localhost"))
-    // Close the socket if no connection in 3 seconds
-    serverSocket.setSoTimeout(3000)
+    // Close the socket if no connection in 15 seconds
+    serverSocket.setSoTimeout(15000)
 
     new Thread(threadName) {
       setDaemon(true)
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 3325b65f8b600..ea993c572fafd 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -127,7 +127,7 @@ def _load_from_socket(port, serializer):
         af, socktype, proto, canonname, sa = res
         sock = socket.socket(af, socktype, proto)
         try:
-            sock.settimeout(3)
+            sock.settimeout(15)
             sock.connect(sa)
         except socket.error:
             sock.close()

From 84454d7d33363a41adf242c8a81ffca20769c55c Mon Sep 17 00:00:00 2001
From: bravo-zhang <mzhang1230@gmail.com>
Date: Wed, 9 Aug 2017 17:42:21 -0700
Subject: [PATCH 337/779] [SPARK-14932][SQL] Allow DataFrame.replace() to
 replace values with None

## What changes were proposed in this pull request?

Currently `df.na.replace("*", Map[String, String]("NULL" -> null))` will produce exception.
This PR enables passing null/None as value in the replacement map in DataFrame.replace().
Note that the replacement map keys and values should still be the same type, while the values can have a mix of null/None and that type.
This PR enables following operations for example:
`df.na.replace("*", Map[String, String]("NULL" -> null))`(scala)
`df.na.replace("*", Map[Any, Any](60 -> null, 70 -> 80))`(scala)
`df.na.replace('Alice', None)`(python)
`df.na.replace([10, 20])`(python, replacing with None is by default)
One use case could be: I want to replace all the empty strings with null/None because they were incorrectly generated and then drop all null/None data
`df.na.replace("*", Map("" -> null)).na.drop()`(scala)
`df.replace(u'', None).dropna()`(python)

## How was this patch tested?

Scala unit test.
Python doctest and unit test.

Author: bravo-zhang <mzhang1230@gmail.com>

Closes #18820 from bravo-zhang/spark-14932.
---
 python/pyspark/sql/dataframe.py               | 35 ++++++++----
 python/pyspark/sql/tests.py                   | 15 +++++
 .../spark/sql/DataFrameNaFunctions.scala      | 57 +++++++++++--------
 .../spark/sql/DataFrameNaFunctionsSuite.scala | 43 ++++++++++++++
 4 files changed, 113 insertions(+), 37 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 944739bcd2078..edc7ca6f5146f 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1362,8 +1362,8 @@ def replace(self, to_replace, value=None, subset=None):
         """Returns a new :class:`DataFrame` replacing a value with another value.
         :func:`DataFrame.replace` and :func:`DataFrameNaFunctions.replace` are
         aliases of each other.
-        Values to_replace and value should contain either all numerics, all booleans,
-        or all strings. When replacing, the new value will be cast
+        Values to_replace and value must have the same type and can only be numerics, booleans,
+        or strings. Value can have None. When replacing, the new value will be cast
         to the type of the existing column.
         For numeric replacements all values to be replaced should have unique
         floating point representation. In case of conflicts (for example with `{42: -1, 42.0: 1}`)
@@ -1373,8 +1373,8 @@ def replace(self, to_replace, value=None, subset=None):
             Value to be replaced.
             If the value is a dict, then `value` is ignored and `to_replace` must be a
             mapping between a value and a replacement.
-        :param value: int, long, float, string, or list.
-            The replacement value must be an int, long, float, or string. If `value` is a
+        :param value: bool, int, long, float, string, list or None.
+            The replacement value must be a bool, int, long, float, string or None. If `value` is a
             list, `value` should be of the same length and type as `to_replace`.
             If `value` is a scalar and `to_replace` is a sequence, then `value` is
             used as a replacement for each item in `to_replace`.
@@ -1393,6 +1393,16 @@ def replace(self, to_replace, value=None, subset=None):
         |null|  null| null|
         +----+------+-----+
 
+        >>> df4.na.replace('Alice', None).show()
+        +----+------+----+
+        | age|height|name|
+        +----+------+----+
+        |  10|    80|null|
+        |   5|  null| Bob|
+        |null|  null| Tom|
+        |null|  null|null|
+        +----+------+----+
+
         >>> df4.na.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show()
         +----+------+----+
         | age|height|name|
@@ -1425,12 +1435,13 @@ def all_of_(xs):
         valid_types = (bool, float, int, long, basestring, list, tuple)
         if not isinstance(to_replace, valid_types + (dict, )):
             raise ValueError(
-                "to_replace should be a float, int, long, string, list, tuple, or dict. "
+                "to_replace should be a bool, float, int, long, string, list, tuple, or dict. "
                 "Got {0}".format(type(to_replace)))
 
-        if not isinstance(value, valid_types) and not isinstance(to_replace, dict):
+        if not isinstance(value, valid_types) and value is not None \
+                and not isinstance(to_replace, dict):
             raise ValueError("If to_replace is not a dict, value should be "
-                             "a float, int, long, string, list, or tuple. "
+                             "a bool, float, int, long, string, list, tuple or None. "
                              "Got {0}".format(type(value)))
 
         if isinstance(to_replace, (list, tuple)) and isinstance(value, (list, tuple)):
@@ -1446,21 +1457,21 @@ def all_of_(xs):
         if isinstance(to_replace, (float, int, long, basestring)):
             to_replace = [to_replace]
 
-        if isinstance(value, (float, int, long, basestring)):
-            value = [value for _ in range(len(to_replace))]
-
         if isinstance(to_replace, dict):
             rep_dict = to_replace
             if value is not None:
                 warnings.warn("to_replace is a dict and value is not None. value will be ignored.")
         else:
+            if isinstance(value, (float, int, long, basestring)) or value is None:
+                value = [value for _ in range(len(to_replace))]
             rep_dict = dict(zip(to_replace, value))
 
         if isinstance(subset, basestring):
             subset = [subset]
 
-        # Verify we were not passed in mixed type generics."
-        if not any(all_of_type(rep_dict.keys()) and all_of_type(rep_dict.values())
+        # Verify we were not passed in mixed type generics.
+        if not any(all_of_type(rep_dict.keys())
+                   and all_of_type(x for x in rep_dict.values() if x is not None)
                    for all_of_type in [all_of_bool, all_of_str, all_of_numeric]):
             raise ValueError("Mixed type replacements are not supported")
 
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index cfd9c558ff67e..cf2c473a1645c 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -1964,6 +1964,21 @@ def test_replace(self):
                .replace(False, True).first())
         self.assertTupleEqual(row, (True, True))
 
+        # replace list while value is not given (default to None)
+        row = self.spark.createDataFrame(
+            [(u'Alice', 10, 80.0)], schema).replace(["Alice", "Bob"]).first()
+        self.assertTupleEqual(row, (None, 10, 80.0))
+
+        # replace string with None and then drop None rows
+        row = self.spark.createDataFrame(
+            [(u'Alice', 10, 80.0)], schema).replace(u'Alice', None).dropna()
+        self.assertEqual(row.count(), 0)
+
+        # replace with number and None
+        row = self.spark.createDataFrame(
+            [(u'Alice', 10, 80.0)], schema).replace([10, 80], [20, None]).first()
+        self.assertTupleEqual(row, (u'Alice', 20, None))
+
         # should fail if subset is not list, tuple or None
         with self.assertRaises(ValueError):
             self.spark.createDataFrame(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
index 871fff71e5538..e068df3586f06 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
@@ -260,9 +260,6 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
 
   /**
    * Replaces values matching keys in `replacement` map with the corresponding values.
-   * Key and value of `replacement` map must have the same type, and
-   * can only be doubles, strings or booleans.
-   * If `col` is "*", then the replacement is applied on all string columns or numeric columns.
    *
    * {{{
    *   import com.google.common.collect.ImmutableMap;
@@ -277,8 +274,11 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
    *   df.na.replace("*", ImmutableMap.of("UNKNOWN", "unnamed"));
    * }}}
    *
-   * @param col name of the column to apply the value replacement
-   * @param replacement value replacement map, as explained above
+   * @param col name of the column to apply the value replacement. If `col` is "*",
+   *            replacement is applied on all string, numeric or boolean columns.
+   * @param replacement value replacement map. Key and value of `replacement` map must have
+   *                    the same type, and can only be doubles, strings or booleans.
+   *                    The map value can have nulls.
    *
    * @since 1.3.1
    */
@@ -288,8 +288,6 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
 
   /**
    * Replaces values matching keys in `replacement` map with the corresponding values.
-   * Key and value of `replacement` map must have the same type, and
-   * can only be doubles, strings or booleans.
    *
    * {{{
    *   import com.google.common.collect.ImmutableMap;
@@ -301,8 +299,11 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
    *   df.na.replace(new String[] {"firstname", "lastname"}, ImmutableMap.of("UNKNOWN", "unnamed"));
    * }}}
    *
-   * @param cols list of columns to apply the value replacement
-   * @param replacement value replacement map, as explained above
+   * @param cols list of columns to apply the value replacement. If `col` is "*",
+   *             replacement is applied on all string, numeric or boolean columns.
+   * @param replacement value replacement map. Key and value of `replacement` map must have
+   *                    the same type, and can only be doubles, strings or booleans.
+   *                    The map value can have nulls.
    *
    * @since 1.3.1
    */
@@ -312,10 +313,6 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
 
   /**
    * (Scala-specific) Replaces values matching keys in `replacement` map.
-   * Key and value of `replacement` map must have the same type, and
-   * can only be doubles, strings or booleans.
-   * If `col` is "*",
-   * then the replacement is applied on all string columns , numeric columns or boolean columns.
    *
    * {{{
    *   // Replaces all occurrences of 1.0 with 2.0 in column "height".
@@ -328,8 +325,11 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
    *   df.na.replace("*", Map("UNKNOWN" -> "unnamed"));
    * }}}
    *
-   * @param col name of the column to apply the value replacement
-   * @param replacement value replacement map, as explained above
+   * @param col name of the column to apply the value replacement. If `col` is "*",
+   *            replacement is applied on all string, numeric or boolean columns.
+   * @param replacement value replacement map. Key and value of `replacement` map must have
+   *                    the same type, and can only be doubles, strings or booleans.
+   *                    The map value can have nulls.
    *
    * @since 1.3.1
    */
@@ -343,8 +343,6 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
 
   /**
    * (Scala-specific) Replaces values matching keys in `replacement` map.
-   * Key and value of `replacement` map must have the same type, and
-   * can only be doubles , strings or booleans.
    *
    * {{{
    *   // Replaces all occurrences of 1.0 with 2.0 in column "height" and "weight".
@@ -354,8 +352,11 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
    *   df.na.replace("firstname" :: "lastname" :: Nil, Map("UNKNOWN" -> "unnamed"));
    * }}}
    *
-   * @param cols list of columns to apply the value replacement
-   * @param replacement value replacement map, as explained above
+   * @param cols list of columns to apply the value replacement. If `col` is "*",
+   *             replacement is applied on all string, numeric or boolean columns.
+   * @param replacement value replacement map. Key and value of `replacement` map must have
+   *                    the same type, and can only be doubles, strings or booleans.
+   *                    The map value can have nulls.
    *
    * @since 1.3.1
    */
@@ -366,14 +367,20 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
       return df
     }
 
-    // replacementMap is either Map[String, String] or Map[Double, Double] or Map[Boolean,Boolean]
-    val replacementMap: Map[_, _] = replacement.head._2 match {
-      case v: String => replacement
-      case v: Boolean => replacement
-      case _ => replacement.map { case (k, v) => (convertToDouble(k), convertToDouble(v)) }
+    // Convert the NumericType in replacement map to DoubleType,
+    // while leaving StringType, BooleanType and null untouched.
+    val replacementMap: Map[_, _] = replacement.map {
+      case (k, v: String) => (k, v)
+      case (k, v: Boolean) => (k, v)
+      case (k: String, null) => (k, null)
+      case (k: Boolean, null) => (k, null)
+      case (k, null) => (convertToDouble(k), null)
+      case (k, v) => (convertToDouble(k), convertToDouble(v))
     }
 
-    // targetColumnType is either DoubleType or StringType or BooleanType
+    // targetColumnType is either DoubleType, StringType or BooleanType,
+    // depending on the type of first key in replacement map.
+    // Only fields of targetColumnType will perform replacement.
     val targetColumnType = replacement.head._1 match {
       case _: jl.Double | _: jl.Float | _: jl.Integer | _: jl.Long => DoubleType
       case _: jl.Boolean => BooleanType
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala
index 47c9ba5847a4f..e6983b6be555a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala
@@ -262,4 +262,47 @@ class DataFrameNaFunctionsSuite extends QueryTest with SharedSQLContext {
     assert(out1(4) === Row("Amy", null, null))
     assert(out1(5) === Row(null, null, null))
   }
+
+  test("replace with null") {
+    val input = Seq[(String, java.lang.Double, java.lang.Boolean)](
+      ("Bob", 176.5, true),
+      ("Alice", 164.3, false),
+      ("David", null, true)
+    ).toDF("name", "height", "married")
+
+    // Replace String with String and null
+    checkAnswer(
+      input.na.replace("name", Map(
+        "Bob" -> "Bravo",
+        "Alice" -> null
+      )),
+      Row("Bravo", 176.5, true) ::
+        Row(null, 164.3, false) ::
+        Row("David", null, true) :: Nil)
+
+    // Replace Double with null
+    checkAnswer(
+      input.na.replace("height", Map[Any, Any](
+        164.3 -> null
+      )),
+      Row("Bob", 176.5, true) ::
+        Row("Alice", null, false) ::
+        Row("David", null, true) :: Nil)
+
+    // Replace Boolean with null
+    checkAnswer(
+      input.na.replace("*", Map[Any, Any](
+        false -> null
+      )),
+      Row("Bob", 176.5, true) ::
+        Row("Alice", 164.3, null) ::
+        Row("David", null, true) :: Nil)
+
+    // Replace String with null and then drop rows containing null
+    checkAnswer(
+      input.na.replace("name", Map(
+        "Bob" -> null
+      )).na.drop("name" :: Nil).select("name"),
+      Row("Alice") :: Row("David") :: Nil)
+  }
 }

From 95ad960caf009d843ec700ee41cbccc2fa3a68a5 Mon Sep 17 00:00:00 2001
From: Adrian Ionescu <adrian@databricks.com>
Date: Thu, 10 Aug 2017 12:37:10 -0700
Subject: [PATCH 338/779] [SPARK-21669] Internal API for collecting
 metrics/stats during FileFormatWriter jobs

## What changes were proposed in this pull request?

This patch introduces an internal interface for tracking metrics and/or statistics on data on the fly, as it is being written to disk during a `FileFormatWriter` job and partially reimplements SPARK-20703 in terms of it.

The interface basically consists of 3 traits:
- `WriteTaskStats`: just a tag for classes that represent statistics collected during a `WriteTask`
  The only constraint it adds is that the class should be `Serializable`, as instances of it will be collected on the driver from all executors at the end of the `WriteJob`.
- `WriteTaskStatsTracker`: a trait for classes that can actually compute statistics based on tuples that are processed by a given `WriteTask` and eventually produce a `WriteTaskStats` instance.
- `WriteJobStatsTracker`: a trait for classes that act as containers of `Serializable` state that's necessary for instantiating `WriteTaskStatsTracker` on executors and finally process the resulting collection of `WriteTaskStats`, once they're gathered back on the driver.

Potential future use of this interface is e.g. CBO stats maintenance during `INSERT INTO table ... ` operations.

## How was this patch tested?
Existing tests for SPARK-20703 exercise the new code: `hive/SQLMetricsSuite`, `sql/JavaDataFrameReaderWriterSuite`, etc.

Author: Adrian Ionescu <adrian@databricks.com>

Closes #18884 from adrian-ionescu/write-stats-tracker-api.
---
 .../command/DataWritingCommand.scala          |  34 +--
 .../datasources/BasicWriteStatsTracker.scala  | 133 ++++++++++
 .../datasources/FileFormatWriter.scala        | 245 ++++++++++--------
 .../InsertIntoHadoopFsRelationCommand.scala   |  43 ++-
 .../datasources/WriteStatsTracker.scala       | 121 +++++++++
 .../execution/streaming/FileStreamSink.scala  |   2 +-
 .../hive/execution/InsertIntoHiveTable.scala  |   4 +-
 7 files changed, 420 insertions(+), 162 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriteStatsTracker.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala
index 700f7f81dc8a9..4e1c5e4846f36 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala
@@ -17,10 +17,13 @@
 
 package org.apache.spark.sql.execution.command
 
+import org.apache.hadoop.conf.Configuration
+
 import org.apache.spark.SparkContext
-import org.apache.spark.sql.execution.SQLExecution
-import org.apache.spark.sql.execution.datasources.ExecutedWriteSummary
+import org.apache.spark.sql.execution.datasources.BasicWriteJobStatsTracker
 import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
+import org.apache.spark.util.SerializableConfiguration
+
 
 /**
  * A special `RunnableCommand` which writes data out and updates metrics.
@@ -37,29 +40,8 @@ trait DataWritingCommand extends RunnableCommand {
     )
   }
 
-  /**
-   * Callback function that update metrics collected from the writing operation.
-   */
-  protected def updateWritingMetrics(writeSummaries: Seq[ExecutedWriteSummary]): Unit = {
-    val sparkContext = SparkContext.getActive.get
-    var numPartitions = 0
-    var numFiles = 0
-    var totalNumBytes: Long = 0L
-    var totalNumOutput: Long = 0L
-
-    writeSummaries.foreach { summary =>
-      numPartitions += summary.updatedPartitions.size
-      numFiles += summary.numOutputFile
-      totalNumBytes += summary.numOutputBytes
-      totalNumOutput += summary.numOutputRows
-    }
-
-    metrics("numFiles").add(numFiles)
-    metrics("numOutputBytes").add(totalNumBytes)
-    metrics("numOutputRows").add(totalNumOutput)
-    metrics("numParts").add(numPartitions)
-
-    val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
-    SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metrics.values.toList)
+  def basicWriteJobStatsTracker(hadoopConf: Configuration): BasicWriteJobStatsTracker = {
+    val serializableHadoopConf = new SerializableConfiguration(hadoopConf)
+    new BasicWriteJobStatsTracker(serializableHadoopConf, metrics)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala
new file mode 100644
index 0000000000000..b8f7d130d569f
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+
+import org.apache.spark.SparkContext
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.execution.SQLExecution
+import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
+import org.apache.spark.util.SerializableConfiguration
+
+
+/**
+ * Simple metrics collected during an instance of [[FileFormatWriter.ExecuteWriteTask]].
+ * These were first introduced in https://github.com/apache/spark/pull/18159 (SPARK-20703).
+ */
+case class BasicWriteTaskStats(
+    numPartitions: Int,
+    numFiles: Int,
+    numBytes: Long,
+    numRows: Long)
+  extends WriteTaskStats
+
+
+/**
+ * Simple [[WriteTaskStatsTracker]] implementation that produces [[BasicWriteTaskStats]].
+ * @param hadoopConf
+ */
+class BasicWriteTaskStatsTracker(hadoopConf: Configuration)
+  extends WriteTaskStatsTracker {
+
+  private[this] var numPartitions: Int = 0
+  private[this] var numFiles: Int = 0
+  private[this] var numBytes: Long = 0L
+  private[this] var numRows: Long = 0L
+
+  private[this] var curFile: String = null
+
+
+  private def getFileSize(filePath: String): Long = {
+    val path = new Path(filePath)
+    val fs = path.getFileSystem(hadoopConf)
+    fs.getFileStatus(path).getLen()
+  }
+
+
+  override def newPartition(partitionValues: InternalRow): Unit = {
+    numPartitions += 1
+  }
+
+  override def newBucket(bucketId: Int): Unit = {
+    // currently unhandled
+  }
+
+  override def newFile(filePath: String): Unit = {
+    if (numFiles > 0) {
+      // we assume here that we've finished writing to disk the previous file by now
+      numBytes += getFileSize(curFile)
+    }
+    curFile = filePath
+    numFiles += 1
+  }
+
+  override def newRow(row: InternalRow): Unit = {
+    numRows += 1
+  }
+
+  override def getFinalStats(): WriteTaskStats = {
+    if (numFiles > 0) {
+      numBytes += getFileSize(curFile)
+    }
+    BasicWriteTaskStats(numPartitions, numFiles, numBytes, numRows)
+  }
+}
+
+
+/**
+ * Simple [[WriteJobStatsTracker]] implementation that's serializable, capable of
+ * instantiating [[BasicWriteTaskStatsTracker]] on executors and processing the
+ * [[BasicWriteTaskStats]] they produce by aggregating the metrics and posting them
+ * as DriverMetricUpdates.
+ */
+class BasicWriteJobStatsTracker(
+    serializableHadoopConf: SerializableConfiguration,
+    @transient val metrics: Map[String, SQLMetric])
+  extends WriteJobStatsTracker {
+
+  override def newTaskInstance(): WriteTaskStatsTracker = {
+    new BasicWriteTaskStatsTracker(serializableHadoopConf.value)
+  }
+
+  override def processStats(stats: Seq[WriteTaskStats]): Unit = {
+    val sparkContext = SparkContext.getActive.get
+    var numPartitions: Long = 0L
+    var numFiles: Long = 0L
+    var totalNumBytes: Long = 0L
+    var totalNumOutput: Long = 0L
+
+    val basicStats = stats.map(_.asInstanceOf[BasicWriteTaskStats])
+
+    basicStats.foreach { summary =>
+      numPartitions += summary.numPartitions
+      numFiles += summary.numFiles
+      totalNumBytes += summary.numBytes
+      totalNumOutput += summary.numRows
+    }
+
+    metrics("numFiles").add(numFiles)
+    metrics("numOutputBytes").add(totalNumBytes)
+    metrics("numOutputRows").add(totalNumOutput)
+    metrics("numParts").add(numPartitions)
+
+    val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
+    SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metrics.values.toList)
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
index 073e8788aff86..68aaa8a8d8e29 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
@@ -22,7 +22,7 @@ import java.util.{Date, UUID}
 import scala.collection.mutable
 
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
 import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
@@ -35,7 +35,7 @@ import org.apache.spark.shuffle.FetchFailedException
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.catalog.{BucketSpec, ExternalCatalogUtils}
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
-import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, _}
 import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
@@ -70,7 +70,8 @@ object FileFormatWriter extends Logging {
       val path: String,
       val customPartitionLocations: Map[TablePartitionSpec, String],
       val maxRecordsPerFile: Long,
-      val timeZoneId: String)
+      val timeZoneId: String,
+      val statsTrackers: Seq[WriteJobStatsTracker])
     extends Serializable {
 
     assert(AttributeSet(allColumns) == AttributeSet(partitionColumns ++ dataColumns),
@@ -94,6 +95,9 @@ object FileFormatWriter extends Logging {
    *    exception is thrown during task commitment, also aborts that task.
    * 4. If all tasks are committed, commit the job, otherwise aborts the job;  If any exception is
    *    thrown during job commitment, also aborts the job.
+   * 5. If the job is successfully committed, perform post-commit operations such as
+   *    processing statistics.
+   * @return The set of all partition paths that were updated during this write job.
    */
   def write(
       sparkSession: SparkSession,
@@ -104,8 +108,9 @@ object FileFormatWriter extends Logging {
       hadoopConf: Configuration,
       partitionColumns: Seq[Attribute],
       bucketSpec: Option[BucketSpec],
-      refreshFunction: (Seq[ExecutedWriteSummary]) => Unit,
-      options: Map[String, String]): Unit = {
+      statsTrackers: Seq[WriteJobStatsTracker],
+      options: Map[String, String])
+    : Set[String] = {
 
     val job = Job.getInstance(hadoopConf)
     job.setOutputKeyClass(classOf[Void])
@@ -146,7 +151,8 @@ object FileFormatWriter extends Logging {
       maxRecordsPerFile = caseInsensitiveOptions.get("maxRecordsPerFile").map(_.toLong)
         .getOrElse(sparkSession.sessionState.conf.maxRecordsPerFile),
       timeZoneId = caseInsensitiveOptions.get(DateTimeUtils.TIMEZONE_OPTION)
-        .getOrElse(sparkSession.sessionState.conf.sessionLocalTimeZone)
+        .getOrElse(sparkSession.sessionState.conf.sessionLocalTimeZone),
+      statsTrackers = statsTrackers
     )
 
     // We should first sort by partition columns, then bucket id, and finally sorting columns.
@@ -199,7 +205,12 @@ object FileFormatWriter extends Logging {
 
       committer.commitJob(job, commitMsgs)
       logInfo(s"Job ${job.getJobID} committed.")
-      refreshFunction(ret.map(_.summary))
+
+      processStats(description.statsTrackers, ret.map(_.summary.stats))
+      logInfo(s"Finished processing stats for job ${job.getJobID}.")
+
+      // return a set of all the partition paths that were updated during this job
+      ret.map(_.summary.updatedPartitions).reduceOption(_ ++ _).getOrElse(Set.empty)
     } catch { case cause: Throwable =>
       logError(s"Aborting job ${job.getJobID}.", cause)
       committer.abortJob(job)
@@ -238,7 +249,7 @@ object FileFormatWriter extends Logging {
     val writeTask =
       if (sparkPartitionId != 0 && !iterator.hasNext) {
         // In case of empty job, leave first partition to save meta for file format like parquet.
-        new EmptyDirectoryWriteTask
+        new EmptyDirectoryWriteTask(description)
       } else if (description.partitionColumns.isEmpty && description.bucketIdExpression.isEmpty) {
         new SingleDirectoryWriteTask(description, taskAttemptContext, committer)
       } else {
@@ -268,6 +279,33 @@ object FileFormatWriter extends Logging {
     }
   }
 
+  /**
+   * For every registered [[WriteJobStatsTracker]], call `processStats()` on it, passing it
+   * the corresponding [[WriteTaskStats]] from all executors.
+   */
+  private def processStats(
+      statsTrackers: Seq[WriteJobStatsTracker],
+      statsPerTask: Seq[Seq[WriteTaskStats]])
+    : Unit = {
+
+    val numStatsTrackers = statsTrackers.length
+    assert(statsPerTask.forall(_.length == numStatsTrackers),
+      s"""Every WriteTask should have produced one `WriteTaskStats` object for every tracker.
+         |There are $numStatsTrackers statsTrackers, but some task returned
+         |${statsPerTask.find(_.length != numStatsTrackers).get.length} results instead.
+       """.stripMargin)
+
+    val statsPerTracker = if (statsPerTask.nonEmpty) {
+      statsPerTask.transpose
+    } else {
+      statsTrackers.map(_ => Seq.empty)
+    }
+
+    statsTrackers.zip(statsPerTracker).foreach {
+      case (statsTracker, stats) => statsTracker.processStats(stats)
+    }
+  }
+
   /**
    * A simple trait for writing out data in a single Spark task, without any concerns about how
    * to commit or abort tasks. Exceptions thrown by the implementation of this trait will
@@ -275,44 +313,27 @@ object FileFormatWriter extends Logging {
    */
   private trait ExecuteWriteTask {
 
-    /**
-     * The data structures used to measure metrics during writing.
-     */
-    protected var numOutputRows: Long = 0L
-    protected var numOutputBytes: Long = 0L
-
     /**
      * Writes data out to files, and then returns the summary of relative information which
      * includes the list of partition strings written out. The list of partitions is sent back
      * to the driver and used to update the catalog. Other information will be sent back to the
-     * driver too and used to update the metrics in UI.
+     * driver too and used to e.g. update the metrics in UI.
      */
     def execute(iterator: Iterator[InternalRow]): ExecutedWriteSummary
     def releaseResources(): Unit
-
-    /**
-     * A helper function used to determine the size in  bytes of a written file.
-     */
-    protected def getFileSize(conf: Configuration, filePath: String): Long = {
-      if (filePath != null) {
-        val path = new Path(filePath)
-        val fs = path.getFileSystem(conf)
-        fs.getFileStatus(path).getLen()
-      } else {
-        0L
-      }
-    }
   }
 
   /** ExecuteWriteTask for empty partitions */
-  private class EmptyDirectoryWriteTask extends ExecuteWriteTask {
+  private class EmptyDirectoryWriteTask(description: WriteJobDescription)
+    extends ExecuteWriteTask {
+
+    val statsTrackers: Seq[WriteTaskStatsTracker] =
+      description.statsTrackers.map(_.newTaskInstance())
 
     override def execute(iter: Iterator[InternalRow]): ExecutedWriteSummary = {
       ExecutedWriteSummary(
         updatedPartitions = Set.empty,
-        numOutputFile = 0,
-        numOutputBytes = 0,
-        numOutputRows = 0)
+        stats = statsTrackers.map(_.getFinalStats()))
     }
 
     override def releaseResources(): Unit = {}
@@ -325,11 +346,13 @@ object FileFormatWriter extends Logging {
       committer: FileCommitProtocol) extends ExecuteWriteTask {
 
     private[this] var currentWriter: OutputWriter = _
-    private[this] var currentPath: String = _
+
+    val statsTrackers: Seq[WriteTaskStatsTracker] =
+      description.statsTrackers.map(_.newTaskInstance())
 
     private def newOutputWriter(fileCounter: Int): Unit = {
       val ext = description.outputWriterFactory.getFileExtension(taskAttemptContext)
-      currentPath = committer.newTaskTempFile(
+      val currentPath = committer.newTaskTempFile(
         taskAttemptContext,
         None,
         f"-c$fileCounter%03d" + ext)
@@ -338,6 +361,8 @@ object FileFormatWriter extends Logging {
         path = currentPath,
         dataSchema = description.dataColumns.toStructType,
         context = taskAttemptContext)
+
+      statsTrackers.map(_.newFile(currentPath))
     }
 
     override def execute(iter: Iterator[InternalRow]): ExecutedWriteSummary = {
@@ -353,29 +378,24 @@ object FileFormatWriter extends Logging {
 
           recordsInFile = 0
           releaseResources()
-          numOutputRows += recordsInFile
           newOutputWriter(fileCounter)
         }
 
         val internalRow = iter.next()
         currentWriter.write(internalRow)
+        statsTrackers.foreach(_.newRow(internalRow))
         recordsInFile += 1
       }
       releaseResources()
-      numOutputRows += recordsInFile
-
       ExecutedWriteSummary(
         updatedPartitions = Set.empty,
-        numOutputFile = fileCounter + 1,
-        numOutputBytes = numOutputBytes,
-        numOutputRows = numOutputRows)
+        stats = statsTrackers.map(_.getFinalStats()))
     }
 
     override def releaseResources(): Unit = {
       if (currentWriter != null) {
         try {
           currentWriter.close()
-          numOutputBytes += getFileSize(taskAttemptContext.getConfiguration, currentPath)
         } finally {
           currentWriter = null
         }
@@ -392,30 +412,65 @@ object FileFormatWriter extends Logging {
       taskAttemptContext: TaskAttemptContext,
       committer: FileCommitProtocol) extends ExecuteWriteTask {
 
-    // currentWriter is initialized whenever we see a new key
+    /** Flag saying whether or not the data to be written out is partitioned. */
+    val isPartitioned = desc.partitionColumns.nonEmpty
+
+    /** Flag saying whether or not the data to be written out is bucketed. */
+    val isBucketed = desc.bucketIdExpression.isDefined
+
+    assert(isPartitioned || isBucketed,
+      s"""DynamicPartitionWriteTask should be used for writing out data that's either
+         |partitioned or bucketed. In this case neither is true.
+         |WriteJobDescription: ${desc}
+       """.stripMargin)
+
+    // currentWriter is initialized whenever we see a new key (partitionValues + BucketId)
     private var currentWriter: OutputWriter = _
 
-    private var currentPath: String = _
+    /** Trackers for computing various statistics on the data as it's being written out. */
+    private val statsTrackers: Seq[WriteTaskStatsTracker] =
+      desc.statsTrackers.map(_.newTaskInstance())
 
-    /** Expressions that given partition columns build a path string like: col1=val/col2=val/... */
-    private def partitionPathExpression: Seq[Expression] = {
+    /** Extracts the partition values out of an input row. */
+    private lazy val getPartitionValues: InternalRow => UnsafeRow = {
+      val proj = UnsafeProjection.create(desc.partitionColumns, desc.allColumns)
+      row => proj(row)
+    }
+
+    /** Expression that given partition columns builds a path string like: col1=val/col2=val/... */
+    private lazy val partitionPathExpression: Expression = Concat(
       desc.partitionColumns.zipWithIndex.flatMap { case (c, i) =>
         val partitionName = ScalaUDF(
           ExternalCatalogUtils.getPartitionPathString _,
           StringType,
           Seq(Literal(c.name), Cast(c, StringType, Option(desc.timeZoneId))))
         if (i == 0) Seq(partitionName) else Seq(Literal(Path.SEPARATOR), partitionName)
-      }
+      })
+
+    /** Evaluates the `partitionPathExpression` above on a row of `partitionValues` and returns
+     * the partition string. */
+    private lazy val getPartitionPath: InternalRow => String = {
+      val proj = UnsafeProjection.create(Seq(partitionPathExpression), desc.partitionColumns)
+      row => proj(row).getString(0)
+    }
+
+    /** Given an input row, returns the corresponding `bucketId` */
+    private lazy val getBucketId: InternalRow => Int = {
+      val proj = UnsafeProjection.create(desc.bucketIdExpression.toSeq, desc.allColumns)
+      row => proj(row).getInt(0)
     }
 
+    /** Returns the data columns to be written given an input row */
+    private val getOutputRow = UnsafeProjection.create(desc.dataColumns, desc.allColumns)
+
     /**
-     * Opens a new OutputWriter given a partition key and optional bucket id.
+     * Opens a new OutputWriter given a partition key and/or a bucket id.
      * If bucket id is specified, we will append it to the end of the file name, but before the
      * file extension, e.g. part-r-00009-ea518ad4-455a-4431-b471-d24e03814677-00002.gz.parquet
      *
-     * @param partColsAndBucketId a row consisting of partition columns and a bucket id for the
-     *                            current row.
-     * @param getPartitionPath a function that projects the partition values into a path string.
+     * @param partitionValues the partition which all tuples being written by this `OutputWriter`
+     *                        belong to
+     * @param bucketId the bucket which all tuples being written by this `OutputWriter` belong to
      * @param fileCounter the number of files that have been written in the past for this specific
      *                    partition. This is used to limit the max number of records written for a
      *                    single file. The value should start from 0.
@@ -423,36 +478,24 @@ object FileFormatWriter extends Logging {
      *                          path of this writer to it.
      */
     private def newOutputWriter(
-        partColsAndBucketId: InternalRow,
-        getPartitionPath: UnsafeProjection,
+        partitionValues: Option[InternalRow],
+        bucketId: Option[Int],
         fileCounter: Int,
         updatedPartitions: mutable.Set[String]): Unit = {
-      val partDir = if (desc.partitionColumns.isEmpty) {
-        None
-      } else {
-        Option(getPartitionPath(partColsAndBucketId).getString(0))
-      }
+
+      val partDir = partitionValues.map(getPartitionPath(_))
       partDir.foreach(updatedPartitions.add)
 
-      // If the bucketId expression is defined, the bucketId column is right after the partition
-      // columns.
-      val bucketId = if (desc.bucketIdExpression.isDefined) {
-        BucketingUtils.bucketIdToString(partColsAndBucketId.getInt(desc.partitionColumns.length))
-      } else {
-        ""
-      }
+      val bucketIdStr = bucketId.map(BucketingUtils.bucketIdToString).getOrElse("")
 
       // This must be in a form that matches our bucketing format. See BucketingUtils.
-      val ext = f"$bucketId.c$fileCounter%03d" +
+      val ext = f"$bucketIdStr.c$fileCounter%03d" +
         desc.outputWriterFactory.getFileExtension(taskAttemptContext)
 
-      val customPath = partDir match {
-        case Some(dir) =>
+      val customPath = partDir.flatMap { dir =>
           desc.customPartitionLocations.get(PartitioningUtils.parsePathFragment(dir))
-        case _ =>
-          None
       }
-      currentPath = if (customPath.isDefined) {
+      val currentPath = if (customPath.isDefined) {
         committer.newTaskTempFileAbsPath(taskAttemptContext, customPath.get, ext)
       } else {
         committer.newTaskTempFile(taskAttemptContext, partDir, ext)
@@ -462,78 +505,66 @@ object FileFormatWriter extends Logging {
         path = currentPath,
         dataSchema = desc.dataColumns.toStructType,
         context = taskAttemptContext)
+
+      statsTrackers.foreach(_.newFile(currentPath))
     }
 
     override def execute(iter: Iterator[InternalRow]): ExecutedWriteSummary = {
-      val getPartitionColsAndBucketId = UnsafeProjection.create(
-        desc.partitionColumns ++ desc.bucketIdExpression, desc.allColumns)
-
-      // Generates the partition path given the row generated by `getPartitionColsAndBucketId`.
-      val getPartPath = UnsafeProjection.create(
-        Seq(Concat(partitionPathExpression)), desc.partitionColumns)
-
-      // Returns the data columns to be written given an input row
-      val getOutputRow = UnsafeProjection.create(desc.dataColumns, desc.allColumns)
-
       // If anything below fails, we should abort the task.
       var recordsInFile: Long = 0L
       var fileCounter = 0
-      var totalFileCounter = 0
-      var currentPartColsAndBucketId: UnsafeRow = null
       val updatedPartitions = mutable.Set[String]()
+      var currentPartionValues: Option[UnsafeRow] = None
+      var currentBucketId: Option[Int] = None
 
       for (row <- iter) {
-        val nextPartColsAndBucketId = getPartitionColsAndBucketId(row)
-        if (currentPartColsAndBucketId != nextPartColsAndBucketId) {
-          if (currentPartColsAndBucketId != null) {
-            totalFileCounter += (fileCounter + 1)
-          }
+        val nextPartitionValues = if (isPartitioned) Some(getPartitionValues(row)) else None
+        val nextBucketId = if (isBucketed) Some(getBucketId(row)) else None
 
+        if (currentPartionValues != nextPartitionValues || currentBucketId != nextBucketId) {
           // See a new partition or bucket - write to a new partition dir (or a new bucket file).
-          currentPartColsAndBucketId = nextPartColsAndBucketId.copy()
-          logDebug(s"Writing partition: $currentPartColsAndBucketId")
+          if (isPartitioned && currentPartionValues != nextPartitionValues) {
+            currentPartionValues = Some(nextPartitionValues.get.copy())
+            statsTrackers.foreach(_.newPartition(currentPartionValues.get))
+          }
+          if (isBucketed) {
+            currentBucketId = nextBucketId
+            statsTrackers.foreach(_.newBucket(currentBucketId.get))
+          }
 
-          numOutputRows += recordsInFile
           recordsInFile = 0
           fileCounter = 0
 
           releaseResources()
-          newOutputWriter(currentPartColsAndBucketId, getPartPath, fileCounter, updatedPartitions)
+          newOutputWriter(currentPartionValues, currentBucketId, fileCounter, updatedPartitions)
         } else if (desc.maxRecordsPerFile > 0 &&
             recordsInFile >= desc.maxRecordsPerFile) {
           // Exceeded the threshold in terms of the number of records per file.
           // Create a new file by increasing the file counter.
-
-          numOutputRows += recordsInFile
           recordsInFile = 0
           fileCounter += 1
           assert(fileCounter < MAX_FILE_COUNTER,
             s"File counter $fileCounter is beyond max value $MAX_FILE_COUNTER")
 
           releaseResources()
-          newOutputWriter(currentPartColsAndBucketId, getPartPath, fileCounter, updatedPartitions)
+          newOutputWriter(currentPartionValues, currentBucketId, fileCounter, updatedPartitions)
         }
-        currentWriter.write(getOutputRow(row))
+        val outputRow = getOutputRow(row)
+        currentWriter.write(outputRow)
+        statsTrackers.foreach(_.newRow(outputRow))
         recordsInFile += 1
       }
-      if (currentPartColsAndBucketId != null) {
-        totalFileCounter += (fileCounter + 1)
-      }
       releaseResources()
-      numOutputRows += recordsInFile
 
       ExecutedWriteSummary(
         updatedPartitions = updatedPartitions.toSet,
-        numOutputFile = totalFileCounter,
-        numOutputBytes = numOutputBytes,
-        numOutputRows = numOutputRows)
+        stats = statsTrackers.map(_.getFinalStats()))
     }
 
     override def releaseResources(): Unit = {
       if (currentWriter != null) {
         try {
           currentWriter.close()
-          numOutputBytes += getFileSize(taskAttemptContext.getConfiguration, currentPath)
         } finally {
           currentWriter = null
         }
@@ -547,12 +578,8 @@ object FileFormatWriter extends Logging {
  *
  * @param updatedPartitions the partitions updated during writing data out. Only valid
  *                          for dynamic partition.
- * @param numOutputFile the total number of files.
- * @param numOutputRows the number of output rows.
- * @param numOutputBytes the bytes of output data.
+ * @param stats one `WriteTaskStats` object for every `WriteJobStatsTracker` that the job had.
  */
 case class ExecutedWriteSummary(
   updatedPartitions: Set[String],
-  numOutputFile: Int,
-  numOutputRows: Long,
-  numOutputBytes: Long)
+  stats: Seq[WriteTaskStats])
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index 9ebe1e4d71c68..64e5a57adc37c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -120,18 +120,10 @@ case class InsertIntoHadoopFsRelationCommand(
 
     if (doInsertion) {
 
-      // Callback for updating metric and metastore partition metadata
-      // after the insertion job completes.
-      def refreshCallback(summary: Seq[ExecutedWriteSummary]): Unit = {
-        val updatedPartitions = summary.flatMap(_.updatedPartitions)
-          .distinct.map(PartitioningUtils.parsePathFragment)
-
-        // Updating metrics.
-        updateWritingMetrics(summary)
-
-        // Updating metastore partition metadata.
+      def refreshUpdatedPartitions(updatedPartitionPaths: Set[String]): Unit = {
+        val updatedPartitions = updatedPartitionPaths.map(PartitioningUtils.parsePathFragment)
         if (partitionsTrackedByCatalog) {
-          val newPartitions = updatedPartitions.toSet -- initialMatchingPartitions
+          val newPartitions = updatedPartitions -- initialMatchingPartitions
           if (newPartitions.nonEmpty) {
             AlterTableAddPartitionCommand(
               catalogTable.get.identifier, newPartitions.toSeq.map(p => (p, None)),
@@ -149,18 +141,23 @@ case class InsertIntoHadoopFsRelationCommand(
         }
       }
 
-      FileFormatWriter.write(
-        sparkSession = sparkSession,
-        plan = children.head,
-        fileFormat = fileFormat,
-        committer = committer,
-        outputSpec = FileFormatWriter.OutputSpec(
-          qualifiedOutputPath.toString, customPartitionLocations),
-        hadoopConf = hadoopConf,
-        partitionColumns = partitionColumns,
-        bucketSpec = bucketSpec,
-        refreshFunction = refreshCallback,
-        options = options)
+      val updatedPartitionPaths =
+        FileFormatWriter.write(
+          sparkSession = sparkSession,
+          plan = children.head,
+          fileFormat = fileFormat,
+          committer = committer,
+          outputSpec = FileFormatWriter.OutputSpec(
+            qualifiedOutputPath.toString, customPartitionLocations),
+          hadoopConf = hadoopConf,
+          partitionColumns = partitionColumns,
+          bucketSpec = bucketSpec,
+          statsTrackers = Seq(basicWriteJobStatsTracker(hadoopConf)),
+          options = options)
+
+
+      // update metastore partition metadata
+      refreshUpdatedPartitions(updatedPartitionPaths)
 
       // refresh cached files in FileIndex
       fileIndex.foreach(_.refresh())
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriteStatsTracker.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriteStatsTracker.scala
new file mode 100644
index 0000000000000..c39a82ee037bc
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriteStatsTracker.scala
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import org.apache.spark.sql.catalyst.InternalRow
+
+
+/**
+ * To be implemented by classes that represent data statistics collected during a Write Task.
+ * It is important that instances of this type are [[Serializable]], as they will be gathered
+ * on the driver from all executors.
+ */
+trait WriteTaskStats extends Serializable
+
+
+/**
+ * A trait for classes that are capable of collecting statistics on data that's being processed by
+ * a single write task in [[FileFormatWriter]] - i.e. there should be one instance per executor.
+ *
+ * This trait is coupled with the way [[FileFormatWriter]] works, in the sense that its methods
+ * will be called according to how tuples are being written out to disk, namely in sorted order
+ * according to partitionValue(s), then bucketId.
+ *
+ * As such, a typical call scenario is:
+ *
+ * newPartition -> newBucket -> newFile -> newRow -.
+ *    ^        |______^___________^ ^         ^____|
+ *    |               |             |______________|
+ *    |               |____________________________|
+ *    |____________________________________________|
+ *
+ * newPartition and newBucket events are only triggered if the relation to be written out is
+ * partitioned and/or bucketed, respectively.
+ */
+trait WriteTaskStatsTracker {
+
+  /**
+   * Process the fact that a new partition is about to be written.
+   * Only triggered when the relation is partitioned by a (non-empty) sequence of columns.
+   * @param partitionValues The values that define this new partition.
+   */
+  def newPartition(partitionValues: InternalRow): Unit
+
+  /**
+   * Process the fact that a new bucket is about to written.
+   * Only triggered when the relation is bucketed by a (non-empty) sequence of columns.
+   * @param bucketId The bucket number.
+   */
+  def newBucket(bucketId: Int): Unit
+
+  /**
+   * Process the fact that a new file is about to be written.
+   * @param filePath Path of the file into which future rows will be written.
+   */
+  def newFile(filePath: String): Unit
+
+  /**
+   * Process the fact that a new row to update the tracked statistics accordingly.
+   * The row will be written to the most recently witnessed file (via `newFile`).
+   * @note Keep in mind that any overhead here is per-row, obviously,
+   *       so implementations should be as lightweight as possible.
+   * @param row Current data row to be processed.
+   */
+  def newRow(row: InternalRow): Unit
+
+  /**
+   * Returns the final statistics computed so far.
+   * @note This may only be called once. Further use of the object may lead to undefined behavior.
+   * @return An object of subtype of [[WriteTaskStats]], to be sent to the driver.
+   */
+  def getFinalStats(): WriteTaskStats
+}
+
+
+/**
+ * A class implementing this trait is basically a collection of parameters that are necessary
+ * for instantiating a (derived type of) [[WriteTaskStatsTracker]] on all executors and then
+ * process the statistics produced by them (e.g. save them to memory/disk, issue warnings, etc).
+ * It is therefore important that such an objects is [[Serializable]], as it will be sent
+ * from the driver to all executors.
+ */
+trait WriteJobStatsTracker extends Serializable {
+
+  /**
+   * Instantiates a [[WriteTaskStatsTracker]], based on (non-transient) members of this class.
+   * To be called by executors.
+   * @return A [[WriteTaskStatsTracker]] instance to be used for computing stats during a write task
+   */
+  def newTaskInstance(): WriteTaskStatsTracker
+
+  /**
+   * Process the given collection of stats computed during this job.
+   * E.g. aggregate them, write them to memory / disk, issue warnings, whatever.
+   * @param stats One [[WriteTaskStats]] object from each successful write task.
+   * @note The type of @param `stats` is too generic. These classes should probably be parametrized:
+   *   WriteTaskStatsTracker[S <: WriteTaskStats]
+   *   WriteJobStatsTracker[S <: WriteTaskStats, T <: WriteTaskStatsTracker[S]]
+   * and this would then be:
+   *   def processStats(stats: Seq[S]): Unit
+   * but then we wouldn't be able to have a Seq[WriteJobStatsTracker] due to type
+   * co-/contra-variance considerations. Instead, you may feel free to just cast `stats`
+   * to the expected derived type when implementing this method in a derived class.
+   * The framework will make sure to call this with the right arguments.
+   */
+  def processStats(stats: Seq[WriteTaskStats]): Unit
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
index 0ed2dbe177402..72e5ac40bbfed 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
@@ -128,7 +128,7 @@ class FileStreamSink(
         hadoopConf = hadoopConf,
         partitionColumns = partitionColumns,
         bucketSpec = None,
-        refreshFunction = _ => (),
+        statsTrackers = Nil,
         options = options)
     }
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index b9461ad489bce..b6f4898fd1574 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -31,7 +31,6 @@ import org.apache.hadoop.hive.ql.exec.TaskRunner
 import org.apache.hadoop.hive.ql.ErrorMsg
 import org.apache.hadoop.hive.ql.plan.TableDesc
 
-import org.apache.spark.SparkContext
 import org.apache.spark.internal.io.FileCommitProtocol
 import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
 import org.apache.spark.sql.catalyst.catalog.CatalogTable
@@ -40,7 +39,6 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.command.{CommandUtils, DataWritingCommand}
 import org.apache.spark.sql.execution.datasources.FileFormatWriter
-import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
 import org.apache.spark.sql.hive._
 import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc}
 import org.apache.spark.sql.hive.client.{HiveClientImpl, HiveVersion}
@@ -355,7 +353,7 @@ case class InsertIntoHiveTable(
       hadoopConf = hadoopConf,
       partitionColumns = partitionAttributes,
       bucketSpec = None,
-      refreshFunction = updateWritingMetrics,
+      statsTrackers = Seq(basicWriteJobStatsTracker(hadoopConf)),
       options = Map.empty)
 
     if (partition.nonEmpty) {

From ca6955858cec868c878a2fd8528dbed0ef9edd3f Mon Sep 17 00:00:00 2001
From: Peng Meng <peng.meng@intel.com>
Date: Thu, 10 Aug 2017 21:38:03 +0100
Subject: [PATCH 339/779] [SPARK-21638][ML] Fix RF/GBT Warning message error

## What changes were proposed in this pull request?

When train RF model, there are many warning messages like this:

> WARN  RandomForest: Tree learning is using approximately 268492800 bytes per iteration, which exceeds requested limit maxMemoryUsage=268435456. This allows splitting 2622 nodes in this iteration.

This warning message is unnecessary and the data is not accurate.

Actually, if all the nodes cannot split in one iteration, it will show this warning. For most of the case, all the nodes cannot split just in one iteration, so for most of the case, it will show this warning for each iteration.

## How was this patch tested?
The existing UT

Author: Peng Meng <peng.meng@intel.com>

Closes #18868 from mpjlu/fixRFwarning.
---
 .../org/apache/spark/ml/tree/impl/RandomForest.scala     | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
index 82e1ed85a0a14..f7d969f4ca5db 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
@@ -1089,7 +1089,8 @@ private[spark] object RandomForest extends Logging {
     var numNodesInGroup = 0
     // If maxMemoryInMB is set very small, we want to still try to split 1 node,
     // so we allow one iteration if memUsage == 0.
-    while (nodeStack.nonEmpty && (memUsage < maxMemoryUsage || memUsage == 0)) {
+    var groupDone = false
+    while (nodeStack.nonEmpty && !groupDone) {
       val (treeIndex, node) = nodeStack.top
       // Choose subset of features for node (if subsampling).
       val featureSubset: Option[Array[Int]] = if (metadata.subsamplingFeatures) {
@@ -1107,9 +1108,11 @@ private[spark] object RandomForest extends Logging {
         mutableTreeToNodeToIndexInfo
           .getOrElseUpdate(treeIndex, new mutable.HashMap[Int, NodeIndexInfo]())(node.id)
           = new NodeIndexInfo(numNodesInGroup, featureSubset)
+        numNodesInGroup += 1
+        memUsage += nodeMemUsage
+      } else {
+        groupDone = true
       }
-      numNodesInGroup += 1
-      memUsage += nodeMemUsage
     }
     if (memUsage > maxMemoryUsage) {
       // If maxMemoryUsage is 0, we should still allow splitting 1 node.

From 584c7f14370cdfafdc6cd554b2760b7ce7709368 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 10 Aug 2017 18:56:25 -0700
Subject: [PATCH 340/779] [SPARK-21699][SQL] Remove unused getTableOption in
 ExternalCatalog

## What changes were proposed in this pull request?
This patch removes the unused SessionCatalog.getTableMetadataOption and ExternalCatalog. getTableOption.

## How was this patch tested?
Removed the test case.

Author: Reynold Xin <rxin@databricks.com>

Closes #18912 from rxin/remove-getTableOption.
---
 .../sql/catalyst/catalog/ExternalCatalog.scala  |  2 --
 .../sql/catalyst/catalog/InMemoryCatalog.scala  |  4 ----
 .../sql/catalyst/catalog/SessionCatalog.scala   | 17 +++--------------
 .../catalyst/catalog/SessionCatalogSuite.scala  | 11 -----------
 .../spark/sql/hive/HiveExternalCatalog.scala    |  4 ----
 5 files changed, 3 insertions(+), 35 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
index 68644f4d6b95b..d4c58db3708e3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
@@ -167,8 +167,6 @@ abstract class ExternalCatalog
 
   def getTable(db: String, table: String): CatalogTable
 
-  def getTableOption(db: String, table: String): Option[CatalogTable]
-
   def tableExists(db: String, table: String): Boolean
 
   def listTables(db: String): Seq[String]
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
index 37e9eeadaad47..98370c12a977c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
@@ -326,10 +326,6 @@ class InMemoryCatalog(
     catalog(db).tables(table).table
   }
 
-  override def getTableOption(db: String, table: String): Option[CatalogTable] = synchronized {
-    if (!tableExists(db, table)) None else Option(catalog(db).tables(table).table)
-  }
-
   override def tableExists(db: String, table: String): Boolean = synchronized {
     requireDbExists(db)
     catalog(db).tables.contains(table)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index b44d2ee69e1d1..e3237a8846875 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -387,9 +387,10 @@ class SessionCatalog(
 
   /**
    * Retrieve the metadata of an existing permanent table/view. If no database is specified,
-   * assume the table/view is in the current database. If the specified table/view is not found
-   * in the database then a [[NoSuchTableException]] is thrown.
+   * assume the table/view is in the current database.
    */
+  @throws[NoSuchDatabaseException]
+  @throws[NoSuchTableException]
   def getTableMetadata(name: TableIdentifier): CatalogTable = {
     val db = formatDatabaseName(name.database.getOrElse(getCurrentDatabase))
     val table = formatTableName(name.table)
@@ -398,18 +399,6 @@ class SessionCatalog(
     externalCatalog.getTable(db, table)
   }
 
-  /**
-   * Retrieve the metadata of an existing metastore table.
-   * If no database is specified, assume the table is in the current database.
-   * If the specified table is not found in the database then return None if it doesn't exist.
-   */
-  def getTableMetadataOption(name: TableIdentifier): Option[CatalogTable] = {
-    val db = formatDatabaseName(name.database.getOrElse(getCurrentDatabase))
-    val table = formatTableName(name.table)
-    requireDbExists(db)
-    externalCatalog.getTableOption(db, table)
-  }
-
   /**
    * Load files stored in given path into an existing metastore table.
    * If no database is specified, assume the table is in the current database.
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
index 8f856a0daad15..d2b670d06d18c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
@@ -510,17 +510,6 @@ abstract class SessionCatalogSuite extends AnalysisTest {
     }
   }
 
-  test("get option of table metadata") {
-    withBasicCatalog { catalog =>
-      assert(catalog.getTableMetadataOption(TableIdentifier("tbl1", Some("db2")))
-        == Option(catalog.externalCatalog.getTable("db2", "tbl1")))
-      assert(catalog.getTableMetadataOption(TableIdentifier("unknown_table", Some("db2"))).isEmpty)
-      intercept[NoSuchDatabaseException] {
-        catalog.getTableMetadataOption(TableIdentifier("tbl1", Some("unknown_db")))
-      }
-    }
-  }
-
   test("lookup table relation") {
     withBasicCatalog { catalog =>
       val tempTable1 = Range(1, 10, 1, 10)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 19e5f789822e8..e9d48f95aa905 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -669,10 +669,6 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     restoreTableMetadata(getRawTable(db, table))
   }
 
-  override def getTableOption(db: String, table: String): Option[CatalogTable] = withClient {
-    client.getTableOption(db, table).map(restoreTableMetadata)
-  }
-
   /**
    * Restores table metadata from the table properties. This method is kind of a opposite version
    * of [[createTable]].

From 2387f1e316a90fc9a392ab69ee3c2257b622af4d Mon Sep 17 00:00:00 2001
From: Kent Yao <yaooqinn@hotmail.com>
Date: Fri, 11 Aug 2017 14:57:06 +0100
Subject: [PATCH 341/779] [SPARK-21675][WEBUI] Add a navigation bar at the
 bottom of the Details for Stage Page

## What changes were proposed in this pull request?

1. In Spark Web UI, the Details for Stage Page don't have a navigation bar at the bottom. When we drop down to the bottom, it is better for us to see a navi bar right there to go wherever we what.
2. Executor ID is not equivalent to Host, it may be  better to separate them, and then we can group the tasks by Hosts .

## How was this patch tested?
manually test
![wx20170809-165606](https://user-images.githubusercontent.com/8326978/29114161-f82b4920-7d25-11e7-8d0c-0c036b008a78.png)

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Kent Yao <yaooqinn@hotmail.com>

Closes #18893 from yaooqinn/SPARK-21675.
---
 .../scala/org/apache/spark/ui/PagedTable.scala     |  4 +++-
 .../scala/org/apache/spark/ui/jobs/StagePage.scala | 14 +++++++++-----
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/PagedTable.scala b/core/src/main/scala/org/apache/spark/ui/PagedTable.scala
index 79974df2603fd..65fa38387b9ee 100644
--- a/core/src/main/scala/org/apache/spark/ui/PagedTable.scala
+++ b/core/src/main/scala/org/apache/spark/ui/PagedTable.scala
@@ -94,14 +94,16 @@ private[ui] trait PagedTable[T] {
     val _dataSource = dataSource
     try {
       val PageData(totalPages, data) = _dataSource.pageData(page)
+      val pageNavi = pageNavigation(page, _dataSource.pageSize, totalPages)
       <div>
-        {pageNavigation(page, _dataSource.pageSize, totalPages)}
+        {pageNavi}
         <table class={tableCssClass} id={tableId}>
           {headers}
           <tbody>
             {data.map(row)}
           </tbody>
         </table>
+        {pageNavi}
       </div>
     } catch {
       case e: IndexOutOfBoundsException =>
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index 8ed51746ab9d0..633e740b9c9bd 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -835,7 +835,8 @@ private[ui] class TaskTableRowData(
     val speculative: Boolean,
     val status: String,
     val taskLocality: String,
-    val executorIdAndHost: String,
+    val executorId: String,
+    val host: String,
     val launchTime: Long,
     val duration: Long,
     val formatDuration: String,
@@ -1017,7 +1018,8 @@ private[ui] class TaskDataSource(
       info.speculative,
       info.status,
       info.taskLocality.toString,
-      s"${info.executorId} / ${info.host}",
+      info.executorId,
+      info.host,
       info.launchTime,
       duration,
       formatDuration,
@@ -1047,7 +1049,8 @@ private[ui] class TaskDataSource(
       case "Attempt" => Ordering.by(_.attempt)
       case "Status" => Ordering.by(_.status)
       case "Locality Level" => Ordering.by(_.taskLocality)
-      case "Executor ID / Host" => Ordering.by(_.executorIdAndHost)
+      case "Executor ID" => Ordering.by(_.executorId)
+      case "Host" => Ordering.by(_.host)
       case "Launch Time" => Ordering.by(_.launchTime)
       case "Duration" => Ordering.by(_.duration)
       case "Scheduler Delay" => Ordering.by(_.schedulerDelay)
@@ -1200,7 +1203,7 @@ private[ui] class TaskPagedTable(
     val taskHeadersAndCssClasses: Seq[(String, String)] =
       Seq(
         ("Index", ""), ("ID", ""), ("Attempt", ""), ("Status", ""), ("Locality Level", ""),
-        ("Executor ID / Host", ""), ("Launch Time", ""), ("Duration", ""),
+        ("Executor ID", ""), ("Host", ""), ("Launch Time", ""), ("Duration", ""),
         ("Scheduler Delay", TaskDetailsClassNames.SCHEDULER_DELAY),
         ("Task Deserialization Time", TaskDetailsClassNames.TASK_DESERIALIZATION_TIME),
         ("GC Time", ""),
@@ -1271,8 +1274,9 @@ private[ui] class TaskPagedTable(
       <td>{if (task.speculative) s"${task.attempt} (speculative)" else task.attempt.toString}</td>
       <td>{task.status}</td>
       <td>{task.taskLocality}</td>
+      <td>{task.executorId}</td>
       <td>
-        <div style="float: left">{task.executorIdAndHost}</div>
+        <div style="float: left">{task.host}</div>
         <div style="float: right">
         {
           task.logs.map {

From 0377338bf76c0d55963a338dd73076113a922de9 Mon Sep 17 00:00:00 2001
From: LucaCanali <luca.canali@cern.ch>
Date: Fri, 11 Aug 2017 12:03:37 -0700
Subject: [PATCH 342/779] [SPARK-21519][SQL] Add an option to the JDBC data
 source to initialize the target DB environment

Add an option to the JDBC data source to initialize the environment of the remote database session

## What changes were proposed in this pull request?

This proposes an option to the JDBC datasource, tentatively called " sessionInitStatement" to implement the functionality of session initialization present for example in the Sqoop connector for Oracle (see https://sqoop.apache.org/docs/1.4.6/SqoopUserGuide.html#_oraoop_oracle_session_initialization_statements ) . After each database session is opened to the remote DB, and before starting to read data, this option executes a custom SQL statement (or a PL/SQL block in the case of Oracle).

See also https://issues.apache.org/jira/browse/SPARK-21519

## How was this patch tested?

Manually tested using Spark SQL data source and Oracle JDBC

Author: LucaCanali <luca.canali@cern.ch>

Closes #18724 from LucaCanali/JDBC_datasource_sessionInitStatement.
---
 docs/sql-programming-guide.md                 |  7 +++++
 .../datasources/jdbc/JDBCOptions.scala        |  3 ++
 .../execution/datasources/jdbc/JDBCRDD.scala  | 15 +++++++++
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 31 +++++++++++++++++++
 4 files changed, 56 insertions(+)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 2ac2383d699c4..ee231a934a3af 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1308,6 +1308,13 @@ the following case-insensitive options:
      </td>
    </tr>
 
+  <tr>
+     <td><code>sessionInitStatement</code></td>
+     <td>
+       After each database session is opened to the remote DB and before starting to read data, this option executes a custom SQL statement (or a PL/SQL block). Use this to implement session initialization code. Example: <code>option("sessionInitStatement", """BEGIN execute immediate 'alter session set "_serial_direct_read"=true'; END;""")</code>
+     </td>
+  </tr>
+
   <tr>
     <td><code>truncate</code></td>
     <td>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala
index 96a8a51da18e5..05b00058618a2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala
@@ -138,6 +138,8 @@ class JDBCOptions(
       case "REPEATABLE_READ" => Connection.TRANSACTION_REPEATABLE_READ
       case "SERIALIZABLE" => Connection.TRANSACTION_SERIALIZABLE
     }
+  // An option to execute custom SQL before fetching data from the remote DB
+  val sessionInitStatement = parameters.get(JDBC_SESSION_INIT_STATEMENT)
 }
 
 object JDBCOptions {
@@ -161,4 +163,5 @@ object JDBCOptions {
   val JDBC_CREATE_TABLE_COLUMN_TYPES = newOption("createTableColumnTypes")
   val JDBC_BATCH_INSERT_SIZE = newOption("batchsize")
   val JDBC_TXN_ISOLATION_LEVEL = newOption("isolationLevel")
+  val JDBC_SESSION_INIT_STATEMENT = newOption("sessionInitStatement")
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
index 24e13697c0c9f..3274be91d4817 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
@@ -273,6 +273,21 @@ private[jdbc] class JDBCRDD(
     import scala.collection.JavaConverters._
     dialect.beforeFetch(conn, options.asProperties.asScala.toMap)
 
+    // This executes a generic SQL statement (or PL/SQL block) before reading
+    // the table/query via JDBC. Use this feature to initialize the database
+    // session environment, e.g. for optimizations and/or troubleshooting.
+    options.sessionInitStatement match {
+      case Some(sql) =>
+        val statement = conn.prepareStatement(sql)
+        logInfo(s"Executing sessionInitStatement: $sql")
+        try {
+          statement.execute()
+        } finally {
+          statement.close()
+        }
+      case None =>
+    }
+
     // H2's JDBC driver does not support the setSchema() method.  We pass a
     // fully-qualified table name in the SELECT statement.  I don't know how to
     // talk about a table in a completely portable way.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index 4c43646889418..8dc11d80c3063 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -1044,4 +1044,35 @@ class JDBCSuite extends SparkFunSuite
       assert(sql("select * from people_view").count() == 3)
     }
   }
+
+  test("SPARK-21519: option sessionInitStatement, run SQL to initialize the database session.") {
+    val initSQL1 = "SET @MYTESTVAR 21519"
+    val df1 = spark.read.format("jdbc")
+      .option("url", urlWithUserAndPass)
+      .option("dbtable", "(SELECT NVL(@MYTESTVAR, -1))")
+      .option("sessionInitStatement", initSQL1)
+      .load()
+    assert(df1.collect() === Array(Row(21519)))
+
+    val initSQL2 = "SET SCHEMA DUMMY"
+    val df2 = spark.read.format("jdbc")
+      .option("url", urlWithUserAndPass)
+      .option("dbtable", "TEST.PEOPLE")
+      .option("sessionInitStatement", initSQL2)
+      .load()
+    val e = intercept[SparkException] {df2.collect()}.getMessage
+    assert(e.contains("""Schema "DUMMY" not found"""))
+
+    sql(
+      s"""
+         |CREATE OR REPLACE TEMPORARY VIEW test_sessionInitStatement
+         |USING org.apache.spark.sql.jdbc
+         |OPTIONS (url '$urlWithUserAndPass',
+         |dbtable '(SELECT NVL(@MYTESTVAR1, -1), NVL(@MYTESTVAR2, -1))',
+         |sessionInitStatement 'SET @MYTESTVAR1 21519; SET @MYTESTVAR2 1234')
+       """.stripMargin)
+
+      val df3 = sql("SELECT * FROM test_sessionInitStatement")
+      assert(df3.collect() === Array(Row(21519, 1234)))
+    }
 }

From 94439997d57875838a8283c543f9b44705d3a503 Mon Sep 17 00:00:00 2001
From: Tejas Patil <tejasp@fb.com>
Date: Fri, 11 Aug 2017 22:01:00 +0200
Subject: [PATCH 343/779] [SPARK-21595] Separate thresholds for buffering and
 spilling in ExternalAppendOnlyUnsafeRowArray

## What changes were proposed in this pull request?

[SPARK-21595](https://issues.apache.org/jira/browse/SPARK-21595) reported that there is excessive spilling to disk due to default spill threshold for `ExternalAppendOnlyUnsafeRowArray` being quite small for WINDOW operator. Old behaviour of WINDOW operator (pre https://github.com/apache/spark/pull/16909) would hold data in an array for first 4096 records post which it would switch to `UnsafeExternalSorter` and start spilling to disk after reaching `spark.shuffle.spill.numElementsForceSpillThreshold` (or earlier if there was paucity of memory due to excessive consumers).

Currently the (switch from in-memory to `UnsafeExternalSorter`) and (`UnsafeExternalSorter` spilling to disk) for `ExternalAppendOnlyUnsafeRowArray` is controlled by a single threshold. This PR aims to separate that to have more granular control.

## How was this patch tested?

Added unit tests

Author: Tejas Patil <tejasp@fb.com>

Closes #18843 from tejasapatil/SPARK-21595.
---
 .../apache/spark/sql/internal/SQLConf.scala   |  41 ++++++-
 .../ExternalAppendOnlyUnsafeRowArray.scala    |  28 ++---
 .../joins/CartesianProductExec.scala          |  12 +-
 .../execution/joins/SortMergeJoinExec.scala   |  24 +++-
 .../sql/execution/window/WindowExec.scala     |   4 +-
 .../org/apache/spark/sql/JoinSuite.scala      |   3 +-
 ...nalAppendOnlyUnsafeRowArrayBenchmark.scala |   7 +-
 ...xternalAppendOnlyUnsafeRowArraySuite.scala | 103 +++++++++++-------
 .../execution/SQLWindowFunctionSuite.scala    |   3 +-
 9 files changed, 155 insertions(+), 70 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index ecb941c5fa9e6..733d80e9d46cd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -844,24 +844,47 @@ object SQLConf {
       .stringConf
       .createWithDefaultFunction(() => TimeZone.getDefault.getID)
 
+  val WINDOW_EXEC_BUFFER_IN_MEMORY_THRESHOLD =
+    buildConf("spark.sql.windowExec.buffer.in.memory.threshold")
+      .internal()
+      .doc("Threshold for number of rows guaranteed to be held in memory by the window operator")
+      .intConf
+      .createWithDefault(4096)
+
   val WINDOW_EXEC_BUFFER_SPILL_THRESHOLD =
     buildConf("spark.sql.windowExec.buffer.spill.threshold")
       .internal()
-      .doc("Threshold for number of rows buffered in window operator")
+      .doc("Threshold for number of rows to be spilled by window operator")
       .intConf
-      .createWithDefault(4096)
+      .createWithDefault(UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD.toInt)
+
+  val SORT_MERGE_JOIN_EXEC_BUFFER_IN_MEMORY_THRESHOLD =
+    buildConf("spark.sql.sortMergeJoinExec.buffer.in.memory.threshold")
+      .internal()
+      .doc("Threshold for number of rows guaranteed to be held in memory by the sort merge " +
+        "join operator")
+      .intConf
+      .createWithDefault(Int.MaxValue)
 
   val SORT_MERGE_JOIN_EXEC_BUFFER_SPILL_THRESHOLD =
     buildConf("spark.sql.sortMergeJoinExec.buffer.spill.threshold")
       .internal()
-      .doc("Threshold for number of rows buffered in sort merge join operator")
+      .doc("Threshold for number of rows to be spilled by sort merge join operator")
       .intConf
-      .createWithDefault(Int.MaxValue)
+      .createWithDefault(UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD.toInt)
+
+  val CARTESIAN_PRODUCT_EXEC_BUFFER_IN_MEMORY_THRESHOLD =
+    buildConf("spark.sql.cartesianProductExec.buffer.in.memory.threshold")
+      .internal()
+      .doc("Threshold for number of rows guaranteed to be held in memory by the cartesian " +
+        "product operator")
+      .intConf
+      .createWithDefault(4096)
 
   val CARTESIAN_PRODUCT_EXEC_BUFFER_SPILL_THRESHOLD =
     buildConf("spark.sql.cartesianProductExec.buffer.spill.threshold")
       .internal()
-      .doc("Threshold for number of rows buffered in cartesian product operator")
+      .doc("Threshold for number of rows to be spilled by cartesian product operator")
       .intConf
       .createWithDefault(UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD.toInt)
 
@@ -1137,11 +1160,19 @@ class SQLConf extends Serializable with Logging {
 
   def joinReorderDPStarFilter: Boolean = getConf(SQLConf.JOIN_REORDER_DP_STAR_FILTER)
 
+  def windowExecBufferInMemoryThreshold: Int = getConf(WINDOW_EXEC_BUFFER_IN_MEMORY_THRESHOLD)
+
   def windowExecBufferSpillThreshold: Int = getConf(WINDOW_EXEC_BUFFER_SPILL_THRESHOLD)
 
+  def sortMergeJoinExecBufferInMemoryThreshold: Int =
+    getConf(SORT_MERGE_JOIN_EXEC_BUFFER_IN_MEMORY_THRESHOLD)
+
   def sortMergeJoinExecBufferSpillThreshold: Int =
     getConf(SORT_MERGE_JOIN_EXEC_BUFFER_SPILL_THRESHOLD)
 
+  def cartesianProductExecBufferInMemoryThreshold: Int =
+    getConf(CARTESIAN_PRODUCT_EXEC_BUFFER_IN_MEMORY_THRESHOLD)
+
   def cartesianProductExecBufferSpillThreshold: Int =
     getConf(CARTESIAN_PRODUCT_EXEC_BUFFER_SPILL_THRESHOLD)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArray.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArray.scala
index c4d383421f976..ac282ea2e94f5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArray.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArray.scala
@@ -31,16 +31,16 @@ import org.apache.spark.storage.BlockManager
 import org.apache.spark.util.collection.unsafe.sort.{UnsafeExternalSorter, UnsafeSorterIterator}
 
 /**
- * An append-only array for [[UnsafeRow]]s that spills content to disk when there a predefined
- * threshold of rows is reached.
+ * An append-only array for [[UnsafeRow]]s that strictly keeps content in an in-memory array
+ * until [[numRowsInMemoryBufferThreshold]] is reached post which it will switch to a mode which
+ * would flush to disk after [[numRowsSpillThreshold]] is met (or before if there is
+ * excessive memory consumption). Setting these threshold involves following trade-offs:
  *
- * Setting spill threshold faces following trade-off:
- *
- * - If the spill threshold is too high, the in-memory array may occupy more memory than is
- *   available, resulting in OOM.
- * - If the spill threshold is too low, we spill frequently and incur unnecessary disk writes.
- *   This may lead to a performance regression compared to the normal case of using an
- *   [[ArrayBuffer]] or [[Array]].
+ * - If [[numRowsInMemoryBufferThreshold]] is too high, the in-memory array may occupy more memory
+ *   than is available, resulting in OOM.
+ * - If [[numRowsSpillThreshold]] is too low, data will be spilled frequently and lead to
+ *   excessive disk writes. This may lead to a performance regression compared to the normal case
+ *   of using an [[ArrayBuffer]] or [[Array]].
  */
 private[sql] class ExternalAppendOnlyUnsafeRowArray(
     taskMemoryManager: TaskMemoryManager,
@@ -49,9 +49,10 @@ private[sql] class ExternalAppendOnlyUnsafeRowArray(
     taskContext: TaskContext,
     initialSize: Int,
     pageSizeBytes: Long,
+    numRowsInMemoryBufferThreshold: Int,
     numRowsSpillThreshold: Int) extends Logging {
 
-  def this(numRowsSpillThreshold: Int) {
+  def this(numRowsInMemoryBufferThreshold: Int, numRowsSpillThreshold: Int) {
     this(
       TaskContext.get().taskMemoryManager(),
       SparkEnv.get.blockManager,
@@ -59,11 +60,12 @@ private[sql] class ExternalAppendOnlyUnsafeRowArray(
       TaskContext.get(),
       1024,
       SparkEnv.get.memoryManager.pageSizeBytes,
+      numRowsInMemoryBufferThreshold,
       numRowsSpillThreshold)
   }
 
   private val initialSizeOfInMemoryBuffer =
-    Math.min(DefaultInitialSizeOfInMemoryBuffer, numRowsSpillThreshold)
+    Math.min(DefaultInitialSizeOfInMemoryBuffer, numRowsInMemoryBufferThreshold)
 
   private val inMemoryBuffer = if (initialSizeOfInMemoryBuffer > 0) {
     new ArrayBuffer[UnsafeRow](initialSizeOfInMemoryBuffer)
@@ -102,11 +104,11 @@ private[sql] class ExternalAppendOnlyUnsafeRowArray(
   }
 
   def add(unsafeRow: UnsafeRow): Unit = {
-    if (numRows < numRowsSpillThreshold) {
+    if (numRows < numRowsInMemoryBufferThreshold) {
       inMemoryBuffer += unsafeRow.copy()
     } else {
       if (spillableArray == null) {
-        logInfo(s"Reached spill threshold of $numRowsSpillThreshold rows, switching to " +
+        logInfo(s"Reached spill threshold of $numRowsInMemoryBufferThreshold rows, switching to " +
           s"${classOf[UnsafeExternalSorter].getName}")
 
         // We will not sort the rows, so prefixComparator and recordComparator are null
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProductExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProductExec.scala
index f380986951317..4d261dd422bc5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProductExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProductExec.scala
@@ -35,11 +35,12 @@ class UnsafeCartesianRDD(
     left : RDD[UnsafeRow],
     right : RDD[UnsafeRow],
     numFieldsOfRight: Int,
+    inMemoryBufferThreshold: Int,
     spillThreshold: Int)
   extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) {
 
   override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = {
-    val rowArray = new ExternalAppendOnlyUnsafeRowArray(spillThreshold)
+    val rowArray = new ExternalAppendOnlyUnsafeRowArray(inMemoryBufferThreshold, spillThreshold)
 
     val partition = split.asInstanceOf[CartesianPartition]
     rdd2.iterator(partition.s2, context).foreach(rowArray.add)
@@ -71,9 +72,12 @@ case class CartesianProductExec(
     val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]]
     val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]]
 
-    val spillThreshold = sqlContext.conf.cartesianProductExecBufferSpillThreshold
-
-    val pair = new UnsafeCartesianRDD(leftResults, rightResults, right.output.size, spillThreshold)
+    val pair = new UnsafeCartesianRDD(
+      leftResults,
+      rightResults,
+      right.output.size,
+      sqlContext.conf.cartesianProductExecBufferInMemoryThreshold,
+      sqlContext.conf.cartesianProductExecBufferSpillThreshold)
     pair.mapPartitionsWithIndexInternal { (index, iter) =>
       val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema)
       val filtered = if (condition.isDefined) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
index f41fa14213df5..91d214e1978e4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
@@ -130,9 +130,14 @@ case class SortMergeJoinExec(
     sqlContext.conf.sortMergeJoinExecBufferSpillThreshold
   }
 
+  private def getInMemoryThreshold: Int = {
+    sqlContext.conf.sortMergeJoinExecBufferInMemoryThreshold
+  }
+
   protected override def doExecute(): RDD[InternalRow] = {
     val numOutputRows = longMetric("numOutputRows")
     val spillThreshold = getSpillThreshold
+    val inMemoryThreshold = getInMemoryThreshold
     left.execute().zipPartitions(right.execute()) { (leftIter, rightIter) =>
       val boundCondition: (InternalRow) => Boolean = {
         condition.map { cond =>
@@ -158,6 +163,7 @@ case class SortMergeJoinExec(
               keyOrdering,
               RowIterator.fromScala(leftIter),
               RowIterator.fromScala(rightIter),
+              inMemoryThreshold,
               spillThreshold
             )
             private[this] val joinRow = new JoinedRow
@@ -201,6 +207,7 @@ case class SortMergeJoinExec(
             keyOrdering,
             streamedIter = RowIterator.fromScala(leftIter),
             bufferedIter = RowIterator.fromScala(rightIter),
+            inMemoryThreshold,
             spillThreshold
           )
           val rightNullRow = new GenericInternalRow(right.output.length)
@@ -214,6 +221,7 @@ case class SortMergeJoinExec(
             keyOrdering,
             streamedIter = RowIterator.fromScala(rightIter),
             bufferedIter = RowIterator.fromScala(leftIter),
+            inMemoryThreshold,
             spillThreshold
           )
           val leftNullRow = new GenericInternalRow(left.output.length)
@@ -247,6 +255,7 @@ case class SortMergeJoinExec(
               keyOrdering,
               RowIterator.fromScala(leftIter),
               RowIterator.fromScala(rightIter),
+              inMemoryThreshold,
               spillThreshold
             )
             private[this] val joinRow = new JoinedRow
@@ -281,6 +290,7 @@ case class SortMergeJoinExec(
               keyOrdering,
               RowIterator.fromScala(leftIter),
               RowIterator.fromScala(rightIter),
+              inMemoryThreshold,
               spillThreshold
             )
             private[this] val joinRow = new JoinedRow
@@ -322,6 +332,7 @@ case class SortMergeJoinExec(
               keyOrdering,
               RowIterator.fromScala(leftIter),
               RowIterator.fromScala(rightIter),
+              inMemoryThreshold,
               spillThreshold
             )
             private[this] val joinRow = new JoinedRow
@@ -420,8 +431,10 @@ case class SortMergeJoinExec(
     val clsName = classOf[ExternalAppendOnlyUnsafeRowArray].getName
 
     val spillThreshold = getSpillThreshold
+    val inMemoryThreshold = getInMemoryThreshold
 
-    ctx.addMutableState(clsName, matches, s"$matches = new $clsName($spillThreshold);")
+    ctx.addMutableState(clsName, matches,
+      s"$matches = new $clsName($inMemoryThreshold, $spillThreshold);")
     // Copy the left keys as class members so they could be used in next function call.
     val matchedKeyVars = copyKeys(ctx, leftKeyVars)
 
@@ -626,6 +639,9 @@ case class SortMergeJoinExec(
  * @param streamedIter an input whose rows will be streamed.
  * @param bufferedIter an input whose rows will be buffered to construct sequences of rows that
  *                     have the same join key.
+ * @param inMemoryThreshold Threshold for number of rows guaranteed to be held in memory by
+ *                          internal buffer
+ * @param spillThreshold Threshold for number of rows to be spilled by internal buffer
  */
 private[joins] class SortMergeJoinScanner(
     streamedKeyGenerator: Projection,
@@ -633,7 +649,8 @@ private[joins] class SortMergeJoinScanner(
     keyOrdering: Ordering[InternalRow],
     streamedIter: RowIterator,
     bufferedIter: RowIterator,
-    bufferThreshold: Int) {
+    inMemoryThreshold: Int,
+    spillThreshold: Int) {
   private[this] var streamedRow: InternalRow = _
   private[this] var streamedRowKey: InternalRow = _
   private[this] var bufferedRow: InternalRow = _
@@ -644,7 +661,8 @@ private[joins] class SortMergeJoinScanner(
    */
   private[this] var matchJoinKey: InternalRow = _
   /** Buffered rows from the buffered side of the join. This is empty if there are no matches. */
-  private[this] val bufferedMatches = new ExternalAppendOnlyUnsafeRowArray(bufferThreshold)
+  private[this] val bufferedMatches =
+    new ExternalAppendOnlyUnsafeRowArray(inMemoryThreshold, spillThreshold)
 
   // Initialization (note: do _not_ want to advance streamed here).
   advancedBufferedToRowWithNullFreeJoinKey()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala
index f8bb667e30064..800a2ea3f3996 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala
@@ -292,6 +292,7 @@ case class WindowExec(
     // Unwrap the expressions and factories from the map.
     val expressions = windowFrameExpressionFactoryPairs.flatMap(_._1)
     val factories = windowFrameExpressionFactoryPairs.map(_._2).toArray
+    val inMemoryThreshold = sqlContext.conf.windowExecBufferInMemoryThreshold
     val spillThreshold = sqlContext.conf.windowExecBufferSpillThreshold
 
     // Start processing.
@@ -322,7 +323,8 @@ case class WindowExec(
         val inputFields = child.output.length
 
         val buffer: ExternalAppendOnlyUnsafeRowArray =
-          new ExternalAppendOnlyUnsafeRowArray(spillThreshold)
+          new ExternalAppendOnlyUnsafeRowArray(inMemoryThreshold, spillThreshold)
+
         var bufferIterator: Iterator[UnsafeRow] = _
 
         val windowFunctionResult = new SpecificInternalRow(expressions.map(_.dataType))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index 895ca196a7a51..0008d503a2cbe 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -665,7 +665,8 @@ class JoinSuite extends QueryTest with SharedSQLContext {
 
   test("test SortMergeJoin (with spill)") {
     withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "1",
-      "spark.sql.sortMergeJoinExec.buffer.spill.threshold" -> "0") {
+      "spark.sql.sortMergeJoinExec.buffer.in.memory.threshold" -> "0",
+      "spark.sql.sortMergeJoinExec.buffer.spill.threshold" -> "1") {
 
       assertSpilled(sparkContext, "inner join") {
         checkAnswer(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArrayBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArrayBenchmark.scala
index 031ac38c17d7b..efe28afab08e5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArrayBenchmark.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArrayBenchmark.scala
@@ -67,7 +67,10 @@ object ExternalAppendOnlyUnsafeRowArrayBenchmark {
     benchmark.addCase("ExternalAppendOnlyUnsafeRowArray") { _: Int =>
       var sum = 0L
       for (_ <- 0L until iterations) {
-        val array = new ExternalAppendOnlyUnsafeRowArray(numSpillThreshold)
+        val array = new ExternalAppendOnlyUnsafeRowArray(
+          ExternalAppendOnlyUnsafeRowArray.DefaultInitialSizeOfInMemoryBuffer,
+          numSpillThreshold)
+
         rows.foreach(x => array.add(x))
 
         val iterator = array.generateIterator()
@@ -143,7 +146,7 @@ object ExternalAppendOnlyUnsafeRowArrayBenchmark {
     benchmark.addCase("ExternalAppendOnlyUnsafeRowArray") { _: Int =>
       var sum = 0L
       for (_ <- 0L until iterations) {
-        val array = new ExternalAppendOnlyUnsafeRowArray(numSpillThreshold)
+        val array = new ExternalAppendOnlyUnsafeRowArray(numSpillThreshold, numSpillThreshold)
         rows.foreach(x => array.add(x))
 
         val iterator = array.generateIterator()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArraySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArraySuite.scala
index 53c41639942b4..ecc7264d79442 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArraySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArraySuite.scala
@@ -31,7 +31,7 @@ class ExternalAppendOnlyUnsafeRowArraySuite extends SparkFunSuite with LocalSpar
 
   override def afterAll(): Unit = TaskContext.unset()
 
-  private def withExternalArray(spillThreshold: Int)
+  private def withExternalArray(inMemoryThreshold: Int, spillThreshold: Int)
                                (f: ExternalAppendOnlyUnsafeRowArray => Unit): Unit = {
     sc = new SparkContext("local", "test", new SparkConf(false))
 
@@ -45,6 +45,7 @@ class ExternalAppendOnlyUnsafeRowArraySuite extends SparkFunSuite with LocalSpar
       taskContext,
       1024,
       SparkEnv.get.memoryManager.pageSizeBytes,
+      inMemoryThreshold,
       spillThreshold)
     try f(array) finally {
       array.clear()
@@ -109,9 +110,9 @@ class ExternalAppendOnlyUnsafeRowArraySuite extends SparkFunSuite with LocalSpar
     assert(getNumBytesSpilled > 0)
   }
 
-  test("insert rows less than the spillThreshold") {
-    val spillThreshold = 100
-    withExternalArray(spillThreshold) { array =>
+  test("insert rows less than the inMemoryThreshold") {
+    val (inMemoryThreshold, spillThreshold) = (100, 50)
+    withExternalArray(inMemoryThreshold, spillThreshold) { array =>
       assert(array.isEmpty)
 
       val expectedValues = populateRows(array, 1)
@@ -122,8 +123,8 @@ class ExternalAppendOnlyUnsafeRowArraySuite extends SparkFunSuite with LocalSpar
 
       // Add more rows (but not too many to trigger switch to [[UnsafeExternalSorter]])
       // Verify that NO spill has happened
-      populateRows(array, spillThreshold - 1, expectedValues)
-      assert(array.length == spillThreshold)
+      populateRows(array, inMemoryThreshold - 1, expectedValues)
+      assert(array.length == inMemoryThreshold)
       assertNoSpill()
 
       val iterator2 = validateData(array, expectedValues)
@@ -133,20 +134,42 @@ class ExternalAppendOnlyUnsafeRowArraySuite extends SparkFunSuite with LocalSpar
     }
   }
 
-  test("insert rows more than the spillThreshold to force spill") {
-    val spillThreshold = 100
-    withExternalArray(spillThreshold) { array =>
-      val numValuesInserted = 20 * spillThreshold
-
+  test("insert rows more than the inMemoryThreshold but less than spillThreshold") {
+    val (inMemoryThreshold, spillThreshold) = (10, 50)
+    withExternalArray(inMemoryThreshold, spillThreshold) { array =>
       assert(array.isEmpty)
-      val expectedValues = populateRows(array, 1)
-      assert(array.length == 1)
+      val expectedValues = populateRows(array, inMemoryThreshold - 1)
+      assert(array.length == (inMemoryThreshold - 1))
+      val iterator1 = validateData(array, expectedValues)
+      assertNoSpill()
+
+      // Add more rows to trigger switch to [[UnsafeExternalSorter]] but not too many to cause a
+      // spill to happen. Verify that NO spill has happened
+      populateRows(array, spillThreshold - expectedValues.length - 1, expectedValues)
+      assert(array.length == spillThreshold - 1)
+      assertNoSpill()
+
+      val iterator2 = validateData(array, expectedValues)
+      assert(!iterator2.hasNext)
 
+      assert(!iterator1.hasNext)
+      intercept[ConcurrentModificationException](iterator1.next())
+    }
+  }
+
+  test("insert rows enough to force spill") {
+    val (inMemoryThreshold, spillThreshold) = (20, 10)
+    withExternalArray(inMemoryThreshold, spillThreshold) { array =>
+      assert(array.isEmpty)
+      val expectedValues = populateRows(array, inMemoryThreshold - 1)
+      assert(array.length == (inMemoryThreshold - 1))
       val iterator1 = validateData(array, expectedValues)
+      assertNoSpill()
 
-      // Populate more rows to trigger spill. Verify that spill has happened
-      populateRows(array, numValuesInserted - 1, expectedValues)
-      assert(array.length == numValuesInserted)
+      // Add more rows to trigger switch to [[UnsafeExternalSorter]] and cause a spill to happen.
+      // Verify that spill has happened
+      populateRows(array, 2, expectedValues)
+      assert(array.length == inMemoryThreshold + 1)
       assertSpill()
 
       val iterator2 = validateData(array, expectedValues)
@@ -158,7 +181,7 @@ class ExternalAppendOnlyUnsafeRowArraySuite extends SparkFunSuite with LocalSpar
   }
 
   test("iterator on an empty array should be empty") {
-    withExternalArray(spillThreshold = 10) { array =>
+    withExternalArray(inMemoryThreshold = 4, spillThreshold = 10) { array =>
       val iterator = array.generateIterator()
       assert(array.isEmpty)
       assert(array.length == 0)
@@ -167,7 +190,7 @@ class ExternalAppendOnlyUnsafeRowArraySuite extends SparkFunSuite with LocalSpar
   }
 
   test("generate iterator with negative start index") {
-    withExternalArray(spillThreshold = 2) { array =>
+    withExternalArray(inMemoryThreshold = 100, spillThreshold = 56) { array =>
       val exception =
         intercept[ArrayIndexOutOfBoundsException](array.generateIterator(startIndex = -10))
 
@@ -178,8 +201,8 @@ class ExternalAppendOnlyUnsafeRowArraySuite extends SparkFunSuite with LocalSpar
   }
 
   test("generate iterator with start index exceeding array's size (without spill)") {
-    val spillThreshold = 2
-    withExternalArray(spillThreshold) { array =>
+    val (inMemoryThreshold, spillThreshold) = (20, 100)
+    withExternalArray(inMemoryThreshold, spillThreshold) { array =>
       populateRows(array, spillThreshold / 2)
 
       val exception =
@@ -191,8 +214,8 @@ class ExternalAppendOnlyUnsafeRowArraySuite extends SparkFunSuite with LocalSpar
   }
 
   test("generate iterator with start index exceeding array's size (with spill)") {
-    val spillThreshold = 2
-    withExternalArray(spillThreshold) { array =>
+    val (inMemoryThreshold, spillThreshold) = (20, 100)
+    withExternalArray(inMemoryThreshold, spillThreshold) { array =>
       populateRows(array, spillThreshold * 2)
 
       val exception =
@@ -205,10 +228,10 @@ class ExternalAppendOnlyUnsafeRowArraySuite extends SparkFunSuite with LocalSpar
   }
 
   test("generate iterator with custom start index (without spill)") {
-    val spillThreshold = 10
-    withExternalArray(spillThreshold) { array =>
-      val expectedValues = populateRows(array, spillThreshold)
-      val startIndex = spillThreshold / 2
+    val (inMemoryThreshold, spillThreshold) = (20, 100)
+    withExternalArray(inMemoryThreshold, spillThreshold) { array =>
+      val expectedValues = populateRows(array, inMemoryThreshold)
+      val startIndex = inMemoryThreshold / 2
       val iterator = array.generateIterator(startIndex = startIndex)
       for (i <- startIndex until expectedValues.length) {
         checkIfValueExists(iterator, expectedValues(i))
@@ -217,8 +240,8 @@ class ExternalAppendOnlyUnsafeRowArraySuite extends SparkFunSuite with LocalSpar
   }
 
   test("generate iterator with custom start index (with spill)") {
-    val spillThreshold = 10
-    withExternalArray(spillThreshold) { array =>
+    val (inMemoryThreshold, spillThreshold) = (20, 100)
+    withExternalArray(inMemoryThreshold, spillThreshold) { array =>
       val expectedValues = populateRows(array, spillThreshold * 10)
       val startIndex = spillThreshold * 2
       val iterator = array.generateIterator(startIndex = startIndex)
@@ -229,7 +252,7 @@ class ExternalAppendOnlyUnsafeRowArraySuite extends SparkFunSuite with LocalSpar
   }
 
   test("test iterator invalidation (without spill)") {
-    withExternalArray(spillThreshold = 10) { array =>
+    withExternalArray(inMemoryThreshold = 10, spillThreshold = 100) { array =>
       // insert 2 rows, iterate until the first row
       populateRows(array, 2)
 
@@ -254,9 +277,9 @@ class ExternalAppendOnlyUnsafeRowArraySuite extends SparkFunSuite with LocalSpar
   }
 
   test("test iterator invalidation (with spill)") {
-    val spillThreshold = 10
-    withExternalArray(spillThreshold) { array =>
-      // Populate enough rows so that spill has happens
+    val (inMemoryThreshold, spillThreshold) = (2, 10)
+    withExternalArray(inMemoryThreshold, spillThreshold) { array =>
+      // Populate enough rows so that spill happens
       populateRows(array, spillThreshold * 2)
       assertSpill()
 
@@ -281,7 +304,7 @@ class ExternalAppendOnlyUnsafeRowArraySuite extends SparkFunSuite with LocalSpar
   }
 
   test("clear on an empty the array") {
-    withExternalArray(spillThreshold = 2) { array =>
+    withExternalArray(inMemoryThreshold = 2, spillThreshold = 3) { array =>
       val iterator = array.generateIterator()
       assert(!iterator.hasNext)
 
@@ -299,10 +322,10 @@ class ExternalAppendOnlyUnsafeRowArraySuite extends SparkFunSuite with LocalSpar
   }
 
   test("clear array (without spill)") {
-    val spillThreshold = 10
-    withExternalArray(spillThreshold) { array =>
+    val (inMemoryThreshold, spillThreshold) = (10, 100)
+    withExternalArray(inMemoryThreshold, spillThreshold) { array =>
       // Populate rows ... but not enough to trigger spill
-      populateRows(array, spillThreshold / 2)
+      populateRows(array, inMemoryThreshold / 2)
       assertNoSpill()
 
       // Clear the array
@@ -311,21 +334,21 @@ class ExternalAppendOnlyUnsafeRowArraySuite extends SparkFunSuite with LocalSpar
 
       // Re-populate few rows so that there is no spill
       // Verify the data. Verify that there was no spill
-      val expectedValues = populateRows(array, spillThreshold / 3)
+      val expectedValues = populateRows(array, inMemoryThreshold / 2)
       validateData(array, expectedValues)
       assertNoSpill()
 
       // Populate more rows .. enough to not trigger a spill.
       // Verify the data. Verify that there was no spill
-      populateRows(array, spillThreshold / 3, expectedValues)
+      populateRows(array, inMemoryThreshold / 2, expectedValues)
       validateData(array, expectedValues)
       assertNoSpill()
     }
   }
 
   test("clear array (with spill)") {
-    val spillThreshold = 10
-    withExternalArray(spillThreshold) { array =>
+    val (inMemoryThreshold, spillThreshold) = (10, 20)
+    withExternalArray(inMemoryThreshold, spillThreshold) { array =>
       // Populate enough rows to trigger spill
       populateRows(array, spillThreshold * 2)
       val bytesSpilled = getNumBytesSpilled
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLWindowFunctionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLWindowFunctionSuite.scala
index a9f3fb355c775..a57514c256b90 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLWindowFunctionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLWindowFunctionSuite.scala
@@ -477,7 +477,8 @@ class SQLWindowFunctionSuite extends QueryTest with SharedSQLContext {
         |WINDOW w1 AS (ORDER BY x ROWS BETWEEN UNBOUNDED PRECEDiNG AND CURRENT RoW)
       """.stripMargin)
 
-    withSQLConf("spark.sql.windowExec.buffer.spill.threshold" -> "1") {
+    withSQLConf("spark.sql.windowExec.buffer.in.memory.threshold" -> "1",
+      "spark.sql.windowExec.buffer.spill.threshold" -> "2") {
       assertSpilled(sparkContext, "test with low buffer spill threshold") {
         checkAnswer(actual, expected)
       }

From 7f16c6910700ab90fe8e382b4a99022f67696317 Mon Sep 17 00:00:00 2001
From: Tejas Patil <tejasp@fb.com>
Date: Fri, 11 Aug 2017 15:13:42 -0700
Subject: [PATCH 344/779] [SPARK-19122][SQL] Unnecessary shuffle+sort added if
 join predicates ordering differ from bucketing and sorting order

## What changes were proposed in this pull request?

Jira : https://issues.apache.org/jira/browse/SPARK-19122

`leftKeys` and `rightKeys` in `SortMergeJoinExec` are altered based on the ordering of join keys in the child's `outputPartitioning`. This is done everytime `requiredChildDistribution` is invoked during query planning.

## How was this patch tested?

- Added new test case
- Existing tests

Author: Tejas Patil <tejasp@fb.com>

Closes #16985 from tejasapatil/SPARK-19122_join_order_shuffle.
---
 .../spark/sql/execution/QueryExecution.scala  |  2 +
 .../joins/ReorderJoinPredicates.scala         | 94 +++++++++++++++++++
 .../spark/sql/sources/BucketedReadSuite.scala | 59 ++++++++++++
 3 files changed, 155 insertions(+)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ReorderJoinPredicates.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
index b56fbd4284d2f..4accf54a18232 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.command.{DescribeTableCommand, ExecutedCommandExec, ShowTablesCommand}
 import org.apache.spark.sql.execution.exchange.{EnsureRequirements, ReuseExchange}
+import org.apache.spark.sql.execution.joins.ReorderJoinPredicates
 import org.apache.spark.sql.types.{BinaryType, DateType, DecimalType, TimestampType, _}
 import org.apache.spark.util.Utils
 
@@ -103,6 +104,7 @@ class QueryExecution(val sparkSession: SparkSession, val logical: LogicalPlan) {
   protected def preparations: Seq[Rule[SparkPlan]] = Seq(
     python.ExtractPythonUDFs,
     PlanSubqueries(sparkSession),
+    new ReorderJoinPredicates,
     EnsureRequirements(sparkSession.sessionState.conf),
     CollapseCodegenStages(sparkSession.sessionState.conf),
     ReuseExchange(sparkSession.sessionState.conf),
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ReorderJoinPredicates.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ReorderJoinPredicates.scala
new file mode 100644
index 0000000000000..534d8c5689c27
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ReorderJoinPredicates.scala
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.joins
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning}
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.execution.SparkPlan
+
+/**
+ * When the physical operators are created for JOIN, the ordering of join keys is based on order
+ * in which the join keys appear in the user query. That might not match with the output
+ * partitioning of the join node's children (thus leading to extra sort / shuffle being
+ * introduced). This rule will change the ordering of the join keys to match with the
+ * partitioning of the join nodes' children.
+ */
+class ReorderJoinPredicates extends Rule[SparkPlan] {
+  private def reorderJoinKeys(
+      leftKeys: Seq[Expression],
+      rightKeys: Seq[Expression],
+      leftPartitioning: Partitioning,
+      rightPartitioning: Partitioning): (Seq[Expression], Seq[Expression]) = {
+
+    def reorder(
+        expectedOrderOfKeys: Seq[Expression],
+        currentOrderOfKeys: Seq[Expression]): (Seq[Expression], Seq[Expression]) = {
+      val leftKeysBuffer = ArrayBuffer[Expression]()
+      val rightKeysBuffer = ArrayBuffer[Expression]()
+
+      expectedOrderOfKeys.foreach(expression => {
+        val index = currentOrderOfKeys.indexWhere(e => e.semanticEquals(expression))
+        leftKeysBuffer.append(leftKeys(index))
+        rightKeysBuffer.append(rightKeys(index))
+      })
+      (leftKeysBuffer, rightKeysBuffer)
+    }
+
+    if (leftKeys.forall(_.deterministic) && rightKeys.forall(_.deterministic)) {
+      leftPartitioning match {
+        case HashPartitioning(leftExpressions, _)
+          if leftExpressions.length == leftKeys.length &&
+            leftKeys.forall(x => leftExpressions.exists(_.semanticEquals(x))) =>
+          reorder(leftExpressions, leftKeys)
+
+        case _ => rightPartitioning match {
+          case HashPartitioning(rightExpressions, _)
+            if rightExpressions.length == rightKeys.length &&
+              rightKeys.forall(x => rightExpressions.exists(_.semanticEquals(x))) =>
+            reorder(rightExpressions, rightKeys)
+
+          case _ => (leftKeys, rightKeys)
+        }
+      }
+    } else {
+      (leftKeys, rightKeys)
+    }
+  }
+
+  def apply(plan: SparkPlan): SparkPlan = plan.transformUp {
+    case BroadcastHashJoinExec(leftKeys, rightKeys, joinType, buildSide, condition, left, right) =>
+      val (reorderedLeftKeys, reorderedRightKeys) =
+        reorderJoinKeys(leftKeys, rightKeys, left.outputPartitioning, right.outputPartitioning)
+      BroadcastHashJoinExec(reorderedLeftKeys, reorderedRightKeys, joinType, buildSide, condition,
+        left, right)
+
+    case ShuffledHashJoinExec(leftKeys, rightKeys, joinType, buildSide, condition, left, right) =>
+      val (reorderedLeftKeys, reorderedRightKeys) =
+        reorderJoinKeys(leftKeys, rightKeys, left.outputPartitioning, right.outputPartitioning)
+      ShuffledHashJoinExec(reorderedLeftKeys, reorderedRightKeys, joinType, buildSide, condition,
+        left, right)
+
+    case SortMergeJoinExec(leftKeys, rightKeys, joinType, condition, left, right) =>
+      val (reorderedLeftKeys, reorderedRightKeys) =
+        reorderJoinKeys(leftKeys, rightKeys, left.outputPartitioning, right.outputPartitioning)
+      SortMergeJoinExec(reorderedLeftKeys, reorderedRightKeys, joinType, condition, left, right)
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
index ba0ca666b5c14..eb9e6458fc61c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
@@ -543,6 +543,65 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils {
     )
   }
 
+  test("SPARK-19122 Re-order join predicates if they match with the child's output partitioning") {
+    val bucketedTableTestSpec = BucketedTableTestSpec(
+      Some(BucketSpec(8, Seq("i", "j", "k"), Seq("i", "j", "k"))),
+      numPartitions = 1,
+      expectedShuffle = false,
+      expectedSort = false)
+
+    // If the set of join columns is equal to the set of bucketed + sort columns, then
+    // the order of join keys in the query should not matter and there should not be any shuffle
+    // and sort added in the query plan
+    Seq(
+      Seq("i", "j", "k"),
+      Seq("i", "k", "j"),
+      Seq("j", "k", "i"),
+      Seq("j", "i", "k"),
+      Seq("k", "j", "i"),
+      Seq("k", "i", "j")
+    ).foreach(joinKeys => {
+      testBucketing(
+        bucketedTableTestSpecLeft = bucketedTableTestSpec,
+        bucketedTableTestSpecRight = bucketedTableTestSpec,
+        joinCondition = joinCondition(joinKeys)
+      )
+    })
+  }
+
+  test("SPARK-19122 No re-ordering should happen if set of join columns != set of child's " +
+    "partitioning columns") {
+
+    // join predicates is a super set of child's partitioning columns
+    val bucketedTableTestSpec1 =
+      BucketedTableTestSpec(Some(BucketSpec(8, Seq("i", "j"), Seq("i", "j"))), numPartitions = 1)
+    testBucketing(
+      bucketedTableTestSpecLeft = bucketedTableTestSpec1,
+      bucketedTableTestSpecRight = bucketedTableTestSpec1,
+      joinCondition = joinCondition(Seq("i", "j", "k"))
+    )
+
+    // child's partitioning columns is a super set of join predicates
+    val bucketedTableTestSpec2 =
+      BucketedTableTestSpec(Some(BucketSpec(8, Seq("i", "j", "k"), Seq("i", "j", "k"))),
+        numPartitions = 1)
+    testBucketing(
+      bucketedTableTestSpecLeft = bucketedTableTestSpec2,
+      bucketedTableTestSpecRight = bucketedTableTestSpec2,
+      joinCondition = joinCondition(Seq("i", "j"))
+    )
+
+    // set of child's partitioning columns != set join predicates (despite the lengths of the
+    // sets are same)
+    val bucketedTableTestSpec3 =
+      BucketedTableTestSpec(Some(BucketSpec(8, Seq("i", "j"), Seq("i", "j"))), numPartitions = 1)
+    testBucketing(
+      bucketedTableTestSpecLeft = bucketedTableTestSpec3,
+      bucketedTableTestSpecRight = bucketedTableTestSpec3,
+      joinCondition = joinCondition(Seq("j", "k"))
+    )
+  }
+
   test("error if there exists any malformed bucket files") {
     withTable("bucketed_table") {
       df1.write.format("parquet").bucketBy(8, "i").saveAsTable("bucketed_table")

From da8c59bdeabd9ac9eace9b95eda8b8edecc8937e Mon Sep 17 00:00:00 2001
From: Stavros Kontopoulos <st.kontopoulos@gmail.com>
Date: Fri, 11 Aug 2017 15:49:58 -0700
Subject: [PATCH 345/779] [SPARK-12559][SPARK SUBMIT] fix --packages for
 stand-alone cluster mode

Fixes --packages flag for the stand-alone case in cluster mode. Adds to the driver classpath the jars that are resolved via ivy along with any other jars passed to `spark.jars`. Jars not resolved by ivy are downloaded explicitly to a tmp folder on the driver node. Similar code is available in SparkSubmit so we refactored part of it to use it at the DriverWrapper class which is responsible for launching driver in standalone cluster mode.

Note: In stand-alone mode `spark.jars` contains the user jar so it can be fetched later on at the executor side.

Manually by submitting a driver in cluster mode within a standalone cluster and checking if dependencies were resolved at the driver side.

Author: Stavros Kontopoulos <st.kontopoulos@gmail.com>

Closes #18630 from skonto/fix_packages_stand_alone_cluster.
---
 .../apache/spark/deploy/DependencyUtils.scala | 99 +++++++++++++++++++
 .../org/apache/spark/deploy/SparkSubmit.scala | 52 +++-------
 .../spark/deploy/worker/DriverWrapper.scala   | 23 +++++
 3 files changed, 137 insertions(+), 37 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/deploy/DependencyUtils.scala

diff --git a/core/src/main/scala/org/apache/spark/deploy/DependencyUtils.scala b/core/src/main/scala/org/apache/spark/deploy/DependencyUtils.scala
new file mode 100644
index 0000000000000..97f3803aafce4
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/deploy/DependencyUtils.scala
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy
+
+import java.io.File
+import java.nio.file.Files
+
+import scala.collection.mutable.HashMap
+
+import org.apache.commons.io.FileUtils
+import org.apache.commons.lang3.StringUtils
+import org.apache.hadoop.conf.Configuration
+
+import org.apache.spark.util.MutableURLClassLoader
+
+private[deploy] object DependencyUtils {
+
+  def resolveMavenDependencies(
+      packagesExclusions: String,
+      packages: String,
+      repositories: String,
+      ivyRepoPath: String): String = {
+    val exclusions: Seq[String] =
+      if (!StringUtils.isBlank(packagesExclusions)) {
+        packagesExclusions.split(",")
+      } else {
+        Nil
+      }
+    // Create the IvySettings, either load from file or build defaults
+    val ivySettings = sys.props.get("spark.jars.ivySettings").map { ivySettingsFile =>
+      SparkSubmitUtils.loadIvySettings(ivySettingsFile, Option(repositories), Option(ivyRepoPath))
+    }.getOrElse {
+      SparkSubmitUtils.buildIvySettings(Option(repositories), Option(ivyRepoPath))
+    }
+
+    SparkSubmitUtils.resolveMavenCoordinates(packages, ivySettings, exclusions = exclusions)
+  }
+
+  def createTempDir(): File = {
+    val targetDir = Files.createTempDirectory("tmp").toFile
+    // scalastyle:off runtimeaddshutdownhook
+    Runtime.getRuntime.addShutdownHook(new Thread() {
+      override def run(): Unit = {
+        FileUtils.deleteQuietly(targetDir)
+      }
+    })
+    // scalastyle:on runtimeaddshutdownhook
+    targetDir
+  }
+
+  def resolveAndDownloadJars(jars: String, userJar: String): String = {
+    val targetDir = DependencyUtils.createTempDir()
+    val hadoopConf = new Configuration()
+    val sparkProperties = new HashMap[String, String]()
+    val securityProperties = List("spark.ssl.fs.trustStore", "spark.ssl.trustStore",
+      "spark.ssl.fs.trustStorePassword", "spark.ssl.trustStorePassword",
+      "spark.ssl.fs.protocol", "spark.ssl.protocol")
+
+    securityProperties.foreach { pName =>
+      sys.props.get(pName).foreach { pValue =>
+        sparkProperties.put(pName, pValue)
+      }
+    }
+
+    Option(jars)
+      .map {
+        SparkSubmit.resolveGlobPaths(_, hadoopConf)
+          .split(",")
+          .filterNot(_.contains(userJar.split("/").last))
+          .mkString(",")
+      }
+      .filterNot(_ == "")
+      .map(SparkSubmit.downloadFileList(_, targetDir, sparkProperties, hadoopConf))
+      .orNull
+  }
+
+  def addJarsToClassPath(jars: String, loader: MutableURLClassLoader): Unit = {
+    if (jars != null) {
+      for (jar <- jars.split(",")) {
+        SparkSubmit.addJarToClasspath(jar, loader)
+      }
+    }
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 0ea14361b2f77..019780076e7e7 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -20,7 +20,6 @@ package org.apache.spark.deploy
 import java.io._
 import java.lang.reflect.{InvocationTargetException, Modifier, UndeclaredThrowableException}
 import java.net.URL
-import java.nio.file.Files
 import java.security.{KeyStore, PrivilegedExceptionAction}
 import java.security.cert.X509Certificate
 import java.text.ParseException
@@ -31,7 +30,6 @@ import scala.collection.mutable.{ArrayBuffer, HashMap, Map}
 import scala.util.Properties
 
 import com.google.common.io.ByteStreams
-import org.apache.commons.io.FileUtils
 import org.apache.commons.lang3.StringUtils
 import org.apache.hadoop.conf.{Configuration => HadoopConfiguration}
 import org.apache.hadoop.fs.{FileSystem, Path}
@@ -300,28 +298,13 @@ object SparkSubmit extends CommandLineUtils {
     }
     val isYarnCluster = clusterManager == YARN && deployMode == CLUSTER
     val isMesosCluster = clusterManager == MESOS && deployMode == CLUSTER
+    val isStandAloneCluster = clusterManager == STANDALONE && deployMode == CLUSTER
 
-    if (!isMesosCluster) {
+    if (!isMesosCluster && !isStandAloneCluster) {
       // Resolve maven dependencies if there are any and add classpath to jars. Add them to py-files
       // too for packages that include Python code
-      val exclusions: Seq[String] =
-      if (!StringUtils.isBlank(args.packagesExclusions)) {
-        args.packagesExclusions.split(",")
-      } else {
-        Nil
-      }
-
-      // Create the IvySettings, either load from file or build defaults
-      val ivySettings = args.sparkProperties.get("spark.jars.ivySettings").map { ivySettingsFile =>
-        SparkSubmitUtils.loadIvySettings(ivySettingsFile, Option(args.repositories),
-          Option(args.ivyRepoPath))
-      }.getOrElse {
-        SparkSubmitUtils.buildIvySettings(Option(args.repositories), Option(args.ivyRepoPath))
-      }
-
-      val resolvedMavenCoordinates = SparkSubmitUtils.resolveMavenCoordinates(args.packages,
-        ivySettings, exclusions = exclusions)
-
+      val resolvedMavenCoordinates = DependencyUtils.resolveMavenDependencies(
+        args.packagesExclusions, args.packages, args.repositories, args.ivyRepoPath)
 
       if (!StringUtils.isBlank(resolvedMavenCoordinates)) {
         args.jars = mergeFileLists(args.jars, resolvedMavenCoordinates)
@@ -338,14 +321,7 @@ object SparkSubmit extends CommandLineUtils {
     }
 
     val hadoopConf = new HadoopConfiguration()
-    val targetDir = Files.createTempDirectory("tmp").toFile
-    // scalastyle:off runtimeaddshutdownhook
-    Runtime.getRuntime.addShutdownHook(new Thread() {
-      override def run(): Unit = {
-        FileUtils.deleteQuietly(targetDir)
-      }
-    })
-    // scalastyle:on runtimeaddshutdownhook
+    val targetDir = DependencyUtils.createTempDir()
 
     // Resolve glob path for different resources.
     args.jars = Option(args.jars).map(resolveGlobPaths(_, hadoopConf)).orNull
@@ -473,11 +449,13 @@ object SparkSubmit extends CommandLineUtils {
       OptionAssigner(args.driverExtraLibraryPath, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES,
         sysProp = "spark.driver.extraLibraryPath"),
 
-      // Mesos only - propagate attributes for dependency resolution at the driver side
-      OptionAssigner(args.packages, MESOS, CLUSTER, sysProp = "spark.jars.packages"),
-      OptionAssigner(args.repositories, MESOS, CLUSTER, sysProp = "spark.jars.repositories"),
-      OptionAssigner(args.ivyRepoPath, MESOS, CLUSTER, sysProp = "spark.jars.ivy"),
-      OptionAssigner(args.packagesExclusions, MESOS, CLUSTER, sysProp = "spark.jars.excludes"),
+      // Propagate attributes for dependency resolution at the driver side
+      OptionAssigner(args.packages, STANDALONE | MESOS, CLUSTER, sysProp = "spark.jars.packages"),
+      OptionAssigner(args.repositories, STANDALONE | MESOS, CLUSTER,
+        sysProp = "spark.jars.repositories"),
+      OptionAssigner(args.ivyRepoPath, STANDALONE | MESOS, CLUSTER, sysProp = "spark.jars.ivy"),
+      OptionAssigner(args.packagesExclusions, STANDALONE | MESOS,
+        CLUSTER, sysProp = "spark.jars.excludes"),
 
       // Yarn only
       OptionAssigner(args.queue, YARN, ALL_DEPLOY_MODES, sysProp = "spark.yarn.queue"),
@@ -780,7 +758,7 @@ object SparkSubmit extends CommandLineUtils {
     }
   }
 
-  private def addJarToClasspath(localJar: String, loader: MutableURLClassLoader) {
+  private[deploy] def addJarToClasspath(localJar: String, loader: MutableURLClassLoader) {
     val uri = Utils.resolveURI(localJar)
     uri.getScheme match {
       case "file" | "local" =>
@@ -845,7 +823,7 @@ object SparkSubmit extends CommandLineUtils {
    * Merge a sequence of comma-separated file lists, some of which may be null to indicate
    * no files, into a single comma-separated string.
    */
-  private def mergeFileLists(lists: String*): String = {
+  private[deploy] def mergeFileLists(lists: String*): String = {
     val merged = lists.filterNot(StringUtils.isBlank)
                       .flatMap(_.split(","))
                       .mkString(",")
@@ -968,7 +946,7 @@ object SparkSubmit extends CommandLineUtils {
     }
   }
 
-  private def resolveGlobPaths(paths: String, hadoopConf: HadoopConfiguration): String = {
+  private[deploy] def resolveGlobPaths(paths: String, hadoopConf: HadoopConfiguration): String = {
     require(paths != null, "paths cannot be null.")
     paths.split(",").map(_.trim).filter(_.nonEmpty).flatMap { path =>
       val uri = Utils.resolveURI(path)
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala b/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala
index 6799f78ec0c19..cd3e361530c18 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala
@@ -19,7 +19,10 @@ package org.apache.spark.deploy.worker
 
 import java.io.File
 
+import org.apache.commons.lang3.StringUtils
+
 import org.apache.spark.{SecurityManager, SparkConf}
+import org.apache.spark.deploy.{DependencyUtils, SparkSubmit}
 import org.apache.spark.rpc.RpcEnv
 import org.apache.spark.util.{ChildFirstURLClassLoader, MutableURLClassLoader, Utils}
 
@@ -51,6 +54,7 @@ object DriverWrapper {
             new MutableURLClassLoader(Array(userJarUrl), currentLoader)
           }
         Thread.currentThread.setContextClassLoader(loader)
+        setupDependencies(loader, userJar)
 
         // Delegate to supplied main class
         val clazz = Utils.classForName(mainClass)
@@ -66,4 +70,23 @@ object DriverWrapper {
         System.exit(-1)
     }
   }
+
+  private def setupDependencies(loader: MutableURLClassLoader, userJar: String): Unit = {
+    val Seq(packagesExclusions, packages, repositories, ivyRepoPath) =
+      Seq("spark.jars.excludes", "spark.jars.packages", "spark.jars.repositories", "spark.jars.ivy")
+        .map(sys.props.get(_).orNull)
+
+    val resolvedMavenCoordinates = DependencyUtils.resolveMavenDependencies(packagesExclusions,
+      packages, repositories, ivyRepoPath)
+    val jars = {
+      val jarsProp = sys.props.get("spark.jars").orNull
+      if (!StringUtils.isBlank(resolvedMavenCoordinates)) {
+        SparkSubmit.mergeFileLists(jarsProp, resolvedMavenCoordinates)
+      } else {
+        jarsProp
+      }
+    }
+    val localJars = DependencyUtils.resolveAndDownloadJars(jars, userJar)
+    DependencyUtils.addJarsToClassPath(localJars, loader)
+  }
 }

From b0bdfce9cae986096f327e2c7a5bdaa900dedc32 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Sat, 12 Aug 2017 14:31:05 +0900
Subject: [PATCH 346/779] [MINOR][BUILD] Download RAT and R version info over
 HTTPS; use RAT 0.12

## What changes were proposed in this pull request?

This is trivial, but bugged me. We should download software over HTTPS.
And we can use RAT 0.12 while at it to pick up bug fixes.

## How was this patch tested?

N/A

Author: Sean Owen <sowen@cloudera.com>

Closes #18927 from srowen/Rat012.
---
 dev/appveyor-install-dependencies.ps1 | 2 +-
 dev/check-license                     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/dev/appveyor-install-dependencies.ps1 b/dev/appveyor-install-dependencies.ps1
index a357fbf59f6c8..e6afb18558852 100644
--- a/dev/appveyor-install-dependencies.ps1
+++ b/dev/appveyor-install-dependencies.ps1
@@ -26,7 +26,7 @@ Function InstallR {
   }
 
   $urlPath = ""
-  $latestVer = $(ConvertFrom-JSON $(Invoke-WebRequest http://rversions.r-pkg.org/r-release-win).Content).version
+  $latestVer = $(ConvertFrom-JSON $(Invoke-WebRequest https://rversions.r-pkg.org/r-release-win).Content).version
   If ($rVer -ne $latestVer) {
     $urlPath = ("old/" + $rVer + "/")
   }
diff --git a/dev/check-license b/dev/check-license
index 678e73fd60f1f..8cee09a53e087 100755
--- a/dev/check-license
+++ b/dev/check-license
@@ -20,7 +20,7 @@
 
 acquire_rat_jar () {
 
-  URL="http://repo1.maven.org/maven2/org/apache/rat/apache-rat/${RAT_VERSION}/apache-rat-${RAT_VERSION}.jar"
+  URL="https://repo1.maven.org/maven2/org/apache/rat/apache-rat/${RAT_VERSION}/apache-rat-${RAT_VERSION}.jar"
 
   JAR="$rat_jar"
 
@@ -58,7 +58,7 @@ else
     declare java_cmd=java
 fi
 
-export RAT_VERSION=0.11
+export RAT_VERSION=0.12
 export rat_jar="$FWDIR"/lib/apache-rat-${RAT_VERSION}.jar
 mkdir -p "$FWDIR"/lib
 

From 35db3b9fe38dadfb8afb0b0857c09f83196398be Mon Sep 17 00:00:00 2001
From: Ajay Saini <ajays725@gmail.com>
Date: Fri, 11 Aug 2017 23:57:08 -0700
Subject: [PATCH 347/779] [SPARK-17025][ML][PYTHON] Persistence for Pipelines
 with Python-only Stages

## What changes were proposed in this pull request?

Implemented a Python-only persistence framework for pipelines containing stages that cannot be saved using Java.

## How was this patch tested?

Created a custom Python-only UnaryTransformer, included it in a Pipeline, and saved/loaded the pipeline. The loaded pipeline was compared against the original using _compare_pipelines() in tests.py.

Author: Ajay Saini <ajays725@gmail.com>

Closes #18888 from ajaysaini725/PythonPipelines.
---
 python/pyspark/ml/pipeline.py | 156 ++++++++++++++++++++++++++++++++--
 python/pyspark/ml/tests.py    |  35 +++++++-
 2 files changed, 183 insertions(+), 8 deletions(-)

diff --git a/python/pyspark/ml/pipeline.py b/python/pyspark/ml/pipeline.py
index a8dc76b846c24..097530230cbca 100644
--- a/python/pyspark/ml/pipeline.py
+++ b/python/pyspark/ml/pipeline.py
@@ -16,6 +16,7 @@
 #
 
 import sys
+import os
 
 if sys.version > '3':
     basestring = str
@@ -23,7 +24,7 @@
 from pyspark import since, keyword_only, SparkContext
 from pyspark.ml.base import Estimator, Model, Transformer
 from pyspark.ml.param import Param, Params
-from pyspark.ml.util import JavaMLWriter, JavaMLReader, MLReadable, MLWritable
+from pyspark.ml.util import *
 from pyspark.ml.wrapper import JavaParams
 from pyspark.ml.common import inherit_doc
 
@@ -130,13 +131,16 @@ def copy(self, extra=None):
     @since("2.0.0")
     def write(self):
         """Returns an MLWriter instance for this ML instance."""
-        return JavaMLWriter(self)
+        allStagesAreJava = PipelineSharedReadWrite.checkStagesForJava(self.getStages())
+        if allStagesAreJava:
+            return JavaMLWriter(self)
+        return PipelineWriter(self)
 
     @classmethod
     @since("2.0.0")
     def read(cls):
         """Returns an MLReader instance for this class."""
-        return JavaMLReader(cls)
+        return PipelineReader(cls)
 
     @classmethod
     def _from_java(cls, java_stage):
@@ -171,6 +175,76 @@ def _to_java(self):
         return _java_obj
 
 
+@inherit_doc
+class PipelineWriter(MLWriter):
+    """
+    (Private) Specialization of :py:class:`MLWriter` for :py:class:`Pipeline` types
+    """
+
+    def __init__(self, instance):
+        super(PipelineWriter, self).__init__()
+        self.instance = instance
+
+    def saveImpl(self, path):
+        stages = self.instance.getStages()
+        PipelineSharedReadWrite.validateStages(stages)
+        PipelineSharedReadWrite.saveImpl(self.instance, stages, self.sc, path)
+
+
+@inherit_doc
+class PipelineReader(MLReader):
+    """
+    (Private) Specialization of :py:class:`MLReader` for :py:class:`Pipeline` types
+    """
+
+    def __init__(self, cls):
+        super(PipelineReader, self).__init__()
+        self.cls = cls
+
+    def load(self, path):
+        metadata = DefaultParamsReader.loadMetadata(path, self.sc)
+        if 'language' not in metadata['paramMap'] or metadata['paramMap']['language'] != 'Python':
+            return JavaMLReader(self.cls).load(path)
+        else:
+            uid, stages = PipelineSharedReadWrite.load(metadata, self.sc, path)
+            return Pipeline(stages=stages)._resetUid(uid)
+
+
+@inherit_doc
+class PipelineModelWriter(MLWriter):
+    """
+    (Private) Specialization of :py:class:`MLWriter` for :py:class:`PipelineModel` types
+    """
+
+    def __init__(self, instance):
+        super(PipelineModelWriter, self).__init__()
+        self.instance = instance
+
+    def saveImpl(self, path):
+        stages = self.instance.stages
+        PipelineSharedReadWrite.validateStages(stages)
+        PipelineSharedReadWrite.saveImpl(self.instance, stages, self.sc, path)
+
+
+@inherit_doc
+class PipelineModelReader(MLReader):
+    """
+    (Private) Specialization of :py:class:`MLReader` for :py:class:`PipelineModel` types
+    """
+
+    def __init__(self, cls):
+        super(PipelineModelReader, self).__init__()
+        self.cls = cls
+
+    def load(self, path):
+        metadata = DefaultParamsReader.loadMetadata(path, self.sc)
+        if 'language' not in metadata['paramMap'] or metadata['paramMap']['language'] != 'Python':
+            return JavaMLReader(self.cls).load(path)
+        else:
+            uid, stages = PipelineSharedReadWrite.load(metadata, self.sc, path)
+            return PipelineModel(stages=stages)._resetUid(uid)
+
+
 @inherit_doc
 class PipelineModel(Model, MLReadable, MLWritable):
     """
@@ -204,13 +278,16 @@ def copy(self, extra=None):
     @since("2.0.0")
     def write(self):
         """Returns an MLWriter instance for this ML instance."""
-        return JavaMLWriter(self)
+        allStagesAreJava = PipelineSharedReadWrite.checkStagesForJava(self.stages)
+        if allStagesAreJava:
+            return JavaMLWriter(self)
+        return PipelineModelWriter(self)
 
     @classmethod
     @since("2.0.0")
     def read(cls):
         """Returns an MLReader instance for this class."""
-        return JavaMLReader(cls)
+        return PipelineModelReader(cls)
 
     @classmethod
     def _from_java(cls, java_stage):
@@ -242,3 +319,72 @@ def _to_java(self):
             JavaParams._new_java_obj("org.apache.spark.ml.PipelineModel", self.uid, java_stages)
 
         return _java_obj
+
+
+@inherit_doc
+class PipelineSharedReadWrite():
+    """
+    .. note:: DeveloperApi
+
+    Functions for :py:class:`MLReader` and :py:class:`MLWriter` shared between
+    :py:class:`Pipeline` and :py:class:`PipelineModel`
+
+    .. versionadded:: 2.3.0
+    """
+
+    @staticmethod
+    def checkStagesForJava(stages):
+        return all(isinstance(stage, JavaMLWritable) for stage in stages)
+
+    @staticmethod
+    def validateStages(stages):
+        """
+        Check that all stages are Writable
+        """
+        for stage in stages:
+            if not isinstance(stage, MLWritable):
+                raise ValueError("Pipeline write will fail on this pipeline " +
+                                 "because stage %s of type %s is not MLWritable",
+                                 stage.uid, type(stage))
+
+    @staticmethod
+    def saveImpl(instance, stages, sc, path):
+        """
+        Save metadata and stages for a :py:class:`Pipeline` or :py:class:`PipelineModel`
+        - save metadata to path/metadata
+        - save stages to stages/IDX_UID
+        """
+        stageUids = [stage.uid for stage in stages]
+        jsonParams = {'stageUids': stageUids, 'language': 'Python'}
+        DefaultParamsWriter.saveMetadata(instance, path, sc, paramMap=jsonParams)
+        stagesDir = os.path.join(path, "stages")
+        for index, stage in enumerate(stages):
+            stage.write().save(PipelineSharedReadWrite
+                               .getStagePath(stage.uid, index, len(stages), stagesDir))
+
+    @staticmethod
+    def load(metadata, sc, path):
+        """
+        Load metadata and stages for a :py:class:`Pipeline` or :py:class:`PipelineModel`
+
+        :return: (UID, list of stages)
+        """
+        stagesDir = os.path.join(path, "stages")
+        stageUids = metadata['paramMap']['stageUids']
+        stages = []
+        for index, stageUid in enumerate(stageUids):
+            stagePath = \
+                PipelineSharedReadWrite.getStagePath(stageUid, index, len(stageUids), stagesDir)
+            stage = DefaultParamsReader.loadParamsInstance(stagePath, sc)
+            stages.append(stage)
+        return (metadata['uid'], stages)
+
+    @staticmethod
+    def getStagePath(stageUid, stageIdx, numStages, stagesDir):
+        """
+        Get path for saving the given stage.
+        """
+        stageIdxDigits = len(str(numStages))
+        stageDir = str(stageIdx).zfill(stageIdxDigits) + "_" + stageUid
+        stagePath = os.path.join(stagesDir, stageDir)
+        return stagePath
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 6aecc7fe87074..0495973d2f625 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -123,7 +123,7 @@ def _transform(self, dataset):
         return dataset
 
 
-class MockUnaryTransformer(UnaryTransformer):
+class MockUnaryTransformer(UnaryTransformer, DefaultParamsReadable, DefaultParamsWritable):
 
     shift = Param(Params._dummy(), "shift", "The amount by which to shift " +
                   "data in a DataFrame",
@@ -150,7 +150,7 @@ def outputDataType(self):
     def validateInputType(self, inputType):
         if inputType != DoubleType():
             raise TypeError("Bad input type: {}. ".format(inputType) +
-                            "Requires Integer.")
+                            "Requires Double.")
 
 
 class MockEstimator(Estimator, HasFake):
@@ -1063,7 +1063,7 @@ def _compare_pipelines(self, m1, m2):
         """
         self.assertEqual(m1.uid, m2.uid)
         self.assertEqual(type(m1), type(m2))
-        if isinstance(m1, JavaParams):
+        if isinstance(m1, JavaParams) or isinstance(m1, Transformer):
             self.assertEqual(len(m1.params), len(m2.params))
             for p in m1.params:
                 self._compare_params(m1, m2, p)
@@ -1142,6 +1142,35 @@ def test_nested_pipeline_persistence(self):
             except OSError:
                 pass
 
+    def test_python_transformer_pipeline_persistence(self):
+        """
+        Pipeline[MockUnaryTransformer, Binarizer]
+        """
+        temp_path = tempfile.mkdtemp()
+
+        try:
+            df = self.spark.range(0, 10).toDF('input')
+            tf = MockUnaryTransformer(shiftVal=2)\
+                .setInputCol("input").setOutputCol("shiftedInput")
+            tf2 = Binarizer(threshold=6, inputCol="shiftedInput", outputCol="binarized")
+            pl = Pipeline(stages=[tf, tf2])
+            model = pl.fit(df)
+
+            pipeline_path = temp_path + "/pipeline"
+            pl.save(pipeline_path)
+            loaded_pipeline = Pipeline.load(pipeline_path)
+            self._compare_pipelines(pl, loaded_pipeline)
+
+            model_path = temp_path + "/pipeline-model"
+            model.save(model_path)
+            loaded_model = PipelineModel.load(model_path)
+            self._compare_pipelines(model, loaded_model)
+        finally:
+            try:
+                rmtree(temp_path)
+            except OSError:
+                pass
+
     def test_onevsrest(self):
         temp_path = tempfile.mkdtemp()
         df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),

From c0e333dbed2571ed995c27576c3ef616446a61ce Mon Sep 17 00:00:00 2001
From: "pj.fanning" <pj.fanning@workday.com>
Date: Sat, 12 Aug 2017 20:01:20 +0100
Subject: [PATCH 348/779] [SPARK-21709][BUILD] sbt 0.13.16 and some plugin
 updates

## What changes were proposed in this pull request?

Update sbt version to 0.13.16. I think this is a useful stepping stone to getting to sbt 1.0.0.

## How was this patch tested?

Existing Build.

Author: pj.fanning <pj.fanning@workday.com>

Closes #18921 from pjfanning/SPARK-21709.
---
 dev/mima                 |  2 +-
 project/build.properties |  2 +-
 project/plugins.sbt      | 13 +++++++++++--
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/dev/mima b/dev/mima
index 85b09dbb1bf20..5501589b7900a 100755
--- a/dev/mima
+++ b/dev/mima
@@ -41,7 +41,7 @@ $JAVA_CMD \
   -cp "$TOOLS_CLASSPATH:$OLD_DEPS_CLASSPATH" \
   org.apache.spark.tools.GenerateMIMAIgnore
 
-echo -e "q\n" | build/sbt -DcopyDependencies=false "$@" mimaReportBinaryIssues | grep -v -e "info.*Resolving"
+echo -e "q\n" | build/sbt -mem 4096 -DcopyDependencies=false "$@" mimaReportBinaryIssues | grep -v -e "info.*Resolving"
 ret_val=$?
 
 if [ $ret_val != 0 ]; then
diff --git a/project/build.properties b/project/build.properties
index d339865ab915a..b19518fd7aa1c 100644
--- a/project/build.properties
+++ b/project/build.properties
@@ -14,4 +14,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-sbt.version=0.13.13
+sbt.version=0.13.16
diff --git a/project/plugins.sbt b/project/plugins.sbt
index 84d123999085e..2b49c297ff9cb 100644
--- a/project/plugins.sbt
+++ b/project/plugins.sbt
@@ -1,25 +1,34 @@
+// need to make changes to uptake sbt 1.0 support in "com.eed3si9n" % "sbt-assembly" % "1.14.5"
 addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
 
-addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "5.0.1")
+// sbt 1.0.0 support: https://github.com/typesafehub/sbteclipse/issues/343
+addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "5.1.0")
 
+// sbt 1.0.0 support: https://github.com/jrudolph/sbt-dependency-graph/issues/134
 addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.8.2")
 
+// need to make changes to uptake sbt 1.0 support in "org.scalastyle" %% "scalastyle-sbt-plugin" % "0.9.0"
 addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.8.0")
 
-addSbtPlugin("com.typesafe" % "sbt-mima-plugin" % "0.1.12")
+addSbtPlugin("com.typesafe" % "sbt-mima-plugin" % "0.1.17")
 
+// sbt 1.0.0 support: https://github.com/AlpineNow/junit_xml_listener/issues/6
 addSbtPlugin("com.alpinenow" % "junit_xml_listener" % "0.5.1")
 
+// need to make changes to uptake sbt 1.0 support in "com.eed3si9n" % "sbt-unidoc" % "0.4.1"
 addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.3.3")
 
+// need to make changes to uptake sbt 1.0 support in "com.cavorite" % "sbt-avro-1-7" % "1.1.2"
 addSbtPlugin("com.cavorite" % "sbt-avro" % "0.3.2")
 
+// sbt 1.0.0 support: https://github.com/spray/sbt-revolver/issues/62
 addSbtPlugin("io.spray" % "sbt-revolver" % "0.8.0")
 
 libraryDependencies += "org.ow2.asm"  % "asm" % "5.1"
 
 libraryDependencies += "org.ow2.asm"  % "asm-commons" % "5.1"
 
+// sbt 1.0.0 support: https://github.com/ihji/sbt-antlr4/issues/14
 addSbtPlugin("com.simplytyped" % "sbt-antlr4" % "0.7.11")
 
 // Spark uses a custom fork of the sbt-pom-reader plugin which contains a patch to fix issues

From 5596ce83c4d1971510dacfee4c8b0cf438475135 Mon Sep 17 00:00:00 2001
From: aokolnychyi <anton.okolnychyi@sap.com>
Date: Sun, 13 Aug 2017 21:33:16 -0700
Subject: [PATCH 349/779] [MINOR][SQL] Additional test case for
 CheckCartesianProducts rule

## What changes were proposed in this pull request?

While discovering optimization rules and their test coverage, I did not find any tests for `CheckCartesianProducts` in the Catalyst folder. So, I decided to create a new test suite. Once I finished, I found a test in `JoinSuite` for this functionality so feel free to discard this change if it does not make much sense. The proposed test suite covers a few additional use cases.

Author: aokolnychyi <anton.okolnychyi@sap.com>

Closes #18909 from aokolnychyi/check-cartesian-join-tests.
---
 .../CheckCartesianProductsSuite.scala         | 91 +++++++++++++++++++
 .../resources/sql-tests/inputs/cross-join.sql |  3 +-
 .../sql-tests/results/cross-join.sql.out      | 12 ++-
 .../org/apache/spark/sql/JoinSuite.scala      | 32 +++++++
 4 files changed, 136 insertions(+), 2 deletions(-)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CheckCartesianProductsSuite.scala

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CheckCartesianProductsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CheckCartesianProductsSuite.scala
new file mode 100644
index 0000000000000..21220b38968e8
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CheckCartesianProductsSuite.scala
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import org.scalatest.Matchers._
+
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.plans._
+import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
+import org.apache.spark.sql.catalyst.rules.RuleExecutor
+import org.apache.spark.sql.internal.SQLConf.CROSS_JOINS_ENABLED
+
+class CheckCartesianProductsSuite extends PlanTest {
+
+  object Optimize extends RuleExecutor[LogicalPlan] {
+    val batches = Batch("Check Cartesian Products", Once, CheckCartesianProducts) :: Nil
+  }
+
+  val testRelation1 = LocalRelation('a.int, 'b.int)
+  val testRelation2 = LocalRelation('c.int, 'd.int)
+
+  val joinTypesWithRequiredCondition = Seq(Inner, LeftOuter, RightOuter, FullOuter)
+  val joinTypesWithoutRequiredCondition = Seq(LeftSemi, LeftAnti, ExistenceJoin('exists))
+
+  test("CheckCartesianProducts doesn't throw an exception if cross joins are enabled)") {
+    withSQLConf(CROSS_JOINS_ENABLED.key -> "true") {
+      noException should be thrownBy {
+        for (joinType <- joinTypesWithRequiredCondition ++ joinTypesWithoutRequiredCondition) {
+          performCartesianProductCheck(joinType)
+        }
+      }
+    }
+  }
+
+  test("CheckCartesianProducts throws an exception for join types that require a join condition") {
+    withSQLConf(CROSS_JOINS_ENABLED.key -> "false") {
+      for (joinType <- joinTypesWithRequiredCondition) {
+        val thrownException = the [AnalysisException] thrownBy {
+          performCartesianProductCheck(joinType)
+        }
+        assert(thrownException.message.contains("Detected cartesian product"))
+      }
+    }
+  }
+
+  test("CheckCartesianProducts doesn't throw an exception if a join condition is present") {
+    withSQLConf(CROSS_JOINS_ENABLED.key -> "false") {
+      for (joinType <- joinTypesWithRequiredCondition) {
+        noException should be thrownBy {
+          performCartesianProductCheck(joinType, Some('a === 'd))
+        }
+      }
+    }
+  }
+
+  test("CheckCartesianProducts doesn't throw an exception if join types don't require conditions") {
+    withSQLConf(CROSS_JOINS_ENABLED.key -> "false") {
+      for (joinType <- joinTypesWithoutRequiredCondition) {
+        noException should be thrownBy {
+          performCartesianProductCheck(joinType)
+        }
+      }
+    }
+  }
+
+  private def performCartesianProductCheck(
+      joinType: JoinType,
+      condition: Option[Expression] = None): Unit = {
+    val analyzedPlan = testRelation1.join(testRelation2, joinType, condition).analyze
+    val optimizedPlan = Optimize.execute(analyzedPlan)
+    comparePlans(analyzedPlan, optimizedPlan)
+  }
+}
diff --git a/sql/core/src/test/resources/sql-tests/inputs/cross-join.sql b/sql/core/src/test/resources/sql-tests/inputs/cross-join.sql
index aa7312437487a..b64197e2bc704 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/cross-join.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/cross-join.sql
@@ -32,4 +32,5 @@ create temporary view D(d, vd) as select * from nt1;
 
 -- Allowed since cross join with C is explicit
 select * from ((A join B on (a = b)) cross join C) join D on (a = d);
-
+-- Cross joins with non-equal predicates
+SELECT * FROM nt1 CROSS JOIN nt2 ON (nt1.k > nt2.k);
diff --git a/sql/core/src/test/resources/sql-tests/results/cross-join.sql.out b/sql/core/src/test/resources/sql-tests/results/cross-join.sql.out
index 562e174fc0bb2..e75cc4448a1ea 100644
--- a/sql/core/src/test/resources/sql-tests/results/cross-join.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/cross-join.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 12
+-- Number of queries: 13
 
 
 -- !query 0
@@ -127,3 +127,13 @@ three	3	three	3	two	2	three	3
 two	2	two	2	one	1	two	2
 two	2	two	2	three	3	two	2
 two	2	two	2	two	2	two	2
+
+-- !query 12
+SELECT * FROM nt1 CROSS JOIN nt2 ON (nt1.k > nt2.k)
+-- !query 12 schema
+struct<k:string,v1:int,k:string,v2:int>
+-- !query 12 output
+three	3	one	1
+three	3	one	5
+two	2	one	1
+two	2	one	5
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index 0008d503a2cbe..3f0c8640f2641 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -216,6 +216,9 @@ class JoinSuite extends QueryTest with SharedSQLContext {
           Row(1, null, 2, 2) ::
           Row(2, 2, 1, null) ::
           Row(2, 2, 2, 2) :: Nil)
+      checkAnswer(
+        testData3.as("x").join(testData3.as("y"), $"x.a" > $"y.a"),
+        Row(2, 2, 1, null) :: Nil)
     }
     withSQLConf(SQLConf.CROSS_JOINS_ENABLED.key -> "false") {
       val e = intercept[Exception] {
@@ -604,6 +607,35 @@ class JoinSuite extends QueryTest with SharedSQLContext {
     }
 
     cartesianQueries.foreach(checkCartesianDetection)
+
+    // Check that left_semi, left_anti, existence joins without conditions do not throw
+    // an exception if cross joins are disabled
+    withSQLConf(SQLConf.CROSS_JOINS_ENABLED.key -> "false") {
+      checkAnswer(
+        sql("SELECT * FROM testData3 LEFT SEMI JOIN testData2"),
+        Row(1, null) :: Row (2, 2) :: Nil)
+      checkAnswer(
+        sql("SELECT * FROM testData3 LEFT ANTI JOIN testData2"),
+        Nil)
+      checkAnswer(
+        sql(
+          """
+            |SELECT a FROM testData3
+            |WHERE
+            |  EXISTS (SELECT * FROM testData)
+            |OR
+            |  EXISTS (SELECT * FROM testData2)""".stripMargin),
+        Row(1) :: Row(2) :: Nil)
+      checkAnswer(
+        sql(
+          """
+            |SELECT key FROM testData
+            |WHERE
+            |  key IN (SELECT a FROM testData2)
+            |OR
+            |  key IN (SELECT a FROM testData3)""".stripMargin),
+        Row(1) :: Row(2) :: Row(3) :: Nil)
+    }
   }
 
   test("test SortMergeJoin (without spill)") {

From 34d2134a9fac1c00c320faeb84bef3ed2a04bd51 Mon Sep 17 00:00:00 2001
From: Anderson Osagie <osagie@gmail.com>
Date: Mon, 14 Aug 2017 10:00:59 +0100
Subject: [PATCH 350/779] [SPARK-21176][WEB UI] Format worker page links to
 work with proxy

## What changes were proposed in this pull request?

Several links on the worker page do not work correctly with the proxy because:
1) They don't acknowledge the proxy
2) They use relative paths (unlike the Application Page which uses full paths)

This patch fixes that. It also fixes a mistake in the proxy's Location header parsing which caused it to incorrectly handle redirects.

## How was this patch tested?

I checked the validity of every link with the proxy on and off.

Author: Anderson Osagie <osagie@gmail.com>

Closes #18915 from aosagie/fix/proxy-links.
---
 .../deploy/master/ui/ApplicationPage.scala    |  8 +++---
 .../apache/spark/deploy/worker/Worker.scala   |  4 ++-
 .../spark/deploy/worker/ui/WorkerPage.scala   | 28 ++++++++++++-------
 .../org/apache/spark/ui/JettyUtils.scala      |  9 ++++--
 .../scala/org/apache/spark/ui/UISuite.scala   |  2 +-
 5 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala
index f40896457df95..68e57b7564ad1 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala
@@ -125,10 +125,10 @@ private[ui] class ApplicationPage(parent: MasterWebUI) extends WebUIPage("app")
       <td>{executor.memory}</td>
       <td>{executor.state}</td>
       <td>
-        <a href={"%s/logPage?appId=%s&executorId=%s&logType=stdout"
-          .format(workerUrlRef, executor.application.id, executor.id)}>stdout</a>
-        <a href={"%s/logPage?appId=%s&executorId=%s&logType=stderr"
-          .format(workerUrlRef, executor.application.id, executor.id)}>stderr</a>
+        <a href={s"$workerUrlRef/logPage?appId=${executor.application.id}&executorId=${executor.
+          id}&logType=stdout"}>stdout</a>
+        <a href={s"$workerUrlRef/logPage?appId=${executor.application.id}&executorId=${executor.
+          id}&logType=stderr"}>stderr</a>
       </td>
     </tr>
   }
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
index f6d3876e3bbfa..29a810fe7abe0 100755
--- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
@@ -155,6 +155,8 @@ private[deploy] class Worker(
   private val metricsSystem = MetricsSystem.createMetricsSystem("worker", conf, securityMgr)
   private val workerSource = new WorkerSource(this)
 
+  val reverseProxy = conf.getBoolean("spark.ui.reverseProxy", false)
+
   private var registerMasterFutures: Array[JFuture[_]] = null
   private var registrationRetryTimer: Option[JScheduledFuture[_]] = None
 
@@ -225,7 +227,7 @@ private[deploy] class Worker(
     masterAddressToConnect = Some(masterAddress)
     master = Some(masterRef)
     connected = true
-    if (conf.getBoolean("spark.ui.reverseProxy", false)) {
+    if (reverseProxy) {
       logInfo(s"WorkerWebUI is available at $activeMasterWebUiUrl/proxy/$workerId")
     }
     // Cancel any outstanding re-registration attempts because we found a new master
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerPage.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerPage.scala
index ea39b0dce0a41..ce84bc4dae32c 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerPage.scala
@@ -51,9 +51,11 @@ private[ui] class WorkerPage(parent: WorkerWebUI) extends WebUIPage("") {
 
     val driverHeaders = Seq("DriverID", "Main Class", "State", "Cores", "Memory", "Logs", "Notes")
     val runningDrivers = workerState.drivers.sortBy(_.driverId).reverse
-    val runningDriverTable = UIUtils.listingTable(driverHeaders, driverRow, runningDrivers)
+    val runningDriverTable = UIUtils.listingTable[DriverRunner](driverHeaders,
+      driverRow(workerState.workerId, _), runningDrivers)
     val finishedDrivers = workerState.finishedDrivers.sortBy(_.driverId).reverse
-    val finishedDriverTable = UIUtils.listingTable(driverHeaders, driverRow, finishedDrivers)
+    val finishedDriverTable = UIUtils.listingTable[DriverRunner](driverHeaders,
+      driverRow(workerState.workerId, _), finishedDrivers)
 
     // For now we only show driver information if the user has submitted drivers to the cluster.
     // This is until we integrate the notion of drivers and applications in the UI.
@@ -102,6 +104,11 @@ private[ui] class WorkerPage(parent: WorkerWebUI) extends WebUIPage("") {
   }
 
   def executorRow(executor: ExecutorRunner): Seq[Node] = {
+    val workerUrlRef = UIUtils.makeHref(parent.worker.reverseProxy, executor.workerId,
+      parent.webUrl)
+    val appUrlRef = UIUtils.makeHref(parent.worker.reverseProxy, executor.appId,
+      executor.appDesc.appUiUrl)
+
     <tr>
       <td>{executor.execId}</td>
       <td>{executor.cores}</td>
@@ -115,7 +122,7 @@ private[ui] class WorkerPage(parent: WorkerWebUI) extends WebUIPage("") {
           <li><strong>Name:</strong>
           {
             if ({executor.state == ExecutorState.RUNNING} && executor.appDesc.appUiUrl.nonEmpty) {
-              <a href={executor.appDesc.appUiUrl}> {executor.appDesc.name}</a>
+              <a href={appUrlRef}> {executor.appDesc.name}</a>
             } else {
               {executor.appDesc.name}
             }
@@ -125,16 +132,17 @@ private[ui] class WorkerPage(parent: WorkerWebUI) extends WebUIPage("") {
         </ul>
       </td>
       <td>
-     <a href={"logPage?appId=%s&executorId=%s&logType=stdout"
-        .format(executor.appId, executor.execId)}>stdout</a>
-     <a href={"logPage?appId=%s&executorId=%s&logType=stderr"
-        .format(executor.appId, executor.execId)}>stderr</a>
+        <a href={s"$workerUrlRef/logPage?appId=${executor
+          .appId}&executorId=${executor.execId}&logType=stdout"}>stdout</a>
+        <a href={s"$workerUrlRef/logPage?appId=${executor
+          .appId}&executorId=${executor.execId}&logType=stderr"}>stderr</a>
       </td>
     </tr>
 
   }
 
-  def driverRow(driver: DriverRunner): Seq[Node] = {
+  def driverRow(workerId: String, driver: DriverRunner): Seq[Node] = {
+    val workerUrlRef = UIUtils.makeHref(parent.worker.reverseProxy, workerId, parent.webUrl)
     <tr>
       <td>{driver.driverId}</td>
       <td>{driver.driverDesc.command.arguments(2)}</td>
@@ -146,8 +154,8 @@ private[ui] class WorkerPage(parent: WorkerWebUI) extends WebUIPage("") {
         {Utils.megabytesToString(driver.driverDesc.mem)}
       </td>
       <td>
-        <a href={s"logPage?driverId=${driver.driverId}&logType=stdout"}>stdout</a>
-        <a href={s"logPage?driverId=${driver.driverId}&logType=stderr"}>stderr</a>
+        <a href={s"$workerUrlRef/logPage?driverId=${driver.driverId}&logType=stdout"}>stdout</a>
+        <a href={s"$workerUrlRef/logPage?driverId=${driver.driverId}&logType=stderr"}>stderr</a>
       </td>
       <td>
         {driver.finalException.getOrElse("")}
diff --git a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
index 880cf08d34016..52b7ab6347b47 100644
--- a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
@@ -209,7 +209,8 @@ private[spark] object JettyUtils extends Logging {
         val id = prefix.drop(1)
 
         // Query master state for id's corresponding UI address
-        // If that address exists, turn it into a valid, target URI string or return null
+        // If that address exists, try to turn it into a valid, target URI string
+        // Otherwise, return null
         idToUiAddress(id)
           .map(createProxyURI(prefix, _, path, request.getQueryString))
           .filter(uri => uri != null && validateDestination(uri.getHost, uri.getPort))
@@ -467,8 +468,10 @@ private[spark] object JettyUtils extends Logging {
       targetUri: URI): String = {
     val toReplace = targetUri.getScheme() + "://" + targetUri.getAuthority()
     if (headerValue.startsWith(toReplace)) {
-      clientRequest.getScheme() + "://" + clientRequest.getHeader("host") +
-          clientRequest.getPathInfo() + headerValue.substring(toReplace.length())
+      val id = clientRequest.getPathInfo.substring("/proxy/".length).takeWhile(_ != '/')
+      val headerPath = headerValue.substring(toReplace.length)
+
+      s"${clientRequest.getScheme}://${clientRequest.getHeader("host")}/proxy/$id$headerPath"
     } else {
       null
     }
diff --git a/core/src/test/scala/org/apache/spark/ui/UISuite.scala b/core/src/test/scala/org/apache/spark/ui/UISuite.scala
index 0428903bb4d8b..36ea3799afdf2 100644
--- a/core/src/test/scala/org/apache/spark/ui/UISuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/UISuite.scala
@@ -223,7 +223,7 @@ class UISuite extends SparkFunSuite {
     val targetUri = URI.create("http://localhost:4040")
     when(clientRequest.getScheme()).thenReturn("http")
     when(clientRequest.getHeader("host")).thenReturn("localhost:8080")
-    when(clientRequest.getPathInfo()).thenReturn("/proxy/worker-id")
+    when(clientRequest.getPathInfo()).thenReturn("/proxy/worker-id/jobs")
     var newHeader = JettyUtils.createProxyLocationHeader(headerValue, clientRequest, targetUri)
     assert(newHeader.toString() === "http://localhost:8080/proxy/worker-id/jobs")
     headerValue = "http://localhost:4041/jobs"

From 6847e93cf427aa971dac1ea261c1443eebf4089e Mon Sep 17 00:00:00 2001
From: Andrew Ash <andrew@andrewash.com>
Date: Mon, 14 Aug 2017 22:48:08 +0800
Subject: [PATCH 351/779] [SPARK-21563][CORE] Fix race condition when
 serializing TaskDescriptions and adding jars

## What changes were proposed in this pull request?

Fix the race condition when serializing TaskDescriptions and adding jars by keeping the set of jars and files for a TaskSet constant across the lifetime of the TaskSet.  Otherwise TaskDescription serialization can produce an invalid serialization when new file/jars are added concurrently as the TaskDescription is serialized.

## How was this patch tested?

Additional unit test ensures jars/files contained in the TaskDescription remain constant throughout the lifetime of the TaskSet.

Author: Andrew Ash <andrew@andrewash.com>

Closes #18913 from ash211/SPARK-21563.
---
 .../scala/org/apache/spark/SparkContext.scala |  7 ++++
 .../spark/scheduler/TaskSetManager.scala      |  8 +++--
 .../spark/scheduler/TaskSetManagerSuite.scala | 34 ++++++++++++++++++-
 3 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 531646891499b..136f0af7b2c9e 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1490,6 +1490,8 @@ class SparkContext(config: SparkConf) extends Logging {
   /**
    * Add a file to be downloaded with this Spark job on every node.
    *
+   * If a file is added during execution, it will not be available until the next TaskSet starts.
+   *
    * @param path can be either a local file, a file in HDFS (or other Hadoop-supported
    * filesystems), or an HTTP, HTTPS or FTP URI. To access the file in Spark jobs,
    * use `SparkFiles.get(fileName)` to find its download location.
@@ -1506,6 +1508,8 @@ class SparkContext(config: SparkConf) extends Logging {
   /**
    * Add a file to be downloaded with this Spark job on every node.
    *
+   * If a file is added during execution, it will not be available until the next TaskSet starts.
+   *
    * @param path can be either a local file, a file in HDFS (or other Hadoop-supported
    * filesystems), or an HTTP, HTTPS or FTP URI. To access the file in Spark jobs,
    * use `SparkFiles.get(fileName)` to find its download location.
@@ -1792,6 +1796,9 @@ class SparkContext(config: SparkConf) extends Logging {
 
   /**
    * Adds a JAR dependency for all tasks to be executed on this `SparkContext` in the future.
+   *
+   * If a jar is added during execution, it will not be available until the next TaskSet starts.
+   *
    * @param path can be either a local file, a file in HDFS (or other Hadoop-supported filesystems),
    * an HTTP, HTTPS or FTP URI, or local:/path for a file on every worker node.
    */
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index 589fe672ade71..c2510714e1d11 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -56,6 +56,10 @@ private[spark] class TaskSetManager(
 
   private val conf = sched.sc.conf
 
+  // SPARK-21563 make a copy of the jars/files so they are consistent across the TaskSet
+  private val addedJars = HashMap[String, Long](sched.sc.addedJars.toSeq: _*)
+  private val addedFiles = HashMap[String, Long](sched.sc.addedFiles.toSeq: _*)
+
   // Quantile of tasks at which to start speculation
   val SPECULATION_QUANTILE = conf.getDouble("spark.speculation.quantile", 0.75)
   val SPECULATION_MULTIPLIER = conf.getDouble("spark.speculation.multiplier", 1.5)
@@ -502,8 +506,8 @@ private[spark] class TaskSetManager(
           execId,
           taskName,
           index,
-          sched.sc.addedFiles,
-          sched.sc.addedJars,
+          addedFiles,
+          addedJars,
           task.localProperties,
           serializedTask)
       }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
index e46900e4e5049..3696df06e019f 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
@@ -32,7 +32,7 @@ import org.apache.spark.internal.config
 import org.apache.spark.internal.Logging
 import org.apache.spark.serializer.SerializerInstance
 import org.apache.spark.storage.BlockManagerId
-import org.apache.spark.util.{AccumulatorV2, ManualClock}
+import org.apache.spark.util.{AccumulatorV2, ManualClock, Utils}
 
 class FakeDAGScheduler(sc: SparkContext, taskScheduler: FakeTaskScheduler)
   extends DAGScheduler(sc) {
@@ -1214,6 +1214,38 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg
     verify(taskSetManagerSpy, times(1)).addPendingTask(anyInt())
   }
 
+  test("SPARK-21563 context's added jars shouldn't change mid-TaskSet") {
+    sc = new SparkContext("local", "test")
+    val addedJarsPreTaskSet = Map[String, Long](sc.addedJars.toSeq: _*)
+    assert(addedJarsPreTaskSet.size === 0)
+
+    sched = new FakeTaskScheduler(sc, ("exec1", "host1"))
+    val taskSet1 = FakeTask.createTaskSet(3)
+    val manager1 = new TaskSetManager(sched, taskSet1, MAX_TASK_FAILURES, clock = new ManualClock)
+
+    // all tasks from the first taskset have the same jars
+    val taskOption1 = manager1.resourceOffer("exec1", "host1", NO_PREF)
+    assert(taskOption1.get.addedJars === addedJarsPreTaskSet)
+    val taskOption2 = manager1.resourceOffer("exec1", "host1", NO_PREF)
+    assert(taskOption2.get.addedJars === addedJarsPreTaskSet)
+
+    // even with a jar added mid-TaskSet
+    val jarPath = Thread.currentThread().getContextClassLoader.getResource("TestUDTF.jar")
+    sc.addJar(jarPath.toString)
+    val addedJarsMidTaskSet = Map[String, Long](sc.addedJars.toSeq: _*)
+    assert(addedJarsPreTaskSet !== addedJarsMidTaskSet)
+    val taskOption3 = manager1.resourceOffer("exec1", "host1", NO_PREF)
+    // which should have the old version of the jars list
+    assert(taskOption3.get.addedJars === addedJarsPreTaskSet)
+
+    // and then the jar does appear in the next TaskSet
+    val taskSet2 = FakeTask.createTaskSet(1)
+    val manager2 = new TaskSetManager(sched, taskSet2, MAX_TASK_FAILURES, clock = new ManualClock)
+
+    val taskOption4 = manager2.resourceOffer("exec1", "host1", NO_PREF)
+    assert(taskOption4.get.addedJars === addedJarsMidTaskSet)
+  }
+
   private def createTaskResult(
       id: Int,
       accumUpdates: Seq[AccumulatorV2[_, _]] = Seq.empty): DirectTaskResult[Int] = {

From 0fcde87aadc9a92e138f11583119465ca4b5c518 Mon Sep 17 00:00:00 2001
From: byakuinss <grace.chinhanyu@gmail.com>
Date: Tue, 15 Aug 2017 00:41:01 +0900
Subject: [PATCH 352/779] [SPARK-21658][SQL][PYSPARK] Add default None for
 value in na.replace in PySpark

## What changes were proposed in this pull request?
JIRA issue: https://issues.apache.org/jira/browse/SPARK-21658

Add default None for value in `na.replace` since `Dataframe.replace` and `DataframeNaFunctions.replace` are alias.

The default values are the same now.
```
>>> df = sqlContext.createDataFrame([('Alice', 10, 80.0)])
>>> df.replace({"Alice": "a"}).first()
Row(_1=u'a', _2=10, _3=80.0)
>>> df.na.replace({"Alice": "a"}).first()
Row(_1=u'a', _2=10, _3=80.0)
```

## How was this patch tested?
Existing tests.

cc viirya

Author: byakuinss <grace.chinhanyu@gmail.com>

Closes #18895 from byakuinss/SPARK-21658.
---
 python/pyspark/sql/dataframe.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index edc7ca6f5146f..5cd208bb525a3 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1403,6 +1403,16 @@ def replace(self, to_replace, value=None, subset=None):
         |null|  null|null|
         +----+------+----+
 
+        >>> df4.na.replace('Alice').show()
+        +----+------+----+
+        | age|height|name|
+        +----+------+----+
+        |  10|    80|null|
+        |   5|  null| Bob|
+        |null|  null| Tom|
+        |null|  null|null|
+        +----+------+----+
+
         >>> df4.na.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show()
         +----+------+----+
         | age|height|name|
@@ -1837,7 +1847,7 @@ def fill(self, value, subset=None):
 
     fill.__doc__ = DataFrame.fillna.__doc__
 
-    def replace(self, to_replace, value, subset=None):
+    def replace(self, to_replace, value=None, subset=None):
         return self.df.replace(to_replace, value, subset)
 
     replace.__doc__ = DataFrame.replace.__doc__

From 0326b69c98a4f8995bb1532b7635108d53703d53 Mon Sep 17 00:00:00 2001
From: caoxuewen <cao.xuewen@zte.com.cn>
Date: Mon, 14 Aug 2017 09:33:22 -0700
Subject: [PATCH 353/779] [MINOR][SQL][TEST] no uncache table in joinsuite test

## What changes were proposed in this pull request?

At present, in test("broadcasted hash outer join operator selection") case,  set the testData2 to _CACHE TABLE_, but no _uncache table_ testData2. It can make people confused.
In addition, in the joinsuite test cases, clear the cache table of work by SharedSQLContext _spark.sharedState.cacheManager.clearCache_ to do, so we do not need to uncache table
let's fix it. thanks.

## How was this patch tested?
Existing test cases.

Author: caoxuewen <cao.xuewen@zte.com.cn>

Closes #18914 from heary-cao/uncache_table.
---
 sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index 3f0c8640f2641..720f040dbfe42 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -126,7 +126,6 @@ class JoinSuite extends QueryTest with SharedSQLContext {
       ("SELECT * FROM testData join testData2 ON key = a where key = 2",
         classOf[BroadcastHashJoinExec])
     ).foreach(assertJoin)
-    sql("UNCACHE TABLE testData")
   }
 
   test("broadcasted hash outer join operator selection") {
@@ -141,7 +140,6 @@ class JoinSuite extends QueryTest with SharedSQLContext {
       ("SELECT * FROM testData right join testData2 ON key = a and key = 2",
         classOf[BroadcastHashJoinExec])
     ).foreach(assertJoin)
-    sql("UNCACHE TABLE testData")
   }
 
   test("multiple-key equi-join is hash-join") {
@@ -476,7 +474,6 @@ class JoinSuite extends QueryTest with SharedSQLContext {
       ).foreach(assertJoin)
     }
 
-    sql("UNCACHE TABLE testData")
   }
 
   test("cross join with broadcast") {
@@ -565,7 +562,6 @@ class JoinSuite extends QueryTest with SharedSQLContext {
           Row("2", 3, 2) :: Nil)
     }
 
-    sql("UNCACHE TABLE testData")
   }
 
   test("left semi join") {

From fbc269252a1c99e04bd08906ad8404c031e9a097 Mon Sep 17 00:00:00 2001
From: donnyzone <wellfengzhu@gmail.com>
Date: Mon, 14 Aug 2017 09:37:18 -0700
Subject: [PATCH 354/779] [SPARK-19471][SQL] AggregationIterator does not
 initialize the generated result projection before using it

## What changes were proposed in this pull request?

Recently, we have also encountered such NPE issues in our production environment as described in:
https://issues.apache.org/jira/browse/SPARK-19471

This issue can be reproduced by the following examples:
` val df = spark.createDataFrame(Seq(("1", 1), ("1", 2), ("2", 3), ("2", 4))).toDF("x", "y")

//HashAggregate, SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key=false
df.groupBy("x").agg(rand(),sum("y")).show()

//ObjectHashAggregate, SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key=false
df.groupBy("x").agg(rand(),collect_list("y")).show()

//SortAggregate, SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key=false &&SQLConf.USE_OBJECT_HASH_AGG.key=false
df.groupBy("x").agg(rand(),collect_list("y")).show()`
`

This PR is based on PR-16820(https://github.com/apache/spark/pull/16820) with test cases for all aggregation paths. We want to push it forward.

> When AggregationIterator generates result projection, it does not call the initialize method of the Projection class. This will cause a runtime NullPointerException when the projection involves nondeterministic expressions.

## How was this patch tested?

unit test
verified in production environment

Author: donnyzone <wellfengzhu@gmail.com>

Closes #18920 from DonnyZone/Branch-spark-19471.
---
 .../aggregate/AggregationIterator.scala       |  4 ++
 .../aggregate/HashAggregateExec.scala         |  3 +-
 .../aggregate/ObjectAggregationIterator.scala |  2 +
 .../aggregate/ObjectHashAggregateExec.scala   |  3 +-
 .../aggregate/SortAggregateExec.scala         |  3 +-
 .../SortBasedAggregationIterator.scala        |  2 +
 .../TungstenAggregationIterator.scala         |  4 ++
 .../spark/sql/DataFrameFunctionsSuite.scala   | 45 +++++++++++++++++++
 8 files changed, 63 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
index 7c11fdb9792e8..98c4a51299958 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
@@ -33,6 +33,7 @@ import org.apache.spark.sql.catalyst.expressions.aggregate._
  *    is used to generate result.
  */
 abstract class AggregationIterator(
+    partIndex: Int,
     groupingExpressions: Seq[NamedExpression],
     inputAttributes: Seq[Attribute],
     aggregateExpressions: Seq[AggregateExpression],
@@ -217,6 +218,7 @@ abstract class AggregationIterator(
 
       val resultProjection =
         UnsafeProjection.create(resultExpressions, groupingAttributes ++ aggregateAttributes)
+      resultProjection.initialize(partIndex)
 
       (currentGroupingKey: UnsafeRow, currentBuffer: InternalRow) => {
         // Generate results for all expression-based aggregate functions.
@@ -235,6 +237,7 @@ abstract class AggregationIterator(
       val resultProjection = UnsafeProjection.create(
         groupingAttributes ++ bufferAttributes,
         groupingAttributes ++ bufferAttributes)
+      resultProjection.initialize(partIndex)
 
       // TypedImperativeAggregate stores generic object in aggregation buffer, and requires
       // calling serialization before shuffling. See [[TypedImperativeAggregate]] for more info.
@@ -256,6 +259,7 @@ abstract class AggregationIterator(
     } else {
       // Grouping-only: we only output values based on grouping expressions.
       val resultProjection = UnsafeProjection.create(resultExpressions, groupingAttributes)
+      resultProjection.initialize(partIndex)
       (currentGroupingKey: UnsafeRow, currentBuffer: InternalRow) => {
         resultProjection(currentGroupingKey)
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
index 56f61c30c4a38..80ea458687865 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
@@ -96,7 +96,7 @@ case class HashAggregateExec(
     val spillSize = longMetric("spillSize")
     val avgHashProbe = longMetric("avgHashProbe")
 
-    child.execute().mapPartitions { iter =>
+    child.execute().mapPartitionsWithIndex { (partIndex, iter) =>
 
       val hasInput = iter.hasNext
       if (!hasInput && groupingExpressions.nonEmpty) {
@@ -106,6 +106,7 @@ case class HashAggregateExec(
       } else {
         val aggregationIterator =
           new TungstenAggregationIterator(
+            partIndex,
             groupingExpressions,
             aggregateExpressions,
             aggregateAttributes,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala
index eef2c4e843f35..c68dbc73f0447 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala
@@ -31,6 +31,7 @@ import org.apache.spark.unsafe.KVIterator
 import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter
 
 class ObjectAggregationIterator(
+    partIndex: Int,
     outputAttributes: Seq[Attribute],
     groupingExpressions: Seq[NamedExpression],
     aggregateExpressions: Seq[AggregateExpression],
@@ -43,6 +44,7 @@ class ObjectAggregationIterator(
     fallbackCountThreshold: Int,
     numOutputRows: SQLMetric)
   extends AggregationIterator(
+    partIndex,
     groupingExpressions,
     originalInputAttributes,
     aggregateExpressions,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectHashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectHashAggregateExec.scala
index b53521b1b6ba2..6316e06a8f34e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectHashAggregateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectHashAggregateExec.scala
@@ -98,7 +98,7 @@ case class ObjectHashAggregateExec(
     val numOutputRows = longMetric("numOutputRows")
     val fallbackCountThreshold = sqlContext.conf.objectAggSortBasedFallbackThreshold
 
-    child.execute().mapPartitionsInternal { iter =>
+    child.execute().mapPartitionsWithIndexInternal { (partIndex, iter) =>
       val hasInput = iter.hasNext
       if (!hasInput && groupingExpressions.nonEmpty) {
         // This is a grouped aggregate and the input kvIterator is empty,
@@ -107,6 +107,7 @@ case class ObjectHashAggregateExec(
       } else {
         val aggregationIterator =
           new ObjectAggregationIterator(
+            partIndex,
             child.output,
             groupingExpressions,
             aggregateExpressions,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala
index be3198b8e7d82..a43235790834e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala
@@ -74,7 +74,7 @@ case class SortAggregateExec(
 
   protected override def doExecute(): RDD[InternalRow] = attachTree(this, "execute") {
     val numOutputRows = longMetric("numOutputRows")
-    child.execute().mapPartitionsInternal { iter =>
+    child.execute().mapPartitionsWithIndexInternal { (partIndex, iter) =>
       // Because the constructor of an aggregation iterator will read at least the first row,
       // we need to get the value of iter.hasNext first.
       val hasInput = iter.hasNext
@@ -84,6 +84,7 @@ case class SortAggregateExec(
         Iterator[UnsafeRow]()
       } else {
         val outputIter = new SortBasedAggregationIterator(
+          partIndex,
           groupingExpressions,
           child.output,
           iter,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala
index a5a444b160c63..492b0f2da77cb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala
@@ -27,6 +27,7 @@ import org.apache.spark.sql.execution.metric.SQLMetric
  * sorted by values of [[groupingExpressions]].
  */
 class SortBasedAggregationIterator(
+    partIndex: Int,
     groupingExpressions: Seq[NamedExpression],
     valueAttributes: Seq[Attribute],
     inputIterator: Iterator[InternalRow],
@@ -37,6 +38,7 @@ class SortBasedAggregationIterator(
     newMutableProjection: (Seq[Expression], Seq[Attribute]) => MutableProjection,
     numOutputRows: SQLMetric)
   extends AggregationIterator(
+    partIndex,
     groupingExpressions,
     valueAttributes,
     aggregateExpressions,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
index cfa930607360c..756eeb642e2d0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
@@ -60,6 +60,8 @@ import org.apache.spark.unsafe.KVIterator
  *  - Part 8: A utility function used to generate a result when there is no
  *            input and there is no grouping expression.
  *
+ * @param partIndex
+ *   index of the partition
  * @param groupingExpressions
  *   expressions for grouping keys
  * @param aggregateExpressions
@@ -77,6 +79,7 @@ import org.apache.spark.unsafe.KVIterator
  *   the iterator containing input [[UnsafeRow]]s.
  */
 class TungstenAggregationIterator(
+    partIndex: Int,
     groupingExpressions: Seq[NamedExpression],
     aggregateExpressions: Seq[AggregateExpression],
     aggregateAttributes: Seq[Attribute],
@@ -91,6 +94,7 @@ class TungstenAggregationIterator(
     spillSize: SQLMetric,
     avgHashProbe: SQLMetric)
   extends AggregationIterator(
+    partIndex,
     groupingExpressions,
     originalInputAttributes,
     aggregateExpressions,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 0681b9cbeb1d8..fdb9f1d1e0e94 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -24,6 +24,8 @@ import scala.util.Random
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
+import org.apache.spark.sql.execution.WholeStageCodegenExec
+import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, ObjectHashAggregateExec, SortAggregateExec}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
@@ -449,6 +451,49 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext {
     ).foreach(assertValuesDoNotChangeAfterCoalesceOrUnion(_))
   }
 
+  private def assertNoExceptions(c: Column): Unit = {
+    for ((wholeStage, useObjectHashAgg) <-
+         Seq((true, true), (true, false), (false, true), (false, false))) {
+      withSQLConf(
+        (SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, wholeStage.toString),
+        (SQLConf.USE_OBJECT_HASH_AGG.key, useObjectHashAgg.toString)) {
+
+        val df = Seq(("1", 1), ("1", 2), ("2", 3), ("2", 4)).toDF("x", "y")
+
+        // HashAggregate test case
+        val hashAggDF = df.groupBy("x").agg(c, sum("y"))
+        val hashAggPlan = hashAggDF.queryExecution.executedPlan
+        if (wholeStage) {
+          assert(hashAggPlan.find {
+            case WholeStageCodegenExec(_: HashAggregateExec) => true
+            case _ => false
+          }.isDefined)
+        } else {
+          assert(hashAggPlan.isInstanceOf[HashAggregateExec])
+        }
+        hashAggDF.collect()
+
+        // ObjectHashAggregate and SortAggregate test case
+        val objHashAggOrSortAggDF = df.groupBy("x").agg(c, collect_list("y"))
+        val objHashAggOrSortAggPlan = objHashAggOrSortAggDF.queryExecution.executedPlan
+        if (useObjectHashAgg) {
+          assert(objHashAggOrSortAggPlan.isInstanceOf[ObjectHashAggregateExec])
+        } else {
+          assert(objHashAggOrSortAggPlan.isInstanceOf[SortAggregateExec])
+        }
+        objHashAggOrSortAggDF.collect()
+      }
+    }
+  }
+
+  test("SPARK-19471: AggregationIterator does not initialize the generated result projection" +
+    " before using it") {
+    Seq(
+      monotonically_increasing_id(), spark_partition_id(),
+      rand(Random.nextLong()), randn(Random.nextLong())
+    ).foreach(assertNoExceptions)
+  }
+
   test("SPARK-21281 use string types by default if array and map have no argument") {
     val ds = spark.range(1)
     var expectedSchema = new StructType()

From 282f00b410fdc4dc69b9d1f3cb3e2ba53cd85b8b Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Mon, 14 Aug 2017 15:06:55 -0700
Subject: [PATCH 355/779] [SPARK-21696][SS] Fix a potential issue that may
 generate partial snapshot files

## What changes were proposed in this pull request?

Directly writing a snapshot file may generate a partial file. This PR changes it to write to a temp file then rename to the target file.

## How was this patch tested?

Jenkins.

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #18928 from zsxwing/SPARK-21696.
---
 .../streaming/state/HDFSBackedStateStoreProvider.scala | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
index e0c2e942072c7..36d6569a4187a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
@@ -386,9 +386,11 @@ private[state] class HDFSBackedStateStoreProvider extends StateStoreProvider wit
 
   private def writeSnapshotFile(version: Long, map: MapType): Unit = {
     val fileToWrite = snapshotFile(version)
+    val tempFile =
+      new Path(fileToWrite.getParent, s"${fileToWrite.getName}.temp-${Random.nextLong}")
     var output: DataOutputStream = null
     Utils.tryWithSafeFinally {
-      output = compressStream(fs.create(fileToWrite, false))
+      output = compressStream(fs.create(tempFile, false))
       val iter = map.entrySet().iterator()
       while(iter.hasNext) {
         val entry = iter.next()
@@ -403,6 +405,12 @@ private[state] class HDFSBackedStateStoreProvider extends StateStoreProvider wit
     } {
       if (output != null) output.close()
     }
+    if (fs.exists(fileToWrite)) {
+      // Skip rename if the file is alreayd created.
+      fs.delete(tempFile, true)
+    } else if (!fs.rename(tempFile, fileToWrite)) {
+      throw new IOException(s"Failed to rename $tempFile to $fileToWrite")
+    }
     logInfo(s"Written snapshot file for version $version of $this at $fileToWrite")
   }
 

From 4c3cf1cc5cdb400ceef447d366e9f395cd87b273 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Mon, 14 Aug 2017 22:29:15 -0700
Subject: [PATCH 356/779] [SPARK-21721][SQL] Clear FileSystem deleteOnExit
 cache when paths are successfully removed

## What changes were proposed in this pull request?

We put staging path to delete into the deleteOnExit cache of `FileSystem` in case of the path can't be successfully removed. But when we successfully remove the path, we don't remove it from the cache. We should do it to avoid continuing grow the cache size.

## How was this patch tested?

Added a test.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #18934 from viirya/SPARK-21721.
---
 .../hive/execution/InsertIntoHiveTable.scala  |  8 ++++++-
 .../sql/hive/execution/SQLQuerySuite.scala    | 22 +++++++++++++++++--
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index b6f4898fd1574..858f29c7ee530 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -423,7 +423,13 @@ case class InsertIntoHiveTable(
     // Attempt to delete the staging directory and the inclusive files. If failed, the files are
     // expected to be dropped at the normal termination of VM since deleteOnExit is used.
     try {
-      createdTempDir.foreach { path => path.getFileSystem(hadoopConf).delete(path, true) }
+      createdTempDir.foreach { path =>
+        val fs = path.getFileSystem(hadoopConf)
+        if (fs.delete(path, true)) {
+          // If we successfully delete the staging directory, remove it from FileSystem's cache.
+          fs.cancelDeleteOnExit(path)
+        }
+      }
     } catch {
       case NonFatal(e) =>
         logWarning(s"Unable to delete staging directory: $stagingDir.\n" + e)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index a949e5e829e14..45bbb0c674be3 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -20,10 +20,10 @@ package org.apache.spark.sql.hive.execution
 import java.io.File
 import java.nio.charset.StandardCharsets
 import java.sql.{Date, Timestamp}
-import java.util.Locale
+import java.util.{Locale, Set}
 
 import com.google.common.io.Files
-import org.apache.hadoop.fs.Path
+import org.apache.hadoop.fs.{FileSystem, Path}
 
 import org.apache.spark.TestUtils
 import org.apache.spark.sql._
@@ -2021,4 +2021,22 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       checkAnswer(table.filter($"p" === "p1\" and q=\"q1").select($"a"), Row(4))
     }
   }
+
+  test("SPARK-21721: Clear FileSystem deleterOnExit cache if path is successfully removed") {
+    withTable("test21721") {
+      val deleteOnExitField = classOf[FileSystem].getDeclaredField("deleteOnExit")
+      deleteOnExitField.setAccessible(true)
+
+      val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
+      val setOfPath = deleteOnExitField.get(fs).asInstanceOf[Set[Path]]
+
+      val testData = sparkContext.parallelize(1 to 10).map(i => TestData(i, i.toString)).toDF()
+      sql("CREATE TABLE test21721 (key INT, value STRING)")
+      val pathSizeToDeleteOnExit = setOfPath.size()
+
+      (0 to 10).foreach(_ => testData.write.mode(SaveMode.Append).insertInto("test1"))
+
+      assert(setOfPath.size() == pathSizeToDeleteOnExit)
+    }
+  }
 }

From 0422ce06df71538893b86f35b7308d76f701534b Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Mon, 14 Aug 2017 23:44:25 -0700
Subject: [PATCH 357/779] [SPARK-21724][SQL][DOC] Adds since information in the
 documentation of date functions

## What changes were proposed in this pull request?

This PR adds `since` annotation in documentation so that this can be rendered as below:

<img width="290" alt="2017-08-14 6 54 26" src="https://user-images.githubusercontent.com/6477701/29267050-034c1f64-8122-11e7-862b-7dfc38e292bf.png">

## How was this patch tested?

Manually checked the documentation by `cd sql && ./create-docs.sh`.
Also, Jenkins tests are required.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #18939 from HyukjinKwon/add-sinces-date-functions.
---
 .../expressions/datetimeExpressions.scala     | 84 ++++++++++++-------
 1 file changed, 56 insertions(+), 28 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index 7dc32e1d6d492..70354200c82df 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -53,7 +53,8 @@ trait TimeZoneAwareExpression extends Expression {
  * There is no code generation since this expression should get constant folded by the optimizer.
  */
 @ExpressionDescription(
-  usage = "_FUNC_() - Returns the current date at the start of query evaluation.")
+  usage = "_FUNC_() - Returns the current date at the start of query evaluation.",
+  since = "1.5.0")
 case class CurrentDate(timeZoneId: Option[String] = None)
   extends LeafExpression with TimeZoneAwareExpression with CodegenFallback {
 
@@ -81,7 +82,8 @@ case class CurrentDate(timeZoneId: Option[String] = None)
  * There is no code generation since this expression should get constant folded by the optimizer.
  */
 @ExpressionDescription(
-  usage = "_FUNC_() - Returns the current timestamp at the start of query evaluation.")
+  usage = "_FUNC_() - Returns the current timestamp at the start of query evaluation.",
+  since = "1.5.0")
 case class CurrentTimestamp() extends LeafExpression with CodegenFallback {
   override def foldable: Boolean = true
   override def nullable: Boolean = false
@@ -141,7 +143,8 @@ case class CurrentBatchTimestamp(
     Examples:
       > SELECT _FUNC_('2016-07-30', 1);
        2016-07-31
-  """)
+  """,
+  since = "1.5.0")
 case class DateAdd(startDate: Expression, days: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
@@ -174,7 +177,8 @@ case class DateAdd(startDate: Expression, days: Expression)
     Examples:
       > SELECT _FUNC_('2016-07-30', 1);
        2016-07-29
-  """)
+  """,
+  since = "1.5.0")
 case class DateSub(startDate: Expression, days: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
   override def left: Expression = startDate
@@ -203,7 +207,8 @@ case class DateSub(startDate: Expression, days: Expression)
     Examples:
       > SELECT _FUNC_('2009-07-30 12:58:59');
        12
-  """)
+  """,
+  since = "1.5.0")
 case class Hour(child: Expression, timeZoneId: Option[String] = None)
   extends UnaryExpression with TimeZoneAwareExpression with ImplicitCastInputTypes {
 
@@ -233,7 +238,8 @@ case class Hour(child: Expression, timeZoneId: Option[String] = None)
     Examples:
       > SELECT _FUNC_('2009-07-30 12:58:59');
        58
-  """)
+  """,
+  since = "1.5.0")
 case class Minute(child: Expression, timeZoneId: Option[String] = None)
   extends UnaryExpression with TimeZoneAwareExpression with ImplicitCastInputTypes {
 
@@ -263,7 +269,8 @@ case class Minute(child: Expression, timeZoneId: Option[String] = None)
     Examples:
       > SELECT _FUNC_('2009-07-30 12:58:59');
        59
-  """)
+  """,
+  since = "1.5.0")
 case class Second(child: Expression, timeZoneId: Option[String] = None)
   extends UnaryExpression with TimeZoneAwareExpression with ImplicitCastInputTypes {
 
@@ -293,7 +300,8 @@ case class Second(child: Expression, timeZoneId: Option[String] = None)
     Examples:
       > SELECT _FUNC_('2016-04-09');
        100
-  """)
+  """,
+  since = "1.5.0")
 case class DayOfYear(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(DateType)
@@ -316,7 +324,8 @@ case class DayOfYear(child: Expression) extends UnaryExpression with ImplicitCas
     Examples:
       > SELECT _FUNC_('2016-07-30');
        2016
-  """)
+  """,
+  since = "1.5.0")
 case class Year(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(DateType)
@@ -339,7 +348,8 @@ case class Year(child: Expression) extends UnaryExpression with ImplicitCastInpu
     Examples:
       > SELECT _FUNC_('2016-08-31');
        3
-  """)
+  """,
+  since = "1.5.0")
 case class Quarter(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(DateType)
@@ -362,7 +372,8 @@ case class Quarter(child: Expression) extends UnaryExpression with ImplicitCastI
     Examples:
       > SELECT _FUNC_('2016-07-30');
        7
-  """)
+  """,
+  since = "1.5.0")
 case class Month(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(DateType)
@@ -385,7 +396,8 @@ case class Month(child: Expression) extends UnaryExpression with ImplicitCastInp
     Examples:
       > SELECT _FUNC_('2009-07-30');
        30
-  """)
+  """,
+  since = "1.5.0")
 case class DayOfMonth(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(DateType)
@@ -409,7 +421,8 @@ case class DayOfMonth(child: Expression) extends UnaryExpression with ImplicitCa
     Examples:
       > SELECT _FUNC_('2009-07-30');
        5
-  """)
+  """,
+  since = "2.3.0")
 // scalastyle:on line.size.limit
 case class DayOfWeek(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
@@ -447,7 +460,8 @@ case class DayOfWeek(child: Expression) extends UnaryExpression with ImplicitCas
     Examples:
       > SELECT _FUNC_('2008-02-20');
        8
-  """)
+  """,
+  since = "1.5.0")
 // scalastyle:on line.size.limit
 case class WeekOfYear(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
@@ -493,7 +507,8 @@ case class WeekOfYear(child: Expression) extends UnaryExpression with ImplicitCa
     Examples:
       > SELECT _FUNC_('2016-04-08', 'y');
        2016
-  """)
+  """,
+  since = "1.5.0")
 // scalastyle:on line.size.limit
 case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Option[String] = None)
   extends BinaryExpression with TimeZoneAwareExpression with ImplicitCastInputTypes {
@@ -534,7 +549,8 @@ case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Opti
     Examples:
       > SELECT _FUNC_('2016-04-08', 'yyyy-MM-dd');
        1460041200
-  """)
+  """,
+  since = "1.6.0")
 case class ToUnixTimestamp(
     timeExp: Expression,
     format: Expression,
@@ -574,7 +590,8 @@ case class ToUnixTimestamp(
        1476884637
       > SELECT _FUNC_('2016-04-08', 'yyyy-MM-dd');
        1460041200
-  """)
+  """,
+  since = "1.5.0")
 case class UnixTimestamp(timeExp: Expression, format: Expression, timeZoneId: Option[String] = None)
   extends UnixTime {
 
@@ -724,7 +741,8 @@ abstract class UnixTime
     Examples:
       > SELECT _FUNC_(0, 'yyyy-MM-dd HH:mm:ss');
        1970-01-01 00:00:00
-  """)
+  """,
+  since = "1.5.0")
 case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[String] = None)
   extends BinaryExpression with TimeZoneAwareExpression with ImplicitCastInputTypes {
 
@@ -833,7 +851,8 @@ case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[
     Examples:
       > SELECT _FUNC_('2009-01-12');
        2009-01-31
-  """)
+  """,
+  since = "1.5.0")
 case class LastDay(startDate: Expression) extends UnaryExpression with ImplicitCastInputTypes {
   override def child: Expression = startDate
 
@@ -867,7 +886,8 @@ case class LastDay(startDate: Expression) extends UnaryExpression with ImplicitC
     Examples:
       > SELECT _FUNC_('2015-01-14', 'TU');
        2015-01-20
-  """)
+  """,
+  since = "1.5.0")
 // scalastyle:on line.size.limit
 case class NextDay(startDate: Expression, dayOfWeek: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
@@ -968,7 +988,8 @@ case class TimeAdd(start: Expression, interval: Expression, timeZoneId: Option[S
     Examples:
       > SELECT from_utc_timestamp('2016-08-31', 'Asia/Seoul');
        2016-08-31 09:00:00
-  """)
+  """,
+  since = "1.5.0")
 // scalastyle:on line.size.limit
 case class FromUTCTimestamp(left: Expression, right: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
@@ -1061,7 +1082,8 @@ case class TimeSub(start: Expression, interval: Expression, timeZoneId: Option[S
     Examples:
       > SELECT _FUNC_('2016-08-31', 1);
        2016-09-30
-  """)
+  """,
+  since = "1.5.0")
 // scalastyle:on line.size.limit
 case class AddMonths(startDate: Expression, numMonths: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
@@ -1097,7 +1119,8 @@ case class AddMonths(startDate: Expression, numMonths: Expression)
     Examples:
       > SELECT _FUNC_('1997-02-28 10:30:00', '1996-10-30');
        3.94959677
-  """)
+  """,
+  since = "1.5.0")
 // scalastyle:on line.size.limit
 case class MonthsBetween(date1: Expression, date2: Expression, timeZoneId: Option[String] = None)
   extends BinaryExpression with TimeZoneAwareExpression with ImplicitCastInputTypes {
@@ -1140,7 +1163,8 @@ case class MonthsBetween(date1: Expression, date2: Expression, timeZoneId: Optio
     Examples:
       > SELECT _FUNC_('2016-08-31', 'Asia/Seoul');
        2016-08-30 15:00:00
-  """)
+  """,
+  since = "1.5.0")
 // scalastyle:on line.size.limit
 case class ToUTCTimestamp(left: Expression, right: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
@@ -1203,7 +1227,8 @@ case class ToUTCTimestamp(left: Expression, right: Expression)
        2009-07-30
       > SELECT _FUNC_('2016-12-31', 'yyyy-MM-dd');
        2016-12-31
-  """)
+  """,
+  since = "1.5.0")
 case class ParseToDate(left: Expression, format: Option[Expression], child: Expression)
   extends RuntimeReplaceable {
 
@@ -1244,7 +1269,8 @@ case class ParseToDate(left: Expression, format: Option[Expression], child: Expr
        2016-12-31 00:12:00
       > SELECT _FUNC_('2016-12-31', 'yyyy-MM-dd');
        2016-12-31 00:00:00
-  """)
+  """,
+  since = "2.2.0")
 case class ParseToTimestamp(left: Expression, format: Option[Expression], child: Expression)
   extends RuntimeReplaceable {
 
@@ -1279,7 +1305,8 @@ case class ParseToTimestamp(left: Expression, format: Option[Expression], child:
        2009-02-01
       > SELECT _FUNC_('2015-10-27', 'YEAR');
        2015-01-01
-  """)
+  """,
+  since = "1.5.0")
 // scalastyle:on line.size.limit
 case class TruncDate(date: Expression, format: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
@@ -1359,7 +1386,8 @@ case class TruncDate(date: Expression, format: Expression)
 
       > SELECT _FUNC_('2009-07-30', '2009-07-31');
        -1
-  """)
+  """,
+  since = "1.5.0")
 case class DateDiff(endDate: Expression, startDate: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 

From 12411b5edf85ecbae6bf75c70635f5493c86fe09 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Mon, 14 Aug 2017 23:46:52 -0700
Subject: [PATCH 358/779] [SPARK-21732][SQL] Lazily init hive metastore client

## What changes were proposed in this pull request?

This PR changes the codes to lazily init hive metastore client so that we can create SparkSession without talking to the hive metastore sever.

It's pretty helpful when you set a hive metastore server but it's down. You can still start the Spark shell to debug.

## How was this patch tested?

The new unit test.

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #18944 from zsxwing/hive-lazy-init.
---
 .../internal/BaseSessionStateBuilder.scala    |  8 +--
 .../spark/sql/internal/SessionState.scala     | 28 ++++++---
 ...HiveMetastoreLazyInitializationSuite.scala | 57 +++++++++++++++++++
 3 files changed, 81 insertions(+), 12 deletions(-)
 create mode 100644 sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreLazyInitializationSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
index 37f4f8d4ab65e..4e756084bbdbb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
@@ -287,14 +287,14 @@ abstract class BaseSessionStateBuilder(
       experimentalMethods,
       functionRegistry,
       udfRegistration,
-      catalog,
+      () => catalog,
       sqlParser,
-      analyzer,
-      optimizer,
+      () => analyzer,
+      () => optimizer,
       planner,
       streamingQueryManager,
       listenerManager,
-      resourceLoader,
+      () => resourceLoader,
       createQueryExecution,
       createClone)
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
index ac013ecf12ce0..accbea41b9603 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
@@ -42,14 +42,17 @@ import org.apache.spark.sql.util.{ExecutionListenerManager, QueryExecutionListen
  * @param experimentalMethods Interface to add custom planning strategies and optimizers.
  * @param functionRegistry Internal catalog for managing functions registered by the user.
  * @param udfRegistration Interface exposed to the user for registering user-defined functions.
- * @param catalog Internal catalog for managing table and database states.
+ * @param catalogBuilder a function to create an internal catalog for managing table and database
+ *                       states.
  * @param sqlParser Parser that extracts expressions, plans, table identifiers etc. from SQL texts.
- * @param analyzer Logical query plan analyzer for resolving unresolved attributes and relations.
- * @param optimizer Logical query plan optimizer.
+ * @param analyzerBuilder A function to create the logical query plan analyzer for resolving
+ *                        unresolved attributes and relations.
+ * @param optimizerBuilder a function to create the logical query plan optimizer.
  * @param planner Planner that converts optimized logical plans to physical plans.
  * @param streamingQueryManager Interface to start and stop streaming queries.
  * @param listenerManager Interface to register custom [[QueryExecutionListener]]s.
- * @param resourceLoader Session shared resource loader to load JARs, files, etc.
+ * @param resourceLoaderBuilder a function to create a session shared resource loader to load JARs,
+ *                              files, etc.
  * @param createQueryExecution Function used to create QueryExecution objects.
  * @param createClone Function used to create clones of the session state.
  */
@@ -59,17 +62,26 @@ private[sql] class SessionState(
     val experimentalMethods: ExperimentalMethods,
     val functionRegistry: FunctionRegistry,
     val udfRegistration: UDFRegistration,
-    val catalog: SessionCatalog,
+    catalogBuilder: () => SessionCatalog,
     val sqlParser: ParserInterface,
-    val analyzer: Analyzer,
-    val optimizer: Optimizer,
+    analyzerBuilder: () => Analyzer,
+    optimizerBuilder: () => Optimizer,
     val planner: SparkPlanner,
     val streamingQueryManager: StreamingQueryManager,
     val listenerManager: ExecutionListenerManager,
-    val resourceLoader: SessionResourceLoader,
+    resourceLoaderBuilder: () => SessionResourceLoader,
     createQueryExecution: LogicalPlan => QueryExecution,
     createClone: (SparkSession, SessionState) => SessionState) {
 
+  // The following fields are lazy to avoid creating the Hive client when creating SessionState.
+  lazy val catalog: SessionCatalog = catalogBuilder()
+
+  lazy val analyzer: Analyzer = analyzerBuilder()
+
+  lazy val optimizer: Optimizer = optimizerBuilder()
+
+  lazy val resourceLoader: SessionResourceLoader = resourceLoaderBuilder()
+
   def newHadoopConf(): Configuration = SessionState.newHadoopConf(
     sharedState.sparkContext.hadoopConfiguration,
     conf)
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreLazyInitializationSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreLazyInitializationSuite.scala
new file mode 100644
index 0000000000000..3f135cc864983
--- /dev/null
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreLazyInitializationSuite.scala
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.{AnalysisException, SparkSession}
+import org.apache.spark.util.Utils
+
+class HiveMetastoreLazyInitializationSuite extends SparkFunSuite {
+
+  test("lazily initialize Hive client") {
+    val spark = SparkSession.builder()
+      .appName("HiveMetastoreLazyInitializationSuite")
+      .master("local[2]")
+      .enableHiveSupport()
+      .config("spark.hadoop.hive.metastore.uris", "thrift://127.0.0.1:11111")
+      .getOrCreate()
+    val originalLevel = org.apache.log4j.Logger.getRootLogger().getLevel
+    try {
+      // Avoid outputting a lot of expected warning logs
+      spark.sparkContext.setLogLevel("error")
+
+      // We should be able to run Spark jobs without Hive client.
+      assert(spark.sparkContext.range(0, 1).count() === 1)
+
+      // Make sure that we are not using the local derby metastore.
+      val exceptionString = Utils.exceptionString(intercept[AnalysisException] {
+        spark.sql("show tables")
+      })
+      for (msg <- Seq(
+        "show tables",
+        "Could not connect to meta store",
+        "org.apache.thrift.transport.TTransportException",
+        "Connection refused")) {
+        exceptionString.contains(msg)
+      }
+    } finally {
+      spark.sparkContext.setLogLevel(originalLevel.toString)
+      spark.stop()
+    }
+  }
+}

From bc9902587a3a3fc6a835ec485c32c047f89100f2 Mon Sep 17 00:00:00 2001
From: donnyzone <wellfengzhu@gmail.com>
Date: Tue, 15 Aug 2017 08:51:18 -0700
Subject: [PATCH 359/779] [SPARK-19471][SQL] AggregationIterator does not
 initialize the generated result projection before using it

## What changes were proposed in this pull request?

This is a follow-up PR that moves the test case in PR-18920 (https://github.com/apache/spark/pull/18920) to DataFrameAggregateSuit.

## How was this patch tested?
unit test

Author: donnyzone <wellfengzhu@gmail.com>

Closes #18946 from DonnyZone/branch-19471-followingPR.
---
 .../spark/sql/DataFrameAggregateSuite.scala   | 47 +++++++++++++++++++
 .../spark/sql/DataFrameFunctionsSuite.scala   | 45 ------------------
 2 files changed, 47 insertions(+), 45 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index 69ea62ef5eb74..affe97120c8f6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -17,6 +17,10 @@
 
 package org.apache.spark.sql
 
+import scala.util.Random
+
+import org.apache.spark.sql.execution.WholeStageCodegenExec
+import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, ObjectHashAggregateExec, SortAggregateExec}
 import org.apache.spark.sql.expressions.Window
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
@@ -558,6 +562,49 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {
     assert(e.message.contains("aggregate functions are not allowed in GROUP BY"))
   }
 
+  private def assertNoExceptions(c: Column): Unit = {
+    for ((wholeStage, useObjectHashAgg) <-
+         Seq((true, true), (true, false), (false, true), (false, false))) {
+      withSQLConf(
+        (SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, wholeStage.toString),
+        (SQLConf.USE_OBJECT_HASH_AGG.key, useObjectHashAgg.toString)) {
+
+        val df = Seq(("1", 1), ("1", 2), ("2", 3), ("2", 4)).toDF("x", "y")
+
+        // test case for HashAggregate
+        val hashAggDF = df.groupBy("x").agg(c, sum("y"))
+        val hashAggPlan = hashAggDF.queryExecution.executedPlan
+        if (wholeStage) {
+          assert(hashAggPlan.find {
+            case WholeStageCodegenExec(_: HashAggregateExec) => true
+            case _ => false
+          }.isDefined)
+        } else {
+          assert(hashAggPlan.isInstanceOf[HashAggregateExec])
+        }
+        hashAggDF.collect()
+
+        // test case for ObjectHashAggregate and SortAggregate
+        val objHashAggOrSortAggDF = df.groupBy("x").agg(c, collect_list("y"))
+        val objHashAggOrSortAggPlan = objHashAggOrSortAggDF.queryExecution.executedPlan
+        if (useObjectHashAgg) {
+          assert(objHashAggOrSortAggPlan.isInstanceOf[ObjectHashAggregateExec])
+        } else {
+          assert(objHashAggOrSortAggPlan.isInstanceOf[SortAggregateExec])
+        }
+        objHashAggOrSortAggDF.collect()
+      }
+    }
+  }
+
+  test("SPARK-19471: AggregationIterator does not initialize the generated result projection" +
+    " before using it") {
+    Seq(
+      monotonically_increasing_id(), spark_partition_id(),
+      rand(Random.nextLong()), randn(Random.nextLong())
+    ).foreach(assertNoExceptions)
+  }
+
   test("SPARK-21580 ints in aggregation expressions are taken as group-by ordinal.") {
     checkAnswer(
       testData2.groupBy(lit(3), lit(4)).agg(lit(6), lit(7), sum("b")),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index fdb9f1d1e0e94..0681b9cbeb1d8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -24,8 +24,6 @@ import scala.util.Random
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
-import org.apache.spark.sql.execution.WholeStageCodegenExec
-import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, ObjectHashAggregateExec, SortAggregateExec}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
@@ -451,49 +449,6 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext {
     ).foreach(assertValuesDoNotChangeAfterCoalesceOrUnion(_))
   }
 
-  private def assertNoExceptions(c: Column): Unit = {
-    for ((wholeStage, useObjectHashAgg) <-
-         Seq((true, true), (true, false), (false, true), (false, false))) {
-      withSQLConf(
-        (SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, wholeStage.toString),
-        (SQLConf.USE_OBJECT_HASH_AGG.key, useObjectHashAgg.toString)) {
-
-        val df = Seq(("1", 1), ("1", 2), ("2", 3), ("2", 4)).toDF("x", "y")
-
-        // HashAggregate test case
-        val hashAggDF = df.groupBy("x").agg(c, sum("y"))
-        val hashAggPlan = hashAggDF.queryExecution.executedPlan
-        if (wholeStage) {
-          assert(hashAggPlan.find {
-            case WholeStageCodegenExec(_: HashAggregateExec) => true
-            case _ => false
-          }.isDefined)
-        } else {
-          assert(hashAggPlan.isInstanceOf[HashAggregateExec])
-        }
-        hashAggDF.collect()
-
-        // ObjectHashAggregate and SortAggregate test case
-        val objHashAggOrSortAggDF = df.groupBy("x").agg(c, collect_list("y"))
-        val objHashAggOrSortAggPlan = objHashAggOrSortAggDF.queryExecution.executedPlan
-        if (useObjectHashAgg) {
-          assert(objHashAggOrSortAggPlan.isInstanceOf[ObjectHashAggregateExec])
-        } else {
-          assert(objHashAggOrSortAggPlan.isInstanceOf[SortAggregateExec])
-        }
-        objHashAggOrSortAggDF.collect()
-      }
-    }
-  }
-
-  test("SPARK-19471: AggregationIterator does not initialize the generated result projection" +
-    " before using it") {
-    Seq(
-      monotonically_increasing_id(), spark_partition_id(),
-      rand(Random.nextLong()), randn(Random.nextLong())
-    ).foreach(assertNoExceptions)
-  }
-
   test("SPARK-21281 use string types by default if array and map have no argument") {
     val ds = spark.range(1)
     var expectedSchema = new StructType()

From 14bdb25fd76cf1da8f590231833f47b5b7059f11 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 15 Aug 2017 09:04:56 -0700
Subject: [PATCH 360/779] [SPARK-18464][SQL][FOLLOWUP] support old table which
 doesn't store schema in table properties

## What changes were proposed in this pull request?

This is a follow-up of https://github.com/apache/spark/pull/15900 , to fix one more bug:
When table schema is empty and need to be inferred at runtime, we should not resolve parent plans before the schema has been inferred, or the parent plans will be resolved against an empty schema and may get wrong result for something like `select *`

The fix logic is: introduce `UnresolvedCatalogRelation` as a placeholder. Then we replace it with `LogicalRelation` or `HiveTableRelation` during analysis, so that it's guaranteed that we won't resolve parent plans until the schema has been inferred.

## How was this patch tested?

regression test

Author: Wenchen Fan <wenchen@databricks.com>

Closes #18907 from cloud-fan/bug.
---
 .../sql/catalyst/catalog/SessionCatalog.scala |  7 +--
 .../sql/catalyst/catalog/interface.scala      | 25 ++++++----
 .../catalog/SessionCatalogSuite.scala         |  5 +-
 .../apache/spark/sql/DataFrameWriter.scala    | 11 ++---
 .../scala/org/apache/spark/sql/Dataset.scala  |  4 +-
 .../execution/OptimizeMetadataOnlyQuery.scala |  6 +--
 .../datasources/DataSourceStrategy.scala      | 49 +++++++++++--------
 .../sql/execution/datasources/rules.scala     |  4 +-
 .../sql/StatisticsCollectionTestBase.scala    |  4 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  6 +--
 .../spark/sql/hive/HiveStrategies.scala       | 17 +++----
 .../hive/execution/HiveTableScanExec.scala    |  6 +--
 .../sql/hive/MetastoreDataSourcesSuite.scala  |  1 +
 .../spark/sql/hive/StatisticsSuite.scala      | 10 ++--
 .../sql/hive/execution/SQLQuerySuite.scala    | 10 ++--
 .../spark/sql/hive/orc/OrcQuerySuite.scala    |  4 +-
 .../apache/spark/sql/hive/parquetSuites.scala |  4 +-
 17 files changed, 90 insertions(+), 83 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index e3237a8846875..6030d90ed99c3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -678,12 +678,7 @@ class SessionCatalog(
             child = parser.parsePlan(viewText))
           SubqueryAlias(table, child)
         } else {
-          val tableRelation = CatalogRelation(
-            metadata,
-            // we assume all the columns are nullable.
-            metadata.dataSchema.asNullable.toAttributes,
-            metadata.partitionSchema.asNullable.toAttributes)
-          SubqueryAlias(table, tableRelation)
+          SubqueryAlias(table, UnresolvedCatalogRelation(metadata))
         }
       } else {
         SubqueryAlias(table, tempTables(table))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index f86510624aa78..5a8c4e7610fff 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -28,7 +28,6 @@ import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.{FunctionIdentifier, InternalRow, TableIdentifier}
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference, Cast, ExprId, Literal}
-import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
 import org.apache.spark.sql.catalyst.util.quoteIdentifier
@@ -405,11 +404,22 @@ object CatalogTypes {
   type TablePartitionSpec = Map[String, String]
 }
 
+/**
+ * A placeholder for a table relation, which will be replaced by concrete relation like
+ * `LogicalRelation` or `HiveTableRelation`, during analysis.
+ */
+case class UnresolvedCatalogRelation(tableMeta: CatalogTable) extends LeafNode {
+  assert(tableMeta.identifier.database.isDefined)
+  override lazy val resolved: Boolean = false
+  override def output: Seq[Attribute] = Nil
+}
 
 /**
- * A [[LogicalPlan]] that represents a table.
+ * A `LogicalPlan` that represents a hive table.
+ *
+ * TODO: remove this after we completely make hive as a data source.
  */
-case class CatalogRelation(
+case class HiveTableRelation(
     tableMeta: CatalogTable,
     dataCols: Seq[AttributeReference],
     partitionCols: Seq[AttributeReference]) extends LeafNode with MultiInstanceRelation {
@@ -423,7 +433,7 @@ case class CatalogRelation(
   def isPartitioned: Boolean = partitionCols.nonEmpty
 
   override def equals(relation: Any): Boolean = relation match {
-    case other: CatalogRelation => tableMeta == other.tableMeta && output == other.output
+    case other: HiveTableRelation => tableMeta == other.tableMeta && output == other.output
     case _ => false
   }
 
@@ -431,7 +441,7 @@ case class CatalogRelation(
     Objects.hashCode(tableMeta.identifier, output)
   }
 
-  override lazy val canonicalized: LogicalPlan = copy(
+  override lazy val canonicalized: HiveTableRelation = copy(
     tableMeta = tableMeta.copy(
       storage = CatalogStorageFormat.empty,
       createTime = -1
@@ -444,15 +454,12 @@ case class CatalogRelation(
     })
 
   override def computeStats(): Statistics = {
-    // For data source tables, we will create a `LogicalRelation` and won't call this method, for
-    // hive serde tables, we will always generate a statistics.
-    // TODO: unify the table stats generation.
     tableMeta.stats.map(_.toPlanStats(output)).getOrElse {
       throw new IllegalStateException("table stats must be specified.")
     }
   }
 
-  override def newInstance(): LogicalPlan = copy(
+  override def newInstance(): HiveTableRelation = copy(
     dataCols = dataCols.map(_.newInstance()),
     partitionCols = partitionCols.map(_.newInstance()))
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
index d2b670d06d18c..1cce19989c60e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
@@ -22,7 +22,6 @@ import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
-import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical.{Range, SubqueryAlias, View}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
@@ -518,14 +517,14 @@ abstract class SessionCatalogSuite extends AnalysisTest {
       catalog.setCurrentDatabase("db2")
       // If we explicitly specify the database, we'll look up the relation in that database
       assert(catalog.lookupRelation(TableIdentifier("tbl1", Some("db2"))).children.head
-        .asInstanceOf[CatalogRelation].tableMeta == metastoreTable1)
+        .asInstanceOf[UnresolvedCatalogRelation].tableMeta == metastoreTable1)
       // Otherwise, we'll first look up a temporary table with the same name
       assert(catalog.lookupRelation(TableIdentifier("tbl1"))
         == SubqueryAlias("tbl1", tempTable1))
       // Then, if that does not exist, look up the relation in the current database
       catalog.dropTable(TableIdentifier("tbl1"), ignoreIfNotExists = false, purge = false)
       assert(catalog.lookupRelation(TableIdentifier("tbl1")).children.head
-        .asInstanceOf[CatalogRelation].tableMeta == metastoreTable1)
+        .asInstanceOf[UnresolvedCatalogRelation].tableMeta == metastoreTable1)
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 65c9ef40777a2..877051a60e910 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -24,11 +24,11 @@ import scala.collection.JavaConverters._
 import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.{EliminateSubqueryAliases, UnresolvedRelation}
-import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogRelation, CatalogTable, CatalogTableType}
+import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan}
 import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.execution.command.DDLUtils
-import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource, LogicalRelation, SaveIntoDataSourceCommand}
+import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource, LogicalRelation}
 import org.apache.spark.sql.sources.BaseRelation
 import org.apache.spark.sql.types.StructType
 
@@ -372,8 +372,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
         // Get all input data source or hive relations of the query.
         val srcRelations = df.logicalPlan.collect {
           case LogicalRelation(src: BaseRelation, _, _) => src
-          case relation: CatalogRelation if DDLUtils.isHiveTable(relation.tableMeta) =>
-            relation.tableMeta.identifier
+          case relation: HiveTableRelation => relation.tableMeta.identifier
         }
 
         val tableRelation = df.sparkSession.table(tableIdentWithDB).queryExecution.analyzed
@@ -383,8 +382,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
             throw new AnalysisException(
               s"Cannot overwrite table $tableName that is also being read from")
           // check hive table relation when overwrite mode
-          case relation: CatalogRelation if DDLUtils.isHiveTable(relation.tableMeta)
-            && srcRelations.contains(relation.tableMeta.identifier) =>
+          case relation: HiveTableRelation
+              if srcRelations.contains(relation.tableMeta.identifier) =>
             throw new AnalysisException(
               s"Cannot overwrite table $tableName that is also being read from")
           case _ => // OK
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index aa968d8b3c34d..a9887eb95279f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -36,7 +36,7 @@ import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst._
 import org.apache.spark.sql.catalyst.analysis._
-import org.apache.spark.sql.catalyst.catalog.CatalogRelation
+import org.apache.spark.sql.catalyst.catalog.HiveTableRelation
 import org.apache.spark.sql.catalyst.encoders._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.json.{JacksonGenerator, JSONOptions}
@@ -2965,7 +2965,7 @@ class Dataset[T] private[sql](
         fsBasedRelation.inputFiles
       case fr: FileRelation =>
         fr.inputFiles
-      case r: CatalogRelation if DDLUtils.isHiveTable(r.tableMeta) =>
+      case r: HiveTableRelation =>
         r.tableMeta.storage.locationUri.map(_.toString).toArray
     }.flatten
     files.toSet.toArray
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala
index 02f45abaa3f08..301c4f02647d5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.execution
 
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.catalog.{CatalogRelation, SessionCatalog}
+import org.apache.spark.sql.catalyst.catalog.{HiveTableRelation, SessionCatalog}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.plans.logical._
@@ -99,7 +99,7 @@ case class OptimizeMetadataOnlyQuery(catalog: SessionCatalog) extends Rule[Logic
             val partitionData = fsRelation.location.listFiles(Nil, Nil)
             LocalRelation(partAttrs, partitionData.map(_.values))
 
-          case relation: CatalogRelation =>
+          case relation: HiveTableRelation =>
             val partAttrs = getPartitionAttrs(relation.tableMeta.partitionColumnNames, relation)
             val caseInsensitiveProperties =
               CaseInsensitiveMap(relation.tableMeta.storage.properties)
@@ -135,7 +135,7 @@ case class OptimizeMetadataOnlyQuery(catalog: SessionCatalog) extends Rule[Logic
         val partAttrs = getPartitionAttrs(fsRelation.partitionSchema.map(_.name), l)
         Some((AttributeSet(partAttrs), l))
 
-      case relation: CatalogRelation if relation.tableMeta.partitionColumnNames.nonEmpty =>
+      case relation: HiveTableRelation if relation.tableMeta.partitionColumnNames.nonEmpty =>
         val partAttrs = getPartitionAttrs(relation.tableMeta.partitionColumnNames, relation)
         Some((AttributeSet(partAttrs), relation))
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 587b9b450ea2a..237017742770a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -25,12 +25,12 @@ import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, QualifiedTableName}
 import org.apache.spark.sql.catalyst.CatalystTypeConverters.convertToScala
 import org.apache.spark.sql.catalyst.analysis._
-import org.apache.spark.sql.catalyst.catalog.{CatalogRelation, CatalogUtils}
+import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan, Project}
-import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, UnknownPartitioning}
+import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.{RowDataSourceScanExec, SparkPlan}
 import org.apache.spark.sql.execution.command._
@@ -207,15 +207,16 @@ case class DataSourceAnalysis(conf: SQLConf) extends Rule[LogicalPlan] with Cast
 
 
 /**
- * Replaces [[CatalogRelation]] with data source table if its table provider is not hive.
+ * Replaces [[UnresolvedCatalogRelation]] with concrete relation logical plans.
+ *
+ * TODO: we should remove the special handling for hive tables after completely making hive as a
+ * data source.
  */
 class FindDataSourceTable(sparkSession: SparkSession) extends Rule[LogicalPlan] {
-  private def readDataSourceTable(r: CatalogRelation): LogicalPlan = {
-    val table = r.tableMeta
+  private def readDataSourceTable(table: CatalogTable): LogicalPlan = {
     val qualifiedTableName = QualifiedTableName(table.database, table.identifier.table)
-    val catalogProxy = sparkSession.sessionState.catalog
-
-    val plan = catalogProxy.getCachedPlan(qualifiedTableName, new Callable[LogicalPlan]() {
+    val catalog = sparkSession.sessionState.catalog
+    catalog.getCachedPlan(qualifiedTableName, new Callable[LogicalPlan]() {
       override def call(): LogicalPlan = {
         val pathOption = table.storage.locationUri.map("path" -> CatalogUtils.URIToString(_))
         val dataSource =
@@ -232,24 +233,30 @@ class FindDataSourceTable(sparkSession: SparkSession) extends Rule[LogicalPlan]
 
         LogicalRelation(dataSource.resolveRelation(checkFilesExist = false), table)
       }
-    }).asInstanceOf[LogicalRelation]
+    })
+  }
 
-    if (r.output.isEmpty) {
-      // It's possible that the table schema is empty and need to be inferred at runtime. For this
-      // case, we don't need to change the output of the cached plan.
-      plan
-    } else {
-      plan.copy(output = r.output)
-    }
+  private def readHiveTable(table: CatalogTable): LogicalPlan = {
+    HiveTableRelation(
+      table,
+      // Hive table columns are always nullable.
+      table.dataSchema.asNullable.toAttributes,
+      table.partitionSchema.asNullable.toAttributes)
   }
 
   override def apply(plan: LogicalPlan): LogicalPlan = plan transform {
-    case i @ InsertIntoTable(r: CatalogRelation, _, _, _, _)
-        if DDLUtils.isDatasourceTable(r.tableMeta) =>
-      i.copy(table = readDataSourceTable(r))
+    case i @ InsertIntoTable(UnresolvedCatalogRelation(tableMeta), _, _, _, _)
+        if DDLUtils.isDatasourceTable(tableMeta) =>
+      i.copy(table = readDataSourceTable(tableMeta))
+
+    case i @ InsertIntoTable(UnresolvedCatalogRelation(tableMeta), _, _, _, _) =>
+      i.copy(table = readHiveTable(tableMeta))
+
+    case UnresolvedCatalogRelation(tableMeta) if DDLUtils.isDatasourceTable(tableMeta) =>
+      readDataSourceTable(tableMeta)
 
-    case r: CatalogRelation if DDLUtils.isDatasourceTable(r.tableMeta) =>
-      readDataSourceTable(r)
+    case UnresolvedCatalogRelation(tableMeta) =>
+      readHiveTable(tableMeta)
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
index cb8dc1e041a9b..84acca242aa41 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -382,7 +382,7 @@ case class PreprocessTableInsertion(conf: SQLConf) extends Rule[LogicalPlan] wit
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
     case i @ InsertIntoTable(table, _, query, _, _) if table.resolved && query.resolved =>
       table match {
-        case relation: CatalogRelation =>
+        case relation: HiveTableRelation =>
           val metadata = relation.tableMeta
           preprocess(i, metadata.identifier.quotedString, metadata.partitionColumnNames)
         case LogicalRelation(h: HadoopFsRelation, _, catalogTable) =>
@@ -427,7 +427,7 @@ object PreReadCheck extends (LogicalPlan => Unit) {
 
   private def checkNumInputFileBlockSources(e: Expression, operator: LogicalPlan): Int = {
     operator match {
-      case _: CatalogRelation => 1
+      case _: HiveTableRelation => 1
       case _ @ LogicalRelation(_: HadoopFsRelation, _, _) => 1
       case _: LeafNode => 0
       // UNION ALL has multiple children, but these children do not concurrently use InputFileBlock.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala
index 41569762d3c59..5916cd76b8789 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala
@@ -24,7 +24,7 @@ import scala.collection.mutable
 import scala.util.Random
 
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.catalog.{CatalogRelation, CatalogStatistics, CatalogTable}
+import org.apache.spark.sql.catalyst.catalog.{CatalogStatistics, CatalogTable, HiveTableRelation}
 import org.apache.spark.sql.catalyst.plans.logical.ColumnStat
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.datasources.LogicalRelation
@@ -171,7 +171,7 @@ abstract class StatisticsCollectionTestBase extends QueryTest with SQLTestUtils
     // Analyze only one column.
     sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS c1")
     val (relation, catalogTable) = spark.table(tableName).queryExecution.analyzed.collect {
-      case catalogRel: CatalogRelation => (catalogRel, catalogRel.tableMeta)
+      case catalogRel: HiveTableRelation => (catalogRel, catalogRel.tableMeta)
       case logicalRel: LogicalRelation => (logicalRel, logicalRel.catalogTable.get)
     }.head
     val emptyColStat = ColumnStat(0, None, None, 0, 4, 4)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 808dc013f170b..8bab059ed5e84 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -112,7 +112,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
   }
 
   def convertToLogicalRelation(
-      relation: CatalogRelation,
+      relation: HiveTableRelation,
       options: Map[String, String],
       fileFormatClass: Class[_ <: FileFormat],
       fileType: String): LogicalRelation = {
@@ -210,7 +210,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
         logicalRelation
       })
     }
-    // The inferred schema may have different filed names as the table schema, we should respect
+    // The inferred schema may have different field names as the table schema, we should respect
     // it, but also respect the exprId in table relation output.
     assert(result.output.length == relation.output.length &&
       result.output.zip(relation.output).forall { case (a1, a2) => a1.dataType == a2.dataType })
@@ -221,7 +221,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
   }
 
   private def inferIfNeeded(
-      relation: CatalogRelation,
+      relation: HiveTableRelation,
       options: Map[String, String],
       fileFormat: FileFormat,
       fileIndexOpt: Option[FileIndex] = None): (StructType, CatalogTable) = {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index 9c60d22d35ce1..ae1e7e72e8c3f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -21,10 +21,9 @@ import java.io.IOException
 import java.util.Locale
 
 import org.apache.hadoop.fs.{FileSystem, Path}
-import org.apache.hadoop.hive.common.StatsSetupConst
 
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.catalog.{CatalogRelation, CatalogStatistics, CatalogStorageFormat, CatalogTable}
+import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning._
 import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan, ScriptTransformation}
@@ -116,7 +115,7 @@ class ResolveHiveSerdeTable(session: SparkSession) extends Rule[LogicalPlan] {
 
 class DetermineTableStats(session: SparkSession) extends Rule[LogicalPlan] {
   override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
-    case relation: CatalogRelation
+    case relation: HiveTableRelation
         if DDLUtils.isHiveTable(relation.tableMeta) && relation.tableMeta.stats.isEmpty =>
       val table = relation.tableMeta
       val sizeInBytes = if (session.sessionState.conf.fallBackToHdfsForStatsEnabled) {
@@ -147,7 +146,7 @@ class DetermineTableStats(session: SparkSession) extends Rule[LogicalPlan] {
  */
 object HiveAnalysis extends Rule[LogicalPlan] {
   override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
-    case InsertIntoTable(r: CatalogRelation, partSpec, query, overwrite, ifPartitionNotExists)
+    case InsertIntoTable(r: HiveTableRelation, partSpec, query, overwrite, ifPartitionNotExists)
         if DDLUtils.isHiveTable(r.tableMeta) =>
       InsertIntoHiveTable(r.tableMeta, partSpec, query, overwrite, ifPartitionNotExists)
 
@@ -171,13 +170,13 @@ object HiveAnalysis extends Rule[LogicalPlan] {
 case class RelationConversions(
     conf: SQLConf,
     sessionCatalog: HiveSessionCatalog) extends Rule[LogicalPlan] {
-  private def isConvertible(relation: CatalogRelation): Boolean = {
+  private def isConvertible(relation: HiveTableRelation): Boolean = {
     val serde = relation.tableMeta.storage.serde.getOrElse("").toLowerCase(Locale.ROOT)
     serde.contains("parquet") && conf.getConf(HiveUtils.CONVERT_METASTORE_PARQUET) ||
       serde.contains("orc") && conf.getConf(HiveUtils.CONVERT_METASTORE_ORC)
   }
 
-  private def convert(relation: CatalogRelation): LogicalRelation = {
+  private def convert(relation: HiveTableRelation): LogicalRelation = {
     val serde = relation.tableMeta.storage.serde.getOrElse("").toLowerCase(Locale.ROOT)
     if (serde.contains("parquet")) {
       val options = Map(ParquetOptions.MERGE_SCHEMA ->
@@ -194,14 +193,14 @@ case class RelationConversions(
   override def apply(plan: LogicalPlan): LogicalPlan = {
     plan transformUp {
       // Write path
-      case InsertIntoTable(r: CatalogRelation, partition, query, overwrite, ifPartitionNotExists)
+      case InsertIntoTable(r: HiveTableRelation, partition, query, overwrite, ifPartitionNotExists)
         // Inserting into partitioned table is not supported in Parquet/Orc data source (yet).
           if query.resolved && DDLUtils.isHiveTable(r.tableMeta) &&
             !r.isPartitioned && isConvertible(r) =>
         InsertIntoTable(convert(r), partition, query, overwrite, ifPartitionNotExists)
 
       // Read path
-      case relation: CatalogRelation
+      case relation: HiveTableRelation
           if DDLUtils.isHiveTable(relation.tableMeta) && isConvertible(relation) =>
         convert(relation)
     }
@@ -229,7 +228,7 @@ private[hive] trait HiveStrategies {
    */
   object HiveTableScans extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
-      case PhysicalOperation(projectList, predicates, relation: CatalogRelation) =>
+      case PhysicalOperation(projectList, predicates, relation: HiveTableRelation) =>
         // Filter out all predicates that only deal with partition keys, these are given to the
         // hive table scan operator to be used for partition pruning.
         val partitionKeyIds = AttributeSet(relation.partitionCols)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala
index e191071efbf18..896f24f2e223d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala
@@ -30,7 +30,7 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.catalog.CatalogRelation
+import org.apache.spark.sql.catalyst.catalog.HiveTableRelation
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.execution._
@@ -50,7 +50,7 @@ import org.apache.spark.util.Utils
 private[hive]
 case class HiveTableScanExec(
     requestedAttributes: Seq[Attribute],
-    relation: CatalogRelation,
+    relation: HiveTableRelation,
     partitionPruningPred: Seq[Expression])(
     @transient private val sparkSession: SparkSession)
   extends LeafExecNode {
@@ -205,7 +205,7 @@ case class HiveTableScanExec(
     val input: AttributeSeq = relation.output
     HiveTableScanExec(
       requestedAttributes.map(QueryPlan.normalizeExprId(_, input)),
-      relation.canonicalized.asInstanceOf[CatalogRelation],
+      relation.canonicalized,
       QueryPlan.normalizePredicates(partitionPruningPred, input))(sparkSession)
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index c785aca985820..e01198dd53178 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -1372,6 +1372,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
         hiveClient.createTable(tableDesc, ignoreIfExists = false)
 
         checkAnswer(spark.table("old"), Row(1, "a"))
+        checkAnswer(sql("select * from old"), Row(1, "a"))
 
         val expectedSchema = StructType(Seq(
           StructField("i", IntegerType, nullable = true),
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index 36566bffb9335..71cf79c473b46 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -26,7 +26,7 @@ import org.apache.hadoop.hive.common.StatsSetupConst
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.catalog.{CatalogRelation, CatalogStatistics}
+import org.apache.spark.sql.catalyst.catalog.{CatalogStatistics, HiveTableRelation}
 import org.apache.spark.sql.catalyst.plans.logical.ColumnStat
 import org.apache.spark.sql.catalyst.util.StringUtils
 import org.apache.spark.sql.execution.command.DDLUtils
@@ -72,7 +72,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
                |LOCATION '${tempDir.toURI}'""".stripMargin)
 
           val relation = spark.table("csv_table").queryExecution.analyzed.children.head
-            .asInstanceOf[CatalogRelation]
+            .asInstanceOf[HiveTableRelation]
 
           val properties = relation.tableMeta.ignoredProperties
           assert(properties("totalSize").toLong <= 0, "external table totalSize must be <= 0")
@@ -905,7 +905,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
   test("estimates the size of a test Hive serde tables") {
     val df = sql("""SELECT * FROM src""")
     val sizes = df.queryExecution.analyzed.collect {
-      case relation: CatalogRelation => relation.stats.sizeInBytes
+      case relation: HiveTableRelation => relation.stats.sizeInBytes
     }
     assert(sizes.size === 1, s"Size wrong for:\n ${df.queryExecution}")
     assert(sizes(0).equals(BigInt(5812)),
@@ -965,7 +965,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
       () => (),
       metastoreQuery,
       metastoreAnswer,
-      implicitly[ClassTag[CatalogRelation]]
+      implicitly[ClassTag[HiveTableRelation]]
     )
   }
 
@@ -979,7 +979,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
 
     // Assert src has a size smaller than the threshold.
     val sizes = df.queryExecution.analyzed.collect {
-      case relation: CatalogRelation => relation.stats.sizeInBytes
+      case relation: HiveTableRelation => relation.stats.sizeInBytes
     }
     assert(sizes.size === 2 && sizes(1) <= spark.sessionState.conf.autoBroadcastJoinThreshold
       && sizes(0) <= spark.sessionState.conf.autoBroadcastJoinThreshold,
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 45bbb0c674be3..ef3d9b27aad79 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -29,7 +29,7 @@ import org.apache.spark.TestUtils
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.{EliminateSubqueryAliases, FunctionRegistry, NoSuchPartitionException}
-import org.apache.spark.sql.catalyst.catalog.{CatalogRelation, CatalogTableType, CatalogUtils}
+import org.apache.spark.sql.catalyst.catalog.{CatalogTableType, CatalogUtils, HiveTableRelation}
 import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias}
 import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
@@ -454,7 +454,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       case LogicalRelation(r: HadoopFsRelation, _, _) =>
         if (!isDataSourceTable) {
           fail(
-            s"${classOf[CatalogRelation].getCanonicalName} is expected, but found " +
+            s"${classOf[HiveTableRelation].getCanonicalName} is expected, but found " +
               s"${HadoopFsRelation.getClass.getCanonicalName}.")
         }
         userSpecifiedLocation match {
@@ -464,11 +464,11 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
         }
         assert(catalogTable.provider.get === format)
 
-      case r: CatalogRelation =>
+      case r: HiveTableRelation =>
         if (isDataSourceTable) {
           fail(
             s"${HadoopFsRelation.getClass.getCanonicalName} is expected, but found " +
-              s"${classOf[CatalogRelation].getCanonicalName}.")
+              s"${classOf[HiveTableRelation].getCanonicalName}.")
         }
         userSpecifiedLocation match {
           case Some(location) =>
@@ -948,7 +948,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     withSQLConf(SQLConf.CONVERT_CTAS.key -> "false") {
       sql("CREATE TABLE explodeTest (key bigInt)")
       table("explodeTest").queryExecution.analyzed match {
-        case SubqueryAlias(_, r: CatalogRelation) => // OK
+        case SubqueryAlias(_, r: HiveTableRelation) => // OK
         case _ =>
           fail("To correctly test the fix of SPARK-5875, explodeTest should be a MetastoreRelation")
       }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
index 8c855730c31f2..60ccd996d6d58 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
@@ -26,7 +26,7 @@ import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.catalog.CatalogRelation
+import org.apache.spark.sql.catalyst.catalog.HiveTableRelation
 import org.apache.spark.sql.execution.datasources.{LogicalRelation, RecordReaderIterator}
 import org.apache.spark.sql.hive.HiveUtils
 import org.apache.spark.sql.hive.test.TestHive._
@@ -475,7 +475,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
                 }
               } else {
                 queryExecution.analyzed.collectFirst {
-                  case _: CatalogRelation => ()
+                  case _: HiveTableRelation => ()
                 }.getOrElse {
                   fail(s"Expecting no conversion from orc to data sources, " +
                     s"but got:\n$queryExecution")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index 23f21e6b9931e..303884da19f09 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -21,7 +21,7 @@ import java.io.File
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.catalog.CatalogRelation
+import org.apache.spark.sql.catalyst.catalog.HiveTableRelation
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.DataSourceScanExec
 import org.apache.spark.sql.execution.datasources._
@@ -812,7 +812,7 @@ class ParquetSourceSuite extends ParquetPartitioningTest {
               }
             } else {
               queryExecution.analyzed.collectFirst {
-                case _: CatalogRelation =>
+                case _: HiveTableRelation =>
               }.getOrElse {
                 fail(s"Expecting no conversion from parquet to data sources, " +
                   s"but got:\n$queryExecution")

From cba826d00173a945b0c9a7629c66e36fa73b723e Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Tue, 15 Aug 2017 11:26:29 -0700
Subject: [PATCH 361/779] [SPARK-17742][CORE] Handle child process exit in
 SparkLauncher.

Currently the launcher handle does not monitor the child spark-submit
process it launches; this means that if the child exits with an error,
the handle's state will never change, and an application will not know
that the application has failed.

This change adds code to monitor the child process, and changes the
handle state appropriately when the child process exits.

Tested with added unit tests.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #18877 from vanzin/SPARK-17742.
---
 .../spark/launcher/SparkLauncherSuite.java    |  4 +-
 .../spark/launcher/ChildProcAppHandle.java    | 65 ++++++++++++++-----
 .../spark/launcher/OutputRedirector.java      | 16 ++++-
 ...uite.java => ChildProcAppHandleSuite.java} | 46 +++++++++----
 launcher/src/test/resources/log4j.properties  |  2 +-
 5 files changed, 99 insertions(+), 34 deletions(-)
 rename launcher/src/test/java/org/apache/spark/launcher/{OutputRedirectionSuite.java => ChildProcAppHandleSuite.java} (81%)

diff --git a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
index 19861b81c2562..db4fc26cdf353 100644
--- a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
+++ b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
@@ -116,11 +116,11 @@ public void testChildProcLauncher() throws Exception {
       .setConf(SparkLauncher.DRIVER_EXTRA_CLASSPATH, System.getProperty("java.class.path"))
       .addSparkArg(opts.CLASS, "ShouldBeOverriddenBelow")
       .setMainClass(SparkLauncherTestApp.class.getName())
+      .redirectError()
       .addAppArgs("proc");
     final Process app = launcher.launch();
 
-    new OutputRedirector(app.getInputStream(), TF);
-    new OutputRedirector(app.getErrorStream(), TF);
+    new OutputRedirector(app.getInputStream(), getClass().getName() + ".child", TF);
     assertEquals(0, app.waitFor());
   }
 
diff --git a/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
index 3ce4b79ee9c05..bf916406f1471 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
@@ -34,7 +34,7 @@ class ChildProcAppHandle implements SparkAppHandle {
   private final String secret;
   private final LauncherServer server;
 
-  private Process childProc;
+  private volatile Process childProc;
   private boolean disposed;
   private LauncherConnection connection;
   private List<Listener> listeners;
@@ -96,18 +96,14 @@ public synchronized void disconnect() {
 
   @Override
   public synchronized void kill() {
-    if (!disposed) {
-      disconnect();
-    }
+    disconnect();
     if (childProc != null) {
-      try {
-        childProc.exitValue();
-      } catch (IllegalThreadStateException e) {
+      if (childProc.isAlive()) {
         childProc.destroyForcibly();
-      } finally {
-        childProc = null;
       }
+      childProc = null;
     }
+    setState(State.KILLED);
   }
 
   String getSecret() {
@@ -118,7 +114,13 @@ void setChildProc(Process childProc, String loggerName, InputStream logStream) {
     this.childProc = childProc;
     if (logStream != null) {
       this.redirector = new OutputRedirector(logStream, loggerName,
-        SparkLauncher.REDIRECTOR_FACTORY);
+        SparkLauncher.REDIRECTOR_FACTORY, this);
+    } else {
+      // If there is no log redirection, spawn a thread that will wait for the child process
+      // to finish.
+      Thread waiter = SparkLauncher.REDIRECTOR_FACTORY.newThread(this::monitorChild);
+      waiter.setDaemon(true);
+      waiter.start();
     }
   }
 
@@ -134,7 +136,7 @@ LauncherConnection getConnection() {
     return connection;
   }
 
-  void setState(State s) {
+  synchronized void setState(State s) {
     if (!state.isFinal()) {
       state = s;
       fireEvent(false);
@@ -144,17 +146,48 @@ void setState(State s) {
     }
   }
 
-  void setAppId(String appId) {
+  synchronized void setAppId(String appId) {
     this.appId = appId;
     fireEvent(true);
   }
 
-  // Visible for testing.
-  boolean isRunning() {
-    return childProc == null || childProc.isAlive() || (redirector != null && redirector.isAlive());
+  /**
+   * Wait for the child process to exit and update the handle's state if necessary, accoding to
+   * the exit code.
+   */
+  void monitorChild() {
+    while (childProc.isAlive()) {
+      try {
+        childProc.waitFor();
+      } catch (Exception e) {
+        LOG.log(Level.WARNING, "Exception waiting for child process to exit.", e);
+      }
+    }
+
+    synchronized (this) {
+      if (disposed) {
+        return;
+      }
+
+      disconnect();
+
+      int ec;
+      try {
+        ec = childProc.exitValue();
+      } catch (Exception e) {
+        LOG.log(Level.WARNING, "Exception getting child process exit code, assuming failure.", e);
+        ec = 1;
+      }
+
+      // Only override the success state; leave other fail states alone.
+      if (!state.isFinal() || (ec != 0 && state == State.FINISHED)) {
+        state = State.LOST;
+        fireEvent(false);
+      }
+    }
   }
 
-  private synchronized void fireEvent(boolean isInfoChanged) {
+  private void fireEvent(boolean isInfoChanged) {
     if (listeners != null) {
       for (Listener l : listeners) {
         if (isInfoChanged) {
diff --git a/launcher/src/main/java/org/apache/spark/launcher/OutputRedirector.java b/launcher/src/main/java/org/apache/spark/launcher/OutputRedirector.java
index 63abae9a1c49d..6f4b0bb38e031 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/OutputRedirector.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/OutputRedirector.java
@@ -34,18 +34,24 @@ class OutputRedirector {
   private final BufferedReader reader;
   private final Logger sink;
   private final Thread thread;
+  private final ChildProcAppHandle callback;
 
   private volatile boolean active;
 
-  OutputRedirector(InputStream in, ThreadFactory tf) {
-    this(in, OutputRedirector.class.getName(), tf);
+  OutputRedirector(InputStream in, String loggerName, ThreadFactory tf) {
+    this(in, loggerName, tf, null);
   }
 
-  OutputRedirector(InputStream in, String loggerName, ThreadFactory tf) {
+  OutputRedirector(
+      InputStream in,
+      String loggerName,
+      ThreadFactory tf,
+      ChildProcAppHandle callback) {
     this.active = true;
     this.reader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8));
     this.thread = tf.newThread(this::redirect);
     this.sink = Logger.getLogger(loggerName);
+    this.callback = callback;
     thread.start();
   }
 
@@ -59,6 +65,10 @@ private void redirect() {
       }
     } catch (IOException e) {
       sink.log(Level.FINE, "Error reading child process output.", e);
+    } finally {
+      if (callback != null) {
+        callback.monitorChild();
+      }
     }
   }
 
diff --git a/launcher/src/test/java/org/apache/spark/launcher/OutputRedirectionSuite.java b/launcher/src/test/java/org/apache/spark/launcher/ChildProcAppHandleSuite.java
similarity index 81%
rename from launcher/src/test/java/org/apache/spark/launcher/OutputRedirectionSuite.java
rename to launcher/src/test/java/org/apache/spark/launcher/ChildProcAppHandleSuite.java
index ba044d3e4c0d7..64a87b365d6a9 100644
--- a/launcher/src/test/java/org/apache/spark/launcher/OutputRedirectionSuite.java
+++ b/launcher/src/test/java/org/apache/spark/launcher/ChildProcAppHandleSuite.java
@@ -25,6 +25,7 @@
 import java.util.Arrays;
 import java.util.EnumSet;
 import java.util.List;
+import java.util.concurrent.TimeUnit;
 import java.util.stream.Collectors;
 import static java.nio.file.attribute.PosixFilePermission.*;
 
@@ -39,7 +40,7 @@
 
 import static org.apache.spark.launcher.CommandBuilderUtils.*;
 
-public class OutputRedirectionSuite extends BaseSuite {
+public class ChildProcAppHandleSuite extends BaseSuite {
 
   private static final List<String> MESSAGES = new ArrayList<>();
 
@@ -99,7 +100,8 @@ public void testRedirectLastWins() throws Exception {
   public void testRedirectToLog() throws Exception {
     assumeFalse(isWindows());
 
-    ChildProcAppHandle handle = (ChildProcAppHandle) new TestSparkLauncher().startApplication();
+    SparkAppHandle handle = (ChildProcAppHandle) new TestSparkLauncher()
+      .startApplication();
     waitFor(handle);
 
     assertTrue(MESSAGES.contains("output"));
@@ -112,7 +114,7 @@ public void testRedirectErrorToLog() throws Exception {
 
     Path err = Files.createTempFile("stderr", "txt");
 
-    ChildProcAppHandle handle = (ChildProcAppHandle) new TestSparkLauncher()
+    SparkAppHandle handle = (ChildProcAppHandle) new TestSparkLauncher()
       .redirectError(err.toFile())
       .startApplication();
     waitFor(handle);
@@ -127,7 +129,7 @@ public void testRedirectOutputToLog() throws Exception {
 
     Path out = Files.createTempFile("stdout", "txt");
 
-    ChildProcAppHandle handle = (ChildProcAppHandle) new TestSparkLauncher()
+    SparkAppHandle handle = (ChildProcAppHandle) new TestSparkLauncher()
       .redirectOutput(out.toFile())
       .startApplication();
     waitFor(handle);
@@ -173,17 +175,37 @@ public void testRedirectErrorTwiceFails() throws Exception {
       .waitFor();
   }
 
-  private void waitFor(ChildProcAppHandle handle) throws Exception {
+  @Test
+  public void testProcMonitorWithOutputRedirection() throws Exception {
+    File err = Files.createTempFile("out", "txt").toFile();
+    SparkAppHandle handle = new TestSparkLauncher()
+      .redirectError()
+      .redirectOutput(err)
+      .startApplication();
+    waitFor(handle);
+    assertEquals(SparkAppHandle.State.LOST, handle.getState());
+  }
+
+  @Test
+  public void testProcMonitorWithLogRedirection() throws Exception {
+    SparkAppHandle handle = new TestSparkLauncher()
+      .redirectToLog(getClass().getName())
+      .startApplication();
+    waitFor(handle);
+    assertEquals(SparkAppHandle.State.LOST, handle.getState());
+  }
+
+  private void waitFor(SparkAppHandle handle) throws Exception {
+    long deadline = System.nanoTime() + TimeUnit.SECONDS.toNanos(10);
     try {
-      while (handle.isRunning()) {
-        Thread.sleep(10);
+      while (!handle.getState().isFinal()) {
+        assertTrue("Timed out waiting for handle to transition to final state.",
+          System.nanoTime() < deadline);
+        TimeUnit.MILLISECONDS.sleep(10);
       }
     } finally {
-      // Explicit unregister from server since the handle doesn't yet do that when the
-      // process finishes by itself.
-      LauncherServer server = LauncherServer.getServerInstance();
-      if (server != null) {
-        server.unregister(handle);
+      if (!handle.getState().isFinal()) {
+        handle.kill();
       }
     }
   }
diff --git a/launcher/src/test/resources/log4j.properties b/launcher/src/test/resources/log4j.properties
index bd982b1d44aa8..366a6b9f817c0 100644
--- a/launcher/src/test/resources/log4j.properties
+++ b/launcher/src/test/resources/log4j.properties
@@ -29,7 +29,7 @@ log4j.appender.childproc.target=System.err
 log4j.appender.childproc.layout=org.apache.log4j.PatternLayout
 log4j.appender.childproc.layout.ConversionPattern=%t: %m%n
 
-log4j.appender.outputredirtest=org.apache.spark.launcher.OutputRedirectionSuite$LogAppender
+log4j.appender.outputredirtest=org.apache.spark.launcher.ChildProcAppHandleSuite$LogAppender
 log4j.logger.org.apache.spark.launcher.app.outputredirtest=INFO, outputredirtest
 log4j.logger.org.apache.spark.launcher.app.outputredirtest.additivity=false
 

From 3f958a99921d149fb9fdf7ba7e78957afdad1405 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Tue, 15 Aug 2017 13:59:00 -0700
Subject: [PATCH 362/779] [SPARK-21731][BUILD] Upgrade scalastyle to 0.9.

This version fixes a few issues in the import order checker; it provides
better error messages, and detects more improper ordering (thus the need
to change a lot of files in this patch). The main fix is that it correctly
complains about the order of packages vs. classes.

As part of the above, I moved some "SparkSession" import in ML examples
inside the "$example on$" blocks; that didn't seem consistent across
different source files to start with, and avoids having to add more on/off blocks
around specific imports.

The new scalastyle also seems to have a better header detector, so a few
license headers had to be updated to match the expected indentation.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #18943 from vanzin/SPARK-21731.
---
 .../apache/spark/api/java/JavaDoubleRDD.scala |  2 +-
 .../api/python/PythonWorkerFactory.scala      |  2 +-
 .../apache/spark/api/r/JVMObjectTracker.scala |  2 +-
 .../apache/spark/api/r/RBackendHandler.scala  |  2 +-
 .../apache/spark/deploy/PythonRunner.scala    |  2 +-
 .../apache/spark/deploy/SparkHadoopUtil.scala |  2 +-
 .../deploy/rest/RestSubmissionServer.scala    |  2 +-
 .../deploy/rest/StandaloneRestServer.scala    |  2 +-
 .../spark/deploy/worker/WorkerArguments.scala |  2 +-
 .../spark/input/PortableDataStream.scala      |  2 +-
 .../input/WholeTextFileInputFormat.scala      |  5 +--
 .../apache/spark/metrics/MetricsSystem.scala  |  2 +-
 .../org/apache/spark/rdd/CoGroupedRDD.scala   |  2 +-
 .../apache/spark/rdd/DoubleRDDFunctions.scala |  2 +-
 .../apache/spark/rdd/PairRDDFunctions.scala   |  2 +-
 .../spark/rdd/PartitionwiseSampledRDD.scala   |  2 +-
 .../apache/spark/rdd/coalesce-public.scala    |  2 +-
 .../apache/spark/scheduler/DAGScheduler.scala |  2 +-
 .../spark/scheduler/TaskSetManager.scala      |  2 +-
 .../api/v1/AllExecutorListResource.scala      | 30 ++++++++--------
 .../status/api/v1/ExecutorListResource.scala  | 30 ++++++++--------
 .../storage/BlockManagerMasterEndpoint.scala  |  2 +-
 .../org/apache/spark/ui/JettyUtils.scala      |  2 +-
 .../apache/spark/ui/exec/ExecutorsTab.scala   |  2 +-
 .../org/apache/spark/DistributedSuite.scala   |  2 +-
 .../apache/spark/JobCancellationSuite.scala   |  2 +-
 .../apache/spark/MapOutputTrackerSuite.scala  |  2 +-
 .../org/apache/spark/SparkContextSuite.scala  |  2 +-
 .../spark/deploy/SparkSubmitSuite.scala       |  4 +--
 .../rest/StandaloneRestSubmitSuite.scala      |  2 +-
 .../NettyBlockTransferSecuritySuite.scala     |  2 +-
 .../spark/rdd/AsyncRDDActionsSuite.scala      |  2 +-
 .../org/apache/spark/rpc/RpcEnvSuite.scala    |  2 +-
 .../scheduler/BlacklistTrackerSuite.scala     |  2 +-
 .../spark/scheduler/MapStatusSuite.scala      |  2 +-
 .../spark/scheduler/SparkListenerSuite.scala  |  2 +-
 .../scheduler/TaskResultGetterSuite.scala     |  2 +-
 .../scheduler/TaskSchedulerImplSuite.scala    |  2 +-
 .../spark/scheduler/TaskSetManagerSuite.scala |  2 +-
 .../spark/storage/BlockManagerSuite.scala     |  2 +-
 .../apache/spark/storage/DiskStoreSuite.scala |  2 +-
 .../apache/spark/util/JsonProtocolSuite.scala |  2 +-
 .../BucketedRandomProjectionLSHExample.scala  |  2 +-
 .../spark/examples/ml/MinHashLSHExample.scala |  2 +-
 .../spark/examples/ml/TokenizerExample.scala  |  2 +-
 .../examples/ml/UnaryTransformerExample.scala |  4 +--
 .../examples/ml/VectorSlicerExample.scala     |  3 +-
 .../spark/examples/mllib/TallSkinnyPCA.scala  |  2 +-
 .../spark/examples/mllib/TallSkinnySVD.scala  |  2 +-
 .../pythonconverters/AvroConverters.scala     |  6 ++--
 .../sql/UserDefinedTypedAggregation.scala     |  4 +--
 .../sql/UserDefinedUntypedAggregation.scala   |  3 +-
 .../streaming/DirectKafkaWordCount.scala      |  2 +-
 .../streaming/flume/FlumeTestUtils.scala      |  2 +-
 .../flume/PollingFlumeTestUtils.scala         |  2 +-
 .../spark/sql/kafka010/KafkaTestUtils.scala   |  2 +-
 .../kafka010/DirectKafkaStreamSuite.scala     |  2 +-
 .../kafka/DirectKafkaStreamSuite.scala        |  2 +-
 .../kinesis/KinesisInputDStream.scala         |  3 +-
 .../org/apache/spark/graphx/GraphOps.scala    |  2 +-
 .../org/apache/spark/ml/impl/UtilsSuite.scala |  2 +-
 .../apache/spark/ml/feature/Interaction.scala |  4 +--
 .../spark/ml/feature/SQLTransformer.scala     |  2 +-
 .../org/apache/spark/ml/param/params.scala    |  2 +-
 .../ml/regression/LinearRegression.scala      |  2 +-
 .../ml/source/libsvm/LibSVMRelation.scala     |  2 +-
 .../clustering/BisectingKMeansModel.scala     |  2 +-
 .../spark/mllib/feature/ChiSqSelector.scala   |  2 +-
 .../mllib/regression/IsotonicRegression.scala |  3 +-
 .../spark/mllib/regression/LabeledPoint.scala |  2 +-
 .../spark/ml/recommendation/ALSSuite.scala    |  2 +-
 pom.xml                                       |  2 +-
 project/SparkBuild.scala                      |  4 +--
 project/plugins.sbt                           |  2 +-
 .../MesosClusterDispatcherArguments.scala     |  2 +-
 .../deploy/mesos/ui/MesosClusterPage.scala    |  2 +-
 .../cluster/mesos/MesosClusterScheduler.scala |  2 +-
 ...osCoarseGrainedSchedulerBackendSuite.scala |  4 +--
 ...yPreferredContainerPlacementStrategy.scala |  2 +-
 .../spark/deploy/yarn/YarnAllocator.scala     |  2 +-
 .../deploy/yarn/YarnAllocatorSuite.scala      |  2 +-
 .../yarn/YarnShuffleIntegrationSuite.scala    | 30 ++++++++--------
 .../sql/catalyst/analysis/Analyzer.scala      |  2 +-
 .../sql/catalyst/analysis/CheckAnalysis.scala |  2 +-
 .../sql/catalyst/encoders/RowEncoder.scala    |  4 +--
 .../spark/sql/catalyst/errors/package.scala   |  2 +-
 .../expressions/aggregate/Percentile.scala    |  2 +-
 .../expressions/aggregate/collect.scala       |  2 +-
 .../spark/sql/catalyst/expressions/hash.scala |  2 +-
 .../expressions/jsonExpressions.scala         |  4 +--
 .../expressions/mathExpressions.scala         |  2 +-
 .../sql/catalyst/optimizer/expressions.scala  |  2 +-
 .../sql/catalyst/optimizer/subquery.scala     |  2 +-
 .../catalyst/plans/logical/Statistics.scala   |  2 +-
 .../spark/sql/catalyst/trees/TreeNode.scala   |  2 +-
 .../apache/spark/sql/types/StructType.scala   |  2 +-
 .../analysis/DecimalPrecisionSuite.scala      |  2 +-
 .../encoders/EncoderResolutionSuite.scala     |  2 +-
 .../BooleanSimplificationSuite.scala          |  2 +-
 .../optimizer/CombiningLimitsSuite.scala      |  2 +-
 .../EliminateSerializationSuite.scala         |  2 +-
 .../optimizer/LikeSimplificationSuite.scala   |  2 +-
 .../catalyst/optimizer/OptimizeInSuite.scala  |  2 +-
 .../SimplifyStringCaseConversionSuite.scala   |  2 +-
 .../catalyst/parser/DataTypeParserSuite.scala | 30 ++++++++--------
 .../parser/TableSchemaParserSuite.scala       | 30 ++++++++--------
 .../sql/catalyst/trees/TreeNodeSuite.scala    |  8 ++---
 .../sql/catalyst/util/UnsafeArraySuite.scala  |  2 +-
 .../spark/sql/types/DataTypeSuite.scala       | 30 ++++++++--------
 .../scala/org/apache/spark/sql/Column.scala   | 30 ++++++++--------
 .../spark/sql/DataFrameNaFunctions.scala      | 30 ++++++++--------
 .../apache/spark/sql/DataFrameReader.scala    | 34 +++++++++----------
 .../spark/sql/DataFrameStatFunctions.scala    | 30 ++++++++--------
 .../org/apache/spark/sql/DatasetHolder.scala  | 30 ++++++++--------
 .../org/apache/spark/sql/api/package.scala    | 30 ++++++++--------
 .../org/apache/spark/sql/api/r/SQLUtils.scala |  2 +-
 .../spark/sql/execution/CacheManager.scala    |  3 +-
 .../aggregate/HashAggregateExec.scala         |  2 +-
 .../aggregate/SortAggregateExec.scala         |  2 +-
 .../sql/execution/arrow/ArrowConverters.scala | 30 ++++++++--------
 .../sql/execution/command/CommandUtils.scala  | 30 ++++++++--------
 .../spark/sql/execution/command/tables.scala  |  2 +-
 .../datasources/FileFormatWriter.scala        |  2 +-
 .../datasources/InMemoryFileIndex.scala       |  2 +-
 .../datasources/jdbc/JDBCRelation.scala       |  2 +-
 .../sql/execution/metric/SQLMetrics.scala     | 30 ++++++++--------
 .../apache/spark/sql/execution/objects.scala  |  5 ++-
 .../python/BatchEvalPythonExec.scala          | 30 ++++++++--------
 .../sql/execution/python/EvaluatePython.scala | 30 ++++++++--------
 .../execution/python/ExtractPythonUDFs.scala  | 30 ++++++++--------
 .../spark/sql/execution/python/RowQueue.scala | 30 ++++++++--------
 .../python/UserDefinedPythonFunction.scala    |  2 +-
 .../execution/r/MapPartitionsRWrapper.scala   |  2 +-
 .../sql/execution/stat/FrequentItems.scala    | 30 ++++++++--------
 .../execution/streaming/HDFSMetadataLog.scala | 30 ++++++++--------
 .../streaming/IncrementalExecution.scala      | 30 ++++++++--------
 .../sql/execution/streaming/MetadataLog.scala | 30 ++++++++--------
 .../execution/streaming/OffsetSeqLog.scala    | 30 ++++++++--------
 .../streaming/ProgressReporter.scala          |  2 +-
 .../sql/execution/streaming/console.scala     |  5 ++-
 .../sql/expressions/UserDefinedFunction.scala | 32 ++++++++---------
 .../spark/sql/DataFrameImplicitsSuite.scala   | 30 ++++++++--------
 .../org/apache/spark/sql/JoinSuite.scala      |  4 +--
 .../scala/org/apache/spark/sql/RowSuite.scala | 30 ++++++++--------
 .../apache/spark/sql/SQLContextSuite.scala    | 32 ++++++++---------
 .../spark/sql/api/r/SQLUtilsSuite.scala       | 30 ++++++++--------
 .../execution/SQLWindowFunctionSuite.scala    |  2 +-
 .../execution/UnsafeRowSerializerSuite.scala  |  2 +-
 .../SortBasedAggregationStoreSuite.scala      |  2 +-
 .../execution/datasources/csv/CSVSuite.scala  |  2 +-
 .../datasources/json/JsonSuite.scala          |  2 +-
 .../parquet/ParquetReadBenchmark.scala        |  2 +-
 .../execution/joins/BroadcastJoinSuite.scala  |  2 +-
 .../CompactibleFileStreamLogSuite.scala       |  2 +-
 .../sql/execution/ui/SQLListenerSuite.scala   |  2 +-
 .../vectorized/ColumnarBatchSuite.scala       |  2 +-
 .../spark/sql/internal/SQLConfSuite.scala     | 30 ++++++++--------
 .../sql/sources/DDLSourceLoadSuite.scala      | 30 ++++++++--------
 .../spark/sql/sources/DataSourceTest.scala    | 30 ++++++++--------
 .../spark/sql/sources/FilteredScanSuite.scala | 30 ++++++++--------
 .../spark/sql/sources/PathOptionSuite.scala   | 30 ++++++++--------
 .../spark/sql/sources/PrunedScanSuite.scala   | 30 ++++++++--------
 .../sql/sources/ResolvedDataSourceSuite.scala | 30 ++++++++--------
 .../spark/sql/sources/TableScanSuite.scala    | 30 ++++++++--------
 .../sql/sources/fakeExternalSources.scala     | 30 ++++++++--------
 .../StreamingQueryListenerSuite.scala         |  4 +--
 .../sql/streaming/StreamingQuerySuite.scala   |  6 ++--
 .../apache/spark/sql/test/SQLTestUtils.scala  |  2 +-
 .../HiveThriftServer2Suites.scala             |  4 +--
 .../org/apache/spark/sql/hive/HiveUtils.scala |  2 +-
 .../spark/sql/hive/client/HiveShim.scala      |  2 +-
 .../hive/execution/InsertIntoHiveTable.scala  |  4 +--
 .../hive/HiveContextCompatibilitySuite.scala  | 30 ++++++++--------
 .../spark/sql/hive/HiveInspectorSuite.scala   |  2 +-
 .../sql/hive/HiveSchemaInferenceSuite.scala   |  2 +-
 .../spark/sql/hive/HiveUtilsSuite.scala       |  2 +-
 .../spark/sql/hive/ListTablesSuite.scala      | 30 ++++++++--------
 .../hive/PartitionedTablePerfStatsSuite.scala |  2 +-
 .../sql/hive/execution/HiveDDLSuite.scala     |  2 +-
 .../sql/hive/execution/HivePlanTest.scala     |  2 +-
 .../spark/streaming/receiver/Receiver.scala   |  2 +-
 .../receiver/ReceiverSupervisorImpl.scala     |  2 +-
 .../streaming/util/BatchedWriteAheadLog.scala |  2 +-
 .../streaming/BasicOperationsSuite.scala      |  3 +-
 184 files changed, 766 insertions(+), 783 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala
index b71af0d42cdb0..b6df5663d9197 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala
@@ -22,8 +22,8 @@ import java.lang.{Double => JDouble}
 import scala.language.implicitConversions
 import scala.reflect.ClassTag
 
-import org.apache.spark.annotation.Since
 import org.apache.spark.Partitioner
+import org.apache.spark.annotation.Since
 import org.apache.spark.api.java.function.{Function => JFunction}
 import org.apache.spark.partial.{BoundedDouble, PartialResult}
 import org.apache.spark.rdd.RDD
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala b/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala
index 6a5e6f7c5afb1..fc595ae9e4563 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala
@@ -22,8 +22,8 @@ import java.net.{InetAddress, ServerSocket, Socket, SocketException}
 import java.nio.charset.StandardCharsets
 import java.util.Arrays
 
-import scala.collection.mutable
 import scala.collection.JavaConverters._
+import scala.collection.mutable
 
 import org.apache.spark._
 import org.apache.spark.internal.Logging
diff --git a/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala b/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala
index fe7438ac54f18..b8c4ff9d477a5 100644
--- a/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala
@@ -17,8 +17,8 @@
 
 package org.apache.spark.api.r
 
-import java.util.concurrent.atomic.AtomicInteger
 import java.util.concurrent.ConcurrentHashMap
+import java.util.concurrent.atomic.AtomicInteger
 
 /** JVM object ID wrapper */
 private[r] case class JVMObjectId(id: String) {
diff --git a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
index cfd37ac54ba23..18fc595301f46 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
@@ -26,9 +26,9 @@ import io.netty.channel.{ChannelHandlerContext, SimpleChannelInboundHandler}
 import io.netty.channel.ChannelHandler.Sharable
 import io.netty.handler.timeout.ReadTimeoutException
 
+import org.apache.spark.SparkConf
 import org.apache.spark.api.r.SerDe._
 import org.apache.spark.internal.Logging
-import org.apache.spark.SparkConf
 import org.apache.spark.util.{ThreadUtils, Utils}
 
 /**
diff --git a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
index a8f732b11f6cf..7aca305783a7f 100644
--- a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
@@ -20,8 +20,8 @@ package org.apache.spark.deploy
 import java.io.File
 import java.net.URI
 
-import scala.collection.mutable.ArrayBuffer
 import scala.collection.JavaConverters._
+import scala.collection.mutable.ArrayBuffer
 import scala.util.Try
 
 import org.apache.spark.{SparkConf, SparkUserAppException}
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index eeb6d10d3e741..e26f61dd3e9e5 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -22,8 +22,8 @@ import java.security.PrivilegedExceptionAction
 import java.text.DateFormat
 import java.util.{Arrays, Comparator, Date, Locale}
 
-import scala.collection.immutable.Map
 import scala.collection.JavaConverters._
+import scala.collection.immutable.Map
 import scala.collection.mutable
 import scala.collection.mutable.HashMap
 import scala.util.control.NonFatal
diff --git a/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionServer.scala b/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionServer.scala
index b30c980e95a9a..e88195d95f270 100644
--- a/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionServer.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionServer.scala
@@ -11,7 +11,7 @@
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
+ * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestServer.scala b/core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestServer.scala
index 56620064c57fa..0164084ab129e 100644
--- a/core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestServer.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestServer.scala
@@ -11,7 +11,7 @@
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
+ * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerArguments.scala b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerArguments.scala
index bd07d342e04ac..580281288b060 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerArguments.scala
@@ -21,8 +21,8 @@ import java.lang.management.ManagementFactory
 
 import scala.annotation.tailrec
 
-import org.apache.spark.util.{IntParam, MemoryParam, Utils}
 import org.apache.spark.SparkConf
+import org.apache.spark.util.{IntParam, MemoryParam, Utils}
 
 /**
  * Command-line parser for the worker.
diff --git a/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala b/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala
index 9606c4754314f..17cdba4f1305b 100644
--- a/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala
+++ b/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala
@@ -27,9 +27,9 @@ import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext}
 import org.apache.hadoop.mapreduce.lib.input.{CombineFileInputFormat, CombineFileRecordReader, CombineFileSplit}
 
-import org.apache.spark.internal.config
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Since
+import org.apache.spark.internal.config
 
 /**
  * A general format for reading whole files in as streams, byte arrays,
diff --git a/core/src/main/scala/org/apache/spark/input/WholeTextFileInputFormat.scala b/core/src/main/scala/org/apache/spark/input/WholeTextFileInputFormat.scala
index fa34f1e886c72..f47cd38d712c3 100644
--- a/core/src/main/scala/org/apache/spark/input/WholeTextFileInputFormat.scala
+++ b/core/src/main/scala/org/apache/spark/input/WholeTextFileInputFormat.scala
@@ -21,11 +21,8 @@ import scala.collection.JavaConverters._
 
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.io.Text
-import org.apache.hadoop.mapreduce.InputSplit
-import org.apache.hadoop.mapreduce.JobContext
+import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext}
 import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat
-import org.apache.hadoop.mapreduce.RecordReader
-import org.apache.hadoop.mapreduce.TaskAttemptContext
 
 /**
  * A [[org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat CombineFileInputFormat]] for
diff --git a/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala b/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala
index 1d494500cdb5c..3457a2632277d 100644
--- a/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala
@@ -26,8 +26,8 @@ import com.codahale.metrics.{Metric, MetricFilter, MetricRegistry}
 import org.eclipse.jetty.servlet.ServletContextHandler
 
 import org.apache.spark.{SecurityManager, SparkConf}
-import org.apache.spark.internal.config._
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config._
 import org.apache.spark.metrics.sink.{MetricsServlet, Sink}
 import org.apache.spark.metrics.source.{Source, StaticSources}
 import org.apache.spark.util.Utils
diff --git a/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
index a091f06b4ed7c..4574c3724962e 100644
--- a/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
@@ -26,8 +26,8 @@ import scala.reflect.ClassTag
 import org.apache.spark._
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.serializer.Serializer
-import org.apache.spark.util.collection.{CompactBuffer, ExternalAppendOnlyMap}
 import org.apache.spark.util.Utils
+import org.apache.spark.util.collection.{CompactBuffer, ExternalAppendOnlyMap}
 
 /**
  * The references to rdd and splitIndex are transient because redundant information is stored
diff --git a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
index 14331dfd0c987..57782c0ebf332 100644
--- a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
@@ -17,8 +17,8 @@
 
 package org.apache.spark.rdd
 
-import org.apache.spark.annotation.Since
 import org.apache.spark.TaskContext
+import org.apache.spark.annotation.Since
 import org.apache.spark.internal.Logging
 import org.apache.spark.partial.BoundedDouble
 import org.apache.spark.partial.MeanEvaluator
diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 4628fa8ba270e..e68c6b1366c7f 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -35,8 +35,8 @@ import org.apache.hadoop.mapreduce.{Job => NewAPIHadoopJob, OutputFormat => NewO
 import org.apache.spark._
 import org.apache.spark.Partitioner.defaultPartitioner
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.internal.io._
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.io._
 import org.apache.spark.partial.{BoundedDouble, PartialResult}
 import org.apache.spark.serializer.Serializer
 import org.apache.spark.util.{SerializableConfiguration, SerializableJobConf, Utils}
diff --git a/core/src/main/scala/org/apache/spark/rdd/PartitionwiseSampledRDD.scala b/core/src/main/scala/org/apache/spark/rdd/PartitionwiseSampledRDD.scala
index 6a89ea8786464..15691a8fc8eaa 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PartitionwiseSampledRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PartitionwiseSampledRDD.scala
@@ -22,8 +22,8 @@ import java.util.Random
 import scala.reflect.ClassTag
 
 import org.apache.spark.{Partition, TaskContext}
-import org.apache.spark.util.random.RandomSampler
 import org.apache.spark.util.Utils
+import org.apache.spark.util.random.RandomSampler
 
 private[spark]
 class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
diff --git a/core/src/main/scala/org/apache/spark/rdd/coalesce-public.scala b/core/src/main/scala/org/apache/spark/rdd/coalesce-public.scala
index e00bc22aba44d..1f8ab784a92b5 100644
--- a/core/src/main/scala/org/apache/spark/rdd/coalesce-public.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/coalesce-public.scala
@@ -19,8 +19,8 @@ package org.apache.spark.rdd
 
 import scala.collection.mutable
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.Partition
+import org.apache.spark.annotation.DeveloperApi
 
 /**
  * ::DeveloperApi::
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 89b4cab88109d..21bf9d013ebef 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -35,8 +35,8 @@ import org.apache.commons.lang3.SerializationUtils
 import org.apache.spark._
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.executor.TaskMetrics
-import org.apache.spark.internal.config
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config
 import org.apache.spark.network.util.JavaUtils
 import org.apache.spark.partial.{ApproximateActionListener, ApproximateEvaluator, PartialResult}
 import org.apache.spark.rdd.RDD
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index c2510714e1d11..c2f817858473c 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -26,9 +26,9 @@ import scala.math.max
 import scala.util.control.NonFatal
 
 import org.apache.spark._
+import org.apache.spark.TaskState.TaskState
 import org.apache.spark.internal.Logging
 import org.apache.spark.scheduler.SchedulingMode._
-import org.apache.spark.TaskState.TaskState
 import org.apache.spark.util.{AccumulatorV2, Clock, SystemClock, Utils}
 import org.apache.spark.util.collection.MedianHeap
 
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/AllExecutorListResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/AllExecutorListResource.scala
index 01f2a18122e6f..eb5cc1b9a3bd0 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/AllExecutorListResource.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/AllExecutorListResource.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.spark.status.api.v1
 
 import javax.ws.rs.{GET, Produces}
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/ExecutorListResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/ExecutorListResource.scala
index ab53881594180..2f3b5e984002a 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/ExecutorListResource.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/ExecutorListResource.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.spark.status.api.v1
 
 import javax.ws.rs.{GET, Produces}
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
index 6f85b9e4d6c73..df0a5f5e229fb 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
@@ -19,8 +19,8 @@ package org.apache.spark.storage
 
 import java.util.{HashMap => JHashMap}
 
-import scala.collection.mutable
 import scala.collection.JavaConverters._
+import scala.collection.mutable
 import scala.concurrent.{ExecutionContext, Future}
 import scala.util.Random
 
diff --git a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
index 52b7ab6347b47..3ddaac78f0257 100644
--- a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
@@ -25,8 +25,8 @@ import scala.collection.mutable.ArrayBuffer
 import scala.language.implicitConversions
 import scala.xml.Node
 
-import org.eclipse.jetty.client.api.Response
 import org.eclipse.jetty.client.HttpClient
+import org.eclipse.jetty.client.api.Response
 import org.eclipse.jetty.client.http.HttpClientTransportOverHTTP
 import org.eclipse.jetty.proxy.ProxyServlet
 import org.eclipse.jetty.server._
diff --git a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsTab.scala b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsTab.scala
index 770da2226fe04..64a1a292a3840 100644
--- a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsTab.scala
+++ b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsTab.scala
@@ -11,7 +11,7 @@
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
+ * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
diff --git a/core/src/test/scala/org/apache/spark/DistributedSuite.scala b/core/src/test/scala/org/apache/spark/DistributedSuite.scala
index 84f7f1fc8eb09..23372384940e7 100644
--- a/core/src/test/scala/org/apache/spark/DistributedSuite.scala
+++ b/core/src/test/scala/org/apache/spark/DistributedSuite.scala
@@ -17,8 +17,8 @@
 
 package org.apache.spark
 
-import org.scalatest.concurrent.Timeouts._
 import org.scalatest.Matchers
+import org.scalatest.concurrent.Timeouts._
 import org.scalatest.time.{Millis, Span}
 
 import org.apache.spark.security.EncryptionFunSuite
diff --git a/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala b/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala
index 99150a1430d95..8a77aea75a992 100644
--- a/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala
@@ -20,8 +20,8 @@ package org.apache.spark
 import java.util.concurrent.Semaphore
 
 import scala.concurrent.ExecutionContext.Implicits.global
-import scala.concurrent.duration._
 import scala.concurrent.Future
+import scala.concurrent.duration._
 
 import org.scalatest.BeforeAndAfter
 import org.scalatest.Matchers
diff --git a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala
index 493ae514e78fe..ebd826b0ba2f6 100644
--- a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala
@@ -22,8 +22,8 @@ import scala.collection.mutable.ArrayBuffer
 import org.mockito.Matchers.any
 import org.mockito.Mockito._
 
-import org.apache.spark.broadcast.BroadcastManager
 import org.apache.spark.LocalSparkContext._
+import org.apache.spark.broadcast.BroadcastManager
 import org.apache.spark.rpc.{RpcAddress, RpcCallContext, RpcEnv}
 import org.apache.spark.scheduler.{CompressedMapStatus, MapStatus}
 import org.apache.spark.shuffle.FetchFailedException
diff --git a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
index 979270a527a68..890e93d764f90 100644
--- a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
@@ -30,8 +30,8 @@ import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.io.{BytesWritable, LongWritable, Text}
 import org.apache.hadoop.mapred.TextInputFormat
 import org.apache.hadoop.mapreduce.lib.input.{TextInputFormat => NewTextInputFormat}
-import org.scalatest.concurrent.Eventually
 import org.scalatest.Matchers._
+import org.scalatest.concurrent.Eventually
 
 import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart, SparkListenerTaskEnd, SparkListenerTaskStart}
 import org.apache.spark.util.{ThreadUtils, Utils}
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index 97357cdbb6083..08ba41f50a2b9 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -35,12 +35,12 @@ import org.scalatest.concurrent.Timeouts
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark._
+import org.apache.spark.TestUtils.JavaSourceFromString
 import org.apache.spark.api.r.RUtils
 import org.apache.spark.deploy.SparkSubmit._
 import org.apache.spark.deploy.SparkSubmitUtils.MavenCoordinate
-import org.apache.spark.internal.config._
 import org.apache.spark.internal.Logging
-import org.apache.spark.TestUtils.JavaSourceFromString
+import org.apache.spark.internal.config._
 import org.apache.spark.scheduler.EventLoggingListener
 import org.apache.spark.util.{CommandLineUtils, ResetSystemProperties, Utils}
 
diff --git a/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
index dd50e33da30ac..70887dc5dd97a 100644
--- a/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
@@ -11,7 +11,7 @@
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
+ * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
diff --git a/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferSecuritySuite.scala b/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferSecuritySuite.scala
index 474e30144f629..e4c133c9f2cdd 100644
--- a/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferSecuritySuite.scala
+++ b/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferSecuritySuite.scala
@@ -28,8 +28,8 @@ import scala.util.{Failure, Success, Try}
 
 import com.google.common.io.CharStreams
 import org.mockito.Mockito._
-import org.scalatest.mock.MockitoSugar
 import org.scalatest.ShouldMatchers
+import org.scalatest.mock.MockitoSugar
 
 import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
 import org.apache.spark.network.{BlockDataManager, BlockTransferService}
diff --git a/core/src/test/scala/org/apache/spark/rdd/AsyncRDDActionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/AsyncRDDActionsSuite.scala
index b29a53cffeb51..8f639eef46f66 100644
--- a/core/src/test/scala/org/apache/spark/rdd/AsyncRDDActionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/AsyncRDDActionsSuite.scala
@@ -20,8 +20,8 @@ package org.apache.spark.rdd
 import java.util.concurrent.Semaphore
 
 import scala.concurrent._
-import scala.concurrent.duration.Duration
 import scala.concurrent.ExecutionContext.Implicits.global
+import scala.concurrent.duration.Duration
 
 import org.scalatest.BeforeAndAfterAll
 import org.scalatest.concurrent.Timeouts
diff --git a/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala b/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
index 59d8c14d74e30..a799b1cfb0765 100644
--- a/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
@@ -22,8 +22,8 @@ import java.nio.charset.StandardCharsets.UTF_8
 import java.util.UUID
 import java.util.concurrent.{ConcurrentLinkedQueue, CountDownLatch, TimeUnit}
 
-import scala.collection.mutable
 import scala.collection.JavaConverters._
+import scala.collection.mutable
 import scala.concurrent.Await
 import scala.concurrent.duration._
 import scala.language.postfixOps
diff --git a/core/src/test/scala/org/apache/spark/scheduler/BlacklistTrackerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/BlacklistTrackerSuite.scala
index 7ff03c44b0611..520d85a298922 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/BlacklistTrackerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/BlacklistTrackerSuite.scala
@@ -17,9 +17,9 @@
 
 package org.apache.spark.scheduler
 
-import org.mockito.invocation.InvocationOnMock
 import org.mockito.Matchers.any
 import org.mockito.Mockito.{never, verify, when}
+import org.mockito.invocation.InvocationOnMock
 import org.mockito.stubbing.Answer
 import org.scalatest.BeforeAndAfterEach
 import org.scalatest.mock.MockitoSugar
diff --git a/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala
index 276169e02f01d..144e5afdcdd78 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala
@@ -25,8 +25,8 @@ import org.mockito.Mockito._
 import org.roaringbitmap.RoaringBitmap
 
 import org.apache.spark.{SparkConf, SparkContext, SparkEnv, SparkFunSuite}
-import org.apache.spark.internal.config
 import org.apache.spark.LocalSparkContext._
+import org.apache.spark.internal.config
 import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
 import org.apache.spark.storage.BlockManagerId
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
index 481603b40357c..995df1dd52010 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
@@ -19,8 +19,8 @@ package org.apache.spark.scheduler
 
 import java.util.concurrent.Semaphore
 
-import scala.collection.mutable
 import scala.collection.JavaConverters._
+import scala.collection.mutable
 
 import org.mockito.Mockito
 import org.scalatest.Matchers
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala
index 3e55d399e9df9..1bddba8f6c82b 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala
@@ -34,8 +34,8 @@ import org.scalatest.BeforeAndAfter
 import org.scalatest.concurrent.Eventually._
 
 import org.apache.spark._
-import org.apache.spark.storage.TaskResultBlockId
 import org.apache.spark.TestUtils.JavaSourceFromString
+import org.apache.spark.storage.TaskResultBlockId
 import org.apache.spark.util.{MutableURLClassLoader, RpcUtils, Utils}
 
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
index a00337776dadc..ab67a393e2ac5 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
@@ -27,8 +27,8 @@ import org.scalatest.BeforeAndAfterEach
 import org.scalatest.mock.MockitoSugar
 
 import org.apache.spark._
-import org.apache.spark.internal.config
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config
 import org.apache.spark.util.ManualClock
 
 class FakeSchedulerBackend extends SchedulerBackend {
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
index 3696df06e019f..6f1663b210969 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
@@ -28,8 +28,8 @@ import org.mockito.invocation.InvocationOnMock
 import org.mockito.stubbing.Answer
 
 import org.apache.spark._
-import org.apache.spark.internal.config
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config
 import org.apache.spark.serializer.SerializerInstance
 import org.apache.spark.storage.BlockManagerId
 import org.apache.spark.util.{AccumulatorV2, ManualClock, Utils}
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
index 755a61a438a6a..8dd70fcb2fbd5 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
@@ -23,8 +23,8 @@ import java.nio.ByteBuffer
 import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
-import scala.concurrent.duration._
 import scala.concurrent.Future
+import scala.concurrent.duration._
 import scala.language.{implicitConversions, postfixOps}
 import scala.reflect.ClassTag
 
diff --git a/core/src/test/scala/org/apache/spark/storage/DiskStoreSuite.scala b/core/src/test/scala/org/apache/spark/storage/DiskStoreSuite.scala
index 67fc084e8a13d..a81ccd78f08dd 100644
--- a/core/src/test/scala/org/apache/spark/storage/DiskStoreSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/DiskStoreSuite.scala
@@ -26,8 +26,8 @@ import io.netty.channel.FileRegion
 import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
 import org.apache.spark.network.util.{ByteArrayWritableChannel, JavaUtils}
 import org.apache.spark.security.CryptoStreamUtils
-import org.apache.spark.util.io.ChunkedByteBuffer
 import org.apache.spark.util.Utils
+import org.apache.spark.util.io.ChunkedByteBuffer
 
 class DiskStoreSuite extends SparkFunSuite {
 
diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
index 57452d4912abe..a1a858765a7d4 100644
--- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
@@ -22,9 +22,9 @@ import java.util.Properties
 import scala.collection.JavaConverters._
 import scala.collection.Map
 
-import org.json4s.jackson.JsonMethods._
 import org.json4s.JsonAST.{JArray, JInt, JString, JValue}
 import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods._
 import org.scalatest.Assertions
 import org.scalatest.exceptions.TestFailedException
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BucketedRandomProjectionLSHExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BucketedRandomProjectionLSHExample.scala
index 16da4fa887aaf..58f9fb30078dc 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/BucketedRandomProjectionLSHExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/BucketedRandomProjectionLSHExample.scala
@@ -21,9 +21,9 @@ package org.apache.spark.examples.ml
 // $example on$
 import org.apache.spark.ml.feature.BucketedRandomProjectionLSH
 import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.functions.col
 // $example off$
-import org.apache.spark.sql.SparkSession
 
 /**
  * An example demonstrating BucketedRandomProjectionLSH.
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MinHashLSHExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MinHashLSHExample.scala
index b94ab9b8bedc1..8515821a9413a 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/MinHashLSHExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MinHashLSHExample.scala
@@ -21,9 +21,9 @@ package org.apache.spark.examples.ml
 // $example on$
 import org.apache.spark.ml.feature.MinHashLSH
 import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.functions.col
 // $example off$
-import org.apache.spark.sql.SparkSession
 
 /**
  * An example demonstrating MinHashLSH.
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala
index 0167dc3723c6a..154777690f003 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala
@@ -20,9 +20,9 @@ package org.apache.spark.examples.ml
 
 // $example on$
 import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer}
+import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.functions._
 // $example off$
-import org.apache.spark.sql.SparkSession
 
 object TokenizerExample {
   def main(args: Array[String]): Unit = {
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/UnaryTransformerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/UnaryTransformerExample.scala
index 13b58d154ba9b..b4179ecc1e56d 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/UnaryTransformerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/UnaryTransformerExample.scala
@@ -22,10 +22,8 @@ package org.apache.spark.examples.ml
 import org.apache.spark.ml.UnaryTransformer
 import org.apache.spark.ml.param.DoubleParam
 import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
-import org.apache.spark.sql.functions.col
-// $example off$
 import org.apache.spark.sql.SparkSession
-// $example on$
+import org.apache.spark.sql.functions.col
 import org.apache.spark.sql.types.{DataType, DataTypes}
 import org.apache.spark.util.Utils
 // $example off$
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala
index 63a60912de540..9a0af5d7b3585 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala
@@ -24,10 +24,9 @@ import java.util.Arrays
 import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute}
 import org.apache.spark.ml.feature.VectorSlicer
 import org.apache.spark.ml.linalg.Vectors
-import org.apache.spark.sql.Row
+import org.apache.spark.sql.{Row, SparkSession}
 import org.apache.spark.sql.types.StructType
 // $example off$
-import org.apache.spark.sql.SparkSession
 
 object VectorSlicerExample {
   def main(args: Array[String]): Unit = {
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnyPCA.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnyPCA.scala
index 464fbd385ab5d..03bc675299c5a 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnyPCA.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnyPCA.scala
@@ -19,8 +19,8 @@
 package org.apache.spark.examples.mllib
 
 import org.apache.spark.{SparkConf, SparkContext}
-import org.apache.spark.mllib.linalg.distributed.RowMatrix
 import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.linalg.distributed.RowMatrix
 
 /**
  * Compute the principal components of a tall-and-skinny matrix, whose rows are observations.
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnySVD.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnySVD.scala
index 65b4bc46f0266..067e49b9599e7 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnySVD.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnySVD.scala
@@ -19,8 +19,8 @@
 package org.apache.spark.examples.mllib
 
 import org.apache.spark.{SparkConf, SparkContext}
-import org.apache.spark.mllib.linalg.distributed.RowMatrix
 import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.linalg.distributed.RowMatrix
 
 /**
  * Compute the singular value decomposition (SVD) of a tall-and-skinny matrix.
diff --git a/examples/src/main/scala/org/apache/spark/examples/pythonconverters/AvroConverters.scala b/examples/src/main/scala/org/apache/spark/examples/pythonconverters/AvroConverters.scala
index cf12c98b4af6c..6bd96346ca238 100644
--- a/examples/src/main/scala/org/apache/spark/examples/pythonconverters/AvroConverters.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/pythonconverters/AvroConverters.scala
@@ -21,13 +21,13 @@ import java.util.{Collection => JCollection, Map => JMap}
 
 import scala.collection.JavaConverters._
 
-import org.apache.avro.generic.{GenericFixed, IndexedRecord}
-import org.apache.avro.mapred.AvroWrapper
 import org.apache.avro.Schema
 import org.apache.avro.Schema.Type._
+import org.apache.avro.generic.{GenericFixed, IndexedRecord}
+import org.apache.avro.mapred.AvroWrapper
 
-import org.apache.spark.api.python.Converter
 import org.apache.spark.SparkException
+import org.apache.spark.api.python.Converter
 
 
 object AvroConversionUtil extends Serializable {
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/UserDefinedTypedAggregation.scala b/examples/src/main/scala/org/apache/spark/examples/sql/UserDefinedTypedAggregation.scala
index ac617d19d36cf..f04a831487b59 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/UserDefinedTypedAggregation.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/UserDefinedTypedAggregation.scala
@@ -17,10 +17,8 @@
 package org.apache.spark.examples.sql
 
 // $example on:typed_custom_aggregation$
+import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
 import org.apache.spark.sql.expressions.Aggregator
-import org.apache.spark.sql.Encoder
-import org.apache.spark.sql.Encoders
-import org.apache.spark.sql.SparkSession
 // $example off:typed_custom_aggregation$
 
 object UserDefinedTypedAggregation {
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/UserDefinedUntypedAggregation.scala b/examples/src/main/scala/org/apache/spark/examples/sql/UserDefinedUntypedAggregation.scala
index 9c9ebc55163de..3656a84c571db 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/UserDefinedUntypedAggregation.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/UserDefinedUntypedAggregation.scala
@@ -17,11 +17,10 @@
 package org.apache.spark.examples.sql
 
 // $example on:untyped_custom_aggregation$
+import org.apache.spark.sql.{Row, SparkSession}
 import org.apache.spark.sql.expressions.MutableAggregationBuffer
 import org.apache.spark.sql.expressions.UserDefinedAggregateFunction
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.Row
-import org.apache.spark.sql.SparkSession
 // $example off:untyped_custom_aggregation$
 
 object UserDefinedUntypedAggregation {
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala
index bd78526f8c299..474b03aa24a5d 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala
@@ -20,9 +20,9 @@ package org.apache.spark.examples.streaming
 
 import kafka.serializer.StringDecoder
 
+import org.apache.spark.SparkConf
 import org.apache.spark.streaming._
 import org.apache.spark.streaming.kafka._
-import org.apache.spark.SparkConf
 
 /**
  * Consumes messages from one or more topics in Kafka and does wordcount.
diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeTestUtils.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeTestUtils.scala
index 945cfa7295d1d..e8623b4766aea 100644
--- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeTestUtils.scala
+++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeTestUtils.scala
@@ -35,8 +35,8 @@ import org.jboss.netty.channel.socket.SocketChannel
 import org.jboss.netty.channel.socket.nio.NioClientSocketChannelFactory
 import org.jboss.netty.handler.codec.compression.{ZlibDecoder, ZlibEncoder}
 
-import org.apache.spark.util.Utils
 import org.apache.spark.SparkConf
+import org.apache.spark.util.Utils
 
 /**
  * Share codes for Scala and Python unit tests
diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/PollingFlumeTestUtils.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/PollingFlumeTestUtils.scala
index 15ff4f60259f0..a3e784a4f32ee 100644
--- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/PollingFlumeTestUtils.scala
+++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/PollingFlumeTestUtils.scala
@@ -23,10 +23,10 @@ import java.util.concurrent._
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.apache.flume.event.EventBuilder
 import org.apache.flume.Context
 import org.apache.flume.channel.MemoryChannel
 import org.apache.flume.conf.Configurables
+import org.apache.flume.event.EventBuilder
 
 import org.apache.spark.streaming.flume.sink.{SparkSink, SparkSinkConfig}
 
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
index 5915d9f99a939..066a68a5dd31a 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
@@ -40,9 +40,9 @@ import org.apache.zookeeper.server.{NIOServerCnxnFactory, ZooKeeperServer}
 import org.scalatest.concurrent.Eventually._
 import org.scalatest.time.SpanSugar._
 
+import org.apache.spark.SparkConf
 import org.apache.spark.internal.Logging
 import org.apache.spark.util.Utils
-import org.apache.spark.SparkConf
 
 /**
  * This is a helper class for Kafka test suites. This has the functionality to set up
diff --git a/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala b/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala
index 88a312a189cee..453b5e5ab20d3 100644
--- a/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala
+++ b/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala
@@ -20,8 +20,8 @@ package org.apache.spark.streaming.kafka010
 import java.io.File
 import java.lang.{ Long => JLong }
 import java.util.{ Arrays, HashMap => JHashMap, Map => JMap }
-import java.util.concurrent.atomic.AtomicLong
 import java.util.concurrent.ConcurrentLinkedQueue
+import java.util.concurrent.atomic.AtomicLong
 
 import scala.collection.JavaConverters._
 import scala.concurrent.duration._
diff --git a/external/kafka-0-8/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala b/external/kafka-0-8/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
index f8b34074f104f..06ef5bc3f8bd0 100644
--- a/external/kafka-0-8/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
+++ b/external/kafka-0-8/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
@@ -19,8 +19,8 @@ package org.apache.spark.streaming.kafka
 
 import java.io.File
 import java.util.Arrays
-import java.util.concurrent.atomic.AtomicLong
 import java.util.concurrent.ConcurrentLinkedQueue
+import java.util.concurrent.atomic.AtomicLong
 
 import scala.collection.JavaConverters._
 import scala.concurrent.duration._
diff --git a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.scala b/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.scala
index decfb6b3ebd31..f61e398e7bdd1 100644
--- a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.scala
+++ b/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.scala
@@ -21,7 +21,6 @@ import scala.reflect.ClassTag
 
 import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
 import com.amazonaws.services.kinesis.model.Record
-import KinesisReadConfigurations._
 
 import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.rdd.RDD
@@ -47,6 +46,8 @@ private[kinesis] class KinesisInputDStream[T: ClassTag](
     val cloudWatchCreds: Option[SparkAWSCredentials]
   ) extends ReceiverInputDStream[T](_ssc) {
 
+  import KinesisReadConfigurations._
+
   private[streaming]
   override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[T] = {
 
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala b/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala
index 475bccf9bfc76..49e8487dbe935 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala
@@ -20,10 +20,10 @@ package org.apache.spark.graphx
 import scala.reflect.ClassTag
 import scala.util.Random
 
+import org.apache.spark.SparkException
 import org.apache.spark.graphx.lib._
 import org.apache.spark.ml.linalg.Vector
 import org.apache.spark.rdd.RDD
-import org.apache.spark.SparkException
 
 /**
  * Contains additional functionality for [[Graph]]. All operations are expressed in terms of the
diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/impl/UtilsSuite.scala b/mllib-local/src/test/scala/org/apache/spark/ml/impl/UtilsSuite.scala
index 44b122b694bcc..20e7c101b5651 100644
--- a/mllib-local/src/test/scala/org/apache/spark/ml/impl/UtilsSuite.scala
+++ b/mllib-local/src/test/scala/org/apache/spark/ml/impl/UtilsSuite.scala
@@ -17,8 +17,8 @@
 
 package org.apache.spark.ml.impl
 
-import org.apache.spark.ml.impl.Utils.EPSILON
 import org.apache.spark.ml.SparkMLFunSuite
+import org.apache.spark.ml.impl.Utils.EPSILON
 
 
 class UtilsSuite extends SparkMLFunSuite {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala
index 902f84f862c17..4ff1d0ef356f3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Interaction.scala
@@ -21,12 +21,12 @@ import scala.collection.mutable.ArrayBuilder
 
 import org.apache.spark.SparkException
 import org.apache.spark.annotation.Since
+import org.apache.spark.ml.Transformer
 import org.apache.spark.ml.attribute._
+import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
-import org.apache.spark.ml.Transformer
-import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
index 65db06c0d6085..62c1972aab12c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
@@ -18,8 +18,8 @@
 package org.apache.spark.ml.feature
 
 import org.apache.spark.annotation.Since
-import org.apache.spark.ml.param.{Param, ParamMap}
 import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.param.{Param, ParamMap}
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 import org.apache.spark.sql.types.StructType
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index 12ad800206463..ac68b825af537 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -22,8 +22,8 @@ import java.util.{List => JList}
 import java.util.NoSuchElementException
 
 import scala.annotation.varargs
-import scala.collection.mutable
 import scala.collection.JavaConverters._
+import scala.collection.mutable
 
 import org.json4s._
 import org.json4s.jackson.JsonMethods._
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 50931fecbc2f0..ed431f550817e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -27,11 +27,11 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.SparkException
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.internal.Logging
+import org.apache.spark.ml.PredictorParams
 import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.linalg.BLAS._
 import org.apache.spark.ml.optim.WeightedLeastSquares
-import org.apache.spark.ml.PredictorParams
 import org.apache.spark.ml.optim.aggregator.LeastSquaresAggregator
 import org.apache.spark.ml.optim.loss.{L2Regularization, RDDLossFunction}
 import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
index dec118330aec6..74aaed96fbabd 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
@@ -23,8 +23,8 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
 
-import org.apache.spark.internal.Logging
 import org.apache.spark.TaskContext
+import org.apache.spark.internal.Logging
 import org.apache.spark.ml.feature.LabeledPoint
 import org.apache.spark.ml.linalg.{Vectors, VectorUDT}
 import org.apache.spark.mllib.util.MLUtils
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
index 6f1ab091b2317..633bda6aac804 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
@@ -19,8 +19,8 @@ package org.apache.spark.mllib.clustering
 
 import org.json4s._
 import org.json4s.DefaultFormats
-import org.json4s.jackson.JsonMethods._
 import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Since
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index 862be6f37e7e3..32f15552094e1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -23,13 +23,13 @@ import org.json4s._
 import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
+import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Since
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.stat.Statistics
 import org.apache.spark.mllib.util.{Loader, Saveable}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.SparkContext
 import org.apache.spark.sql.{Row, SparkSession}
 
 /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
index 2d236509d5715..8347ccad6b715 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
@@ -28,14 +28,13 @@ import org.json4s._
 import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
-import org.apache.spark.SparkContext
+import org.apache.spark.{RangePartitioner, SparkContext}
 import org.apache.spark.annotation.Since
 import org.apache.spark.api.java.{JavaDoubleRDD, JavaRDD}
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.util.{Loader, Saveable}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.RangePartitioner
 
 /**
  * Regression model for isotonic regression.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/LabeledPoint.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/LabeledPoint.scala
index f082b16b95e81..4381d6ab20cc0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/LabeledPoint.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/LabeledPoint.scala
@@ -19,11 +19,11 @@ package org.apache.spark.mllib.regression
 
 import scala.beans.BeanInfo
 
+import org.apache.spark.SparkException
 import org.apache.spark.annotation.Since
 import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint}
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.util.NumericParser
-import org.apache.spark.SparkException
 
 /**
  * Class that represents the features and labels of a data point.
diff --git a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
index 45d3f9b4c53be..ac7319110159b 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
@@ -20,10 +20,10 @@ package org.apache.spark.ml.recommendation
 import java.io.File
 import java.util.Random
 
+import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 import scala.collection.mutable.WrappedArray
-import scala.collection.JavaConverters._
 import scala.language.existentials
 
 import com.github.fommil.netlib.BLAS.{getInstance => blas}
diff --git a/pom.xml b/pom.xml
index 9616f6d6edbac..d544894b54c1a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2424,7 +2424,7 @@
       <plugin>
         <groupId>org.scalastyle</groupId>
         <artifactId>scalastyle-maven-plugin</artifactId>
-        <version>0.8.0</version>
+        <version>0.9.0</version>
         <configuration>
           <verbose>false</verbose>
           <failOnViolation>true</failOnViolation>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 371a171aa98e3..7565e14c9b9ed 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -30,7 +30,7 @@ import sbtunidoc.Plugin.UnidocKeys.unidocGenjavadocVersion
 import com.simplytyped.Antlr4Plugin._
 import com.typesafe.sbt.pom.{PomBuild, SbtPomKeys}
 import com.typesafe.tools.mima.plugin.MimaKeys
-import org.scalastyle.sbt.ScalastylePlugin._
+import org.scalastyle.sbt.ScalastylePlugin.autoImport._
 import org.scalastyle.sbt.Tasks
 
 import spray.revolver.RevolverPlugin._
@@ -116,7 +116,7 @@ object SparkBuild extends PomBuild {
 
   lazy val scalaStyleRules = Project("scalaStyleRules", file("scalastyle"))
     .settings(
-      libraryDependencies += "org.scalastyle" %% "scalastyle" % "0.8.0"
+      libraryDependencies += "org.scalastyle" %% "scalastyle" % "0.9.0"
     )
 
   lazy val scalaStyleOnCompile = taskKey[Unit]("scalaStyleOnCompile")
diff --git a/project/plugins.sbt b/project/plugins.sbt
index 2b49c297ff9cb..f67e0a14a8f24 100644
--- a/project/plugins.sbt
+++ b/project/plugins.sbt
@@ -8,7 +8,7 @@ addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "5.1.0")
 addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.8.2")
 
 // need to make changes to uptake sbt 1.0 support in "org.scalastyle" %% "scalastyle-sbt-plugin" % "0.9.0"
-addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.8.0")
+addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.9.0")
 
 addSbtPlugin("com.typesafe" % "sbt-mima-plugin" % "0.1.17")
 
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherArguments.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherArguments.scala
index ddea762fdb919..096bb4e1af688 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherArguments.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherArguments.scala
@@ -20,8 +20,8 @@ package org.apache.spark.deploy.mesos
 import scala.annotation.tailrec
 import scala.collection.mutable
 
-import org.apache.spark.util.{IntParam, Utils}
 import org.apache.spark.SparkConf
+import org.apache.spark.util.{IntParam, Utils}
 
 private[mesos] class MesosClusterDispatcherArguments(args: Array[String], conf: SparkConf) {
   var host: String = Utils.localHostName()
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterPage.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterPage.scala
index 219b52ef1403a..88a6614d51384 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterPage.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterPage.scala
@@ -23,8 +23,8 @@ import scala.xml.Node
 
 import org.apache.mesos.Protos.TaskStatus
 
-import org.apache.spark.deploy.mesos.config._
 import org.apache.spark.deploy.mesos.MesosDriverDescription
+import org.apache.spark.deploy.mesos.config._
 import org.apache.spark.scheduler.cluster.mesos.MesosClusterSubmissionState
 import org.apache.spark.ui.{UIUtils, WebUIPage}
 
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
index 28780d3a89e8a..9ee9cb1e79306 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
@@ -30,8 +30,8 @@ import org.apache.mesos.Protos.Environment.Variable
 import org.apache.mesos.Protos.TaskStatus.Reason
 
 import org.apache.spark.{SecurityManager, SparkConf, SparkException, TaskState}
-import org.apache.spark.deploy.mesos.config
 import org.apache.spark.deploy.mesos.MesosDriverDescription
+import org.apache.spark.deploy.mesos.config
 import org.apache.spark.deploy.rest.{CreateSubmissionResponse, KillSubmissionResponse, SubmissionStatusResponse}
 import org.apache.spark.metrics.MetricsSystem
 import org.apache.spark.util.Utils
diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
index d9ff4a403ea36..a8175e29bc9cf 100644
--- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
+++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
@@ -28,17 +28,17 @@ import org.apache.mesos.Protos._
 import org.mockito.Matchers
 import org.mockito.Matchers._
 import org.mockito.Mockito._
+import org.scalatest.BeforeAndAfter
 import org.scalatest.concurrent.ScalaFutures
 import org.scalatest.mock.MockitoSugar
-import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.{LocalSparkContext, SecurityManager, SparkConf, SparkContext, SparkFunSuite}
 import org.apache.spark.deploy.mesos.config._
 import org.apache.spark.internal.config._
 import org.apache.spark.network.shuffle.mesos.MesosExternalShuffleClient
 import org.apache.spark.rpc.{RpcAddress, RpcEndpointRef}
-import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.{RegisterExecutor, RemoveExecutor}
 import org.apache.spark.scheduler.TaskSchedulerImpl
+import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.{RegisterExecutor, RemoveExecutor}
 import org.apache.spark.scheduler.cluster.mesos.Utils._
 
 class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala
index 257dc83621e98..0a7a16f468fbd 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala
@@ -17,8 +17,8 @@
 
 package org.apache.spark.deploy.yarn
 
-import scala.collection.mutable.{ArrayBuffer, HashMap, Set}
 import scala.collection.JavaConverters._
+import scala.collection.mutable.{ArrayBuffer, HashMap, Set}
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.yarn.api.records.{ContainerId, Resource}
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
index cc571c330f8d8..f73e7dc0bb567 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -22,9 +22,9 @@ import java.util.concurrent._
 import java.util.concurrent.atomic.AtomicInteger
 import java.util.regex.Pattern
 
+import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, Queue}
-import scala.collection.JavaConverters._
 import scala.util.control.NonFatal
 
 import org.apache.hadoop.yarn.api.records._
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
index 97b0e8aca3330..cb1e3c5268510 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
@@ -28,9 +28,9 @@ import org.mockito.Mockito._
 import org.scalatest.{BeforeAndAfterEach, Matchers}
 
 import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.yarn.config._
 import org.apache.spark.deploy.yarn.YarnAllocator._
 import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._
+import org.apache.spark.deploy.yarn.config._
 import org.apache.spark.rpc.RpcEndpointRef
 import org.apache.spark.scheduler.SplitInfo
 import org.apache.spark.util.ManualClock
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
index 75427b4ad6cb4..13472f2ece184 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.deploy.yarn
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 8628a89aa949a..70a3885d21531 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -25,9 +25,9 @@ import org.apache.spark.sql.catalyst._
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.encoders.OuterScopes
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.SubExprUtils._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.expressions.objects.{LambdaVariable, MapObjects, NewInstance, UnresolvedMapObjects}
-import org.apache.spark.sql.catalyst.expressions.SubExprUtils._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, _}
 import org.apache.spark.sql.catalyst.rules._
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index e235689cc36ec..d9906bb6e6ede 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -19,8 +19,8 @@ package org.apache.spark.sql.catalyst.analysis
 
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.expressions.SubExprUtils._
+import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.optimizer.BooleanSimplification
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
index 43c35bbdf383a..789750fd408f2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
@@ -22,11 +22,11 @@ import scala.reflect.ClassTag
 
 import org.apache.spark.SparkException
 import org.apache.spark.sql.Row
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, DateTimeUtils, GenericArrayData}
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.analysis.GetColumnByOrdinal
+import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.objects._
+import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, DateTimeUtils, GenericArrayData}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/errors/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/errors/package.scala
index 0d45f371fa0cd..09a6b769da3f4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/errors/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/errors/package.scala
@@ -19,8 +19,8 @@ package org.apache.spark.sql.catalyst
 
 import scala.util.control.NonFatal
 
-import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.SparkException
+import org.apache.spark.sql.catalyst.trees.TreeNode
 
 /**
  * Functions for attaching and retrieving trees that are associated with errors.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Percentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Percentile.scala
index 8433a93ea3032..8761ae4020e55 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Percentile.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Percentile.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions.aggregate
 import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream}
 import java.util
 
+import org.apache.spark.SparkException
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess}
@@ -27,7 +28,6 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.types._
 import org.apache.spark.util.collection.OpenHashMap
-import org.apache.spark.SparkException
 
 /**
  * The Percentile aggregate function returns the exact percentile(s) of numeric column `expr` at
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
index 0d2f9889a27d5..405c2065680f5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
@@ -20,10 +20,10 @@ package org.apache.spark.sql.catalyst.expressions.aggregate
 import scala.collection.generic.Growable
 import scala.collection.mutable
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.util.GenericArrayData
-import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.types._
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
index 1e64d2129ea7b..85a5f7fb2c6c3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
@@ -30,9 +30,9 @@ import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.util.{ArrayData, MapData}
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.Platform
 import org.apache.spark.unsafe.hash.Murmur3_x86_32
 import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
-import org.apache.spark.unsafe.Platform
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // This file defines all the expressions for hashing.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index 6f451fe54761a..17b605438d587 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -24,11 +24,11 @@ import scala.util.parsing.combinator.RegexParsers
 import com.fasterxml.jackson.core._
 
 import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
-import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
-import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.json._
+import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
 import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, BadRecordException, FailFastMode, GenericArrayData}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
index 5c54ffa9845a2..547d5be0e908e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
@@ -20,10 +20,10 @@ package org.apache.spark.sql.catalyst.expressions
 import java.{lang => jl}
 import java.util.Locale
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess}
 import org.apache.spark.sql.catalyst.expressions.codegen._
-import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.NumberConverter
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
index 79a6c8663a56b..273bc6ce27c5d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
@@ -22,8 +22,8 @@ import scala.collection.mutable.{ArrayBuffer, Stack}
 
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.expressions.Literal.{FalseLiteral, TrueLiteral}
+import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.expressions.objects.AssertNotNull
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
index 2a3e07aebe709..9dbb6b14aaac3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
@@ -20,8 +20,8 @@ package org.apache.spark.sql.catalyst.optimizer
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.expressions.SubExprUtils._
+import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala
index ae5f1d1fc4f83..5ae1a55a8b66f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala
@@ -22,8 +22,8 @@ import java.math.{MathContext, RoundingMode}
 import scala.util.control.NonFatal
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
index b6889f21cc6ed..9c7d47f99ee10 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -27,10 +27,10 @@ import org.json4s.JsonAST._
 import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
-import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable, CatalogTableType, FunctionResource}
 import org.apache.spark.sql.catalyst.FunctionIdentifier
 import org.apache.spark.sql.catalyst.ScalaReflection._
 import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable, CatalogTableType, FunctionResource}
 import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.JoinType
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
index b314ef4e35d6d..e3b0969283a84 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
@@ -18,8 +18,8 @@
 package org.apache.spark.sql.types
 
 import scala.collection.mutable.ArrayBuffer
-import scala.util.control.NonFatal
 import scala.util.Try
+import scala.util.control.NonFatal
 
 import org.json4s.JsonDSL._
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
index 11a4a79089416..60e46a9910a8b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
@@ -22,8 +22,8 @@ import org.scalatest.BeforeAndAfter
 import org.apache.spark.sql.catalyst.catalog.{InMemoryCatalog, SessionCatalog}
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.expressions.Literal.{FalseLiteral, TrueLiteral}
+import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, Project, Union}
 import org.apache.spark.sql.types._
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala
index 630e8a7990e7b..630113ce2d948 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/EncoderResolutionSuite.scala
@@ -20,10 +20,10 @@ package org.apache.spark.sql.catalyst.encoders
 import scala.reflect.runtime.universe.TypeTag
 
 import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.util.GenericArrayData
-import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala
index 56399f4831a6f..653c07f1835ca 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.optimizer
 
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.catalog.{InMemoryCatalog, SessionCatalog}
 import org.apache.spark.sql.catalyst.dsl.expressions._
@@ -26,7 +27,6 @@ import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.Row
 
 class BooleanSimplificationSuite extends PlanTest with PredicateHelper {
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CombiningLimitsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CombiningLimitsSuite.scala
index 87ad81db11b64..ef4b848924f06 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CombiningLimitsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CombiningLimitsSuite.scala
@@ -19,8 +19,8 @@ package org.apache.spark.sql.catalyst.optimizer
 
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
-import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
 
 class CombiningLimitsSuite extends PlanTest {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateSerializationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateSerializationSuite.scala
index 3c033ddc374cf..ef38cc076d95e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateSerializationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateSerializationSuite.scala
@@ -22,8 +22,8 @@ import scala.reflect.runtime.universe.TypeTag
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
-import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
 
 case class OtherTuple(_1: Int, _2: Int)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala
index 50398788c605c..f64d10357ce2f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala
@@ -20,8 +20,8 @@ package org.apache.spark.sql.catalyst.optimizer
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.types.{BooleanType, StringType}
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
index 28bf7b6f84341..eaad1e32a8aba 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
@@ -21,8 +21,8 @@ import org.apache.spark.sql.catalyst.analysis.{EliminateSubqueryAliases, Unresol
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.logical.{Filter, LocalRelation, LogicalPlan}
 import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.catalyst.plans.logical.{Filter, LocalRelation, LogicalPlan}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.apache.spark.sql.internal.SQLConf.OPTIMIZER_INSET_CONVERSION_THRESHOLD
 import org.apache.spark.sql.types._
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SimplifyStringCaseConversionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SimplifyStringCaseConversionSuite.scala
index 24413e7a2a3f0..b9bf930f0ea0b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SimplifyStringCaseConversionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SimplifyStringCaseConversionSuite.scala
@@ -20,8 +20,8 @@ package org.apache.spark.sql.catalyst.optimizer
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
 
 class SimplifyStringCaseConversionSuite extends PlanTest {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DataTypeParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DataTypeParserSuite.scala
index 4490523369006..2c491cd376edc 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DataTypeParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DataTypeParserSuite.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql.catalyst.parser
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala
index da1041d617086..48aaec44885d4 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql.catalyst.parser
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
index f83c6379185c4..84d0ba7bef642 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
@@ -22,22 +22,20 @@ import java.util.UUID
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.json4s.jackson.JsonMethods
-import org.json4s.jackson.JsonMethods._
 import org.json4s.JsonAST._
 import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods
+import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.{FunctionIdentifier, InternalRow, TableIdentifier}
 import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable, CatalogTableType, FunctionResource, JarResource}
 import org.apache.spark.sql.catalyst.dsl.expressions.DslString
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
-import org.apache.spark.sql.catalyst.FunctionIdentifier
-import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.plans.{LeftOuter, NaturalJoin}
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Union}
 import org.apache.spark.sql.catalyst.plans.physical.{IdentityBroadcastMode, RoundRobinPartitioning, SinglePartition}
-import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.types.{BooleanType, DoubleType, FloatType, IntegerType, Metadata, NullType, StringType, StructField, StructType}
 import org.apache.spark.storage.StorageLevel
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/UnsafeArraySuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/UnsafeArraySuite.scala
index f0e247bf46c44..8f75c14192c9b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/UnsafeArraySuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/UnsafeArraySuite.scala
@@ -18,9 +18,9 @@
 package org.apache.spark.sql.catalyst.util
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
 import org.apache.spark.sql.catalyst.expressions.UnsafeArrayData
-import org.apache.spark.sql.Row
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
index 193826d66be26..8e2b32c2b9a08 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql.types
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index b546cccb7ea98..8468a8a96349a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
index e068df3586f06..f3a2b70657c48 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index a1d8b7f4af1b5..10b28ce812afc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql
 
@@ -21,10 +21,10 @@ import java.util.{Locale, Properties}
 
 import scala.collection.JavaConverters._
 
-import org.apache.spark.api.java.JavaRDD
-import org.apache.spark.internal.Logging
 import org.apache.spark.Partition
 import org.apache.spark.annotation.InterfaceStability
+import org.apache.spark.api.java.JavaRDD
+import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.json.{CreateJacksonParser, JacksonParser, JSONOptions}
 import org.apache.spark.sql.execution.LogicalRDD
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
index 531c613afb0dd..a41753098966e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DatasetHolder.scala b/sql/core/src/main/scala/org/apache/spark/sql/DatasetHolder.scala
index 582d4a3670b8e..08aa1bbe78fae 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DatasetHolder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DatasetHolder.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/package.scala
index cbbd005228d44..d9f2b2576033f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/package.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
index 9bd2987057dbc..872ef773e8a3a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
@@ -23,11 +23,11 @@ import java.util.{Locale, Map => JMap}
 import scala.collection.JavaConverters._
 import scala.util.matching.Regex
 
-import org.apache.spark.internal.Logging
 import org.apache.spark.SparkContext
 import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
 import org.apache.spark.api.r.SerDe
 import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
index 0ea806d6cb50b..5a1d680c99f66 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
@@ -24,12 +24,11 @@ import scala.collection.JavaConverters._
 import org.apache.hadoop.fs.{FileSystem, Path}
 
 import org.apache.spark.internal.Logging
+import org.apache.spark.sql.{Dataset, SparkSession}
 import org.apache.spark.sql.catalyst.expressions.SubqueryExpression
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.Dataset
 import org.apache.spark.sql.execution.columnar.InMemoryRelation
 import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
-import org.apache.spark.sql.SparkSession
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.storage.StorageLevel.MEMORY_AND_DISK
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
index 80ea458687865..d77405c559c58 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
@@ -17,8 +17,8 @@
 
 package org.apache.spark.sql.execution.aggregate
 
-import org.apache.spark.memory.TaskMemoryManager
 import org.apache.spark.TaskContext
+import org.apache.spark.memory.TaskMemoryManager
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.errors._
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala
index a43235790834e..fc87de2c52e41 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala
@@ -18,8 +18,8 @@
 package org.apache.spark.sql.execution.aggregate
 
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.plans.physical._
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
index 240f38f5bfeb4..fa45822311e15 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql.execution.arrow
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
index fce12cc96620c..de45be85220e9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql.execution.command
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index fa50d12722411..694d517668a2c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -22,8 +22,8 @@ import java.net.URI
 import java.nio.file.FileSystems
 
 import scala.collection.mutable.ArrayBuffer
-import scala.util.control.NonFatal
 import scala.util.Try
+import scala.util.control.NonFatal
 
 import org.apache.hadoop.fs.Path
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
index 68aaa8a8d8e29..514969715091a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
@@ -33,11 +33,11 @@ import org.apache.spark.internal.io.{FileCommitProtocol, SparkHadoopWriterUtils}
 import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage
 import org.apache.spark.shuffle.FetchFailedException
 import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.catalog.{BucketSpec, ExternalCatalogUtils}
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, _}
 import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning
-import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
 import org.apache.spark.sql.execution.{SortExec, SparkPlan, SQLExecution}
 import org.apache.spark.sql.types.StringType
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
index 91e31650617ec..d74aae35250ff 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
@@ -27,8 +27,8 @@ import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.metrics.source.HiveCatalogMetrics
-import org.apache.spark.sql.execution.streaming.FileStreamSink
 import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.execution.streaming.FileStreamSink
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.SerializableConfiguration
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
index 68ff53c612188..17405f550d25f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
@@ -19,8 +19,8 @@ package org.apache.spark.sql.execution.datasources.jdbc
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.apache.spark.internal.Logging
 import org.apache.spark.Partition
+import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Row, SaveMode, SparkSession, SQLContext}
 import org.apache.spark.sql.jdbc.JdbcDialects
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
index b4653c1b564f3..77b907870d678 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql.execution.metric
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
index 9e4e02b99bbda..5a3fcad38888e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
@@ -23,15 +23,14 @@ import org.apache.spark.api.java.function.MapFunction
 import org.apache.spark.api.r._
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.api.r.SQLUtils._
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.expressions.objects.Invoke
-import org.apache.spark.sql.catalyst.plans.logical.FunctionUtils
+import org.apache.spark.sql.catalyst.plans.logical.{FunctionUtils, LogicalGroupState}
 import org.apache.spark.sql.catalyst.plans.physical._
-import org.apache.spark.sql.Row
-import org.apache.spark.sql.catalyst.plans.logical.LogicalGroupState
 import org.apache.spark.sql.execution.streaming.GroupStateImpl
 import org.apache.spark.sql.streaming.GroupStateTimeout
 import org.apache.spark.sql.types._
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala
index 7a5ac48f1b69d..3e176e2cde5bd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql.execution.python
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala
index 38b3aa7891b40..9bbfa6018ba77 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql.execution.python
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
index 1400452833039..882a5ce1a663e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql.execution.python
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/RowQueue.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/RowQueue.scala
index cd1e77f524afd..e2fa6e7f504ba 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/RowQueue.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/RowQueue.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql.execution.python
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/UserDefinedPythonFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/UserDefinedPythonFunction.scala
index d301874c223d1..0d39c8ff980f2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/UserDefinedPythonFunction.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/UserDefinedPythonFunction.scala
@@ -18,8 +18,8 @@
 package org.apache.spark.sql.execution.python
 
 import org.apache.spark.api.python.PythonFunction
-import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.Column
+import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.types.DataType
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/r/MapPartitionsRWrapper.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/r/MapPartitionsRWrapper.scala
index b9835c7dbb025..a62016dac1229 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/r/MapPartitionsRWrapper.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/r/MapPartitionsRWrapper.scala
@@ -19,8 +19,8 @@ package org.apache.spark.sql.execution.r
 
 import org.apache.spark.api.r._
 import org.apache.spark.broadcast.Broadcast
-import org.apache.spark.sql.api.r.SQLUtils._
 import org.apache.spark.sql.Row
+import org.apache.spark.sql.api.r.SQLUtils._
 import org.apache.spark.sql.types.StructType
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
index cdb755edc79a1..3884cec345620 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql.execution.stat
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
index 5f8973fd09460..43cf0ef1da8ca 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql.execution.streaming
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
index dbe652b3b1ed2..258a64216136f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql.execution.streaming
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLog.scala
index 9e2604c9c069f..4083d5d897afc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLog.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql.execution.streaming
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala
index 4f8cd116f610e..e3f4abcf9f1dc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql.execution.streaming
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
index c5fbb638e3494..b1c3a8ab235ab 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
@@ -20,8 +20,8 @@ package org.apache.spark.sql.execution.streaming
 import java.text.SimpleDateFormat
 import java.util.{Date, UUID}
 
-import scala.collection.mutable
 import scala.collection.JavaConverters._
+import scala.collection.mutable
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{DataFrame, SparkSession}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
index 1c9284e252bd6..71eaabe273fea 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
@@ -18,11 +18,10 @@
 package org.apache.spark.sql.execution.streaming
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.{DataFrame, SQLContext}
+import org.apache.spark.sql.{DataFrame, SaveMode, SQLContext}
+import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, StreamSinkProvider}
 import org.apache.spark.sql.streaming.OutputMode
-import org.apache.spark.sql.SaveMode
-import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.types.StructType
 
 class ConsoleSink(options: Map[String, String]) extends Sink with Logging {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
index 97b921a622636..011ee7e058e31 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
@@ -1,25 +1,25 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql.expressions
 
 import org.apache.spark.annotation.InterfaceStability
-import org.apache.spark.sql.catalyst.expressions.ScalaUDF
 import org.apache.spark.sql.Column
+import org.apache.spark.sql.catalyst.expressions.ScalaUDF
 import org.apache.spark.sql.types.DataType
 
 /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala
index 63094d1b6122b..25e1d93ff0927 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index 720f040dbfe42..86fe09bd977af 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -20,12 +20,12 @@ package org.apache.spark.sql
 import scala.collection.mutable.ListBuffer
 import scala.language.existentials
 
-import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
+import org.apache.spark.TestUtils.{assertNotSpilled, assertSpilled}
 import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
 import org.apache.spark.sql.execution.joins._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
-import org.apache.spark.TestUtils.{assertNotSpilled, assertSpilled}
 
 class JoinSuite extends QueryTest with SharedSQLContext {
   import testImplicits._
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
index 7516be315dd2d..57b5f5e4ab99f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala
index 2b35db411e2ab..a1799829932b8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala
@@ -1,26 +1,26 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql
 
 import org.apache.spark.{SharedSparkContext, SparkFunSuite}
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
-import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{BooleanType, StringType, StructField, StructType}
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/api/r/SQLUtilsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/api/r/SQLUtilsSuite.scala
index f54e23e3aa6cb..7cfee49575570 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/api/r/SQLUtilsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/api/r/SQLUtilsSuite.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql.api.r
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLWindowFunctionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLWindowFunctionSuite.scala
index a57514c256b90..1c6fc3530cbe1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLWindowFunctionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLWindowFunctionSuite.scala
@@ -17,9 +17,9 @@
 
 package org.apache.spark.sql.execution
 
+import org.apache.spark.TestUtils.assertSpilled
 import org.apache.spark.sql.{AnalysisException, QueryTest, Row}
 import org.apache.spark.sql.test.SharedSQLContext
-import org.apache.spark.TestUtils.assertSpilled
 
 case class WindowData(month: Int, area: String, product: Int)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
index 53105e0b24959..dff88ce7f1b9a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
@@ -29,8 +29,8 @@ import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, UnsafeRow}
 import org.apache.spark.sql.types._
 import org.apache.spark.storage.ShuffleBlockId
-import org.apache.spark.util.collection.ExternalSorter
 import org.apache.spark.util.Utils
+import org.apache.spark.util.collection.ExternalSorter
 
 /**
  * used to test close InputStream in UnsafeRowSerializer
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationStoreSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationStoreSuite.scala
index bc9cb6ec2e771..10f1ee279bedf 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationStoreSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationStoreSuite.scala
@@ -23,8 +23,8 @@ import scala.collection.mutable
 
 import org.apache.spark._
 import org.apache.spark.memory.{TaskMemoryManager, TestMemoryManager}
-import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
 import org.apache.spark.unsafe.KVIterator
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 487c84f629be3..243a55cffd47f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -24,8 +24,8 @@ import java.text.SimpleDateFormat
 import java.util.Locale
 
 import org.apache.commons.lang3.time.FastDateFormat
-import org.apache.hadoop.io.compress.GzipCodec
 import org.apache.hadoop.io.SequenceFile.CompressionType
+import org.apache.hadoop.io.compress.GzipCodec
 
 import org.apache.spark.SparkException
 import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row, UDT}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
index 80e5dd161d1f3..0008954e36bdd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -27,8 +27,8 @@ import org.apache.hadoop.fs.{Path, PathFilter}
 import org.apache.hadoop.io.SequenceFile.CompressionType
 import org.apache.hadoop.io.compress.GzipCodec
 
-import org.apache.spark.rdd.RDD
 import org.apache.spark.SparkException
+import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{functions => F, _}
 import org.apache.spark.sql.catalyst.json.{CreateJacksonParser, JacksonParser, JSONOptions}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadBenchmark.scala
index 487d7a7e5ac88..0917f188b9799 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadBenchmark.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadBenchmark.scala
@@ -22,8 +22,8 @@ import scala.collection.JavaConverters._
 import scala.util.Try
 
 import org.apache.spark.SparkConf
-import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.util.{Benchmark, Utils}
 
 /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
index afb8ced53e25c..a0fad862b44c7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
@@ -22,8 +22,8 @@ import scala.reflect.ClassTag
 import org.apache.spark.AccumulatorSuite
 import org.apache.spark.sql.{Dataset, QueryTest, Row, SparkSession}
 import org.apache.spark.sql.catalyst.expressions.{BitwiseAnd, BitwiseOr, Cast, Literal, ShiftLeft}
-import org.apache.spark.sql.execution.exchange.EnsureRequirements
 import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.exchange.EnsureRequirements
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SQLTestUtils
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLogSuite.scala
index 3d480b148db55..83018f95aa55d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLogSuite.scala
@@ -21,8 +21,8 @@ import java.io._
 import java.nio.charset.StandardCharsets._
 
 import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.sql.execution.streaming.FakeFileSystem._
 import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.execution.streaming.FakeFileSystem._
 import org.apache.spark.sql.test.SharedSQLContext
 
 class CompactibleFileStreamLogSuite extends SparkFunSuite with SharedSQLContext {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
index 2c6763ed79a68..1055f09f5411c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
@@ -23,9 +23,9 @@ import org.json4s.jackson.JsonMethods._
 import org.mockito.Mockito.mock
 
 import org.apache.spark._
+import org.apache.spark.LocalSparkContext._
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.internal.config
-import org.apache.spark.LocalSparkContext._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.scheduler._
 import org.apache.spark.sql.{DataFrame, SparkSession}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
index ccf7aa7022a2a..c8461dcb9dfdb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
@@ -17,9 +17,9 @@
 
 package org.apache.spark.sql.execution.vectorized
 
-import java.nio.charset.StandardCharsets
 import java.nio.ByteBuffer
 import java.nio.ByteOrder
+import java.nio.charset.StandardCharsets
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
index 948f179f5e8f0..205c303b6cc4b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql.internal
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLSourceLoadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLSourceLoadSuite.scala
index b5fb740b6eb77..3ce6ae3c52927 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLSourceLoadSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLSourceLoadSuite.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql.sources
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala
index 70338670c9f9c..1ece98aa7eb3a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql.sources
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
index 5a0388ec1d1db..fe9469b49e385 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql.sources
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PathOptionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PathOptionSuite.scala
index c25c3f62158cf..3fd7a5be1da37 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PathOptionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PathOptionSuite.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql.sources
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PrunedScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PrunedScanSuite.scala
index fb6123d1cc4b5..c1eaf948a4b99 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PrunedScanSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PrunedScanSuite.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql.sources
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala
index 308c5079c44bf..4adbff5c663bc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql.sources
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
index 65474a52dd0b7..17690e3df9155 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql.sources
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/fakeExternalSources.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/fakeExternalSources.scala
index 0dfd75e709123..bf43de597a7a0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/fakeExternalSources.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/fakeExternalSources.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.fakesource
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
index d21d1f1069475..1fe639fcf2840 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
@@ -23,10 +23,10 @@ import scala.collection.mutable
 import scala.language.reflectiveCalls
 
 import org.scalactic.TolerantNumerics
-import org.scalatest.concurrent.AsyncAssertions.Waiter
-import org.scalatest.concurrent.PatienceConfiguration.Timeout
 import org.scalatest.BeforeAndAfter
 import org.scalatest.PrivateMethodTester._
+import org.scalatest.concurrent.AsyncAssertions.Waiter
+import org.scalatest.concurrent.PatienceConfiguration.Timeout
 
 import org.apache.spark.SparkException
 import org.apache.spark.scheduler._
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index 41f73b8529ad5..27ea6902fa1fd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -22,19 +22,19 @@ import java.util.concurrent.CountDownLatch
 import org.apache.commons.lang3.RandomStringUtils
 import org.mockito.Mockito._
 import org.scalactic.TolerantNumerics
-import org.scalatest.concurrent.Eventually._
 import org.scalatest.BeforeAndAfter
+import org.scalatest.concurrent.Eventually._
 import org.scalatest.concurrent.PatienceConfiguration.Timeout
 import org.scalatest.mock.MockitoSugar
 
+import org.apache.spark.SparkException
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{DataFrame, Dataset}
-import org.apache.spark.sql.types.StructType
-import org.apache.spark.SparkException
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.streaming.util.{BlockingSource, MockSourceProvider, StreamManualClock}
+import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.ManualClock
 
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
index 92ee7d596acd1..e68db3b636bce 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
@@ -32,9 +32,9 @@ import org.scalatest.concurrent.Eventually
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.FunctionIdentifier
 import org.apache.spark.sql.catalyst.analysis.NoSuchTableException
 import org.apache.spark.sql.catalyst.catalog.SessionCatalog.DEFAULT_DATABASE
-import org.apache.spark.sql.catalyst.FunctionIdentifier
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.util._
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
index b6215bde6bf00..4997d7f96afa2 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
@@ -33,11 +33,9 @@ import com.google.common.io.Files
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 import org.apache.hive.jdbc.HiveDriver
 import org.apache.hive.service.auth.PlainSaslHelper
-import org.apache.hive.service.cli.GetInfoType
+import org.apache.hive.service.cli.{FetchOrientation, FetchType, GetInfoType}
 import org.apache.hive.service.cli.thrift.TCLIService.Client
 import org.apache.hive.service.cli.thrift.ThriftCLIServiceClient
-import org.apache.hive.service.cli.FetchOrientation
-import org.apache.hive.service.cli.FetchType
 import org.apache.thrift.protocol.TBinaryProtocol
 import org.apache.thrift.transport.TSocket
 import org.scalatest.BeforeAndAfterAll
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
index b32b6fb82663f..643925064d532 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
@@ -24,8 +24,8 @@ import java.sql.Timestamp
 import java.util.Locale
 import java.util.concurrent.TimeUnit
 
-import scala.collection.mutable.HashMap
 import scala.collection.JavaConverters._
+import scala.collection.mutable.HashMap
 import scala.language.implicitConversions
 
 import org.apache.hadoop.conf.Configuration
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
index 449a303b59eed..cde20da186acd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -24,8 +24,8 @@ import java.util.{ArrayList => JArrayList, List => JList, Locale, Map => JMap, S
 import java.util.concurrent.TimeUnit
 
 import scala.collection.JavaConverters._
-import scala.util.control.NonFatal
 import scala.util.Try
+import scala.util.control.NonFatal
 
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.hive.conf.HiveConf
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index 858f29c7ee530..46610f84dd822 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -27,10 +27,11 @@ import scala.util.control.NonFatal
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.hive.common.FileUtils
-import org.apache.hadoop.hive.ql.exec.TaskRunner
 import org.apache.hadoop.hive.ql.ErrorMsg
+import org.apache.hadoop.hive.ql.exec.TaskRunner
 import org.apache.hadoop.hive.ql.plan.TableDesc
 
+import org.apache.spark.SparkException
 import org.apache.spark.internal.io.FileCommitProtocol
 import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
 import org.apache.spark.sql.catalyst.catalog.CatalogTable
@@ -42,7 +43,6 @@ import org.apache.spark.sql.execution.datasources.FileFormatWriter
 import org.apache.spark.sql.hive._
 import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc}
 import org.apache.spark.sql.hive.client.{HiveClientImpl, HiveVersion}
-import org.apache.spark.SparkException
 
 
 /**
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveContextCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveContextCompatibilitySuite.scala
index 939fd71b4f1ed..8a7423663f28d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveContextCompatibilitySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveContextCompatibilitySuite.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql.hive
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
index 11fd8c56e631b..c300660458fdd 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
@@ -28,11 +28,11 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectIn
 import org.apache.hadoop.io.LongWritable
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Literal
 import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData, MapData}
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.Row
 
 class HiveInspectorSuite extends SparkFunSuite with HiveInspectors {
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala
index d271acc63de08..3d0e43cbbe037 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala
@@ -23,10 +23,10 @@ import scala.util.Random
 
 import org.scalatest.BeforeAndAfterEach
 
+import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.execution.datasources.FileStatusCache
-import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.{HiveSerDe, SQLConf}
 import org.apache.spark.sql.internal.SQLConf.HiveCaseSensitiveInferenceMode.{Value => InferenceMode, _}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala
index 2ebb1de428fd1..fdbfcf1a68440 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala
@@ -19,9 +19,9 @@ package org.apache.spark.sql.hive
 
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 
+import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.test.SQLTestUtils
-import org.apache.spark.sql.QueryTest
 
 class HiveUtilsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala
index 15ba61646d03f..32db22e704b3e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark.sql.hive
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
index 50506197b3138..54d3962a46b4d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
@@ -23,8 +23,8 @@ import java.util.concurrent.{Executors, TimeUnit}
 import org.scalatest.BeforeAndAfterEach
 
 import org.apache.spark.metrics.source.HiveCatalogMetrics
-import org.apache.spark.sql.execution.datasources.FileStatusCache
 import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.execution.datasources.FileStatusCache
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SQLTestUtils
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index 0007d25614a29..4c2fea3eb68bc 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -27,9 +27,9 @@ import org.scalatest.BeforeAndAfterEach
 
 import org.apache.spark.SparkException
 import org.apache.spark.sql.{AnalysisException, QueryTest, Row, SaveMode}
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.{NoSuchPartitionException, TableAlreadyExistsException}
 import org.apache.spark.sql.catalyst.catalog._
-import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.execution.command.{DDLSuite, DDLUtils}
 import org.apache.spark.sql.hive.HiveExternalCatalog
 import org.apache.spark.sql.hive.orc.OrcFileOperator
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HivePlanTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HivePlanTest.scala
index 89e6edb6b1577..78cdc67800c18 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HivePlanTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HivePlanTest.scala
@@ -17,10 +17,10 @@
 
 package org.apache.spark.sql.hive.execution
 
-import org.apache.spark.sql.functions._
 import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.expressions.Window
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 
 class HivePlanTest extends QueryTest with TestHiveSingleton {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/Receiver.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/Receiver.scala
index d91a64df321a6..31a88730d163e 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/Receiver.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/Receiver.scala
@@ -19,8 +19,8 @@ package org.apache.spark.streaming.receiver
 
 import java.nio.ByteBuffer
 
-import scala.collection.mutable.ArrayBuffer
 import scala.collection.JavaConverters._
+import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.storage.StorageLevel
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala
index f5c8a88f42af6..27644a645727c 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala
@@ -18,8 +18,8 @@
 package org.apache.spark.streaming.receiver
 
 import java.nio.ByteBuffer
-import java.util.concurrent.atomic.AtomicLong
 import java.util.concurrent.ConcurrentLinkedQueue
+import java.util.concurrent.atomic.AtomicLong
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/BatchedWriteAheadLog.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/BatchedWriteAheadLog.scala
index e522bc62d5cac..71b86d16866ea 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/util/BatchedWriteAheadLog.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/util/BatchedWriteAheadLog.scala
@@ -19,8 +19,8 @@ package org.apache.spark.streaming.util
 
 import java.nio.ByteBuffer
 import java.util.{Iterator => JIterator}
-import java.util.concurrent.atomic.AtomicBoolean
 import java.util.concurrent.LinkedBlockingQueue
+import java.util.concurrent.atomic.AtomicBoolean
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
index a3062ac94614b..6f62c7a88dc3c 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
@@ -25,12 +25,11 @@ import scala.reflect.ClassTag
 
 import org.scalatest.concurrent.Eventually.eventually
 
-import org.apache.spark.{SparkConf, SparkException}
+import org.apache.spark.{HashPartitioner, SparkConf, SparkException}
 import org.apache.spark.rdd.{BlockRDD, RDD}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.dstream.{DStream, WindowedDStream}
 import org.apache.spark.util.{Clock, ManualClock}
-import org.apache.spark.HashPartitioner
 
 class BasicOperationsSuite extends TestSuiteBase {
   test("map") {

From 42b9eda80e975d970c3e8da4047b318b83dd269f Mon Sep 17 00:00:00 2001
From: Xingbo Jiang <xingbo.jiang@databricks.com>
Date: Tue, 15 Aug 2017 16:40:01 -0700
Subject: [PATCH 363/779] [MINOR] Fix a typo in the method name
 `UserDefinedFunction.asNonNullabe`

## What changes were proposed in this pull request?

The method name `asNonNullabe` should be `asNonNullable`.

## How was this patch tested?

N/A

Author: Xingbo Jiang <xingbo.jiang@databricks.com>

Closes #18952 from jiangxb1987/typo.
---
 .../apache/spark/sql/UDFRegistration.scala    | 52 +++++++++----------
 .../sql/expressions/UserDefinedFunction.scala |  2 +-
 .../org/apache/spark/sql/functions.scala      | 24 ++++-----
 3 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
index 737afb4ac564e..3ff476147b8b7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
@@ -82,7 +82,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    * Registers a user-defined function (UDF), for a UDF that's already defined using the Dataset
    * API (i.e. of type UserDefinedFunction). To change a UDF to nondeterministic, call the API
    * `UserDefinedFunction.asNondeterministic()`. To change a UDF to nonNullable, call the API
-   * `UserDefinedFunction.asNonNullabe()`.
+   * `UserDefinedFunction.asNonNullable()`.
    *
    * Example:
    * {{{
@@ -90,7 +90,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    *   spark.udf.register("random", foo.asNondeterministic())
    *
    *   val bar = udf(() => "bar")
-   *   spark.udf.register("stringLit", bar.asNonNullabe())
+   *   spark.udf.register("stringLit", bar.asNonNullable())
    * }}}
    *
    * @param name the name of the UDF.
@@ -130,7 +130,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
           }
           functionRegistry.createOrReplaceTempFunction(name, builder)
           val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
-          if (nullable) udf else udf.asNonNullabe()
+          if (nullable) udf else udf.asNonNullable()
         }""")
     }
 
@@ -175,7 +175,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -194,7 +194,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -213,7 +213,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -232,7 +232,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -251,7 +251,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -270,7 +270,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -289,7 +289,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -308,7 +308,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -327,7 +327,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -346,7 +346,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -365,7 +365,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -384,7 +384,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -403,7 +403,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -422,7 +422,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -441,7 +441,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -460,7 +460,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -479,7 +479,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -498,7 +498,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -517,7 +517,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -536,7 +536,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -555,7 +555,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -574,7 +574,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -593,7 +593,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
     }
     functionRegistry.createOrReplaceTempFunction(name, builder)
     val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   //////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
index 011ee7e058e31..03b654f830520 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
@@ -101,7 +101,7 @@ case class UserDefinedFunction protected[sql] (
    *
    * @since 2.3.0
    */
-  def asNonNullabe(): UserDefinedFunction = {
+  def asNonNullable(): UserDefinedFunction = {
     if (!nullable) {
       this
     } else {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 14ab8a2665141..53b2552fa3b36 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -3226,7 +3226,7 @@ object functions {
       val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
       val inputTypes = Try($inputTypes).toOption
       val udf = UserDefinedFunction(f, dataType, inputTypes)
-      if (nullable) udf else udf.asNonNullabe()
+      if (nullable) udf else udf.asNonNullable()
     }""")
   }
 
@@ -3245,7 +3245,7 @@ object functions {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(Nil).toOption
     val udf = UserDefinedFunction(f, dataType, inputTypes)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -3261,7 +3261,7 @@ object functions {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor(typeTag[A1]).dataType :: Nil).toOption
     val udf = UserDefinedFunction(f, dataType, inputTypes)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -3277,7 +3277,7 @@ object functions {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor(typeTag[A1]).dataType :: ScalaReflection.schemaFor(typeTag[A2]).dataType :: Nil).toOption
     val udf = UserDefinedFunction(f, dataType, inputTypes)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -3293,7 +3293,7 @@ object functions {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor(typeTag[A1]).dataType :: ScalaReflection.schemaFor(typeTag[A2]).dataType :: ScalaReflection.schemaFor(typeTag[A3]).dataType :: Nil).toOption
     val udf = UserDefinedFunction(f, dataType, inputTypes)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -3309,7 +3309,7 @@ object functions {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor(typeTag[A1]).dataType :: ScalaReflection.schemaFor(typeTag[A2]).dataType :: ScalaReflection.schemaFor(typeTag[A3]).dataType :: ScalaReflection.schemaFor(typeTag[A4]).dataType :: Nil).toOption
     val udf = UserDefinedFunction(f, dataType, inputTypes)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -3325,7 +3325,7 @@ object functions {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor(typeTag[A1]).dataType :: ScalaReflection.schemaFor(typeTag[A2]).dataType :: ScalaReflection.schemaFor(typeTag[A3]).dataType :: ScalaReflection.schemaFor(typeTag[A4]).dataType :: ScalaReflection.schemaFor(typeTag[A5]).dataType :: Nil).toOption
     val udf = UserDefinedFunction(f, dataType, inputTypes)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -3341,7 +3341,7 @@ object functions {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor(typeTag[A1]).dataType :: ScalaReflection.schemaFor(typeTag[A2]).dataType :: ScalaReflection.schemaFor(typeTag[A3]).dataType :: ScalaReflection.schemaFor(typeTag[A4]).dataType :: ScalaReflection.schemaFor(typeTag[A5]).dataType :: ScalaReflection.schemaFor(typeTag[A6]).dataType :: Nil).toOption
     val udf = UserDefinedFunction(f, dataType, inputTypes)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -3357,7 +3357,7 @@ object functions {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor(typeTag[A1]).dataType :: ScalaReflection.schemaFor(typeTag[A2]).dataType :: ScalaReflection.schemaFor(typeTag[A3]).dataType :: ScalaReflection.schemaFor(typeTag[A4]).dataType :: ScalaReflection.schemaFor(typeTag[A5]).dataType :: ScalaReflection.schemaFor(typeTag[A6]).dataType :: ScalaReflection.schemaFor(typeTag[A7]).dataType :: Nil).toOption
     val udf = UserDefinedFunction(f, dataType, inputTypes)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -3373,7 +3373,7 @@ object functions {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor(typeTag[A1]).dataType :: ScalaReflection.schemaFor(typeTag[A2]).dataType :: ScalaReflection.schemaFor(typeTag[A3]).dataType :: ScalaReflection.schemaFor(typeTag[A4]).dataType :: ScalaReflection.schemaFor(typeTag[A5]).dataType :: ScalaReflection.schemaFor(typeTag[A6]).dataType :: ScalaReflection.schemaFor(typeTag[A7]).dataType :: ScalaReflection.schemaFor(typeTag[A8]).dataType :: Nil).toOption
     val udf = UserDefinedFunction(f, dataType, inputTypes)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -3389,7 +3389,7 @@ object functions {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor(typeTag[A1]).dataType :: ScalaReflection.schemaFor(typeTag[A2]).dataType :: ScalaReflection.schemaFor(typeTag[A3]).dataType :: ScalaReflection.schemaFor(typeTag[A4]).dataType :: ScalaReflection.schemaFor(typeTag[A5]).dataType :: ScalaReflection.schemaFor(typeTag[A6]).dataType :: ScalaReflection.schemaFor(typeTag[A7]).dataType :: ScalaReflection.schemaFor(typeTag[A8]).dataType :: ScalaReflection.schemaFor(typeTag[A9]).dataType :: Nil).toOption
     val udf = UserDefinedFunction(f, dataType, inputTypes)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   /**
@@ -3405,7 +3405,7 @@ object functions {
     val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
     val inputTypes = Try(ScalaReflection.schemaFor(typeTag[A1]).dataType :: ScalaReflection.schemaFor(typeTag[A2]).dataType :: ScalaReflection.schemaFor(typeTag[A3]).dataType :: ScalaReflection.schemaFor(typeTag[A4]).dataType :: ScalaReflection.schemaFor(typeTag[A5]).dataType :: ScalaReflection.schemaFor(typeTag[A6]).dataType :: ScalaReflection.schemaFor(typeTag[A7]).dataType :: ScalaReflection.schemaFor(typeTag[A8]).dataType :: ScalaReflection.schemaFor(typeTag[A9]).dataType :: ScalaReflection.schemaFor(typeTag[A10]).dataType :: Nil).toOption
     val udf = UserDefinedFunction(f, dataType, inputTypes)
-    if (nullable) udf else udf.asNonNullabe()
+    if (nullable) udf else udf.asNonNullable()
   }
 
   // scalastyle:on parameter.number

From 96608310501a43fa4ab9f2697f202d655dba98c5 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Wed, 16 Aug 2017 11:19:15 +0900
Subject: [PATCH 364/779] [SPARK-21712][PYSPARK] Clarify type error for
 Column.substr()

Proposed changes:
* Clarify the type error that `Column.substr()` gives.

Test plan:
* Tested this manually.
* Test code:
    ```python
    from pyspark.sql.functions import col, lit
    spark.createDataFrame([['nick']], schema=['name']).select(col('name').substr(0, lit(1)))
    ```
* Before:
    ```
    TypeError: Can not mix the type
    ```
* After:
    ```
    TypeError: startPos and length must be the same type. Got <class 'int'> and
    <class 'pyspark.sql.column.Column'>, respectively.
    ```

Author: Nicholas Chammas <nicholas.chammas@gmail.com>

Closes #18926 from nchammas/SPARK-21712-substr-type-error.
---
 python/pyspark/sql/column.py | 10 ++++++++--
 python/pyspark/sql/tests.py  | 12 ++++++++++++
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
index e753ed402cdd7..b172f38ea22d0 100644
--- a/python/pyspark/sql/column.py
+++ b/python/pyspark/sql/column.py
@@ -406,8 +406,14 @@ def substr(self, startPos, length):
         [Row(col=u'Ali'), Row(col=u'Bob')]
         """
         if type(startPos) != type(length):
-            raise TypeError("Can not mix the type")
-        if isinstance(startPos, (int, long)):
+            raise TypeError(
+                "startPos and length must be the same type. "
+                "Got {startPos_t} and {length_t}, respectively."
+                .format(
+                    startPos_t=type(startPos),
+                    length_t=type(length),
+                ))
+        if isinstance(startPos, int):
             jc = self._jc.substr(startPos, length)
         elif isinstance(startPos, Column):
             jc = self._jc.substr(startPos._jc, length._jc)
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index cf2c473a1645c..45a3f9e7165f1 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -1220,6 +1220,18 @@ def test_rand_functions(self):
         rndn2 = df.select('key', functions.randn(0)).collect()
         self.assertEqual(sorted(rndn1), sorted(rndn2))
 
+    def test_string_functions(self):
+        from pyspark.sql.functions import col, lit
+        df = self.spark.createDataFrame([['nick']], schema=['name'])
+        self.assertRaisesRegexp(
+            TypeError,
+            "must be the same type",
+            lambda: df.select(col('name').substr(0, lit(1))))
+        if sys.version_info.major == 2:
+            self.assertRaises(
+                TypeError,
+                lambda: df.select(col('name').substr(long(0), long(1))))
+
     def test_array_contains_function(self):
         from pyspark.sql.functions import array_contains
 

From 07549b20a3fc2a282e080f76a2be075e4dd5ebc7 Mon Sep 17 00:00:00 2001
From: WeichenXu <WeichenXu123@outlook.com>
Date: Wed, 16 Aug 2017 10:41:05 +0800
Subject: [PATCH 365/779] [SPARK-19634][ML] Multivariate summarizer -
 dataframes API

## What changes were proposed in this pull request?

This patch adds the DataFrames API to the multivariate summarizer (mean, variance, etc.). In addition to all the features of MultivariateOnlineSummarizer, it also allows the user to select a subset of the metrics.

## How was this patch tested?

Testcases added.

## Performance
Resolve several performance issues in #17419, further optimization pending on SQL team's work. One of the SQL layer performance issue related to these feature has been resolved in #18712, thanks liancheng and cloud-fan

### Performance data

(test on my laptop, use 2 partitions. tries out = 20, warm up = 10)

The unit of test results is records/milliseconds (higher is better)

Vector size/records number | 1/10000000 | 10/1000000 | 100/1000000 | 1000/100000 | 10000/10000
----|------|----|---|----|----
Dataframe | 15149  | 7441 | 2118 | 224 | 21
RDD from Dataframe | 4992  | 4440 | 2328 | 320 | 33
raw RDD | 53931  | 20683 | 3966 | 528 | 53

Author: WeichenXu <WeichenXu123@outlook.com>

Closes #18798 from WeichenXu123/SPARK-19634-dataframe-summarizer.
---
 .../apache/spark/ml/linalg/VectorUDT.scala    |  24 +-
 .../org/apache/spark/ml/stat/Summarizer.scala | 596 ++++++++++++++++++
 .../spark/ml/stat/SummarizerSuite.scala       | 582 +++++++++++++++++
 .../sql/catalyst/expressions/Projection.scala |   6 +
 .../expressions/aggregate/interfaces.scala    |   6 +
 5 files changed, 1203 insertions(+), 11 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/linalg/VectorUDT.scala b/mllib/src/main/scala/org/apache/spark/ml/linalg/VectorUDT.scala
index 917861309c573..37f173bc20469 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/linalg/VectorUDT.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/linalg/VectorUDT.scala
@@ -27,17 +27,7 @@ import org.apache.spark.sql.types._
  */
 private[spark] class VectorUDT extends UserDefinedType[Vector] {
 
-  override def sqlType: StructType = {
-    // type: 0 = sparse, 1 = dense
-    // We only use "values" for dense vectors, and "size", "indices", and "values" for sparse
-    // vectors. The "values" field is nullable because we might want to add binary vectors later,
-    // which uses "size" and "indices", but not "values".
-    StructType(Seq(
-      StructField("type", ByteType, nullable = false),
-      StructField("size", IntegerType, nullable = true),
-      StructField("indices", ArrayType(IntegerType, containsNull = false), nullable = true),
-      StructField("values", ArrayType(DoubleType, containsNull = false), nullable = true)))
-  }
+  override final def sqlType: StructType = _sqlType
 
   override def serialize(obj: Vector): InternalRow = {
     obj match {
@@ -94,4 +84,16 @@ private[spark] class VectorUDT extends UserDefinedType[Vector] {
   override def typeName: String = "vector"
 
   private[spark] override def asNullable: VectorUDT = this
+
+  private[this] val _sqlType = {
+    // type: 0 = sparse, 1 = dense
+    // We only use "values" for dense vectors, and "size", "indices", and "values" for sparse
+    // vectors. The "values" field is nullable because we might want to add binary vectors later,
+    // which uses "size" and "indices", but not "values".
+    StructType(Seq(
+      StructField("type", ByteType, nullable = false),
+      StructField("size", IntegerType, nullable = true),
+      StructField("indices", ArrayType(IntegerType, containsNull = false), nullable = true),
+      StructField("values", ArrayType(DoubleType, containsNull = false), nullable = true)))
+  }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
new file mode 100644
index 0000000000000..7e408b9dbd13a
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
@@ -0,0 +1,596 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.stat
+
+import java.io._
+
+import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.internal.Logging
+import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
+import org.apache.spark.sql.Column
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Expression, UnsafeArrayData}
+import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete, TypedImperativeAggregate}
+import org.apache.spark.sql.functions.lit
+import org.apache.spark.sql.types._
+
+/**
+ * A builder object that provides summary statistics about a given column.
+ *
+ * Users should not directly create such builders, but instead use one of the methods in
+ * [[Summarizer]].
+ */
+@Experimental
+@Since("2.3.0")
+sealed abstract class SummaryBuilder {
+  /**
+   * Returns an aggregate object that contains the summary of the column with the requested metrics.
+   * @param featuresCol a column that contains features Vector object.
+   * @param weightCol a column that contains weight value.
+   * @return an aggregate column that contains the statistics. The exact content of this
+   *         structure is determined during the creation of the builder.
+   */
+  @Since("2.3.0")
+  def summary(featuresCol: Column, weightCol: Column): Column
+
+  @Since("2.3.0")
+  def summary(featuresCol: Column): Column = summary(featuresCol, lit(1.0))
+}
+
+/**
+ * Tools for vectorized statistics on MLlib Vectors.
+ *
+ * The methods in this package provide various statistics for Vectors contained inside DataFrames.
+ *
+ * This class lets users pick the statistics they would like to extract for a given column. Here is
+ * an example in Scala:
+ * {{{
+ *   val dataframe = ... // Some dataframe containing a feature column
+ *   val allStats = dataframe.select(Summarizer.metrics("min", "max").summary($"features"))
+ *   val Row(Row(min_, max_)) = allStats.first()
+ * }}}
+ *
+ * If one wants to get a single metric, shortcuts are also available:
+ * {{{
+ *   val meanDF = dataframe.select(Summarizer.mean($"features"))
+ *   val Row(mean_) = meanDF.first()
+ * }}}
+ *
+ * Note: Currently, the performance of this interface is about 2x~3x slower then using the RDD
+ * interface.
+ */
+@Experimental
+@Since("2.3.0")
+object Summarizer extends Logging {
+
+  import SummaryBuilderImpl._
+
+  /**
+   * Given a list of metrics, provides a builder that it turns computes metrics from a column.
+   *
+   * See the documentation of [[Summarizer]] for an example.
+   *
+   * The following metrics are accepted (case sensitive):
+   *  - mean: a vector that contains the coefficient-wise mean.
+   *  - variance: a vector tha contains the coefficient-wise variance.
+   *  - count: the count of all vectors seen.
+   *  - numNonzeros: a vector with the number of non-zeros for each coefficients
+   *  - max: the maximum for each coefficient.
+   *  - min: the minimum for each coefficient.
+   *  - normL2: the Euclidian norm for each coefficient.
+   *  - normL1: the L1 norm of each coefficient (sum of the absolute values).
+   * @param firstMetric the metric being provided
+   * @param metrics additional metrics that can be provided.
+   * @return a builder.
+   * @throws IllegalArgumentException if one of the metric names is not understood.
+   *
+   * Note: Currently, the performance of this interface is about 2x~3x slower then using the RDD
+   * interface.
+   */
+  @Since("2.3.0")
+  def metrics(firstMetric: String, metrics: String*): SummaryBuilder = {
+    val (typedMetrics, computeMetrics) = getRelevantMetrics(Seq(firstMetric) ++ metrics)
+    new SummaryBuilderImpl(typedMetrics, computeMetrics)
+  }
+
+  @Since("2.3.0")
+  def mean(col: Column): Column = getSingleMetric(col, "mean")
+
+  @Since("2.3.0")
+  def variance(col: Column): Column = getSingleMetric(col, "variance")
+
+  @Since("2.3.0")
+  def count(col: Column): Column = getSingleMetric(col, "count")
+
+  @Since("2.3.0")
+  def numNonZeros(col: Column): Column = getSingleMetric(col, "numNonZeros")
+
+  @Since("2.3.0")
+  def max(col: Column): Column = getSingleMetric(col, "max")
+
+  @Since("2.3.0")
+  def min(col: Column): Column = getSingleMetric(col, "min")
+
+  @Since("2.3.0")
+  def normL1(col: Column): Column = getSingleMetric(col, "normL1")
+
+  @Since("2.3.0")
+  def normL2(col: Column): Column = getSingleMetric(col, "normL2")
+
+  private def getSingleMetric(col: Column, metric: String): Column = {
+    val c1 = metrics(metric).summary(col)
+    c1.getField(metric).as(s"$metric($col)")
+  }
+}
+
+private[ml] class SummaryBuilderImpl(
+    requestedMetrics: Seq[SummaryBuilderImpl.Metric],
+    requestedCompMetrics: Seq[SummaryBuilderImpl.ComputeMetric]
+  ) extends SummaryBuilder {
+
+  override def summary(featuresCol: Column, weightCol: Column): Column = {
+
+    val agg = SummaryBuilderImpl.MetricsAggregate(
+      requestedMetrics,
+      requestedCompMetrics,
+      featuresCol.expr,
+      weightCol.expr,
+      mutableAggBufferOffset = 0,
+      inputAggBufferOffset = 0)
+
+    new Column(AggregateExpression(agg, mode = Complete, isDistinct = false))
+  }
+}
+
+private[ml] object SummaryBuilderImpl extends Logging {
+
+  def implementedMetrics: Seq[String] = allMetrics.map(_._1).sorted
+
+  @throws[IllegalArgumentException]("When the list is empty or not a subset of known metrics")
+  def getRelevantMetrics(requested: Seq[String]): (Seq[Metric], Seq[ComputeMetric]) = {
+    val all = requested.map { req =>
+      val (_, metric, _, deps) = allMetrics.find(_._1 == req).getOrElse {
+        throw new IllegalArgumentException(s"Metric $req cannot be found." +
+          s" Valid metrics are $implementedMetrics")
+      }
+      metric -> deps
+    }
+    // Do not sort, otherwise the user has to look the schema to see the order that it
+    // is going to be given in.
+    val metrics = all.map(_._1)
+    val computeMetrics = all.flatMap(_._2).distinct.sortBy(_.toString)
+    metrics -> computeMetrics
+  }
+
+  def structureForMetrics(metrics: Seq[Metric]): StructType = {
+    val dict = allMetrics.map { case (name, metric, dataType, _) =>
+      (metric, (name, dataType))
+    }.toMap
+    val fields = metrics.map(dict.apply).map { case (name, dataType) =>
+      StructField(name, dataType, nullable = false)
+    }
+    StructType(fields)
+  }
+
+  private val arrayDType = ArrayType(DoubleType, containsNull = false)
+  private val arrayLType = ArrayType(LongType, containsNull = false)
+
+  /**
+   * All the metrics that can be currently computed by Spark for vectors.
+   *
+   * This list associates the user name, the internal (typed) name, and the list of computation
+   * metrics that need to de computed internally to get the final result.
+   */
+  private val allMetrics: Seq[(String, Metric, DataType, Seq[ComputeMetric])] = Seq(
+    ("mean", Mean, arrayDType, Seq(ComputeMean, ComputeWeightSum)),
+    ("variance", Variance, arrayDType, Seq(ComputeWeightSum, ComputeMean, ComputeM2n)),
+    ("count", Count, LongType, Seq()),
+    ("numNonZeros", NumNonZeros, arrayLType, Seq(ComputeNNZ)),
+    ("max", Max, arrayDType, Seq(ComputeMax, ComputeNNZ)),
+    ("min", Min, arrayDType, Seq(ComputeMin, ComputeNNZ)),
+    ("normL2", NormL2, arrayDType, Seq(ComputeM2)),
+    ("normL1", NormL1, arrayDType, Seq(ComputeL1))
+  )
+
+  /**
+   * The metrics that are currently implemented.
+   */
+  sealed trait Metric extends Serializable
+  private[stat] case object Mean extends Metric
+  private[stat] case object Variance extends Metric
+  private[stat] case object Count extends Metric
+  private[stat] case object NumNonZeros extends Metric
+  private[stat] case object Max extends Metric
+  private[stat] case object Min extends Metric
+  private[stat] case object NormL2 extends Metric
+  private[stat] case object NormL1 extends Metric
+
+  /**
+   * The running metrics that are going to be computed.
+   *
+   * There is a bipartite graph between the metrics and the computed metrics.
+   */
+  sealed trait ComputeMetric extends Serializable
+  private[stat] case object ComputeMean extends ComputeMetric
+  private[stat] case object ComputeM2n extends ComputeMetric
+  private[stat] case object ComputeM2 extends ComputeMetric
+  private[stat] case object ComputeL1 extends ComputeMetric
+  private[stat] case object ComputeWeightSum extends ComputeMetric
+  private[stat] case object ComputeNNZ extends ComputeMetric
+  private[stat] case object ComputeMax extends ComputeMetric
+  private[stat] case object ComputeMin extends ComputeMetric
+
+  private[stat] class SummarizerBuffer(
+      requestedMetrics: Seq[Metric],
+      requestedCompMetrics: Seq[ComputeMetric]
+  ) extends Serializable {
+
+    private var n = 0
+    private var currMean: Array[Double] = null
+    private var currM2n: Array[Double] = null
+    private var currM2: Array[Double] = null
+    private var currL1: Array[Double] = null
+    private var totalCnt: Long = 0
+    private var totalWeightSum: Double = 0.0
+    private var weightSquareSum: Double = 0.0
+    private var weightSum: Array[Double] = null
+    private var nnz: Array[Long] = null
+    private var currMax: Array[Double] = null
+    private var currMin: Array[Double] = null
+
+    def this() {
+      this(
+        Seq(Mean, Variance, Count, NumNonZeros, Max, Min, NormL2, NormL1),
+        Seq(ComputeMean, ComputeM2n, ComputeM2, ComputeL1,
+          ComputeWeightSum, ComputeNNZ, ComputeMax, ComputeMin)
+      )
+    }
+
+    /**
+     * Add a new sample to this summarizer, and update the statistical summary.
+     */
+    def add(instance: Vector, weight: Double): this.type = {
+      require(weight >= 0.0, s"sample weight, $weight has to be >= 0.0")
+      if (weight == 0.0) return this
+
+      if (n == 0) {
+        require(instance.size > 0, s"Vector should have dimension larger than zero.")
+        n = instance.size
+
+        if (requestedCompMetrics.contains(ComputeMean)) { currMean = Array.ofDim[Double](n) }
+        if (requestedCompMetrics.contains(ComputeM2n)) { currM2n = Array.ofDim[Double](n) }
+        if (requestedCompMetrics.contains(ComputeM2)) { currM2 = Array.ofDim[Double](n) }
+        if (requestedCompMetrics.contains(ComputeL1)) { currL1 = Array.ofDim[Double](n) }
+        if (requestedCompMetrics.contains(ComputeWeightSum)) { weightSum = Array.ofDim[Double](n) }
+        if (requestedCompMetrics.contains(ComputeNNZ)) { nnz = Array.ofDim[Long](n) }
+        if (requestedCompMetrics.contains(ComputeMax)) {
+          currMax = Array.fill[Double](n)(Double.MinValue)
+        }
+        if (requestedCompMetrics.contains(ComputeMin)) {
+          currMin = Array.fill[Double](n)(Double.MaxValue)
+        }
+      }
+
+      require(n == instance.size, s"Dimensions mismatch when adding new sample." +
+        s" Expecting $n but got ${instance.size}.")
+
+      val localCurrMean = currMean
+      val localCurrM2n = currM2n
+      val localCurrM2 = currM2
+      val localCurrL1 = currL1
+      val localWeightSum = weightSum
+      val localNumNonzeros = nnz
+      val localCurrMax = currMax
+      val localCurrMin = currMin
+      instance.foreachActive { (index, value) =>
+        if (value != 0.0) {
+          if (localCurrMax != null && localCurrMax(index) < value) {
+            localCurrMax(index) = value
+          }
+          if (localCurrMin != null && localCurrMin(index) > value) {
+            localCurrMin(index) = value
+          }
+
+          if (localWeightSum != null) {
+            if (localCurrMean != null) {
+              val prevMean = localCurrMean(index)
+              val diff = value - prevMean
+              localCurrMean(index) = prevMean + weight * diff / (localWeightSum(index) + weight)
+
+              if (localCurrM2n != null) {
+                localCurrM2n(index) += weight * (value - localCurrMean(index)) * diff
+              }
+            }
+            localWeightSum(index) += weight
+          }
+
+          if (localCurrM2 != null) {
+            localCurrM2(index) += weight * value * value
+          }
+          if (localCurrL1 != null) {
+            localCurrL1(index) += weight * math.abs(value)
+          }
+
+          if (localNumNonzeros != null) {
+            localNumNonzeros(index) += 1
+          }
+        }
+      }
+
+      totalWeightSum += weight
+      weightSquareSum += weight * weight
+      totalCnt += 1
+      this
+    }
+
+    def add(instance: Vector): this.type = add(instance, 1.0)
+
+    /**
+     * Merge another SummarizerBuffer, and update the statistical summary.
+     * (Note that it's in place merging; as a result, `this` object will be modified.)
+     *
+     * @param other The other MultivariateOnlineSummarizer to be merged.
+     */
+    def merge(other: SummarizerBuffer): this.type = {
+      if (this.totalWeightSum != 0.0 && other.totalWeightSum != 0.0) {
+        require(n == other.n, s"Dimensions mismatch when merging with another summarizer. " +
+          s"Expecting $n but got ${other.n}.")
+        totalCnt += other.totalCnt
+        totalWeightSum += other.totalWeightSum
+        weightSquareSum += other.weightSquareSum
+        var i = 0
+        while (i < n) {
+          if (weightSum != null) {
+            val thisWeightSum = weightSum(i)
+            val otherWeightSum = other.weightSum(i)
+            val totalWeightSum = thisWeightSum + otherWeightSum
+
+            if (totalWeightSum != 0.0) {
+              if (currMean != null) {
+                val deltaMean = other.currMean(i) - currMean(i)
+                // merge mean together
+                currMean(i) += deltaMean * otherWeightSum / totalWeightSum
+
+                if (currM2n != null) {
+                  // merge m2n together
+                  currM2n(i) += other.currM2n(i) +
+                    deltaMean * deltaMean * thisWeightSum * otherWeightSum / totalWeightSum
+                }
+              }
+            }
+            weightSum(i) = totalWeightSum
+          }
+
+          // merge m2 together
+          if (currM2 != null) { currM2(i) += other.currM2(i) }
+          // merge l1 together
+          if (currL1 != null) { currL1(i) += other.currL1(i) }
+          // merge max and min
+          if (currMax != null) { currMax(i) = math.max(currMax(i), other.currMax(i)) }
+          if (currMin != null) { currMin(i) = math.min(currMin(i), other.currMin(i)) }
+          if (nnz != null) { nnz(i) = nnz(i) + other.nnz(i) }
+          i += 1
+        }
+      } else if (totalWeightSum == 0.0 && other.totalWeightSum != 0.0) {
+        this.n = other.n
+        if (other.currMean != null) { this.currMean = other.currMean.clone() }
+        if (other.currM2n != null) { this.currM2n = other.currM2n.clone() }
+        if (other.currM2 != null) { this.currM2 = other.currM2.clone() }
+        if (other.currL1 != null) { this.currL1 = other.currL1.clone() }
+        this.totalCnt = other.totalCnt
+        this.totalWeightSum = other.totalWeightSum
+        this.weightSquareSum = other.weightSquareSum
+        if (other.weightSum != null) { this.weightSum = other.weightSum.clone() }
+        if (other.nnz != null) { this.nnz = other.nnz.clone() }
+        if (other.currMax != null) { this.currMax = other.currMax.clone() }
+        if (other.currMin != null) { this.currMin = other.currMin.clone() }
+      }
+      this
+    }
+
+    /**
+     * Sample mean of each dimension.
+     */
+    def mean: Vector = {
+      require(requestedMetrics.contains(Mean))
+      require(totalWeightSum > 0, s"Nothing has been added to this summarizer.")
+
+      val realMean = Array.ofDim[Double](n)
+      var i = 0
+      while (i < n) {
+        realMean(i) = currMean(i) * (weightSum(i) / totalWeightSum)
+        i += 1
+      }
+      Vectors.dense(realMean)
+    }
+
+    /**
+     * Unbiased estimate of sample variance of each dimension.
+     */
+    def variance: Vector = {
+      require(requestedMetrics.contains(Variance))
+      require(totalWeightSum > 0, s"Nothing has been added to this summarizer.")
+
+      val realVariance = Array.ofDim[Double](n)
+
+      val denominator = totalWeightSum - (weightSquareSum / totalWeightSum)
+
+      // Sample variance is computed, if the denominator is less than 0, the variance is just 0.
+      if (denominator > 0.0) {
+        val deltaMean = currMean
+        var i = 0
+        val len = currM2n.length
+        while (i < len) {
+          realVariance(i) = (currM2n(i) + deltaMean(i) * deltaMean(i) * weightSum(i) *
+            (totalWeightSum - weightSum(i)) / totalWeightSum) / denominator
+          i += 1
+        }
+      }
+      Vectors.dense(realVariance)
+    }
+
+    /**
+     * Sample size.
+     */
+    def count: Long = totalCnt
+
+    /**
+     * Number of nonzero elements in each dimension.
+     *
+     */
+    def numNonzeros: Vector = {
+      require(requestedMetrics.contains(NumNonZeros))
+      require(totalCnt > 0, s"Nothing has been added to this summarizer.")
+
+      Vectors.dense(nnz.map(_.toDouble))
+    }
+
+    /**
+     * Maximum value of each dimension.
+     */
+    def max: Vector = {
+      require(requestedMetrics.contains(Max))
+      require(totalWeightSum > 0, s"Nothing has been added to this summarizer.")
+
+      var i = 0
+      while (i < n) {
+        if ((nnz(i) < totalCnt) && (currMax(i) < 0.0)) currMax(i) = 0.0
+        i += 1
+      }
+      Vectors.dense(currMax)
+    }
+
+    /**
+     * Minimum value of each dimension.
+     */
+    def min: Vector = {
+      require(requestedMetrics.contains(Min))
+      require(totalWeightSum > 0, s"Nothing has been added to this summarizer.")
+
+      var i = 0
+      while (i < n) {
+        if ((nnz(i) < totalCnt) && (currMin(i) > 0.0)) currMin(i) = 0.0
+        i += 1
+      }
+      Vectors.dense(currMin)
+    }
+
+    /**
+     * L2 (Euclidian) norm of each dimension.
+     */
+    def normL2: Vector = {
+      require(requestedMetrics.contains(NormL2))
+      require(totalWeightSum > 0, s"Nothing has been added to this summarizer.")
+
+      val realMagnitude = Array.ofDim[Double](n)
+
+      var i = 0
+      val len = currM2.length
+      while (i < len) {
+        realMagnitude(i) = math.sqrt(currM2(i))
+        i += 1
+      }
+      Vectors.dense(realMagnitude)
+    }
+
+    /**
+     * L1 norm of each dimension.
+     */
+    def normL1: Vector = {
+      require(requestedMetrics.contains(NormL1))
+      require(totalWeightSum > 0, s"Nothing has been added to this summarizer.")
+
+      Vectors.dense(currL1)
+    }
+  }
+
+  private case class MetricsAggregate(
+      requestedMetrics: Seq[Metric],
+      requestedComputeMetrics: Seq[ComputeMetric],
+      featuresExpr: Expression,
+      weightExpr: Expression,
+      mutableAggBufferOffset: Int,
+      inputAggBufferOffset: Int)
+    extends TypedImperativeAggregate[SummarizerBuffer] {
+
+    override def eval(state: SummarizerBuffer): InternalRow = {
+      val metrics = requestedMetrics.map {
+        case Mean => UnsafeArrayData.fromPrimitiveArray(state.mean.toArray)
+        case Variance => UnsafeArrayData.fromPrimitiveArray(state.variance.toArray)
+        case Count => state.count
+        case NumNonZeros => UnsafeArrayData.fromPrimitiveArray(
+          state.numNonzeros.toArray.map(_.toLong))
+        case Max => UnsafeArrayData.fromPrimitiveArray(state.max.toArray)
+        case Min => UnsafeArrayData.fromPrimitiveArray(state.min.toArray)
+        case NormL2 => UnsafeArrayData.fromPrimitiveArray(state.normL2.toArray)
+        case NormL1 => UnsafeArrayData.fromPrimitiveArray(state.normL1.toArray)
+      }
+      InternalRow.apply(metrics: _*)
+    }
+
+    override def children: Seq[Expression] = featuresExpr :: weightExpr :: Nil
+
+    override def update(state: SummarizerBuffer, row: InternalRow): SummarizerBuffer = {
+      val features = udt.deserialize(featuresExpr.eval(row))
+      val weight = weightExpr.eval(row).asInstanceOf[Double]
+      state.add(features, weight)
+      state
+    }
+
+    override def merge(state: SummarizerBuffer,
+      other: SummarizerBuffer): SummarizerBuffer = {
+      state.merge(other)
+    }
+
+    override def nullable: Boolean = false
+
+    override def createAggregationBuffer(): SummarizerBuffer
+      = new SummarizerBuffer(requestedMetrics, requestedComputeMetrics)
+
+    override def serialize(state: SummarizerBuffer): Array[Byte] = {
+      // TODO: Use ByteBuffer to optimize
+      val bos = new ByteArrayOutputStream()
+      val oos = new ObjectOutputStream(bos)
+      oos.writeObject(state)
+      bos.toByteArray
+    }
+
+    override def deserialize(bytes: Array[Byte]): SummarizerBuffer = {
+      // TODO: Use ByteBuffer to optimize
+      val bis = new ByteArrayInputStream(bytes)
+      val ois = new ObjectInputStream(bis)
+      ois.readObject().asInstanceOf[SummarizerBuffer]
+    }
+
+    override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): MetricsAggregate = {
+      copy(mutableAggBufferOffset = newMutableAggBufferOffset)
+    }
+
+    override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): MetricsAggregate = {
+      copy(inputAggBufferOffset = newInputAggBufferOffset)
+    }
+
+    override lazy val dataType: DataType = structureForMetrics(requestedMetrics)
+
+    override def prettyName: String = "aggregate_metrics"
+
+  }
+
+  private[this] val udt = new VectorUDT
+
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
new file mode 100644
index 0000000000000..dfb733ff6e761
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
@@ -0,0 +1,582 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.stat
+
+import org.scalatest.exceptions.TestFailedException
+
+import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.ml.linalg.{Vector, Vectors}
+import org.apache.spark.ml.util.TestingUtils._
+import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => OldVectors}
+import org.apache.spark.mllib.stat.{MultivariateOnlineSummarizer, Statistics}
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
+
+class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
+
+  import testImplicits._
+  import Summarizer._
+  import SummaryBuilderImpl._
+
+  private case class ExpectedMetrics(
+      mean: Seq[Double],
+      variance: Seq[Double],
+      count: Long,
+      numNonZeros: Seq[Long],
+      max: Seq[Double],
+      min: Seq[Double],
+      normL2: Seq[Double],
+      normL1: Seq[Double])
+
+  /**
+   * The input is expected to be either a sparse vector, a dense vector or an array of doubles
+   * (which will be converted to a dense vector)
+   * The expected is the list of all the known metrics.
+   *
+   * The tests take an list of input vectors and a list of all the summary values that
+   * are expected for this input. They currently test against some fixed subset of the
+   * metrics, but should be made fuzzy in the future.
+   */
+  private def testExample(name: String, input: Seq[Any], exp: ExpectedMetrics): Unit = {
+
+    def inputVec: Seq[Vector] = input.map {
+      case x: Array[Double @unchecked] => Vectors.dense(x)
+      case x: Seq[Double @unchecked] => Vectors.dense(x.toArray)
+      case x: Vector => x
+      case x => throw new Exception(x.toString)
+    }
+
+    val summarizer = {
+      val _summarizer = new MultivariateOnlineSummarizer
+      inputVec.foreach(v => _summarizer.add(OldVectors.fromML(v)))
+      _summarizer
+    }
+
+    // Because the Spark context is reset between tests, we cannot hold a reference onto it.
+    def wrappedInit() = {
+      val df = inputVec.map(Tuple1.apply).toDF("features")
+      val col = df.col("features")
+      (df, col)
+    }
+
+    registerTest(s"$name - mean only") {
+      val (df, c) = wrappedInit()
+      compare(df.select(metrics("mean").summary(c), mean(c)), Seq(Row(exp.mean), summarizer.mean))
+    }
+
+    registerTest(s"$name - mean only (direct)") {
+      val (df, c) = wrappedInit()
+      compare(df.select(mean(c)), Seq(exp.mean))
+    }
+
+    registerTest(s"$name - variance only") {
+      val (df, c) = wrappedInit()
+      compare(df.select(metrics("variance").summary(c), variance(c)),
+        Seq(Row(exp.variance), summarizer.variance))
+    }
+
+    registerTest(s"$name - variance only (direct)") {
+      val (df, c) = wrappedInit()
+      compare(df.select(variance(c)), Seq(summarizer.variance))
+    }
+
+    registerTest(s"$name - count only") {
+      val (df, c) = wrappedInit()
+      compare(df.select(metrics("count").summary(c), count(c)),
+        Seq(Row(exp.count), exp.count))
+    }
+
+    registerTest(s"$name - count only (direct)") {
+      val (df, c) = wrappedInit()
+      compare(df.select(count(c)),
+        Seq(exp.count))
+    }
+
+    registerTest(s"$name - numNonZeros only") {
+      val (df, c) = wrappedInit()
+      compare(df.select(metrics("numNonZeros").summary(c), numNonZeros(c)),
+        Seq(Row(exp.numNonZeros), exp.numNonZeros))
+    }
+
+    registerTest(s"$name - numNonZeros only (direct)") {
+      val (df, c) = wrappedInit()
+      compare(df.select(numNonZeros(c)),
+        Seq(exp.numNonZeros))
+    }
+
+    registerTest(s"$name - min only") {
+      val (df, c) = wrappedInit()
+      compare(df.select(metrics("min").summary(c), min(c)),
+        Seq(Row(exp.min), exp.min))
+    }
+
+    registerTest(s"$name - max only") {
+      val (df, c) = wrappedInit()
+      compare(df.select(metrics("max").summary(c), max(c)),
+        Seq(Row(exp.max), exp.max))
+    }
+
+    registerTest(s"$name - normL1 only") {
+      val (df, c) = wrappedInit()
+      compare(df.select(metrics("normL1").summary(c), normL1(c)),
+        Seq(Row(exp.normL1), exp.normL1))
+    }
+
+    registerTest(s"$name - normL2 only") {
+      val (df, c) = wrappedInit()
+      compare(df.select(metrics("normL2").summary(c), normL2(c)),
+        Seq(Row(exp.normL2), exp.normL2))
+    }
+
+    registerTest(s"$name - all metrics at once") {
+      val (df, c) = wrappedInit()
+      compare(df.select(
+        metrics("mean", "variance", "count", "numNonZeros").summary(c),
+        mean(c), variance(c), count(c), numNonZeros(c)),
+        Seq(Row(exp.mean, exp.variance, exp.count, exp.numNonZeros),
+          exp.mean, exp.variance, exp.count, exp.numNonZeros))
+    }
+  }
+
+  private def denseData(input: Seq[Seq[Double]]): DataFrame = {
+    input.map(_.toArray).map(Vectors.dense).map(Tuple1.apply).toDF("features")
+  }
+
+  private def compare(df: DataFrame, exp: Seq[Any]): Unit = {
+    val coll = df.collect().toSeq
+    val Seq(row) = coll
+    val res = row.toSeq
+    val names = df.schema.fieldNames.zipWithIndex.map { case (n, idx) => s"$n ($idx)" }
+    assert(res.size === exp.size, (res.size, exp.size))
+    for (((x1, x2), name) <- res.zip(exp).zip(names)) {
+      compareStructures(x1, x2, name)
+    }
+  }
+
+  // Compares structured content.
+  private def compareStructures(x1: Any, x2: Any, name: String): Unit = (x1, x2) match {
+    case (y1: Seq[Double @unchecked], v1: OldVector) =>
+      compareStructures(y1, v1.toArray.toSeq, name)
+    case (d1: Double, d2: Double) =>
+      assert2(Vectors.dense(d1) ~== Vectors.dense(d2) absTol 1e-4, name)
+    case (r1: GenericRowWithSchema, r2: Row) =>
+      assert(r1.size === r2.size, (r1, r2))
+      for (((fname, x1), x2) <- r1.schema.fieldNames.zip(r1.toSeq).zip(r2.toSeq)) {
+        compareStructures(x1, x2, s"$name.$fname")
+      }
+    case (r1: Row, r2: Row) =>
+      assert(r1.size === r2.size, (r1, r2))
+      for ((x1, x2) <- r1.toSeq.zip(r2.toSeq)) { compareStructures(x1, x2, name) }
+    case (v1: Vector, v2: Vector) =>
+      assert2(v1 ~== v2 absTol 1e-4, name)
+    case (l1: Long, l2: Long) => assert(l1 === l2)
+    case (s1: Seq[_], s2: Seq[_]) =>
+      assert(s1.size === s2.size, s"$name ${(s1, s2)}")
+      for (((x1, idx), x2) <- s1.zipWithIndex.zip(s2)) {
+        compareStructures(x1, x2, s"$name.$idx")
+      }
+    case (arr1: Array[_], arr2: Array[_]) =>
+      assert(arr1.toSeq === arr2.toSeq)
+    case _ => throw new Exception(s"$name: ${x1.getClass} ${x2.getClass} $x1 $x2")
+  }
+
+  private def assert2(x: => Boolean, hint: String): Unit = {
+    try {
+      assert(x, hint)
+    } catch {
+      case tfe: TestFailedException =>
+        throw new TestFailedException(Some(s"Failure with hint $hint"), Some(tfe), 1)
+    }
+  }
+
+  test("debugging test") {
+    val df = denseData(Nil)
+    val c = df.col("features")
+    val c1 = metrics("mean").summary(c)
+    val res = df.select(c1)
+    intercept[SparkException] {
+      compare(res, Seq.empty)
+    }
+  }
+
+  test("basic error handling") {
+    val df = denseData(Nil)
+    val c = df.col("features")
+    val res = df.select(metrics("mean").summary(c), mean(c))
+    intercept[SparkException] {
+      compare(res, Seq.empty)
+    }
+  }
+
+  test("no element, working metrics") {
+    val df = denseData(Nil)
+    val c = df.col("features")
+    val res = df.select(metrics("count").summary(c), count(c))
+    compare(res, Seq(Row(0L), 0L))
+  }
+
+  val singleElem = Seq(0.0, 1.0, 2.0)
+  testExample("single element", Seq(singleElem), ExpectedMetrics(
+    mean = singleElem,
+    variance = Seq(0.0, 0.0, 0.0),
+    count = 1,
+    numNonZeros = Seq(0, 1, 1),
+    max = singleElem,
+    min = singleElem,
+    normL1 = singleElem,
+    normL2 = singleElem
+  ))
+
+  testExample("two elements", Seq(Seq(0.0, 1.0, 2.0), Seq(0.0, -1.0, -2.0)), ExpectedMetrics(
+    mean = Seq(0.0, 0.0, 0.0),
+    // TODO: I have a doubt about these values, they are not normalized.
+    variance = Seq(0.0, 2.0, 8.0),
+    count = 2,
+    numNonZeros = Seq(0, 2, 2),
+    max = Seq(0.0, 1.0, 2.0),
+    min = Seq(0.0, -1.0, -2.0),
+    normL1 = Seq(0.0, 2.0, 4.0),
+    normL2 = Seq(0.0, math.sqrt(2.0), math.sqrt(2.0) * 2.0)
+  ))
+
+  testExample("dense vector input",
+    Seq(Seq(-1.0, 0.0, 6.0), Seq(3.0, -3.0, 0.0)),
+    ExpectedMetrics(
+      mean = Seq(1.0, -1.5, 3.0),
+      variance = Seq(8.0, 4.5, 18.0),
+      count = 2,
+      numNonZeros = Seq(2, 1, 1),
+      max = Seq(3.0, 0.0, 6.0),
+      min = Seq(-1.0, -3, 0.0),
+      normL1 = Seq(4.0, 3.0, 6.0),
+      normL2 = Seq(math.sqrt(10), 3, 6.0)
+    )
+  )
+
+  test("summarizer buffer basic error handing") {
+    val summarizer = new SummarizerBuffer
+
+    assert(summarizer.count === 0, "should be zero since nothing is added.")
+
+    withClue("Getting numNonzeros from empty summarizer should throw exception.") {
+      intercept[IllegalArgumentException] {
+        summarizer.numNonzeros
+      }
+    }
+
+    withClue("Getting variance from empty summarizer should throw exception.") {
+      intercept[IllegalArgumentException] {
+        summarizer.variance
+      }
+    }
+
+    withClue("Getting mean from empty summarizer should throw exception.") {
+      intercept[IllegalArgumentException] {
+        summarizer.mean
+      }
+    }
+
+    withClue("Getting max from empty summarizer should throw exception.") {
+      intercept[IllegalArgumentException] {
+        summarizer.max
+      }
+    }
+
+    withClue("Getting min from empty summarizer should throw exception.") {
+      intercept[IllegalArgumentException] {
+        summarizer.min
+      }
+    }
+
+    summarizer.add(Vectors.dense(-1.0, 2.0, 6.0)).add(Vectors.sparse(3, Seq((0, -2.0), (1, 6.0))))
+
+    withClue("Adding a new dense sample with different array size should throw exception.") {
+      intercept[IllegalArgumentException] {
+        summarizer.add(Vectors.dense(3.0, 1.0))
+      }
+    }
+
+    withClue("Adding a new sparse sample with different array size should throw exception.") {
+      intercept[IllegalArgumentException] {
+        summarizer.add(Vectors.sparse(5, Seq((0, -2.0), (1, 6.0))))
+      }
+    }
+
+    val summarizer2 = (new SummarizerBuffer).add(Vectors.dense(1.0, -2.0, 0.0, 4.0))
+    withClue("Merging a new summarizer with different dimensions should throw exception.") {
+      intercept[IllegalArgumentException] {
+        summarizer.merge(summarizer2)
+      }
+    }
+  }
+
+  test("summarizer buffer dense vector input") {
+    // For column 2, the maximum will be 0.0, and it's not explicitly added since we ignore all
+    // the zeros; it's a case we need to test. For column 3, the minimum will be 0.0 which we
+    // need to test as well.
+    val summarizer = (new SummarizerBuffer)
+      .add(Vectors.dense(-1.0, 0.0, 6.0))
+      .add(Vectors.dense(3.0, -3.0, 0.0))
+
+    assert(summarizer.mean ~== Vectors.dense(1.0, -1.5, 3.0) absTol 1E-5, "mean mismatch")
+    assert(summarizer.min ~== Vectors.dense(-1.0, -3, 0.0) absTol 1E-5, "min mismatch")
+    assert(summarizer.max ~== Vectors.dense(3.0, 0.0, 6.0) absTol 1E-5, "max mismatch")
+    assert(summarizer.numNonzeros ~== Vectors.dense(2, 1, 1) absTol 1E-5, "numNonzeros mismatch")
+    assert(summarizer.variance ~== Vectors.dense(8.0, 4.5, 18.0) absTol 1E-5, "variance mismatch")
+    assert(summarizer.count === 2)
+  }
+
+  test("summarizer buffer sparse vector input") {
+    val summarizer = (new SummarizerBuffer)
+      .add(Vectors.sparse(3, Seq((0, -1.0), (2, 6.0))))
+      .add(Vectors.sparse(3, Seq((0, 3.0), (1, -3.0))))
+
+    assert(summarizer.mean ~== Vectors.dense(1.0, -1.5, 3.0) absTol 1E-5, "mean mismatch")
+    assert(summarizer.min ~== Vectors.dense(-1.0, -3, 0.0) absTol 1E-5, "min mismatch")
+    assert(summarizer.max ~== Vectors.dense(3.0, 0.0, 6.0) absTol 1E-5, "max mismatch")
+    assert(summarizer.numNonzeros ~== Vectors.dense(2, 1, 1) absTol 1E-5, "numNonzeros mismatch")
+    assert(summarizer.variance ~== Vectors.dense(8.0, 4.5, 18.0) absTol 1E-5, "variance mismatch")
+    assert(summarizer.count === 2)
+  }
+
+  test("summarizer buffer mixing dense and sparse vector input") {
+    val summarizer = (new SummarizerBuffer)
+      .add(Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))))
+      .add(Vectors.dense(0.0, -1.0, -3.0))
+      .add(Vectors.sparse(3, Seq((1, -5.1))))
+      .add(Vectors.dense(3.8, 0.0, 1.9))
+      .add(Vectors.dense(1.7, -0.6, 0.0))
+      .add(Vectors.sparse(3, Seq((1, 1.9), (2, 0.0))))
+
+    assert(summarizer.mean ~==
+      Vectors.dense(0.583333333333, -0.416666666666, -0.183333333333) absTol 1E-5, "mean mismatch")
+
+    assert(summarizer.min ~== Vectors.dense(-2.0, -5.1, -3) absTol 1E-5, "min mismatch")
+    assert(summarizer.max ~== Vectors.dense(3.8, 2.3, 1.9) absTol 1E-5, "max mismatch")
+    assert(summarizer.numNonzeros ~== Vectors.dense(3, 5, 2) absTol 1E-5, "numNonzeros mismatch")
+    assert(summarizer.variance ~==
+      Vectors.dense(3.857666666666, 7.0456666666666, 2.48166666666666) absTol 1E-5,
+      "variance mismatch")
+
+    assert(summarizer.count === 6)
+  }
+
+  test("summarizer buffer merging two summarizers") {
+    val summarizer1 = (new SummarizerBuffer)
+      .add(Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))))
+      .add(Vectors.dense(0.0, -1.0, -3.0))
+
+    val summarizer2 = (new SummarizerBuffer)
+      .add(Vectors.sparse(3, Seq((1, -5.1))))
+      .add(Vectors.dense(3.8, 0.0, 1.9))
+      .add(Vectors.dense(1.7, -0.6, 0.0))
+      .add(Vectors.sparse(3, Seq((1, 1.9), (2, 0.0))))
+
+    val summarizer = summarizer1.merge(summarizer2)
+
+    assert(summarizer.mean ~==
+      Vectors.dense(0.583333333333, -0.416666666666, -0.183333333333) absTol 1E-5, "mean mismatch")
+
+    assert(summarizer.min ~== Vectors.dense(-2.0, -5.1, -3) absTol 1E-5, "min mismatch")
+    assert(summarizer.max ~== Vectors.dense(3.8, 2.3, 1.9) absTol 1E-5, "max mismatch")
+    assert(summarizer.numNonzeros ~== Vectors.dense(3, 5, 2) absTol 1E-5, "numNonzeros mismatch")
+    assert(summarizer.variance ~==
+      Vectors.dense(3.857666666666, 7.0456666666666, 2.48166666666666) absTol 1E-5,
+      "variance mismatch")
+    assert(summarizer.count === 6)
+  }
+
+  test("summarizer buffer merging summarizer with empty summarizer") {
+    // If one of two is non-empty, this should return the non-empty summarizer.
+    // If both of them are empty, then just return the empty summarizer.
+    val summarizer1 = (new SummarizerBuffer)
+      .add(Vectors.dense(0.0, -1.0, -3.0)).merge(new SummarizerBuffer)
+    assert(summarizer1.count === 1)
+
+    val summarizer2 = (new SummarizerBuffer)
+      .merge((new SummarizerBuffer).add(Vectors.dense(0.0, -1.0, -3.0)))
+    assert(summarizer2.count === 1)
+
+    val summarizer3 = (new SummarizerBuffer).merge(new SummarizerBuffer)
+    assert(summarizer3.count === 0)
+
+    assert(summarizer1.mean ~== Vectors.dense(0.0, -1.0, -3.0) absTol 1E-5, "mean mismatch")
+    assert(summarizer2.mean ~== Vectors.dense(0.0, -1.0, -3.0) absTol 1E-5, "mean mismatch")
+    assert(summarizer1.min ~== Vectors.dense(0.0, -1.0, -3.0) absTol 1E-5, "min mismatch")
+    assert(summarizer2.min ~== Vectors.dense(0.0, -1.0, -3.0) absTol 1E-5, "min mismatch")
+    assert(summarizer1.max ~== Vectors.dense(0.0, -1.0, -3.0) absTol 1E-5, "max mismatch")
+    assert(summarizer2.max ~== Vectors.dense(0.0, -1.0, -3.0) absTol 1E-5, "max mismatch")
+    assert(summarizer1.numNonzeros ~== Vectors.dense(0, 1, 1) absTol 1E-5, "numNonzeros mismatch")
+    assert(summarizer2.numNonzeros ~== Vectors.dense(0, 1, 1) absTol 1E-5, "numNonzeros mismatch")
+    assert(summarizer1.variance ~== Vectors.dense(0, 0, 0) absTol 1E-5, "variance mismatch")
+    assert(summarizer2.variance ~== Vectors.dense(0, 0, 0) absTol 1E-5, "variance mismatch")
+  }
+
+  test("summarizer buffer merging summarizer when one side has zero mean (SPARK-4355)") {
+    val s0 = new SummarizerBuffer()
+      .add(Vectors.dense(2.0))
+      .add(Vectors.dense(2.0))
+    val s1 = new SummarizerBuffer()
+      .add(Vectors.dense(1.0))
+      .add(Vectors.dense(-1.0))
+    s0.merge(s1)
+    assert(s0.mean(0) ~== 1.0 absTol 1e-14)
+  }
+
+  test("summarizer buffer merging summarizer with weighted samples") {
+    val summarizer = (new SummarizerBuffer)
+      .add(Vectors.sparse(3, Seq((0, -0.8), (1, 1.7))), weight = 0.1)
+      .add(Vectors.dense(0.0, -1.2, -1.7), 0.2).merge(
+      (new SummarizerBuffer)
+        .add(Vectors.sparse(3, Seq((0, -0.7), (1, 0.01), (2, 1.3))), 0.15)
+        .add(Vectors.dense(-0.5, 0.3, -1.5), 0.05))
+
+    assert(summarizer.count === 4)
+
+    // The following values are hand calculated using the formula:
+    // [[https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights]]
+    // which defines the reliability weight used for computing the unbiased estimation of variance
+    // for weighted instances.
+    assert(summarizer.mean ~== Vectors.dense(Array(-0.42, -0.107, -0.44))
+      absTol 1E-10, "mean mismatch")
+    assert(summarizer.variance ~== Vectors.dense(Array(0.17657142857, 1.645115714, 2.42057142857))
+      absTol 1E-8, "variance mismatch")
+    assert(summarizer.numNonzeros ~== Vectors.dense(Array(3.0, 4.0, 3.0))
+      absTol 1E-10, "numNonzeros mismatch")
+    assert(summarizer.max ~== Vectors.dense(Array(0.0, 1.7, 1.3)) absTol 1E-10, "max mismatch")
+    assert(summarizer.min ~== Vectors.dense(Array(-0.8, -1.2, -1.7)) absTol 1E-10, "min mismatch")
+    assert(summarizer.normL2 ~== Vectors.dense(0.387298335, 0.762571308141, 0.9715966241192)
+      absTol 1E-8, "normL2 mismatch")
+    assert(summarizer.normL1 ~== Vectors.dense(0.21, 0.4265, 0.61) absTol 1E-10, "normL1 mismatch")
+  }
+
+  test("summarizer buffer test min/max with weighted samples") {
+    val summarizer1 = new SummarizerBuffer()
+      .add(Vectors.dense(10.0, -10.0), 1e10)
+      .add(Vectors.dense(0.0, 0.0), 1e-7)
+
+    val summarizer2 = new SummarizerBuffer()
+    summarizer2.add(Vectors.dense(10.0, -10.0), 1e10)
+    for (i <- 1 to 100) {
+      summarizer2.add(Vectors.dense(0.0, 0.0), 1e-7)
+    }
+
+    val summarizer3 = new SummarizerBuffer()
+    for (i <- 1 to 100) {
+      summarizer3.add(Vectors.dense(0.0, 0.0), 1e-7)
+    }
+    summarizer3.add(Vectors.dense(10.0, -10.0), 1e10)
+
+    assert(summarizer1.max ~== Vectors.dense(10.0, 0.0) absTol 1e-14)
+    assert(summarizer1.min ~== Vectors.dense(0.0, -10.0) absTol 1e-14)
+    assert(summarizer2.max ~== Vectors.dense(10.0, 0.0) absTol 1e-14)
+    assert(summarizer2.min ~== Vectors.dense(0.0, -10.0) absTol 1e-14)
+    assert(summarizer3.max ~== Vectors.dense(10.0, 0.0) absTol 1e-14)
+    assert(summarizer3.min ~== Vectors.dense(0.0, -10.0) absTol 1e-14)
+  }
+
+  ignore("performance test") {
+    /*
+    Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.12
+    MacBook Pro (15-inch, 2016) CPU 2.9 GHz Intel Core i7
+    Use 2 partitions. tries out times= 20, warm up times = 10
+
+    The unit of test results is records/milliseconds (higher is better)
+
+    Vector size/records number:     1/1E7    10/1E6   100/1E6   1E3/1E5   1E4/1E4
+    -----------------------------------------------------------------------------
+    DataFrame                       15149      7441      2118       224        21
+    RDD from DataFrame               4992      4440      2328       320        33
+    Raw RDD                         53931     20683      3966       528        53
+    */
+    import scala.util.Random
+    val rand = new Random()
+
+    val genArr = (dim: Int) => {
+      Array.fill(dim)(rand.nextDouble())
+    }
+
+    val numPartitions = 2
+    for ( (n, dim) <- Seq(
+      (10000000, 1), (1000000, 10), (1000000, 100), (100000, 1000), (10000, 10000))
+    ) {
+      val rdd1 = sc.parallelize(1 to n, numPartitions).map { idx =>
+        OldVectors.dense(genArr(dim))
+      }
+      // scalastyle:off println
+      println(s"records number = $n, vector size = $dim, partition = ${rdd1.getNumPartitions}")
+      // scalastyle:on println
+
+      val numOfTry = 20
+      val numOfWarmUp = 10
+      rdd1.cache()
+      rdd1.count()
+      val rdd2 = sc.parallelize(1 to n, numPartitions).map { idx =>
+        Vectors.dense(genArr(dim))
+      }
+      rdd2.cache()
+      rdd2.count()
+      val df = rdd2.map(Tuple1.apply).toDF("features")
+      df.cache()
+      df.count()
+
+      def print(name: String, l: List[Long]): Unit = {
+        def f(z: Long) = (1e6 * n.toDouble) / z
+        val min = f(l.max)
+        val max = f(l.min)
+        val med = f(l.sorted.drop(l.size / 2).head)
+        // scalastyle:off println
+        println(s"$name = [$min ~ $med ~ $max] records / milli")
+        // scalastyle:on println
+      }
+
+      var timeDF: List[Long] = Nil
+      val x = df.select(
+        metrics("mean", "variance", "count", "numNonZeros", "max", "min", "normL1",
+          "normL2").summary($"features"))
+      for (i <- 1 to numOfTry) {
+        val start = System.nanoTime()
+        x.head()
+        val end = System.nanoTime()
+        if (i > numOfWarmUp) timeDF ::= (end - start)
+      }
+
+      var timeRDD: List[Long] = Nil
+      for (i <- 1 to numOfTry) {
+        val start = System.nanoTime()
+        Statistics.colStats(rdd1)
+        val end = System.nanoTime()
+        if (i > numOfWarmUp) timeRDD ::= (end - start)
+      }
+
+      var timeRDDFromDF: List[Long] = Nil
+      val rddFromDf = df.rdd.map { case Row(v: Vector) => OldVectors.fromML(v) }
+      for (i <- 1 to numOfTry) {
+        val start = System.nanoTime()
+        Statistics.colStats(rddFromDf)
+        val end = System.nanoTime()
+        if (i > numOfWarmUp) timeRDDFromDF ::= (end - start)
+      }
+
+      print("DataFrame : ", timeDF)
+      print("RDD :", timeRDD)
+      print("RDD from DataFrame : ", timeRDDFromDF)
+    }
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
index 7c57025f995d6..64b94f0a2c103 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
@@ -101,6 +101,8 @@ case class InterpretedMutableProjection(expressions: Seq[Expression]) extends Mu
 
 /**
  * A projection that returns UnsafeRow.
+ *
+ * CAUTION: the returned projection object should *not* be assumed to be thread-safe.
  */
 abstract class UnsafeProjection extends Projection {
   override def apply(row: InternalRow): UnsafeRow
@@ -110,11 +112,15 @@ object UnsafeProjection {
 
   /**
    * Returns an UnsafeProjection for given StructType.
+   *
+   * CAUTION: the returned projection object is *not* thread-safe.
    */
   def create(schema: StructType): UnsafeProjection = create(schema.fields.map(_.dataType))
 
   /**
    * Returns an UnsafeProjection for given Array of DataTypes.
+   *
+   * CAUTION: the returned projection object is *not* thread-safe.
    */
   def create(fields: Array[DataType]): UnsafeProjection = {
     create(fields.zipWithIndex.map(x => BoundReference(x._2, x._1, true)))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
index 7af4901435857..19abce01a26cf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
@@ -511,6 +511,12 @@ abstract class TypedImperativeAggregate[T] extends ImperativeAggregate {
    * Generates the final aggregation result value for current key group with the aggregation buffer
    * object.
    *
+   * Developer note: the only return types accepted by Spark are:
+   *   - primitive types
+   *   - InternalRow and subclasses
+   *   - ArrayData
+   *   - MapData
+   *
    * @param buffer aggregation buffer object.
    * @return The aggregation result of current key group
    */

From 8c54f1eb71d7338d3bd8224a57a293a2e7875252 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Tue, 15 Aug 2017 23:00:13 -0700
Subject: [PATCH 366/779] [SPARK-21422][BUILD] Depend on Apache ORC 1.4.0

## What changes were proposed in this pull request?

Like Parquet, this PR aims to depend on the latest Apache ORC 1.4 for Apache Spark 2.3. There are key benefits for Apache ORC 1.4.

- Stability: Apache ORC 1.4.0 has many fixes and we can depend on ORC community more.
- Maintainability: Reduce the Hive dependency and can remove old legacy code later.

Later, we can get the following two key benefits by adding new ORCFileFormat in SPARK-20728 (#17980), too.
- Usability: User can use ORC data sources without hive module, i.e, -Phive.
- Speed: Use both Spark ColumnarBatch and ORC RowBatch together. This will be faster than the current implementation in Spark.

## How was this patch tested?

Pass the jenkins.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #18640 from dongjoon-hyun/SPARK-21422.
---
 assembly/pom.xml               |  6 +++++
 dev/deps/spark-deps-hadoop-2.6 |  3 +++
 dev/deps/spark-deps-hadoop-2.7 |  3 +++
 pom.xml                        | 44 ++++++++++++++++++++++++++++++++++
 sql/core/pom.xml               | 10 ++++++++
 5 files changed, 66 insertions(+)

diff --git a/assembly/pom.xml b/assembly/pom.xml
index 464af16e46f6e..cd8366a175529 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -220,6 +220,12 @@
         <hive.deps.scope>provided</hive.deps.scope>
       </properties>
     </profile>
+    <profile>
+      <id>orc-provided</id>
+      <properties>
+        <orc.deps.scope>provided</orc.deps.scope>
+      </properties>
+    </profile>
     <profile>
       <id>parquet-provided</id>
       <properties>
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 83070a906d877..01af2c75b0251 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -2,6 +2,7 @@ JavaEWAH-0.3.2.jar
 RoaringBitmap-0.5.11.jar
 ST4-4.0.4.jar
 activation-1.1.1.jar
+aircompressor-0.3.jar
 antlr-2.7.7.jar
 antlr-runtime-3.4.jar
 antlr4-runtime-4.5.3.jar
@@ -148,6 +149,8 @@ netty-3.9.9.Final.jar
 netty-all-4.0.43.Final.jar
 objenesis-2.1.jar
 opencsv-2.3.jar
+orc-core-1.4.0-nohive.jar
+orc-mapreduce-1.4.0-nohive.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.6.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 5481e255a53de..69f3a4bb60f8b 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -2,6 +2,7 @@ JavaEWAH-0.3.2.jar
 RoaringBitmap-0.5.11.jar
 ST4-4.0.4.jar
 activation-1.1.1.jar
+aircompressor-0.3.jar
 antlr-2.7.7.jar
 antlr-runtime-3.4.jar
 antlr4-runtime-4.5.3.jar
@@ -149,6 +150,8 @@ netty-3.9.9.Final.jar
 netty-all-4.0.43.Final.jar
 objenesis-2.1.jar
 opencsv-2.3.jar
+orc-core-1.4.0-nohive.jar
+orc-mapreduce-1.4.0-nohive.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.6.jar
diff --git a/pom.xml b/pom.xml
index d544894b54c1a..c0df3ef0fe200 100644
--- a/pom.xml
+++ b/pom.xml
@@ -132,6 +132,8 @@
     <hive.version.short>1.2.1</hive.version.short>
     <derby.version>10.12.1.1</derby.version>
     <parquet.version>1.8.2</parquet.version>
+    <orc.version>1.4.0</orc.version>
+    <orc.classifier>nohive</orc.classifier>
     <hive.parquet.version>1.6.0</hive.parquet.version>
     <jetty.version>9.3.20.v20170531</jetty.version>
     <javaxservlet.version>3.1.0</javaxservlet.version>
@@ -208,6 +210,7 @@
     <flume.deps.scope>compile</flume.deps.scope>
     <hadoop.deps.scope>compile</hadoop.deps.scope>
     <hive.deps.scope>compile</hive.deps.scope>
+    <orc.deps.scope>compile</orc.deps.scope>
     <parquet.deps.scope>compile</parquet.deps.scope>
     <parquet.test.deps.scope>test</parquet.test.deps.scope>
 
@@ -1695,6 +1698,44 @@
           </exclusion>
         </exclusions>
       </dependency>
+      <dependency>
+        <groupId>org.apache.orc</groupId>
+        <artifactId>orc-core</artifactId>
+        <version>${orc.version}</version>
+        <classifier>${orc.classifier}</classifier>
+        <scope>${orc.deps.scope}</scope>
+        <exclusions>
+          <exclusion>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>hadoop-common</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.apache.hive</groupId>
+            <artifactId>hive-storage-api</artifactId>
+          </exclusion>
+        </exclusions>
+      </dependency>
+      <dependency>
+        <groupId>org.apache.orc</groupId>
+        <artifactId>orc-mapreduce</artifactId>
+        <version>${orc.version}</version>
+        <classifier>${orc.classifier}</classifier>
+        <scope>${orc.deps.scope}</scope>
+        <exclusions>
+          <exclusion>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>hadoop-common</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.apache.orc</groupId>
+            <artifactId>orc-core</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.apache.hive</groupId>
+            <artifactId>hive-storage-api</artifactId>
+          </exclusion>
+        </exclusions>
+      </dependency>
       <dependency>
         <groupId>org.apache.parquet</groupId>
         <artifactId>parquet-column</artifactId>
@@ -2727,6 +2768,9 @@
     <profile>
       <id>hive-provided</id>
     </profile>
+    <profile>
+      <id>orc-provided</id>
+    </profile>
     <profile>
       <id>parquet-provided</id>
     </profile>
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index a16411ec304a9..9a3cacbe3825e 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -86,6 +86,16 @@
       <scope>test</scope>
     </dependency>
 
+    <dependency>
+      <groupId>org.apache.orc</groupId>
+      <artifactId>orc-core</artifactId>
+      <classifier>${orc.classifier}</classifier>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.orc</groupId>
+      <artifactId>orc-mapreduce</artifactId>
+      <classifier>${orc.classifier}</classifier>
+    </dependency>
     <dependency>
       <groupId>org.apache.parquet</groupId>
       <artifactId>parquet-column</artifactId>

From 8321c141f63a911a97ec183aefa5ff75a338c051 Mon Sep 17 00:00:00 2001
From: Jan Vrsovsky <jan.vrsovsky@firma.seznam.cz>
Date: Wed, 16 Aug 2017 08:21:42 +0100
Subject: [PATCH 367/779] [SPARK-21723][ML] Fix writing LibSVM (key not found:
 numFeatures)

## What changes were proposed in this pull request?

Check the option "numFeatures" only when reading LibSVM, not when writing. When writing, Spark was raising an exception. After the change it will ignore the option completely. liancheng HyukjinKwon

(Maybe the usage should be forbidden when writing, in a major version change?).

## How was this patch tested?

Manual test, that loading and writing LibSVM files work fine, both with and without the numFeatures option.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Jan Vrsovsky <jan.vrsovsky@firma.seznam.cz>

Closes #18872 from ProtD/master.
---
 .../ml/source/libsvm/LibSVMRelation.scala     |  8 ++---
 .../source/libsvm/LibSVMRelationSuite.scala   | 36 +++++++++++++++----
 2 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
index 74aaed96fbabd..4e84ff044f55e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
@@ -76,12 +76,12 @@ private[libsvm] class LibSVMFileFormat
 
   override def toString: String = "LibSVM"
 
-  private def verifySchema(dataSchema: StructType): Unit = {
+  private def verifySchema(dataSchema: StructType, forWriting: Boolean): Unit = {
     if (
       dataSchema.size != 2 ||
         !dataSchema(0).dataType.sameType(DataTypes.DoubleType) ||
         !dataSchema(1).dataType.sameType(new VectorUDT()) ||
-        !(dataSchema(1).metadata.getLong(LibSVMOptions.NUM_FEATURES).toInt > 0)
+        !(forWriting || dataSchema(1).metadata.getLong(LibSVMOptions.NUM_FEATURES).toInt > 0)
     ) {
       throw new IOException(s"Illegal schema for libsvm data, schema=$dataSchema")
     }
@@ -119,7 +119,7 @@ private[libsvm] class LibSVMFileFormat
       job: Job,
       options: Map[String, String],
       dataSchema: StructType): OutputWriterFactory = {
-    verifySchema(dataSchema)
+    verifySchema(dataSchema, true)
     new OutputWriterFactory {
       override def newInstance(
           path: String,
@@ -142,7 +142,7 @@ private[libsvm] class LibSVMFileFormat
       filters: Seq[Filter],
       options: Map[String, String],
       hadoopConf: Configuration): (PartitionedFile) => Iterator[InternalRow] = {
-    verifySchema(dataSchema)
+    verifySchema(dataSchema, false)
     val numFeatures = dataSchema("features").metadata.getLong(LibSVMOptions.NUM_FEATURES).toInt
     assert(numFeatures > 0)
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala
index a67e49d54e148..3eabff434e8de 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala
@@ -19,13 +19,16 @@ package org.apache.spark.ml.source.libsvm
 
 import java.io.{File, IOException}
 import java.nio.charset.StandardCharsets
+import java.util.List
 
 import com.google.common.io.Files
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
+import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.{Row, SaveMode}
+import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
 import org.apache.spark.util.Utils
 
 
@@ -44,14 +47,14 @@ class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
       """
         |0 2:4.0 4:5.0 6:6.0
       """.stripMargin
-    val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data")
+    val dir = Utils.createTempDir()
     val succ = new File(dir, "_SUCCESS")
     val file0 = new File(dir, "part-00000")
     val file1 = new File(dir, "part-00001")
     Files.write("", succ, StandardCharsets.UTF_8)
     Files.write(lines0, file0, StandardCharsets.UTF_8)
     Files.write(lines1, file1, StandardCharsets.UTF_8)
-    path = dir.toURI.toString
+    path = dir.getPath
   }
 
   override def afterAll(): Unit = {
@@ -108,12 +111,12 @@ class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("write libsvm data and read it again") {
     val df = spark.read.format("libsvm").load(path)
-    val tempDir2 = new File(tempDir, "read_write_test")
-    val writepath = tempDir2.toURI.toString
+    val writePath = Utils.createTempDir().getPath
+
     // TODO: Remove requirement to coalesce by supporting multiple reads.
-    df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath)
+    df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writePath)
 
-    val df2 = spark.read.format("libsvm").load(writepath)
+    val df2 = spark.read.format("libsvm").load(writePath)
     val row1 = df2.first()
     val v = row1.getAs[SparseVector](1)
     assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
@@ -126,6 +129,27 @@ class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
     }
   }
 
+  test("write libsvm data from scratch and read it again") {
+    val rawData = new java.util.ArrayList[Row]()
+    rawData.add(Row(1.0, Vectors.sparse(3, Seq((0, 2.0), (1, 3.0)))))
+    rawData.add(Row(4.0, Vectors.sparse(3, Seq((0, 5.0), (2, 6.0)))))
+
+    val struct = StructType(
+      StructField("labelFoo", DoubleType, false) ::
+      StructField("featuresBar", VectorType, false) :: Nil
+    )
+    val df = spark.sqlContext.createDataFrame(rawData, struct)
+
+    val writePath = Utils.createTempDir().getPath
+
+    df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writePath)
+
+    val df2 = spark.read.format("libsvm").load(writePath)
+    val row1 = df2.first()
+    val v = row1.getAs[SparseVector](1)
+    assert(v == Vectors.sparse(3, Seq((0, 2.0), (1, 3.0))))
+  }
+
   test("select features from libsvm relation") {
     val df = spark.read.format("libsvm").load(path)
     df.select("features").rdd.map { case Row(d: Vector) => d }.first

From 0bb8d1f30a7223d2844aa6e733aaacfe2d37eb82 Mon Sep 17 00:00:00 2001
From: Nick Pentreath <nickp@za.ibm.com>
Date: Wed, 16 Aug 2017 10:54:28 +0200
Subject: [PATCH 368/779] [SPARK-13969][ML] Add FeatureHasher transformer

This PR adds a `FeatureHasher` transformer, modeled on [scikit-learn](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.FeatureHasher.html) and [Vowpal wabbit](https://github.com/JohnLangford/vowpal_wabbit/wiki/Feature-Hashing-and-Extraction).

The transformer operates on multiple input columns in one pass. Current behavior is:
* for numerical columns, the values are assumed to be real values and the feature index is `hash(columnName)` while feature value is `feature_value`
* for string columns, the values are assumed to be categorical and the feature index is `hash(column_name=feature_value)`, while feature value is `1.0`
* For hash collisions, feature values will be summed
* `null` (missing) values are ignored

The following dataframe illustrates the basic semantics:
```
+---+------+-----+---------+------+-----------------------------------------+
|int|double|float|stringNum|string|features                                 |
+---+------+-----+---------+------+-----------------------------------------+
|3  |4.0   |5.0  |1        |foo   |(16,[0,8,11,12,15],[5.0,3.0,1.0,4.0,1.0])|
|6  |7.0   |8.0  |2        |bar   |(16,[0,8,11,12,15],[8.0,6.0,1.0,7.0,1.0])|
+---+------+-----+---------+------+-----------------------------------------+
```

## How was this patch tested?

New unit tests and manual experiments.

Author: Nick Pentreath <nickp@za.ibm.com>

Closes #18513 from MLnick/FeatureHasher.
---
 .../spark/ml/feature/FeatureHasher.scala      | 196 ++++++++++++++++++
 .../spark/ml/feature/FeatureHasherSuite.scala | 193 +++++++++++++++++
 .../spark/ml/feature/HashingTFSuite.scala     |   8 +-
 3 files changed, 396 insertions(+), 1 deletion(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/feature/FeatureHasherSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
new file mode 100644
index 0000000000000..d22bf164c313c
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
@@ -0,0 +1,196 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.attribute.AttributeGroup
+import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators}
+import org.apache.spark.ml.param.shared.{HasInputCols, HasOutputCol}
+import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils}
+import org.apache.spark.mllib.feature.{HashingTF => OldHashingTF}
+import org.apache.spark.sql.{DataFrame, Dataset, Row}
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
+import org.apache.spark.util.Utils
+import org.apache.spark.util.collection.OpenHashMap
+
+/**
+ * Feature hashing projects a set of categorical or numerical features into a feature vector of
+ * specified dimension (typically substantially smaller than that of the original feature
+ * space). This is done using the hashing trick (https://en.wikipedia.org/wiki/Feature_hashing)
+ * to map features to indices in the feature vector.
+ *
+ * The [[FeatureHasher]] transformer operates on multiple columns. Each column may contain either
+ * numeric or categorical features. Behavior and handling of column data types is as follows:
+ *  -Numeric columns: For numeric features, the hash value of the column name is used to map the
+ *                    feature value to its index in the feature vector. Numeric features are never
+ *                    treated as categorical, even when they are integers. You must explicitly
+ *                    convert numeric columns containing categorical features to strings first.
+ *  -String columns: For categorical features, the hash value of the string "column_name=value"
+ *                   is used to map to the vector index, with an indicator value of `1.0`.
+ *                   Thus, categorical features are "one-hot" encoded
+ *                   (similarly to using [[OneHotEncoder]] with `dropLast=false`).
+ *  -Boolean columns: Boolean values are treated in the same way as string columns. That is,
+ *                    boolean features are represented as "column_name=true" or "column_name=false",
+ *                    with an indicator value of `1.0`.
+ *
+ * Null (missing) values are ignored (implicitly zero in the resulting feature vector).
+ *
+ * Since a simple modulo is used to transform the hash function to a vector index,
+ * it is advisable to use a power of two as the numFeatures parameter;
+ * otherwise the features will not be mapped evenly to the vector indices.
+ *
+ * {{{
+ *   val df = Seq(
+ *    (2.0, true, "1", "foo"),
+ *    (3.0, false, "2", "bar")
+ *   ).toDF("real", "bool", "stringNum", "string")
+ *
+ *   val hasher = new FeatureHasher()
+ *    .setInputCols("real", "bool", "stringNum", "num")
+ *    .setOutputCol("features")
+ *
+ *   hasher.transform(df).show()
+ *
+ *   +----+-----+---------+------+--------------------+
+ *   |real| bool|stringNum|string|            features|
+ *   +----+-----+---------+------+--------------------+
+ *   | 2.0| true|        1|   foo|(262144,[51871,63...|
+ *   | 3.0|false|        2|   bar|(262144,[6031,806...|
+ *   +----+-----+---------+------+--------------------+
+ * }}}
+ */
+@Experimental
+@Since("2.3.0")
+class FeatureHasher(@Since("2.3.0") override val uid: String) extends Transformer
+  with HasInputCols with HasOutputCol with DefaultParamsWritable {
+
+  @Since("2.3.0")
+  def this() = this(Identifiable.randomUID("featureHasher"))
+
+  /**
+   * Number of features. Should be greater than 0.
+   * (default = 2^18^)
+   * @group param
+   */
+  @Since("2.3.0")
+  val numFeatures = new IntParam(this, "numFeatures", "number of features (> 0)",
+    ParamValidators.gt(0))
+
+  setDefault(numFeatures -> (1 << 18))
+
+  /** @group getParam */
+  @Since("2.3.0")
+  def getNumFeatures: Int = $(numFeatures)
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setNumFeatures(value: Int): this.type = set(numFeatures, value)
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setInputCols(values: String*): this.type = setInputCols(values.toArray)
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setInputCols(value: Array[String]): this.type = set(inputCols, value)
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setOutputCol(value: String): this.type = set(outputCol, value)
+
+  @Since("2.3.0")
+  override def transform(dataset: Dataset[_]): DataFrame = {
+    val hashFunc: Any => Int = OldHashingTF.murmur3Hash
+    val n = $(numFeatures)
+    val localInputCols = $(inputCols)
+
+    val outputSchema = transformSchema(dataset.schema)
+    val realFields = outputSchema.fields.filter { f =>
+      f.dataType.isInstanceOf[NumericType]
+    }.map(_.name).toSet
+
+    def getDouble(x: Any): Double = {
+      x match {
+        case n: java.lang.Number =>
+          n.doubleValue()
+        case other =>
+          // will throw ClassCastException if it cannot be cast, as would row.getDouble
+          other.asInstanceOf[Double]
+      }
+    }
+
+    val hashFeatures = udf { row: Row =>
+      val map = new OpenHashMap[Int, Double]()
+      localInputCols.foreach { colName =>
+        val fieldIndex = row.fieldIndex(colName)
+        if (!row.isNullAt(fieldIndex)) {
+          val (rawIdx, value) = if (realFields(colName)) {
+            // numeric values are kept as is, with vector index based on hash of "column_name"
+            val value = getDouble(row.get(fieldIndex))
+            val hash = hashFunc(colName)
+            (hash, value)
+          } else {
+            // string and boolean values are treated as categorical, with an indicator value of 1.0
+            // and vector index based on hash of "column_name=value"
+            val value = row.get(fieldIndex).toString
+            val fieldName = s"$colName=$value"
+            val hash = hashFunc(fieldName)
+            (hash, 1.0)
+          }
+          val idx = Utils.nonNegativeMod(rawIdx, n)
+          map.changeValue(idx, value, v => v + value)
+        }
+      }
+      Vectors.sparse(n, map.toSeq)
+    }
+
+    val metadata = outputSchema($(outputCol)).metadata
+    dataset.select(
+      col("*"),
+      hashFeatures(struct($(inputCols).map(col): _*)).as($(outputCol), metadata))
+  }
+
+  @Since("2.3.0")
+  override def copy(extra: ParamMap): FeatureHasher = defaultCopy(extra)
+
+  @Since("2.3.0")
+  override def transformSchema(schema: StructType): StructType = {
+    val fields = schema($(inputCols).toSet)
+    fields.foreach { fieldSchema =>
+      val dataType = fieldSchema.dataType
+      val fieldName = fieldSchema.name
+      require(dataType.isInstanceOf[NumericType] ||
+        dataType.isInstanceOf[StringType] ||
+        dataType.isInstanceOf[BooleanType],
+        s"FeatureHasher requires columns to be of NumericType, BooleanType or StringType. " +
+          s"Column $fieldName was $dataType")
+    }
+    val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
+    SchemaUtils.appendColumn(schema, attrGroup.toStructField())
+  }
+}
+
+@Since("2.3.0")
+object FeatureHasher extends DefaultParamsReadable[FeatureHasher] {
+
+  @Since("2.3.0")
+  override def load(path: String): FeatureHasher = super.load(path)
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/FeatureHasherSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/FeatureHasherSuite.scala
new file mode 100644
index 0000000000000..407371a826666
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/FeatureHasherSuite.scala
@@ -0,0 +1,193 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.attribute.AttributeGroup
+import org.apache.spark.ml.linalg.{Vector, Vectors}
+import org.apache.spark.ml.param.ParamsSuite
+import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.ml.util.TestingUtils._
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
+import org.apache.spark.sql.functions.col
+import org.apache.spark.sql.types._
+
+class FeatureHasherSuite extends SparkFunSuite
+  with MLlibTestSparkContext
+  with DefaultReadWriteTest {
+
+  import testImplicits._
+
+  import HashingTFSuite.murmur3FeatureIdx
+
+  implicit private val vectorEncoder = ExpressionEncoder[Vector]()
+
+  test("params") {
+    ParamsSuite.checkParams(new FeatureHasher)
+  }
+
+  test("specify input cols using varargs or array") {
+    val featureHasher1 = new FeatureHasher()
+      .setInputCols("int", "double", "float", "stringNum", "string")
+    val featureHasher2 = new FeatureHasher()
+      .setInputCols(Array("int", "double", "float", "stringNum", "string"))
+    assert(featureHasher1.getInputCols === featureHasher2.getInputCols)
+  }
+
+  test("feature hashing") {
+    val df = Seq(
+      (2.0, true, "1", "foo"),
+      (3.0, false, "2", "bar")
+    ).toDF("real", "bool", "stringNum", "string")
+
+    val n = 100
+    val hasher = new FeatureHasher()
+      .setInputCols("real", "bool", "stringNum", "string")
+      .setOutputCol("features")
+      .setNumFeatures(n)
+    val output = hasher.transform(df)
+    val attrGroup = AttributeGroup.fromStructField(output.schema("features"))
+    assert(attrGroup.numAttributes === Some(n))
+
+    val features = output.select("features").as[Vector].collect()
+    // Assume perfect hash on field names
+    def idx: Any => Int = murmur3FeatureIdx(n)
+    // check expected indices
+    val expected = Seq(
+      Vectors.sparse(n, Seq((idx("real"), 2.0), (idx("bool=true"), 1.0),
+        (idx("stringNum=1"), 1.0), (idx("string=foo"), 1.0))),
+      Vectors.sparse(n, Seq((idx("real"), 3.0), (idx("bool=false"), 1.0),
+        (idx("stringNum=2"), 1.0), (idx("string=bar"), 1.0)))
+    )
+    assert(features.zip(expected).forall { case (e, a) => e ~== a absTol 1e-14 })
+  }
+
+  test("hashing works for all numeric types") {
+    val df = Seq(5.0, 10.0, 15.0).toDF("real")
+
+    val hasher = new FeatureHasher()
+      .setInputCols("real")
+      .setOutputCol("features")
+
+    val expectedResult = hasher.transform(df).select("features").as[Vector].collect()
+    // check all numeric types work as expected. String & boolean types are tested in default case
+    val types =
+      Seq(ShortType, LongType, IntegerType, FloatType, ByteType, DoubleType, DecimalType(10, 0))
+    types.foreach { t =>
+      val castDF = df.select(col("real").cast(t))
+      val castResult = hasher.transform(castDF).select("features").as[Vector].collect()
+      withClue(s"FeatureHasher works for all numeric types (testing $t): ") {
+        assert(castResult.zip(expectedResult).forall { case (actual, expected) =>
+          actual ~== expected absTol 1e-14
+        })
+      }
+    }
+  }
+
+  test("invalid input type should fail") {
+    val df = Seq(
+      Vectors.dense(1),
+      Vectors.dense(2)
+    ).toDF("vec")
+
+    intercept[IllegalArgumentException] {
+      new FeatureHasher().setInputCols("vec").transform(df)
+    }
+  }
+
+  test("hash collisions sum feature values") {
+    val df = Seq(
+      (1.0, "foo", "foo"),
+      (2.0, "bar", "baz")
+    ).toDF("real", "string1", "string2")
+
+    val n = 1
+    val hasher = new FeatureHasher()
+      .setInputCols("real", "string1", "string2")
+      .setOutputCol("features")
+      .setNumFeatures(n)
+
+    val features = hasher.transform(df).select("features").as[Vector].collect()
+    def idx: Any => Int = murmur3FeatureIdx(n)
+    // everything should hash into one field
+    assert(idx("real") === idx("string1=foo"))
+    assert(idx("string1=foo") === idx("string2=foo"))
+    assert(idx("string2=foo") === idx("string1=bar"))
+    assert(idx("string1=bar") === idx("string2=baz"))
+    val expected = Seq(
+      Vectors.sparse(n, Seq((idx("string1=foo"), 3.0))),
+      Vectors.sparse(n, Seq((idx("string2=bar"), 4.0)))
+    )
+    assert(features.zip(expected).forall { case (e, a) => e ~== a absTol 1e-14 })
+  }
+
+  test("ignores null values in feature hashing") {
+    import org.apache.spark.sql.functions._
+
+    val df = Seq(
+      (2.0, "foo", null),
+      (3.0, "bar", "baz")
+    ).toDF("real", "string1", "string2").select(
+      when(col("real") === 3.0, null).otherwise(col("real")).alias("real"),
+      col("string1"),
+      col("string2")
+    )
+
+    val n = 100
+    val hasher = new FeatureHasher()
+      .setInputCols("real", "string1", "string2")
+      .setOutputCol("features")
+      .setNumFeatures(n)
+
+    val features = hasher.transform(df).select("features").as[Vector].collect()
+    def idx: Any => Int = murmur3FeatureIdx(n)
+    val expected = Seq(
+      Vectors.sparse(n, Seq((idx("real"), 2.0), (idx("string1=foo"), 1.0))),
+      Vectors.sparse(n, Seq((idx("string1=bar"), 1.0), (idx("string2=baz"), 1.0)))
+    )
+    assert(features.zip(expected).forall { case (e, a) => e ~== a absTol 1e-14 })
+  }
+
+  test("unicode column names and values") {
+    // scalastyle:off nonascii
+    val df = Seq((2.0, "中文")).toDF("中文", "unicode")
+
+    val n = 100
+    val hasher = new FeatureHasher()
+      .setInputCols("中文", "unicode")
+      .setOutputCol("features")
+      .setNumFeatures(n)
+
+    val features = hasher.transform(df).select("features").as[Vector].collect()
+    def idx: Any => Int = murmur3FeatureIdx(n)
+    val expected = Seq(
+      Vectors.sparse(n, Seq((idx("中文"), 2.0), (idx("unicode=中文"), 1.0)))
+    )
+    assert(features.zip(expected).forall { case (e, a) => e ~== a absTol 1e-14 })
+    // scalastyle:on nonascii
+  }
+
+  test("read/write") {
+    val t = new FeatureHasher()
+      .setInputCols(Array("myCol1", "myCol2", "myCol3"))
+      .setOutputCol("myOutputCol")
+      .setNumFeatures(10)
+    testDefaultReadWrite(t)
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/HashingTFSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/HashingTFSuite.scala
index 1d14866cc933b..a46272fdce1fb 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/HashingTFSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/HashingTFSuite.scala
@@ -30,6 +30,7 @@ import org.apache.spark.util.Utils
 class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
 
   import testImplicits._
+  import HashingTFSuite.murmur3FeatureIdx
 
   test("params") {
     ParamsSuite.checkParams(new HashingTF)
@@ -77,7 +78,12 @@ class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext with Defau
     testDefaultReadWrite(t)
   }
 
-  private def murmur3FeatureIdx(numFeatures: Int)(term: Any): Int = {
+}
+
+object HashingTFSuite {
+
+  private[feature] def murmur3FeatureIdx(numFeatures: Int)(term: Any): Int = {
     Utils.nonNegativeMod(MLlibHashingTF.murmur3Hash(term), numFeatures)
   }
+
 }

From adf005dabe3b0060033e1eeaedbab31a868efc8c Mon Sep 17 00:00:00 2001
From: John Lee <jlee2@yahoo-inc.com>
Date: Wed, 16 Aug 2017 09:44:09 -0500
Subject: [PATCH 369/779] [SPARK-21656][CORE] spark dynamic allocation should
 not idle timeout executors when tasks still to run

## What changes were proposed in this pull request?

Right now spark lets go of executors when they are idle for the 60s (or configurable time). I have seen spark let them go when they are idle but they were really needed. I have seen this issue when the scheduler was waiting to get node locality but that takes longer than the default idle timeout. In these jobs the number of executors goes down really small (less than 10) but there are still like 80,000 tasks to run.
We should consider not allowing executors to idle timeout if they are still needed according to the number of tasks to be run.

## How was this patch tested?

Tested by manually adding executors to `executorsIdsToBeRemoved` list and seeing if those executors were removed when there are a lot of tasks and a high `numExecutorsTarget` value.

Code used

In  `ExecutorAllocationManager.start()`

```
    start_time = clock.getTimeMillis()
```

In `ExecutorAllocationManager.schedule()`
```
    val executorIdsToBeRemoved = ArrayBuffer[String]()
    if ( now > start_time + 1000 * 60 * 2) {
      logInfo("--- REMOVING 1/2 of the EXECUTORS ---")
      start_time +=  1000 * 60 * 100
      var counter = 0
      for (x <- executorIds) {
        counter += 1
        if (counter == 2) {
          counter = 0
          executorIdsToBeRemoved += x
        }
      }
    }

Author: John Lee <jlee2@yahoo-inc.com>

Closes #18874 from yoonlee95/SPARK-21656.
---
 .../spark/ExecutorAllocationManager.scala     |   5 +-
 .../ExecutorAllocationManagerSuite.scala      | 119 +++++++++++++-----
 2 files changed, 89 insertions(+), 35 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
index 9a0e3b5557892..337631a6f9a34 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
@@ -420,7 +420,10 @@ private[spark] class ExecutorAllocationManager(
     executors.foreach { executorIdToBeRemoved =>
       if (newExecutorTotal - 1 < minNumExecutors) {
         logDebug(s"Not removing idle executor $executorIdToBeRemoved because there are only " +
-          s"$newExecutorTotal executor(s) left (limit $minNumExecutors)")
+          s"$newExecutorTotal executor(s) left (minimum number of executor limit $minNumExecutors)")
+      } else if (newExecutorTotal - 1 < numExecutorsTarget) {
+        logDebug(s"Not removing idle executor $executorIdToBeRemoved because there are only " +
+          s"$newExecutorTotal executor(s) left (number of executor target $numExecutorsTarget)")
       } else if (canBeKilled(executorIdToBeRemoved)) {
         executorIdsToBeRemoved += executorIdToBeRemoved
         newExecutorTotal -= 1
diff --git a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
index 4ea42fc7d5c22..b9ce71a0c5254 100644
--- a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
@@ -314,8 +314,47 @@ class ExecutorAllocationManagerSuite
     assert(executorsPendingToRemove(manager).isEmpty)
   }
 
+  test ("Removing with various numExecutorsTarget condition") {
+    sc = createSparkContext(5, 12, 5)
+    val manager = sc.executorAllocationManager.get
+
+    sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(0, 8)))
+
+    // Remove when numExecutorsTarget is the same as the current number of executors
+    assert(addExecutors(manager) === 1)
+    assert(addExecutors(manager) === 2)
+    (1 to 8).map { i => createTaskInfo(i, i, s"$i") }.foreach {
+      info => sc.listenerBus.postToAll(SparkListenerTaskStart(0, 0, info)) }
+    assert(executorIds(manager).size === 8)
+    assert(numExecutorsTarget(manager) === 8)
+    assert(maxNumExecutorsNeeded(manager) == 8)
+    assert(!removeExecutor(manager, "1")) // won't work since numExecutorsTarget == numExecutors
+
+    // Remove executors when numExecutorsTarget is lower than current number of executors
+    (1 to 3).map { i => createTaskInfo(i, i, s"$i") }.foreach {
+      info => sc.listenerBus.postToAll(SparkListenerTaskEnd(0, 0, null, Success, info, null)) }
+    adjustRequestedExecutors(manager)
+    assert(executorIds(manager).size === 8)
+    assert(numExecutorsTarget(manager) === 5)
+    assert(maxNumExecutorsNeeded(manager) == 5)
+    assert(removeExecutor(manager, "1"))
+    assert(removeExecutors(manager, Seq("2", "3"))=== Seq("2", "3"))
+    onExecutorRemoved(manager, "1")
+    onExecutorRemoved(manager, "2")
+    onExecutorRemoved(manager, "3")
+
+    // numExecutorsTarget is lower than minNumExecutors
+    sc.listenerBus.postToAll(
+      SparkListenerTaskEnd(0, 0, null, Success, createTaskInfo(4, 4, "4"), null))
+    assert(executorIds(manager).size === 5)
+    assert(numExecutorsTarget(manager) === 5)
+    assert(maxNumExecutorsNeeded(manager) == 4)
+    assert(!removeExecutor(manager, "4")) // lower limit
+    assert(addExecutors(manager) === 0) // upper limit
+  }
+
   test ("interleaving add and remove") {
-    sc = createSparkContext(5, 10, 5)
+    sc = createSparkContext(5, 12, 5)
     val manager = sc.executorAllocationManager.get
     sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(0, 1000)))
 
@@ -331,52 +370,59 @@ class ExecutorAllocationManagerSuite
     onExecutorAdded(manager, "7")
     onExecutorAdded(manager, "8")
     assert(executorIds(manager).size === 8)
+    assert(numExecutorsTarget(manager) === 8)
 
-    // Remove until limit
-    assert(removeExecutor(manager, "1"))
-    assert(removeExecutors(manager, Seq("2", "3")) === Seq("2", "3"))
-    assert(!removeExecutor(manager, "4")) // lower limit reached
-    assert(!removeExecutor(manager, "5"))
-    onExecutorRemoved(manager, "1")
-    onExecutorRemoved(manager, "2")
-    onExecutorRemoved(manager, "3")
-    assert(executorIds(manager).size === 5)
 
-    // Add until limit
-    assert(addExecutors(manager) === 2) // upper limit reached
-    assert(addExecutors(manager) === 0)
-    assert(!removeExecutor(manager, "4")) // still at lower limit
-    assert((manager, Seq("5")) !== Seq("5"))
+    // Remove when numTargetExecutors is equal to the current number of executors
+    assert(!removeExecutor(manager, "1"))
+    assert(removeExecutors(manager, Seq("2", "3")) !== Seq("2", "3"))
+
+    // Remove until limit
     onExecutorAdded(manager, "9")
     onExecutorAdded(manager, "10")
     onExecutorAdded(manager, "11")
     onExecutorAdded(manager, "12")
-    onExecutorAdded(manager, "13")
-    assert(executorIds(manager).size === 10)
+    assert(executorIds(manager).size === 12)
+    assert(numExecutorsTarget(manager) === 8)
 
-    // Remove succeeds again, now that we are no longer at the lower limit
-    assert(removeExecutors(manager, Seq("4", "5", "6")) === Seq("4", "5", "6"))
-    assert(removeExecutor(manager, "7"))
-    assert(executorIds(manager).size === 10)
-    assert(addExecutors(manager) === 0)
+    assert(removeExecutor(manager, "1"))
+    assert(removeExecutors(manager, Seq("2", "3", "4")) === Seq("2", "3", "4"))
+    assert(!removeExecutor(manager, "5")) // lower limit reached
+    assert(!removeExecutor(manager, "6"))
+    onExecutorRemoved(manager, "1")
+    onExecutorRemoved(manager, "2")
+    onExecutorRemoved(manager, "3")
     onExecutorRemoved(manager, "4")
-    onExecutorRemoved(manager, "5")
     assert(executorIds(manager).size === 8)
 
-    // Number of executors pending restarts at 1
-    assert(numExecutorsToAdd(manager) === 1)
-    assert(addExecutors(manager) === 0)
-    assert(executorIds(manager).size === 8)
-    onExecutorRemoved(manager, "6")
-    onExecutorRemoved(manager, "7")
+    // Add until limit
+    assert(!removeExecutor(manager, "7")) // still at lower limit
+    assert((manager, Seq("8")) !== Seq("8"))
+    onExecutorAdded(manager, "13")
     onExecutorAdded(manager, "14")
     onExecutorAdded(manager, "15")
-    assert(executorIds(manager).size === 8)
-    assert(addExecutors(manager) === 0) // still at upper limit
     onExecutorAdded(manager, "16")
+    assert(executorIds(manager).size === 12)
+
+    // Remove succeeds again, now that we are no longer at the lower limit
+    assert(removeExecutors(manager, Seq("5", "6", "7")) === Seq("5", "6", "7"))
+    assert(removeExecutor(manager, "8"))
+    assert(executorIds(manager).size === 12)
+    onExecutorRemoved(manager, "5")
+    onExecutorRemoved(manager, "6")
+    assert(executorIds(manager).size === 10)
+    assert(numExecutorsToAdd(manager) === 4)
+    onExecutorRemoved(manager, "9")
+    onExecutorRemoved(manager, "10")
+    assert(addExecutors(manager) === 4) // at upper limit
     onExecutorAdded(manager, "17")
+    onExecutorAdded(manager, "18")
     assert(executorIds(manager).size === 10)
-    assert(numExecutorsTarget(manager) === 10)
+    assert(addExecutors(manager) === 0) // still at upper limit
+    onExecutorAdded(manager, "19")
+    onExecutorAdded(manager, "20")
+    assert(executorIds(manager).size === 12)
+    assert(numExecutorsTarget(manager) === 12)
   }
 
   test("starting/canceling add timer") {
@@ -915,12 +961,17 @@ class ExecutorAllocationManagerSuite
     onExecutorAdded(manager, "third")
     onExecutorAdded(manager, "fourth")
     onExecutorAdded(manager, "fifth")
-    assert(executorIds(manager) === Set("first", "second", "third", "fourth", "fifth"))
+    onExecutorAdded(manager, "sixth")
+    onExecutorAdded(manager, "seventh")
+    onExecutorAdded(manager, "eighth")
+    assert(executorIds(manager) === Set("first", "second", "third", "fourth", "fifth",
+      "sixth", "seventh", "eighth"))
 
     removeExecutor(manager, "first")
     removeExecutors(manager, Seq("second", "third"))
     assert(executorsPendingToRemove(manager) === Set("first", "second", "third"))
-    assert(executorIds(manager) === Set("first", "second", "third", "fourth", "fifth"))
+    assert(executorIds(manager) === Set("first", "second", "third", "fourth", "fifth",
+      "sixth", "seventh", "eighth"))
 
 
     // Cluster manager lost will make all the live executors lost, so here simulate this behavior

From 1cce1a3b639c5c793d43fa51a8ec3e0fef622a40 Mon Sep 17 00:00:00 2001
From: 10129659 <chen.yanshan@zte.com.cn>
Date: Wed, 16 Aug 2017 09:12:20 -0700
Subject: [PATCH 370/779] [SPARK-21603][SQL] The wholestage codegen will be
 much slower then that is closed when the function is too long

## What changes were proposed in this pull request?
Close the whole stage codegen when the function lines is longer than the maxlines which will be setted by
spark.sql.codegen.MaxFunctionLength parameter, because when the function is too long , it will not get the JIT  optimizing.
A benchmark test result is 10x slower when the generated function is too long :

ignore("max function length of wholestagecodegen") {
    val N = 20 << 15

    val benchmark = new Benchmark("max function length of wholestagecodegen", N)
    def f(): Unit = sparkSession.range(N)
      .selectExpr(
        "id",
        "(id & 1023) as k1",
        "cast(id & 1023 as double) as k2",
        "cast(id & 1023 as int) as k3",
        "case when id > 100 and id <= 200 then 1 else 0 end as v1",
        "case when id > 200 and id <= 300 then 1 else 0 end as v2",
        "case when id > 300 and id <= 400 then 1 else 0 end as v3",
        "case when id > 400 and id <= 500 then 1 else 0 end as v4",
        "case when id > 500 and id <= 600 then 1 else 0 end as v5",
        "case when id > 600 and id <= 700 then 1 else 0 end as v6",
        "case when id > 700 and id <= 800 then 1 else 0 end as v7",
        "case when id > 800 and id <= 900 then 1 else 0 end as v8",
        "case when id > 900 and id <= 1000 then 1 else 0 end as v9",
        "case when id > 1000 and id <= 1100 then 1 else 0 end as v10",
        "case when id > 1100 and id <= 1200 then 1 else 0 end as v11",
        "case when id > 1200 and id <= 1300 then 1 else 0 end as v12",
        "case when id > 1300 and id <= 1400 then 1 else 0 end as v13",
        "case when id > 1400 and id <= 1500 then 1 else 0 end as v14",
        "case when id > 1500 and id <= 1600 then 1 else 0 end as v15",
        "case when id > 1600 and id <= 1700 then 1 else 0 end as v16",
        "case when id > 1700 and id <= 1800 then 1 else 0 end as v17",
        "case when id > 1800 and id <= 1900 then 1 else 0 end as v18")
      .groupBy("k1", "k2", "k3")
      .sum()
      .collect()

    benchmark.addCase(s"codegen = F") { iter =>
      sparkSession.conf.set("spark.sql.codegen.wholeStage", "false")
      f()
    }

    benchmark.addCase(s"codegen = T") { iter =>
      sparkSession.conf.set("spark.sql.codegen.wholeStage", "true")
      sparkSession.conf.set("spark.sql.codegen.MaxFunctionLength", "10000")
      f()
    }

    benchmark.run()

    /*
    Java HotSpot(TM) 64-Bit Server VM 1.8.0_111-b14 on Windows 7 6.1
    Intel64 Family 6 Model 58 Stepping 9, GenuineIntel
    max function length of wholestagecodegen: Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
    ------------------------------------------------------------------------------------------------
    codegen = F                                    443 /  507          1.5         676.0       1.0X
    codegen = T                                   3279 / 3283          0.2        5002.6       0.1X
     */
  }

## How was this patch tested?
Run the unit test

Author: 10129659 <chen.yanshan@zte.com.cn>

Closes #18810 from eatoncys/codegen.
---
 .../expressions/codegen/CodeFormatter.scala   |  8 +++
 .../expressions/codegen/CodeGenerator.scala   | 14 +++++
 .../apache/spark/sql/internal/SQLConf.scala   | 12 ++++
 .../codegen/CodeFormatterSuite.scala          | 32 ++++++++++
 .../sql/execution/WholeStageCodegenExec.scala |  8 +++
 .../execution/WholeStageCodegenSuite.scala    | 57 +++++++++++++++++
 .../benchmark/AggregateBenchmark.scala        | 62 +++++++++++++++++++
 7 files changed, 193 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala
index 60e600d8dbd8f..7b398f424cead 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala
@@ -89,6 +89,14 @@ object CodeFormatter {
     }
     new CodeAndComment(code.result().trim(), map)
   }
+
+  def stripExtraNewLinesAndComments(input: String): String = {
+    val commentReg =
+      ("""([ |\t]*?\/\*[\s|\S]*?\*\/[ |\t]*?)|""" +    // strip /*comment*/
+       """([ |\t]*?\/\/[\s\S]*?\n)""").r               // strip //comment
+    val codeWithoutComment = commentReg.replaceAllIn(input, "")
+    codeWithoutComment.replaceAll("""\n\s*\n""", "\n") // strip ExtraNewLines
+  }
 }
 
 private class CodeFormatter {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index a014e2aa34820..807765c1e00a1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -355,6 +355,20 @@ class CodegenContext {
    */
   private val placeHolderToComments = new mutable.HashMap[String, String]
 
+  /**
+   * It will count the lines of every Java function generated by whole-stage codegen,
+   * if there is a function of length greater than spark.sql.codegen.maxLinesPerFunction,
+   * it will return true.
+   */
+  def isTooLongGeneratedFunction: Boolean = {
+    classFunctions.values.exists { _.values.exists {
+      code =>
+        val codeWithoutComments = CodeFormatter.stripExtraNewLinesAndComments(code)
+        codeWithoutComments.count(_ == '\n') > SQLConf.get.maxLinesPerFunction
+      }
+    }
+  }
+
   /**
    * Returns a term name that is unique within this instance of a `CodegenContext`.
    */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 733d80e9d46cd..2c7397c1ec774 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -572,6 +572,16 @@ object SQLConf {
       "disable logging or -1 to apply no limit.")
     .createWithDefault(1000)
 
+  val WHOLESTAGE_MAX_LINES_PER_FUNCTION = buildConf("spark.sql.codegen.maxLinesPerFunction")
+    .internal()
+    .doc("The maximum lines of a single Java function generated by whole-stage codegen. " +
+      "When the generated function exceeds this threshold, " +
+      "the whole-stage codegen is deactivated for this subtree of the current query plan. " +
+      "The default value 2667 is the max length of byte code JIT supported " +
+      "for a single function(8000) divided by 3.")
+    .intConf
+    .createWithDefault(2667)
+
   val FILES_MAX_PARTITION_BYTES = buildConf("spark.sql.files.maxPartitionBytes")
     .doc("The maximum number of bytes to pack into a single partition when reading files.")
     .longConf
@@ -1037,6 +1047,8 @@ class SQLConf extends Serializable with Logging {
 
   def loggingMaxLinesForCodegen: Int = getConf(CODEGEN_LOGGING_MAX_LINES)
 
+  def maxLinesPerFunction: Int = getConf(WHOLESTAGE_MAX_LINES_PER_FUNCTION)
+
   def tableRelationCacheSize: Int =
     getConf(StaticSQLConf.FILESOURCE_TABLE_RELATION_CACHE_SIZE)
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala
index 9d0a41661beaa..a0f1a64b0ab08 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala
@@ -53,6 +53,38 @@ class CodeFormatterSuite extends SparkFunSuite {
     assert(reducedCode.body === "/*project_c4*/")
   }
 
+  test("removing extra new lines and comments") {
+    val code =
+      """
+        |/*
+        |  * multi
+        |  * line
+        |  * comments
+        |  */
+        |
+        |public function() {
+        |/*comment*/
+        |  /*comment_with_space*/
+        |code_body
+        |//comment
+        |code_body
+        |  //comment_with_space
+        |
+        |code_body
+        |}
+      """.stripMargin
+
+    val reducedCode = CodeFormatter.stripExtraNewLinesAndComments(code)
+    assert(reducedCode ===
+      """
+        |public function() {
+        |code_body
+        |code_body
+        |code_body
+        |}
+      """.stripMargin)
+  }
+
   testCase("basic example") {
     """
       |class A {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
index 34134db278ad8..bacb7090a70ab 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
@@ -370,6 +370,14 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
 
   override def doExecute(): RDD[InternalRow] = {
     val (ctx, cleanedSource) = doCodeGen()
+    if (ctx.isTooLongGeneratedFunction) {
+      logWarning("Found too long generated codes and JIT optimization might not work, " +
+        "Whole-stage codegen disabled for this plan, " +
+        "You can change the config spark.sql.codegen.MaxFunctionLength " +
+        "to adjust the function length limit:\n "
+        + s"$treeString")
+      return child.execute()
+    }
     // try to compile and fallback if it failed
     try {
       CodeGenerator.compile(cleanedSource)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
index 183c68fd3c016..beeee6a97c8dd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution
 import org.apache.spark.sql.{Column, Dataset, Row}
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.expressions.{Add, Literal, Stack}
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
 import org.apache.spark.sql.execution.aggregate.HashAggregateExec
 import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec
 import org.apache.spark.sql.execution.joins.SortMergeJoinExec
@@ -149,4 +150,60 @@ class WholeStageCodegenSuite extends SparkPlanTest with SharedSQLContext {
       assert(df.collect() === Array(Row(1), Row(2)))
     }
   }
+
+  def genGroupByCodeGenContext(caseNum: Int): CodegenContext = {
+    val caseExp = (1 to caseNum).map { i =>
+      s"case when id > $i and id <= ${i + 1} then 1 else 0 end as v$i"
+    }.toList
+    val keyExp = List(
+      "id",
+      "(id & 1023) as k1",
+      "cast(id & 1023 as double) as k2",
+      "cast(id & 1023 as int) as k3")
+
+    val ds = spark.range(10)
+      .selectExpr(keyExp:::caseExp: _*)
+      .groupBy("k1", "k2", "k3")
+      .sum()
+    val plan = ds.queryExecution.executedPlan
+
+    val wholeStageCodeGenExec = plan.find(p => p match {
+      case wp: WholeStageCodegenExec => wp.child match {
+        case hp: HashAggregateExec if (hp.child.isInstanceOf[ProjectExec]) => true
+        case _ => false
+      }
+      case _ => false
+    })
+
+    assert(wholeStageCodeGenExec.isDefined)
+    wholeStageCodeGenExec.get.asInstanceOf[WholeStageCodegenExec].doCodeGen()._1
+  }
+
+  test("SPARK-21603 check there is a too long generated function") {
+    withSQLConf(SQLConf.WHOLESTAGE_MAX_LINES_PER_FUNCTION.key -> "1500") {
+      val ctx = genGroupByCodeGenContext(30)
+      assert(ctx.isTooLongGeneratedFunction === true)
+    }
+  }
+
+  test("SPARK-21603 check there is not a too long generated function") {
+    withSQLConf(SQLConf.WHOLESTAGE_MAX_LINES_PER_FUNCTION.key -> "1500") {
+      val ctx = genGroupByCodeGenContext(1)
+      assert(ctx.isTooLongGeneratedFunction === false)
+    }
+  }
+
+  test("SPARK-21603 check there is not a too long generated function when threshold is Int.Max") {
+    withSQLConf(SQLConf.WHOLESTAGE_MAX_LINES_PER_FUNCTION.key -> Int.MaxValue.toString) {
+      val ctx = genGroupByCodeGenContext(30)
+      assert(ctx.isTooLongGeneratedFunction === false)
+    }
+  }
+
+  test("SPARK-21603 check there is a too long generated function when threshold is 0") {
+    withSQLConf(SQLConf.WHOLESTAGE_MAX_LINES_PER_FUNCTION.key -> "0") {
+      val ctx = genGroupByCodeGenContext(1)
+      assert(ctx.isTooLongGeneratedFunction === true)
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala
index 8a798fb444696..691fa9ac5e1e7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala
@@ -301,6 +301,68 @@ class AggregateBenchmark extends BenchmarkBase {
     */
   }
 
+  ignore("max function length of wholestagecodegen") {
+    val N = 20 << 15
+
+    val benchmark = new Benchmark("max function length of wholestagecodegen", N)
+    def f(): Unit = sparkSession.range(N)
+      .selectExpr(
+        "id",
+        "(id & 1023) as k1",
+        "cast(id & 1023 as double) as k2",
+        "cast(id & 1023 as int) as k3",
+        "case when id > 100 and id <= 200 then 1 else 0 end as v1",
+        "case when id > 200 and id <= 300 then 1 else 0 end as v2",
+        "case when id > 300 and id <= 400 then 1 else 0 end as v3",
+        "case when id > 400 and id <= 500 then 1 else 0 end as v4",
+        "case when id > 500 and id <= 600 then 1 else 0 end as v5",
+        "case when id > 600 and id <= 700 then 1 else 0 end as v6",
+        "case when id > 700 and id <= 800 then 1 else 0 end as v7",
+        "case when id > 800 and id <= 900 then 1 else 0 end as v8",
+        "case when id > 900 and id <= 1000 then 1 else 0 end as v9",
+        "case when id > 1000 and id <= 1100 then 1 else 0 end as v10",
+        "case when id > 1100 and id <= 1200 then 1 else 0 end as v11",
+        "case when id > 1200 and id <= 1300 then 1 else 0 end as v12",
+        "case when id > 1300 and id <= 1400 then 1 else 0 end as v13",
+        "case when id > 1400 and id <= 1500 then 1 else 0 end as v14",
+        "case when id > 1500 and id <= 1600 then 1 else 0 end as v15",
+        "case when id > 1600 and id <= 1700 then 1 else 0 end as v16",
+        "case when id > 1700 and id <= 1800 then 1 else 0 end as v17",
+        "case when id > 1800 and id <= 1900 then 1 else 0 end as v18")
+      .groupBy("k1", "k2", "k3")
+      .sum()
+      .collect()
+
+    benchmark.addCase(s"codegen = F") { iter =>
+      sparkSession.conf.set("spark.sql.codegen.wholeStage", "false")
+      f()
+    }
+
+    benchmark.addCase(s"codegen = T maxLinesPerFunction = 10000") { iter =>
+      sparkSession.conf.set("spark.sql.codegen.wholeStage", "true")
+      sparkSession.conf.set("spark.sql.codegen.maxLinesPerFunction", "10000")
+      f()
+    }
+
+    benchmark.addCase(s"codegen = T maxLinesPerFunction = 1500") { iter =>
+      sparkSession.conf.set("spark.sql.codegen.wholeStage", "true")
+      sparkSession.conf.set("spark.sql.codegen.maxLinesPerFunction", "1500")
+      f()
+    }
+
+    benchmark.run()
+
+    /*
+    Java HotSpot(TM) 64-Bit Server VM 1.8.0_111-b14 on Windows 7 6.1
+    Intel64 Family 6 Model 58 Stepping 9, GenuineIntel
+    max function length of wholestagecodegen: Best/Avg Time(ms)    Rate(M/s)  Per Row(ns)  Relative
+    ----------------------------------------------------------------------------------------------
+    codegen = F                                    462 /  533          1.4       704.4     1.0X
+    codegen = T maxLinesPerFunction = 10000       3444 / 3447          0.2      5255.3     0.1X
+    codegen = T maxLinesPerFunction = 1500         447 /  478          1.5       682.1     1.0X
+     */
+  }
+
 
   ignore("cube") {
     val N = 5 << 20

From 7add4e982184c46990995dcd1326e6caad7adf6e Mon Sep 17 00:00:00 2001
From: Marco Gaido <mgaido@hortonworks.com>
Date: Wed, 16 Aug 2017 09:40:04 -0700
Subject: [PATCH 371/779] [SPARK-21738] Thriftserver doesn't cancel jobs when
 session is closed

## What changes were proposed in this pull request?

When a session is closed the Thriftserver doesn't cancel the jobs which may still be running. This is a huge waste of resources.
This PR address the problem canceling the pending jobs when a session is closed.

## How was this patch tested?

The patch was tested manually.

Author: Marco Gaido <mgaido@hortonworks.com>

Closes #18951 from mgaido91/SPARK-21738.
---
 .../thriftserver/SparkExecuteStatementOperation.scala     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
index 1d1074a2a7387..f5191fa9132bd 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
@@ -71,9 +71,9 @@ private[hive] class SparkExecuteStatementOperation(
 
   def close(): Unit = {
     // RDDs will be cleaned automatically upon garbage collection.
-    sqlContext.sparkContext.clearJobGroup()
     logDebug(s"CLOSING $statementId")
     cleanup(OperationState.CLOSED)
+    sqlContext.sparkContext.clearJobGroup()
   }
 
   def addNonNullColumnValue(from: SparkRow, to: ArrayBuffer[Any], ordinal: Int) {
@@ -273,9 +273,6 @@ private[hive] class SparkExecuteStatementOperation(
 
   override def cancel(): Unit = {
     logInfo(s"Cancel '$statement' with $statementId")
-    if (statementId != null) {
-      sqlContext.sparkContext.cancelJobGroup(statementId)
-    }
     cleanup(OperationState.CANCELED)
   }
 
@@ -287,6 +284,9 @@ private[hive] class SparkExecuteStatementOperation(
         backgroundHandle.cancel(true)
       }
     }
+    if (statementId != null) {
+      sqlContext.sparkContext.cancelJobGroup(statementId)
+    }
   }
 }
 

From a0345cbebe23537df4084cf90f9d47425e550150 Mon Sep 17 00:00:00 2001
From: Peng Meng <peng.meng@intel.com>
Date: Wed, 16 Aug 2017 19:05:20 +0100
Subject: [PATCH 372/779] [SPARK-21680][ML][MLLIB] optimize Vector compress

## What changes were proposed in this pull request?

When use Vector.compressed to change a Vector to SparseVector, the performance is very low comparing with Vector.toSparse.
This is because you have to scan the value three times using Vector.compressed, but you just need two times when use Vector.toSparse.
When the length of the vector is large, there is significant performance difference between this two method.

## How was this patch tested?

The existing UT

Author: Peng Meng <peng.meng@intel.com>

Closes #18899 from mpjlu/optVectorCompress.
---
 .../org/apache/spark/ml/linalg/Vectors.scala  | 24 ++++++++++++-----
 .../apache/spark/ml/linalg/VectorsSuite.scala | 10 +++++++
 .../apache/spark/mllib/linalg/Vectors.scala   | 26 +++++++++++++------
 .../spark/mllib/linalg/VectorsSuite.scala     | 10 +++++++
 project/MimaExcludes.scala                    |  4 +++
 5 files changed, 60 insertions(+), 14 deletions(-)

diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala
index 3fbc0958a0f11..941b6eca568d3 100644
--- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala
+++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala
@@ -136,7 +136,21 @@ sealed trait Vector extends Serializable {
    * Converts this vector to a sparse vector with all explicit zeros removed.
    */
   @Since("2.0.0")
-  def toSparse: SparseVector
+  def toSparse: SparseVector = toSparseWithSize(numNonzeros)
+
+  /**
+   * Converts this vector to a sparse vector with all explicit zeros removed when the size is known.
+   * This method is used to avoid re-computing the number of non-zero elements when it is
+   * already known. This method should only be called after computing the number of non-zero
+   * elements via [[numNonzeros]]. e.g.
+   * {{{
+   *   val nnz = numNonzeros
+   *   val sv = toSparse(nnz)
+   * }}}
+   *
+   * If `nnz` is under-specified, a [[java.lang.ArrayIndexOutOfBoundsException]] is thrown.
+   */
+  private[linalg] def toSparseWithSize(nnz: Int): SparseVector
 
   /**
    * Converts this vector to a dense vector.
@@ -152,7 +166,7 @@ sealed trait Vector extends Serializable {
     val nnz = numNonzeros
     // A dense vector needs 8 * size + 8 bytes, while a sparse vector needs 12 * nnz + 20 bytes.
     if (1.5 * (nnz + 1.0) < size) {
-      toSparse
+      toSparseWithSize(nnz)
     } else {
       toDense
     }
@@ -495,8 +509,7 @@ class DenseVector @Since("2.0.0") ( @Since("2.0.0") val values: Array[Double]) e
     nnz
   }
 
-  override def toSparse: SparseVector = {
-    val nnz = numNonzeros
+  private[linalg] override def toSparseWithSize(nnz: Int): SparseVector = {
     val ii = new Array[Int](nnz)
     val vv = new Array[Double](nnz)
     var k = 0
@@ -635,8 +648,7 @@ class SparseVector @Since("2.0.0") (
     nnz
   }
 
-  override def toSparse: SparseVector = {
-    val nnz = numNonzeros
+  private[linalg] override def toSparseWithSize(nnz: Int): SparseVector = {
     if (nnz == numActives) {
       this
     } else {
diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/VectorsSuite.scala b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/VectorsSuite.scala
index 4cd91afd6d7fc..79acef8214d88 100644
--- a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/VectorsSuite.scala
+++ b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/VectorsSuite.scala
@@ -318,11 +318,21 @@ class VectorsSuite extends SparkMLFunSuite {
     assert(dv0s.numActives === 2)
     assert(dv0s === dv0)
 
+    assert(dv0.toSparseWithSize(dv0.numNonzeros) === dv0)
+    val dv0s2 = dv0.toSparseWithSize(dv0.numNonzeros)
+    assert(dv0s2.numActives === 2)
+    assert(dv0s2 === dv0s)
+
     val sv0 = Vectors.sparse(4, Array(0, 1, 2), Array(0.0, 2.0, 3.0))
     assert(sv0.toDense === sv0)
     val sv0s = sv0.toSparse
     assert(sv0s.numActives === 2)
     assert(sv0s === sv0)
+
+    assert(sv0.toSparseWithSize(sv0.numNonzeros) === sv0)
+    val sv0s2 = sv0.toSparseWithSize(sv0.numNonzeros)
+    assert(sv0s2.numActives === 2)
+    assert(sv0s2 === sv0s)
   }
 
   test("Vector.compressed") {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index f063420bec143..fd9605c013625 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -149,7 +149,21 @@ sealed trait Vector extends Serializable {
    * Converts this vector to a sparse vector with all explicit zeros removed.
    */
   @Since("1.4.0")
-  def toSparse: SparseVector
+  def toSparse: SparseVector = toSparseWithSize(numNonzeros)
+
+  /**
+   * Converts this vector to a sparse vector with all explicit zeros removed when the size is known.
+   * This method is used to avoid re-computing the number of non-zero elements when it is
+   * already known. This method should only be called after computing the number of non-zero
+   * elements via [[numNonzeros]]. e.g.
+   * {{{
+   *   val nnz = numNonzeros
+   *   val sv = toSparse(nnz)
+   * }}}
+   *
+   * If `nnz` is under-specified, a [[java.lang.ArrayIndexOutOfBoundsException]] is thrown.
+   */
+  private[linalg] def toSparseWithSize(nnz: Int): SparseVector
 
   /**
    * Converts this vector to a dense vector.
@@ -165,7 +179,7 @@ sealed trait Vector extends Serializable {
     val nnz = numNonzeros
     // A dense vector needs 8 * size + 8 bytes, while a sparse vector needs 12 * nnz + 20 bytes.
     if (1.5 * (nnz + 1.0) < size) {
-      toSparse
+      toSparseWithSize(nnz)
     } else {
       toDense
     }
@@ -669,9 +683,7 @@ class DenseVector @Since("1.0.0") (
     nnz
   }
 
-  @Since("1.4.0")
-  override def toSparse: SparseVector = {
-    val nnz = numNonzeros
+  private[linalg] override def toSparseWithSize(nnz: Int): SparseVector = {
     val ii = new Array[Int](nnz)
     val vv = new Array[Double](nnz)
     var k = 0
@@ -822,9 +834,7 @@ class SparseVector @Since("1.0.0") (
     nnz
   }
 
-  @Since("1.4.0")
-  override def toSparse: SparseVector = {
-    val nnz = numNonzeros
+  private[linalg] override def toSparseWithSize(nnz: Int): SparseVector = {
     if (nnz == numActives) {
       this
     } else {
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
index 6172cffee861c..a1e3ee54b49ff 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
@@ -351,11 +351,21 @@ class VectorsSuite extends SparkFunSuite with Logging {
     assert(dv0s.numActives === 2)
     assert(dv0s === dv0)
 
+    assert(dv0.toSparseWithSize(dv0.numNonzeros) === dv0)
+    val dv0s2 = dv0.toSparseWithSize(dv0.numNonzeros)
+    assert(dv0s2.numActives === 2)
+    assert(dv0s2 === dv0s)
+
     val sv0 = Vectors.sparse(4, Array(0, 1, 2), Array(0.0, 2.0, 3.0))
     assert(sv0.toDense === sv0)
     val sv0s = sv0.toSparse
     assert(sv0s.numActives === 2)
     assert(sv0s === sv0)
+
+    assert(sv0.toSparseWithSize(sv0.numNonzeros) === sv0)
+    val sv0s2 = sv0.toSparseWithSize(sv0.numNonzeros)
+    assert(sv0s2.numActives === 2)
+    assert(sv0s2 === sv0s)
   }
 
   test("Vector.compressed") {
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 7ba85bda0cd7a..9bda917377c2e 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -1015,6 +1015,10 @@ object MimaExcludes {
       ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel.setFeatureSubsetStrategy"),
       ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.numTrees"),
       ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.setFeatureSubsetStrategy")
+    ) ++ Seq(
+      // [SPARK-21680][ML][MLLIB]optimzie Vector coompress
+      ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.mllib.linalg.Vector.toSparseWithSize"),
+      ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Vector.toSparseWithSize")
     )
   }
 

From b8ffb51055108fd606b86f034747006962cd2df3 Mon Sep 17 00:00:00 2001
From: Eyal Farago <eyal@nrgene.com>
Date: Thu, 17 Aug 2017 09:21:50 +0800
Subject: [PATCH 373/779] [SPARK-3151][BLOCK MANAGER] DiskStore.getBytes fails
 for files larger than 2GB

## What changes were proposed in this pull request?
introduced `DiskBlockData`, a new implementation of `BlockData` representing a whole file.
this is somehow related to [SPARK-6236](https://issues.apache.org/jira/browse/SPARK-6236) as well

This class follows the implementation of `EncryptedBlockData` just without the encryption. hence:
* `toInputStream` is implemented using a `FileInputStream` (todo: encrypted version actually uses `Channels.newInputStream`, not sure if it's the right choice for this)
* `toNetty` is implemented in terms of `io.netty.channel.DefaultFileRegion`
* `toByteBuffer` fails for files larger than 2GB (same behavior of the original code, just postponed a bit), it also respects the same configuration keys defined by the original code to choose between memory mapping and simple file read.

## How was this patch tested?
added test to DiskStoreSuite and MemoryManagerSuite

Author: Eyal Farago <eyal@nrgene.com>

Closes #18855 from eyalfa/SPARK-3151.
---
 .../org/apache/spark/storage/DiskStore.scala  | 79 ++++++++++++++-----
 .../apache/spark/storage/DiskStoreSuite.scala | 50 ++++++++++--
 2 files changed, 103 insertions(+), 26 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/storage/DiskStore.scala b/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
index c6656341fcd15..95d70479ef017 100644
--- a/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
@@ -27,7 +27,7 @@ import java.util.concurrent.ConcurrentHashMap
 import scala.collection.mutable.ListBuffer
 
 import com.google.common.io.{ByteStreams, Closeables, Files}
-import io.netty.channel.FileRegion
+import io.netty.channel.{DefaultFileRegion, FileRegion}
 import io.netty.util.AbstractReferenceCounted
 
 import org.apache.spark.{SecurityManager, SparkConf}
@@ -47,6 +47,8 @@ private[spark] class DiskStore(
     securityManager: SecurityManager) extends Logging {
 
   private val minMemoryMapBytes = conf.getSizeAsBytes("spark.storage.memoryMapThreshold", "2m")
+  private val maxMemoryMapBytes = conf.getSizeAsBytes("spark.storage.memoryMapLimitForTests",
+    Int.MaxValue.toString)
   private val blockSizes = new ConcurrentHashMap[String, Long]()
 
   def getSize(blockId: BlockId): Long = blockSizes.get(blockId.name)
@@ -108,25 +110,7 @@ private[spark] class DiskStore(
         new EncryptedBlockData(file, blockSize, conf, key)
 
       case _ =>
-        val channel = new FileInputStream(file).getChannel()
-        if (blockSize < minMemoryMapBytes) {
-          // For small files, directly read rather than memory map.
-          Utils.tryWithSafeFinally {
-            val buf = ByteBuffer.allocate(blockSize.toInt)
-            JavaUtils.readFully(channel, buf)
-            buf.flip()
-            new ByteBufferBlockData(new ChunkedByteBuffer(buf), true)
-          } {
-            channel.close()
-          }
-        } else {
-          Utils.tryWithSafeFinally {
-            new ByteBufferBlockData(
-              new ChunkedByteBuffer(channel.map(MapMode.READ_ONLY, 0, file.length)), true)
-          } {
-            channel.close()
-          }
-        }
+        new DiskBlockData(minMemoryMapBytes, maxMemoryMapBytes, file, blockSize)
     }
   }
 
@@ -165,6 +149,61 @@ private[spark] class DiskStore(
 
 }
 
+private class DiskBlockData(
+    minMemoryMapBytes: Long,
+    maxMemoryMapBytes: Long,
+    file: File,
+    blockSize: Long) extends BlockData {
+
+  override def toInputStream(): InputStream = new FileInputStream(file)
+
+  /**
+  * Returns a Netty-friendly wrapper for the block's data.
+  *
+  * Please see `ManagedBuffer.convertToNetty()` for more details.
+  */
+  override def toNetty(): AnyRef = new DefaultFileRegion(file, 0, size)
+
+  override def toChunkedByteBuffer(allocator: (Int) => ByteBuffer): ChunkedByteBuffer = {
+    Utils.tryWithResource(open()) { channel =>
+      var remaining = blockSize
+      val chunks = new ListBuffer[ByteBuffer]()
+      while (remaining > 0) {
+        val chunkSize = math.min(remaining, maxMemoryMapBytes)
+        val chunk = allocator(chunkSize.toInt)
+        remaining -= chunkSize
+        JavaUtils.readFully(channel, chunk)
+        chunk.flip()
+        chunks += chunk
+      }
+      new ChunkedByteBuffer(chunks.toArray)
+    }
+  }
+
+  override def toByteBuffer(): ByteBuffer = {
+    require(blockSize < maxMemoryMapBytes,
+      s"can't create a byte buffer of size $blockSize" +
+      s" since it exceeds ${Utils.bytesToString(maxMemoryMapBytes)}.")
+    Utils.tryWithResource(open()) { channel =>
+      if (blockSize < minMemoryMapBytes) {
+        // For small files, directly read rather than memory map.
+        val buf = ByteBuffer.allocate(blockSize.toInt)
+        JavaUtils.readFully(channel, buf)
+        buf.flip()
+        buf
+      } else {
+        channel.map(MapMode.READ_ONLY, 0, file.length)
+      }
+    }
+  }
+
+  override def size: Long = blockSize
+
+  override def dispose(): Unit = {}
+
+  private def open() = new FileInputStream(file).getChannel
+}
+
 private class EncryptedBlockData(
     file: File,
     blockSize: Long,
diff --git a/core/src/test/scala/org/apache/spark/storage/DiskStoreSuite.scala b/core/src/test/scala/org/apache/spark/storage/DiskStoreSuite.scala
index a81ccd78f08dd..7258fdf5efc0d 100644
--- a/core/src/test/scala/org/apache/spark/storage/DiskStoreSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/DiskStoreSuite.scala
@@ -50,18 +50,18 @@ class DiskStoreSuite extends SparkFunSuite {
     val diskStoreMapped = new DiskStore(conf.clone().set(confKey, "0"), diskBlockManager,
       securityManager)
     diskStoreMapped.putBytes(blockId, byteBuffer)
-    val mapped = diskStoreMapped.getBytes(blockId).asInstanceOf[ByteBufferBlockData].buffer
+    val mapped = diskStoreMapped.getBytes(blockId).toByteBuffer()
     assert(diskStoreMapped.remove(blockId))
 
     val diskStoreNotMapped = new DiskStore(conf.clone().set(confKey, "1m"), diskBlockManager,
       securityManager)
     diskStoreNotMapped.putBytes(blockId, byteBuffer)
-    val notMapped = diskStoreNotMapped.getBytes(blockId).asInstanceOf[ByteBufferBlockData].buffer
+    val notMapped = diskStoreNotMapped.getBytes(blockId).toByteBuffer()
 
     // Not possible to do isInstanceOf due to visibility of HeapByteBuffer
-    assert(notMapped.getChunks().forall(_.getClass.getName.endsWith("HeapByteBuffer")),
+    assert(notMapped.getClass.getName.endsWith("HeapByteBuffer"),
       "Expected HeapByteBuffer for un-mapped read")
-    assert(mapped.getChunks().forall(_.isInstanceOf[MappedByteBuffer]),
+    assert(mapped.isInstanceOf[MappedByteBuffer],
       "Expected MappedByteBuffer for mapped read")
 
     def arrayFromByteBuffer(in: ByteBuffer): Array[Byte] = {
@@ -70,8 +70,8 @@ class DiskStoreSuite extends SparkFunSuite {
       array
     }
 
-    assert(Arrays.equals(mapped.toArray, bytes))
-    assert(Arrays.equals(notMapped.toArray, bytes))
+    assert(Arrays.equals(new ChunkedByteBuffer(mapped).toArray, bytes))
+    assert(Arrays.equals(new ChunkedByteBuffer(notMapped).toArray, bytes))
   }
 
   test("block size tracking") {
@@ -92,6 +92,44 @@ class DiskStoreSuite extends SparkFunSuite {
     assert(diskStore.getSize(blockId) === 0L)
   }
 
+  test("blocks larger than 2gb") {
+    val conf = new SparkConf()
+      .set("spark.storage.memoryMapLimitForTests", "10k" )
+    val diskBlockManager = new DiskBlockManager(conf, deleteFilesOnStop = true)
+    val diskStore = new DiskStore(conf, diskBlockManager, new SecurityManager(conf))
+
+    val blockId = BlockId("rdd_1_2")
+    diskStore.put(blockId) { chan =>
+      val arr = new Array[Byte](1024)
+      for {
+        _ <- 0 until 20
+      } {
+        val buf = ByteBuffer.wrap(arr)
+        while (buf.hasRemaining()) {
+          chan.write(buf)
+        }
+      }
+    }
+
+    val blockData = diskStore.getBytes(blockId)
+    assert(blockData.size == 20 * 1024)
+
+    val chunkedByteBuffer = blockData.toChunkedByteBuffer(ByteBuffer.allocate)
+    val chunks = chunkedByteBuffer.chunks
+    assert(chunks.size === 2)
+    for (chunk <- chunks) {
+      assert(chunk.limit === 10 * 1024)
+    }
+
+    val e = intercept[IllegalArgumentException]{
+      blockData.toByteBuffer()
+    }
+
+    assert(e.getMessage ===
+      s"requirement failed: can't create a byte buffer of size ${blockData.size}" +
+      " since it exceeds 10.0 KB.")
+  }
+
   test("block data encryption") {
     val testDir = Utils.createTempDir()
     val testData = new Array[Byte](128 * 1024)

From a45133b826984b7856e16d754e01c82702016af7 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 16 Aug 2017 22:37:45 -0700
Subject: [PATCH 374/779] [SPARK-21743][SQL] top-most limit should not cause
 memory leak

## What changes were proposed in this pull request?

For top-most limit, we will use a special operator to execute it: `CollectLimitExec`.

`CollectLimitExec` will retrieve `n`(which is the limit) rows from each partition of the child plan output, see https://github.com/apache/spark/blob/v2.2.0/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala#L311. It's very likely that we don't exhaust the child plan output.

This is fine when whole-stage-codegen is off, as child plan will release the resource via task completion listener. However, when whole-stage codegen is on, the resource can only be released if all output is consumed.

To fix this memory leak, one simple approach is, when `CollectLimitExec` retrieve `n` rows from child plan output, child plan output should only have `n` rows, then the output is exhausted and resource is released. This can be done by wrapping child plan with `LocalLimit`

## How was this patch tested?

a regression test

Author: Wenchen Fan <wenchen@databricks.com>

Closes #18955 from cloud-fan/leak.
---
 .../org/apache/spark/sql/execution/SparkStrategies.scala  | 7 ++++++-
 .../main/scala/org/apache/spark/sql/execution/limit.scala | 8 ++++++++
 .../test/scala/org/apache/spark/sql/SQLQuerySuite.scala   | 5 +++++
 3 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 691f71a7d4ac2..2e8ce4541865d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -72,7 +72,12 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
           execution.TakeOrderedAndProjectExec(
             limit, order, projectList, planLater(child)) :: Nil
         case logical.Limit(IntegerLiteral(limit), child) =>
-          execution.CollectLimitExec(limit, planLater(child)) :: Nil
+          // Normally wrapping child with `LocalLimitExec` here is a no-op, because
+          // `CollectLimitExec.executeCollect` will call `LocalLimitExec.executeTake`, which
+          // calls `child.executeTake`. If child supports whole stage codegen, adding this
+          // `LocalLimitExec` can stop the processing of whole stage codegen and trigger the
+          // resource releasing work, after we consume `limit` rows.
+          execution.CollectLimitExec(limit, LocalLimitExec(limit, planLater(child))) :: Nil
         case other => planLater(other) :: Nil
       }
       case logical.Limit(IntegerLiteral(limit), logical.Sort(order, true, child)) =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala
index 73a0f8735ed45..7cef5569717a3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala
@@ -54,6 +54,14 @@ trait BaseLimitExec extends UnaryExecNode with CodegenSupport {
   val limit: Int
   override def output: Seq[Attribute] = child.output
 
+  // Do not enable whole stage codegen for a single limit.
+  override def supportCodegen: Boolean = child match {
+    case plan: CodegenSupport => plan.supportCodegen
+    case _ => false
+  }
+
+  override def executeTake(n: Int): Array[InternalRow] = child.executeTake(math.min(n, limit))
+
   protected override def doExecute(): RDD[InternalRow] = child.execute().mapPartitions { iter =>
     iter.take(limit)
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index e95f6dba46079..923c6d8eb71fd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2658,4 +2658,9 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       checkAnswer(sql("SELECT __auto_generated_subquery_name.i from (SELECT i FROM v)"), Row(1))
     }
   }
+
+  test("SPARK-21743: top-most limit should not cause memory leak") {
+    // In unit test, Spark will fail the query if memory leak detected.
+    spark.range(100).groupBy("id").count().limit(1).collect()
+  }
 }

From d695a528bef6291e0e1657f4f3583a8371abd7c8 Mon Sep 17 00:00:00 2001
From: Hideaki Tanaka <tanakah@amazon.com>
Date: Thu, 17 Aug 2017 22:02:13 +0800
Subject: [PATCH 375/779] [SPARK-21642][CORE] Use FQDN for DRIVER_HOST_ADDRESS
 instead of ip address

## What changes were proposed in this pull request?

The patch lets spark web ui use FQDN as its hostname instead of ip address.

In current implementation, ip address of a driver host is set to DRIVER_HOST_ADDRESS. This becomes a problem when we enable SSL using "spark.ssl.enabled", "spark.ssl.trustStore" and "spark.ssl.keyStore" properties. When we configure these properties, spark web ui is launched with SSL enabled and the HTTPS server is configured with the custom SSL certificate you configured in these properties.
In this case, client gets javax.net.ssl.SSLPeerUnverifiedException exception when the client accesses the spark web ui because the client fails to verify the SSL certificate (Common Name of the SSL cert does not match with DRIVER_HOST_ADDRESS).

To avoid the exception, we should use FQDN of the driver host for DRIVER_HOST_ADDRESS.

Error message that client gets when the client accesses spark web ui:
javax.net.ssl.SSLPeerUnverifiedException: Certificate for <10.102.138.239> doesn't match any of the subject alternative names: []

## How was this patch tested?
manual tests

Author: Hideaki Tanaka <tanakah@amazon.com>

Closes #18846 from thideeeee/SPARK-21642.
---
 .../scala/org/apache/spark/internal/config/package.scala   | 2 +-
 core/src/main/scala/org/apache/spark/util/Utils.scala      | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index ef28e2c48ad02..9495cd2835f97 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -222,7 +222,7 @@ package object config {
   private[spark] val DRIVER_HOST_ADDRESS = ConfigBuilder("spark.driver.host")
     .doc("Address of driver endpoints.")
     .stringConf
-    .createWithDefault(Utils.localHostName())
+    .createWithDefault(Utils.localCanonicalHostName())
 
   private[spark] val DRIVER_BIND_ADDRESS = ConfigBuilder("spark.driver.bindAddress")
     .doc("Address where to bind network listen sockets on the driver.")
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index d661293e529f9..900a619421903 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -936,6 +936,13 @@ private[spark] object Utils extends Logging {
     customHostname = Some(hostname)
   }
 
+  /**
+   * Get the local machine's FQDN.
+   */
+  def localCanonicalHostName(): String = {
+    customHostname.getOrElse(localIpAddress.getCanonicalHostName)
+  }
+
   /**
    * Get the local machine's hostname.
    */

From b83b502c4189c571bda776511c6f7541c6067aae Mon Sep 17 00:00:00 2001
From: Kent Yao <yaooqinn@hotmail.com>
Date: Fri, 18 Aug 2017 00:24:45 +0800
Subject: [PATCH 376/779] [SPARK-21428] Turn IsolatedClientLoader off while
 using builtin Hive jars for reusing CliSessionState
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

Set isolated to false while using builtin hive jars and `SessionState.get` returns a `CliSessionState` instance.

## How was this patch tested?

1 Unit Tests
2 Manually verified: `hive.exec.strachdir` was only created once because of reusing cliSessionState
```java
➜  spark git:(SPARK-21428) ✗ bin/spark-sql --conf spark.sql.hive.metastore.jars=builtin

log4j:WARN No appenders could be found for logger (org.apache.hadoop.util.Shell).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
17/07/16 23:59:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
17/07/16 23:59:27 INFO HiveMetaStore: 0: Opening raw store with implemenation class:org.apache.hadoop.hive.metastore.ObjectStore
17/07/16 23:59:27 INFO ObjectStore: ObjectStore, initialize called
17/07/16 23:59:28 INFO Persistence: Property hive.metastore.integral.jdo.pushdown unknown - will be ignored
17/07/16 23:59:28 INFO Persistence: Property datanucleus.cache.level2 unknown - will be ignored
17/07/16 23:59:29 INFO ObjectStore: Setting MetaStore object pin classes with hive.metastore.cache.pinobjtypes="Table,StorageDescriptor,SerDeInfo,Partition,Database,Type,FieldSchema,Order"
17/07/16 23:59:30 INFO Datastore: The class "org.apache.hadoop.hive.metastore.model.MFieldSchema" is tagged as "embedded-only" so does not have its own datastore table.
17/07/16 23:59:30 INFO Datastore: The class "org.apache.hadoop.hive.metastore.model.MOrder" is tagged as "embedded-only" so does not have its own datastore table.
17/07/16 23:59:31 INFO Datastore: The class "org.apache.hadoop.hive.metastore.model.MFieldSchema" is tagged as "embedded-only" so does not have its own datastore table.
17/07/16 23:59:31 INFO Datastore: The class "org.apache.hadoop.hive.metastore.model.MOrder" is tagged as "embedded-only" so does not have its own datastore table.
17/07/16 23:59:31 INFO MetaStoreDirectSql: Using direct SQL, underlying DB is DERBY
17/07/16 23:59:31 INFO ObjectStore: Initialized ObjectStore
17/07/16 23:59:31 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 1.2.0
17/07/16 23:59:31 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
17/07/16 23:59:32 INFO HiveMetaStore: Added admin role in metastore
17/07/16 23:59:32 INFO HiveMetaStore: Added public role in metastore
17/07/16 23:59:32 INFO HiveMetaStore: No user is added in admin role, since config is empty
17/07/16 23:59:32 INFO HiveMetaStore: 0: get_all_databases
17/07/16 23:59:32 INFO audit: ugi=Kent	ip=unknown-ip-addr	cmd=get_all_databases
17/07/16 23:59:32 INFO HiveMetaStore: 0: get_functions: db=default pat=*
17/07/16 23:59:32 INFO audit: ugi=Kent	ip=unknown-ip-addr	cmd=get_functions: db=default pat=*
17/07/16 23:59:32 INFO Datastore: The class "org.apache.hadoop.hive.metastore.model.MResourceUri" is tagged as "embedded-only" so does not have its own datastore table.
17/07/16 23:59:32 INFO SessionState: Created local directory: /var/folders/k2/04p4k4ws73l6711h_mz2_tq00000gn/T/beea7261-221a-4711-89e8-8b12a9d37370_resources
17/07/16 23:59:32 INFO SessionState: Created HDFS directory: /tmp/hive/Kent/beea7261-221a-4711-89e8-8b12a9d37370
17/07/16 23:59:32 INFO SessionState: Created local directory: /var/folders/k2/04p4k4ws73l6711h_mz2_tq00000gn/T/Kent/beea7261-221a-4711-89e8-8b12a9d37370
17/07/16 23:59:32 INFO SessionState: Created HDFS directory: /tmp/hive/Kent/beea7261-221a-4711-89e8-8b12a9d37370/_tmp_space.db
17/07/16 23:59:32 INFO SparkContext: Running Spark version 2.3.0-SNAPSHOT
17/07/16 23:59:32 INFO SparkContext: Submitted application: SparkSQL::10.0.0.8
17/07/16 23:59:32 INFO SecurityManager: Changing view acls to: Kent
17/07/16 23:59:32 INFO SecurityManager: Changing modify acls to: Kent
17/07/16 23:59:32 INFO SecurityManager: Changing view acls groups to:
17/07/16 23:59:32 INFO SecurityManager: Changing modify acls groups to:
17/07/16 23:59:32 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users  with view permissions: Set(Kent); groups with view permissions: Set(); users  with modify permissions: Set(Kent); groups with modify permissions: Set()
17/07/16 23:59:33 INFO Utils: Successfully started service 'sparkDriver' on port 51889.
17/07/16 23:59:33 INFO SparkEnv: Registering MapOutputTracker
17/07/16 23:59:33 INFO SparkEnv: Registering BlockManagerMaster
17/07/16 23:59:33 INFO BlockManagerMasterEndpoint: Using org.apache.spark.storage.DefaultTopologyMapper for getting topology information
17/07/16 23:59:33 INFO BlockManagerMasterEndpoint: BlockManagerMasterEndpoint up
17/07/16 23:59:33 INFO DiskBlockManager: Created local directory at /private/var/folders/k2/04p4k4ws73l6711h_mz2_tq00000gn/T/blockmgr-9cfae28a-01e9-4c73-a1f1-f76fa52fc7a5
17/07/16 23:59:33 INFO MemoryStore: MemoryStore started with capacity 366.3 MB
17/07/16 23:59:33 INFO SparkEnv: Registering OutputCommitCoordinator
17/07/16 23:59:33 INFO Utils: Successfully started service 'SparkUI' on port 4040.
17/07/16 23:59:33 INFO SparkUI: Bound SparkUI to 0.0.0.0, and started at http://10.0.0.8:4040
17/07/16 23:59:33 INFO Executor: Starting executor ID driver on host localhost
17/07/16 23:59:33 INFO Utils: Successfully started service 'org.apache.spark.network.netty.NettyBlockTransferService' on port 51890.
17/07/16 23:59:33 INFO NettyBlockTransferService: Server created on 10.0.0.8:51890
17/07/16 23:59:33 INFO BlockManager: Using org.apache.spark.storage.RandomBlockReplicationPolicy for block replication policy
17/07/16 23:59:33 INFO BlockManagerMaster: Registering BlockManager BlockManagerId(driver, 10.0.0.8, 51890, None)
17/07/16 23:59:33 INFO BlockManagerMasterEndpoint: Registering block manager 10.0.0.8:51890 with 366.3 MB RAM, BlockManagerId(driver, 10.0.0.8, 51890, None)
17/07/16 23:59:33 INFO BlockManagerMaster: Registered BlockManager BlockManagerId(driver, 10.0.0.8, 51890, None)
17/07/16 23:59:33 INFO BlockManager: Initialized BlockManager: BlockManagerId(driver, 10.0.0.8, 51890, None)
17/07/16 23:59:34 INFO SharedState: Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir ('file:/Users/Kent/Documents/spark/spark-warehouse').
17/07/16 23:59:34 INFO SharedState: Warehouse path is 'file:/Users/Kent/Documents/spark/spark-warehouse'.
17/07/16 23:59:34 INFO HiveUtils: Initializing HiveMetastoreConnection version 1.2.1 using Spark classes.
17/07/16 23:59:34 INFO HiveClientImpl: Warehouse location for Hive client (version 1.2.2) is /user/hive/warehouse
17/07/16 23:59:34 INFO HiveMetaStore: 0: get_database: default
17/07/16 23:59:34 INFO audit: ugi=Kent	ip=unknown-ip-addr	cmd=get_database: default
17/07/16 23:59:34 INFO HiveClientImpl: Warehouse location for Hive client (version 1.2.2) is /user/hive/warehouse
17/07/16 23:59:34 INFO HiveMetaStore: 0: get_database: global_temp
17/07/16 23:59:34 INFO audit: ugi=Kent	ip=unknown-ip-addr	cmd=get_database: global_temp
17/07/16 23:59:34 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
17/07/16 23:59:34 INFO HiveClientImpl: Warehouse location for Hive client (version 1.2.2) is /user/hive/warehouse
17/07/16 23:59:34 INFO StateStoreCoordinatorRef: Registered StateStoreCoordinator endpoint
spark-sql>

```
cc cloud-fan gatorsmile

Author: Kent Yao <yaooqinn@hotmail.com>
Author: hzyaoqin <hzyaoqin@corp.netease.com>

Closes #18648 from yaooqinn/SPARK-21428.
---
 .../apache/spark/deploy/SparkHadoopUtil.scala |  12 +-
 .../org/apache/spark/deploy/SparkSubmit.scala |  19 +--
 .../HiveCliSessionStateSuite.scala            |  64 +++++++
 .../org/apache/spark/sql/hive/HiveUtils.scala |  19 ++-
 .../spark/sql/hive/client/HiveClient.scala    |   6 +
 .../sql/hive/client/HiveClientImpl.scala      | 158 ++++++++----------
 6 files changed, 170 insertions(+), 108 deletions(-)
 create mode 100644 sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveCliSessionStateSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index e26f61dd3e9e5..2a92ef99b9f37 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.deploy
 
-import java.io.IOException
+import java.io.{File, IOException}
 import java.security.PrivilegedExceptionAction
 import java.text.DateFormat
 import java.util.{Arrays, Comparator, Date, Locale}
@@ -155,8 +155,14 @@ class SparkHadoopUtil extends Logging {
 
   def getSecretKeyFromUserCredentials(key: String): Array[Byte] = { null }
 
-  def loginUserFromKeytab(principalName: String, keytabFilename: String) {
-    UserGroupInformation.loginUserFromKeytab(principalName, keytabFilename)
+  def loginUserFromKeytab(principalName: String, keytabFilename: String): Unit = {
+    if (!new File(keytabFilename).exists()) {
+      throw new SparkException(s"Keytab file: ${keytabFilename} does not exist")
+    } else {
+      logInfo("Attempting to login to Kerberos" +
+        s" using principal: ${principalName} and keytab: ${keytabFilename}")
+      UserGroupInformation.loginUserFromKeytab(principalName, keytabFilename)
+    }
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 019780076e7e7..6d744a084a0fa 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -559,18 +559,13 @@ object SparkSubmit extends CommandLineUtils {
     if (clusterManager == YARN || clusterManager == LOCAL) {
       if (args.principal != null) {
         require(args.keytab != null, "Keytab must be specified when principal is specified")
-        if (!new File(args.keytab).exists()) {
-          throw new SparkException(s"Keytab file: ${args.keytab} does not exist")
-        } else {
-          // Add keytab and principal configurations in sysProps to make them available
-          // for later use; e.g. in spark sql, the isolated class loader used to talk
-          // to HiveMetastore will use these settings. They will be set as Java system
-          // properties and then loaded by SparkConf
-          sysProps.put("spark.yarn.keytab", args.keytab)
-          sysProps.put("spark.yarn.principal", args.principal)
-
-          UserGroupInformation.loginUserFromKeytab(args.principal, args.keytab)
-        }
+        SparkHadoopUtil.get.loginUserFromKeytab(args.principal, args.keytab)
+        // Add keytab and principal configurations in sysProps to make them available
+        // for later use; e.g. in spark sql, the isolated class loader used to talk
+        // to HiveMetastore will use these settings. They will be set as Java system
+        // properties and then loaded by SparkConf
+        sysProps.put("spark.yarn.keytab", args.keytab)
+        sysProps.put("spark.yarn.principal", args.principal)
       }
     }
 
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveCliSessionStateSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveCliSessionStateSuite.scala
new file mode 100644
index 0000000000000..5f9ea4d26790b
--- /dev/null
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveCliSessionStateSuite.scala
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.thriftserver
+
+import org.apache.hadoop.hive.cli.CliSessionState
+import org.apache.hadoop.hive.conf.HiveConf
+import org.apache.hadoop.hive.ql.session.SessionState
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.sql.hive.HiveUtils
+
+class HiveCliSessionStateSuite extends SparkFunSuite {
+
+  def withSessionClear(f: () => Unit): Unit = {
+    try f finally SessionState.detachSession()
+  }
+
+  test("CliSessionState will be reused") {
+    withSessionClear { () =>
+      val hiveConf = new HiveConf(classOf[SessionState])
+      HiveUtils.newTemporaryConfiguration(useInMemoryDerby = false).foreach {
+        case (key, value) => hiveConf.set(key, value)
+      }
+      val sessionState: SessionState = new CliSessionState(hiveConf)
+      SessionState.start(sessionState)
+      val s1 = SessionState.get
+      val sparkConf = new SparkConf()
+      val hadoopConf = SparkHadoopUtil.get.newConfiguration(sparkConf)
+      val s2 = HiveUtils.newClientForMetadata(sparkConf, hadoopConf).getState
+      assert(s1 === s2)
+      assert(s2.isInstanceOf[CliSessionState])
+    }
+  }
+
+  test("SessionState will not be reused") {
+    withSessionClear { () =>
+      val sparkConf = new SparkConf()
+      val hadoopConf = SparkHadoopUtil.get.newConfiguration(sparkConf)
+      HiveUtils.newTemporaryConfiguration(useInMemoryDerby = false).foreach {
+        case (key, value) => hadoopConf.set(key, value)
+      }
+      val hiveClient = HiveUtils.newClientForMetadata(sparkConf, hadoopConf)
+      val s1 = hiveClient.getState
+      val s2 = hiveClient.newSession().getState
+      assert(s1 !== s2)
+    }
+  }
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
index 643925064d532..561c127a40bb6 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
@@ -32,6 +32,7 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.hive.common.`type`.HiveDecimal
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
+import org.apache.hadoop.hive.ql.session.SessionState
 import org.apache.hadoop.hive.serde2.io.{DateWritable, TimestampWritable}
 import org.apache.hadoop.util.VersionInfo
 
@@ -230,6 +231,22 @@ private[spark] object HiveUtils extends Logging {
     }.toMap
   }
 
+  /**
+   * Check current Thread's SessionState type
+   * @return true when SessionState.get returns an instance of CliSessionState,
+   *         false when it gets non-CliSessionState instance or null
+   */
+  def isCliSessionState(): Boolean = {
+    val state = SessionState.get
+    var temp: Class[_] = if (state != null) state.getClass else null
+    var found = false
+    while (temp != null && !found) {
+      found = temp.getName == "org.apache.hadoop.hive.cli.CliSessionState"
+      temp = temp.getSuperclass
+    }
+    found
+  }
+
   /**
    * Create a [[HiveClient]] used for execution.
    *
@@ -313,7 +330,7 @@ private[spark] object HiveUtils extends Logging {
         hadoopConf = hadoopConf,
         execJars = jars.toSeq,
         config = configurations,
-        isolationOn = true,
+        isolationOn = !isCliSessionState(),
         barrierPrefixes = hiveMetastoreBarrierPrefixes,
         sharedPrefixes = hiveMetastoreSharedPrefixes)
     } else if (hiveMetastoreJars == "maven") {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
index 16a80f9fff452..8cff0ca0963bd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
@@ -38,6 +38,12 @@ private[hive] trait HiveClient {
   /** Returns the configuration for the given key in the current session. */
   def getConf(key: String, defaultValue: String): String
 
+  /**
+   * Return the associated Hive SessionState of this [[HiveClientImpl]]
+   * @return [[Any]] not SessionState to avoid linkage error
+   */
+  def getState: Any
+
   /**
    * Runs a HiveQL command using Hive, returning the results as a list of strings.  Each row will
    * result in one string.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index bde9a81c65a4e..5e5c0a2a5078c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -35,9 +35,9 @@ import org.apache.hadoop.hive.ql.metadata.{Hive, Partition => HivePartition, Tab
 import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.HIVE_COLUMN_ORDER_ASC
 import org.apache.hadoop.hive.ql.processors._
 import org.apache.hadoop.hive.ql.session.SessionState
-import org.apache.hadoop.security.UserGroupInformation
 
 import org.apache.spark.{SparkConf, SparkException}
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.metrics.source.HiveCatalogMetrics
 import org.apache.spark.sql.AnalysisException
@@ -105,100 +105,33 @@ private[hive] class HiveClientImpl(
   // Create an internal session state for this HiveClientImpl.
   val state: SessionState = {
     val original = Thread.currentThread().getContextClassLoader
-    // Switch to the initClassLoader.
-    Thread.currentThread().setContextClassLoader(initClassLoader)
-
-    // Set up kerberos credentials for UserGroupInformation.loginUser within
-    // current class loader
-    if (sparkConf.contains("spark.yarn.principal") && sparkConf.contains("spark.yarn.keytab")) {
-      val principalName = sparkConf.get("spark.yarn.principal")
-      val keytabFileName = sparkConf.get("spark.yarn.keytab")
-      if (!new File(keytabFileName).exists()) {
-        throw new SparkException(s"Keytab file: ${keytabFileName}" +
-          " specified in spark.yarn.keytab does not exist")
-      } else {
-        logInfo("Attempting to login to Kerberos" +
-          s" using principal: ${principalName} and keytab: ${keytabFileName}")
-        UserGroupInformation.loginUserFromKeytab(principalName, keytabFileName)
-      }
-    }
-
-    def isCliSessionState(state: SessionState): Boolean = {
-      var temp: Class[_] = if (state != null) state.getClass else null
-      var found = false
-      while (temp != null && !found) {
-        found = temp.getName == "org.apache.hadoop.hive.cli.CliSessionState"
-        temp = temp.getSuperclass
+    if (clientLoader.isolationOn) {
+      // Switch to the initClassLoader.
+      Thread.currentThread().setContextClassLoader(initClassLoader)
+      // Set up kerberos credentials for UserGroupInformation.loginUser within current class loader
+      if (sparkConf.contains("spark.yarn.principal") && sparkConf.contains("spark.yarn.keytab")) {
+        val principal = sparkConf.get("spark.yarn.principal")
+        val keytab = sparkConf.get("spark.yarn.keytab")
+        SparkHadoopUtil.get.loginUserFromKeytab(principal, keytab)
       }
-      found
-    }
-
-    val ret = try {
-      // originState will be created if not exists, will never be null
-      val originalState = SessionState.get()
-      if (isCliSessionState(originalState)) {
-        // In `SparkSQLCLIDriver`, we have already started a `CliSessionState`,
-        // which contains information like configurations from command line. Later
-        // we call `SparkSQLEnv.init()` there, which would run into this part again.
-        // so we should keep `conf` and reuse the existing instance of `CliSessionState`.
-        originalState
-      } else {
-        val hiveConf = new HiveConf(classOf[SessionState])
-        // 1: we set all confs in the hadoopConf to this hiveConf.
-        // This hadoopConf contains user settings in Hadoop's core-site.xml file
-        // and Hive's hive-site.xml file. Note, we load hive-site.xml file manually in
-        // SharedState and put settings in this hadoopConf instead of relying on HiveConf
-        // to load user settings. Otherwise, HiveConf's initialize method will override
-        // settings in the hadoopConf. This issue only shows up when spark.sql.hive.metastore.jars
-        // is not set to builtin. When spark.sql.hive.metastore.jars is builtin, the classpath
-        // has hive-site.xml. So, HiveConf will use that to override its default values.
-        hadoopConf.iterator().asScala.foreach { entry =>
-          val key = entry.getKey
-          val value = entry.getValue
-          if (key.toLowerCase(Locale.ROOT).contains("password")) {
-            logDebug(s"Applying Hadoop and Hive config to Hive Conf: $key=xxx")
-          } else {
-            logDebug(s"Applying Hadoop and Hive config to Hive Conf: $key=$value")
-          }
-          hiveConf.set(key, value)
-        }
-        // HiveConf is a Hadoop Configuration, which has a field of classLoader and
-        // the initial value will be the current thread's context class loader
-        // (i.e. initClassLoader at here).
-        // We call initialConf.setClassLoader(initClassLoader) at here to make
-        // this action explicit.
-        hiveConf.setClassLoader(initClassLoader)
-        // 2: we set all spark confs to this hiveConf.
-        sparkConf.getAll.foreach { case (k, v) =>
-          if (k.toLowerCase(Locale.ROOT).contains("password")) {
-            logDebug(s"Applying Spark config to Hive Conf: $k=xxx")
-          } else {
-            logDebug(s"Applying Spark config to Hive Conf: $k=$v")
-          }
-          hiveConf.set(k, v)
-        }
-        // 3: we set all entries in config to this hiveConf.
-        extraConfig.foreach { case (k, v) =>
-          if (k.toLowerCase(Locale.ROOT).contains("password")) {
-            logDebug(s"Applying extra config to HiveConf: $k=xxx")
-          } else {
-            logDebug(s"Applying extra config to HiveConf: $k=$v")
-          }
-          hiveConf.set(k, v)
-        }
-        val state = new SessionState(hiveConf)
-        if (clientLoader.cachedHive != null) {
-          Hive.set(clientLoader.cachedHive.asInstanceOf[Hive])
-        }
-        SessionState.start(state)
-        state.out = new PrintStream(outputBuffer, true, "UTF-8")
-        state.err = new PrintStream(outputBuffer, true, "UTF-8")
-        state
+      try {
+        newState()
+      } finally {
+        Thread.currentThread().setContextClassLoader(original)
       }
-    } finally {
-      Thread.currentThread().setContextClassLoader(original)
+    } else {
+      // Isolation off means we detect a CliSessionState instance in current thread.
+      // 1: Inside the spark project, we have already started a CliSessionState in
+      // `SparkSQLCLIDriver`, which contains configurations from command lines. Later, we call
+      // `SparkSQLEnv.init()` there, which would new a hive client again. so we should keep those
+      // configurations and reuse the existing instance of `CliSessionState`. In this case,
+      // SessionState.get will always return a CliSessionState.
+      // 2: In another case, a user app may start a CliSessionState outside spark project with built
+      // in hive jars, which will turn off isolation, if SessionSate.detachSession is
+      // called to remove the current state after that, hive client created later will initialize
+      // its own state by newState()
+      Option(SessionState.get).getOrElse(newState())
     }
-    ret
   }
 
   // Log the default warehouse location.
@@ -206,6 +139,44 @@ private[hive] class HiveClientImpl(
     s"Warehouse location for Hive client " +
       s"(version ${version.fullVersion}) is ${conf.get("hive.metastore.warehouse.dir")}")
 
+  private def newState(): SessionState = {
+    val hiveConf = new HiveConf(classOf[SessionState])
+    // HiveConf is a Hadoop Configuration, which has a field of classLoader and
+    // the initial value will be the current thread's context class loader
+    // (i.e. initClassLoader at here).
+    // We call initialConf.setClassLoader(initClassLoader) at here to make
+    // this action explicit.
+    hiveConf.setClassLoader(initClassLoader)
+
+    // 1: Take all from the hadoopConf to this hiveConf.
+    // This hadoopConf contains user settings in Hadoop's core-site.xml file
+    // and Hive's hive-site.xml file. Note, we load hive-site.xml file manually in
+    // SharedState and put settings in this hadoopConf instead of relying on HiveConf
+    // to load user settings. Otherwise, HiveConf's initialize method will override
+    // settings in the hadoopConf. This issue only shows up when spark.sql.hive.metastore.jars
+    // is not set to builtin. When spark.sql.hive.metastore.jars is builtin, the classpath
+    // has hive-site.xml. So, HiveConf will use that to override its default values.
+    // 2: we set all spark confs to this hiveConf.
+    // 3: we set all entries in config to this hiveConf.
+    (hadoopConf.iterator().asScala.map(kv => kv.getKey -> kv.getValue)
+      ++ sparkConf.getAll.toMap ++ extraConfig).foreach { case (k, v) =>
+      logDebug(
+        s"""
+           |Applying Hadoop/Hive/Spark and extra properties to Hive Conf:
+           |$k=${if (k.toLowerCase(Locale.ROOT).contains("password")) "xxx" else v}
+         """.stripMargin)
+      hiveConf.set(k, v)
+    }
+    val state = new SessionState(hiveConf)
+    if (clientLoader.cachedHive != null) {
+      Hive.set(clientLoader.cachedHive.asInstanceOf[Hive])
+    }
+    SessionState.start(state)
+    state.out = new PrintStream(outputBuffer, true, "UTF-8")
+    state.err = new PrintStream(outputBuffer, true, "UTF-8")
+    state
+  }
+
   /** Returns the configuration for the current session. */
   def conf: HiveConf = state.getConf
 
@@ -269,6 +240,9 @@ private[hive] class HiveClientImpl(
     }
   }
 
+  /** Return the associated Hive [[SessionState]] of this [[HiveClientImpl]] */
+  override def getState: SessionState = withHiveState(state)
+
   /**
    * Runs `f` with ThreadLocal session state and classloaders configured for this version of hive.
    */

From ae9e42479253a9cd30423476405377f2d7952137 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Thu, 17 Aug 2017 13:00:37 -0700
Subject: [PATCH 377/779] [SQL][MINOR][TEST] Set
 spark.unsafe.exceptionOnMemoryLeak to true

## What changes were proposed in this pull request?
When running IntelliJ, we are unable to capture the exception of memory leak detection.
> org.apache.spark.executor.Executor: Managed memory leak detected

Explicitly setting `spark.unsafe.exceptionOnMemoryLeak` in SparkConf when building the SparkSession, instead of reading it from system properties.

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #18967 from gatorsmile/setExceptionOnMemoryLeak.
---
 .../scala/org/apache/spark/sql/test/SharedSQLContext.scala    | 4 +++-
 .../main/scala/org/apache/spark/sql/hive/test/TestHive.scala  | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
index 5ec76a4f0ec90..1f073d5f64c6b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
@@ -31,7 +31,9 @@ import org.apache.spark.sql.{SparkSession, SQLContext}
 trait SharedSQLContext extends SQLTestUtils with BeforeAndAfterEach with Eventually {
 
   protected def sparkConf = {
-    new SparkConf().set("spark.hadoop.fs.file.impl", classOf[DebugFilesystem].getName)
+    new SparkConf()
+      .set("spark.hadoop.fs.file.impl", classOf[DebugFilesystem].getName)
+      .set("spark.unsafe.exceptionOnMemoryLeak", "true")
   }
 
   /**
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 9e15baa4b2b74..10c9a2de6540a 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -55,7 +55,8 @@ object TestHive
           "org.apache.spark.sql.hive.execution.PairSerDe")
         .set("spark.sql.warehouse.dir", TestHiveContext.makeWarehouseDir().toURI.getPath)
         // SPARK-8910
-        .set("spark.ui.enabled", "false")))
+        .set("spark.ui.enabled", "false")
+        .set("spark.unsafe.exceptionOnMemoryLeak", "true")))
 
 
 case class TestHiveVersion(hiveClient: HiveClient)

From 6aad02d03632df964363a144c96371e86f7b207e Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Thu, 17 Aug 2017 22:47:14 +0200
Subject: [PATCH 378/779] [SPARK-18394][SQL] Make an AttributeSet.toSeq output
 order consistent

## What changes were proposed in this pull request?
This pr sorted output attributes on their name and exprId in `AttributeSet.toSeq` to make the order consistent.  If the order is different, spark possibly generates different code and then misses cache in `CodeGenerator`, e.g., `GenerateColumnAccessor` generates code depending on an input attribute order.

## How was this patch tested?
Added tests in `AttributeSetSuite` and manually checked if the cache worked well in the given query of the JIRA.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #18959 from maropu/SPARK-18394.
---
 .../catalyst/expressions/AttributeSet.scala   |  7 +++-
 .../expressions/AttributeSetSuite.scala       | 40 +++++++++++++++++++
 .../sql/hive/execution/PruningSuite.scala     |  7 +++-
 3 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeSet.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeSet.scala
index b77f93373e78d..7420b6b57d8e1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeSet.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeSet.scala
@@ -121,7 +121,12 @@ class AttributeSet private (val baseSet: Set[AttributeEquals])
 
   // We must force toSeq to not be strict otherwise we end up with a [[Stream]] that captures all
   // sorts of things in its closure.
-  override def toSeq: Seq[Attribute] = baseSet.map(_.a).toArray.toSeq
+  override def toSeq: Seq[Attribute] = {
+    // We need to keep a deterministic output order for `baseSet` because this affects a variable
+    // order in generated code (e.g., `GenerateColumnAccessor`).
+    // See SPARK-18394 for details.
+    baseSet.map(_.a).toSeq.sortBy { a => (a.name, a.exprId.id) }
+  }
 
   override def toString: String = "{" + baseSet.map(_.a).mkString(", ") + "}"
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/AttributeSetSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/AttributeSetSuite.scala
index 273f95f91ee50..b6e8b667a2400 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/AttributeSetSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/AttributeSetSuite.scala
@@ -78,4 +78,44 @@ class AttributeSetSuite extends SparkFunSuite {
     assert(aSet == aSet)
     assert(aSet == AttributeSet(aUpper :: Nil))
   }
+
+  test("SPARK-18394 keep a deterministic output order along with attribute names and exprIds") {
+    // Checks a simple case
+    val attrSeqA = {
+      val attr1 = AttributeReference("c1", IntegerType)(exprId = ExprId(1098))
+      val attr2 = AttributeReference("c2", IntegerType)(exprId = ExprId(107))
+      val attr3 = AttributeReference("c3", IntegerType)(exprId = ExprId(838))
+      val attrSetA = AttributeSet(attr1 :: attr2 :: attr3 :: Nil)
+
+      val attr4 = AttributeReference("c4", IntegerType)(exprId = ExprId(389))
+      val attr5 = AttributeReference("c5", IntegerType)(exprId = ExprId(89329))
+
+      val attrSetB = AttributeSet(attr4 :: attr5 :: Nil)
+      (attrSetA ++ attrSetB).toSeq.map(_.name)
+    }
+
+    val attrSeqB = {
+      val attr1 = AttributeReference("c1", IntegerType)(exprId = ExprId(392))
+      val attr2 = AttributeReference("c2", IntegerType)(exprId = ExprId(92))
+      val attr3 = AttributeReference("c3", IntegerType)(exprId = ExprId(87))
+      val attrSetA = AttributeSet(attr1 :: attr2 :: attr3 :: Nil)
+
+      val attr4 = AttributeReference("c4", IntegerType)(exprId = ExprId(9023920))
+      val attr5 = AttributeReference("c5", IntegerType)(exprId = ExprId(522))
+      val attrSetB = AttributeSet(attr4 :: attr5 :: Nil)
+
+      (attrSetA ++ attrSetB).toSeq.map(_.name)
+    }
+
+    assert(attrSeqA === attrSeqB)
+
+    // Checks the same column names having different exprIds
+    val attr1 = AttributeReference("c", IntegerType)(exprId = ExprId(1098))
+    val attr2 = AttributeReference("c", IntegerType)(exprId = ExprId(107))
+    val attrSetA = AttributeSet(attr1 :: attr2 :: Nil)
+    val attr3 = AttributeReference("c", IntegerType)(exprId = ExprId(389))
+    val attrSetB = AttributeSet(attr3 :: Nil)
+
+    assert((attrSetA ++ attrSetB).toSeq === attr2 :: attr3 :: attr1 :: Nil)
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala
index d535bef4cc787..cc592cf6ca629 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala
@@ -162,7 +162,12 @@ class PruningSuite extends HiveComparisonTest with BeforeAndAfter {
       }.head
 
       assert(actualOutputColumns === expectedOutputColumns, "Output columns mismatch")
-      assert(actualScannedColumns === expectedScannedColumns, "Scanned columns mismatch")
+
+      // Scanned columns in `HiveTableScanExec` are generated by the `pruneFilterProject` method
+      // in `SparkPlanner`. This method internally uses `AttributeSet.toSeq`, in which
+      // the returned output columns are sorted by the names and expression ids.
+      assert(actualScannedColumns.sorted === expectedScannedColumns.sorted,
+        "Scanned columns mismatch")
 
       val actualPartitions = actualPartValues.map(_.asScala.mkString(",")).sorted
       val expectedPartitions = expectedPartValues.map(_.mkString(",")).sorted

From bfdc361ededb2ed4e323f075fdc40ed004b7f41d Mon Sep 17 00:00:00 2001
From: ArtRand <arand@soe.ucsc.edu>
Date: Thu, 17 Aug 2017 15:47:07 -0700
Subject: [PATCH 379/779] [SPARK-16742] Mesos Kerberos Support

## What changes were proposed in this pull request?

Add Kerberos Support to Mesos.   This includes kinit and --keytab support, but does not include delegation token renewal.

## How was this patch tested?

Manually against a Secure DC/OS Apache HDFS cluster.

Author: ArtRand <arand@soe.ucsc.edu>
Author: Michael Gummelt <mgummelt@mesosphere.io>

Closes #18519 from mgummelt/SPARK-16742-kerberos.
---
 .../apache/spark/deploy/SparkHadoopUtil.scala | 29 +++++++++++---
 .../org/apache/spark/deploy/SparkSubmit.scala | 38 ++++++++++++++-----
 .../HadoopDelegationTokenManager.scala        |  8 ++++
 .../CoarseGrainedExecutorBackend.scala        |  7 ++++
 .../cluster/CoarseGrainedClusterMessage.scala |  3 +-
 .../CoarseGrainedSchedulerBackend.scala       | 33 ++++++++++++++--
 resource-managers/mesos/pom.xml               | 11 ++++++
 .../MesosCoarseGrainedSchedulerBackend.scala  | 12 +++---
 .../deploy/yarn/YarnSparkHadoopUtil.scala     |  8 ----
 9 files changed, 117 insertions(+), 32 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index 2a92ef99b9f37..6d507d85331bd 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.deploy
 
-import java.io.{File, IOException}
+import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream, File, IOException}
 import java.security.PrivilegedExceptionAction
 import java.text.DateFormat
 import java.util.{Arrays, Comparator, Date, Locale}
@@ -147,14 +147,18 @@ class SparkHadoopUtil extends Logging {
 
   def isYarnMode(): Boolean = { false }
 
-  def getCurrentUserCredentials(): Credentials = { null }
-
-  def addCurrentUserCredentials(creds: Credentials) {}
-
   def addSecretKeyToUserCredentials(key: String, secret: String) {}
 
   def getSecretKeyFromUserCredentials(key: String): Array[Byte] = { null }
 
+  def getCurrentUserCredentials(): Credentials = {
+    UserGroupInformation.getCurrentUser().getCredentials()
+  }
+
+  def addCurrentUserCredentials(creds: Credentials): Unit = {
+    UserGroupInformation.getCurrentUser.addCredentials(creds)
+  }
+
   def loginUserFromKeytab(principalName: String, keytabFilename: String): Unit = {
     if (!new File(keytabFilename).exists()) {
       throw new SparkException(s"Keytab file: ${keytabFilename} does not exist")
@@ -425,6 +429,21 @@ class SparkHadoopUtil extends Logging {
       s"${if (status.isDirectory) "d" else "-"}$perm")
     false
   }
+
+  def serialize(creds: Credentials): Array[Byte] = {
+    val byteStream = new ByteArrayOutputStream
+    val dataStream = new DataOutputStream(byteStream)
+    creds.writeTokenStorageToStream(dataStream)
+    byteStream.toByteArray
+  }
+
+  def deserialize(tokenBytes: Array[Byte]): Credentials = {
+    val tokensBuf = new ByteArrayInputStream(tokenBytes)
+
+    val creds = new Credentials()
+    creds.readTokenStorageStream(new DataInputStream(tokensBuf))
+    creds
+  }
 }
 
 object SparkHadoopUtil {
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 6d744a084a0fa..e7e8fbc25d0ec 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -34,6 +34,7 @@ import org.apache.commons.lang3.StringUtils
 import org.apache.hadoop.conf.{Configuration => HadoopConfiguration}
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.security.UserGroupInformation
+import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.apache.ivy.Ivy
 import org.apache.ivy.core.LogOptions
 import org.apache.ivy.core.module.descriptor._
@@ -49,6 +50,7 @@ import org.apache.ivy.plugins.resolver.{ChainResolver, FileSystemResolver, IBibl
 import org.apache.spark._
 import org.apache.spark.api.r.RUtils
 import org.apache.spark.deploy.rest._
+import org.apache.spark.internal.Logging
 import org.apache.spark.launcher.SparkLauncher
 import org.apache.spark.util._
 
@@ -556,19 +558,25 @@ object SparkSubmit extends CommandLineUtils {
     }
 
     // assure a keytab is available from any place in a JVM
-    if (clusterManager == YARN || clusterManager == LOCAL) {
+    if (clusterManager == YARN || clusterManager == LOCAL || clusterManager == MESOS) {
       if (args.principal != null) {
-        require(args.keytab != null, "Keytab must be specified when principal is specified")
-        SparkHadoopUtil.get.loginUserFromKeytab(args.principal, args.keytab)
-        // Add keytab and principal configurations in sysProps to make them available
-        // for later use; e.g. in spark sql, the isolated class loader used to talk
-        // to HiveMetastore will use these settings. They will be set as Java system
-        // properties and then loaded by SparkConf
-        sysProps.put("spark.yarn.keytab", args.keytab)
-        sysProps.put("spark.yarn.principal", args.principal)
+        if (args.keytab != null) {
+          require(new File(args.keytab).exists(), s"Keytab file: ${args.keytab} does not exist")
+          // Add keytab and principal configurations in sysProps to make them available
+          // for later use; e.g. in spark sql, the isolated class loader used to talk
+          // to HiveMetastore will use these settings. They will be set as Java system
+          // properties and then loaded by SparkConf
+          sysProps.put("spark.yarn.keytab", args.keytab)
+          sysProps.put("spark.yarn.principal", args.principal)
+          UserGroupInformation.loginUserFromKeytab(args.principal, args.keytab)
+        }
       }
     }
 
+    if (clusterManager == MESOS && UserGroupInformation.isSecurityEnabled) {
+      setRMPrincipal(sysProps)
+    }
+
     // In yarn-cluster mode, use yarn.Client as a wrapper around the user class
     if (isYarnCluster) {
       childMainClass = "org.apache.spark.deploy.yarn.Client"
@@ -653,6 +661,18 @@ object SparkSubmit extends CommandLineUtils {
     (childArgs, childClasspath, sysProps, childMainClass)
   }
 
+  // [SPARK-20328]. HadoopRDD calls into a Hadoop library that fetches delegation tokens with
+  // renewer set to the YARN ResourceManager.  Since YARN isn't configured in Mesos mode, we
+  // must trick it into thinking we're YARN.
+  private def setRMPrincipal(sysProps: HashMap[String, String]): Unit = {
+    val shortUserName = UserGroupInformation.getCurrentUser.getShortUserName
+    val key = s"spark.hadoop.${YarnConfiguration.RM_PRINCIPAL}"
+    // scalastyle:off println
+    printStream.println(s"Setting ${key} to ${shortUserName}")
+    // scalastyle:off println
+    sysProps.put(key, shortUserName)
+  }
+
   /**
    * Run the main method of the child class using the provided launch environment.
    *
diff --git a/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala b/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala
index 01cbfe1ee6ae1..c317c4fe3d821 100644
--- a/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala
@@ -55,6 +55,14 @@ private[spark] class HadoopDelegationTokenManager(
   logDebug(s"Using the following delegation token providers: " +
     s"${delegationTokenProviders.keys.mkString(", ")}.")
 
+  /** Construct a [[HadoopDelegationTokenManager]] for the default Hadoop filesystem */
+  def this(sparkConf: SparkConf, hadoopConf: Configuration) = {
+    this(
+      sparkConf,
+      hadoopConf,
+      hadoopConf => Set(FileSystem.get(hadoopConf).getHomeDirectory.getFileSystem(hadoopConf)))
+  }
+
   private def getDelegationTokenProviders: Map[String, HadoopDelegationTokenProvider] = {
     val providers = List(new HadoopFSDelegationTokenProvider(fileSystems),
       new HiveDelegationTokenProvider,
diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
index a2f1aa22b0063..a5d60e90210f1 100644
--- a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
+++ b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -26,6 +26,8 @@ import scala.collection.mutable
 import scala.util.{Failure, Success}
 import scala.util.control.NonFatal
 
+import org.apache.hadoop.security.{Credentials, UserGroupInformation}
+
 import org.apache.spark._
 import org.apache.spark.TaskState.TaskState
 import org.apache.spark.deploy.SparkHadoopUtil
@@ -219,6 +221,11 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging {
         SparkHadoopUtil.get.startCredentialUpdater(driverConf)
       }
 
+      cfg.hadoopDelegationCreds.foreach { hadoopCreds =>
+        val creds = SparkHadoopUtil.get.deserialize(hadoopCreds)
+        SparkHadoopUtil.get.addCurrentUserCredentials(creds)
+      }
+
       val env = SparkEnv.createExecutorEnv(
         driverConf, executorId, hostname, cores, cfg.ioEncryptionKey, isLocal = false)
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
index 89a9ad6811e18..5d65731dfc30e 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
@@ -32,7 +32,8 @@ private[spark] object CoarseGrainedClusterMessages {
 
   case class SparkAppConfig(
       sparkProperties: Seq[(String, String)],
-      ioEncryptionKey: Option[Array[Byte]])
+      ioEncryptionKey: Option[Array[Byte]],
+      hadoopDelegationCreds: Option[Array[Byte]])
     extends CoarseGrainedClusterMessage
 
   case object RetrieveLastAllocatedExecutorId extends CoarseGrainedClusterMessage
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index a46824a0c6fad..a0ef209779309 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -24,7 +24,11 @@ import javax.annotation.concurrent.GuardedBy
 import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
 import scala.concurrent.Future
 
+import org.apache.hadoop.security.UserGroupInformation
+
 import org.apache.spark.{ExecutorAllocationClient, SparkEnv, SparkException, TaskState}
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.deploy.security.HadoopDelegationTokenManager
 import org.apache.spark.internal.Logging
 import org.apache.spark.rpc._
 import org.apache.spark.scheduler._
@@ -42,8 +46,8 @@ import org.apache.spark.util.{RpcUtils, SerializableBuffer, ThreadUtils, Utils}
  */
 private[spark]
 class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: RpcEnv)
-  extends ExecutorAllocationClient with SchedulerBackend with Logging
-{
+  extends ExecutorAllocationClient with SchedulerBackend with Logging {
+
   // Use an atomic variable to track total number of cores in the cluster for simplicity and speed
   protected val totalCoreCount = new AtomicInteger(0)
   // Total number of executors that are currently registered
@@ -95,6 +99,12 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
   // The num of current max ExecutorId used to re-register appMaster
   @volatile protected var currentExecutorIdCounter = 0
 
+  // hadoop token manager used by some sub-classes (e.g. Mesos)
+  def hadoopDelegationTokenManager: Option[HadoopDelegationTokenManager] = None
+
+  // Hadoop delegation tokens to be sent to the executors.
+  val hadoopDelegationCreds: Option[Array[Byte]] = getHadoopDelegationCreds()
+
   class DriverEndpoint(override val rpcEnv: RpcEnv, sparkProperties: Seq[(String, String)])
     extends ThreadSafeRpcEndpoint with Logging {
 
@@ -223,8 +233,10 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
         context.reply(true)
 
       case RetrieveSparkAppConfig =>
-        val reply = SparkAppConfig(sparkProperties,
-          SparkEnv.get.securityManager.getIOEncryptionKey())
+        val reply = SparkAppConfig(
+          sparkProperties,
+          SparkEnv.get.securityManager.getIOEncryptionKey(),
+          hadoopDelegationCreds)
         context.reply(reply)
     }
 
@@ -675,6 +687,19 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
     driverEndpoint.send(KillExecutorsOnHost(host))
     true
   }
+
+  protected def getHadoopDelegationCreds(): Option[Array[Byte]] = {
+    if (UserGroupInformation.isSecurityEnabled && hadoopDelegationTokenManager.isDefined) {
+      hadoopDelegationTokenManager.map { manager =>
+        val creds = UserGroupInformation.getCurrentUser.getCredentials
+        val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf)
+        manager.obtainDelegationTokens(hadoopConf, creds)
+        SparkHadoopUtil.get.serialize(creds)
+      }
+    } else {
+      None
+    }
+  }
 }
 
 private[spark] object CoarseGrainedSchedulerBackend {
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index 20b53f2d8f987..2aa3228af79d6 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -74,6 +74,17 @@
       <scope>test</scope>
     </dependency>
 
+    <dependency>
+      <groupId>${hive.group}</groupId>
+      <artifactId>hive-exec</artifactId>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>${hive.group}</groupId>
+      <artifactId>hive-metastore</artifactId>
+      <scope>provided</scope>
+    </dependency>
+
     <!-- Explicitly depend on shaded dependencies from the parent, since shaded deps aren't transitive -->
     <dependency>
       <groupId>com.google.guava</groupId>
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
index e6b09572121d6..5ecd466194d8b 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
@@ -22,15 +22,15 @@ import java.util.{Collections, List => JList}
 import java.util.concurrent.atomic.{AtomicBoolean, AtomicLong}
 import java.util.concurrent.locks.ReentrantLock
 
+import org.apache.mesos.Protos.{TaskInfo => MesosTaskInfo, _}
+import org.apache.mesos.SchedulerDriver
 import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.concurrent.Future
 
-import org.apache.mesos.Protos.{TaskInfo => MesosTaskInfo, _}
-import org.apache.mesos.SchedulerDriver
-
 import org.apache.spark.{SecurityManager, SparkContext, SparkException, TaskState}
 import org.apache.spark.deploy.mesos.config._
+import org.apache.spark.deploy.security.HadoopDelegationTokenManager
 import org.apache.spark.internal.config
 import org.apache.spark.network.netty.SparkTransportConf
 import org.apache.spark.network.shuffle.mesos.MesosExternalShuffleClient
@@ -55,8 +55,10 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
     master: String,
     securityManager: SecurityManager)
   extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv)
-  with org.apache.mesos.Scheduler
-  with MesosSchedulerUtils {
+    with org.apache.mesos.Scheduler with MesosSchedulerUtils {
+
+  override def hadoopDelegationTokenManager: Option[HadoopDelegationTokenManager] =
+    Some(new HadoopDelegationTokenManager(sc.conf, sc.hadoopConfiguration))
 
   // Blacklist a slave after this many failures
   private val MAX_SLAVE_FAILURES = 2
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
index 4fef4394bb3f0..3d9f99f57bed7 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
@@ -74,14 +74,6 @@ class YarnSparkHadoopUtil extends SparkHadoopUtil {
     jobCreds.mergeAll(UserGroupInformation.getCurrentUser().getCredentials())
   }
 
-  override def getCurrentUserCredentials(): Credentials = {
-    UserGroupInformation.getCurrentUser().getCredentials()
-  }
-
-  override def addCurrentUserCredentials(creds: Credentials) {
-    UserGroupInformation.getCurrentUser().addCredentials(creds)
-  }
-
   override def addSecretKeyToUserCredentials(key: String, secret: String) {
     val creds = new Credentials()
     creds.addSecretKey(new Text(key), secret.getBytes(UTF_8))

From 7ab951885fd34aa8184b70a3a39b865a239e5052 Mon Sep 17 00:00:00 2001
From: Jen-Ming Chung <jenmingisme@gmail.com>
Date: Thu, 17 Aug 2017 15:59:45 -0700
Subject: [PATCH 380/779] [SPARK-21677][SQL] json_tuple throws
 NullPointException when column is null as string type

## What changes were proposed in this pull request?
``` scala
scala> Seq(("""{"Hyukjin": 224, "John": 1225}""")).toDS.selectExpr("json_tuple(value, trim(null))").show()
...
java.lang.NullPointerException
	at ...
```

Currently the `null` field name will throw NullPointException. As a given field name null can't be matched with any field names in json, we just output null as its column value. This PR achieves it by returning a very unlikely column name `__NullFieldName` in evaluation of the field names.

## How was this patch tested?
Added unit test.

Author: Jen-Ming Chung <jenmingisme@gmail.com>

Closes #18930 from jmchung/SPARK-21677.
---
 .../expressions/jsonExpressions.scala         |  8 ++---
 .../expressions/JsonExpressionsSuite.scala    | 10 ++++++
 .../sql-tests/inputs/json-functions.sql       |  6 ++++
 .../sql-tests/results/json-functions.sql.out  | 34 ++++++++++++++++++-
 4 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index 17b605438d587..c3757373a3cf9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -362,9 +362,9 @@ case class JsonTuple(children: Seq[Expression])
   @transient private lazy val fieldExpressions: Seq[Expression] = children.tail
 
   // eagerly evaluate any foldable the field names
-  @transient private lazy val foldableFieldNames: IndexedSeq[String] = {
+  @transient private lazy val foldableFieldNames: IndexedSeq[Option[String]] = {
     fieldExpressions.map {
-      case expr if expr.foldable => expr.eval().asInstanceOf[UTF8String].toString
+      case expr if expr.foldable => Option(expr.eval()).map(_.asInstanceOf[UTF8String].toString)
       case _ => null
     }.toIndexedSeq
   }
@@ -417,7 +417,7 @@ case class JsonTuple(children: Seq[Expression])
     val fieldNames = if (constantFields == fieldExpressions.length) {
       // typically the user will provide the field names as foldable expressions
       // so we can use the cached copy
-      foldableFieldNames
+      foldableFieldNames.map(_.orNull)
     } else if (constantFields == 0) {
       // none are foldable so all field names need to be evaluated from the input row
       fieldExpressions.map(_.eval(input).asInstanceOf[UTF8String].toString)
@@ -426,7 +426,7 @@ case class JsonTuple(children: Seq[Expression])
       // prefer the cached copy when available
       foldableFieldNames.zip(fieldExpressions).map {
         case (null, expr) => expr.eval(input).asInstanceOf[UTF8String].toString
-        case (fieldName, _) => fieldName
+        case (fieldName, _) => fieldName.orNull
       }
     }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
index f892e80204603..1cd2b4fc18a5c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
@@ -363,6 +363,16 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       InternalRow(UTF8String.fromString("b\nc")))
   }
 
+  test("SPARK-21677: json_tuple throws NullPointException when column is null as string type") {
+    checkJsonTuple(
+      JsonTuple(Literal("""{"f1": 1, "f2": 2}""") ::
+        NonFoldableLiteral("f1") ::
+        NonFoldableLiteral("cast(NULL AS STRING)") ::
+        NonFoldableLiteral("f2") ::
+        Nil),
+      InternalRow(UTF8String.fromString("1"), null, UTF8String.fromString("2")))
+  }
+
   val gmtId = Option(DateTimeUtils.TimeZoneGMT.getID)
 
   test("from_json") {
diff --git a/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql
index b3cc2cea51d43..5a46fb4321f90 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql
@@ -20,3 +20,9 @@ select from_json('{"a":1}', 'a InvalidType');
 select from_json('{"a":1}', 'a INT', named_struct('mode', 'PERMISSIVE'));
 select from_json('{"a":1}', 'a INT', map('mode', 1));
 select from_json();
+-- json_tuple
+SELECT json_tuple('{"a" : 1, "b" : 2}', CAST(NULL AS STRING), 'b', CAST(NULL AS STRING), 'a');
+CREATE TEMPORARY VIEW jsonTable(jsonField, a) AS SELECT * FROM VALUES ('{"a": 1, "b": 2}', 'a');
+SELECT json_tuple(jsonField, 'b', CAST(NULL AS STRING), a) FROM jsonTable;
+-- Clean up
+DROP VIEW IF EXISTS jsonTable;
diff --git a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
index 22da20d9a9f4e..ae21d00116e9b 100644
--- a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 17
+-- Number of queries: 21
 
 
 -- !query 0
@@ -178,3 +178,35 @@ struct<>
 -- !query 16 output
 org.apache.spark.sql.AnalysisException
 Invalid number of arguments for function from_json; line 1 pos 7
+
+
+-- !query 17
+SELECT json_tuple('{"a" : 1, "b" : 2}', CAST(NULL AS STRING), 'b', CAST(NULL AS STRING), 'a')
+-- !query 17 schema
+struct<c0:string,c1:string,c2:string,c3:string>
+-- !query 17 output
+NULL	2	NULL	1
+
+
+-- !query 18
+CREATE TEMPORARY VIEW jsonTable(jsonField, a) AS SELECT * FROM VALUES ('{"a": 1, "b": 2}', 'a')
+-- !query 18 schema
+struct<>
+-- !query 18 output
+
+
+
+-- !query 19
+SELECT json_tuple(jsonField, 'b', CAST(NULL AS STRING), a) FROM jsonTable
+-- !query 19 schema
+struct<c0:string,c1:string,c2:string>
+-- !query 19 output
+2	NULL	1
+
+
+-- !query 20
+DROP VIEW IF EXISTS jsonTable
+-- !query 20 schema
+struct<>
+-- !query 20 output
+

From 2caaed970e3e26ae59be5999516a737aff3e5c78 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Thu, 17 Aug 2017 16:33:39 -0700
Subject: [PATCH 381/779] [SPARK-21767][TEST][SQL] Add Decimal Test For Avro in
 VersionSuite

## What changes were proposed in this pull request?
Decimal is a logical type of AVRO. We need to ensure the support of Hive's AVRO serde works well in Spark

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #18977 from gatorsmile/addAvroTest.
---
 .../spark/sql/hive/client/VersionsSuite.scala | 68 ++++++++++++++++++-
 1 file changed, 67 insertions(+), 1 deletion(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index 82fbdd645ebe0..072e538b9ed54 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -21,7 +21,6 @@ import java.io.{ByteArrayOutputStream, File, PrintStream}
 import java.net.URI
 
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.hive.common.StatsSetupConst
 import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
 import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
 import org.apache.hadoop.mapred.TextInputFormat
@@ -697,6 +696,73 @@ class VersionsSuite extends SparkFunSuite with Logging {
         assert(versionSpark.table("t1").collect() === Array(Row(2)))
       }
     }
+
+    test(s"$version: Decimal support of Avro Hive serde") {
+      val tableName = "tab1"
+      // TODO: add the other logical types. For details, see the link:
+      // https://avro.apache.org/docs/1.8.1/spec.html#Logical+Types
+      val avroSchema =
+        """{
+          |  "name": "test_record",
+          |  "type": "record",
+          |  "fields": [ {
+          |    "name": "f0",
+          |    "type": [
+          |      "null",
+          |      {
+          |        "precision": 38,
+          |        "scale": 2,
+          |        "type": "bytes",
+          |        "logicalType": "decimal"
+          |      }
+          |    ]
+          |  } ]
+          |}
+        """.stripMargin
+
+      Seq(true, false).foreach { isPartitioned =>
+        withTable(tableName) {
+          val partitionClause = if (isPartitioned) "PARTITIONED BY (ds STRING)" else ""
+          // Creates the (non-)partitioned Avro table
+          versionSpark.sql(
+            s"""
+               |CREATE TABLE $tableName
+               |$partitionClause
+               |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
+               |STORED AS
+               |  INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
+               |  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
+               |TBLPROPERTIES ('avro.schema.literal' = '$avroSchema')
+           """.stripMargin
+          )
+
+          val errorMsg = "data type mismatch: cannot cast DecimalType(2,1) to BinaryType"
+
+          if (isPartitioned) {
+            val insertStmt = s"INSERT OVERWRITE TABLE $tableName partition (ds='a') SELECT 1.3"
+            if (version == "0.12" || version == "0.13") {
+              val e = intercept[AnalysisException](versionSpark.sql(insertStmt)).getMessage
+              assert(e.contains(errorMsg))
+            } else {
+              versionSpark.sql(insertStmt)
+              assert(versionSpark.table(tableName).collect() ===
+                versionSpark.sql("SELECT 1.30, 'a'").collect())
+            }
+          } else {
+            val insertStmt = s"INSERT OVERWRITE TABLE $tableName SELECT 1.3"
+            if (version == "0.12" || version == "0.13") {
+              val e = intercept[AnalysisException](versionSpark.sql(insertStmt)).getMessage
+              assert(e.contains(errorMsg))
+            } else {
+              versionSpark.sql(insertStmt)
+              assert(versionSpark.table(tableName).collect() ===
+                versionSpark.sql("SELECT 1.30").collect())
+            }
+          }
+        }
+      }
+    }
+
     // TODO: add more tests.
   }
 }

From 310454be3b0ce5ff6b6ef0070c5daadf6fb16927 Mon Sep 17 00:00:00 2001
From: donnyzone <wellfengzhu@gmail.com>
Date: Thu, 17 Aug 2017 22:37:32 -0700
Subject: [PATCH 382/779] [SPARK-21739][SQL] Cast expression should initialize
 timezoneId when it is called statically to convert something into
 TimestampType

## What changes were proposed in this pull request?

https://issues.apache.org/jira/projects/SPARK/issues/SPARK-21739

This issue is caused by introducing TimeZoneAwareExpression.
When the **Cast** expression converts something into TimestampType, it should be resolved with setting `timezoneId`. In general, it is resolved in LogicalPlan phase.

However, there are still some places that use Cast expression statically to convert datatypes without setting `timezoneId`. In such cases,  `NoSuchElementException: None.get` will be thrown for TimestampType.

This PR is proposed to fix the issue. We have checked the whole project and found two such usages(i.e., in`TableReader` and `HiveTableScanExec`).

## How was this patch tested?

unit test

Author: donnyzone <wellfengzhu@gmail.com>

Closes #18960 from DonnyZone/spark-21739.
---
 .../org/apache/spark/sql/hive/TableReader.scala |  8 ++++++--
 .../sql/hive/execution/HiveTableScanExec.scala  |  8 ++++++--
 .../spark/sql/hive/QueryPartitionSuite.scala    | 17 +++++++++++++++++
 3 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
index f238b9a4f7f6f..cc8907a0bbc93 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
@@ -39,8 +39,10 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.{EmptyRDD, HadoopRDD, RDD, UnionRDD}
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.analysis.CastSupport
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.{SerializableConfiguration, Utils}
 
@@ -65,7 +67,7 @@ class HadoopTableReader(
     @transient private val tableDesc: TableDesc,
     @transient private val sparkSession: SparkSession,
     hadoopConf: Configuration)
-  extends TableReader with Logging {
+  extends TableReader with CastSupport with Logging {
 
   // Hadoop honors "mapreduce.job.maps" as hint,
   // but will ignore when mapreduce.jobtracker.address is "local".
@@ -86,6 +88,8 @@ class HadoopTableReader(
   private val _broadcastedHadoopConf =
     sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf))
 
+  override def conf: SQLConf = sparkSession.sessionState.conf
+
   override def makeRDDForTable(hiveTable: HiveTable): RDD[InternalRow] =
     makeRDDForTable(
       hiveTable,
@@ -227,7 +231,7 @@ class HadoopTableReader(
       def fillPartitionKeys(rawPartValues: Array[String], row: InternalRow): Unit = {
         partitionKeyAttrs.foreach { case (attr, ordinal) =>
           val partOrdinal = partitionKeys.indexOf(attr)
-          row(ordinal) = Cast(Literal(rawPartValues(partOrdinal)), attr.dataType).eval(null)
+          row(ordinal) = cast(Literal(rawPartValues(partOrdinal)), attr.dataType).eval(null)
         }
       }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala
index 896f24f2e223d..48d0b4a63e54a 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala
@@ -30,6 +30,7 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.analysis.CastSupport
 import org.apache.spark.sql.catalyst.catalog.HiveTableRelation
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.QueryPlan
@@ -37,6 +38,7 @@ import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.sql.hive._
 import org.apache.spark.sql.hive.client.HiveClientImpl
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{BooleanType, DataType}
 import org.apache.spark.util.Utils
 
@@ -53,11 +55,13 @@ case class HiveTableScanExec(
     relation: HiveTableRelation,
     partitionPruningPred: Seq[Expression])(
     @transient private val sparkSession: SparkSession)
-  extends LeafExecNode {
+  extends LeafExecNode with CastSupport {
 
   require(partitionPruningPred.isEmpty || relation.isPartitioned,
     "Partition pruning predicates only supported for partitioned tables.")
 
+  override def conf: SQLConf = sparkSession.sessionState.conf
+
   override lazy val metrics = Map(
     "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
 
@@ -104,7 +108,7 @@ case class HiveTableScanExec(
     hadoopConf)
 
   private def castFromString(value: String, dataType: DataType) = {
-    Cast(Literal(value), dataType).eval(null)
+    cast(Literal(value), dataType).eval(null)
   }
 
   private def addColumnMetadataToConf(hiveConf: Configuration): Unit = {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
index 43b6bf5feeb60..b2dc401ce1efc 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.hive
 
 import java.io.File
+import java.sql.Timestamp
 
 import com.google.common.io.Files
 import org.apache.hadoop.fs.FileSystem
@@ -68,4 +69,20 @@ class QueryPartitionSuite extends QueryTest with SQLTestUtils with TestHiveSingl
       sql("DROP TABLE IF EXISTS createAndInsertTest")
     }
   }
+
+  test("SPARK-21739: Cast expression should initialize timezoneId") {
+    withTable("table_with_timestamp_partition") {
+      sql("CREATE TABLE table_with_timestamp_partition(value int) PARTITIONED BY (ts TIMESTAMP)")
+      sql("INSERT OVERWRITE TABLE table_with_timestamp_partition " +
+        "PARTITION (ts = '2010-01-01 00:00:00.000') VALUES (1)")
+
+      // test for Cast expression in TableReader
+      checkAnswer(sql("SELECT * FROM table_with_timestamp_partition"),
+        Seq(Row(1, Timestamp.valueOf("2010-01-01 00:00:00.000"))))
+
+      // test for Cast expression in HiveTableScanExec
+      checkAnswer(sql("SELECT value FROM table_with_timestamp_partition " +
+        "WHERE ts = '2010-01-01 00:00:00.000'"), Row(1))
+    }
+  }
 }

From 07a2b8738ed8e6c136545d03f91a865de05e41a0 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Fri, 18 Aug 2017 23:58:20 +0900
Subject: [PATCH 383/779] [SPARK-21778][SQL] Simpler Dataset.sample API in
 Scala / Java

## What changes were proposed in this pull request?
Dataset.sample requires a boolean flag withReplacement as the first argument. However, most of the time users simply want to sample some records without replacement. This ticket introduces a new sample function that simply takes in the fraction and seed.

## How was this patch tested?
Tested manually. Not sure yet if we should add a test case for just this wrapper ...

Author: Reynold Xin <rxin@databricks.com>

Closes #18988 from rxin/SPARK-21778.
---
 .../scala/org/apache/spark/sql/Dataset.scala  | 36 +++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index a9887eb95279f..615686ccbe2b3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -1848,11 +1848,43 @@ class Dataset[T] private[sql](
     Except(logicalPlan, other.logicalPlan)
   }
 
+  /**
+   * Returns a new [[Dataset]] by sampling a fraction of rows (without replacement),
+   * using a user-supplied seed.
+   *
+   * @param fraction Fraction of rows to generate, range [0.0, 1.0].
+   * @param seed Seed for sampling.
+   *
+   * @note This is NOT guaranteed to provide exactly the fraction of the count
+   * of the given [[Dataset]].
+   *
+   * @group typedrel
+   * @since 2.3.0
+   */
+  def sample(fraction: Double, seed: Long): Dataset[T] = {
+    sample(withReplacement = false, fraction = fraction, seed = seed)
+  }
+
+  /**
+   * Returns a new [[Dataset]] by sampling a fraction of rows (without replacement).
+   *
+   * @param fraction Fraction of rows to generate, range [0.0, 1.0].
+   *
+   * @note This is NOT guaranteed to provide exactly the fraction of the count
+   * of the given [[Dataset]].
+   *
+   * @group typedrel
+   * @since 2.3.0
+   */
+  def sample(fraction: Double): Dataset[T] = {
+    sample(withReplacement = false, fraction = fraction)
+  }
+
   /**
    * Returns a new [[Dataset]] by sampling a fraction of rows, using a user-supplied seed.
    *
    * @param withReplacement Sample with replacement or not.
-   * @param fraction Fraction of rows to generate.
+   * @param fraction Fraction of rows to generate, range [0.0, 1.0].
    * @param seed Seed for sampling.
    *
    * @note This is NOT guaranteed to provide exactly the fraction of the count
@@ -1871,7 +1903,7 @@ class Dataset[T] private[sql](
    * Returns a new [[Dataset]] by sampling a fraction of rows, using a random seed.
    *
    * @param withReplacement Sample with replacement or not.
-   * @param fraction Fraction of rows to generate.
+   * @param fraction Fraction of rows to generate, range [0.0, 1.0].
    *
    * @note This is NOT guaranteed to provide exactly the fraction of the total count
    * of the given [[Dataset]].

From 23ea8980809497d0372084adf5936602655e1685 Mon Sep 17 00:00:00 2001
From: Masha Basmanova <mbasmanova@fb.com>
Date: Fri, 18 Aug 2017 09:54:39 -0700
Subject: [PATCH 384/779] [SPARK-21213][SQL] Support collecting partition-level
 statistics: rowCount and sizeInBytes

## What changes were proposed in this pull request?

Added support for ANALYZE TABLE [db_name].tablename PARTITION (partcol1[=val1], partcol2[=val2], ...) COMPUTE STATISTICS [NOSCAN] SQL command to calculate total number of rows and size in bytes for a subset of partitions. Calculated statistics are stored in Hive Metastore as user-defined properties attached to partition objects. Property names are the same as the ones used to store table-level statistics: spark.sql.statistics.totalSize and spark.sql.statistics.numRows.

When partition specification contains all partition columns with values, the command collects statistics for a single partition that matches the specification. When some partition columns are missing or listed without their values, the command collects statistics for all partitions which match a subset of partition column values specified.

For example, table t has 4 partitions with the following specs:

* Partition1: (ds='2008-04-08', hr=11)
* Partition2: (ds='2008-04-08', hr=12)
* Partition3: (ds='2008-04-09', hr=11)
* Partition4: (ds='2008-04-09', hr=12)

'ANALYZE TABLE t PARTITION (ds='2008-04-09', hr=11)' command will collect statistics only for partition 3.

'ANALYZE TABLE t PARTITION (ds='2008-04-09')' command will collect statistics for partitions 3 and 4.

'ANALYZE TABLE t PARTITION (ds, hr)' command will collect statistics for all four partitions.

When the optional parameter NOSCAN is specified, the command doesn't count number of rows and only gathers size in bytes.

The statistics gathered by ANALYZE TABLE command can be fetched using DESC EXTENDED [db_name.]tablename PARTITION command.

## How was this patch tested?

Added tests.

Author: Masha Basmanova <mbasmanova@fb.com>

Closes #18421 from mbasmanova/mbasmanova-analyze-partition.
---
 .../sql/catalyst/catalog/interface.scala      |   7 +-
 .../spark/sql/execution/SparkSqlParser.scala  |  36 ++-
 .../command/AnalyzePartitionCommand.scala     | 149 ++++++++++
 .../command/AnalyzeTableCommand.scala         |  28 +-
 .../sql/execution/command/CommandUtils.scala  |  27 +-
 .../inputs/describe-part-after-analyze.sql    |  34 +++
 .../describe-part-after-analyze.sql.out       | 244 +++++++++++++++++
 .../sql/execution/SparkSqlParserSuite.scala   |  33 ++-
 .../spark/sql/hive/HiveExternalCatalog.scala  | 169 ++++++++----
 .../sql/hive/client/HiveClientImpl.scala      |   2 +
 .../spark/sql/hive/StatisticsSuite.scala      | 254 ++++++++++++++++++
 11 files changed, 888 insertions(+), 95 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzePartitionCommand.scala
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/describe-part-after-analyze.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/describe-part-after-analyze.sql.out

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index 5a8c4e7610fff..1965144e81197 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -91,12 +91,14 @@ object CatalogStorageFormat {
  *
  * @param spec partition spec values indexed by column name
  * @param storage storage format of the partition
- * @param parameters some parameters for the partition, for example, stats.
+ * @param parameters some parameters for the partition
+ * @param stats optional statistics (number of rows, total size, etc.)
  */
 case class CatalogTablePartition(
     spec: CatalogTypes.TablePartitionSpec,
     storage: CatalogStorageFormat,
-    parameters: Map[String, String] = Map.empty) {
+    parameters: Map[String, String] = Map.empty,
+    stats: Option[CatalogStatistics] = None) {
 
   def toLinkedHashMap: mutable.LinkedHashMap[String, String] = {
     val map = new mutable.LinkedHashMap[String, String]()
@@ -106,6 +108,7 @@ case class CatalogTablePartition(
     if (parameters.nonEmpty) {
       map.put("Partition Parameters", s"{${parameters.map(p => p._1 + "=" + p._2).mkString(", ")}}")
     }
+    stats.foreach(s => map.put("Partition Statistics", s.simpleString))
     map
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index d4414b6f78ca2..8379e740a0717 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -90,30 +90,40 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
   }
 
   /**
-   * Create an [[AnalyzeTableCommand]] command or an [[AnalyzeColumnCommand]] command.
-   * Example SQL for analyzing table :
+   * Create an [[AnalyzeTableCommand]] command, or an [[AnalyzePartitionCommand]]
+   * or an [[AnalyzeColumnCommand]] command.
+   * Example SQL for analyzing a table or a set of partitions :
    * {{{
-   *   ANALYZE TABLE table COMPUTE STATISTICS [NOSCAN];
+   *   ANALYZE TABLE [db_name.]tablename [PARTITION (partcol1[=val1], partcol2[=val2], ...)]
+   *   COMPUTE STATISTICS [NOSCAN];
    * }}}
+   *
    * Example SQL for analyzing columns :
    * {{{
-   *   ANALYZE TABLE table COMPUTE STATISTICS FOR COLUMNS column1, column2;
+   *   ANALYZE TABLE [db_name.]tablename COMPUTE STATISTICS FOR COLUMNS column1, column2;
    * }}}
    */
   override def visitAnalyze(ctx: AnalyzeContext): LogicalPlan = withOrigin(ctx) {
-    if (ctx.partitionSpec != null) {
-      logWarning(s"Partition specification is ignored: ${ctx.partitionSpec.getText}")
+    if (ctx.identifier != null &&
+        ctx.identifier.getText.toLowerCase(Locale.ROOT) != "noscan") {
+      throw new ParseException(s"Expected `NOSCAN` instead of `${ctx.identifier.getText}`", ctx)
     }
-    if (ctx.identifier != null) {
-      if (ctx.identifier.getText.toLowerCase(Locale.ROOT) != "noscan") {
-        throw new ParseException(s"Expected `NOSCAN` instead of `${ctx.identifier.getText}`", ctx)
+
+    val table = visitTableIdentifier(ctx.tableIdentifier)
+    if (ctx.identifierSeq() == null) {
+      if (ctx.partitionSpec != null) {
+        AnalyzePartitionCommand(table, visitPartitionSpec(ctx.partitionSpec),
+          noscan = ctx.identifier != null)
+      } else {
+        AnalyzeTableCommand(table, noscan = ctx.identifier != null)
       }
-      AnalyzeTableCommand(visitTableIdentifier(ctx.tableIdentifier))
-    } else if (ctx.identifierSeq() == null) {
-      AnalyzeTableCommand(visitTableIdentifier(ctx.tableIdentifier), noscan = false)
     } else {
+      if (ctx.partitionSpec != null) {
+        logWarning("Partition specification is ignored when collecting column statistics: " +
+          ctx.partitionSpec.getText)
+      }
       AnalyzeColumnCommand(
-        visitTableIdentifier(ctx.tableIdentifier),
+        table,
         visitIdentifierSeq(ctx.identifierSeq()))
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzePartitionCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzePartitionCommand.scala
new file mode 100644
index 0000000000000..5b54b2270b5ec
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzePartitionCommand.scala
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.command
+
+import org.apache.spark.sql.{AnalysisException, Column, Row, SparkSession}
+import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.analysis.{NoSuchPartitionException, UnresolvedAttribute}
+import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType}
+import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
+import org.apache.spark.sql.catalyst.expressions.{And, EqualTo, Literal}
+import org.apache.spark.sql.execution.datasources.PartitioningUtils
+
+/**
+ * Analyzes a given set of partitions to generate per-partition statistics, which will be used in
+ * query optimizations.
+ *
+ * When `partitionSpec` is empty, statistics for all partitions are collected and stored in
+ * Metastore.
+ *
+ * When `partitionSpec` mentions only some of the partition columns, all partitions with
+ * matching values for specified columns are processed.
+ *
+ * If `partitionSpec` mentions unknown partition column, an `AnalysisException` is raised.
+ *
+ * By default, total number of rows and total size in bytes are calculated. When `noscan`
+ * is `true`, only total size in bytes is computed.
+ */
+case class AnalyzePartitionCommand(
+    tableIdent: TableIdentifier,
+    partitionSpec: Map[String, Option[String]],
+    noscan: Boolean = true) extends RunnableCommand {
+
+  private def getPartitionSpec(table: CatalogTable): Option[TablePartitionSpec] = {
+    val normalizedPartitionSpec =
+      PartitioningUtils.normalizePartitionSpec(partitionSpec, table.partitionColumnNames,
+        table.identifier.quotedString, conf.resolver)
+
+    // Report an error if partition columns in partition specification do not form
+    // a prefix of the list of partition columns defined in the table schema
+    val isNotSpecified =
+      table.partitionColumnNames.map(normalizedPartitionSpec.getOrElse(_, None).isEmpty)
+    if (isNotSpecified.init.zip(isNotSpecified.tail).contains((true, false))) {
+      val tableId = table.identifier
+      val schemaColumns = table.partitionColumnNames.mkString(",")
+      val specColumns = normalizedPartitionSpec.keys.mkString(",")
+      throw new AnalysisException("The list of partition columns with values " +
+        s"in partition specification for table '${tableId.table}' " +
+        s"in database '${tableId.database.get}' is not a prefix of the list of " +
+        "partition columns defined in the table schema. " +
+        s"Expected a prefix of [${schemaColumns}], but got [${specColumns}].")
+    }
+
+    val filteredSpec = normalizedPartitionSpec.filter(_._2.isDefined).mapValues(_.get)
+    if (filteredSpec.isEmpty) {
+      None
+    } else {
+      Some(filteredSpec)
+    }
+  }
+
+  override def run(sparkSession: SparkSession): Seq[Row] = {
+    val sessionState = sparkSession.sessionState
+    val db = tableIdent.database.getOrElse(sessionState.catalog.getCurrentDatabase)
+    val tableIdentWithDB = TableIdentifier(tableIdent.table, Some(db))
+    val tableMeta = sessionState.catalog.getTableMetadata(tableIdentWithDB)
+    if (tableMeta.tableType == CatalogTableType.VIEW) {
+      throw new AnalysisException("ANALYZE TABLE is not supported on views.")
+    }
+
+    val partitionValueSpec = getPartitionSpec(tableMeta)
+
+    val partitions = sessionState.catalog.listPartitions(tableMeta.identifier, partitionValueSpec)
+
+    if (partitions.isEmpty) {
+      if (partitionValueSpec.isDefined) {
+        throw new NoSuchPartitionException(db, tableIdent.table, partitionValueSpec.get)
+      } else {
+        // the user requested to analyze all partitions for a table which has no partitions
+        // return normally, since there is nothing to do
+        return Seq.empty[Row]
+      }
+    }
+
+    // Compute statistics for individual partitions
+    val rowCounts: Map[TablePartitionSpec, BigInt] =
+      if (noscan) {
+        Map.empty
+      } else {
+        calculateRowCountsPerPartition(sparkSession, tableMeta, partitionValueSpec)
+      }
+
+    // Update the metastore if newly computed statistics are different from those
+    // recorded in the metastore.
+    val newPartitions = partitions.flatMap { p =>
+      val newTotalSize = CommandUtils.calculateLocationSize(
+        sessionState, tableMeta.identifier, p.storage.locationUri)
+      val newRowCount = rowCounts.get(p.spec)
+      val newStats = CommandUtils.compareAndGetNewStats(tableMeta.stats, newTotalSize, newRowCount)
+      newStats.map(_ => p.copy(stats = newStats))
+    }
+
+    if (newPartitions.nonEmpty) {
+      sessionState.catalog.alterPartitions(tableMeta.identifier, newPartitions)
+    }
+
+    Seq.empty[Row]
+  }
+
+  private def calculateRowCountsPerPartition(
+      sparkSession: SparkSession,
+      tableMeta: CatalogTable,
+      partitionValueSpec: Option[TablePartitionSpec]): Map[TablePartitionSpec, BigInt] = {
+    val filter = if (partitionValueSpec.isDefined) {
+      val filters = partitionValueSpec.get.map {
+        case (columnName, value) => EqualTo(UnresolvedAttribute(columnName), Literal(value))
+      }
+      filters.reduce(And)
+    } else {
+      Literal.TrueLiteral
+    }
+
+    val tableDf = sparkSession.table(tableMeta.identifier)
+    val partitionColumns = tableMeta.partitionColumnNames.map(Column(_))
+
+    val df = tableDf.filter(Column(filter)).groupBy(partitionColumns: _*).count()
+
+    df.collect().map { r =>
+      val partitionColumnValues = partitionColumns.indices.map(r.get(_).toString)
+      val spec = tableMeta.partitionColumnNames.zip(partitionColumnValues).toMap
+      val count = BigInt(r.getLong(partitionColumns.size))
+      (spec, count)
+    }.toMap
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
index cba147c35dd99..04715bd314d4d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.command
 
 import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.catalog.{CatalogStatistics, CatalogTableType}
+import org.apache.spark.sql.catalyst.catalog.CatalogTableType
 
 
 /**
@@ -37,31 +37,15 @@ case class AnalyzeTableCommand(
     if (tableMeta.tableType == CatalogTableType.VIEW) {
       throw new AnalysisException("ANALYZE TABLE is not supported on views.")
     }
+
+    // Compute stats for the whole table
     val newTotalSize = CommandUtils.calculateTotalSize(sessionState, tableMeta)
+    val newRowCount =
+      if (noscan) None else Some(BigInt(sparkSession.table(tableIdentWithDB).count()))
 
-    val oldTotalSize = tableMeta.stats.map(_.sizeInBytes.toLong).getOrElse(-1L)
-    val oldRowCount = tableMeta.stats.flatMap(_.rowCount.map(_.toLong)).getOrElse(-1L)
-    var newStats: Option[CatalogStatistics] = None
-    if (newTotalSize >= 0 && newTotalSize != oldTotalSize) {
-      newStats = Some(CatalogStatistics(sizeInBytes = newTotalSize))
-    }
-    // We only set rowCount when noscan is false, because otherwise:
-    // 1. when total size is not changed, we don't need to alter the table;
-    // 2. when total size is changed, `oldRowCount` becomes invalid.
-    // This is to make sure that we only record the right statistics.
-    if (!noscan) {
-      val newRowCount = sparkSession.table(tableIdentWithDB).count()
-      if (newRowCount >= 0 && newRowCount != oldRowCount) {
-        newStats = if (newStats.isDefined) {
-          newStats.map(_.copy(rowCount = Some(BigInt(newRowCount))))
-        } else {
-          Some(CatalogStatistics(
-            sizeInBytes = oldTotalSize, rowCount = Some(BigInt(newRowCount))))
-        }
-      }
-    }
     // Update the metastore if the above statistics of the table are different from those
     // recorded in the metastore.
+    val newStats = CommandUtils.compareAndGetNewStats(tableMeta.stats, newTotalSize, newRowCount)
     if (newStats.isDefined) {
       sessionState.catalog.alterTableStats(tableIdentWithDB, newStats)
       // Refresh the cached data source table in the catalog.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
index de45be85220e9..b22958d59336c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
@@ -26,7 +26,7 @@ import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.catalog.{CatalogStatistics, CatalogTable}
+import org.apache.spark.sql.catalyst.catalog.{CatalogStatistics, CatalogTable, CatalogTablePartition}
 import org.apache.spark.sql.internal.SessionState
 
 
@@ -112,4 +112,29 @@ object CommandUtils extends Logging {
     size
   }
 
+  def compareAndGetNewStats(
+      oldStats: Option[CatalogStatistics],
+      newTotalSize: BigInt,
+      newRowCount: Option[BigInt]): Option[CatalogStatistics] = {
+    val oldTotalSize = oldStats.map(_.sizeInBytes.toLong).getOrElse(-1L)
+    val oldRowCount = oldStats.flatMap(_.rowCount.map(_.toLong)).getOrElse(-1L)
+    var newStats: Option[CatalogStatistics] = None
+    if (newTotalSize >= 0 && newTotalSize != oldTotalSize) {
+      newStats = Some(CatalogStatistics(sizeInBytes = newTotalSize))
+    }
+    // We only set rowCount when noscan is false, because otherwise:
+    // 1. when total size is not changed, we don't need to alter the table;
+    // 2. when total size is changed, `oldRowCount` becomes invalid.
+    // This is to make sure that we only record the right statistics.
+    if (newRowCount.isDefined) {
+      if (newRowCount.get >= 0 && newRowCount.get != oldRowCount) {
+        newStats = if (newStats.isDefined) {
+          newStats.map(_.copy(rowCount = newRowCount))
+        } else {
+          Some(CatalogStatistics(sizeInBytes = oldTotalSize, rowCount = newRowCount))
+        }
+      }
+    }
+    newStats
+  }
 }
diff --git a/sql/core/src/test/resources/sql-tests/inputs/describe-part-after-analyze.sql b/sql/core/src/test/resources/sql-tests/inputs/describe-part-after-analyze.sql
new file mode 100644
index 0000000000000..f4239da906276
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/describe-part-after-analyze.sql
@@ -0,0 +1,34 @@
+CREATE TABLE t (key STRING, value STRING, ds STRING, hr INT) USING parquet
+    PARTITIONED BY (ds, hr);
+
+INSERT INTO TABLE t PARTITION (ds='2017-08-01', hr=10)
+VALUES ('k1', 100), ('k2', 200), ('k3', 300);
+
+INSERT INTO TABLE t PARTITION (ds='2017-08-01', hr=11)
+VALUES ('k1', 101), ('k2', 201), ('k3', 301), ('k4', 401);
+
+INSERT INTO TABLE t PARTITION (ds='2017-09-01', hr=5)
+VALUES ('k1', 102), ('k2', 202);
+
+DESC EXTENDED t PARTITION (ds='2017-08-01', hr=10);
+
+-- Collect stats for a single partition
+ANALYZE TABLE t PARTITION (ds='2017-08-01', hr=10) COMPUTE STATISTICS;
+
+DESC EXTENDED t PARTITION (ds='2017-08-01', hr=10);
+
+-- Collect stats for 2 partitions
+ANALYZE TABLE t PARTITION (ds='2017-08-01') COMPUTE STATISTICS;
+
+DESC EXTENDED t PARTITION (ds='2017-08-01', hr=10);
+DESC EXTENDED t PARTITION (ds='2017-08-01', hr=11);
+
+-- Collect stats for all partitions
+ANALYZE TABLE t PARTITION (ds, hr) COMPUTE STATISTICS;
+
+DESC EXTENDED t PARTITION (ds='2017-08-01', hr=10);
+DESC EXTENDED t PARTITION (ds='2017-08-01', hr=11);
+DESC EXTENDED t PARTITION (ds='2017-09-01', hr=5);
+
+-- DROP TEST TABLES/VIEWS
+DROP TABLE t;
diff --git a/sql/core/src/test/resources/sql-tests/results/describe-part-after-analyze.sql.out b/sql/core/src/test/resources/sql-tests/results/describe-part-after-analyze.sql.out
new file mode 100644
index 0000000000000..51dac111029e8
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/describe-part-after-analyze.sql.out
@@ -0,0 +1,244 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 15
+
+
+-- !query 0
+CREATE TABLE t (key STRING, value STRING, ds STRING, hr INT) USING parquet
+    PARTITIONED BY (ds, hr)
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+INSERT INTO TABLE t PARTITION (ds='2017-08-01', hr=10)
+VALUES ('k1', 100), ('k2', 200), ('k3', 300)
+-- !query 1 schema
+struct<>
+-- !query 1 output
+
+
+
+-- !query 2
+INSERT INTO TABLE t PARTITION (ds='2017-08-01', hr=11)
+VALUES ('k1', 101), ('k2', 201), ('k3', 301), ('k4', 401)
+-- !query 2 schema
+struct<>
+-- !query 2 output
+
+
+
+-- !query 3
+INSERT INTO TABLE t PARTITION (ds='2017-09-01', hr=5)
+VALUES ('k1', 102), ('k2', 202)
+-- !query 3 schema
+struct<>
+-- !query 3 output
+
+
+
+-- !query 4
+DESC EXTENDED t PARTITION (ds='2017-08-01', hr=10)
+-- !query 4 schema
+struct<col_name:string,data_type:string,comment:string>
+-- !query 4 output
+key                 	string              	                    
+value               	string              	                    
+ds                  	string              	                    
+hr                  	int                 	                    
+# Partition Information	                    	                    
+# col_name          	data_type           	comment             
+ds                  	string              	                    
+hr                  	int                 	                    
+                    	                    	                    
+# Detailed Partition Information	                    	                    
+Database            	default             	                    
+Table               	t                   	                    
+Partition Values    	[ds=2017-08-01, hr=10]	                    
+Location [not included in comparison]sql/core/spark-warehouse/t/ds=2017-08-01/hr=10	                    
+                    	                    	                    
+# Storage Information	                    	                    
+Location [not included in comparison]sql/core/spark-warehouse/t
+
+
+-- !query 5
+ANALYZE TABLE t PARTITION (ds='2017-08-01', hr=10) COMPUTE STATISTICS
+-- !query 5 schema
+struct<>
+-- !query 5 output
+
+
+
+-- !query 6
+DESC EXTENDED t PARTITION (ds='2017-08-01', hr=10)
+-- !query 6 schema
+struct<col_name:string,data_type:string,comment:string>
+-- !query 6 output
+key                 	string              	                    
+value               	string              	                    
+ds                  	string              	                    
+hr                  	int                 	                    
+# Partition Information	                    	                    
+# col_name          	data_type           	comment             
+ds                  	string              	                    
+hr                  	int                 	                    
+                    	                    	                    
+# Detailed Partition Information	                    	                    
+Database            	default             	                    
+Table               	t                   	                    
+Partition Values    	[ds=2017-08-01, hr=10]	                    
+Location [not included in comparison]sql/core/spark-warehouse/t/ds=2017-08-01/hr=10	                    
+Partition Statistics	1067 bytes, 3 rows  	                    
+                    	                    	                    
+# Storage Information	                    	                    
+Location [not included in comparison]sql/core/spark-warehouse/t
+
+
+-- !query 7
+ANALYZE TABLE t PARTITION (ds='2017-08-01') COMPUTE STATISTICS
+-- !query 7 schema
+struct<>
+-- !query 7 output
+
+
+
+-- !query 8
+DESC EXTENDED t PARTITION (ds='2017-08-01', hr=10)
+-- !query 8 schema
+struct<col_name:string,data_type:string,comment:string>
+-- !query 8 output
+key                 	string              	                    
+value               	string              	                    
+ds                  	string              	                    
+hr                  	int                 	                    
+# Partition Information	                    	                    
+# col_name          	data_type           	comment             
+ds                  	string              	                    
+hr                  	int                 	                    
+                    	                    	                    
+# Detailed Partition Information	                    	                    
+Database            	default             	                    
+Table               	t                   	                    
+Partition Values    	[ds=2017-08-01, hr=10]	                    
+Location [not included in comparison]sql/core/spark-warehouse/t/ds=2017-08-01/hr=10	                    
+Partition Statistics	1067 bytes, 3 rows  	                    
+                    	                    	                    
+# Storage Information	                    	                    
+Location [not included in comparison]sql/core/spark-warehouse/t
+
+
+-- !query 9
+DESC EXTENDED t PARTITION (ds='2017-08-01', hr=11)
+-- !query 9 schema
+struct<col_name:string,data_type:string,comment:string>
+-- !query 9 output
+key                 	string              	                    
+value               	string              	                    
+ds                  	string              	                    
+hr                  	int                 	                    
+# Partition Information	                    	                    
+# col_name          	data_type           	comment             
+ds                  	string              	                    
+hr                  	int                 	                    
+                    	                    	                    
+# Detailed Partition Information	                    	                    
+Database            	default             	                    
+Table               	t                   	                    
+Partition Values    	[ds=2017-08-01, hr=11]	                    
+Location [not included in comparison]sql/core/spark-warehouse/t/ds=2017-08-01/hr=11	                    
+Partition Statistics	1080 bytes, 4 rows  	                    
+                    	                    	                    
+# Storage Information	                    	                    
+Location [not included in comparison]sql/core/spark-warehouse/t
+
+
+-- !query 10
+ANALYZE TABLE t PARTITION (ds, hr) COMPUTE STATISTICS
+-- !query 10 schema
+struct<>
+-- !query 10 output
+
+
+
+-- !query 11
+DESC EXTENDED t PARTITION (ds='2017-08-01', hr=10)
+-- !query 11 schema
+struct<col_name:string,data_type:string,comment:string>
+-- !query 11 output
+key                 	string              	                    
+value               	string              	                    
+ds                  	string              	                    
+hr                  	int                 	                    
+# Partition Information	                    	                    
+# col_name          	data_type           	comment             
+ds                  	string              	                    
+hr                  	int                 	                    
+                    	                    	                    
+# Detailed Partition Information	                    	                    
+Database            	default             	                    
+Table               	t                   	                    
+Partition Values    	[ds=2017-08-01, hr=10]	                    
+Location [not included in comparison]sql/core/spark-warehouse/t/ds=2017-08-01/hr=10	                    
+Partition Statistics	1067 bytes, 3 rows  	                    
+                    	                    	                    
+# Storage Information	                    	                    
+Location [not included in comparison]sql/core/spark-warehouse/t
+
+
+-- !query 12
+DESC EXTENDED t PARTITION (ds='2017-08-01', hr=11)
+-- !query 12 schema
+struct<col_name:string,data_type:string,comment:string>
+-- !query 12 output
+key                 	string              	                    
+value               	string              	                    
+ds                  	string              	                    
+hr                  	int                 	                    
+# Partition Information	                    	                    
+# col_name          	data_type           	comment             
+ds                  	string              	                    
+hr                  	int                 	                    
+                    	                    	                    
+# Detailed Partition Information	                    	                    
+Database            	default             	                    
+Table               	t                   	                    
+Partition Values    	[ds=2017-08-01, hr=11]	                    
+Location [not included in comparison]sql/core/spark-warehouse/t/ds=2017-08-01/hr=11	                    
+Partition Statistics	1080 bytes, 4 rows  	                    
+                    	                    	                    
+# Storage Information	                    	                    
+Location [not included in comparison]sql/core/spark-warehouse/t
+
+
+-- !query 13
+DESC EXTENDED t PARTITION (ds='2017-09-01', hr=5)
+-- !query 13 schema
+struct<col_name:string,data_type:string,comment:string>
+-- !query 13 output
+key                 	string              	                    
+value               	string              	                    
+ds                  	string              	                    
+hr                  	int                 	                    
+# Partition Information	                    	                    
+# col_name          	data_type           	comment             
+ds                  	string              	                    
+hr                  	int                 	                    
+                    	                    	                    
+# Detailed Partition Information	                    	                    
+Database            	default             	                    
+Table               	t                   	                    
+Partition Values    	[ds=2017-09-01, hr=5]	                    
+Location [not included in comparison]sql/core/spark-warehouse/t/ds=2017-09-01/hr=5	                    
+Partition Statistics	1054 bytes, 2 rows  	                    
+                    	                    	                    
+# Storage Information	                    	                    
+Location [not included in comparison]sql/core/spark-warehouse/t
+
+
+-- !query 14
+DROP TABLE t
+-- !query 14 schema
+struct<>
+-- !query 14 output
+
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala
index d238c76fbeeff..fa7a866f4d551 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala
@@ -259,17 +259,33 @@ class SparkSqlParserSuite extends AnalysisTest {
     assertEqual("analyze table t compute statistics noscan",
       AnalyzeTableCommand(TableIdentifier("t"), noscan = true))
     assertEqual("analyze table t partition (a) compute statistics nOscAn",
-      AnalyzeTableCommand(TableIdentifier("t"), noscan = true))
+      AnalyzePartitionCommand(TableIdentifier("t"), Map("a" -> None), noscan = true))
 
-    // Partitions specified - we currently parse them but don't do anything with it
+    // Partitions specified
     assertEqual("ANALYZE TABLE t PARTITION(ds='2008-04-09', hr=11) COMPUTE STATISTICS",
-      AnalyzeTableCommand(TableIdentifier("t"), noscan = false))
+      AnalyzePartitionCommand(TableIdentifier("t"), noscan = false,
+        partitionSpec = Map("ds" -> Some("2008-04-09"), "hr" -> Some("11"))))
     assertEqual("ANALYZE TABLE t PARTITION(ds='2008-04-09', hr=11) COMPUTE STATISTICS noscan",
-      AnalyzeTableCommand(TableIdentifier("t"), noscan = true))
+      AnalyzePartitionCommand(TableIdentifier("t"), noscan = true,
+        partitionSpec = Map("ds" -> Some("2008-04-09"), "hr" -> Some("11"))))
+    assertEqual("ANALYZE TABLE t PARTITION(ds='2008-04-09') COMPUTE STATISTICS noscan",
+      AnalyzePartitionCommand(TableIdentifier("t"), noscan = true,
+        partitionSpec = Map("ds" -> Some("2008-04-09"))))
+    assertEqual("ANALYZE TABLE t PARTITION(ds='2008-04-09', hr) COMPUTE STATISTICS",
+      AnalyzePartitionCommand(TableIdentifier("t"), noscan = false,
+        partitionSpec = Map("ds" -> Some("2008-04-09"), "hr" -> None)))
+    assertEqual("ANALYZE TABLE t PARTITION(ds='2008-04-09', hr) COMPUTE STATISTICS noscan",
+      AnalyzePartitionCommand(TableIdentifier("t"), noscan = true,
+        partitionSpec = Map("ds" -> Some("2008-04-09"), "hr" -> None)))
+    assertEqual("ANALYZE TABLE t PARTITION(ds, hr=11) COMPUTE STATISTICS noscan",
+      AnalyzePartitionCommand(TableIdentifier("t"), noscan = true,
+        partitionSpec = Map("ds" -> None, "hr" -> Some("11"))))
     assertEqual("ANALYZE TABLE t PARTITION(ds, hr) COMPUTE STATISTICS",
-      AnalyzeTableCommand(TableIdentifier("t"), noscan = false))
+      AnalyzePartitionCommand(TableIdentifier("t"), noscan = false,
+        partitionSpec = Map("ds" -> None, "hr" -> None)))
     assertEqual("ANALYZE TABLE t PARTITION(ds, hr) COMPUTE STATISTICS noscan",
-      AnalyzeTableCommand(TableIdentifier("t"), noscan = true))
+      AnalyzePartitionCommand(TableIdentifier("t"), noscan = true,
+        partitionSpec = Map("ds" -> None, "hr" -> None)))
 
     intercept("analyze table t compute statistics xxxx",
       "Expected `NOSCAN` instead of `xxxx`")
@@ -282,6 +298,11 @@ class SparkSqlParserSuite extends AnalysisTest {
 
     assertEqual("ANALYZE TABLE t COMPUTE STATISTICS FOR COLUMNS key, value",
       AnalyzeColumnCommand(TableIdentifier("t"), Seq("key", "value")))
+
+    // Partition specified - should be ignored
+    assertEqual("ANALYZE TABLE t PARTITION(ds='2017-06-10') " +
+      "COMPUTE STATISTICS FOR COLUMNS key, value",
+      AnalyzeColumnCommand(TableIdentifier("t"), Seq("key", "value")))
   }
 
   test("query organization") {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index e9d48f95aa905..547447b31f0a1 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -639,26 +639,17 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     requireTableExists(db, table)
     val rawTable = getRawTable(db, table)
 
-    // convert table statistics to properties so that we can persist them through hive client
-    val statsProperties = new mutable.HashMap[String, String]()
-    if (stats.isDefined) {
-      statsProperties += STATISTICS_TOTAL_SIZE -> stats.get.sizeInBytes.toString()
-      if (stats.get.rowCount.isDefined) {
-        statsProperties += STATISTICS_NUM_ROWS -> stats.get.rowCount.get.toString()
-      }
-
-      // For datasource tables and hive serde tables created by spark 2.1 or higher,
-      // the data schema is stored in the table properties.
-      val schema = restoreTableMetadata(rawTable).schema
+    // For datasource tables and hive serde tables created by spark 2.1 or higher,
+    // the data schema is stored in the table properties.
+    val schema = restoreTableMetadata(rawTable).schema
 
-      val colNameTypeMap: Map[String, DataType] =
-        schema.fields.map(f => (f.name, f.dataType)).toMap
-      stats.get.colStats.foreach { case (colName, colStat) =>
-        colStat.toMap(colName, colNameTypeMap(colName)).foreach { case (k, v) =>
-          statsProperties += (columnStatKeyPropName(colName, k) -> v)
-        }
+    // convert table statistics to properties so that we can persist them through hive client
+    var statsProperties =
+      if (stats.isDefined) {
+        statsToProperties(stats.get, schema)
+      } else {
+        new mutable.HashMap[String, String]()
       }
-    }
 
     val oldTableNonStatsProps = rawTable.properties.filterNot(_._1.startsWith(STATISTICS_PREFIX))
     val updatedTable = rawTable.copy(properties = oldTableNonStatsProps ++ statsProperties)
@@ -704,36 +695,10 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     val version: String = table.properties.getOrElse(CREATED_SPARK_VERSION, "2.2 or prior")
 
     // Restore Spark's statistics from information in Metastore.
-    val statsProps = table.properties.filterKeys(_.startsWith(STATISTICS_PREFIX))
-
-    // Currently we have two sources of statistics: one from Hive and the other from Spark.
-    // In our design, if Spark's statistics is available, we respect it over Hive's statistics.
-    if (statsProps.nonEmpty) {
-      val colStats = new mutable.HashMap[String, ColumnStat]
-
-      // For each column, recover its column stats. Note that this is currently a O(n^2) operation,
-      // but given the number of columns it usually not enormous, this is probably OK as a start.
-      // If we want to map this a linear operation, we'd need a stronger contract between the
-      // naming convention used for serialization.
-      table.schema.foreach { field =>
-        if (statsProps.contains(columnStatKeyPropName(field.name, ColumnStat.KEY_VERSION))) {
-          // If "version" field is defined, then the column stat is defined.
-          val keyPrefix = columnStatKeyPropName(field.name, "")
-          val colStatMap = statsProps.filterKeys(_.startsWith(keyPrefix)).map { case (k, v) =>
-            (k.drop(keyPrefix.length), v)
-          }
-
-          ColumnStat.fromMap(table.identifier.table, field, colStatMap).foreach {
-            colStat => colStats += field.name -> colStat
-          }
-        }
-      }
-
-      table = table.copy(
-        stats = Some(CatalogStatistics(
-          sizeInBytes = BigInt(table.properties(STATISTICS_TOTAL_SIZE)),
-          rowCount = table.properties.get(STATISTICS_NUM_ROWS).map(BigInt(_)),
-          colStats = colStats.toMap)))
+    val restoredStats =
+      statsFromProperties(table.properties, table.identifier.table, table.schema)
+    if (restoredStats.isDefined) {
+      table = table.copy(stats = restoredStats)
     }
 
     // Get the original table properties as defined by the user.
@@ -1037,17 +1002,92 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     currentFullPath
   }
 
+  private def statsToProperties(
+      stats: CatalogStatistics,
+      schema: StructType): Map[String, String] = {
+
+    var statsProperties: Map[String, String] =
+      Map(STATISTICS_TOTAL_SIZE -> stats.sizeInBytes.toString())
+    if (stats.rowCount.isDefined) {
+      statsProperties += STATISTICS_NUM_ROWS -> stats.rowCount.get.toString()
+    }
+
+    val colNameTypeMap: Map[String, DataType] =
+      schema.fields.map(f => (f.name, f.dataType)).toMap
+    stats.colStats.foreach { case (colName, colStat) =>
+      colStat.toMap(colName, colNameTypeMap(colName)).foreach { case (k, v) =>
+        statsProperties += (columnStatKeyPropName(colName, k) -> v)
+      }
+    }
+
+    statsProperties
+  }
+
+  private def statsFromProperties(
+      properties: Map[String, String],
+      table: String,
+      schema: StructType): Option[CatalogStatistics] = {
+
+    val statsProps = properties.filterKeys(_.startsWith(STATISTICS_PREFIX))
+    if (statsProps.isEmpty) {
+      None
+    } else {
+
+      val colStats = new mutable.HashMap[String, ColumnStat]
+
+      // For each column, recover its column stats. Note that this is currently a O(n^2) operation,
+      // but given the number of columns it usually not enormous, this is probably OK as a start.
+      // If we want to map this a linear operation, we'd need a stronger contract between the
+      // naming convention used for serialization.
+      schema.foreach { field =>
+        if (statsProps.contains(columnStatKeyPropName(field.name, ColumnStat.KEY_VERSION))) {
+          // If "version" field is defined, then the column stat is defined.
+          val keyPrefix = columnStatKeyPropName(field.name, "")
+          val colStatMap = statsProps.filterKeys(_.startsWith(keyPrefix)).map { case (k, v) =>
+            (k.drop(keyPrefix.length), v)
+          }
+
+          ColumnStat.fromMap(table, field, colStatMap).foreach {
+            colStat => colStats += field.name -> colStat
+          }
+        }
+      }
+
+      Some(CatalogStatistics(
+        sizeInBytes = BigInt(statsProps(STATISTICS_TOTAL_SIZE)),
+        rowCount = statsProps.get(STATISTICS_NUM_ROWS).map(BigInt(_)),
+        colStats = colStats.toMap))
+    }
+  }
+
   override def alterPartitions(
       db: String,
       table: String,
       newParts: Seq[CatalogTablePartition]): Unit = withClient {
     val lowerCasedParts = newParts.map(p => p.copy(spec = lowerCasePartitionSpec(p.spec)))
+
+    val rawTable = getRawTable(db, table)
+
+    // For datasource tables and hive serde tables created by spark 2.1 or higher,
+    // the data schema is stored in the table properties.
+    val schema = restoreTableMetadata(rawTable).schema
+
+    // convert partition statistics to properties so that we can persist them through hive api
+    val withStatsProps = lowerCasedParts.map(p => {
+      if (p.stats.isDefined) {
+        val statsProperties = statsToProperties(p.stats.get, schema)
+        p.copy(parameters = p.parameters ++ statsProperties)
+      } else {
+        p
+      }
+    })
+
     // Note: Before altering table partitions in Hive, you *must* set the current database
     // to the one that contains the table of interest. Otherwise you will end up with the
     // most helpful error message ever: "Unable to alter partition. alter is not possible."
     // See HIVE-2742 for more detail.
     client.setCurrentDatabase(db)
-    client.alterPartitions(db, table, lowerCasedParts)
+    client.alterPartitions(db, table, withStatsProps)
   }
 
   override def getPartition(
@@ -1055,7 +1095,34 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       table: String,
       spec: TablePartitionSpec): CatalogTablePartition = withClient {
     val part = client.getPartition(db, table, lowerCasePartitionSpec(spec))
-    part.copy(spec = restorePartitionSpec(part.spec, getTable(db, table).partitionColumnNames))
+    restorePartitionMetadata(part, getTable(db, table))
+  }
+
+  /**
+   * Restores partition metadata from the partition properties.
+   *
+   * Reads partition-level statistics from partition properties, puts these
+   * into [[CatalogTablePartition#stats]] and removes these special entries
+   * from the partition properties.
+   */
+  private def restorePartitionMetadata(
+      partition: CatalogTablePartition,
+      table: CatalogTable): CatalogTablePartition = {
+    val restoredSpec = restorePartitionSpec(partition.spec, table.partitionColumnNames)
+
+    // Restore Spark's statistics from information in Metastore.
+    // Note: partition-level statistics were introduced in 2.3.
+    val restoredStats =
+      statsFromProperties(partition.parameters, table.identifier.table, table.schema)
+    if (restoredStats.isDefined) {
+      partition.copy(
+        spec = restoredSpec,
+        stats = restoredStats,
+        parameters = partition.parameters.filterNot {
+          case (key, _) => key.startsWith(SPARK_SQL_PREFIX) })
+    } else {
+      partition.copy(spec = restoredSpec)
+    }
   }
 
   /**
@@ -1066,7 +1133,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       table: String,
       spec: TablePartitionSpec): Option[CatalogTablePartition] = withClient {
     client.getPartitionOption(db, table, lowerCasePartitionSpec(spec)).map { part =>
-      part.copy(spec = restorePartitionSpec(part.spec, getTable(db, table).partitionColumnNames))
+      restorePartitionMetadata(part, getTable(db, table))
     }
   }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 5e5c0a2a5078c..995280e0e9416 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -21,6 +21,7 @@ import java.io.{File, PrintStream}
 import java.util.Locale
 
 import scala.collection.JavaConverters._
+import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.hadoop.conf.Configuration
@@ -960,6 +961,7 @@ private[hive] object HiveClientImpl {
     tpart.setTableName(ht.getTableName)
     tpart.setValues(partValues.asJava)
     tpart.setSd(storageDesc)
+    tpart.setParameters(mutable.Map(p.parameters.toSeq: _*).asJava)
     new HivePartition(ht, tpart)
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index 71cf79c473b46..dc6140756d519 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -26,6 +26,7 @@ import org.apache.hadoop.hive.common.StatsSetupConst
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.analysis.NoSuchPartitionException
 import org.apache.spark.sql.catalyst.catalog.{CatalogStatistics, HiveTableRelation}
 import org.apache.spark.sql.catalyst.plans.logical.ColumnStat
 import org.apache.spark.sql.catalyst.util.StringUtils
@@ -256,6 +257,259 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
     }
   }
 
+  test("analyze single partition") {
+    val tableName = "analyzeTable_part"
+
+    def queryStats(ds: String): CatalogStatistics = {
+      val partition =
+        spark.sessionState.catalog.getPartition(TableIdentifier(tableName), Map("ds" -> ds))
+      partition.stats.get
+    }
+
+    def createPartition(ds: String, query: String): Unit = {
+      sql(s"INSERT INTO TABLE $tableName PARTITION (ds='$ds') $query")
+    }
+
+    withTable(tableName) {
+      sql(s"CREATE TABLE $tableName (key STRING, value STRING) PARTITIONED BY (ds STRING)")
+
+      createPartition("2010-01-01", "SELECT '1', 'A' from src")
+      createPartition("2010-01-02", "SELECT '1', 'A' from src UNION ALL SELECT '1', 'A' from src")
+      createPartition("2010-01-03", "SELECT '1', 'A' from src")
+
+      sql(s"ANALYZE TABLE $tableName PARTITION (ds='2010-01-01') COMPUTE STATISTICS NOSCAN")
+
+      sql(s"ANALYZE TABLE $tableName PARTITION (ds='2010-01-02') COMPUTE STATISTICS NOSCAN")
+
+      assert(queryStats("2010-01-01").rowCount === None)
+      assert(queryStats("2010-01-01").sizeInBytes === 2000)
+
+      assert(queryStats("2010-01-02").rowCount === None)
+      assert(queryStats("2010-01-02").sizeInBytes === 2*2000)
+
+      sql(s"ANALYZE TABLE $tableName PARTITION (ds='2010-01-01') COMPUTE STATISTICS")
+
+      sql(s"ANALYZE TABLE $tableName PARTITION (ds='2010-01-02') COMPUTE STATISTICS")
+
+      assert(queryStats("2010-01-01").rowCount.get === 500)
+      assert(queryStats("2010-01-01").sizeInBytes === 2000)
+
+      assert(queryStats("2010-01-02").rowCount.get === 2*500)
+      assert(queryStats("2010-01-02").sizeInBytes === 2*2000)
+    }
+  }
+
+  test("analyze a set of partitions") {
+    val tableName = "analyzeTable_part"
+
+    def queryStats(ds: String, hr: String): Option[CatalogStatistics] = {
+      val tableId = TableIdentifier(tableName)
+      val partition =
+        spark.sessionState.catalog.getPartition(tableId, Map("ds" -> ds, "hr" -> hr))
+      partition.stats
+    }
+
+    def assertPartitionStats(
+        ds: String,
+        hr: String,
+        rowCount: Option[BigInt],
+        sizeInBytes: BigInt): Unit = {
+      val stats = queryStats(ds, hr).get
+      assert(stats.rowCount === rowCount)
+      assert(stats.sizeInBytes === sizeInBytes)
+    }
+
+    def createPartition(ds: String, hr: Int, query: String): Unit = {
+      sql(s"INSERT INTO TABLE $tableName PARTITION (ds='$ds', hr=$hr) $query")
+    }
+
+    withTable(tableName) {
+      sql(s"CREATE TABLE $tableName (key STRING, value STRING) PARTITIONED BY (ds STRING, hr INT)")
+
+      createPartition("2010-01-01", 10, "SELECT '1', 'A' from src")
+      createPartition("2010-01-01", 11, "SELECT '1', 'A' from src")
+      createPartition("2010-01-02", 10, "SELECT '1', 'A' from src")
+      createPartition("2010-01-02", 11,
+        "SELECT '1', 'A' from src UNION ALL SELECT '1', 'A' from src")
+
+      sql(s"ANALYZE TABLE $tableName PARTITION (ds='2010-01-01') COMPUTE STATISTICS NOSCAN")
+
+      assertPartitionStats("2010-01-01", "10", rowCount = None, sizeInBytes = 2000)
+      assertPartitionStats("2010-01-01", "11", rowCount = None, sizeInBytes = 2000)
+      assert(queryStats("2010-01-02", "10") === None)
+      assert(queryStats("2010-01-02", "11") === None)
+
+      sql(s"ANALYZE TABLE $tableName PARTITION (ds='2010-01-02') COMPUTE STATISTICS NOSCAN")
+
+      assertPartitionStats("2010-01-01", "10", rowCount = None, sizeInBytes = 2000)
+      assertPartitionStats("2010-01-01", "11", rowCount = None, sizeInBytes = 2000)
+      assertPartitionStats("2010-01-02", "10", rowCount = None, sizeInBytes = 2000)
+      assertPartitionStats("2010-01-02", "11", rowCount = None, sizeInBytes = 2*2000)
+
+      sql(s"ANALYZE TABLE $tableName PARTITION (ds='2010-01-01') COMPUTE STATISTICS")
+
+      assertPartitionStats("2010-01-01", "10", rowCount = Some(500), sizeInBytes = 2000)
+      assertPartitionStats("2010-01-01", "11", rowCount = Some(500), sizeInBytes = 2000)
+      assertPartitionStats("2010-01-02", "10", rowCount = None, sizeInBytes = 2000)
+      assertPartitionStats("2010-01-02", "11", rowCount = None, sizeInBytes = 2*2000)
+
+      sql(s"ANALYZE TABLE $tableName PARTITION (ds='2010-01-02') COMPUTE STATISTICS")
+
+      assertPartitionStats("2010-01-01", "10", rowCount = Some(500), sizeInBytes = 2000)
+      assertPartitionStats("2010-01-01", "11", rowCount = Some(500), sizeInBytes = 2000)
+      assertPartitionStats("2010-01-02", "10", rowCount = Some(500), sizeInBytes = 2000)
+      assertPartitionStats("2010-01-02", "11", rowCount = Some(2*500), sizeInBytes = 2*2000)
+    }
+  }
+
+  test("analyze all partitions") {
+    val tableName = "analyzeTable_part"
+
+    def assertPartitionStats(
+        ds: String,
+        hr: String,
+        rowCount: Option[BigInt],
+        sizeInBytes: BigInt): Unit = {
+      val stats = spark.sessionState.catalog.getPartition(TableIdentifier(tableName),
+        Map("ds" -> ds, "hr" -> hr)).stats.get
+      assert(stats.rowCount === rowCount)
+      assert(stats.sizeInBytes === sizeInBytes)
+    }
+
+    def createPartition(ds: String, hr: Int, query: String): Unit = {
+      sql(s"INSERT INTO TABLE $tableName PARTITION (ds='$ds', hr=$hr) $query")
+    }
+
+    withTable(tableName) {
+      sql(s"CREATE TABLE $tableName (key STRING, value STRING) PARTITIONED BY (ds STRING, hr INT)")
+
+      createPartition("2010-01-01", 10, "SELECT '1', 'A' from src")
+      createPartition("2010-01-01", 11, "SELECT '1', 'A' from src")
+      createPartition("2010-01-02", 10, "SELECT '1', 'A' from src")
+      createPartition("2010-01-02", 11,
+        "SELECT '1', 'A' from src UNION ALL SELECT '1', 'A' from src")
+
+      sql(s"ANALYZE TABLE $tableName PARTITION (ds, hr) COMPUTE STATISTICS NOSCAN")
+
+      assertPartitionStats("2010-01-01", "10", rowCount = None, sizeInBytes = 2000)
+      assertPartitionStats("2010-01-01", "11", rowCount = None, sizeInBytes = 2000)
+      assertPartitionStats("2010-01-01", "11", rowCount = None, sizeInBytes = 2000)
+      assertPartitionStats("2010-01-02", "11", rowCount = None, sizeInBytes = 2*2000)
+
+      sql(s"ANALYZE TABLE $tableName PARTITION (ds, hr) COMPUTE STATISTICS")
+
+      assertPartitionStats("2010-01-01", "10", rowCount = Some(500), sizeInBytes = 2000)
+      assertPartitionStats("2010-01-01", "11", rowCount = Some(500), sizeInBytes = 2000)
+      assertPartitionStats("2010-01-01", "11", rowCount = Some(500), sizeInBytes = 2000)
+      assertPartitionStats("2010-01-02", "11", rowCount = Some(2*500), sizeInBytes = 2*2000)
+    }
+  }
+
+  test("analyze partitions for an empty table") {
+    val tableName = "analyzeTable_part"
+
+    withTable(tableName) {
+      sql(s"CREATE TABLE $tableName (key STRING, value STRING) PARTITIONED BY (ds STRING)")
+
+      // make sure there is no exception
+      sql(s"ANALYZE TABLE $tableName PARTITION (ds) COMPUTE STATISTICS NOSCAN")
+
+      // make sure there is no exception
+      sql(s"ANALYZE TABLE $tableName PARTITION (ds) COMPUTE STATISTICS")
+    }
+  }
+
+  test("analyze partitions case sensitivity") {
+    val tableName = "analyzeTable_part"
+    withTable(tableName) {
+      sql(s"CREATE TABLE $tableName (key STRING, value STRING) PARTITIONED BY (ds STRING)")
+
+      sql(s"INSERT INTO TABLE $tableName PARTITION (ds='2010-01-01') SELECT * FROM src")
+
+      withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") {
+        sql(s"ANALYZE TABLE $tableName PARTITION (DS='2010-01-01') COMPUTE STATISTICS")
+      }
+
+      withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") {
+        val message = intercept[AnalysisException] {
+          sql(s"ANALYZE TABLE $tableName PARTITION (DS='2010-01-01') COMPUTE STATISTICS")
+        }.getMessage
+        assert(message.contains(
+          s"DS is not a valid partition column in table `default`.`${tableName.toLowerCase}`"))
+      }
+    }
+  }
+
+  test("analyze partial partition specifications") {
+
+    val tableName = "analyzeTable_part"
+
+    def assertAnalysisException(partitionSpec: String): Unit = {
+      val message = intercept[AnalysisException] {
+        sql(s"ANALYZE TABLE $tableName $partitionSpec COMPUTE STATISTICS")
+      }.getMessage
+      assert(message.contains("The list of partition columns with values " +
+        s"in partition specification for table '${tableName.toLowerCase}' in database 'default' " +
+        "is not a prefix of the list of partition columns defined in the table schema"))
+    }
+
+    withTable(tableName) {
+      sql(
+        s"""
+           |CREATE TABLE $tableName (key STRING, value STRING)
+           |PARTITIONED BY (a STRING, b INT, c STRING)
+         """.stripMargin)
+
+      sql(s"INSERT INTO TABLE $tableName PARTITION (a='a1', b=10, c='c1') SELECT * FROM src")
+
+      sql(s"ANALYZE TABLE $tableName PARTITION (a='a1') COMPUTE STATISTICS")
+      sql(s"ANALYZE TABLE $tableName PARTITION (a='a1', b=10) COMPUTE STATISTICS")
+      sql(s"ANALYZE TABLE $tableName PARTITION (A='a1', b=10) COMPUTE STATISTICS")
+      sql(s"ANALYZE TABLE $tableName PARTITION (b=10, a='a1') COMPUTE STATISTICS")
+      sql(s"ANALYZE TABLE $tableName PARTITION (b=10, A='a1') COMPUTE STATISTICS")
+
+      assertAnalysisException("PARTITION (b=10)")
+      assertAnalysisException("PARTITION (a, b=10)")
+      assertAnalysisException("PARTITION (b=10, c='c1')")
+      assertAnalysisException("PARTITION (a, b=10, c='c1')")
+      assertAnalysisException("PARTITION (c='c1')")
+      assertAnalysisException("PARTITION (a, b, c='c1')")
+      assertAnalysisException("PARTITION (a='a1', c='c1')")
+      assertAnalysisException("PARTITION (a='a1', b, c='c1')")
+    }
+  }
+
+  test("analyze non-existent partition") {
+
+    def assertAnalysisException(analyzeCommand: String, errorMessage: String): Unit = {
+      val message = intercept[AnalysisException] {
+        sql(analyzeCommand)
+      }.getMessage
+      assert(message.contains(errorMessage))
+    }
+
+    val tableName = "analyzeTable_part"
+    withTable(tableName) {
+      sql(s"CREATE TABLE $tableName (key STRING, value STRING) PARTITIONED BY (ds STRING)")
+
+      sql(s"INSERT INTO TABLE $tableName PARTITION (ds='2010-01-01') SELECT * FROM src")
+
+      assertAnalysisException(
+        s"ANALYZE TABLE $tableName PARTITION (hour=20) COMPUTE STATISTICS",
+        s"hour is not a valid partition column in table `default`.`${tableName.toLowerCase}`"
+      )
+
+      assertAnalysisException(
+        s"ANALYZE TABLE $tableName PARTITION (hour) COMPUTE STATISTICS",
+        s"hour is not a valid partition column in table `default`.`${tableName.toLowerCase}`"
+      )
+
+      intercept[NoSuchPartitionException] {
+        sql(s"ANALYZE TABLE $tableName PARTITION (ds='2011-02-30') COMPUTE STATISTICS")
+      }
+    }
+  }
+
   test("test table-level statistics for hive tables created in HiveExternalCatalog") {
     val textTable = "textTable"
     withTable(textTable) {

From 7880909c45916ab76dccac308a9b2c5225a00e89 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Fri, 18 Aug 2017 11:19:22 -0700
Subject: [PATCH 385/779] [SPARK-21743][SQL][FOLLOW-UP] top-most limit should
 not cause memory leak

## What changes were proposed in this pull request?

This is a follow-up of https://github.com/apache/spark/pull/18955 , to fix a bug that we break whole stage codegen for `Limit`.

## How was this patch tested?

existing tests.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #18993 from cloud-fan/bug.
---
 .../sql/catalyst/optimizer/Optimizer.scala    |  5 ++-
 .../spark/sql/execution/SparkStrategies.scala | 37 ++++++++-----------
 .../apache/spark/sql/execution/limit.scala    |  8 ----
 3 files changed, 20 insertions(+), 30 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index a51b385399d88..e2d7164d93ac1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -1171,7 +1171,7 @@ object DecimalAggregates extends Rule[LogicalPlan] {
  * Converts local operations (i.e. ones that don't require data exchange) on LocalRelation to
  * another LocalRelation.
  *
- * This is relatively simple as it currently handles only a single case: Project.
+ * This is relatively simple as it currently handles only 2 single case: Project and Limit.
  */
 object ConvertToLocalRelation extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
@@ -1180,6 +1180,9 @@ object ConvertToLocalRelation extends Rule[LogicalPlan] {
       val projection = new InterpretedProjection(projectList, output)
       projection.initialize(0)
       LocalRelation(projectList.map(_.toAttribute), data.map(projection))
+
+    case Limit(IntegerLiteral(limit), LocalRelation(output, data)) =>
+      LocalRelation(output, data.take(limit))
   }
 
   private def hasUnevaluableExpr(expr: Expression): Boolean = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 2e8ce4541865d..c115cb6e80e91 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -63,29 +63,24 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
    */
   object SpecialLimits extends Strategy {
     override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
-      case logical.ReturnAnswer(rootPlan) => rootPlan match {
-        case logical.Limit(IntegerLiteral(limit), logical.Sort(order, true, child)) =>
-          execution.TakeOrderedAndProjectExec(limit, order, child.output, planLater(child)) :: Nil
-        case logical.Limit(
-            IntegerLiteral(limit),
-            logical.Project(projectList, logical.Sort(order, true, child))) =>
-          execution.TakeOrderedAndProjectExec(
-            limit, order, projectList, planLater(child)) :: Nil
-        case logical.Limit(IntegerLiteral(limit), child) =>
-          // Normally wrapping child with `LocalLimitExec` here is a no-op, because
-          // `CollectLimitExec.executeCollect` will call `LocalLimitExec.executeTake`, which
-          // calls `child.executeTake`. If child supports whole stage codegen, adding this
-          // `LocalLimitExec` can stop the processing of whole stage codegen and trigger the
-          // resource releasing work, after we consume `limit` rows.
-          execution.CollectLimitExec(limit, LocalLimitExec(limit, planLater(child))) :: Nil
+      case ReturnAnswer(rootPlan) => rootPlan match {
+        case Limit(IntegerLiteral(limit), Sort(order, true, child)) =>
+          TakeOrderedAndProjectExec(limit, order, child.output, planLater(child)) :: Nil
+        case Limit(IntegerLiteral(limit), Project(projectList, Sort(order, true, child))) =>
+          TakeOrderedAndProjectExec(limit, order, projectList, planLater(child)) :: Nil
+        case Limit(IntegerLiteral(limit), child) =>
+          // With whole stage codegen, Spark releases resources only when all the output data of the
+          // query plan are consumed. It's possible that `CollectLimitExec` only consumes a little
+          // data from child plan and finishes the query without releasing resources. Here we wrap
+          // the child plan with `LocalLimitExec`, to stop the processing of whole stage codegen and
+          // trigger the resource releasing work, after we consume `limit` rows.
+          CollectLimitExec(limit, LocalLimitExec(limit, planLater(child))) :: Nil
         case other => planLater(other) :: Nil
       }
-      case logical.Limit(IntegerLiteral(limit), logical.Sort(order, true, child)) =>
-        execution.TakeOrderedAndProjectExec(limit, order, child.output, planLater(child)) :: Nil
-      case logical.Limit(
-          IntegerLiteral(limit), logical.Project(projectList, logical.Sort(order, true, child))) =>
-        execution.TakeOrderedAndProjectExec(
-          limit, order, projectList, planLater(child)) :: Nil
+      case Limit(IntegerLiteral(limit), Sort(order, true, child)) =>
+        TakeOrderedAndProjectExec(limit, order, child.output, planLater(child)) :: Nil
+      case Limit(IntegerLiteral(limit), Project(projectList, Sort(order, true, child))) =>
+        TakeOrderedAndProjectExec(limit, order, projectList, planLater(child)) :: Nil
       case _ => Nil
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala
index 7cef5569717a3..73a0f8735ed45 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala
@@ -54,14 +54,6 @@ trait BaseLimitExec extends UnaryExecNode with CodegenSupport {
   val limit: Int
   override def output: Seq[Attribute] = child.output
 
-  // Do not enable whole stage codegen for a single limit.
-  override def supportCodegen: Boolean = child match {
-    case plan: CodegenSupport => plan.supportCodegen
-    case _ => false
-  }
-
-  override def executeTake(n: Int): Array[InternalRow] = child.executeTake(math.min(n, limit))
-
   protected override def doExecute(): RDD[InternalRow] = child.execute().mapPartitions { iter =>
     iter.take(limit)
   }

From a2db5c5761b0c72babe48b79859d3b208ee8e9f6 Mon Sep 17 00:00:00 2001
From: Andrew Ash <andrew@andrewash.com>
Date: Fri, 18 Aug 2017 13:43:42 -0700
Subject: [PATCH 386/779] [MINOR][TYPO] Fix typos: runnning and Excecutors

## What changes were proposed in this pull request?

Fix typos

## How was this patch tested?

Existing tests

Author: Andrew Ash <andrew@andrewash.com>

Closes #18996 from ash211/patch-2.
---
 .../scala/org/apache/spark/deploy/yarn/YarnAllocator.scala    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
index f73e7dc0bb567..7052fb347106b 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -551,8 +551,8 @@ private[yarn] class YarnAllocator(
           updateInternalState()
         }
       } else {
-        logInfo(("Skip launching executorRunnable as runnning Excecutors count: %d " +
-          "reached target Executors count: %d.").format(
+        logInfo(("Skip launching executorRunnable as running executors count: %d " +
+          "reached target executors count: %d.").format(
           numExecutorsRunning.get, targetNumExecutors))
       }
     }

From 10be01848ef28004a287940a4e8d8a044e14b257 Mon Sep 17 00:00:00 2001
From: Andrew Ray <ray.andrew@gmail.com>
Date: Fri, 18 Aug 2017 18:10:54 -0700
Subject: [PATCH 387/779] [SPARK-21566][SQL][PYTHON] Python method for summary

## What changes were proposed in this pull request?

Adds the recently added `summary` method to the python dataframe interface.

## How was this patch tested?

Additional inline doctests.

Author: Andrew Ray <ray.andrew@gmail.com>

Closes #18762 from aray/summary-py.
---
 python/pyspark/sql/dataframe.py | 61 ++++++++++++++++++++++++++++++++-
 1 file changed, 60 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 5cd208bb525a3..d1b2a9c9947e1 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -927,7 +927,7 @@ def _sort_cols(self, cols, kwargs):
 
     @since("1.3.1")
     def describe(self, *cols):
-        """Computes statistics for numeric and string columns.
+        """Computes basic statistics for numeric and string columns.
 
         This include count, mean, stddev, min, and max. If no columns are
         given, this function computes statistics for all numerical or string columns.
@@ -955,12 +955,71 @@ def describe(self, *cols):
         |    min|                 2|Alice|
         |    max|                 5|  Bob|
         +-------+------------------+-----+
+
+        Use summary for expanded statistics and control over which statistics to compute.
         """
         if len(cols) == 1 and isinstance(cols[0], list):
             cols = cols[0]
         jdf = self._jdf.describe(self._jseq(cols))
         return DataFrame(jdf, self.sql_ctx)
 
+    @since("2.3.0")
+    def summary(self, *statistics):
+        """Computes specified statistics for numeric and string columns. Available statistics are:
+        - count
+        - mean
+        - stddev
+        - min
+        - max
+        - arbitrary approximate percentiles specified as a percentage (eg, 75%)
+
+        If no statistics are given, this function computes count, mean, stddev, min,
+        approximate quartiles (percentiles at 25%, 50%, and 75%), and max.
+
+        .. note:: This function is meant for exploratory data analysis, as we make no
+            guarantee about the backward compatibility of the schema of the resulting DataFrame.
+
+        >>> df.summary().show()
+        +-------+------------------+-----+
+        |summary|               age| name|
+        +-------+------------------+-----+
+        |  count|                 2|    2|
+        |   mean|               3.5| null|
+        | stddev|2.1213203435596424| null|
+        |    min|                 2|Alice|
+        |    25%|               5.0| null|
+        |    50%|               5.0| null|
+        |    75%|               5.0| null|
+        |    max|                 5|  Bob|
+        +-------+------------------+-----+
+
+        >>> df.summary("count", "min", "25%", "75%", "max").show()
+        +-------+---+-----+
+        |summary|age| name|
+        +-------+---+-----+
+        |  count|  2|    2|
+        |    min|  2|Alice|
+        |    25%|5.0| null|
+        |    75%|5.0| null|
+        |    max|  5|  Bob|
+        +-------+---+-----+
+
+        To do a summary for specific columns first select them:
+
+        >>> df.select("age", "name").summary("count").show()
+        +-------+---+----+
+        |summary|age|name|
+        +-------+---+----+
+        |  count|  2|   2|
+        +-------+---+----+
+
+        See also describe for basic statistics.
+        """
+        if len(statistics) == 1 and isinstance(statistics[0], list):
+            statistics = statistics[0]
+        jdf = self._jdf.summary(self._jseq(statistics))
+        return DataFrame(jdf, self.sql_ctx)
+
     @ignore_unicode_prefix
     @since(1.3)
     def head(self, n=None):

From 72b738d8dcdb7893003c81bf1c73bbe262852d1a Mon Sep 17 00:00:00 2001
From: Yuming Wang <wgyumg@gmail.com>
Date: Sat, 19 Aug 2017 11:41:32 -0700
Subject: [PATCH 388/779] [SPARK-21790][TESTS] Fix Docker-based Integration
 Test errors.

## What changes were proposed in this pull request?
[SPARK-17701](https://github.com/apache/spark/pull/18600/files#diff-b9f96d092fb3fea76bcf75e016799678L77) removed `metadata` function, this PR removed the Docker-based Integration module that has been relevant to `SparkPlan.metadata`.

## How was this patch tested?
manual tests

Author: Yuming Wang <wgyumg@gmail.com>

Closes #19000 from wangyum/SPARK-21709.
---
 external/docker-integration-tests/pom.xml                | 7 +++++++
 .../apache/spark/sql/jdbc/OracleIntegrationSuite.scala   | 9 ---------
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 0fa87a697454b..485b562dce990 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -80,6 +80,13 @@
       <type>test-jar</type>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-catalyst_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-sql_${scala.binary.version}</artifactId>
diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala
index e14810a32edc6..80a129a9e0329 100644
--- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala
+++ b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala
@@ -255,15 +255,6 @@ class OracleIntegrationSuite extends DockerJDBCIntegrationSuite with SharedSQLCo
     val df = dfRead.filter(dfRead.col("date_type").lt(dt))
       .filter(dfRead.col("timestamp_type").lt(ts))
 
-    val metadata = df.queryExecution.sparkPlan.metadata
-    // The "PushedFilters" part should be exist in Datafrome's
-    // physical plan and the existence of right literals in
-    // "PushedFilters" is used to prove that the predicates
-    // pushing down have been effective.
-    assert(metadata.get("PushedFilters").ne(None))
-    assert(metadata("PushedFilters").contains(dt.toString))
-    assert(metadata("PushedFilters").contains(ts.toString))
-
     val row = df.collect()(0)
     assert(row.getDate(0).equals(dateVal))
     assert(row.getTimestamp(1).equals(timestampVal))

From 73e04ecc4f29a0fe51687ed1337c61840c976f89 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Pelvet?= <cedric.pelvet@gmail.com>
Date: Sun, 20 Aug 2017 11:05:54 +0100
Subject: [PATCH 389/779] [MINOR] Correct validateAndTransformSchema in
 GaussianMixture and AFTSurvivalRegression
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

The line SchemaUtils.appendColumn(schema, $(predictionCol), IntegerType) did not modify the variable schema, hence only the last line had any effect. A temporary variable is used to correctly append the two columns predictionCol and probabilityCol.

## How was this patch tested?

Manually.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Cédric Pelvet <cedric.pelvet@gmail.com>

Closes #18980 from sharp-pixel/master.
---
 .../org/apache/spark/ml/clustering/GaussianMixture.scala  | 4 ++--
 .../spark/ml/regression/AFTSurvivalRegression.scala       | 8 +++++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index 5259ee419445f..f19ad7a5a6938 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -64,8 +64,8 @@ private[clustering] trait GaussianMixtureParams extends Params with HasMaxIter w
    */
   protected def validateAndTransformSchema(schema: StructType): StructType = {
     SchemaUtils.checkColumnType(schema, $(featuresCol), new VectorUDT)
-    SchemaUtils.appendColumn(schema, $(predictionCol), IntegerType)
-    SchemaUtils.appendColumn(schema, $(probabilityCol), new VectorUDT)
+    val schemaWithPredictionCol = SchemaUtils.appendColumn(schema, $(predictionCol), IntegerType)
+    SchemaUtils.appendColumn(schemaWithPredictionCol, $(probabilityCol), new VectorUDT)
   }
 }
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
index 0891994530f88..16821f317760e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
@@ -109,10 +109,12 @@ private[regression] trait AFTSurvivalRegressionParams extends Params
       SchemaUtils.checkNumericType(schema, $(censorCol))
       SchemaUtils.checkNumericType(schema, $(labelCol))
     }
-    if (hasQuantilesCol) {
+
+    val schemaWithQuantilesCol = if (hasQuantilesCol) {
       SchemaUtils.appendColumn(schema, $(quantilesCol), new VectorUDT)
-    }
-    SchemaUtils.appendColumn(schema, $(predictionCol), DoubleType)
+    } else schema
+
+    SchemaUtils.appendColumn(schemaWithQuantilesCol, $(predictionCol), DoubleType)
   }
 }
 

From 41e0eb71a63140c9a44a7d2f32821f02abd62367 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sun, 20 Aug 2017 19:48:04 +0900
Subject: [PATCH 390/779] [SPARK-21773][BUILD][DOCS] Installs mkdocs if missing
 in the path in SQL documentation build
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

This PR proposes to install `mkdocs` by `pip install` if missing in the path. Mainly to fix Jenkins's documentation build failure in `spark-master-docs`. See https://amplab.cs.berkeley.edu/jenkins/job/spark-master-docs/3580/console.

It also adds `mkdocs` as requirements in `docs/README.md`.

## How was this patch tested?

I manually ran `jekyll build` under `docs` directory after manually removing `mkdocs` via `pip uninstall mkdocs`.

Also, tested this in the same way but on CentOS Linux release 7.3.1611 (Core) where I built Spark few times but never built documentation before and `mkdocs` is not installed.

```
...
Moving back into docs dir.
Moving to SQL directory and building docs.
Missing mkdocs in your path, trying to install mkdocs for SQL documentation generation.
Collecting mkdocs
  Downloading mkdocs-0.16.3-py2.py3-none-any.whl (1.2MB)
    100% |████████████████████████████████| 1.2MB 574kB/s
Requirement already satisfied: PyYAML>=3.10 in /usr/lib64/python2.7/site-packages (from mkdocs)
Collecting livereload>=2.5.1 (from mkdocs)
  Downloading livereload-2.5.1-py2-none-any.whl
Collecting tornado>=4.1 (from mkdocs)
  Downloading tornado-4.5.1.tar.gz (483kB)
    100% |████████████████████████████████| 491kB 1.4MB/s
Collecting Markdown>=2.3.1 (from mkdocs)
  Downloading Markdown-2.6.9.tar.gz (271kB)
    100% |████████████████████████████████| 276kB 2.4MB/s
Collecting click>=3.3 (from mkdocs)
  Downloading click-6.7-py2.py3-none-any.whl (71kB)
    100% |████████████████████████████████| 71kB 2.8MB/s
Requirement already satisfied: Jinja2>=2.7.1 in /usr/lib/python2.7/site-packages (from mkdocs)
Requirement already satisfied: six in /usr/lib/python2.7/site-packages (from livereload>=2.5.1->mkdocs)
Requirement already satisfied: backports.ssl_match_hostname in /usr/lib/python2.7/site-packages (from tornado>=4.1->mkdocs)
Collecting singledispatch (from tornado>=4.1->mkdocs)
  Downloading singledispatch-3.4.0.3-py2.py3-none-any.whl
Collecting certifi (from tornado>=4.1->mkdocs)
  Downloading certifi-2017.7.27.1-py2.py3-none-any.whl (349kB)
    100% |████████████████████████████████| 358kB 2.1MB/s
Collecting backports_abc>=0.4 (from tornado>=4.1->mkdocs)
  Downloading backports_abc-0.5-py2.py3-none-any.whl
Requirement already satisfied: MarkupSafe>=0.23 in /usr/lib/python2.7/site-packages (from Jinja2>=2.7.1->mkdocs)
Building wheels for collected packages: tornado, Markdown
  Running setup.py bdist_wheel for tornado ... done
  Stored in directory: /root/.cache/pip/wheels/84/83/cd/6a04602633457269d161344755e6766d24307189b7a67ff4b7
  Running setup.py bdist_wheel for Markdown ... done
  Stored in directory: /root/.cache/pip/wheels/bf/46/10/c93e17ae86ae3b3a919c7b39dad3b5ccf09aeb066419e5c1e5
Successfully built tornado Markdown
Installing collected packages: singledispatch, certifi, backports-abc, tornado, livereload, Markdown, click, mkdocs
Successfully installed Markdown-2.6.9 backports-abc-0.5 certifi-2017.7.27.1 click-6.7 livereload-2.5.1 mkdocs-0.16.3 singledispatch-3.4.0.3 tornado-4.5.1
Generating markdown files for SQL documentation.
Generating HTML files for SQL documentation.
INFO    -  Cleaning site directory
INFO    -  Building documentation to directory: .../spark/sql/site
Moving back into docs dir.
Making directory api/sql
cp -r ../sql/site/. api/sql
            Source: .../spark/docs
       Destination: .../spark/docs/_site
      Generating...
                    done.
 Auto-regeneration: disabled. Use --watch to enable.
 ```

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #18984 from HyukjinKwon/sql-doc-mkdocs.
---
 docs/README.md     | 2 +-
 sql/create-docs.sh | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/README.md b/docs/README.md
index 0090dd071e15f..866364f1566a9 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -19,7 +19,7 @@ installed. Also install the following libraries:
     $ sudo gem install jekyll jekyll-redirect-from pygments.rb
     $ sudo pip install Pygments
     # Following is needed only for generating API docs
-    $ sudo pip install sphinx pypandoc
+    $ sudo pip install sphinx pypandoc mkdocs
     $ sudo Rscript -e 'install.packages(c("knitr", "devtools", "roxygen2", "testthat", "rmarkdown"), repos="http://cran.stat.ucla.edu/")'
 ```
 (Note: If you are on a system with both Ruby 1.9 and Ruby 2.0 you may need to replace gem with gem2.0)
diff --git a/sql/create-docs.sh b/sql/create-docs.sh
index 275e4c391a388..1d2d602c979be 100755
--- a/sql/create-docs.sh
+++ b/sql/create-docs.sh
@@ -33,8 +33,8 @@ if ! hash python 2>/dev/null; then
 fi
 
 if ! hash mkdocs 2>/dev/null; then
-  echo "Missing mkdocs in your path, skipping SQL documentation generation."
-  exit 0
+  echo "Missing mkdocs in your path, trying to install mkdocs for SQL documentation generation."
+  pip install mkdocs
 fi
 
 # Now create the markdown file

From 28a6cca7df900d13613b318c07acb97a5722d2b8 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Mon, 21 Aug 2017 00:45:23 +0800
Subject: [PATCH 391/779] [SPARK-21721][SQL][FOLLOWUP] Clear FileSystem
 deleteOnExit cache when paths are successfully removed

## What changes were proposed in this pull request?

Fix a typo in test.

## How was this patch tested?

Jenkins tests.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #19005 from viirya/SPARK-21721-followup.
---
 .../apache/spark/sql/hive/execution/SQLQuerySuite.scala    | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index ef3d9b27aad79..d0e0d20df30af 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -2023,7 +2023,8 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
   }
 
   test("SPARK-21721: Clear FileSystem deleterOnExit cache if path is successfully removed") {
-    withTable("test21721") {
+    val table = "test21721"
+    withTable(table) {
       val deleteOnExitField = classOf[FileSystem].getDeclaredField("deleteOnExit")
       deleteOnExitField.setAccessible(true)
 
@@ -2031,10 +2032,10 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       val setOfPath = deleteOnExitField.get(fs).asInstanceOf[Set[Path]]
 
       val testData = sparkContext.parallelize(1 to 10).map(i => TestData(i, i.toString)).toDF()
-      sql("CREATE TABLE test21721 (key INT, value STRING)")
+      sql(s"CREATE TABLE $table (key INT, value STRING)")
       val pathSizeToDeleteOnExit = setOfPath.size()
 
-      (0 to 10).foreach(_ => testData.write.mode(SaveMode.Append).insertInto("test1"))
+      (0 to 10).foreach(_ => testData.write.mode(SaveMode.Append).insertInto(table))
 
       assert(setOfPath.size() == pathSizeToDeleteOnExit)
     }

From 77d046ec47a9bfa6323aa014869844c28e18e049 Mon Sep 17 00:00:00 2001
From: Sergey Serebryakov <sserebryakov@tesla.com>
Date: Mon, 21 Aug 2017 08:21:25 +0100
Subject: [PATCH 392/779] [SPARK-21782][CORE] Repartition creates skews when
 numPartitions is a power of 2

## Problem
When an RDD (particularly with a low item-per-partition ratio) is repartitioned to numPartitions = power of 2, the resulting partitions are very uneven-sized, due to using fixed seed to initialize PRNG, and using the PRNG only once. See details in https://issues.apache.org/jira/browse/SPARK-21782

## What changes were proposed in this pull request?
Instead of directly using `0, 1, 2,...` seeds to initialize `Random`, hash them with `scala.util.hashing.byteswap32()`.

## How was this patch tested?
`build/mvn -Dtest=none -DwildcardSuites=org.apache.spark.rdd.RDDSuite test`

Author: Sergey Serebryakov <sserebryakov@tesla.com>

Closes #18990 from megaserg/repartition-skew.
---
 core/src/main/scala/org/apache/spark/rdd/RDD.scala      | 3 ++-
 core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala | 6 ++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index 5435f59ea0d28..8798dfc925362 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -24,6 +24,7 @@ import scala.collection.mutable.ArrayBuffer
 import scala.io.Codec
 import scala.language.implicitConversions
 import scala.reflect.{classTag, ClassTag}
+import scala.util.hashing
 
 import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus
 import org.apache.hadoop.io.{BytesWritable, NullWritable, Text}
@@ -448,7 +449,7 @@ abstract class RDD[T: ClassTag](
     if (shuffle) {
       /** Distributes elements evenly across output partitions, starting from a random partition. */
       val distributePartition = (index: Int, items: Iterator[T]) => {
-        var position = (new Random(index)).nextInt(numPartitions)
+        var position = (new Random(hashing.byteswap32(index))).nextInt(numPartitions)
         items.map { t =>
           // Note that the hash code of the key will just be the key itself. The HashPartitioner
           // will mod it with the number of total partitions.
diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
index 386c0060f9c41..e994d724c462f 100644
--- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
@@ -347,16 +347,18 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext {
       val partitions = repartitioned.glom().collect()
       // assert all elements are present
       assert(repartitioned.collect().sortWith(_ > _).toSeq === input.toSeq.sortWith(_ > _).toSeq)
-      // assert no bucket is overloaded
+      // assert no bucket is overloaded or empty
       for (partition <- partitions) {
         val avg = input.size / finalPartitions
         val maxPossible = avg + initialPartitions
-        assert(partition.length <=  maxPossible)
+        assert(partition.length <= maxPossible)
+        assert(!partition.isEmpty)
       }
     }
 
     testSplitPartitions(Array.fill(100)(1), 10, 20)
     testSplitPartitions(Array.fill(10000)(1) ++ Array.fill(10000)(2), 20, 100)
+    testSplitPartitions(Array.fill(1000)(1), 250, 128)
   }
 
   test("coalesced RDDs") {

From b3a07526fe774fd64fe3a2b9a2381eff9a3c49a3 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Mon, 21 Aug 2017 14:20:40 +0200
Subject: [PATCH 393/779] [SPARK-21718][SQL] Heavy log of type: "Skipping
 partition based on stats ..."

## What changes were proposed in this pull request?

Reduce 'Skipping partitions' message to debug

## How was this patch tested?

Existing tests

Author: Sean Owen <sowen@cloudera.com>

Closes #19010 from srowen/SPARK-21718.
---
 .../sql/execution/columnar/InMemoryTableScanExec.scala   | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
index 1d601374de135..c7ddec55682e1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
@@ -166,12 +166,13 @@ case class InMemoryTableScanExec(
         if (inMemoryPartitionPruningEnabled) {
           cachedBatchIterator.filter { cachedBatch =>
             if (!partitionFilter.eval(cachedBatch.stats)) {
-              def statsString: String = schemaIndex.map {
-                case (a, i) =>
+              logDebug {
+                val statsString = schemaIndex.map { case (a, i) =>
                   val value = cachedBatch.stats.get(i, a.dataType)
                   s"${a.name}: $value"
-              }.mkString(", ")
-              logInfo(s"Skipping partition based on stats $statsString")
+                }.mkString(", ")
+                s"Skipping partition based on stats $statsString"
+              }
               false
             } else {
               true

From 988b84d7ed43bea2616527ff050dffcf20548ab2 Mon Sep 17 00:00:00 2001
From: Nick Pentreath <nickp@za.ibm.com>
Date: Mon, 21 Aug 2017 14:35:38 +0200
Subject: [PATCH 394/779] [SPARK-21468][PYSPARK][ML] Python API for
 FeatureHasher

Add Python API for `FeatureHasher` transformer.

## How was this patch tested?

New doc test.

Author: Nick Pentreath <nickp@za.ibm.com>

Closes #18970 from MLnick/SPARK-21468-pyspark-hasher.
---
 .../spark/ml/feature/FeatureHasher.scala      | 16 ++--
 python/pyspark/ml/feature.py                  | 77 +++++++++++++++++++
 2 files changed, 85 insertions(+), 8 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
index d22bf164c313c..4b91fa933ed9f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
@@ -64,17 +64,17 @@ import org.apache.spark.util.collection.OpenHashMap
  *   ).toDF("real", "bool", "stringNum", "string")
  *
  *   val hasher = new FeatureHasher()
- *    .setInputCols("real", "bool", "stringNum", "num")
+ *    .setInputCols("real", "bool", "stringNum", "string")
  *    .setOutputCol("features")
  *
- *   hasher.transform(df).show()
+ *   hasher.transform(df).show(false)
  *
- *   +----+-----+---------+------+--------------------+
- *   |real| bool|stringNum|string|            features|
- *   +----+-----+---------+------+--------------------+
- *   | 2.0| true|        1|   foo|(262144,[51871,63...|
- *   | 3.0|false|        2|   bar|(262144,[6031,806...|
- *   +----+-----+---------+------+--------------------+
+ *   +----+-----+---------+------+------------------------------------------------------+
+ *   |real|bool |stringNum|string|features                                              |
+ *   +----+-----+---------+------+------------------------------------------------------+
+ *   |2.0 |true |1        |foo   |(262144,[51871,63643,174475,253195],[1.0,1.0,2.0,1.0])|
+ *   |3.0 |false|2        |bar   |(262144,[6031,80619,140467,174475],[1.0,1.0,1.0,3.0]) |
+ *   +----+-----+---------+------+------------------------------------------------------+
  * }}}
  */
 @Experimental
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 54b4026f78bec..050537b811f61 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -34,6 +34,7 @@
            'CountVectorizer', 'CountVectorizerModel',
            'DCT',
            'ElementwiseProduct',
+           'FeatureHasher',
            'HashingTF',
            'IDF', 'IDFModel',
            'Imputer', 'ImputerModel',
@@ -696,6 +697,82 @@ def getScalingVec(self):
         return self.getOrDefault(self.scalingVec)
 
 
+@inherit_doc
+class FeatureHasher(JavaTransformer, HasInputCols, HasOutputCol, HasNumFeatures, JavaMLReadable,
+                    JavaMLWritable):
+    """
+    .. note:: Experimental
+
+    Feature hashing projects a set of categorical or numerical features into a feature vector of
+    specified dimension (typically substantially smaller than that of the original feature
+    space). This is done using the hashing trick (https://en.wikipedia.org/wiki/Feature_hashing)
+    to map features to indices in the feature vector.
+
+    The FeatureHasher transformer operates on multiple columns. Each column may contain either
+    numeric or categorical features. Behavior and handling of column data types is as follows:
+
+    * Numeric columns:
+        For numeric features, the hash value of the column name is used to map the
+        feature value to its index in the feature vector. Numeric features are never
+        treated as categorical, even when they are integers. You must explicitly
+        convert numeric columns containing categorical features to strings first.
+
+    * String columns:
+        For categorical features, the hash value of the string "column_name=value"
+        is used to map to the vector index, with an indicator value of `1.0`.
+        Thus, categorical features are "one-hot" encoded
+        (similarly to using :py:class:`OneHotEncoder` with `dropLast=false`).
+
+    * Boolean columns:
+        Boolean values are treated in the same way as string columns. That is,
+        boolean features are represented as "column_name=true" or "column_name=false",
+        with an indicator value of `1.0`.
+
+    Null (missing) values are ignored (implicitly zero in the resulting feature vector).
+
+    Since a simple modulo is used to transform the hash function to a vector index,
+    it is advisable to use a power of two as the `numFeatures` parameter;
+    otherwise the features will not be mapped evenly to the vector indices.
+
+    >>> data = [(2.0, True, "1", "foo"), (3.0, False, "2", "bar")]
+    >>> cols = ["real", "bool", "stringNum", "string"]
+    >>> df = spark.createDataFrame(data, cols)
+    >>> hasher = FeatureHasher(inputCols=cols, outputCol="features")
+    >>> hasher.transform(df).head().features
+    SparseVector(262144, {51871: 1.0, 63643: 1.0, 174475: 2.0, 253195: 1.0})
+    >>> hasherPath = temp_path + "/hasher"
+    >>> hasher.save(hasherPath)
+    >>> loadedHasher = FeatureHasher.load(hasherPath)
+    >>> loadedHasher.getNumFeatures() == hasher.getNumFeatures()
+    True
+    >>> loadedHasher.transform(df).head().features == hasher.transform(df).head().features
+    True
+
+    .. versionadded:: 2.3.0
+    """
+
+    @keyword_only
+    def __init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None):
+        """
+        __init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None)
+        """
+        super(FeatureHasher, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.FeatureHasher", self.uid)
+        self._setDefault(numFeatures=1 << 18)
+        kwargs = self._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    @since("2.3.0")
+    def setParams(self, numFeatures=1 << 18, inputCols=None, outputCol=None):
+        """
+        setParams(self, numFeatures=1 << 18, inputCols=None, outputCol=None)
+        Sets params for this FeatureHasher.
+        """
+        kwargs = self._input_kwargs
+        return self._set(**kwargs)
+
+
 @inherit_doc
 class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, JavaMLReadable,
                 JavaMLWritable):

From ba843292e37368e1f5e4ae5c99ba1f5f90ca6025 Mon Sep 17 00:00:00 2001
From: Yuming Wang <wgyumg@gmail.com>
Date: Mon, 21 Aug 2017 10:16:56 -0700
Subject: [PATCH 395/779] [SPARK-21790][TESTS][FOLLOW-UP] Add filter pushdown
 verification back.

## What changes were proposed in this pull request?

The previous PR(https://github.com/apache/spark/pull/19000) removed filter pushdown verification, This PR add them back.

## How was this patch tested?
manual tests

Author: Yuming Wang <wgyumg@gmail.com>

Closes #19002 from wangyum/SPARK-21790-follow-up.
---
 .../spark/sql/jdbc/OracleIntegrationSuite.scala     | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala
index 80a129a9e0329..1b2c1b9e800ac 100644
--- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala
+++ b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala
@@ -22,6 +22,7 @@ import java.util.Properties
 import java.math.BigDecimal
 
 import org.apache.spark.sql.Row
+import org.apache.spark.sql.execution.{WholeStageCodegenExec, RowDataSourceScanExec}
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
 import org.apache.spark.tags.DockerTest
@@ -255,6 +256,18 @@ class OracleIntegrationSuite extends DockerJDBCIntegrationSuite with SharedSQLCo
     val df = dfRead.filter(dfRead.col("date_type").lt(dt))
       .filter(dfRead.col("timestamp_type").lt(ts))
 
+    val parentPlan = df.queryExecution.executedPlan
+    assert(parentPlan.isInstanceOf[WholeStageCodegenExec])
+    val node = parentPlan.asInstanceOf[WholeStageCodegenExec]
+    val metadata = node.child.asInstanceOf[RowDataSourceScanExec].metadata
+    // The "PushedFilters" part should exist in Dataframe's
+    // physical plan and the existence of right literals in
+    // "PushedFilters" is used to prove that the predicates
+    // pushing down have been effective.
+    assert(metadata.get("PushedFilters").isDefined)
+    assert(metadata("PushedFilters").contains(dt.toString))
+    assert(metadata("PushedFilters").contains(ts.toString))
+
     val row = df.collect()(0)
     assert(row.getDate(0).equals(dateVal))
     assert(row.getTimestamp(1).equals(timestampVal))

From 84b5b16ea6c9816c70f7471a50eb5e4acb7fb1a1 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Mon, 21 Aug 2017 15:09:02 -0700
Subject: [PATCH 396/779] [SPARK-21617][SQL] Store correct table metadata when
 altering schema in Hive metastore.

For Hive tables, the current "replace the schema" code is the correct
path, except that an exception in that path should result in an error, and
not in retrying in a different way.

For data source tables, Spark may generate a non-compatible Hive table;
but for that to work with Hive 2.1, the detection of data source tables needs
to be fixed in the Hive client, to also consider the raw tables used by code
such as `alterTableSchema`.

Tested with existing and added unit tests (plus internal tests with a 2.1 metastore).

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #18849 from vanzin/SPARK-21617.
---
 .../sql/execution/command/DDLSuite.scala      |  15 +--
 .../spark/sql/hive/HiveExternalCatalog.scala  |  55 +++++---
 .../sql/hive/client/HiveClientImpl.scala      |   3 +-
 .../hive/execution/Hive_2_1_DDLSuite.scala    | 126 ++++++++++++++++++
 4 files changed, 171 insertions(+), 28 deletions(-)
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/Hive_2_1_DDLSuite.scala

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 9332f773430e7..ad6fc20df1f02 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -2357,18 +2357,9 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
             }.getMessage
             assert(e.contains("Found duplicate column(s)"))
           } else {
-            if (isUsingHiveMetastore) {
-              // hive catalog will still complains that c1 is duplicate column name because hive
-              // identifiers are case insensitive.
-              val e = intercept[AnalysisException] {
-                sql("ALTER TABLE t1 ADD COLUMNS (C1 string)")
-              }.getMessage
-              assert(e.contains("HiveException"))
-            } else {
-              sql("ALTER TABLE t1 ADD COLUMNS (C1 string)")
-              assert(spark.table("t1").schema
-                .equals(new StructType().add("c1", IntegerType).add("C1", StringType)))
-            }
+            sql("ALTER TABLE t1 ADD COLUMNS (C1 string)")
+            assert(spark.table("t1").schema ==
+              new StructType().add("c1", IntegerType).add("C1", StringType))
           }
         }
       }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 547447b31f0a1..bdbb8bccbc5cd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -114,7 +114,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
    * should interpret these special data source properties and restore the original table metadata
    * before returning it.
    */
-  private def getRawTable(db: String, table: String): CatalogTable = withClient {
+  private[hive] def getRawTable(db: String, table: String): CatalogTable = withClient {
     client.getTable(db, table)
   }
 
@@ -386,6 +386,12 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
    * can be used as table properties later.
    */
   private def tableMetaToTableProps(table: CatalogTable): mutable.Map[String, String] = {
+    tableMetaToTableProps(table, table.schema)
+  }
+
+  private def tableMetaToTableProps(
+      table: CatalogTable,
+      schema: StructType): mutable.Map[String, String] = {
     val partitionColumns = table.partitionColumnNames
     val bucketSpec = table.bucketSpec
 
@@ -397,7 +403,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     // property. In this case, we split the JSON string and store each part as a separate table
     // property.
     val threshold = conf.get(SCHEMA_STRING_LENGTH_THRESHOLD)
-    val schemaJsonString = table.schema.json
+    val schemaJsonString = schema.json
     // Split the JSON string.
     val parts = schemaJsonString.grouped(threshold).toSeq
     properties.put(DATASOURCE_SCHEMA_NUMPARTS, parts.size.toString)
@@ -615,20 +621,29 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
   override def alterTableSchema(db: String, table: String, schema: StructType): Unit = withClient {
     requireTableExists(db, table)
     val rawTable = getRawTable(db, table)
-    val withNewSchema = rawTable.copy(schema = schema)
-    verifyColumnNames(withNewSchema)
     // Add table metadata such as table schema, partition columns, etc. to table properties.
-    val updatedTable = withNewSchema.copy(
-      properties = withNewSchema.properties ++ tableMetaToTableProps(withNewSchema))
-    try {
-      client.alterTable(updatedTable)
-    } catch {
-      case NonFatal(e) =>
-        val warningMessage =
-          s"Could not alter schema of table  ${rawTable.identifier.quotedString} in a Hive " +
-            "compatible way. Updating Hive metastore in Spark SQL specific format."
-        logWarning(warningMessage, e)
-        client.alterTable(updatedTable.copy(schema = updatedTable.partitionSchema))
+    val updatedProperties = rawTable.properties ++ tableMetaToTableProps(rawTable, schema)
+    val withNewSchema = rawTable.copy(properties = updatedProperties, schema = schema)
+    verifyColumnNames(withNewSchema)
+
+    if (isDatasourceTable(rawTable)) {
+      // For data source tables, first try to write it with the schema set; if that does not work,
+      // try again with updated properties and the partition schema. This is a simplified version of
+      // what createDataSourceTable() does, and may leave the table in a state unreadable by Hive
+      // (for example, the schema does not match the data source schema, or does not match the
+      // storage descriptor).
+      try {
+        client.alterTable(withNewSchema)
+      } catch {
+        case NonFatal(e) =>
+          val warningMessage =
+            s"Could not alter schema of table  ${rawTable.identifier.quotedString} in a Hive " +
+              "compatible way. Updating Hive metastore in Spark SQL specific format."
+          logWarning(warningMessage, e)
+          client.alterTable(withNewSchema.copy(schema = rawTable.partitionSchema))
+      }
+    } else {
+      client.alterTable(withNewSchema)
     }
   }
 
@@ -1351,4 +1366,14 @@ object HiveExternalCatalog {
         getColumnNamesByType(metadata.properties, "sort", "sorting columns"))
     }
   }
+
+  /**
+   * Detects a data source table. This checks both the table provider and the table properties,
+   * unlike DDLUtils which just checks the former.
+   */
+  private[spark] def isDatasourceTable(table: CatalogTable): Boolean = {
+    val provider = table.provider.orElse(table.properties.get(DATASOURCE_PROVIDER))
+    provider.isDefined && provider != Some(DDLUtils.HIVE_PROVIDER)
+  }
+
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 995280e0e9416..7c0b9bf19bf30 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -50,6 +50,7 @@ import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException}
 import org.apache.spark.sql.execution.QueryExecutionException
 import org.apache.spark.sql.execution.command.DDLUtils
+import org.apache.spark.sql.hive.HiveExternalCatalog
 import org.apache.spark.sql.hive.client.HiveClientImpl._
 import org.apache.spark.sql.types._
 import org.apache.spark.util.{CircularBuffer, Utils}
@@ -883,7 +884,7 @@ private[hive] object HiveClientImpl {
     }
     // after SPARK-19279, it is not allowed to create a hive table with an empty schema,
     // so here we should not add a default col schema
-    if (schema.isEmpty && DDLUtils.isDatasourceTable(table)) {
+    if (schema.isEmpty && HiveExternalCatalog.isDatasourceTable(table)) {
       // This is a hack to preserve existing behavior. Before Spark 2.0, we do not
       // set a default serde here (this was done in Hive), and so if the user provides
       // an empty schema Hive would automatically populate the schema with a single
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/Hive_2_1_DDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/Hive_2_1_DDLSuite.scala
new file mode 100644
index 0000000000000..5c248b9acd04f
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/Hive_2_1_DDLSuite.scala
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.execution
+
+import scala.language.existentials
+
+import org.apache.hadoop.conf.Configuration
+import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach}
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.launcher.SparkLauncher
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.catalog._
+import org.apache.spark.sql.hive.{HiveExternalCatalog, HiveUtils}
+import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.internal.StaticSQLConf._
+import org.apache.spark.sql.types._
+import org.apache.spark.tags.ExtendedHiveTest
+import org.apache.spark.util.Utils
+
+/**
+ * A separate set of DDL tests that uses Hive 2.1 libraries, which behave a little differently
+ * from the built-in ones.
+ */
+@ExtendedHiveTest
+class Hive_2_1_DDLSuite extends SparkFunSuite with TestHiveSingleton with BeforeAndAfterEach
+  with BeforeAndAfterAll {
+
+  // Create a custom HiveExternalCatalog instance with the desired configuration. We cannot
+  // use SparkSession here since there's already an active on managed by the TestHive object.
+  private var catalog = {
+    val warehouse = Utils.createTempDir()
+    val metastore = Utils.createTempDir()
+    metastore.delete()
+    val sparkConf = new SparkConf()
+      .set(SparkLauncher.SPARK_MASTER, "local")
+      .set(WAREHOUSE_PATH.key, warehouse.toURI().toString())
+      .set(CATALOG_IMPLEMENTATION.key, "hive")
+      .set(HiveUtils.HIVE_METASTORE_VERSION.key, "2.1")
+      .set(HiveUtils.HIVE_METASTORE_JARS.key, "maven")
+
+    val hadoopConf = new Configuration()
+    hadoopConf.set("hive.metastore.warehouse.dir", warehouse.toURI().toString())
+    hadoopConf.set("javax.jdo.option.ConnectionURL",
+      s"jdbc:derby:;databaseName=${metastore.getAbsolutePath()};create=true")
+    // These options are needed since the defaults in Hive 2.1 cause exceptions with an
+    // empty metastore db.
+    hadoopConf.set("datanucleus.schema.autoCreateAll", "true")
+    hadoopConf.set("hive.metastore.schema.verification", "false")
+
+    new HiveExternalCatalog(sparkConf, hadoopConf)
+  }
+
+  override def afterEach: Unit = {
+    catalog.listTables("default").foreach { t =>
+      catalog.dropTable("default", t, true, false)
+    }
+    spark.sessionState.catalog.reset()
+  }
+
+  override def afterAll(): Unit = {
+    catalog = null
+  }
+
+  test("SPARK-21617: ALTER TABLE for non-compatible DataSource tables") {
+    testAlterTable(
+      "t1",
+      "CREATE TABLE t1 (c1 int) USING json",
+      StructType(Array(StructField("c1", IntegerType), StructField("c2", IntegerType))),
+      hiveCompatible = false)
+  }
+
+  test("SPARK-21617: ALTER TABLE for Hive-compatible DataSource tables") {
+    testAlterTable(
+      "t1",
+      "CREATE TABLE t1 (c1 int) USING parquet",
+      StructType(Array(StructField("c1", IntegerType), StructField("c2", IntegerType))))
+  }
+
+  test("SPARK-21617: ALTER TABLE for Hive tables") {
+    testAlterTable(
+      "t1",
+      "CREATE TABLE t1 (c1 int) STORED AS parquet",
+      StructType(Array(StructField("c1", IntegerType), StructField("c2", IntegerType))))
+  }
+
+  test("SPARK-21617: ALTER TABLE with incompatible schema on Hive-compatible table") {
+    val exception = intercept[AnalysisException] {
+      testAlterTable(
+        "t1",
+        "CREATE TABLE t1 (c1 string) USING parquet",
+        StructType(Array(StructField("c2", IntegerType))))
+    }
+    assert(exception.getMessage().contains("types incompatible with the existing columns"))
+  }
+
+  private def testAlterTable(
+      tableName: String,
+      createTableStmt: String,
+      updatedSchema: StructType,
+      hiveCompatible: Boolean = true): Unit = {
+    spark.sql(createTableStmt)
+    val oldTable = spark.sessionState.catalog.externalCatalog.getTable("default", tableName)
+    catalog.createTable(oldTable, true)
+    catalog.alterTableSchema("default", tableName, updatedSchema)
+
+    val updatedTable = catalog.getTable("default", tableName)
+    assert(updatedTable.schema.fieldNames === updatedSchema.fieldNames)
+  }
+
+}

From c108a5d30e821fef23709681fca7da22bc507129 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Tue, 22 Aug 2017 08:43:18 +0800
Subject: [PATCH 397/779] [SPARK-19762][ML][FOLLOWUP] Add necessary comments to
 L2Regularization.

## What changes were proposed in this pull request?
MLlib ```LinearRegression/LogisticRegression/LinearSVC``` always standardize the data during training to improve the rate of convergence regardless of _standardization_ is true or false. If _standardization_ is false, we perform reverse standardization by penalizing each component differently to get effectively the same objective function when the training dataset is not standardized. We should keep these comments in the code to let developers understand how we handle it correctly.

## How was this patch tested?
Existing tests, only adding some comments in code.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #18992 from yanboliang/SPARK-19762.
---
 .../optim/loss/DifferentiableRegularization.scala  | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/loss/DifferentiableRegularization.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/loss/DifferentiableRegularization.scala
index 7ac7c225e5acb..929374eda13a8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/optim/loss/DifferentiableRegularization.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/loss/DifferentiableRegularization.scala
@@ -39,9 +39,13 @@ private[ml] trait DifferentiableRegularization[T] extends DiffFunction[T] {
  *
  * @param regParam The magnitude of the regularization.
  * @param shouldApply A function (Int => Boolean) indicating whether a given index should have
- *                    regularization applied to it.
+ *                    regularization applied to it. Usually we don't apply regularization to
+ *                    the intercept.
  * @param applyFeaturesStd Option for a function which maps coefficient index (column major) to the
- *                         feature standard deviation. If `None`, no standardization is applied.
+ *                         feature standard deviation. Since we always standardize the data during
+ *                         training, if `standardization` is false, we have to reverse
+ *                         standardization by penalizing each component differently by this param.
+ *                         If `standardization` is true, this should be `None`.
  */
 private[ml] class L2Regularization(
     override val regParam: Double,
@@ -57,6 +61,11 @@ private[ml] class L2Regularization(
           val coef = coefficients(j)
           applyFeaturesStd match {
             case Some(getStd) =>
+              // If `standardization` is false, we still standardize the data
+              // to improve the rate of convergence; as a result, we have to
+              // perform this reverse standardization by penalizing each component
+              // differently to get effectively the same objective function when
+              // the training dataset is not standardized.
               val std = getStd(j)
               if (std != 0.0) {
                 val temp = coef / (std * std)
@@ -66,6 +75,7 @@ private[ml] class L2Regularization(
                 0.0
               }
             case None =>
+              // If `standardization` is true, compute L2 regularization normally.
               sum += coef * coef
               gradient(j) = coef * regParam
           }

From 751f513367ae776c6d6815e1ce138078924872eb Mon Sep 17 00:00:00 2001
From: Kyle Kelley <rgbkrk@gmail.com>
Date: Tue, 22 Aug 2017 11:17:53 +0900
Subject: [PATCH 398/779] [SPARK-21070][PYSPARK] Attempt to update cloudpickle
 again

## What changes were proposed in this pull request?

Based on https://github.com/apache/spark/pull/18282 by rgbkrk this PR attempts to update to the current released cloudpickle and minimize the difference between Spark cloudpickle and "stock" cloud pickle with the goal of eventually using the stock cloud pickle.

Some notable changes:
* Import submodules accessed by pickled functions (cloudpipe/cloudpickle#80)
* Support recursive functions inside closures (cloudpipe/cloudpickle#89, cloudpipe/cloudpickle#90)
* Fix ResourceWarnings and DeprecationWarnings (cloudpipe/cloudpickle#88)
* Assume modules with __file__ attribute are not dynamic (cloudpipe/cloudpickle#85)
* Make cloudpickle Python 3.6 compatible (cloudpipe/cloudpickle#72)
* Allow pickling of builtin methods (cloudpipe/cloudpickle#57)
* Add ability to pickle dynamically created modules (cloudpipe/cloudpickle#52)
* Support method descriptor (cloudpipe/cloudpickle#46)
* No more pickling of closed files, was broken on Python 3 (cloudpipe/cloudpickle#32)
* ** Remove non-standard __transient__check (cloudpipe/cloudpickle#110)** -- while we don't use this internally, and have no tests or documentation for its use, downstream code may use __transient__, although it has never been part of the API, if we merge this we should include a note about this in the release notes.
* Support for pickling loggers (yay!) (cloudpipe/cloudpickle#96)
* BUG: Fix crash when pickling dynamic class cycles. (cloudpipe/cloudpickle#102)

## How was this patch tested?

Existing PySpark unit tests + the unit tests from the cloudpickle project on their own.

Author: Holden Karau <holden@us.ibm.com>
Author: Kyle Kelley <rgbkrk@gmail.com>

Closes #18734 from holdenk/holden-rgbkrk-cloudpickle-upgrades.
---
 python/pyspark/cloudpickle.py | 599 +++++++++++++++++++++++++---------
 1 file changed, 446 insertions(+), 153 deletions(-)

diff --git a/python/pyspark/cloudpickle.py b/python/pyspark/cloudpickle.py
index 389bee7eee6e9..40e91a2d0655d 100644
--- a/python/pyspark/cloudpickle.py
+++ b/python/pyspark/cloudpickle.py
@@ -9,10 +9,10 @@
 It does not include an unpickler, as standard python unpickling suffices.
 
 This module was extracted from the `cloud` package, developed by `PiCloud, Inc.
-<http://www.picloud.com>`_.
+<https://web.archive.org/web/20140626004012/http://www.picloud.com/>`_.
 
 Copyright (c) 2012, Regents of the University of California.
-Copyright (c) 2009 `PiCloud, Inc. <http://www.picloud.com>`_.
+Copyright (c) 2009 `PiCloud, Inc. <https://web.archive.org/web/20140626004012/http://www.picloud.com/>`_.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -42,18 +42,19 @@
 """
 from __future__ import print_function
 
-import operator
-import opcode
-import os
+import dis
+from functools import partial
+import imp
 import io
+import itertools
+import logging
+import opcode
+import operator
 import pickle
 import struct
 import sys
-import types
-from functools import partial
-import itertools
-import dis
 import traceback
+import types
 import weakref
 
 from pyspark.util import _exception_message
@@ -71,6 +72,92 @@
     from io import BytesIO as StringIO
     PY3 = True
 
+
+def _make_cell_set_template_code():
+    """Get the Python compiler to emit LOAD_FAST(arg); STORE_DEREF
+
+    Notes
+    -----
+    In Python 3, we could use an easier function:
+
+    .. code-block:: python
+
+       def f():
+           cell = None
+
+           def _stub(value):
+               nonlocal cell
+               cell = value
+
+           return _stub
+
+        _cell_set_template_code = f()
+
+    This function is _only_ a LOAD_FAST(arg); STORE_DEREF, but that is
+    invalid syntax on Python 2. If we use this function we also don't need
+    to do the weird freevars/cellvars swap below
+    """
+    def inner(value):
+        lambda: cell  # make ``cell`` a closure so that we get a STORE_DEREF
+        cell = value
+
+    co = inner.__code__
+
+    # NOTE: we are marking the cell variable as a free variable intentionally
+    # so that we simulate an inner function instead of the outer function. This
+    # is what gives us the ``nonlocal`` behavior in a Python 2 compatible way.
+    if not PY3:
+        return types.CodeType(
+            co.co_argcount,
+            co.co_nlocals,
+            co.co_stacksize,
+            co.co_flags,
+            co.co_code,
+            co.co_consts,
+            co.co_names,
+            co.co_varnames,
+            co.co_filename,
+            co.co_name,
+            co.co_firstlineno,
+            co.co_lnotab,
+            co.co_cellvars,  # this is the trickery
+            (),
+        )
+    else:
+        return types.CodeType(
+            co.co_argcount,
+            co.co_kwonlyargcount,
+            co.co_nlocals,
+            co.co_stacksize,
+            co.co_flags,
+            co.co_code,
+            co.co_consts,
+            co.co_names,
+            co.co_varnames,
+            co.co_filename,
+            co.co_name,
+            co.co_firstlineno,
+            co.co_lnotab,
+            co.co_cellvars,  # this is the trickery
+            (),
+        )
+
+
+_cell_set_template_code = _make_cell_set_template_code()
+
+
+def cell_set(cell, value):
+    """Set the value of a closure cell.
+    """
+    return types.FunctionType(
+        _cell_set_template_code,
+        {},
+        '_cell_set_inner',
+        (),
+        (cell,),
+    )(value)
+
+
 #relevant opcodes
 STORE_GLOBAL = opcode.opmap['STORE_GLOBAL']
 DELETE_GLOBAL = opcode.opmap['DELETE_GLOBAL']
@@ -161,6 +248,7 @@ def dump(self, obj):
             print_exec(sys.stderr)
             raise pickle.PicklingError(msg)
 
+
     def save_memoryview(self, obj):
         """Fallback to save_string"""
         Pickler.save_string(self, str(obj))
@@ -186,8 +274,22 @@ def save_module(self, obj):
         """
         Save a module as an import
         """
+        mod_name = obj.__name__
+        # If module is successfully found then it is not a dynamically created module
+        if hasattr(obj, '__file__'):
+            is_dynamic = False
+        else:
+            try:
+                _find_module(mod_name)
+                is_dynamic = False
+            except ImportError:
+                is_dynamic = True
+
         self.modules.add(obj)
-        self.save_reduce(subimport, (obj.__name__,), obj=obj)
+        if is_dynamic:
+            self.save_reduce(dynamic_subimport, (obj.__name__, vars(obj)), obj=obj)
+        else:
+            self.save_reduce(subimport, (obj.__name__,), obj=obj)
     dispatch[types.ModuleType] = save_module
 
     def save_codeobject(self, obj):
@@ -241,11 +343,32 @@ def save_function(self, obj, name=None):
             if getattr(themodule, name, None) is obj:
                 return self.save_global(obj, name)
 
+        # a builtin_function_or_method which comes in as an attribute of some
+        # object (e.g., object.__new__, itertools.chain.from_iterable) will end
+        # up with modname "__main__" and so end up here. But these functions
+        # have no __code__ attribute in CPython, so the handling for
+        # user-defined functions below will fail.
+        # So we pickle them here using save_reduce; have to do it differently
+        # for different python versions.
+        if not hasattr(obj, '__code__'):
+            if PY3:
+                if sys.version_info < (3, 4):
+                    raise pickle.PicklingError("Can't pickle %r" % obj)
+                else:
+                    rv = obj.__reduce_ex__(self.proto)
+            else:
+                if hasattr(obj, '__self__'):
+                    rv = (getattr, (obj.__self__, name))
+                else:
+                    raise pickle.PicklingError("Can't pickle %r" % obj)
+            return Pickler.save_reduce(self, obj=obj, *rv)
+
         # if func is lambda, def'ed at prompt, is in main, or is nested, then
         # we'll pickle the actual function object rather than simply saving a
         # reference (as is done in default pickler), via save_function_tuple.
-        if islambda(obj) or obj.__code__.co_filename == '<stdin>' or themodule is None:
-            #print("save global", islambda(obj), obj.__code__.co_filename, modname, themodule)
+        if (islambda(obj)
+                or getattr(obj.__code__, 'co_filename', None) == '<stdin>'
+                or themodule is None):
             self.save_function_tuple(obj)
             return
         else:
@@ -267,6 +390,97 @@ def save_function(self, obj, name=None):
             self.memoize(obj)
     dispatch[types.FunctionType] = save_function
 
+    def _save_subimports(self, code, top_level_dependencies):
+        """
+        Ensure de-pickler imports any package child-modules that
+        are needed by the function
+        """
+        # check if any known dependency is an imported package
+        for x in top_level_dependencies:
+            if isinstance(x, types.ModuleType) and hasattr(x, '__package__') and x.__package__:
+                # check if the package has any currently loaded sub-imports
+                prefix = x.__name__ + '.'
+                for name, module in sys.modules.items():
+                    # Older versions of pytest will add a "None" module to sys.modules.
+                    if name is not None and name.startswith(prefix):
+                        # check whether the function can address the sub-module
+                        tokens = set(name[len(prefix):].split('.'))
+                        if not tokens - set(code.co_names):
+                            # ensure unpickler executes this import
+                            self.save(module)
+                            # then discards the reference to it
+                            self.write(pickle.POP)
+
+    def save_dynamic_class(self, obj):
+        """
+        Save a class that can't be stored as module global.
+
+        This method is used to serialize classes that are defined inside
+        functions, or that otherwise can't be serialized as attribute lookups
+        from global modules.
+        """
+        clsdict = dict(obj.__dict__)  # copy dict proxy to a dict
+        if not isinstance(clsdict.get('__dict__', None), property):
+            # don't extract dict that are properties
+            clsdict.pop('__dict__', None)
+            clsdict.pop('__weakref__', None)
+
+        # hack as __new__ is stored differently in the __dict__
+        new_override = clsdict.get('__new__', None)
+        if new_override:
+            clsdict['__new__'] = obj.__new__
+
+        # namedtuple is a special case for Spark where we use the _load_namedtuple function
+        if getattr(obj, '_is_namedtuple_', False):
+            self.save_reduce(_load_namedtuple, (obj.__name__, obj._fields))
+            return
+
+        save = self.save
+        write = self.write
+
+        # We write pickle instructions explicitly here to handle the
+        # possibility that the type object participates in a cycle with its own
+        # __dict__. We first write an empty "skeleton" version of the class and
+        # memoize it before writing the class' __dict__ itself. We then write
+        # instructions to "rehydrate" the skeleton class by restoring the
+        # attributes from the __dict__.
+        #
+        # A type can appear in a cycle with its __dict__ if an instance of the
+        # type appears in the type's __dict__ (which happens for the stdlib
+        # Enum class), or if the type defines methods that close over the name
+        # of the type, (which is common for Python 2-style super() calls).
+
+        # Push the rehydration function.
+        save(_rehydrate_skeleton_class)
+
+        # Mark the start of the args for the rehydration function.
+        write(pickle.MARK)
+
+        # On PyPy, __doc__ is a readonly attribute, so we need to include it in
+        # the initial skeleton class.  This is safe because we know that the
+        # doc can't participate in a cycle with the original class.
+        doc_dict = {'__doc__': clsdict.pop('__doc__', None)}
+
+        # Create and memoize an empty class with obj's name and bases.
+        save(type(obj))
+        save((
+            obj.__name__,
+            obj.__bases__,
+            doc_dict,
+        ))
+        write(pickle.REDUCE)
+        self.memoize(obj)
+
+        # Now save the rest of obj's __dict__. Any references to obj
+        # encountered while saving will point to the skeleton class.
+        save(clsdict)
+
+        # Write a tuple of (skeleton_class, clsdict).
+        write(pickle.TUPLE)
+
+        # Call _rehydrate_skeleton_class(skeleton_class, clsdict)
+        write(pickle.REDUCE)
+
     def save_function_tuple(self, func):
         """  Pickles an actual func object.
 
@@ -279,17 +493,31 @@ def save_function_tuple(self, func):
         safe, since this won't contain a ref to the func), and memoize it as
         soon as it's created.  The other stuff can then be filled in later.
         """
+        if is_tornado_coroutine(func):
+            self.save_reduce(_rebuild_tornado_coroutine, (func.__wrapped__,),
+                             obj=func)
+            return
+
         save = self.save
         write = self.write
 
-        code, f_globals, defaults, closure, dct, base_globals = self.extract_func_data(func)
+        code, f_globals, defaults, closure_values, dct, base_globals = self.extract_func_data(func)
 
         save(_fill_function)  # skeleton function updater
         write(pickle.MARK)    # beginning of tuple that _fill_function expects
 
+        self._save_subimports(
+            code,
+            itertools.chain(f_globals.values(), closure_values or ()),
+        )
+
         # create a skeleton function object and memoize it
         save(_make_skel_func)
-        save((code, closure, base_globals))
+        save((
+            code,
+            len(closure_values) if closure_values is not None else -1,
+            base_globals,
+        ))
         write(pickle.REDUCE)
         self.memoize(func)
 
@@ -298,6 +526,7 @@ def save_function_tuple(self, func):
         save(defaults)
         save(dct)
         save(func.__module__)
+        save(closure_values)
         write(pickle.TUPLE)
         write(pickle.REDUCE)  # applies _fill_function on the tuple
 
@@ -335,7 +564,7 @@ def extract_code_globals(cls, co):
     def extract_func_data(self, func):
         """
         Turn the function into a tuple of data necessary to recreate it:
-            code, globals, defaults, closure, dict
+            code, globals, defaults, closure_values, dict
         """
         code = func.__code__
 
@@ -352,7 +581,11 @@ def extract_func_data(self, func):
         defaults = func.__defaults__
 
         # process closure
-        closure = [c.cell_contents for c in func.__closure__] if func.__closure__ else []
+        closure = (
+            list(map(_get_cell_contents, func.__closure__))
+            if func.__closure__ is not None
+            else None
+        )
 
         # save the dict
         dct = func.__dict__
@@ -363,12 +596,18 @@ def extract_func_data(self, func):
         return (code, f_globals, defaults, closure, dct, base_globals)
 
     def save_builtin_function(self, obj):
-        if obj.__module__ is "__builtin__":
+        if obj.__module__ == "__builtin__":
             return self.save_global(obj)
         return self.save_function(obj)
     dispatch[types.BuiltinFunctionType] = save_builtin_function
 
     def save_global(self, obj, name=None, pack=struct.pack):
+        """
+        Save a "global".
+
+        The name of this method is somewhat misleading: all types get
+        dispatched here.
+        """
         if obj.__module__ == "__builtin__" or obj.__module__ == "builtins":
             if obj in _BUILTIN_TYPE_NAMES:
                 return self.save_reduce(_builtin_type, (_BUILTIN_TYPE_NAMES[obj],), obj=obj)
@@ -397,42 +636,7 @@ def save_global(self, obj, name=None, pack=struct.pack):
 
         typ = type(obj)
         if typ is not obj and isinstance(obj, (type, types.ClassType)):
-            d = dict(obj.__dict__)  # copy dict proxy to a dict
-            if not isinstance(d.get('__dict__', None), property):
-                # don't extract dict that are properties
-                d.pop('__dict__', None)
-            d.pop('__weakref__', None)
-
-            # hack as __new__ is stored differently in the __dict__
-            new_override = d.get('__new__', None)
-            if new_override:
-                d['__new__'] = obj.__new__
-
-            # workaround for namedtuple (hijacked by PySpark)
-            if getattr(obj, '_is_namedtuple_', False):
-                self.save_reduce(_load_namedtuple, (obj.__name__, obj._fields))
-                return
-
-            self.save(_load_class)
-            self.save_reduce(typ, (obj.__name__, obj.__bases__, {"__doc__": obj.__doc__}), obj=obj)
-            d.pop('__doc__', None)
-            # handle property and staticmethod
-            dd = {}
-            for k, v in d.items():
-                if isinstance(v, property):
-                    k = ('property', k)
-                    v = (v.fget, v.fset, v.fdel, v.__doc__)
-                elif isinstance(v, staticmethod) and hasattr(v, '__func__'):
-                    k = ('staticmethod', k)
-                    v = v.__func__
-                elif isinstance(v, classmethod) and hasattr(v, '__func__'):
-                    k = ('classmethod', k)
-                    v = v.__func__
-                dd[k] = v
-            self.save(dd)
-            self.write(pickle.TUPLE2)
-            self.write(pickle.REDUCE)
-
+            self.save_dynamic_class(obj)
         else:
             raise pickle.PicklingError("Can't pickle %r" % obj)
 
@@ -441,18 +645,26 @@ def save_global(self, obj, name=None, pack=struct.pack):
 
     def save_instancemethod(self, obj):
         # Memoization rarely is ever useful due to python bounding
-        if PY3:
-            self.save_reduce(types.MethodType, (obj.__func__, obj.__self__), obj=obj)
+        if obj.__self__ is None:
+            self.save_reduce(getattr, (obj.im_class, obj.__name__))
         else:
-            self.save_reduce(types.MethodType, (obj.__func__, obj.__self__, obj.__self__.__class__),
-                             obj=obj)
+            if PY3:
+                self.save_reduce(types.MethodType, (obj.__func__, obj.__self__), obj=obj)
+            else:
+                self.save_reduce(types.MethodType, (obj.__func__, obj.__self__, obj.__self__.__class__),
+                         obj=obj)
     dispatch[types.MethodType] = save_instancemethod
 
     def save_inst(self, obj):
-        """Inner logic to save instance. Based off pickle.save_inst
-        Supports __transient__"""
+        """Inner logic to save instance. Based off pickle.save_inst"""
         cls = obj.__class__
 
+        # Try the dispatch table (pickle module doesn't do it)
+        f = self.dispatch.get(cls)
+        if f:
+            f(self, obj)  # Call unbound method with explicit self
+            return
+
         memo = self.memo
         write = self.write
         save = self.save
@@ -482,13 +694,6 @@ def save_inst(self, obj):
             getstate = obj.__getstate__
         except AttributeError:
             stuff = obj.__dict__
-            #remove items if transient
-            if hasattr(obj, '__transient__'):
-                transient = obj.__transient__
-                stuff = stuff.copy()
-                for k in list(stuff.keys()):
-                    if k in transient:
-                        del stuff[k]
         else:
             stuff = getstate()
             pickle._keep_alive(stuff, memo)
@@ -503,6 +708,17 @@ def save_property(self, obj):
         self.save_reduce(property, (obj.fget, obj.fset, obj.fdel, obj.__doc__), obj=obj)
     dispatch[property] = save_property
 
+    def save_classmethod(self, obj):
+        try:
+            orig_func = obj.__func__
+        except AttributeError:  # Python 2.6
+            orig_func = obj.__get__(None, object)
+            if isinstance(obj, classmethod):
+                orig_func = orig_func.__func__  # Unbind
+        self.save_reduce(type(obj), (orig_func,), obj=obj)
+    dispatch[classmethod] = save_classmethod
+    dispatch[staticmethod] = save_classmethod
+
     def save_itemgetter(self, obj):
         """itemgetter serializer (needed for namedtuple support)"""
         class Dummy:
@@ -540,8 +756,6 @@ def __getattribute__(self, item):
 
     def save_reduce(self, func, args, state=None,
                     listitems=None, dictitems=None, obj=None):
-        """Modified to support __transient__ on new objects
-        Change only affects protocol level 2 (which is always used by PiCloud"""
         # Assert that args is a tuple or None
         if not isinstance(args, tuple):
             raise pickle.PicklingError("args from reduce() should be a tuple")
@@ -555,7 +769,6 @@ def save_reduce(self, func, args, state=None,
 
         # Protocol 2 special case: if func's name is __newobj__, use NEWOBJ
         if self.proto >= 2 and getattr(func, "__name__", "") == "__newobj__":
-            #Added fix to allow transient
             cls = args[0]
             if not hasattr(cls, "__new__"):
                 raise pickle.PicklingError(
@@ -566,15 +779,6 @@ def save_reduce(self, func, args, state=None,
             args = args[1:]
             save(cls)
 
-            #Don't pickle transient entries
-            if hasattr(obj, '__transient__'):
-                transient = obj.__transient__
-                state = state.copy()
-
-                for k in list(state.keys()):
-                    if k in transient:
-                        del state[k]
-
             save(args)
             write(pickle.NEWOBJ)
         else:
@@ -623,72 +827,82 @@ def save_file(self, obj):
             return self.save_reduce(getattr, (sys,'stderr'), obj=obj)
         if obj is sys.stdin:
             raise pickle.PicklingError("Cannot pickle standard input")
-        if  hasattr(obj, 'isatty') and obj.isatty():
+        if obj.closed:
+            raise pickle.PicklingError("Cannot pickle closed files")
+        if hasattr(obj, 'isatty') and obj.isatty():
             raise pickle.PicklingError("Cannot pickle files that map to tty objects")
-        if 'r' not in obj.mode:
-            raise pickle.PicklingError("Cannot pickle files that are not opened for reading")
+        if 'r' not in obj.mode and '+' not in obj.mode:
+            raise pickle.PicklingError("Cannot pickle files that are not opened for reading: %s" % obj.mode)
+
         name = obj.name
-        try:
-            fsize = os.stat(name).st_size
-        except OSError:
-            raise pickle.PicklingError("Cannot pickle file %s as it cannot be stat" % name)
 
-        if obj.closed:
-            #create an empty closed string io
-            retval = pystringIO.StringIO("")
-            retval.close()
-        elif not fsize: #empty file
-            retval = pystringIO.StringIO("")
-            try:
-                tmpfile = file(name)
-                tst = tmpfile.read(1)
-            except IOError:
-                raise pickle.PicklingError("Cannot pickle file %s as it cannot be read" % name)
-            tmpfile.close()
-            if tst != '':
-                raise pickle.PicklingError("Cannot pickle file %s as it does not appear to map to a physical, real file" % name)
-        else:
-            try:
-                tmpfile = file(name)
-                contents = tmpfile.read()
-                tmpfile.close()
-            except IOError:
-                raise pickle.PicklingError("Cannot pickle file %s as it cannot be read" % name)
-            retval = pystringIO.StringIO(contents)
+        retval = pystringIO.StringIO()
+
+        try:
+            # Read the whole file
             curloc = obj.tell()
-            retval.seek(curloc)
+            obj.seek(0)
+            contents = obj.read()
+            obj.seek(curloc)
+        except IOError:
+            raise pickle.PicklingError("Cannot pickle file %s as it cannot be read" % name)
+        retval.write(contents)
+        retval.seek(curloc)
 
         retval.name = name
         self.save(retval)
         self.memoize(obj)
 
+    def save_ellipsis(self, obj):
+        self.save_reduce(_gen_ellipsis, ())
+
+    def save_not_implemented(self, obj):
+        self.save_reduce(_gen_not_implemented, ())
+
     if PY3:
         dispatch[io.TextIOWrapper] = save_file
     else:
         dispatch[file] = save_file
 
-    """Special functions for Add-on libraries"""
+    dispatch[type(Ellipsis)] = save_ellipsis
+    dispatch[type(NotImplemented)] = save_not_implemented
 
-    def inject_numpy(self):
-        numpy = sys.modules.get('numpy')
-        if not numpy or not hasattr(numpy, 'ufunc'):
-            return
-        self.dispatch[numpy.ufunc] = self.__class__.save_ufunc
-
-    def save_ufunc(self, obj):
-        """Hack function for saving numpy ufunc objects"""
-        name = obj.__name__
-        numpy_tst_mods = ['numpy', 'scipy.special']
-        for tst_mod_name in numpy_tst_mods:
-            tst_mod = sys.modules.get(tst_mod_name, None)
-            if tst_mod and name in tst_mod.__dict__:
-                return self.save_reduce(_getobject, (tst_mod_name, name))
-        raise pickle.PicklingError('cannot save %s. Cannot resolve what module it is defined in'
-                                   % str(obj))
+    # WeakSet was added in 2.7.
+    if hasattr(weakref, 'WeakSet'):
+        def save_weakset(self, obj):
+            self.save_reduce(weakref.WeakSet, (list(obj),))
 
+        dispatch[weakref.WeakSet] = save_weakset
+
+    """Special functions for Add-on libraries"""
     def inject_addons(self):
         """Plug in system. Register additional pickling functions if modules already loaded"""
-        self.inject_numpy()
+        pass
+
+    def save_logger(self, obj):
+        self.save_reduce(logging.getLogger, (obj.name,), obj=obj)
+
+    dispatch[logging.Logger] = save_logger
+
+
+# Tornado support
+
+def is_tornado_coroutine(func):
+    """
+    Return whether *func* is a Tornado coroutine function.
+    Running coroutines are not supported.
+    """
+    if 'tornado.gen' not in sys.modules:
+        return False
+    gen = sys.modules['tornado.gen']
+    if not hasattr(gen, "is_coroutine_function"):
+        # Tornado version is too old
+        return False
+    return gen.is_coroutine_function(func)
+
+def _rebuild_tornado_coroutine(func):
+    from tornado import gen
+    return gen.coroutine(func)
 
 
 # Shorthands for legacy support
@@ -705,6 +919,10 @@ def dumps(obj, protocol=2):
 
     return file.getvalue()
 
+# including pickles unloading functions in this namespace
+load = pickle.load
+loads = pickle.loads
+
 
 #hack for __import__ not working as desired
 def subimport(name):
@@ -712,6 +930,12 @@ def subimport(name):
     return sys.modules[name]
 
 
+def dynamic_subimport(name, vars):
+    mod = imp.new_module(name)
+    mod.__dict__.update(vars)
+    sys.modules[name] = mod
+    return mod
+
 # restores function attributes
 def _restore_attr(obj, attr):
     for key, val in attr.items():
@@ -755,59 +979,114 @@ def _genpartial(func, args, kwds):
         kwds = {}
     return partial(func, *args, **kwds)
 
+def _gen_ellipsis():
+    return Ellipsis
 
-def _fill_function(func, globals, defaults, dict, module):
+def _gen_not_implemented():
+    return NotImplemented
+
+
+def _get_cell_contents(cell):
+    try:
+        return cell.cell_contents
+    except ValueError:
+        # sentinel used by ``_fill_function`` which will leave the cell empty
+        return _empty_cell_value
+
+
+def instance(cls):
+    """Create a new instance of a class.
+
+    Parameters
+    ----------
+    cls : type
+        The class to create an instance of.
+
+    Returns
+    -------
+    instance : cls
+        A new instance of ``cls``.
+    """
+    return cls()
+
+
+@instance
+class _empty_cell_value(object):
+    """sentinel for empty closures
+    """
+    @classmethod
+    def __reduce__(cls):
+        return cls.__name__
+
+
+def _fill_function(func, globals, defaults, dict, module, closure_values):
     """ Fills in the rest of function data into the skeleton function object
         that were created via _make_skel_func().
-         """
+    """
     func.__globals__.update(globals)
     func.__defaults__ = defaults
     func.__dict__ = dict
     func.__module__ = module
 
-    return func
+    cells = func.__closure__
+    if cells is not None:
+        for cell, value in zip(cells, closure_values):
+            if value is not _empty_cell_value:
+                cell_set(cell, value)
 
+    return func
 
-def _make_cell(value):
-    return (lambda: value).__closure__[0]
 
+def _make_empty_cell():
+    if False:
+        # trick the compiler into creating an empty cell in our lambda
+        cell = None
+        raise AssertionError('this route should not be executed')
 
-def _reconstruct_closure(values):
-    return tuple([_make_cell(v) for v in values])
+    return (lambda: cell).__closure__[0]
 
 
-def _make_skel_func(code, closures, base_globals = None):
+def _make_skel_func(code, cell_count, base_globals=None):
     """ Creates a skeleton function object that contains just the provided
         code and the correct number of cells in func_closure.  All other
         func attributes (e.g. func_globals) are empty.
     """
-    closure = _reconstruct_closure(closures) if closures else None
-
     if base_globals is None:
         base_globals = {}
     base_globals['__builtins__'] = __builtins__
 
-    return types.FunctionType(code, base_globals,
-                              None, None, closure)
+    closure = (
+        tuple(_make_empty_cell() for _ in range(cell_count))
+        if cell_count >= 0 else
+        None
+    )
+    return types.FunctionType(code, base_globals, None, None, closure)
 
 
-def _load_class(cls, d):
-    """
-    Loads additional properties into class `cls`.
+def _rehydrate_skeleton_class(skeleton_class, class_dict):
+    """Put attributes from `class_dict` back on `skeleton_class`.
+
+    See CloudPickler.save_dynamic_class for more info.
     """
-    for k, v in d.items():
-        if isinstance(k, tuple):
-            typ, k = k
-            if typ == 'property':
-                v = property(*v)
-            elif typ == 'staticmethod':
-                v = staticmethod(v)
-            elif typ == 'classmethod':
-                v = classmethod(v)
-        setattr(cls, k, v)
-    return cls
+    for attrname, attr in class_dict.items():
+        setattr(skeleton_class, attrname, attr)
+    return skeleton_class
 
 
+def _find_module(mod_name):
+    """
+    Iterate over each part instead of calling imp.find_module directly.
+    This function is able to find submodules (e.g. sickit.tree)
+    """
+    path = None
+    for part in mod_name.split('.'):
+        if path is not None:
+            path = [path]
+        file, path, description = imp.find_module(part, path)
+        if file is not None:
+            file.close()
+    return path, description
+
 def _load_namedtuple(name, fields):
     """
     Loads a class generated by namedtuple
@@ -815,10 +1094,24 @@ def _load_namedtuple(name, fields):
     from collections import namedtuple
     return namedtuple(name, fields)
 
-
 """Constructors for 3rd party libraries
 Note: These can never be renamed due to client compatibility issues"""
 
 def _getobject(modname, attribute):
     mod = __import__(modname, fromlist=[attribute])
     return mod.__dict__[attribute]
+
+
+""" Use copy_reg to extend global pickle definitions """
+
+if sys.version_info < (3, 4):
+    method_descriptor = type(str.upper)
+
+    def _reduce_method_descriptor(obj):
+        return (getattr, (obj.__objclass__, obj.__name__))
+
+    try:
+        import copy_reg as copyreg
+    except ImportError:
+        import copyreg
+    copyreg.pickle(method_descriptor, _reduce_method_descriptor)

From 5c9b3017279e4f20c364ae92a1fd059d4cfe9f4f Mon Sep 17 00:00:00 2001
From: Andrew Ray <ray.andrew@gmail.com>
Date: Mon, 21 Aug 2017 23:08:27 -0700
Subject: [PATCH 399/779] [SPARK-21584][SQL][SPARKR] Update R method for
 summary to call new implementation

## What changes were proposed in this pull request?

SPARK-21100 introduced a new `summary` method to the Scala/Java Dataset API that included  expanded statistics (vs `describe`) and control over which statistics to compute. Currently in the R API `summary` acts as an alias for `describe`. This patch updates the R API to call the new `summary` method in the JVM that includes additional statistics and ability to select which to compute.

This does not break the current interface as the present `summary` method does not take additional arguments like `describe` and the output was never meant to be used programmatically.

## How was this patch tested?

Modified and additional unit tests.

Author: Andrew Ray <ray.andrew@gmail.com>

Closes #18786 from aray/summary-r.
---
 R/pkg/R/DataFrame.R                   | 44 ++++++++++++++++++++++++---
 R/pkg/R/generics.R                    |  2 +-
 R/pkg/tests/fulltests/test_sparkSQL.R | 19 ++++++++----
 3 files changed, 54 insertions(+), 11 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 5d6f9c042248b..80526cdd4fd45 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -2930,7 +2930,7 @@ setMethod("saveAsTable",
             invisible(callJMethod(write, "saveAsTable", tableName))
           })
 
-#' summary
+#' describe
 #'
 #' Computes statistics for numeric and string columns.
 #' If no columns are given, this function computes statistics for all numerical or string columns.
@@ -2941,7 +2941,7 @@ setMethod("saveAsTable",
 #' @return A SparkDataFrame.
 #' @family SparkDataFrame functions
 #' @aliases describe,SparkDataFrame,character-method describe,SparkDataFrame,ANY-method
-#' @rdname summary
+#' @rdname describe
 #' @name describe
 #' @export
 #' @examples
@@ -2953,6 +2953,7 @@ setMethod("saveAsTable",
 #' describe(df, "col1")
 #' describe(df, "col1", "col2")
 #' }
+#' @seealso See \link{summary} for expanded statistics and control over which statistics to compute.
 #' @note describe(SparkDataFrame, character) since 1.4.0
 setMethod("describe",
           signature(x = "SparkDataFrame", col = "character"),
@@ -2962,7 +2963,7 @@ setMethod("describe",
             dataFrame(sdf)
           })
 
-#' @rdname summary
+#' @rdname describe
 #' @name describe
 #' @aliases describe,SparkDataFrame-method
 #' @note describe(SparkDataFrame) since 1.4.0
@@ -2973,15 +2974,50 @@ setMethod("describe",
             dataFrame(sdf)
           })
 
+#' summary
+#'
+#' Computes specified statistics for numeric and string columns. Available statistics are:
+#' \itemize{
+#' \item count
+#' \item mean
+#' \item stddev
+#' \item min
+#' \item max
+#' \item arbitrary approximate percentiles specified as a percentage (eg, "75%")
+#' }
+#' If no statistics are given, this function computes count, mean, stddev, min,
+#' approximate quartiles (percentiles at 25%, 50%, and 75%), and max.
+#' This function is meant for exploratory data analysis, as we make no guarantee about the
+#' backward compatibility of the schema of the resulting Dataset. If you want to
+#' programmatically compute summary statistics, use the \code{agg} function instead.
+#'
+#'
 #' @param object a SparkDataFrame to be summarized.
+#' @param ... (optional) statistics to be computed for all columns.
+#' @return A SparkDataFrame.
+#' @family SparkDataFrame functions
 #' @rdname summary
 #' @name summary
 #' @aliases summary,SparkDataFrame-method
+#' @export
+#' @examples
+#'\dontrun{
+#' sparkR.session()
+#' path <- "path/to/file.json"
+#' df <- read.json(path)
+#' summary(df)
+#' summary(df, "min", "25%", "75%", "max")
+#' summary(select(df, "age", "height"))
+#' }
 #' @note summary(SparkDataFrame) since 1.5.0
+#' @note The statistics provided by \code{summary} were change in 2.3.0 use \link{describe} for previous defaults.
+#' @seealso \link{describe}
 setMethod("summary",
           signature(object = "SparkDataFrame"),
           function(object, ...) {
-            describe(object)
+            statisticsList <- list(...)
+            sdf <- callJMethod(object@sdf, "summary", statisticsList)
+            dataFrame(sdf)
           })
 
 
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index df91c35f7d851..f0cc2dc3f195a 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -521,7 +521,7 @@ setGeneric("gapplyCollect", function(x, ...) { standardGeneric("gapplyCollect")
 # @export
 setGeneric("getNumPartitions", function(x) { standardGeneric("getNumPartitions") })
 
-#' @rdname summary
+#' @rdname describe
 #' @export
 setGeneric("describe", function(x, col, ...) { standardGeneric("describe") })
 
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index deb0e163a8d58..d477fc6a4256c 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -2497,7 +2497,7 @@ test_that("read/write text files - compression option", {
   unlink(textPath)
 })
 
-test_that("describe() and summarize() on a DataFrame", {
+test_that("describe() and summary() on a DataFrame", {
   df <- read.json(jsonPath)
   stats <- describe(df, "age")
   expect_equal(collect(stats)[1, "summary"], "count")
@@ -2508,8 +2508,15 @@ test_that("describe() and summarize() on a DataFrame", {
   expect_equal(collect(stats)[5, "age"], "30")
 
   stats2 <- summary(df)
-  expect_equal(collect(stats2)[4, "summary"], "min")
-  expect_equal(collect(stats2)[5, "age"], "30")
+  expect_equal(collect(stats2)[5, "summary"], "25%")
+  expect_equal(collect(stats2)[5, "age"], "30.0")
+
+  stats3 <- summary(df, "min", "max", "55.1%")
+
+  expect_equal(collect(stats3)[1, "summary"], "min")
+  expect_equal(collect(stats3)[2, "summary"], "max")
+  expect_equal(collect(stats3)[3, "summary"], "55.1%")
+  expect_equal(collect(stats3)[3, "age"], "30.0")
 
   # SPARK-16425: SparkR summary() fails on column of type logical
   df <- withColumn(df, "boolean", df$age == 30)
@@ -2742,15 +2749,15 @@ test_that("attach() on a DataFrame", {
   expected_age <- data.frame(age = c(NA, 30, 19))
   expect_equal(head(age), expected_age)
   stat <- summary(age)
-  expect_equal(collect(stat)[5, "age"], "30")
+  expect_equal(collect(stat)[8, "age"], "30")
   age <- age$age + 1
   expect_is(age, "Column")
   rm(age)
   stat2 <- summary(age)
-  expect_equal(collect(stat2)[5, "age"], "30")
+  expect_equal(collect(stat2)[8, "age"], "30")
   detach("df")
   stat3 <- summary(df[, "age", drop = F])
-  expect_equal(collect(stat3)[5, "age"], "30")
+  expect_equal(collect(stat3)[8, "age"], "30")
   expect_error(age)
 })
 

From be72b157ea13ea116c5178a9e41e37ae24090f72 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 22 Aug 2017 17:54:39 +0800
Subject: [PATCH 400/779] [SPARK-21803][TEST] Remove the HiveDDLCommandSuite

## What changes were proposed in this pull request?
We do not have any Hive-specific parser. It does not make sense to keep a parser-specific test suite `HiveDDLCommandSuite.scala` in the Hive package. This PR is to remove it.

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #19015 from gatorsmile/combineDDL.
---
 ...ommandSuite.scala => DDLParserSuite.scala} | 572 +++++++++++++-
 .../spark/sql/hive/HiveDDLCommandSuite.scala  | 739 ------------------
 .../apache/spark/sql/hive/TestHiveSuite.scala |   4 +
 .../sql/hive/execution/HiveDDLSuite.scala     |  11 +
 .../sql/hive/execution/HiveSerDeSuite.scala   | 133 +++-
 5 files changed, 716 insertions(+), 743 deletions(-)
 rename sql/core/src/test/scala/org/apache/spark/sql/execution/command/{DDLCommandSuite.scala => DDLParserSuite.scala} (62%)
 delete mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
similarity index 62%
rename from sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
rename to sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
index 5643c58d9f847..70df7607a713f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
@@ -22,19 +22,25 @@ import java.util.Locale
 
 import scala.reflect.{classTag, ClassTag}
 
+import org.apache.spark.sql.{AnalysisException, SaveMode}
 import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.catalog._
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.dsl.plans
+import org.apache.spark.sql.catalyst.dsl.plans.DslLogicalPlan
+import org.apache.spark.sql.catalyst.expressions.JsonTuple
 import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.catalyst.plans.PlanTest
-import org.apache.spark.sql.catalyst.plans.logical.Project
+import org.apache.spark.sql.catalyst.plans.logical.{Generate, LogicalPlan, Project, ScriptTransformation}
 import org.apache.spark.sql.execution.SparkSqlParser
 import org.apache.spark.sql.execution.datasources.CreateTable
 import org.apache.spark.sql.internal.{HiveSerDe, SQLConf}
+import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
 
 
-// TODO: merge this with DDLSuite (SPARK-14441)
-class DDLCommandSuite extends PlanTest {
+class DDLParserSuite extends PlanTest with SharedSQLContext {
   private lazy val parser = new SparkSqlParser(new SQLConf)
 
   private def assertUnsupported(sql: String, containsThesePhrases: Seq[String] = Seq()): Unit = {
@@ -56,6 +62,17 @@ class DDLCommandSuite extends PlanTest {
     }
   }
 
+  private def compareTransformQuery(sql: String, expected: LogicalPlan): Unit = {
+    val plan = parser.parsePlan(sql).asInstanceOf[ScriptTransformation].copy(ioschema = null)
+    comparePlans(plan, expected, checkAnalysis = false)
+  }
+
+  private def extractTableDesc(sql: String): (CatalogTable, Boolean) = {
+    parser.parsePlan(sql).collect {
+      case CreateTable(tableDesc, mode, _) => (tableDesc, mode == SaveMode.Ignore)
+    }.head
+  }
+
   test("create database") {
     val sql =
       """
@@ -1046,4 +1063,553 @@ class DDLCommandSuite extends PlanTest {
           s"got ${other.getClass.getName}: $sql")
     }
   }
+
+  test("Test CTAS #1") {
+    val s1 =
+      """CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view
+        |COMMENT 'This is the staging page view table'
+        |STORED AS RCFILE
+        |LOCATION '/user/external/page_view'
+        |TBLPROPERTIES ('p1'='v1', 'p2'='v2')
+        |AS SELECT * FROM src""".stripMargin
+
+    val (desc, exists) = extractTableDesc(s1)
+    assert(exists)
+    assert(desc.identifier.database == Some("mydb"))
+    assert(desc.identifier.table == "page_view")
+    assert(desc.tableType == CatalogTableType.EXTERNAL)
+    assert(desc.storage.locationUri == Some(new URI("/user/external/page_view")))
+    assert(desc.schema.isEmpty) // will be populated later when the table is actually created
+    assert(desc.comment == Some("This is the staging page view table"))
+    // TODO will be SQLText
+    assert(desc.viewText.isEmpty)
+    assert(desc.viewDefaultDatabase.isEmpty)
+    assert(desc.viewQueryColumnNames.isEmpty)
+    assert(desc.partitionColumnNames.isEmpty)
+    assert(desc.storage.inputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileInputFormat"))
+    assert(desc.storage.outputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileOutputFormat"))
+    assert(desc.storage.serde ==
+      Some("org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe"))
+    assert(desc.properties == Map("p1" -> "v1", "p2" -> "v2"))
+  }
+
+  test("Test CTAS #2") {
+    val s2 =
+      """CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view
+        |COMMENT 'This is the staging page view table'
+        |ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe'
+        | STORED AS
+        | INPUTFORMAT 'parquet.hive.DeprecatedParquetInputFormat'
+        | OUTPUTFORMAT 'parquet.hive.DeprecatedParquetOutputFormat'
+        |LOCATION '/user/external/page_view'
+        |TBLPROPERTIES ('p1'='v1', 'p2'='v2')
+        |AS SELECT * FROM src""".stripMargin
+
+    val (desc, exists) = extractTableDesc(s2)
+    assert(exists)
+    assert(desc.identifier.database == Some("mydb"))
+    assert(desc.identifier.table == "page_view")
+    assert(desc.tableType == CatalogTableType.EXTERNAL)
+    assert(desc.storage.locationUri == Some(new URI("/user/external/page_view")))
+    assert(desc.schema.isEmpty) // will be populated later when the table is actually created
+    // TODO will be SQLText
+    assert(desc.comment == Some("This is the staging page view table"))
+    assert(desc.viewText.isEmpty)
+    assert(desc.viewDefaultDatabase.isEmpty)
+    assert(desc.viewQueryColumnNames.isEmpty)
+    assert(desc.partitionColumnNames.isEmpty)
+    assert(desc.storage.properties == Map())
+    assert(desc.storage.inputFormat == Some("parquet.hive.DeprecatedParquetInputFormat"))
+    assert(desc.storage.outputFormat == Some("parquet.hive.DeprecatedParquetOutputFormat"))
+    assert(desc.storage.serde == Some("parquet.hive.serde.ParquetHiveSerDe"))
+    assert(desc.properties == Map("p1" -> "v1", "p2" -> "v2"))
+  }
+
+  test("Test CTAS #3") {
+    val s3 = """CREATE TABLE page_view AS SELECT * FROM src"""
+    val (desc, exists) = extractTableDesc(s3)
+    assert(exists == false)
+    assert(desc.identifier.database == None)
+    assert(desc.identifier.table == "page_view")
+    assert(desc.tableType == CatalogTableType.MANAGED)
+    assert(desc.storage.locationUri == None)
+    assert(desc.schema.isEmpty)
+    assert(desc.viewText == None) // TODO will be SQLText
+    assert(desc.viewDefaultDatabase.isEmpty)
+    assert(desc.viewQueryColumnNames.isEmpty)
+    assert(desc.storage.properties == Map())
+    assert(desc.storage.inputFormat == Some("org.apache.hadoop.mapred.TextInputFormat"))
+    assert(desc.storage.outputFormat ==
+      Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"))
+    assert(desc.storage.serde == Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"))
+    assert(desc.properties == Map())
+  }
+
+  test("Test CTAS #4") {
+    val s4 =
+      """CREATE TABLE page_view
+        |STORED BY 'storage.handler.class.name' AS SELECT * FROM src""".stripMargin
+    intercept[AnalysisException] {
+      extractTableDesc(s4)
+    }
+  }
+
+  test("Test CTAS #5") {
+    val s5 = """CREATE TABLE ctas2
+               | ROW FORMAT SERDE "org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe"
+               | WITH SERDEPROPERTIES("serde_p1"="p1","serde_p2"="p2")
+               | STORED AS RCFile
+               | TBLPROPERTIES("tbl_p1"="p11", "tbl_p2"="p22")
+               | AS
+               |   SELECT key, value
+               |   FROM src
+               |   ORDER BY key, value""".stripMargin
+    val (desc, exists) = extractTableDesc(s5)
+    assert(exists == false)
+    assert(desc.identifier.database == None)
+    assert(desc.identifier.table == "ctas2")
+    assert(desc.tableType == CatalogTableType.MANAGED)
+    assert(desc.storage.locationUri == None)
+    assert(desc.schema.isEmpty)
+    assert(desc.viewText == None) // TODO will be SQLText
+    assert(desc.viewDefaultDatabase.isEmpty)
+    assert(desc.viewQueryColumnNames.isEmpty)
+    assert(desc.storage.properties == Map(("serde_p1" -> "p1"), ("serde_p2" -> "p2")))
+    assert(desc.storage.inputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileInputFormat"))
+    assert(desc.storage.outputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileOutputFormat"))
+    assert(desc.storage.serde == Some("org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe"))
+    assert(desc.properties == Map(("tbl_p1" -> "p11"), ("tbl_p2" -> "p22")))
+  }
+
+  test("CTAS statement with a PARTITIONED BY clause is not allowed") {
+    assertUnsupported(s"CREATE TABLE ctas1 PARTITIONED BY (k int)" +
+      " AS SELECT key, value FROM (SELECT 1 as key, 2 as value) tmp")
+  }
+
+  test("CTAS statement with schema") {
+    assertUnsupported(s"CREATE TABLE ctas1 (age INT, name STRING) AS SELECT * FROM src")
+    assertUnsupported(s"CREATE TABLE ctas1 (age INT, name STRING) AS SELECT 1, 'hello'")
+  }
+
+  test("unsupported operations") {
+    intercept[ParseException] {
+      parser.parsePlan(
+        """
+          |CREATE TEMPORARY TABLE ctas2
+          |ROW FORMAT SERDE "org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe"
+          |WITH SERDEPROPERTIES("serde_p1"="p1","serde_p2"="p2")
+          |STORED AS RCFile
+          |TBLPROPERTIES("tbl_p1"="p11", "tbl_p2"="p22")
+          |AS SELECT key, value FROM src ORDER BY key, value
+        """.stripMargin)
+    }
+    intercept[ParseException] {
+      parser.parsePlan(
+        """
+          |CREATE TABLE user_info_bucketed(user_id BIGINT, firstname STRING, lastname STRING)
+          |CLUSTERED BY(user_id) INTO 256 BUCKETS
+          |AS SELECT key, value FROM src ORDER BY key, value
+        """.stripMargin)
+    }
+    intercept[ParseException] {
+      parser.parsePlan(
+        """
+          |CREATE TABLE user_info_bucketed(user_id BIGINT, firstname STRING, lastname STRING)
+          |SKEWED BY (key) ON (1,5,6)
+          |AS SELECT key, value FROM src ORDER BY key, value
+        """.stripMargin)
+    }
+    intercept[ParseException] {
+      parser.parsePlan(
+        """
+          |SELECT TRANSFORM (key, value) USING 'cat' AS (tKey, tValue)
+          |ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe'
+          |RECORDREADER 'org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesRecordReader'
+          |FROM testData
+        """.stripMargin)
+    }
+  }
+
+  test("Invalid interval term should throw AnalysisException") {
+    def assertError(sql: String, errorMessage: String): Unit = {
+      val e = intercept[AnalysisException] {
+        parser.parsePlan(sql)
+      }
+      assert(e.getMessage.contains(errorMessage))
+    }
+    assertError("select interval '42-32' year to month",
+      "month 32 outside range [0, 11]")
+    assertError("select interval '5 49:12:15' day to second",
+      "hour 49 outside range [0, 23]")
+    assertError("select interval '.1111111111' second",
+      "nanosecond 1111111111 outside range")
+  }
+
+  test("use native json_tuple instead of hive's UDTF in LATERAL VIEW") {
+    val analyzer = spark.sessionState.analyzer
+    val plan = analyzer.execute(parser.parsePlan(
+      """
+        |SELECT *
+        |FROM (SELECT '{"f1": "value1", "f2": 12}' json) test
+        |LATERAL VIEW json_tuple(json, 'f1', 'f2') jt AS a, b
+      """.stripMargin))
+
+    assert(plan.children.head.asInstanceOf[Generate].generator.isInstanceOf[JsonTuple])
+  }
+
+  test("transform query spec") {
+    val p = ScriptTransformation(
+      Seq(UnresolvedAttribute("a"), UnresolvedAttribute("b")),
+      "func", Seq.empty, plans.table("e"), null)
+
+    compareTransformQuery("select transform(a, b) using 'func' from e where f < 10",
+      p.copy(child = p.child.where('f < 10), output = Seq('key.string, 'value.string)))
+    compareTransformQuery("map a, b using 'func' as c, d from e",
+      p.copy(output = Seq('c.string, 'd.string)))
+    compareTransformQuery("reduce a, b using 'func' as (c int, d decimal(10, 0)) from e",
+      p.copy(output = Seq('c.int, 'd.decimal(10, 0))))
+  }
+
+  test("use backticks in output of Script Transform") {
+    parser.parsePlan(
+      """SELECT `t`.`thing1`
+        |FROM (SELECT TRANSFORM (`parquet_t1`.`key`, `parquet_t1`.`value`)
+        |USING 'cat' AS (`thing1` int, `thing2` string) FROM `default`.`parquet_t1`) AS t
+      """.stripMargin)
+  }
+
+  test("use backticks in output of Generator") {
+    parser.parsePlan(
+      """
+        |SELECT `gentab2`.`gencol2`
+        |FROM `default`.`src`
+        |LATERAL VIEW explode(array(array(1, 2, 3))) `gentab1` AS `gencol1`
+        |LATERAL VIEW explode(`gentab1`.`gencol1`) `gentab2` AS `gencol2`
+      """.stripMargin)
+  }
+
+  test("use escaped backticks in output of Generator") {
+    parser.parsePlan(
+      """
+        |SELECT `gen``tab2`.`gen``col2`
+        |FROM `default`.`src`
+        |LATERAL VIEW explode(array(array(1, 2,  3))) `gen``tab1` AS `gen``col1`
+        |LATERAL VIEW explode(`gen``tab1`.`gen``col1`) `gen``tab2` AS `gen``col2`
+      """.stripMargin)
+  }
+
+  test("create table - basic") {
+    val query = "CREATE TABLE my_table (id int, name string)"
+    val (desc, allowExisting) = extractTableDesc(query)
+    assert(!allowExisting)
+    assert(desc.identifier.database.isEmpty)
+    assert(desc.identifier.table == "my_table")
+    assert(desc.tableType == CatalogTableType.MANAGED)
+    assert(desc.schema == new StructType().add("id", "int").add("name", "string"))
+    assert(desc.partitionColumnNames.isEmpty)
+    assert(desc.bucketSpec.isEmpty)
+    assert(desc.viewText.isEmpty)
+    assert(desc.viewDefaultDatabase.isEmpty)
+    assert(desc.viewQueryColumnNames.isEmpty)
+    assert(desc.storage.locationUri.isEmpty)
+    assert(desc.storage.inputFormat ==
+      Some("org.apache.hadoop.mapred.TextInputFormat"))
+    assert(desc.storage.outputFormat ==
+      Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"))
+    assert(desc.storage.serde == Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"))
+    assert(desc.storage.properties.isEmpty)
+    assert(desc.properties.isEmpty)
+    assert(desc.comment.isEmpty)
+  }
+
+  test("create table - with database name") {
+    val query = "CREATE TABLE dbx.my_table (id int, name string)"
+    val (desc, _) = extractTableDesc(query)
+    assert(desc.identifier.database == Some("dbx"))
+    assert(desc.identifier.table == "my_table")
+  }
+
+  test("create table - temporary") {
+    val query = "CREATE TEMPORARY TABLE tab1 (id int, name string)"
+    val e = intercept[ParseException] { parser.parsePlan(query) }
+    assert(e.message.contains("CREATE TEMPORARY TABLE is not supported yet"))
+  }
+
+  test("create table - external") {
+    val query = "CREATE EXTERNAL TABLE tab1 (id int, name string) LOCATION '/path/to/nowhere'"
+    val (desc, _) = extractTableDesc(query)
+    assert(desc.tableType == CatalogTableType.EXTERNAL)
+    assert(desc.storage.locationUri == Some(new URI("/path/to/nowhere")))
+  }
+
+  test("create table - if not exists") {
+    val query = "CREATE TABLE IF NOT EXISTS tab1 (id int, name string)"
+    val (_, allowExisting) = extractTableDesc(query)
+    assert(allowExisting)
+  }
+
+  test("create table - comment") {
+    val query = "CREATE TABLE my_table (id int, name string) COMMENT 'its hot as hell below'"
+    val (desc, _) = extractTableDesc(query)
+    assert(desc.comment == Some("its hot as hell below"))
+  }
+
+  test("create table - partitioned columns") {
+    val query = "CREATE TABLE my_table (id int, name string) PARTITIONED BY (month int)"
+    val (desc, _) = extractTableDesc(query)
+    assert(desc.schema == new StructType()
+      .add("id", "int")
+      .add("name", "string")
+      .add("month", "int"))
+    assert(desc.partitionColumnNames == Seq("month"))
+  }
+
+  test("create table - clustered by") {
+    val numBuckets = 10
+    val bucketedColumn = "id"
+    val sortColumn = "id"
+    val baseQuery =
+      s"""
+         CREATE TABLE my_table (
+           $bucketedColumn int,
+           name string)
+         CLUSTERED BY($bucketedColumn)
+       """
+
+    val query1 = s"$baseQuery INTO $numBuckets BUCKETS"
+    val (desc1, _) = extractTableDesc(query1)
+    assert(desc1.bucketSpec.isDefined)
+    val bucketSpec1 = desc1.bucketSpec.get
+    assert(bucketSpec1.numBuckets == numBuckets)
+    assert(bucketSpec1.bucketColumnNames.head.equals(bucketedColumn))
+    assert(bucketSpec1.sortColumnNames.isEmpty)
+
+    val query2 = s"$baseQuery SORTED BY($sortColumn) INTO $numBuckets BUCKETS"
+    val (desc2, _) = extractTableDesc(query2)
+    assert(desc2.bucketSpec.isDefined)
+    val bucketSpec2 = desc2.bucketSpec.get
+    assert(bucketSpec2.numBuckets == numBuckets)
+    assert(bucketSpec2.bucketColumnNames.head.equals(bucketedColumn))
+    assert(bucketSpec2.sortColumnNames.head.equals(sortColumn))
+  }
+
+  test("create table - skewed by") {
+    val baseQuery = "CREATE TABLE my_table (id int, name string) SKEWED BY"
+    val query1 = s"$baseQuery(id) ON (1, 10, 100)"
+    val query2 = s"$baseQuery(id, name) ON ((1, 'x'), (2, 'y'), (3, 'z'))"
+    val query3 = s"$baseQuery(id, name) ON ((1, 'x'), (2, 'y'), (3, 'z')) STORED AS DIRECTORIES"
+    val e1 = intercept[ParseException] { parser.parsePlan(query1) }
+    val e2 = intercept[ParseException] { parser.parsePlan(query2) }
+    val e3 = intercept[ParseException] { parser.parsePlan(query3) }
+    assert(e1.getMessage.contains("Operation not allowed"))
+    assert(e2.getMessage.contains("Operation not allowed"))
+    assert(e3.getMessage.contains("Operation not allowed"))
+  }
+
+  test("create table - row format") {
+    val baseQuery = "CREATE TABLE my_table (id int, name string) ROW FORMAT"
+    val query1 = s"$baseQuery SERDE 'org.apache.poof.serde.Baff'"
+    val query2 = s"$baseQuery SERDE 'org.apache.poof.serde.Baff' WITH SERDEPROPERTIES ('k1'='v1')"
+    val query3 =
+      s"""
+         |$baseQuery DELIMITED FIELDS TERMINATED BY 'x' ESCAPED BY 'y'
+         |COLLECTION ITEMS TERMINATED BY 'a'
+         |MAP KEYS TERMINATED BY 'b'
+         |LINES TERMINATED BY '\n'
+         |NULL DEFINED AS 'c'
+      """.stripMargin
+    val (desc1, _) = extractTableDesc(query1)
+    val (desc2, _) = extractTableDesc(query2)
+    val (desc3, _) = extractTableDesc(query3)
+    assert(desc1.storage.serde == Some("org.apache.poof.serde.Baff"))
+    assert(desc1.storage.properties.isEmpty)
+    assert(desc2.storage.serde == Some("org.apache.poof.serde.Baff"))
+    assert(desc2.storage.properties == Map("k1" -> "v1"))
+    assert(desc3.storage.properties == Map(
+      "field.delim" -> "x",
+      "escape.delim" -> "y",
+      "serialization.format" -> "x",
+      "line.delim" -> "\n",
+      "colelction.delim" -> "a", // yes, it's a typo from Hive :)
+      "mapkey.delim" -> "b"))
+  }
+
+  test("create table - file format") {
+    val baseQuery = "CREATE TABLE my_table (id int, name string) STORED AS"
+    val query1 = s"$baseQuery INPUTFORMAT 'winput' OUTPUTFORMAT 'wowput'"
+    val query2 = s"$baseQuery ORC"
+    val (desc1, _) = extractTableDesc(query1)
+    val (desc2, _) = extractTableDesc(query2)
+    assert(desc1.storage.inputFormat == Some("winput"))
+    assert(desc1.storage.outputFormat == Some("wowput"))
+    assert(desc1.storage.serde == Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"))
+    assert(desc2.storage.inputFormat == Some("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat"))
+    assert(desc2.storage.outputFormat == Some("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat"))
+    assert(desc2.storage.serde == Some("org.apache.hadoop.hive.ql.io.orc.OrcSerde"))
+  }
+
+  test("create table - storage handler") {
+    val baseQuery = "CREATE TABLE my_table (id int, name string) STORED BY"
+    val query1 = s"$baseQuery 'org.papachi.StorageHandler'"
+    val query2 = s"$baseQuery 'org.mamachi.StorageHandler' WITH SERDEPROPERTIES ('k1'='v1')"
+    val e1 = intercept[ParseException] { parser.parsePlan(query1) }
+    val e2 = intercept[ParseException] { parser.parsePlan(query2) }
+    assert(e1.getMessage.contains("Operation not allowed"))
+    assert(e2.getMessage.contains("Operation not allowed"))
+  }
+
+  test("create table - properties") {
+    val query = "CREATE TABLE my_table (id int, name string) TBLPROPERTIES ('k1'='v1', 'k2'='v2')"
+    val (desc, _) = extractTableDesc(query)
+    assert(desc.properties == Map("k1" -> "v1", "k2" -> "v2"))
+  }
+
+  test("create table - everything!") {
+    val query =
+      """
+        |CREATE EXTERNAL TABLE IF NOT EXISTS dbx.my_table (id int, name string)
+        |COMMENT 'no comment'
+        |PARTITIONED BY (month int)
+        |ROW FORMAT SERDE 'org.apache.poof.serde.Baff' WITH SERDEPROPERTIES ('k1'='v1')
+        |STORED AS INPUTFORMAT 'winput' OUTPUTFORMAT 'wowput'
+        |LOCATION '/path/to/mercury'
+        |TBLPROPERTIES ('k1'='v1', 'k2'='v2')
+      """.stripMargin
+    val (desc, allowExisting) = extractTableDesc(query)
+    assert(allowExisting)
+    assert(desc.identifier.database == Some("dbx"))
+    assert(desc.identifier.table == "my_table")
+    assert(desc.tableType == CatalogTableType.EXTERNAL)
+    assert(desc.schema == new StructType()
+      .add("id", "int")
+      .add("name", "string")
+      .add("month", "int"))
+    assert(desc.partitionColumnNames == Seq("month"))
+    assert(desc.bucketSpec.isEmpty)
+    assert(desc.viewText.isEmpty)
+    assert(desc.viewDefaultDatabase.isEmpty)
+    assert(desc.viewQueryColumnNames.isEmpty)
+    assert(desc.storage.locationUri == Some(new URI("/path/to/mercury")))
+    assert(desc.storage.inputFormat == Some("winput"))
+    assert(desc.storage.outputFormat == Some("wowput"))
+    assert(desc.storage.serde == Some("org.apache.poof.serde.Baff"))
+    assert(desc.storage.properties == Map("k1" -> "v1"))
+    assert(desc.properties == Map("k1" -> "v1", "k2" -> "v2"))
+    assert(desc.comment == Some("no comment"))
+  }
+
+  test("create view -- basic") {
+    val v1 = "CREATE VIEW view1 AS SELECT * FROM tab1"
+    val command = parser.parsePlan(v1).asInstanceOf[CreateViewCommand]
+    assert(!command.allowExisting)
+    assert(command.name.database.isEmpty)
+    assert(command.name.table == "view1")
+    assert(command.originalText == Some("SELECT * FROM tab1"))
+    assert(command.userSpecifiedColumns.isEmpty)
+  }
+
+  test("create view - full") {
+    val v1 =
+      """
+        |CREATE OR REPLACE VIEW view1
+        |(col1, col3 COMMENT 'hello')
+        |COMMENT 'BLABLA'
+        |TBLPROPERTIES('prop1Key'="prop1Val")
+        |AS SELECT * FROM tab1
+      """.stripMargin
+    val command = parser.parsePlan(v1).asInstanceOf[CreateViewCommand]
+    assert(command.name.database.isEmpty)
+    assert(command.name.table == "view1")
+    assert(command.userSpecifiedColumns == Seq("col1" -> None, "col3" -> Some("hello")))
+    assert(command.originalText == Some("SELECT * FROM tab1"))
+    assert(command.properties == Map("prop1Key" -> "prop1Val"))
+    assert(command.comment == Some("BLABLA"))
+  }
+
+  test("create view -- partitioned view") {
+    val v1 = "CREATE VIEW view1 partitioned on (ds, hr) as select * from srcpart"
+    intercept[ParseException] {
+      parser.parsePlan(v1)
+    }
+  }
+
+  test("MSCK REPAIR table") {
+    val sql = "MSCK REPAIR TABLE tab1"
+    val parsed = parser.parsePlan(sql)
+    val expected = AlterTableRecoverPartitionsCommand(
+      TableIdentifier("tab1", None),
+      "MSCK REPAIR TABLE")
+    comparePlans(parsed, expected)
+  }
+
+  test("create table like") {
+    val v1 = "CREATE TABLE table1 LIKE table2"
+    val (target, source, location, exists) = parser.parsePlan(v1).collect {
+      case CreateTableLikeCommand(t, s, l, allowExisting) => (t, s, l, allowExisting)
+    }.head
+    assert(exists == false)
+    assert(target.database.isEmpty)
+    assert(target.table == "table1")
+    assert(source.database.isEmpty)
+    assert(source.table == "table2")
+    assert(location.isEmpty)
+
+    val v2 = "CREATE TABLE IF NOT EXISTS table1 LIKE table2"
+    val (target2, source2, location2, exists2) = parser.parsePlan(v2).collect {
+      case CreateTableLikeCommand(t, s, l, allowExisting) => (t, s, l, allowExisting)
+    }.head
+    assert(exists2)
+    assert(target2.database.isEmpty)
+    assert(target2.table == "table1")
+    assert(source2.database.isEmpty)
+    assert(source2.table == "table2")
+    assert(location2.isEmpty)
+
+    val v3 = "CREATE TABLE table1 LIKE table2 LOCATION '/spark/warehouse'"
+    val (target3, source3, location3, exists3) = parser.parsePlan(v3).collect {
+      case CreateTableLikeCommand(t, s, l, allowExisting) => (t, s, l, allowExisting)
+    }.head
+    assert(!exists3)
+    assert(target3.database.isEmpty)
+    assert(target3.table == "table1")
+    assert(source3.database.isEmpty)
+    assert(source3.table == "table2")
+    assert(location3 == Some("/spark/warehouse"))
+
+    val v4 = "CREATE TABLE IF NOT EXISTS table1 LIKE table2  LOCATION '/spark/warehouse'"
+    val (target4, source4, location4, exists4) = parser.parsePlan(v4).collect {
+      case CreateTableLikeCommand(t, s, l, allowExisting) => (t, s, l, allowExisting)
+    }.head
+    assert(exists4)
+    assert(target4.database.isEmpty)
+    assert(target4.table == "table1")
+    assert(source4.database.isEmpty)
+    assert(source4.table == "table2")
+    assert(location4 == Some("/spark/warehouse"))
+  }
+
+  test("load data") {
+    val v1 = "LOAD DATA INPATH 'path' INTO TABLE table1"
+    val (table, path, isLocal, isOverwrite, partition) = parser.parsePlan(v1).collect {
+      case LoadDataCommand(t, path, l, o, partition) => (t, path, l, o, partition)
+    }.head
+    assert(table.database.isEmpty)
+    assert(table.table == "table1")
+    assert(path == "path")
+    assert(!isLocal)
+    assert(!isOverwrite)
+    assert(partition.isEmpty)
+
+    val v2 = "LOAD DATA LOCAL INPATH 'path' OVERWRITE INTO TABLE table1 PARTITION(c='1', d='2')"
+    val (table2, path2, isLocal2, isOverwrite2, partition2) = parser.parsePlan(v2).collect {
+      case LoadDataCommand(t, path, l, o, partition) => (t, path, l, o, partition)
+    }.head
+    assert(table2.database.isEmpty)
+    assert(table2.table == "table1")
+    assert(path2 == "path")
+    assert(isLocal2)
+    assert(isOverwrite2)
+    assert(partition2.nonEmpty)
+    assert(partition2.get.apply("c") == "1" && partition2.get.apply("d") == "2")
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala
deleted file mode 100644
index bee470d8e1382..0000000000000
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala
+++ /dev/null
@@ -1,739 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.hive
-
-import java.net.URI
-import java.util.Locale
-
-import org.apache.spark.sql.{AnalysisException, SaveMode}
-import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
-import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType}
-import org.apache.spark.sql.catalyst.dsl.expressions._
-import org.apache.spark.sql.catalyst.dsl.plans
-import org.apache.spark.sql.catalyst.dsl.plans.DslLogicalPlan
-import org.apache.spark.sql.catalyst.expressions.JsonTuple
-import org.apache.spark.sql.catalyst.parser.ParseException
-import org.apache.spark.sql.catalyst.plans.PlanTest
-import org.apache.spark.sql.catalyst.plans.logical.{Generate, LogicalPlan, ScriptTransformation}
-import org.apache.spark.sql.execution.command._
-import org.apache.spark.sql.execution.datasources.CreateTable
-import org.apache.spark.sql.hive.test.{TestHive, TestHiveSingleton}
-import org.apache.spark.sql.test.SQLTestUtils
-import org.apache.spark.sql.types.StructType
-
-class HiveDDLCommandSuite extends PlanTest with SQLTestUtils with TestHiveSingleton {
-  val parser = TestHive.sessionState.sqlParser
-
-  private def extractTableDesc(sql: String): (CatalogTable, Boolean) = {
-    parser.parsePlan(sql).collect {
-      case CreateTable(tableDesc, mode, _) => (tableDesc, mode == SaveMode.Ignore)
-    }.head
-  }
-
-  private def assertUnsupported(sql: String): Unit = {
-    val e = intercept[ParseException] {
-      parser.parsePlan(sql)
-    }
-    assert(e.getMessage.toLowerCase(Locale.ROOT).contains("operation not allowed"))
-  }
-
-  private def analyzeCreateTable(sql: String): CatalogTable = {
-    TestHive.sessionState.analyzer.execute(parser.parsePlan(sql)).collect {
-      case CreateTableCommand(tableDesc, _) => tableDesc
-    }.head
-  }
-
-  private def compareTransformQuery(sql: String, expected: LogicalPlan): Unit = {
-    val plan = parser.parsePlan(sql).asInstanceOf[ScriptTransformation].copy(ioschema = null)
-    comparePlans(plan, expected, checkAnalysis = false)
-  }
-
-  test("Test CTAS #1") {
-    val s1 =
-      """CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view
-        |COMMENT 'This is the staging page view table'
-        |STORED AS RCFILE
-        |LOCATION '/user/external/page_view'
-        |TBLPROPERTIES ('p1'='v1', 'p2'='v2')
-        |AS SELECT * FROM src""".stripMargin
-
-    val (desc, exists) = extractTableDesc(s1)
-    assert(exists)
-    assert(desc.identifier.database == Some("mydb"))
-    assert(desc.identifier.table == "page_view")
-    assert(desc.tableType == CatalogTableType.EXTERNAL)
-    assert(desc.storage.locationUri == Some(new URI("/user/external/page_view")))
-    assert(desc.schema.isEmpty) // will be populated later when the table is actually created
-    assert(desc.comment == Some("This is the staging page view table"))
-    // TODO will be SQLText
-    assert(desc.viewText.isEmpty)
-    assert(desc.viewDefaultDatabase.isEmpty)
-    assert(desc.viewQueryColumnNames.isEmpty)
-    assert(desc.partitionColumnNames.isEmpty)
-    assert(desc.storage.inputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileInputFormat"))
-    assert(desc.storage.outputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileOutputFormat"))
-    assert(desc.storage.serde ==
-      Some("org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe"))
-    assert(desc.properties == Map("p1" -> "v1", "p2" -> "v2"))
-  }
-
-  test("Test CTAS #2") {
-    val s2 =
-      """CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view
-        |COMMENT 'This is the staging page view table'
-        |ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe'
-        | STORED AS
-        | INPUTFORMAT 'parquet.hive.DeprecatedParquetInputFormat'
-        | OUTPUTFORMAT 'parquet.hive.DeprecatedParquetOutputFormat'
-        |LOCATION '/user/external/page_view'
-        |TBLPROPERTIES ('p1'='v1', 'p2'='v2')
-        |AS SELECT * FROM src""".stripMargin
-
-    val (desc, exists) = extractTableDesc(s2)
-    assert(exists)
-    assert(desc.identifier.database == Some("mydb"))
-    assert(desc.identifier.table == "page_view")
-    assert(desc.tableType == CatalogTableType.EXTERNAL)
-    assert(desc.storage.locationUri == Some(new URI("/user/external/page_view")))
-    assert(desc.schema.isEmpty) // will be populated later when the table is actually created
-    // TODO will be SQLText
-    assert(desc.comment == Some("This is the staging page view table"))
-    assert(desc.viewText.isEmpty)
-    assert(desc.viewDefaultDatabase.isEmpty)
-    assert(desc.viewQueryColumnNames.isEmpty)
-    assert(desc.partitionColumnNames.isEmpty)
-    assert(desc.storage.properties == Map())
-    assert(desc.storage.inputFormat == Some("parquet.hive.DeprecatedParquetInputFormat"))
-    assert(desc.storage.outputFormat == Some("parquet.hive.DeprecatedParquetOutputFormat"))
-    assert(desc.storage.serde == Some("parquet.hive.serde.ParquetHiveSerDe"))
-    assert(desc.properties == Map("p1" -> "v1", "p2" -> "v2"))
-  }
-
-  test("Test CTAS #3") {
-    val s3 = """CREATE TABLE page_view AS SELECT * FROM src"""
-    val (desc, exists) = extractTableDesc(s3)
-    assert(exists == false)
-    assert(desc.identifier.database == None)
-    assert(desc.identifier.table == "page_view")
-    assert(desc.tableType == CatalogTableType.MANAGED)
-    assert(desc.storage.locationUri == None)
-    assert(desc.schema.isEmpty)
-    assert(desc.viewText == None) // TODO will be SQLText
-    assert(desc.viewDefaultDatabase.isEmpty)
-    assert(desc.viewQueryColumnNames.isEmpty)
-    assert(desc.storage.properties == Map())
-    assert(desc.storage.inputFormat == Some("org.apache.hadoop.mapred.TextInputFormat"))
-    assert(desc.storage.outputFormat ==
-      Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"))
-    assert(desc.storage.serde == Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"))
-    assert(desc.properties == Map())
-  }
-
-  test("Test CTAS #4") {
-    val s4 =
-      """CREATE TABLE page_view
-        |STORED BY 'storage.handler.class.name' AS SELECT * FROM src""".stripMargin
-    intercept[AnalysisException] {
-      extractTableDesc(s4)
-    }
-  }
-
-  test("Test CTAS #5") {
-    val s5 = """CREATE TABLE ctas2
-               | ROW FORMAT SERDE "org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe"
-               | WITH SERDEPROPERTIES("serde_p1"="p1","serde_p2"="p2")
-               | STORED AS RCFile
-               | TBLPROPERTIES("tbl_p1"="p11", "tbl_p2"="p22")
-               | AS
-               |   SELECT key, value
-               |   FROM src
-               |   ORDER BY key, value""".stripMargin
-    val (desc, exists) = extractTableDesc(s5)
-    assert(exists == false)
-    assert(desc.identifier.database == None)
-    assert(desc.identifier.table == "ctas2")
-    assert(desc.tableType == CatalogTableType.MANAGED)
-    assert(desc.storage.locationUri == None)
-    assert(desc.schema.isEmpty)
-    assert(desc.viewText == None) // TODO will be SQLText
-    assert(desc.viewDefaultDatabase.isEmpty)
-    assert(desc.viewQueryColumnNames.isEmpty)
-    assert(desc.storage.properties == Map(("serde_p1" -> "p1"), ("serde_p2" -> "p2")))
-    assert(desc.storage.inputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileInputFormat"))
-    assert(desc.storage.outputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileOutputFormat"))
-    assert(desc.storage.serde == Some("org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe"))
-    assert(desc.properties == Map(("tbl_p1" -> "p11"), ("tbl_p2" -> "p22")))
-  }
-
-  test("CTAS statement with a PARTITIONED BY clause is not allowed") {
-    assertUnsupported(s"CREATE TABLE ctas1 PARTITIONED BY (k int)" +
-      " AS SELECT key, value FROM (SELECT 1 as key, 2 as value) tmp")
-  }
-
-  test("CTAS statement with schema") {
-    assertUnsupported(s"CREATE TABLE ctas1 (age INT, name STRING) AS SELECT * FROM src")
-    assertUnsupported(s"CREATE TABLE ctas1 (age INT, name STRING) AS SELECT 1, 'hello'")
-  }
-
-  test("unsupported operations") {
-    intercept[ParseException] {
-      parser.parsePlan(
-        """
-          |CREATE TEMPORARY TABLE ctas2
-          |ROW FORMAT SERDE "org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe"
-          |WITH SERDEPROPERTIES("serde_p1"="p1","serde_p2"="p2")
-          |STORED AS RCFile
-          |TBLPROPERTIES("tbl_p1"="p11", "tbl_p2"="p22")
-          |AS SELECT key, value FROM src ORDER BY key, value
-        """.stripMargin)
-    }
-    intercept[ParseException] {
-      parser.parsePlan(
-        """
-          |CREATE TABLE user_info_bucketed(user_id BIGINT, firstname STRING, lastname STRING)
-          |CLUSTERED BY(user_id) INTO 256 BUCKETS
-          |AS SELECT key, value FROM src ORDER BY key, value
-        """.stripMargin)
-    }
-    intercept[ParseException] {
-      parser.parsePlan(
-        """
-          |CREATE TABLE user_info_bucketed(user_id BIGINT, firstname STRING, lastname STRING)
-          |SKEWED BY (key) ON (1,5,6)
-          |AS SELECT key, value FROM src ORDER BY key, value
-        """.stripMargin)
-    }
-    intercept[ParseException] {
-      parser.parsePlan(
-        """
-          |SELECT TRANSFORM (key, value) USING 'cat' AS (tKey, tValue)
-          |ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe'
-          |RECORDREADER 'org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesRecordReader'
-          |FROM testData
-        """.stripMargin)
-    }
-  }
-
-  test("Invalid interval term should throw AnalysisException") {
-    def assertError(sql: String, errorMessage: String): Unit = {
-      val e = intercept[AnalysisException] {
-        parser.parsePlan(sql)
-      }
-      assert(e.getMessage.contains(errorMessage))
-    }
-    assertError("select interval '42-32' year to month",
-      "month 32 outside range [0, 11]")
-    assertError("select interval '5 49:12:15' day to second",
-      "hour 49 outside range [0, 23]")
-    assertError("select interval '.1111111111' second",
-      "nanosecond 1111111111 outside range")
-  }
-
-  test("use native json_tuple instead of hive's UDTF in LATERAL VIEW") {
-    val analyzer = TestHive.sparkSession.sessionState.analyzer
-    val plan = analyzer.execute(parser.parsePlan(
-      """
-        |SELECT *
-        |FROM (SELECT '{"f1": "value1", "f2": 12}' json) test
-        |LATERAL VIEW json_tuple(json, 'f1', 'f2') jt AS a, b
-      """.stripMargin))
-
-    assert(plan.children.head.asInstanceOf[Generate].generator.isInstanceOf[JsonTuple])
-  }
-
-  test("transform query spec") {
-    val p = ScriptTransformation(
-      Seq(UnresolvedAttribute("a"), UnresolvedAttribute("b")),
-      "func", Seq.empty, plans.table("e"), null)
-
-    compareTransformQuery("select transform(a, b) using 'func' from e where f < 10",
-      p.copy(child = p.child.where('f < 10), output = Seq('key.string, 'value.string)))
-    compareTransformQuery("map a, b using 'func' as c, d from e",
-      p.copy(output = Seq('c.string, 'd.string)))
-    compareTransformQuery("reduce a, b using 'func' as (c int, d decimal(10, 0)) from e",
-      p.copy(output = Seq('c.int, 'd.decimal(10, 0))))
-  }
-
-  test("use backticks in output of Script Transform") {
-    parser.parsePlan(
-      """SELECT `t`.`thing1`
-        |FROM (SELECT TRANSFORM (`parquet_t1`.`key`, `parquet_t1`.`value`)
-        |USING 'cat' AS (`thing1` int, `thing2` string) FROM `default`.`parquet_t1`) AS t
-      """.stripMargin)
-  }
-
-  test("use backticks in output of Generator") {
-    parser.parsePlan(
-      """
-        |SELECT `gentab2`.`gencol2`
-        |FROM `default`.`src`
-        |LATERAL VIEW explode(array(array(1, 2, 3))) `gentab1` AS `gencol1`
-        |LATERAL VIEW explode(`gentab1`.`gencol1`) `gentab2` AS `gencol2`
-      """.stripMargin)
-  }
-
-  test("use escaped backticks in output of Generator") {
-    parser.parsePlan(
-      """
-        |SELECT `gen``tab2`.`gen``col2`
-        |FROM `default`.`src`
-        |LATERAL VIEW explode(array(array(1, 2,  3))) `gen``tab1` AS `gen``col1`
-        |LATERAL VIEW explode(`gen``tab1`.`gen``col1`) `gen``tab2` AS `gen``col2`
-      """.stripMargin)
-  }
-
-  test("create table - basic") {
-    val query = "CREATE TABLE my_table (id int, name string)"
-    val (desc, allowExisting) = extractTableDesc(query)
-    assert(!allowExisting)
-    assert(desc.identifier.database.isEmpty)
-    assert(desc.identifier.table == "my_table")
-    assert(desc.tableType == CatalogTableType.MANAGED)
-    assert(desc.schema == new StructType().add("id", "int").add("name", "string"))
-    assert(desc.partitionColumnNames.isEmpty)
-    assert(desc.bucketSpec.isEmpty)
-    assert(desc.viewText.isEmpty)
-    assert(desc.viewDefaultDatabase.isEmpty)
-    assert(desc.viewQueryColumnNames.isEmpty)
-    assert(desc.storage.locationUri.isEmpty)
-    assert(desc.storage.inputFormat ==
-      Some("org.apache.hadoop.mapred.TextInputFormat"))
-    assert(desc.storage.outputFormat ==
-      Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"))
-    assert(desc.storage.serde == Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"))
-    assert(desc.storage.properties.isEmpty)
-    assert(desc.properties.isEmpty)
-    assert(desc.comment.isEmpty)
-  }
-
-  test("create table - with database name") {
-    val query = "CREATE TABLE dbx.my_table (id int, name string)"
-    val (desc, _) = extractTableDesc(query)
-    assert(desc.identifier.database == Some("dbx"))
-    assert(desc.identifier.table == "my_table")
-  }
-
-  test("create table - temporary") {
-    val query = "CREATE TEMPORARY TABLE tab1 (id int, name string)"
-    val e = intercept[ParseException] { parser.parsePlan(query) }
-    assert(e.message.contains("CREATE TEMPORARY TABLE is not supported yet"))
-  }
-
-  test("create table - external") {
-    val query = "CREATE EXTERNAL TABLE tab1 (id int, name string) LOCATION '/path/to/nowhere'"
-    val (desc, _) = extractTableDesc(query)
-    assert(desc.tableType == CatalogTableType.EXTERNAL)
-    assert(desc.storage.locationUri == Some(new URI("/path/to/nowhere")))
-  }
-
-  test("create table - if not exists") {
-    val query = "CREATE TABLE IF NOT EXISTS tab1 (id int, name string)"
-    val (_, allowExisting) = extractTableDesc(query)
-    assert(allowExisting)
-  }
-
-  test("create table - comment") {
-    val query = "CREATE TABLE my_table (id int, name string) COMMENT 'its hot as hell below'"
-    val (desc, _) = extractTableDesc(query)
-    assert(desc.comment == Some("its hot as hell below"))
-  }
-
-  test("create table - partitioned columns") {
-    val query = "CREATE TABLE my_table (id int, name string) PARTITIONED BY (month int)"
-    val (desc, _) = extractTableDesc(query)
-    assert(desc.schema == new StructType()
-      .add("id", "int")
-      .add("name", "string")
-      .add("month", "int"))
-    assert(desc.partitionColumnNames == Seq("month"))
-  }
-
-  test("create table - clustered by") {
-    val numBuckets = 10
-    val bucketedColumn = "id"
-    val sortColumn = "id"
-    val baseQuery =
-      s"""
-         CREATE TABLE my_table (
-           $bucketedColumn int,
-           name string)
-         CLUSTERED BY($bucketedColumn)
-       """
-
-    val query1 = s"$baseQuery INTO $numBuckets BUCKETS"
-    val (desc1, _) = extractTableDesc(query1)
-    assert(desc1.bucketSpec.isDefined)
-    val bucketSpec1 = desc1.bucketSpec.get
-    assert(bucketSpec1.numBuckets == numBuckets)
-    assert(bucketSpec1.bucketColumnNames.head.equals(bucketedColumn))
-    assert(bucketSpec1.sortColumnNames.isEmpty)
-
-    val query2 = s"$baseQuery SORTED BY($sortColumn) INTO $numBuckets BUCKETS"
-    val (desc2, _) = extractTableDesc(query2)
-    assert(desc2.bucketSpec.isDefined)
-    val bucketSpec2 = desc2.bucketSpec.get
-    assert(bucketSpec2.numBuckets == numBuckets)
-    assert(bucketSpec2.bucketColumnNames.head.equals(bucketedColumn))
-    assert(bucketSpec2.sortColumnNames.head.equals(sortColumn))
-  }
-
-  test("create table - skewed by") {
-    val baseQuery = "CREATE TABLE my_table (id int, name string) SKEWED BY"
-    val query1 = s"$baseQuery(id) ON (1, 10, 100)"
-    val query2 = s"$baseQuery(id, name) ON ((1, 'x'), (2, 'y'), (3, 'z'))"
-    val query3 = s"$baseQuery(id, name) ON ((1, 'x'), (2, 'y'), (3, 'z')) STORED AS DIRECTORIES"
-    val e1 = intercept[ParseException] { parser.parsePlan(query1) }
-    val e2 = intercept[ParseException] { parser.parsePlan(query2) }
-    val e3 = intercept[ParseException] { parser.parsePlan(query3) }
-    assert(e1.getMessage.contains("Operation not allowed"))
-    assert(e2.getMessage.contains("Operation not allowed"))
-    assert(e3.getMessage.contains("Operation not allowed"))
-  }
-
-  test("create table - row format") {
-    val baseQuery = "CREATE TABLE my_table (id int, name string) ROW FORMAT"
-    val query1 = s"$baseQuery SERDE 'org.apache.poof.serde.Baff'"
-    val query2 = s"$baseQuery SERDE 'org.apache.poof.serde.Baff' WITH SERDEPROPERTIES ('k1'='v1')"
-    val query3 =
-      s"""
-        |$baseQuery DELIMITED FIELDS TERMINATED BY 'x' ESCAPED BY 'y'
-        |COLLECTION ITEMS TERMINATED BY 'a'
-        |MAP KEYS TERMINATED BY 'b'
-        |LINES TERMINATED BY '\n'
-        |NULL DEFINED AS 'c'
-      """.stripMargin
-    val (desc1, _) = extractTableDesc(query1)
-    val (desc2, _) = extractTableDesc(query2)
-    val (desc3, _) = extractTableDesc(query3)
-    assert(desc1.storage.serde == Some("org.apache.poof.serde.Baff"))
-    assert(desc1.storage.properties.isEmpty)
-    assert(desc2.storage.serde == Some("org.apache.poof.serde.Baff"))
-    assert(desc2.storage.properties == Map("k1" -> "v1"))
-    assert(desc3.storage.properties == Map(
-      "field.delim" -> "x",
-      "escape.delim" -> "y",
-      "serialization.format" -> "x",
-      "line.delim" -> "\n",
-      "colelction.delim" -> "a", // yes, it's a typo from Hive :)
-      "mapkey.delim" -> "b"))
-  }
-
-  test("create table - file format") {
-    val baseQuery = "CREATE TABLE my_table (id int, name string) STORED AS"
-    val query1 = s"$baseQuery INPUTFORMAT 'winput' OUTPUTFORMAT 'wowput'"
-    val query2 = s"$baseQuery ORC"
-    val (desc1, _) = extractTableDesc(query1)
-    val (desc2, _) = extractTableDesc(query2)
-    assert(desc1.storage.inputFormat == Some("winput"))
-    assert(desc1.storage.outputFormat == Some("wowput"))
-    assert(desc1.storage.serde == Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"))
-    assert(desc2.storage.inputFormat == Some("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat"))
-    assert(desc2.storage.outputFormat == Some("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat"))
-    assert(desc2.storage.serde == Some("org.apache.hadoop.hive.ql.io.orc.OrcSerde"))
-  }
-
-  test("create table - storage handler") {
-    val baseQuery = "CREATE TABLE my_table (id int, name string) STORED BY"
-    val query1 = s"$baseQuery 'org.papachi.StorageHandler'"
-    val query2 = s"$baseQuery 'org.mamachi.StorageHandler' WITH SERDEPROPERTIES ('k1'='v1')"
-    val e1 = intercept[ParseException] { parser.parsePlan(query1) }
-    val e2 = intercept[ParseException] { parser.parsePlan(query2) }
-    assert(e1.getMessage.contains("Operation not allowed"))
-    assert(e2.getMessage.contains("Operation not allowed"))
-  }
-
-  test("create table - properties") {
-    val query = "CREATE TABLE my_table (id int, name string) TBLPROPERTIES ('k1'='v1', 'k2'='v2')"
-    val (desc, _) = extractTableDesc(query)
-    assert(desc.properties == Map("k1" -> "v1", "k2" -> "v2"))
-  }
-
-  test("create table - everything!") {
-    val query =
-      """
-        |CREATE EXTERNAL TABLE IF NOT EXISTS dbx.my_table (id int, name string)
-        |COMMENT 'no comment'
-        |PARTITIONED BY (month int)
-        |ROW FORMAT SERDE 'org.apache.poof.serde.Baff' WITH SERDEPROPERTIES ('k1'='v1')
-        |STORED AS INPUTFORMAT 'winput' OUTPUTFORMAT 'wowput'
-        |LOCATION '/path/to/mercury'
-        |TBLPROPERTIES ('k1'='v1', 'k2'='v2')
-      """.stripMargin
-    val (desc, allowExisting) = extractTableDesc(query)
-    assert(allowExisting)
-    assert(desc.identifier.database == Some("dbx"))
-    assert(desc.identifier.table == "my_table")
-    assert(desc.tableType == CatalogTableType.EXTERNAL)
-    assert(desc.schema == new StructType()
-      .add("id", "int")
-      .add("name", "string")
-      .add("month", "int"))
-    assert(desc.partitionColumnNames == Seq("month"))
-    assert(desc.bucketSpec.isEmpty)
-    assert(desc.viewText.isEmpty)
-    assert(desc.viewDefaultDatabase.isEmpty)
-    assert(desc.viewQueryColumnNames.isEmpty)
-    assert(desc.storage.locationUri == Some(new URI("/path/to/mercury")))
-    assert(desc.storage.inputFormat == Some("winput"))
-    assert(desc.storage.outputFormat == Some("wowput"))
-    assert(desc.storage.serde == Some("org.apache.poof.serde.Baff"))
-    assert(desc.storage.properties == Map("k1" -> "v1"))
-    assert(desc.properties == Map("k1" -> "v1", "k2" -> "v2"))
-    assert(desc.comment == Some("no comment"))
-  }
-
-  test("create view -- basic") {
-    val v1 = "CREATE VIEW view1 AS SELECT * FROM tab1"
-    val command = parser.parsePlan(v1).asInstanceOf[CreateViewCommand]
-    assert(!command.allowExisting)
-    assert(command.name.database.isEmpty)
-    assert(command.name.table == "view1")
-    assert(command.originalText == Some("SELECT * FROM tab1"))
-    assert(command.userSpecifiedColumns.isEmpty)
-  }
-
-  test("create view - full") {
-    val v1 =
-      """
-        |CREATE OR REPLACE VIEW view1
-        |(col1, col3 COMMENT 'hello')
-        |COMMENT 'BLABLA'
-        |TBLPROPERTIES('prop1Key'="prop1Val")
-        |AS SELECT * FROM tab1
-      """.stripMargin
-    val command = parser.parsePlan(v1).asInstanceOf[CreateViewCommand]
-    assert(command.name.database.isEmpty)
-    assert(command.name.table == "view1")
-    assert(command.userSpecifiedColumns == Seq("col1" -> None, "col3" -> Some("hello")))
-    assert(command.originalText == Some("SELECT * FROM tab1"))
-    assert(command.properties == Map("prop1Key" -> "prop1Val"))
-    assert(command.comment == Some("BLABLA"))
-  }
-
-  test("create view -- partitioned view") {
-    val v1 = "CREATE VIEW view1 partitioned on (ds, hr) as select * from srcpart"
-    intercept[ParseException] {
-      parser.parsePlan(v1)
-    }
-  }
-
-  test("MSCK REPAIR table") {
-    val sql = "MSCK REPAIR TABLE tab1"
-    val parsed = parser.parsePlan(sql)
-    val expected = AlterTableRecoverPartitionsCommand(
-      TableIdentifier("tab1", None),
-      "MSCK REPAIR TABLE")
-    comparePlans(parsed, expected)
-  }
-
-  test("create table like") {
-    val v1 = "CREATE TABLE table1 LIKE table2"
-    val (target, source, location, exists) = parser.parsePlan(v1).collect {
-      case CreateTableLikeCommand(t, s, l, allowExisting) => (t, s, l, allowExisting)
-    }.head
-    assert(exists == false)
-    assert(target.database.isEmpty)
-    assert(target.table == "table1")
-    assert(source.database.isEmpty)
-    assert(source.table == "table2")
-    assert(location.isEmpty)
-
-    val v2 = "CREATE TABLE IF NOT EXISTS table1 LIKE table2"
-    val (target2, source2, location2, exists2) = parser.parsePlan(v2).collect {
-      case CreateTableLikeCommand(t, s, l, allowExisting) => (t, s, l, allowExisting)
-    }.head
-    assert(exists2)
-    assert(target2.database.isEmpty)
-    assert(target2.table == "table1")
-    assert(source2.database.isEmpty)
-    assert(source2.table == "table2")
-    assert(location2.isEmpty)
-
-    val v3 = "CREATE TABLE table1 LIKE table2 LOCATION '/spark/warehouse'"
-    val (target3, source3, location3, exists3) = parser.parsePlan(v3).collect {
-      case CreateTableLikeCommand(t, s, l, allowExisting) => (t, s, l, allowExisting)
-    }.head
-    assert(!exists3)
-    assert(target3.database.isEmpty)
-    assert(target3.table == "table1")
-    assert(source3.database.isEmpty)
-    assert(source3.table == "table2")
-    assert(location3 == Some("/spark/warehouse"))
-
-    val v4 = "CREATE TABLE IF NOT EXISTS table1 LIKE table2  LOCATION '/spark/warehouse'"
-    val (target4, source4, location4, exists4) = parser.parsePlan(v4).collect {
-      case CreateTableLikeCommand(t, s, l, allowExisting) => (t, s, l, allowExisting)
-    }.head
-    assert(exists4)
-    assert(target4.database.isEmpty)
-    assert(target4.table == "table1")
-    assert(source4.database.isEmpty)
-    assert(source4.table == "table2")
-    assert(location4 == Some("/spark/warehouse"))
-  }
-
-  test("load data") {
-    val v1 = "LOAD DATA INPATH 'path' INTO TABLE table1"
-    val (table, path, isLocal, isOverwrite, partition) = parser.parsePlan(v1).collect {
-      case LoadDataCommand(t, path, l, o, partition) => (t, path, l, o, partition)
-    }.head
-    assert(table.database.isEmpty)
-    assert(table.table == "table1")
-    assert(path == "path")
-    assert(!isLocal)
-    assert(!isOverwrite)
-    assert(partition.isEmpty)
-
-    val v2 = "LOAD DATA LOCAL INPATH 'path' OVERWRITE INTO TABLE table1 PARTITION(c='1', d='2')"
-    val (table2, path2, isLocal2, isOverwrite2, partition2) = parser.parsePlan(v2).collect {
-      case LoadDataCommand(t, path, l, o, partition) => (t, path, l, o, partition)
-    }.head
-    assert(table2.database.isEmpty)
-    assert(table2.table == "table1")
-    assert(path2 == "path")
-    assert(isLocal2)
-    assert(isOverwrite2)
-    assert(partition2.nonEmpty)
-    assert(partition2.get.apply("c") == "1" && partition2.get.apply("d") == "2")
-  }
-
-  test("Test the default fileformat for Hive-serde tables") {
-    withSQLConf("hive.default.fileformat" -> "orc") {
-      val (desc, exists) = extractTableDesc("CREATE TABLE IF NOT EXISTS fileformat_test (id int)")
-      assert(exists)
-      assert(desc.storage.inputFormat == Some("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat"))
-      assert(desc.storage.outputFormat == Some("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat"))
-      assert(desc.storage.serde == Some("org.apache.hadoop.hive.ql.io.orc.OrcSerde"))
-    }
-
-    withSQLConf("hive.default.fileformat" -> "parquet") {
-      val (desc, exists) = extractTableDesc("CREATE TABLE IF NOT EXISTS fileformat_test (id int)")
-      assert(exists)
-      val input = desc.storage.inputFormat
-      val output = desc.storage.outputFormat
-      val serde = desc.storage.serde
-      assert(input == Some("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat"))
-      assert(output == Some("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat"))
-      assert(serde == Some("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"))
-    }
-  }
-
-  test("table name with schema") {
-    // regression test for SPARK-11778
-    spark.sql("create schema usrdb")
-    spark.sql("create table usrdb.test(c int)")
-    spark.read.table("usrdb.test")
-    spark.sql("drop table usrdb.test")
-    spark.sql("drop schema usrdb")
-  }
-
-  test("SPARK-15887: hive-site.xml should be loaded") {
-    assert(hiveClient.getConf("hive.in.test", "") == "true")
-  }
-
-  test("create hive serde table with new syntax - basic") {
-    val sql =
-      """
-        |CREATE TABLE t
-        |(id int, name string COMMENT 'blabla')
-        |USING hive
-        |OPTIONS (fileFormat 'parquet', my_prop 1)
-        |LOCATION '/tmp/file'
-        |COMMENT 'BLABLA'
-      """.stripMargin
-
-    val table = analyzeCreateTable(sql)
-    assert(table.schema == new StructType()
-      .add("id", "int")
-      .add("name", "string", nullable = true, comment = "blabla"))
-    assert(table.provider == Some(DDLUtils.HIVE_PROVIDER))
-    assert(table.storage.locationUri == Some(new URI("/tmp/file")))
-    assert(table.storage.properties == Map("my_prop" -> "1"))
-    assert(table.comment == Some("BLABLA"))
-
-    assert(table.storage.inputFormat ==
-      Some("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat"))
-    assert(table.storage.outputFormat ==
-      Some("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat"))
-    assert(table.storage.serde ==
-      Some("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"))
-  }
-
-  test("create hive serde table with new syntax - with partition and bucketing") {
-    val v1 = "CREATE TABLE t (c1 int, c2 int) USING hive PARTITIONED BY (c2)"
-    val table = analyzeCreateTable(v1)
-    assert(table.schema == new StructType().add("c1", "int").add("c2", "int"))
-    assert(table.partitionColumnNames == Seq("c2"))
-    // check the default formats
-    assert(table.storage.serde == Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"))
-    assert(table.storage.inputFormat == Some("org.apache.hadoop.mapred.TextInputFormat"))
-    assert(table.storage.outputFormat ==
-      Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"))
-
-    val v2 = "CREATE TABLE t (c1 int, c2 int) USING hive CLUSTERED BY (c2) INTO 4 BUCKETS"
-    val e2 = intercept[AnalysisException](analyzeCreateTable(v2))
-    assert(e2.message.contains("Creating bucketed Hive serde table is not supported yet"))
-
-    val v3 =
-      """
-        |CREATE TABLE t (c1 int, c2 int) USING hive
-        |PARTITIONED BY (c2)
-        |CLUSTERED BY (c2) INTO 4 BUCKETS""".stripMargin
-    val e3 = intercept[AnalysisException](analyzeCreateTable(v3))
-    assert(e3.message.contains("Creating bucketed Hive serde table is not supported yet"))
-  }
-
-  test("create hive serde table with new syntax - Hive options error checking") {
-    val v1 = "CREATE TABLE t (c1 int) USING hive OPTIONS (inputFormat 'abc')"
-    val e1 = intercept[IllegalArgumentException](analyzeCreateTable(v1))
-    assert(e1.getMessage.contains("Cannot specify only inputFormat or outputFormat"))
-
-    val v2 = "CREATE TABLE t (c1 int) USING hive OPTIONS " +
-      "(fileFormat 'x', inputFormat 'a', outputFormat 'b')"
-    val e2 = intercept[IllegalArgumentException](analyzeCreateTable(v2))
-    assert(e2.getMessage.contains(
-      "Cannot specify fileFormat and inputFormat/outputFormat together"))
-
-    val v3 = "CREATE TABLE t (c1 int) USING hive OPTIONS (fileFormat 'parquet', serde 'a')"
-    val e3 = intercept[IllegalArgumentException](analyzeCreateTable(v3))
-    assert(e3.getMessage.contains("fileFormat 'parquet' already specifies a serde"))
-
-    val v4 = "CREATE TABLE t (c1 int) USING hive OPTIONS (serde 'a', fieldDelim ' ')"
-    val e4 = intercept[IllegalArgumentException](analyzeCreateTable(v4))
-    assert(e4.getMessage.contains("Cannot specify delimiters with a custom serde"))
-
-    val v5 = "CREATE TABLE t (c1 int) USING hive OPTIONS (fieldDelim ' ')"
-    val e5 = intercept[IllegalArgumentException](analyzeCreateTable(v5))
-    assert(e5.getMessage.contains("Cannot specify delimiters without fileFormat"))
-
-    val v6 = "CREATE TABLE t (c1 int) USING hive OPTIONS (fileFormat 'parquet', fieldDelim ' ')"
-    val e6 = intercept[IllegalArgumentException](analyzeCreateTable(v6))
-    assert(e6.getMessage.contains(
-      "Cannot specify delimiters as they are only compatible with fileFormat 'textfile'"))
-
-    // The value of 'fileFormat' option is case-insensitive.
-    val v7 = "CREATE TABLE t (c1 int) USING hive OPTIONS (fileFormat 'TEXTFILE', lineDelim ',')"
-    val e7 = intercept[IllegalArgumentException](analyzeCreateTable(v7))
-    assert(e7.getMessage.contains("Hive data source only support newline '\\n' as line delimiter"))
-
-    val v8 = "CREATE TABLE t (c1 int) USING hive OPTIONS (fileFormat 'wrong')"
-    val e8 = intercept[IllegalArgumentException](analyzeCreateTable(v8))
-    assert(e8.getMessage.contains("invalid fileFormat: 'wrong'"))
-  }
-}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/TestHiveSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/TestHiveSuite.scala
index 193fa83dbad99..72f8e8ff7c688 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/TestHiveSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/TestHiveSuite.scala
@@ -42,4 +42,8 @@ class TestHiveSuite extends TestHiveSingleton with SQLTestUtils {
     }
     testHiveSparkSession.reset()
   }
+
+  test("SPARK-15887: hive-site.xml should be loaded") {
+    assert(hiveClient.getConf("hive.in.test", "") == "true")
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index 4c2fea3eb68bc..ee64bc9f9ee04 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -1998,4 +1998,15 @@ class HiveDDLSuite
       sq.stop()
     }
   }
+
+  test("table name with schema") {
+    // regression test for SPARK-11778
+    withDatabase("usrdb") {
+      spark.sql("create schema usrdb")
+      withTable("usrdb.test") {
+        spark.sql("create table usrdb.test(c int)")
+        spark.read.table("usrdb.test")
+      }
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala
index 7803ac39e508b..1c9f00141ae1d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala
@@ -17,15 +17,23 @@
 
 package org.apache.spark.sql.hive.execution
 
+import java.net.URI
+
 import org.scalatest.BeforeAndAfterAll
 
+import org.apache.spark.sql.{AnalysisException, SaveMode}
+import org.apache.spark.sql.catalyst.catalog.CatalogTable
+import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.execution.command.{CreateTableCommand, DDLUtils}
+import org.apache.spark.sql.execution.datasources.CreateTable
 import org.apache.spark.sql.execution.metric.InputOutputMetricsHelper
 import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.types.StructType
 
 /**
  * A set of tests that validates support for Hive SerDe.
  */
-class HiveSerDeSuite extends HiveComparisonTest with BeforeAndAfterAll {
+class HiveSerDeSuite extends HiveComparisonTest with PlanTest with BeforeAndAfterAll {
   override def beforeAll(): Unit = {
     import TestHive._
     import org.apache.hadoop.hive.serde2.RegexSerDe
@@ -60,4 +68,127 @@ class HiveSerDeSuite extends HiveComparisonTest with BeforeAndAfterAll {
     val serdeinsRes = InputOutputMetricsHelper.run(sql("select * from serdeins").toDF())
     assert(serdeinsRes === (serdeinsCnt, 0L, serdeinsCnt) :: Nil)
   }
+
+  private def extractTableDesc(sql: String): (CatalogTable, Boolean) = {
+    TestHive.sessionState.sqlParser.parsePlan(sql).collect {
+      case CreateTable(tableDesc, mode, _) => (tableDesc, mode == SaveMode.Ignore)
+    }.head
+  }
+
+  private def analyzeCreateTable(sql: String): CatalogTable = {
+    TestHive.sessionState.analyzer.execute(TestHive.sessionState.sqlParser.parsePlan(sql)).collect {
+      case CreateTableCommand(tableDesc, _) => tableDesc
+    }.head
+  }
+
+  test("Test the default fileformat for Hive-serde tables") {
+    withSQLConf("hive.default.fileformat" -> "orc") {
+      val (desc, exists) = extractTableDesc("CREATE TABLE IF NOT EXISTS fileformat_test (id int)")
+      assert(exists)
+      assert(desc.storage.inputFormat == Some("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat"))
+      assert(desc.storage.outputFormat == Some("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat"))
+      assert(desc.storage.serde == Some("org.apache.hadoop.hive.ql.io.orc.OrcSerde"))
+    }
+
+    withSQLConf("hive.default.fileformat" -> "parquet") {
+      val (desc, exists) = extractTableDesc("CREATE TABLE IF NOT EXISTS fileformat_test (id int)")
+      assert(exists)
+      val input = desc.storage.inputFormat
+      val output = desc.storage.outputFormat
+      val serde = desc.storage.serde
+      assert(input == Some("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat"))
+      assert(output == Some("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat"))
+      assert(serde == Some("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"))
+    }
+  }
+
+  test("create hive serde table with new syntax - basic") {
+    val sql =
+      """
+        |CREATE TABLE t
+        |(id int, name string COMMENT 'blabla')
+        |USING hive
+        |OPTIONS (fileFormat 'parquet', my_prop 1)
+        |LOCATION '/tmp/file'
+        |COMMENT 'BLABLA'
+      """.stripMargin
+
+    val table = analyzeCreateTable(sql)
+    assert(table.schema == new StructType()
+      .add("id", "int")
+      .add("name", "string", nullable = true, comment = "blabla"))
+    assert(table.provider == Some(DDLUtils.HIVE_PROVIDER))
+    assert(table.storage.locationUri == Some(new URI("/tmp/file")))
+    assert(table.storage.properties == Map("my_prop" -> "1"))
+    assert(table.comment == Some("BLABLA"))
+
+    assert(table.storage.inputFormat ==
+      Some("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat"))
+    assert(table.storage.outputFormat ==
+      Some("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat"))
+    assert(table.storage.serde ==
+      Some("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"))
+  }
+
+  test("create hive serde table with new syntax - with partition and bucketing") {
+    val v1 = "CREATE TABLE t (c1 int, c2 int) USING hive PARTITIONED BY (c2)"
+    val table = analyzeCreateTable(v1)
+    assert(table.schema == new StructType().add("c1", "int").add("c2", "int"))
+    assert(table.partitionColumnNames == Seq("c2"))
+    // check the default formats
+    assert(table.storage.serde == Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"))
+    assert(table.storage.inputFormat == Some("org.apache.hadoop.mapred.TextInputFormat"))
+    assert(table.storage.outputFormat ==
+      Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"))
+
+    val v2 = "CREATE TABLE t (c1 int, c2 int) USING hive CLUSTERED BY (c2) INTO 4 BUCKETS"
+    val e2 = intercept[AnalysisException](analyzeCreateTable(v2))
+    assert(e2.message.contains("Creating bucketed Hive serde table is not supported yet"))
+
+    val v3 =
+      """
+        |CREATE TABLE t (c1 int, c2 int) USING hive
+        |PARTITIONED BY (c2)
+        |CLUSTERED BY (c2) INTO 4 BUCKETS""".stripMargin
+    val e3 = intercept[AnalysisException](analyzeCreateTable(v3))
+    assert(e3.message.contains("Creating bucketed Hive serde table is not supported yet"))
+  }
+
+  test("create hive serde table with new syntax - Hive options error checking") {
+    val v1 = "CREATE TABLE t (c1 int) USING hive OPTIONS (inputFormat 'abc')"
+    val e1 = intercept[IllegalArgumentException](analyzeCreateTable(v1))
+    assert(e1.getMessage.contains("Cannot specify only inputFormat or outputFormat"))
+
+    val v2 = "CREATE TABLE t (c1 int) USING hive OPTIONS " +
+      "(fileFormat 'x', inputFormat 'a', outputFormat 'b')"
+    val e2 = intercept[IllegalArgumentException](analyzeCreateTable(v2))
+    assert(e2.getMessage.contains(
+      "Cannot specify fileFormat and inputFormat/outputFormat together"))
+
+    val v3 = "CREATE TABLE t (c1 int) USING hive OPTIONS (fileFormat 'parquet', serde 'a')"
+    val e3 = intercept[IllegalArgumentException](analyzeCreateTable(v3))
+    assert(e3.getMessage.contains("fileFormat 'parquet' already specifies a serde"))
+
+    val v4 = "CREATE TABLE t (c1 int) USING hive OPTIONS (serde 'a', fieldDelim ' ')"
+    val e4 = intercept[IllegalArgumentException](analyzeCreateTable(v4))
+    assert(e4.getMessage.contains("Cannot specify delimiters with a custom serde"))
+
+    val v5 = "CREATE TABLE t (c1 int) USING hive OPTIONS (fieldDelim ' ')"
+    val e5 = intercept[IllegalArgumentException](analyzeCreateTable(v5))
+    assert(e5.getMessage.contains("Cannot specify delimiters without fileFormat"))
+
+    val v6 = "CREATE TABLE t (c1 int) USING hive OPTIONS (fileFormat 'parquet', fieldDelim ' ')"
+    val e6 = intercept[IllegalArgumentException](analyzeCreateTable(v6))
+    assert(e6.getMessage.contains(
+      "Cannot specify delimiters as they are only compatible with fileFormat 'textfile'"))
+
+    // The value of 'fileFormat' option is case-insensitive.
+    val v7 = "CREATE TABLE t (c1 int) USING hive OPTIONS (fileFormat 'TEXTFILE', lineDelim ',')"
+    val e7 = intercept[IllegalArgumentException](analyzeCreateTable(v7))
+    assert(e7.getMessage.contains("Hive data source only support newline '\\n' as line delimiter"))
+
+    val v8 = "CREATE TABLE t (c1 int) USING hive OPTIONS (fileFormat 'wrong')"
+    val e8 = intercept[IllegalArgumentException](analyzeCreateTable(v8))
+    assert(e8.getMessage.contains("invalid fileFormat: 'wrong'"))
+  }
 }

From 3ed1ae10052e456d4efb13b2e51b5a43f7b1609a Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Tue, 22 Aug 2017 10:14:41 -0700
Subject: [PATCH 401/779] [SPARK-20641][CORE] Add missing kvstore module in
 Laucher and SparkSubmit code

There're two code in Launcher and SparkSubmit will will explicitly list all the Spark submodules, newly added kvstore module is missing in this two parts, so submitting a minor PR to fix this.

Author: jerryshao <sshao@hortonworks.com>

Closes #19014 from jerryshao/missing-kvstore.
---
 core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala   | 2 +-
 .../java/org/apache/spark/launcher/AbstractCommandBuilder.java  | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index e7e8fbc25d0ec..e56925102d47e 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -988,7 +988,7 @@ private[spark] object SparkSubmitUtils {
   // We need to specify each component explicitly, otherwise we miss spark-streaming-kafka-0-8 and
   // other spark-streaming utility components. Underscore is there to differentiate between
   // spark-streaming_2.1x and spark-streaming-kafka-0-8-assembly_2.1x
-  val IVY_DEFAULT_EXCLUDES = Seq("catalyst_", "core_", "graphx_", "launcher_", "mllib_",
+  val IVY_DEFAULT_EXCLUDES = Seq("catalyst_", "core_", "graphx_", "kvstore_", "launcher_", "mllib_",
     "mllib-local_", "network-common_", "network-shuffle_", "repl_", "sketch_", "sql_", "streaming_",
     "tags_", "unsafe_")
 
diff --git a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
index 44028c58ac489..c32974a57fccc 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
@@ -144,6 +144,7 @@ List<String> buildClassPath(String appClassPath) throws IOException {
     if (prependClasses || isTesting) {
       String scala = getScalaVersion();
       List<String> projects = Arrays.asList(
+        "common/kvstore",
         "common/network-common",
         "common/network-shuffle",
         "common/network-yarn",

From 43d71d96596baa8d2111a4b20bf21c1c668ad793 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 22 Aug 2017 13:01:35 -0700
Subject: [PATCH 402/779] [SPARK-21499][SQL] Support creating persistent
 function for Spark UDAF(UserDefinedAggregateFunction)

## What changes were proposed in this pull request?
This PR is to enable users to create persistent Scala UDAF (that extends UserDefinedAggregateFunction).

```SQL
CREATE FUNCTION myDoubleAvg AS 'test.org.apache.spark.sql.MyDoubleAvg'
```

Before this PR, Spark UDAF only can be registered through the API `spark.udf.register(...)`

## How was this patch tested?
Added test cases

Author: gatorsmile <gatorsmile@gmail.com>

Closes #18700 from gatorsmile/javaUDFinScala.
---
 .../sql/catalyst/catalog/SessionCatalog.scala |  41 ++++++-
 .../test/resources/sql-tests/inputs/udaf.sql  |  13 +++
 .../resources/sql-tests/results/udaf.sql.out  |  54 ++++++++++
 .../spark/sql/hive/HiveSessionCatalog.scala   |  62 +++++------
 .../sql/hive/execution/HiveUDAFSuite.scala    |  13 +++
 .../sql/hive/execution/HiveUDFSuite.scala     | 101 ++++++++++--------
 .../sql/hive/execution/SQLQuerySuite.scala    |  42 +-------
 7 files changed, 204 insertions(+), 122 deletions(-)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/udaf.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/udaf.sql.out

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index 6030d90ed99c3..0908d68d25649 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.catalog
 
+import java.lang.reflect.InvocationTargetException
 import java.net.URI
 import java.util.Locale
 import java.util.concurrent.Callable
@@ -24,6 +25,7 @@ import javax.annotation.concurrent.GuardedBy
 
 import scala.collection.mutable
 import scala.util.{Failure, Success, Try}
+import scala.util.control.NonFatal
 
 import com.google.common.cache.{Cache, CacheBuilder}
 import org.apache.hadoop.conf.Configuration
@@ -39,7 +41,9 @@ import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParserInterface}
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias, View}
 import org.apache.spark.sql.catalyst.util.StringUtils
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION
 import org.apache.spark.sql.types.StructType
+import org.apache.spark.util.Utils
 
 object SessionCatalog {
   val DEFAULT_DATABASE = "default"
@@ -1075,13 +1079,33 @@ class SessionCatalog(
   // ----------------------------------------------------------------
 
   /**
-   * Construct a [[FunctionBuilder]] based on the provided class that represents a function.
+   * Constructs a [[FunctionBuilder]] based on the provided class that represents a function.
+   */
+  private def makeFunctionBuilder(name: String, functionClassName: String): FunctionBuilder = {
+    val clazz = Utils.classForName(functionClassName)
+    (input: Seq[Expression]) => makeFunctionExpression(name, clazz, input)
+  }
+
+  /**
+   * Constructs a [[Expression]] based on the provided class that represents a function.
    *
    * This performs reflection to decide what type of [[Expression]] to return in the builder.
    */
-  protected def makeFunctionBuilder(name: String, functionClassName: String): FunctionBuilder = {
-    // TODO: at least support UDAFs here
-    throw new UnsupportedOperationException("Use sqlContext.udf.register(...) instead.")
+  protected def makeFunctionExpression(
+      name: String,
+      clazz: Class[_],
+      input: Seq[Expression]): Expression = {
+    val clsForUDAF =
+      Utils.classForName("org.apache.spark.sql.expressions.UserDefinedAggregateFunction")
+    if (clsForUDAF.isAssignableFrom(clazz)) {
+      val cls = Utils.classForName("org.apache.spark.sql.execution.aggregate.ScalaUDAF")
+      cls.getConstructor(classOf[Seq[Expression]], clsForUDAF, classOf[Int], classOf[Int])
+        .newInstance(input, clazz.newInstance().asInstanceOf[Object], Int.box(1), Int.box(1))
+        .asInstanceOf[Expression]
+    } else {
+      throw new AnalysisException(s"No handler for UDAF '${clazz.getCanonicalName}'. " +
+        s"Use sparkSession.udf.register(...) instead.")
+    }
   }
 
   /**
@@ -1105,7 +1129,14 @@ class SessionCatalog(
     }
     val info = new ExpressionInfo(funcDefinition.className, func.database.orNull, func.funcName)
     val builder =
-      functionBuilder.getOrElse(makeFunctionBuilder(func.unquotedString, funcDefinition.className))
+      functionBuilder.getOrElse {
+        val className = funcDefinition.className
+        if (!Utils.classIsLoadable(className)) {
+          throw new AnalysisException(s"Can not load class '$className' when registering " +
+            s"the function '$func', please make sure it is on the classpath")
+        }
+        makeFunctionBuilder(func.unquotedString, className)
+      }
     functionRegistry.registerFunction(func, info, builder)
   }
 
diff --git a/sql/core/src/test/resources/sql-tests/inputs/udaf.sql b/sql/core/src/test/resources/sql-tests/inputs/udaf.sql
new file mode 100644
index 0000000000000..2183ba23afc38
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/udaf.sql
@@ -0,0 +1,13 @@
+CREATE OR REPLACE TEMPORARY VIEW t1 AS SELECT * FROM VALUES
+(1), (2), (3), (4)
+as t1(int_col1);
+
+CREATE FUNCTION myDoubleAvg AS 'test.org.apache.spark.sql.MyDoubleAvg';
+
+SELECT default.myDoubleAvg(int_col1) as my_avg from t1;
+
+SELECT default.myDoubleAvg(int_col1, 3) as my_avg from t1;
+
+CREATE FUNCTION udaf1 AS 'test.non.existent.udaf';
+
+SELECT default.udaf1(int_col1) as udaf1 from t1;
diff --git a/sql/core/src/test/resources/sql-tests/results/udaf.sql.out b/sql/core/src/test/resources/sql-tests/results/udaf.sql.out
new file mode 100644
index 0000000000000..4815a578b1029
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/udaf.sql.out
@@ -0,0 +1,54 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 6
+
+
+-- !query 0
+CREATE OR REPLACE TEMPORARY VIEW t1 AS SELECT * FROM VALUES
+(1), (2), (3), (4)
+as t1(int_col1)
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+CREATE FUNCTION myDoubleAvg AS 'test.org.apache.spark.sql.MyDoubleAvg'
+-- !query 1 schema
+struct<>
+-- !query 1 output
+
+
+
+-- !query 2
+SELECT default.myDoubleAvg(int_col1) as my_avg from t1
+-- !query 2 schema
+struct<my_avg:double>
+-- !query 2 output
+102.5
+
+
+-- !query 3
+SELECT default.myDoubleAvg(int_col1, 3) as my_avg from t1
+-- !query 3 schema
+struct<>
+-- !query 3 output
+java.lang.AssertionError
+assertion failed: Incorrect number of children
+
+
+-- !query 4
+CREATE FUNCTION udaf1 AS 'test.non.existent.udaf'
+-- !query 4 schema
+struct<>
+-- !query 4 output
+
+
+
+-- !query 5
+SELECT default.udaf1(int_col1) as udaf1 from t1
+-- !query 5 schema
+struct<>
+-- !query 5 output
+org.apache.spark.sql.AnalysisException
+Can not load class 'test.non.existent.udaf' when registering the function 'default.udaf1', please make sure it is on the classpath; line 1 pos 7
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
index 0d0269f694300..b352bf6971bad 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
@@ -37,7 +37,6 @@ import org.apache.spark.sql.catalyst.parser.ParserInterface
 import org.apache.spark.sql.hive.HiveShim.HiveFunctionWrapper
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{DecimalType, DoubleType}
-import org.apache.spark.util.Utils
 
 
 private[sql] class HiveSessionCatalog(
@@ -58,55 +57,52 @@ private[sql] class HiveSessionCatalog(
       parser,
       functionResourceLoader) {
 
-  override def makeFunctionBuilder(funcName: String, className: String): FunctionBuilder = {
-    makeFunctionBuilder(funcName, Utils.classForName(className))
-  }
-
   /**
-   * Construct a [[FunctionBuilder]] based on the provided class that represents a function.
+   * Constructs a [[Expression]] based on the provided class that represents a function.
+   *
+   * This performs reflection to decide what type of [[Expression]] to return in the builder.
    */
-  private def makeFunctionBuilder(name: String, clazz: Class[_]): FunctionBuilder = {
-    // When we instantiate hive UDF wrapper class, we may throw exception if the input
-    // expressions don't satisfy the hive UDF, such as type mismatch, input number
-    // mismatch, etc. Here we catch the exception and throw AnalysisException instead.
-    (children: Seq[Expression]) => {
+  override def makeFunctionExpression(
+      name: String,
+      clazz: Class[_],
+      input: Seq[Expression]): Expression = {
+
+    Try(super.makeFunctionExpression(name, clazz, input)).getOrElse {
+      var udfExpr: Option[Expression] = None
       try {
+        // When we instantiate hive UDF wrapper class, we may throw exception if the input
+        // expressions don't satisfy the hive UDF, such as type mismatch, input number
+        // mismatch, etc. Here we catch the exception and throw AnalysisException instead.
         if (classOf[UDF].isAssignableFrom(clazz)) {
-          val udf = HiveSimpleUDF(name, new HiveFunctionWrapper(clazz.getName), children)
-          udf.dataType // Force it to check input data types.
-          udf
+          udfExpr = Some(HiveSimpleUDF(name, new HiveFunctionWrapper(clazz.getName), input))
+          udfExpr.get.dataType // Force it to check input data types.
         } else if (classOf[GenericUDF].isAssignableFrom(clazz)) {
-          val udf = HiveGenericUDF(name, new HiveFunctionWrapper(clazz.getName), children)
-          udf.dataType // Force it to check input data types.
-          udf
+          udfExpr = Some(HiveGenericUDF(name, new HiveFunctionWrapper(clazz.getName), input))
+          udfExpr.get.dataType // Force it to check input data types.
         } else if (classOf[AbstractGenericUDAFResolver].isAssignableFrom(clazz)) {
-          val udaf = HiveUDAFFunction(name, new HiveFunctionWrapper(clazz.getName), children)
-          udaf.dataType // Force it to check input data types.
-          udaf
+          udfExpr = Some(HiveUDAFFunction(name, new HiveFunctionWrapper(clazz.getName), input))
+          udfExpr.get.dataType // Force it to check input data types.
         } else if (classOf[UDAF].isAssignableFrom(clazz)) {
-          val udaf = HiveUDAFFunction(
+          udfExpr = Some(HiveUDAFFunction(
             name,
             new HiveFunctionWrapper(clazz.getName),
-            children,
-            isUDAFBridgeRequired = true)
-          udaf.dataType  // Force it to check input data types.
-          udaf
+            input,
+            isUDAFBridgeRequired = true))
+          udfExpr.get.dataType // Force it to check input data types.
         } else if (classOf[GenericUDTF].isAssignableFrom(clazz)) {
-          val udtf = HiveGenericUDTF(name, new HiveFunctionWrapper(clazz.getName), children)
-          udtf.elementSchema // Force it to check input data types.
-          udtf
-        } else {
-          throw new AnalysisException(s"No handler for Hive UDF '${clazz.getCanonicalName}'")
+          udfExpr = Some(HiveGenericUDTF(name, new HiveFunctionWrapper(clazz.getName), input))
+          udfExpr.get.asInstanceOf[HiveGenericUDTF].elementSchema // Force it to check data types.
         }
       } catch {
-        case ae: AnalysisException =>
-          throw ae
         case NonFatal(e) =>
           val analysisException =
-            new AnalysisException(s"No handler for Hive UDF '${clazz.getCanonicalName}': $e")
+            new AnalysisException(s"No handler for UDF/UDAF/UDTF '${clazz.getCanonicalName}': $e")
           analysisException.setStackTrace(e.getStackTrace)
           throw analysisException
       }
+      udfExpr.getOrElse {
+        throw new AnalysisException(s"No handler for UDF/UDAF/UDTF '${clazz.getCanonicalName}'")
+      }
     }
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDAFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDAFSuite.scala
index 479ca1e8def56..8986fb58c6460 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDAFSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDAFSuite.scala
@@ -26,6 +26,7 @@ import org.apache.hadoop.hive.ql.util.JavaDataModel
 import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspector, ObjectInspectorFactory}
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory
 import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo
+import test.org.apache.spark.sql.MyDoubleAvg
 
 import org.apache.spark.sql.{AnalysisException, QueryTest, Row}
 import org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec
@@ -86,6 +87,18 @@ class HiveUDAFSuite extends QueryTest with TestHiveSingleton with SQLTestUtils {
     ))
   }
 
+  test("call JAVA UDAF") {
+    withTempView("temp") {
+      withUserDefinedFunction("myDoubleAvg" -> false) {
+        spark.range(1, 10).toDF("value").createOrReplaceTempView("temp")
+        sql(s"CREATE FUNCTION myDoubleAvg AS '${classOf[MyDoubleAvg].getName}'")
+        checkAnswer(
+          spark.sql("SELECT default.myDoubleAvg(value) as my_avg from temp"),
+          Row(105.0))
+      }
+    }
+  }
+
   test("non-deterministic children expressions of UDAF") {
     withTempView("view1") {
       spark.range(1).selectExpr("id as x", "id as y").createTempView("view1")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
index cae338c0ab0ae..383d41f907c6d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
@@ -404,59 +404,34 @@ class HiveUDFSuite extends QueryTest with TestHiveSingleton with SQLTestUtils {
   }
 
   test("Hive UDFs with insufficient number of input arguments should trigger an analysis error") {
-    Seq((1, 2)).toDF("a", "b").createOrReplaceTempView("testUDF")
+    withTempView("testUDF") {
+      Seq((1, 2)).toDF("a", "b").createOrReplaceTempView("testUDF")
+
+      def testErrorMsgForFunc(funcName: String, className: String): Unit = {
+        withUserDefinedFunction(funcName -> true) {
+          sql(s"CREATE TEMPORARY FUNCTION $funcName AS '$className'")
+          val message = intercept[AnalysisException] {
+            sql(s"SELECT $funcName() FROM testUDF")
+          }.getMessage
+          assert(message.contains(s"No handler for UDF/UDAF/UDTF '$className'"))
+        }
+      }
 
-    {
       // HiveSimpleUDF
-      sql(s"CREATE TEMPORARY FUNCTION testUDFTwoListList AS '${classOf[UDFTwoListList].getName}'")
-      val message = intercept[AnalysisException] {
-        sql("SELECT testUDFTwoListList() FROM testUDF")
-      }.getMessage
-      assert(message.contains("No handler for Hive UDF"))
-      sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFTwoListList")
-    }
+      testErrorMsgForFunc("testUDFTwoListList", classOf[UDFTwoListList].getName)
 
-    {
       // HiveGenericUDF
-      sql(s"CREATE TEMPORARY FUNCTION testUDFAnd AS '${classOf[GenericUDFOPAnd].getName}'")
-      val message = intercept[AnalysisException] {
-        sql("SELECT testUDFAnd() FROM testUDF")
-      }.getMessage
-      assert(message.contains("No handler for Hive UDF"))
-      sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFAnd")
-    }
+      testErrorMsgForFunc("testUDFAnd", classOf[GenericUDFOPAnd].getName)
 
-    {
       // Hive UDAF
-      sql(s"CREATE TEMPORARY FUNCTION testUDAFPercentile AS '${classOf[UDAFPercentile].getName}'")
-      val message = intercept[AnalysisException] {
-        sql("SELECT testUDAFPercentile(a) FROM testUDF GROUP BY b")
-      }.getMessage
-      assert(message.contains("No handler for Hive UDF"))
-      sql("DROP TEMPORARY FUNCTION IF EXISTS testUDAFPercentile")
-    }
+      testErrorMsgForFunc("testUDAFPercentile", classOf[UDAFPercentile].getName)
 
-    {
       // AbstractGenericUDAFResolver
-      sql(s"CREATE TEMPORARY FUNCTION testUDAFAverage AS '${classOf[GenericUDAFAverage].getName}'")
-      val message = intercept[AnalysisException] {
-        sql("SELECT testUDAFAverage() FROM testUDF GROUP BY b")
-      }.getMessage
-      assert(message.contains("No handler for Hive UDF"))
-      sql("DROP TEMPORARY FUNCTION IF EXISTS testUDAFAverage")
-    }
+      testErrorMsgForFunc("testUDAFAverage", classOf[GenericUDAFAverage].getName)
 
-    {
-      // Hive UDTF
-      sql(s"CREATE TEMPORARY FUNCTION testUDTFExplode AS '${classOf[GenericUDTFExplode].getName}'")
-      val message = intercept[AnalysisException] {
-        sql("SELECT testUDTFExplode() FROM testUDF")
-      }.getMessage
-      assert(message.contains("No handler for Hive UDF"))
-      sql("DROP TEMPORARY FUNCTION IF EXISTS testUDTFExplode")
+      // AbstractGenericUDAFResolver
+      testErrorMsgForFunc("testUDTFExplode", classOf[GenericUDTFExplode].getName)
     }
-
-    spark.catalog.dropTempView("testUDF")
   }
 
   test("Hive UDF in group by") {
@@ -621,6 +596,46 @@ class HiveUDFSuite extends QueryTest with TestHiveSingleton with SQLTestUtils {
       }
     }
   }
+
+  test("UDTF") {
+    withUserDefinedFunction("udtf_count2" -> true) {
+      sql(s"ADD JAR ${hiveContext.getHiveFile("TestUDTF.jar").getCanonicalPath}")
+      // The function source code can be found at:
+      // https://cwiki.apache.org/confluence/display/Hive/DeveloperGuide+UDTF
+      sql(
+        """
+          |CREATE TEMPORARY FUNCTION udtf_count2
+          |AS 'org.apache.spark.sql.hive.execution.GenericUDTFCount2'
+        """.stripMargin)
+
+      checkAnswer(
+        sql("SELECT key, cc FROM src LATERAL VIEW udtf_count2(value) dd AS cc"),
+        Row(97, 500) :: Row(97, 500) :: Nil)
+
+      checkAnswer(
+        sql("SELECT udtf_count2(a) FROM (SELECT 1 AS a FROM src LIMIT 3) t"),
+        Row(3) :: Row(3) :: Nil)
+    }
+  }
+
+  test("permanent UDTF") {
+    withUserDefinedFunction("udtf_count_temp" -> false) {
+      sql(
+        s"""
+           |CREATE FUNCTION udtf_count_temp
+           |AS 'org.apache.spark.sql.hive.execution.GenericUDTFCount2'
+           |USING JAR '${hiveContext.getHiveFile("TestUDTF.jar").toURI}'
+        """.stripMargin)
+
+      checkAnswer(
+        sql("SELECT key, cc FROM src LATERAL VIEW udtf_count_temp(value) dd AS cc"),
+        Row(97, 500) :: Row(97, 500) :: Nil)
+
+      checkAnswer(
+        sql("SELECT udtf_count_temp(a) FROM (SELECT 1 AS a FROM src LIMIT 3) t"),
+        Row(3) :: Row(3) :: Nil)
+    }
+  }
 }
 
 class TestPair(x: Int, y: Int) extends Writable with Serializable {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index d0e0d20df30af..02cfa02a37886 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -28,7 +28,7 @@ import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.spark.TestUtils
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.analysis.{EliminateSubqueryAliases, FunctionRegistry, NoSuchPartitionException}
+import org.apache.spark.sql.catalyst.analysis.{EliminateSubqueryAliases, FunctionRegistry}
 import org.apache.spark.sql.catalyst.catalog.{CatalogTableType, CatalogUtils, HiveTableRelation}
 import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias}
@@ -98,46 +98,6 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     checkAnswer(query1, Row("x1_y1") :: Row("x2_y2") :: Nil)
   }
 
-  test("UDTF") {
-    withUserDefinedFunction("udtf_count2" -> true) {
-      sql(s"ADD JAR ${hiveContext.getHiveFile("TestUDTF.jar").getCanonicalPath()}")
-      // The function source code can be found at:
-      // https://cwiki.apache.org/confluence/display/Hive/DeveloperGuide+UDTF
-      sql(
-        """
-          |CREATE TEMPORARY FUNCTION udtf_count2
-          |AS 'org.apache.spark.sql.hive.execution.GenericUDTFCount2'
-        """.stripMargin)
-
-      checkAnswer(
-        sql("SELECT key, cc FROM src LATERAL VIEW udtf_count2(value) dd AS cc"),
-        Row(97, 500) :: Row(97, 500) :: Nil)
-
-      checkAnswer(
-        sql("SELECT udtf_count2(a) FROM (SELECT 1 AS a FROM src LIMIT 3) t"),
-        Row(3) :: Row(3) :: Nil)
-    }
-  }
-
-  test("permanent UDTF") {
-    withUserDefinedFunction("udtf_count_temp" -> false) {
-      sql(
-        s"""
-          |CREATE FUNCTION udtf_count_temp
-          |AS 'org.apache.spark.sql.hive.execution.GenericUDTFCount2'
-          |USING JAR '${hiveContext.getHiveFile("TestUDTF.jar").toURI}'
-        """.stripMargin)
-
-      checkAnswer(
-        sql("SELECT key, cc FROM src LATERAL VIEW udtf_count_temp(value) dd AS cc"),
-        Row(97, 500) :: Row(97, 500) :: Nil)
-
-      checkAnswer(
-        sql("SELECT udtf_count_temp(a) FROM (SELECT 1 AS a FROM src LIMIT 3) t"),
-        Row(3) :: Row(3) :: Nil)
-    }
-  }
-
   test("SPARK-6835: udtf in lateral view") {
     val df = Seq((1, 1)).toDF("c1", "c2")
     df.createOrReplaceTempView("table1")

From 01a8e46278dbfde916a74b6fd51e08804602e1cf Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 22 Aug 2017 13:12:59 -0700
Subject: [PATCH 403/779] [SPARK-21769][SQL] Add a table-specific option for
 always respecting schemas inferred/controlled by Spark SQL

## What changes were proposed in this pull request?
For Hive-serde tables, we always respect the schema stored in Hive metastore, because the schema could be altered by the other engines that share the same metastore. Thus, we always trust the metastore-controlled schema for Hive-serde tables when the schemas are different (without considering the nullability and cases). However, in some scenarios, Hive metastore also could INCORRECTLY overwrite the schemas when the serde and Hive metastore built-in serde are different.

The proposed solution is to introduce a table-specific option for such scenarios. For a specific table, users can make Spark always respect Spark-inferred/controlled schema instead of trusting metastore-controlled schema. By default, we trust Hive metastore-controlled schema.

## How was this patch tested?
Added a cross-version test case

Author: gatorsmile <gatorsmile@gmail.com>

Closes #19003 from gatorsmile/respectSparkSchema.
---
 .../execution/datasources/SourceOptions.scala |  50 ++++++++++++++++++
 .../spark/sql/hive/HiveExternalCatalog.scala  |  11 ++--
 .../test/resources/avroDecimal/decimal.avro   | Bin 0 -> 203 bytes
 .../spark/sql/hive/client/VersionsSuite.scala |  41 ++++++++++++++
 4 files changed, 97 insertions(+), 5 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SourceOptions.scala
 create mode 100755 sql/hive/src/test/resources/avroDecimal/decimal.avro

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SourceOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SourceOptions.scala
new file mode 100644
index 0000000000000..c98c0b2a756a1
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SourceOptions.scala
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
+
+/**
+ * Options for the data source.
+ */
+class SourceOptions(
+     @transient private val parameters: CaseInsensitiveMap[String])
+  extends Serializable {
+  import SourceOptions._
+
+  def this(parameters: Map[String, String]) = this(CaseInsensitiveMap(parameters))
+
+  // A flag to disable saving a data source table's metadata in hive compatible way.
+  val skipHiveMetadata: Boolean = parameters
+    .get(SKIP_HIVE_METADATA).map(_.toBoolean).getOrElse(DEFAULT_SKIP_HIVE_METADATA)
+
+  // A flag to always respect the Spark schema restored from the table properties
+  val respectSparkSchema: Boolean = parameters
+    .get(RESPECT_SPARK_SCHEMA).map(_.toBoolean).getOrElse(DEFAULT_RESPECT_SPARK_SCHEMA)
+}
+
+
+object SourceOptions {
+
+  val SKIP_HIVE_METADATA = "skipHiveMetadata"
+  val DEFAULT_SKIP_HIVE_METADATA = false
+
+  val RESPECT_SPARK_SCHEMA = "respectSparkSchema"
+  val DEFAULT_RESPECT_SPARK_SCHEMA = false
+
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index bdbb8bccbc5cd..34af37ce11103 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -41,7 +41,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.ColumnStat
 import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 import org.apache.spark.sql.execution.command.DDLUtils
-import org.apache.spark.sql.execution.datasources.PartitioningUtils
+import org.apache.spark.sql.execution.datasources.{PartitioningUtils, SourceOptions}
 import org.apache.spark.sql.hive.client.HiveClient
 import org.apache.spark.sql.internal.HiveSerDe
 import org.apache.spark.sql.internal.StaticSQLConf._
@@ -260,6 +260,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
   private def createDataSourceTable(table: CatalogTable, ignoreIfExists: Boolean): Unit = {
     // data source table always have a provider, it's guaranteed by `DDLUtils.isDatasourceTable`.
     val provider = table.provider.get
+    val options = new SourceOptions(table.storage.properties)
 
     // To work around some hive metastore issues, e.g. not case-preserving, bad decimal type
     // support, no column nullability, etc., we should do some extra works before saving table
@@ -325,11 +326,9 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
 
     val qualifiedTableName = table.identifier.quotedString
     val maybeSerde = HiveSerDe.sourceToSerDe(provider)
-    val skipHiveMetadata = table.storage.properties
-      .getOrElse("skipHiveMetadata", "false").toBoolean
 
     val (hiveCompatibleTable, logMessage) = maybeSerde match {
-      case _ if skipHiveMetadata =>
+      case _ if options.skipHiveMetadata =>
         val message =
           s"Persisting data source table $qualifiedTableName into Hive metastore in" +
             "Spark SQL specific format, which is NOT compatible with Hive."
@@ -737,6 +736,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
   }
 
   private def restoreHiveSerdeTable(table: CatalogTable): CatalogTable = {
+    val options = new SourceOptions(table.storage.properties)
     val hiveTable = table.copy(
       provider = Some(DDLUtils.HIVE_PROVIDER),
       tracksPartitionsInCatalog = true)
@@ -748,7 +748,8 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       val partColumnNames = getPartitionColumnsFromTableProperties(table)
       val reorderedSchema = reorderSchema(schema = schemaFromTableProps, partColumnNames)
 
-      if (DataType.equalsIgnoreCaseAndNullability(reorderedSchema, table.schema)) {
+      if (DataType.equalsIgnoreCaseAndNullability(reorderedSchema, table.schema) ||
+          options.respectSparkSchema) {
         hiveTable.copy(
           schema = reorderedSchema,
           partitionColumnNames = partColumnNames,
diff --git a/sql/hive/src/test/resources/avroDecimal/decimal.avro b/sql/hive/src/test/resources/avroDecimal/decimal.avro
new file mode 100755
index 0000000000000000000000000000000000000000..6da423f78661fca12d6e642bc22585099b671440
GIT binary patch
literal 203
zcmeZI%3@>^ODrqO*DFrWNX<=L#8j<RQdy9yWTjM;nw(#hqNJmgmzWFUm!uY##KQ&C
zGE;L>ij}OQt6}nK20*nCz0pc}r8zlDI&ia+DuKFz(gi^MnZ=p;c}iBs7CK7B$%#2Y
zqm6Wwa`MwNft(PC)hR$#xrsSSwXs0R&~!TaxAOg}XkNB;_cIPZ-^#?n#KO>oE(!qS
Cg-CY*

literal 0
HcmV?d00001

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index 072e538b9ed54..cbbe869403724 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -763,6 +763,47 @@ class VersionsSuite extends SparkFunSuite with Logging {
       }
     }
 
+    test(s"$version: read avro file containing decimal") {
+      val url = Thread.currentThread().getContextClassLoader.getResource("avroDecimal")
+      val location = new File(url.getFile)
+
+      val tableName = "tab1"
+      val avroSchema =
+        """{
+          |  "name": "test_record",
+          |  "type": "record",
+          |  "fields": [ {
+          |    "name": "f0",
+          |    "type": [
+          |      "null",
+          |      {
+          |        "precision": 38,
+          |        "scale": 2,
+          |        "type": "bytes",
+          |        "logicalType": "decimal"
+          |      }
+          |    ]
+          |  } ]
+          |}
+        """.stripMargin
+      withTable(tableName) {
+        versionSpark.sql(
+          s"""
+             |CREATE TABLE $tableName
+             |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
+             |WITH SERDEPROPERTIES ('respectSparkSchema' = 'true')
+             |STORED AS
+             |  INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
+             |  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
+             |LOCATION '$location'
+             |TBLPROPERTIES ('avro.schema.literal' = '$avroSchema')
+           """.stripMargin
+        )
+        assert(versionSpark.table(tableName).collect() ===
+          versionSpark.sql("SELECT 1.30").collect())
+      }
+    }
+
     // TODO: add more tests.
   }
 }

From d56c262109a5d94b46fffc04954c34671b14ee4f Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Tue, 22 Aug 2017 16:55:34 -0700
Subject: [PATCH 404/779] [SPARK-21681][ML] fix bug of MLOR do not work
 correctly when featureStd contains zero

## What changes were proposed in this pull request?

fix bug of MLOR do not work correctly when featureStd contains zero

We can reproduce the bug through such dataset (features including zero variance), will generate wrong result (all coefficients becomes 0)
```
    val multinomialDatasetWithZeroVar = {
      val nPoints = 100
      val coefficients = Array(
        -0.57997, 0.912083, -0.371077,
        -0.16624, -0.84355, -0.048509)

      val xMean = Array(5.843, 3.0)
      val xVariance = Array(0.6856, 0.0)  // including zero variance

      val testData = generateMultinomialLogisticInput(
        coefficients, xMean, xVariance, addIntercept = true, nPoints, seed)

      val df = sc.parallelize(testData, 4).toDF().withColumn("weight", lit(1.0))
      df.cache()
      df
    }
```
## How was this patch tested?

testcase added.

Author: WeichenXu <WeichenXu123@outlook.com>

Closes #18896 from WeichenXu123/fix_mlor_stdvalue_zero_bug.
---
 .../optim/aggregator/LogisticAggregator.scala | 12 +--
 .../LogisticRegressionSuite.scala             | 78 +++++++++++++++++++
 .../aggregator/LogisticAggregatorSuite.scala  | 37 ++++++++-
 3 files changed, 118 insertions(+), 9 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregator.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregator.scala
index 66a52942e668c..272d36dd94ae8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregator.scala
@@ -270,11 +270,13 @@ private[ml] class LogisticAggregator(
 
     val margins = new Array[Double](numClasses)
     features.foreachActive { (index, value) =>
-      val stdValue = value / localFeaturesStd(index)
-      var j = 0
-      while (j < numClasses) {
-        margins(j) += localCoefficients(index * numClasses + j) * stdValue
-        j += 1
+      if (localFeaturesStd(index) != 0.0 && value != 0.0) {
+        val stdValue = value / localFeaturesStd(index)
+        var j = 0
+        while (j < numClasses) {
+          margins(j) += localCoefficients(index * numClasses + j) * stdValue
+          j += 1
+        }
       }
     }
     var i = 0
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 0570499e74516..542977a48f0ac 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -46,6 +46,7 @@ class LogisticRegressionSuite
   @transient var smallMultinomialDataset: Dataset[_] = _
   @transient var binaryDataset: Dataset[_] = _
   @transient var multinomialDataset: Dataset[_] = _
+  @transient var multinomialDatasetWithZeroVar: Dataset[_] = _
   private val eps: Double = 1e-5
 
   override def beforeAll(): Unit = {
@@ -99,6 +100,23 @@ class LogisticRegressionSuite
       df.cache()
       df
     }
+
+    multinomialDatasetWithZeroVar = {
+      val nPoints = 100
+      val coefficients = Array(
+        -0.57997, 0.912083, -0.371077,
+        -0.16624, -0.84355, -0.048509)
+
+      val xMean = Array(5.843, 3.0)
+      val xVariance = Array(0.6856, 0.0)
+
+      val testData = generateMultinomialLogisticInput(
+        coefficients, xMean, xVariance, addIntercept = true, nPoints, seed)
+
+      val df = sc.parallelize(testData, 4).toDF().withColumn("weight", lit(1.0))
+      df.cache()
+      df
+    }
   }
 
   /**
@@ -112,6 +130,11 @@ class LogisticRegressionSuite
     multinomialDataset.rdd.map { case Row(label: Double, features: Vector, weight: Double) =>
       label + "," + weight + "," + features.toArray.mkString(",")
     }.repartition(1).saveAsTextFile("target/tmp/LogisticRegressionSuite/multinomialDataset")
+    multinomialDatasetWithZeroVar.rdd.map {
+      case Row(label: Double, features: Vector, weight: Double) =>
+        label + "," + weight + "," + features.toArray.mkString(",")
+    }.repartition(1)
+     .saveAsTextFile("target/tmp/LogisticRegressionSuite/multinomialDatasetWithZeroVar")
   }
 
   test("params") {
@@ -1392,6 +1415,61 @@ class LogisticRegressionSuite
     assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
   }
 
+  test("multinomial logistic regression with zero variance (SPARK-21681)") {
+    val sqlContext = multinomialDatasetWithZeroVar.sqlContext
+    import sqlContext.implicits._
+    val mlr = new LogisticRegression().setFamily("multinomial").setFitIntercept(true)
+      .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(true).setWeightCol("weight")
+
+    val model = mlr.fit(multinomialDatasetWithZeroVar)
+
+    /*
+     Use the following R code to load the data and train the model using glmnet package.
+
+     library("glmnet")
+     data <- read.csv("path", header=FALSE)
+     label = as.factor(data$V1)
+     w = data$V2
+     features = as.matrix(data.frame(data$V3, data$V4))
+     coefficients = coef(glmnet(features, label, weights=w, family="multinomial",
+     alpha = 0, lambda = 0))
+     coefficients
+     $`0`
+     3 x 1 sparse Matrix of class "dgCMatrix"
+                    s0
+             0.2658824
+     data.V3 0.1881871
+     data.V4 .
+
+     $`1`
+     3 x 1 sparse Matrix of class "dgCMatrix"
+                      s0
+              0.53604701
+     data.V3 -0.02412645
+     data.V4  .
+
+     $`2`
+     3 x 1 sparse Matrix of class "dgCMatrix"
+                     s0
+             -0.8019294
+     data.V3 -0.1640607
+     data.V4  .
+    */
+
+    val coefficientsR = new DenseMatrix(3, 2, Array(
+      0.1881871, 0.0,
+      -0.02412645, 0.0,
+      -0.1640607, 0.0), isTransposed = true)
+    val interceptsR = Vectors.dense(0.2658824, 0.53604701, -0.8019294)
+
+    model.coefficientMatrix.colIter.foreach(v => assert(v.toArray.sum ~== 0.0 absTol eps))
+
+    assert(model.coefficientMatrix ~== coefficientsR relTol 0.05)
+    assert(model.coefficientMatrix.toArray.sum ~== 0.0 absTol eps)
+    assert(model.interceptVector ~== interceptsR relTol 0.05)
+    assert(model.interceptVector.toArray.sum ~== 0.0 absTol eps)
+  }
+
   test("multinomial logistic regression with intercept without regularization with bound") {
     // Bound constrained optimization with bound on one side.
     val lowerBoundsOnCoefficients = Matrices.dense(3, 4, Array.fill(12)(1.0))
diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala
index 2b29c67d859db..16ef4af4f94e8 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala
@@ -28,6 +28,7 @@ class LogisticAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   @transient var instances: Array[Instance] = _
   @transient var instancesConstantFeature: Array[Instance] = _
+  @transient var instancesConstantFeatureFiltered: Array[Instance] = _
 
   override def beforeAll(): Unit = {
     super.beforeAll()
@@ -41,6 +42,11 @@ class LogisticAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
       Instance(1.0, 0.5, Vectors.dense(1.0, 1.0)),
       Instance(2.0, 0.3, Vectors.dense(1.0, 0.5))
     )
+    instancesConstantFeatureFiltered = Array(
+      Instance(0.0, 0.1, Vectors.dense(2.0)),
+      Instance(1.0, 0.5, Vectors.dense(1.0)),
+      Instance(2.0, 0.3, Vectors.dense(0.5))
+    )
   }
 
   /** Get summary statistics for some data and create a new LogisticAggregator. */
@@ -233,21 +239,44 @@ class LogisticAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
     val binaryInstances = instancesConstantFeature.map { instance =>
       if (instance.label <= 1.0) instance else Instance(0.0, instance.weight, instance.features)
     }
+    val binaryInstancesFiltered = instancesConstantFeatureFiltered.map { instance =>
+      if (instance.label <= 1.0) instance else Instance(0.0, instance.weight, instance.features)
+    }
     val coefArray = Array(1.0, 2.0, -2.0, 3.0, 0.0, -1.0)
+    val coefArrayFiltered = Array(3.0, 0.0, -1.0)
     val interceptArray = Array(4.0, 2.0, -3.0)
     val aggConstantFeature = getNewAggregator(instancesConstantFeature,
       Vectors.dense(coefArray ++ interceptArray), fitIntercept = true, isMultinomial = true)
-    instances.foreach(aggConstantFeature.add)
+    val aggConstantFeatureFiltered = getNewAggregator(instancesConstantFeatureFiltered,
+      Vectors.dense(coefArrayFiltered ++ interceptArray), fitIntercept = true, isMultinomial = true)
+
+    instancesConstantFeature.foreach(aggConstantFeature.add)
+    instancesConstantFeatureFiltered.foreach(aggConstantFeatureFiltered.add)
+
     // constant features should not affect gradient
-    assert(aggConstantFeature.gradient(0) === 0.0)
+    def validateGradient(grad: Vector, gradFiltered: Vector, numCoefficientSets: Int): Unit = {
+      for (i <- 0 until numCoefficientSets) {
+        assert(grad(i) === 0.0)
+        assert(grad(numCoefficientSets + i) == gradFiltered(i))
+      }
+    }
+
+    validateGradient(aggConstantFeature.gradient, aggConstantFeatureFiltered.gradient, 3)
 
     val binaryCoefArray = Array(1.0, 2.0)
+    val binaryCoefArrayFiltered = Array(2.0)
     val intercept = 1.0
     val aggConstantFeatureBinary = getNewAggregator(binaryInstances,
       Vectors.dense(binaryCoefArray ++ Array(intercept)), fitIntercept = true,
       isMultinomial = false)
-    instances.foreach(aggConstantFeatureBinary.add)
+    val aggConstantFeatureBinaryFiltered = getNewAggregator(binaryInstancesFiltered,
+      Vectors.dense(binaryCoefArrayFiltered ++ Array(intercept)), fitIntercept = true,
+      isMultinomial = false)
+    binaryInstances.foreach(aggConstantFeatureBinary.add)
+    binaryInstancesFiltered.foreach(aggConstantFeatureBinaryFiltered.add)
+
     // constant features should not affect gradient
-    assert(aggConstantFeatureBinary.gradient(0) === 0.0)
+    validateGradient(aggConstantFeatureBinary.gradient,
+      aggConstantFeatureBinaryFiltered.gradient, 1)
   }
 }

From 41bb1ddc63298c004bb6a6bb6fff9fd4f6e44792 Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Tue, 22 Aug 2017 17:40:50 -0700
Subject: [PATCH 405/779] [SPARK-10931][ML][PYSPARK] PySpark Models Copy Param
 Values from Estimator

## What changes were proposed in this pull request?

Added call to copy values of Params from Estimator to Model after fit in PySpark ML.  This will copy values for any params that are also defined in the Model.  Since currently most Models do not define the same params from the Estimator, also added method to create new Params from looking at the Java object if they do not exist in the Python object.  This is a temporary fix that can be removed once the PySpark models properly define the params themselves.

## How was this patch tested?

Refactored the `check_params` test to optionally check if the model params for Python and Java match and added this check to an existing fitted model that shares params between Estimator and Model.

Author: Bryan Cutler <cutlerb@gmail.com>

Closes #17849 from BryanCutler/pyspark-models-own-params-SPARK-10931.
---
 python/pyspark/ml/classification.py |  2 +-
 python/pyspark/ml/clustering.py     |  8 ++-
 python/pyspark/ml/tests.py          | 87 +++++++++++++++++------------
 python/pyspark/ml/wrapper.py        | 32 ++++++++++-
 4 files changed, 91 insertions(+), 38 deletions(-)

diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index bccf8e7f636f1..235cee48bc6a6 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -1434,7 +1434,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
         super(MultilayerPerceptronClassifier, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.classification.MultilayerPerceptronClassifier", self.uid)
-        self._setDefault(maxIter=100, tol=1E-4, blockSize=128, stepSize=0.03, solver="l-bfgs")
+        self._setDefault(maxIter=100, tol=1E-6, blockSize=128, stepSize=0.03, solver="l-bfgs")
         kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index 88ac7e275e386..66fb00508522e 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -745,7 +745,13 @@ def toLocal(self):
 
         WARNING: This involves collecting a large :py:func:`topicsMatrix` to the driver.
         """
-        return LocalLDAModel(self._call_java("toLocal"))
+        model = LocalLDAModel(self._call_java("toLocal"))
+
+        # SPARK-10931: Temporary fix to be removed once LDAModel defines Params
+        model._create_params_from_java()
+        model._transfer_params_from_java()
+
+        return model
 
     @since("2.0.0")
     def trainingLogLikelihood(self):
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 0495973d2f625..6076b3c2f26a6 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -455,6 +455,54 @@ def test_logistic_regression_check_thresholds(self):
             LogisticRegression, threshold=0.42, thresholds=[0.5, 0.5]
         )
 
+    @staticmethod
+    def check_params(test_self, py_stage, check_params_exist=True):
+        """
+        Checks common requirements for Params.params:
+          - set of params exist in Java and Python and are ordered by names
+          - param parent has the same UID as the object's UID
+          - default param value from Java matches value in Python
+          - optionally check if all params from Java also exist in Python
+        """
+        py_stage_str = "%s %s" % (type(py_stage), py_stage)
+        if not hasattr(py_stage, "_to_java"):
+            return
+        java_stage = py_stage._to_java()
+        if java_stage is None:
+            return
+        test_self.assertEqual(py_stage.uid, java_stage.uid(), msg=py_stage_str)
+        if check_params_exist:
+            param_names = [p.name for p in py_stage.params]
+            java_params = list(java_stage.params())
+            java_param_names = [jp.name() for jp in java_params]
+            test_self.assertEqual(
+                param_names, sorted(java_param_names),
+                "Param list in Python does not match Java for %s:\nJava = %s\nPython = %s"
+                % (py_stage_str, java_param_names, param_names))
+        for p in py_stage.params:
+            test_self.assertEqual(p.parent, py_stage.uid)
+            java_param = java_stage.getParam(p.name)
+            py_has_default = py_stage.hasDefault(p)
+            java_has_default = java_stage.hasDefault(java_param)
+            test_self.assertEqual(py_has_default, java_has_default,
+                                  "Default value mismatch of param %s for Params %s"
+                                  % (p.name, str(py_stage)))
+            if py_has_default:
+                if p.name == "seed":
+                    continue  # Random seeds between Spark and PySpark are different
+                java_default = _java2py(test_self.sc,
+                                        java_stage.clear(java_param).getOrDefault(java_param))
+                py_stage._clear(p)
+                py_default = py_stage.getOrDefault(p)
+                # equality test for NaN is always False
+                if isinstance(java_default, float) and np.isnan(java_default):
+                    java_default = "NaN"
+                    py_default = "NaN" if np.isnan(py_default) else "not NaN"
+                test_self.assertEqual(
+                    java_default, py_default,
+                    "Java default %s != python default %s of param %s for Params %s"
+                    % (str(java_default), str(py_default), p.name, str(py_stage)))
+
 
 class EvaluatorTests(SparkSessionTestCase):
 
@@ -511,6 +559,8 @@ def test_idf(self):
                          "Model should inherit the UID from its parent estimator.")
         output = idf0m.transform(dataset)
         self.assertIsNotNone(output.head().idf)
+        # Test that parameters transferred to Python Model
+        ParamTests.check_params(self, idf0m)
 
     def test_ngram(self):
         dataset = self.spark.createDataFrame([
@@ -1656,40 +1706,6 @@ class DefaultValuesTests(PySparkTestCase):
     those in their Scala counterparts.
     """
 
-    def check_params(self, py_stage):
-        import pyspark.ml.feature
-        if not hasattr(py_stage, "_to_java"):
-            return
-        java_stage = py_stage._to_java()
-        if java_stage is None:
-            return
-        for p in py_stage.params:
-            java_param = java_stage.getParam(p.name)
-            py_has_default = py_stage.hasDefault(p)
-            java_has_default = java_stage.hasDefault(java_param)
-            self.assertEqual(py_has_default, java_has_default,
-                             "Default value mismatch of param %s for Params %s"
-                             % (p.name, str(py_stage)))
-            if py_has_default:
-                if p.name == "seed":
-                    return  # Random seeds between Spark and PySpark are different
-                java_default =\
-                    _java2py(self.sc, java_stage.clear(java_param).getOrDefault(java_param))
-                py_stage._clear(p)
-                py_default = py_stage.getOrDefault(p)
-                if isinstance(py_stage, pyspark.ml.feature.Imputer) and p.name == "missingValue":
-                    # SPARK-15040 - default value for Imputer param 'missingValue' is NaN,
-                    # and NaN != NaN, so handle it specially here
-                    import math
-                    self.assertTrue(math.isnan(java_default) and math.isnan(py_default),
-                                    "Java default %s and python default %s are not both NaN for "
-                                    "param %s for Params %s"
-                                    % (str(java_default), str(py_default), p.name, str(py_stage)))
-                    return
-                self.assertEqual(java_default, py_default,
-                                 "Java default %s != python default %s of param %s for Params %s"
-                                 % (str(java_default), str(py_default), p.name, str(py_stage)))
-
     def test_java_params(self):
         import pyspark.ml.feature
         import pyspark.ml.classification
@@ -1703,7 +1719,8 @@ def test_java_params(self):
             for name, cls in inspect.getmembers(module, inspect.isclass):
                 if not name.endswith('Model') and issubclass(cls, JavaParams)\
                         and not inspect.isabstract(cls):
-                    self.check_params(cls())
+                    # NOTE: disable check_params_exist until there is parity with Scala API
+                    ParamTests.check_params(self, cls(), check_params_exist=False)
 
 
 def _squared_distance(a, b):
diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py
index ee6301ef19a43..0f846fbc5b5ef 100644
--- a/python/pyspark/ml/wrapper.py
+++ b/python/pyspark/ml/wrapper.py
@@ -135,6 +135,20 @@ def _transfer_param_map_to_java(self, pyParamMap):
                 paramMap.put([pair])
         return paramMap
 
+    def _create_params_from_java(self):
+        """
+        SPARK-10931: Temporary fix to create params that are defined in the Java obj but not here
+        """
+        java_params = list(self._java_obj.params())
+        from pyspark.ml.param import Param
+        for java_param in java_params:
+            java_param_name = java_param.name()
+            if not hasattr(self, java_param_name):
+                param = Param(self, java_param_name, java_param.doc())
+                setattr(param, "created_from_java_param", True)
+                setattr(self, java_param_name, param)
+                self._params = None  # need to reset so self.params will discover new params
+
     def _transfer_params_from_java(self):
         """
         Transforms the embedded params from the companion Java object.
@@ -147,6 +161,10 @@ def _transfer_params_from_java(self):
                 if self._java_obj.isSet(java_param):
                     value = _java2py(sc, self._java_obj.getOrDefault(java_param))
                     self._set(**{param.name: value})
+                # SPARK-10931: Temporary fix for params that have a default in Java
+                if self._java_obj.hasDefault(java_param) and not self.isDefined(param):
+                    value = _java2py(sc, self._java_obj.getDefault(java_param)).get()
+                    self._setDefault(**{param.name: value})
 
     def _transfer_param_map_from_java(self, javaParamMap):
         """
@@ -204,6 +222,11 @@ def __get_class(clazz):
             # Load information from java_stage to the instance.
             py_stage = py_type()
             py_stage._java_obj = java_stage
+
+            # SPARK-10931: Temporary fix so that persisted models would own params from Estimator
+            if issubclass(py_type, JavaModel):
+                py_stage._create_params_from_java()
+
             py_stage._resetUid(java_stage.uid())
             py_stage._transfer_params_from_java()
         elif hasattr(py_type, "_from_java"):
@@ -263,7 +286,8 @@ def _fit_java(self, dataset):
 
     def _fit(self, dataset):
         java_model = self._fit_java(dataset)
-        return self._create_model(java_model)
+        model = self._create_model(java_model)
+        return self._copyValues(model)
 
 
 @inherit_doc
@@ -307,4 +331,10 @@ def __init__(self, java_model=None):
         """
         super(JavaModel, self).__init__(java_model)
         if java_model is not None:
+
+            # SPARK-10931: This is a temporary fix to allow models to own params
+            # from estimators. Eventually, these params should be in models through
+            # using common base classes between estimators and models.
+            self._create_params_from_java()
+
             self._resetUid(java_model.uid())

From 3c0c2d09ca89c6b6247137823169db17847dfae3 Mon Sep 17 00:00:00 2001
From: Jose Torres <joseph-torres@databricks.com>
Date: Tue, 22 Aug 2017 19:07:43 -0700
Subject: [PATCH 406/779] [SPARK-21765] Set isStreaming on leaf nodes for
 streaming plans.

## What changes were proposed in this pull request?
All streaming logical plans will now have isStreaming set. This involved adding isStreaming as a case class arg in a few cases, since a node might be logically streaming depending on where it came from.

## How was this patch tested?

Existing unit tests - no functional change is intended in this PR.

Author: Jose Torres <joseph-torres@databricks.com>
Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #18973 from joseph-torres/SPARK-21765.
---
 .../spark/sql/kafka010/KafkaSource.scala      |  2 +-
 .../sql/catalyst/optimizer/Optimizer.scala    | 10 +++---
 .../plans/logical/LocalRelation.scala         |  5 ++-
 .../catalyst/plans/logical/LogicalPlan.scala  |  2 +-
 .../plans/logical/basicLogicalOperators.scala | 11 ++++---
 .../analysis/ResolveInlineTablesSuite.scala   |  2 +-
 .../analysis/UnsupportedOperationsSuite.scala |  6 ++--
 .../optimizer/ReplaceOperatorSuite.scala      |  6 ++--
 .../sql/catalyst/plans/LogicalPlanSuite.scala |  6 ++--
 .../apache/spark/sql/DataFrameReader.scala    |  4 +--
 .../apache/spark/sql/DataFrameWriter.scala    |  4 +--
 .../scala/org/apache/spark/sql/Dataset.scala  |  7 +++--
 .../org/apache/spark/sql/SQLContext.scala     |  7 +++--
 .../org/apache/spark/sql/SparkSession.scala   |  8 +++--
 .../spark/sql/execution/ExistingRDD.scala     |  8 +++--
 .../execution/OptimizeMetadataOnlyQuery.scala |  6 ++--
 .../spark/sql/execution/SparkStrategies.scala |  8 +++--
 .../execution/datasources/DataSource.scala    |  2 +-
 .../datasources/DataSourceStrategy.scala      | 15 ++++-----
 .../datasources/FileSourceStrategy.scala      |  2 +-
 .../datasources/LogicalRelation.scala         | 12 ++++---
 .../PruneFileSourcePartitions.scala           |  1 +
 .../sql/execution/datasources/rules.scala     | 10 +++---
 .../streaming/FileStreamSource.scala          |  2 +-
 .../streaming/RateSourceProvider.scala        |  5 +--
 .../execution/streaming/StreamExecution.scala |  3 ++
 .../sql/execution/streaming/memory.scala      | 31 +++++++++++++++----
 .../OptimizeMetadataOnlyQuerySuite.scala      |  4 +--
 .../sql/execution/SparkPlannerSuite.scala     |  2 +-
 .../datasources/FileSourceStrategySuite.scala |  2 +-
 .../parquet/ParquetFilterSuite.scala          |  3 +-
 .../ParquetPartitionDiscoverySuite.scala      |  2 +-
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala |  2 +-
 .../spark/sql/sources/FilteredScanSuite.scala |  2 +-
 .../spark/sql/sources/PathOptionSuite.scala   |  2 +-
 .../sql/streaming/FileStreamSinkSuite.scala   |  2 +-
 .../sql/streaming/FileStreamSourceSuite.scala |  5 ++-
 .../spark/sql/streaming/StreamSuite.scala     | 12 ++++++-
 .../streaming/StreamingAggregationSuite.scala | 29 +++++++++++++++--
 .../sql/streaming/StreamingQuerySuite.scala   |  5 ++-
 .../test/DataStreamReaderWriterSuite.scala    |  2 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  2 +-
 .../sql/hive/MetastoreDataSourcesSuite.scala  |  2 +-
 .../sql/hive/execution/SQLQuerySuite.scala    |  2 +-
 .../spark/sql/hive/orc/OrcFilterSuite.scala   |  4 +--
 .../apache/spark/sql/hive/parquetSuites.scala |  8 ++---
 46 files changed, 180 insertions(+), 97 deletions(-)

diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
index 7ac183776e20d..e9cff04ba5f2e 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
@@ -310,7 +310,7 @@ private[kafka010] class KafkaSource(
       currentPartitionOffsets = Some(untilPartitionOffsets)
     }
 
-    sqlContext.internalCreateDataFrame(rdd, schema)
+    sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = true)
   }
 
   /** Stop this source and free any resources it has allocated. */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index e2d7164d93ac1..75d83bc6e86f8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -1175,14 +1175,14 @@ object DecimalAggregates extends Rule[LogicalPlan] {
  */
 object ConvertToLocalRelation extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
-    case Project(projectList, LocalRelation(output, data))
+    case Project(projectList, LocalRelation(output, data, isStreaming))
         if !projectList.exists(hasUnevaluableExpr) =>
       val projection = new InterpretedProjection(projectList, output)
       projection.initialize(0)
-      LocalRelation(projectList.map(_.toAttribute), data.map(projection))
+      LocalRelation(projectList.map(_.toAttribute), data.map(projection), isStreaming)
 
-    case Limit(IntegerLiteral(limit), LocalRelation(output, data)) =>
-      LocalRelation(output, data.take(limit))
+    case Limit(IntegerLiteral(limit), LocalRelation(output, data, isStreaming)) =>
+      LocalRelation(output, data.take(limit), isStreaming)
   }
 
   private def hasUnevaluableExpr(expr: Expression): Boolean = {
@@ -1207,7 +1207,7 @@ object ReplaceDistinctWithAggregate extends Rule[LogicalPlan] {
  */
 object ReplaceDeduplicateWithAggregate extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
-    case Deduplicate(keys, child, streaming) if !streaming =>
+    case Deduplicate(keys, child) if !child.isStreaming =>
       val keyExprIds = keys.map(_.exprId)
       val aggCols = child.output.map { attr =>
         if (keyExprIds.contains(attr.exprId)) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala
index 1c986fbde7ada..7a21183664c56 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala
@@ -43,7 +43,10 @@ object LocalRelation {
   }
 }
 
-case class LocalRelation(output: Seq[Attribute], data: Seq[InternalRow] = Nil)
+case class LocalRelation(output: Seq[Attribute],
+                         data: Seq[InternalRow] = Nil,
+                         // Indicates whether this relation has data from a streaming source.
+                         override val isStreaming: Boolean = false)
   extends LeafNode with analysis.MultiInstanceRelation {
 
   // A local relation must have resolved output.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index 9b440cd99f994..d893b392e56b9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -47,7 +47,7 @@ abstract class LogicalPlan
    */
   def analyzed: Boolean = _analyzed
 
-  /** Returns true if this subtree contains any streaming data sources. */
+  /** Returns true if this subtree has data from a streaming data source. */
   def isStreaming: Boolean = children.exists(_.isStreaming == true)
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index 303014e0b8d31..4b3054dbfe2f0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -429,9 +429,10 @@ case class Sort(
 
 /** Factory for constructing new `Range` nodes. */
 object Range {
-  def apply(start: Long, end: Long, step: Long, numSlices: Option[Int]): Range = {
+  def apply(start: Long, end: Long, step: Long,
+            numSlices: Option[Int], isStreaming: Boolean = false): Range = {
     val output = StructType(StructField("id", LongType, nullable = false) :: Nil).toAttributes
-    new Range(start, end, step, numSlices, output)
+    new Range(start, end, step, numSlices, output, isStreaming)
   }
   def apply(start: Long, end: Long, step: Long, numSlices: Int): Range = {
     Range(start, end, step, Some(numSlices))
@@ -443,7 +444,8 @@ case class Range(
     end: Long,
     step: Long,
     numSlices: Option[Int],
-    output: Seq[Attribute])
+    output: Seq[Attribute],
+    override val isStreaming: Boolean)
   extends LeafNode with MultiInstanceRelation {
 
   require(step != 0, s"step ($step) cannot be 0")
@@ -784,8 +786,7 @@ case class OneRowRelation() extends LeafNode {
 /** A logical plan for `dropDuplicates`. */
 case class Deduplicate(
     keys: Seq[Attribute],
-    child: LogicalPlan,
-    streaming: Boolean) extends UnaryNode {
+    child: LogicalPlan) extends UnaryNode {
 
   override def output: Seq[Attribute] = child.output
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveInlineTablesSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveInlineTablesSuite.scala
index d0fe815052256..9e99c8e11cdfe 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveInlineTablesSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveInlineTablesSuite.scala
@@ -93,7 +93,7 @@ class ResolveInlineTablesSuite extends AnalysisTest with BeforeAndAfter {
     val table = UnresolvedInlineTable(Seq("c1"),
       Seq(Seq(Cast(lit("1991-12-06 00:00:00.0"), TimestampType))))
     val withTimeZone = ResolveTimeZone(conf).apply(table)
-    val LocalRelation(output, data) = ResolveInlineTables(conf).apply(withTimeZone)
+    val LocalRelation(output, data, _) = ResolveInlineTables(conf).apply(withTimeZone)
     val correct = Cast(lit("1991-12-06 00:00:00.0"), TimestampType)
       .withTimeZone(conf.sessionLocalTimeZone).eval().asInstanceOf[Long]
     assert(output.map(_.dataType) == Seq(TimestampType))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala
index f68d930f60523..4de75866e04a3 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala
@@ -368,18 +368,18 @@ class UnsupportedOperationsSuite extends SparkFunSuite {
     Aggregate(
       Seq(attributeWithWatermark),
       aggExprs("c"),
-      Deduplicate(Seq(att), streamRelation, streaming = true)),
+      Deduplicate(Seq(att), streamRelation)),
     outputMode = Append)
 
   assertNotSupportedInStreamingPlan(
     "Deduplicate - Deduplicate on streaming relation after aggregation",
-    Deduplicate(Seq(att), Aggregate(Nil, aggExprs("c"), streamRelation), streaming = true),
+    Deduplicate(Seq(att), Aggregate(Nil, aggExprs("c"), streamRelation)),
     outputMode = Complete,
     expectedMsgs = Seq("dropDuplicates"))
 
   assertSupportedInStreamingPlan(
     "Deduplicate - Deduplicate on batch relation inside a streaming query",
-    Deduplicate(Seq(att), batchRelation, streaming = false),
+    Deduplicate(Seq(att), batchRelation),
     outputMode = Append
   )
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceOperatorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceOperatorSuite.scala
index e68423f85c92e..85988d2fb948c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceOperatorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceOperatorSuite.scala
@@ -79,7 +79,7 @@ class ReplaceOperatorSuite extends PlanTest {
     val input = LocalRelation('a.int, 'b.int)
     val attrA = input.output(0)
     val attrB = input.output(1)
-    val query = Deduplicate(Seq(attrA), input, streaming = false) // dropDuplicates("a")
+    val query = Deduplicate(Seq(attrA), input) // dropDuplicates("a")
     val optimized = Optimize.execute(query.analyze)
 
     val correctAnswer =
@@ -95,9 +95,9 @@ class ReplaceOperatorSuite extends PlanTest {
   }
 
   test("don't replace streaming Deduplicate") {
-    val input = LocalRelation('a.int, 'b.int)
+    val input = LocalRelation(Seq('a.int, 'b.int), isStreaming = true)
     val attrA = input.output(0)
-    val query = Deduplicate(Seq(attrA), input, streaming = true) // dropDuplicates("a")
+    val query = Deduplicate(Seq(attrA), input) // dropDuplicates("a")
     val optimized = Optimize.execute(query.analyze)
 
     comparePlans(optimized, query)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/LogicalPlanSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/LogicalPlanSuite.scala
index cc86f1f6e2f48..cdf912df7c76a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/LogicalPlanSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/LogicalPlanSuite.scala
@@ -73,10 +73,8 @@ class LogicalPlanSuite extends SparkFunSuite {
 
   test("isStreaming") {
     val relation = LocalRelation(AttributeReference("a", IntegerType, nullable = true)())
-    val incrementalRelation = new LocalRelation(
-      Seq(AttributeReference("a", IntegerType, nullable = true)())) {
-      override def isStreaming(): Boolean = true
-    }
+    val incrementalRelation = LocalRelation(
+      Seq(AttributeReference("a", IntegerType, nullable = true)()), isStreaming = true)
 
     case class TestBinaryRelation(left: LogicalPlan, right: LogicalPlan) extends BinaryNode {
       override def output: Seq[Attribute] = left.output ++ right.output
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 10b28ce812afc..41cb019499ae1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -410,7 +410,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
 
     Dataset.ofRows(
       sparkSession,
-      LogicalRDD(schema.toAttributes, parsed)(sparkSession))
+      LogicalRDD(schema.toAttributes, parsed, isStreaming = jsonDataset.isStreaming)(sparkSession))
   }
 
   /**
@@ -473,7 +473,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
 
     Dataset.ofRows(
       sparkSession,
-      LogicalRDD(schema.toAttributes, parsed)(sparkSession))
+      LogicalRDD(schema.toAttributes, parsed, isStreaming = csvDataset.isStreaming)(sparkSession))
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 877051a60e910..cca93525d6792 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -371,14 +371,14 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
       case (true, SaveMode.Overwrite) =>
         // Get all input data source or hive relations of the query.
         val srcRelations = df.logicalPlan.collect {
-          case LogicalRelation(src: BaseRelation, _, _) => src
+          case LogicalRelation(src: BaseRelation, _, _, _) => src
           case relation: HiveTableRelation => relation.tableMeta.identifier
         }
 
         val tableRelation = df.sparkSession.table(tableIdentWithDB).queryExecution.analyzed
         EliminateSubqueryAliases(tableRelation) match {
           // check if the table is a data source table (the relation is a BaseRelation).
-          case LogicalRelation(dest: BaseRelation, _, _) if srcRelations.contains(dest) =>
+          case LogicalRelation(dest: BaseRelation, _, _, _) if srcRelations.contains(dest) =>
             throw new AnalysisException(
               s"Cannot overwrite table $tableName that is also being read from")
           // check hive table relation when overwrite mode
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 615686ccbe2b3..c6707396af1a8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -569,7 +569,8 @@ class Dataset[T] private[sql](
         logicalPlan.output,
         internalRdd,
         outputPartitioning,
-        physicalPlan.outputOrdering
+        physicalPlan.outputOrdering,
+        isStreaming
       )(sparkSession)).as[T]
   }
 
@@ -2233,7 +2234,7 @@ class Dataset[T] private[sql](
       }
       cols
     }
-    Deduplicate(groupCols, logicalPlan, isStreaming)
+    Deduplicate(groupCols, logicalPlan)
   }
 
   /**
@@ -2993,7 +2994,7 @@ class Dataset[T] private[sql](
    */
   def inputFiles: Array[String] = {
     val files: Seq[String] = queryExecution.optimizedPlan.collect {
-      case LogicalRelation(fsBasedRelation: FileRelation, _, _) =>
+      case LogicalRelation(fsBasedRelation: FileRelation, _, _, _) =>
         fsBasedRelation.inputFiles
       case fr: FileRelation =>
         fr.inputFiles
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 7fde6e9469e5e..af6018472cb03 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -420,8 +420,11 @@ class SQLContext private[sql](val sparkSession: SparkSession)
    * converted to Catalyst rows.
    */
   private[sql]
-  def internalCreateDataFrame(catalystRows: RDD[InternalRow], schema: StructType) = {
-    sparkSession.internalCreateDataFrame(catalystRows, schema)
+  def internalCreateDataFrame(
+      catalystRows: RDD[InternalRow],
+      schema: StructType,
+      isStreaming: Boolean = false) = {
+    sparkSession.internalCreateDataFrame(catalystRows, schema, isStreaming)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index 6dfe8a66baa9b..863c316bbac65 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -564,10 +564,14 @@ class SparkSession private(
    */
   private[sql] def internalCreateDataFrame(
       catalystRows: RDD[InternalRow],
-      schema: StructType): DataFrame = {
+      schema: StructType,
+      isStreaming: Boolean = false): DataFrame = {
     // TODO: use MutableProjection when rowRDD is another DataFrame and the applied
     // schema differs from the existing schema on any field data type.
-    val logicalPlan = LogicalRDD(schema.toAttributes, catalystRows)(self)
+    val logicalPlan = LogicalRDD(
+      schema.toAttributes,
+      catalystRows,
+      isStreaming = isStreaming)(self)
     Dataset.ofRows(self, logicalPlan)
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
index dcb918eeb9d10..f3555508185fe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
@@ -125,7 +125,8 @@ case class LogicalRDD(
     output: Seq[Attribute],
     rdd: RDD[InternalRow],
     outputPartitioning: Partitioning = UnknownPartitioning(0),
-    outputOrdering: Seq[SortOrder] = Nil)(session: SparkSession)
+    outputOrdering: Seq[SortOrder] = Nil,
+    override val isStreaming: Boolean = false)(session: SparkSession)
   extends LeafNode with MultiInstanceRelation {
 
   override protected final def otherCopyArgs: Seq[AnyRef] = session :: Nil
@@ -150,11 +151,12 @@ case class LogicalRDD(
       output.map(rewrite),
       rdd,
       rewrittenPartitioning,
-      rewrittenOrdering
+      rewrittenOrdering,
+      isStreaming
     )(session).asInstanceOf[this.type]
   }
 
-  override protected def stringArgs: Iterator[Any] = Iterator(output)
+  override protected def stringArgs: Iterator[Any] = Iterator(output, isStreaming)
 
   override def computeStats(): Statistics = Statistics(
     // TODO: Instead of returning a default value here, find a way to return a meaningful size
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala
index 301c4f02647d5..18f6f697bc857 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala
@@ -94,10 +94,10 @@ case class OptimizeMetadataOnlyQuery(catalog: SessionCatalog) extends Rule[Logic
     child transform {
       case plan if plan eq relation =>
         relation match {
-          case l @ LogicalRelation(fsRelation: HadoopFsRelation, _, _) =>
+          case l @ LogicalRelation(fsRelation: HadoopFsRelation, _, _, isStreaming) =>
             val partAttrs = getPartitionAttrs(fsRelation.partitionSchema.map(_.name), l)
             val partitionData = fsRelation.location.listFiles(Nil, Nil)
-            LocalRelation(partAttrs, partitionData.map(_.values))
+            LocalRelation(partAttrs, partitionData.map(_.values), isStreaming)
 
           case relation: HiveTableRelation =>
             val partAttrs = getPartitionAttrs(relation.tableMeta.partitionColumnNames, relation)
@@ -130,7 +130,7 @@ case class OptimizeMetadataOnlyQuery(catalog: SessionCatalog) extends Rule[Logic
   object PartitionedRelation {
 
     def unapply(plan: LogicalPlan): Option[(AttributeSet, LogicalPlan)] = plan match {
-      case l @ LogicalRelation(fsRelation: HadoopFsRelation, _, _)
+      case l @ LogicalRelation(fsRelation: HadoopFsRelation, _, _, _)
         if fsRelation.partitionSchema.nonEmpty =>
         val partAttrs = getPartitionAttrs(fsRelation.partitionSchema.map(_.name), l)
         Some((AttributeSet(partAttrs), l))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index c115cb6e80e91..6b16408e27840 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -221,12 +221,14 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
   }
 
   /**
-   * Used to plan aggregation queries that are computed incrementally as part of a
+   * Used to plan streaming aggregation queries that are computed incrementally as part of a
    * [[StreamingQuery]]. Currently this rule is injected into the planner
    * on-demand, only when planning in a [[org.apache.spark.sql.execution.streaming.StreamExecution]]
    */
   object StatefulAggregationStrategy extends Strategy {
     override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
+      case _ if !plan.isStreaming => Nil
+
       case EventTimeWatermark(columnName, delay, child) =>
         EventTimeWatermarkExec(columnName, delay, planLater(child)) :: Nil
 
@@ -248,7 +250,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
    */
   object StreamingDeduplicationStrategy extends Strategy {
     override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
-      case Deduplicate(keys, child, true) =>
+      case Deduplicate(keys, child) if child.isStreaming =>
         StreamingDeduplicateExec(keys, planLater(child)) :: Nil
 
       case _ => Nil
@@ -410,7 +412,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         execution.window.WindowExec(windowExprs, partitionSpec, orderSpec, planLater(child)) :: Nil
       case logical.Sample(lb, ub, withReplacement, seed, child) =>
         execution.SampleExec(lb, ub, withReplacement, seed, planLater(child)) :: Nil
-      case logical.LocalRelation(output, data) =>
+      case logical.LocalRelation(output, data, _) =>
         LocalTableScanExec(output, data) :: Nil
       case logical.LocalLimit(IntegerLiteral(limit), child) =>
         execution.LocalLimitExec(limit, planLater(child)) :: Nil
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index 567ff49773f9b..b9502a95a7c08 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -455,7 +455,7 @@ case class DataSource(
 
     val fileIndex = catalogTable.map(_.identifier).map { tableIdent =>
       sparkSession.table(tableIdent).queryExecution.analyzed.collect {
-        case LogicalRelation(t: HadoopFsRelation, _, _) => t.location
+        case LogicalRelation(t: HadoopFsRelation, _, _, _) => t.location
       }.head
     }
     // For partitioned relation r, r.schema's column ordering can be different from the column
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 237017742770a..0deac1984bd62 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -136,12 +136,12 @@ case class DataSourceAnalysis(conf: SQLConf) extends Rule[LogicalPlan] with Cast
         if query.resolved && DDLUtils.isDatasourceTable(tableDesc) =>
       CreateDataSourceTableAsSelectCommand(tableDesc, mode, query)
 
-    case InsertIntoTable(l @ LogicalRelation(_: InsertableRelation, _, _),
+    case InsertIntoTable(l @ LogicalRelation(_: InsertableRelation, _, _, _),
         parts, query, overwrite, false) if parts.isEmpty =>
       InsertIntoDataSourceCommand(l, query, overwrite)
 
     case i @ InsertIntoTable(
-        l @ LogicalRelation(t: HadoopFsRelation, _, table), parts, query, overwrite, _) =>
+        l @ LogicalRelation(t: HadoopFsRelation, _, table, _), parts, query, overwrite, _) =>
       // If the InsertIntoTable command is for a partitioned HadoopFsRelation and
       // the user has specified static partitions, we add a Project operator on top of the query
       // to include those constant column values in the query result.
@@ -177,7 +177,7 @@ case class DataSourceAnalysis(conf: SQLConf) extends Rule[LogicalPlan] with Cast
 
       val outputPath = t.location.rootPaths.head
       val inputPaths = actualQuery.collect {
-        case LogicalRelation(r: HadoopFsRelation, _, _) => r.location.rootPaths
+        case LogicalRelation(r: HadoopFsRelation, _, _, _) => r.location.rootPaths
       }.flatten
 
       val mode = if (overwrite) SaveMode.Overwrite else SaveMode.Append
@@ -268,7 +268,7 @@ case class DataSourceStrategy(conf: SQLConf) extends Strategy with Logging with
   import DataSourceStrategy._
 
   def apply(plan: LogicalPlan): Seq[execution.SparkPlan] = plan match {
-    case PhysicalOperation(projects, filters, l @ LogicalRelation(t: CatalystScan, _, _)) =>
+    case PhysicalOperation(projects, filters, l @ LogicalRelation(t: CatalystScan, _, _, _)) =>
       pruneFilterProjectRaw(
         l,
         projects,
@@ -276,21 +276,22 @@ case class DataSourceStrategy(conf: SQLConf) extends Strategy with Logging with
         (requestedColumns, allPredicates, _) =>
           toCatalystRDD(l, requestedColumns, t.buildScan(requestedColumns, allPredicates))) :: Nil
 
-    case PhysicalOperation(projects, filters, l @ LogicalRelation(t: PrunedFilteredScan, _, _)) =>
+    case PhysicalOperation(projects, filters,
+                           l @ LogicalRelation(t: PrunedFilteredScan, _, _, _)) =>
       pruneFilterProject(
         l,
         projects,
         filters,
         (a, f) => toCatalystRDD(l, a, t.buildScan(a.map(_.name).toArray, f))) :: Nil
 
-    case PhysicalOperation(projects, filters, l @ LogicalRelation(t: PrunedScan, _, _)) =>
+    case PhysicalOperation(projects, filters, l @ LogicalRelation(t: PrunedScan, _, _, _)) =>
       pruneFilterProject(
         l,
         projects,
         filters,
         (a, _) => toCatalystRDD(l, a, t.buildScan(a.map(_.name).toArray))) :: Nil
 
-    case l @ LogicalRelation(baseRelation: TableScan, _, _) =>
+    case l @ LogicalRelation(baseRelation: TableScan, _, _, _) =>
       RowDataSourceScanExec(
         l.output,
         l.output.indices,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala
index 17f7e0e601c0c..16b22717b8d92 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala
@@ -52,7 +52,7 @@ import org.apache.spark.sql.execution.SparkPlan
 object FileSourceStrategy extends Strategy with Logging {
   def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
     case PhysicalOperation(projects, filters,
-      l @ LogicalRelation(fsRelation: HadoopFsRelation, _, table)) =>
+      l @ LogicalRelation(fsRelation: HadoopFsRelation, _, table, _)) =>
       // Filters on this relation fall into four categories based on where we can use them to avoid
       // reading unneeded data:
       //  - partition keys only - used to prune directories to read
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
index 699f1bad9c4ed..17a61074d3b5c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
@@ -30,12 +30,14 @@ import org.apache.spark.util.Utils
 case class LogicalRelation(
     relation: BaseRelation,
     output: Seq[AttributeReference],
-    catalogTable: Option[CatalogTable])
+    catalogTable: Option[CatalogTable],
+    override val isStreaming: Boolean)
   extends LeafNode with MultiInstanceRelation {
 
   // Logical Relations are distinct if they have different output for the sake of transformations.
   override def equals(other: Any): Boolean = other match {
-    case l @ LogicalRelation(otherRelation, _, _) => relation == otherRelation && output == l.output
+    case l @ LogicalRelation(otherRelation, _, _, isStreaming) =>
+      relation == otherRelation && output == l.output && isStreaming == l.isStreaming
     case _ => false
   }
 
@@ -76,9 +78,9 @@ case class LogicalRelation(
 }
 
 object LogicalRelation {
-  def apply(relation: BaseRelation): LogicalRelation =
-    LogicalRelation(relation, relation.schema.toAttributes, None)
+  def apply(relation: BaseRelation, isStreaming: Boolean = false): LogicalRelation =
+    LogicalRelation(relation, relation.schema.toAttributes, None, isStreaming)
 
   def apply(relation: BaseRelation, table: CatalogTable): LogicalRelation =
-    LogicalRelation(relation, relation.schema.toAttributes, Some(table))
+    LogicalRelation(relation, relation.schema.toAttributes, Some(table), false)
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
index f5df1848a38c4..3b830accb83f0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
@@ -36,6 +36,7 @@ private[sql] object PruneFileSourcePartitions extends Rule[LogicalPlan] {
               _,
               _),
             _,
+            _,
             _))
         if filters.nonEmpty && fsRelation.partitionSchemaOption.isDefined =>
       // The attribute name of predicate could be different than the one in schema in case of
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
index 84acca242aa41..7a2c85e8e01f6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -385,10 +385,10 @@ case class PreprocessTableInsertion(conf: SQLConf) extends Rule[LogicalPlan] wit
         case relation: HiveTableRelation =>
           val metadata = relation.tableMeta
           preprocess(i, metadata.identifier.quotedString, metadata.partitionColumnNames)
-        case LogicalRelation(h: HadoopFsRelation, _, catalogTable) =>
+        case LogicalRelation(h: HadoopFsRelation, _, catalogTable, _) =>
           val tblName = catalogTable.map(_.identifier.quotedString).getOrElse("unknown")
           preprocess(i, tblName, h.partitionSchema.map(_.name))
-        case LogicalRelation(_: InsertableRelation, _, catalogTable) =>
+        case LogicalRelation(_: InsertableRelation, _, catalogTable, _) =>
           val tblName = catalogTable.map(_.identifier.quotedString).getOrElse("unknown")
           preprocess(i, tblName, Nil)
         case _ => i
@@ -428,7 +428,7 @@ object PreReadCheck extends (LogicalPlan => Unit) {
   private def checkNumInputFileBlockSources(e: Expression, operator: LogicalPlan): Int = {
     operator match {
       case _: HiveTableRelation => 1
-      case _ @ LogicalRelation(_: HadoopFsRelation, _, _) => 1
+      case _ @ LogicalRelation(_: HadoopFsRelation, _, _, _) => 1
       case _: LeafNode => 0
       // UNION ALL has multiple children, but these children do not concurrently use InputFileBlock.
       case u: Union =>
@@ -454,10 +454,10 @@ object PreWriteCheck extends (LogicalPlan => Unit) {
 
   def apply(plan: LogicalPlan): Unit = {
     plan.foreach {
-      case InsertIntoTable(l @ LogicalRelation(relation, _, _), partition, query, _, _) =>
+      case InsertIntoTable(l @ LogicalRelation(relation, _, _, _), partition, query, _, _) =>
         // Get all input data source relations of the query.
         val srcRelations = query.collect {
-          case LogicalRelation(src, _, _) => src
+          case LogicalRelation(src, _, _, _) => src
         }
         if (srcRelations.contains(relation)) {
           failAnalysis("Cannot insert into table that is also being read from.")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
index 4b1b2520390ba..f17417343e289 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
@@ -171,7 +171,7 @@ class FileStreamSource(
         className = fileFormatClassName,
         options = optionsWithPartitionBasePath)
     Dataset.ofRows(sparkSession, LogicalRelation(newDataSource.resolveRelation(
-      checkFilesExist = false)))
+      checkFilesExist = false), isStreaming = true))
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
index e76d4dc6125df..077a4778e34a8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
@@ -200,7 +200,8 @@ class RateStreamSource(
       s"rangeStart: $rangeStart, rangeEnd: $rangeEnd")
 
     if (rangeStart == rangeEnd) {
-      return sqlContext.internalCreateDataFrame(sqlContext.sparkContext.emptyRDD, schema)
+      return sqlContext.internalCreateDataFrame(
+        sqlContext.sparkContext.emptyRDD, schema, isStreaming = true)
     }
 
     val localStartTimeMs = startTimeMs + TimeUnit.SECONDS.toMillis(startSeconds)
@@ -211,7 +212,7 @@ class RateStreamSource(
       val relative = math.round((v - rangeStart) * relativeMsPerValue)
       InternalRow(DateTimeUtils.fromMillis(relative + localStartTimeMs), v)
     }
-    sqlContext.internalCreateDataFrame(rdd, schema)
+    sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = true)
   }
 
   override def stop(): Unit = {}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 9bc114f138562..432b2d4925ae2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -609,6 +609,9 @@ class StreamExecution(
           if committedOffsets.get(source).map(_ != available).getOrElse(true) =>
           val current = committedOffsets.get(source)
           val batch = source.getBatch(current, available)
+          assert(batch.isStreaming,
+            s"DataFrame returned by getBatch from $source did not have isStreaming=true\n" +
+              s"${batch.queryExecution.logical}")
           logDebug(s"Retrieving data from $source: $current -> $available")
           Some(source -> batch)
         case _ => None
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
index 587ae2bfb63fb..c9784c093b408 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
@@ -20,6 +20,8 @@ package org.apache.spark.sql.execution.streaming
 import java.util.concurrent.atomic.AtomicInteger
 import javax.annotation.concurrent.GuardedBy
 
+import scala.collection.JavaConverters._
+import scala.collection.mutable
 import scala.collection.mutable.{ArrayBuffer, ListBuffer}
 import scala.util.control.NonFatal
 
@@ -27,13 +29,14 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.encoders.encoderFor
 import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Statistics}
+import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LocalRelation, Statistics}
 import org.apache.spark.sql.catalyst.streaming.InternalOutputModes._
 import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.Utils
 
+
 object MemoryStream {
   protected val currentBlockId = new AtomicInteger(0)
   protected val memoryStreamId = new AtomicInteger(0)
@@ -44,7 +47,7 @@ object MemoryStream {
 
 /**
  * A [[Source]] that produces value stored in memory as they are added by the user.  This [[Source]]
- * is primarily intended for use in unit tests as it can only replay data when the object is still
+ * is intended for use in unit tests as it can only replay data when the object is still
  * available.
  */
 case class MemoryStream[A : Encoder](id: Int, sqlContext: SQLContext)
@@ -85,8 +88,9 @@ case class MemoryStream[A : Encoder](id: Int, sqlContext: SQLContext)
   }
 
   def addData(data: TraversableOnce[A]): Offset = {
-    import sqlContext.implicits._
-    val ds = data.toVector.toDS()
+    val encoded = data.toVector.map(d => encoder.toRow(d).copy())
+    val plan = new LocalRelation(schema.toAttributes, encoded, isStreaming = true)
+    val ds = Dataset[A](sqlContext.sparkSession, plan)
     logDebug(s"Adding ds: $ds")
     this.synchronized {
       currentOffset = currentOffset + 1
@@ -118,8 +122,8 @@ case class MemoryStream[A : Encoder](id: Int, sqlContext: SQLContext)
       batches.slice(sliceStart, sliceEnd)
     }
 
-    logDebug(
-      s"MemoryBatch [$startOrdinal, $endOrdinal]: ${newBlocks.flatMap(_.collect()).mkString(", ")}")
+    logDebug(generateDebugString(newBlocks, startOrdinal, endOrdinal))
+
     newBlocks
       .map(_.toDF())
       .reduceOption(_ union _)
@@ -128,6 +132,21 @@ case class MemoryStream[A : Encoder](id: Int, sqlContext: SQLContext)
       }
   }
 
+  private def generateDebugString(
+      blocks: TraversableOnce[Dataset[A]],
+      startOrdinal: Int,
+      endOrdinal: Int): String = {
+    val originalUnsupportedCheck =
+      sqlContext.getConf("spark.sql.streaming.unsupportedOperationCheck")
+    try {
+      sqlContext.setConf("spark.sql.streaming.unsupportedOperationCheck", "false")
+      s"MemoryBatch [$startOrdinal, $endOrdinal]: " +
+          s"${blocks.flatMap(_.collect()).mkString(", ")}"
+    } finally {
+      sqlContext.setConf("spark.sql.streaming.unsupportedOperationCheck", originalUnsupportedCheck)
+    }
+  }
+
   override def commit(end: Offset): Unit = synchronized {
     def check(newOffset: LongOffset): Unit = {
       val offsetDiff = (newOffset.offset - lastOffsetCommitted.offset).toInt
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala
index 58c310596ca6d..223c3d7729a50 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala
@@ -42,14 +42,14 @@ class OptimizeMetadataOnlyQuerySuite extends QueryTest with SharedSQLContext {
 
   private def assertMetadataOnlyQuery(df: DataFrame): Unit = {
     val localRelations = df.queryExecution.optimizedPlan.collect {
-      case l @ LocalRelation(_, _) => l
+      case l @ LocalRelation(_, _, _) => l
     }
     assert(localRelations.size == 1)
   }
 
   private def assertNotMetadataOnlyQuery(df: DataFrame): Unit = {
     val localRelations = df.queryExecution.optimizedPlan.collect {
-      case l @ LocalRelation(_, _) => l
+      case l @ LocalRelation(_, _, _) => l
     }
     assert(localRelations.size == 0)
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlannerSuite.scala
index aecfd3062147c..5828f9783da42 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlannerSuite.scala
@@ -40,7 +40,7 @@ class SparkPlannerSuite extends SharedSQLContext {
         case Union(children) =>
           planned += 1
           UnionExec(children.map(planLater)) :: planLater(NeverPlanned) :: Nil
-        case LocalRelation(output, data) =>
+        case LocalRelation(output, data, _) =>
           planned += 1
           LocalTableScanExec(output, data) :: planLater(NeverPlanned) :: Nil
         case NeverPlanned =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala
index d77f0c298ffe3..c1d61b843d899 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala
@@ -556,7 +556,7 @@ class FileSourceStrategySuite extends QueryTest with SharedSQLContext with Predi
 
     if (buckets > 0) {
       val bucketed = df.queryExecution.analyzed transform {
-        case l @ LogicalRelation(r: HadoopFsRelation, _, _) =>
+        case l @ LogicalRelation(r: HadoopFsRelation, _, _, _) =>
           l.copy(relation =
             r.copy(bucketSpec =
               Some(BucketSpec(numBuckets = buckets, "c1" :: Nil, Nil)))(r.sparkSession))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
index c43c1ec8b9a6b..28e8521b35fa0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
@@ -63,7 +63,8 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
 
         var maybeRelation: Option[HadoopFsRelation] = None
         val maybeAnalyzedPredicate = query.queryExecution.optimizedPlan.collect {
-          case PhysicalOperation(_, filters, LogicalRelation(relation: HadoopFsRelation, _, _)) =>
+          case PhysicalOperation(_, filters,
+                                 LogicalRelation(relation: HadoopFsRelation, _, _, _)) =>
             maybeRelation = Some(relation)
             filters
         }.flatten.reduceLeftOption(_ && _)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
index 2f5fd8438f682..837a0872d7b71 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
@@ -651,7 +651,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
       val queryExecution = spark.read.parquet(dir.getCanonicalPath).queryExecution
       queryExecution.analyzed.collectFirst {
         case LogicalRelation(
-            HadoopFsRelation(location: PartitioningAwareFileIndex, _, _, _, _, _), _, _) =>
+            HadoopFsRelation(location: PartitioningAwareFileIndex, _, _, _, _, _), _, _, _) =>
           assert(location.partitionSpec() === PartitionSpec.emptySpec)
       }.getOrElse {
         fail(s"Expecting a matching HadoopFsRelation, but got:\n$queryExecution")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index 8dc11d80c3063..f951b46e4dd7d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -247,7 +247,7 @@ class JDBCSuite extends SparkFunSuite
   // Check whether the tables are fetched in the expected degree of parallelism
   def checkNumPartitions(df: DataFrame, expectedNumPartitions: Int): Unit = {
     val jdbcRelations = df.queryExecution.analyzed.collect {
-      case LogicalRelation(r: JDBCRelation, _, _) => r
+      case LogicalRelation(r: JDBCRelation, _, _, _) => r
     }
     assert(jdbcRelations.length == 1)
     assert(jdbcRelations.head.parts.length == expectedNumPartitions,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
index fe9469b49e385..c45b507d2b489 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
@@ -327,7 +327,7 @@ class FilteredScanSuite extends DataSourceTest with SharedSQLContext with Predic
 
         val table = spark.table("oneToTenFiltered")
         val relation = table.queryExecution.logical.collectFirst {
-          case LogicalRelation(r, _, _) => r
+          case LogicalRelation(r, _, _, _) => r
         }.get
 
         assert(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PathOptionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PathOptionSuite.scala
index 3fd7a5be1da37..85da3f0e38468 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PathOptionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PathOptionSuite.scala
@@ -135,7 +135,7 @@ class PathOptionSuite extends DataSourceTest with SharedSQLContext {
 
   private def getPathOption(tableName: String): Option[String] = {
     spark.table(tableName).queryExecution.analyzed.collect {
-      case LogicalRelation(r: TestOptionsRelation, _, _) => r.pathOption
+      case LogicalRelation(r: TestOptionsRelation, _, _, _) => r.pathOption
     }.head
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
index a5cf40c3581c6..08db06b94904b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
@@ -127,7 +127,7 @@ class FileStreamSinkSuite extends StreamTest {
       // Verify that MetadataLogFileIndex is being used and the correct partitioning schema has
       // been inferred
       val hadoopdFsRelations = outputDf.queryExecution.analyzed.collect {
-        case LogicalRelation(baseRelation: HadoopFsRelation, _, _) => baseRelation
+        case LogicalRelation(baseRelation: HadoopFsRelation, _, _, _) => baseRelation
       }
       assert(hadoopdFsRelations.size === 1)
       assert(hadoopdFsRelations.head.location.isInstanceOf[MetadataLogFileIndex])
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
index e2ec690d90e52..b6baaed1927e4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
@@ -1105,7 +1105,10 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
             def verify(startId: Option[Int], endId: Int, expected: String*): Unit = {
               val start = startId.map(new FileStreamSourceOffset(_))
               val end = FileStreamSourceOffset(endId)
-              assert(fileSource.getBatch(start, end).as[String].collect().toSeq === expected)
+
+              withSQLConf("spark.sql.streaming.unsupportedOperationCheck" -> "false") {
+                assert(fileSource.getBatch(start, end).as[String].collect().toSeq === expected)
+              }
             }
 
             verify(startId = None, endId = 2, "keep1", "keep2", "keep3")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
index 6f7b9d35a6bb3..012cccfdd9166 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
@@ -29,6 +29,7 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.spark.SparkContext
 import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart}
 import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.plans.logical.Range
 import org.apache.spark.sql.catalyst.streaming.InternalOutputModes
 import org.apache.spark.sql.execution.command.ExplainCommand
 import org.apache.spark.sql.execution.streaming._
@@ -728,7 +729,16 @@ class FakeDefaultSource extends FakeSource {
 
       override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
         val startOffset = start.map(_.asInstanceOf[LongOffset].offset).getOrElse(-1L) + 1
-        spark.range(startOffset, end.asInstanceOf[LongOffset].offset + 1).toDF("a")
+        val ds = new Dataset[java.lang.Long](
+          spark.sparkSession,
+          Range(
+            startOffset,
+            end.asInstanceOf[LongOffset].offset + 1,
+            1,
+            Some(spark.sparkSession.sparkContext.defaultParallelism),
+            isStreaming = true),
+          Encoders.LONG)
+        ds.toDF("a")
       }
 
       override def stop() {}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
index b6e82b621c8cb..e0979ce296c3a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
@@ -19,10 +19,12 @@ package org.apache.spark.sql.streaming
 
 import java.util.{Locale, TimeZone}
 
+import org.scalatest.Assertions
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.SparkException
-import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.{AnalysisException, DataFrame}
+import org.apache.spark.sql.catalyst.plans.logical.Aggregate
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.streaming._
@@ -31,12 +33,14 @@ import org.apache.spark.sql.expressions.scalalang.typed
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.streaming.OutputMode._
 import org.apache.spark.sql.streaming.util.StreamManualClock
+import org.apache.spark.sql.types.StructType
 
 object FailureSinglton {
   var firstTime = true
 }
 
-class StreamingAggregationSuite extends StateStoreMetricsTest with BeforeAndAfterAll {
+class StreamingAggregationSuite extends StateStoreMetricsTest
+    with BeforeAndAfterAll with Assertions {
 
   override def afterAll(): Unit = {
     super.afterAll()
@@ -356,4 +360,25 @@ class StreamingAggregationSuite extends StateStoreMetricsTest with BeforeAndAfte
       CheckLastBatch((90L, 1), (100L, 1), (105L, 1))
     )
   }
+
+  test("SPARK-19690: do not convert batch aggregation in streaming query to streaming") {
+    val streamInput = MemoryStream[Int]
+    val batchDF = Seq(1, 2, 3, 4, 5)
+        .toDF("value")
+        .withColumn("parity", 'value % 2)
+        .groupBy('parity)
+        .agg(count("*") as 'joinValue)
+    val joinDF = streamInput
+        .toDF()
+        .join(batchDF, 'value === 'parity)
+
+    // make sure we're planning an aggregate in the first place
+    assert(batchDF.queryExecution.optimizedPlan match { case _: Aggregate => true })
+
+    testStream(joinDF, Append)(
+      AddData(streamInput, 0, 1, 2, 3),
+      CheckLastBatch((0, 0, 2), (1, 1, 3)),
+      AddData(streamInput, 0, 1, 2, 3),
+      CheckLastBatch((0, 0, 2), (1, 1, 3)))
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index 27ea6902fa1fd..969f594edf615 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -647,7 +647,10 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging wi
     val source = new Source() {
       override def schema: StructType = triggerDF.schema
       override def getOffset: Option[Offset] = Some(LongOffset(0))
-      override def getBatch(start: Option[Offset], end: Offset): DataFrame = triggerDF
+      override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
+        sqlContext.internalCreateDataFrame(
+          triggerDF.queryExecution.toRdd, triggerDF.schema, isStreaming = true)
+      }
       override def stop(): Unit = {}
     }
     StreamingExecutionRelation(source)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
index e8a6202b8adce..aa163d2211c38 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
@@ -88,7 +88,7 @@ class DefaultSource extends StreamSourceProvider with StreamSinkProvider {
       override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
         import spark.implicits._
 
-        Seq[Int]().toDS().toDF()
+        spark.internalCreateDataFrame(spark.sparkContext.emptyRDD, schema, isStreaming = true)
       }
 
       override def stop() {}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 8bab059ed5e84..f0f2c493498b3 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -73,7 +73,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
 
     catalogProxy.getCachedTable(tableIdentifier) match {
       case null => None // Cache miss
-      case logical @ LogicalRelation(relation: HadoopFsRelation, _, _) =>
+      case logical @ LogicalRelation(relation: HadoopFsRelation, _, _, _) =>
         val cachedRelationFileFormatClass = relation.fileFormat.getClass
 
         expectedFileFormat match {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index e01198dd53178..83cee5d1b8a42 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -583,7 +583,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
             Row(3) :: Row(4) :: Nil)
 
           table("test_parquet_ctas").queryExecution.optimizedPlan match {
-            case LogicalRelation(p: HadoopFsRelation, _, _) => // OK
+            case LogicalRelation(p: HadoopFsRelation, _, _, _) => // OK
             case _ =>
               fail(s"test_parquet_ctas should have be converted to ${classOf[HadoopFsRelation]}")
           }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 02cfa02a37886..d2a6ef7b2b377 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -411,7 +411,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     val catalogTable =
       sessionState.catalog.getTableMetadata(TableIdentifier(tableName))
     relation match {
-      case LogicalRelation(r: HadoopFsRelation, _, _) =>
+      case LogicalRelation(r: HadoopFsRelation, _, _, _) =>
         if (!isDataSourceTable) {
           fail(
             s"${classOf[HiveTableRelation].getCanonicalName} is expected, but found " +
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcFilterSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcFilterSuite.scala
index 222c24927a763..de6f0d67f1734 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcFilterSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcFilterSuite.scala
@@ -45,7 +45,7 @@ class OrcFilterSuite extends QueryTest with OrcTest {
 
     var maybeRelation: Option[HadoopFsRelation] = None
     val maybeAnalyzedPredicate = query.queryExecution.optimizedPlan.collect {
-      case PhysicalOperation(_, filters, LogicalRelation(orcRelation: HadoopFsRelation, _, _)) =>
+      case PhysicalOperation(_, filters, LogicalRelation(orcRelation: HadoopFsRelation, _, _, _)) =>
         maybeRelation = Some(orcRelation)
         filters
     }.flatten.reduceLeftOption(_ && _)
@@ -89,7 +89,7 @@ class OrcFilterSuite extends QueryTest with OrcTest {
 
     var maybeRelation: Option[HadoopFsRelation] = None
     val maybeAnalyzedPredicate = query.queryExecution.optimizedPlan.collect {
-      case PhysicalOperation(_, filters, LogicalRelation(orcRelation: HadoopFsRelation, _, _)) =>
+      case PhysicalOperation(_, filters, LogicalRelation(orcRelation: HadoopFsRelation, _, _, _)) =>
         maybeRelation = Some(orcRelation)
         filters
     }.flatten.reduceLeftOption(_ && _)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index 303884da19f09..740e0837350cc 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -285,7 +285,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
       )
 
       table("test_parquet_ctas").queryExecution.optimizedPlan match {
-        case LogicalRelation(_: HadoopFsRelation, _, _) => // OK
+        case LogicalRelation(_: HadoopFsRelation, _, _, _) => // OK
         case _ => fail(
           "test_parquet_ctas should be converted to " +
               s"${classOf[HadoopFsRelation ].getCanonicalName }")
@@ -370,7 +370,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
 
       assertResult(2) {
         analyzed.collect {
-          case r @ LogicalRelation(_: HadoopFsRelation, _, _) => r
+          case r @ LogicalRelation(_: HadoopFsRelation, _, _, _) => r
         }.size
       }
     }
@@ -379,7 +379,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
   def collectHadoopFsRelation(df: DataFrame): HadoopFsRelation = {
     val plan = df.queryExecution.analyzed
     plan.collectFirst {
-      case LogicalRelation(r: HadoopFsRelation, _, _) => r
+      case LogicalRelation(r: HadoopFsRelation, _, _, _) => r
     }.getOrElse {
       fail(s"Expecting a HadoopFsRelation 2, but got:\n$plan")
     }
@@ -459,7 +459,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
       // Converted test_parquet should be cached.
       getCachedDataSourceTable(tableIdentifier) match {
         case null => fail("Converted test_parquet should be cached in the cache.")
-        case LogicalRelation(_: HadoopFsRelation, _, _) => // OK
+        case LogicalRelation(_: HadoopFsRelation, _, _, _) => // OK
         case other =>
           fail(
             "The cached test_parquet should be a Parquet Relation. " +

From 34296190558435fce73184fb7fb1e3d2ced7c3f6 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 23 Aug 2017 11:06:53 +0800
Subject: [PATCH 407/779] [ML][MINOR] Make sharedParams update.

## What changes were proposed in this pull request?
```sharedParams.scala``` was generated by ```SharedParamsCodeGen```, but it's not updated in master. Maybe someone manual update ```sharedParams.scala```, this PR fix this issue.

## How was this patch tested?
Offline check.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #19011 from yanboliang/sharedParams.
---
 .../scala/org/apache/spark/ml/param/shared/sharedParams.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
index 545e45e84e9ea..6061d9ca0a084 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -154,7 +154,7 @@ private[ml] trait HasVarianceCol extends Params {
 }
 
 /**
- * Trait for shared param threshold (default: 0.5).
+ * Trait for shared param threshold.
  */
 private[ml] trait HasThreshold extends Params {
 

From d58a3507ed2d48eabb857c92aecead19a52f4952 Mon Sep 17 00:00:00 2001
From: Jane Wang <janewang@fb.com>
Date: Wed, 23 Aug 2017 11:31:54 +0800
Subject: [PATCH 408/779] [SPARK-19326] Speculated task attempts do not get
 launched in few scenarios

## What changes were proposed in this pull request?

Add a new listener event when a speculative task is created and notify it to ExecutorAllocationManager for requesting more executor.

## How was this patch tested?

- Added Unittests.
- For the test snippet in the jira:
val n = 100
val someRDD = sc.parallelize(1 to n, n)
someRDD.mapPartitionsWithIndex( (index: Int, it: Iterator[Int]) => {
if (index == 1) {
Thread.sleep(Long.MaxValue) // fake long running task(s)
}
it.toList.map(x => index + ", " + x).iterator
}).collect
With this code change, spark indicates 101 jobs are running (99 succeeded, 2 running and 1 is speculative job)

Author: Jane Wang <janewang@fb.com>

Closes #18492 from janewangfb/speculated_task_not_launched.
---
 .../apache/spark/SparkFirehoseListener.java   |  5 ++
 .../spark/ExecutorAllocationManager.scala     | 61 ++++++++++++++++---
 .../apache/spark/scheduler/DAGScheduler.scala | 14 +++++
 .../spark/scheduler/DAGSchedulerEvent.scala   |  4 ++
 .../spark/scheduler/SparkListener.scala       | 11 ++++
 .../spark/scheduler/SparkListenerBus.scala    |  2 +
 .../spark/scheduler/TaskSetManager.scala      |  1 +
 .../ExecutorAllocationManagerSuite.scala      | 48 ++++++++++++++-
 .../spark/scheduler/TaskSetManagerSuite.scala |  9 +++
 9 files changed, 144 insertions(+), 11 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/SparkFirehoseListener.java b/core/src/main/java/org/apache/spark/SparkFirehoseListener.java
index 140c52fd12f94..3583856d88998 100644
--- a/core/src/main/java/org/apache/spark/SparkFirehoseListener.java
+++ b/core/src/main/java/org/apache/spark/SparkFirehoseListener.java
@@ -139,6 +139,11 @@ public void onBlockUpdated(SparkListenerBlockUpdated blockUpdated) {
     onEvent(blockUpdated);
   }
 
+  @Override
+  public void onSpeculativeTaskSubmitted(SparkListenerSpeculativeTaskSubmitted speculativeTask) {
+    onEvent(speculativeTask);
+  }
+
   @Override
   public void onOtherEvent(SparkListenerEvent event) {
     onEvent(event);
diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
index 337631a6f9a34..33503260bbe02 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
@@ -373,8 +373,14 @@ private[spark] class ExecutorAllocationManager(
     // If our target has not changed, do not send a message
     // to the cluster manager and reset our exponential growth
     if (delta == 0) {
-      numExecutorsToAdd = 1
-      return 0
+      // Check if there is any speculative jobs pending
+      if (listener.pendingTasks == 0 && listener.pendingSpeculativeTasks > 0) {
+        numExecutorsTarget =
+          math.max(math.min(maxNumExecutorsNeeded + 1, maxNumExecutors), minNumExecutors)
+      } else {
+        numExecutorsToAdd = 1
+        return 0
+      }
     }
 
     val addRequestAcknowledged = try {
@@ -588,17 +594,22 @@ private[spark] class ExecutorAllocationManager(
    * A listener that notifies the given allocation manager of when to add and remove executors.
    *
    * This class is intentionally conservative in its assumptions about the relative ordering
-   * and consistency of events returned by the listener. For simplicity, it does not account
-   * for speculated tasks.
+   * and consistency of events returned by the listener.
    */
   private class ExecutorAllocationListener extends SparkListener {
 
     private val stageIdToNumTasks = new mutable.HashMap[Int, Int]
     private val stageIdToTaskIndices = new mutable.HashMap[Int, mutable.HashSet[Int]]
     private val executorIdToTaskIds = new mutable.HashMap[String, mutable.HashSet[Long]]
-    // Number of tasks currently running on the cluster.  Should be 0 when no stages are active.
+    // Number of tasks currently running on the cluster including speculative tasks.
+    // Should be 0 when no stages are active.
     private var numRunningTasks: Int = _
 
+    // Number of speculative tasks to be scheduled in each stage
+    private val stageIdToNumSpeculativeTasks = new mutable.HashMap[Int, Int]
+    // The speculative tasks started in each stage
+    private val stageIdToSpeculativeTaskIndices = new mutable.HashMap[Int, mutable.HashSet[Int]]
+
     // stageId to tuple (the number of task with locality preferences, a map where each pair is a
     // node and the number of tasks that would like to be scheduled on that node) map,
     // maintain the executor placement hints for each stage Id used by resource framework to better
@@ -637,7 +648,9 @@ private[spark] class ExecutorAllocationManager(
       val stageId = stageCompleted.stageInfo.stageId
       allocationManager.synchronized {
         stageIdToNumTasks -= stageId
+        stageIdToNumSpeculativeTasks -= stageId
         stageIdToTaskIndices -= stageId
+        stageIdToSpeculativeTaskIndices -= stageId
         stageIdToExecutorPlacementHints -= stageId
 
         // Update the executor placement hints
@@ -645,7 +658,7 @@ private[spark] class ExecutorAllocationManager(
 
         // If this is the last stage with pending tasks, mark the scheduler queue as empty
         // This is needed in case the stage is aborted for any reason
-        if (stageIdToNumTasks.isEmpty) {
+        if (stageIdToNumTasks.isEmpty && stageIdToNumSpeculativeTasks.isEmpty) {
           allocationManager.onSchedulerQueueEmpty()
           if (numRunningTasks != 0) {
             logWarning("No stages are running, but numRunningTasks != 0")
@@ -671,7 +684,12 @@ private[spark] class ExecutorAllocationManager(
         }
 
         // If this is the last pending task, mark the scheduler queue as empty
-        stageIdToTaskIndices.getOrElseUpdate(stageId, new mutable.HashSet[Int]) += taskIndex
+        if (taskStart.taskInfo.speculative) {
+          stageIdToSpeculativeTaskIndices.getOrElseUpdate(stageId, new mutable.HashSet[Int]) +=
+            taskIndex
+        } else {
+          stageIdToTaskIndices.getOrElseUpdate(stageId, new mutable.HashSet[Int]) += taskIndex
+        }
         if (totalPendingTasks() == 0) {
           allocationManager.onSchedulerQueueEmpty()
         }
@@ -705,7 +723,11 @@ private[spark] class ExecutorAllocationManager(
           if (totalPendingTasks() == 0) {
             allocationManager.onSchedulerBacklogged()
           }
-          stageIdToTaskIndices.get(stageId).foreach { _.remove(taskIndex) }
+          if (taskEnd.taskInfo.speculative) {
+            stageIdToSpeculativeTaskIndices.get(stageId).foreach {_.remove(taskIndex)}
+          } else {
+            stageIdToTaskIndices.get(stageId).foreach {_.remove(taskIndex)}
+          }
         }
       }
     }
@@ -726,18 +748,39 @@ private[spark] class ExecutorAllocationManager(
       allocationManager.onExecutorRemoved(executorRemoved.executorId)
     }
 
+    override def onSpeculativeTaskSubmitted(speculativeTask: SparkListenerSpeculativeTaskSubmitted)
+      : Unit = {
+       val stageId = speculativeTask.stageId
+
+      allocationManager.synchronized {
+        stageIdToNumSpeculativeTasks(stageId) =
+          stageIdToNumSpeculativeTasks.getOrElse(stageId, 0) + 1
+        allocationManager.onSchedulerBacklogged()
+      }
+    }
+
     /**
      * An estimate of the total number of pending tasks remaining for currently running stages. Does
      * not account for tasks which may have failed and been resubmitted.
      *
      * Note: This is not thread-safe without the caller owning the `allocationManager` lock.
      */
-    def totalPendingTasks(): Int = {
+    def pendingTasks(): Int = {
       stageIdToNumTasks.map { case (stageId, numTasks) =>
         numTasks - stageIdToTaskIndices.get(stageId).map(_.size).getOrElse(0)
       }.sum
     }
 
+    def pendingSpeculativeTasks(): Int = {
+      stageIdToNumSpeculativeTasks.map { case (stageId, numTasks) =>
+        numTasks - stageIdToSpeculativeTaskIndices.get(stageId).map(_.size).getOrElse(0)
+      }.sum
+    }
+
+    def totalPendingTasks(): Int = {
+      pendingTasks + pendingSpeculativeTasks
+    }
+
     /**
      * The number of tasks currently running across all stages.
      */
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 21bf9d013ebef..562dd1da4fe14 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -281,6 +281,13 @@ class DAGScheduler(
     eventProcessLoop.post(TaskSetFailed(taskSet, reason, exception))
   }
 
+  /**
+   * Called by the TaskSetManager when it decides a speculative task is needed.
+   */
+  def speculativeTaskSubmitted(task: Task[_]): Unit = {
+    eventProcessLoop.post(SpeculativeTaskSubmitted(task))
+  }
+
   private[scheduler]
   def getCacheLocs(rdd: RDD[_]): IndexedSeq[Seq[TaskLocation]] = cacheLocs.synchronized {
     // Note: this doesn't use `getOrElse()` because this method is called O(num tasks) times
@@ -812,6 +819,10 @@ class DAGScheduler(
     listenerBus.post(SparkListenerTaskStart(task.stageId, stageAttemptId, taskInfo))
   }
 
+  private[scheduler] def handleSpeculativeTaskSubmitted(task: Task[_]): Unit = {
+    listenerBus.post(SparkListenerSpeculativeTaskSubmitted(task.stageId))
+  }
+
   private[scheduler] def handleTaskSetFailed(
       taskSet: TaskSet,
       reason: String,
@@ -1778,6 +1789,9 @@ private[scheduler] class DAGSchedulerEventProcessLoop(dagScheduler: DAGScheduler
     case BeginEvent(task, taskInfo) =>
       dagScheduler.handleBeginEvent(task, taskInfo)
 
+    case SpeculativeTaskSubmitted(task) =>
+      dagScheduler.handleSpeculativeTaskSubmitted(task)
+
     case GettingResultEvent(taskInfo) =>
       dagScheduler.handleGetTaskResult(taskInfo)
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala
index 3f8d5639a2b90..54ab8f8b3e1d8 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala
@@ -94,3 +94,7 @@ case class TaskSetFailed(taskSet: TaskSet, reason: String, exception: Option[Thr
   extends DAGSchedulerEvent
 
 private[scheduler] case object ResubmitFailedStages extends DAGSchedulerEvent
+
+private[scheduler]
+case class SpeculativeTaskSubmitted(task: Task[_]) extends DAGSchedulerEvent
+
diff --git a/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala b/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala
index 59f89a82a1da8..b76e560669d59 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala
@@ -52,6 +52,9 @@ case class SparkListenerTaskStart(stageId: Int, stageAttemptId: Int, taskInfo: T
 @DeveloperApi
 case class SparkListenerTaskGettingResult(taskInfo: TaskInfo) extends SparkListenerEvent
 
+@DeveloperApi
+case class SparkListenerSpeculativeTaskSubmitted(stageId: Int) extends SparkListenerEvent
+
 @DeveloperApi
 case class SparkListenerTaskEnd(
     stageId: Int,
@@ -290,6 +293,11 @@ private[spark] trait SparkListenerInterface {
    */
   def onBlockUpdated(blockUpdated: SparkListenerBlockUpdated): Unit
 
+  /**
+   * Called when a speculative task is submitted
+   */
+  def onSpeculativeTaskSubmitted(speculativeTask: SparkListenerSpeculativeTaskSubmitted): Unit
+
   /**
    * Called when other events like SQL-specific events are posted.
    */
@@ -354,5 +362,8 @@ abstract class SparkListener extends SparkListenerInterface {
 
   override def onBlockUpdated(blockUpdated: SparkListenerBlockUpdated): Unit = { }
 
+  override def onSpeculativeTaskSubmitted(
+      speculativeTask: SparkListenerSpeculativeTaskSubmitted): Unit = { }
+
   override def onOtherEvent(event: SparkListenerEvent): Unit = { }
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala
index 3b0d3b1b150fe..056c0cbded435 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala
@@ -71,6 +71,8 @@ private[spark] trait SparkListenerBus
         listener.onNodeUnblacklisted(nodeUnblacklisted)
       case blockUpdated: SparkListenerBlockUpdated =>
         listener.onBlockUpdated(blockUpdated)
+      case speculativeTaskSubmitted: SparkListenerSpeculativeTaskSubmitted =>
+        listener.onSpeculativeTaskSubmitted(speculativeTaskSubmitted)
       case _ => listener.onOtherEvent(event)
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index c2f817858473c..3804ea863b4f9 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -966,6 +966,7 @@ private[spark] class TaskSetManager(
             "Marking task %d in stage %s (on %s) as speculatable because it ran more than %.0f ms"
               .format(index, taskSet.id, info.host, threshold))
           speculatableTasks += index
+          sched.dagScheduler.speculativeTaskSubmitted(tasks(index))
           foundTasks = true
         }
       }
diff --git a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
index b9ce71a0c5254..7da4bae0ab7eb 100644
--- a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
@@ -188,6 +188,40 @@ class ExecutorAllocationManagerSuite
     assert(numExecutorsTarget(manager) === 10)
   }
 
+  test("add executors when speculative tasks added") {
+    sc = createSparkContext(0, 10, 0)
+    val manager = sc.executorAllocationManager.get
+
+    // Verify that we're capped at number of tasks including the speculative ones in the stage
+    sc.listenerBus.postToAll(SparkListenerSpeculativeTaskSubmitted(1))
+    assert(numExecutorsTarget(manager) === 0)
+    assert(numExecutorsToAdd(manager) === 1)
+    assert(addExecutors(manager) === 1)
+    sc.listenerBus.postToAll(SparkListenerSpeculativeTaskSubmitted(1))
+    sc.listenerBus.postToAll(SparkListenerSpeculativeTaskSubmitted(1))
+    sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(1, 2)))
+    assert(numExecutorsTarget(manager) === 1)
+    assert(numExecutorsToAdd(manager) === 2)
+    assert(addExecutors(manager) === 2)
+    assert(numExecutorsTarget(manager) === 3)
+    assert(numExecutorsToAdd(manager) === 4)
+    assert(addExecutors(manager) === 2)
+    assert(numExecutorsTarget(manager) === 5)
+    assert(numExecutorsToAdd(manager) === 1)
+
+    // Verify that running a task doesn't affect the target
+    sc.listenerBus.postToAll(SparkListenerTaskStart(1, 0, createTaskInfo(0, 0, "executor-1")))
+    assert(numExecutorsTarget(manager) === 5)
+    assert(addExecutors(manager) === 0)
+    assert(numExecutorsToAdd(manager) === 1)
+
+    // Verify that running a speculative task doesn't affect the target
+    sc.listenerBus.postToAll(SparkListenerTaskStart(1, 0, createTaskInfo(0, 0, "executor-2", true)))
+    assert(numExecutorsTarget(manager) === 5)
+    assert(addExecutors(manager) === 0)
+    assert(numExecutorsToAdd(manager) === 1)
+  }
+
   test("cancel pending executors when no longer needed") {
     sc = createSparkContext(0, 10, 0)
     val manager = sc.executorAllocationManager.get
@@ -1031,10 +1065,15 @@ private object ExecutorAllocationManagerSuite extends PrivateMethodTester {
       taskLocalityPreferences = taskLocalityPreferences)
   }
 
-  private def createTaskInfo(taskId: Int, taskIndex: Int, executorId: String): TaskInfo = {
-    new TaskInfo(taskId, taskIndex, 0, 0, executorId, "", TaskLocality.ANY, speculative = false)
+  private def createTaskInfo(
+      taskId: Int,
+      taskIndex: Int,
+      executorId: String,
+      speculative: Boolean = false): TaskInfo = {
+    new TaskInfo(taskId, taskIndex, 0, 0, executorId, "", TaskLocality.ANY, speculative)
   }
 
+
   /* ------------------------------------------------------- *
    | Helper methods for accessing private methods and fields |
    * ------------------------------------------------------- */
@@ -1061,6 +1100,7 @@ private object ExecutorAllocationManagerSuite extends PrivateMethodTester {
   private val _onExecutorBusy = PrivateMethod[Unit]('onExecutorBusy)
   private val _localityAwareTasks = PrivateMethod[Int]('localityAwareTasks)
   private val _hostToLocalTaskCount = PrivateMethod[Map[String, Int]]('hostToLocalTaskCount)
+  private val _onSpeculativeTaskSubmitted = PrivateMethod[Unit]('onSpeculativeTaskSubmitted)
 
   private def numExecutorsToAdd(manager: ExecutorAllocationManager): Int = {
     manager invokePrivate _numExecutorsToAdd()
@@ -1136,6 +1176,10 @@ private object ExecutorAllocationManagerSuite extends PrivateMethodTester {
     manager invokePrivate _onExecutorBusy(id)
   }
 
+  private def onSpeculativeTaskSubmitted(manager: ExecutorAllocationManager, id: String) : Unit = {
+    manager invokePrivate _onSpeculativeTaskSubmitted(id)
+  }
+
   private def localityAwareTasks(manager: ExecutorAllocationManager): Int = {
     manager invokePrivate _localityAwareTasks()
   }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
index 6f1663b210969..ae43f4cadc037 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
@@ -60,6 +60,10 @@ class FakeDAGScheduler(sc: SparkContext, taskScheduler: FakeTaskScheduler)
       exception: Option[Throwable]): Unit = {
     taskScheduler.taskSetsFailed += taskSet.id
   }
+
+  override def speculativeTaskSubmitted(task: Task[_]): Unit = {
+    taskScheduler.speculativeTasks += task.partitionId
+  }
 }
 
 // Get the rack for a given host
@@ -92,6 +96,7 @@ class FakeTaskScheduler(sc: SparkContext, liveExecutors: (String, String)* /* ex
   val endedTasks = new mutable.HashMap[Long, TaskEndReason]
   val finishedManagers = new ArrayBuffer[TaskSetManager]
   val taskSetsFailed = new ArrayBuffer[String]
+  val speculativeTasks = new ArrayBuffer[Int]
 
   val executors = new mutable.HashMap[String, String]
   for ((execId, host) <- liveExecutors) {
@@ -139,6 +144,7 @@ class FakeTaskScheduler(sc: SparkContext, liveExecutors: (String, String)* /* ex
     }
   }
 
+
   override def getRackForHost(value: String): Option[String] = FakeRackUtil.getRackForHost(value)
 }
 
@@ -929,6 +935,8 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg
     // > 0ms, so advance the clock by 1ms here.
     clock.advance(1)
     assert(manager.checkSpeculatableTasks(0))
+    assert(sched.speculativeTasks.toSet === Set(3))
+
     // Offer resource to start the speculative attempt for the running task
     val taskOption5 = manager.resourceOffer("exec1", "host1", NO_PREF)
     assert(taskOption5.isDefined)
@@ -1016,6 +1024,7 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg
     // > 0ms, so advance the clock by 1ms here.
     clock.advance(1)
     assert(manager.checkSpeculatableTasks(0))
+    assert(sched.speculativeTasks.toSet === Set(3, 4))
     // Offer resource to start the speculative attempt for the running task
     val taskOption5 = manager.resourceOffer("exec1", "host1", NO_PREF)
     assert(taskOption5.isDefined)

From d6b30edd4974b593cc8085f680ccb524c7722c85 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Tue, 22 Aug 2017 21:16:34 -0700
Subject: [PATCH 409/779] [SPARK-12664][ML] Expose probability in mlp model

## What changes were proposed in this pull request?

Modify MLP model to inherit `ProbabilisticClassificationModel` and so that it can expose the probability  column when transforming data.

## How was this patch tested?

Test added.

Author: WeichenXu <WeichenXu123@outlook.com>

Closes #17373 from WeichenXu123/expose_probability_in_mlp_model.
---
 .../scala/org/apache/spark/ml/ann/Layer.scala | 53 ++++++++++++++++---
 .../MultilayerPerceptronClassifier.scala      | 17 ++++--
 .../apache/spark/ml/ann/GradientSuite.scala   |  2 +-
 .../MultilayerPerceptronClassifierSuite.scala | 42 +++++++++++++++
 python/pyspark/ml/classification.py           |  4 +-
 5 files changed, 103 insertions(+), 15 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/ann/Layer.scala b/mllib/src/main/scala/org/apache/spark/ml/ann/Layer.scala
index e7e0dae0b5a01..014ff07c21158 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/ann/Layer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/ann/Layer.scala
@@ -361,17 +361,42 @@ private[ann] trait TopologyModel extends Serializable {
    * Forward propagation
    *
    * @param data input data
+   * @param includeLastLayer Include the last layer in the output. In
+   *                         MultilayerPerceptronClassifier, the last layer is always softmax;
+   *                         the last layer of outputs is needed for class predictions, but not
+   *                         for rawPrediction.
+   *
    * @return array of outputs for each of the layers
    */
-  def forward(data: BDM[Double]): Array[BDM[Double]]
+  def forward(data: BDM[Double], includeLastLayer: Boolean): Array[BDM[Double]]
 
   /**
-   * Prediction of the model
+   * Prediction of the model. See {@link ProbabilisticClassificationModel}
    *
-   * @param data input data
+   * @param features input features
    * @return prediction
    */
-  def predict(data: Vector): Vector
+  def predict(features: Vector): Vector
+
+  /**
+   * Raw prediction of the model. See {@link ProbabilisticClassificationModel}
+   *
+   * @param features input features
+   * @return raw prediction
+   *
+   * Note: This interface is only used for classification Model.
+   */
+  def predictRaw(features: Vector): Vector
+
+  /**
+   * Probability of the model. See {@link ProbabilisticClassificationModel}
+   *
+   * @param rawPrediction raw prediction vector
+   * @return probability
+   *
+   * Note: This interface is only used for classification Model.
+   */
+  def raw2ProbabilityInPlace(rawPrediction: Vector): Vector
 
   /**
    * Computes gradient for the network
@@ -463,7 +488,7 @@ private[ml] class FeedForwardModel private(
   private var outputs: Array[BDM[Double]] = null
   private var deltas: Array[BDM[Double]] = null
 
-  override def forward(data: BDM[Double]): Array[BDM[Double]] = {
+  override def forward(data: BDM[Double], includeLastLayer: Boolean): Array[BDM[Double]] = {
     // Initialize output arrays for all layers. Special treatment for InPlace
     val currentBatchSize = data.cols
     // TODO: allocate outputs as one big array and then create BDMs from it
@@ -481,7 +506,8 @@ private[ml] class FeedForwardModel private(
       }
     }
     layerModels(0).eval(data, outputs(0))
-    for (i <- 1 until layerModels.length) {
+    val end = if (includeLastLayer) layerModels.length else layerModels.length - 1
+    for (i <- 1 until end) {
       layerModels(i).eval(outputs(i - 1), outputs(i))
     }
     outputs
@@ -492,7 +518,7 @@ private[ml] class FeedForwardModel private(
     target: BDM[Double],
     cumGradient: Vector,
     realBatchSize: Int): Double = {
-    val outputs = forward(data)
+    val outputs = forward(data, true)
     val currentBatchSize = data.cols
     // TODO: allocate deltas as one big array and then create BDMs from it
     if (deltas == null || deltas(0).cols != currentBatchSize) {
@@ -527,9 +553,20 @@ private[ml] class FeedForwardModel private(
 
   override def predict(data: Vector): Vector = {
     val size = data.size
-    val result = forward(new BDM[Double](size, 1, data.toArray))
+    val result = forward(new BDM[Double](size, 1, data.toArray), true)
     Vectors.dense(result.last.toArray)
   }
+
+  override def predictRaw(data: Vector): Vector = {
+    val result = forward(new BDM[Double](data.size, 1, data.toArray), false)
+    Vectors.dense(result(result.length - 2).toArray)
+  }
+
+  override def raw2ProbabilityInPlace(data: Vector): Vector = {
+    val dataMatrix = new BDM[Double](data.size, 1, data.toArray)
+    layerModels.last.eval(dataMatrix, dataMatrix)
+    data
+  }
 }
 
 /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
index ceba11edc93be..14a0c9f5a66dd 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
@@ -32,7 +32,7 @@ import org.apache.spark.ml.util._
 import org.apache.spark.sql.Dataset
 
 /** Params for Multilayer Perceptron. */
-private[classification] trait MultilayerPerceptronParams extends PredictorParams
+private[classification] trait MultilayerPerceptronParams extends ProbabilisticClassifierParams
   with HasSeed with HasMaxIter with HasTol with HasStepSize with HasSolver {
 
   import MultilayerPerceptronClassifier._
@@ -143,7 +143,8 @@ private object LabelConverter {
 @Since("1.5.0")
 class MultilayerPerceptronClassifier @Since("1.5.0") (
     @Since("1.5.0") override val uid: String)
-  extends Predictor[Vector, MultilayerPerceptronClassifier, MultilayerPerceptronClassificationModel]
+  extends ProbabilisticClassifier[Vector, MultilayerPerceptronClassifier,
+    MultilayerPerceptronClassificationModel]
   with MultilayerPerceptronParams with DefaultParamsWritable {
 
   @Since("1.5.0")
@@ -301,13 +302,13 @@ class MultilayerPerceptronClassificationModel private[ml] (
     @Since("1.5.0") override val uid: String,
     @Since("1.5.0") val layers: Array[Int],
     @Since("2.0.0") val weights: Vector)
-  extends PredictionModel[Vector, MultilayerPerceptronClassificationModel]
+  extends ProbabilisticClassificationModel[Vector, MultilayerPerceptronClassificationModel]
   with Serializable with MLWritable {
 
   @Since("1.6.0")
   override val numFeatures: Int = layers.head
 
-  private val mlpModel = FeedForwardTopology
+  private[ml] val mlpModel = FeedForwardTopology
     .multiLayerPerceptron(layers, softmaxOnTop = true)
     .model(weights)
 
@@ -335,6 +336,14 @@ class MultilayerPerceptronClassificationModel private[ml] (
   @Since("2.0.0")
   override def write: MLWriter =
     new MultilayerPerceptronClassificationModel.MultilayerPerceptronClassificationModelWriter(this)
+
+  override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = {
+    mlpModel.raw2ProbabilityInPlace(rawPrediction)
+  }
+
+  override protected def predictRaw(features: Vector): Vector = mlpModel.predictRaw(features)
+
+  override def numClasses: Int = layers.last
 }
 
 @Since("2.0.0")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/ann/GradientSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/ann/GradientSuite.scala
index f0c0183323c92..2f225645bdfc4 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/ann/GradientSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/ann/GradientSuite.scala
@@ -64,7 +64,7 @@ class GradientSuite extends SparkFunSuite with MLlibTestSparkContext {
   }
 
   private def computeLoss(input: BDM[Double], target: BDM[Double], model: TopologyModel): Double = {
-    val outputs = model.forward(input)
+    val outputs = model.forward(input, true)
     model.layerModels.last match {
       case layerWithLoss: LossFunction =>
         layerWithLoss.loss(outputs.last, target, new BDM[Double](target.rows, target.cols))
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
index ce54c3df4f3f6..c294e4ad54bf7 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
@@ -29,6 +29,7 @@ import org.apache.spark.mllib.linalg.{Vectors => OldVectors}
 import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.{Dataset, Row}
+import org.apache.spark.sql.functions._
 
 class MultilayerPerceptronClassifierSuite
   extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
@@ -82,6 +83,47 @@ class MultilayerPerceptronClassifierSuite
     }
   }
 
+  test("Predicted class probabilities: calibration on toy dataset") {
+    val layers = Array[Int](4, 5, 2)
+
+    val strongDataset = Seq(
+      (Vectors.dense(1, 2, 3, 4), 0d, Vectors.dense(1d, 0d)),
+      (Vectors.dense(4, 3, 2, 1), 1d, Vectors.dense(0d, 1d)),
+      (Vectors.dense(1, 1, 1, 1), 0d, Vectors.dense(.5, .5)),
+      (Vectors.dense(1, 1, 1, 1), 1d, Vectors.dense(.5, .5))
+    ).toDF("features", "label", "expectedProbability")
+    val trainer = new MultilayerPerceptronClassifier()
+      .setLayers(layers)
+      .setBlockSize(1)
+      .setSeed(123L)
+      .setMaxIter(100)
+      .setSolver("l-bfgs")
+    val model = trainer.fit(strongDataset)
+    val result = model.transform(strongDataset)
+    result.select("probability", "expectedProbability").collect().foreach {
+      case Row(p: Vector, e: Vector) =>
+        assert(p ~== e absTol 1e-3)
+    }
+  }
+
+  test("test model probability") {
+    val layers = Array[Int](2, 5, 2)
+    val trainer = new MultilayerPerceptronClassifier()
+      .setLayers(layers)
+      .setBlockSize(1)
+      .setSeed(123L)
+      .setMaxIter(100)
+      .setSolver("l-bfgs")
+    val model = trainer.fit(dataset)
+    model.setProbabilityCol("probability")
+    val result = model.transform(dataset)
+    val features2prob = udf { features: Vector => model.mlpModel.predict(features) }
+    result.select(features2prob(col("features")), col("probability")).collect().foreach {
+      case Row(p1: Vector, p2: Vector) =>
+        assert(p1 ~== p2 absTol 1e-3)
+    }
+  }
+
   test("Test setWeights by training restart") {
     val dataFrame = Seq(
       (Vectors.dense(0.0, 0.0), 0.0),
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 235cee48bc6a6..f0f42a34942d7 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -1378,7 +1378,7 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol,
     >>> testDF = spark.createDataFrame([
     ...     (Vectors.dense([1.0, 0.0]),),
     ...     (Vectors.dense([0.0, 0.0]),)], ["features"])
-    >>> model.transform(testDF).show()
+    >>> model.transform(testDF).select("features", "prediction").show()
     +---------+----------+
     | features|prediction|
     +---------+----------+
@@ -1512,7 +1512,7 @@ def getInitialWeights(self):
         return self.getOrDefault(self.initialWeights)
 
 
-class MultilayerPerceptronClassificationModel(JavaModel, JavaPredictionModel, JavaMLWritable,
+class MultilayerPerceptronClassificationModel(JavaModel, JavaClassificationModel, JavaMLWritable,
                                               JavaMLReadable):
     """
     Model fitted by MultilayerPerceptronClassifier.

From 1662e93119d68498942386906de309d35f4a135f Mon Sep 17 00:00:00 2001
From: Sanket Chintapalli <schintap@yahoo-inc.com>
Date: Wed, 23 Aug 2017 11:51:11 -0500
Subject: [PATCH 410/779] [SPARK-21501] Change CacheLoader to limit entries
 based on memory footprint

Right now the spark shuffle service has a cache for index files. It is based on a # of files cached (spark.shuffle.service.index.cache.entries). This can cause issues if people have a lot of reducers because the size of each entry can fluctuate based on the # of reducers.
We saw an issues with a job that had 170000 reducers and it caused NM with spark shuffle service to use 700-800MB or memory in NM by itself.
We should change this cache to be memory based and only allow a certain memory size used. When I say memory based I mean the cache should have a limit of say 100MB.

https://issues.apache.org/jira/browse/SPARK-21501

Manual Testing with 170000 reducers has been performed with cache loaded up to max 100MB default limit, with each shuffle index file of size 1.3MB. Eviction takes place as soon as the total cache size reaches the 100MB limit and the objects will be ready for garbage collection there by avoiding NM to crash. No notable difference in runtime has been observed.

Author: Sanket Chintapalli <schintap@yahoo-inc.com>

Closes #18940 from redsanket/SPARK-21501.
---
 .../org/apache/spark/network/util/TransportConf.java  |  4 ++++
 .../network/shuffle/ExternalShuffleBlockResolver.java | 11 +++++++++--
 .../network/shuffle/ShuffleIndexInformation.java      | 11 ++++++++++-
 core/src/main/scala/org/apache/spark/SparkConf.scala  |  4 +++-
 docs/configuration.md                                 |  6 +++---
 5 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java b/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java
index 88256b810bf04..fa2ff42de07d0 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java
@@ -67,6 +67,10 @@ public int getInt(String name, int defaultValue) {
     return conf.getInt(name, defaultValue);
   }
 
+  public String get(String name, String defaultValue) {
+    return conf.get(name, defaultValue);
+  }
+
   private String getConfKey(String suffix) {
     return "spark." + module + "." + suffix;
   }
diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java
index d7ec0e299dead..e6399897be9c2 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java
@@ -33,6 +33,7 @@
 import com.google.common.cache.CacheBuilder;
 import com.google.common.cache.CacheLoader;
 import com.google.common.cache.LoadingCache;
+import com.google.common.cache.Weigher;
 import com.google.common.collect.Maps;
 import org.iq80.leveldb.DB;
 import org.iq80.leveldb.DBIterator;
@@ -104,7 +105,7 @@ public ExternalShuffleBlockResolver(TransportConf conf, File registeredExecutorF
       Executor directoryCleaner) throws IOException {
     this.conf = conf;
     this.registeredExecutorFile = registeredExecutorFile;
-    int indexCacheEntries = conf.getInt("spark.shuffle.service.index.cache.entries", 1024);
+    String indexCacheSize = conf.get("spark.shuffle.service.index.cache.size", "100m");
     CacheLoader<File, ShuffleIndexInformation> indexCacheLoader =
         new CacheLoader<File, ShuffleIndexInformation>() {
           public ShuffleIndexInformation load(File file) throws IOException {
@@ -112,7 +113,13 @@ public ShuffleIndexInformation load(File file) throws IOException {
           }
         };
     shuffleIndexCache = CacheBuilder.newBuilder()
-                                    .maximumSize(indexCacheEntries).build(indexCacheLoader);
+      .maximumWeight(JavaUtils.byteStringAsBytes(indexCacheSize))
+      .weigher(new Weigher<File, ShuffleIndexInformation>() {
+        public int weigh(File file, ShuffleIndexInformation indexInfo) {
+          return indexInfo.getSize();
+        }
+      })
+      .build(indexCacheLoader);
     db = LevelDBProvider.initLevelDB(this.registeredExecutorFile, CURRENT_VERSION, mapper);
     if (db != null) {
       executors = reloadRegisteredExecutors(db);
diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java
index 39ca9ba574853..386738ece51a6 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java
@@ -31,9 +31,10 @@
 public class ShuffleIndexInformation {
   /** offsets as long buffer */
   private final LongBuffer offsets;
+  private int size;
 
   public ShuffleIndexInformation(File indexFile) throws IOException {
-    int size = (int)indexFile.length();
+    size = (int)indexFile.length();
     ByteBuffer buffer = ByteBuffer.allocate(size);
     offsets = buffer.asLongBuffer();
     DataInputStream dis = null;
@@ -47,6 +48,14 @@ public ShuffleIndexInformation(File indexFile) throws IOException {
     }
   }
 
+  /**
+   * Size of the index file
+   * @return size
+   */
+  public int getSize() {
+    return size;
+  }
+
   /**
    * Get index offset for a particular reducer.
    */
diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index 715cfdcc8f4ef..e61f943af49f2 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -597,7 +597,9 @@ private[spark] object SparkConf extends Logging {
       DeprecatedConfig("spark.scheduler.executorTaskBlacklistTime", "2.1.0",
         "Please use the new blacklisting options, spark.blacklist.*"),
       DeprecatedConfig("spark.yarn.am.port", "2.0.0", "Not used any more"),
-      DeprecatedConfig("spark.executor.port", "2.0.0", "Not used any more")
+      DeprecatedConfig("spark.executor.port", "2.0.0", "Not used any more"),
+      DeprecatedConfig("spark.shuffle.service.index.cache.entries", "2.3.0",
+        "Not used any more. Please use spark.shuffle.service.index.cache.size")
     )
 
     Map(configs.map { cfg => (cfg.key -> cfg) } : _*)
diff --git a/docs/configuration.md b/docs/configuration.md
index e7c0306920e08..6e9fe591b70a3 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -627,10 +627,10 @@ Apart from these, the following properties are also available, and may be useful
   </td>
 </tr>
 <tr>
-  <td><code>spark.shuffle.service.index.cache.entries</code></td>
-  <td>1024</td>
+  <td><code>spark.shuffle.service.index.cache.size</code></td>
+  <td>100m</td>
   <td>
-    Max number of entries to keep in the index cache of the shuffle service.
+    Cache entries limited to the specified memory footprint.
   </td>
 </tr>
 <tr>

From 6942aeeb0a0095a1ba85a817eb9e0edc410e5624 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Wed, 23 Aug 2017 12:02:24 -0700
Subject: [PATCH 411/779] [SPARK-21603][SQL][FOLLOW-UP] Change the default
 value of maxLinesPerFunction into 4000

## What changes were proposed in this pull request?
This pr changed the default value of `maxLinesPerFunction` into `4000`. In #18810, we had this new option to disable code generation for too long functions and I found this option only affected `Q17` and `Q66` in TPC-DS. But, `Q66` had some performance regression:

```
Q17 w/o #18810, 3224ms --> q17 w/#18810, 2627ms (improvement)
Q66 w/o #18810, 1712ms --> q66 w/#18810, 3032ms (regression)
```

To keep the previous performance in TPC-DS, we better set higher value at `maxLinesPerFunction` by default.

## How was this patch tested?
Existing tests.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #19021 from maropu/SPARK-21603-FOLLOWUP-1.
---
 .../main/scala/org/apache/spark/sql/internal/SQLConf.scala  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 2c7397c1ec774..a685099505ee8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -577,10 +577,10 @@ object SQLConf {
     .doc("The maximum lines of a single Java function generated by whole-stage codegen. " +
       "When the generated function exceeds this threshold, " +
       "the whole-stage codegen is deactivated for this subtree of the current query plan. " +
-      "The default value 2667 is the max length of byte code JIT supported " +
-      "for a single function(8000) divided by 3.")
+      "The default value 4000 is the max length of byte code JIT supported " +
+      "for a single function(8000) divided by 2.")
     .intConf
-    .createWithDefault(2667)
+    .createWithDefault(4000)
 
   val FILES_MAX_PARTITION_BYTES = buildConf("spark.sql.files.maxPartitionBytes")
     .doc("The maximum number of bytes to pack into a single partition when reading files.")

From b8aaef49fbf02401c874b06d17cbe354f739b9e7 Mon Sep 17 00:00:00 2001
From: 10129659 <chen.yanshan@zte.com.cn>
Date: Wed, 23 Aug 2017 20:35:08 -0700
Subject: [PATCH 412/779] [SPARK-21807][SQL] Override ++ operation in
 ExpressionSet to reduce clone time

## What changes were proposed in this pull request?
The getAliasedConstraints  fuction in LogicalPlan.scala will clone the expression set when an element added,
and it will take a long time. This PR add a function to add multiple elements at once to reduce the clone time.

Before modified, the cost of getAliasedConstraints is:
100 expressions:  41 seconds
150 expressions:  466 seconds

After modified, the cost of getAliasedConstraints is:
100 expressions:  1.8 seconds
150 expressions:  6.5 seconds

The test is like this:
test("getAliasedConstraints") {
    val expressionNum = 150
    val aggExpression = (1 to expressionNum).map(i => Alias(Count(Literal(1)), s"cnt$i")())
    val aggPlan = Aggregate(Nil, aggExpression, LocalRelation())

    val beginTime = System.currentTimeMillis()
    val expressions = aggPlan.validConstraints
    println(s"validConstraints cost: ${System.currentTimeMillis() - beginTime}ms")
    // The size of Aliased expression is n * (n - 1) / 2 + n
    assert( expressions.size === expressionNum * (expressionNum - 1) / 2 + expressionNum)
  }

(Please fill in changes proposed in this fix)

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)
(If this patch involves UI changes, please attach a screenshot; otherwise, remove this)

Run new added test.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: 10129659 <chen.yanshan@zte.com.cn>

Closes #19022 from eatoncys/getAliasedConstraints.
---
 .../spark/sql/catalyst/expressions/ExpressionSet.scala   | 8 +++++++-
 .../sql/catalyst/expressions/ExpressionSetSuite.scala    | 9 +++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSet.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSet.scala
index ede0b1654bbd6..305ac90e245b8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSet.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSet.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import scala.collection.mutable
+import scala.collection.{mutable, GenTraversableOnce}
 import scala.collection.mutable.ArrayBuffer
 
 object ExpressionSet {
@@ -67,6 +67,12 @@ class ExpressionSet protected(
     newSet
   }
 
+  override def ++(elems: GenTraversableOnce[Expression]): ExpressionSet = {
+    val newSet = new ExpressionSet(baseSet.clone(), originals.clone())
+    elems.foreach(newSet.add)
+    newSet
+  }
+
   override def -(elem: Expression): ExpressionSet = {
     val newBaseSet = baseSet.clone().filterNot(_ == elem.canonicalized)
     val newOriginals = originals.clone().filterNot(_.canonicalized == elem.canonicalized)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSetSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSetSuite.scala
index d617ad540d5ff..a1000a0e80799 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSetSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSetSuite.scala
@@ -210,4 +210,13 @@ class ExpressionSetSuite extends SparkFunSuite {
     assert((initialSet - (aLower + 1)).size == 0)
 
   }
+
+  test("add multiple elements to set") {
+    val initialSet = ExpressionSet(aUpper + 1 :: Nil)
+    val setToAddWithSameExpression = ExpressionSet(aUpper + 1 :: aUpper + 2 :: Nil)
+    val setToAddWithOutSameExpression = ExpressionSet(aUpper + 3 :: aUpper + 4 :: Nil)
+
+    assert((initialSet ++ setToAddWithSameExpression).size == 2)
+    assert((initialSet ++ setToAddWithOutSameExpression).size == 3)
+  }
 }

From 43cbfad9992624d89bbb3209d1f5b765c7947bb9 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Wed, 23 Aug 2017 21:35:17 -0700
Subject: [PATCH 413/779] [SPARK-21805][SPARKR] Disable R vignettes code on
 Windows

## What changes were proposed in this pull request?

Code in vignettes requires winutils on windows to run, when publishing to CRAN or building from source, winutils might not be available, so it's better to disable code run (so resulting vigenttes will not have output from code, but text is still there and code is still there)

fix * checking re-building of vignette outputs ... WARNING
and
> %LOCALAPPDATA% not found. Please define the environment variable or restart and enter an installation path in localDir.

## How was this patch tested?

jenkins, appveyor, r-hub

before: https://artifacts.r-hub.io/SparkR_2.2.0.tar.gz-49cecef3bb09db1db130db31604e0293/SparkR.Rcheck/00check.log
after: https://artifacts.r-hub.io/SparkR_2.2.0.tar.gz-86a066c7576f46794930ad114e5cff7c/SparkR.Rcheck/00check.log

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #19016 from felixcheung/rvigwind.
---
 R/pkg/DESCRIPTION                    |  2 +-
 R/pkg/R/install.R                    |  6 +++++-
 R/pkg/vignettes/sparkr-vignettes.Rmd | 11 +++++++++++
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index b739d423a36cc..d1c846c048274 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -2,7 +2,7 @@ Package: SparkR
 Type: Package
 Version: 2.3.0
 Title: R Frontend for Apache Spark
-Description: The SparkR package provides an R Frontend for Apache Spark.
+Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
                     email = "shivaram@cs.berkeley.edu"),
              person("Xiangrui", "Meng", role = "aut",
diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R
index ec931befa2854..492dee68e164d 100644
--- a/R/pkg/R/install.R
+++ b/R/pkg/R/install.R
@@ -270,7 +270,11 @@ sparkCachePath <- function() {
   if (is_windows()) {
     winAppPath <- Sys.getenv("LOCALAPPDATA", unset = NA)
     if (is.na(winAppPath)) {
-      stop(paste("%LOCALAPPDATA% not found.",
+      message("%LOCALAPPDATA% not found. Falling back to %USERPROFILE%.")
+      winAppPath <- Sys.getenv("USERPROFILE", unset = NA)
+    }
+    if (is.na(winAppPath)) {
+      stop(paste("%LOCALAPPDATA% and %USERPROFILE% not found.",
                    "Please define the environment variable",
                    "or restart and enter an installation path in localDir."))
     } else {
diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
index 2301a64576d0e..caeae72e37bbf 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -27,6 +27,17 @@ vignette: >
  limitations under the License.
 -->
 
+```{r setup, include=FALSE}
+library(knitr)
+opts_hooks$set(eval = function(options) {
+  # override eval to FALSE only on windows
+  if (.Platform$OS.type == "windows") {
+    options$eval = FALSE
+  }
+  options
+})
+```
+
 ## Overview
 
 SparkR is an R package that provides a light-weight frontend to use Apache Spark from R. With Spark `r packageVersion("SparkR")`, SparkR provides a distributed data frame implementation that supports data processing operations like selection, filtering, aggregation etc. and distributed machine learning using [MLlib](http://spark.apache.org/mllib/).

From ce0d3bb377766bdf4df7852272557ae846408877 Mon Sep 17 00:00:00 2001
From: "Susan X. Huynh" <xhuynh@mesosphere.com>
Date: Thu, 24 Aug 2017 10:05:38 +0100
Subject: [PATCH 414/779] [SPARK-21694][MESOS] Support Mesos CNI network labels

JIRA ticket: https://issues.apache.org/jira/browse/SPARK-21694

## What changes were proposed in this pull request?

Spark already supports launching containers attached to a given CNI network by specifying it via the config `spark.mesos.network.name`.

This PR adds support to pass in network labels to CNI plugins via a new config option `spark.mesos.network.labels`. These network labels are key-value pairs that are set in the `NetworkInfo` of both the driver and executor tasks. More details in the related Mesos documentation:  http://mesos.apache.org/documentation/latest/cni/#mesos-meta-data-to-cni-plugins

## How was this patch tested?

Unit tests, for both driver and executor tasks.
Manual integration test to submit a job with the `spark.mesos.network.labels` option, hit the mesos/state.json endpoint, and check that the labels are set in the driver and executor tasks.

ArtRand skonto

Author: Susan X. Huynh <xhuynh@mesosphere.com>

Closes #18910 from susanxhuynh/sh-mesos-cni-labels.
---
 docs/running-on-mesos.md                      | 14 ++++++++++++++
 .../apache/spark/deploy/mesos/config.scala    | 19 +++++++++++++++++--
 .../MesosCoarseGrainedSchedulerBackend.scala  |  2 +-
 .../mesos/MesosSchedulerBackendUtil.scala     |  9 +++++++--
 .../mesos/MesosClusterSchedulerSuite.scala    |  9 +++++++--
 ...osCoarseGrainedSchedulerBackendSuite.scala |  9 +++++++--
 6 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md
index ae3855084a650..0e5a20c578db3 100644
--- a/docs/running-on-mesos.md
+++ b/docs/running-on-mesos.md
@@ -537,6 +537,20 @@ See the [configuration page](configuration.html) for information on Spark config
     for more details.
   </td>
 </tr>
+<tr>
+  <td><code>spark.mesos.network.labels</code></td>
+  <td><code>(none)</code></td>
+  <td>
+    Pass network labels to CNI plugins.  This is a comma-separated list
+    of key-value pairs, where each key-value pair has the format key:value.
+    Example:
+
+    <pre>key1:val1,key2:val2</pre>
+    See
+    <a href="http://mesos.apache.org/documentation/latest/cni/#mesos-meta-data-to-cni-plugins">the Mesos CNI docs</a>
+    for more details.
+  </td>
+</tr>
 <tr>
   <td><code>spark.mesos.fetcherCache.enable</code></td>
   <td><code>false</code></td>
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala
index 6c8619e3c3c13..a5015b9243316 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala
@@ -56,7 +56,7 @@ package object config {
       .stringConf
       .createOptional
 
-  private [spark] val DRIVER_LABELS =
+  private[spark] val DRIVER_LABELS =
     ConfigBuilder("spark.mesos.driver.labels")
       .doc("Mesos labels to add to the driver.  Labels are free-form key-value pairs.  Key-value " +
         "pairs should be separated by a colon, and commas used to list more than one." +
@@ -64,10 +64,25 @@ package object config {
       .stringConf
       .createOptional
 
-  private [spark] val DRIVER_FAILOVER_TIMEOUT =
+  private[spark] val DRIVER_FAILOVER_TIMEOUT =
     ConfigBuilder("spark.mesos.driver.failoverTimeout")
       .doc("Amount of time in seconds that the master will wait to hear from the driver, " +
           "during a temporary disconnection, before tearing down all the executors.")
       .doubleConf
       .createWithDefault(0.0)
+
+  private[spark] val NETWORK_NAME =
+    ConfigBuilder("spark.mesos.network.name")
+      .doc("Attach containers to the given named network. If this job is launched " +
+        "in cluster mode, also launch the driver in the given named network.")
+      .stringConf
+      .createOptional
+
+  private[spark] val NETWORK_LABELS =
+    ConfigBuilder("spark.mesos.network.labels")
+      .doc("Network labels to pass to CNI plugins.  This is a comma-separated list " +
+        "of key-value pairs, where each key-value pair has the format key:value. " +
+        "Example: key1:val1,key2:val2")
+      .stringConf
+      .createOptional
 }
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
index 5ecd466194d8b..26699873145b4 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
@@ -670,7 +670,7 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
   }
 
   private def executorHostname(offer: Offer): String = {
-    if (sc.conf.getOption("spark.mesos.network.name").isDefined) {
+    if (sc.conf.get(NETWORK_NAME).isDefined) {
       // The agent's IP is not visible in a CNI container, so we bind to 0.0.0.0
       "0.0.0.0"
     } else {
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala
index fbcbc55099ec5..e5c1e801f2772 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala
@@ -21,6 +21,7 @@ import org.apache.mesos.Protos.{ContainerInfo, Image, NetworkInfo, Parameter, Vo
 import org.apache.mesos.Protos.ContainerInfo.{DockerInfo, MesosInfo}
 
 import org.apache.spark.{SparkConf, SparkException}
+import org.apache.spark.deploy.mesos.config.{NETWORK_LABELS, NETWORK_NAME}
 import org.apache.spark.internal.Logging
 
 /**
@@ -161,8 +162,12 @@ private[mesos] object MesosSchedulerBackendUtil extends Logging {
       volumes.foreach(_.foreach(containerInfo.addVolumes(_)))
     }
 
-    conf.getOption("spark.mesos.network.name").map { name =>
-      val info = NetworkInfo.newBuilder().setName(name).build()
+    conf.get(NETWORK_NAME).map { name =>
+      val networkLabels = MesosProtoUtils.mesosLabels(conf.get(NETWORK_LABELS).getOrElse(""))
+      val info = NetworkInfo.newBuilder()
+        .setName(name)
+        .setLabels(networkLabels)
+        .build()
       containerInfo.addNetworkInfos(info)
     }
 
diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala
index 0bb47906347d5..50bb501071509 100644
--- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala
+++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala
@@ -222,7 +222,7 @@ class MesosClusterSchedulerSuite extends SparkFunSuite with LocalSparkContext wi
     assert(env.getOrElse("TEST_ENV", null) == "TEST_VAL")
   }
 
-  test("supports spark.mesos.network.name") {
+  test("supports spark.mesos.network.name and spark.mesos.network.labels") {
     setScheduler()
 
     val mem = 1000
@@ -233,7 +233,8 @@ class MesosClusterSchedulerSuite extends SparkFunSuite with LocalSparkContext wi
         command,
         Map("spark.mesos.executor.home" -> "test",
           "spark.app.name" -> "test",
-          "spark.mesos.network.name" -> "test-network-name"),
+          "spark.mesos.network.name" -> "test-network-name",
+          "spark.mesos.network.labels" -> "key1:val1,key2:val2"),
         "s1",
         new Date()))
 
@@ -246,6 +247,10 @@ class MesosClusterSchedulerSuite extends SparkFunSuite with LocalSparkContext wi
     val networkInfos = launchedTasks.head.getContainer.getNetworkInfosList
     assert(networkInfos.size == 1)
     assert(networkInfos.get(0).getName == "test-network-name")
+    assert(networkInfos.get(0).getLabels.getLabels(0).getKey == "key1")
+    assert(networkInfos.get(0).getLabels.getLabels(0).getValue == "val1")
+    assert(networkInfos.get(0).getLabels.getLabels(1).getKey == "key2")
+    assert(networkInfos.get(0).getLabels.getLabels(1).getValue == "val2")
   }
 
   test("supports spark.mesos.driver.labels") {
diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
index a8175e29bc9cf..ab29c295dd893 100644
--- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
+++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
@@ -568,9 +568,10 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite
     assert(launchedTasks.head.getLabels.equals(taskLabels))
   }
 
-  test("mesos supports spark.mesos.network.name") {
+  test("mesos supports spark.mesos.network.name and spark.mesos.network.labels") {
     setBackend(Map(
-      "spark.mesos.network.name" -> "test-network-name"
+      "spark.mesos.network.name" -> "test-network-name",
+      "spark.mesos.network.labels" -> "key1:val1,key2:val2"
     ))
 
     val (mem, cpu) = (backend.executorMemory(sc), 4)
@@ -582,6 +583,10 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite
     val networkInfos = launchedTasks.head.getContainer.getNetworkInfosList
     assert(networkInfos.size == 1)
     assert(networkInfos.get(0).getName == "test-network-name")
+    assert(networkInfos.get(0).getLabels.getLabels(0).getKey == "key1")
+    assert(networkInfos.get(0).getLabels.getLabels(0).getValue == "val1")
+    assert(networkInfos.get(0).getLabels.getLabels(1).getKey == "key2")
+    assert(networkInfos.get(0).getLabels.getLabels(1).getValue == "val2")
   }
 
   test("supports spark.scheduler.minRegisteredResourcesRatio") {

From 846bc61cf5aa522dc755d50359ef3856ef2b17bf Mon Sep 17 00:00:00 2001
From: lufei <lu.fei80@zte.com.cn>
Date: Thu, 24 Aug 2017 10:07:27 +0100
Subject: [PATCH 415/779] [MINOR][SQL] The comment of Class ExchangeCoordinator
 exist a typing and context error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

The given example in the comment of Class ExchangeCoordinator is exist four post-shuffle partitions,but the current comment is “three”.

## How was this patch tested?

Author: lufei <lu.fei80@zte.com.cn>

Closes #19028 from figo77/SPARK-21816.
---
 .../spark/sql/execution/exchange/ExchangeCoordinator.scala      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ExchangeCoordinator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ExchangeCoordinator.scala
index deb2c24d0f16e..9fc4ffb651ec8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ExchangeCoordinator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ExchangeCoordinator.scala
@@ -75,7 +75,7 @@ import org.apache.spark.sql.execution.{ShuffledRowRDD, SparkPlan}
  * For example, we have two stages with the following pre-shuffle partition size statistics:
  * stage 1: [100 MB, 20 MB, 100 MB, 10MB, 30 MB]
  * stage 2: [10 MB,  10 MB, 70 MB,  5 MB, 5 MB]
- * assuming the target input size is 128 MB, we will have three post-shuffle partitions,
+ * assuming the target input size is 128 MB, we will have four post-shuffle partitions,
  * which are:
  *  - post-shuffle partition 0: pre-shuffle partition 0 (size 110 MB)
  *  - post-shuffle partition 1: pre-shuffle partition 1 (size 30 MB)

From 95713eb4f22de4e16617a605f74a1d6373ed270b Mon Sep 17 00:00:00 2001
From: Jen-Ming Chung <jenmingisme@gmail.com>
Date: Thu, 24 Aug 2017 19:24:00 +0900
Subject: [PATCH 416/779] [SPARK-21804][SQL] json_tuple returns null values
 within repeated columns except the first one

## What changes were proposed in this pull request?

When json_tuple in extracting values from JSON it returns null values within repeated columns except the first one as below:

``` scala
scala> spark.sql("""SELECT json_tuple('{"a":1, "b":2}', 'a', 'b', 'a')""").show()
+---+---+----+
| c0| c1|  c2|
+---+---+----+
|  1|  2|null|
+---+---+----+
```

I think this should be consistent with Hive's implementation:
```
hive> SELECT json_tuple('{"a": 1, "b": 2}', 'a', 'a');
...
1    1
```

In this PR, we located all the matched indices in `fieldNames` instead of returning the first matched index, i.e., indexOf.

## How was this patch tested?

Added test in JsonExpressionsSuite.

Author: Jen-Ming Chung <jenmingisme@gmail.com>

Closes #19017 from jmchung/SPARK-21804.
---
 .../sql/catalyst/expressions/jsonExpressions.scala   | 12 ++++++++++--
 .../catalyst/expressions/JsonExpressionsSuite.scala  | 10 ++++++++++
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index c3757373a3cf9..ee5da1a83a4ce 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -436,7 +436,8 @@ case class JsonTuple(children: Seq[Expression])
     while (parser.nextToken() != JsonToken.END_OBJECT) {
       if (parser.getCurrentToken == JsonToken.FIELD_NAME) {
         // check to see if this field is desired in the output
-        val idx = fieldNames.indexOf(parser.getCurrentName)
+        val jsonField = parser.getCurrentName
+        var idx = fieldNames.indexOf(jsonField)
         if (idx >= 0) {
           // it is, copy the child tree to the correct location in the output row
           val output = new ByteArrayOutputStream()
@@ -447,7 +448,14 @@ case class JsonTuple(children: Seq[Expression])
               generator => copyCurrentStructure(generator, parser)
             }
 
-            row(idx) = UTF8String.fromBytes(output.toByteArray)
+            val jsonValue = UTF8String.fromBytes(output.toByteArray)
+
+            // SPARK-21804: json_tuple returns null values within repeated columns
+            // except the first one; so that we need to check the remaining fields.
+            do {
+              row(idx) = jsonValue
+              idx = fieldNames.indexOf(jsonField, idx + 1)
+            } while (idx >= 0)
           }
         }
       }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
index 1cd2b4fc18a5c..9991bda165a01 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
@@ -373,6 +373,16 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       InternalRow(UTF8String.fromString("1"), null, UTF8String.fromString("2")))
   }
 
+  test("SPARK-21804: json_tuple returns null values within repeated columns except the first one") {
+    checkJsonTuple(
+      JsonTuple(Literal("""{"f1": 1, "f2": 2}""") ::
+        NonFoldableLiteral("f1") ::
+        NonFoldableLiteral("cast(NULL AS STRING)") ::
+        NonFoldableLiteral("f1") ::
+        Nil),
+      InternalRow(UTF8String.fromString("1"), null, UTF8String.fromString("1")))
+  }
+
   val gmtId = Option(DateTimeUtils.TimeZoneGMT.getID)
 
   test("from_json") {

From dc5d34d8dcd6526d1dfdac8606661561c7576a62 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Thu, 24 Aug 2017 20:29:03 +0900
Subject: [PATCH 417/779] [SPARK-19165][PYTHON][SQL] PySpark APIs using columns
 as arguments should validate input types for column

## What changes were proposed in this pull request?

While preparing to take over https://github.com/apache/spark/pull/16537, I realised a (I think) better approach to make the exception handling in one point.

This PR proposes to fix `_to_java_column` in `pyspark.sql.column`, which most of functions in `functions.py` and some other APIs use. This `_to_java_column` basically looks not working with other types than `pyspark.sql.column.Column` or string (`str` and `unicode`).

If this is not `Column`, then it calls `_create_column_from_name` which calls `functions.col` within JVM:

https://github.com/apache/spark/blob/42b9eda80e975d970c3e8da4047b318b83dd269f/sql/core/src/main/scala/org/apache/spark/sql/functions.scala#L76

And it looks we only have `String` one with `col`.

So, these should work:

```python
>>> from pyspark.sql.column import _to_java_column, Column
>>> _to_java_column("a")
JavaObject id=o28
>>> _to_java_column(u"a")
JavaObject id=o29
>>> _to_java_column(spark.range(1).id)
JavaObject id=o33
```

whereas these do not:

```python
>>> _to_java_column(1)
```
```
...
py4j.protocol.Py4JError: An error occurred while calling z:org.apache.spark.sql.functions.col. Trace:
py4j.Py4JException: Method col([class java.lang.Integer]) does not exist
    ...
```

```python
>>> _to_java_column([])
```
```
...
py4j.protocol.Py4JError: An error occurred while calling z:org.apache.spark.sql.functions.col. Trace:
py4j.Py4JException: Method col([class java.util.ArrayList]) does not exist
    ...
```

```python
>>> class A(): pass
>>> _to_java_column(A())
```
```
...
AttributeError: 'A' object has no attribute '_get_object_id'
```

Meaning most of functions using `_to_java_column` such as `udf` or `to_json` or some other APIs throw an exception as below:

```python
>>> from pyspark.sql.functions import udf
>>> udf(lambda x: x)(None)
```

```
...
py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.sql.functions.col.
: java.lang.NullPointerException
    ...
```

```python
>>> from pyspark.sql.functions import to_json
>>> to_json(None)
```

```
...
py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.sql.functions.col.
: java.lang.NullPointerException
    ...
```

**After this PR**:

```python
>>> from pyspark.sql.functions import udf
>>> udf(lambda x: x)(None)
...
```

```
TypeError: Invalid argument, not a string or column: None of type <type 'NoneType'>. For column literals, use 'lit', 'array', 'struct' or 'create_map' functions.
```

```python
>>> from pyspark.sql.functions import to_json
>>> to_json(None)
```

```
...
TypeError: Invalid argument, not a string or column: None of type <type 'NoneType'>. For column literals, use 'lit', 'array', 'struct' or 'create_map' functions.
```

## How was this patch tested?

Unit tests added in `python/pyspark/sql/tests.py` and manual tests.

Author: hyukjinkwon <gurwls223@gmail.com>
Author: zero323 <zero323@users.noreply.github.com>

Closes #19027 from HyukjinKwon/SPARK-19165.
---
 python/pyspark/sql/column.py |  8 +++++++-
 python/pyspark/sql/tests.py  | 25 +++++++++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
index b172f38ea22d0..43b38a2cd477c 100644
--- a/python/pyspark/sql/column.py
+++ b/python/pyspark/sql/column.py
@@ -44,8 +44,14 @@ def _create_column_from_name(name):
 def _to_java_column(col):
     if isinstance(col, Column):
         jcol = col._jc
-    else:
+    elif isinstance(col, basestring):
         jcol = _create_column_from_name(col)
+    else:
+        raise TypeError(
+            "Invalid argument, not a string or column: "
+            "{0} of type {1}. "
+            "For column literals, use 'lit', 'array', 'struct' or 'create_map' "
+            "function.".format(col, type(col)))
     return jcol
 
 
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 45a3f9e7165f1..1ecde68fb0ac1 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -704,6 +704,31 @@ def __call__(self, x):
         self.assertEqual(f, f_.func)
         self.assertEqual(return_type, f_.returnType)
 
+    def test_validate_column_types(self):
+        from pyspark.sql.functions import udf, to_json
+        from pyspark.sql.column import _to_java_column
+
+        self.assertTrue("Column" in _to_java_column("a").getClass().toString())
+        self.assertTrue("Column" in _to_java_column(u"a").getClass().toString())
+        self.assertTrue("Column" in _to_java_column(self.spark.range(1).id).getClass().toString())
+
+        self.assertRaisesRegexp(
+            TypeError,
+            "Invalid argument, not a string or column",
+            lambda: _to_java_column(1))
+
+        class A():
+            pass
+
+        self.assertRaises(TypeError, lambda: _to_java_column(A()))
+        self.assertRaises(TypeError, lambda: _to_java_column([]))
+
+        self.assertRaisesRegexp(
+            TypeError,
+            "Invalid argument, not a string or column",
+            lambda: udf(lambda x: x)(None))
+        self.assertRaises(TypeError, lambda: to_json(1))
+
     def test_basic_functions(self):
         rdd = self.sc.parallelize(['{"foo":"bar"}', '{"foo":"baz"}'])
         df = self.spark.read.json(rdd)

From 9e33954ddfe1148f69e523c89827feb76ba892c9 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Thu, 24 Aug 2017 21:13:44 +0800
Subject: [PATCH 418/779] [SPARK-21745][SQL] Refactor ColumnVector hierarchy to
 make ColumnVector read-only and to introduce WritableColumnVector.

## What changes were proposed in this pull request?

This is a refactoring of `ColumnVector` hierarchy and related classes.

1. make `ColumnVector` read-only
2. introduce `WritableColumnVector` with write interface
3. remove `ReadOnlyColumnVector`

## How was this patch tested?

Existing tests.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #18958 from ueshin/issues/SPARK-21745.
---
 .../expressions/codegen/CodeGenerator.scala   |  28 +-
 .../parquet/VectorizedColumnReader.java       |  31 +-
 .../VectorizedParquetRecordReader.java        |  23 +-
 .../parquet/VectorizedPlainValuesReader.java  |  16 +-
 .../parquet/VectorizedRleValuesReader.java    |  87 ++-
 .../parquet/VectorizedValuesReader.java       |  16 +-
 .../vectorized/AggregateHashMap.java          |  10 +-
 .../vectorized/ArrowColumnVector.java         |  45 +-
 .../execution/vectorized/ColumnVector.java    | 632 +---------------
 .../vectorized/ColumnVectorUtils.java         |  18 +-
 .../execution/vectorized/ColumnarBatch.java   | 106 +--
 .../vectorized/OffHeapColumnVector.java       |  34 +-
 .../vectorized/OnHeapColumnVector.java        |  35 +-
 .../vectorized/ReadOnlyColumnVector.java      | 251 -------
 .../vectorized/WritableColumnVector.java      | 674 ++++++++++++++++++
 .../VectorizedHashMapGenerator.scala          |  39 +-
 .../vectorized/ColumnarBatchBenchmark.scala   |  23 +-
 .../vectorized/ColumnarBatchSuite.scala       | 109 +--
 18 files changed, 1078 insertions(+), 1099 deletions(-)
 delete mode 100644 sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ReadOnlyColumnVector.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 807765c1e00a1..38538630c8b32 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -464,14 +464,13 @@ class CodegenContext {
   /**
    * Returns the specialized code to set a given value in a column vector for a given `DataType`.
    */
-  def setValue(batch: String, row: String, dataType: DataType, ordinal: Int,
-      value: String): String = {
+  def setValue(vector: String, rowId: String, dataType: DataType, value: String): String = {
     val jt = javaType(dataType)
     dataType match {
       case _ if isPrimitiveType(jt) =>
-        s"$batch.column($ordinal).put${primitiveTypeName(jt)}($row, $value);"
-      case t: DecimalType => s"$batch.column($ordinal).putDecimal($row, $value, ${t.precision});"
-      case t: StringType => s"$batch.column($ordinal).putByteArray($row, $value.getBytes());"
+        s"$vector.put${primitiveTypeName(jt)}($rowId, $value);"
+      case t: DecimalType => s"$vector.putDecimal($rowId, $value, ${t.precision});"
+      case t: StringType => s"$vector.putByteArray($rowId, $value.getBytes());"
       case _ =>
         throw new IllegalArgumentException(s"cannot generate code for unsupported type: $dataType")
     }
@@ -482,37 +481,36 @@ class CodegenContext {
    * that could potentially be nullable.
    */
   def updateColumn(
-      batch: String,
-      row: String,
+      vector: String,
+      rowId: String,
       dataType: DataType,
-      ordinal: Int,
       ev: ExprCode,
       nullable: Boolean): String = {
     if (nullable) {
       s"""
          if (!${ev.isNull}) {
-           ${setValue(batch, row, dataType, ordinal, ev.value)}
+           ${setValue(vector, rowId, dataType, ev.value)}
          } else {
-           $batch.column($ordinal).putNull($row);
+           $vector.putNull($rowId);
          }
        """
     } else {
-      s"""${setValue(batch, row, dataType, ordinal, ev.value)};"""
+      s"""${setValue(vector, rowId, dataType, ev.value)};"""
     }
   }
 
   /**
    * Returns the specialized code to access a value from a column vector for a given `DataType`.
    */
-  def getValue(batch: String, row: String, dataType: DataType, ordinal: Int): String = {
+  def getValue(vector: String, rowId: String, dataType: DataType): String = {
     val jt = javaType(dataType)
     dataType match {
       case _ if isPrimitiveType(jt) =>
-        s"$batch.column($ordinal).get${primitiveTypeName(jt)}($row)"
+        s"$vector.get${primitiveTypeName(jt)}($rowId)"
       case t: DecimalType =>
-        s"$batch.column($ordinal).getDecimal($row, ${t.precision}, ${t.scale})"
+        s"$vector.getDecimal($rowId, ${t.precision}, ${t.scale})"
       case StringType =>
-        s"$batch.column($ordinal).getUTF8String($row)"
+        s"$vector.getUTF8String($rowId)"
       case _ =>
         throw new IllegalArgumentException(s"cannot generate code for unsupported type: $dataType")
     }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
index fd8db1727212f..f37864a0f5393 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
@@ -30,6 +30,7 @@
 
 import org.apache.spark.sql.catalyst.util.DateTimeUtils;
 import org.apache.spark.sql.execution.vectorized.ColumnVector;
+import org.apache.spark.sql.execution.vectorized.WritableColumnVector;
 import org.apache.spark.sql.types.DataTypes;
 import org.apache.spark.sql.types.DecimalType;
 
@@ -135,9 +136,9 @@ private boolean next() throws IOException {
   /**
    * Reads `total` values from this columnReader into column.
    */
-  void readBatch(int total, ColumnVector column) throws IOException {
+  void readBatch(int total, WritableColumnVector column) throws IOException {
     int rowId = 0;
-    ColumnVector dictionaryIds = null;
+    WritableColumnVector dictionaryIds = null;
     if (dictionary != null) {
       // SPARK-16334: We only maintain a single dictionary per row batch, so that it can be used to
       // decode all previous dictionary encoded pages if we ever encounter a non-dictionary encoded
@@ -219,8 +220,11 @@ void readBatch(int total, ColumnVector column) throws IOException {
   /**
    * Reads `num` values into column, decoding the values from `dictionaryIds` and `dictionary`.
    */
-  private void decodeDictionaryIds(int rowId, int num, ColumnVector column,
-                                   ColumnVector dictionaryIds) {
+  private void decodeDictionaryIds(
+      int rowId,
+      int num,
+      WritableColumnVector column,
+      ColumnVector dictionaryIds) {
     switch (descriptor.getType()) {
       case INT32:
         if (column.dataType() == DataTypes.IntegerType ||
@@ -346,13 +350,13 @@ private void decodeDictionaryIds(int rowId, int num, ColumnVector column,
    * is guaranteed that num is smaller than the number of values left in the current page.
    */
 
-  private void readBooleanBatch(int rowId, int num, ColumnVector column) throws IOException {
+  private void readBooleanBatch(int rowId, int num, WritableColumnVector column) throws IOException {
     assert(column.dataType() == DataTypes.BooleanType);
     defColumn.readBooleans(
         num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
   }
 
-  private void readIntBatch(int rowId, int num, ColumnVector column) throws IOException {
+  private void readIntBatch(int rowId, int num, WritableColumnVector column) throws IOException {
     // This is where we implement support for the valid type conversions.
     // TODO: implement remaining type conversions
     if (column.dataType() == DataTypes.IntegerType || column.dataType() == DataTypes.DateType ||
@@ -370,7 +374,7 @@ private void readIntBatch(int rowId, int num, ColumnVector column) throws IOExce
     }
   }
 
-  private void readLongBatch(int rowId, int num, ColumnVector column) throws IOException {
+  private void readLongBatch(int rowId, int num, WritableColumnVector column) throws IOException {
     // This is where we implement support for the valid type conversions.
     if (column.dataType() == DataTypes.LongType ||
         DecimalType.is64BitDecimalType(column.dataType())) {
@@ -389,7 +393,7 @@ private void readLongBatch(int rowId, int num, ColumnVector column) throws IOExc
     }
   }
 
-  private void readFloatBatch(int rowId, int num, ColumnVector column) throws IOException {
+  private void readFloatBatch(int rowId, int num, WritableColumnVector column) throws IOException {
     // This is where we implement support for the valid type conversions.
     // TODO: support implicit cast to double?
     if (column.dataType() == DataTypes.FloatType) {
@@ -400,7 +404,7 @@ private void readFloatBatch(int rowId, int num, ColumnVector column) throws IOEx
     }
   }
 
-  private void readDoubleBatch(int rowId, int num, ColumnVector column) throws IOException {
+  private void readDoubleBatch(int rowId, int num, WritableColumnVector column) throws IOException {
     // This is where we implement support for the valid type conversions.
     // TODO: implement remaining type conversions
     if (column.dataType() == DataTypes.DoubleType) {
@@ -411,7 +415,7 @@ private void readDoubleBatch(int rowId, int num, ColumnVector column) throws IOE
     }
   }
 
-  private void readBinaryBatch(int rowId, int num, ColumnVector column) throws IOException {
+  private void readBinaryBatch(int rowId, int num, WritableColumnVector column) throws IOException {
     // This is where we implement support for the valid type conversions.
     // TODO: implement remaining type conversions
     VectorizedValuesReader data = (VectorizedValuesReader) dataColumn;
@@ -432,8 +436,11 @@ private void readBinaryBatch(int rowId, int num, ColumnVector column) throws IOE
     }
   }
 
-  private void readFixedLenByteArrayBatch(int rowId, int num,
-                                          ColumnVector column, int arrayLen) throws IOException {
+  private void readFixedLenByteArrayBatch(
+      int rowId,
+      int num,
+      WritableColumnVector column,
+      int arrayLen) throws IOException {
     VectorizedValuesReader data = (VectorizedValuesReader) dataColumn;
     // This is where we implement support for the valid type conversions.
     // TODO: implement remaining type conversions
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java
index 04f8141d66e9d..0cacf0c9c93a5 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java
@@ -31,6 +31,9 @@
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.execution.vectorized.ColumnVectorUtils;
 import org.apache.spark.sql.execution.vectorized.ColumnarBatch;
+import org.apache.spark.sql.execution.vectorized.WritableColumnVector;
+import org.apache.spark.sql.execution.vectorized.OffHeapColumnVector;
+import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector;
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
 
@@ -90,6 +93,8 @@ public class VectorizedParquetRecordReader extends SpecificParquetRecordReaderBa
    */
   private ColumnarBatch columnarBatch;
 
+  private WritableColumnVector[] columnVectors;
+
   /**
    * If true, this class returns batches instead of rows.
    */
@@ -172,20 +177,26 @@ public void initBatch(MemoryMode memMode, StructType partitionColumns,
       }
     }
 
-    columnarBatch = ColumnarBatch.allocate(batchSchema, memMode);
+    int capacity = ColumnarBatch.DEFAULT_BATCH_SIZE;
+    if (memMode == MemoryMode.OFF_HEAP) {
+      columnVectors = OffHeapColumnVector.allocateColumns(capacity, batchSchema);
+    } else {
+      columnVectors = OnHeapColumnVector.allocateColumns(capacity, batchSchema);
+    }
+    columnarBatch = new ColumnarBatch(batchSchema, columnVectors, capacity);
     if (partitionColumns != null) {
       int partitionIdx = sparkSchema.fields().length;
       for (int i = 0; i < partitionColumns.fields().length; i++) {
-        ColumnVectorUtils.populate(columnarBatch.column(i + partitionIdx), partitionValues, i);
-        columnarBatch.column(i + partitionIdx).setIsConstant();
+        ColumnVectorUtils.populate(columnVectors[i + partitionIdx], partitionValues, i);
+        columnVectors[i + partitionIdx].setIsConstant();
       }
     }
 
     // Initialize missing columns with nulls.
     for (int i = 0; i < missingColumns.length; i++) {
       if (missingColumns[i]) {
-        columnarBatch.column(i).putNulls(0, columnarBatch.capacity());
-        columnarBatch.column(i).setIsConstant();
+        columnVectors[i].putNulls(0, columnarBatch.capacity());
+        columnVectors[i].setIsConstant();
       }
     }
   }
@@ -226,7 +237,7 @@ public boolean nextBatch() throws IOException {
     int num = (int) Math.min((long) columnarBatch.capacity(), totalCountLoadedSoFar - rowsReturned);
     for (int i = 0; i < columnReaders.length; ++i) {
       if (columnReaders[i] == null) continue;
-      columnReaders[i].readBatch(num, columnarBatch.column(i));
+      columnReaders[i].readBatch(num, columnVectors[i]);
     }
     rowsReturned += num;
     columnarBatch.setNumRows(num);
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java
index 98018b7f48bd8..5b75f719339fb 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java
@@ -20,7 +20,7 @@
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 
-import org.apache.spark.sql.execution.vectorized.ColumnVector;
+import org.apache.spark.sql.execution.vectorized.WritableColumnVector;
 import org.apache.spark.unsafe.Platform;
 
 import org.apache.parquet.column.values.ValuesReader;
@@ -56,7 +56,7 @@ public void skip() {
   }
 
   @Override
-  public final void readBooleans(int total, ColumnVector c, int rowId) {
+  public final void readBooleans(int total, WritableColumnVector c, int rowId) {
     // TODO: properly vectorize this
     for (int i = 0; i < total; i++) {
       c.putBoolean(rowId + i, readBoolean());
@@ -64,31 +64,31 @@ public final void readBooleans(int total, ColumnVector c, int rowId) {
   }
 
   @Override
-  public final void readIntegers(int total, ColumnVector c, int rowId) {
+  public final void readIntegers(int total, WritableColumnVector c, int rowId) {
     c.putIntsLittleEndian(rowId, total, buffer, offset - Platform.BYTE_ARRAY_OFFSET);
     offset += 4 * total;
   }
 
   @Override
-  public final void readLongs(int total, ColumnVector c, int rowId) {
+  public final void readLongs(int total, WritableColumnVector c, int rowId) {
     c.putLongsLittleEndian(rowId, total, buffer, offset - Platform.BYTE_ARRAY_OFFSET);
     offset += 8 * total;
   }
 
   @Override
-  public final void readFloats(int total, ColumnVector c, int rowId) {
+  public final void readFloats(int total, WritableColumnVector c, int rowId) {
     c.putFloats(rowId, total, buffer, offset - Platform.BYTE_ARRAY_OFFSET);
     offset += 4 * total;
   }
 
   @Override
-  public final void readDoubles(int total, ColumnVector c, int rowId) {
+  public final void readDoubles(int total, WritableColumnVector c, int rowId) {
     c.putDoubles(rowId, total, buffer, offset - Platform.BYTE_ARRAY_OFFSET);
     offset += 8 * total;
   }
 
   @Override
-  public final void readBytes(int total, ColumnVector c, int rowId) {
+  public final void readBytes(int total, WritableColumnVector c, int rowId) {
     for (int i = 0; i < total; i++) {
       // Bytes are stored as a 4-byte little endian int. Just read the first byte.
       // TODO: consider pushing this in ColumnVector by adding a readBytes with a stride.
@@ -159,7 +159,7 @@ public final double readDouble() {
   }
 
   @Override
-  public final void readBinary(int total, ColumnVector v, int rowId) {
+  public final void readBinary(int total, WritableColumnVector v, int rowId) {
     for (int i = 0; i < total; i++) {
       int len = readInteger();
       int start = offset;
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedRleValuesReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedRleValuesReader.java
index 62157389013bb..fc7fa70c39419 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedRleValuesReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedRleValuesReader.java
@@ -25,7 +25,7 @@
 import org.apache.parquet.io.ParquetDecodingException;
 import org.apache.parquet.io.api.Binary;
 
-import org.apache.spark.sql.execution.vectorized.ColumnVector;
+import org.apache.spark.sql.execution.vectorized.WritableColumnVector;
 
 /**
  * A values reader for Parquet's run-length encoded data. This is based off of the version in
@@ -177,7 +177,11 @@ public int readInteger() {
    *    c[rowId] = null;
    *  }
    */
-  public void readIntegers(int total, ColumnVector c, int rowId, int level,
+  public void readIntegers(
+      int total,
+      WritableColumnVector c,
+      int rowId,
+      int level,
       VectorizedValuesReader data) {
     int left = total;
     while (left > 0) {
@@ -208,8 +212,12 @@ public void readIntegers(int total, ColumnVector c, int rowId, int level,
   }
 
   // TODO: can this code duplication be removed without a perf penalty?
-  public void readBooleans(int total, ColumnVector c,
-                        int rowId, int level, VectorizedValuesReader data) {
+  public void readBooleans(
+      int total,
+      WritableColumnVector c,
+      int rowId,
+      int level,
+      VectorizedValuesReader data) {
     int left = total;
     while (left > 0) {
       if (this.currentCount == 0) this.readNextGroup();
@@ -238,8 +246,12 @@ public void readBooleans(int total, ColumnVector c,
     }
   }
 
-  public void readBytes(int total, ColumnVector c,
-                        int rowId, int level, VectorizedValuesReader data) {
+  public void readBytes(
+      int total,
+      WritableColumnVector c,
+      int rowId,
+      int level,
+      VectorizedValuesReader data) {
     int left = total;
     while (left > 0) {
       if (this.currentCount == 0) this.readNextGroup();
@@ -268,8 +280,12 @@ public void readBytes(int total, ColumnVector c,
     }
   }
 
-  public void readShorts(int total, ColumnVector c,
-                        int rowId, int level, VectorizedValuesReader data) {
+  public void readShorts(
+      int total,
+      WritableColumnVector c,
+      int rowId,
+      int level,
+      VectorizedValuesReader data) {
     int left = total;
     while (left > 0) {
       if (this.currentCount == 0) this.readNextGroup();
@@ -300,8 +316,12 @@ public void readShorts(int total, ColumnVector c,
     }
   }
 
-  public void readLongs(int total, ColumnVector c, int rowId, int level,
-                        VectorizedValuesReader data) {
+  public void readLongs(
+      int total,
+      WritableColumnVector c,
+      int rowId,
+      int level,
+      VectorizedValuesReader data) {
     int left = total;
     while (left > 0) {
       if (this.currentCount == 0) this.readNextGroup();
@@ -330,8 +350,12 @@ public void readLongs(int total, ColumnVector c, int rowId, int level,
     }
   }
 
-  public void readFloats(int total, ColumnVector c, int rowId, int level,
-                        VectorizedValuesReader data) {
+  public void readFloats(
+      int total,
+      WritableColumnVector c,
+      int rowId,
+      int level,
+      VectorizedValuesReader data) {
     int left = total;
     while (left > 0) {
       if (this.currentCount == 0) this.readNextGroup();
@@ -360,8 +384,12 @@ public void readFloats(int total, ColumnVector c, int rowId, int level,
     }
   }
 
-  public void readDoubles(int total, ColumnVector c, int rowId, int level,
-                         VectorizedValuesReader data) {
+  public void readDoubles(
+      int total,
+      WritableColumnVector c,
+      int rowId,
+      int level,
+      VectorizedValuesReader data) {
     int left = total;
     while (left > 0) {
       if (this.currentCount == 0) this.readNextGroup();
@@ -390,8 +418,12 @@ public void readDoubles(int total, ColumnVector c, int rowId, int level,
     }
   }
 
-  public void readBinarys(int total, ColumnVector c, int rowId, int level,
-                        VectorizedValuesReader data) {
+  public void readBinarys(
+      int total,
+      WritableColumnVector c,
+      int rowId,
+      int level,
+      VectorizedValuesReader data) {
     int left = total;
     while (left > 0) {
       if (this.currentCount == 0) this.readNextGroup();
@@ -424,8 +456,13 @@ public void readBinarys(int total, ColumnVector c, int rowId, int level,
    * Decoding for dictionary ids. The IDs are populated into `values` and the nullability is
    * populated into `nulls`.
    */
-  public void readIntegers(int total, ColumnVector values, ColumnVector nulls, int rowId, int level,
-                           VectorizedValuesReader data) {
+  public void readIntegers(
+      int total,
+      WritableColumnVector values,
+      WritableColumnVector nulls,
+      int rowId,
+      int level,
+      VectorizedValuesReader data) {
     int left = total;
     while (left > 0) {
       if (this.currentCount == 0) this.readNextGroup();
@@ -459,7 +496,7 @@ public void readIntegers(int total, ColumnVector values, ColumnVector nulls, int
   // IDs. This is different than the above APIs that decodes definitions levels along with values.
   // Since this is only used to decode dictionary IDs, only decoding integers is supported.
   @Override
-  public void readIntegers(int total, ColumnVector c, int rowId) {
+  public void readIntegers(int total, WritableColumnVector c, int rowId) {
     int left = total;
     while (left > 0) {
       if (this.currentCount == 0) this.readNextGroup();
@@ -485,32 +522,32 @@ public byte readByte() {
   }
 
   @Override
-  public void readBytes(int total, ColumnVector c, int rowId) {
+  public void readBytes(int total, WritableColumnVector c, int rowId) {
     throw new UnsupportedOperationException("only readInts is valid.");
   }
 
   @Override
-  public void readLongs(int total, ColumnVector c, int rowId) {
+  public void readLongs(int total, WritableColumnVector c, int rowId) {
     throw new UnsupportedOperationException("only readInts is valid.");
   }
 
   @Override
-  public void readBinary(int total, ColumnVector c, int rowId) {
+  public void readBinary(int total, WritableColumnVector c, int rowId) {
     throw new UnsupportedOperationException("only readInts is valid.");
   }
 
   @Override
-  public void readBooleans(int total, ColumnVector c, int rowId) {
+  public void readBooleans(int total, WritableColumnVector c, int rowId) {
     throw new UnsupportedOperationException("only readInts is valid.");
   }
 
   @Override
-  public void readFloats(int total, ColumnVector c, int rowId) {
+  public void readFloats(int total, WritableColumnVector c, int rowId) {
     throw new UnsupportedOperationException("only readInts is valid.");
   }
 
   @Override
-  public void readDoubles(int total, ColumnVector c, int rowId) {
+  public void readDoubles(int total, WritableColumnVector c, int rowId) {
     throw new UnsupportedOperationException("only readInts is valid.");
   }
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java
index 88418ca53fe1e..57d92ae27ece8 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.execution.datasources.parquet;
 
-import org.apache.spark.sql.execution.vectorized.ColumnVector;
+import org.apache.spark.sql.execution.vectorized.WritableColumnVector;
 
 import org.apache.parquet.io.api.Binary;
 
@@ -37,11 +37,11 @@ public interface VectorizedValuesReader {
   /*
    * Reads `total` values into `c` start at `c[rowId]`
    */
-  void readBooleans(int total, ColumnVector c, int rowId);
-  void readBytes(int total, ColumnVector c, int rowId);
-  void readIntegers(int total, ColumnVector c, int rowId);
-  void readLongs(int total, ColumnVector c, int rowId);
-  void readFloats(int total, ColumnVector c, int rowId);
-  void readDoubles(int total, ColumnVector c, int rowId);
-  void readBinary(int total, ColumnVector c, int rowId);
+  void readBooleans(int total, WritableColumnVector c, int rowId);
+  void readBytes(int total, WritableColumnVector c, int rowId);
+  void readIntegers(int total, WritableColumnVector c, int rowId);
+  void readLongs(int total, WritableColumnVector c, int rowId);
+  void readFloats(int total, WritableColumnVector c, int rowId);
+  void readDoubles(int total, WritableColumnVector c, int rowId);
+  void readBinary(int total, WritableColumnVector c, int rowId);
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/AggregateHashMap.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/AggregateHashMap.java
index 25a565d32638d..1c94f706dc685 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/AggregateHashMap.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/AggregateHashMap.java
@@ -41,6 +41,7 @@
  */
 public class AggregateHashMap {
 
+  private OnHeapColumnVector[] columnVectors;
   private ColumnarBatch batch;
   private int[] buckets;
   private int numBuckets;
@@ -62,7 +63,8 @@ public AggregateHashMap(StructType schema, int capacity, double loadFactor, int
 
     this.maxSteps = maxSteps;
     numBuckets = (int) (capacity / loadFactor);
-    batch = ColumnarBatch.allocate(schema, MemoryMode.ON_HEAP, capacity);
+    columnVectors = OnHeapColumnVector.allocateColumns(capacity, schema);
+    batch = new ColumnarBatch(schema, columnVectors, capacity);
     buckets = new int[numBuckets];
     Arrays.fill(buckets, -1);
   }
@@ -74,8 +76,8 @@ public AggregateHashMap(StructType schema) {
   public ColumnarBatch.Row findOrInsert(long key) {
     int idx = find(key);
     if (idx != -1 && buckets[idx] == -1) {
-      batch.column(0).putLong(numRows, key);
-      batch.column(1).putLong(numRows, 0);
+      columnVectors[0].putLong(numRows, key);
+      columnVectors[1].putLong(numRows, 0);
       buckets[idx] = numRows++;
     }
     return batch.getRow(buckets[idx]);
@@ -105,6 +107,6 @@ private long hash(long key) {
   }
 
   private boolean equals(int idx, long key1) {
-    return batch.column(0).getLong(buckets[idx]) == key1;
+    return columnVectors[0].getLong(buckets[idx]) == key1;
   }
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
index 59d66c599c518..be2a9c246747c 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
@@ -29,12 +29,13 @@
 /**
  * A column vector backed by Apache Arrow.
  */
-public final class ArrowColumnVector extends ReadOnlyColumnVector {
+public final class ArrowColumnVector extends ColumnVector {
 
   private final ArrowVectorAccessor accessor;
-  private final int valueCount;
+  private ArrowColumnVector[] childColumns;
 
   private void ensureAccessible(int index) {
+    int valueCount = accessor.getValueCount();
     if (index < 0 || index >= valueCount) {
       throw new IndexOutOfBoundsException(
         String.format("index: %d, valueCount: %d", index, valueCount));
@@ -42,12 +43,23 @@ private void ensureAccessible(int index) {
   }
 
   private void ensureAccessible(int index, int count) {
+    int valueCount = accessor.getValueCount();
     if (index < 0 || index + count > valueCount) {
       throw new IndexOutOfBoundsException(
         String.format("index range: [%d, %d), valueCount: %d", index, index + count, valueCount));
     }
   }
 
+  @Override
+  public int numNulls() {
+    return accessor.getNullCount();
+  }
+
+  @Override
+  public boolean anyNullsSet() {
+    return numNulls() > 0;
+  }
+
   @Override
   public long nullsNativeAddress() {
     throw new RuntimeException("Cannot get native address for arrow column");
@@ -274,9 +286,20 @@ public byte[] getBinary(int rowId) {
     return accessor.getBinary(rowId);
   }
 
+  /**
+   * Returns the data for the underlying array.
+   */
+  @Override
+  public ArrowColumnVector arrayData() { return childColumns[0]; }
+
+  /**
+   * Returns the ordinal's child data column.
+   */
+  @Override
+  public ArrowColumnVector getChildColumn(int ordinal) { return childColumns[ordinal]; }
+
   public ArrowColumnVector(ValueVector vector) {
-    super(vector.getValueCapacity(), ArrowUtils.fromArrowField(vector.getField()),
-      MemoryMode.OFF_HEAP);
+    super(ArrowUtils.fromArrowField(vector.getField()));
 
     if (vector instanceof NullableBitVector) {
       accessor = new BooleanAccessor((NullableBitVector) vector);
@@ -302,7 +325,7 @@ public ArrowColumnVector(ValueVector vector) {
       ListVector listVector = (ListVector) vector;
       accessor = new ArrayAccessor(listVector);
 
-      childColumns = new ColumnVector[1];
+      childColumns = new ArrowColumnVector[1];
       childColumns[0] = new ArrowColumnVector(listVector.getDataVector());
       resultArray = new ColumnVector.Array(childColumns[0]);
     } else if (vector instanceof MapVector) {
@@ -317,9 +340,6 @@ public ArrowColumnVector(ValueVector vector) {
     } else {
       throw new UnsupportedOperationException();
     }
-    valueCount = accessor.getValueCount();
-    numNulls = accessor.getNullCount();
-    anyNullsSet = numNulls > 0;
   }
 
   private abstract static class ArrowVectorAccessor {
@@ -327,14 +347,9 @@ private abstract static class ArrowVectorAccessor {
     private final ValueVector vector;
     private final ValueVector.Accessor nulls;
 
-    private final int valueCount;
-    private final int nullCount;
-
     ArrowVectorAccessor(ValueVector vector) {
       this.vector = vector;
       this.nulls = vector.getAccessor();
-      this.valueCount = nulls.getValueCount();
-      this.nullCount = nulls.getNullCount();
     }
 
     final boolean isNullAt(int rowId) {
@@ -342,11 +357,11 @@ final boolean isNullAt(int rowId) {
     }
 
     final int getValueCount() {
-      return valueCount;
+      return nulls.getValueCount();
     }
 
     final int getNullCount() {
-      return nullCount;
+      return nulls.getNullCount();
     }
 
     final void close() {
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
index 77966382881b8..a69dd9718fe33 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
@@ -16,23 +16,16 @@
  */
 package org.apache.spark.sql.execution.vectorized;
 
-import java.math.BigDecimal;
-import java.math.BigInteger;
-
-import com.google.common.annotations.VisibleForTesting;
-
-import org.apache.spark.memory.MemoryMode;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.util.ArrayData;
 import org.apache.spark.sql.catalyst.util.MapData;
-import org.apache.spark.sql.internal.SQLConf;
 import org.apache.spark.sql.types.*;
 import org.apache.spark.unsafe.types.CalendarInterval;
 import org.apache.spark.unsafe.types.UTF8String;
 
 /**
  * This class represents a column of values and provides the main APIs to access the data
- * values. It supports all the types and contains get/put APIs as well as their batched versions.
+ * values. It supports all the types and contains get APIs as well as their batched versions.
  * The batched versions are preferable whenever possible.
  *
  * To handle nested schemas, ColumnVector has two types: Arrays and Structs. In both cases these
@@ -40,34 +33,15 @@
  * contains nullability, and in the case of Arrays, the lengths and offsets into the child column.
  * Lengths and offsets are encoded identically to INTs.
  * Maps are just a special case of a two field struct.
- * Strings are handled as an Array of ByteType.
- *
- * Capacity: The data stored is dense but the arrays are not fixed capacity. It is the
- * responsibility of the caller to call reserve() to ensure there is enough room before adding
- * elements. This means that the put() APIs do not check as in common cases (i.e. flat schemas),
- * the lengths are known up front.
  *
  * Most of the APIs take the rowId as a parameter. This is the batch local 0-based row id for values
  * in the current RowBatch.
  *
- * A ColumnVector should be considered immutable once originally created. In other words, it is not
- * valid to call put APIs after reads until reset() is called.
+ * A ColumnVector should be considered immutable once originally created.
  *
  * ColumnVectors are intended to be reused.
  */
 public abstract class ColumnVector implements AutoCloseable {
-  /**
-   * Allocates a column to store elements of `type` on or off heap.
-   * Capacity is the initial capacity of the vector and it will grow as necessary. Capacity is
-   * in number of elements, not number of bytes.
-   */
-  public static ColumnVector allocate(int capacity, DataType type, MemoryMode mode) {
-    if (mode == MemoryMode.OFF_HEAP) {
-      return new OffHeapColumnVector(capacity, type);
-    } else {
-      return new OnHeapColumnVector(capacity, type);
-    }
-  }
 
   /**
    * Holder object to return an array. This object is intended to be reused. Callers should
@@ -278,75 +252,22 @@ public Object get(int ordinal, DataType dataType) {
    */
   public final DataType dataType() { return type; }
 
-  /**
-   * Resets this column for writing. The currently stored values are no longer accessible.
-   */
-  public void reset() {
-    if (isConstant) return;
-
-    if (childColumns != null) {
-      for (ColumnVector c: childColumns) {
-        c.reset();
-      }
-    }
-    numNulls = 0;
-    elementsAppended = 0;
-    if (anyNullsSet) {
-      putNotNulls(0, capacity);
-      anyNullsSet = false;
-    }
-  }
-
   /**
    * Cleans up memory for this column. The column is not usable after this.
    * TODO: this should probably have ref-counted semantics.
    */
   public abstract void close();
 
-  public void reserve(int requiredCapacity) {
-    if (requiredCapacity > capacity) {
-      int newCapacity = (int) Math.min(MAX_CAPACITY, requiredCapacity * 2L);
-      if (requiredCapacity <= newCapacity) {
-        try {
-          reserveInternal(newCapacity);
-        } catch (OutOfMemoryError outOfMemoryError) {
-          throwUnsupportedException(requiredCapacity, outOfMemoryError);
-        }
-      } else {
-        throwUnsupportedException(requiredCapacity, null);
-      }
-    }
-  }
-
-  private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
-    String message = "Cannot reserve additional contiguous bytes in the vectorized reader " +
-        "(requested = " + requiredCapacity + " bytes). As a workaround, you can disable the " +
-        "vectorized reader by setting " + SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key() +
-        " to false.";
-
-    if (cause != null) {
-      throw new RuntimeException(message, cause);
-    } else {
-      throw new RuntimeException(message);
-    }
-  }
-
-  /**
-   * Ensures that there is enough storage to store capacity elements. That is, the put() APIs
-   * must work for all rowIds < capacity.
-   */
-  protected abstract void reserveInternal(int capacity);
-
   /**
    * Returns the number of nulls in this column.
    */
-  public final int numNulls() { return numNulls; }
+  public abstract int numNulls();
 
   /**
    * Returns true if any of the nulls indicator are set for this column. This can be used
    * as an optimization to prevent setting nulls.
    */
-  public final boolean anyNullsSet() { return anyNullsSet; }
+  public abstract boolean anyNullsSet();
 
   /**
    * Returns the off heap ptr for the arrays backing the NULLs and values buffer. Only valid
@@ -355,33 +276,11 @@ private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
   public abstract long nullsNativeAddress();
   public abstract long valuesNativeAddress();
 
-  /**
-   * Sets the value at rowId to null/not null.
-   */
-  public abstract void putNotNull(int rowId);
-  public abstract void putNull(int rowId);
-
-  /**
-   * Sets the values from [rowId, rowId + count) to null/not null.
-   */
-  public abstract void putNulls(int rowId, int count);
-  public abstract void putNotNulls(int rowId, int count);
-
   /**
    * Returns whether the value at rowId is NULL.
    */
   public abstract boolean isNullAt(int rowId);
 
-  /**
-   * Sets the value at rowId to `value`.
-   */
-  public abstract void putBoolean(int rowId, boolean value);
-
-  /**
-   * Sets values from [rowId, rowId + count) to value.
-   */
-  public abstract void putBooleans(int rowId, int count, boolean value);
-
   /**
    * Returns the value for rowId.
    */
@@ -392,21 +291,6 @@ private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
    */
   public abstract boolean[] getBooleans(int rowId, int count);
 
-  /**
-   * Sets the value at rowId to `value`.
-   */
-  public abstract void putByte(int rowId, byte value);
-
-  /**
-   * Sets values from [rowId, rowId + count) to value.
-   */
-  public abstract void putBytes(int rowId, int count, byte value);
-
-  /**
-   * Sets values from [rowId, rowId + count) to [src + srcIndex, src + srcIndex + count)
-   */
-  public abstract void putBytes(int rowId, int count, byte[] src, int srcIndex);
-
   /**
    * Returns the value for rowId.
    */
@@ -417,21 +301,6 @@ private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
    */
   public abstract byte[] getBytes(int rowId, int count);
 
-  /**
-   * Sets the value at rowId to `value`.
-   */
-  public abstract void putShort(int rowId, short value);
-
-  /**
-   * Sets values from [rowId, rowId + count) to value.
-   */
-  public abstract void putShorts(int rowId, int count, short value);
-
-  /**
-   * Sets values from [rowId, rowId + count) to [src + srcIndex, src + srcIndex + count)
-   */
-  public abstract void putShorts(int rowId, int count, short[] src, int srcIndex);
-
   /**
    * Returns the value for rowId.
    */
@@ -442,27 +311,6 @@ private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
    */
   public abstract short[] getShorts(int rowId, int count);
 
-  /**
-   * Sets the value at rowId to `value`.
-   */
-  public abstract void putInt(int rowId, int value);
-
-  /**
-   * Sets values from [rowId, rowId + count) to value.
-   */
-  public abstract void putInts(int rowId, int count, int value);
-
-  /**
-   * Sets values from [rowId, rowId + count) to [src + srcIndex, src + srcIndex + count)
-   */
-  public abstract void putInts(int rowId, int count, int[] src, int srcIndex);
-
-  /**
-   * Sets values from [rowId, rowId + count) to [src[srcIndex], src[srcIndex + count])
-   * The data in src must be 4-byte little endian ints.
-   */
-  public abstract void putIntsLittleEndian(int rowId, int count, byte[] src, int srcIndex);
-
   /**
    * Returns the value for rowId.
    */
@@ -480,27 +328,6 @@ private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
    */
   public abstract int getDictId(int rowId);
 
-  /**
-   * Sets the value at rowId to `value`.
-   */
-  public abstract void putLong(int rowId, long value);
-
-  /**
-   * Sets values from [rowId, rowId + count) to value.
-   */
-  public abstract void putLongs(int rowId, int count, long value);
-
-  /**
-   * Sets values from [rowId, rowId + count) to [src + srcIndex, src + srcIndex + count)
-   */
-  public abstract void putLongs(int rowId, int count, long[] src, int srcIndex);
-
-  /**
-   * Sets values from [rowId, rowId + count) to [src[srcIndex], src[srcIndex + count])
-   * The data in src must be 8-byte little endian longs.
-   */
-  public abstract void putLongsLittleEndian(int rowId, int count, byte[] src, int srcIndex);
-
   /**
    * Returns the value for rowId.
    */
@@ -511,27 +338,6 @@ private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
    */
   public abstract long[] getLongs(int rowId, int count);
 
-  /**
-   * Sets the value at rowId to `value`.
-   */
-  public abstract void putFloat(int rowId, float value);
-
-  /**
-   * Sets values from [rowId, rowId + count) to value.
-   */
-  public abstract void putFloats(int rowId, int count, float value);
-
-  /**
-   * Sets values from [rowId, rowId + count) to [src + srcIndex, src + srcIndex + count)
-   */
-  public abstract void putFloats(int rowId, int count, float[] src, int srcIndex);
-
-  /**
-   * Sets values from [rowId, rowId + count) to [src[srcIndex], src[srcIndex + count])
-   * The data in src must be ieee formatted floats.
-   */
-  public abstract void putFloats(int rowId, int count, byte[] src, int srcIndex);
-
   /**
    * Returns the value for rowId.
    */
@@ -542,27 +348,6 @@ private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
    */
   public abstract float[] getFloats(int rowId, int count);
 
-  /**
-   * Sets the value at rowId to `value`.
-   */
-  public abstract void putDouble(int rowId, double value);
-
-  /**
-   * Sets values from [rowId, rowId + count) to value.
-   */
-  public abstract void putDoubles(int rowId, int count, double value);
-
-  /**
-   * Sets values from [rowId, rowId + count) to [src + srcIndex, src + srcIndex + count)
-   */
-  public abstract void putDoubles(int rowId, int count, double[] src, int srcIndex);
-
-  /**
-   * Sets values from [rowId, rowId + count) to [src[srcIndex], src[srcIndex + count])
-   * The data in src must be ieee formatted doubles.
-   */
-  public abstract void putDoubles(int rowId, int count, byte[] src, int srcIndex);
-
   /**
    * Returns the value for rowId.
    */
@@ -573,11 +358,6 @@ private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
    */
   public abstract double[] getDoubles(int rowId, int count);
 
-  /**
-   * Puts a byte array that already exists in this column.
-   */
-  public abstract void putArray(int rowId, int offset, int length);
-
   /**
    * Returns the length of the array at rowid.
    */
@@ -608,7 +388,7 @@ public ColumnarBatch.Row getStruct(int rowId, int size) {
   /**
    * Returns the array at rowid.
    */
-  public final Array getArray(int rowId) {
+  public final ColumnVector.Array getArray(int rowId) {
     resultArray.length = getArrayLength(rowId);
     resultArray.offset = getArrayOffset(rowId);
     return resultArray;
@@ -617,24 +397,7 @@ public final Array getArray(int rowId) {
   /**
    * Loads the data into array.byteArray.
    */
-  public abstract void loadBytes(Array array);
-
-  /**
-   * Sets the value at rowId to `value`.
-   */
-  public abstract int putByteArray(int rowId, byte[] value, int offset, int count);
-  public final int putByteArray(int rowId, byte[] value) {
-    return putByteArray(rowId, value, 0, value.length);
-  }
-
-  /**
-   * Returns the value for rowId.
-   */
-  private Array getByteArray(int rowId) {
-    Array array = getArray(rowId);
-    array.data.loadBytes(array);
-    return array;
-  }
+  public abstract void loadBytes(ColumnVector.Array array);
 
   /**
    * Returns the value for rowId.
@@ -646,354 +409,42 @@ public MapData getMap(int ordinal) {
   /**
    * Returns the decimal for rowId.
    */
-  public Decimal getDecimal(int rowId, int precision, int scale) {
-    if (precision <= Decimal.MAX_INT_DIGITS()) {
-      return Decimal.createUnsafe(getInt(rowId), precision, scale);
-    } else if (precision <= Decimal.MAX_LONG_DIGITS()) {
-      return Decimal.createUnsafe(getLong(rowId), precision, scale);
-    } else {
-      // TODO: best perf?
-      byte[] bytes = getBinary(rowId);
-      BigInteger bigInteger = new BigInteger(bytes);
-      BigDecimal javaDecimal = new BigDecimal(bigInteger, scale);
-      return Decimal.apply(javaDecimal, precision, scale);
-    }
-  }
-
-
-  public void putDecimal(int rowId, Decimal value, int precision) {
-    if (precision <= Decimal.MAX_INT_DIGITS()) {
-      putInt(rowId, (int) value.toUnscaledLong());
-    } else if (precision <= Decimal.MAX_LONG_DIGITS()) {
-      putLong(rowId, value.toUnscaledLong());
-    } else {
-      BigInteger bigInteger = value.toJavaBigDecimal().unscaledValue();
-      putByteArray(rowId, bigInteger.toByteArray());
-    }
-  }
+  public abstract Decimal getDecimal(int rowId, int precision, int scale);
 
   /**
    * Returns the UTF8String for rowId.
    */
-  public UTF8String getUTF8String(int rowId) {
-    if (dictionary == null) {
-      ColumnVector.Array a = getByteArray(rowId);
-      return UTF8String.fromBytes(a.byteArray, a.byteArrayOffset, a.length);
-    } else {
-      byte[] bytes = dictionary.decodeToBinary(dictionaryIds.getDictId(rowId));
-      return UTF8String.fromBytes(bytes);
-    }
-  }
+  public abstract UTF8String getUTF8String(int rowId);
 
   /**
    * Returns the byte array for rowId.
    */
-  public byte[] getBinary(int rowId) {
-    if (dictionary == null) {
-      ColumnVector.Array array = getByteArray(rowId);
-      byte[] bytes = new byte[array.length];
-      System.arraycopy(array.byteArray, array.byteArrayOffset, bytes, 0, bytes.length);
-      return bytes;
-    } else {
-      return dictionary.decodeToBinary(dictionaryIds.getDictId(rowId));
-    }
-  }
-
-  /**
-   * Append APIs. These APIs all behave similarly and will append data to the current vector.  It
-   * is not valid to mix the put and append APIs. The append APIs are slower and should only be
-   * used if the sizes are not known up front.
-   * In all these cases, the return value is the rowId for the first appended element.
-   */
-  public final int appendNull() {
-    assert (!(dataType() instanceof StructType)); // Use appendStruct()
-    reserve(elementsAppended + 1);
-    putNull(elementsAppended);
-    return elementsAppended++;
-  }
-
-  public final int appendNotNull() {
-    reserve(elementsAppended + 1);
-    putNotNull(elementsAppended);
-    return elementsAppended++;
-  }
-
-  public final int appendNulls(int count) {
-    assert (!(dataType() instanceof StructType));
-    reserve(elementsAppended + count);
-    int result = elementsAppended;
-    putNulls(elementsAppended, count);
-    elementsAppended += count;
-    return result;
-  }
-
-  public final int appendNotNulls(int count) {
-    assert (!(dataType() instanceof StructType));
-    reserve(elementsAppended + count);
-    int result = elementsAppended;
-    putNotNulls(elementsAppended, count);
-    elementsAppended += count;
-    return result;
-  }
-
-  public final int appendBoolean(boolean v) {
-    reserve(elementsAppended + 1);
-    putBoolean(elementsAppended, v);
-    return elementsAppended++;
-  }
-
-  public final int appendBooleans(int count, boolean v) {
-    reserve(elementsAppended + count);
-    int result = elementsAppended;
-    putBooleans(elementsAppended, count, v);
-    elementsAppended += count;
-    return result;
-  }
-
-  public final int appendByte(byte v) {
-    reserve(elementsAppended + 1);
-    putByte(elementsAppended, v);
-    return elementsAppended++;
-  }
-
-  public final int appendBytes(int count, byte v) {
-    reserve(elementsAppended + count);
-    int result = elementsAppended;
-    putBytes(elementsAppended, count, v);
-    elementsAppended += count;
-    return result;
-  }
-
-  public final int appendBytes(int length, byte[] src, int offset) {
-    reserve(elementsAppended + length);
-    int result = elementsAppended;
-    putBytes(elementsAppended, length, src, offset);
-    elementsAppended += length;
-    return result;
-  }
-
-  public final int appendShort(short v) {
-    reserve(elementsAppended + 1);
-    putShort(elementsAppended, v);
-    return elementsAppended++;
-  }
-
-  public final int appendShorts(int count, short v) {
-    reserve(elementsAppended + count);
-    int result = elementsAppended;
-    putShorts(elementsAppended, count, v);
-    elementsAppended += count;
-    return result;
-  }
-
-  public final int appendShorts(int length, short[] src, int offset) {
-    reserve(elementsAppended + length);
-    int result = elementsAppended;
-    putShorts(elementsAppended, length, src, offset);
-    elementsAppended += length;
-    return result;
-  }
-
-  public final int appendInt(int v) {
-    reserve(elementsAppended + 1);
-    putInt(elementsAppended, v);
-    return elementsAppended++;
-  }
-
-  public final int appendInts(int count, int v) {
-    reserve(elementsAppended + count);
-    int result = elementsAppended;
-    putInts(elementsAppended, count, v);
-    elementsAppended += count;
-    return result;
-  }
-
-  public final int appendInts(int length, int[] src, int offset) {
-    reserve(elementsAppended + length);
-    int result = elementsAppended;
-    putInts(elementsAppended, length, src, offset);
-    elementsAppended += length;
-    return result;
-  }
-
-  public final int appendLong(long v) {
-    reserve(elementsAppended + 1);
-    putLong(elementsAppended, v);
-    return elementsAppended++;
-  }
-
-  public final int appendLongs(int count, long v) {
-    reserve(elementsAppended + count);
-    int result = elementsAppended;
-    putLongs(elementsAppended, count, v);
-    elementsAppended += count;
-    return result;
-  }
-
-  public final int appendLongs(int length, long[] src, int offset) {
-    reserve(elementsAppended + length);
-    int result = elementsAppended;
-    putLongs(elementsAppended, length, src, offset);
-    elementsAppended += length;
-    return result;
-  }
-
-  public final int appendFloat(float v) {
-    reserve(elementsAppended + 1);
-    putFloat(elementsAppended, v);
-    return elementsAppended++;
-  }
-
-  public final int appendFloats(int count, float v) {
-    reserve(elementsAppended + count);
-    int result = elementsAppended;
-    putFloats(elementsAppended, count, v);
-    elementsAppended += count;
-    return result;
-  }
-
-  public final int appendFloats(int length, float[] src, int offset) {
-    reserve(elementsAppended + length);
-    int result = elementsAppended;
-    putFloats(elementsAppended, length, src, offset);
-    elementsAppended += length;
-    return result;
-  }
-
-  public final int appendDouble(double v) {
-    reserve(elementsAppended + 1);
-    putDouble(elementsAppended, v);
-    return elementsAppended++;
-  }
-
-  public final int appendDoubles(int count, double v) {
-    reserve(elementsAppended + count);
-    int result = elementsAppended;
-    putDoubles(elementsAppended, count, v);
-    elementsAppended += count;
-    return result;
-  }
-
-  public final int appendDoubles(int length, double[] src, int offset) {
-    reserve(elementsAppended + length);
-    int result = elementsAppended;
-    putDoubles(elementsAppended, length, src, offset);
-    elementsAppended += length;
-    return result;
-  }
-
-  public final int appendByteArray(byte[] value, int offset, int length) {
-    int copiedOffset = arrayData().appendBytes(length, value, offset);
-    reserve(elementsAppended + 1);
-    putArray(elementsAppended, copiedOffset, length);
-    return elementsAppended++;
-  }
-
-  public final int appendArray(int length) {
-    reserve(elementsAppended + 1);
-    putArray(elementsAppended, arrayData().elementsAppended, length);
-    return elementsAppended++;
-  }
-
-  /**
-   * Appends a NULL struct. This *has* to be used for structs instead of appendNull() as this
-   * recursively appends a NULL to its children.
-   * We don't have this logic as the general appendNull implementation to optimize the more
-   * common non-struct case.
-   */
-  public final int appendStruct(boolean isNull) {
-    if (isNull) {
-      appendNull();
-      for (ColumnVector c: childColumns) {
-        if (c.type instanceof StructType) {
-          c.appendStruct(true);
-        } else {
-          c.appendNull();
-        }
-      }
-    } else {
-      appendNotNull();
-    }
-    return elementsAppended;
-  }
+  public abstract byte[] getBinary(int rowId);
 
   /**
    * Returns the data for the underlying array.
    */
-  public final ColumnVector arrayData() { return childColumns[0]; }
+  public abstract ColumnVector arrayData();
 
   /**
    * Returns the ordinal's child data column.
    */
-  public final ColumnVector getChildColumn(int ordinal) { return childColumns[ordinal]; }
-
-  /**
-   * Returns the elements appended.
-   */
-  public final int getElementsAppended() { return elementsAppended; }
+  public abstract ColumnVector getChildColumn(int ordinal);
 
   /**
    * Returns true if this column is an array.
    */
   public final boolean isArray() { return resultArray != null; }
 
-  /**
-   * Marks this column as being constant.
-   */
-  public final void setIsConstant() { isConstant = true; }
-
-  /**
-   * Maximum number of rows that can be stored in this column.
-   */
-  protected int capacity;
-
-  /**
-   * Upper limit for the maximum capacity for this column.
-   */
-  @VisibleForTesting
-  protected int MAX_CAPACITY = Integer.MAX_VALUE;
-
   /**
    * Data type for this column.
    */
   protected DataType type;
 
-  /**
-   * Number of nulls in this column. This is an optimization for the reader, to skip NULL checks.
-   */
-  protected int numNulls;
-
-  /**
-   * True if there is at least one NULL byte set. This is an optimization for the writer, to skip
-   * having to clear NULL bits.
-   */
-  protected boolean anyNullsSet;
-
-  /**
-   * True if this column's values are fixed. This means the column values never change, even
-   * across resets.
-   */
-  protected boolean isConstant;
-
-  /**
-   * Default size of each array length value. This grows as necessary.
-   */
-  protected static final int DEFAULT_ARRAY_LENGTH = 4;
-
-  /**
-   * Current write cursor (row index) when appending data.
-   */
-  protected int elementsAppended;
-
-  /**
-   * If this is a nested type (array or struct), the column for the child data.
-   */
-  protected ColumnVector[] childColumns;
-
   /**
    * Reusable Array holder for getArray().
    */
-  protected Array resultArray;
+  protected ColumnVector.Array resultArray;
 
   /**
    * Reusable Struct holder for getStruct().
@@ -1012,32 +463,11 @@ public final int appendStruct(boolean isNull) {
    */
   protected ColumnVector dictionaryIds;
 
-  /**
-   * Update the dictionary.
-   */
-  public void setDictionary(Dictionary dictionary) {
-    this.dictionary = dictionary;
-  }
-
   /**
    * Returns true if this column has a dictionary.
    */
   public boolean hasDictionary() { return this.dictionary != null; }
 
-  /**
-   * Reserve a integer column for ids of dictionary.
-   */
-  public ColumnVector reserveDictionaryIds(int capacity) {
-    if (dictionaryIds == null) {
-      dictionaryIds = allocate(capacity, DataTypes.IntegerType,
-        this instanceof OnHeapColumnVector ? MemoryMode.ON_HEAP : MemoryMode.OFF_HEAP);
-    } else {
-      dictionaryIds.reset();
-      dictionaryIds.reserve(capacity);
-    }
-    return dictionaryIds;
-  }
-
   /**
    * Returns the underlying integer column for ids of dictionary.
    */
@@ -1049,43 +479,7 @@ public ColumnVector getDictionaryIds() {
    * Sets up the common state and also handles creating the child columns if this is a nested
    * type.
    */
-  protected ColumnVector(int capacity, DataType type, MemoryMode memMode) {
-    this.capacity = capacity;
+  protected ColumnVector(DataType type) {
     this.type = type;
-
-    if (type instanceof ArrayType || type instanceof BinaryType || type instanceof StringType
-        || DecimalType.isByteArrayDecimalType(type)) {
-      DataType childType;
-      int childCapacity = capacity;
-      if (type instanceof ArrayType) {
-        childType = ((ArrayType)type).elementType();
-      } else {
-        childType = DataTypes.ByteType;
-        childCapacity *= DEFAULT_ARRAY_LENGTH;
-      }
-      this.childColumns = new ColumnVector[1];
-      this.childColumns[0] = ColumnVector.allocate(childCapacity, childType, memMode);
-      this.resultArray = new Array(this.childColumns[0]);
-      this.resultStruct = null;
-    } else if (type instanceof StructType) {
-      StructType st = (StructType)type;
-      this.childColumns = new ColumnVector[st.fields().length];
-      for (int i = 0; i < childColumns.length; ++i) {
-        this.childColumns[i] = ColumnVector.allocate(capacity, st.fields()[i].dataType(), memMode);
-      }
-      this.resultArray = null;
-      this.resultStruct = new ColumnarBatch.Row(this.childColumns);
-    } else if (type instanceof CalendarIntervalType) {
-      // Two columns. Months as int. Microseconds as Long.
-      this.childColumns = new ColumnVector[2];
-      this.childColumns[0] = ColumnVector.allocate(capacity, DataTypes.IntegerType, memMode);
-      this.childColumns[1] = ColumnVector.allocate(capacity, DataTypes.LongType, memMode);
-      this.resultArray = null;
-      this.resultStruct = new ColumnarBatch.Row(this.childColumns);
-    } else {
-      this.childColumns = null;
-      this.resultArray = null;
-      this.resultStruct = null;
-    }
   }
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java
index 900d7c431e723..adb859ed17757 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java
@@ -40,7 +40,7 @@ public class ColumnVectorUtils {
   /**
    * Populates the entire `col` with `row[fieldIdx]`
    */
-  public static void populate(ColumnVector col, InternalRow row, int fieldIdx) {
+  public static void populate(WritableColumnVector col, InternalRow row, int fieldIdx) {
     int capacity = col.capacity;
     DataType t = col.dataType();
 
@@ -115,7 +115,7 @@ public static Object toPrimitiveJavaArray(ColumnVector.Array array) {
     }
   }
 
-  private static void appendValue(ColumnVector dst, DataType t, Object o) {
+  private static void appendValue(WritableColumnVector dst, DataType t, Object o) {
     if (o == null) {
       if (t instanceof CalendarIntervalType) {
         dst.appendStruct(true);
@@ -165,7 +165,7 @@ private static void appendValue(ColumnVector dst, DataType t, Object o) {
     }
   }
 
-  private static void appendValue(ColumnVector dst, DataType t, Row src, int fieldIdx) {
+  private static void appendValue(WritableColumnVector dst, DataType t, Row src, int fieldIdx) {
     if (t instanceof ArrayType) {
       ArrayType at = (ArrayType)t;
       if (src.isNullAt(fieldIdx)) {
@@ -198,15 +198,23 @@ private static void appendValue(ColumnVector dst, DataType t, Row src, int field
    */
   public static ColumnarBatch toBatch(
       StructType schema, MemoryMode memMode, Iterator<Row> row) {
-    ColumnarBatch batch = ColumnarBatch.allocate(schema, memMode);
+    int capacity = ColumnarBatch.DEFAULT_BATCH_SIZE;
+    WritableColumnVector[] columnVectors;
+    if (memMode == MemoryMode.OFF_HEAP) {
+      columnVectors = OffHeapColumnVector.allocateColumns(capacity, schema);
+    } else {
+      columnVectors = OnHeapColumnVector.allocateColumns(capacity, schema);
+    }
+
     int n = 0;
     while (row.hasNext()) {
       Row r = row.next();
       for (int i = 0; i < schema.fields().length; i++) {
-        appendValue(batch.column(i), schema.fields()[i].dataType(), r, i);
+        appendValue(columnVectors[i], schema.fields()[i].dataType(), r, i);
       }
       n++;
     }
+    ColumnarBatch batch = new ColumnarBatch(schema, columnVectors, capacity);
     batch.setNumRows(n);
     return batch;
   }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java
index 34dc3af9b85c8..e782756a3e781 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java
@@ -19,7 +19,6 @@
 import java.math.BigDecimal;
 import java.util.*;
 
-import org.apache.spark.memory.MemoryMode;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
@@ -44,8 +43,7 @@
  *  - Compaction: The batch and columns should be able to compact based on a selection vector.
  */
 public final class ColumnarBatch {
-  private static final int DEFAULT_BATCH_SIZE = 4 * 1024;
-  private static MemoryMode DEFAULT_MEMORY_MODE = MemoryMode.ON_HEAP;
+  public static final int DEFAULT_BATCH_SIZE = 4 * 1024;
 
   private final StructType schema;
   private final int capacity;
@@ -64,18 +62,6 @@ public final class ColumnarBatch {
   // Staging row returned from getRow.
   final Row row;
 
-  public static ColumnarBatch allocate(StructType schema, MemoryMode memMode) {
-    return new ColumnarBatch(schema, DEFAULT_BATCH_SIZE, memMode);
-  }
-
-  public static ColumnarBatch allocate(StructType type) {
-    return new ColumnarBatch(type, DEFAULT_BATCH_SIZE, DEFAULT_MEMORY_MODE);
-  }
-
-  public static ColumnarBatch allocate(StructType schema, MemoryMode memMode, int maxRows) {
-    return new ColumnarBatch(schema, maxRows, memMode);
-  }
-
   /**
    * Called to close all the columns in this batch. It is not valid to access the data after
    * calling this. This must be called at the end to clean up memory allocations.
@@ -95,12 +81,19 @@ public static final class Row extends InternalRow {
     private final ColumnarBatch parent;
     private final int fixedLenRowSize;
     private final ColumnVector[] columns;
+    private final WritableColumnVector[] writableColumns;
 
     // Ctor used if this is a top level row.
     private Row(ColumnarBatch parent) {
       this.parent = parent;
       this.fixedLenRowSize = UnsafeRow.calculateFixedPortionByteSize(parent.numCols());
       this.columns = parent.columns;
+      this.writableColumns = new WritableColumnVector[this.columns.length];
+      for (int i = 0; i < this.columns.length; i++) {
+        if (this.columns[i] instanceof WritableColumnVector) {
+          this.writableColumns[i] = (WritableColumnVector) this.columns[i];
+        }
+      }
     }
 
     // Ctor used if this is a struct.
@@ -108,6 +101,12 @@ protected Row(ColumnVector[] columns) {
       this.parent = null;
       this.fixedLenRowSize = UnsafeRow.calculateFixedPortionByteSize(columns.length);
       this.columns = columns;
+      this.writableColumns = new WritableColumnVector[this.columns.length];
+      for (int i = 0; i < this.columns.length; i++) {
+        if (this.columns[i] instanceof WritableColumnVector) {
+          this.writableColumns[i] = (WritableColumnVector) this.columns[i];
+        }
+      }
     }
 
     /**
@@ -307,64 +306,69 @@ public void update(int ordinal, Object value) {
 
     @Override
     public void setNullAt(int ordinal) {
-      assert (!columns[ordinal].isConstant);
-      columns[ordinal].putNull(rowId);
+      getWritableColumn(ordinal).putNull(rowId);
     }
 
     @Override
     public void setBoolean(int ordinal, boolean value) {
-      assert (!columns[ordinal].isConstant);
-      columns[ordinal].putNotNull(rowId);
-      columns[ordinal].putBoolean(rowId, value);
+      WritableColumnVector column = getWritableColumn(ordinal);
+      column.putNotNull(rowId);
+      column.putBoolean(rowId, value);
     }
 
     @Override
     public void setByte(int ordinal, byte value) {
-      assert (!columns[ordinal].isConstant);
-      columns[ordinal].putNotNull(rowId);
-      columns[ordinal].putByte(rowId, value);
+      WritableColumnVector column = getWritableColumn(ordinal);
+      column.putNotNull(rowId);
+      column.putByte(rowId, value);
     }
 
     @Override
     public void setShort(int ordinal, short value) {
-      assert (!columns[ordinal].isConstant);
-      columns[ordinal].putNotNull(rowId);
-      columns[ordinal].putShort(rowId, value);
+      WritableColumnVector column = getWritableColumn(ordinal);
+      column.putNotNull(rowId);
+      column.putShort(rowId, value);
     }
 
     @Override
     public void setInt(int ordinal, int value) {
-      assert (!columns[ordinal].isConstant);
-      columns[ordinal].putNotNull(rowId);
-      columns[ordinal].putInt(rowId, value);
+      WritableColumnVector column = getWritableColumn(ordinal);
+      column.putNotNull(rowId);
+      column.putInt(rowId, value);
     }
 
     @Override
     public void setLong(int ordinal, long value) {
-      assert (!columns[ordinal].isConstant);
-      columns[ordinal].putNotNull(rowId);
-      columns[ordinal].putLong(rowId, value);
+      WritableColumnVector column = getWritableColumn(ordinal);
+      column.putNotNull(rowId);
+      column.putLong(rowId, value);
     }
 
     @Override
     public void setFloat(int ordinal, float value) {
-      assert (!columns[ordinal].isConstant);
-      columns[ordinal].putNotNull(rowId);
-      columns[ordinal].putFloat(rowId, value);
+      WritableColumnVector column = getWritableColumn(ordinal);
+      column.putNotNull(rowId);
+      column.putFloat(rowId, value);
     }
 
     @Override
     public void setDouble(int ordinal, double value) {
-      assert (!columns[ordinal].isConstant);
-      columns[ordinal].putNotNull(rowId);
-      columns[ordinal].putDouble(rowId, value);
+      WritableColumnVector column = getWritableColumn(ordinal);
+      column.putNotNull(rowId);
+      column.putDouble(rowId, value);
     }
 
     @Override
     public void setDecimal(int ordinal, Decimal value, int precision) {
-      assert (!columns[ordinal].isConstant);
-      columns[ordinal].putNotNull(rowId);
-      columns[ordinal].putDecimal(rowId, value, precision);
+      WritableColumnVector column = getWritableColumn(ordinal);
+      column.putNotNull(rowId);
+      column.putDecimal(rowId, value, precision);
+    }
+
+    private WritableColumnVector getWritableColumn(int ordinal) {
+      WritableColumnVector column = writableColumns[ordinal];
+      assert (!column.isConstant);
+      return column;
     }
   }
 
@@ -409,7 +413,9 @@ public void remove() {
    */
   public void reset() {
     for (int i = 0; i < numCols(); ++i) {
-      columns[i].reset();
+      if (columns[i] instanceof WritableColumnVector) {
+        ((WritableColumnVector) columns[i]).reset();
+      }
     }
     if (this.numRowsFiltered > 0) {
       Arrays.fill(filteredRows, false);
@@ -427,7 +433,7 @@ public void setNumRows(int numRows) {
     this.numRows = numRows;
 
     for (int ordinal : nullFilteredColumns) {
-      if (columns[ordinal].numNulls != 0) {
+      if (columns[ordinal].numNulls() != 0) {
         for (int rowId = 0; rowId < numRows; rowId++) {
           if (!filteredRows[rowId] && columns[ordinal].isNullAt(rowId)) {
             filteredRows[rowId] = true;
@@ -505,18 +511,12 @@ public void filterNullsInColumn(int ordinal) {
     nullFilteredColumns.add(ordinal);
   }
 
-  private ColumnarBatch(StructType schema, int maxRows, MemoryMode memMode) {
+  public ColumnarBatch(StructType schema, ColumnVector[] columns, int capacity) {
     this.schema = schema;
-    this.capacity = maxRows;
-    this.columns = new ColumnVector[schema.size()];
+    this.columns = columns;
+    this.capacity = capacity;
     this.nullFilteredColumns = new HashSet<>();
-    this.filteredRows = new boolean[maxRows];
-
-    for (int i = 0; i < schema.fields().length; ++i) {
-      StructField field = schema.fields()[i];
-      columns[i] = ColumnVector.allocate(maxRows, field.dataType(), memMode);
-    }
-
+    this.filteredRows = new boolean[capacity];
     this.row = new Row(this);
   }
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
index 2d1f3da8e7463..35682756ed6c3 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
@@ -19,18 +19,39 @@
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 
-import org.apache.spark.memory.MemoryMode;
 import org.apache.spark.sql.types.*;
 import org.apache.spark.unsafe.Platform;
 
 /**
  * Column data backed using offheap memory.
  */
-public final class OffHeapColumnVector extends ColumnVector {
+public final class OffHeapColumnVector extends WritableColumnVector {
 
   private static final boolean bigEndianPlatform =
     ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN);
 
+  /**
+   * Allocates columns to store elements of each field of the schema off heap.
+   * Capacity is the initial capacity of the vector and it will grow as necessary. Capacity is
+   * in number of elements, not number of bytes.
+   */
+  public static OffHeapColumnVector[] allocateColumns(int capacity, StructType schema) {
+    return allocateColumns(capacity, schema.fields());
+  }
+
+  /**
+   * Allocates columns to store elements of each field off heap.
+   * Capacity is the initial capacity of the vector and it will grow as necessary. Capacity is
+   * in number of elements, not number of bytes.
+   */
+  public static OffHeapColumnVector[] allocateColumns(int capacity, StructField[] fields) {
+    OffHeapColumnVector[] vectors = new OffHeapColumnVector[fields.length];
+    for (int i = 0; i < fields.length; i++) {
+      vectors[i] = new OffHeapColumnVector(capacity, fields[i].dataType());
+    }
+    return vectors;
+  }
+
   // The data stored in these two allocations need to maintain binary compatible. We can
   // directly pass this buffer to external components.
   private long nulls;
@@ -40,8 +61,8 @@ public final class OffHeapColumnVector extends ColumnVector {
   private long lengthData;
   private long offsetData;
 
-  protected OffHeapColumnVector(int capacity, DataType type) {
-    super(capacity, type, MemoryMode.OFF_HEAP);
+  public OffHeapColumnVector(int capacity, DataType type) {
+    super(capacity, type);
 
     nulls = 0;
     data = 0;
@@ -519,4 +540,9 @@ protected void reserveInternal(int newCapacity) {
     Platform.setMemory(nulls + oldCapacity, (byte)0, newCapacity - oldCapacity);
     capacity = newCapacity;
   }
+
+  @Override
+  protected OffHeapColumnVector reserveNewColumn(int capacity, DataType type) {
+    return new OffHeapColumnVector(capacity, type);
+  }
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
index 506434364be48..96a452978cb35 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
@@ -20,7 +20,6 @@
 import java.nio.ByteOrder;
 import java.util.Arrays;
 
-import org.apache.spark.memory.MemoryMode;
 import org.apache.spark.sql.types.*;
 import org.apache.spark.unsafe.Platform;
 
@@ -28,11 +27,33 @@
  * A column backed by an in memory JVM array. This stores the NULLs as a byte per value
  * and a java array for the values.
  */
-public final class OnHeapColumnVector extends ColumnVector {
+public final class OnHeapColumnVector extends WritableColumnVector {
 
   private static final boolean bigEndianPlatform =
     ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN);
 
+  /**
+   * Allocates columns to store elements of each field of the schema on heap.
+   * Capacity is the initial capacity of the vector and it will grow as necessary. Capacity is
+   * in number of elements, not number of bytes.
+   */
+  public static OnHeapColumnVector[] allocateColumns(int capacity, StructType schema) {
+    return allocateColumns(capacity, schema.fields());
+  }
+
+  /**
+   * Allocates columns to store elements of each field on heap.
+   * Capacity is the initial capacity of the vector and it will grow as necessary. Capacity is
+   * in number of elements, not number of bytes.
+   */
+  public static OnHeapColumnVector[] allocateColumns(int capacity, StructField[] fields) {
+    OnHeapColumnVector[] vectors = new OnHeapColumnVector[fields.length];
+    for (int i = 0; i < fields.length; i++) {
+      vectors[i] = new OnHeapColumnVector(capacity, fields[i].dataType());
+    }
+    return vectors;
+  }
+
   // The data stored in these arrays need to maintain binary compatible. We can
   // directly pass this buffer to external components.
 
@@ -51,8 +72,9 @@ public final class OnHeapColumnVector extends ColumnVector {
   private int[] arrayLengths;
   private int[] arrayOffsets;
 
-  protected OnHeapColumnVector(int capacity, DataType type) {
-    super(capacity, type, MemoryMode.ON_HEAP);
+  public OnHeapColumnVector(int capacity, DataType type) {
+    super(capacity, type);
+
     reserveInternal(capacity);
     reset();
   }
@@ -529,4 +551,9 @@ protected void reserveInternal(int newCapacity) {
 
     capacity = newCapacity;
   }
+
+  @Override
+  protected OnHeapColumnVector reserveNewColumn(int capacity, DataType type) {
+    return new OnHeapColumnVector(capacity, type);
+  }
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ReadOnlyColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ReadOnlyColumnVector.java
deleted file mode 100644
index e9f6e7c631fd4..0000000000000
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ReadOnlyColumnVector.java
+++ /dev/null
@@ -1,251 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.vectorized;
-
-import org.apache.spark.memory.MemoryMode;
-import org.apache.spark.sql.types.*;
-
-/**
- * An abstract class for read-only column vector.
- */
-public abstract class ReadOnlyColumnVector extends ColumnVector {
-
-  protected ReadOnlyColumnVector(int capacity, DataType type, MemoryMode memMode) {
-    super(capacity, DataTypes.NullType, memMode);
-    this.type = type;
-    isConstant = true;
-  }
-
-  //
-  // APIs dealing with nulls
-  //
-
-  @Override
-  public final void putNotNull(int rowId) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public final void putNull(int rowId) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public final void putNulls(int rowId, int count) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public final void putNotNulls(int rowId, int count) {
-    throw new UnsupportedOperationException();
-  }
-
-  //
-  // APIs dealing with Booleans
-  //
-
-  @Override
-  public final void putBoolean(int rowId, boolean value) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public final void putBooleans(int rowId, int count, boolean value) {
-    throw new UnsupportedOperationException();
-  }
-
-  //
-  // APIs dealing with Bytes
-  //
-
-  @Override
-  public final void putByte(int rowId, byte value) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public final void putBytes(int rowId, int count, byte value) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public final void putBytes(int rowId, int count, byte[] src, int srcIndex) {
-    throw new UnsupportedOperationException();
-  }
-
-  //
-  // APIs dealing with Shorts
-  //
-
-  @Override
-  public final void putShort(int rowId, short value) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public final void putShorts(int rowId, int count, short value) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public final void putShorts(int rowId, int count, short[] src, int srcIndex) {
-    throw new UnsupportedOperationException();
-  }
-
-  //
-  // APIs dealing with Ints
-  //
-
-  @Override
-  public final void putInt(int rowId, int value) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public final void putInts(int rowId, int count, int value) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public final void putInts(int rowId, int count, int[] src, int srcIndex) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public final void putIntsLittleEndian(int rowId, int count, byte[] src, int srcIndex) {
-    throw new UnsupportedOperationException();
-  }
-
-  //
-  // APIs dealing with Longs
-  //
-
-  @Override
-  public final void putLong(int rowId, long value) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public final void putLongs(int rowId, int count, long value) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public final void putLongs(int rowId, int count, long[] src, int srcIndex) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public final void putLongsLittleEndian(int rowId, int count, byte[] src, int srcIndex) {
-    throw new UnsupportedOperationException();
-  }
-
-  //
-  // APIs dealing with floats
-  //
-
-  @Override
-  public final void putFloat(int rowId, float value) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public final void putFloats(int rowId, int count, float value) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public final void putFloats(int rowId, int count, float[] src, int srcIndex) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public final void putFloats(int rowId, int count, byte[] src, int srcIndex) {
-    throw new UnsupportedOperationException();
-  }
-
-  //
-  // APIs dealing with doubles
-  //
-
-  @Override
-  public final void putDouble(int rowId, double value) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public final void putDoubles(int rowId, int count, double value) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public final void putDoubles(int rowId, int count, double[] src, int srcIndex) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public final void putDoubles(int rowId, int count, byte[] src, int srcIndex) {
-    throw new UnsupportedOperationException();
-  }
-
-  //
-  // APIs dealing with Arrays
-  //
-
-  @Override
-  public final void putArray(int rowId, int offset, int length) {
-    throw new UnsupportedOperationException();
-  }
-
-  //
-  // APIs dealing with Byte Arrays
-  //
-
-  @Override
-  public final int putByteArray(int rowId, byte[] value, int offset, int count) {
-    throw new UnsupportedOperationException();
-  }
-
-  //
-  // APIs dealing with Decimals
-  //
-
-  @Override
-  public final void putDecimal(int rowId, Decimal value, int precision) {
-    throw new UnsupportedOperationException();
-  }
-
-  //
-  // Other APIs
-  //
-
-  @Override
-  public final void setDictionary(Dictionary dictionary) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public final ColumnVector reserveDictionaryIds(int capacity) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  protected final void reserveInternal(int newCapacity) {
-    throw new UnsupportedOperationException();
-  }
-}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
new file mode 100644
index 0000000000000..b4f753c0bc2a3
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
@@ -0,0 +1,674 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.vectorized;
+
+import java.math.BigDecimal;
+import java.math.BigInteger;
+
+import com.google.common.annotations.VisibleForTesting;
+
+import org.apache.spark.sql.internal.SQLConf;
+import org.apache.spark.sql.types.*;
+import org.apache.spark.unsafe.types.UTF8String;
+
+/**
+ * This class adds write APIs to ColumnVector.
+ * It supports all the types and contains put APIs as well as their batched versions.
+ * The batched versions are preferable whenever possible.
+ *
+ * Capacity: The data stored is dense but the arrays are not fixed capacity. It is the
+ * responsibility of the caller to call reserve() to ensure there is enough room before adding
+ * elements. This means that the put() APIs do not check as in common cases (i.e. flat schemas),
+ * the lengths are known up front.
+ *
+ * A ColumnVector should be considered immutable once originally created. In other words, it is not
+ * valid to call put APIs after reads until reset() is called.
+ */
+public abstract class WritableColumnVector extends ColumnVector {
+
+  /**
+   * Resets this column for writing. The currently stored values are no longer accessible.
+   */
+  public void reset() {
+    if (isConstant) return;
+
+    if (childColumns != null) {
+      for (ColumnVector c: childColumns) {
+        ((WritableColumnVector) c).reset();
+      }
+    }
+    numNulls = 0;
+    elementsAppended = 0;
+    if (anyNullsSet) {
+      putNotNulls(0, capacity);
+      anyNullsSet = false;
+    }
+  }
+
+  public void reserve(int requiredCapacity) {
+    if (requiredCapacity > capacity) {
+      int newCapacity = (int) Math.min(MAX_CAPACITY, requiredCapacity * 2L);
+      if (requiredCapacity <= newCapacity) {
+        try {
+          reserveInternal(newCapacity);
+        } catch (OutOfMemoryError outOfMemoryError) {
+          throwUnsupportedException(requiredCapacity, outOfMemoryError);
+        }
+      } else {
+        throwUnsupportedException(requiredCapacity, null);
+      }
+    }
+  }
+
+  private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
+    String message = "Cannot reserve additional contiguous bytes in the vectorized reader " +
+        "(requested = " + requiredCapacity + " bytes). As a workaround, you can disable the " +
+        "vectorized reader by setting " + SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key() +
+        " to false.";
+    throw new RuntimeException(message, cause);
+  }
+
+  @Override
+  public int numNulls() { return numNulls; }
+
+  @Override
+  public boolean anyNullsSet() { return anyNullsSet; }
+
+  /**
+   * Ensures that there is enough storage to store capacity elements. That is, the put() APIs
+   * must work for all rowIds < capacity.
+   */
+  protected abstract void reserveInternal(int capacity);
+
+  /**
+   * Sets the value at rowId to null/not null.
+   */
+  public abstract void putNotNull(int rowId);
+  public abstract void putNull(int rowId);
+
+  /**
+   * Sets the values from [rowId, rowId + count) to null/not null.
+   */
+  public abstract void putNulls(int rowId, int count);
+  public abstract void putNotNulls(int rowId, int count);
+
+  /**
+   * Sets the value at rowId to `value`.
+   */
+  public abstract void putBoolean(int rowId, boolean value);
+
+  /**
+   * Sets values from [rowId, rowId + count) to value.
+   */
+  public abstract void putBooleans(int rowId, int count, boolean value);
+
+  /**
+   * Sets the value at rowId to `value`.
+   */
+  public abstract void putByte(int rowId, byte value);
+
+  /**
+   * Sets values from [rowId, rowId + count) to value.
+   */
+  public abstract void putBytes(int rowId, int count, byte value);
+
+  /**
+   * Sets values from [rowId, rowId + count) to [src + srcIndex, src + srcIndex + count)
+   */
+  public abstract void putBytes(int rowId, int count, byte[] src, int srcIndex);
+
+  /**
+   * Sets the value at rowId to `value`.
+   */
+  public abstract void putShort(int rowId, short value);
+
+  /**
+   * Sets values from [rowId, rowId + count) to value.
+   */
+  public abstract void putShorts(int rowId, int count, short value);
+
+  /**
+   * Sets values from [rowId, rowId + count) to [src + srcIndex, src + srcIndex + count)
+   */
+  public abstract void putShorts(int rowId, int count, short[] src, int srcIndex);
+
+  /**
+   * Sets the value at rowId to `value`.
+   */
+  public abstract void putInt(int rowId, int value);
+
+  /**
+   * Sets values from [rowId, rowId + count) to value.
+   */
+  public abstract void putInts(int rowId, int count, int value);
+
+  /**
+   * Sets values from [rowId, rowId + count) to [src + srcIndex, src + srcIndex + count)
+   */
+  public abstract void putInts(int rowId, int count, int[] src, int srcIndex);
+
+  /**
+   * Sets values from [rowId, rowId + count) to [src[srcIndex], src[srcIndex + count])
+   * The data in src must be 4-byte little endian ints.
+   */
+  public abstract void putIntsLittleEndian(int rowId, int count, byte[] src, int srcIndex);
+
+  /**
+   * Sets the value at rowId to `value`.
+   */
+  public abstract void putLong(int rowId, long value);
+
+  /**
+   * Sets values from [rowId, rowId + count) to value.
+   */
+  public abstract void putLongs(int rowId, int count, long value);
+
+  /**
+   * Sets values from [rowId, rowId + count) to [src + srcIndex, src + srcIndex + count)
+   */
+  public abstract void putLongs(int rowId, int count, long[] src, int srcIndex);
+
+  /**
+   * Sets values from [rowId, rowId + count) to [src[srcIndex], src[srcIndex + count])
+   * The data in src must be 8-byte little endian longs.
+   */
+  public abstract void putLongsLittleEndian(int rowId, int count, byte[] src, int srcIndex);
+
+  /**
+   * Sets the value at rowId to `value`.
+   */
+  public abstract void putFloat(int rowId, float value);
+
+  /**
+   * Sets values from [rowId, rowId + count) to value.
+   */
+  public abstract void putFloats(int rowId, int count, float value);
+
+  /**
+   * Sets values from [rowId, rowId + count) to [src + srcIndex, src + srcIndex + count)
+   */
+  public abstract void putFloats(int rowId, int count, float[] src, int srcIndex);
+
+  /**
+   * Sets values from [rowId, rowId + count) to [src[srcIndex], src[srcIndex + count])
+   * The data in src must be ieee formatted floats.
+   */
+  public abstract void putFloats(int rowId, int count, byte[] src, int srcIndex);
+
+  /**
+   * Sets the value at rowId to `value`.
+   */
+  public abstract void putDouble(int rowId, double value);
+
+  /**
+   * Sets values from [rowId, rowId + count) to value.
+   */
+  public abstract void putDoubles(int rowId, int count, double value);
+
+  /**
+   * Sets values from [rowId, rowId + count) to [src + srcIndex, src + srcIndex + count)
+   */
+  public abstract void putDoubles(int rowId, int count, double[] src, int srcIndex);
+
+  /**
+   * Sets values from [rowId, rowId + count) to [src[srcIndex], src[srcIndex + count])
+   * The data in src must be ieee formatted doubles.
+   */
+  public abstract void putDoubles(int rowId, int count, byte[] src, int srcIndex);
+
+  /**
+   * Puts a byte array that already exists in this column.
+   */
+  public abstract void putArray(int rowId, int offset, int length);
+
+  /**
+   * Sets the value at rowId to `value`.
+   */
+  public abstract int putByteArray(int rowId, byte[] value, int offset, int count);
+  public final int putByteArray(int rowId, byte[] value) {
+    return putByteArray(rowId, value, 0, value.length);
+  }
+
+  /**
+   * Returns the value for rowId.
+   */
+  private ColumnVector.Array getByteArray(int rowId) {
+    ColumnVector.Array array = getArray(rowId);
+    array.data.loadBytes(array);
+    return array;
+  }
+
+  /**
+   * Returns the decimal for rowId.
+   */
+  @Override
+  public Decimal getDecimal(int rowId, int precision, int scale) {
+    if (precision <= Decimal.MAX_INT_DIGITS()) {
+      return Decimal.createUnsafe(getInt(rowId), precision, scale);
+    } else if (precision <= Decimal.MAX_LONG_DIGITS()) {
+      return Decimal.createUnsafe(getLong(rowId), precision, scale);
+    } else {
+      // TODO: best perf?
+      byte[] bytes = getBinary(rowId);
+      BigInteger bigInteger = new BigInteger(bytes);
+      BigDecimal javaDecimal = new BigDecimal(bigInteger, scale);
+      return Decimal.apply(javaDecimal, precision, scale);
+    }
+  }
+
+  public void putDecimal(int rowId, Decimal value, int precision) {
+    if (precision <= Decimal.MAX_INT_DIGITS()) {
+      putInt(rowId, (int) value.toUnscaledLong());
+    } else if (precision <= Decimal.MAX_LONG_DIGITS()) {
+      putLong(rowId, value.toUnscaledLong());
+    } else {
+      BigInteger bigInteger = value.toJavaBigDecimal().unscaledValue();
+      putByteArray(rowId, bigInteger.toByteArray());
+    }
+  }
+
+  /**
+   * Returns the UTF8String for rowId.
+   */
+  @Override
+  public UTF8String getUTF8String(int rowId) {
+    if (dictionary == null) {
+      ColumnVector.Array a = getByteArray(rowId);
+      return UTF8String.fromBytes(a.byteArray, a.byteArrayOffset, a.length);
+    } else {
+      byte[] bytes = dictionary.decodeToBinary(dictionaryIds.getDictId(rowId));
+      return UTF8String.fromBytes(bytes);
+    }
+  }
+
+  /**
+   * Returns the byte array for rowId.
+   */
+  @Override
+  public byte[] getBinary(int rowId) {
+    if (dictionary == null) {
+      ColumnVector.Array array = getByteArray(rowId);
+      byte[] bytes = new byte[array.length];
+      System.arraycopy(array.byteArray, array.byteArrayOffset, bytes, 0, bytes.length);
+      return bytes;
+    } else {
+      return dictionary.decodeToBinary(dictionaryIds.getDictId(rowId));
+    }
+  }
+
+  /**
+   * Append APIs. These APIs all behave similarly and will append data to the current vector.  It
+   * is not valid to mix the put and append APIs. The append APIs are slower and should only be
+   * used if the sizes are not known up front.
+   * In all these cases, the return value is the rowId for the first appended element.
+   */
+  public final int appendNull() {
+    assert (!(dataType() instanceof StructType)); // Use appendStruct()
+    reserve(elementsAppended + 1);
+    putNull(elementsAppended);
+    return elementsAppended++;
+  }
+
+  public final int appendNotNull() {
+    reserve(elementsAppended + 1);
+    putNotNull(elementsAppended);
+    return elementsAppended++;
+  }
+
+  public final int appendNulls(int count) {
+    assert (!(dataType() instanceof StructType));
+    reserve(elementsAppended + count);
+    int result = elementsAppended;
+    putNulls(elementsAppended, count);
+    elementsAppended += count;
+    return result;
+  }
+
+  public final int appendNotNulls(int count) {
+    assert (!(dataType() instanceof StructType));
+    reserve(elementsAppended + count);
+    int result = elementsAppended;
+    putNotNulls(elementsAppended, count);
+    elementsAppended += count;
+    return result;
+  }
+
+  public final int appendBoolean(boolean v) {
+    reserve(elementsAppended + 1);
+    putBoolean(elementsAppended, v);
+    return elementsAppended++;
+  }
+
+  public final int appendBooleans(int count, boolean v) {
+    reserve(elementsAppended + count);
+    int result = elementsAppended;
+    putBooleans(elementsAppended, count, v);
+    elementsAppended += count;
+    return result;
+  }
+
+  public final int appendByte(byte v) {
+    reserve(elementsAppended + 1);
+    putByte(elementsAppended, v);
+    return elementsAppended++;
+  }
+
+  public final int appendBytes(int count, byte v) {
+    reserve(elementsAppended + count);
+    int result = elementsAppended;
+    putBytes(elementsAppended, count, v);
+    elementsAppended += count;
+    return result;
+  }
+
+  public final int appendBytes(int length, byte[] src, int offset) {
+    reserve(elementsAppended + length);
+    int result = elementsAppended;
+    putBytes(elementsAppended, length, src, offset);
+    elementsAppended += length;
+    return result;
+  }
+
+  public final int appendShort(short v) {
+    reserve(elementsAppended + 1);
+    putShort(elementsAppended, v);
+    return elementsAppended++;
+  }
+
+  public final int appendShorts(int count, short v) {
+    reserve(elementsAppended + count);
+    int result = elementsAppended;
+    putShorts(elementsAppended, count, v);
+    elementsAppended += count;
+    return result;
+  }
+
+  public final int appendShorts(int length, short[] src, int offset) {
+    reserve(elementsAppended + length);
+    int result = elementsAppended;
+    putShorts(elementsAppended, length, src, offset);
+    elementsAppended += length;
+    return result;
+  }
+
+  public final int appendInt(int v) {
+    reserve(elementsAppended + 1);
+    putInt(elementsAppended, v);
+    return elementsAppended++;
+  }
+
+  public final int appendInts(int count, int v) {
+    reserve(elementsAppended + count);
+    int result = elementsAppended;
+    putInts(elementsAppended, count, v);
+    elementsAppended += count;
+    return result;
+  }
+
+  public final int appendInts(int length, int[] src, int offset) {
+    reserve(elementsAppended + length);
+    int result = elementsAppended;
+    putInts(elementsAppended, length, src, offset);
+    elementsAppended += length;
+    return result;
+  }
+
+  public final int appendLong(long v) {
+    reserve(elementsAppended + 1);
+    putLong(elementsAppended, v);
+    return elementsAppended++;
+  }
+
+  public final int appendLongs(int count, long v) {
+    reserve(elementsAppended + count);
+    int result = elementsAppended;
+    putLongs(elementsAppended, count, v);
+    elementsAppended += count;
+    return result;
+  }
+
+  public final int appendLongs(int length, long[] src, int offset) {
+    reserve(elementsAppended + length);
+    int result = elementsAppended;
+    putLongs(elementsAppended, length, src, offset);
+    elementsAppended += length;
+    return result;
+  }
+
+  public final int appendFloat(float v) {
+    reserve(elementsAppended + 1);
+    putFloat(elementsAppended, v);
+    return elementsAppended++;
+  }
+
+  public final int appendFloats(int count, float v) {
+    reserve(elementsAppended + count);
+    int result = elementsAppended;
+    putFloats(elementsAppended, count, v);
+    elementsAppended += count;
+    return result;
+  }
+
+  public final int appendFloats(int length, float[] src, int offset) {
+    reserve(elementsAppended + length);
+    int result = elementsAppended;
+    putFloats(elementsAppended, length, src, offset);
+    elementsAppended += length;
+    return result;
+  }
+
+  public final int appendDouble(double v) {
+    reserve(elementsAppended + 1);
+    putDouble(elementsAppended, v);
+    return elementsAppended++;
+  }
+
+  public final int appendDoubles(int count, double v) {
+    reserve(elementsAppended + count);
+    int result = elementsAppended;
+    putDoubles(elementsAppended, count, v);
+    elementsAppended += count;
+    return result;
+  }
+
+  public final int appendDoubles(int length, double[] src, int offset) {
+    reserve(elementsAppended + length);
+    int result = elementsAppended;
+    putDoubles(elementsAppended, length, src, offset);
+    elementsAppended += length;
+    return result;
+  }
+
+  public final int appendByteArray(byte[] value, int offset, int length) {
+    int copiedOffset = arrayData().appendBytes(length, value, offset);
+    reserve(elementsAppended + 1);
+    putArray(elementsAppended, copiedOffset, length);
+    return elementsAppended++;
+  }
+
+  public final int appendArray(int length) {
+    reserve(elementsAppended + 1);
+    putArray(elementsAppended, arrayData().elementsAppended, length);
+    return elementsAppended++;
+  }
+
+  /**
+   * Appends a NULL struct. This *has* to be used for structs instead of appendNull() as this
+   * recursively appends a NULL to its children.
+   * We don't have this logic as the general appendNull implementation to optimize the more
+   * common non-struct case.
+   */
+  public final int appendStruct(boolean isNull) {
+    if (isNull) {
+      appendNull();
+      for (ColumnVector c: childColumns) {
+        if (c.type instanceof StructType) {
+          ((WritableColumnVector) c).appendStruct(true);
+        } else {
+          ((WritableColumnVector) c).appendNull();
+        }
+      }
+    } else {
+      appendNotNull();
+    }
+    return elementsAppended;
+  }
+
+  /**
+   * Returns the data for the underlying array.
+   */
+  @Override
+  public WritableColumnVector arrayData() { return childColumns[0]; }
+
+  /**
+   * Returns the ordinal's child data column.
+   */
+  @Override
+  public WritableColumnVector getChildColumn(int ordinal) { return childColumns[ordinal]; }
+
+  /**
+   * Returns the elements appended.
+   */
+  public final int getElementsAppended() { return elementsAppended; }
+
+  /**
+   * Marks this column as being constant.
+   */
+  public final void setIsConstant() { isConstant = true; }
+
+  /**
+   * Maximum number of rows that can be stored in this column.
+   */
+  protected int capacity;
+
+  /**
+   * Upper limit for the maximum capacity for this column.
+   */
+  @VisibleForTesting
+  protected int MAX_CAPACITY = Integer.MAX_VALUE;
+
+  /**
+   * Number of nulls in this column. This is an optimization for the reader, to skip NULL checks.
+   */
+  protected int numNulls;
+
+  /**
+   * True if there is at least one NULL byte set. This is an optimization for the writer, to skip
+   * having to clear NULL bits.
+   */
+  protected boolean anyNullsSet;
+
+  /**
+   * True if this column's values are fixed. This means the column values never change, even
+   * across resets.
+   */
+  protected boolean isConstant;
+
+  /**
+   * Default size of each array length value. This grows as necessary.
+   */
+  protected static final int DEFAULT_ARRAY_LENGTH = 4;
+
+  /**
+   * Current write cursor (row index) when appending data.
+   */
+  protected int elementsAppended;
+
+  /**
+   * If this is a nested type (array or struct), the column for the child data.
+   */
+  protected WritableColumnVector[] childColumns;
+
+  /**
+   * Update the dictionary.
+   */
+  public void setDictionary(Dictionary dictionary) {
+    this.dictionary = dictionary;
+  }
+
+  /**
+   * Reserve a integer column for ids of dictionary.
+   */
+  public WritableColumnVector reserveDictionaryIds(int capacity) {
+    WritableColumnVector dictionaryIds = (WritableColumnVector) this.dictionaryIds;
+    if (dictionaryIds == null) {
+      dictionaryIds = reserveNewColumn(capacity, DataTypes.IntegerType);
+      this.dictionaryIds = dictionaryIds;
+    } else {
+      dictionaryIds.reset();
+      dictionaryIds.reserve(capacity);
+    }
+    return dictionaryIds;
+  }
+
+  /**
+   * Returns the underlying integer column for ids of dictionary.
+   */
+  @Override
+  public WritableColumnVector getDictionaryIds() {
+    return (WritableColumnVector) dictionaryIds;
+  }
+
+  /**
+   * Reserve a new column.
+   */
+  protected abstract WritableColumnVector reserveNewColumn(int capacity, DataType type);
+
+  /**
+   * Sets up the common state and also handles creating the child columns if this is a nested
+   * type.
+   */
+  protected WritableColumnVector(int capacity, DataType type) {
+    super(type);
+    this.capacity = capacity;
+
+    if (type instanceof ArrayType || type instanceof BinaryType || type instanceof StringType
+        || DecimalType.isByteArrayDecimalType(type)) {
+      DataType childType;
+      int childCapacity = capacity;
+      if (type instanceof ArrayType) {
+        childType = ((ArrayType)type).elementType();
+      } else {
+        childType = DataTypes.ByteType;
+        childCapacity *= DEFAULT_ARRAY_LENGTH;
+      }
+      this.childColumns = new WritableColumnVector[1];
+      this.childColumns[0] = reserveNewColumn(childCapacity, childType);
+      this.resultArray = new ColumnVector.Array(this.childColumns[0]);
+      this.resultStruct = null;
+    } else if (type instanceof StructType) {
+      StructType st = (StructType)type;
+      this.childColumns = new WritableColumnVector[st.fields().length];
+      for (int i = 0; i < childColumns.length; ++i) {
+        this.childColumns[i] = reserveNewColumn(capacity, st.fields()[i].dataType());
+      }
+      this.resultArray = null;
+      this.resultStruct = new ColumnarBatch.Row(this.childColumns);
+    } else if (type instanceof CalendarIntervalType) {
+      // Two columns. Months as int. Microseconds as Long.
+      this.childColumns = new WritableColumnVector[2];
+      this.childColumns[0] = reserveNewColumn(capacity, DataTypes.IntegerType);
+      this.childColumns[1] = reserveNewColumn(capacity, DataTypes.LongType);
+      this.resultArray = null;
+      this.resultStruct = new ColumnarBatch.Row(this.childColumns);
+    } else {
+      this.childColumns = null;
+      this.resultArray = null;
+      this.resultStruct = null;
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
index 0c40417db0837..13f79275cac41 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
@@ -76,6 +76,8 @@ class VectorizedHashMapGenerator(
         }.mkString("\n").concat(";")
 
     s"""
+       |  private org.apache.spark.sql.execution.vectorized.OnHeapColumnVector[] batchVectors;
+       |  private org.apache.spark.sql.execution.vectorized.OnHeapColumnVector[] bufferVectors;
        |  private org.apache.spark.sql.execution.vectorized.ColumnarBatch batch;
        |  private org.apache.spark.sql.execution.vectorized.ColumnarBatch aggregateBufferBatch;
        |  private int[] buckets;
@@ -89,14 +91,19 @@ class VectorizedHashMapGenerator(
        |    $generatedAggBufferSchema
        |
        |  public $generatedClassName() {
-       |    batch = org.apache.spark.sql.execution.vectorized.ColumnarBatch.allocate(schema,
-       |      org.apache.spark.memory.MemoryMode.ON_HEAP, capacity);
-       |    // TODO: Possibly generate this projection in HashAggregate directly
-       |    aggregateBufferBatch = org.apache.spark.sql.execution.vectorized.ColumnarBatch.allocate(
-       |      aggregateBufferSchema, org.apache.spark.memory.MemoryMode.ON_HEAP, capacity);
-       |    for (int i = 0 ; i < aggregateBufferBatch.numCols(); i++) {
-       |       aggregateBufferBatch.setColumn(i, batch.column(i+${groupingKeys.length}));
+       |    batchVectors = org.apache.spark.sql.execution.vectorized
+       |      .OnHeapColumnVector.allocateColumns(capacity, schema);
+       |    batch = new org.apache.spark.sql.execution.vectorized.ColumnarBatch(
+       |      schema, batchVectors, capacity);
+       |
+       |    bufferVectors = new org.apache.spark.sql.execution.vectorized
+       |      .OnHeapColumnVector[aggregateBufferSchema.fields().length];
+       |    for (int i = 0; i < aggregateBufferSchema.fields().length; i++) {
+       |      bufferVectors[i] = batchVectors[i + ${groupingKeys.length}];
        |    }
+       |    // TODO: Possibly generate this projection in HashAggregate directly
+       |    aggregateBufferBatch = new org.apache.spark.sql.execution.vectorized.ColumnarBatch(
+       |      aggregateBufferSchema, bufferVectors, capacity);
        |
        |    buckets = new int[numBuckets];
        |    java.util.Arrays.fill(buckets, -1);
@@ -112,8 +119,8 @@ class VectorizedHashMapGenerator(
    *
    * {{{
    * private boolean equals(int idx, long agg_key, long agg_key1) {
-   *   return batch.column(0).getLong(buckets[idx]) == agg_key &&
-   *     batch.column(1).getLong(buckets[idx]) == agg_key1;
+   *   return batchVectors[0].getLong(buckets[idx]) == agg_key &&
+   *     batchVectors[1].getLong(buckets[idx]) == agg_key1;
    * }
    * }}}
    */
@@ -121,8 +128,8 @@ class VectorizedHashMapGenerator(
 
     def genEqualsForKeys(groupingKeys: Seq[Buffer]): String = {
       groupingKeys.zipWithIndex.map { case (key: Buffer, ordinal: Int) =>
-        s"""(${ctx.genEqual(key.dataType, ctx.getValue("batch", "buckets[idx]",
-          key.dataType, ordinal), key.name)})"""
+        s"""(${ctx.genEqual(key.dataType, ctx.getValue(s"batchVectors[$ordinal]", "buckets[idx]",
+          key.dataType), key.name)})"""
       }.mkString(" && ")
     }
 
@@ -150,9 +157,9 @@ class VectorizedHashMapGenerator(
    *   while (step < maxSteps) {
    *     // Return bucket index if it's either an empty slot or already contains the key
    *     if (buckets[idx] == -1) {
-   *       batch.column(0).putLong(numRows, agg_key);
-   *       batch.column(1).putLong(numRows, agg_key1);
-   *       batch.column(2).putLong(numRows, 0);
+   *       batchVectors[0].putLong(numRows, agg_key);
+   *       batchVectors[1].putLong(numRows, agg_key1);
+   *       batchVectors[2].putLong(numRows, 0);
    *       buckets[idx] = numRows++;
    *       return batch.getRow(buckets[idx]);
    *     } else if (equals(idx, agg_key, agg_key1)) {
@@ -170,13 +177,13 @@ class VectorizedHashMapGenerator(
 
     def genCodeToSetKeys(groupingKeys: Seq[Buffer]): Seq[String] = {
       groupingKeys.zipWithIndex.map { case (key: Buffer, ordinal: Int) =>
-        ctx.setValue("batch", "numRows", key.dataType, ordinal, key.name)
+        ctx.setValue(s"batchVectors[$ordinal]", "numRows", key.dataType, key.name)
       }
     }
 
     def genCodeToSetAggBuffers(bufferValues: Seq[Buffer]): Seq[String] = {
       bufferValues.zipWithIndex.map { case (key: Buffer, ordinal: Int) =>
-        ctx.updateColumn("batch", "numRows", key.dataType, groupingKeys.length + ordinal,
+        ctx.updateColumn(s"batchVectors[${groupingKeys.length + ordinal}]", "numRows", key.dataType,
           buffVars(ordinal), nullable = true)
       }
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchBenchmark.scala
index 67b3d98c1daed..1331f157363b0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchBenchmark.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchBenchmark.scala
@@ -24,7 +24,10 @@ import scala.util.Random
 import org.apache.spark.memory.MemoryMode
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.execution.vectorized.ColumnVector
-import org.apache.spark.sql.types.{BinaryType, IntegerType}
+import org.apache.spark.sql.execution.vectorized.OffHeapColumnVector
+import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector
+import org.apache.spark.sql.execution.vectorized.WritableColumnVector
+import org.apache.spark.sql.types.{BinaryType, DataType, IntegerType}
 import org.apache.spark.unsafe.Platform
 import org.apache.spark.util.Benchmark
 import org.apache.spark.util.collection.BitSet
@@ -34,6 +37,14 @@ import org.apache.spark.util.collection.BitSet
  */
 object ColumnarBatchBenchmark {
 
+  def allocate(capacity: Int, dt: DataType, memMode: MemoryMode): WritableColumnVector = {
+    if (memMode == MemoryMode.OFF_HEAP) {
+      new OffHeapColumnVector(capacity, dt)
+    } else {
+      new OnHeapColumnVector(capacity, dt)
+    }
+  }
+
   // This benchmark reads and writes an array of ints.
   // TODO: there is a big (2x) penalty for a random access API for off heap.
   // Note: carefully if modifying this code. It's hard to reason about the JIT.
@@ -140,7 +151,7 @@ object ColumnarBatchBenchmark {
 
     // Access through the column API with on heap memory
     val columnOnHeap = { i: Int =>
-      val col = ColumnVector.allocate(count, IntegerType, MemoryMode.ON_HEAP)
+      val col = allocate(count, IntegerType, MemoryMode.ON_HEAP)
       var sum = 0L
       for (n <- 0L until iters) {
         var i = 0
@@ -159,7 +170,7 @@ object ColumnarBatchBenchmark {
 
     // Access through the column API with off heap memory
     def columnOffHeap = { i: Int => {
-      val col = ColumnVector.allocate(count, IntegerType, MemoryMode.OFF_HEAP)
+      val col = allocate(count, IntegerType, MemoryMode.OFF_HEAP)
       var sum = 0L
       for (n <- 0L until iters) {
         var i = 0
@@ -178,7 +189,7 @@ object ColumnarBatchBenchmark {
 
     // Access by directly getting the buffer backing the column.
     val columnOffheapDirect = { i: Int =>
-      val col = ColumnVector.allocate(count, IntegerType, MemoryMode.OFF_HEAP)
+      val col = allocate(count, IntegerType, MemoryMode.OFF_HEAP)
       var sum = 0L
       for (n <- 0L until iters) {
         var addr = col.valuesNativeAddress()
@@ -244,7 +255,7 @@ object ColumnarBatchBenchmark {
 
     // Adding values by appending, instead of putting.
     val onHeapAppend = { i: Int =>
-      val col = ColumnVector.allocate(count, IntegerType, MemoryMode.ON_HEAP)
+      val col = allocate(count, IntegerType, MemoryMode.ON_HEAP)
       var sum = 0L
       for (n <- 0L until iters) {
         var i = 0
@@ -362,7 +373,7 @@ object ColumnarBatchBenchmark {
       .map(_.getBytes(StandardCharsets.UTF_8)).toArray
 
     def column(memoryMode: MemoryMode) = { i: Int =>
-      val column = ColumnVector.allocate(count, BinaryType, memoryMode)
+      val column = allocate(count, BinaryType, memoryMode)
       var sum = 0L
       for (n <- 0L until iters) {
         var i = 0
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
index c8461dcb9dfdb..08ccbd628cf8f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
@@ -34,11 +34,20 @@ import org.apache.spark.unsafe.Platform
 import org.apache.spark.unsafe.types.CalendarInterval
 
 class ColumnarBatchSuite extends SparkFunSuite {
+
+  def allocate(capacity: Int, dt: DataType, memMode: MemoryMode): WritableColumnVector = {
+    if (memMode == MemoryMode.OFF_HEAP) {
+      new OffHeapColumnVector(capacity, dt)
+    } else {
+      new OnHeapColumnVector(capacity, dt)
+    }
+  }
+
   test("Null Apis") {
     (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode => {
       val reference = mutable.ArrayBuffer.empty[Boolean]
 
-      val column = ColumnVector.allocate(1024, IntegerType, memMode)
+      val column = allocate(1024, IntegerType, memMode)
       var idx = 0
       assert(column.anyNullsSet() == false)
       assert(column.numNulls() == 0)
@@ -109,7 +118,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
     (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode => {
       val reference = mutable.ArrayBuffer.empty[Byte]
 
-      val column = ColumnVector.allocate(1024, ByteType, memMode)
+      val column = allocate(1024, ByteType, memMode)
 
       var values = (10 :: 20 :: 30 :: 40 :: 50 :: Nil).map(_.toByte).toArray
       column.appendBytes(2, values, 0)
@@ -167,7 +176,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
       val random = new Random(seed)
       val reference = mutable.ArrayBuffer.empty[Short]
 
-      val column = ColumnVector.allocate(1024, ShortType, memMode)
+      val column = allocate(1024, ShortType, memMode)
 
       var values = (10 :: 20 :: 30 :: 40 :: 50 :: Nil).map(_.toShort).toArray
       column.appendShorts(2, values, 0)
@@ -247,7 +256,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
       val random = new Random(seed)
       val reference = mutable.ArrayBuffer.empty[Int]
 
-      val column = ColumnVector.allocate(1024, IntegerType, memMode)
+      val column = allocate(1024, IntegerType, memMode)
 
       var values = (10 :: 20 :: 30 :: 40 :: 50 :: Nil).toArray
       column.appendInts(2, values, 0)
@@ -332,7 +341,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
       val random = new Random(seed)
       val reference = mutable.ArrayBuffer.empty[Long]
 
-      val column = ColumnVector.allocate(1024, LongType, memMode)
+      val column = allocate(1024, LongType, memMode)
 
       var values = (10L :: 20L :: 30L :: 40L :: 50L :: Nil).toArray
       column.appendLongs(2, values, 0)
@@ -419,7 +428,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
       val random = new Random(seed)
       val reference = mutable.ArrayBuffer.empty[Float]
 
-      val column = ColumnVector.allocate(1024, FloatType, memMode)
+      val column = allocate(1024, FloatType, memMode)
 
       var values = (.1f :: .2f :: .3f :: .4f :: .5f :: Nil).toArray
       column.appendFloats(2, values, 0)
@@ -510,7 +519,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
       val random = new Random(seed)
       val reference = mutable.ArrayBuffer.empty[Double]
 
-      val column = ColumnVector.allocate(1024, DoubleType, memMode)
+      val column = allocate(1024, DoubleType, memMode)
 
       var values = (.1 :: .2 :: .3 :: .4 :: .5 :: Nil).toArray
       column.appendDoubles(2, values, 0)
@@ -599,7 +608,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
     (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode => {
       val reference = mutable.ArrayBuffer.empty[String]
 
-      val column = ColumnVector.allocate(6, BinaryType, memMode)
+      val column = allocate(6, BinaryType, memMode)
       assert(column.arrayData().elementsAppended == 0)
 
       val str = "string"
@@ -656,7 +665,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
 
   test("Int Array") {
     (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode => {
-      val column = ColumnVector.allocate(10, new ArrayType(IntegerType, true), memMode)
+      val column = allocate(10, new ArrayType(IntegerType, true), memMode)
 
       // Fill the underlying data with all the arrays back to back.
       val data = column.arrayData();
@@ -714,43 +723,43 @@ class ColumnarBatchSuite extends SparkFunSuite {
     (MemoryMode.ON_HEAP :: Nil).foreach { memMode => {
       val len = 4
 
-      val columnBool = ColumnVector.allocate(len, new ArrayType(BooleanType, false), memMode)
+      val columnBool = allocate(len, new ArrayType(BooleanType, false), memMode)
       val boolArray = Array(false, true, false, true)
       boolArray.zipWithIndex.map { case (v, i) => columnBool.arrayData.putBoolean(i, v) }
       columnBool.putArray(0, 0, len)
       assert(columnBool.getArray(0).toBooleanArray === boolArray)
 
-      val columnByte = ColumnVector.allocate(len, new ArrayType(ByteType, false), memMode)
+      val columnByte = allocate(len, new ArrayType(ByteType, false), memMode)
       val byteArray = Array[Byte](0, 1, 2, 3)
       byteArray.zipWithIndex.map { case (v, i) => columnByte.arrayData.putByte(i, v) }
       columnByte.putArray(0, 0, len)
       assert(columnByte.getArray(0).toByteArray === byteArray)
 
-      val columnShort = ColumnVector.allocate(len, new ArrayType(ShortType, false), memMode)
+      val columnShort = allocate(len, new ArrayType(ShortType, false), memMode)
       val shortArray = Array[Short](0, 1, 2, 3)
       shortArray.zipWithIndex.map { case (v, i) => columnShort.arrayData.putShort(i, v) }
       columnShort.putArray(0, 0, len)
       assert(columnShort.getArray(0).toShortArray === shortArray)
 
-      val columnInt = ColumnVector.allocate(len, new ArrayType(IntegerType, false), memMode)
+      val columnInt = allocate(len, new ArrayType(IntegerType, false), memMode)
       val intArray = Array(0, 1, 2, 3)
       intArray.zipWithIndex.map { case (v, i) => columnInt.arrayData.putInt(i, v) }
       columnInt.putArray(0, 0, len)
       assert(columnInt.getArray(0).toIntArray === intArray)
 
-      val columnLong = ColumnVector.allocate(len, new ArrayType(LongType, false), memMode)
+      val columnLong = allocate(len, new ArrayType(LongType, false), memMode)
       val longArray = Array[Long](0, 1, 2, 3)
       longArray.zipWithIndex.map { case (v, i) => columnLong.arrayData.putLong(i, v) }
       columnLong.putArray(0, 0, len)
       assert(columnLong.getArray(0).toLongArray === longArray)
 
-      val columnFloat = ColumnVector.allocate(len, new ArrayType(FloatType, false), memMode)
+      val columnFloat = allocate(len, new ArrayType(FloatType, false), memMode)
       val floatArray = Array(0.0F, 1.1F, 2.2F, 3.3F)
       floatArray.zipWithIndex.map { case (v, i) => columnFloat.arrayData.putFloat(i, v) }
       columnFloat.putArray(0, 0, len)
       assert(columnFloat.getArray(0).toFloatArray === floatArray)
 
-      val columnDouble = ColumnVector.allocate(len, new ArrayType(DoubleType, false), memMode)
+      val columnDouble = allocate(len, new ArrayType(DoubleType, false), memMode)
       val doubleArray = Array(0.0, 1.1, 2.2, 3.3)
       doubleArray.zipWithIndex.map { case (v, i) => columnDouble.arrayData.putDouble(i, v) }
       columnDouble.putArray(0, 0, len)
@@ -761,7 +770,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
   test("Struct Column") {
     (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode => {
       val schema = new StructType().add("int", IntegerType).add("double", DoubleType)
-      val column = ColumnVector.allocate(1024, schema, memMode)
+      val column = allocate(1024, schema, memMode)
 
       val c1 = column.getChildColumn(0)
       val c2 = column.getChildColumn(1)
@@ -790,7 +799,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
 
   test("Nest Array in Array.") {
     (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode =>
-      val column = ColumnVector.allocate(10, new ArrayType(new ArrayType(IntegerType, true), true),
+      val column = allocate(10, new ArrayType(new ArrayType(IntegerType, true), true),
         memMode)
       val childColumn = column.arrayData()
       val data = column.arrayData().arrayData()
@@ -823,7 +832,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
   test("Nest Struct in Array.") {
     (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode =>
       val schema = new StructType().add("int", IntegerType).add("long", LongType)
-      val column = ColumnVector.allocate(10, new ArrayType(schema, true), memMode)
+      val column = allocate(10, new ArrayType(schema, true), memMode)
       val data = column.arrayData()
       val c0 = data.getChildColumn(0)
       val c1 = data.getChildColumn(1)
@@ -853,7 +862,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
       val schema = new StructType()
         .add("int", IntegerType)
         .add("array", new ArrayType(IntegerType, true))
-      val column = ColumnVector.allocate(10, schema, memMode)
+      val column = allocate(10, schema, memMode)
       val c0 = column.getChildColumn(0)
       val c1 = column.getChildColumn(1)
       c0.putInt(0, 0)
@@ -885,7 +894,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
       val schema = new StructType()
         .add("int", IntegerType)
         .add("struct", subSchema)
-      val column = ColumnVector.allocate(10, schema, memMode)
+      val column = allocate(10, schema, memMode)
       val c0 = column.getChildColumn(0)
       val c1 = column.getChildColumn(1)
       c0.putInt(0, 0)
@@ -918,7 +927,11 @@ class ColumnarBatchSuite extends SparkFunSuite {
         .add("intCol2", IntegerType)
         .add("string", BinaryType)
 
-      val batch = ColumnarBatch.allocate(schema, memMode)
+      val capacity = ColumnarBatch.DEFAULT_BATCH_SIZE
+      val columns = schema.fields.map { field =>
+        allocate(capacity, field.dataType, memMode)
+      }
+      val batch = new ColumnarBatch(schema, columns.toArray, ColumnarBatch.DEFAULT_BATCH_SIZE)
       assert(batch.numCols() == 4)
       assert(batch.numRows() == 0)
       assert(batch.numValidRows() == 0)
@@ -926,10 +939,10 @@ class ColumnarBatchSuite extends SparkFunSuite {
       assert(batch.rowIterator().hasNext == false)
 
       // Add a row [1, 1.1, NULL]
-      batch.column(0).putInt(0, 1)
-      batch.column(1).putDouble(0, 1.1)
-      batch.column(2).putNull(0)
-      batch.column(3).putByteArray(0, "Hello".getBytes(StandardCharsets.UTF_8))
+      columns(0).putInt(0, 1)
+      columns(1).putDouble(0, 1.1)
+      columns(2).putNull(0)
+      columns(3).putByteArray(0, "Hello".getBytes(StandardCharsets.UTF_8))
       batch.setNumRows(1)
 
       // Verify the results of the row.
@@ -939,12 +952,12 @@ class ColumnarBatchSuite extends SparkFunSuite {
       assert(batch.rowIterator().hasNext == true)
       assert(batch.rowIterator().hasNext == true)
 
-      assert(batch.column(0).getInt(0) == 1)
-      assert(batch.column(0).isNullAt(0) == false)
-      assert(batch.column(1).getDouble(0) == 1.1)
-      assert(batch.column(1).isNullAt(0) == false)
-      assert(batch.column(2).isNullAt(0) == true)
-      assert(batch.column(3).getUTF8String(0).toString == "Hello")
+      assert(columns(0).getInt(0) == 1)
+      assert(columns(0).isNullAt(0) == false)
+      assert(columns(1).getDouble(0) == 1.1)
+      assert(columns(1).isNullAt(0) == false)
+      assert(columns(2).isNullAt(0) == true)
+      assert(columns(3).getUTF8String(0).toString == "Hello")
 
       // Verify the iterator works correctly.
       val it = batch.rowIterator()
@@ -955,7 +968,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
       assert(row.getDouble(1) == 1.1)
       assert(row.isNullAt(1) == false)
       assert(row.isNullAt(2) == true)
-      assert(batch.column(3).getUTF8String(0).toString == "Hello")
+      assert(columns(3).getUTF8String(0).toString == "Hello")
       assert(it.hasNext == false)
       assert(it.hasNext == false)
 
@@ -972,20 +985,20 @@ class ColumnarBatchSuite extends SparkFunSuite {
       assert(batch.rowIterator().hasNext == false)
 
       // Add rows [NULL, 2.2, 2, "abc"], [3, NULL, 3, ""], [4, 4.4, 4, "world]
-      batch.column(0).putNull(0)
-      batch.column(1).putDouble(0, 2.2)
-      batch.column(2).putInt(0, 2)
-      batch.column(3).putByteArray(0, "abc".getBytes(StandardCharsets.UTF_8))
-
-      batch.column(0).putInt(1, 3)
-      batch.column(1).putNull(1)
-      batch.column(2).putInt(1, 3)
-      batch.column(3).putByteArray(1, "".getBytes(StandardCharsets.UTF_8))
-
-      batch.column(0).putInt(2, 4)
-      batch.column(1).putDouble(2, 4.4)
-      batch.column(2).putInt(2, 4)
-      batch.column(3).putByteArray(2, "world".getBytes(StandardCharsets.UTF_8))
+      columns(0).putNull(0)
+      columns(1).putDouble(0, 2.2)
+      columns(2).putInt(0, 2)
+      columns(3).putByteArray(0, "abc".getBytes(StandardCharsets.UTF_8))
+
+      columns(0).putInt(1, 3)
+      columns(1).putNull(1)
+      columns(2).putInt(1, 3)
+      columns(3).putByteArray(1, "".getBytes(StandardCharsets.UTF_8))
+
+      columns(0).putInt(2, 4)
+      columns(1).putDouble(2, 4.4)
+      columns(2).putInt(2, 4)
+      columns(3).putByteArray(2, "world".getBytes(StandardCharsets.UTF_8))
       batch.setNumRows(3)
 
       def rowEquals(x: InternalRow, y: Row): Unit = {
@@ -1232,7 +1245,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
 
   test("exceeding maximum capacity should throw an error") {
     (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode =>
-      val column = ColumnVector.allocate(1, ByteType, memMode)
+      val column = allocate(1, ByteType, memMode)
       column.MAX_CAPACITY = 15
       column.appendBytes(5, 0.toByte)
       // Successfully allocate twice the requested capacity

From 183d4cb71fbcbf484fc85d8621e1fe04cbbc8195 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Thu, 24 Aug 2017 21:46:58 +0800
Subject: [PATCH 419/779] [SPARK-21759][SQL] In.checkInputDataTypes should not
 wrongly report unresolved plans for IN correlated subquery

## What changes were proposed in this pull request?

With the check for structural integrity proposed in SPARK-21726, it is found that the optimization rule `PullupCorrelatedPredicates` can produce unresolved plans.

For a correlated IN query looks like:

    SELECT t1.a FROM t1
    WHERE
    t1.a IN (SELECT t2.c
            FROM t2
            WHERE t1.b < t2.d);

The query plan might look like:

    Project [a#0]
    +- Filter a#0 IN (list#4 [b#1])
       :  +- Project [c#2]
       :     +- Filter (outer(b#1) < d#3)
       :        +- LocalRelation <empty>, [c#2, d#3]
       +- LocalRelation <empty>, [a#0, b#1]

After `PullupCorrelatedPredicates`, it produces query plan like:

    'Project [a#0]
    +- 'Filter a#0 IN (list#4 [(b#1 < d#3)])
       :  +- Project [c#2, d#3]
       :     +- LocalRelation <empty>, [c#2, d#3]
       +- LocalRelation <empty>, [a#0, b#1]

Because the correlated predicate involves another attribute `d#3` in subquery, it has been pulled out and added into the `Project` on the top of the subquery.

When `list` in `In` contains just one `ListQuery`, `In.checkInputDataTypes` checks if the size of `value` expressions matches the output size of subquery. In the above example, there is only `value` expression and the subquery output has two attributes `c#2, d#3`, so it fails the check and `In.resolved` returns `false`.

We should not let `In.checkInputDataTypes` wrongly report unresolved plans to fail the structural integrity check.

## How was this patch tested?

Added test.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #18968 from viirya/SPARK-21759.
---
 .../sql/catalyst/analysis/Analyzer.scala      |  6 +-
 .../sql/catalyst/analysis/TypeCoercion.scala  |  5 +-
 .../sql/catalyst/expressions/predicates.scala | 65 +++++++++----------
 .../sql/catalyst/expressions/subquery.scala   | 13 +++-
 .../sql/catalyst/optimizer/subquery.scala     | 10 +--
 .../PullupCorrelatedPredicatesSuite.scala     | 52 +++++++++++++++
 .../subq-input-typecheck.sql.out              |  6 +-
 7 files changed, 106 insertions(+), 51 deletions(-)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PullupCorrelatedPredicatesSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 70a3885d21531..1e934d0aa0e51 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -1286,8 +1286,10 @@ class Analyzer(
           resolveSubQuery(s, plans)(ScalarSubquery(_, _, exprId))
         case e @ Exists(sub, _, exprId) if !sub.resolved =>
           resolveSubQuery(e, plans)(Exists(_, _, exprId))
-        case In(value, Seq(l @ ListQuery(sub, _, exprId))) if value.resolved && !sub.resolved =>
-          val expr = resolveSubQuery(l, plans)(ListQuery(_, _, exprId))
+        case In(value, Seq(l @ ListQuery(sub, _, exprId, _))) if value.resolved && !l.resolved =>
+          val expr = resolveSubQuery(l, plans)((plan, exprs) => {
+            ListQuery(plan, exprs, exprId, plan.output)
+          })
           In(value, Seq(expr))
       }
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
index 06d8350db9891..9ffe646b5e4ec 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
@@ -402,7 +402,7 @@ object TypeCoercion {
 
       // Handle type casting required between value expression and subquery output
       // in IN subquery.
-      case i @ In(a, Seq(ListQuery(sub, children, exprId)))
+      case i @ In(a, Seq(ListQuery(sub, children, exprId, _)))
         if !i.resolved && flattenExpr(a).length == sub.output.length =>
         // LHS is the value expression of IN subquery.
         val lhs = flattenExpr(a)
@@ -434,7 +434,8 @@ object TypeCoercion {
             case _ => CreateStruct(castedLhs)
           }
 
-          In(newLhs, Seq(ListQuery(Project(castedRhs, sub), children, exprId)))
+          val newSub = Project(castedRhs, sub)
+          In(newLhs, Seq(ListQuery(newSub, children, exprId, newSub.output)))
         } else {
           i
         }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 7bf10f199f1c7..613d6202b0b26 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -138,32 +138,33 @@ case class Not(child: Expression)
 case class In(value: Expression, list: Seq[Expression]) extends Predicate {
 
   require(list != null, "list should not be null")
+
   override def checkInputDataTypes(): TypeCheckResult = {
-    list match {
-      case ListQuery(sub, _, _) :: Nil =>
-        val valExprs = value match {
-          case cns: CreateNamedStruct => cns.valExprs
-          case expr => Seq(expr)
-        }
-        if (valExprs.length != sub.output.length) {
-          TypeCheckResult.TypeCheckFailure(
-            s"""
-               |The number of columns in the left hand side of an IN subquery does not match the
-               |number of columns in the output of subquery.
-               |#columns in left hand side: ${valExprs.length}.
-               |#columns in right hand side: ${sub.output.length}.
-               |Left side columns:
-               |[${valExprs.map(_.sql).mkString(", ")}].
-               |Right side columns:
-               |[${sub.output.map(_.sql).mkString(", ")}].
-             """.stripMargin)
-        } else {
-          val mismatchedColumns = valExprs.zip(sub.output).flatMap {
-            case (l, r) if l.dataType != r.dataType =>
-              s"(${l.sql}:${l.dataType.catalogString}, ${r.sql}:${r.dataType.catalogString})"
-            case _ => None
+    val mismatchOpt = list.find(l => !DataType.equalsStructurally(l.dataType, value.dataType))
+    if (mismatchOpt.isDefined) {
+      list match {
+        case ListQuery(_, _, _, childOutputs) :: Nil =>
+          val valExprs = value match {
+            case cns: CreateNamedStruct => cns.valExprs
+            case expr => Seq(expr)
           }
-          if (mismatchedColumns.nonEmpty) {
+          if (valExprs.length != childOutputs.length) {
+            TypeCheckResult.TypeCheckFailure(
+              s"""
+                 |The number of columns in the left hand side of an IN subquery does not match the
+                 |number of columns in the output of subquery.
+                 |#columns in left hand side: ${valExprs.length}.
+                 |#columns in right hand side: ${childOutputs.length}.
+                 |Left side columns:
+                 |[${valExprs.map(_.sql).mkString(", ")}].
+                 |Right side columns:
+                 |[${childOutputs.map(_.sql).mkString(", ")}].""".stripMargin)
+          } else {
+            val mismatchedColumns = valExprs.zip(childOutputs).flatMap {
+              case (l, r) if l.dataType != r.dataType =>
+                s"(${l.sql}:${l.dataType.catalogString}, ${r.sql}:${r.dataType.catalogString})"
+              case _ => None
+            }
             TypeCheckResult.TypeCheckFailure(
               s"""
                  |The data type of one or more elements in the left hand side of an IN subquery
@@ -173,20 +174,14 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate {
                  |Left side:
                  |[${valExprs.map(_.dataType.catalogString).mkString(", ")}].
                  |Right side:
-                 |[${sub.output.map(_.dataType.catalogString).mkString(", ")}].
-               """.stripMargin)
-          } else {
-            TypeUtils.checkForOrderingExpr(value.dataType, s"function $prettyName")
+                 |[${childOutputs.map(_.dataType.catalogString).mkString(", ")}].""".stripMargin)
           }
-        }
-      case _ =>
-        val mismatchOpt = list.find(l => l.dataType != value.dataType)
-        if (mismatchOpt.isDefined) {
+        case _ =>
           TypeCheckResult.TypeCheckFailure(s"Arguments must be same type but were: " +
             s"${value.dataType} != ${mismatchOpt.get.dataType}")
-        } else {
-          TypeUtils.checkForOrderingExpr(value.dataType, s"function $prettyName")
-        }
+      }
+    } else {
+      TypeUtils.checkForOrderingExpr(value.dataType, s"function $prettyName")
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala
index d7b493d521ddb..c6146042ef1a6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala
@@ -274,9 +274,15 @@ object ScalarSubquery {
 case class ListQuery(
     plan: LogicalPlan,
     children: Seq[Expression] = Seq.empty,
-    exprId: ExprId = NamedExpression.newExprId)
+    exprId: ExprId = NamedExpression.newExprId,
+    childOutputs: Seq[Attribute] = Seq.empty)
   extends SubqueryExpression(plan, children, exprId) with Unevaluable {
-  override def dataType: DataType = plan.schema.fields.head.dataType
+  override def dataType: DataType = if (childOutputs.length > 1) {
+    childOutputs.toStructType
+  } else {
+    childOutputs.head.dataType
+  }
+  override lazy val resolved: Boolean = childrenResolved && plan.resolved && childOutputs.nonEmpty
   override def nullable: Boolean = false
   override def withNewPlan(plan: LogicalPlan): ListQuery = copy(plan = plan)
   override def toString: String = s"list#${exprId.id} $conditionString"
@@ -284,7 +290,8 @@ case class ListQuery(
     ListQuery(
       plan.canonicalized,
       children.map(_.canonicalized),
-      ExprId(0))
+      ExprId(0),
+      childOutputs.map(_.canonicalized.asInstanceOf[Attribute]))
   }
 }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
index 9dbb6b14aaac3..4386a10162767 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
@@ -68,11 +68,11 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper {
         case (p, Not(Exists(sub, conditions, _))) =>
           val (joinCond, outerPlan) = rewriteExistentialExpr(conditions, p)
           Join(outerPlan, sub, LeftAnti, joinCond)
-        case (p, In(value, Seq(ListQuery(sub, conditions, _)))) =>
+        case (p, In(value, Seq(ListQuery(sub, conditions, _, _)))) =>
           val inConditions = getValueExpression(value).zip(sub.output).map(EqualTo.tupled)
           val (joinCond, outerPlan) = rewriteExistentialExpr(inConditions ++ conditions, p)
           Join(outerPlan, sub, LeftSemi, joinCond)
-        case (p, Not(In(value, Seq(ListQuery(sub, conditions, _))))) =>
+        case (p, Not(In(value, Seq(ListQuery(sub, conditions, _, _))))) =>
           // This is a NULL-aware (left) anti join (NAAJ) e.g. col NOT IN expr
           // Construct the condition. A NULL in one of the conditions is regarded as a positive
           // result; such a row will be filtered out by the Anti-Join operator.
@@ -116,7 +116,7 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper {
           val exists = AttributeReference("exists", BooleanType, nullable = false)()
           newPlan = Join(newPlan, sub, ExistenceJoin(exists), conditions.reduceLeftOption(And))
           exists
-        case In(value, Seq(ListQuery(sub, conditions, _))) =>
+        case In(value, Seq(ListQuery(sub, conditions, _, _))) =>
           val exists = AttributeReference("exists", BooleanType, nullable = false)()
           val inConditions = getValueExpression(value).zip(sub.output).map(EqualTo.tupled)
           val newConditions = (inConditions ++ conditions).reduceLeftOption(And)
@@ -227,9 +227,9 @@ object PullupCorrelatedPredicates extends Rule[LogicalPlan] with PredicateHelper
       case Exists(sub, children, exprId) if children.nonEmpty =>
         val (newPlan, newCond) = pullOutCorrelatedPredicates(sub, outerPlans)
         Exists(newPlan, newCond, exprId)
-      case ListQuery(sub, _, exprId) =>
+      case ListQuery(sub, _, exprId, childOutputs) =>
         val (newPlan, newCond) = pullOutCorrelatedPredicates(sub, outerPlans)
-        ListQuery(newPlan, newCond, exprId)
+        ListQuery(newPlan, newCond, exprId, childOutputs)
     }
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PullupCorrelatedPredicatesSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PullupCorrelatedPredicatesSuite.scala
new file mode 100644
index 0000000000000..169b8737d808b
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PullupCorrelatedPredicatesSuite.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.expressions.{In, ListQuery}
+import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
+import org.apache.spark.sql.catalyst.rules.RuleExecutor
+
+class PullupCorrelatedPredicatesSuite extends PlanTest {
+
+  object Optimize extends RuleExecutor[LogicalPlan] {
+    val batches =
+      Batch("PullupCorrelatedPredicates", Once,
+        PullupCorrelatedPredicates) :: Nil
+  }
+
+  val testRelation = LocalRelation('a.int, 'b.double)
+  val testRelation2 = LocalRelation('c.int, 'd.double)
+
+  test("PullupCorrelatedPredicates should not produce unresolved plan") {
+    val correlatedSubquery =
+      testRelation2
+        .where('b < 'd)
+        .select('c)
+    val outerQuery =
+      testRelation
+        .where(In('a, Seq(ListQuery(correlatedSubquery))))
+        .select('a).analyze
+    assert(outerQuery.resolved)
+
+    val optimized = Optimize.execute(outerQuery)
+    assert(optimized.resolved)
+  }
+}
diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/subq-input-typecheck.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/subq-input-typecheck.sql.out
index 9ea9d3c4c6f40..70aeb9373f3c7 100644
--- a/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/subq-input-typecheck.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/subq-input-typecheck.sql.out
@@ -80,8 +80,7 @@ number of columns in the output of subquery.
 Left side columns:
 [t1.`t1a`].
 Right side columns:
-[t2.`t2a`, t2.`t2b`].
-             ;
+[t2.`t2a`, t2.`t2b`].;
 
 
 -- !query 6
@@ -102,5 +101,4 @@ number of columns in the output of subquery.
 Left side columns:
 [t1.`t1a`, t1.`t1b`].
 Right side columns:
-[t2.`t2a`].
-             ;
+[t2.`t2a`].;

From 2dd37d827f2e443dcb3eaf8a95437d179130d55c Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 24 Aug 2017 16:44:12 +0200
Subject: [PATCH 420/779] [SPARK-21826][SQL] outer broadcast hash join should
 not throw NPE

## What changes were proposed in this pull request?

This is a bug introduced by https://github.com/apache/spark/pull/11274/files#diff-7adb688cbfa583b5711801f196a074bbL274 .

Non-equal join condition should only be applied when the equal-join condition matches.

## How was this patch tested?

regression test

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19036 from cloud-fan/bug.
---
 .../joins/BroadcastHashJoinExec.scala         |  2 +-
 .../org/apache/spark/sql/JoinSuite.scala      | 20 +++++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
index bfa1e9d49a545..2f52a089ef9bf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
@@ -283,8 +283,8 @@ case class BroadcastHashJoinExec(
       s"""
          |boolean $conditionPassed = true;
          |${eval.trim}
-         |${ev.code}
          |if ($matched != null) {
+         |  ${ev.code}
          |  $conditionPassed = !${ev.isNull} && ${ev.value};
          |}
        """.stripMargin
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index 86fe09bd977af..453052a8ce191 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql
 
+import scala.collection.JavaConverters._
 import scala.collection.mutable.ListBuffer
 import scala.language.existentials
 
@@ -26,6 +27,7 @@ import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
 import org.apache.spark.sql.execution.joins._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.sql.types.StructType
 
 class JoinSuite extends QueryTest with SharedSQLContext {
   import testImplicits._
@@ -767,4 +769,22 @@ class JoinSuite extends QueryTest with SharedSQLContext {
       }
     }
   }
+
+  test("outer broadcast hash join should not throw NPE") {
+    withTempView("v1", "v2") {
+      withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true") {
+        Seq(2 -> 2).toDF("x", "y").createTempView("v1")
+
+        spark.createDataFrame(
+          Seq(Row(1, "a")).asJava,
+          new StructType().add("i", "int", nullable = false).add("j", "string", nullable = false)
+        ).createTempView("v2")
+
+        checkAnswer(
+          sql("select x, y, i, j from v1 left join v2 on x = i and y < length(j)"),
+          Row(2, 2, null, null)
+        )
+      }
+    }
+  }
 }

From d3abb36990d928a8445a8c69ddebeabdfeb1484d Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <zsxwing@gmail.com>
Date: Thu, 24 Aug 2017 10:23:59 -0700
Subject: [PATCH 421/779] [SPARK-21788][SS] Handle more exceptions when
 stopping a streaming query

## What changes were proposed in this pull request?

Add more cases we should view as a normal query stop rather than a failure.

## How was this patch tested?

The new unit tests.

Author: Shixiong Zhu <zsxwing@gmail.com>

Closes #18997 from zsxwing/SPARK-21788.
---
 .../execution/streaming/StreamExecution.scala | 34 ++++++++++-
 .../spark/sql/streaming/StreamSuite.scala     | 60 ++++++++++++++++++-
 2 files changed, 89 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 432b2d4925ae2..c224f2f9f1404 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -17,9 +17,10 @@
 
 package org.apache.spark.sql.execution.streaming
 
-import java.io.{InterruptedIOException, IOException}
+import java.io.{InterruptedIOException, IOException, UncheckedIOException}
+import java.nio.channels.ClosedByInterruptException
 import java.util.UUID
-import java.util.concurrent.{CountDownLatch, TimeUnit}
+import java.util.concurrent.{CountDownLatch, ExecutionException, TimeUnit}
 import java.util.concurrent.atomic.AtomicReference
 import java.util.concurrent.locks.ReentrantLock
 
@@ -27,6 +28,7 @@ import scala.collection.mutable.{Map => MutableMap}
 import scala.collection.mutable.ArrayBuffer
 import scala.util.control.NonFatal
 
+import com.google.common.util.concurrent.UncheckedExecutionException
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.internal.Logging
@@ -335,7 +337,7 @@ class StreamExecution(
         // `stop()` is already called. Let `finally` finish the cleanup.
       }
     } catch {
-      case _: InterruptedException | _: InterruptedIOException if state.get == TERMINATED =>
+      case e if isInterruptedByStop(e) =>
         // interrupted by stop()
         updateStatusMessage("Stopped")
       case e: IOException if e.getMessage != null
@@ -407,6 +409,32 @@ class StreamExecution(
     }
   }
 
+  private def isInterruptedByStop(e: Throwable): Boolean = {
+    if (state.get == TERMINATED) {
+      e match {
+        // InterruptedIOException - thrown when an I/O operation is interrupted
+        // ClosedByInterruptException - thrown when an I/O operation upon a channel is interrupted
+        case _: InterruptedException | _: InterruptedIOException | _: ClosedByInterruptException =>
+          true
+        // The cause of the following exceptions may be one of the above exceptions:
+        //
+        // UncheckedIOException - thrown by codes that cannot throw a checked IOException, such as
+        //                        BiFunction.apply
+        // ExecutionException - thrown by codes running in a thread pool and these codes throw an
+        //                      exception
+        // UncheckedExecutionException - thrown by codes that cannot throw a checked
+        //                               ExecutionException, such as BiFunction.apply
+        case e2 @ (_: UncheckedIOException | _: ExecutionException | _: UncheckedExecutionException)
+          if e2.getCause != null =>
+          isInterruptedByStop(e2.getCause)
+        case _ =>
+          false
+      }
+    } else {
+      false
+    }
+  }
+
   /**
    * Populate the start offsets to start the execution at the current offsets stored in the sink
    * (i.e. avoid reprocessing data that we have already processed). This function must be called
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
index 012cccfdd9166..d0b2041a8644f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
@@ -17,12 +17,14 @@
 
 package org.apache.spark.sql.streaming
 
-import java.io.{File, InterruptedIOException, IOException}
-import java.util.concurrent.{CountDownLatch, TimeoutException, TimeUnit}
+import java.io.{File, InterruptedIOException, IOException, UncheckedIOException}
+import java.nio.channels.ClosedByInterruptException
+import java.util.concurrent.{CountDownLatch, ExecutionException, TimeoutException, TimeUnit}
 
 import scala.reflect.ClassTag
 import scala.util.control.ControlThrowable
 
+import com.google.common.util.concurrent.UncheckedExecutionException
 import org.apache.commons.io.FileUtils
 import org.apache.hadoop.conf.Configuration
 
@@ -691,6 +693,31 @@ class StreamSuite extends StreamTest {
       }
     }
   }
+
+  for (e <- Seq(
+    new InterruptedException,
+    new InterruptedIOException,
+    new ClosedByInterruptException,
+    new UncheckedIOException("test", new ClosedByInterruptException),
+    new ExecutionException("test", new InterruptedException),
+    new UncheckedExecutionException("test", new InterruptedException))) {
+    test(s"view ${e.getClass.getSimpleName} as a normal query stop") {
+      ThrowingExceptionInCreateSource.createSourceLatch = new CountDownLatch(1)
+      ThrowingExceptionInCreateSource.exception = e
+      val query = spark
+        .readStream
+        .format(classOf[ThrowingExceptionInCreateSource].getName)
+        .load()
+        .writeStream
+        .format("console")
+        .start()
+      assert(ThrowingExceptionInCreateSource.createSourceLatch
+        .await(streamingTimeout.toMillis, TimeUnit.MILLISECONDS),
+        "ThrowingExceptionInCreateSource.createSource wasn't called before timeout")
+      query.stop()
+      assert(query.exception.isEmpty)
+    }
+  }
 }
 
 abstract class FakeSource extends StreamSourceProvider {
@@ -824,3 +851,32 @@ class TestStateStoreProvider extends StateStoreProvider {
 
   override def getStore(version: Long): StateStore = null
 }
+
+/** A fake source that throws `ThrowingExceptionInCreateSource.exception` in `createSource` */
+class ThrowingExceptionInCreateSource extends FakeSource {
+
+  override def createSource(
+    spark: SQLContext,
+    metadataPath: String,
+    schema: Option[StructType],
+    providerName: String,
+    parameters: Map[String, String]): Source = {
+    ThrowingExceptionInCreateSource.createSourceLatch.countDown()
+    try {
+      Thread.sleep(30000)
+      throw new TimeoutException("sleep was not interrupted in 30 seconds")
+    } catch {
+      case _: InterruptedException =>
+        throw ThrowingExceptionInCreateSource.exception
+    }
+  }
+}
+
+object ThrowingExceptionInCreateSource {
+  /**
+   * A latch to allow the user to wait until `ThrowingExceptionInCreateSource.createSource` is
+   * called.
+   */
+  @volatile var createSourceLatch: CountDownLatch = null
+  @volatile var exception: Exception = null
+}

From 763b83ee84cbb6f263218c471dd9198dd6bee411 Mon Sep 17 00:00:00 2001
From: "xu.zhang" <xu.zhang@hulu.com>
Date: Thu, 24 Aug 2017 14:27:52 -0700
Subject: [PATCH 422/779] [SPARK-21701][CORE] Enable RPC client to use `
 SO_RCVBUF` and ` SO_SNDBUF`  in SparkConf.

## What changes were proposed in this pull request?

TCP parameters like SO_RCVBUF and SO_SNDBUF can be set in SparkConf, and `org.apache.spark.network.server.TransportServe`r can use those parameters to build server by leveraging netty. But for TransportClientFactory, there is no such way to set those parameters from SparkConf. This could be inconsistent in server and client side when people set parameters in SparkConf. So this PR make RPC client to be enable to use those TCP parameters as well.

## How was this patch tested?

Existing tests.

Author: xu.zhang <xu.zhang@hulu.com>

Closes #18964 from neoremind/add_client_param.
---
 .../spark/network/client/TransportClientFactory.java      | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java
index b50e043d5c9ce..8add4e1ab021d 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java
@@ -210,6 +210,14 @@ private TransportClient createClient(InetSocketAddress address)
       .option(ChannelOption.CONNECT_TIMEOUT_MILLIS, conf.connectionTimeoutMs())
       .option(ChannelOption.ALLOCATOR, pooledAllocator);
 
+    if (conf.receiveBuf() > 0) {
+      bootstrap.option(ChannelOption.SO_RCVBUF, conf.receiveBuf());
+    }
+
+    if (conf.sendBuf() > 0) {
+      bootstrap.option(ChannelOption.SO_SNDBUF, conf.sendBuf());
+    }
+
     final AtomicReference<TransportClient> clientRef = new AtomicReference<>();
     final AtomicReference<Channel> channelRef = new AtomicReference<>();
 

From 05af2de0fdce625041b99908adc320c576bac116 Mon Sep 17 00:00:00 2001
From: Herman van Hovell <hvanhovell@databricks.com>
Date: Thu, 24 Aug 2017 16:33:55 -0700
Subject: [PATCH 423/779] [SPARK-21830][SQL] Bump ANTLR version and fix a few
 issues.

## What changes were proposed in this pull request?
This PR bumps the ANTLR version to 4.7, and fixes a number of small parser related issues uncovered by the bump.

The main reason for upgrading is that in some cases the current version of ANTLR (4.5) can exhibit exponential slowdowns if it needs to parse boolean predicates. For example the following query will take forever to parse:
```sql
SELECT *
FROM RANGE(1000)
WHERE
TRUE
AND NOT upper(DESCRIPTION) LIKE '%FOO%'
AND NOT upper(DESCRIPTION) LIKE '%FOO%'
AND NOT upper(DESCRIPTION) LIKE '%FOO%'
AND NOT upper(DESCRIPTION) LIKE '%FOO%'
AND NOT upper(DESCRIPTION) LIKE '%FOO%'
AND NOT upper(DESCRIPTION) LIKE '%FOO%'
AND NOT upper(DESCRIPTION) LIKE '%FOO%'
AND NOT upper(DESCRIPTION) LIKE '%FOO%'
AND NOT upper(DESCRIPTION) LIKE '%FOO%'
AND NOT upper(DESCRIPTION) LIKE '%FOO%'
AND NOT upper(DESCRIPTION) LIKE '%FOO%'
AND NOT upper(DESCRIPTION) LIKE '%FOO%'
AND NOT upper(DESCRIPTION) LIKE '%FOO%'
AND NOT upper(DESCRIPTION) LIKE '%FOO%'
AND NOT upper(DESCRIPTION) LIKE '%FOO%'
AND NOT upper(DESCRIPTION) LIKE '%FOO%'
AND NOT upper(DESCRIPTION) LIKE '%FOO%'
AND NOT upper(DESCRIPTION) LIKE '%FOO%'
```

This is caused by a know bug in ANTLR (https://github.com/antlr/antlr4/issues/994), which was fixed in version 4.6.

## How was this patch tested?
Existing tests.

Author: Herman van Hovell <hvanhovell@databricks.com>

Closes #19042 from hvanhovell/SPARK-21830.
---
 dev/deps/spark-deps-hadoop-2.6                     |  2 +-
 dev/deps/spark-deps-hadoop-2.7                     |  2 +-
 pom.xml                                            |  2 +-
 project/SparkBuild.scala                           |  1 +
 .../apache/spark/sql/catalyst/parser/SqlBase.g4    |  6 +++++-
 .../spark/sql/catalyst/parser/AstBuilder.scala     |  4 ++++
 .../spark/sql/catalyst/parser/ParseDriver.scala    |  2 +-
 .../catalyst/parser/TableSchemaParserSuite.scala   | 14 ++++++++------
 .../sql-tests/results/show-tables.sql.out          |  4 ++--
 .../org/apache/spark/sql/jdbc/JDBCWriteSuite.scala |  2 +-
 10 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 01af2c75b0251..de1750777d36c 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -5,7 +5,7 @@ activation-1.1.1.jar
 aircompressor-0.3.jar
 antlr-2.7.7.jar
 antlr-runtime-3.4.jar
-antlr4-runtime-4.5.3.jar
+antlr4-runtime-4.7.jar
 aopalliance-1.0.jar
 aopalliance-repackaged-2.4.0-b34.jar
 apache-log4j-extras-1.2.17.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 69f3a4bb60f8b..da826a7ee8b12 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -5,7 +5,7 @@ activation-1.1.1.jar
 aircompressor-0.3.jar
 antlr-2.7.7.jar
 antlr-runtime-3.4.jar
-antlr4-runtime-4.5.3.jar
+antlr4-runtime-4.7.jar
 aopalliance-1.0.jar
 aopalliance-repackaged-2.4.0-b34.jar
 apache-log4j-extras-1.2.17.jar
diff --git a/pom.xml b/pom.xml
index c0df3ef0fe200..8b4a6c5425a98 100644
--- a/pom.xml
+++ b/pom.xml
@@ -178,7 +178,7 @@
     <jodd.version>3.5.2</jodd.version>
     <jsr305.version>1.3.9</jsr305.version>
     <libthrift.version>0.9.3</libthrift.version>
-    <antlr4.version>4.5.3</antlr4.version>
+    <antlr4.version>4.7</antlr4.version>
     <jpam.version>1.1</jpam.version>
     <selenium.version>2.52.0</selenium.version>
     <paranamer.version>2.6</paranamer.version>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 7565e14c9b9ed..18059adc864b5 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -474,6 +474,7 @@ object OldDeps {
 
 object Catalyst {
   lazy val settings = antlr4Settings ++ Seq(
+    antlr4Version in Antlr4 := "4.7",
     antlr4PackageName in Antlr4 := Some("org.apache.spark.sql.catalyst.parser"),
     antlr4GenListener in Antlr4 := true,
     antlr4GenVisitor in Antlr4 := true
diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index 954955b6b1293..5d4363f945bf8 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -64,6 +64,10 @@ singleDataType
     : dataType EOF
     ;
 
+singleTableSchema
+    : colTypeList EOF
+    ;
+
 statement
     : query                                                            #statementDefault
     | USE db=identifier                                                #use
@@ -974,7 +978,7 @@ CURRENT_TIMESTAMP: 'CURRENT_TIMESTAMP';
 
 STRING
     : '\'' ( ~('\''|'\\') | ('\\' .) )* '\''
-    | '\"' ( ~('\"'|'\\') | ('\\' .) )* '\"'
+    | '"' ( ~('"'|'\\') | ('\\' .) )* '"'
     ;
 
 BIGINT_LITERAL
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 22c5484b76638..8a45c5216781b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -89,6 +89,10 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
     visitSparkDataType(ctx.dataType)
   }
 
+  override def visitSingleTableSchema(ctx: SingleTableSchemaContext): StructType = {
+    withOrigin(ctx)(StructType(visitColTypeList(ctx.colTypeList)))
+  }
+
   /* ********************************************************************************************
    * Plan parsing
    * ******************************************************************************************** */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala
index 09598ffe770c6..0d9ad218e48db 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala
@@ -61,7 +61,7 @@ abstract class AbstractSqlParser extends ParserInterface with Logging {
    * definitions which will preserve the correct Hive metadata.
    */
   override def parseTableSchema(sqlText: String): StructType = parse(sqlText) { parser =>
-    StructType(astBuilder.visitColTypeList(parser.colTypeList()))
+    astBuilder.visitSingleTableSchema(parser.singleTableSchema())
   }
 
   /** Creates LogicalPlan for a given SQL string. */
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala
index 48aaec44885d4..6803fc307f919 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala
@@ -79,10 +79,12 @@ class TableSchemaParserSuite extends SparkFunSuite {
   }
 
   // Negative cases
-  assertError("")
-  assertError("a")
-  assertError("a INT b long")
-  assertError("a INT,, b long")
-  assertError("a INT, b long,,")
-  assertError("a INT, b long, c int,")
+  test("Negative cases") {
+    assertError("")
+    assertError("a")
+    assertError("a INT b long")
+    assertError("a INT,, b long")
+    assertError("a INT, b long,,")
+    assertError("a INT, b long, c int,")
+  }
 }
diff --git a/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out b/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out
index da729cd757cfc..975bb06124744 100644
--- a/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out
@@ -164,7 +164,7 @@ struct<>
 -- !query 13 output
 org.apache.spark.sql.catalyst.parser.ParseException
 
-mismatched input '<EOF>' expecting 'LIKE'(line 1, pos 19)
+mismatched input '<EOF>' expecting {'FROM', 'IN', 'LIKE'}(line 1, pos 19)
 
 == SQL ==
 SHOW TABLE EXTENDED
@@ -187,7 +187,7 @@ struct<>
 -- !query 15 output
 org.apache.spark.sql.catalyst.parser.ParseException
 
-mismatched input 'PARTITION' expecting 'LIKE'(line 1, pos 20)
+mismatched input 'PARTITION' expecting {'FROM', 'IN', 'LIKE'}(line 1, pos 20)
 
 == SQL ==
 SHOW TABLE EXTENDED PARTITION(c='Us', d=1)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
index b7f97f204b24c..1985b1dc82879 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
@@ -468,7 +468,7 @@ class JDBCWriteSuite extends SharedSQLContext with BeforeAndAfter {
         .option("createTableColumnTypes", "`name char(20)") // incorrectly quoted column
         .jdbc(url1, "TEST.USERDBTYPETEST", properties)
     }.getMessage()
-    assert(msg.contains("no viable alternative at input"))
+    assert(msg.contains("extraneous input"))
   }
 
   test("SPARK-10849: jdbc CreateTableColumnTypes duplicate columns") {

From f3676d63913e0706e071b71e1742b8d57b102fba Mon Sep 17 00:00:00 2001
From: Yuhao Yang <yuhao.yang@intel.com>
Date: Fri, 25 Aug 2017 10:22:27 +0800
Subject: [PATCH 424/779] [SPARK-21108][ML] convert LinearSVC to aggregator
 framework

## What changes were proposed in this pull request?

convert LinearSVC to new aggregator framework

## How was this patch tested?

existing unit test.

Author: Yuhao Yang <yuhao.yang@intel.com>

Closes #18315 from hhbyyh/svcAggregator.
---
 .../spark/ml/classification/LinearSVC.scala   | 204 ++----------------
 .../ml/optim/aggregator/HingeAggregator.scala | 105 +++++++++
 .../ml/classification/LinearSVCSuite.scala    |   7 +-
 .../aggregator/HingeAggregatorSuite.scala     | 163 ++++++++++++++
 .../aggregator/LogisticAggregatorSuite.scala  |   2 -
 5 files changed, 286 insertions(+), 195 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/HingeAggregator.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/HingeAggregatorSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
index 8d556deef2be8..3b0666c36d20a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
@@ -25,11 +25,11 @@ import org.apache.hadoop.fs.Path
 
 import org.apache.spark.SparkException
 import org.apache.spark.annotation.{Experimental, Since}
-import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.internal.Logging
 import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.linalg._
-import org.apache.spark.ml.linalg.BLAS._
+import org.apache.spark.ml.optim.aggregator.HingeAggregator
+import org.apache.spark.ml.optim.loss.{L2Regularization, RDDLossFunction}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
@@ -214,10 +214,20 @@ class LinearSVC @Since("2.2.0") (
       }
 
       val featuresStd = summarizer.variance.toArray.map(math.sqrt)
+      val getFeaturesStd = (j: Int) => featuresStd(j)
       val regParamL2 = $(regParam)
       val bcFeaturesStd = instances.context.broadcast(featuresStd)
-      val costFun = new LinearSVCCostFun(instances, $(fitIntercept),
-        $(standardization), bcFeaturesStd, regParamL2, $(aggregationDepth))
+      val regularization = if (regParamL2 != 0.0) {
+        val shouldApply = (idx: Int) => idx >= 0 && idx < numFeatures
+        Some(new L2Regularization(regParamL2, shouldApply,
+          if ($(standardization)) None else Some(getFeaturesStd)))
+      } else {
+        None
+      }
+
+      val getAggregatorFunc = new HingeAggregator(bcFeaturesStd, $(fitIntercept))(_)
+      val costFun = new RDDLossFunction(instances, getAggregatorFunc, regularization,
+        $(aggregationDepth))
 
       def regParamL1Fun = (index: Int) => 0D
       val optimizer = new BreezeOWLQN[Int, BDV[Double]]($(maxIter), 10, regParamL1Fun, $(tol))
@@ -372,189 +382,3 @@ object LinearSVCModel extends MLReadable[LinearSVCModel] {
     }
   }
 }
-
-/**
- * LinearSVCCostFun implements Breeze's DiffFunction[T] for hinge loss function
- */
-private class LinearSVCCostFun(
-    instances: RDD[Instance],
-    fitIntercept: Boolean,
-    standardization: Boolean,
-    bcFeaturesStd: Broadcast[Array[Double]],
-    regParamL2: Double,
-    aggregationDepth: Int) extends DiffFunction[BDV[Double]] {
-
-  override def calculate(coefficients: BDV[Double]): (Double, BDV[Double]) = {
-    val coeffs = Vectors.fromBreeze(coefficients)
-    val bcCoeffs = instances.context.broadcast(coeffs)
-    val featuresStd = bcFeaturesStd.value
-    val numFeatures = featuresStd.length
-
-    val svmAggregator = {
-      val seqOp = (c: LinearSVCAggregator, instance: Instance) => c.add(instance)
-      val combOp = (c1: LinearSVCAggregator, c2: LinearSVCAggregator) => c1.merge(c2)
-
-      instances.treeAggregate(
-        new LinearSVCAggregator(bcCoeffs, bcFeaturesStd, fitIntercept)
-      )(seqOp, combOp, aggregationDepth)
-    }
-
-    val totalGradientArray = svmAggregator.gradient.toArray
-    // regVal is the sum of coefficients squares excluding intercept for L2 regularization.
-    val regVal = if (regParamL2 == 0.0) {
-      0.0
-    } else {
-      var sum = 0.0
-      coeffs.foreachActive { case (index, value) =>
-        // We do not apply regularization to the intercepts
-        if (index != numFeatures) {
-          // The following code will compute the loss of the regularization; also
-          // the gradient of the regularization, and add back to totalGradientArray.
-          sum += {
-            if (standardization) {
-              totalGradientArray(index) += regParamL2 * value
-              value * value
-            } else {
-              if (featuresStd(index) != 0.0) {
-                // If `standardization` is false, we still standardize the data
-                // to improve the rate of convergence; as a result, we have to
-                // perform this reverse standardization by penalizing each component
-                // differently to get effectively the same objective function when
-                // the training dataset is not standardized.
-                val temp = value / (featuresStd(index) * featuresStd(index))
-                totalGradientArray(index) += regParamL2 * temp
-                value * temp
-              } else {
-                0.0
-              }
-            }
-          }
-        }
-      }
-      0.5 * regParamL2 * sum
-    }
-    bcCoeffs.destroy(blocking = false)
-
-    (svmAggregator.loss + regVal, new BDV(totalGradientArray))
-  }
-}
-
-/**
- * LinearSVCAggregator computes the gradient and loss for hinge loss function, as used
- * in binary classification for instances in sparse or dense vector in an online fashion.
- *
- * Two LinearSVCAggregator can be merged together to have a summary of loss and gradient of
- * the corresponding joint dataset.
- *
- * This class standardizes feature values during computation using bcFeaturesStd.
- *
- * @param bcCoefficients The coefficients corresponding to the features.
- * @param fitIntercept Whether to fit an intercept term.
- * @param bcFeaturesStd The standard deviation values of the features.
- */
-private class LinearSVCAggregator(
-    bcCoefficients: Broadcast[Vector],
-    bcFeaturesStd: Broadcast[Array[Double]],
-    fitIntercept: Boolean) extends Serializable {
-
-  private val numFeatures: Int = bcFeaturesStd.value.length
-  private val numFeaturesPlusIntercept: Int = if (fitIntercept) numFeatures + 1 else numFeatures
-  private var weightSum: Double = 0.0
-  private var lossSum: Double = 0.0
-  @transient private lazy val coefficientsArray = bcCoefficients.value match {
-    case DenseVector(values) => values
-    case _ => throw new IllegalArgumentException(s"coefficients only supports dense vector" +
-      s" but got type ${bcCoefficients.value.getClass}.")
-  }
-  private lazy val gradientSumArray = new Array[Double](numFeaturesPlusIntercept)
-
-  /**
-   * Add a new training instance to this LinearSVCAggregator, and update the loss and gradient
-   * of the objective function.
-   *
-   * @param instance The instance of data point to be added.
-   * @return This LinearSVCAggregator object.
-   */
-  def add(instance: Instance): this.type = {
-    instance match { case Instance(label, weight, features) =>
-
-      if (weight == 0.0) return this
-      val localFeaturesStd = bcFeaturesStd.value
-      val localCoefficients = coefficientsArray
-      val localGradientSumArray = gradientSumArray
-
-      val dotProduct = {
-        var sum = 0.0
-        features.foreachActive { (index, value) =>
-          if (localFeaturesStd(index) != 0.0 && value != 0.0) {
-            sum += localCoefficients(index) * value / localFeaturesStd(index)
-          }
-        }
-        if (fitIntercept) sum += localCoefficients(numFeaturesPlusIntercept - 1)
-        sum
-      }
-      // Our loss function with {0, 1} labels is max(0, 1 - (2y - 1) (f_w(x)))
-      // Therefore the gradient is -(2y - 1)*x
-      val labelScaled = 2 * label - 1.0
-      val loss = if (1.0 > labelScaled * dotProduct) {
-        weight * (1.0 - labelScaled * dotProduct)
-      } else {
-        0.0
-      }
-
-      if (1.0 > labelScaled * dotProduct) {
-        val gradientScale = -labelScaled * weight
-        features.foreachActive { (index, value) =>
-          if (localFeaturesStd(index) != 0.0 && value != 0.0) {
-            localGradientSumArray(index) += value * gradientScale / localFeaturesStd(index)
-          }
-        }
-        if (fitIntercept) {
-          localGradientSumArray(localGradientSumArray.length - 1) += gradientScale
-        }
-      }
-
-      lossSum += loss
-      weightSum += weight
-      this
-    }
-  }
-
-  /**
-   * Merge another LinearSVCAggregator, and update the loss and gradient
-   * of the objective function.
-   * (Note that it's in place merging; as a result, `this` object will be modified.)
-   *
-   * @param other The other LinearSVCAggregator to be merged.
-   * @return This LinearSVCAggregator object.
-   */
-  def merge(other: LinearSVCAggregator): this.type = {
-
-    if (other.weightSum != 0.0) {
-      weightSum += other.weightSum
-      lossSum += other.lossSum
-
-      var i = 0
-      val localThisGradientSumArray = this.gradientSumArray
-      val localOtherGradientSumArray = other.gradientSumArray
-      val len = localThisGradientSumArray.length
-      while (i < len) {
-        localThisGradientSumArray(i) += localOtherGradientSumArray(i)
-        i += 1
-      }
-    }
-    this
-  }
-
-  def loss: Double = if (weightSum != 0) lossSum / weightSum else 0.0
-
-  def gradient: Vector = {
-    if (weightSum != 0) {
-      val result = Vectors.dense(gradientSumArray.clone())
-      scal(1.0 / weightSum, result)
-      result
-    } else {
-      Vectors.dense(new Array[Double](numFeaturesPlusIntercept))
-    }
-  }
-}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/HingeAggregator.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/HingeAggregator.scala
new file mode 100644
index 0000000000000..0300500a34ec0
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/HingeAggregator.scala
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.optim.aggregator
+
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.ml.feature.Instance
+import org.apache.spark.ml.linalg._
+
+/**
+ * HingeAggregator computes the gradient and loss for Hinge loss function as used in
+ * binary classification for instances in sparse or dense vector in an online fashion.
+ *
+ * Two HingeAggregators can be merged together to have a summary of loss and gradient of
+ * the corresponding joint dataset.
+ *
+ * This class standardizes feature values during computation using bcFeaturesStd.
+ *
+ * @param bcCoefficients The coefficients corresponding to the features.
+ * @param fitIntercept Whether to fit an intercept term.
+ * @param bcFeaturesStd The standard deviation values of the features.
+ */
+private[ml] class HingeAggregator(
+    bcFeaturesStd: Broadcast[Array[Double]],
+    fitIntercept: Boolean)(bcCoefficients: Broadcast[Vector])
+  extends DifferentiableLossAggregator[Instance, HingeAggregator] {
+
+  private val numFeatures: Int = bcFeaturesStd.value.length
+  private val numFeaturesPlusIntercept: Int = if (fitIntercept) numFeatures + 1 else numFeatures
+  @transient private lazy val coefficientsArray = bcCoefficients.value match {
+    case DenseVector(values) => values
+    case _ => throw new IllegalArgumentException(s"coefficients only supports dense vector" +
+      s" but got type ${bcCoefficients.value.getClass}.")
+  }
+  protected override val dim: Int = numFeaturesPlusIntercept
+
+  /**
+   * Add a new training instance to this HingeAggregator, and update the loss and gradient
+   * of the objective function.
+   *
+   * @param instance The instance of data point to be added.
+   * @return This HingeAggregator object.
+   */
+  def add(instance: Instance): this.type = {
+    instance match { case Instance(label, weight, features) =>
+      require(numFeatures == features.size, s"Dimensions mismatch when adding new instance." +
+        s" Expecting $numFeatures but got ${features.size}.")
+      require(weight >= 0.0, s"instance weight, $weight has to be >= 0.0")
+
+      if (weight == 0.0) return this
+      val localFeaturesStd = bcFeaturesStd.value
+      val localCoefficients = coefficientsArray
+      val localGradientSumArray = gradientSumArray
+
+      val dotProduct = {
+        var sum = 0.0
+        features.foreachActive { (index, value) =>
+          if (localFeaturesStd(index) != 0.0 && value != 0.0) {
+            sum += localCoefficients(index) * value / localFeaturesStd(index)
+          }
+        }
+        if (fitIntercept) sum += localCoefficients(numFeaturesPlusIntercept - 1)
+        sum
+      }
+      // Our loss function with {0, 1} labels is max(0, 1 - (2y - 1) (f_w(x)))
+      // Therefore the gradient is -(2y - 1)*x
+      val labelScaled = 2 * label - 1.0
+      val loss = if (1.0 > labelScaled * dotProduct) {
+        (1.0 - labelScaled * dotProduct) * weight
+      } else {
+        0.0
+      }
+
+      if (1.0 > labelScaled * dotProduct) {
+        val gradientScale = -labelScaled * weight
+        features.foreachActive { (index, value) =>
+          if (localFeaturesStd(index) != 0.0 && value != 0.0) {
+            localGradientSumArray(index) += value * gradientScale / localFeaturesStd(index)
+          }
+        }
+        if (fitIntercept) {
+          localGradientSumArray(localGradientSumArray.length - 1) += gradientScale
+        }
+      }
+
+      lossSum += loss
+      weightSum += weight
+      this
+    }
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala
index f2b00d0bae1d6..41a5d22dd6283 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala
@@ -25,7 +25,8 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.classification.LinearSVCSuite._
 import org.apache.spark.ml.feature.{Instance, LabeledPoint}
 import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
-import org.apache.spark.ml.param.{ParamMap, ParamsSuite}
+import org.apache.spark.ml.optim.aggregator.HingeAggregator
+import org.apache.spark.ml.param.ParamsSuite
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.util.MLlibTestSparkContext
@@ -170,10 +171,10 @@ class LinearSVCSuite extends SparkFunSuite with MLlibTestSparkContext with Defau
     assert(model2.intercept !== 0.0)
   }
 
-  test("sparse coefficients in SVCAggregator") {
+  test("sparse coefficients in HingeAggregator") {
     val bcCoefficients = spark.sparkContext.broadcast(Vectors.sparse(2, Array(0), Array(1.0)))
     val bcFeaturesStd = spark.sparkContext.broadcast(Array(1.0))
-    val agg = new LinearSVCAggregator(bcCoefficients, bcFeaturesStd, true)
+    val agg = new HingeAggregator(bcFeaturesStd, true)(bcCoefficients)
     val thrown = withClue("LinearSVCAggregator cannot handle sparse coefficients") {
       intercept[IllegalArgumentException] {
         agg.add(Instance(1.0, 1.0, Vectors.dense(1.0)))
diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/HingeAggregatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/HingeAggregatorSuite.scala
new file mode 100644
index 0000000000000..61b48ffa10944
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/HingeAggregatorSuite.scala
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.ml.optim.aggregator
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.feature.Instance
+import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors}
+import org.apache.spark.ml.util.TestingUtils._
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+
+class HingeAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
+
+  import DifferentiableLossAggregatorSuite.getClassificationSummarizers
+
+  @transient var instances: Array[Instance] = _
+  @transient var instancesConstantFeature: Array[Instance] = _
+  @transient var instancesConstantFeatureFiltered: Array[Instance] = _
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    instances = Array(
+      Instance(0.0, 0.1, Vectors.dense(1.0, 2.0)),
+      Instance(1.0, 0.5, Vectors.dense(1.5, 1.0)),
+      Instance(0.0, 0.3, Vectors.dense(4.0, 0.5))
+    )
+    instancesConstantFeature = Array(
+      Instance(0.0, 0.1, Vectors.dense(1.0, 2.0)),
+      Instance(1.0, 0.5, Vectors.dense(1.0, 1.0)),
+      Instance(1.0, 0.3, Vectors.dense(1.0, 0.5))
+    )
+    instancesConstantFeatureFiltered = Array(
+      Instance(0.0, 0.1, Vectors.dense(2.0)),
+      Instance(1.0, 0.5, Vectors.dense(1.0)),
+      Instance(2.0, 0.3, Vectors.dense(0.5))
+    )
+  }
+
+   /** Get summary statistics for some data and create a new HingeAggregator. */
+  private def getNewAggregator(
+      instances: Array[Instance],
+      coefficients: Vector,
+      fitIntercept: Boolean): HingeAggregator = {
+    val (featuresSummarizer, ySummarizer) =
+      DifferentiableLossAggregatorSuite.getClassificationSummarizers(instances)
+    val featuresStd = featuresSummarizer.variance.toArray.map(math.sqrt)
+    val bcFeaturesStd = spark.sparkContext.broadcast(featuresStd)
+    val bcCoefficients = spark.sparkContext.broadcast(coefficients)
+    new HingeAggregator(bcFeaturesStd, fitIntercept)(bcCoefficients)
+  }
+
+  test("aggregator add method input size") {
+    val coefArray = Array(1.0, 2.0)
+    val interceptArray = Array(2.0)
+    val agg = getNewAggregator(instances, Vectors.dense(coefArray ++ interceptArray),
+      fitIntercept = true)
+    withClue("HingeAggregator features dimension must match coefficients dimension") {
+      intercept[IllegalArgumentException] {
+        agg.add(Instance(1.0, 1.0, Vectors.dense(2.0)))
+      }
+    }
+  }
+
+  test("negative weight") {
+    val coefArray = Array(1.0, 2.0)
+    val interceptArray = Array(2.0)
+    val agg = getNewAggregator(instances, Vectors.dense(coefArray ++ interceptArray),
+      fitIntercept = true)
+    withClue("HingeAggregator does not support negative instance weights") {
+      intercept[IllegalArgumentException] {
+        agg.add(Instance(1.0, -1.0, Vectors.dense(2.0, 1.0)))
+      }
+    }
+  }
+
+  test("check sizes") {
+    val rng = new scala.util.Random
+    val numFeatures = instances.head.features.size
+    val coefWithIntercept = Vectors.dense(Array.fill(numFeatures + 1)(rng.nextDouble))
+    val coefWithoutIntercept = Vectors.dense(Array.fill(numFeatures)(rng.nextDouble))
+    val aggIntercept = getNewAggregator(instances, coefWithIntercept, fitIntercept = true)
+    val aggNoIntercept = getNewAggregator(instances, coefWithoutIntercept,
+      fitIntercept = false)
+    instances.foreach(aggIntercept.add)
+    instances.foreach(aggNoIntercept.add)
+
+    assert(aggIntercept.gradient.size === numFeatures + 1)
+    assert(aggNoIntercept.gradient.size === numFeatures)
+  }
+
+  test("check correctness") {
+    val coefArray = Array(1.0, 2.0)
+    val intercept = 1.0
+    val numFeatures = instances.head.features.size
+    val (featuresSummarizer, _) = getClassificationSummarizers(instances)
+    val featuresStd = featuresSummarizer.variance.toArray.map(math.sqrt)
+    val weightSum = instances.map(_.weight).sum
+
+    val agg = getNewAggregator(instances, Vectors.dense(coefArray ++ Array(intercept)),
+      fitIntercept = true)
+    instances.foreach(agg.add)
+
+    // compute the loss
+    val stdCoef = coefArray.indices.map(i => coefArray(i) / featuresStd(i)).toArray
+    val lossSum = instances.map { case Instance(l, w, f) =>
+      val margin = BLAS.dot(Vectors.dense(stdCoef), f) + intercept
+      val labelScaled = 2 * l - 1.0
+      if (1.0 > labelScaled * margin) {
+        (1.0 - labelScaled * margin) * w
+      } else {
+        0.0
+      }
+    }.sum
+    val loss = lossSum / weightSum
+
+    // compute the gradients
+    val gradientCoef = new Array[Double](numFeatures)
+    var gradientIntercept = 0.0
+    instances.foreach { case Instance(l, w, f) =>
+      val margin = BLAS.dot(f, Vectors.dense(coefArray)) + intercept
+      if (1.0 > (2 * l - 1.0) * margin) {
+        gradientCoef.indices.foreach { i =>
+          gradientCoef(i) += f(i) * -(2 * l - 1.0) * w / featuresStd(i)
+        }
+        gradientIntercept += -(2 * l - 1.0) * w
+      }
+    }
+    val gradient = Vectors.dense((gradientCoef ++ Array(gradientIntercept)).map(_ / weightSum))
+
+    assert(loss ~== agg.loss relTol 0.01)
+    assert(gradient ~== agg.gradient relTol 0.01)
+  }
+
+  test("check with zero standard deviation") {
+    val binaryCoefArray = Array(1.0, 2.0)
+    val intercept = 1.0
+    val aggConstantFeatureBinary = getNewAggregator(instancesConstantFeature,
+      Vectors.dense(binaryCoefArray ++ Array(intercept)), fitIntercept = true)
+    instancesConstantFeature.foreach(aggConstantFeatureBinary.add)
+
+    val aggConstantFeatureBinaryFiltered = getNewAggregator(instancesConstantFeatureFiltered,
+      Vectors.dense(binaryCoefArray ++ Array(intercept)), fitIntercept = true)
+    instancesConstantFeatureFiltered.foreach(aggConstantFeatureBinaryFiltered.add)
+
+    // constant features should not affect gradient
+    assert(aggConstantFeatureBinary.gradient(0) === 0.0)
+    assert(aggConstantFeatureBinary.gradient(1) == aggConstantFeatureBinaryFiltered.gradient(0))
+  }
+
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala
index 16ef4af4f94e8..4c7913d5d2577 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala
@@ -217,8 +217,6 @@ class LogisticAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
     }.sum
     val loss = lossSum / weightSum
 
-
-
     // compute the gradients
     val gradientCoef = new Array[Double](numFeatures)
     var gradientIntercept = 0.0

From 7d16776d28da5bcf656f0d8556b15ed3a5edca44 Mon Sep 17 00:00:00 2001
From: mike <mike0sv@gmail.com>
Date: Fri, 25 Aug 2017 07:22:34 +0100
Subject: [PATCH 425/779] [SPARK-21255][SQL][WIP] Fixed NPE when creating
 encoder for enum

## What changes were proposed in this pull request?

Fixed NPE when creating encoder for enum.

When you try to create an encoder for Enum type (or bean with enum property) via Encoders.bean(...), it fails with NullPointerException at TypeToken:495.
I did a little research and it turns out, that in JavaTypeInference following code
```
  def getJavaBeanReadableProperties(beanClass: Class[_]): Array[PropertyDescriptor] = {
    val beanInfo = Introspector.getBeanInfo(beanClass)
    beanInfo.getPropertyDescriptors.filterNot(_.getName == "class")
      .filter(_.getReadMethod != null)
  }
```
filters out properties named "class", because we wouldn't want to serialize that. But enum types have another property of type Class named "declaringClass", which we are trying to inspect recursively. Eventually we try to inspect ClassLoader class, which has property "defaultAssertionStatus" with no read method, which leads to NPE at TypeToken:495.

I added property name "declaringClass" to filtering to resolve this.

## How was this patch tested?
Unit test in JavaDatasetSuite which creates an encoder for enum

Author: mike <mike0sv@gmail.com>
Author: Mikhail Sveshnikov <mike0sv@gmail.com>

Closes #18488 from mike0sv/enum-support.
---
 .../sql/catalyst/JavaTypeInference.scala      | 40 ++++++++++
 .../catalyst/encoders/ExpressionEncoder.scala | 14 +++-
 .../expressions/objects/objects.scala         |  4 +-
 .../apache/spark/sql/JavaDatasetSuite.java    | 77 +++++++++++++++++++
 4 files changed, 131 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
index 21363d3ba82c1..33f6ce080c339 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
@@ -32,6 +32,7 @@ import org.apache.spark.sql.catalyst.expressions.objects._
 import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericArrayData}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
+import org.apache.spark.util.Utils
 
 /**
  * Type-inference utilities for POJOs and Java collections.
@@ -118,6 +119,10 @@ object JavaTypeInference {
         val (valueDataType, nullable) = inferDataType(valueType, seenTypeSet)
         (MapType(keyDataType, valueDataType, nullable), true)
 
+      case other if other.isEnum =>
+        (StructType(Seq(StructField(typeToken.getRawType.getSimpleName,
+          StringType, nullable = false))), true)
+
       case other =>
         if (seenTypeSet.contains(other)) {
           throw new UnsupportedOperationException(
@@ -140,6 +145,7 @@ object JavaTypeInference {
   def getJavaBeanReadableProperties(beanClass: Class[_]): Array[PropertyDescriptor] = {
     val beanInfo = Introspector.getBeanInfo(beanClass)
     beanInfo.getPropertyDescriptors.filterNot(_.getName == "class")
+      .filterNot(_.getName == "declaringClass")
       .filter(_.getReadMethod != null)
   }
 
@@ -303,6 +309,11 @@ object JavaTypeInference {
           keyData :: valueData :: Nil,
           returnNullable = false)
 
+      case other if other.isEnum =>
+        StaticInvoke(JavaTypeInference.getClass, ObjectType(other), "deserializeEnumName",
+          expressions.Literal.create(other.getEnumConstants.apply(0), ObjectType(other))
+            :: getPath :: Nil)
+
       case other =>
         val properties = getJavaBeanReadableAndWritableProperties(other)
         val setters = properties.map { p =>
@@ -345,6 +356,30 @@ object JavaTypeInference {
     }
   }
 
+  /** Returns a mapping from enum value to int for given enum type */
+  def enumSerializer[T <: Enum[T]](enum: Class[T]): T => UTF8String = {
+    assert(enum.isEnum)
+    inputObject: T =>
+      UTF8String.fromString(inputObject.name())
+  }
+
+  /** Returns value index for given enum type and value */
+  def serializeEnumName[T <: Enum[T]](enum: UTF8String, inputObject: T): UTF8String = {
+    enumSerializer(Utils.classForName(enum.toString).asInstanceOf[Class[T]])(inputObject)
+  }
+
+  /** Returns a mapping from int to enum value for given enum type */
+  def enumDeserializer[T <: Enum[T]](enum: Class[T]): InternalRow => T = {
+    assert(enum.isEnum)
+    value: InternalRow =>
+      Enum.valueOf(enum, value.getUTF8String(0).toString)
+  }
+
+  /** Returns enum value for given enum type and value index */
+  def deserializeEnumName[T <: Enum[T]](typeDummy: T, inputObject: InternalRow): T = {
+    enumDeserializer(typeDummy.getClass.asInstanceOf[Class[T]])(inputObject)
+  }
+
   private def serializerFor(inputObject: Expression, typeToken: TypeToken[_]): Expression = {
 
     def toCatalystArray(input: Expression, elementType: TypeToken[_]): Expression = {
@@ -429,6 +464,11 @@ object JavaTypeInference {
             valueNullable = true
           )
 
+        case other if other.isEnum =>
+          CreateNamedStruct(expressions.Literal("enum") ::
+          StaticInvoke(JavaTypeInference.getClass, StringType, "serializeEnumName",
+          expressions.Literal.create(other.getName, StringType) :: inputObject :: Nil) :: Nil)
+
         case other =>
           val properties = getJavaBeanReadableAndWritableProperties(other)
           val nonNullOutput = CreateNamedStruct(properties.flatMap { p =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
index efc2882f0a3d3..9ed5e120344b1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateSafeProjection
 import org.apache.spark.sql.catalyst.expressions.objects.{AssertNotNull, Invoke, NewInstance}
 import org.apache.spark.sql.catalyst.optimizer.SimplifyCasts
 import org.apache.spark.sql.catalyst.plans.logical.{CatalystSerde, DeserializeToObject, LocalRelation}
-import org.apache.spark.sql.types.{BooleanType, ObjectType, StructField, StructType}
+import org.apache.spark.sql.types.{BooleanType, DataType, ObjectType, StringType, StructField, StructType}
 import org.apache.spark.util.Utils
 
 /**
@@ -81,9 +81,19 @@ object ExpressionEncoder {
       ClassTag[T](cls))
   }
 
+  def javaEnumSchema[T](beanClass: Class[T]): DataType = {
+    StructType(Seq(StructField("enum",
+      StructType(Seq(StructField(beanClass.getSimpleName, StringType, nullable = false))),
+      nullable = false)))
+  }
+
   // TODO: improve error message for java bean encoder.
   def javaBean[T](beanClass: Class[T]): ExpressionEncoder[T] = {
-    val schema = JavaTypeInference.inferDataType(beanClass)._1
+    val schema = if (beanClass.isEnum) {
+      javaEnumSchema(beanClass)
+    } else {
+      JavaTypeInference.inferDataType(beanClass)._1
+    }
     assert(schema.isInstanceOf[StructType])
 
     val serializer = JavaTypeInference.serializerFor(beanClass)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index 9b28a18035b1c..7c466fe03cdcf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -154,13 +154,13 @@ case class StaticInvoke(
     val evaluate = if (returnNullable) {
       if (ctx.defaultValue(dataType) == "null") {
         s"""
-          ${ev.value} = $callFunc;
+          ${ev.value} = (($javaType) ($callFunc));
           ${ev.isNull} = ${ev.value} == null;
         """
       } else {
         val boxedResult = ctx.freshName("boxedResult")
         s"""
-          ${ctx.boxedType(dataType)} $boxedResult = $callFunc;
+          ${ctx.boxedType(dataType)} $boxedResult = (($javaType) ($callFunc));
           ${ev.isNull} = $boxedResult == null;
           if (!${ev.isNull}) {
             ${ev.value} = $boxedResult;
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java
index 4ca3b6406a328..a34474683013f 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java
@@ -1283,6 +1283,83 @@ public void test() {
     ds.collectAsList();
   }
 
+  public enum EnumBean {
+    A("www.elgoog.com"),
+    B("www.google.com");
+
+    private String url;
+
+    EnumBean(String url) {
+      this.url = url;
+    }
+
+    public String getUrl() {
+      return url;
+    }
+
+    public void setUrl(String url) {
+      this.url = url;
+    }
+  }
+
+  @Test
+  public void testEnum() {
+    List<EnumBean> data = Arrays.asList(EnumBean.B);
+    Encoder<EnumBean> encoder = Encoders.bean(EnumBean.class);
+    Dataset<EnumBean> ds = spark.createDataset(data, encoder);
+    Assert.assertEquals(ds.collectAsList(), data);
+  }
+
+  public static class BeanWithEnum {
+    EnumBean enumField;
+    String regularField;
+
+    public String getRegularField() {
+      return regularField;
+    }
+
+    public void setRegularField(String regularField) {
+      this.regularField = regularField;
+    }
+
+    public EnumBean getEnumField() {
+      return enumField;
+    }
+
+    public void setEnumField(EnumBean field) {
+      this.enumField = field;
+    }
+
+    public BeanWithEnum(EnumBean enumField, String regularField) {
+      this.enumField = enumField;
+      this.regularField = regularField;
+    }
+
+    public BeanWithEnum() {
+    }
+
+    public String toString() {
+      return "BeanWithEnum(" + enumField  + ", " + regularField + ")";
+    }
+
+    public boolean equals(Object other) {
+      if (other instanceof BeanWithEnum) {
+        BeanWithEnum beanWithEnum = (BeanWithEnum) other;
+        return beanWithEnum.regularField.equals(regularField) && beanWithEnum.enumField.equals(enumField);
+      }
+      return false;
+    }
+  }
+
+  @Test
+  public void testBeanWithEnum() {
+    List<BeanWithEnum> data = Arrays.asList(new BeanWithEnum(EnumBean.A, "mira avenue"),
+            new BeanWithEnum(EnumBean.B, "flower boulevard"));
+    Encoder<BeanWithEnum> encoder = Encoders.bean(BeanWithEnum.class);
+    Dataset<BeanWithEnum> ds = spark.createDataset(data, encoder);
+    Assert.assertEquals(ds.collectAsList(), data);
+  }
+
   public static class EmptyBean implements Serializable {}
 
   @Test

From 574ef6c987c636210828e96d2f797d8f10aff05e Mon Sep 17 00:00:00 2001
From: zhoukang <zhoukang199191@gmail.com>
Date: Fri, 25 Aug 2017 22:59:31 +0800
Subject: [PATCH 426/779] [SPARK-21527][CORE] Use buffer limit in order to use
 JAVA NIO Util's buffercache

## What changes were proposed in this pull request?

Right now, ChunkedByteBuffer#writeFully do not slice bytes first.We observe code in java nio Util#getTemporaryDirectBuffer below:

        BufferCache cache = bufferCache.get();
        ByteBuffer buf = cache.get(size);
        if (buf != null) {
            return buf;
        } else {
            // No suitable buffer in the cache so we need to allocate a new
            // one. To avoid the cache growing then we remove the first
            // buffer from the cache and free it.
            if (!cache.isEmpty()) {
                buf = cache.removeFirst();
                free(buf);
            }
            return ByteBuffer.allocateDirect(size);
        }

If we slice first with a fixed size, we can use buffer cache and only need to allocate at the first write call.
Since we allocate new buffer, we can not control the free time of this buffer.This once cause memory issue in our production cluster.
In this patch, i supply a new api which will slice with fixed size for buffer writing.

## How was this patch tested?

Unit test and test in production.

Author: zhoukang <zhoukang199191@gmail.com>
Author: zhoukang <zhoukang@xiaomi.com>

Closes #18730 from caneGuy/zhoukang/improve-chunkwrite.
---
 .../org/apache/spark/internal/config/package.scala    |  9 +++++++++
 .../org/apache/spark/util/io/ChunkedByteBuffer.scala  | 11 ++++++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index 9495cd2835f97..0457a66af8e89 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -293,6 +293,15 @@ package object config {
       .booleanConf
       .createWithDefault(false)
 
+  private[spark] val BUFFER_WRITE_CHUNK_SIZE =
+    ConfigBuilder("spark.buffer.write.chunkSize")
+      .internal()
+      .doc("The chunk size during writing out the bytes of ChunkedByteBuffer.")
+      .bytesConf(ByteUnit.BYTE)
+      .checkValue(_ <= Int.MaxValue, "The chunk size during writing out the bytes of" +
+        " ChunkedByteBuffer should not larger than Int.MaxValue.")
+      .createWithDefault(64 * 1024 * 1024)
+
   private[spark] val CHECKPOINT_COMPRESS =
     ConfigBuilder("spark.checkpoint.compress")
       .doc("Whether to compress RDD checkpoints. Generally a good idea. Compression will use " +
diff --git a/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala b/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala
index f48bfd5c25f77..c28570fb24560 100644
--- a/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala
+++ b/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala
@@ -24,6 +24,8 @@ import java.nio.channels.WritableByteChannel
 import com.google.common.primitives.UnsignedBytes
 import io.netty.buffer.{ByteBuf, Unpooled}
 
+import org.apache.spark.SparkEnv
+import org.apache.spark.internal.config
 import org.apache.spark.network.util.ByteArrayWritableChannel
 import org.apache.spark.storage.StorageUtils
 
@@ -40,6 +42,11 @@ private[spark] class ChunkedByteBuffer(var chunks: Array[ByteBuffer]) {
   require(chunks != null, "chunks must not be null")
   require(chunks.forall(_.position() == 0), "chunks' positions must be 0")
 
+  // Chunk size in bytes
+  private val bufferWriteChunkSize =
+    Option(SparkEnv.get).map(_.conf.get(config.BUFFER_WRITE_CHUNK_SIZE))
+      .getOrElse(config.BUFFER_WRITE_CHUNK_SIZE.defaultValue.get).toInt
+
   private[this] var disposed: Boolean = false
 
   /**
@@ -56,7 +63,9 @@ private[spark] class ChunkedByteBuffer(var chunks: Array[ByteBuffer]) {
    */
   def writeFully(channel: WritableByteChannel): Unit = {
     for (bytes <- getChunks()) {
-      while (bytes.remaining > 0) {
+      while (bytes.remaining() > 0) {
+        val ioSize = Math.min(bytes.remaining(), bufferWriteChunkSize)
+        bytes.limit(bytes.position + ioSize)
         channel.write(bytes)
       }
     }

From de7af295c2047f1b508cb02e735e0e743395f181 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Fri, 25 Aug 2017 16:07:13 +0100
Subject: [PATCH 427/779] [MINOR][BUILD] Fix build warnings and Java lint
 errors

## What changes were proposed in this pull request?

Fix build warnings and Java lint errors. This just helps a bit in evaluating (new) warnings in another PR I have open.

## How was this patch tested?

Existing tests

Author: Sean Owen <sowen@cloudera.com>

Closes #19051 from srowen/JavaWarnings.
---
 .../java/org/apache/spark/util/kvstore/InMemoryStore.java  | 2 +-
 .../org/apache/spark/util/kvstore/KVStoreIterator.java     | 3 ++-
 .../apache/spark/network/TransportRequestHandlerSuite.java | 7 +++++--
 .../java/org/apache/spark/launcher/SparkLauncherSuite.java | 1 -
 .../org/apache/spark/launcher/ChildProcAppHandleSuite.java | 1 -
 .../org/apache/spark/ml/tuning/CrossValidatorSuite.scala   | 7 +++----
 .../apache/spark/ml/tuning/TrainValidationSplitSuite.scala | 7 +++----
 pom.xml                                                    | 2 +-
 .../datasources/parquet/VectorizedColumnReader.java        | 3 ++-
 .../spark/sql/execution/vectorized/AggregateHashMap.java   | 1 -
 .../spark/sql/execution/vectorized/ArrowColumnVector.java  | 1 -
 11 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/InMemoryStore.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/InMemoryStore.java
index 9cae5da5d2600..5ca4371285198 100644
--- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/InMemoryStore.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/InMemoryStore.java
@@ -171,7 +171,7 @@ public int size() {
     public <T> InMemoryView<T> view(Class<T> type) {
       Preconditions.checkArgument(ti.type().equals(type), "Unexpected type: %s", type);
       Collection<T> all = (Collection<T>) data.values();
-      return new InMemoryView(type, all, ti);
+      return new InMemoryView<>(type, all, ti);
     }
 
   }
diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVStoreIterator.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVStoreIterator.java
index 28a432b26d98e..e6254a9368ff5 100644
--- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVStoreIterator.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVStoreIterator.java
@@ -17,6 +17,7 @@
 
 package org.apache.spark.util.kvstore;
 
+import java.io.Closeable;
 import java.util.Iterator;
 import java.util.List;
 
@@ -31,7 +32,7 @@
  * </p>
  */
 @Private
-public interface KVStoreIterator<T> extends Iterator<T>, AutoCloseable {
+public interface KVStoreIterator<T> extends Iterator<T>, Closeable {
 
   /**
    * Retrieve multiple elements from the store.
diff --git a/common/network-common/src/test/java/org/apache/spark/network/TransportRequestHandlerSuite.java b/common/network-common/src/test/java/org/apache/spark/network/TransportRequestHandlerSuite.java
index 1ed57116bc7bf..2656cbee95a20 100644
--- a/common/network-common/src/test/java/org/apache/spark/network/TransportRequestHandlerSuite.java
+++ b/common/network-common/src/test/java/org/apache/spark/network/TransportRequestHandlerSuite.java
@@ -102,7 +102,7 @@ public void handleFetchRequestAndStreamRequest() throws Exception {
 
   private class ExtendedChannelPromise extends DefaultChannelPromise {
 
-    private List<GenericFutureListener> listeners = new ArrayList<>();
+    private List<GenericFutureListener<Future<Void>>> listeners = new ArrayList<>();
     private boolean success;
 
     ExtendedChannelPromise(Channel channel) {
@@ -113,7 +113,10 @@ private class ExtendedChannelPromise extends DefaultChannelPromise {
     @Override
     public ChannelPromise addListener(
         GenericFutureListener<? extends Future<? super Void>> listener) {
-      listeners.add(listener);
+      @SuppressWarnings("unchecked")
+      GenericFutureListener<Future<Void>> gfListener =
+          (GenericFutureListener<Future<Void>>) listener;
+      listeners.add(gfListener);
       return super.addListener(listener);
     }
 
diff --git a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
index db4fc26cdf353..ac4391e3ef99b 100644
--- a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
+++ b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
@@ -21,7 +21,6 @@
 import java.util.HashMap;
 import java.util.Map;
 
-import org.junit.Before;
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
diff --git a/launcher/src/test/java/org/apache/spark/launcher/ChildProcAppHandleSuite.java b/launcher/src/test/java/org/apache/spark/launcher/ChildProcAppHandleSuite.java
index 64a87b365d6a9..602f55a50564d 100644
--- a/launcher/src/test/java/org/apache/spark/launcher/ChildProcAppHandleSuite.java
+++ b/launcher/src/test/java/org/apache/spark/launcher/ChildProcAppHandleSuite.java
@@ -18,7 +18,6 @@
 package org.apache.spark.launcher;
 
 import java.io.File;
-import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.ArrayList;
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
index dc6043ef19fe2..90778d7890064 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
@@ -187,14 +187,13 @@ class CrossValidatorSuite
     cv2.getEstimator match {
       case ova2: OneVsRest =>
         assert(ova.uid === ova2.uid)
-        val classifier = ova2.getClassifier
-        classifier match {
+        ova2.getClassifier match {
           case lr: LogisticRegression =>
             assert(ova.getClassifier.asInstanceOf[LogisticRegression].getMaxIter
               === lr.getMaxIter)
-          case _ =>
+          case other =>
             throw new AssertionError(s"Loaded CrossValidator expected estimator of type" +
-              s" LogisticREgression but found ${classifier.getClass.getName}")
+              s" LogisticRegression but found ${other.getClass.getName}")
         }
 
       case other =>
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
index 7c97865e45202..aa8b4cf173cc3 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
@@ -173,14 +173,13 @@ class TrainValidationSplitSuite
     tvs2.getEstimator match {
       case ova2: OneVsRest =>
         assert(ova.uid === ova2.uid)
-        val classifier = ova2.getClassifier
-        classifier match {
+        ova2.getClassifier match {
           case lr: LogisticRegression =>
             assert(ova.getClassifier.asInstanceOf[LogisticRegression].getMaxIter
               === lr.getMaxIter)
-          case _ =>
+          case other =>
             throw new AssertionError(s"Loaded TrainValidationSplit expected estimator of type" +
-              s" LogisticREgression but found ${classifier.getClass.getName}")
+              s" LogisticRegression but found ${other.getClass.getName}")
         }
 
       case other =>
diff --git a/pom.xml b/pom.xml
index 8b4a6c5425a98..fffd70ec1d929 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2058,7 +2058,7 @@
               <javacArg>${java.version}</javacArg>
               <javacArg>-target</javacArg>
               <javacArg>${java.version}</javacArg>
-              <javacArg>-Xlint:all,-serial,-path</javacArg>
+              <javacArg>-Xlint:all,-serial,-path,-try</javacArg>
             </javacArgs>
           </configuration>
         </plugin>
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
index f37864a0f5393..2173bbce3eea9 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
@@ -350,7 +350,8 @@ private void decodeDictionaryIds(
    * is guaranteed that num is smaller than the number of values left in the current page.
    */
 
-  private void readBooleanBatch(int rowId, int num, WritableColumnVector column) throws IOException {
+  private void readBooleanBatch(int rowId, int num, WritableColumnVector column)
+      throws IOException {
     assert(column.dataType() == DataTypes.BooleanType);
     defColumn.readBooleans(
         num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/AggregateHashMap.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/AggregateHashMap.java
index 1c94f706dc685..cb3ad4eab1f60 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/AggregateHashMap.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/AggregateHashMap.java
@@ -21,7 +21,6 @@
 
 import com.google.common.annotations.VisibleForTesting;
 
-import org.apache.spark.memory.MemoryMode;
 import org.apache.spark.sql.types.StructType;
 
 import static org.apache.spark.sql.types.DataTypes.LongType;
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
index be2a9c246747c..1f171049820b2 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
@@ -21,7 +21,6 @@
 import org.apache.arrow.vector.complex.*;
 import org.apache.arrow.vector.holders.NullableVarCharHolder;
 
-import org.apache.spark.memory.MemoryMode;
 import org.apache.spark.sql.execution.arrow.ArrowUtils;
 import org.apache.spark.sql.types.*;
 import org.apache.spark.unsafe.types.UTF8String;

From 1f24ceee606f17c4f3ca969fa4b5631256fa09e8 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Fri, 25 Aug 2017 08:59:48 -0700
Subject: [PATCH 428/779] [SPARK-21832][TEST] Merge SQLBuilderTest into
 ExpressionSQLBuilderSuite

## What changes were proposed in this pull request?

After [SPARK-19025](https://github.com/apache/spark/pull/16869), there is no need to keep SQLBuilderTest.
ExpressionSQLBuilderSuite is the only place to use it.
This PR aims to remove SQLBuilderTest.

## How was this patch tested?

Pass the updated `ExpressionSQLBuilderSuite`.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #19044 from dongjoon-hyun/SPARK-21832.
---
 .../catalyst/ExpressionSQLBuilderSuite.scala  | 23 ++++++++--
 .../spark/sql/catalyst/SQLBuilderTest.scala   | 44 -------------------
 2 files changed, 20 insertions(+), 47 deletions(-)
 delete mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/catalyst/SQLBuilderTest.scala

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionSQLBuilderSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionSQLBuilderSuite.scala
index 90f90599d5bf4..d9cf1f361c1d6 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionSQLBuilderSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionSQLBuilderSuite.scala
@@ -19,12 +19,29 @@ package org.apache.spark.sql.catalyst
 
 import java.sql.Timestamp
 
+import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.catalyst.dsl.expressions._
-import org.apache.spark.sql.catalyst.expressions.{If, Literal, SpecifiedWindowFrame, TimeAdd,
-  TimeSub, WindowSpecDefinition}
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.unsafe.types.CalendarInterval
 
-class ExpressionSQLBuilderSuite extends SQLBuilderTest {
+class ExpressionSQLBuilderSuite extends QueryTest with TestHiveSingleton {
+  protected def checkSQL(e: Expression, expectedSQL: String): Unit = {
+    val actualSQL = e.sql
+    try {
+      assert(actualSQL == expectedSQL)
+    } catch {
+      case cause: Throwable =>
+        fail(
+          s"""Wrong SQL generated for the following expression:
+             |
+             |${e.prettyName}
+             |
+             |$cause
+           """.stripMargin)
+    }
+  }
+
   test("literal") {
     checkSQL(Literal("foo"), "'foo'")
     checkSQL(Literal("\"foo\""), "'\"foo\"'")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/SQLBuilderTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/SQLBuilderTest.scala
deleted file mode 100644
index 157783abc8c2f..0000000000000
--- a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/SQLBuilderTest.scala
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst
-
-import scala.util.control.NonFatal
-
-import org.apache.spark.sql.{DataFrame, Dataset, QueryTest}
-import org.apache.spark.sql.catalyst.expressions.Expression
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.hive.test.TestHiveSingleton
-
-
-abstract class SQLBuilderTest extends QueryTest with TestHiveSingleton {
-  protected def checkSQL(e: Expression, expectedSQL: String): Unit = {
-    val actualSQL = e.sql
-    try {
-      assert(actualSQL === expectedSQL)
-    } catch {
-      case cause: Throwable =>
-        fail(
-          s"""Wrong SQL generated for the following expression:
-             |
-             |${e.prettyName}
-             |
-             |$cause
-           """.stripMargin)
-    }
-  }
-}

From 1813c4a8dd4388fe76a4ec772c9be151be0f60a1 Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Fri, 25 Aug 2017 09:57:53 -0700
Subject: [PATCH 429/779] [SPARK-21714][CORE][YARN] Avoiding re-uploading
 remote resources in yarn client mode

## What changes were proposed in this pull request?

With SPARK-10643, Spark supports download resources from remote in client deploy mode. But the implementation overrides variables which representing added resources (like `args.jars`, `args.pyFiles`) to local path, And yarn client leverage this local path to re-upload resources to distributed cache. This is unnecessary to break the semantics of putting resources in a shared FS. So here proposed to fix it.

## How was this patch tested?

This is manually verified with jars, pyFiles in local and remote storage, both in client and cluster mode.

Author: jerryshao <sshao@hortonworks.com>

Closes #18962 from jerryshao/SPARK-21714.
---
 .../org/apache/spark/deploy/SparkSubmit.scala | 64 +++++++++++------
 .../spark/internal/config/package.scala       |  2 +-
 .../scala/org/apache/spark/util/Utils.scala   | 25 ++++---
 .../spark/deploy/SparkSubmitSuite.scala       | 70 +++++++++++++++----
 .../scala/org/apache/spark/repl/Main.scala    |  2 +-
 5 files changed, 114 insertions(+), 49 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index e56925102d47e..548149a88a49d 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -212,14 +212,20 @@ object SparkSubmit extends CommandLineUtils {
 
   /**
    * Prepare the environment for submitting an application.
-   * This returns a 4-tuple:
-   *   (1) the arguments for the child process,
-   *   (2) a list of classpath entries for the child,
-   *   (3) a map of system properties, and
-   *   (4) the main class for the child
+   *
+   * @param args the parsed SparkSubmitArguments used for environment preparation.
+   * @param conf the Hadoop Configuration, this argument will only be set in unit test.
+   * @return a 4-tuple:
+   *        (1) the arguments for the child process,
+   *        (2) a list of classpath entries for the child,
+   *        (3) a map of system properties, and
+   *        (4) the main class for the child
+   *
    * Exposed for testing.
    */
-  private[deploy] def prepareSubmitEnvironment(args: SparkSubmitArguments)
+  private[deploy] def prepareSubmitEnvironment(
+      args: SparkSubmitArguments,
+      conf: Option[HadoopConfiguration] = None)
       : (Seq[String], Seq[String], Map[String, String], String) = {
     // Return values
     val childArgs = new ArrayBuffer[String]()
@@ -322,7 +328,7 @@ object SparkSubmit extends CommandLineUtils {
       }
     }
 
-    val hadoopConf = new HadoopConfiguration()
+    val hadoopConf = conf.getOrElse(new HadoopConfiguration())
     val targetDir = DependencyUtils.createTempDir()
 
     // Resolve glob path for different resources.
@@ -332,19 +338,21 @@ object SparkSubmit extends CommandLineUtils {
     args.archives = Option(args.archives).map(resolveGlobPaths(_, hadoopConf)).orNull
 
     // In client mode, download remote files.
+    var localPrimaryResource: String = null
+    var localJars: String = null
+    var localPyFiles: String = null
     if (deployMode == CLIENT) {
-      args.primaryResource = Option(args.primaryResource).map {
+      localPrimaryResource = Option(args.primaryResource).map {
         downloadFile(_, targetDir, args.sparkProperties, hadoopConf)
       }.orNull
-      args.jars = Option(args.jars).map {
+      localJars = Option(args.jars).map {
         downloadFileList(_, targetDir, args.sparkProperties, hadoopConf)
       }.orNull
-      args.pyFiles = Option(args.pyFiles).map {
+      localPyFiles = Option(args.pyFiles).map {
         downloadFileList(_, targetDir, args.sparkProperties, hadoopConf)
       }.orNull
     }
 
-
     // If we're running a python app, set the main class to our specific python runner
     if (args.isPython && deployMode == CLIENT) {
       if (args.primaryResource == PYSPARK_SHELL) {
@@ -353,7 +361,7 @@ object SparkSubmit extends CommandLineUtils {
         // If a python file is provided, add it to the child arguments and list of files to deploy.
         // Usage: PythonAppRunner <main python file> <extra python files> [app arguments]
         args.mainClass = "org.apache.spark.deploy.PythonRunner"
-        args.childArgs = ArrayBuffer(args.primaryResource, args.pyFiles) ++ args.childArgs
+        args.childArgs = ArrayBuffer(localPrimaryResource, localPyFiles) ++ args.childArgs
         if (clusterManager != YARN) {
           // The YARN backend distributes the primary file differently, so don't merge it.
           args.files = mergeFileLists(args.files, args.primaryResource)
@@ -363,8 +371,8 @@ object SparkSubmit extends CommandLineUtils {
         // The YARN backend handles python files differently, so don't merge the lists.
         args.files = mergeFileLists(args.files, args.pyFiles)
       }
-      if (args.pyFiles != null) {
-        sysProps("spark.submit.pyFiles") = args.pyFiles
+      if (localPyFiles != null) {
+        sysProps("spark.submit.pyFiles") = localPyFiles
       }
     }
 
@@ -418,7 +426,7 @@ object SparkSubmit extends CommandLineUtils {
         // If an R file is provided, add it to the child arguments and list of files to deploy.
         // Usage: RRunner <main R file> [app arguments]
         args.mainClass = "org.apache.spark.deploy.RRunner"
-        args.childArgs = ArrayBuffer(args.primaryResource) ++ args.childArgs
+        args.childArgs = ArrayBuffer(localPrimaryResource) ++ args.childArgs
         args.files = mergeFileLists(args.files, args.primaryResource)
       }
     }
@@ -463,6 +471,7 @@ object SparkSubmit extends CommandLineUtils {
       OptionAssigner(args.queue, YARN, ALL_DEPLOY_MODES, sysProp = "spark.yarn.queue"),
       OptionAssigner(args.numExecutors, YARN, ALL_DEPLOY_MODES,
         sysProp = "spark.executor.instances"),
+      OptionAssigner(args.pyFiles, YARN, ALL_DEPLOY_MODES, sysProp = "spark.yarn.dist.pyFiles"),
       OptionAssigner(args.jars, YARN, ALL_DEPLOY_MODES, sysProp = "spark.yarn.dist.jars"),
       OptionAssigner(args.files, YARN, ALL_DEPLOY_MODES, sysProp = "spark.yarn.dist.files"),
       OptionAssigner(args.archives, YARN, ALL_DEPLOY_MODES, sysProp = "spark.yarn.dist.archives"),
@@ -486,15 +495,28 @@ object SparkSubmit extends CommandLineUtils {
         sysProp = "spark.driver.cores"),
       OptionAssigner(args.supervise.toString, STANDALONE | MESOS, CLUSTER,
         sysProp = "spark.driver.supervise"),
-      OptionAssigner(args.ivyRepoPath, STANDALONE, CLUSTER, sysProp = "spark.jars.ivy")
+      OptionAssigner(args.ivyRepoPath, STANDALONE, CLUSTER, sysProp = "spark.jars.ivy"),
+
+      // An internal option used only for spark-shell to add user jars to repl's classloader,
+      // previously it uses "spark.jars" or "spark.yarn.dist.jars" which now may be pointed to
+      // remote jars, so adding a new option to only specify local jars for spark-shell internally.
+      OptionAssigner(localJars, ALL_CLUSTER_MGRS, CLIENT, sysProp = "spark.repl.local.jars")
     )
 
     // In client mode, launch the application main class directly
     // In addition, add the main application jar and any added jars (if any) to the classpath
-    // Also add the main application jar and any added jars to classpath in case YARN client
-    // requires these jars.
-    if (deployMode == CLIENT || isYarnCluster) {
+    if (deployMode == CLIENT) {
       childMainClass = args.mainClass
+      if (localPrimaryResource != null && isUserJar(localPrimaryResource)) {
+        childClasspath += localPrimaryResource
+      }
+      if (localJars != null) { childClasspath ++= localJars.split(",") }
+    }
+    // Add the main application jar and any added jars to classpath in case YARN client
+    // requires these jars.
+    // This assumes both primaryResource and user jars are local jars, otherwise it will not be
+    // added to the classpath of YARN client.
+    if (isYarnCluster) {
       if (isUserJar(args.primaryResource)) {
         childClasspath += args.primaryResource
       }
@@ -551,10 +573,6 @@ object SparkSubmit extends CommandLineUtils {
       if (args.isPython) {
         sysProps.put("spark.yarn.isPython", "true")
       }
-
-      if (args.pyFiles != null) {
-        sysProps("spark.submit.pyFiles") = args.pyFiles
-      }
     }
 
     // assure a keytab is available from any place in a JVM
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index 0457a66af8e89..0d3769a735869 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -87,7 +87,7 @@ package object config {
     .intConf
     .createOptional
 
-  private[spark] val PY_FILES = ConfigBuilder("spark.submit.pyFiles")
+  private[spark] val PY_FILES = ConfigBuilder("spark.yarn.dist.pyFiles")
     .internal()
     .stringConf
     .toSequence
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 900a619421903..3dce76c2c96ba 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -2601,18 +2601,23 @@ private[spark] object Utils extends Logging {
   }
 
   /**
-   * In YARN mode this method returns a union of the jar files pointed by "spark.jars" and the
-   * "spark.yarn.dist.jars" properties, while in other modes it returns the jar files pointed by
-   * only the "spark.jars" property.
+   * Return the jar files pointed by the "spark.jars" property. Spark internally will distribute
+   * these jars through file server. In the YARN mode, it will return an empty list, since YARN
+   * has its own mechanism to distribute jars.
    */
-  def getUserJars(conf: SparkConf, isShell: Boolean = false): Seq[String] = {
+  def getUserJars(conf: SparkConf): Seq[String] = {
     val sparkJars = conf.getOption("spark.jars")
-    if (conf.get("spark.master") == "yarn" && isShell) {
-      val yarnJars = conf.getOption("spark.yarn.dist.jars")
-      unionFileLists(sparkJars, yarnJars).toSeq
-    } else {
-      sparkJars.map(_.split(",")).map(_.filter(_.nonEmpty)).toSeq.flatten
-    }
+    sparkJars.map(_.split(",")).map(_.filter(_.nonEmpty)).toSeq.flatten
+  }
+
+  /**
+   * Return the local jar files which will be added to REPL's classpath. These jar files are
+   * specified by --jars (spark.jars) or --packages, remote jars will be downloaded to local by
+   * SparkSubmit at first.
+   */
+  def getLocalUserJarsForShell(conf: SparkConf): Seq[String] = {
+    val localJars = conf.getOption("spark.repl.local.jars")
+    localJars.map(_.split(",")).map(_.filter(_.nonEmpty)).toSeq.flatten
   }
 
   private[spark] val REDACTION_REPLACEMENT_TEXT = "*********(redacted)"
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index 08ba41f50a2b9..95137c868cbaf 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -29,7 +29,7 @@ import scala.io.Source
 import com.google.common.io.ByteStreams
 import org.apache.commons.io.FileUtils
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.Path
+import org.apache.hadoop.fs.{FileStatus, Path, RawLocalFileSystem}
 import org.scalatest.{BeforeAndAfterEach, Matchers}
 import org.scalatest.concurrent.Timeouts
 import org.scalatest.time.SpanSugar._
@@ -762,7 +762,7 @@ class SparkSubmitSuite
       (Set(jar1.toURI.toString, jar2.toURI.toString))
     sysProps("spark.yarn.dist.files").split(",").toSet should be
       (Set(file1.toURI.toString, file2.toURI.toString))
-    sysProps("spark.submit.pyFiles").split(",").toSet should be
+    sysProps("spark.yarn.dist.pyFiles").split(",").toSet should be
       (Set(pyFile1.getAbsolutePath, pyFile2.getAbsolutePath))
     sysProps("spark.yarn.dist.archives").split(",").toSet should be
       (Set(archive1.toURI.toString, archive2.toURI.toString))
@@ -802,10 +802,7 @@ class SparkSubmitSuite
   test("downloadFile - file doesn't exist") {
     val hadoopConf = new Configuration()
     val tmpDir = Utils.createTempDir()
-    // Set s3a implementation to local file system for testing.
-    hadoopConf.set("fs.s3a.impl", "org.apache.spark.deploy.TestFileSystem")
-    // Disable file system impl cache to make sure the test file system is picked up.
-    hadoopConf.set("fs.s3a.impl.disable.cache", "true")
+    updateConfWithFakeS3Fs(hadoopConf)
     intercept[FileNotFoundException] {
       SparkSubmit.downloadFile("s3a:/no/such/file", tmpDir, mutable.Map.empty, hadoopConf)
     }
@@ -826,10 +823,7 @@ class SparkSubmitSuite
     FileUtils.write(jarFile, content)
     val hadoopConf = new Configuration()
     val tmpDir = Files.createTempDirectory("tmp").toFile
-    // Set s3a implementation to local file system for testing.
-    hadoopConf.set("fs.s3a.impl", "org.apache.spark.deploy.TestFileSystem")
-    // Disable file system impl cache to make sure the test file system is picked up.
-    hadoopConf.set("fs.s3a.impl.disable.cache", "true")
+    updateConfWithFakeS3Fs(hadoopConf)
     val sourcePath = s"s3a://${jarFile.getAbsolutePath}"
     val outputPath =
       SparkSubmit.downloadFile(sourcePath, tmpDir, mutable.Map.empty, hadoopConf)
@@ -844,10 +838,7 @@ class SparkSubmitSuite
     FileUtils.write(jarFile, content)
     val hadoopConf = new Configuration()
     val tmpDir = Files.createTempDirectory("tmp").toFile
-    // Set s3a implementation to local file system for testing.
-    hadoopConf.set("fs.s3a.impl", "org.apache.spark.deploy.TestFileSystem")
-    // Disable file system impl cache to make sure the test file system is picked up.
-    hadoopConf.set("fs.s3a.impl.disable.cache", "true")
+    updateConfWithFakeS3Fs(hadoopConf)
     val sourcePaths = Seq("/local/file", s"s3a://${jarFile.getAbsolutePath}")
     val outputPaths = SparkSubmit.downloadFileList(
       sourcePaths.mkString(","), tmpDir, mutable.Map.empty, hadoopConf).split(",")
@@ -859,6 +850,43 @@ class SparkSubmitSuite
     }
   }
 
+  test("Avoid re-upload remote resources in yarn client mode") {
+    val hadoopConf = new Configuration()
+    updateConfWithFakeS3Fs(hadoopConf)
+
+    val tmpDir = Utils.createTempDir()
+    val file = File.createTempFile("tmpFile", "", tmpDir)
+    val pyFile = File.createTempFile("tmpPy", ".egg", tmpDir)
+    val mainResource = File.createTempFile("tmpPy", ".py", tmpDir)
+    val tmpJar = TestUtils.createJarWithFiles(Map("test.resource" -> "USER"), tmpDir)
+    val tmpJarPath = s"s3a://${new File(tmpJar.toURI).getAbsolutePath}"
+
+    val args = Seq(
+      "--class", UserClasspathFirstTest.getClass.getName.stripPrefix("$"),
+      "--name", "testApp",
+      "--master", "yarn",
+      "--deploy-mode", "client",
+      "--jars", tmpJarPath,
+      "--files", s"s3a://${file.getAbsolutePath}",
+      "--py-files", s"s3a://${pyFile.getAbsolutePath}",
+      s"s3a://$mainResource"
+      )
+
+    val appArgs = new SparkSubmitArguments(args)
+    val sysProps = SparkSubmit.prepareSubmitEnvironment(appArgs, Some(hadoopConf))._3
+
+    // All the resources should still be remote paths, so that YARN client will not upload again.
+    sysProps("spark.yarn.dist.jars") should be (tmpJarPath)
+    sysProps("spark.yarn.dist.files") should be (s"s3a://${file.getAbsolutePath}")
+    sysProps("spark.yarn.dist.pyFiles") should be (s"s3a://${pyFile.getAbsolutePath}")
+
+    // Local repl jars should be a local path.
+    sysProps("spark.repl.local.jars") should (startWith("file:"))
+
+    // local py files should not be a URI format.
+    sysProps("spark.submit.pyFiles") should (startWith("/"))
+  }
+
   // NOTE: This is an expensive operation in terms of time (10 seconds+). Use sparingly.
   private def runSparkSubmit(args: Seq[String]): Unit = {
     val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!"))
@@ -898,6 +926,11 @@ class SparkSubmitSuite
       Utils.deleteRecursively(tmpDir)
     }
   }
+
+  private def updateConfWithFakeS3Fs(conf: Configuration): Unit = {
+    conf.set("fs.s3a.impl", classOf[TestFileSystem].getCanonicalName)
+    conf.set("fs.s3a.impl.disable.cache", "true")
+  }
 }
 
 object JarCreationTest extends Logging {
@@ -967,4 +1000,13 @@ class TestFileSystem extends org.apache.hadoop.fs.LocalFileSystem {
     // Ignore the scheme for testing.
     super.copyToLocalFile(new Path(src.toUri.getPath), dst)
   }
+
+  override def globStatus(pathPattern: Path): Array[FileStatus] = {
+    val newPath = new Path(pathPattern.toUri.getPath)
+    super.globStatus(newPath).map { status =>
+      val path = s"s3a://${status.getPath.toUri.getPath}"
+      status.setPath(new Path(path))
+      status
+    }
+  }
 }
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
index 9702a1e653c32..0b16e1b073e32 100644
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
+++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
@@ -57,7 +57,7 @@ object Main extends Logging {
   // Visible for testing
   private[repl] def doMain(args: Array[String], _interp: SparkILoop): Unit = {
     interp = _interp
-    val jars = Utils.getUserJars(conf, isShell = true)
+    val jars = Utils.getLocalUserJarsForShell(conf)
       // Remove file:///, file:// or file:/ scheme if exists for each jar
       .map { x => if (x.startsWith("file:")) new File(new URI(x)).getPath else x }
       .mkString(File.pathSeparator)

From 628bdeabda3347d0903c9ac8748d37d7b379d1e6 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Fri, 25 Aug 2017 10:04:21 -0700
Subject: [PATCH 430/779] [SPARK-17742][CORE] Fail launcher app handle if child
 process exits with error.

This is a follow up to cba826d0; that commit set the app handle state
to "LOST" when the child process exited, but that can be ambiguous. This
change sets the state to "FAILED" if the exit code was non-zero and
the handle state wasn't a failure state, or "LOST" if the exit status
was zero.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #19012 from vanzin/SPARK-17742.
---
 .../spark/launcher/ChildProcAppHandle.java    | 27 ++++++++++++++-----
 .../launcher/ChildProcAppHandleSuite.java     | 21 ++++++++++++++-
 2 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
index bf916406f1471..5391d4a50fe47 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
@@ -156,9 +156,15 @@ synchronized void setAppId(String appId) {
    * the exit code.
    */
   void monitorChild() {
-    while (childProc.isAlive()) {
+    Process proc = childProc;
+    if (proc == null) {
+      // Process may have already been disposed of, e.g. by calling kill().
+      return;
+    }
+
+    while (proc.isAlive()) {
       try {
-        childProc.waitFor();
+        proc.waitFor();
       } catch (Exception e) {
         LOG.log(Level.WARNING, "Exception waiting for child process to exit.", e);
       }
@@ -173,15 +179,24 @@ void monitorChild() {
 
       int ec;
       try {
-        ec = childProc.exitValue();
+        ec = proc.exitValue();
       } catch (Exception e) {
         LOG.log(Level.WARNING, "Exception getting child process exit code, assuming failure.", e);
         ec = 1;
       }
 
-      // Only override the success state; leave other fail states alone.
-      if (!state.isFinal() || (ec != 0 && state == State.FINISHED)) {
-        state = State.LOST;
+      State newState = null;
+      if (ec != 0) {
+        // Override state with failure if the current state is not final, or is success.
+        if (!state.isFinal() || state == State.FINISHED) {
+          newState = State.FAILED;
+        }
+      } else if (!state.isFinal()) {
+        newState = State.LOST;
+      }
+
+      if (newState != null) {
+        state = newState;
         fireEvent(false);
       }
     }
diff --git a/launcher/src/test/java/org/apache/spark/launcher/ChildProcAppHandleSuite.java b/launcher/src/test/java/org/apache/spark/launcher/ChildProcAppHandleSuite.java
index 602f55a50564d..3b4d1b07f606e 100644
--- a/launcher/src/test/java/org/apache/spark/launcher/ChildProcAppHandleSuite.java
+++ b/launcher/src/test/java/org/apache/spark/launcher/ChildProcAppHandleSuite.java
@@ -46,7 +46,9 @@ public class ChildProcAppHandleSuite extends BaseSuite {
   private static final List<String> TEST_SCRIPT = Arrays.asList(
     "#!/bin/sh",
     "echo \"output\"",
-    "echo \"error\" 1>&2");
+    "echo \"error\" 1>&2",
+    "while [ -n \"$1\" ]; do EC=$1; shift; done",
+    "exit $EC");
 
   private static File TEST_SCRIPT_PATH;
 
@@ -176,6 +178,7 @@ public void testRedirectErrorTwiceFails() throws Exception {
 
   @Test
   public void testProcMonitorWithOutputRedirection() throws Exception {
+    assumeFalse(isWindows());
     File err = Files.createTempFile("out", "txt").toFile();
     SparkAppHandle handle = new TestSparkLauncher()
       .redirectError()
@@ -187,6 +190,7 @@ public void testProcMonitorWithOutputRedirection() throws Exception {
 
   @Test
   public void testProcMonitorWithLogRedirection() throws Exception {
+    assumeFalse(isWindows());
     SparkAppHandle handle = new TestSparkLauncher()
       .redirectToLog(getClass().getName())
       .startApplication();
@@ -194,6 +198,16 @@ public void testProcMonitorWithLogRedirection() throws Exception {
     assertEquals(SparkAppHandle.State.LOST, handle.getState());
   }
 
+  @Test
+  public void testFailedChildProc() throws Exception {
+    assumeFalse(isWindows());
+    SparkAppHandle handle = new TestSparkLauncher(1)
+      .redirectToLog(getClass().getName())
+      .startApplication();
+    waitFor(handle);
+    assertEquals(SparkAppHandle.State.FAILED, handle.getState());
+  }
+
   private void waitFor(SparkAppHandle handle) throws Exception {
     long deadline = System.nanoTime() + TimeUnit.SECONDS.toNanos(10);
     try {
@@ -212,7 +226,12 @@ private void waitFor(SparkAppHandle handle) throws Exception {
   private static class TestSparkLauncher extends SparkLauncher {
 
     TestSparkLauncher() {
+      this(0);
+    }
+
+    TestSparkLauncher(int ec) {
       setAppResource("outputredirtest");
+      addAppArgs(String.valueOf(ec));
     }
 
     @Override

From 51620e288b5e0a7fffc3899c9deadabace28e6d7 Mon Sep 17 00:00:00 2001
From: vinodkc <vinod.kc.in@gmail.com>
Date: Fri, 25 Aug 2017 10:18:03 -0700
Subject: [PATCH 431/779] [SPARK-21756][SQL] Add JSON option to allow unquoted
 control characters

## What changes were proposed in this pull request?

This patch adds allowUnquotedControlChars option in JSON data source to allow JSON Strings to contain unquoted control characters (ASCII characters with value less than 32, including tab and line feed characters)

## How was this patch tested?
Add new test cases

Author: vinodkc <vinod.kc.in@gmail.com>

Closes #19008 from vinodkc/br_fix_SPARK-21756.
---
 python/pyspark/sql/readwriter.py                  |  8 ++++++--
 python/pyspark/sql/streaming.py                   |  8 ++++++--
 .../spark/sql/catalyst/json/JSONOptions.scala     |  3 +++
 .../org/apache/spark/sql/DataFrameReader.scala    |  3 +++
 .../spark/sql/streaming/DataStreamReader.scala    |  3 +++
 .../json/JsonParsingOptionsSuite.scala            | 15 +++++++++++++++
 6 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 7279173df6e4f..01da0dc27d83d 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -176,7 +176,7 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
              allowComments=None, allowUnquotedFieldNames=None, allowSingleQuotes=None,
              allowNumericLeadingZero=None, allowBackslashEscapingAnyCharacter=None,
              mode=None, columnNameOfCorruptRecord=None, dateFormat=None, timestampFormat=None,
-             multiLine=None):
+             multiLine=None, allowUnquotedControlChars=None):
         """
         Loads JSON files and returns the results as a :class:`DataFrame`.
 
@@ -234,6 +234,9 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
                                 default value, ``yyyy-MM-dd'T'HH:mm:ss.SSSXXX``.
         :param multiLine: parse one record, which may span multiple lines, per file. If None is
                           set, it uses the default value, ``false``.
+        :param allowUnquotedControlChars: allows JSON Strings to contain unquoted control
+                                          characters (ASCII characters with value less than 32,
+                                          including tab and line feed characters) or not.
 
         >>> df1 = spark.read.json('python/test_support/sql/people.json')
         >>> df1.dtypes
@@ -250,7 +253,8 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
             allowSingleQuotes=allowSingleQuotes, allowNumericLeadingZero=allowNumericLeadingZero,
             allowBackslashEscapingAnyCharacter=allowBackslashEscapingAnyCharacter,
             mode=mode, columnNameOfCorruptRecord=columnNameOfCorruptRecord, dateFormat=dateFormat,
-            timestampFormat=timestampFormat, multiLine=multiLine)
+            timestampFormat=timestampFormat, multiLine=multiLine,
+            allowUnquotedControlChars=allowUnquotedControlChars)
         if isinstance(path, basestring):
             path = [path]
         if type(path) == list:
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index 5bbd70cf0a789..0cf702143c773 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -407,7 +407,7 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
              allowComments=None, allowUnquotedFieldNames=None, allowSingleQuotes=None,
              allowNumericLeadingZero=None, allowBackslashEscapingAnyCharacter=None,
              mode=None, columnNameOfCorruptRecord=None, dateFormat=None, timestampFormat=None,
-             multiLine=None):
+             multiLine=None,  allowUnquotedControlChars=None):
         """
         Loads a JSON file stream and returns the results as a :class:`DataFrame`.
 
@@ -467,6 +467,9 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
                                 default value, ``yyyy-MM-dd'T'HH:mm:ss.SSSXXX``.
         :param multiLine: parse one record, which may span multiple lines, per file. If None is
                           set, it uses the default value, ``false``.
+        :param allowUnquotedControlChars: allows JSON Strings to contain unquoted control
+                                          characters (ASCII characters with value less than 32,
+                                          including tab and line feed characters) or not.
 
         >>> json_sdf = spark.readStream.json(tempfile.mkdtemp(), schema = sdf_schema)
         >>> json_sdf.isStreaming
@@ -480,7 +483,8 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
             allowSingleQuotes=allowSingleQuotes, allowNumericLeadingZero=allowNumericLeadingZero,
             allowBackslashEscapingAnyCharacter=allowBackslashEscapingAnyCharacter,
             mode=mode, columnNameOfCorruptRecord=columnNameOfCorruptRecord, dateFormat=dateFormat,
-            timestampFormat=timestampFormat, multiLine=multiLine)
+            timestampFormat=timestampFormat, multiLine=multiLine,
+            allowUnquotedControlChars=allowUnquotedControlChars)
         if isinstance(path, basestring):
             return self._df(self._jreader.json(path))
         else:
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
index 1fd680ab64b5a..652412b34478a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
@@ -64,6 +64,8 @@ private[sql] class JSONOptions(
     parameters.get("allowNonNumericNumbers").map(_.toBoolean).getOrElse(true)
   val allowBackslashEscapingAnyCharacter =
     parameters.get("allowBackslashEscapingAnyCharacter").map(_.toBoolean).getOrElse(false)
+  private val allowUnquotedControlChars =
+    parameters.get("allowUnquotedControlChars").map(_.toBoolean).getOrElse(false)
   val compressionCodec = parameters.get("compression").map(CompressionCodecs.getCodecClassName)
   val parseMode: ParseMode =
     parameters.get("mode").map(ParseMode.fromString).getOrElse(PermissiveMode)
@@ -92,5 +94,6 @@ private[sql] class JSONOptions(
     factory.configure(JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS, allowNonNumericNumbers)
     factory.configure(JsonParser.Feature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER,
       allowBackslashEscapingAnyCharacter)
+    factory.configure(JsonParser.Feature.ALLOW_UNQUOTED_CONTROL_CHARS, allowUnquotedControlChars)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 41cb019499ae1..8209cec4ba0a8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -313,6 +313,9 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * (e.g. 00012)</li>
    * <li>`allowBackslashEscapingAnyCharacter` (default `false`): allows accepting quoting of all
    * character using backslash quoting mechanism</li>
+   * <li>`allowUnquotedControlChars` (default `false`): allows JSON Strings to contain unquoted
+   * control characters (ASCII characters with value less than 32, including tab and line feed
+   * characters) or not.</li>
    * <li>`mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records
    * during parsing.
    *   <ul>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
index 70ddfa8e9b835..a42e28053a96a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
@@ -195,6 +195,9 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
    * (e.g. 00012)</li>
    * <li>`allowBackslashEscapingAnyCharacter` (default `false`): allows accepting quoting of all
    * character using backslash quoting mechanism</li>
+   * <li>`allowUnquotedControlChars` (default `false`): allows JSON Strings to contain unquoted
+   * control characters (ASCII characters with value less than 32, including tab and line feed
+   * characters) or not.</li>
    * <li>`mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records
    * during parsing.
    *   <ul>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala
index 6e2b4f0df595f..316c5183fddf1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala
@@ -72,6 +72,21 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext {
     assert(df.first().getString(0) == "Reynold Xin")
   }
 
+  test("allowUnquotedControlChars off") {
+    val str = """{"name": "a\u0001b"}"""
+    val df = spark.read.json(Seq(str).toDS())
+
+    assert(df.schema.head.name == "_corrupt_record")
+  }
+
+  test("allowUnquotedControlChars on") {
+    val str = """{"name": "a\u0001b"}"""
+    val df = spark.read.option("allowUnquotedControlChars", "true").json(Seq(str).toDS())
+
+    assert(df.schema.head.name == "name")
+    assert(df.first().getString(0) == "a\u0001b")
+  }
+
   test("allowNumericLeadingZeros off") {
     val str = """{"age": 0018}"""
     val df = spark.read.json(Seq(str).toDS())

From 1a598d717ca9cdebae70999c4ae9350e802f6863 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Fri, 25 Aug 2017 13:29:40 -0700
Subject: [PATCH 432/779] [SPARK-21837][SQL][TESTS] UserDefinedTypeSuite Local
 UDTs not actually testing what it intends

## What changes were proposed in this pull request?

Adjust Local UDTs test to assert about results, and fix index of vector column. See JIRA for details.

## How was this patch tested?

Existing tests.

Author: Sean Owen <sowen@cloudera.com>

Closes #19053 from srowen/SPARK-21837.
---
 .../org/apache/spark/sql/UserDefinedTypeSuite.scala  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
index b096a6db8517f..a08433ba794d9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
@@ -203,12 +203,12 @@ class UserDefinedTypeSuite extends QueryTest with SharedSQLContext with ParquetT
 
   // Tests to make sure that all operators correctly convert types on the way out.
   test("Local UDTs") {
-    val df = Seq((1, new UDT.MyDenseVector(Array(0.1, 1.0)))).toDF("int", "vec")
-    df.collect()(0).getAs[UDT.MyDenseVector](1)
-    df.take(1)(0).getAs[UDT.MyDenseVector](1)
-    df.limit(1).groupBy('int).agg(first('vec)).collect()(0).getAs[UDT.MyDenseVector](0)
-    df.orderBy('int).limit(1).groupBy('int).agg(first('vec)).collect()(0)
-      .getAs[UDT.MyDenseVector](0)
+    val vec = new UDT.MyDenseVector(Array(0.1, 1.0))
+    val df = Seq((1, vec)).toDF("int", "vec")
+    assert(vec === df.collect()(0).getAs[UDT.MyDenseVector](1))
+    assert(vec === df.take(1)(0).getAs[UDT.MyDenseVector](1))
+    checkAnswer(df.limit(1).groupBy('int).agg(first('vec)), Row(1, vec))
+    checkAnswer(df.orderBy('int).limit(1).groupBy('int).agg(first('vec)), Row(1, vec))
   }
 
   test("UDTs with JSON") {

From 522e1f80d6e1f7647ecbb6392831832f3dad0f86 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Fri, 25 Aug 2017 19:51:13 -0700
Subject: [PATCH 433/779] [SPARK-21831][TEST] Remove
 `spark.sql.hive.convertMetastoreOrc` config in HiveCompatibilitySuite

## What changes were proposed in this pull request?

[SPARK-19025](https://github.com/apache/spark/pull/16869) removes SQLBuilder, so we don't need the following in HiveCompatibilitySuite.

```scala
// Ensures that the plans generation use metastore relation and not OrcRelation
// Was done because SqlBuilder does not work with plans having logical relation
TestHive.setConf(HiveUtils.CONVERT_METASTORE_ORC, false)
```

## How was this patch tested?

Pass the existing Jenkins tests.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #19043 from dongjoon-hyun/SPARK-21831.
---
 .../spark/sql/hive/execution/HiveCompatibilitySuite.scala    | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index 0a53aaca404e6..45791c69b4cb7 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -39,7 +39,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
   private val originalLocale = Locale.getDefault
   private val originalColumnBatchSize = TestHive.conf.columnBatchSize
   private val originalInMemoryPartitionPruning = TestHive.conf.inMemoryPartitionPruning
-  private val originalConvertMetastoreOrc = TestHive.conf.getConf(HiveUtils.CONVERT_METASTORE_ORC)
   private val originalCrossJoinEnabled = TestHive.conf.crossJoinEnabled
   private val originalSessionLocalTimeZone = TestHive.conf.sessionLocalTimeZone
 
@@ -58,9 +57,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     TestHive.setConf(SQLConf.COLUMN_BATCH_SIZE, 5)
     // Enable in-memory partition pruning for testing purposes
     TestHive.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, true)
-    // Ensures that the plans generation use metastore relation and not OrcRelation
-    // Was done because SqlBuilder does not work with plans having logical relation
-    TestHive.setConf(HiveUtils.CONVERT_METASTORE_ORC, false)
     // Ensures that cross joins are enabled so that we can test them
     TestHive.setConf(SQLConf.CROSS_JOINS_ENABLED, true)
     // Fix session local timezone to America/Los_Angeles for those timezone sensitive tests
@@ -76,7 +72,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
       Locale.setDefault(originalLocale)
       TestHive.setConf(SQLConf.COLUMN_BATCH_SIZE, originalColumnBatchSize)
       TestHive.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, originalInMemoryPartitionPruning)
-      TestHive.setConf(HiveUtils.CONVERT_METASTORE_ORC, originalConvertMetastoreOrc)
       TestHive.setConf(SQLConf.CROSS_JOINS_ENABLED, originalCrossJoinEnabled)
       TestHive.setConf(SQLConf.SESSION_LOCAL_TIMEZONE, originalSessionLocalTimeZone)
 

From 3b66b1c44060fb0ebf292830b08f71e990779800 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sat, 26 Aug 2017 13:56:24 +0900
Subject: [PATCH 434/779] [MINOR][DOCS] Minor doc fixes related with doc build
 and uses script dir in SQL doc gen script

## What changes were proposed in this pull request?

This PR proposes both:

- Add information about Javadoc, SQL docs and few more information in `docs/README.md` and a comment in `docs/_plugins/copy_api_dirs.rb` related with Javadoc.

- Adds some commands so that the script always runs the SQL docs build under `./sql` directory (for directly running `./sql/create-docs.sh` in the root directory).

## How was this patch tested?

Manual tests with `jekyll build` and `./sql/create-docs.sh` in the root directory.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #19019 from HyukjinKwon/minor-doc-build.
---
 docs/README.md                 | 70 +++++++++++++++++++---------------
 docs/_plugins/copy_api_dirs.rb |  2 +-
 sql/create-docs.sh             |  4 ++
 3 files changed, 45 insertions(+), 31 deletions(-)

diff --git a/docs/README.md b/docs/README.md
index 866364f1566a9..225bb1b2040de 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -9,19 +9,22 @@ documentation yourself. Why build it yourself? So that you have the docs that co
 whichever version of Spark you currently have checked out of revision control.
 
 ## Prerequisites
-The Spark documentation build uses a number of tools to build HTML docs and API docs in Scala,
-Python and R.
+
+The Spark documentation build uses a number of tools to build HTML docs and API docs in Scala, Java,
+Python, R and SQL.
 
 You need to have [Ruby](https://www.ruby-lang.org/en/documentation/installation/) and
 [Python](https://docs.python.org/2/using/unix.html#getting-and-installing-the-latest-version-of-python)
 installed. Also install the following libraries:
+
 ```sh
-    $ sudo gem install jekyll jekyll-redirect-from pygments.rb
-    $ sudo pip install Pygments
-    # Following is needed only for generating API docs
-    $ sudo pip install sphinx pypandoc mkdocs
-    $ sudo Rscript -e 'install.packages(c("knitr", "devtools", "roxygen2", "testthat", "rmarkdown"), repos="http://cran.stat.ucla.edu/")'
+$ sudo gem install jekyll jekyll-redirect-from pygments.rb
+$ sudo pip install Pygments
+# Following is needed only for generating API docs
+$ sudo pip install sphinx pypandoc mkdocs
+$ sudo Rscript -e 'install.packages(c("knitr", "devtools", "roxygen2", "testthat", "rmarkdown"), repos="http://cran.stat.ucla.edu/")'
 ```
+
 (Note: If you are on a system with both Ruby 1.9 and Ruby 2.0 you may need to replace gem with gem2.0)
 
 ## Generating the Documentation HTML
@@ -32,42 +35,49 @@ the source code and be captured by revision control (currently git). This way th
 includes the version of the documentation that is relevant regardless of which version or release
 you have checked out or downloaded.
 
-In this directory you will find textfiles formatted using Markdown, with an ".md" suffix. You can
-read those text files directly if you want. Start with index.md.
+In this directory you will find text files formatted using Markdown, with an ".md" suffix. You can
+read those text files directly if you want. Start with `index.md`.
 
 Execute `jekyll build` from the `docs/` directory to compile the site. Compiling the site with
-Jekyll will create a directory called `_site` containing index.html as well as the rest of the
+Jekyll will create a directory called `_site` containing `index.html` as well as the rest of the
 compiled files.
 
-    $ cd docs
-    $ jekyll build
+```sh
+$ cd docs
+$ jekyll build
+```
 
 You can modify the default Jekyll build as follows:
+
 ```sh
-    # Skip generating API docs (which takes a while)
-    $ SKIP_API=1 jekyll build
-    
-    # Serve content locally on port 4000
-    $ jekyll serve --watch
-    
-    # Build the site with extra features used on the live page
-    $ PRODUCTION=1 jekyll build
+# Skip generating API docs (which takes a while)
+$ SKIP_API=1 jekyll build
+
+# Serve content locally on port 4000
+$ jekyll serve --watch
+
+# Build the site with extra features used on the live page
+$ PRODUCTION=1 jekyll build
 ```
 
-## API Docs (Scaladoc, Sphinx, roxygen2)
+## API Docs (Scaladoc, Javadoc, Sphinx, roxygen2, MkDocs)
 
-You can build just the Spark scaladoc by running `build/sbt unidoc` from the SPARK_PROJECT_ROOT directory.
+You can build just the Spark scaladoc and javadoc by running `build/sbt unidoc` from the `SPARK_HOME` directory.
 
 Similarly, you can build just the PySpark docs by running `make html` from the
-SPARK_PROJECT_ROOT/python/docs directory. Documentation is only generated for classes that are listed as
-public in `__init__.py`. The SparkR docs can be built by running SPARK_PROJECT_ROOT/R/create-docs.sh.
+`SPARK_HOME/python/docs` directory. Documentation is only generated for classes that are listed as
+public in `__init__.py`. The SparkR docs can be built by running `SPARK_HOME/R/create-docs.sh`, and
+the SQL docs can be built by running `SPARK_HOME/sql/create-docs.sh`
+after [building Spark](https://github.com/apache/spark#building-spark) first.
 
-When you run `jekyll` in the `docs` directory, it will also copy over the scaladoc for the various
+When you run `jekyll build` in the `docs` directory, it will also copy over the scaladoc and javadoc for the various
 Spark subprojects into the `docs` directory (and then also into the `_site` directory). We use a
 jekyll plugin to run `build/sbt unidoc` before building the site so if you haven't run it (recently) it
-may take some time as it generates all of the scaladoc.  The jekyll plugin also generates the
-PySpark docs using [Sphinx](http://sphinx-doc.org/).
+may take some time as it generates all of the scaladoc and javadoc using [Unidoc](https://github.com/sbt/sbt-unidoc).
+The jekyll plugin also generates the PySpark docs using [Sphinx](http://sphinx-doc.org/), SparkR docs
+using [roxygen2](https://cran.r-project.org/web/packages/roxygen2/index.html) and SQL docs
+using [MkDocs](http://www.mkdocs.org/).
 
-NOTE: To skip the step of building and copying over the Scala, Python, R and SQL API docs, run `SKIP_API=1
-jekyll`. In addition, `SKIP_SCALADOC=1`, `SKIP_PYTHONDOC=1`, `SKIP_RDOC=1` and `SKIP_SQLDOC=1` can be used
-to skip a single step of the corresponding language.
+NOTE: To skip the step of building and copying over the Scala, Java, Python, R and SQL API docs, run `SKIP_API=1
+jekyll build`. In addition, `SKIP_SCALADOC=1`, `SKIP_PYTHONDOC=1`, `SKIP_RDOC=1` and `SKIP_SQLDOC=1` can be used
+to skip a single step of the corresponding language. `SKIP_SCALADOC` indicates skipping both the Scala and Java docs.
diff --git a/docs/_plugins/copy_api_dirs.rb b/docs/_plugins/copy_api_dirs.rb
index 00366f803c2ad..4d0d043a349bb 100644
--- a/docs/_plugins/copy_api_dirs.rb
+++ b/docs/_plugins/copy_api_dirs.rb
@@ -20,7 +20,7 @@
 
 if not (ENV['SKIP_API'] == '1')
   if not (ENV['SKIP_SCALADOC'] == '1')
-    # Build Scaladoc for Java/Scala
+    # Build Scaladoc for Scala and Javadoc for Java
 
     puts "Moving to project root and building API docs."
     curr_dir = pwd
diff --git a/sql/create-docs.sh b/sql/create-docs.sh
index 1d2d602c979be..4353708d22f7b 100755
--- a/sql/create-docs.sh
+++ b/sql/create-docs.sh
@@ -37,6 +37,8 @@ if ! hash mkdocs 2>/dev/null; then
   pip install mkdocs
 fi
 
+pushd "$FWDIR" > /dev/null
+
 # Now create the markdown file
 rm -fr docs
 mkdir docs
@@ -47,3 +49,5 @@ echo "Generating markdown files for SQL documentation."
 echo "Generating HTML files for SQL documentation."
 mkdocs build --clean
 rm -fr docs
+
+popd

From 07142cf6dc821d42d3357d0173ae2c6875c0f545 Mon Sep 17 00:00:00 2001
From: iamhumanbeing <iamhumanbeing@gmail.com>
Date: Sun, 27 Aug 2017 08:23:57 +0100
Subject: [PATCH 435/779] [SPARK-21843] testNameNote should be
 "(minNumPostShufflePartitions: 5)"

Signed-off-by: iamhumanbeing <iamhumanbeinggmail.com>

## What changes were proposed in this pull request?

testNameNote = "(minNumPostShufflePartitions: 3) is not correct.
it should be "(minNumPostShufflePartitions: " + numPartitions + ")" in ExchangeCoordinatorSuite

## How was this patch tested?

unit tests

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: iamhumanbeing <iamhumanbeing@gmail.com>

Closes #19058 from iamhumanbeing/testnote.
---
 .../apache/spark/sql/execution/ExchangeCoordinatorSuite.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeCoordinatorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeCoordinatorSuite.scala
index 06bce9a2400e7..ab4042a9816e1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeCoordinatorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeCoordinatorSuite.scala
@@ -280,7 +280,7 @@ class ExchangeCoordinatorSuite extends SparkFunSuite with BeforeAndAfterAll {
 
   Seq(Some(5), None).foreach { minNumPostShufflePartitions =>
     val testNameNote = minNumPostShufflePartitions match {
-      case Some(numPartitions) => "(minNumPostShufflePartitions: 3)"
+      case Some(numPartitions) => "(minNumPostShufflePartitions: " + numPartitions + ")"
       case None => ""
     }
 

From 0456b4050817e64f27824720e695bbfff738d474 Mon Sep 17 00:00:00 2001
From: WeichenXu <WeichenXu123@outlook.com>
Date: Mon, 28 Aug 2017 07:41:42 +0100
Subject: [PATCH 436/779] [SPARK-21818][ML][MLLIB] Fix bug of
 MultivariateOnlineSummarizer.variance generate negative result

## What changes were proposed in this pull request?

Because of numerical error, MultivariateOnlineSummarizer.variance is possible to generate negative variance.

**This is a serious bug because many algos in MLLib**
**use stddev computed from** `sqrt(variance)`
**it will generate NaN and crash the whole algorithm.**

we can reproduce this bug use the following code:
```
    val summarizer1 = (new MultivariateOnlineSummarizer)
      .add(Vectors.dense(3.0), 0.7)
    val summarizer2 = (new MultivariateOnlineSummarizer)
      .add(Vectors.dense(3.0), 0.4)
    val summarizer3 = (new MultivariateOnlineSummarizer)
      .add(Vectors.dense(3.0), 0.5)
    val summarizer4 = (new MultivariateOnlineSummarizer)
      .add(Vectors.dense(3.0), 0.4)

    val summarizer = summarizer1
      .merge(summarizer2)
      .merge(summarizer3)
      .merge(summarizer4)

    println(summarizer.variance(0))
```
This PR fix the bugs in `mllib.stat.MultivariateOnlineSummarizer.variance` and `ml.stat.SummarizerBuffer.variance`, and several places in `WeightedLeastSquares`

## How was this patch tested?

test cases added.

Author: WeichenXu <WeichenXu123@outlook.com>

Closes #19029 from WeichenXu123/fix_summarizer_var_bug.
---
 .../spark/ml/optim/WeightedLeastSquares.scala  | 12 +++++++++---
 .../org/apache/spark/ml/stat/Summarizer.scala  |  5 +++--
 .../stat/MultivariateOnlineSummarizer.scala    |  5 +++--
 .../apache/spark/ml/stat/SummarizerSuite.scala | 18 ++++++++++++++++++
 .../MultivariateOnlineSummarizerSuite.scala    | 18 ++++++++++++++++++
 5 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
index 32b0af72ba9bb..1ed218aa58bdd 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
@@ -440,7 +440,11 @@ private[ml] object WeightedLeastSquares {
     /**
      * Weighted population standard deviation of labels.
      */
-    def bStd: Double = math.sqrt(bbSum / wSum - bBar * bBar)
+    def bStd: Double = {
+      // We prevent variance from negative value caused by numerical error.
+      val variance = math.max(bbSum / wSum - bBar * bBar, 0.0)
+      math.sqrt(variance)
+    }
 
     /**
      * Weighted mean of (label * features).
@@ -471,7 +475,8 @@ private[ml] object WeightedLeastSquares {
       while (i < triK) {
         val l = j - 2
         val aw = aSum(l) / wSum
-        std(l) = math.sqrt(aaValues(i) / wSum - aw * aw)
+        // We prevent variance from negative value caused by numerical error.
+        std(l) = math.sqrt(math.max(aaValues(i) / wSum - aw * aw, 0.0))
         i += j
         j += 1
       }
@@ -489,7 +494,8 @@ private[ml] object WeightedLeastSquares {
       while (i < triK) {
         val l = j - 2
         val aw = aSum(l) / wSum
-        variance(l) = aaValues(i) / wSum - aw * aw
+        // We prevent variance from negative value caused by numerical error.
+        variance(l) = math.max(aaValues(i) / wSum - aw * aw, 0.0)
         i += j
         j += 1
       }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
index 7e408b9dbd13a..cae41edb7aca8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
@@ -436,8 +436,9 @@ private[ml] object SummaryBuilderImpl extends Logging {
         var i = 0
         val len = currM2n.length
         while (i < len) {
-          realVariance(i) = (currM2n(i) + deltaMean(i) * deltaMean(i) * weightSum(i) *
-            (totalWeightSum - weightSum(i)) / totalWeightSum) / denominator
+          // We prevent variance from negative value caused by numerical error.
+          realVariance(i) = math.max((currM2n(i) + deltaMean(i) * deltaMean(i) * weightSum(i) *
+            (totalWeightSum - weightSum(i)) / totalWeightSum) / denominator, 0.0)
           i += 1
         }
       }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
index 7dc0c459ec032..8121880cfb233 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
@@ -213,8 +213,9 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
       var i = 0
       val len = currM2n.length
       while (i < len) {
-        realVariance(i) = (currM2n(i) + deltaMean(i) * deltaMean(i) * weightSum(i) *
-          (totalWeightSum - weightSum(i)) / totalWeightSum) / denominator
+        // We prevent variance from negative value caused by numerical error.
+        realVariance(i) = math.max((currM2n(i) + deltaMean(i) * deltaMean(i) * weightSum(i) *
+          (totalWeightSum - weightSum(i)) / totalWeightSum) / denominator, 0.0)
         i += 1
       }
     }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
index dfb733ff6e761..1ea851ef2d676 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
@@ -402,6 +402,24 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(summarizer.count === 6)
   }
 
+  test("summarizer buffer zero variance test (SPARK-21818)") {
+    val summarizer1 = new SummarizerBuffer()
+      .add(Vectors.dense(3.0), 0.7)
+    val summarizer2 = new SummarizerBuffer()
+      .add(Vectors.dense(3.0), 0.4)
+    val summarizer3 = new SummarizerBuffer()
+      .add(Vectors.dense(3.0), 0.5)
+    val summarizer4 = new SummarizerBuffer()
+      .add(Vectors.dense(3.0), 0.4)
+
+    val summarizer = summarizer1
+      .merge(summarizer2)
+      .merge(summarizer3)
+      .merge(summarizer4)
+
+    assert(summarizer.variance(0) >= 0.0)
+  }
+
   test("summarizer buffer merging summarizer with empty summarizer") {
     // If one of two is non-empty, this should return the non-empty summarizer.
     // If both of them are empty, then just return the empty summarizer.
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizerSuite.scala
index 797e84fcc7377..c6466bc918dd0 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizerSuite.scala
@@ -270,4 +270,22 @@ class MultivariateOnlineSummarizerSuite extends SparkFunSuite {
     assert(summarizer3.max ~== Vectors.dense(10.0, 0.0) absTol 1e-14)
     assert(summarizer3.min ~== Vectors.dense(0.0, -10.0) absTol 1e-14)
   }
+
+  test ("test zero variance (SPARK-21818)") {
+    val summarizer1 = (new MultivariateOnlineSummarizer)
+      .add(Vectors.dense(3.0), 0.7)
+    val summarizer2 = (new MultivariateOnlineSummarizer)
+      .add(Vectors.dense(3.0), 0.4)
+    val summarizer3 = (new MultivariateOnlineSummarizer)
+      .add(Vectors.dense(3.0), 0.5)
+    val summarizer4 = (new MultivariateOnlineSummarizer)
+      .add(Vectors.dense(3.0), 0.4)
+
+    val summarizer = summarizer1
+      .merge(summarizer2)
+      .merge(summarizer3)
+      .merge(summarizer4)
+
+    assert(summarizer.variance(0) >= 0.0)
+  }
 }

From 24e6c187fbaa6874eedbdda6b3b5dc6ff9e1de36 Mon Sep 17 00:00:00 2001
From: pgandhi <pgandhi@yahoo-inc.com>
Date: Mon, 28 Aug 2017 08:51:22 -0500
Subject: [PATCH 437/779] [SPARK-21798] No config to replace deprecated
 SPARK_CLASSPATH config for launching daemons like History Server

History Server Launch uses SparkClassCommandBuilder for launching the server. It is observed that SPARK_CLASSPATH has been removed and deprecated. For spark-submit this takes a different route and spark.driver.extraClasspath takes care of specifying additional jars in the classpath that were previously specified in the SPARK_CLASSPATH. Right now the only way specify the additional jars for launching daemons such as history server is using SPARK_DIST_CLASSPATH (https://spark.apache.org/docs/latest/hadoop-provided.html) but this I presume is a distribution classpath. It would be nice to have a similar config like spark.driver.extraClasspath for launching daemons similar to history server.

Added new environment variable SPARK_DAEMON_CLASSPATH to set classpath for launching daemons. Tested and verified for History Server and Standalone Mode.

## How was this patch tested?
Initially, history server start script would fail for the reason being that it could not find the required jars for launching the server in the java classpath. Same was true for running Master and Worker in standalone mode. By adding the environment variable SPARK_DAEMON_CLASSPATH to the java classpath, both the daemons(History Server, Standalone daemons) are starting up and running.

Author: pgandhi <pgandhi@yahoo-inc.com>
Author: pgandhi999 <parthkgandhi9@gmail.com>

Closes #19047 from pgandhi999/master.
---
 conf/spark-env.sh.template                                   | 1 +
 docs/monitoring.md                                           | 4 ++++
 docs/running-on-mesos.md                                     | 2 ++
 docs/spark-standalone.md                                     | 4 ++++
 .../org/apache/spark/launcher/SparkClassCommandBuilder.java  | 5 +++++
 5 files changed, 16 insertions(+)

diff --git a/conf/spark-env.sh.template b/conf/spark-env.sh.template
index 1663019ee5758..f8c895f5303b9 100755
--- a/conf/spark-env.sh.template
+++ b/conf/spark-env.sh.template
@@ -52,6 +52,7 @@
 # - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y")
 # - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y")
 # - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y")
+# - SPARK_DAEMON_CLASSPATH, to set the classpath for all daemons
 # - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers
 
 # Generic options for the daemons used in the standalone deploy mode
diff --git a/docs/monitoring.md b/docs/monitoring.md
index 3e577c5f36778..d22cd945eaf61 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -61,6 +61,10 @@ The history server can be configured as follows:
     <td><code>SPARK_DAEMON_JAVA_OPTS</code></td>
     <td>JVM options for the history server (default: none).</td>
   </tr>
+  <tr>
+    <td><code>SPARK_DAEMON_CLASSPATH</code></td>
+    <td>Classpath for the history server (default: none).</td>
+  </tr>
   <tr>
     <td><code>SPARK_PUBLIC_DNS</code></td>
     <td>
diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md
index 0e5a20c578db3..c12b8580af063 100644
--- a/docs/running-on-mesos.md
+++ b/docs/running-on-mesos.md
@@ -160,6 +160,8 @@ If you like to run the `MesosClusterDispatcher` with Marathon, you need to run t
 The `MesosClusterDispatcher` also supports writing recovery state into Zookeeper. This will allow the `MesosClusterDispatcher` to be able to recover all submitted and running containers on relaunch.   In order to enable this recovery mode, you can set SPARK_DAEMON_JAVA_OPTS in spark-env by configuring `spark.deploy.recoveryMode` and related spark.deploy.zookeeper.* configurations.
 For more information about these configurations please refer to the configurations [doc](configurations.html#deploy).
 
+You can also specify any additional jars required by the `MesosClusterDispatcher` in the classpath by setting the environment variable SPARK_DAEMON_CLASSPATH in spark-env.
+
 From the client, you can submit a job to Mesos cluster by running `spark-submit` and specifying the master URL
 to the URL of the `MesosClusterDispatcher` (e.g: mesos://dispatcher:7077). You can view driver statuses on the
 Spark cluster Web UI.
diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md
index 642575b46dd42..1095386c31ab8 100644
--- a/docs/spark-standalone.md
+++ b/docs/spark-standalone.md
@@ -149,6 +149,10 @@ You can optionally configure the cluster further by setting environment variable
     <td><code>SPARK_DAEMON_JAVA_OPTS</code></td>
     <td>JVM options for the Spark master and worker daemons themselves in the form "-Dx=y" (default: none).</td>
   </tr>
+  <tr>
+    <td><code>SPARK_DAEMON_CLASSPATH</code></td>
+    <td>Classpath for the Spark master and worker daemons themselves (default: none).</td>
+  </tr>
   <tr>
     <td><code>SPARK_PUBLIC_DNS</code></td>
     <td>The public DNS name of the Spark master and workers (default: none).</td>
diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkClassCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/SparkClassCommandBuilder.java
index 137ef74843da5..32724acdc362c 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/SparkClassCommandBuilder.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/SparkClassCommandBuilder.java
@@ -53,16 +53,19 @@ public List<String> buildCommand(Map<String, String> env)
       case "org.apache.spark.deploy.master.Master":
         javaOptsKeys.add("SPARK_DAEMON_JAVA_OPTS");
         javaOptsKeys.add("SPARK_MASTER_OPTS");
+        extraClassPath = getenv("SPARK_DAEMON_CLASSPATH");
         memKey = "SPARK_DAEMON_MEMORY";
         break;
       case "org.apache.spark.deploy.worker.Worker":
         javaOptsKeys.add("SPARK_DAEMON_JAVA_OPTS");
         javaOptsKeys.add("SPARK_WORKER_OPTS");
+        extraClassPath = getenv("SPARK_DAEMON_CLASSPATH");
         memKey = "SPARK_DAEMON_MEMORY";
         break;
       case "org.apache.spark.deploy.history.HistoryServer":
         javaOptsKeys.add("SPARK_DAEMON_JAVA_OPTS");
         javaOptsKeys.add("SPARK_HISTORY_OPTS");
+        extraClassPath = getenv("SPARK_DAEMON_CLASSPATH");
         memKey = "SPARK_DAEMON_MEMORY";
         break;
       case "org.apache.spark.executor.CoarseGrainedExecutorBackend":
@@ -77,11 +80,13 @@ public List<String> buildCommand(Map<String, String> env)
         break;
       case "org.apache.spark.deploy.mesos.MesosClusterDispatcher":
         javaOptsKeys.add("SPARK_DAEMON_JAVA_OPTS");
+        extraClassPath = getenv("SPARK_DAEMON_CLASSPATH");
         break;
       case "org.apache.spark.deploy.ExternalShuffleService":
       case "org.apache.spark.deploy.mesos.MesosExternalShuffleService":
         javaOptsKeys.add("SPARK_DAEMON_JAVA_OPTS");
         javaOptsKeys.add("SPARK_SHUFFLE_OPTS");
+        extraClassPath = getenv("SPARK_DAEMON_CLASSPATH");
         memKey = "SPARK_DAEMON_MEMORY";
         break;
       default:

From 73e64f7d50ba7a8469bd76f97e0a22fad41c2caa Mon Sep 17 00:00:00 2001
From: erenavsarogullari <erenavsarogullari@gmail.com>
Date: Mon, 28 Aug 2017 14:54:00 -0500
Subject: [PATCH 438/779] [SPARK-19662][SCHEDULER][TEST] Add Fair Scheduler
 Unit Test coverage for different build cases

## What changes were proposed in this pull request?
Fair Scheduler can be built via one of the following options:
- By setting a `spark.scheduler.allocation.file` property,
- By setting `fairscheduler.xml` into classpath.

These options are checked **in order** and fair-scheduler is built via first found option. If invalid path is found, `FileNotFoundException` will be expected.

This PR aims unit test coverage of these use cases and a minor documentation change has been added for second option(`fairscheduler.xml` into classpath) to inform the users.

Also, this PR was related with #16813 and has been created separately to keep patch content as isolated and to help the reviewers.

## How was this patch tested?
Added new Unit Tests.

Author: erenavsarogullari <erenavsarogullari@gmail.com>

Closes #16992 from erenavsarogullari/SPARK-19662.
---
 .../fairscheduler-with-valid-data.xml         | 35 +++++++++++++++
 .../apache/spark/scheduler/PoolSuite.scala    | 44 +++++++++++++++++++
 docs/job-scheduling.md                        |  2 +-
 3 files changed, 80 insertions(+), 1 deletion(-)
 create mode 100644 core/src/test/resources/fairscheduler-with-valid-data.xml

diff --git a/core/src/test/resources/fairscheduler-with-valid-data.xml b/core/src/test/resources/fairscheduler-with-valid-data.xml
new file mode 100644
index 0000000000000..3d882331835ca
--- /dev/null
+++ b/core/src/test/resources/fairscheduler-with-valid-data.xml
@@ -0,0 +1,35 @@
+<?xml version="1.0"?>
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<allocations>
+    <pool name="pool1">
+        <minShare>3</minShare>
+        <weight>1</weight>
+        <schedulingMode>FIFO</schedulingMode>
+    </pool>
+    <pool name="pool2">
+        <minShare>4</minShare>
+        <weight>2</weight>
+        <schedulingMode>FAIR</schedulingMode>
+    </pool>
+    <pool name="pool3">
+        <minShare>2</minShare>
+        <weight>3</weight>
+        <schedulingMode>FAIR</schedulingMode>
+    </pool>
+</allocations>
\ No newline at end of file
diff --git a/core/src/test/scala/org/apache/spark/scheduler/PoolSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/PoolSuite.scala
index 4901062a78553..5bd3955f5adbb 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/PoolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/PoolSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.scheduler
 
+import java.io.FileNotFoundException
 import java.util.Properties
 
 import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite}
@@ -292,6 +293,49 @@ class PoolSuite extends SparkFunSuite with LocalSparkContext {
     }
   }
 
+  test("Fair Scheduler should build fair scheduler when " +
+    "valid spark.scheduler.allocation.file property is set") {
+    val xmlPath = getClass.getClassLoader.getResource("fairscheduler-with-valid-data.xml").getFile()
+    val conf = new SparkConf().set(SCHEDULER_ALLOCATION_FILE_PROPERTY, xmlPath)
+    sc = new SparkContext(LOCAL, APP_NAME, conf)
+
+    val rootPool = new Pool("", SchedulingMode.FAIR, 0, 0)
+    val schedulableBuilder = new FairSchedulableBuilder(rootPool, sc.conf)
+    schedulableBuilder.buildPools()
+
+    verifyPool(rootPool, schedulableBuilder.DEFAULT_POOL_NAME, 0, 1, FIFO)
+    verifyPool(rootPool, "pool1", 3, 1, FIFO)
+    verifyPool(rootPool, "pool2", 4, 2, FAIR)
+    verifyPool(rootPool, "pool3", 2, 3, FAIR)
+  }
+
+  test("Fair Scheduler should use default file(fairscheduler.xml) if it exists in classpath " +
+    "and spark.scheduler.allocation.file property is not set") {
+    val conf = new SparkConf()
+    sc = new SparkContext(LOCAL, APP_NAME, conf)
+
+    val rootPool = new Pool("", SchedulingMode.FAIR, 0, 0)
+    val schedulableBuilder = new FairSchedulableBuilder(rootPool, sc.conf)
+    schedulableBuilder.buildPools()
+
+    verifyPool(rootPool, schedulableBuilder.DEFAULT_POOL_NAME, 0, 1, FIFO)
+    verifyPool(rootPool, "1", 2, 1, FIFO)
+    verifyPool(rootPool, "2", 3, 1, FIFO)
+    verifyPool(rootPool, "3", 0, 1, FIFO)
+  }
+
+  test("Fair Scheduler should throw FileNotFoundException " +
+    "when invalid spark.scheduler.allocation.file property is set") {
+    val conf = new SparkConf().set(SCHEDULER_ALLOCATION_FILE_PROPERTY, "INVALID_FILE_PATH")
+    sc = new SparkContext(LOCAL, APP_NAME, conf)
+
+    val rootPool = new Pool("", SchedulingMode.FAIR, 0, 0)
+    val schedulableBuilder = new FairSchedulableBuilder(rootPool, sc.conf)
+    intercept[FileNotFoundException] {
+      schedulableBuilder.buildPools()
+    }
+  }
+
   private def verifyPool(rootPool: Pool, poolName: String, expectedInitMinShare: Int,
                          expectedInitWeight: Int, expectedSchedulingMode: SchedulingMode): Unit = {
     val selectedPool = rootPool.getSchedulableByName(poolName)
diff --git a/docs/job-scheduling.md b/docs/job-scheduling.md
index 807944f20a78a..e6d881639a13b 100644
--- a/docs/job-scheduling.md
+++ b/docs/job-scheduling.md
@@ -235,7 +235,7 @@ properties:
   of the cluster. By default, each pool's `minShare` is 0.
 
 The pool properties can be set by creating an XML file, similar to `conf/fairscheduler.xml.template`,
-and setting a `spark.scheduler.allocation.file` property in your
+and either putting a file named `fairscheduler.xml` on the classpath, or setting `spark.scheduler.allocation.file` property in your
 [SparkConf](configuration.html#spark-properties).
 
 {% highlight scala %}

From c7270a46fc340db62c87ddfc6568603d0b832845 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Mon, 28 Aug 2017 13:31:01 -0700
Subject: [PATCH 439/779] [SPARK-17139][ML] Add model summary for
 MultinomialLogisticRegression

## What changes were proposed in this pull request?

Add 4 traits, using the following hierarchy:
LogisticRegressionSummary
LogisticRegressionTrainingSummary: LogisticRegressionSummary
BinaryLogisticRegressionSummary: LogisticRegressionSummary
BinaryLogisticRegressionTrainingSummary: LogisticRegressionTrainingSummary, BinaryLogisticRegressionSummary

and the public method such as `def summary` only return trait type listed above.

and then implement 4 concrete classes:
LogisticRegressionSummaryImpl (multiclass case)
LogisticRegressionTrainingSummaryImpl (multiclass case)
BinaryLogisticRegressionSummaryImpl (binary case).
BinaryLogisticRegressionTrainingSummaryImpl (binary case).

## How was this patch tested?

Existing tests & added tests.

Author: WeichenXu <WeichenXu123@outlook.com>

Closes #15435 from WeichenXu123/mlor_summary.
---
 .../classification/LogisticRegression.scala   | 340 ++++++++++++++----
 .../LogisticRegressionSuite.scala             | 160 +++++++--
 .../ml/regression/LinearRegressionSuite.scala |   2 +-
 project/MimaExcludes.scala                    |  21 +-
 4 files changed, 412 insertions(+), 111 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 21957d94e2dc3..ffe4b52300c72 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -22,7 +22,7 @@ import java.util.Locale
 import scala.collection.mutable
 
 import breeze.linalg.{DenseVector => BDV}
-import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS, LBFGSB => BreezeLBFGSB, OWLQN => BreezeOWLQN}
+import breeze.optimize.{CachedDiffFunction, LBFGS => BreezeLBFGS, LBFGSB => BreezeLBFGSB, OWLQN => BreezeOWLQN}
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.SparkException
@@ -35,7 +35,7 @@ import org.apache.spark.ml.optim.loss.{L2Regularization, RDDLossFunction}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
-import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
+import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics, MulticlassMetrics}
 import org.apache.spark.mllib.linalg.VectorImplicits._
 import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
 import org.apache.spark.mllib.util.MLUtils
@@ -882,21 +882,28 @@ class LogisticRegression @Since("1.2.0") (
 
     val model = copyValues(new LogisticRegressionModel(uid, coefficientMatrix, interceptVector,
       numClasses, isMultinomial))
-    // TODO: implement summary model for multinomial case
-    val m = if (!isMultinomial) {
-      val (summaryModel, probabilityColName) = model.findSummaryModelAndProbabilityCol()
-      val logRegSummary = new BinaryLogisticRegressionTrainingSummary(
+
+    val (summaryModel, probabilityColName, predictionColName) = model.findSummaryModel()
+    val logRegSummary = if (numClasses <= 2) {
+      new BinaryLogisticRegressionTrainingSummaryImpl(
         summaryModel.transform(dataset),
         probabilityColName,
+        predictionColName,
         $(labelCol),
         $(featuresCol),
         objectiveHistory)
-      model.setSummary(Some(logRegSummary))
     } else {
-      model
+      new LogisticRegressionTrainingSummaryImpl(
+        summaryModel.transform(dataset),
+        probabilityColName,
+        predictionColName,
+        $(labelCol),
+        $(featuresCol),
+        objectiveHistory)
     }
-    instr.logSuccess(m)
-    m
+    model.setSummary(Some(logRegSummary))
+    instr.logSuccess(model)
+    model
   }
 
   @Since("1.4.0")
@@ -1010,8 +1017,8 @@ class LogisticRegressionModel private[spark] (
   private var trainingSummary: Option[LogisticRegressionTrainingSummary] = None
 
   /**
-   * Gets summary of model on training set. An exception is
-   * thrown if `trainingSummary == None`.
+   * Gets summary of model on training set. An exception is thrown
+   * if `trainingSummary == None`.
    */
   @Since("1.5.0")
   def summary: LogisticRegressionTrainingSummary = trainingSummary.getOrElse {
@@ -1019,18 +1026,36 @@ class LogisticRegressionModel private[spark] (
   }
 
   /**
-   * If the probability column is set returns the current model and probability column,
-   * otherwise generates a new column and sets it as the probability column on a new copy
-   * of the current model.
+   * Gets summary of model on training set. An exception is thrown
+   * if `trainingSummary == None` or it is a multiclass model.
    */
-  private[classification] def findSummaryModelAndProbabilityCol():
-      (LogisticRegressionModel, String) = {
-    $(probabilityCol) match {
-      case "" =>
-        val probabilityColName = "probability_" + java.util.UUID.randomUUID.toString
-        (copy(ParamMap.empty).setProbabilityCol(probabilityColName), probabilityColName)
-      case p => (this, p)
+  @Since("2.3.0")
+  def binarySummary: BinaryLogisticRegressionTrainingSummary = summary match {
+    case b: BinaryLogisticRegressionTrainingSummary => b
+    case _ =>
+      throw new RuntimeException("Cannot create a binary summary for a non-binary model" +
+        s"(numClasses=${numClasses}), use summary instead.")
+  }
+
+  /**
+   * If the probability and prediction columns are set, this method returns the current model,
+   * otherwise it generates new columns for them and sets them as columns on a new copy of
+   * the current model
+   */
+  private[classification] def findSummaryModel():
+      (LogisticRegressionModel, String, String) = {
+    val model = if ($(probabilityCol).isEmpty && $(predictionCol).isEmpty) {
+      copy(ParamMap.empty)
+        .setProbabilityCol("probability_" + java.util.UUID.randomUUID.toString)
+        .setPredictionCol("prediction_" + java.util.UUID.randomUUID.toString)
+    } else if ($(probabilityCol).isEmpty) {
+      copy(ParamMap.empty).setProbabilityCol("probability_" + java.util.UUID.randomUUID.toString)
+    } else if ($(predictionCol).isEmpty) {
+      copy(ParamMap.empty).setPredictionCol("prediction_" + java.util.UUID.randomUUID.toString)
+    } else {
+      this
     }
+    (model, model.getProbabilityCol, model.getPredictionCol)
   }
 
   private[classification]
@@ -1051,9 +1076,14 @@ class LogisticRegressionModel private[spark] (
   @Since("2.0.0")
   def evaluate(dataset: Dataset[_]): LogisticRegressionSummary = {
     // Handle possible missing or invalid prediction columns
-    val (summaryModel, probabilityColName) = findSummaryModelAndProbabilityCol()
-    new BinaryLogisticRegressionSummary(summaryModel.transform(dataset),
-      probabilityColName, $(labelCol), $(featuresCol))
+    val (summaryModel, probabilityColName, predictionColName) = findSummaryModel()
+    if (numClasses > 2) {
+      new LogisticRegressionSummaryImpl(summaryModel.transform(dataset),
+        probabilityColName, predictionColName, $(labelCol), $(featuresCol))
+    } else {
+      new BinaryLogisticRegressionSummaryImpl(summaryModel.transform(dataset),
+        probabilityColName, predictionColName, $(labelCol), $(featuresCol))
+    }
   }
 
   /**
@@ -1324,90 +1354,154 @@ private[ml] class MultiClassSummarizer extends Serializable {
 }
 
 /**
- * Abstraction for multinomial Logistic Regression Training results.
- * Currently, the training summary ignores the training weights except
- * for the objective trace.
- */
-sealed trait LogisticRegressionTrainingSummary extends LogisticRegressionSummary {
-
-  /** objective function (scaled loss + regularization) at each iteration. */
-  def objectiveHistory: Array[Double]
-
-  /** Number of training iterations until termination */
-  def totalIterations: Int = objectiveHistory.length
-
-}
-
-/**
- * Abstraction for Logistic Regression Results for a given model.
+ * :: Experimental ::
+ * Abstraction for logistic regression results for a given model.
  */
+@Experimental
 sealed trait LogisticRegressionSummary extends Serializable {
 
   /**
    * Dataframe output by the model's `transform` method.
    */
+  @Since("1.5.0")
   def predictions: DataFrame
 
   /** Field in "predictions" which gives the probability of each class as a vector. */
+  @Since("1.5.0")
   def probabilityCol: String
 
+  /** Field in "predictions" which gives the prediction of each class. */
+  @Since("2.3.0")
+  def predictionCol: String
+
   /** Field in "predictions" which gives the true label of each instance (if available). */
+  @Since("1.5.0")
   def labelCol: String
 
   /** Field in "predictions" which gives the features of each instance as a vector. */
+  @Since("1.6.0")
   def featuresCol: String
 
+  @transient private val multiclassMetrics = {
+    new MulticlassMetrics(
+      predictions.select(
+        col(predictionCol),
+        col(labelCol).cast(DoubleType))
+        .rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) })
+  }
+
+  /**
+   * Returns the sequence of labels in ascending order. This order matches the order used
+   * in metrics which are specified as arrays over labels, e.g., truePositiveRateByLabel.
+   *
+   * Note: In most cases, it will be values {0.0, 1.0, ..., numClasses-1}, However, if the
+   * training set is missing a label, then all of the arrays over labels
+   * (e.g., from truePositiveRateByLabel) will be of length numClasses-1 instead of the
+   * expected numClasses.
+   */
+  @Since("2.3.0")
+  def labels: Array[Double] = multiclassMetrics.labels
+
+  /** Returns true positive rate for each label (category). */
+  @Since("2.3.0")
+  def truePositiveRateByLabel: Array[Double] = recallByLabel
+
+  /** Returns false positive rate for each label (category). */
+  @Since("2.3.0")
+  def falsePositiveRateByLabel: Array[Double] = {
+    multiclassMetrics.labels.map(label => multiclassMetrics.falsePositiveRate(label))
+  }
+
+  /** Returns precision for each label (category). */
+  @Since("2.3.0")
+  def precisionByLabel: Array[Double] = {
+    multiclassMetrics.labels.map(label => multiclassMetrics.precision(label))
+  }
+
+  /** Returns recall for each label (category). */
+  @Since("2.3.0")
+  def recallByLabel: Array[Double] = {
+    multiclassMetrics.labels.map(label => multiclassMetrics.recall(label))
+  }
+
+  /** Returns f-measure for each label (category). */
+  @Since("2.3.0")
+  def fMeasureByLabel(beta: Double): Array[Double] = {
+    multiclassMetrics.labels.map(label => multiclassMetrics.fMeasure(label, beta))
+  }
+
+  /** Returns f1-measure for each label (category). */
+  @Since("2.3.0")
+  def fMeasureByLabel: Array[Double] = fMeasureByLabel(1.0)
+
+  /**
+   * Returns accuracy.
+   * (equals to the total number of correctly classified instances
+   * out of the total number of instances.)
+   */
+  @Since("2.3.0")
+  def accuracy: Double = multiclassMetrics.accuracy
+
+  /**
+   * Returns weighted true positive rate.
+   * (equals to precision, recall and f-measure)
+   */
+  @Since("2.3.0")
+  def weightedTruePositiveRate: Double = weightedRecall
+
+  /** Returns weighted false positive rate. */
+  @Since("2.3.0")
+  def weightedFalsePositiveRate: Double = multiclassMetrics.weightedFalsePositiveRate
+
+  /**
+   * Returns weighted averaged recall.
+   * (equals to precision, recall and f-measure)
+   */
+  @Since("2.3.0")
+  def weightedRecall: Double = multiclassMetrics.weightedRecall
+
+  /** Returns weighted averaged precision. */
+  @Since("2.3.0")
+  def weightedPrecision: Double = multiclassMetrics.weightedPrecision
+
+  /** Returns weighted averaged f-measure. */
+  @Since("2.3.0")
+  def weightedFMeasure(beta: Double): Double = multiclassMetrics.weightedFMeasure(beta)
+
+  /** Returns weighted averaged f1-measure. */
+  @Since("2.3.0")
+  def weightedFMeasure: Double = multiclassMetrics.weightedFMeasure(1.0)
 }
 
 /**
  * :: Experimental ::
- * Logistic regression training results.
- *
- * @param predictions dataframe output by the model's `transform` method.
- * @param probabilityCol field in "predictions" which gives the probability of
- *                       each class as a vector.
- * @param labelCol field in "predictions" which gives the true label of each instance.
- * @param featuresCol field in "predictions" which gives the features of each instance as a vector.
- * @param objectiveHistory objective function (scaled loss + regularization) at each iteration.
+ * Abstraction for multiclass logistic regression training results.
+ * Currently, the training summary ignores the training weights except
+ * for the objective trace.
  */
 @Experimental
-@Since("1.5.0")
-class BinaryLogisticRegressionTrainingSummary private[classification] (
-    predictions: DataFrame,
-    probabilityCol: String,
-    labelCol: String,
-    featuresCol: String,
-    @Since("1.5.0") val objectiveHistory: Array[Double])
-  extends BinaryLogisticRegressionSummary(predictions, probabilityCol, labelCol, featuresCol)
-  with LogisticRegressionTrainingSummary {
+sealed trait LogisticRegressionTrainingSummary extends LogisticRegressionSummary {
+
+  /** objective function (scaled loss + regularization) at each iteration. */
+  @Since("1.5.0")
+  def objectiveHistory: Array[Double]
+
+  /** Number of training iterations. */
+  @Since("1.5.0")
+  def totalIterations: Int = objectiveHistory.length
 
 }
 
 /**
  * :: Experimental ::
- * Binary Logistic regression results for a given model.
- *
- * @param predictions dataframe output by the model's `transform` method.
- * @param probabilityCol field in "predictions" which gives the probability of
- *                       each class as a vector.
- * @param labelCol field in "predictions" which gives the true label of each instance.
- * @param featuresCol field in "predictions" which gives the features of each instance as a vector.
+ * Abstraction for binary logistic regression results for a given model.
  */
 @Experimental
-@Since("1.5.0")
-class BinaryLogisticRegressionSummary private[classification] (
-    @Since("1.5.0") @transient override val predictions: DataFrame,
-    @Since("1.5.0") override val probabilityCol: String,
-    @Since("1.5.0") override val labelCol: String,
-    @Since("1.6.0") override val featuresCol: String) extends LogisticRegressionSummary {
-
+sealed trait BinaryLogisticRegressionSummary extends LogisticRegressionSummary {
 
   private val sparkSession = predictions.sparkSession
   import sparkSession.implicits._
 
-  /**
-   * Returns a BinaryClassificationMetrics object.
-   */
   // TODO: Allow the user to vary the number of bins using a setBins method in
   // BinaryClassificationMetrics. For now the default is set to 100.
   @transient private val binaryMetrics = new BinaryClassificationMetrics(
@@ -1484,3 +1578,99 @@ class BinaryLogisticRegressionSummary private[classification] (
     binaryMetrics.recallByThreshold().toDF("threshold", "recall")
   }
 }
+
+/**
+ * :: Experimental ::
+ * Abstraction for binary logistic regression training results.
+ * Currently, the training summary ignores the training weights except
+ * for the objective trace.
+ */
+@Experimental
+sealed trait BinaryLogisticRegressionTrainingSummary extends BinaryLogisticRegressionSummary
+  with LogisticRegressionTrainingSummary
+
+/**
+ * Multiclass logistic regression training results.
+ *
+ * @param predictions dataframe output by the model's `transform` method.
+ * @param probabilityCol field in "predictions" which gives the probability of
+ *                       each class as a vector.
+ * @param predictionCol field in "predictions" which gives the prediction for a data instance as a
+ *                      double.
+ * @param labelCol field in "predictions" which gives the true label of each instance.
+ * @param featuresCol field in "predictions" which gives the features of each instance as a vector.
+ * @param objectiveHistory objective function (scaled loss + regularization) at each iteration.
+ */
+private class LogisticRegressionTrainingSummaryImpl(
+    predictions: DataFrame,
+    probabilityCol: String,
+    predictionCol: String,
+    labelCol: String,
+    featuresCol: String,
+    override val objectiveHistory: Array[Double])
+  extends LogisticRegressionSummaryImpl(
+    predictions, probabilityCol, predictionCol, labelCol, featuresCol)
+  with LogisticRegressionTrainingSummary
+
+/**
+ * Multiclass logistic regression results for a given model.
+ *
+ * @param predictions dataframe output by the model's `transform` method.
+ * @param probabilityCol field in "predictions" which gives the probability of
+ *                       each class as a vector.
+ * @param predictionCol field in "predictions" which gives the prediction for a data instance as a
+ *                      double.
+ * @param labelCol field in "predictions" which gives the true label of each instance.
+ * @param featuresCol field in "predictions" which gives the features of each instance as a vector.
+ */
+private class LogisticRegressionSummaryImpl(
+    @transient override val predictions: DataFrame,
+    override val probabilityCol: String,
+    override val predictionCol: String,
+    override val labelCol: String,
+    override val featuresCol: String)
+  extends LogisticRegressionSummary
+
+/**
+ * Binary logistic regression training results.
+ *
+ * @param predictions dataframe output by the model's `transform` method.
+ * @param probabilityCol field in "predictions" which gives the probability of
+ *                       each class as a vector.
+ * @param predictionCol field in "predictions" which gives the prediction for a data instance as a
+ *                      double.
+ * @param labelCol field in "predictions" which gives the true label of each instance.
+ * @param featuresCol field in "predictions" which gives the features of each instance as a vector.
+ * @param objectiveHistory objective function (scaled loss + regularization) at each iteration.
+ */
+private class BinaryLogisticRegressionTrainingSummaryImpl(
+    predictions: DataFrame,
+    probabilityCol: String,
+    predictionCol: String,
+    labelCol: String,
+    featuresCol: String,
+    override val objectiveHistory: Array[Double])
+  extends BinaryLogisticRegressionSummaryImpl(
+    predictions, probabilityCol, predictionCol, labelCol, featuresCol)
+  with BinaryLogisticRegressionTrainingSummary
+
+/**
+ * Binary logistic regression results for a given model.
+ *
+ * @param predictions dataframe output by the model's `transform` method.
+ * @param probabilityCol field in "predictions" which gives the probability of
+ *                       each class as a vector.
+ * @param predictionCol field in "predictions" which gives the prediction of
+ *                      each class as a double.
+ * @param labelCol field in "predictions" which gives the true label of each instance.
+ * @param featuresCol field in "predictions" which gives the features of each instance as a vector.
+ */
+private class BinaryLogisticRegressionSummaryImpl(
+    predictions: DataFrame,
+    probabilityCol: String,
+    predictionCol: String,
+    labelCol: String,
+    featuresCol: String)
+  extends LogisticRegressionSummaryImpl(
+    predictions, probabilityCol, predictionCol, labelCol, featuresCol)
+  with BinaryLogisticRegressionSummary
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 542977a48f0ac..6649fa402527d 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -222,15 +222,58 @@ class LogisticRegressionSuite
     }
   }
 
-  test("empty probabilityCol") {
-    val lr = new LogisticRegression().setProbabilityCol("")
-    val model = lr.fit(smallBinaryDataset)
-    assert(model.hasSummary)
-    // Validate that we re-insert a probability column for evaluation
-    val fieldNames = model.summary.predictions.schema.fieldNames
-    assert(smallBinaryDataset.schema.fieldNames.toSet.subsetOf(
-      fieldNames.toSet))
-    assert(fieldNames.exists(s => s.startsWith("probability_")))
+  test("empty probabilityCol or predictionCol") {
+    val lr = new LogisticRegression().setMaxIter(1)
+    val datasetFieldNames = smallBinaryDataset.schema.fieldNames.toSet
+    def checkSummarySchema(model: LogisticRegressionModel, columns: Seq[String]): Unit = {
+      val fieldNames = model.summary.predictions.schema.fieldNames
+      assert(model.hasSummary)
+      assert(datasetFieldNames.subsetOf(fieldNames.toSet))
+      columns.foreach { c => assert(fieldNames.exists(_.startsWith(c))) }
+    }
+    // check that the summary model adds the appropriate columns
+    Seq(("binomial", smallBinaryDataset), ("multinomial", smallMultinomialDataset)).foreach {
+      case (family, dataset) =>
+        lr.setFamily(family)
+        lr.setProbabilityCol("").setPredictionCol("prediction")
+        val modelNoProb = lr.fit(dataset)
+        checkSummarySchema(modelNoProb, Seq("probability_"))
+
+        lr.setProbabilityCol("probability").setPredictionCol("")
+        val modelNoPred = lr.fit(dataset)
+        checkSummarySchema(modelNoPred, Seq("prediction_"))
+
+        lr.setProbabilityCol("").setPredictionCol("")
+        val modelNoPredNoProb = lr.fit(dataset)
+        checkSummarySchema(modelNoPredNoProb, Seq("prediction_", "probability_"))
+    }
+  }
+
+  test("check summary types for binary and multiclass") {
+    val lr = new LogisticRegression()
+      .setFamily("binomial")
+      .setMaxIter(1)
+
+    val blorModel = lr.fit(smallBinaryDataset)
+    assert(blorModel.summary.isInstanceOf[BinaryLogisticRegressionTrainingSummary])
+    assert(blorModel.binarySummary.isInstanceOf[BinaryLogisticRegressionTrainingSummary])
+
+    val mlorModel = lr.setFamily("multinomial").fit(smallMultinomialDataset)
+    assert(mlorModel.summary.isInstanceOf[LogisticRegressionTrainingSummary])
+    withClue("cannot get binary summary for multiclass model") {
+      intercept[RuntimeException] {
+        mlorModel.binarySummary
+      }
+    }
+
+    val mlorBinaryModel = lr.setFamily("multinomial").fit(smallBinaryDataset)
+    assert(mlorBinaryModel.summary.isInstanceOf[BinaryLogisticRegressionTrainingSummary])
+    assert(mlorBinaryModel.binarySummary.isInstanceOf[BinaryLogisticRegressionTrainingSummary])
+
+    val blorSummary = blorModel.evaluate(smallBinaryDataset)
+    val mlorSummary = mlorModel.evaluate(smallMultinomialDataset)
+    assert(blorSummary.isInstanceOf[BinaryLogisticRegressionSummary])
+    assert(mlorSummary.isInstanceOf[LogisticRegressionSummary])
   }
 
   test("setThreshold, getThreshold") {
@@ -2341,51 +2384,98 @@ class LogisticRegressionSuite
   }
 
   test("evaluate on test set") {
-    // TODO: add for multiclass when model summary becomes available
     // Evaluate on test set should be same as that of the transformed training data.
     val lr = new LogisticRegression()
       .setMaxIter(10)
       .setRegParam(1.0)
       .setThreshold(0.6)
-    val model = lr.fit(smallBinaryDataset)
-    val summary = model.summary.asInstanceOf[BinaryLogisticRegressionSummary]
-
-    val sameSummary =
-      model.evaluate(smallBinaryDataset).asInstanceOf[BinaryLogisticRegressionSummary]
-    assert(summary.areaUnderROC === sameSummary.areaUnderROC)
-    assert(summary.roc.collect() === sameSummary.roc.collect())
-    assert(summary.pr.collect === sameSummary.pr.collect())
+      .setFamily("binomial")
+    val blorModel = lr.fit(smallBinaryDataset)
+    val blorSummary = blorModel.binarySummary
+
+    val sameBlorSummary =
+      blorModel.evaluate(smallBinaryDataset).asInstanceOf[BinaryLogisticRegressionSummary]
+    assert(blorSummary.areaUnderROC === sameBlorSummary.areaUnderROC)
+    assert(blorSummary.roc.collect() === sameBlorSummary.roc.collect())
+    assert(blorSummary.pr.collect === sameBlorSummary.pr.collect())
+    assert(
+      blorSummary.fMeasureByThreshold.collect() === sameBlorSummary.fMeasureByThreshold.collect())
     assert(
-      summary.fMeasureByThreshold.collect() === sameSummary.fMeasureByThreshold.collect())
-    assert(summary.recallByThreshold.collect() === sameSummary.recallByThreshold.collect())
+      blorSummary.recallByThreshold.collect() === sameBlorSummary.recallByThreshold.collect())
     assert(
-      summary.precisionByThreshold.collect() === sameSummary.precisionByThreshold.collect())
+      blorSummary.precisionByThreshold.collect() === sameBlorSummary.precisionByThreshold.collect())
+
+    lr.setFamily("multinomial")
+    val mlorModel = lr.fit(smallMultinomialDataset)
+    val mlorSummary = mlorModel.summary
+
+    val mlorSameSummary = mlorModel.evaluate(smallMultinomialDataset)
+
+    assert(mlorSummary.truePositiveRateByLabel === mlorSameSummary.truePositiveRateByLabel)
+    assert(mlorSummary.falsePositiveRateByLabel === mlorSameSummary.falsePositiveRateByLabel)
+    assert(mlorSummary.precisionByLabel === mlorSameSummary.precisionByLabel)
+    assert(mlorSummary.recallByLabel === mlorSameSummary.recallByLabel)
+    assert(mlorSummary.fMeasureByLabel === mlorSameSummary.fMeasureByLabel)
+    assert(mlorSummary.accuracy === mlorSameSummary.accuracy)
+    assert(mlorSummary.weightedTruePositiveRate === mlorSameSummary.weightedTruePositiveRate)
+    assert(mlorSummary.weightedFalsePositiveRate === mlorSameSummary.weightedFalsePositiveRate)
+    assert(mlorSummary.weightedPrecision === mlorSameSummary.weightedPrecision)
+    assert(mlorSummary.weightedRecall === mlorSameSummary.weightedRecall)
+    assert(mlorSummary.weightedFMeasure === mlorSameSummary.weightedFMeasure)
   }
 
   test("evaluate with labels that are not doubles") {
     // Evaluate a test set with Label that is a numeric type other than Double
-    val lr = new LogisticRegression()
+    val blor = new LogisticRegression()
       .setMaxIter(1)
       .setRegParam(1.0)
-    val model = lr.fit(smallBinaryDataset)
-    val summary = model.evaluate(smallBinaryDataset).asInstanceOf[BinaryLogisticRegressionSummary]
+      .setFamily("binomial")
+    val blorModel = blor.fit(smallBinaryDataset)
+    val blorSummary = blorModel.evaluate(smallBinaryDataset)
+      .asInstanceOf[BinaryLogisticRegressionSummary]
+
+    val blorLongLabelData = smallBinaryDataset.select(col(blorModel.getLabelCol).cast(LongType),
+      col(blorModel.getFeaturesCol))
+    val blorLongSummary = blorModel.evaluate(blorLongLabelData)
+      .asInstanceOf[BinaryLogisticRegressionSummary]
+
+    assert(blorSummary.areaUnderROC === blorLongSummary.areaUnderROC)
 
-    val longLabelData = smallBinaryDataset.select(col(model.getLabelCol).cast(LongType),
-      col(model.getFeaturesCol))
-    val longSummary = model.evaluate(longLabelData).asInstanceOf[BinaryLogisticRegressionSummary]
+    val mlor = new LogisticRegression()
+      .setMaxIter(1)
+      .setRegParam(1.0)
+      .setFamily("multinomial")
+    val mlorModel = mlor.fit(smallMultinomialDataset)
+    val mlorSummary = mlorModel.evaluate(smallMultinomialDataset)
+
+    val mlorLongLabelData = smallMultinomialDataset.select(
+      col(mlorModel.getLabelCol).cast(LongType),
+      col(mlorModel.getFeaturesCol))
+    val mlorLongSummary = mlorModel.evaluate(mlorLongLabelData)
 
-    assert(summary.areaUnderROC === longSummary.areaUnderROC)
+    assert(mlorSummary.accuracy === mlorLongSummary.accuracy)
   }
 
   test("statistics on training data") {
     // Test that loss is monotonically decreasing.
-    val lr = new LogisticRegression()
+    val blor = new LogisticRegression()
       .setMaxIter(10)
       .setRegParam(1.0)
-      .setThreshold(0.6)
-    val model = lr.fit(smallBinaryDataset)
+      .setFamily("binomial")
+    val blorModel = blor.fit(smallBinaryDataset)
+    assert(
+      blorModel.summary
+        .objectiveHistory
+        .sliding(2)
+        .forall(x => x(0) >= x(1)))
+
+    val mlor = new LogisticRegression()
+      .setMaxIter(10)
+      .setRegParam(1.0)
+      .setFamily("multinomial")
+    val mlorModel = mlor.fit(smallMultinomialDataset)
     assert(
-      model.summary
+      mlorModel.summary
         .objectiveHistory
         .sliding(2)
         .forall(x => x(0) >= x(1)))
@@ -2470,7 +2560,7 @@ class LogisticRegressionSuite
     predictions3.zip(predictions4).foreach { case (Row(p1: Double), Row(p2: Double)) =>
       assert(p1 === p2)
     }
-    // TODO: check that it converges in a single iteration when model summary is available
+    assert(model4.summary.totalIterations === 1)
   }
 
   test("binary logistic regression with all labels the same") {
@@ -2531,6 +2621,7 @@ class LogisticRegressionSuite
         assert(prob === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, 1.0)))
         assert(pred === 4.0)
     }
+    assert(model.summary.totalIterations === 0)
 
     // force the model to be trained with only one class
     val constantZeroData = Seq(
@@ -2544,6 +2635,7 @@ class LogisticRegressionSuite
         assert(prob === Vectors.dense(Array(1.0)))
         assert(pred === 0.0)
     }
+    assert(modelZeroLabel.summary.totalIterations > 0)
 
     // ensure that the correct value is predicted when numClasses passed through metadata
     val labelMeta = NominalAttribute.defaultAttr.withName("label").withNumValues(6).toMetadata()
@@ -2557,7 +2649,7 @@ class LogisticRegressionSuite
         assert(prob === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, 1.0, 0.0)))
         assert(pred === 4.0)
     }
-    // TODO: check num iters is zero when it become available in the model
+    require(modelWithMetadata.summary.totalIterations === 0)
   }
 
   test("compressed storage for constant label") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index e7bd4eb9e0adf..f470dca7dbd0a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -715,7 +715,7 @@ class LinearRegressionSuite
       assert(modelNoPredictionColFieldNames.exists(s => s.startsWith("prediction_")))
 
       // Residuals in [[LinearRegressionResults]] should equal those manually computed
-      val expectedResiduals = datasetWithDenseFeature.select("features", "label")
+      datasetWithDenseFeature.select("features", "label")
         .rdd
         .map { case Row(features: DenseVector, label: Double) =>
           val prediction =
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 9bda917377c2e..eecda26abb7ed 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -44,7 +44,26 @@ object MimaExcludes {
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.status.api.v1.ShuffleReadMetricDistributions.this"),
 
     // [SPARK-21276] Update lz4-java to the latest (v1.4.0)
-    ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.io.LZ4BlockInputStream")
+    ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.io.LZ4BlockInputStream"),
+
+    // [SPARK-17139] Add model summary for MultinomialLogisticRegression
+    ProblemFilters.exclude[IncompatibleTemplateDefProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary"),
+    ProblemFilters.exclude[IncompatibleTemplateDefProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary"),
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.predictionCol"),
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.labels"),
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.truePositiveRateByLabel"),
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.falsePositiveRateByLabel"),
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.precisionByLabel"),
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.recallByLabel"),
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.fMeasureByLabel"),
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.accuracy"),
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightedTruePositiveRate"),
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightedFalsePositiveRate"),
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightedRecall"),
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightedPrecision"),
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightedFMeasure"),
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.org$apache$spark$ml$classification$LogisticRegressionSummary$$multiclassMetrics"),
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.org$apache$spark$ml$classification$LogisticRegressionSummary$_setter_$org$apache$spark$ml$classification$LogisticRegressionSummary$$multiclassMetrics_=")
   )
 
   // Exclude rules for 2.2.x

From 32fa0b81411f781173e185f4b19b9fd6d118f9fe Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Tue, 29 Aug 2017 20:16:45 +0800
Subject: [PATCH 440/779] [SPARK-21781][SQL] Modify DataSourceScanExec to use
 concrete ColumnVector type.

## What changes were proposed in this pull request?

As mentioned at https://github.com/apache/spark/pull/18680#issuecomment-316820409, when we have more `ColumnVector` implementations, it might (or might not) have huge performance implications because it might disable inlining, or force virtual dispatches.

As for read path, one of the major paths is the one generated by `ColumnBatchScan`. Currently it refers `ColumnVector` so the penalty will be bigger as we have more classes, but we can know the concrete type from its usage, e.g. vectorized Parquet reader uses `OnHeapColumnVector`. We can use the concrete type in the generated code directly to avoid the penalty.

## How was this patch tested?

Existing tests.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #18989 from ueshin/issues/SPARK-21781.
---
 .../spark/sql/execution/ColumnarBatchScan.scala    | 14 +++++++++-----
 .../spark/sql/execution/DataSourceScanExec.scala   |  5 +++++
 .../sql/execution/datasources/FileFormat.scala     | 10 ++++++++++
 .../datasources/parquet/ParquetFileFormat.scala    |  8 ++++++++
 4 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
index 74a47da2deef2..1afe83ea3539e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
@@ -33,6 +33,8 @@ private[sql] trait ColumnarBatchScan extends CodegenSupport {
 
   val inMemoryTableScan: InMemoryTableScanExec = null
 
+  def vectorTypes: Option[Seq[String]] = None
+
   override lazy val metrics = Map(
     "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
     "scanTime" -> SQLMetrics.createTimingMetric(sparkContext, "scan time"))
@@ -79,17 +81,19 @@ private[sql] trait ColumnarBatchScan extends CodegenSupport {
     val scanTimeTotalNs = ctx.freshName("scanTime")
     ctx.addMutableState("long", scanTimeTotalNs, s"$scanTimeTotalNs = 0;")
 
-    val columnarBatchClz = "org.apache.spark.sql.execution.vectorized.ColumnarBatch"
+    val columnarBatchClz = classOf[ColumnarBatch].getName
     val batch = ctx.freshName("batch")
     ctx.addMutableState(columnarBatchClz, batch, s"$batch = null;")
 
-    val columnVectorClz = "org.apache.spark.sql.execution.vectorized.ColumnVector"
     val idx = ctx.freshName("batchIdx")
     ctx.addMutableState("int", idx, s"$idx = 0;")
     val colVars = output.indices.map(i => ctx.freshName("colInstance" + i))
-    val columnAssigns = colVars.zipWithIndex.map { case (name, i) =>
-      ctx.addMutableState(columnVectorClz, name, s"$name = null;")
-      s"$name = $batch.column($i);"
+    val columnVectorClzs = vectorTypes.getOrElse(
+      Seq.fill(colVars.size)(classOf[ColumnVector].getName))
+    val columnAssigns = colVars.zip(columnVectorClzs).zipWithIndex.map {
+      case ((name, columnVectorClz), i) =>
+        ctx.addMutableState(columnVectorClz, name, s"$name = null;")
+        s"$name = ($columnVectorClz) $batch.column($i);"
     }
 
     val nextBatch = ctx.freshName("nextBatch")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index 588c937a13e45..77e6dbf636476 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -174,6 +174,11 @@ case class FileSourceScanExec(
     false
   }
 
+  override def vectorTypes: Option[Seq[String]] =
+    relation.fileFormat.vectorTypes(
+      requiredSchema = requiredSchema,
+      partitionSchema = relation.partitionSchema)
+
   @transient private lazy val selectedPartitions: Seq[PartitionDirectory] = {
     val optimizerMetadataTimeNs = relation.location.metadataOpsTimeNs.getOrElse(0L)
     val startTime = System.nanoTime()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
index dacf462953520..e5a7aee64a4f4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
@@ -64,6 +64,16 @@ trait FileFormat {
     false
   }
 
+  /**
+   * Returns concrete column vector class names for each column to be used in a columnar batch
+   * if this format supports returning columnar batch.
+   */
+  def vectorTypes(
+      requiredSchema: StructType,
+      partitionSchema: StructType): Option[Seq[String]] = {
+    None
+  }
+
   /**
    * Returns whether a file with `path` could be splitted or not.
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
index 64eea26a9f98e..e1e740500205a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -47,6 +47,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
 import org.apache.spark.sql.catalyst.parser.LegacyTypeStringParser
 import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
@@ -272,6 +273,13 @@ class ParquetFileFormat
       schema.forall(_.dataType.isInstanceOf[AtomicType])
   }
 
+  override def vectorTypes(
+      requiredSchema: StructType,
+      partitionSchema: StructType): Option[Seq[String]] = {
+    Option(Seq.fill(requiredSchema.fields.length + partitionSchema.fields.length)(
+      classOf[OnHeapColumnVector].getName))
+  }
+
   override def isSplitable(
       sparkSession: SparkSession,
       options: Map[String, String],

From 8fcbda9c93175c0d44b0e4deaf10df1a427e03ea Mon Sep 17 00:00:00 2001
From: Wang Gengliang <ltnwgl@gmail.com>
Date: Tue, 29 Aug 2017 09:08:59 -0700
Subject: [PATCH 441/779] [SPARK-21848][SQL] Add trait UserDefinedExpression to
 identify user-defined functions

## What changes were proposed in this pull request?

Add trait UserDefinedExpression to identify user-defined functions.
UDF can be expensive. In optimizer we may need to avoid executing UDF multiple times.
E.g.
```scala
table.select(UDF as 'a).select('a, ('a + 1) as 'b)
```
If UDF is expensive in this case, optimizer should not collapse the project to
```scala
table.select(UDF as 'a, (UDF+1) as 'b)
```

Currently UDF classes like PythonUDF, HiveGenericUDF are not defined in catalyst.
This PR is to add a new trait to make it easier to identify user-defined functions.

## How was this patch tested?

Unit test

Author: Wang Gengliang <ltnwgl@gmail.com>

Closes #19064 from gengliangwang/UDFType.
---
 .../sql/catalyst/expressions/Expression.scala  |  6 ++++++
 .../sql/catalyst/expressions/ScalaUDF.scala    |  2 +-
 .../spark/sql/execution/aggregate/udaf.scala   |  6 +++++-
 .../spark/sql/execution/python/PythonUDF.scala |  4 ++--
 .../org/apache/spark/sql/hive/hiveUDFs.scala   | 18 ++++++++++++++----
 5 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 74c4cddf2b47e..c058425b4bc36 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -635,3 +635,9 @@ abstract class TernaryExpression extends Expression {
     }
   }
 }
+
+/**
+ * Common base trait for user-defined functions, including UDF/UDAF/UDTF of different languages
+ * and Hive function wrappers.
+ */
+trait UserDefinedExpression
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
index 9df0e2e1415c0..527f1670c25e1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
@@ -47,7 +47,7 @@ case class ScalaUDF(
     udfName: Option[String] = None,
     nullable: Boolean = true,
     udfDeterministic: Boolean = true)
-  extends Expression with ImplicitCastInputTypes with NonSQLExpression {
+  extends Expression with ImplicitCastInputTypes with NonSQLExpression with UserDefinedExpression {
 
   override def deterministic: Boolean = udfDeterministic && children.forall(_.deterministic)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
index ae5e2c6bece2a..fec1add18cbf2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
@@ -324,7 +324,11 @@ case class ScalaUDAF(
     udaf: UserDefinedAggregateFunction,
     mutableAggBufferOffset: Int = 0,
     inputAggBufferOffset: Int = 0)
-  extends ImperativeAggregate with NonSQLExpression with Logging with ImplicitCastInputTypes {
+  extends ImperativeAggregate
+  with NonSQLExpression
+  with Logging
+  with ImplicitCastInputTypes
+  with UserDefinedExpression {
 
   override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ImperativeAggregate =
     copy(mutableAggBufferOffset = newMutableAggBufferOffset)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDF.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDF.scala
index 59d7e8dd6dffb..7ebbdb9846cce 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDF.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDF.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.execution.python
 
 import org.apache.spark.api.python.PythonFunction
-import org.apache.spark.sql.catalyst.expressions.{Expression, NonSQLExpression, Unevaluable}
+import org.apache.spark.sql.catalyst.expressions.{Expression, NonSQLExpression, Unevaluable, UserDefinedExpression}
 import org.apache.spark.sql.types.DataType
 
 /**
@@ -29,7 +29,7 @@ case class PythonUDF(
     func: PythonFunction,
     dataType: DataType,
     children: Seq[Expression])
-  extends Expression with Unevaluable with NonSQLExpression {
+  extends Expression with Unevaluable with NonSQLExpression with UserDefinedExpression {
 
   override def toString: String = s"$name(${children.mkString(", ")})"
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
index a83ad61b204ad..e9bdcf00b9346 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
@@ -42,7 +42,11 @@ import org.apache.spark.sql.types._
 
 private[hive] case class HiveSimpleUDF(
     name: String, funcWrapper: HiveFunctionWrapper, children: Seq[Expression])
-  extends Expression with HiveInspectors with CodegenFallback with Logging {
+  extends Expression
+  with HiveInspectors
+  with CodegenFallback
+  with Logging
+  with UserDefinedExpression {
 
   override def deterministic: Boolean = isUDFDeterministic && children.forall(_.deterministic)
 
@@ -119,7 +123,11 @@ private[hive] class DeferredObjectAdapter(oi: ObjectInspector, dataType: DataTyp
 
 private[hive] case class HiveGenericUDF(
     name: String, funcWrapper: HiveFunctionWrapper, children: Seq[Expression])
-  extends Expression with HiveInspectors with CodegenFallback with Logging {
+  extends Expression
+  with HiveInspectors
+  with CodegenFallback
+  with Logging
+  with UserDefinedExpression {
 
   override def nullable: Boolean = true
 
@@ -191,7 +199,7 @@ private[hive] case class HiveGenericUDTF(
     name: String,
     funcWrapper: HiveFunctionWrapper,
     children: Seq[Expression])
-  extends Generator with HiveInspectors with CodegenFallback {
+  extends Generator with HiveInspectors with CodegenFallback with UserDefinedExpression {
 
   @transient
   protected lazy val function: GenericUDTF = {
@@ -303,7 +311,9 @@ private[hive] case class HiveUDAFFunction(
     isUDAFBridgeRequired: Boolean = false,
     mutableAggBufferOffset: Int = 0,
     inputAggBufferOffset: Int = 0)
-  extends TypedImperativeAggregate[GenericUDAFEvaluator.AggregationBuffer] with HiveInspectors {
+  extends TypedImperativeAggregate[GenericUDAFEvaluator.AggregationBuffer]
+  with HiveInspectors
+  with UserDefinedExpression {
 
   override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ImperativeAggregate =
     copy(mutableAggBufferOffset = newMutableAggBufferOffset)

From 6327ea570bf542983081c5d1d3ee7e6123365c8f Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 29 Aug 2017 09:15:59 -0700
Subject: [PATCH 442/779] [SPARK-21255][SQL] simplify encoder for java enum

## What changes were proposed in this pull request?

This is a follow-up for https://github.com/apache/spark/pull/18488, to simplify the code.

The major change is, we should map java enum to string type, instead of a struct type with a single string field.

## How was this patch tested?

existing tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19066 from cloud-fan/fix.
---
 .../sql/catalyst/JavaTypeInference.scala      | 46 ++++++-------------
 .../catalyst/encoders/ExpressionEncoder.scala | 14 +-----
 .../expressions/objects/objects.scala         |  4 +-
 .../apache/spark/sql/JavaDatasetSuite.java    | 24 ++++------
 4 files changed, 25 insertions(+), 63 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
index 33f6ce080c339..3ecc137c8cd7f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
@@ -32,7 +32,6 @@ import org.apache.spark.sql.catalyst.expressions.objects._
 import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericArrayData}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
-import org.apache.spark.util.Utils
 
 /**
  * Type-inference utilities for POJOs and Java collections.
@@ -120,8 +119,7 @@ object JavaTypeInference {
         (MapType(keyDataType, valueDataType, nullable), true)
 
       case other if other.isEnum =>
-        (StructType(Seq(StructField(typeToken.getRawType.getSimpleName,
-          StringType, nullable = false))), true)
+        (StringType, true)
 
       case other =>
         if (seenTypeSet.contains(other)) {
@@ -310,9 +308,12 @@ object JavaTypeInference {
           returnNullable = false)
 
       case other if other.isEnum =>
-        StaticInvoke(JavaTypeInference.getClass, ObjectType(other), "deserializeEnumName",
-          expressions.Literal.create(other.getEnumConstants.apply(0), ObjectType(other))
-            :: getPath :: Nil)
+        StaticInvoke(
+          other,
+          ObjectType(other),
+          "valueOf",
+          Invoke(getPath, "toString", ObjectType(classOf[String]), returnNullable = false) :: Nil,
+          returnNullable = false)
 
       case other =>
         val properties = getJavaBeanReadableAndWritableProperties(other)
@@ -356,30 +357,6 @@ object JavaTypeInference {
     }
   }
 
-  /** Returns a mapping from enum value to int for given enum type */
-  def enumSerializer[T <: Enum[T]](enum: Class[T]): T => UTF8String = {
-    assert(enum.isEnum)
-    inputObject: T =>
-      UTF8String.fromString(inputObject.name())
-  }
-
-  /** Returns value index for given enum type and value */
-  def serializeEnumName[T <: Enum[T]](enum: UTF8String, inputObject: T): UTF8String = {
-    enumSerializer(Utils.classForName(enum.toString).asInstanceOf[Class[T]])(inputObject)
-  }
-
-  /** Returns a mapping from int to enum value for given enum type */
-  def enumDeserializer[T <: Enum[T]](enum: Class[T]): InternalRow => T = {
-    assert(enum.isEnum)
-    value: InternalRow =>
-      Enum.valueOf(enum, value.getUTF8String(0).toString)
-  }
-
-  /** Returns enum value for given enum type and value index */
-  def deserializeEnumName[T <: Enum[T]](typeDummy: T, inputObject: InternalRow): T = {
-    enumDeserializer(typeDummy.getClass.asInstanceOf[Class[T]])(inputObject)
-  }
-
   private def serializerFor(inputObject: Expression, typeToken: TypeToken[_]): Expression = {
 
     def toCatalystArray(input: Expression, elementType: TypeToken[_]): Expression = {
@@ -465,9 +442,12 @@ object JavaTypeInference {
           )
 
         case other if other.isEnum =>
-          CreateNamedStruct(expressions.Literal("enum") ::
-          StaticInvoke(JavaTypeInference.getClass, StringType, "serializeEnumName",
-          expressions.Literal.create(other.getName, StringType) :: inputObject :: Nil) :: Nil)
+          StaticInvoke(
+            classOf[UTF8String],
+            StringType,
+            "fromString",
+            Invoke(inputObject, "name", ObjectType(classOf[String]), returnNullable = false) :: Nil,
+            returnNullable = false)
 
         case other =>
           val properties = getJavaBeanReadableAndWritableProperties(other)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
index 9ed5e120344b1..efc2882f0a3d3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateSafeProjection
 import org.apache.spark.sql.catalyst.expressions.objects.{AssertNotNull, Invoke, NewInstance}
 import org.apache.spark.sql.catalyst.optimizer.SimplifyCasts
 import org.apache.spark.sql.catalyst.plans.logical.{CatalystSerde, DeserializeToObject, LocalRelation}
-import org.apache.spark.sql.types.{BooleanType, DataType, ObjectType, StringType, StructField, StructType}
+import org.apache.spark.sql.types.{BooleanType, ObjectType, StructField, StructType}
 import org.apache.spark.util.Utils
 
 /**
@@ -81,19 +81,9 @@ object ExpressionEncoder {
       ClassTag[T](cls))
   }
 
-  def javaEnumSchema[T](beanClass: Class[T]): DataType = {
-    StructType(Seq(StructField("enum",
-      StructType(Seq(StructField(beanClass.getSimpleName, StringType, nullable = false))),
-      nullable = false)))
-  }
-
   // TODO: improve error message for java bean encoder.
   def javaBean[T](beanClass: Class[T]): ExpressionEncoder[T] = {
-    val schema = if (beanClass.isEnum) {
-      javaEnumSchema(beanClass)
-    } else {
-      JavaTypeInference.inferDataType(beanClass)._1
-    }
+    val schema = JavaTypeInference.inferDataType(beanClass)._1
     assert(schema.isInstanceOf[StructType])
 
     val serializer = JavaTypeInference.serializerFor(beanClass)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index 7c466fe03cdcf..9b28a18035b1c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -154,13 +154,13 @@ case class StaticInvoke(
     val evaluate = if (returnNullable) {
       if (ctx.defaultValue(dataType) == "null") {
         s"""
-          ${ev.value} = (($javaType) ($callFunc));
+          ${ev.value} = $callFunc;
           ${ev.isNull} = ${ev.value} == null;
         """
       } else {
         val boxedResult = ctx.freshName("boxedResult")
         s"""
-          ${ctx.boxedType(dataType)} $boxedResult = (($javaType) ($callFunc));
+          ${ctx.boxedType(dataType)} $boxedResult = $callFunc;
           ${ev.isNull} = $boxedResult == null;
           if (!${ev.isNull}) {
             ${ev.value} = $boxedResult;
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java
index a34474683013f..3e57403bede9e 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java
@@ -1283,13 +1283,13 @@ public void test() {
     ds.collectAsList();
   }
 
-  public enum EnumBean {
+  public enum MyEnum {
     A("www.elgoog.com"),
     B("www.google.com");
 
     private String url;
 
-    EnumBean(String url) {
+    MyEnum(String url) {
       this.url = url;
     }
 
@@ -1302,16 +1302,8 @@ public void setUrl(String url) {
     }
   }
 
-  @Test
-  public void testEnum() {
-    List<EnumBean> data = Arrays.asList(EnumBean.B);
-    Encoder<EnumBean> encoder = Encoders.bean(EnumBean.class);
-    Dataset<EnumBean> ds = spark.createDataset(data, encoder);
-    Assert.assertEquals(ds.collectAsList(), data);
-  }
-
   public static class BeanWithEnum {
-    EnumBean enumField;
+    MyEnum enumField;
     String regularField;
 
     public String getRegularField() {
@@ -1322,15 +1314,15 @@ public void setRegularField(String regularField) {
       this.regularField = regularField;
     }
 
-    public EnumBean getEnumField() {
+    public MyEnum getEnumField() {
       return enumField;
     }
 
-    public void setEnumField(EnumBean field) {
+    public void setEnumField(MyEnum field) {
       this.enumField = field;
     }
 
-    public BeanWithEnum(EnumBean enumField, String regularField) {
+    public BeanWithEnum(MyEnum enumField, String regularField) {
       this.enumField = enumField;
       this.regularField = regularField;
     }
@@ -1353,8 +1345,8 @@ public boolean equals(Object other) {
 
   @Test
   public void testBeanWithEnum() {
-    List<BeanWithEnum> data = Arrays.asList(new BeanWithEnum(EnumBean.A, "mira avenue"),
-            new BeanWithEnum(EnumBean.B, "flower boulevard"));
+    List<BeanWithEnum> data = Arrays.asList(new BeanWithEnum(MyEnum.A, "mira avenue"),
+            new BeanWithEnum(MyEnum.B, "flower boulevard"));
     Encoder<BeanWithEnum> encoder = Encoders.bean(BeanWithEnum.class);
     Dataset<BeanWithEnum> ds = spark.createDataset(data, encoder);
     Assert.assertEquals(ds.collectAsList(), data);

From 6077e3ef3cb39fab061cbf6a357423da030c47c4 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Tue, 29 Aug 2017 10:09:41 -0700
Subject: [PATCH 443/779] [SPARK-21801][SPARKR][TEST] unit test randomly fail
 with randomforest

## What changes were proposed in this pull request?

fix the random seed to eliminate variability

## How was this patch tested?

jenkins, appveyor, lots more jenkins

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #19018 from felixcheung/rrftest.
---
 R/pkg/tests/fulltests/test_mllib_tree.R | 36 +++++++++++++------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/R/pkg/tests/fulltests/test_mllib_tree.R b/R/pkg/tests/fulltests/test_mllib_tree.R
index 799f94401d008..facd3a941cf1f 100644
--- a/R/pkg/tests/fulltests/test_mllib_tree.R
+++ b/R/pkg/tests/fulltests/test_mllib_tree.R
@@ -66,7 +66,7 @@ test_that("spark.gbt", {
   # label must be binary - GBTClassifier currently only supports binary classification.
   iris2 <- iris[iris$Species != "virginica", ]
   data <- suppressWarnings(createDataFrame(iris2))
-  model <- spark.gbt(data, Species ~ Petal_Length + Petal_Width, "classification")
+  model <- spark.gbt(data, Species ~ Petal_Length + Petal_Width, "classification", seed = 12)
   stats <- summary(model)
   expect_equal(stats$numFeatures, 2)
   expect_equal(stats$numTrees, 20)
@@ -94,7 +94,7 @@ test_that("spark.gbt", {
 
   iris2$NumericSpecies <- ifelse(iris2$Species == "setosa", 0, 1)
   df <- suppressWarnings(createDataFrame(iris2))
-  m <- spark.gbt(df, NumericSpecies ~ ., type = "classification")
+  m <- spark.gbt(df, NumericSpecies ~ ., type = "classification", seed = 12)
   s <- summary(m)
   # test numeric prediction values
   expect_equal(iris2$NumericSpecies, as.double(collect(predict(m, df))$prediction))
@@ -106,7 +106,7 @@ test_that("spark.gbt", {
   if (windows_with_hadoop()) {
     data <- read.df(absoluteSparkPath("data/mllib/sample_binary_classification_data.txt"),
                   source = "libsvm")
-    model <- spark.gbt(data, label ~ features, "classification")
+    model <- spark.gbt(data, label ~ features, "classification", seed = 12)
     expect_equal(summary(model)$numFeatures, 692)
   }
 
@@ -117,10 +117,11 @@ test_that("spark.gbt", {
   trainidxs <- base::sample(nrow(data), nrow(data) * 0.7)
   traindf <- as.DataFrame(data[trainidxs, ])
   testdf <- as.DataFrame(rbind(data[-trainidxs, ], c(0, "the other")))
-  model <- spark.gbt(traindf, clicked ~ ., type = "classification")
+  model <- spark.gbt(traindf, clicked ~ ., type = "classification", seed = 23)
   predictions <- predict(model, testdf)
   expect_error(collect(predictions))
-  model <- spark.gbt(traindf, clicked ~ ., type = "classification", handleInvalid = "keep")
+  model <- spark.gbt(traindf, clicked ~ ., type = "classification", handleInvalid = "keep",
+                    seed = 23)
   predictions <- predict(model, testdf)
   expect_equal(class(collect(predictions)$clicked[1]), "character")
 })
@@ -129,7 +130,7 @@ test_that("spark.randomForest", {
   # regression
   data <- suppressWarnings(createDataFrame(longley))
   model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
-                              numTrees = 1)
+                              numTrees = 1, seed = 1)
 
   predictions <- collect(predict(model, data))
   expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187,
@@ -177,7 +178,7 @@ test_that("spark.randomForest", {
   # classification
   data <- suppressWarnings(createDataFrame(iris))
   model <- spark.randomForest(data, Species ~ Petal_Length + Petal_Width, "classification",
-                              maxDepth = 5, maxBins = 16)
+                              maxDepth = 5, maxBins = 16, seed = 123)
 
   stats <- summary(model)
   expect_equal(stats$numFeatures, 2)
@@ -215,7 +216,7 @@ test_that("spark.randomForest", {
   iris$NumericSpecies <- lapply(iris$Species, labelToIndex)
   data <- suppressWarnings(createDataFrame(iris[-5]))
   model <- spark.randomForest(data, NumericSpecies ~ Petal_Length + Petal_Width, "classification",
-                              maxDepth = 5, maxBins = 16)
+                              maxDepth = 5, maxBins = 16, seed = 123)
   stats <- summary(model)
   expect_equal(stats$numFeatures, 2)
   expect_equal(stats$numTrees, 20)
@@ -234,12 +235,12 @@ test_that("spark.randomForest", {
   traindf <- as.DataFrame(data[trainidxs, ])
   testdf <- as.DataFrame(rbind(data[-trainidxs, ], c(0, "the other")))
   model <- spark.randomForest(traindf, clicked ~ ., type = "classification",
-                          maxDepth = 10, maxBins = 10, numTrees = 10)
+                          maxDepth = 10, maxBins = 10, numTrees = 10, seed = 123)
   predictions <- predict(model, testdf)
   expect_error(collect(predictions))
   model <- spark.randomForest(traindf, clicked ~ ., type = "classification",
                              maxDepth = 10, maxBins = 10, numTrees = 10,
-                             handleInvalid = "keep")
+                             handleInvalid = "keep", seed = 123)
   predictions <- predict(model, testdf)
   expect_equal(class(collect(predictions)$clicked[1]), "character")
 
@@ -247,7 +248,7 @@ test_that("spark.randomForest", {
   if (windows_with_hadoop()) {
     data <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
                   source = "libsvm")
-    model <- spark.randomForest(data, label ~ features, "classification")
+    model <- spark.randomForest(data, label ~ features, "classification", seed = 123)
     expect_equal(summary(model)$numFeatures, 4)
   }
 })
@@ -255,7 +256,8 @@ test_that("spark.randomForest", {
 test_that("spark.decisionTree", {
   # regression
   data <- suppressWarnings(createDataFrame(longley))
-  model <- spark.decisionTree(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16)
+  model <- spark.decisionTree(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
+                            seed = 42)
 
   predictions <- collect(predict(model, data))
   expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187,
@@ -288,7 +290,7 @@ test_that("spark.decisionTree", {
   # classification
   data <- suppressWarnings(createDataFrame(iris))
   model <- spark.decisionTree(data, Species ~ Petal_Length + Petal_Width, "classification",
-                              maxDepth = 5, maxBins = 16)
+                              maxDepth = 5, maxBins = 16, seed = 43)
 
   stats <- summary(model)
   expect_equal(stats$numFeatures, 2)
@@ -325,7 +327,7 @@ test_that("spark.decisionTree", {
   iris$NumericSpecies <- lapply(iris$Species, labelToIndex)
   data <- suppressWarnings(createDataFrame(iris[-5]))
   model <- spark.decisionTree(data, NumericSpecies ~ Petal_Length + Petal_Width, "classification",
-                              maxDepth = 5, maxBins = 16)
+                              maxDepth = 5, maxBins = 16, seed = 44)
   stats <- summary(model)
   expect_equal(stats$numFeatures, 2)
   expect_equal(stats$maxDepth, 5)
@@ -339,7 +341,7 @@ test_that("spark.decisionTree", {
   if (windows_with_hadoop()) {
     data <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
                   source = "libsvm")
-    model <- spark.decisionTree(data, label ~ features, "classification")
+    model <- spark.decisionTree(data, label ~ features, "classification", seed = 45)
     expect_equal(summary(model)$numFeatures, 4)
   }
 
@@ -351,11 +353,11 @@ test_that("spark.decisionTree", {
   traindf <- as.DataFrame(data[trainidxs, ])
   testdf <- as.DataFrame(rbind(data[-trainidxs, ], c(0, "the other")))
   model <- spark.decisionTree(traindf, clicked ~ ., type = "classification",
-                              maxDepth = 5, maxBins = 16)
+                              maxDepth = 5, maxBins = 16, seed = 46)
   predictions <- predict(model, testdf)
   expect_error(collect(predictions))
   model <- spark.decisionTree(traindf, clicked ~ ., type = "classification",
-                              maxDepth = 5, maxBins = 16, handleInvalid = "keep")
+                              maxDepth = 5, maxBins = 16, handleInvalid = "keep", seed = 46)
   predictions <- predict(model, testdf)
   expect_equal(class(collect(predictions)$clicked[1]), "character")
 })

From 840ba053b982362dfe84c6faa59d2237994d591c Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Tue, 29 Aug 2017 13:01:37 -0700
Subject: [PATCH 444/779] [MINOR][ML] Document treatment of instance weights in
 logreg summary

## What changes were proposed in this pull request?

Add Scaladoc noting that instance weights are currently ignored in the logistic regression summary traits.

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #19071 from jkbradley/lr-summary-minor.
---
 .../apache/spark/ml/classification/LogisticRegression.scala   | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index ffe4b52300c72..1869d51af7986 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -1356,6 +1356,8 @@ private[ml] class MultiClassSummarizer extends Serializable {
 /**
  * :: Experimental ::
  * Abstraction for logistic regression results for a given model.
+ *
+ * Currently, the summary ignores the instance weights.
  */
 @Experimental
 sealed trait LogisticRegressionSummary extends Serializable {
@@ -1495,6 +1497,8 @@ sealed trait LogisticRegressionTrainingSummary extends LogisticRegressionSummary
 /**
  * :: Experimental ::
  * Abstraction for binary logistic regression results for a given model.
+ *
+ * Currently, the summary ignores the instance weights.
  */
 @Experimental
 sealed trait BinaryLogisticRegressionSummary extends LogisticRegressionSummary {

From d7b1fcf8f0a267322af0592b2cb31f1c8970fb16 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Tue, 29 Aug 2017 14:42:24 -0700
Subject: [PATCH 445/779] [SPARK-21728][CORE] Allow SparkSubmit to use Logging.

This change initializes logging when SparkSubmit runs, using
a configuration that should avoid printing log messages as
much as possible with most configurations, and adds code to
restore the Spark logging system to as close as possible to
its initial state, so the Spark app being run can re-initialize
logging with its own configuration.

With that feature, some duplicate code in SparkSubmit can now
be replaced with the existing methods in the Utils class, which
could not be used before because they initialized logging. As part
of that I also did some minor refactoring, moving methods that
should really belong in DependencyUtils.

The change also shuffles some code in SparkHadoopUtil so that
SparkSubmit can create a Hadoop config like the rest of Spark
code, respecting the user's Spark configuration.

The behavior was verified running spark-shell, pyspark and
normal applications, then verifying the logging behavior,
with and without dependency downloads.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #19013 from vanzin/SPARK-21728.
---
 .../apache/spark/deploy/DependencyUtils.scala | 112 ++++++++---
 .../apache/spark/deploy/SparkHadoopUtil.scala |  79 +++++---
 .../org/apache/spark/deploy/SparkSubmit.scala | 179 +++---------------
 .../spark/deploy/worker/DriverWrapper.scala   |   9 +-
 .../org/apache/spark/internal/Logging.scala   |  61 ++++--
 .../scala/org/apache/spark/util/Utils.scala   |  10 +-
 .../spark/deploy/SparkSubmitSuite.scala       |  60 ++++--
 7 files changed, 263 insertions(+), 247 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/DependencyUtils.scala b/core/src/main/scala/org/apache/spark/deploy/DependencyUtils.scala
index 97f3803aafce4..db92a8f102365 100644
--- a/core/src/main/scala/org/apache/spark/deploy/DependencyUtils.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/DependencyUtils.scala
@@ -18,15 +18,15 @@
 package org.apache.spark.deploy
 
 import java.io.File
-import java.nio.file.Files
 
 import scala.collection.mutable.HashMap
 
-import org.apache.commons.io.FileUtils
 import org.apache.commons.lang3.StringUtils
 import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileSystem, Path}
 
-import org.apache.spark.util.MutableURLClassLoader
+import org.apache.spark.{SecurityManager, SparkConf}
+import org.apache.spark.util.{MutableURLClassLoader, Utils}
 
 private[deploy] object DependencyUtils {
 
@@ -51,41 +51,22 @@ private[deploy] object DependencyUtils {
     SparkSubmitUtils.resolveMavenCoordinates(packages, ivySettings, exclusions = exclusions)
   }
 
-  def createTempDir(): File = {
-    val targetDir = Files.createTempDirectory("tmp").toFile
-    // scalastyle:off runtimeaddshutdownhook
-    Runtime.getRuntime.addShutdownHook(new Thread() {
-      override def run(): Unit = {
-        FileUtils.deleteQuietly(targetDir)
-      }
-    })
-    // scalastyle:on runtimeaddshutdownhook
-    targetDir
-  }
-
-  def resolveAndDownloadJars(jars: String, userJar: String): String = {
-    val targetDir = DependencyUtils.createTempDir()
-    val hadoopConf = new Configuration()
-    val sparkProperties = new HashMap[String, String]()
-    val securityProperties = List("spark.ssl.fs.trustStore", "spark.ssl.trustStore",
-      "spark.ssl.fs.trustStorePassword", "spark.ssl.trustStorePassword",
-      "spark.ssl.fs.protocol", "spark.ssl.protocol")
-
-    securityProperties.foreach { pName =>
-      sys.props.get(pName).foreach { pValue =>
-        sparkProperties.put(pName, pValue)
-      }
-    }
-
+  def resolveAndDownloadJars(
+      jars: String,
+      userJar: String,
+      sparkConf: SparkConf,
+      hadoopConf: Configuration,
+      secMgr: SecurityManager): String = {
+    val targetDir = Utils.createTempDir()
     Option(jars)
       .map {
-        SparkSubmit.resolveGlobPaths(_, hadoopConf)
+        resolveGlobPaths(_, hadoopConf)
           .split(",")
           .filterNot(_.contains(userJar.split("/").last))
           .mkString(",")
       }
       .filterNot(_ == "")
-      .map(SparkSubmit.downloadFileList(_, targetDir, sparkProperties, hadoopConf))
+      .map(downloadFileList(_, targetDir, sparkConf, hadoopConf, secMgr))
       .orNull
   }
 
@@ -96,4 +77,73 @@ private[deploy] object DependencyUtils {
       }
     }
   }
+
+  /**
+   * Download a list of remote files to temp local files. If the file is local, the original file
+   * will be returned.
+   *
+   * @param fileList A comma separated file list.
+   * @param targetDir A temporary directory for which downloaded files.
+   * @param sparkConf Spark configuration.
+   * @param hadoopConf Hadoop configuration.
+   * @param secMgr Spark security manager.
+   * @return A comma separated local files list.
+   */
+  def downloadFileList(
+      fileList: String,
+      targetDir: File,
+      sparkConf: SparkConf,
+      hadoopConf: Configuration,
+      secMgr: SecurityManager): String = {
+    require(fileList != null, "fileList cannot be null.")
+    fileList.split(",")
+      .map(downloadFile(_, targetDir, sparkConf, hadoopConf, secMgr))
+      .mkString(",")
+  }
+
+  /**
+   * Download a file from the remote to a local temporary directory. If the input path points to
+   * a local path, returns it with no operation.
+   *
+   * @param path A file path from where the files will be downloaded.
+   * @param targetDir A temporary directory for which downloaded files.
+   * @param sparkConf Spark configuration.
+   * @param hadoopConf Hadoop configuration.
+   * @param secMgr Spark security manager.
+   * @return Path to the local file.
+   */
+  def downloadFile(
+      path: String,
+      targetDir: File,
+      sparkConf: SparkConf,
+      hadoopConf: Configuration,
+      secMgr: SecurityManager): String = {
+    require(path != null, "path cannot be null.")
+    val uri = Utils.resolveURI(path)
+
+    uri.getScheme match {
+      case "file" | "local" => path
+      case _ =>
+        val fname = new Path(uri).getName()
+        val localFile = Utils.doFetchFile(uri.toString(), targetDir, fname, sparkConf, secMgr,
+          hadoopConf)
+        localFile.toURI().toString()
+    }
+  }
+
+  def resolveGlobPaths(paths: String, hadoopConf: Configuration): String = {
+    require(paths != null, "paths cannot be null.")
+    paths.split(",").map(_.trim).filter(_.nonEmpty).flatMap { path =>
+      val uri = Utils.resolveURI(path)
+      uri.getScheme match {
+        case "local" | "http" | "https" | "ftp" => Array(path)
+        case _ =>
+          val fs = FileSystem.get(uri, hadoopConf)
+          Option(fs.globStatus(new Path(uri))).map { status =>
+            status.filter(_.isFile).map(_.getPath.toUri.toString)
+          }.getOrElse(Array(path))
+      }
+    }.mkString(",")
+  }
+
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index 6d507d85331bd..53775db251bc6 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -81,29 +81,7 @@ class SparkHadoopUtil extends Logging {
    * configuration.
    */
   def appendS3AndSparkHadoopConfigurations(conf: SparkConf, hadoopConf: Configuration): Unit = {
-    // Note: this null check is around more than just access to the "conf" object to maintain
-    // the behavior of the old implementation of this code, for backwards compatibility.
-    if (conf != null) {
-      // Explicitly check for S3 environment variables
-      val keyId = System.getenv("AWS_ACCESS_KEY_ID")
-      val accessKey = System.getenv("AWS_SECRET_ACCESS_KEY")
-      if (keyId != null && accessKey != null) {
-        hadoopConf.set("fs.s3.awsAccessKeyId", keyId)
-        hadoopConf.set("fs.s3n.awsAccessKeyId", keyId)
-        hadoopConf.set("fs.s3a.access.key", keyId)
-        hadoopConf.set("fs.s3.awsSecretAccessKey", accessKey)
-        hadoopConf.set("fs.s3n.awsSecretAccessKey", accessKey)
-        hadoopConf.set("fs.s3a.secret.key", accessKey)
-
-        val sessionToken = System.getenv("AWS_SESSION_TOKEN")
-        if (sessionToken != null) {
-          hadoopConf.set("fs.s3a.session.token", sessionToken)
-        }
-      }
-      appendSparkHadoopConfigs(conf, hadoopConf)
-      val bufferSize = conf.get("spark.buffer.size", "65536")
-      hadoopConf.set("io.file.buffer.size", bufferSize)
-    }
+    SparkHadoopUtil.appendS3AndSparkHadoopConfigurations(conf, hadoopConf)
   }
 
   /**
@@ -111,10 +89,7 @@ class SparkHadoopUtil extends Logging {
    * configuration without the spark.hadoop. prefix.
    */
   def appendSparkHadoopConfigs(conf: SparkConf, hadoopConf: Configuration): Unit = {
-    // Copy any "spark.hadoop.foo=bar" spark properties into conf as "foo=bar"
-    for ((key, value) <- conf.getAll if key.startsWith("spark.hadoop.")) {
-      hadoopConf.set(key.substring("spark.hadoop.".length), value)
-    }
+    SparkHadoopUtil.appendSparkHadoopConfigs(conf, hadoopConf)
   }
 
   /**
@@ -134,9 +109,7 @@ class SparkHadoopUtil extends Logging {
    * subsystems.
    */
   def newConfiguration(conf: SparkConf): Configuration = {
-    val hadoopConf = new Configuration()
-    appendS3AndSparkHadoopConfigurations(conf, hadoopConf)
-    hadoopConf
+    SparkHadoopUtil.newConfiguration(conf)
   }
 
   /**
@@ -479,4 +452,50 @@ object SparkHadoopUtil {
       hadoop
     }
   }
+
+  /**
+   * Returns a Configuration object with Spark configuration applied on top. Unlike
+   * the instance method, this will always return a Configuration instance, and not a
+   * cluster manager-specific type.
+   */
+  private[spark] def newConfiguration(conf: SparkConf): Configuration = {
+    val hadoopConf = new Configuration()
+    appendS3AndSparkHadoopConfigurations(conf, hadoopConf)
+    hadoopConf
+  }
+
+  private def appendS3AndSparkHadoopConfigurations(
+      conf: SparkConf,
+      hadoopConf: Configuration): Unit = {
+    // Note: this null check is around more than just access to the "conf" object to maintain
+    // the behavior of the old implementation of this code, for backwards compatibility.
+    if (conf != null) {
+      // Explicitly check for S3 environment variables
+      val keyId = System.getenv("AWS_ACCESS_KEY_ID")
+      val accessKey = System.getenv("AWS_SECRET_ACCESS_KEY")
+      if (keyId != null && accessKey != null) {
+        hadoopConf.set("fs.s3.awsAccessKeyId", keyId)
+        hadoopConf.set("fs.s3n.awsAccessKeyId", keyId)
+        hadoopConf.set("fs.s3a.access.key", keyId)
+        hadoopConf.set("fs.s3.awsSecretAccessKey", accessKey)
+        hadoopConf.set("fs.s3n.awsSecretAccessKey", accessKey)
+        hadoopConf.set("fs.s3a.secret.key", accessKey)
+
+        val sessionToken = System.getenv("AWS_SESSION_TOKEN")
+        if (sessionToken != null) {
+          hadoopConf.set("fs.s3a.session.token", sessionToken)
+        }
+      }
+      appendSparkHadoopConfigs(conf, hadoopConf)
+      val bufferSize = conf.get("spark.buffer.size", "65536")
+      hadoopConf.set("io.file.buffer.size", bufferSize)
+    }
+  }
+
+  private def appendSparkHadoopConfigs(conf: SparkConf, hadoopConf: Configuration): Unit = {
+    // Copy any "spark.hadoop.foo=bar" spark properties into conf as "foo=bar"
+    for ((key, value) <- conf.getAll if key.startsWith("spark.hadoop.")) {
+      hadoopConf.set(key.substring("spark.hadoop.".length), value)
+    }
+  }
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 548149a88a49d..38604fe93998c 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -20,19 +20,16 @@ package org.apache.spark.deploy
 import java.io._
 import java.lang.reflect.{InvocationTargetException, Modifier, UndeclaredThrowableException}
 import java.net.URL
-import java.security.{KeyStore, PrivilegedExceptionAction}
-import java.security.cert.X509Certificate
+import java.security.PrivilegedExceptionAction
 import java.text.ParseException
-import javax.net.ssl._
 
 import scala.annotation.tailrec
 import scala.collection.mutable.{ArrayBuffer, HashMap, Map}
 import scala.util.Properties
 
-import com.google.common.io.ByteStreams
 import org.apache.commons.lang3.StringUtils
 import org.apache.hadoop.conf.{Configuration => HadoopConfiguration}
-import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.hadoop.fs.Path
 import org.apache.hadoop.security.UserGroupInformation
 import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.apache.ivy.Ivy
@@ -69,7 +66,9 @@ private[deploy] object SparkSubmitAction extends Enumeration {
  * This program handles setting up the classpath with relevant Spark dependencies and provides
  * a layer over the different cluster managers and deploy modes that Spark supports.
  */
-object SparkSubmit extends CommandLineUtils {
+object SparkSubmit extends CommandLineUtils with Logging {
+
+  import DependencyUtils._
 
   // Cluster managers
   private val YARN = 1
@@ -113,6 +112,10 @@ object SparkSubmit extends CommandLineUtils {
   // scalastyle:on println
 
   override def main(args: Array[String]): Unit = {
+    // Initialize logging if it hasn't been done yet. Keep track of whether logging needs to
+    // be reset before the application starts.
+    val uninitLog = initializeLogIfNecessary(true, silent = true)
+
     val appArgs = new SparkSubmitArguments(args)
     if (appArgs.verbose) {
       // scalastyle:off println
@@ -120,7 +123,7 @@ object SparkSubmit extends CommandLineUtils {
       // scalastyle:on println
     }
     appArgs.action match {
-      case SparkSubmitAction.SUBMIT => submit(appArgs)
+      case SparkSubmitAction.SUBMIT => submit(appArgs, uninitLog)
       case SparkSubmitAction.KILL => kill(appArgs)
       case SparkSubmitAction.REQUEST_STATUS => requestStatus(appArgs)
     }
@@ -153,7 +156,7 @@ object SparkSubmit extends CommandLineUtils {
    * main class.
    */
   @tailrec
-  private def submit(args: SparkSubmitArguments): Unit = {
+  private def submit(args: SparkSubmitArguments, uninitLog: Boolean): Unit = {
     val (childArgs, childClasspath, sysProps, childMainClass) = prepareSubmitEnvironment(args)
 
     def doRunMain(): Unit = {
@@ -185,11 +188,16 @@ object SparkSubmit extends CommandLineUtils {
       }
     }
 
-     // In standalone cluster mode, there are two submission gateways:
-     //   (1) The traditional RPC gateway using o.a.s.deploy.Client as a wrapper
-     //   (2) The new REST-based gateway introduced in Spark 1.3
-     // The latter is the default behavior as of Spark 1.3, but Spark submit will fail over
-     // to use the legacy gateway if the master endpoint turns out to be not a REST server.
+    // Let the main class re-initialize the logging system once it starts.
+    if (uninitLog) {
+      Logging.uninitialize()
+    }
+
+    // In standalone cluster mode, there are two submission gateways:
+    //   (1) The traditional RPC gateway using o.a.s.deploy.Client as a wrapper
+    //   (2) The new REST-based gateway introduced in Spark 1.3
+    // The latter is the default behavior as of Spark 1.3, but Spark submit will fail over
+    // to use the legacy gateway if the master endpoint turns out to be not a REST server.
     if (args.isStandaloneCluster && args.useRest) {
       try {
         // scalastyle:off println
@@ -202,7 +210,7 @@ object SparkSubmit extends CommandLineUtils {
           printWarning(s"Master endpoint ${args.master} was not a REST server. " +
             "Falling back to legacy submission gateway instead.")
           args.useRest = false
-          submit(args)
+          submit(args, false)
       }
     // In all other modes, just run the main class as prepared
     } else {
@@ -328,8 +336,10 @@ object SparkSubmit extends CommandLineUtils {
       }
     }
 
-    val hadoopConf = conf.getOrElse(new HadoopConfiguration())
-    val targetDir = DependencyUtils.createTempDir()
+    val sparkConf = new SparkConf(false)
+    args.sparkProperties.foreach { case (k, v) => sparkConf.set(k, v) }
+    val hadoopConf = conf.getOrElse(SparkHadoopUtil.newConfiguration(sparkConf))
+    val targetDir = Utils.createTempDir()
 
     // Resolve glob path for different resources.
     args.jars = Option(args.jars).map(resolveGlobPaths(_, hadoopConf)).orNull
@@ -342,14 +352,15 @@ object SparkSubmit extends CommandLineUtils {
     var localJars: String = null
     var localPyFiles: String = null
     if (deployMode == CLIENT) {
+      val secMgr = new SecurityManager(sparkConf)
       localPrimaryResource = Option(args.primaryResource).map {
-        downloadFile(_, targetDir, args.sparkProperties, hadoopConf)
+        downloadFile(_, targetDir, sparkConf, hadoopConf, secMgr)
       }.orNull
       localJars = Option(args.jars).map {
-        downloadFileList(_, targetDir, args.sparkProperties, hadoopConf)
+        downloadFileList(_, targetDir, sparkConf, hadoopConf, secMgr)
       }.orNull
       localPyFiles = Option(args.pyFiles).map {
-        downloadFileList(_, targetDir, args.sparkProperties, hadoopConf)
+        downloadFileList(_, targetDir, sparkConf, hadoopConf, secMgr)
       }.orNull
     }
 
@@ -863,136 +874,6 @@ object SparkSubmit extends CommandLineUtils {
     if (merged == "") null else merged
   }
 
-  /**
-   * Download a list of remote files to temp local files. If the file is local, the original file
-   * will be returned.
-   * @param fileList A comma separated file list.
-   * @param targetDir A temporary directory for which downloaded files
-   * @param sparkProperties Spark properties
-   * @return A comma separated local files list.
-   */
-  private[deploy] def downloadFileList(
-      fileList: String,
-      targetDir: File,
-      sparkProperties: Map[String, String],
-      hadoopConf: HadoopConfiguration): String = {
-    require(fileList != null, "fileList cannot be null.")
-    fileList.split(",")
-      .map(downloadFile(_, targetDir, sparkProperties, hadoopConf))
-      .mkString(",")
-  }
-
-  /**
-   * Download a file from the remote to a local temporary directory. If the input path points to
-   * a local path, returns it with no operation.
-   * @param path A file path from where the files will be downloaded.
-   * @param targetDir A temporary directory for which downloaded files
-   * @param sparkProperties Spark properties
-   * @return A comma separated local files list.
-   */
-  private[deploy] def downloadFile(
-      path: String,
-      targetDir: File,
-      sparkProperties: Map[String, String],
-      hadoopConf: HadoopConfiguration): String = {
-    require(path != null, "path cannot be null.")
-    val uri = Utils.resolveURI(path)
-    uri.getScheme match {
-      case "file" | "local" => path
-      case "http" | "https" | "ftp" =>
-        val uc = uri.toURL.openConnection()
-        uc match {
-          case https: HttpsURLConnection =>
-            val trustStore = sparkProperties.get("spark.ssl.fs.trustStore")
-              .orElse(sparkProperties.get("spark.ssl.trustStore"))
-            val trustStorePwd = sparkProperties.get("spark.ssl.fs.trustStorePassword")
-              .orElse(sparkProperties.get("spark.ssl.trustStorePassword"))
-              .map(_.toCharArray)
-              .orNull
-            val protocol = sparkProperties.get("spark.ssl.fs.protocol")
-              .orElse(sparkProperties.get("spark.ssl.protocol"))
-            if (protocol.isEmpty) {
-              printErrorAndExit("spark ssl protocol is required when enabling SSL connection.")
-            }
-
-            val trustStoreManagers = trustStore.map { t =>
-              var input: InputStream = null
-              try {
-                input = new FileInputStream(new File(t))
-                val ks = KeyStore.getInstance(KeyStore.getDefaultType)
-                ks.load(input, trustStorePwd)
-                val tmf = TrustManagerFactory.getInstance(TrustManagerFactory.getDefaultAlgorithm)
-                tmf.init(ks)
-                tmf.getTrustManagers
-              } finally {
-                if (input != null) {
-                  input.close()
-                  input = null
-                }
-              }
-            }.getOrElse {
-              Array({
-                new X509TrustManager {
-                  override def getAcceptedIssuers: Array[X509Certificate] = null
-                  override def checkClientTrusted(
-                      x509Certificates: Array[X509Certificate], s: String) {}
-                  override def checkServerTrusted(
-                      x509Certificates: Array[X509Certificate], s: String) {}
-                }: TrustManager
-              })
-            }
-            val sslContext = SSLContext.getInstance(protocol.get)
-            sslContext.init(null, trustStoreManagers, null)
-            https.setSSLSocketFactory(sslContext.getSocketFactory)
-            https.setHostnameVerifier(new HostnameVerifier {
-              override def verify(s: String, sslSession: SSLSession): Boolean = false
-            })
-
-          case _ =>
-        }
-
-        uc.setConnectTimeout(60 * 1000)
-        uc.setReadTimeout(60 * 1000)
-        uc.connect()
-        val in = uc.getInputStream
-        val fileName = new Path(uri).getName
-        val tempFile = new File(targetDir, fileName)
-        val out = new FileOutputStream(tempFile)
-        // scalastyle:off println
-        printStream.println(s"Downloading ${uri.toString} to ${tempFile.getAbsolutePath}.")
-        // scalastyle:on println
-        try {
-          ByteStreams.copy(in, out)
-        } finally {
-          in.close()
-          out.close()
-        }
-        tempFile.toURI.toString
-      case _ =>
-        val fs = FileSystem.get(uri, hadoopConf)
-        val tmpFile = new File(targetDir, new Path(uri).getName)
-        // scalastyle:off println
-        printStream.println(s"Downloading ${uri.toString} to ${tmpFile.getAbsolutePath}.")
-        // scalastyle:on println
-        fs.copyToLocalFile(new Path(uri), new Path(tmpFile.getAbsolutePath))
-        tmpFile.toURI.toString
-    }
-  }
-
-  private[deploy] def resolveGlobPaths(paths: String, hadoopConf: HadoopConfiguration): String = {
-    require(paths != null, "paths cannot be null.")
-    paths.split(",").map(_.trim).filter(_.nonEmpty).flatMap { path =>
-      val uri = Utils.resolveURI(path)
-      uri.getScheme match {
-        case "local" | "http" | "https" | "ftp" => Array(path)
-        case _ =>
-          val fs = FileSystem.get(uri, hadoopConf)
-          Option(fs.globStatus(new Path(uri))).map { status =>
-            status.filter(_.isFile).map(_.getPath.toUri.toString)
-          }.getOrElse(Array(path))
-      }
-    }.mkString(",")
-  }
 }
 
 /** Provides utility functions to be used inside SparkSubmit. */
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala b/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala
index cd3e361530c18..c1671192e0c64 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala
@@ -22,7 +22,7 @@ import java.io.File
 import org.apache.commons.lang3.StringUtils
 
 import org.apache.spark.{SecurityManager, SparkConf}
-import org.apache.spark.deploy.{DependencyUtils, SparkSubmit}
+import org.apache.spark.deploy.{DependencyUtils, SparkHadoopUtil, SparkSubmit}
 import org.apache.spark.rpc.RpcEnv
 import org.apache.spark.util.{ChildFirstURLClassLoader, MutableURLClassLoader, Utils}
 
@@ -72,6 +72,10 @@ object DriverWrapper {
   }
 
   private def setupDependencies(loader: MutableURLClassLoader, userJar: String): Unit = {
+    val sparkConf = new SparkConf()
+    val secMgr = new SecurityManager(sparkConf)
+    val hadoopConf = SparkHadoopUtil.newConfiguration(sparkConf)
+
     val Seq(packagesExclusions, packages, repositories, ivyRepoPath) =
       Seq("spark.jars.excludes", "spark.jars.packages", "spark.jars.repositories", "spark.jars.ivy")
         .map(sys.props.get(_).orNull)
@@ -86,7 +90,8 @@ object DriverWrapper {
         jarsProp
       }
     }
-    val localJars = DependencyUtils.resolveAndDownloadJars(jars, userJar)
+    val localJars = DependencyUtils.resolveAndDownloadJars(jars, userJar, sparkConf, hadoopConf,
+      secMgr)
     DependencyUtils.addJarsToClassPath(localJars, loader)
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/internal/Logging.scala b/core/src/main/scala/org/apache/spark/internal/Logging.scala
index c7f2847731fcb..cea9964ea8c91 100644
--- a/core/src/main/scala/org/apache/spark/internal/Logging.scala
+++ b/core/src/main/scala/org/apache/spark/internal/Logging.scala
@@ -96,24 +96,27 @@ trait Logging {
   }
 
   protected def initializeLogIfNecessary(isInterpreter: Boolean): Unit = {
+    initializeLogIfNecessary(isInterpreter, silent = false)
+  }
+
+  protected def initializeLogIfNecessary(
+      isInterpreter: Boolean,
+      silent: Boolean = false): Boolean = {
     if (!Logging.initialized) {
       Logging.initLock.synchronized {
         if (!Logging.initialized) {
-          initializeLogging(isInterpreter)
+          initializeLogging(isInterpreter, silent)
+          return true
         }
       }
     }
+    false
   }
 
-  private def initializeLogging(isInterpreter: Boolean): Unit = {
+  private def initializeLogging(isInterpreter: Boolean, silent: Boolean): Unit = {
     // Don't use a logger in here, as this is itself occurring during initialization of a logger
     // If Log4j 1.2 is being used, but is not initialized, load a default properties file
-    val binderClass = StaticLoggerBinder.getSingleton.getLoggerFactoryClassStr
-    // This distinguishes the log4j 1.2 binding, currently
-    // org.slf4j.impl.Log4jLoggerFactory, from the log4j 2.0 binding, currently
-    // org.apache.logging.slf4j.Log4jLoggerFactory
-    val usingLog4j12 = "org.slf4j.impl.Log4jLoggerFactory".equals(binderClass)
-    if (usingLog4j12) {
+    if (Logging.isLog4j12()) {
       val log4j12Initialized = LogManager.getRootLogger.getAllAppenders.hasMoreElements
       // scalastyle:off println
       if (!log4j12Initialized) {
@@ -121,22 +124,30 @@ trait Logging {
         Option(Utils.getSparkClassLoader.getResource(defaultLogProps)) match {
           case Some(url) =>
             PropertyConfigurator.configure(url)
-            System.err.println(s"Using Spark's default log4j profile: $defaultLogProps")
+            if (!silent) {
+              System.err.println(s"Using Spark's default log4j profile: $defaultLogProps")
+            }
           case None =>
             System.err.println(s"Spark was unable to load $defaultLogProps")
         }
       }
 
+      val rootLogger = LogManager.getRootLogger()
+      if (Logging.defaultRootLevel == null) {
+        Logging.defaultRootLevel = rootLogger.getLevel()
+      }
+
       if (isInterpreter) {
         // Use the repl's main class to define the default log level when running the shell,
         // overriding the root logger's config if they're different.
-        val rootLogger = LogManager.getRootLogger()
         val replLogger = LogManager.getLogger(logName)
         val replLevel = Option(replLogger.getLevel()).getOrElse(Level.WARN)
         if (replLevel != rootLogger.getEffectiveLevel()) {
-          System.err.printf("Setting default log level to \"%s\".\n", replLevel)
-          System.err.println("To adjust logging level use sc.setLogLevel(newLevel). " +
-            "For SparkR, use setLogLevel(newLevel).")
+          if (!silent) {
+            System.err.printf("Setting default log level to \"%s\".\n", replLevel)
+            System.err.println("To adjust logging level use sc.setLogLevel(newLevel). " +
+              "For SparkR, use setLogLevel(newLevel).")
+          }
           rootLogger.setLevel(replLevel)
         }
       }
@@ -150,8 +161,10 @@ trait Logging {
   }
 }
 
-private object Logging {
+private[spark] object Logging {
   @volatile private var initialized = false
+  @volatile private var defaultRootLevel: Level = null
+
   val initLock = new Object()
   try {
     // We use reflection here to handle the case where users remove the
@@ -165,4 +178,24 @@ private object Logging {
   } catch {
     case e: ClassNotFoundException => // can't log anything yet so just fail silently
   }
+
+  /**
+   * Marks the logging system as not initialized. This does a best effort at resetting the
+   * logging system to its initial state so that the next class to use logging triggers
+   * initialization again.
+   */
+  def uninitialize(): Unit = initLock.synchronized {
+    if (isLog4j12()) {
+      LogManager.resetConfiguration()
+    }
+    this.initialized = false
+  }
+
+  private def isLog4j12(): Boolean = {
+    // This distinguishes the log4j 1.2 binding, currently
+    // org.slf4j.impl.Log4jLoggerFactory, from the log4j 2.0 binding, currently
+    // org.apache.logging.slf4j.Log4jLoggerFactory
+    val binderClass = StaticLoggerBinder.getSingleton.getLoggerFactoryClassStr
+    "org.slf4j.impl.Log4jLoggerFactory".equals(binderClass)
+  }
 }
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 3dce76c2c96ba..0da075de71d2f 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -449,7 +449,7 @@ private[spark] object Utils extends Logging {
       securityMgr: SecurityManager,
       hadoopConf: Configuration,
       timestamp: Long,
-      useCache: Boolean) {
+      useCache: Boolean): File = {
     val fileName = decodeFileNameInURI(new URI(url))
     val targetFile = new File(targetDir, fileName)
     val fetchCacheEnabled = conf.getBoolean("spark.files.useFetchCache", defaultValue = true)
@@ -498,6 +498,8 @@ private[spark] object Utils extends Logging {
     if (isWindows) {
       FileUtil.chmod(targetFile.getAbsolutePath, "u+r")
     }
+
+    targetFile
   }
 
   /**
@@ -637,13 +639,13 @@ private[spark] object Utils extends Logging {
    * Throws SparkException if the target file already exists and has different contents than
    * the requested file.
    */
-  private def doFetchFile(
+  def doFetchFile(
       url: String,
       targetDir: File,
       filename: String,
       conf: SparkConf,
       securityMgr: SecurityManager,
-      hadoopConf: Configuration) {
+      hadoopConf: Configuration): File = {
     val targetFile = new File(targetDir, filename)
     val uri = new URI(url)
     val fileOverwrite = conf.getBoolean("spark.files.overwrite", defaultValue = false)
@@ -687,6 +689,8 @@ private[spark] object Utils extends Logging {
         fetchHcfsFile(path, targetDir, fs, conf, hadoopConf, fileOverwrite,
                       filename = Some(filename))
     }
+
+    targetFile
   }
 
   /**
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index 95137c868cbaf..724096d4abf34 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -29,7 +29,7 @@ import scala.io.Source
 import com.google.common.io.ByteStreams
 import org.apache.commons.io.FileUtils
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileStatus, Path, RawLocalFileSystem}
+import org.apache.hadoop.fs.{FileStatus, FSDataInputStream, Path}
 import org.scalatest.{BeforeAndAfterEach, Matchers}
 import org.scalatest.concurrent.Timeouts
 import org.scalatest.time.SpanSugar._
@@ -793,30 +793,37 @@ class SparkSubmitSuite
   }
 
   test("downloadFile - invalid url") {
+    val sparkConf = new SparkConf(false)
     intercept[IOException] {
-      SparkSubmit.downloadFile(
-        "abc:/my/file", Utils.createTempDir(), mutable.Map.empty, new Configuration())
+      DependencyUtils.downloadFile(
+        "abc:/my/file", Utils.createTempDir(), sparkConf, new Configuration(),
+        new SecurityManager(sparkConf))
     }
   }
 
   test("downloadFile - file doesn't exist") {
+    val sparkConf = new SparkConf(false)
     val hadoopConf = new Configuration()
     val tmpDir = Utils.createTempDir()
     updateConfWithFakeS3Fs(hadoopConf)
     intercept[FileNotFoundException] {
-      SparkSubmit.downloadFile("s3a:/no/such/file", tmpDir, mutable.Map.empty, hadoopConf)
+      DependencyUtils.downloadFile("s3a:/no/such/file", tmpDir, sparkConf, hadoopConf,
+        new SecurityManager(sparkConf))
     }
   }
 
   test("downloadFile does not download local file") {
+    val sparkConf = new SparkConf(false)
+    val secMgr = new SecurityManager(sparkConf)
     // empty path is considered as local file.
     val tmpDir = Files.createTempDirectory("tmp").toFile
-    assert(SparkSubmit.downloadFile("", tmpDir, mutable.Map.empty, new Configuration()) === "")
-    assert(SparkSubmit.downloadFile("/local/file", tmpDir, mutable.Map.empty,
-      new Configuration()) === "/local/file")
+    assert(DependencyUtils.downloadFile("", tmpDir, sparkConf, new Configuration(), secMgr) === "")
+    assert(DependencyUtils.downloadFile("/local/file", tmpDir, sparkConf, new Configuration(),
+      secMgr) === "/local/file")
   }
 
   test("download one file to local") {
+    val sparkConf = new SparkConf(false)
     val jarFile = File.createTempFile("test", ".jar")
     jarFile.deleteOnExit()
     val content = "hello, world"
@@ -825,13 +832,14 @@ class SparkSubmitSuite
     val tmpDir = Files.createTempDirectory("tmp").toFile
     updateConfWithFakeS3Fs(hadoopConf)
     val sourcePath = s"s3a://${jarFile.getAbsolutePath}"
-    val outputPath =
-      SparkSubmit.downloadFile(sourcePath, tmpDir, mutable.Map.empty, hadoopConf)
+    val outputPath = DependencyUtils.downloadFile(sourcePath, tmpDir, sparkConf, hadoopConf,
+      new SecurityManager(sparkConf))
     checkDownloadedFile(sourcePath, outputPath)
     deleteTempOutputFile(outputPath)
   }
 
   test("download list of files to local") {
+    val sparkConf = new SparkConf(false)
     val jarFile = File.createTempFile("test", ".jar")
     jarFile.deleteOnExit()
     val content = "hello, world"
@@ -840,8 +848,10 @@ class SparkSubmitSuite
     val tmpDir = Files.createTempDirectory("tmp").toFile
     updateConfWithFakeS3Fs(hadoopConf)
     val sourcePaths = Seq("/local/file", s"s3a://${jarFile.getAbsolutePath}")
-    val outputPaths = SparkSubmit.downloadFileList(
-      sourcePaths.mkString(","), tmpDir, mutable.Map.empty, hadoopConf).split(",")
+    val outputPaths = DependencyUtils
+      .downloadFileList(sourcePaths.mkString(","), tmpDir, sparkConf, hadoopConf,
+        new SecurityManager(sparkConf))
+      .split(",")
 
     assert(outputPaths.length === sourcePaths.length)
     sourcePaths.zip(outputPaths).foreach { case (sourcePath, outputPath) =>
@@ -996,17 +1006,31 @@ object UserClasspathFirstTest {
 }
 
 class TestFileSystem extends org.apache.hadoop.fs.LocalFileSystem {
-  override def copyToLocalFile(src: Path, dst: Path): Unit = {
+  private def local(path: Path): Path = {
     // Ignore the scheme for testing.
-    super.copyToLocalFile(new Path(src.toUri.getPath), dst)
+    new Path(path.toUri.getPath)
+  }
+
+  private def toRemote(status: FileStatus): FileStatus = {
+    val path = s"s3a://${status.getPath.toUri.getPath}"
+    status.setPath(new Path(path))
+    status
   }
 
+  override def isFile(path: Path): Boolean = super.isFile(local(path))
+
   override def globStatus(pathPattern: Path): Array[FileStatus] = {
     val newPath = new Path(pathPattern.toUri.getPath)
-    super.globStatus(newPath).map { status =>
-      val path = s"s3a://${status.getPath.toUri.getPath}"
-      status.setPath(new Path(path))
-      status
-    }
+    super.globStatus(newPath).map(toRemote)
   }
+
+  override def listStatus(path: Path): Array[FileStatus] = {
+    super.listStatus(local(path)).map(toRemote)
+  }
+
+  override def copyToLocalFile(src: Path, dst: Path): Unit = {
+    super.copyToLocalFile(local(src), dst)
+  }
+
+  override def open(path: Path): FSDataInputStream = super.open(local(path))
 }

From fba9cc8466dccdcd1f6f372ea7962e7ae9e09be1 Mon Sep 17 00:00:00 2001
From: "he.qiao" <he.qiao17@zte.com.cn>
Date: Tue, 29 Aug 2017 23:44:27 +0100
Subject: [PATCH 446/779] [SPARK-21813][CORE] Modify
 TaskMemoryManager.MAXIMUM_PAGE_SIZE_BYTES comments

## What changes were proposed in this pull request?
The variable "TaskMemoryManager.MAXIMUM_PAGE_SIZE_BYTES" comment error, It shouldn't be 2^32-1, should be 2^31-1, That means the maximum value of int.

## How was this patch tested?
Existing test cases

Author: he.qiao <he.qiao17@zte.com.cn>

Closes #19025 from Geek-He/08_23_comments.
---
 .../java/org/apache/spark/memory/TaskMemoryManager.java     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java b/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
index 761ba9de659d5..0f1e902b0e6a1 100644
--- a/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
+++ b/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
@@ -53,8 +53,8 @@
  * retrieve the base object.
  * <p>
  * This allows us to address 8192 pages. In on-heap mode, the maximum page size is limited by the
- * maximum size of a long[] array, allowing us to address 8192 * 2^32 * 8 bytes, which is
- * approximately 35 terabytes of memory.
+ * maximum size of a long[] array, allowing us to address 8192 * (2^31 - 1) * 8 bytes, which is
+ * approximately 140 terabytes of memory.
  */
 public class TaskMemoryManager {
 
@@ -74,7 +74,7 @@ public class TaskMemoryManager {
    * Maximum supported data page size (in bytes). In principle, the maximum addressable page size is
    * (1L &lt;&lt; OFFSET_BITS) bytes, which is 2+ petabytes. However, the on-heap allocator's
    * maximum page size is limited by the maximum amount of data that can be stored in a long[]
-   * array, which is (2^32 - 1) * 8 bytes (or 16 gigabytes). Therefore, we cap this at 16 gigabytes.
+   * array, which is (2^31 - 1) * 8 bytes (or about 17 gigabytes). Therefore, we cap this at 17 gigabytes.
    */
   public static final long MAXIMUM_PAGE_SIZE_BYTES = ((1L << 31) - 1) * 8L;
 

From 3d0e174244bc293f11dff0f11ef705ba6cd5fe3a Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 29 Aug 2017 20:59:01 -0700
Subject: [PATCH 447/779] [SPARK-21845][SQL] Make codegen fallback of
 expressions configurable

## What changes were proposed in this pull request?
We should make codegen fallback of expressions configurable. So far, it is always on. We might hide it when our codegen have compilation bugs. Thus, we should also disable the codegen fallback when running test cases.

## How was this patch tested?
Added test cases

Author: gatorsmile <gatorsmile@gmail.com>

Closes #19062 from gatorsmile/fallbackCodegen.
---
 .../org/apache/spark/sql/internal/SQLConf.scala   |  6 +++---
 .../apache/spark/sql/execution/SparkPlan.scala    | 15 +++++----------
 .../sql/execution/WholeStageCodegenExec.scala     |  2 +-
 .../spark/sql/DataFrameFunctionsSuite.scala       |  2 +-
 .../org/apache/spark/sql/DataFrameSuite.scala     | 12 +++++++++++-
 .../apache/spark/sql/test/SharedSQLContext.scala  |  2 ++
 .../org/apache/spark/sql/hive/test/TestHive.scala |  1 +
 7 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index a685099505ee8..24f51ef163106 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -551,9 +551,9 @@ object SQLConf {
     .intConf
     .createWithDefault(100)
 
-  val WHOLESTAGE_FALLBACK = buildConf("spark.sql.codegen.fallback")
+  val CODEGEN_FALLBACK = buildConf("spark.sql.codegen.fallback")
     .internal()
-    .doc("When true, whole stage codegen could be temporary disabled for the part of query that" +
+    .doc("When true, (whole stage) codegen could be temporary disabled for the part of query that" +
       " fail to compile generated code")
     .booleanConf
     .createWithDefault(true)
@@ -1041,7 +1041,7 @@ class SQLConf extends Serializable with Logging {
 
   def wholeStageMaxNumFields: Int = getConf(WHOLESTAGE_MAX_NUM_FIELDS)
 
-  def wholeStageFallback: Boolean = getConf(WHOLESTAGE_FALLBACK)
+  def codegenFallback: Boolean = getConf(CODEGEN_FALLBACK)
 
   def maxCaseBranchesForCodegen: Int = getConf(MAX_CASES_BRANCHES)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index c7277c21cebb2..b1db9dd9dd8ac 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -56,14 +56,10 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
 
   protected def sparkContext = sqlContext.sparkContext
 
-  // sqlContext will be null when we are being deserialized on the slaves.  In this instance
-  // the value of subexpressionEliminationEnabled will be set by the deserializer after the
-  // constructor has run.
-  val subexpressionEliminationEnabled: Boolean = if (sqlContext != null) {
-    sqlContext.conf.subexpressionEliminationEnabled
-  } else {
-    false
-  }
+  // whether we should fallback when hitting compilation errors caused by codegen
+  private val codeGenFallBack = sqlContext.conf.codegenFallback
+
+  protected val subexpressionEliminationEnabled = sqlContext.conf.subexpressionEliminationEnabled
 
   /** Overridden make copy also propagates sqlContext to copied plan. */
   override def makeCopy(newArgs: Array[AnyRef]): SparkPlan = {
@@ -370,8 +366,7 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
     try {
       GeneratePredicate.generate(expression, inputSchema)
     } catch {
-      case e @ (_: JaninoRuntimeException | _: CompileException)
-          if sqlContext == null || sqlContext.conf.wholeStageFallback =>
+      case _ @ (_: JaninoRuntimeException | _: CompileException) if codeGenFallBack =>
         genInterpretedPredicate(expression, inputSchema)
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
index bacb7090a70ab..a41a7ca56a0a1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
@@ -382,7 +382,7 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
     try {
       CodeGenerator.compile(cleanedSource)
     } catch {
-      case e: Exception if !Utils.isTesting && sqlContext.conf.wholeStageFallback =>
+      case _: Exception if !Utils.isTesting && sqlContext.conf.codegenFallback =>
         // We should already saw the error message
         logWarning(s"Whole-stage codegen disabled for this plan:\n $treeString")
         return child.execute()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 0681b9cbeb1d8..50e475984f458 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -422,7 +422,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext {
         v
       }
       withSQLConf(
-        (SQLConf.WHOLESTAGE_FALLBACK.key, codegenFallback.toString),
+        (SQLConf.CODEGEN_FALLBACK.key, codegenFallback.toString),
         (SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, wholeStage.toString)) {
         val df = spark.range(0, 4, 1, 4).withColumn("c", c)
         val rows = df.collect()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 5eb34e587e95f..13341645e8ff8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -2011,7 +2011,17 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
 
     val filter = (0 until N)
       .foldLeft(lit(false))((e, index) => e.or(df.col(df.columns(index)) =!= "string"))
-    df.filter(filter).count
+
+    withSQLConf(SQLConf.CODEGEN_FALLBACK.key -> "true") {
+      df.filter(filter).count()
+    }
+
+    withSQLConf(SQLConf.CODEGEN_FALLBACK.key -> "false") {
+      val e = intercept[SparkException] {
+        df.filter(filter).count()
+      }.getMessage
+      assert(e.contains("grows beyond 64 KB"))
+    }
   }
 
   test("SPARK-20897: cached self-join should not fail") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
index 1f073d5f64c6b..cd8d0708d8a32 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
@@ -24,6 +24,7 @@ import org.scalatest.concurrent.Eventually
 
 import org.apache.spark.{DebugFilesystem, SparkConf}
 import org.apache.spark.sql.{SparkSession, SQLContext}
+import org.apache.spark.sql.internal.SQLConf
 
 /**
  * Helper trait for SQL test suites where all tests share a single [[TestSparkSession]].
@@ -34,6 +35,7 @@ trait SharedSQLContext extends SQLTestUtils with BeforeAndAfterEach with Eventua
     new SparkConf()
       .set("spark.hadoop.fs.file.impl", classOf[DebugFilesystem].getName)
       .set("spark.unsafe.exceptionOnMemoryLeak", "true")
+      .set(SQLConf.CODEGEN_FALLBACK.key, "false")
   }
 
   /**
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 10c9a2de6540a..0f6a81b6f813b 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -51,6 +51,7 @@ object TestHive
       "TestSQLContext",
       new SparkConf()
         .set("spark.sql.test", "")
+        .set(SQLConf.CODEGEN_FALLBACK.key, "false")
         .set("spark.sql.hive.metastore.barrierPrefixes",
           "org.apache.spark.sql.hive.execution.PairSerDe")
         .set("spark.sql.warehouse.dir", TestHiveContext.makeWarehouseDir().toURI.getPath)

From e47f48c737052564e92903de16ff16707fae32c3 Mon Sep 17 00:00:00 2001
From: Steve Loughran <stevel@hortonworks.com>
Date: Wed, 30 Aug 2017 13:03:30 +0900
Subject: [PATCH 448/779] [SPARK-20886][CORE] HadoopMapReduceCommitProtocol to
 handle FileOutputCommitter.getWorkPath==null

## What changes were proposed in this pull request?

Handles the situation where a `FileOutputCommitter.getWorkPath()` returns `null` by downgrading to the supplied `path` argument.

The existing code does an  `Option(workPath.toString).getOrElse(path)`, which triggers an NPE in the `toString()` operation if the workPath == null. The code apparently was meant to handle this (hence the getOrElse() clause, but as the NPE has already occurred at that point the else-clause never gets invoked.

## How was this patch tested?

Manually, with some later code review.

Author: Steve Loughran <stevel@hortonworks.com>

Closes #18111 from steveloughran/cloud/SPARK-20886-committer-NPE.
---
 .../spark/internal/io/HadoopMapReduceCommitProtocol.scala      | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index 22e26799138ba..b1d07ab2c9199 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -73,7 +73,8 @@ class HadoopMapReduceCommitProtocol(jobId: String, path: String)
 
     val stagingDir: String = committer match {
       // For FileOutputCommitter it has its own staging path called "work path".
-      case f: FileOutputCommitter => Option(f.getWorkPath.toString).getOrElse(path)
+      case f: FileOutputCommitter =>
+        Option(f.getWorkPath).map(_.toString).getOrElse(path)
       case _ => path
     }
 

From d4895c9de6ca9c3ac4461cf6f86cd88eb63e0720 Mon Sep 17 00:00:00 2001
From: liuxian <liu.xian3@zte.com.cn>
Date: Wed, 30 Aug 2017 10:16:11 +0100
Subject: [PATCH 449/779] [MINOR][TEST] Off -heap memory leaks for unit tests

## What changes were proposed in this pull request?
Free off -heap memory .
I have checked all the unit tests.

## How was this patch tested?
N/A

Author: liuxian <liu.xian3@zte.com.cn>

Closes #19075 from 10110346/memleak.
---
 .../src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java | 1 +
 .../java/org/apache/spark/memory/TaskMemoryManagerSuite.java     | 1 +
 2 files changed, 2 insertions(+)

diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java
index a77ba826fce29..4ae49d82efa29 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java
@@ -73,5 +73,6 @@ public void memoryDebugFillEnabledInTest() {
     Assert.assertEquals(
       Platform.getByte(offheap.getBaseObject(), offheap.getBaseOffset()),
       MemoryAllocator.MEMORY_DEBUG_FILL_CLEAN_VALUE);
+    MemoryAllocator.UNSAFE.free(offheap);
   }
 }
diff --git a/core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java b/core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java
index f53bc0b02bbfa..46b0516e36141 100644
--- a/core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java
+++ b/core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java
@@ -54,6 +54,7 @@ public void encodePageNumberAndOffsetOffHeap() {
     final long encodedAddress = manager.encodePageNumberAndOffset(dataPage, offset);
     Assert.assertEquals(null, manager.getPage(encodedAddress));
     Assert.assertEquals(offset, manager.getOffsetInPage(encodedAddress));
+    manager.freePage(dataPage, c);
   }
 
   @Test

From 8f0df6bc1092c0c75b41e91e4ffc41a5525c8274 Mon Sep 17 00:00:00 2001
From: Yuval Itzchakov <yuval.itzchakov@clicktale.com>
Date: Wed, 30 Aug 2017 10:33:23 +0100
Subject: [PATCH 450/779] [SPARK-21873][SS] - Avoid using `return` inside
 `CachedKafkaConsumer.get`

During profiling of a structured streaming application with Kafka as the source, I came across this exception:

![Structured Streaming Kafka Exceptions](https://user-images.githubusercontent.com/3448320/29743366-4149ef78-8a99-11e7-94d6-f0cbb691134a.png)

This is a 1 minute sample, which caused 106K `NonLocalReturnControl` exceptions to be thrown.
This happens because `CachedKafkaConsumer.get` is ran inside:

`private def runUninterruptiblyIfPossible[T](body: => T): T`

Where `body: => T` is the `get` method. Turning the method into a function means that in order to escape the `while` loop defined in `get` the runtime has to do dirty tricks which involve throwing the above exception.

## What changes were proposed in this pull request?

Instead of using `return` (which is generally not recommended in Scala), we place the result of the `fetchData` method inside a local variable and use a boolean flag to indicate the status of fetching data, which we monitor as our predicate to the `while` loop.

## How was this patch tested?

I've ran the `KafkaSourceSuite` to make sure regression passes. Since the exception isn't visible from user code, there is no way (at least that I could think of) to add this as a test to the existing suite.

Author: Yuval Itzchakov <yuval.itzchakov@clicktale.com>

Closes #19059 from YuvalItzchakov/master.
---
 .../sql/kafka010/CachedKafkaConsumer.scala    | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/CachedKafkaConsumer.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/CachedKafkaConsumer.scala
index 7c4f38e02fb2a..90ed7b1fba2f8 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/CachedKafkaConsumer.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/CachedKafkaConsumer.scala
@@ -112,9 +112,15 @@ private[kafka010] case class CachedKafkaConsumer private(
     // we will move to the next available offset within `[offset, untilOffset)` and retry.
     // If `failOnDataLoss` is `true`, the loop body will be executed only once.
     var toFetchOffset = offset
-    while (toFetchOffset != UNKNOWN_OFFSET) {
+    var consumerRecord: ConsumerRecord[Array[Byte], Array[Byte]] = null
+    // We want to break out of the while loop on a successful fetch to avoid using "return"
+    // which may causes a NonLocalReturnControl exception when this method is used as a function.
+    var isFetchComplete = false
+
+    while (toFetchOffset != UNKNOWN_OFFSET && !isFetchComplete) {
       try {
-        return fetchData(toFetchOffset, untilOffset, pollTimeoutMs, failOnDataLoss)
+        consumerRecord = fetchData(toFetchOffset, untilOffset, pollTimeoutMs, failOnDataLoss)
+        isFetchComplete = true
       } catch {
         case e: OffsetOutOfRangeException =>
           // When there is some error thrown, it's better to use a new consumer to drop all cached
@@ -125,8 +131,13 @@ private[kafka010] case class CachedKafkaConsumer private(
           toFetchOffset = getEarliestAvailableOffsetBetween(toFetchOffset, untilOffset)
       }
     }
-    resetFetchedData()
-    null
+
+    if (isFetchComplete) {
+      consumerRecord
+    } else {
+      resetFetchedData()
+      null
+    }
   }
 
   /**

From 734ed7a7b397578f16549070f350215bde369b3c Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Wed, 30 Aug 2017 11:36:00 +0100
Subject: [PATCH 451/779] [SPARK-21806][MLLIB] BinaryClassificationMetrics
 pr(): first point (0.0, 1.0) is misleading

## What changes were proposed in this pull request?

Prepend (0,p) to precision-recall curve not (0,1) where p matches lowest recall point

## How was this patch tested?

Updated tests.

Author: Sean Owen <sowen@cloudera.com>

Closes #19038 from srowen/SPARK-21806.
---
 .../BinaryClassificationMetrics.scala         |  8 +++----
 .../BinaryClassificationMetricsSuite.scala    | 22 +++++++++----------
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
index 9b7cd0427f5ed..2cfcf38eb4ca8 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
@@ -98,16 +98,16 @@ class BinaryClassificationMetrics @Since("1.3.0") (
 
   /**
    * Returns the precision-recall curve, which is an RDD of (recall, precision),
-   * NOT (precision, recall), with (0.0, 1.0) prepended to it.
+   * NOT (precision, recall), with (0.0, p) prepended to it, where p is the precision
+   * associated with the lowest recall on the curve.
    * @see <a href="http://en.wikipedia.org/wiki/Precision_and_recall">
    * Precision and recall (Wikipedia)</a>
    */
   @Since("1.0.0")
   def pr(): RDD[(Double, Double)] = {
     val prCurve = createCurve(Recall, Precision)
-    val sc = confusions.context
-    val first = sc.makeRDD(Seq((0.0, 1.0)), 1)
-    first.union(prCurve)
+    val (_, firstPrecision) = prCurve.first()
+    confusions.context.parallelize(Seq((0.0, firstPrecision)), 1).union(prCurve)
   }
 
   /**
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetricsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetricsSuite.scala
index 99d52fabc5309..a08917ac1ebed 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetricsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetricsSuite.scala
@@ -23,18 +23,16 @@ import org.apache.spark.mllib.util.TestingUtils._
 
 class BinaryClassificationMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
 
-  private def areWithinEpsilon(x: (Double, Double)): Boolean = x._1 ~= (x._2) absTol 1E-5
-
-  private def pairsWithinEpsilon(x: ((Double, Double), (Double, Double))): Boolean =
-    (x._1._1 ~= x._2._1 absTol 1E-5) && (x._1._2 ~= x._2._2 absTol 1E-5)
-
-  private def assertSequencesMatch(left: Seq[Double], right: Seq[Double]): Unit = {
-      assert(left.zip(right).forall(areWithinEpsilon))
+  private def assertSequencesMatch(actual: Seq[Double], expected: Seq[Double]): Unit = {
+    actual.zip(expected).foreach { case (a, e) => assert(a ~== e absTol 1.0e-5) }
   }
 
-  private def assertTupleSequencesMatch(left: Seq[(Double, Double)],
-       right: Seq[(Double, Double)]): Unit = {
-    assert(left.zip(right).forall(pairsWithinEpsilon))
+  private def assertTupleSequencesMatch(actual: Seq[(Double, Double)],
+       expected: Seq[(Double, Double)]): Unit = {
+    actual.zip(expected).foreach { case ((ax, ay), (ex, ey)) =>
+      assert(ax ~== ex absTol 1.0e-5)
+      assert(ay ~== ey absTol 1.0e-5)
+    }
   }
 
   private def validateMetrics(metrics: BinaryClassificationMetrics,
@@ -44,7 +42,7 @@ class BinaryClassificationMetricsSuite extends SparkFunSuite with MLlibTestSpark
       expectedFMeasures1: Seq[Double],
       expectedFmeasures2: Seq[Double],
       expectedPrecisions: Seq[Double],
-      expectedRecalls: Seq[Double]) = {
+      expectedRecalls: Seq[Double]): Unit = {
 
     assertSequencesMatch(metrics.thresholds().collect(), expectedThresholds)
     assertTupleSequencesMatch(metrics.roc().collect(), expectedROCCurve)
@@ -111,7 +109,7 @@ class BinaryClassificationMetricsSuite extends SparkFunSuite with MLlibTestSpark
     val fpr = Seq(1.0)
     val rocCurve = Seq((0.0, 0.0)) ++ fpr.zip(recalls) ++ Seq((1.0, 1.0))
     val pr = recalls.zip(precisions)
-    val prCurve = Seq((0.0, 1.0)) ++ pr
+    val prCurve = Seq((0.0, 0.0)) ++ pr
     val f1 = pr.map {
       case (0, 0) => 0.0
       case (r, p) => 2.0 * (p * r) / (p + r)

From b30a11a6acf4b1512b5759f21ae58e69662ba455 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Wed, 30 Aug 2017 21:35:52 +0900
Subject: [PATCH 452/779] [SPARK-21764][TESTS] Fix tests failures on Windows:
 resources not being closed and incorrect paths

## What changes were proposed in this pull request?

`org.apache.spark.deploy.RPackageUtilsSuite`

```
 - jars without manifest return false *** FAILED *** (109 milliseconds)
   java.io.IOException: Unable to delete file: C:\projects\spark\target\tmp\1500266936418-0\dep1-c.jar
```

`org.apache.spark.deploy.SparkSubmitSuite`

```
 - download one file to local *** FAILED *** (16 milliseconds)
   java.net.URISyntaxException: Illegal character in authority at index 6: s3a://C:\projects\spark\target\tmp\test2630198944759847458.jar

 - download list of files to local *** FAILED *** (0 milliseconds)
   java.net.URISyntaxException: Illegal character in authority at index 6: s3a://C:\projects\spark\target\tmp\test2783551769392880031.jar
```

`org.apache.spark.scheduler.ReplayListenerSuite`

```
 - Replay compressed inprogress log file succeeding on partial read (156 milliseconds)
   Exception encountered when attempting to run a suite with class name:
   org.apache.spark.scheduler.ReplayListenerSuite *** ABORTED *** (1 second, 391 milliseconds)
   java.io.IOException: Failed to delete: C:\projects\spark\target\tmp\spark-8f3cacd6-faad-4121-b901-ba1bba8025a0

 - End-to-end replay *** FAILED *** (62 milliseconds)
   java.io.IOException: No FileSystem for scheme: C

 - End-to-end replay with compression *** FAILED *** (110 milliseconds)
   java.io.IOException: No FileSystem for scheme: C
```

`org.apache.spark.sql.hive.StatisticsSuite`

```
 - SPARK-21079 - analyze table with location different than that of individual partitions *** FAILED *** (875 milliseconds)
   org.apache.spark.sql.AnalysisException: org.apache.hadoop.hive.ql.metadata.HiveException: MetaException(message:java.lang.IllegalArgumentException: Can not create a Path from an empty string);

 - SPARK-21079 - analyze partitioned table with only a subset of partitions visible *** FAILED *** (47 milliseconds)
   org.apache.spark.sql.AnalysisException: org.apache.hadoop.hive.ql.metadata.HiveException: MetaException(message:java.lang.IllegalArgumentException: Can not create a Path from an empty string);
```

**Note:** this PR does not fix:

`org.apache.spark.deploy.SparkSubmitSuite`

```
 - launch simple application with spark-submit with redaction *** FAILED *** (172 milliseconds)
   java.util.NoSuchElementException: next on empty iterator
```

I can't reproduce this on my Windows machine but looks appearntly consistently failed on AppVeyor. This one is unclear to me yet and hard to debug so I did not include this one for now.

**Note:** it looks there are more instances but it is hard to identify them partly due to flakiness and partly due to swarming logs and errors. Will probably go one more time if it is fine.

## How was this patch tested?

Manually via AppVeyor:

**Before**

- `org.apache.spark.deploy.RPackageUtilsSuite`: https://ci.appveyor.com/project/spark-test/spark/build/771-windows-fix/job/8t8ra3lrljuir7q4
- `org.apache.spark.deploy.SparkSubmitSuite`: https://ci.appveyor.com/project/spark-test/spark/build/771-windows-fix/job/taquy84yudjjen64
- `org.apache.spark.scheduler.ReplayListenerSuite`: https://ci.appveyor.com/project/spark-test/spark/build/771-windows-fix/job/24omrfn2k0xfa9xq
- `org.apache.spark.sql.hive.StatisticsSuite`: https://ci.appveyor.com/project/spark-test/spark/build/771-windows-fix/job/2079y1plgj76dc9l

**After**

- `org.apache.spark.deploy.RPackageUtilsSuite`: https://ci.appveyor.com/project/spark-test/spark/build/775-windows-fix/job/3803dbfn89ne1164
- `org.apache.spark.deploy.SparkSubmitSuite`: https://ci.appveyor.com/project/spark-test/spark/build/775-windows-fix/job/m5l350dp7u9a4xjr
- `org.apache.spark.scheduler.ReplayListenerSuite`: https://ci.appveyor.com/project/spark-test/spark/build/775-windows-fix/job/565vf74pp6bfdk18
- `org.apache.spark.sql.hive.StatisticsSuite`: https://ci.appveyor.com/project/spark-test/spark/build/775-windows-fix/job/qm78tsk8c37jb6s4

Jenkins tests are required and AppVeyor tests will be triggered.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #18971 from HyukjinKwon/windows-fixes.
---
 .../spark/deploy/RPackageUtilsSuite.scala     |  7 +--
 .../spark/deploy/SparkSubmitSuite.scala       |  4 +-
 .../spark/scheduler/ReplayListenerSuite.scala | 53 +++++++++++--------
 .../spark/sql/hive/StatisticsSuite.scala      |  6 +--
 4 files changed, 39 insertions(+), 31 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala
index 5e0bf6d438dc8..32dd3ecc2f027 100644
--- a/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala
@@ -137,9 +137,10 @@ class RPackageUtilsSuite
     IvyTestUtils.withRepository(main, None, None) { repo =>
       val jar = IvyTestUtils.packJar(new File(new URI(repo)), dep1, Nil,
         useIvyLayout = false, withR = false, None)
-      val jarFile = new JarFile(jar)
-      assert(jarFile.getManifest == null, "jar file should have null manifest")
-      assert(!RPackageUtils.checkManifestForR(jarFile), "null manifest should return false")
+      Utils.tryWithResource(new JarFile(jar)) { jarFile =>
+        assert(jarFile.getManifest == null, "jar file should have null manifest")
+        assert(!RPackageUtils.checkManifestForR(jarFile), "null manifest should return false")
+      }
     }
   }
 
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index 724096d4abf34..7400ceb802d14 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -831,7 +831,7 @@ class SparkSubmitSuite
     val hadoopConf = new Configuration()
     val tmpDir = Files.createTempDirectory("tmp").toFile
     updateConfWithFakeS3Fs(hadoopConf)
-    val sourcePath = s"s3a://${jarFile.getAbsolutePath}"
+    val sourcePath = s"s3a://${jarFile.toURI.getPath}"
     val outputPath = DependencyUtils.downloadFile(sourcePath, tmpDir, sparkConf, hadoopConf,
       new SecurityManager(sparkConf))
     checkDownloadedFile(sourcePath, outputPath)
@@ -847,7 +847,7 @@ class SparkSubmitSuite
     val hadoopConf = new Configuration()
     val tmpDir = Files.createTempDirectory("tmp").toFile
     updateConfWithFakeS3Fs(hadoopConf)
-    val sourcePaths = Seq("/local/file", s"s3a://${jarFile.getAbsolutePath}")
+    val sourcePaths = Seq("/local/file", s"s3a://${jarFile.toURI.getPath}")
     val outputPaths = DependencyUtils
       .downloadFileList(sourcePaths.mkString(","), tmpDir, sparkConf, hadoopConf,
         new SecurityManager(sparkConf))
diff --git a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala
index 88a68af6b647d..d17e3864854a8 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala
@@ -21,6 +21,7 @@ import java.io._
 import java.net.URI
 import java.util.concurrent.atomic.AtomicInteger
 
+import org.apache.hadoop.fs.Path
 import org.json4s.jackson.JsonMethods._
 import org.scalatest.BeforeAndAfter
 
@@ -84,24 +85,23 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp
     val buffered = new ByteArrayOutputStream
     val codec = new LZ4CompressionCodec(new SparkConf())
     val compstream = codec.compressedOutputStream(buffered)
-    val writer = new PrintWriter(compstream)
+    Utils.tryWithResource(new PrintWriter(compstream)) { writer =>
 
-    val applicationStart = SparkListenerApplicationStart("AppStarts", None,
-      125L, "Mickey", None)
-    val applicationEnd = SparkListenerApplicationEnd(1000L)
+      val applicationStart = SparkListenerApplicationStart("AppStarts", None,
+        125L, "Mickey", None)
+      val applicationEnd = SparkListenerApplicationEnd(1000L)
 
-    // scalastyle:off println
-    writer.println(compact(render(JsonProtocol.sparkEventToJson(applicationStart))))
-    writer.println(compact(render(JsonProtocol.sparkEventToJson(applicationEnd))))
-    // scalastyle:on println
-    writer.close()
+      // scalastyle:off println
+      writer.println(compact(render(JsonProtocol.sparkEventToJson(applicationStart))))
+      writer.println(compact(render(JsonProtocol.sparkEventToJson(applicationEnd))))
+      // scalastyle:on println
+    }
 
     val logFilePath = Utils.getFilePath(testDir, "events.lz4.inprogress")
-    val fstream = fileSystem.create(logFilePath)
     val bytes = buffered.toByteArray
-
-    fstream.write(bytes, 0, buffered.size)
-    fstream.close
+    Utils.tryWithResource(fileSystem.create(logFilePath)) { fstream =>
+      fstream.write(bytes, 0, buffered.size)
+    }
 
     // Read the compressed .inprogress file and verify only first event was parsed.
     val conf = EventLoggingListenerSuite.getLoggingConf(logFilePath)
@@ -112,17 +112,19 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp
 
     // Verify the replay returns the events given the input maybe truncated.
     val logData = EventLoggingListener.openEventLog(logFilePath, fileSystem)
-    val failingStream = new EarlyEOFInputStream(logData, buffered.size - 10)
-    replayer.replay(failingStream, logFilePath.toString, true)
+    Utils.tryWithResource(new EarlyEOFInputStream(logData, buffered.size - 10)) { failingStream =>
+      replayer.replay(failingStream, logFilePath.toString, true)
 
-    assert(eventMonster.loggedEvents.size === 1)
-    assert(failingStream.didFail)
+      assert(eventMonster.loggedEvents.size === 1)
+      assert(failingStream.didFail)
+    }
 
     // Verify the replay throws the EOF exception since the input may not be truncated.
     val logData2 = EventLoggingListener.openEventLog(logFilePath, fileSystem)
-    val failingStream2 = new EarlyEOFInputStream(logData2, buffered.size - 10)
-    intercept[EOFException] {
-      replayer.replay(failingStream2, logFilePath.toString, false)
+    Utils.tryWithResource(new EarlyEOFInputStream(logData2, buffered.size - 10)) { failingStream2 =>
+      intercept[EOFException] {
+        replayer.replay(failingStream2, logFilePath.toString, false)
+      }
     }
   }
 
@@ -151,7 +153,10 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp
    * assumption that the event logging behavior is correct (tested in a separate suite).
    */
   private def testApplicationReplay(codecName: Option[String] = None) {
-    val logDirPath = Utils.getFilePath(testDir, "test-replay")
+    val logDir = new File(testDir.getAbsolutePath, "test-replay")
+    // Here, it creates `Path` from the URI instead of the absolute path for the explicit file
+    // scheme so that the string representation of this `Path` has leading file scheme correctly.
+    val logDirPath = new Path(logDir.toURI)
     fileSystem.mkdirs(logDirPath)
 
     val conf = EventLoggingListenerSuite.getLoggingConf(logDirPath, codecName)
@@ -221,12 +226,14 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp
     def didFail: Boolean = countDown.get == 0
 
     @throws[IOException]
-    def read: Int = {
+    override def read(): Int = {
       if (countDown.get == 0) {
         throw new EOFException("Stream ended prematurely")
       }
       countDown.decrementAndGet()
-      in.read
+      in.read()
     }
+
+    override def close(): Unit = in.close()
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index dc6140756d519..03e50e4119f6f 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -203,7 +203,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
           sql(s"INSERT INTO TABLE $tableName PARTITION (ds='$ds') SELECT * FROM src")
         }
 
-        sql(s"ALTER TABLE $tableName SET LOCATION '$path'")
+        sql(s"ALTER TABLE $tableName SET LOCATION '${path.toURI}'")
 
         sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS noscan")
 
@@ -222,7 +222,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
             s"""
                |CREATE TABLE $sourceTableName (key STRING, value STRING)
                |PARTITIONED BY (ds STRING)
-               |LOCATION '$path'
+               |LOCATION '${path.toURI}'
              """.stripMargin)
 
           val partitionDates = List("2010-01-01", "2010-01-02", "2010-01-03")
@@ -239,7 +239,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
             s"""
                |CREATE TABLE $tableName (key STRING, value STRING)
                |PARTITIONED BY (ds STRING)
-               |LOCATION '$path'
+               |LOCATION '${path.toURI}'
              """.stripMargin)
 
           // Register only one of the partitions found on disk

From 4133c1b0abb22f728fbff287f4f77a06ab88bbe8 Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Wed, 30 Aug 2017 16:00:29 +0200
Subject: [PATCH 453/779] [SPARK-21469][ML][EXAMPLES] Adding Examples for
 FeatureHasher

## What changes were proposed in this pull request?

This PR adds ML examples for the FeatureHasher transform in Scala, Java, Python.

## How was this patch tested?

Manually ran examples and verified that output is consistent for different APIs

Author: Bryan Cutler <cutlerb@gmail.com>

Closes #19024 from BryanCutler/ml-examples-FeatureHasher-SPARK-21810.
---
 docs/ml-features.md                           | 91 ++++++++++++++++++-
 .../examples/ml/JavaFeatureHasherExample.java | 69 ++++++++++++++
 .../main/python/ml/feature_hasher_example.py  | 46 ++++++++++
 .../examples/ml/FeatureHasherExample.scala    | 50 ++++++++++
 .../spark/ml/feature/FeatureHasher.scala      |  7 +-
 5 files changed, 256 insertions(+), 7 deletions(-)
 create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaFeatureHasherExample.java
 create mode 100644 examples/src/main/python/ml/feature_hasher_example.py
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/FeatureHasherExample.scala

diff --git a/docs/ml-features.md b/docs/ml-features.md
index e19fba249fb2d..86a0e09997b8e 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -53,9 +53,9 @@ are calculated based on the mapped indices. This approach avoids the need to com
 term-to-index map, which can be expensive for a large corpus, but it suffers from potential hash 
 collisions, where different raw features may become the same term after hashing. To reduce the 
 chance of collision, we can increase the target feature dimension, i.e. the number of buckets 
-of the hash table. Since a simple modulo is used to transform the hash function to a column index, 
-it is advisable to use a power of two as the feature dimension, otherwise the features will 
-not be mapped evenly to the columns. The default feature dimension is `$2^{18} = 262,144$`.
+of the hash table. Since a simple modulo on the hashed value is used to determine the vector index,
+it is advisable to use a power of two as the feature dimension, otherwise the features will not
+be mapped evenly to the vector indices. The default feature dimension is `$2^{18} = 262,144$`.
 An optional binary toggle parameter controls term frequency counts. When set to true all nonzero
 frequency counts are set to 1. This is especially useful for discrete probabilistic models that
 model binary, rather than integer, counts.
@@ -65,7 +65,7 @@ model binary, rather than integer, counts.
 
 **IDF**: `IDF` is an `Estimator` which is fit on a dataset and produces an `IDFModel`.  The 
 `IDFModel` takes feature vectors (generally created from `HashingTF` or `CountVectorizer`) and 
-scales each column. Intuitively, it down-weights columns which appear frequently in a corpus.
+scales each feature. Intuitively, it down-weights features which appear frequently in a corpus.
 
 **Note:** `spark.ml` doesn't provide tools for text segmentation.
 We refer users to the [Stanford NLP Group](http://nlp.stanford.edu/) and 
@@ -211,6 +211,89 @@ for more details on the API.
 </div>
 </div>
 
+## FeatureHasher
+
+Feature hashing projects a set of categorical or numerical features into a feature vector of
+specified dimension (typically substantially smaller than that of the original feature
+space). This is done using the [hashing trick](https://en.wikipedia.org/wiki/Feature_hashing)
+to map features to indices in the feature vector.
+
+The `FeatureHasher` transformer operates on multiple columns. Each column may contain either
+numeric or categorical features. Behavior and handling of column data types is as follows:
+
+- Numeric columns: For numeric features, the hash value of the column name is used to map the
+feature value to its index in the feature vector. Numeric features are never treated as
+categorical, even when they are integers. You must explicitly convert numeric columns containing
+categorical features to strings first.
+- String columns: For categorical features, the hash value of the string "column_name=value"
+is used to map to the vector index, with an indicator value of `1.0`. Thus, categorical features
+are "one-hot" encoded (similarly to using [OneHotEncoder](ml-features.html#onehotencoder) with
+`dropLast=false`).
+- Boolean columns: Boolean values are treated in the same way as string columns. That is,
+boolean features are represented as "column_name=true" or "column_name=false", with an indicator
+value of `1.0`.
+
+Null (missing) values are ignored (implicitly zero in the resulting feature vector).
+
+The hash function used here is also the [MurmurHash 3](https://en.wikipedia.org/wiki/MurmurHash)
+used in [HashingTF](ml-features.html#tf-idf). Since a simple modulo on the hashed value is used to
+determine the vector index, it is advisable to use a power of two as the numFeatures parameter;
+otherwise the features will not be mapped evenly to the vector indices.
+
+**Examples**
+
+Assume that we have a DataFrame with 4 input columns `real`, `bool`, `stringNum`, and `string`.
+These different data types as input will illustrate the behavior of the transform to produce a
+column of feature vectors.
+
+~~~~
+real| bool|stringNum|string
+----|-----|---------|------
+ 2.2| true|        1|   foo
+ 3.3|false|        2|   bar
+ 4.4|false|        3|   baz
+ 5.5|false|        4|   foo
+~~~~
+
+Then the output of `FeatureHasher.transform` on this DataFrame is:
+
+~~~~
+real|bool |stringNum|string|features
+----|-----|---------|------|-------------------------------------------------------
+2.2 |true |1        |foo   |(262144,[51871, 63643,174475,253195],[1.0,1.0,2.2,1.0])
+3.3 |false|2        |bar   |(262144,[6031,  80619,140467,174475],[1.0,1.0,1.0,3.3])
+4.4 |false|3        |baz   |(262144,[24279,140467,174475,196810],[1.0,1.0,4.4,1.0])
+5.5 |false|4        |foo   |(262144,[63643,140467,168512,174475],[1.0,1.0,1.0,5.5])
+~~~~
+
+The resulting feature vectors could then be passed to a learning algorithm.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+Refer to the [FeatureHasher Scala docs](api/scala/index.html#org.apache.spark.ml.feature.FeatureHasher)
+for more details on the API.
+
+{% include_example scala/org/apache/spark/examples/ml/FeatureHasherExample.scala %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+Refer to the [FeatureHasher Java docs](api/java/org/apache/spark/ml/feature/FeatureHasher.html)
+for more details on the API.
+
+{% include_example java/org/apache/spark/examples/ml/JavaFeatureHasherExample.java %}
+</div>
+
+<div data-lang="python" markdown="1">
+
+Refer to the [FeatureHasher Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.FeatureHasher)
+for more details on the API.
+
+{% include_example python/ml/feature_hasher_example.py %}
+</div>
+</div>
+
 # Feature Transformers
 
 ## Tokenizer
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaFeatureHasherExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaFeatureHasherExample.java
new file mode 100644
index 0000000000000..9730d42e6db8d
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaFeatureHasherExample.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.SparkSession;
+
+// $example on$
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.spark.ml.feature.FeatureHasher;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaFeatureHasherExample {
+  public static void main(String[] args) {
+    SparkSession spark = SparkSession
+      .builder()
+      .appName("JavaFeatureHasherExample")
+      .getOrCreate();
+
+    // $example on$
+    List<Row> data = Arrays.asList(
+      RowFactory.create(2.2, true, "1", "foo"),
+      RowFactory.create(3.3, false, "2", "bar"),
+      RowFactory.create(4.4, false, "3", "baz"),
+      RowFactory.create(5.5, false, "4", "foo")
+    );
+    StructType schema = new StructType(new StructField[]{
+      new StructField("real", DataTypes.DoubleType, false, Metadata.empty()),
+      new StructField("bool", DataTypes.BooleanType, false, Metadata.empty()),
+      new StructField("stringNum", DataTypes.StringType, false, Metadata.empty()),
+      new StructField("string", DataTypes.StringType, false, Metadata.empty())
+    });
+    Dataset<Row> dataset = spark.createDataFrame(data, schema);
+
+    FeatureHasher hasher = new FeatureHasher()
+      .setInputCols(new String[]{"real", "bool", "stringNum", "string"})
+      .setOutputCol("features");
+
+    Dataset<Row> featurized = hasher.transform(dataset);
+
+    featurized.show(false);
+    // $example off$
+
+    spark.stop();
+  }
+}
diff --git a/examples/src/main/python/ml/feature_hasher_example.py b/examples/src/main/python/ml/feature_hasher_example.py
new file mode 100644
index 0000000000000..6cf9ecc396400
--- /dev/null
+++ b/examples/src/main/python/ml/feature_hasher_example.py
@@ -0,0 +1,46 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark.sql import SparkSession
+# $example on$
+from pyspark.ml.feature import FeatureHasher
+# $example off$
+
+if __name__ == "__main__":
+    spark = SparkSession\
+        .builder\
+        .appName("FeatureHasherExample")\
+        .getOrCreate()
+
+    # $example on$
+    dataset = spark.createDataFrame([
+        (2.2, True, "1", "foo"),
+        (3.3, False, "2", "bar"),
+        (4.4, False, "3", "baz"),
+        (5.5, False, "4", "foo")
+    ], ["real", "bool", "stringNum", "string"])
+
+    hasher = FeatureHasher(inputCols=["real", "bool", "stringNum", "string"],
+                           outputCol="features")
+
+    featurized = hasher.transform(dataset)
+    featurized.show(truncate=False)
+    # $example off$
+
+    spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/FeatureHasherExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/FeatureHasherExample.scala
new file mode 100644
index 0000000000000..1aed10bfb2d38
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/FeatureHasherExample.scala
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.FeatureHasher
+// $example off$
+import org.apache.spark.sql.SparkSession
+
+object FeatureHasherExample {
+  def main(args: Array[String]): Unit = {
+    val spark = SparkSession
+      .builder
+      .appName("FeatureHasherExample")
+      .getOrCreate()
+
+    // $example on$
+    val dataset = spark.createDataFrame(Seq(
+      (2.2, true, "1", "foo"),
+      (3.3, false, "2", "bar"),
+      (4.4, false, "3", "baz"),
+      (5.5, false, "4", "foo")
+    )).toDF("real", "bool", "stringNum", "string")
+
+    val hasher = new FeatureHasher()
+      .setInputCols("real", "bool", "stringNum", "string")
+      .setOutputCol("features")
+
+    val featurized = hasher.transform(dataset)
+    featurized.show(false)
+    // $example off$
+
+    spark.stop()
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
index 4b91fa933ed9f..4615daed20fb1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
@@ -53,9 +53,10 @@ import org.apache.spark.util.collection.OpenHashMap
  *
  * Null (missing) values are ignored (implicitly zero in the resulting feature vector).
  *
- * Since a simple modulo is used to transform the hash function to a vector index,
- * it is advisable to use a power of two as the numFeatures parameter;
- * otherwise the features will not be mapped evenly to the vector indices.
+ * The hash function used here is also the MurmurHash 3 used in [[HashingTF]]. Since a simple modulo
+ * on the hashed value is used to determine the vector index, it is advisable to use a power of two
+ * as the numFeatures parameter; otherwise the features will not be mapped evenly to the vector
+ * indices.
  *
  * {{{
  *   val df = Seq(

From 32d6d9d72019404ebd47f6aa64197d9f574bac8b Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Wed, 30 Aug 2017 09:08:40 -0700
Subject: [PATCH 454/779] Revert "[SPARK-21845][SQL] Make codegen fallback of
 expressions configurable"

This reverts commit 3d0e174244bc293f11dff0f11ef705ba6cd5fe3a.
---
 .../org/apache/spark/sql/internal/SQLConf.scala   |  6 +++---
 .../apache/spark/sql/execution/SparkPlan.scala    | 15 ++++++++++-----
 .../sql/execution/WholeStageCodegenExec.scala     |  2 +-
 .../spark/sql/DataFrameFunctionsSuite.scala       |  2 +-
 .../org/apache/spark/sql/DataFrameSuite.scala     | 12 +-----------
 .../apache/spark/sql/test/SharedSQLContext.scala  |  2 --
 .../org/apache/spark/sql/hive/test/TestHive.scala |  1 -
 7 files changed, 16 insertions(+), 24 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 24f51ef163106..a685099505ee8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -551,9 +551,9 @@ object SQLConf {
     .intConf
     .createWithDefault(100)
 
-  val CODEGEN_FALLBACK = buildConf("spark.sql.codegen.fallback")
+  val WHOLESTAGE_FALLBACK = buildConf("spark.sql.codegen.fallback")
     .internal()
-    .doc("When true, (whole stage) codegen could be temporary disabled for the part of query that" +
+    .doc("When true, whole stage codegen could be temporary disabled for the part of query that" +
       " fail to compile generated code")
     .booleanConf
     .createWithDefault(true)
@@ -1041,7 +1041,7 @@ class SQLConf extends Serializable with Logging {
 
   def wholeStageMaxNumFields: Int = getConf(WHOLESTAGE_MAX_NUM_FIELDS)
 
-  def codegenFallback: Boolean = getConf(CODEGEN_FALLBACK)
+  def wholeStageFallback: Boolean = getConf(WHOLESTAGE_FALLBACK)
 
   def maxCaseBranchesForCodegen: Int = getConf(MAX_CASES_BRANCHES)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index b1db9dd9dd8ac..c7277c21cebb2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -56,10 +56,14 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
 
   protected def sparkContext = sqlContext.sparkContext
 
-  // whether we should fallback when hitting compilation errors caused by codegen
-  private val codeGenFallBack = sqlContext.conf.codegenFallback
-
-  protected val subexpressionEliminationEnabled = sqlContext.conf.subexpressionEliminationEnabled
+  // sqlContext will be null when we are being deserialized on the slaves.  In this instance
+  // the value of subexpressionEliminationEnabled will be set by the deserializer after the
+  // constructor has run.
+  val subexpressionEliminationEnabled: Boolean = if (sqlContext != null) {
+    sqlContext.conf.subexpressionEliminationEnabled
+  } else {
+    false
+  }
 
   /** Overridden make copy also propagates sqlContext to copied plan. */
   override def makeCopy(newArgs: Array[AnyRef]): SparkPlan = {
@@ -366,7 +370,8 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
     try {
       GeneratePredicate.generate(expression, inputSchema)
     } catch {
-      case _ @ (_: JaninoRuntimeException | _: CompileException) if codeGenFallBack =>
+      case e @ (_: JaninoRuntimeException | _: CompileException)
+          if sqlContext == null || sqlContext.conf.wholeStageFallback =>
         genInterpretedPredicate(expression, inputSchema)
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
index a41a7ca56a0a1..bacb7090a70ab 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
@@ -382,7 +382,7 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
     try {
       CodeGenerator.compile(cleanedSource)
     } catch {
-      case _: Exception if !Utils.isTesting && sqlContext.conf.codegenFallback =>
+      case e: Exception if !Utils.isTesting && sqlContext.conf.wholeStageFallback =>
         // We should already saw the error message
         logWarning(s"Whole-stage codegen disabled for this plan:\n $treeString")
         return child.execute()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 50e475984f458..0681b9cbeb1d8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -422,7 +422,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext {
         v
       }
       withSQLConf(
-        (SQLConf.CODEGEN_FALLBACK.key, codegenFallback.toString),
+        (SQLConf.WHOLESTAGE_FALLBACK.key, codegenFallback.toString),
         (SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, wholeStage.toString)) {
         val df = spark.range(0, 4, 1, 4).withColumn("c", c)
         val rows = df.collect()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 13341645e8ff8..5eb34e587e95f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -2011,17 +2011,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
 
     val filter = (0 until N)
       .foldLeft(lit(false))((e, index) => e.or(df.col(df.columns(index)) =!= "string"))
-
-    withSQLConf(SQLConf.CODEGEN_FALLBACK.key -> "true") {
-      df.filter(filter).count()
-    }
-
-    withSQLConf(SQLConf.CODEGEN_FALLBACK.key -> "false") {
-      val e = intercept[SparkException] {
-        df.filter(filter).count()
-      }.getMessage
-      assert(e.contains("grows beyond 64 KB"))
-    }
+    df.filter(filter).count
   }
 
   test("SPARK-20897: cached self-join should not fail") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
index cd8d0708d8a32..1f073d5f64c6b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
@@ -24,7 +24,6 @@ import org.scalatest.concurrent.Eventually
 
 import org.apache.spark.{DebugFilesystem, SparkConf}
 import org.apache.spark.sql.{SparkSession, SQLContext}
-import org.apache.spark.sql.internal.SQLConf
 
 /**
  * Helper trait for SQL test suites where all tests share a single [[TestSparkSession]].
@@ -35,7 +34,6 @@ trait SharedSQLContext extends SQLTestUtils with BeforeAndAfterEach with Eventua
     new SparkConf()
       .set("spark.hadoop.fs.file.impl", classOf[DebugFilesystem].getName)
       .set("spark.unsafe.exceptionOnMemoryLeak", "true")
-      .set(SQLConf.CODEGEN_FALLBACK.key, "false")
   }
 
   /**
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 0f6a81b6f813b..10c9a2de6540a 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -51,7 +51,6 @@ object TestHive
       "TestSQLContext",
       new SparkConf()
         .set("spark.sql.test", "")
-        .set(SQLConf.CODEGEN_FALLBACK.key, "false")
         .set("spark.sql.hive.metastore.barrierPrefixes",
           "org.apache.spark.sql.hive.execution.PairSerDe")
         .set("spark.sql.warehouse.dir", TestHiveContext.makeWarehouseDir().toURI.getPath)

From 235d28333c63719008ee755138db5c964237f526 Mon Sep 17 00:00:00 2001
From: caoxuewen <cao.xuewen@zte.com.cn>
Date: Wed, 30 Aug 2017 10:10:24 -0700
Subject: [PATCH 455/779] [MINOR][SQL][TEST] Test shuffle hash join while is
 not expected
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

igore("shuffle hash join") is to shuffle hash join to test _case class ShuffledHashJoinExec_.
But when you 'ignore' -> 'test', the test is _case class BroadcastHashJoinExec_.

Before modified,  as a result of：canBroadcast is true.
Print information in _canBroadcast(plan: LogicalPlan)_
```
canBroadcast plan.stats.sizeInBytes:6710880
canBroadcast conf.autoBroadcastJoinThreshold:10000000
```

After modified, plan.stats.sizeInBytes is 11184808.
Print information in _canBuildLocalHashMap(plan: LogicalPlan)_
and _muchSmaller(a: LogicalPlan, b: LogicalPlan)_ :

```
canBuildLocalHashMap plan.stats.sizeInBytes:11184808
canBuildLocalHashMap conf.autoBroadcastJoinThreshold:10000000
canBuildLocalHashMap conf.numShufflePartitions:2
```
```
muchSmaller a.stats.sizeInBytes * 3:33554424
muchSmaller b.stats.sizeInBytes:33554432
```
## How was this patch tested?

existing test case.

Author: caoxuewen <cao.xuewen@zte.com.cn>

Closes #19069 from heary-cao/shuffle_hash_join.
---
 .../execution/benchmark/JoinBenchmark.scala   | 56 ++++++++++++-------
 1 file changed, 37 insertions(+), 19 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/JoinBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/JoinBenchmark.scala
index 46db41a8abad9..5a25d72308370 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/JoinBenchmark.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/JoinBenchmark.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution.benchmark
 
+import org.apache.spark.sql.execution.joins._
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.IntegerType
 
@@ -35,7 +36,9 @@ class JoinBenchmark extends BenchmarkBase {
 
     val dim = broadcast(sparkSession.range(M).selectExpr("id as k", "cast(id as string) as v"))
     runBenchmark("Join w long", N) {
-      sparkSession.range(N).join(dim, (col("id") % M) === col("k")).count()
+      val df = sparkSession.range(N).join(dim, (col("id") % M) === col("k"))
+      assert(df.queryExecution.sparkPlan.find(_.isInstanceOf[BroadcastHashJoinExec]).isDefined)
+      df.count()
     }
 
     /*
@@ -55,7 +58,9 @@ class JoinBenchmark extends BenchmarkBase {
     val dim = broadcast(sparkSession.range(M).selectExpr("id as k", "cast(id as string) as v"))
     runBenchmark("Join w long duplicated", N) {
       val dim = broadcast(sparkSession.range(M).selectExpr("cast(id/10 as long) as k"))
-      sparkSession.range(N).join(dim, (col("id") % M) === col("k")).count()
+      val df = sparkSession.range(N).join(dim, (col("id") % M) === col("k"))
+      assert(df.queryExecution.sparkPlan.find(_.isInstanceOf[BroadcastHashJoinExec]).isDefined)
+      df.count()
     }
 
     /*
@@ -75,9 +80,11 @@ class JoinBenchmark extends BenchmarkBase {
       .selectExpr("cast(id as int) as k1", "cast(id as int) as k2", "cast(id as string) as v"))
 
     runBenchmark("Join w 2 ints", N) {
-      sparkSession.range(N).join(dim2,
+      val df = sparkSession.range(N).join(dim2,
         (col("id") % M).cast(IntegerType) === col("k1")
-          && (col("id") % M).cast(IntegerType) === col("k2")).count()
+          && (col("id") % M).cast(IntegerType) === col("k2"))
+      assert(df.queryExecution.sparkPlan.find(_.isInstanceOf[BroadcastHashJoinExec]).isDefined)
+      df.count()
     }
 
     /*
@@ -97,9 +104,10 @@ class JoinBenchmark extends BenchmarkBase {
       .selectExpr("id as k1", "id as k2", "cast(id as string) as v"))
 
     runBenchmark("Join w 2 longs", N) {
-      sparkSession.range(N).join(dim3,
+      val df = sparkSession.range(N).join(dim3,
         (col("id") % M) === col("k1") && (col("id") % M) === col("k2"))
-        .count()
+      assert(df.queryExecution.sparkPlan.find(_.isInstanceOf[BroadcastHashJoinExec]).isDefined)
+      df.count()
     }
 
     /*
@@ -119,9 +127,10 @@ class JoinBenchmark extends BenchmarkBase {
       .selectExpr("cast(id/10 as long) as k1", "cast(id/10 as long) as k2"))
 
     runBenchmark("Join w 2 longs duplicated", N) {
-      sparkSession.range(N).join(dim4,
+      val df = sparkSession.range(N).join(dim4,
         (col("id") bitwiseAND M) === col("k1") && (col("id") bitwiseAND M) === col("k2"))
-        .count()
+      assert(df.queryExecution.sparkPlan.find(_.isInstanceOf[BroadcastHashJoinExec]).isDefined)
+      df.count()
     }
 
     /*
@@ -138,7 +147,9 @@ class JoinBenchmark extends BenchmarkBase {
     val M = 1 << 16
     val dim = broadcast(sparkSession.range(M).selectExpr("id as k", "cast(id as string) as v"))
     runBenchmark("outer join w long", N) {
-      sparkSession.range(N).join(dim, (col("id") % M) === col("k"), "left").count()
+      val df = sparkSession.range(N).join(dim, (col("id") % M) === col("k"), "left")
+      assert(df.queryExecution.sparkPlan.find(_.isInstanceOf[BroadcastHashJoinExec]).isDefined)
+      df.count()
     }
 
     /*
@@ -156,7 +167,9 @@ class JoinBenchmark extends BenchmarkBase {
     val M = 1 << 16
     val dim = broadcast(sparkSession.range(M).selectExpr("id as k", "cast(id as string) as v"))
     runBenchmark("semi join w long", N) {
-      sparkSession.range(N).join(dim, (col("id") % M) === col("k"), "leftsemi").count()
+      val df = sparkSession.range(N).join(dim, (col("id") % M) === col("k"), "leftsemi")
+      assert(df.queryExecution.sparkPlan.find(_.isInstanceOf[BroadcastHashJoinExec]).isDefined)
+      df.count()
     }
 
     /*
@@ -174,7 +187,9 @@ class JoinBenchmark extends BenchmarkBase {
     runBenchmark("merge join", N) {
       val df1 = sparkSession.range(N).selectExpr(s"id * 2 as k1")
       val df2 = sparkSession.range(N).selectExpr(s"id * 3 as k2")
-      df1.join(df2, col("k1") === col("k2")).count()
+      val df = df1.join(df2, col("k1") === col("k2"))
+      assert(df.queryExecution.sparkPlan.find(_.isInstanceOf[SortMergeJoinExec]).isDefined)
+      df.count()
     }
 
     /*
@@ -193,7 +208,9 @@ class JoinBenchmark extends BenchmarkBase {
         .selectExpr(s"(id * 15485863) % ${N*10} as k1")
       val df2 = sparkSession.range(N)
         .selectExpr(s"(id * 15485867) % ${N*10} as k2")
-      df1.join(df2, col("k1") === col("k2")).count()
+      val df = df1.join(df2, col("k1") === col("k2"))
+      assert(df.queryExecution.sparkPlan.find(_.isInstanceOf[SortMergeJoinExec]).isDefined)
+      df.count()
     }
 
     /*
@@ -212,18 +229,19 @@ class JoinBenchmark extends BenchmarkBase {
     sparkSession.conf.set("spark.sql.join.preferSortMergeJoin", "false")
     runBenchmark("shuffle hash join", N) {
       val df1 = sparkSession.range(N).selectExpr(s"id as k1")
-      val df2 = sparkSession.range(N / 5).selectExpr(s"id * 3 as k2")
-      df1.join(df2, col("k1") === col("k2")).count()
+      val df2 = sparkSession.range(N / 3).selectExpr(s"id * 3 as k2")
+      val df = df1.join(df2, col("k1") === col("k2"))
+      assert(df.queryExecution.sparkPlan.find(_.isInstanceOf[ShuffledHashJoinExec]).isDefined)
+      df.count()
     }
 
     /*
-     *Java HotSpot(TM) 64-Bit Server VM 1.7.0_60-b19 on Mac OS X 10.9.5
-     *Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz
+     *Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Windows 7 6.1
+     *Intel64 Family 6 Model 94 Stepping 3, GenuineIntel
      *shuffle hash join:                  Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
      *-------------------------------------------------------------------------------------------
-     *shuffle hash join codegen=false          1101 / 1391          3.8         262.6       1.0X
-     *shuffle hash join codegen=true            528 /  578          7.9         125.8       2.1X
+     *shuffle hash join codegen=false          2005 / 2010          2.1         478.0       1.0X
+     *shuffle hash join codegen=true           1773 / 1792          2.4         422.7       1.1X
      */
   }
-
 }

From 6949a9c5c6120fdde1b63876ede661adbd1eb15e Mon Sep 17 00:00:00 2001
From: Sital Kedia <skedia@fb.com>
Date: Wed, 30 Aug 2017 14:19:13 -0700
Subject: [PATCH 456/779] [SPARK-21834] Incorrect executor request in case of
 dynamic allocation

## What changes were proposed in this pull request?

killExecutor api currently does not allow killing an executor without updating the total number of executors needed. In case of dynamic allocation is turned on and the allocator tries to kill an executor, the scheduler reduces the total number of executors needed ( see https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala#L635) which is incorrect because the allocator already takes care of setting the required number of executors itself.

## How was this patch tested?

Ran a job on the cluster and made sure the executor request is correct

Author: Sital Kedia <skedia@fb.com>

Closes #19081 from sitalkedia/skedia/oss_fix_executor_allocation.
---
 .../scala/org/apache/spark/ExecutorAllocationManager.scala     | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
index 33503260bbe02..7a5fb9a802354 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
@@ -446,6 +446,9 @@ private[spark] class ExecutorAllocationManager(
     } else {
       client.killExecutors(executorIdsToBeRemoved)
     }
+    // [SPARK-21834] killExecutors api reduces the target number of executors.
+    // So we need to update the target with desired value.
+    client.requestTotalExecutors(numExecutorsTarget, localityAwareTasks, hostToLocalTaskCount)
     // reset the newExecutorTotal to the existing number of executors
     newExecutorTotal = numExistingExecutors
     if (testing || executorsRemoved.nonEmpty) {

From d8f45408635d4fccac557cb1e877dfe9267fb326 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Thu, 31 Aug 2017 08:16:58 +0900
Subject: [PATCH 457/779] [SPARK-21839][SQL] Support SQL config for ORC
 compression

## What changes were proposed in this pull request?

This PR aims to support `spark.sql.orc.compression.codec` like Parquet's `spark.sql.parquet.compression.codec`. Users can use SQLConf to control ORC compression, too.

## How was this patch tested?

Pass the Jenkins with new and updated test cases.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #19055 from dongjoon-hyun/SPARK-21839.
---
 python/pyspark/sql/readwriter.py              |  5 ++--
 .../apache/spark/sql/internal/SQLConf.scala   | 10 +++++++
 .../apache/spark/sql/DataFrameWriter.scala    |  8 +++--
 .../spark/sql/hive/orc/OrcFileFormat.scala    |  2 +-
 .../spark/sql/hive/orc/OrcOptions.scala       | 18 +++++++-----
 .../spark/sql/hive/orc/OrcSourceSuite.scala   | 29 +++++++++++++++++--
 6 files changed, 57 insertions(+), 15 deletions(-)

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 01da0dc27d83d..cb847a0420311 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -851,8 +851,9 @@ def orc(self, path, mode=None, partitionBy=None, compression=None):
         :param partitionBy: names of partitioning columns
         :param compression: compression codec to use when saving to file. This can be one of the
                             known case-insensitive shorten names (none, snappy, zlib, and lzo).
-                            This will override ``orc.compress``. If None is set, it uses the
-                            default value, ``snappy``.
+                            This will override ``orc.compress`` and
+                            ``spark.sql.orc.compression.codec``. If None is set, it uses the value
+                            specified in ``spark.sql.orc.compression.codec``.
 
         >>> orc_df = spark.read.orc('python/test_support/sql/orc_partitioned')
         >>> orc_df.write.orc(os.path.join(tempfile.mkdtemp(), 'data'))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index a685099505ee8..c407874381ac2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -322,6 +322,14 @@ object SQLConf {
       .booleanConf
       .createWithDefault(true)
 
+  val ORC_COMPRESSION = buildConf("spark.sql.orc.compression.codec")
+    .doc("Sets the compression codec use when writing ORC files. Acceptable values include: " +
+      "none, uncompressed, snappy, zlib, lzo.")
+    .stringConf
+    .transform(_.toLowerCase(Locale.ROOT))
+    .checkValues(Set("none", "uncompressed", "snappy", "zlib", "lzo"))
+    .createWithDefault("snappy")
+
   val ORC_FILTER_PUSHDOWN_ENABLED = buildConf("spark.sql.orc.filterPushdown")
     .doc("When true, enable filter pushdown for ORC files.")
     .booleanConf
@@ -998,6 +1006,8 @@ class SQLConf extends Serializable with Logging {
 
   def useCompression: Boolean = getConf(COMPRESS_CACHED)
 
+  def orcCompressionCodec: String = getConf(ORC_COMPRESSION)
+
   def parquetCompressionCodec: String = getConf(PARQUET_COMPRESSION)
 
   def parquetCacheMetadata: Boolean = getConf(PARQUET_CACHE_METADATA)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index cca93525d6792..07347d2748544 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -517,9 +517,11 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
    *
    * You can set the following ORC-specific option(s) for writing ORC files:
    * <ul>
-   * <li>`compression` (default `snappy`): compression codec to use when saving to file. This can be
-   * one of the known case-insensitive shorten names(`none`, `snappy`, `zlib`, and `lzo`).
-   * This will override `orc.compress`.</li>
+   * <li>`compression` (default is the value specified in `spark.sql.orc.compression.codec`):
+   * compression codec to use when saving to file. This can be one of the known case-insensitive
+   * shorten names(`none`, `snappy`, `zlib`, and `lzo`). This will override
+   * `orc.compress` and `spark.sql.parquet.compression.codec`. If `orc.compress` is given,
+   * it overrides `spark.sql.parquet.compression.codec`.</li>
    * </ul>
    *
    * @since 1.5.0
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
index 3a34ec55c8b07..edf2013a4c936 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -68,7 +68,7 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
       job: Job,
       options: Map[String, String],
       dataSchema: StructType): OutputWriterFactory = {
-    val orcOptions = new OrcOptions(options)
+    val orcOptions = new OrcOptions(options, sparkSession.sessionState.conf)
 
     val configuration = job.getConfiguration
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcOptions.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcOptions.scala
index 043eb69818ba1..7f94c8c579026 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcOptions.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcOptions.scala
@@ -20,30 +20,34 @@ package org.apache.spark.sql.hive.orc
 import java.util.Locale
 
 import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
+import org.apache.spark.sql.internal.SQLConf
 
 /**
  * Options for the ORC data source.
  */
-private[orc] class OrcOptions(@transient private val parameters: CaseInsensitiveMap[String])
+private[orc] class OrcOptions(
+    @transient private val parameters: CaseInsensitiveMap[String],
+    @transient private val sqlConf: SQLConf)
   extends Serializable {
 
   import OrcOptions._
 
-  def this(parameters: Map[String, String]) = this(CaseInsensitiveMap(parameters))
+  def this(parameters: Map[String, String], sqlConf: SQLConf) =
+    this(CaseInsensitiveMap(parameters), sqlConf)
 
   /**
-   * Compression codec to use. By default snappy compression.
+   * Compression codec to use.
    * Acceptable values are defined in [[shortOrcCompressionCodecNames]].
    */
   val compressionCodec: String = {
-    // `orc.compress` is a ORC configuration. So, here we respect this as an option but
-    // `compression` has higher precedence than `orc.compress`. It means if both are set,
-    // we will use `compression`.
+    // `compression`, `orc.compress`, and `spark.sql.orc.compression.codec` are
+    // in order of precedence from highest to lowest.
     val orcCompressionConf = parameters.get(OrcRelation.ORC_COMPRESSION)
     val codecName = parameters
       .get("compression")
       .orElse(orcCompressionConf)
-      .getOrElse("snappy").toLowerCase(Locale.ROOT)
+      .getOrElse(sqlConf.orcCompressionCodec)
+      .toLowerCase(Locale.ROOT)
     if (!shortOrcCompressionCodecNames.contains(codecName)) {
       val availableCodecs = shortOrcCompressionCodecNames.keys.map(_.toLowerCase(Locale.ROOT))
       throw new IllegalArgumentException(s"Codec [$codecName] " +
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
index 52fa401d32c18..781de6631f324 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
@@ -22,8 +22,8 @@ import java.io.File
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.sql.{QueryTest, Row}
-import org.apache.spark.sql.hive.HiveExternalCatalog
 import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
@@ -149,7 +149,8 @@ abstract class OrcSuite extends QueryTest with TestHiveSingleton with BeforeAndA
   }
 
   test("SPARK-18433: Improve DataSource option keys to be more case-insensitive") {
-    assert(new OrcOptions(Map("Orc.Compress" -> "NONE")).compressionCodec == "NONE")
+    val conf = sqlContext.sessionState.conf
+    assert(new OrcOptions(Map("Orc.Compress" -> "NONE"), conf).compressionCodec == "NONE")
   }
 
   test("SPARK-19459/SPARK-18220: read char/varchar column written by Hive") {
@@ -194,6 +195,30 @@ abstract class OrcSuite extends QueryTest with TestHiveSingleton with BeforeAndA
       Utils.deleteRecursively(location)
     }
   }
+
+  test("SPARK-21839: Add SQL config for ORC compression") {
+    val conf = sqlContext.sessionState.conf
+    // Test if the default of spark.sql.orc.compression.codec is snappy
+    assert(new OrcOptions(Map.empty[String, String], conf).compressionCodec == "SNAPPY")
+
+    // OrcOptions's parameters have a higher priority than SQL configuration.
+    // `compression` -> `orc.compression` -> `spark.sql.orc.compression.codec`
+    withSQLConf(SQLConf.ORC_COMPRESSION.key -> "uncompressed") {
+      assert(new OrcOptions(Map.empty[String, String], conf).compressionCodec == "NONE")
+      val map1 = Map("orc.compress" -> "zlib")
+      val map2 = Map("orc.compress" -> "zlib", "compression" -> "lzo")
+      assert(new OrcOptions(map1, conf).compressionCodec == "ZLIB")
+      assert(new OrcOptions(map2, conf).compressionCodec == "LZO")
+    }
+
+    // Test all the valid options of spark.sql.orc.compression.codec
+    Seq("NONE", "UNCOMPRESSED", "SNAPPY", "ZLIB", "LZO").foreach { c =>
+      withSQLConf(SQLConf.ORC_COMPRESSION.key -> c) {
+        val expected = if (c == "UNCOMPRESSED") "NONE" else c
+        assert(new OrcOptions(Map.empty[String, String], conf).compressionCodec == expected)
+      }
+    }
+  }
 }
 
 class OrcSourceSuite extends OrcSuite {

From 313c6ca43593e247ab8cedac15c77d13e2830d6b Mon Sep 17 00:00:00 2001
From: Andrew Ash <andrew@andrewash.com>
Date: Thu, 31 Aug 2017 09:26:11 +0900
Subject: [PATCH 458/779] [SPARK-21875][BUILD] Fix Java style bugs

## What changes were proposed in this pull request?

Fix Java code style so `./dev/lint-java` succeeds

## How was this patch tested?

Run `./dev/lint-java`

Author: Andrew Ash <andrew@andrewash.com>

Closes #19088 from ash211/spark-21875-lint-java.
---
 .../main/java/org/apache/spark/memory/TaskMemoryManager.java   | 3 ++-
 .../test/java/test/org/apache/spark/sql/JavaDatasetSuite.java  | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java b/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
index 0f1e902b0e6a1..44b60c1e4e8c8 100644
--- a/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
+++ b/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
@@ -74,7 +74,8 @@ public class TaskMemoryManager {
    * Maximum supported data page size (in bytes). In principle, the maximum addressable page size is
    * (1L &lt;&lt; OFFSET_BITS) bytes, which is 2+ petabytes. However, the on-heap allocator's
    * maximum page size is limited by the maximum amount of data that can be stored in a long[]
-   * array, which is (2^31 - 1) * 8 bytes (or about 17 gigabytes). Therefore, we cap this at 17 gigabytes.
+   * array, which is (2^31 - 1) * 8 bytes (or about 17 gigabytes). Therefore, we cap this at 17
+   * gigabytes.
    */
   public static final long MAXIMUM_PAGE_SIZE_BYTES = ((1L << 31) - 1) * 8L;
 
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java
index 3e57403bede9e..13b006fc48ac3 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java
@@ -1337,7 +1337,8 @@ public String toString() {
     public boolean equals(Object other) {
       if (other instanceof BeanWithEnum) {
         BeanWithEnum beanWithEnum = (BeanWithEnum) other;
-        return beanWithEnum.regularField.equals(regularField) && beanWithEnum.enumField.equals(enumField);
+        return beanWithEnum.regularField.equals(regularField)
+          && beanWithEnum.enumField.equals(enumField);
       }
       return false;
     }

From cd5d0f3379b1a9fa0940ffd98bfff33f8cbcdeb0 Mon Sep 17 00:00:00 2001
From: Xiaofeng Lin <xlin@twilio.com>
Date: Thu, 31 Aug 2017 08:57:15 +0800
Subject: [PATCH 459/779] [SPARK-11574][CORE] Add metrics StatsD sink

This patch adds statsd sink to the current metrics system in spark core.

Author: Xiaofeng Lin <xlin@twilio.com>

Closes #9518 from xflin/statsd.

Change-Id: Ib8720e86223d4a650df53f51ceb963cd95b49a44
---
 conf/metrics.properties.template              |  12 ++
 .../spark/metrics/sink/StatsdReporter.scala   | 163 ++++++++++++++++++
 .../spark/metrics/sink/StatsdSink.scala       |  75 ++++++++
 .../spark/metrics/sink/StatsdSinkSuite.scala  | 161 +++++++++++++++++
 docs/monitoring.md                            |   1 +
 5 files changed, 412 insertions(+)
 create mode 100644 core/src/main/scala/org/apache/spark/metrics/sink/StatsdReporter.scala
 create mode 100644 core/src/main/scala/org/apache/spark/metrics/sink/StatsdSink.scala
 create mode 100644 core/src/test/scala/org/apache/spark/metrics/sink/StatsdSinkSuite.scala

diff --git a/conf/metrics.properties.template b/conf/metrics.properties.template
index aeb76c9b2f6ea..4c008a13607c2 100644
--- a/conf/metrics.properties.template
+++ b/conf/metrics.properties.template
@@ -118,6 +118,14 @@
 #   prefix    EMPTY STRING  Prefix to prepend to every metric's name
 #   protocol  tcp           Protocol ("tcp" or "udp") to use
 
+# org.apache.spark.metrics.sink.StatsdSink
+#   Name:     Default:      Description:
+#   host      127.0.0.1     Hostname or IP of StatsD server
+#   port      8125          Port of StatsD server
+#   period    10            Poll period
+#   unit      seconds       Units of poll period
+#   prefix    EMPTY STRING  Prefix to prepend to metric name
+
 ## Examples
 # Enable JmxSink for all instances by class name
 #*.sink.jmx.class=org.apache.spark.metrics.sink.JmxSink
@@ -125,6 +133,10 @@
 # Enable ConsoleSink for all instances by class name
 #*.sink.console.class=org.apache.spark.metrics.sink.ConsoleSink
 
+# Enable StatsdSink for all instances by class name
+#*.sink.statsd.class=org.apache.spark.metrics.sink.StatsdSink
+#*.sink.statsd.prefix=spark
+
 # Polling period for the ConsoleSink
 #*.sink.console.period=10
 # Unit of the polling period for the ConsoleSink
diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/StatsdReporter.scala b/core/src/main/scala/org/apache/spark/metrics/sink/StatsdReporter.scala
new file mode 100644
index 0000000000000..ba75aa1c65cc6
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/metrics/sink/StatsdReporter.scala
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.metrics.sink
+
+import java.io.IOException
+import java.net.{DatagramPacket, DatagramSocket, InetSocketAddress}
+import java.nio.charset.StandardCharsets.UTF_8
+import java.util.SortedMap
+import java.util.concurrent.TimeUnit
+
+import scala.collection.JavaConverters._
+import scala.util.{Failure, Success, Try}
+
+import com.codahale.metrics._
+import org.apache.hadoop.net.NetUtils
+
+import org.apache.spark.internal.Logging
+
+/**
+ * @see <a href="https://github.com/etsy/statsd/blob/master/docs/metric_types.md">
+ *        StatsD metric types</a>
+ */
+private[spark] object StatsdMetricType {
+  val COUNTER = "c"
+  val GAUGE = "g"
+  val TIMER = "ms"
+  val Set = "s"
+}
+
+private[spark] class StatsdReporter(
+    registry: MetricRegistry,
+    host: String = "127.0.0.1",
+    port: Int = 8125,
+    prefix: String = "",
+    filter: MetricFilter = MetricFilter.ALL,
+    rateUnit: TimeUnit = TimeUnit.SECONDS,
+    durationUnit: TimeUnit = TimeUnit.MILLISECONDS)
+  extends ScheduledReporter(registry, "statsd-reporter", filter, rateUnit, durationUnit)
+  with Logging {
+
+  import StatsdMetricType._
+
+  private val address = new InetSocketAddress(host, port)
+  private val whitespace = "[\\s]+".r
+
+  override def report(
+      gauges: SortedMap[String, Gauge[_]],
+      counters: SortedMap[String, Counter],
+      histograms: SortedMap[String, Histogram],
+      meters: SortedMap[String, Meter],
+      timers: SortedMap[String, Timer]): Unit =
+    Try(new DatagramSocket) match {
+      case Failure(ioe: IOException) => logWarning("StatsD datagram socket construction failed",
+        NetUtils.wrapException(host, port, NetUtils.getHostname(), 0, ioe))
+      case Failure(e) => logWarning("StatsD datagram socket construction failed", e)
+      case Success(s) =>
+        implicit val socket = s
+        val localAddress = Try(socket.getLocalAddress).map(_.getHostAddress).getOrElse(null)
+        val localPort = socket.getLocalPort
+        Try {
+          gauges.entrySet.asScala.foreach(e => reportGauge(e.getKey, e.getValue))
+          counters.entrySet.asScala.foreach(e => reportCounter(e.getKey, e.getValue))
+          histograms.entrySet.asScala.foreach(e => reportHistogram(e.getKey, e.getValue))
+          meters.entrySet.asScala.foreach(e => reportMetered(e.getKey, e.getValue))
+          timers.entrySet.asScala.foreach(e => reportTimer(e.getKey, e.getValue))
+        } recover {
+          case ioe: IOException =>
+            logDebug(s"Unable to send packets to StatsD", NetUtils.wrapException(
+              address.getHostString, address.getPort, localAddress, localPort, ioe))
+          case e: Throwable => logDebug(s"Unable to send packets to StatsD at '$host:$port'", e)
+        }
+        Try(socket.close()) recover {
+          case ioe: IOException =>
+            logDebug("Error when close socket to StatsD", NetUtils.wrapException(
+              address.getHostString, address.getPort, localAddress, localPort, ioe))
+          case e: Throwable => logDebug("Error when close socket to StatsD", e)
+        }
+    }
+
+  private def reportGauge(name: String, gauge: Gauge[_])(implicit socket: DatagramSocket): Unit =
+    formatAny(gauge.getValue).foreach(v => send(fullName(name), v, GAUGE))
+
+  private def reportCounter(name: String, counter: Counter)(implicit socket: DatagramSocket): Unit =
+    send(fullName(name), format(counter.getCount), COUNTER)
+
+  private def reportHistogram(name: String, histogram: Histogram)
+      (implicit socket: DatagramSocket): Unit = {
+    val snapshot = histogram.getSnapshot
+    send(fullName(name, "count"), format(histogram.getCount), GAUGE)
+    send(fullName(name, "max"), format(snapshot.getMax), TIMER)
+    send(fullName(name, "mean"), format(snapshot.getMean), TIMER)
+    send(fullName(name, "min"), format(snapshot.getMin), TIMER)
+    send(fullName(name, "stddev"), format(snapshot.getStdDev), TIMER)
+    send(fullName(name, "p50"), format(snapshot.getMedian), TIMER)
+    send(fullName(name, "p75"), format(snapshot.get75thPercentile), TIMER)
+    send(fullName(name, "p95"), format(snapshot.get95thPercentile), TIMER)
+    send(fullName(name, "p98"), format(snapshot.get98thPercentile), TIMER)
+    send(fullName(name, "p99"), format(snapshot.get99thPercentile), TIMER)
+    send(fullName(name, "p999"), format(snapshot.get999thPercentile), TIMER)
+  }
+
+  private def reportMetered(name: String, meter: Metered)(implicit socket: DatagramSocket): Unit = {
+    send(fullName(name, "count"), format(meter.getCount), GAUGE)
+    send(fullName(name, "m1_rate"), format(convertRate(meter.getOneMinuteRate)), TIMER)
+    send(fullName(name, "m5_rate"), format(convertRate(meter.getFiveMinuteRate)), TIMER)
+    send(fullName(name, "m15_rate"), format(convertRate(meter.getFifteenMinuteRate)), TIMER)
+    send(fullName(name, "mean_rate"), format(convertRate(meter.getMeanRate)), TIMER)
+  }
+
+  private def reportTimer(name: String, timer: Timer)(implicit socket: DatagramSocket): Unit = {
+    val snapshot = timer.getSnapshot
+    send(fullName(name, "max"), format(convertDuration(snapshot.getMax)), TIMER)
+    send(fullName(name, "mean"), format(convertDuration(snapshot.getMean)), TIMER)
+    send(fullName(name, "min"), format(convertDuration(snapshot.getMin)), TIMER)
+    send(fullName(name, "stddev"), format(convertDuration(snapshot.getStdDev)), TIMER)
+    send(fullName(name, "p50"), format(convertDuration(snapshot.getMedian)), TIMER)
+    send(fullName(name, "p75"), format(convertDuration(snapshot.get75thPercentile)), TIMER)
+    send(fullName(name, "p95"), format(convertDuration(snapshot.get95thPercentile)), TIMER)
+    send(fullName(name, "p98"), format(convertDuration(snapshot.get98thPercentile)), TIMER)
+    send(fullName(name, "p99"), format(convertDuration(snapshot.get99thPercentile)), TIMER)
+    send(fullName(name, "p999"), format(convertDuration(snapshot.get999thPercentile)), TIMER)
+
+    reportMetered(name, timer)
+  }
+
+  private def send(name: String, value: String, metricType: String)
+      (implicit socket: DatagramSocket): Unit = {
+    val bytes = sanitize(s"$name:$value|$metricType").getBytes(UTF_8)
+    val packet = new DatagramPacket(bytes, bytes.length, address)
+    socket.send(packet)
+  }
+
+  private def fullName(names: String*): String = MetricRegistry.name(prefix, names : _*)
+
+  private def sanitize(s: String): String = whitespace.replaceAllIn(s, "-")
+
+  private def format(v: Any): String = formatAny(v).getOrElse("")
+
+  private def formatAny(v: Any): Option[String] =
+    v match {
+      case f: Float => Some("%2.2f".format(f))
+      case d: Double => Some("%2.2f".format(d))
+      case b: BigDecimal => Some("%2.2f".format(b))
+      case n: Number => Some(v.toString)
+      case _ => None
+    }
+}
+
diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/StatsdSink.scala b/core/src/main/scala/org/apache/spark/metrics/sink/StatsdSink.scala
new file mode 100644
index 0000000000000..859a2f6bcd456
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/metrics/sink/StatsdSink.scala
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.metrics.sink
+
+import java.util.Properties
+import java.util.concurrent.TimeUnit
+
+import com.codahale.metrics.MetricRegistry
+
+import org.apache.spark.SecurityManager
+import org.apache.spark.internal.Logging
+import org.apache.spark.metrics.MetricsSystem
+
+private[spark] object StatsdSink {
+  val STATSD_KEY_HOST = "host"
+  val STATSD_KEY_PORT = "port"
+  val STATSD_KEY_PERIOD = "period"
+  val STATSD_KEY_UNIT = "unit"
+  val STATSD_KEY_PREFIX = "prefix"
+
+  val STATSD_DEFAULT_HOST = "127.0.0.1"
+  val STATSD_DEFAULT_PORT = "8125"
+  val STATSD_DEFAULT_PERIOD = "10"
+  val STATSD_DEFAULT_UNIT = "SECONDS"
+  val STATSD_DEFAULT_PREFIX = ""
+}
+
+private[spark] class StatsdSink(
+    val property: Properties,
+    val registry: MetricRegistry,
+    securityMgr: SecurityManager)
+  extends Sink with Logging {
+  import StatsdSink._
+
+  val host = property.getProperty(STATSD_KEY_HOST, STATSD_DEFAULT_HOST)
+  val port = property.getProperty(STATSD_KEY_PORT, STATSD_DEFAULT_PORT).toInt
+
+  val pollPeriod = property.getProperty(STATSD_KEY_PERIOD, STATSD_DEFAULT_PERIOD).toInt
+  val pollUnit =
+    TimeUnit.valueOf(property.getProperty(STATSD_KEY_UNIT, STATSD_DEFAULT_UNIT).toUpperCase)
+
+  val prefix = property.getProperty(STATSD_KEY_PREFIX, STATSD_DEFAULT_PREFIX)
+
+  MetricsSystem.checkMinimalPollingPeriod(pollUnit, pollPeriod)
+
+  val reporter = new StatsdReporter(registry, host, port, prefix)
+
+  override def start(): Unit = {
+    reporter.start(pollPeriod, pollUnit)
+    logInfo(s"StatsdSink started with prefix: '$prefix'")
+  }
+
+  override def stop(): Unit = {
+    reporter.stop()
+    logInfo("StatsdSink stopped.")
+  }
+
+  override def report(): Unit = reporter.report()
+}
+
diff --git a/core/src/test/scala/org/apache/spark/metrics/sink/StatsdSinkSuite.scala b/core/src/test/scala/org/apache/spark/metrics/sink/StatsdSinkSuite.scala
new file mode 100644
index 0000000000000..0e21a36071c42
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/metrics/sink/StatsdSinkSuite.scala
@@ -0,0 +1,161 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.metrics.sink
+
+import java.net.{DatagramPacket, DatagramSocket}
+import java.nio.charset.StandardCharsets.UTF_8
+import java.util.Properties
+import java.util.concurrent.TimeUnit._
+
+import com.codahale.metrics._
+
+import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
+import org.apache.spark.metrics.sink.StatsdSink._
+
+class StatsdSinkSuite extends SparkFunSuite {
+  private val securityMgr = new SecurityManager(new SparkConf(false))
+  private val defaultProps = Map(
+    STATSD_KEY_PREFIX -> "spark",
+    STATSD_KEY_PERIOD -> "1",
+    STATSD_KEY_UNIT -> "seconds",
+    STATSD_KEY_HOST -> "127.0.0.1"
+  )
+  private val socketTimeout = 30000 // milliseconds
+  private val socketBufferSize = 8192
+
+  private def withSocketAndSink(testCode: (DatagramSocket, StatsdSink) => Any): Unit = {
+    val socket = new DatagramSocket
+    socket.setReceiveBufferSize(socketBufferSize)
+    socket.setSoTimeout(socketTimeout)
+    val props = new Properties
+    defaultProps.foreach(e => props.put(e._1, e._2))
+    props.put(STATSD_KEY_PORT, socket.getLocalPort.toString)
+    val registry = new MetricRegistry
+    val sink = new StatsdSink(props, registry, securityMgr)
+    try {
+      testCode(socket, sink)
+    } finally {
+      socket.close()
+    }
+  }
+
+  test("metrics StatsD sink with Counter") {
+    withSocketAndSink { (socket, sink) =>
+      val counter = new Counter
+      counter.inc(12)
+      sink.registry.register("counter", counter)
+      sink.report()
+
+      val p = new DatagramPacket(new Array[Byte](socketBufferSize), socketBufferSize)
+      socket.receive(p)
+
+      val result = new String(p.getData, 0, p.getLength, UTF_8)
+      assert(result === "spark.counter:12|c", "Counter metric received should match data sent")
+    }
+  }
+
+  test("metrics StatsD sink with Gauge") {
+    withSocketAndSink { (socket, sink) =>
+      val gauge = new Gauge[Double] {
+        override def getValue: Double = 1.23
+      }
+      sink.registry.register("gauge", gauge)
+      sink.report()
+
+      val p = new DatagramPacket(new Array[Byte](socketBufferSize), socketBufferSize)
+      socket.receive(p)
+
+      val result = new String(p.getData, 0, p.getLength, UTF_8)
+      assert(result === "spark.gauge:1.23|g", "Gauge metric received should match data sent")
+    }
+  }
+
+  test("metrics StatsD sink with Histogram") {
+    withSocketAndSink { (socket, sink) =>
+      val p = new DatagramPacket(new Array[Byte](socketBufferSize), socketBufferSize)
+      val histogram = new Histogram(new UniformReservoir)
+      histogram.update(10)
+      histogram.update(20)
+      histogram.update(30)
+      sink.registry.register("histogram", histogram)
+      sink.report()
+
+      val expectedResults = Set(
+        "spark.histogram.count:3|g",
+        "spark.histogram.max:30|ms",
+        "spark.histogram.mean:20.00|ms",
+        "spark.histogram.min:10|ms",
+        "spark.histogram.stddev:10.00|ms",
+        "spark.histogram.p50:20.00|ms",
+        "spark.histogram.p75:30.00|ms",
+        "spark.histogram.p95:30.00|ms",
+        "spark.histogram.p98:30.00|ms",
+        "spark.histogram.p99:30.00|ms",
+        "spark.histogram.p999:30.00|ms"
+      )
+
+      (1 to expectedResults.size).foreach { i =>
+        socket.receive(p)
+        val result = new String(p.getData, 0, p.getLength, UTF_8)
+        logInfo(s"Received histogram result $i: '$result'")
+        assert(expectedResults.contains(result),
+          "Histogram metric received should match data sent")
+      }
+    }
+  }
+
+  test("metrics StatsD sink with Timer") {
+    withSocketAndSink { (socket, sink) =>
+      val p = new DatagramPacket(new Array[Byte](socketBufferSize), socketBufferSize)
+      val timer = new Timer()
+      timer.update(1, SECONDS)
+      timer.update(2, SECONDS)
+      timer.update(3, SECONDS)
+      sink.registry.register("timer", timer)
+      sink.report()
+
+      val expectedResults = Set(
+        "spark.timer.max:3000.00|ms",
+        "spark.timer.mean:2000.00|ms",
+        "spark.timer.min:1000.00|ms",
+        "spark.timer.stddev:816.50|ms",
+        "spark.timer.p50:2000.00|ms",
+        "spark.timer.p75:3000.00|ms",
+        "spark.timer.p95:3000.00|ms",
+        "spark.timer.p98:3000.00|ms",
+        "spark.timer.p99:3000.00|ms",
+        "spark.timer.p999:3000.00|ms",
+        "spark.timer.count:3|g",
+        "spark.timer.m1_rate:0.00|ms",
+        "spark.timer.m5_rate:0.00|ms",
+        "spark.timer.m15_rate:0.00|ms"
+      )
+      // mean rate varies on each test run
+      val oneMoreResult = """spark.timer.mean_rate:\d+\.\d\d\|ms"""
+
+      (1 to (expectedResults.size + 1)).foreach { i =>
+        socket.receive(p)
+        val result = new String(p.getData, 0, p.getLength, UTF_8)
+        logInfo(s"Received timer result $i: '$result'")
+        assert(expectedResults.contains(result) || result.matches(oneMoreResult),
+          "Timer metric received should match data sent")
+      }
+    }
+  }
+}
+
diff --git a/docs/monitoring.md b/docs/monitoring.md
index d22cd945eaf61..51084a25983ea 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -455,6 +455,7 @@ Each instance can report to zero or more _sinks_. Sinks are contained in the
 * `MetricsServlet`: Adds a servlet within the existing Spark UI to serve metrics data as JSON data.
 * `GraphiteSink`: Sends metrics to a Graphite node.
 * `Slf4jSink`: Sends metrics to slf4j as log entries.
+* `StatsdSink`: Sends metrics to a StatsD node.
 
 Spark also supports a Ganglia sink which is not included in the default build due to
 licensing restrictions:

From 4482ff23ad984335b0d477100ac0815d5db8d532 Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Thu, 31 Aug 2017 09:26:20 +0800
Subject: [PATCH 460/779] [SPARK-17321][YARN] Avoid writing shuffle metadata to
 disk if NM recovery is disabled

In the current code, if NM recovery is not enabled then `YarnShuffleService` will write shuffle metadata to NM local dir-1, if this local dir-1 is on bad disk, then `YarnShuffleService` will be failed to start. So to solve this issue, in Spark side if NM recovery is not enabled, then Spark will not persist data into leveldb, in that case yarn shuffle service can still be served but lose the ability for recovery, (it is fine because the failure of NM will kill the containers as well as applications).

Tested in the local cluster with NM recovery off and on to see if folder is created or not. MiniCluster UT isn't added because in MiniCluster NM will always set port to 0, but NM recovery requires non-ephemeral port.

Author: jerryshao <sshao@hortonworks.com>

Closes #19032 from jerryshao/SPARK-17321.

Change-Id: I8f2fe73d175e2ad2c4e380caede3873e0192d027
---
 .../network/yarn/YarnShuffleService.java      | 82 +++++++++----------
 .../yarn/YarnShuffleIntegrationSuite.scala    | 33 ++++++--
 .../yarn/YarnShuffleServiceSuite.scala        | 32 +++++---
 3 files changed, 86 insertions(+), 61 deletions(-)

diff --git a/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java b/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java
index cd67eb28573e8..d8b2ed6b5dc7b 100644
--- a/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java
+++ b/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java
@@ -29,6 +29,7 @@
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Objects;
+import com.google.common.base.Preconditions;
 import com.google.common.collect.Lists;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
@@ -160,7 +161,9 @@ protected void serviceInit(Configuration conf) throws Exception {
       // If we don't find one, then we choose a file to use to save the state next time.  Even if
       // an application was stopped while the NM was down, we expect yarn to call stopApplication()
       // when it comes back
-      registeredExecutorFile = initRecoveryDb(RECOVERY_FILE_NAME);
+      if (_recoveryPath != null) {
+        registeredExecutorFile = initRecoveryDb(RECOVERY_FILE_NAME);
+      }
 
       TransportConf transportConf = new TransportConf("shuffle", new HadoopConfigProvider(conf));
       blockHandler = new ExternalShuffleBlockHandler(transportConf, registeredExecutorFile);
@@ -170,7 +173,10 @@ protected void serviceInit(Configuration conf) throws Exception {
       List<TransportServerBootstrap> bootstraps = Lists.newArrayList();
       boolean authEnabled = conf.getBoolean(SPARK_AUTHENTICATE_KEY, DEFAULT_SPARK_AUTHENTICATE);
       if (authEnabled) {
-        createSecretManager();
+        secretManager = new ShuffleSecretManager();
+        if (_recoveryPath != null) {
+          loadSecretsFromDb();
+        }
         bootstraps.add(new AuthServerBootstrap(transportConf, secretManager));
       }
 
@@ -194,13 +200,12 @@ protected void serviceInit(Configuration conf) throws Exception {
     }
   }
 
-  private void createSecretManager() throws IOException {
-    secretManager = new ShuffleSecretManager();
+  private void loadSecretsFromDb() throws IOException {
     secretsFile = initRecoveryDb(SECRETS_RECOVERY_FILE_NAME);
 
     // Make sure this is protected in case its not in the NM recovery dir
     FileSystem fs = FileSystem.getLocal(_conf);
-    fs.mkdirs(new Path(secretsFile.getPath()), new FsPermission((short)0700));
+    fs.mkdirs(new Path(secretsFile.getPath()), new FsPermission((short) 0700));
 
     db = LevelDBProvider.initLevelDB(secretsFile, CURRENT_VERSION, mapper);
     logger.info("Recovery location is: " + secretsFile.getPath());
@@ -317,10 +322,10 @@ public ByteBuffer getMetaData() {
   }
 
   /**
-   * Set the recovery path for shuffle service recovery when NM is restarted. The method will be
-   * overrode and called when Hadoop version is 2.5+ and NM recovery is enabled, otherwise we
-   * have to manually call this to set our own recovery path.
+   * Set the recovery path for shuffle service recovery when NM is restarted. This will be call
+   * by NM if NM recovery is enabled.
    */
+  @Override
   public void setRecoveryPath(Path recoveryPath) {
     _recoveryPath = recoveryPath;
   }
@@ -334,53 +339,44 @@ protected Path getRecoveryPath(String fileName) {
 
   /**
    * Figure out the recovery path and handle moving the DB if YARN NM recovery gets enabled
-   * when it previously was not. If YARN NM recovery is enabled it uses that path, otherwise
-   * it will uses a YARN local dir.
+   * and DB exists in the local dir of NM by old version of shuffle service.
    */
   protected File initRecoveryDb(String dbName) {
-    if (_recoveryPath != null) {
-        File recoveryFile = new File(_recoveryPath.toUri().getPath(), dbName);
-        if (recoveryFile.exists()) {
-          return recoveryFile;
-        }
+    Preconditions.checkNotNull(_recoveryPath,
+      "recovery path should not be null if NM recovery is enabled");
+
+    File recoveryFile = new File(_recoveryPath.toUri().getPath(), dbName);
+    if (recoveryFile.exists()) {
+      return recoveryFile;
     }
+
     // db doesn't exist in recovery path go check local dirs for it
     String[] localDirs = _conf.getTrimmedStrings("yarn.nodemanager.local-dirs");
     for (String dir : localDirs) {
       File f = new File(new Path(dir).toUri().getPath(), dbName);
       if (f.exists()) {
-        if (_recoveryPath == null) {
-          // If NM recovery is not enabled, we should specify the recovery path using NM local
-          // dirs, which is compatible with the old code.
-          _recoveryPath = new Path(dir);
-          return f;
-        } else {
-          // If the recovery path is set then either NM recovery is enabled or another recovery
-          // DB has been initialized. If NM recovery is enabled and had set the recovery path
-          // make sure to move all DBs to the recovery path from the old NM local dirs.
-          // If another DB was initialized first just make sure all the DBs are in the same
-          // location.
-          Path newLoc = new Path(_recoveryPath, dbName);
-          Path copyFrom = new Path(f.toURI());
-          if (!newLoc.equals(copyFrom)) {
-            logger.info("Moving " + copyFrom + " to: " + newLoc);
-            try {
-              // The move here needs to handle moving non-empty directories across NFS mounts
-              FileSystem fs = FileSystem.getLocal(_conf);
-              fs.rename(copyFrom, newLoc);
-            } catch (Exception e) {
-              // Fail to move recovery file to new path, just continue on with new DB location
-              logger.error("Failed to move recovery file {} to the path {}",
-                dbName, _recoveryPath.toString(), e);
-            }
+        // If the recovery path is set then either NM recovery is enabled or another recovery
+        // DB has been initialized. If NM recovery is enabled and had set the recovery path
+        // make sure to move all DBs to the recovery path from the old NM local dirs.
+        // If another DB was initialized first just make sure all the DBs are in the same
+        // location.
+        Path newLoc = new Path(_recoveryPath, dbName);
+        Path copyFrom = new Path(f.toURI());
+        if (!newLoc.equals(copyFrom)) {
+          logger.info("Moving " + copyFrom + " to: " + newLoc);
+          try {
+            // The move here needs to handle moving non-empty directories across NFS mounts
+            FileSystem fs = FileSystem.getLocal(_conf);
+            fs.rename(copyFrom, newLoc);
+          } catch (Exception e) {
+            // Fail to move recovery file to new path, just continue on with new DB location
+            logger.error("Failed to move recovery file {} to the path {}",
+              dbName, _recoveryPath.toString(), e);
           }
-          return new File(newLoc.toUri().getPath());
         }
+        return new File(newLoc.toUri().getPath());
       }
     }
-    if (_recoveryPath == null) {
-      _recoveryPath = new Path(localDirs[0]);
-    }
 
     return new File(_recoveryPath.toUri().getPath(), dbName);
   }
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
index 13472f2ece184..01db796096f26 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
@@ -70,11 +70,18 @@ class YarnShuffleIntegrationSuite extends BaseYarnClusterSuite {
     val finalState = runSpark(
       false,
       mainClassName(YarnExternalShuffleDriver.getClass),
-      appArgs = Seq(result.getAbsolutePath(), registeredExecFile.getAbsolutePath),
+      appArgs = if (registeredExecFile != null) {
+        Seq(result.getAbsolutePath, registeredExecFile.getAbsolutePath)
+      } else {
+        Seq(result.getAbsolutePath)
+      },
       extraConf = extraSparkConf()
     )
     checkResult(finalState, result)
-    assert(YarnTestAccessor.getRegisteredExecutorFile(shuffleService).exists())
+
+    if (registeredExecFile != null) {
+      assert(YarnTestAccessor.getRegisteredExecutorFile(shuffleService).exists())
+    }
   }
 }
 
@@ -105,7 +112,7 @@ private object YarnExternalShuffleDriver extends Logging with Matchers {
   val WAIT_TIMEOUT_MILLIS = 10000
 
   def main(args: Array[String]): Unit = {
-    if (args.length != 2) {
+    if (args.length > 2) {
       // scalastyle:off println
       System.err.println(
         s"""
@@ -121,10 +128,16 @@ private object YarnExternalShuffleDriver extends Logging with Matchers {
       .setAppName("External Shuffle Test"))
     val conf = sc.getConf
     val status = new File(args(0))
-    val registeredExecFile = new File(args(1))
+    val registeredExecFile = if (args.length == 2) {
+      new File(args(1))
+    } else {
+      null
+    }
     logInfo("shuffle service executor file = " + registeredExecFile)
     var result = "failure"
-    val execStateCopy = new File(registeredExecFile.getAbsolutePath + "_dup")
+    val execStateCopy = Option(registeredExecFile).map { file =>
+      new File(file.getAbsolutePath + "_dup")
+    }.orNull
     try {
       val data = sc.parallelize(0 until 100, 10).map { x => (x % 10) -> x }.reduceByKey{ _ + _ }.
         collect().toSet
@@ -132,11 +145,15 @@ private object YarnExternalShuffleDriver extends Logging with Matchers {
       data should be ((0 until 10).map{x => x -> (x * 10 + 450)}.toSet)
       result = "success"
       // only one process can open a leveldb file at a time, so we copy the files
-      FileUtils.copyDirectory(registeredExecFile, execStateCopy)
-      assert(!ShuffleTestAccessor.reloadRegisteredExecutors(execStateCopy).isEmpty)
+      if (registeredExecFile != null && execStateCopy != null) {
+        FileUtils.copyDirectory(registeredExecFile, execStateCopy)
+        assert(!ShuffleTestAccessor.reloadRegisteredExecutors(execStateCopy).isEmpty)
+      }
     } finally {
       sc.stop()
-      FileUtils.deleteDirectory(execStateCopy)
+      if (execStateCopy != null) {
+        FileUtils.deleteDirectory(execStateCopy)
+      }
       Files.write(result, status, StandardCharsets.UTF_8)
     }
   }
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala
index a58784f59676a..268f4bd13f6c3 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala
@@ -44,6 +44,8 @@ class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAnd
   private[yarn] var yarnConfig: YarnConfiguration = null
   private[yarn] val SORT_MANAGER = "org.apache.spark.shuffle.sort.SortShuffleManager"
 
+  private var recoveryLocalDir: File = _
+
   override def beforeEach(): Unit = {
     super.beforeEach()
     yarnConfig = new YarnConfiguration()
@@ -54,6 +56,8 @@ class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAnd
     yarnConfig.setBoolean(YarnShuffleService.STOP_ON_FAILURE_KEY, true)
     val localDir = Utils.createTempDir()
     yarnConfig.set(YarnConfiguration.NM_LOCAL_DIRS, localDir.getAbsolutePath)
+
+    recoveryLocalDir = Utils.createTempDir()
   }
 
   var s1: YarnShuffleService = null
@@ -81,6 +85,7 @@ class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAnd
 
   test("executor state kept across NM restart") {
     s1 = new YarnShuffleService
+    s1.setRecoveryPath(new Path(recoveryLocalDir.toURI))
     // set auth to true to test the secrets recovery
     yarnConfig.setBoolean(SecurityManager.SPARK_AUTH_CONF, true)
     s1.init(yarnConfig)
@@ -123,6 +128,7 @@ class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAnd
     // now we pretend the shuffle service goes down, and comes back up
     s1.stop()
     s2 = new YarnShuffleService
+    s2.setRecoveryPath(new Path(recoveryLocalDir.toURI))
     s2.init(yarnConfig)
     s2.secretsFile should be (secretsFile)
     s2.registeredExecutorFile should be (execStateFile)
@@ -140,6 +146,7 @@ class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAnd
     // Act like the NM restarts one more time
     s2.stop()
     s3 = new YarnShuffleService
+    s3.setRecoveryPath(new Path(recoveryLocalDir.toURI))
     s3.init(yarnConfig)
     s3.registeredExecutorFile should be (execStateFile)
     s3.secretsFile should be (secretsFile)
@@ -156,6 +163,7 @@ class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAnd
 
   test("removed applications should not be in registered executor file") {
     s1 = new YarnShuffleService
+    s1.setRecoveryPath(new Path(recoveryLocalDir.toURI))
     yarnConfig.setBoolean(SecurityManager.SPARK_AUTH_CONF, false)
     s1.init(yarnConfig)
     val secretsFile = s1.secretsFile
@@ -190,6 +198,7 @@ class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAnd
 
   test("shuffle service should be robust to corrupt registered executor file") {
     s1 = new YarnShuffleService
+    s1.setRecoveryPath(new Path(recoveryLocalDir.toURI))
     s1.init(yarnConfig)
     val app1Id = ApplicationId.newInstance(0, 1)
     val app1Data = makeAppInfo("user", app1Id)
@@ -215,6 +224,7 @@ class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAnd
     out.close()
 
     s2 = new YarnShuffleService
+    s2.setRecoveryPath(new Path(recoveryLocalDir.toURI))
     s2.init(yarnConfig)
     s2.registeredExecutorFile should be (execStateFile)
 
@@ -234,6 +244,7 @@ class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAnd
 
     // another stop & restart should be fine though (eg., we recover from previous corruption)
     s3 = new YarnShuffleService
+    s3.setRecoveryPath(new Path(recoveryLocalDir.toURI))
     s3.init(yarnConfig)
     s3.registeredExecutorFile should be (execStateFile)
     val handler3 = s3.blockHandler
@@ -254,14 +265,6 @@ class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAnd
     s1.init(yarnConfig)
     s1._recoveryPath should be (recoveryPath)
     s1.stop()
-
-    // Test recovery path is set inside the shuffle service, this will be happened when NM
-    // recovery is not enabled or there's no NM recovery (Hadoop 2.5-).
-    s2 = new YarnShuffleService
-    s2.init(yarnConfig)
-    s2._recoveryPath should be
-      (new Path(yarnConfig.getTrimmedStrings("yarn.nodemanager.local-dirs")(0)))
-    s2.stop()
   }
 
   test("moving recovery file from NM local dir to recovery path") {
@@ -271,6 +274,7 @@ class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAnd
     // Simulate s1 is running on old version of Hadoop in which recovery file is in the NM local
     // dir.
     s1 = new YarnShuffleService
+    s1.setRecoveryPath(new Path(yarnConfig.getTrimmedStrings(YarnConfiguration.NM_LOCAL_DIRS)(0)))
     // set auth to true to test the secrets recovery
     yarnConfig.setBoolean(SecurityManager.SPARK_AUTH_CONF, true)
     s1.init(yarnConfig)
@@ -308,7 +312,7 @@ class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAnd
 
     // Simulate s2 is running on Hadoop 2.5+ with NM recovery is enabled.
     assert(execStateFile.exists())
-    val recoveryPath = new Path(Utils.createTempDir().toURI)
+    val recoveryPath = new Path(recoveryLocalDir.toURI)
     s2 = new YarnShuffleService
     s2.setRecoveryPath(recoveryPath)
     s2.init(yarnConfig)
@@ -347,10 +351,10 @@ class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAnd
     // Set up a read-only local dir.
     val roDir = Utils.createTempDir()
     Files.setPosixFilePermissions(roDir.toPath(), EnumSet.of(OWNER_READ, OWNER_EXECUTE))
-    yarnConfig.set(YarnConfiguration.NM_LOCAL_DIRS, roDir.getAbsolutePath())
 
     // Try to start the shuffle service, it should fail.
     val service = new YarnShuffleService()
+    service.setRecoveryPath(new Path(roDir.toURI))
 
     try {
       val error = intercept[ServiceStateException] {
@@ -369,4 +373,12 @@ class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAnd
     new ApplicationInitializationContext(user, appId, secret)
   }
 
+  test("recovery db should not be created if NM recovery is not enabled") {
+    s1 = new YarnShuffleService
+    s1.init(yarnConfig)
+    s1._recoveryPath should be (null)
+    s1.registeredExecutorFile should be (null)
+    s1.secretsFile should be (null)
+  }
+
 }

From ecf437a64874a31328f4e28c6b24f37557fbe07d Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Thu, 31 Aug 2017 12:55:38 +0900
Subject: [PATCH 461/779] [SPARK-21534][SQL][PYSPARK] PickleException when
 creating dataframe from python row with empty bytearray

## What changes were proposed in this pull request?

`PickleException` is thrown when creating dataframe from python row with empty bytearray

    spark.createDataFrame(spark.sql("select unhex('') as xx").rdd.map(lambda x: {"abc": x.xx})).show()

    net.razorvine.pickle.PickleException: invalid pickle data for bytearray; expected 1 or 2 args, got 0
    	at net.razorvine.pickle.objects.ByteArrayConstructor.construct(ByteArrayConstructor.java
        ...

`ByteArrayConstructor` doesn't deal with empty byte array pickled by Python3.

## How was this patch tested?

Added test.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #19085 from viirya/SPARK-21534.
---
 .../org/apache/spark/api/python/SerDeUtil.scala    | 14 ++++++++++++++
 python/pyspark/sql/tests.py                        |  4 +++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
index aaf8e7a1d7461..01e64b6972ae2 100644
--- a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
@@ -35,6 +35,16 @@ import org.apache.spark.rdd.RDD
 
 /** Utilities for serialization / deserialization between Python and Java, using Pickle. */
 private[spark] object SerDeUtil extends Logging {
+  class ByteArrayConstructor extends net.razorvine.pickle.objects.ByteArrayConstructor {
+    override def construct(args: Array[Object]): Object = {
+      // Deal with an empty byte array pickled by Python 3.
+      if (args.length == 0) {
+        Array.emptyByteArray
+      } else {
+        super.construct(args)
+      }
+    }
+  }
   // Unpickle array.array generated by Python 2.6
   class ArrayConstructor extends net.razorvine.pickle.objects.ArrayConstructor {
     //  /* Description of types */
@@ -108,6 +118,10 @@ private[spark] object SerDeUtil extends Logging {
     synchronized{
       if (!initialized) {
         Unpickler.registerConstructor("array", "array", new ArrayConstructor())
+        Unpickler.registerConstructor("__builtin__", "bytearray", new ByteArrayConstructor())
+        Unpickler.registerConstructor("builtins", "bytearray", new ByteArrayConstructor())
+        Unpickler.registerConstructor("__builtin__", "bytes", new ByteArrayConstructor())
+        Unpickler.registerConstructor("_codecs", "encode", new ByteArrayConstructor())
         initialized = true
       }
     }
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 1ecde68fb0ac1..b3102853ceb53 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -2383,9 +2383,11 @@ def test_read_text_file_list(self):
 
     def test_BinaryType_serialization(self):
         # Pyrolite version <= 4.9 could not serialize BinaryType with Python3 SPARK-17808
+        # The empty bytearray is test for SPARK-21534.
         schema = StructType([StructField('mybytes', BinaryType())])
         data = [[bytearray(b'here is my data')],
-                [bytearray(b'and here is some more')]]
+                [bytearray(b'and here is some more')],
+                [bytearray(b'')]]
         df = self.spark.createDataFrame(data, schema=schema)
         df.collect()
 

From 964b507c7511cf3f4383cb0fc4026a573034b8cc Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Thu, 31 Aug 2017 13:08:52 +0900
Subject: [PATCH 462/779] [SPARK-21583][SQL] Create a ColumnarBatch from
 ArrowColumnVectors

## What changes were proposed in this pull request?

This PR allows the creation of a `ColumnarBatch` from `ReadOnlyColumnVectors` where previously a columnar batch could only allocate vectors internally.  This is useful for using `ArrowColumnVectors` in a batch form to do row-based iteration.  Also added `ArrowConverter.fromPayloadIterator` which converts `ArrowPayload` iterator to `InternalRow` iterator and uses a `ColumnarBatch` internally.

## How was this patch tested?

Added a new unit test for creating a `ColumnarBatch` with `ReadOnlyColumnVectors` and a test to verify the roundtrip of rows -> ArrowPayload -> rows, using `toPayloadIterator` and `fromPayloadIterator`.

Author: Bryan Cutler <cutlerb@gmail.com>

Closes #18787 from BryanCutler/arrow-ColumnarBatch-support-SPARK-21583.
---
 .../sql/execution/arrow/ArrowConverters.scala | 76 ++++++++++++++++++-
 .../arrow/ArrowConvertersSuite.scala          | 29 ++++++-
 .../vectorized/ColumnarBatchSuite.scala       | 54 +++++++++++++
 3 files changed, 157 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
index fa45822311e15..561a067a2f81f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
@@ -20,6 +20,8 @@ package org.apache.spark.sql.execution.arrow
 import java.io.ByteArrayOutputStream
 import java.nio.channels.Channels
 
+import scala.collection.JavaConverters._
+
 import org.apache.arrow.memory.BufferAllocator
 import org.apache.arrow.vector._
 import org.apache.arrow.vector.file._
@@ -28,6 +30,7 @@ import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel
 
 import org.apache.spark.TaskContext
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.execution.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnVector}
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
@@ -35,7 +38,7 @@ import org.apache.spark.util.Utils
 /**
  * Store Arrow data in a form that can be serialized by Spark and served to a Python process.
  */
-private[sql] class ArrowPayload private[arrow] (payload: Array[Byte]) extends Serializable {
+private[sql] class ArrowPayload private[sql] (payload: Array[Byte]) extends Serializable {
 
   /**
    * Convert the ArrowPayload to an ArrowRecordBatch.
@@ -50,6 +53,17 @@ private[sql] class ArrowPayload private[arrow] (payload: Array[Byte]) extends Se
   def asPythonSerializable: Array[Byte] = payload
 }
 
+/**
+ * Iterator interface to iterate over Arrow record batches and return rows
+ */
+private[sql] trait ArrowRowIterator extends Iterator[InternalRow] {
+
+  /**
+   * Return the schema loaded from the Arrow record batch being iterated over
+   */
+  def schema: StructType
+}
+
 private[sql] object ArrowConverters {
 
   /**
@@ -110,6 +124,66 @@ private[sql] object ArrowConverters {
     }
   }
 
+  /**
+   * Maps Iterator from ArrowPayload to InternalRow. Returns a pair containing the row iterator
+   * and the schema from the first batch of Arrow data read.
+   */
+  private[sql] def fromPayloadIterator(
+      payloadIter: Iterator[ArrowPayload],
+      context: TaskContext): ArrowRowIterator = {
+    val allocator =
+      ArrowUtils.rootAllocator.newChildAllocator("fromPayloadIterator", 0, Long.MaxValue)
+
+    new ArrowRowIterator {
+      private var reader: ArrowFileReader = null
+      private var schemaRead = StructType(Seq.empty)
+      private var rowIter = if (payloadIter.hasNext) nextBatch() else Iterator.empty
+
+      context.addTaskCompletionListener { _ =>
+        closeReader()
+        allocator.close()
+      }
+
+      override def schema: StructType = schemaRead
+
+      override def hasNext: Boolean = rowIter.hasNext || {
+        closeReader()
+        if (payloadIter.hasNext) {
+          rowIter = nextBatch()
+          true
+        } else {
+          allocator.close()
+          false
+        }
+      }
+
+      override def next(): InternalRow = rowIter.next()
+
+      private def closeReader(): Unit = {
+        if (reader != null) {
+          reader.close()
+          reader = null
+        }
+      }
+
+      private def nextBatch(): Iterator[InternalRow] = {
+        val in = new ByteArrayReadableSeekableByteChannel(payloadIter.next().asPythonSerializable)
+        reader = new ArrowFileReader(in, allocator)
+        reader.loadNextBatch()  // throws IOException
+        val root = reader.getVectorSchemaRoot  // throws IOException
+        schemaRead = ArrowUtils.fromArrowSchema(root.getSchema)
+
+        val columns = root.getFieldVectors.asScala.map { vector =>
+          new ArrowColumnVector(vector).asInstanceOf[ColumnVector]
+        }.toArray
+
+        val batch = new ColumnarBatch(schemaRead, columns, root.getRowCount)
+        batch.setNumRows(root.getRowCount)
+        batch.rowIterator().asScala
+      }
+    }
+  }
+
   /**
    * Convert a byte array to an ArrowRecordBatch.
    */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala
index 4893b52f240ec..30422b657742c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala
@@ -29,8 +29,9 @@ import org.apache.arrow.vector.file.json.JsonFileReader
 import org.apache.arrow.vector.util.Validator
 import org.scalatest.BeforeAndAfterAll
 
-import org.apache.spark.SparkException
+import org.apache.spark.{SparkException, TaskContext}
 import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types.{BinaryType, IntegerType, StructField, StructType}
 import org.apache.spark.util.Utils
@@ -1629,6 +1630,32 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
     }
   }
 
+  test("roundtrip payloads") {
+    val inputRows = (0 until 9).map { i =>
+      InternalRow(i)
+    } :+ InternalRow(null)
+
+    val schema = StructType(Seq(StructField("int", IntegerType, nullable = true)))
+
+    val ctx = TaskContext.empty()
+    val payloadIter = ArrowConverters.toPayloadIterator(inputRows.toIterator, schema, 0, ctx)
+    val outputRowIter = ArrowConverters.fromPayloadIterator(payloadIter, ctx)
+
+    assert(schema.equals(outputRowIter.schema))
+
+    var count = 0
+    outputRowIter.zipWithIndex.foreach { case (row, i) =>
+      if (i != 9) {
+        assert(row.getInt(0) == i)
+      } else {
+        assert(row.isNullAt(0))
+      }
+      count += 1
+    }
+
+    assert(count == inputRows.length)
+  }
+
   /** Test that a converted DataFrame to Arrow record batch equals batch read from JSON file */
   private def collectAndValidate(df: DataFrame, json: String, file: String): Unit = {
     // NOTE: coalesce to single partition because can only load 1 batch in validator
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
index 08ccbd628cf8f..1f21d3c0db987 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
@@ -25,10 +25,13 @@ import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.util.Random
 
+import org.apache.arrow.vector.NullableIntVector
+
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.memory.MemoryMode
 import org.apache.spark.sql.{RandomDataGenerator, Row}
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.execution.arrow.ArrowUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.Platform
 import org.apache.spark.unsafe.types.CalendarInterval
@@ -1261,4 +1264,55 @@ class ColumnarBatchSuite extends SparkFunSuite {
         s"vectorized reader"))
     }
   }
+
+  test("create columnar batch from Arrow column vectors") {
+    val allocator = ArrowUtils.rootAllocator.newChildAllocator("int", 0, Long.MaxValue)
+    val vector1 = ArrowUtils.toArrowField("int1", IntegerType, nullable = true)
+      .createVector(allocator).asInstanceOf[NullableIntVector]
+    vector1.allocateNew()
+    val mutator1 = vector1.getMutator()
+    val vector2 = ArrowUtils.toArrowField("int2", IntegerType, nullable = true)
+      .createVector(allocator).asInstanceOf[NullableIntVector]
+    vector2.allocateNew()
+    val mutator2 = vector2.getMutator()
+
+    (0 until 10).foreach { i =>
+      mutator1.setSafe(i, i)
+      mutator2.setSafe(i + 1, i)
+    }
+    mutator1.setNull(10)
+    mutator1.setValueCount(11)
+    mutator2.setNull(0)
+    mutator2.setValueCount(11)
+
+    val columnVectors = Seq(new ArrowColumnVector(vector1), new ArrowColumnVector(vector2))
+
+    val schema = StructType(Seq(StructField("int1", IntegerType), StructField("int2", IntegerType)))
+    val batch = new ColumnarBatch(schema, columnVectors.toArray[ColumnVector], 11)
+    batch.setNumRows(11)
+
+    assert(batch.numCols() == 2)
+    assert(batch.numRows() == 11)
+
+    val rowIter = batch.rowIterator().asScala
+    rowIter.zipWithIndex.foreach { case (row, i) =>
+      if (i == 10) {
+        assert(row.isNullAt(0))
+      } else {
+        assert(row.getInt(0) == i)
+      }
+      if (i == 0) {
+        assert(row.isNullAt(1))
+      } else {
+        assert(row.getInt(1) == i - 1)
+      }
+    }
+
+    intercept[java.lang.AssertionError] {
+      batch.getRow(100)
+    }
+
+    batch.close()
+    allocator.close()
+  }
 }

From 19b0240d42f81008d918e7a67cb17647bca1500b Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Thu, 31 Aug 2017 09:16:26 -0700
Subject: [PATCH 463/779] [SPARK-21878][SQL][TEST] Create SQLMetricsTestUtils

## What changes were proposed in this pull request?
Creates `SQLMetricsTestUtils` for the utility functions of both Hive-specific and the other SQLMetrics test cases.

Also, move two SQLMetrics test cases from sql/hive to sql/core.

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #19092 from gatorsmile/rewriteSQLMetrics.
---
 .../execution/metric/SQLMetricsSuite.scala    | 151 +---------
 .../metric/SQLMetricsTestUtils.scala          | 270 ++++++++++++++++++
 .../sql/hive/execution/SQLMetricsSuite.scala  | 106 +------
 3 files changed, 277 insertions(+), 250 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsTestUtils.scala

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
index fd793233b0bc1..0dc612ef735fa 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
@@ -19,98 +19,20 @@ package org.apache.spark.sql.execution.metric
 
 import java.io.File
 
-import scala.collection.mutable.HashMap
 import scala.util.Random
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd}
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
-import org.apache.spark.sql.execution.SparkPlanInfo
-import org.apache.spark.sql.execution.ui.SparkPlanGraph
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.util.{AccumulatorContext, JsonProtocol}
 
-class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
+class SQLMetricsSuite extends SparkFunSuite with SQLMetricsTestUtils with SharedSQLContext {
   import testImplicits._
 
 
-  /**
-   * Call `df.collect()` and collect necessary metrics from execution data.
-   *
-   * @param df `DataFrame` to run
-   * @param expectedNumOfJobs number of jobs that will run
-   * @param expectedNodeIds the node ids of the metrics to collect from execution data.
-   */
-  private def getSparkPlanMetrics(
-      df: DataFrame,
-      expectedNumOfJobs: Int,
-      expectedNodeIds: Set[Long],
-      enableWholeStage: Boolean = false): Option[Map[Long, (String, Map[String, Any])]] = {
-    val previousExecutionIds = spark.sharedState.listener.executionIdToData.keySet
-    withSQLConf("spark.sql.codegen.wholeStage" -> enableWholeStage.toString) {
-      df.collect()
-    }
-    sparkContext.listenerBus.waitUntilEmpty(10000)
-    val executionIds =
-      spark.sharedState.listener.executionIdToData.keySet.diff(previousExecutionIds)
-    assert(executionIds.size === 1)
-    val executionId = executionIds.head
-    val jobs = spark.sharedState.listener.getExecution(executionId).get.jobs
-    // Use "<=" because there is a race condition that we may miss some jobs
-    // TODO Change it to "=" once we fix the race condition that missing the JobStarted event.
-    assert(jobs.size <= expectedNumOfJobs)
-    if (jobs.size == expectedNumOfJobs) {
-      // If we can track all jobs, check the metric values
-      val metricValues = spark.sharedState.listener.getExecutionMetrics(executionId)
-      val metrics = SparkPlanGraph(SparkPlanInfo.fromSparkPlan(
-        df.queryExecution.executedPlan)).allNodes.filter { node =>
-        expectedNodeIds.contains(node.id)
-      }.map { node =>
-        val nodeMetrics = node.metrics.map { metric =>
-          val metricValue = metricValues(metric.accumulatorId)
-          (metric.name, metricValue)
-        }.toMap
-        (node.id, node.name -> nodeMetrics)
-      }.toMap
-      Some(metrics)
-    } else {
-      // TODO Remove this "else" once we fix the race condition that missing the JobStarted event.
-      // Since we cannot track all jobs, the metric values could be wrong and we should not check
-      // them.
-      logWarning("Due to a race condition, we miss some jobs and cannot verify the metric values")
-      None
-    }
-  }
-
-  /**
-   * Call `df.collect()` and verify if the collected metrics are same as "expectedMetrics".
-   *
-   * @param df `DataFrame` to run
-   * @param expectedNumOfJobs number of jobs that will run
-   * @param expectedMetrics the expected metrics. The format is
-   *                        `nodeId -> (operatorName, metric name -> metric value)`.
-   */
-  private def testSparkPlanMetrics(
-      df: DataFrame,
-      expectedNumOfJobs: Int,
-      expectedMetrics: Map[Long, (String, Map[String, Any])]): Unit = {
-    val optActualMetrics = getSparkPlanMetrics(df, expectedNumOfJobs, expectedMetrics.keySet)
-    optActualMetrics.map { actualMetrics =>
-      assert(expectedMetrics.keySet === actualMetrics.keySet)
-      for (nodeId <- expectedMetrics.keySet) {
-        val (expectedNodeName, expectedMetricsMap) = expectedMetrics(nodeId)
-        val (actualNodeName, actualMetricsMap) = actualMetrics(nodeId)
-        assert(expectedNodeName === actualNodeName)
-        for (metricName <- expectedMetricsMap.keySet) {
-          assert(expectedMetricsMap(metricName).toString === actualMetricsMap(metricName))
-        }
-      }
-    }
-  }
-
   /**
    * Generates a `DataFrame` by filling randomly generated bytes for hash collision.
    */
@@ -570,75 +492,12 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
       assert(res3 === (10L, 0L, 10L) :: (30L, 0L, 30L) :: (0L, 30L, 300L) :: (0L, 300L, 0L) :: Nil)
     }
   }
-}
-
-object InputOutputMetricsHelper {
-  private class InputOutputMetricsListener extends SparkListener {
-    private case class MetricsResult(
-        var recordsRead: Long = 0L,
-        var shuffleRecordsRead: Long = 0L,
-        var sumMaxOutputRows: Long = 0L)
 
-    private[this] val stageIdToMetricsResult = HashMap.empty[Int, MetricsResult]
-
-    def reset(): Unit = {
-      stageIdToMetricsResult.clear()
-    }
-
-    /**
-     * Return a list of recorded metrics aggregated per stage.
-     *
-     * The list is sorted in the ascending order on the stageId.
-     * For each recorded stage, the following tuple is returned:
-     *  - sum of inputMetrics.recordsRead for all the tasks in the stage
-     *  - sum of shuffleReadMetrics.recordsRead for all the tasks in the stage
-     *  - sum of the highest values of "number of output rows" metric for all the tasks in the stage
-     */
-    def getResults(): List[(Long, Long, Long)] = {
-      stageIdToMetricsResult.keySet.toList.sorted.map { stageId =>
-        val res = stageIdToMetricsResult(stageId)
-        (res.recordsRead, res.shuffleRecordsRead, res.sumMaxOutputRows)
-      }
-    }
-
-    override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = synchronized {
-      val res = stageIdToMetricsResult.getOrElseUpdate(taskEnd.stageId, MetricsResult())
-
-      res.recordsRead += taskEnd.taskMetrics.inputMetrics.recordsRead
-      res.shuffleRecordsRead += taskEnd.taskMetrics.shuffleReadMetrics.recordsRead
-
-      var maxOutputRows = 0L
-      for (accum <- taskEnd.taskMetrics.externalAccums) {
-        val info = accum.toInfo(Some(accum.value), None)
-        if (info.name.toString.contains("number of output rows")) {
-          info.update match {
-            case Some(n: Number) =>
-              if (n.longValue() > maxOutputRows) {
-                maxOutputRows = n.longValue()
-              }
-            case _ => // Ignore.
-          }
-        }
-      }
-      res.sumMaxOutputRows += maxOutputRows
-    }
+  test("writing data out metrics: parquet") {
+    testMetricsNonDynamicPartition("parquet", "t1")
   }
 
-  // Run df.collect() and return aggregated metrics for each stage.
-  def run(df: DataFrame): List[(Long, Long, Long)] = {
-    val spark = df.sparkSession
-    val sparkContext = spark.sparkContext
-    val listener = new InputOutputMetricsListener()
-    sparkContext.addSparkListener(listener)
-
-    try {
-      sparkContext.listenerBus.waitUntilEmpty(5000)
-      listener.reset()
-      df.collect()
-      sparkContext.listenerBus.waitUntilEmpty(5000)
-    } finally {
-      sparkContext.removeSparkListener(listener)
-    }
-    listener.getResults()
+  test("writing data out metrics with dynamic partition: parquet") {
+    testMetricsDynamicPartition("parquet", "parquet", "t1")
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsTestUtils.scala
new file mode 100644
index 0000000000000..3966e98c1ce06
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsTestUtils.scala
@@ -0,0 +1,270 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.metric
+
+import java.io.File
+
+import scala.collection.mutable.HashMap
+
+import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd}
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.execution.SparkPlanInfo
+import org.apache.spark.sql.execution.ui.SparkPlanGraph
+import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.util.Utils
+
+
+trait SQLMetricsTestUtils extends SQLTestUtils {
+
+  import testImplicits._
+
+  /**
+   * Get execution metrics for the SQL execution and verify metrics values.
+   *
+   * @param metricsValues the expected metric values (numFiles, numPartitions, numOutputRows).
+   * @param func the function can produce execution id after running.
+   */
+  private def verifyWriteDataMetrics(metricsValues: Seq[Int])(func: => Unit): Unit = {
+    val previousExecutionIds = spark.sharedState.listener.executionIdToData.keySet
+    // Run the given function to trigger query execution.
+    func
+    spark.sparkContext.listenerBus.waitUntilEmpty(10000)
+    val executionIds =
+      spark.sharedState.listener.executionIdToData.keySet.diff(previousExecutionIds)
+    assert(executionIds.size == 1)
+    val executionId = executionIds.head
+
+    val executionData = spark.sharedState.listener.getExecution(executionId).get
+    val executedNode = executionData.physicalPlanGraph.nodes.head
+
+    val metricsNames = Seq(
+      "number of written files",
+      "number of dynamic part",
+      "number of output rows")
+
+    val metrics = spark.sharedState.listener.getExecutionMetrics(executionId)
+
+    metricsNames.zip(metricsValues).foreach { case (metricsName, expected) =>
+      val sqlMetric = executedNode.metrics.find(_.name == metricsName)
+      assert(sqlMetric.isDefined)
+      val accumulatorId = sqlMetric.get.accumulatorId
+      val metricValue = metrics(accumulatorId).replaceAll(",", "").toInt
+      assert(metricValue == expected)
+    }
+
+    val totalNumBytesMetric = executedNode.metrics.find(_.name == "bytes of written output").get
+    val totalNumBytes = metrics(totalNumBytesMetric.accumulatorId).replaceAll(",", "").toInt
+    assert(totalNumBytes > 0)
+  }
+
+  protected def testMetricsNonDynamicPartition(
+      dataFormat: String,
+      tableName: String): Unit = {
+    withTable(tableName) {
+      Seq((1, 2)).toDF("i", "j")
+        .write.format(dataFormat).mode("overwrite").saveAsTable(tableName)
+
+      val tableLocation =
+        new File(spark.sessionState.catalog.getTableMetadata(TableIdentifier(tableName)).location)
+
+      // 2 files, 100 rows, 0 dynamic partition.
+      verifyWriteDataMetrics(Seq(2, 0, 100)) {
+        (0 until 100).map(i => (i, i + 1)).toDF("i", "j").repartition(2)
+          .write.format(dataFormat).mode("overwrite").insertInto(tableName)
+      }
+      assert(Utils.recursiveList(tableLocation).count(_.getName.startsWith("part-")) == 2)
+    }
+  }
+
+  protected def testMetricsDynamicPartition(
+      provider: String,
+      dataFormat: String,
+      tableName: String): Unit = {
+    withTempPath { dir =>
+      spark.sql(
+        s"""
+           |CREATE TABLE $tableName(a int, b int)
+           |USING $provider
+           |PARTITIONED BY(a)
+           |LOCATION '${dir.toURI}'
+         """.stripMargin)
+      val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier(tableName))
+      assert(table.location == makeQualifiedPath(dir.getAbsolutePath))
+
+      val df = spark.range(start = 0, end = 40, step = 1, numPartitions = 1)
+        .selectExpr("id a", "id b")
+
+      // 40 files, 80 rows, 40 dynamic partitions.
+      verifyWriteDataMetrics(Seq(40, 40, 80)) {
+        df.union(df).repartition(2, $"a")
+          .write
+          .format(dataFormat)
+          .mode("overwrite")
+          .insertInto(tableName)
+      }
+      assert(Utils.recursiveList(dir).count(_.getName.startsWith("part-")) == 40)
+    }
+  }
+
+  /**
+   * Call `df.collect()` and collect necessary metrics from execution data.
+   *
+   * @param df `DataFrame` to run
+   * @param expectedNumOfJobs number of jobs that will run
+   * @param expectedNodeIds the node ids of the metrics to collect from execution data.
+   */
+  protected def getSparkPlanMetrics(
+       df: DataFrame,
+       expectedNumOfJobs: Int,
+       expectedNodeIds: Set[Long],
+       enableWholeStage: Boolean = false): Option[Map[Long, (String, Map[String, Any])]] = {
+    val previousExecutionIds = spark.sharedState.listener.executionIdToData.keySet
+    withSQLConf("spark.sql.codegen.wholeStage" -> enableWholeStage.toString) {
+      df.collect()
+    }
+    sparkContext.listenerBus.waitUntilEmpty(10000)
+    val executionIds =
+      spark.sharedState.listener.executionIdToData.keySet.diff(previousExecutionIds)
+    assert(executionIds.size === 1)
+    val executionId = executionIds.head
+    val jobs = spark.sharedState.listener.getExecution(executionId).get.jobs
+    // Use "<=" because there is a race condition that we may miss some jobs
+    // TODO Change it to "=" once we fix the race condition that missing the JobStarted event.
+    assert(jobs.size <= expectedNumOfJobs)
+    if (jobs.size == expectedNumOfJobs) {
+      // If we can track all jobs, check the metric values
+      val metricValues = spark.sharedState.listener.getExecutionMetrics(executionId)
+      val metrics = SparkPlanGraph(SparkPlanInfo.fromSparkPlan(
+        df.queryExecution.executedPlan)).allNodes.filter { node =>
+        expectedNodeIds.contains(node.id)
+      }.map { node =>
+        val nodeMetrics = node.metrics.map { metric =>
+          val metricValue = metricValues(metric.accumulatorId)
+          (metric.name, metricValue)
+        }.toMap
+        (node.id, node.name -> nodeMetrics)
+      }.toMap
+      Some(metrics)
+    } else {
+      // TODO Remove this "else" once we fix the race condition that missing the JobStarted event.
+      // Since we cannot track all jobs, the metric values could be wrong and we should not check
+      // them.
+      logWarning("Due to a race condition, we miss some jobs and cannot verify the metric values")
+      None
+    }
+  }
+
+  /**
+   * Call `df.collect()` and verify if the collected metrics are same as "expectedMetrics".
+   *
+   * @param df `DataFrame` to run
+   * @param expectedNumOfJobs number of jobs that will run
+   * @param expectedMetrics the expected metrics. The format is
+   *                        `nodeId -> (operatorName, metric name -> metric value)`.
+   */
+  protected def testSparkPlanMetrics(
+      df: DataFrame,
+      expectedNumOfJobs: Int,
+      expectedMetrics: Map[Long, (String, Map[String, Any])]): Unit = {
+    val optActualMetrics = getSparkPlanMetrics(df, expectedNumOfJobs, expectedMetrics.keySet)
+    optActualMetrics.foreach { actualMetrics =>
+      assert(expectedMetrics.keySet === actualMetrics.keySet)
+      for (nodeId <- expectedMetrics.keySet) {
+        val (expectedNodeName, expectedMetricsMap) = expectedMetrics(nodeId)
+        val (actualNodeName, actualMetricsMap) = actualMetrics(nodeId)
+        assert(expectedNodeName === actualNodeName)
+        for (metricName <- expectedMetricsMap.keySet) {
+          assert(expectedMetricsMap(metricName).toString === actualMetricsMap(metricName))
+        }
+      }
+    }
+  }
+}
+
+
+object InputOutputMetricsHelper {
+  private class InputOutputMetricsListener extends SparkListener {
+    private case class MetricsResult(
+      var recordsRead: Long = 0L,
+      var shuffleRecordsRead: Long = 0L,
+      var sumMaxOutputRows: Long = 0L)
+
+    private[this] val stageIdToMetricsResult = HashMap.empty[Int, MetricsResult]
+
+    def reset(): Unit = {
+      stageIdToMetricsResult.clear()
+    }
+
+    /**
+     * Return a list of recorded metrics aggregated per stage.
+     *
+     * The list is sorted in the ascending order on the stageId.
+     * For each recorded stage, the following tuple is returned:
+     *  - sum of inputMetrics.recordsRead for all the tasks in the stage
+     *  - sum of shuffleReadMetrics.recordsRead for all the tasks in the stage
+     *  - sum of the highest values of "number of output rows" metric for all the tasks in the stage
+     */
+    def getResults(): List[(Long, Long, Long)] = {
+      stageIdToMetricsResult.keySet.toList.sorted.map { stageId =>
+        val res = stageIdToMetricsResult(stageId)
+        (res.recordsRead, res.shuffleRecordsRead, res.sumMaxOutputRows)
+      }
+    }
+
+    override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = synchronized {
+      val res = stageIdToMetricsResult.getOrElseUpdate(taskEnd.stageId, MetricsResult())
+
+      res.recordsRead += taskEnd.taskMetrics.inputMetrics.recordsRead
+      res.shuffleRecordsRead += taskEnd.taskMetrics.shuffleReadMetrics.recordsRead
+
+      var maxOutputRows = 0L
+      for (accum <- taskEnd.taskMetrics.externalAccums) {
+        val info = accum.toInfo(Some(accum.value), None)
+        if (info.name.toString.contains("number of output rows")) {
+          info.update match {
+            case Some(n: Number) =>
+              if (n.longValue() > maxOutputRows) {
+                maxOutputRows = n.longValue()
+              }
+            case _ => // Ignore.
+          }
+        }
+      }
+      res.sumMaxOutputRows += maxOutputRows
+    }
+  }
+
+  // Run df.collect() and return aggregated metrics for each stage.
+  def run(df: DataFrame): List[(Long, Long, Long)] = {
+    val spark = df.sparkSession
+    val sparkContext = spark.sparkContext
+    val listener = new InputOutputMetricsListener()
+    sparkContext.addSparkListener(listener)
+
+    try {
+      sparkContext.listenerBus.waitUntilEmpty(5000)
+      listener.reset()
+      df.collect()
+      sparkContext.listenerBus.waitUntilEmpty(5000)
+    } finally {
+      sparkContext.removeSparkListener(listener)
+    }
+    listener.getResults()
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLMetricsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLMetricsSuite.scala
index 24c038587d1d6..022cb7177339d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLMetricsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLMetricsSuite.scala
@@ -17,112 +17,10 @@
 
 package org.apache.spark.sql.hive.execution
 
-import java.io.File
-
-import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.functions._
+import org.apache.spark.sql.execution.metric.SQLMetricsTestUtils
 import org.apache.spark.sql.hive.test.TestHiveSingleton
-import org.apache.spark.sql.test.SQLTestUtils
-import org.apache.spark.util.Utils
-
-class SQLMetricsSuite extends SQLTestUtils with TestHiveSingleton {
-  import spark.implicits._
-
-  /**
-   * Get execution metrics for the SQL execution and verify metrics values.
-   *
-   * @param metricsValues the expected metric values (numFiles, numPartitions, numOutputRows).
-   * @param func the function can produce execution id after running.
-   */
-  private def verifyWriteDataMetrics(metricsValues: Seq[Int])(func: => Unit): Unit = {
-    val previousExecutionIds = spark.sharedState.listener.executionIdToData.keySet
-    // Run the given function to trigger query execution.
-    func
-    spark.sparkContext.listenerBus.waitUntilEmpty(10000)
-    val executionIds =
-      spark.sharedState.listener.executionIdToData.keySet.diff(previousExecutionIds)
-    assert(executionIds.size == 1)
-    val executionId = executionIds.head
-
-    val executionData = spark.sharedState.listener.getExecution(executionId).get
-    val executedNode = executionData.physicalPlanGraph.nodes.head
-
-    val metricsNames = Seq(
-      "number of written files",
-      "number of dynamic part",
-      "number of output rows")
-
-    val metrics = spark.sharedState.listener.getExecutionMetrics(executionId)
-
-    metricsNames.zip(metricsValues).foreach { case (metricsName, expected) =>
-      val sqlMetric = executedNode.metrics.find(_.name == metricsName)
-      assert(sqlMetric.isDefined)
-      val accumulatorId = sqlMetric.get.accumulatorId
-      val metricValue = metrics(accumulatorId).replaceAll(",", "").toInt
-      assert(metricValue == expected)
-    }
-
-    val totalNumBytesMetric = executedNode.metrics.find(_.name == "bytes of written output").get
-    val totalNumBytes = metrics(totalNumBytesMetric.accumulatorId).replaceAll(",", "").toInt
-    assert(totalNumBytes > 0)
-  }
-
-  private def testMetricsNonDynamicPartition(
-      dataFormat: String,
-      tableName: String): Unit = {
-    withTable(tableName) {
-      Seq((1, 2)).toDF("i", "j")
-        .write.format(dataFormat).mode("overwrite").saveAsTable(tableName)
-
-      val tableLocation =
-        new File(spark.sessionState.catalog.getTableMetadata(TableIdentifier(tableName)).location)
 
-      // 2 files, 100 rows, 0 dynamic partition.
-      verifyWriteDataMetrics(Seq(2, 0, 100)) {
-        (0 until 100).map(i => (i, i + 1)).toDF("i", "j").repartition(2)
-          .write.format(dataFormat).mode("overwrite").insertInto(tableName)
-      }
-      assert(Utils.recursiveList(tableLocation).count(_.getName.startsWith("part-")) == 2)
-    }
-  }
-
-  private def testMetricsDynamicPartition(
-      provider: String,
-      dataFormat: String,
-      tableName: String): Unit = {
-    withTempPath { dir =>
-      spark.sql(
-        s"""
-           |CREATE TABLE $tableName(a int, b int)
-           |USING $provider
-           |PARTITIONED BY(a)
-           |LOCATION '${dir.toURI}'
-         """.stripMargin)
-      val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier(tableName))
-      assert(table.location == makeQualifiedPath(dir.getAbsolutePath))
-
-      val df = spark.range(start = 0, end = 40, step = 1, numPartitions = 1)
-        .selectExpr("id a", "id b")
-
-      // 40 files, 80 rows, 40 dynamic partitions.
-      verifyWriteDataMetrics(Seq(40, 40, 80)) {
-        df.union(df).repartition(2, $"a")
-          .write
-          .format(dataFormat)
-          .mode("overwrite")
-          .insertInto(tableName)
-      }
-      assert(Utils.recursiveList(dir).count(_.getName.startsWith("part-")) == 40)
-    }
-  }
-
-  test("writing data out metrics: parquet") {
-    testMetricsNonDynamicPartition("parquet", "t1")
-  }
-
-  test("writing data out metrics with dynamic partition: parquet") {
-    testMetricsDynamicPartition("parquet", "parquet", "t1")
-  }
+class SQLMetricsSuite extends SQLMetricsTestUtils with TestHiveSingleton {
 
   test("writing data out metrics: hive") {
     testMetricsNonDynamicPartition("hive", "t1")

From 9696580c33c68c3de32694fbefb93c509d525d94 Mon Sep 17 00:00:00 2001
From: Jacek Laskowski <jacek@japila.pl>
Date: Thu, 31 Aug 2017 09:44:29 -0700
Subject: [PATCH 464/779] =?UTF-8?q?[SPARK-21886][SQL]=20Use=20SparkSession?=
 =?UTF-8?q?.internalCreateDataFrame=20to=20create=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

… Dataset with LogicalRDD logical operator

## What changes were proposed in this pull request?

Reusing `SparkSession.internalCreateDataFrame` wherever possible (to cut dups)

## How was this patch tested?

Local build and waiting for Jenkins

Author: Jacek Laskowski <jacek@japila.pl>

Closes #19095 from jaceklaskowski/SPARK-21886-internalCreateDataFrame.
---
 .../org/apache/spark/sql/DataFrameReader.scala   | 10 ++--------
 .../org/apache/spark/sql/SparkSession.scala      | 16 ++++++++--------
 2 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 8209cec4ba0a8..4f375e59c34d4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -410,10 +410,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
         parsedOptions.columnNameOfCorruptRecord)
       iter.flatMap(parser.parse)
     }
-
-    Dataset.ofRows(
-      sparkSession,
-      LogicalRDD(schema.toAttributes, parsed, isStreaming = jsonDataset.isStreaming)(sparkSession))
+    sparkSession.internalCreateDataFrame(parsed, schema, isStreaming = jsonDataset.isStreaming)
   }
 
   /**
@@ -473,10 +470,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
         parsedOptions.columnNameOfCorruptRecord)
       iter.flatMap(parser.parse)
     }
-
-    Dataset.ofRows(
-      sparkSession,
-      LogicalRDD(schema.toAttributes, parsed, isStreaming = csvDataset.isStreaming)(sparkSession))
+    sparkSession.internalCreateDataFrame(parsed, schema, isStreaming = csvDataset.isStreaming)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index 863c316bbac65..d5ab53ad8fe29 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -559,8 +559,7 @@ class SparkSession private(
   }
 
   /**
-   * Creates a `DataFrame` from an RDD[Row].
-   * User can specify whether the input rows should be converted to Catalyst rows.
+   * Creates a `DataFrame` from an `RDD[InternalRow]`.
    */
   private[sql] def internalCreateDataFrame(
       catalystRows: RDD[InternalRow],
@@ -576,7 +575,7 @@ class SparkSession private(
   }
 
   /**
-   * Creates a `DataFrame` from an RDD[Row].
+   * Creates a `DataFrame` from an `RDD[Row]`.
    * User can specify whether the input rows should be converted to Catalyst rows.
    */
   private[sql] def createDataFrame(
@@ -589,10 +588,9 @@ class SparkSession private(
       val encoder = RowEncoder(schema)
       rowRDD.map(encoder.toRow)
     } else {
-      rowRDD.map{r: Row => InternalRow.fromSeq(r.toSeq)}
+      rowRDD.map { r: Row => InternalRow.fromSeq(r.toSeq) }
     }
-    val logicalPlan = LogicalRDD(schema.toAttributes, catalystRows)(self)
-    Dataset.ofRows(self, logicalPlan)
+    internalCreateDataFrame(catalystRows, schema)
   }
 
 
@@ -737,13 +735,15 @@ class SparkSession private(
   }
 
   /**
-   * Apply a schema defined by the schema to an RDD. It is only used by PySpark.
+   * Apply `schema` to an RDD.
+   *
+   * @note Used by PySpark only
    */
   private[sql] def applySchemaToPythonRDD(
       rdd: RDD[Array[Any]],
       schema: StructType): DataFrame = {
     val rowRdd = rdd.map(r => python.EvaluatePython.fromJava(r, schema).asInstanceOf[InternalRow])
-    Dataset.ofRows(self, LogicalRDD(schema.toAttributes, rowRdd)(self))
+    internalCreateDataFrame(rowRdd, schema)
   }
 
   /**

From fc45c2c88a838b8f46659ebad2a8f3a9923bc95f Mon Sep 17 00:00:00 2001
From: ArtRand <arand@soe.ucsc.edu>
Date: Thu, 31 Aug 2017 10:58:13 -0700
Subject: [PATCH 465/779] [SPARK-20812][MESOS] Add secrets support to the
 dispatcher

Mesos has secrets primitives for environment and file-based secrets, this PR adds that functionality to the Spark dispatcher and the appropriate configuration flags.
Unit tested and manually tested against a DC/OS cluster with Mesos 1.4.

Author: ArtRand <arand@soe.ucsc.edu>

Closes #18837 from ArtRand/spark-20812-dispatcher-secrets-and-labels.
---
 dev/deps/spark-deps-hadoop-2.6                |   2 +-
 dev/deps/spark-deps-hadoop-2.7                |   2 +-
 docs/running-on-mesos.md                      |  43 ++++-
 docs/security.md                              |   3 +
 resource-managers/mesos/pom.xml               |   2 +-
 .../apache/spark/deploy/mesos/config.scala    |  33 +++-
 .../cluster/mesos/MesosClusterScheduler.scala | 136 ++++++++++++++-
 .../mesos/MesosSchedulerBackendUtil.scala     |   7 +-
 .../cluster/mesos/MesosSchedulerUtils.scala   |  16 +-
 .../mesos/MesosClusterSchedulerSuite.scala    | 162 +++++++++++++++++-
 10 files changed, 386 insertions(+), 20 deletions(-)

diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index de1750777d36c..e481b4d528e3e 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -138,7 +138,7 @@ lz4-java-1.4.0.jar
 machinist_2.11-0.6.1.jar
 macro-compat_2.11-1.1.1.jar
 mail-1.4.7.jar
-mesos-1.0.0-shaded-protobuf.jar
+mesos-1.3.0-shaded-protobuf.jar
 metrics-core-3.1.2.jar
 metrics-graphite-3.1.2.jar
 metrics-json-3.1.2.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index da826a7ee8b12..b8046b117cfcd 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -139,7 +139,7 @@ lz4-java-1.4.0.jar
 machinist_2.11-0.6.1.jar
 macro-compat_2.11-1.1.1.jar
 mail-1.4.7.jar
-mesos-1.0.0-shaded-protobuf.jar
+mesos-1.3.0-shaded-protobuf.jar
 metrics-core-3.1.2.jar
 metrics-graphite-3.1.2.jar
 metrics-json-3.1.2.jar
diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md
index c12b8580af063..e0944bc9f5f86 100644
--- a/docs/running-on-mesos.md
+++ b/docs/running-on-mesos.md
@@ -33,7 +33,8 @@ To get started, follow the steps below to install Mesos and deploy Spark jobs vi
 # Installing Mesos
 
 Spark {{site.SPARK_VERSION}} is designed for use with Mesos {{site.MESOS_VERSION}} or newer and does not
-require any special patches of Mesos.
+require any special patches of Mesos. File and environment-based secrets support requires Mesos 1.3.0 or
+newer.
 
 If you already have a Mesos cluster running, you can skip this Mesos installation step.
 
@@ -430,7 +431,8 @@ See the [configuration page](configuration.html) for information on Spark config
   <td><code>spark.mesos.secret</code></td>
   <td>(none)</td>
   <td>
-    Set the secret with which Spark framework will use to authenticate with Mesos.
+    Set the secret with which Spark framework will use to authenticate with Mesos. Used, for example, when
+    authenticating with the registry.
   </td>
 </tr>
 <tr>
@@ -482,6 +484,43 @@ See the [configuration page](configuration.html) for information on Spark config
   </td>
 </tr>
 
+<tr>
+  <td><code>spark.mesos.driver.secret.envkeys</code></td>
+  <td><code>(none)</code></td>
+  <td>
+    A comma-separated list that, if set, the contents of the secret referenced
+    by spark.mesos.driver.secret.names or spark.mesos.driver.secret.values will be
+    set to the provided environment variable in the driver's process.
+  </td>
+  </tr>
+  <tr>
+<td><code>spark.mesos.driver.secret.filenames</code></td>
+  <td><code>(none)</code></td>
+  <td>
+    A comma-separated list that, if set, the contents of the secret referenced by
+    spark.mesos.driver.secret.names or spark.mesos.driver.secret.values will be
+    written to the provided file. Paths are relative to the container's work
+    directory.  Absolute paths must already exist.  Consult the Mesos Secret
+    protobuf for more information.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.mesos.driver.secret.names</code></td>
+  <td><code>(none)</code></td>
+  <td>
+    A comma-separated list of secret references. Consult the Mesos Secret
+    protobuf for more information.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.mesos.driver.secret.values</code></td>
+  <td><code>(none)</code></td>
+  <td>
+    A comma-separated list of secret values. Consult the Mesos Secret
+    protobuf for more information.
+  </td>
+</tr>
+
 <tr>
   <td><code>spark.mesos.driverEnv.[EnvironmentVariableName]</code></td>
   <td><code>(none)</code></td>
diff --git a/docs/security.md b/docs/security.md
index 9eda42888637f..1d004003f9a32 100644
--- a/docs/security.md
+++ b/docs/security.md
@@ -73,6 +73,9 @@ For long-running apps like Spark Streaming apps to be able to write to HDFS, it
 ### Standalone mode
 The user needs to provide key-stores and configuration options for master and workers. They have to be set by attaching appropriate Java system properties in `SPARK_MASTER_OPTS` and in `SPARK_WORKER_OPTS` environment variables, or just in `SPARK_DAEMON_JAVA_OPTS`. In this mode, the user may allow the executors to use the SSL settings inherited from the worker which spawned that executor. It can be accomplished by setting `spark.ssl.useNodeLocalConf` to `true`. If that parameter is set, the settings provided by user on the client side, are not used by the executors.
 
+### Mesos mode
+Mesos 1.3.0 and newer supports `Secrets` primitives as both file-based and environment based secrets. Spark allows the specification of file-based and environment variable based secrets with the `spark.mesos.driver.secret.filenames` and `spark.mesos.driver.secret.envkeys`, respectively. Depending on the secret store backend secrets can be passed by reference or by value with the `spark.mesos.driver.secret.names` and `spark.mesos.driver.secret.values` configuration properties, respectively. Reference type secrets are served by the secret store and referred to by name, for example `/mysecret`. Value type secrets are passed on the command line and translated into their appropriate files or environment variables. 
+
 ### Preparing the key-stores
 Key-stores can be generated by `keytool` program. The reference documentation for this tool is
 [here](https://docs.oracle.com/javase/7/docs/technotes/tools/solaris/keytool.html). The most basic
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index 2aa3228af79d6..de8f1c913651d 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -29,7 +29,7 @@
   <name>Spark Project Mesos</name>
   <properties>
     <sbt.project.name>mesos</sbt.project.name>
-    <mesos.version>1.0.0</mesos.version>
+    <mesos.version>1.3.0</mesos.version>
     <mesos.classifier>shaded-protobuf</mesos.classifier>
   </properties>
 
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala
index a5015b9243316..7e85de91c5d36 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala
@@ -58,12 +58,43 @@ package object config {
 
   private[spark] val DRIVER_LABELS =
     ConfigBuilder("spark.mesos.driver.labels")
-      .doc("Mesos labels to add to the driver.  Labels are free-form key-value pairs.  Key-value " +
+      .doc("Mesos labels to add to the driver.  Labels are free-form key-value pairs. Key-value " +
         "pairs should be separated by a colon, and commas used to list more than one." +
         "Ex. key:value,key2:value2")
       .stringConf
       .createOptional
 
+  private[spark] val SECRET_NAME =
+    ConfigBuilder("spark.mesos.driver.secret.names")
+      .doc("A comma-separated list of secret reference names. Consult the Mesos Secret protobuf " +
+        "for more information.")
+      .stringConf
+      .toSequence
+      .createOptional
+
+  private[spark] val SECRET_VALUE =
+    ConfigBuilder("spark.mesos.driver.secret.values")
+      .doc("A comma-separated list of secret values.")
+      .stringConf
+      .toSequence
+      .createOptional
+
+  private[spark] val SECRET_ENVKEY =
+    ConfigBuilder("spark.mesos.driver.secret.envkeys")
+      .doc("A comma-separated list of the environment variables to contain the secrets." +
+        "The environment variable will be set on the driver.")
+      .stringConf
+      .toSequence
+      .createOptional
+
+  private[spark] val SECRET_FILENAME =
+    ConfigBuilder("spark.mesos.driver.secret.filenames")
+      .doc("A comma-seperated list of file paths secret will be written to.  Consult the Mesos " +
+        "Secret protobuf for more information.")
+      .stringConf
+      .toSequence
+      .createOptional
+
   private[spark] val DRIVER_FAILOVER_TIMEOUT =
     ConfigBuilder("spark.mesos.driver.failoverTimeout")
       .doc("Amount of time in seconds that the master will wait to hear from the driver, " +
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
index 9ee9cb1e79306..ec533f91474f2 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
@@ -28,6 +28,7 @@ import org.apache.mesos.{Scheduler, SchedulerDriver}
 import org.apache.mesos.Protos.{TaskState => MesosTaskState, _}
 import org.apache.mesos.Protos.Environment.Variable
 import org.apache.mesos.Protos.TaskStatus.Reason
+import org.apache.mesos.protobuf.ByteString
 
 import org.apache.spark.{SecurityManager, SparkConf, SparkException, TaskState}
 import org.apache.spark.deploy.mesos.MesosDriverDescription
@@ -386,12 +387,46 @@ private[spark] class MesosClusterScheduler(
     val env = desc.conf.getAllWithPrefix("spark.mesos.driverEnv.") ++ commandEnv
 
     val envBuilder = Environment.newBuilder()
+
+    // add normal environment variables
     env.foreach { case (k, v) =>
       envBuilder.addVariables(Variable.newBuilder().setName(k).setValue(v))
     }
+
+    // add secret environment variables
+    getSecretEnvVar(desc).foreach { variable =>
+      if (variable.getSecret.getReference.isInitialized) {
+        logInfo(s"Setting reference secret ${variable.getSecret.getReference.getName}" +
+          s"on file ${variable.getName}")
+      } else {
+        logInfo(s"Setting secret on environment variable name=${variable.getName}")
+      }
+      envBuilder.addVariables(variable)
+    }
+
     envBuilder.build()
   }
 
+  private def getSecretEnvVar(desc: MesosDriverDescription): List[Variable] = {
+    val secrets = getSecrets(desc)
+    val secretEnvKeys = desc.conf.get(config.SECRET_ENVKEY).getOrElse(Nil)
+    if (illegalSecretInput(secretEnvKeys, secrets)) {
+      throw new SparkException(
+        s"Need to give equal numbers of secrets and environment keys " +
+          s"for environment-based reference secrets got secrets $secrets, " +
+          s"and keys $secretEnvKeys")
+    }
+
+    secrets.zip(secretEnvKeys).map {
+      case (s, k) =>
+        Variable.newBuilder()
+          .setName(k)
+          .setType(Variable.Type.SECRET)
+          .setSecret(s)
+          .build
+    }.toList
+  }
+
   private def getDriverUris(desc: MesosDriverDescription): List[CommandInfo.URI] = {
     val confUris = List(conf.getOption("spark.mesos.uris"),
       desc.conf.getOption("spark.mesos.uris"),
@@ -529,18 +564,104 @@ private[spark] class MesosClusterScheduler(
 
     val appName = desc.conf.get("spark.app.name")
 
+    val driverLabels = MesosProtoUtils.mesosLabels(desc.conf.get(config.DRIVER_LABELS)
+      .getOrElse(""))
+
     TaskInfo.newBuilder()
       .setTaskId(taskId)
       .setName(s"Driver for ${appName}")
       .setSlaveId(offer.offer.getSlaveId)
       .setCommand(buildDriverCommand(desc))
+      .setContainer(getContainerInfo(desc))
       .addAllResources(cpuResourcesToUse.asJava)
       .addAllResources(memResourcesToUse.asJava)
-      .setLabels(MesosProtoUtils.mesosLabels(desc.conf.get(config.DRIVER_LABELS).getOrElse("")))
-      .setContainer(MesosSchedulerBackendUtil.containerInfo(desc.conf))
+      .setLabels(driverLabels)
       .build
   }
 
+  private def getContainerInfo(desc: MesosDriverDescription): ContainerInfo.Builder = {
+    val containerInfo = MesosSchedulerBackendUtil.containerInfo(desc.conf)
+
+    getSecretVolume(desc).foreach { volume =>
+      if (volume.getSource.getSecret.getReference.isInitialized) {
+        logInfo(s"Setting reference secret ${volume.getSource.getSecret.getReference.getName}" +
+          s"on file ${volume.getContainerPath}")
+      } else {
+        logInfo(s"Setting secret on file name=${volume.getContainerPath}")
+      }
+      containerInfo.addVolumes(volume)
+    }
+
+    containerInfo
+  }
+
+
+  private def getSecrets(desc: MesosDriverDescription): Seq[Secret] = {
+    def createValueSecret(data: String): Secret = {
+      Secret.newBuilder()
+        .setType(Secret.Type.VALUE)
+        .setValue(Secret.Value.newBuilder().setData(ByteString.copyFrom(data.getBytes)))
+        .build()
+    }
+
+    def createReferenceSecret(name: String): Secret = {
+      Secret.newBuilder()
+        .setReference(Secret.Reference.newBuilder().setName(name))
+        .setType(Secret.Type.REFERENCE)
+        .build()
+    }
+
+    val referenceSecrets: Seq[Secret] =
+      desc.conf.get(config.SECRET_NAME).getOrElse(Nil).map(s => createReferenceSecret(s))
+
+    val valueSecrets: Seq[Secret] = {
+      desc.conf.get(config.SECRET_VALUE).getOrElse(Nil).map(s => createValueSecret(s))
+    }
+
+    if (valueSecrets.nonEmpty && referenceSecrets.nonEmpty) {
+      throw new SparkException("Cannot specify VALUE type secrets and REFERENCE types ones")
+    }
+
+    if (referenceSecrets.nonEmpty) referenceSecrets else valueSecrets
+  }
+
+  private def illegalSecretInput(dest: Seq[String], s: Seq[Secret]): Boolean = {
+    if (dest.isEmpty) {  // no destination set (ie not using secrets of this type
+      return false
+    }
+    if (dest.nonEmpty && s.nonEmpty) {
+      // make sure there is a destination for each secret of this type
+      if (dest.length != s.length) {
+        return true
+      }
+    }
+    false
+  }
+
+  private def getSecretVolume(desc: MesosDriverDescription): List[Volume] = {
+    val secrets = getSecrets(desc)
+    val secretPaths: Seq[String] =
+      desc.conf.get(config.SECRET_FILENAME).getOrElse(Nil)
+
+    if (illegalSecretInput(secretPaths, secrets)) {
+      throw new SparkException(
+        s"Need to give equal numbers of secrets and file paths for file-based " +
+          s"reference secrets got secrets $secrets, and paths $secretPaths")
+    }
+
+    secrets.zip(secretPaths).map {
+      case (s, p) =>
+        val source = Volume.Source.newBuilder()
+          .setType(Volume.Source.Type.SECRET)
+          .setSecret(s)
+        Volume.newBuilder()
+          .setContainerPath(p)
+          .setSource(source)
+          .setMode(Volume.Mode.RO)
+          .build
+    }.toList
+  }
+
   /**
    * This method takes all the possible candidates and attempt to schedule them with Mesos offers.
    * Every time a new task is scheduled, the afterLaunchCallback is called to perform post scheduled
@@ -584,9 +705,14 @@ private[spark] class MesosClusterScheduler(
         } catch {
           case e: SparkException =>
             afterLaunchCallback(submission.submissionId)
-            finishedDrivers += new MesosClusterSubmissionState(submission, TaskID.newBuilder().
-              setValue(submission.submissionId).build(), SlaveID.newBuilder().setValue("").
-              build(), None, null, None, getDriverFrameworkID(submission))
+            finishedDrivers += new MesosClusterSubmissionState(
+              submission,
+              TaskID.newBuilder().setValue(submission.submissionId).build(),
+              SlaveID.newBuilder().setValue("").build(),
+              None,
+              null,
+              None,
+              getDriverFrameworkID(submission))
             logError(s"Failed to launch the driver with id: ${submission.submissionId}, " +
               s"cpu: $driverCpu, mem: $driverMem, reason: ${e.getMessage}")
         }
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala
index e5c1e801f2772..f29e541addf23 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala
@@ -122,7 +122,7 @@ private[mesos] object MesosSchedulerBackendUtil extends Logging {
     .toList
   }
 
-  def containerInfo(conf: SparkConf): ContainerInfo = {
+  def containerInfo(conf: SparkConf): ContainerInfo.Builder = {
     val containerType = if (conf.contains("spark.mesos.executor.docker.image") &&
       conf.get("spark.mesos.containerizer", "docker") == "docker") {
       ContainerInfo.Type.DOCKER
@@ -149,8 +149,7 @@ private[mesos] object MesosSchedulerBackendUtil extends Logging {
         .getOrElse(List.empty)
 
       if (containerType == ContainerInfo.Type.DOCKER) {
-        containerInfo
-          .setDocker(dockerInfo(image, forcePullImage, portMaps, params))
+        containerInfo.setDocker(dockerInfo(image, forcePullImage, portMaps, params))
       } else {
         containerInfo.setMesos(mesosInfo(image, forcePullImage))
       }
@@ -171,7 +170,7 @@ private[mesos] object MesosSchedulerBackendUtil extends Logging {
       containerInfo.addNetworkInfos(info)
     }
 
-    containerInfo.build()
+    containerInfo
   }
 
   private def dockerInfo(
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala
index 7ec116c74b10f..6fcb30af8a733 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala
@@ -510,12 +510,20 @@ trait MesosSchedulerUtils extends Logging {
   }
 
   def mesosToTaskState(state: MesosTaskState): TaskState.TaskState = state match {
-    case MesosTaskState.TASK_STAGING | MesosTaskState.TASK_STARTING => TaskState.LAUNCHING
-    case MesosTaskState.TASK_RUNNING | MesosTaskState.TASK_KILLING => TaskState.RUNNING
+    case MesosTaskState.TASK_STAGING |
+         MesosTaskState.TASK_STARTING => TaskState.LAUNCHING
+    case MesosTaskState.TASK_RUNNING |
+         MesosTaskState.TASK_KILLING => TaskState.RUNNING
     case MesosTaskState.TASK_FINISHED => TaskState.FINISHED
-    case MesosTaskState.TASK_FAILED => TaskState.FAILED
+    case MesosTaskState.TASK_FAILED |
+         MesosTaskState.TASK_GONE |
+         MesosTaskState.TASK_GONE_BY_OPERATOR => TaskState.FAILED
     case MesosTaskState.TASK_KILLED => TaskState.KILLED
-    case MesosTaskState.TASK_LOST | MesosTaskState.TASK_ERROR => TaskState.LOST
+    case MesosTaskState.TASK_LOST |
+         MesosTaskState.TASK_ERROR |
+         MesosTaskState.TASK_DROPPED |
+         MesosTaskState.TASK_UNKNOWN |
+         MesosTaskState.TASK_UNREACHABLE => TaskState.LOST
   }
 
   def taskStateToMesos(state: TaskState.TaskState): MesosTaskState = state match {
diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala
index 50bb501071509..f0f99e9554d58 100644
--- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala
+++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala
@@ -21,9 +21,10 @@ import java.util.{Collection, Collections, Date}
 
 import scala.collection.JavaConverters._
 
-import org.apache.mesos.Protos.{TaskState => MesosTaskState, _}
+import org.apache.mesos.Protos.{Environment, Secret, TaskState => MesosTaskState, _}
 import org.apache.mesos.Protos.Value.{Scalar, Type}
 import org.apache.mesos.SchedulerDriver
+import org.apache.mesos.protobuf.ByteString
 import org.mockito.{ArgumentCaptor, Matchers}
 import org.mockito.Mockito._
 import org.scalatest.mock.MockitoSugar
@@ -338,4 +339,163 @@ class MesosClusterSchedulerSuite extends SparkFunSuite with LocalSparkContext wi
 
     verify(driver, times(1)).declineOffer(offerId, filter)
   }
+
+  test("Creates an env-based reference secrets.") {
+    setScheduler()
+
+    val mem = 1000
+    val cpu = 1
+    val secretName = "/path/to/secret,/anothersecret"
+    val envKey = "SECRET_ENV_KEY,PASSWORD"
+    val driverDesc = new MesosDriverDescription(
+      "d1",
+      "jar",
+      mem,
+      cpu,
+      true,
+      command,
+      Map("spark.mesos.executor.home" -> "test",
+        "spark.app.name" -> "test",
+        "spark.mesos.driver.secret.names" -> secretName,
+        "spark.mesos.driver.secret.envkeys" -> envKey),
+      "s1",
+      new Date())
+    val response = scheduler.submitDriver(driverDesc)
+    assert(response.success)
+    val offer = Utils.createOffer("o1", "s1", mem, cpu)
+    scheduler.resourceOffers(driver, Collections.singletonList(offer))
+    val launchedTasks = Utils.verifyTaskLaunched(driver, "o1")
+    assert(launchedTasks.head
+      .getCommand
+      .getEnvironment
+      .getVariablesCount == 3)  // SPARK_SUBMIT_OPS and the secret
+    val variableOne = launchedTasks.head.getCommand.getEnvironment
+      .getVariablesList.asScala.filter(_.getName == "SECRET_ENV_KEY").head
+    assert(variableOne.getSecret.isInitialized)
+    assert(variableOne.getSecret.getType == Secret.Type.REFERENCE)
+    assert(variableOne.getSecret.getReference.getName == "/path/to/secret")
+    assert(variableOne.getType == Environment.Variable.Type.SECRET)
+    val variableTwo = launchedTasks.head.getCommand.getEnvironment
+      .getVariablesList.asScala.filter(_.getName == "PASSWORD").head
+    assert(variableTwo.getSecret.isInitialized)
+    assert(variableTwo.getSecret.getType == Secret.Type.REFERENCE)
+    assert(variableTwo.getSecret.getReference.getName == "/anothersecret")
+    assert(variableTwo.getType == Environment.Variable.Type.SECRET)
+  }
+
+  test("Creates an env-based value secrets.") {
+    setScheduler()
+    val mem = 1000
+    val cpu = 1
+    val secretValues = "user,password"
+    val envKeys = "USER,PASSWORD"
+    val driverDesc = new MesosDriverDescription(
+      "d1",
+      "jar",
+      mem,
+      cpu,
+      true,
+      command,
+      Map("spark.mesos.executor.home" -> "test",
+        "spark.app.name" -> "test",
+        "spark.mesos.driver.secret.values" -> secretValues,
+        "spark.mesos.driver.secret.envkeys" -> envKeys),
+      "s1",
+      new Date())
+    val response = scheduler.submitDriver(driverDesc)
+    assert(response.success)
+    val offer = Utils.createOffer("o1", "s1", mem, cpu)
+    scheduler.resourceOffers(driver, Collections.singletonList(offer))
+    val launchedTasks = Utils.verifyTaskLaunched(driver, "o1")
+    assert(launchedTasks.head
+      .getCommand
+      .getEnvironment
+      .getVariablesCount == 3)  // SPARK_SUBMIT_OPS and the secret
+    val variableOne = launchedTasks.head.getCommand.getEnvironment
+      .getVariablesList.asScala.filter(_.getName == "USER").head
+    assert(variableOne.getSecret.isInitialized)
+    assert(variableOne.getSecret.getType == Secret.Type.VALUE)
+    assert(variableOne.getSecret.getValue.getData == ByteString.copyFrom("user".getBytes))
+    assert(variableOne.getType == Environment.Variable.Type.SECRET)
+    val variableTwo = launchedTasks.head.getCommand.getEnvironment
+      .getVariablesList.asScala.filter(_.getName == "PASSWORD").head
+    assert(variableTwo.getSecret.isInitialized)
+    assert(variableTwo.getSecret.getType == Secret.Type.VALUE)
+    assert(variableTwo.getSecret.getValue.getData == ByteString.copyFrom("password".getBytes))
+    assert(variableTwo.getType == Environment.Variable.Type.SECRET)
+  }
+
+  test("Creates file-based reference secrets.") {
+    setScheduler()
+    val mem = 1000
+    val cpu = 1
+    val secretName = "/path/to/secret,/anothersecret"
+    val secretPath = "/topsecret,/mypassword"
+    val driverDesc = new MesosDriverDescription(
+      "d1",
+      "jar",
+      mem,
+      cpu,
+      true,
+      command,
+      Map("spark.mesos.executor.home" -> "test",
+        "spark.app.name" -> "test",
+        "spark.mesos.driver.secret.names" -> secretName,
+        "spark.mesos.driver.secret.filenames" -> secretPath),
+      "s1",
+      new Date())
+    val response = scheduler.submitDriver(driverDesc)
+    assert(response.success)
+    val offer = Utils.createOffer("o1", "s1", mem, cpu)
+    scheduler.resourceOffers(driver, Collections.singletonList(offer))
+    val launchedTasks = Utils.verifyTaskLaunched(driver, "o1")
+    val volumes = launchedTasks.head.getContainer.getVolumesList
+    assert(volumes.size() == 2)
+    val secretVolOne = volumes.get(0)
+    assert(secretVolOne.getContainerPath == "/topsecret")
+    assert(secretVolOne.getSource.getSecret.getType == Secret.Type.REFERENCE)
+    assert(secretVolOne.getSource.getSecret.getReference.getName == "/path/to/secret")
+    val secretVolTwo = volumes.get(1)
+    assert(secretVolTwo.getContainerPath == "/mypassword")
+    assert(secretVolTwo.getSource.getSecret.getType == Secret.Type.REFERENCE)
+    assert(secretVolTwo.getSource.getSecret.getReference.getName == "/anothersecret")
+  }
+
+  test("Creates a file-based value secrets.") {
+    setScheduler()
+    val mem = 1000
+    val cpu = 1
+    val secretValues = "user,password"
+    val secretPath = "/whoami,/mypassword"
+    val driverDesc = new MesosDriverDescription(
+      "d1",
+      "jar",
+      mem,
+      cpu,
+      true,
+      command,
+      Map("spark.mesos.executor.home" -> "test",
+        "spark.app.name" -> "test",
+        "spark.mesos.driver.secret.values" -> secretValues,
+        "spark.mesos.driver.secret.filenames" -> secretPath),
+      "s1",
+      new Date())
+    val response = scheduler.submitDriver(driverDesc)
+    assert(response.success)
+    val offer = Utils.createOffer("o1", "s1", mem, cpu)
+    scheduler.resourceOffers(driver, Collections.singletonList(offer))
+    val launchedTasks = Utils.verifyTaskLaunched(driver, "o1")
+    val volumes = launchedTasks.head.getContainer.getVolumesList
+    assert(volumes.size() == 2)
+    val secretVolOne = volumes.get(0)
+    assert(secretVolOne.getContainerPath == "/whoami")
+    assert(secretVolOne.getSource.getSecret.getType == Secret.Type.VALUE)
+    assert(secretVolOne.getSource.getSecret.getValue.getData ==
+      ByteString.copyFrom("user".getBytes))
+    val secretVolTwo = volumes.get(1)
+    assert(secretVolTwo.getContainerPath == "/mypassword")
+    assert(secretVolTwo.getSource.getSecret.getType == Secret.Type.VALUE)
+    assert(secretVolTwo.getSource.getSecret.getValue.getData ==
+      ByteString.copyFrom("password".getBytes))
+  }
 }

From 501370d9d54acea398ab86b5c45cedd5d9471f66 Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Thu, 31 Aug 2017 11:32:10 -0700
Subject: [PATCH 466/779] [SPARK-21583][HOTFIX] Removed intercept in test
 causing failures

Removing a check in the ColumnarBatchSuite that depended on a Java assertion.  This assertion is being compiled out in the Maven builds causing the test to fail.  This part of the test is not specifically from to the functionality that is being tested here.

Author: Bryan Cutler <cutlerb@gmail.com>

Closes #19098 from BryanCutler/hotfix-ColumnarBatchSuite-assertion.
---
 .../spark/sql/execution/vectorized/ColumnarBatchSuite.scala   | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
index 1f21d3c0db987..ebf76613343ba 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
@@ -1308,10 +1308,6 @@ class ColumnarBatchSuite extends SparkFunSuite {
       }
     }
 
-    intercept[java.lang.AssertionError] {
-      batch.getRow(100)
-    }
-
     batch.close()
     allocator.close()
   }

From 7ce110828608551f22f6cd2abdbd964844b45975 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Thu, 31 Aug 2017 14:15:34 -0700
Subject: [PATCH 467/779] [SPARK-17107][SQL][FOLLOW-UP] Remove redundant
 pushdown rule for Union

## What changes were proposed in this pull request?
Also remove useless function `partitionByDeterministic` after the changes of https://github.com/apache/spark/pull/14687

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #19097 from gatorsmile/followupSPARK-17107.
---
 .../spark/sql/catalyst/optimizer/Optimizer.scala  | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 75d83bc6e86f8..b73f70a35166e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -380,21 +380,6 @@ object PushProjectionThroughUnion extends Rule[LogicalPlan] with PredicateHelper
     result.asInstanceOf[A]
   }
 
-  /**
-   * Splits the condition expression into small conditions by `And`, and partition them by
-   * deterministic, and finally recombine them by `And`. It returns an expression containing
-   * all deterministic expressions (the first field of the returned Tuple2) and an expression
-   * containing all non-deterministic expressions (the second field of the returned Tuple2).
-   */
-  private def partitionByDeterministic(condition: Expression): (Expression, Expression) = {
-    val andConditions = splitConjunctivePredicates(condition)
-    andConditions.partition(_.deterministic) match {
-      case (deterministic, nondeterministic) =>
-        deterministic.reduceOption(And).getOrElse(Literal(true)) ->
-        nondeterministic.reduceOption(And).getOrElse(Literal(true))
-    }
-  }
-
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
 
     // Push down deterministic projection through UNION ALL

From cba69aeb453d2489830f3e6e0473a64dee81989e Mon Sep 17 00:00:00 2001
From: Andrew Ray <ray.andrew@gmail.com>
Date: Thu, 31 Aug 2017 15:08:03 -0700
Subject: [PATCH 468/779] [SPARK-21110][SQL] Structs, arrays, and other
 orderable datatypes should be usable in inequalities

## What changes were proposed in this pull request?

Allows `BinaryComparison` operators to work on any data type that actually supports ordering as verified by `TypeUtils.checkForOrderingExpr` instead of relying on the incomplete list `TypeCollection.Ordered` (which is removed by this PR).

## How was this patch tested?

Updated unit tests to cover structs and arrays.

Author: Andrew Ray <ray.andrew@gmail.com>

Closes #18818 from aray/SPARK-21110.
---
 .../expressions/codegen/CodeGenerator.scala   |  1 +
 .../sql/catalyst/expressions/predicates.scala | 58 +++++--------------
 .../spark/sql/catalyst/util/TypeUtils.scala   |  1 +
 .../spark/sql/types/AbstractDataType.scala    | 12 ----
 .../analysis/AnalysisErrorSuite.scala         |  2 +-
 .../ExpressionTypeCheckingSuite.scala         | 15 ++---
 .../catalyst/expressions/PredicateSuite.scala | 37 ++++++++++--
 7 files changed, 58 insertions(+), 68 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 38538630c8b32..437397187356c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -594,6 +594,7 @@ class CodegenContext {
     case array: ArrayType => genComp(array, c1, c2) + " == 0"
     case struct: StructType => genComp(struct, c1, c2) + " == 0"
     case udt: UserDefinedType[_] => genEqual(udt.sqlType, c1, c2)
+    case NullType => "false"
     case _ =>
       throw new IllegalArgumentException(
         "cannot generate equality code for un-comparable type: " + dataType.simpleString)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 613d6202b0b26..d3071c533bef9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -448,6 +448,16 @@ case class Or(left: Expression, right: Expression) extends BinaryOperator with P
 
 abstract class BinaryComparison extends BinaryOperator with Predicate {
 
+  // Note that we need to give a superset of allowable input types since orderable types are not
+  // finitely enumerable. The allowable types are checked below by checkInputDataTypes.
+  override def inputType: AbstractDataType = AnyDataType
+
+  override def checkInputDataTypes(): TypeCheckResult = super.checkInputDataTypes() match {
+    case TypeCheckResult.TypeCheckSuccess =>
+      TypeUtils.checkForOrderingExpr(left.dataType, this.getClass.getSimpleName)
+    case failure => failure
+  }
+
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     if (ctx.isPrimitiveType(left.dataType)
         && left.dataType != BooleanType // java boolean doesn't support > or < operator
@@ -460,7 +470,7 @@ abstract class BinaryComparison extends BinaryOperator with Predicate {
     }
   }
 
-  protected lazy val ordering = TypeUtils.getInterpretedOrdering(left.dataType)
+  protected lazy val ordering: Ordering[Any] = TypeUtils.getInterpretedOrdering(left.dataType)
 }
 
 
@@ -478,28 +488,13 @@ object Equality {
   }
 }
 
+// TODO: although map type is not orderable, technically map type should be able to be used
+// in equality comparison
 @ExpressionDescription(
   usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` equals `expr2`, or false otherwise.")
 case class EqualTo(left: Expression, right: Expression)
     extends BinaryComparison with NullIntolerant {
 
-  override def inputType: AbstractDataType = AnyDataType
-
-  override def checkInputDataTypes(): TypeCheckResult = {
-    super.checkInputDataTypes() match {
-      case TypeCheckResult.TypeCheckSuccess =>
-        // TODO: although map type is not orderable, technically map type should be able to be used
-        // in equality comparison, remove this type check once we support it.
-        if (left.dataType.existsRecursively(_.isInstanceOf[MapType])) {
-          TypeCheckResult.TypeCheckFailure("Cannot use map type in EqualTo, but the actual " +
-            s"input type is ${left.dataType.catalogString}.")
-        } else {
-          TypeCheckResult.TypeCheckSuccess
-        }
-      case failure => failure
-    }
-  }
-
   override def symbol: String = "="
 
   protected override def nullSafeEval(left: Any, right: Any): Any = ordering.equiv(left, right)
@@ -509,6 +504,8 @@ case class EqualTo(left: Expression, right: Expression)
   }
 }
 
+// TODO: although map type is not orderable, technically map type should be able to be used
+// in equality comparison
 @ExpressionDescription(
   usage = """
     expr1 _FUNC_ expr2 - Returns same result as the EQUAL(=) operator for non-null operands,
@@ -516,23 +513,6 @@ case class EqualTo(left: Expression, right: Expression)
   """)
 case class EqualNullSafe(left: Expression, right: Expression) extends BinaryComparison {
 
-  override def inputType: AbstractDataType = AnyDataType
-
-  override def checkInputDataTypes(): TypeCheckResult = {
-    super.checkInputDataTypes() match {
-      case TypeCheckResult.TypeCheckSuccess =>
-        // TODO: although map type is not orderable, technically map type should be able to be used
-        // in equality comparison, remove this type check once we support it.
-        if (left.dataType.existsRecursively(_.isInstanceOf[MapType])) {
-          TypeCheckResult.TypeCheckFailure("Cannot use map type in EqualNullSafe, but the actual " +
-            s"input type is ${left.dataType.catalogString}.")
-        } else {
-          TypeCheckResult.TypeCheckSuccess
-        }
-      case failure => failure
-    }
-  }
-
   override def symbol: String = "<=>"
 
   override def nullable: Boolean = false
@@ -564,8 +544,6 @@ case class EqualNullSafe(left: Expression, right: Expression) extends BinaryComp
 case class LessThan(left: Expression, right: Expression)
     extends BinaryComparison with NullIntolerant {
 
-  override def inputType: AbstractDataType = TypeCollection.Ordered
-
   override def symbol: String = "<"
 
   protected override def nullSafeEval(input1: Any, input2: Any): Any = ordering.lt(input1, input2)
@@ -576,8 +554,6 @@ case class LessThan(left: Expression, right: Expression)
 case class LessThanOrEqual(left: Expression, right: Expression)
     extends BinaryComparison with NullIntolerant {
 
-  override def inputType: AbstractDataType = TypeCollection.Ordered
-
   override def symbol: String = "<="
 
   protected override def nullSafeEval(input1: Any, input2: Any): Any = ordering.lteq(input1, input2)
@@ -588,8 +564,6 @@ case class LessThanOrEqual(left: Expression, right: Expression)
 case class GreaterThan(left: Expression, right: Expression)
     extends BinaryComparison with NullIntolerant {
 
-  override def inputType: AbstractDataType = TypeCollection.Ordered
-
   override def symbol: String = ">"
 
   protected override def nullSafeEval(input1: Any, input2: Any): Any = ordering.gt(input1, input2)
@@ -600,8 +574,6 @@ case class GreaterThan(left: Expression, right: Expression)
 case class GreaterThanOrEqual(left: Expression, right: Expression)
     extends BinaryComparison with NullIntolerant {
 
-  override def inputType: AbstractDataType = TypeCollection.Ordered
-
   override def symbol: String = ">="
 
   protected override def nullSafeEval(input1: Any, input2: Any): Any = ordering.gteq(input1, input2)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala
index 45225779bffcb..1dcda49a3af6a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala
@@ -65,6 +65,7 @@ object TypeUtils {
       case i: AtomicType => i.ordering.asInstanceOf[Ordering[Any]]
       case a: ArrayType => a.interpretedOrdering.asInstanceOf[Ordering[Any]]
       case s: StructType => s.interpretedOrdering.asInstanceOf[Ordering[Any]]
+      case udt: UserDefinedType[_] => getInterpretedOrdering(udt.sqlType)
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
index 1d54ff5825c2e..3041f44b116ea 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
@@ -78,18 +78,6 @@ private[sql] class TypeCollection(private val types: Seq[AbstractDataType])
 
 private[sql] object TypeCollection {
 
-  /**
-   * Types that can be ordered/compared. In the long run we should probably make this a trait
-   * that can be mixed into each data type, and perhaps create an `AbstractDataType`.
-   */
-  // TODO: Should we consolidate this with RowOrdering.isOrderable?
-  val Ordered = TypeCollection(
-    BooleanType,
-    ByteType, ShortType, IntegerType, LongType,
-    FloatType, DoubleType, DecimalType,
-    TimestampType, DateType,
-    StringType, BinaryType)
-
   /**
    * Types that include numeric types and interval type. They are only used in unary_minus,
    * unary_positive, add and subtract operations.
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
index 4e0613619add6..884e113537c93 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
@@ -505,7 +505,7 @@ class AnalysisErrorSuite extends AnalysisTest {
       right,
       joinType = Cross,
       condition = Some('b === 'd))
-    assertAnalysisError(plan2, "Cannot use map type in EqualTo" :: Nil)
+    assertAnalysisError(plan2, "EqualTo does not support ordering on type MapType" :: Nil)
   }
 
   test("PredicateSubQuery is used outside of a filter") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
index 30725773a37b1..36714bd631b0e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
-import org.apache.spark.sql.types.{LongType, StringType, TypeCollection}
+import org.apache.spark.sql.types._
 
 class ExpressionTypeCheckingSuite extends SparkFunSuite {
 
@@ -109,16 +109,17 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite {
     assertErrorForDifferingTypes(GreaterThan('intField, 'booleanField))
     assertErrorForDifferingTypes(GreaterThanOrEqual('intField, 'booleanField))
 
-    assertError(EqualTo('mapField, 'mapField), "Cannot use map type in EqualTo")
-    assertError(EqualNullSafe('mapField, 'mapField), "Cannot use map type in EqualNullSafe")
+    assertError(EqualTo('mapField, 'mapField), "EqualTo does not support ordering on type MapType")
+    assertError(EqualNullSafe('mapField, 'mapField),
+      "EqualNullSafe does not support ordering on type MapType")
     assertError(LessThan('mapField, 'mapField),
-      s"requires ${TypeCollection.Ordered.simpleString} type")
+      "LessThan does not support ordering on type MapType")
     assertError(LessThanOrEqual('mapField, 'mapField),
-      s"requires ${TypeCollection.Ordered.simpleString} type")
+      "LessThanOrEqual does not support ordering on type MapType")
     assertError(GreaterThan('mapField, 'mapField),
-      s"requires ${TypeCollection.Ordered.simpleString} type")
+      "GreaterThan does not support ordering on type MapType")
     assertError(GreaterThanOrEqual('mapField, 'mapField),
-      s"requires ${TypeCollection.Ordered.simpleString} type")
+      "GreaterThanOrEqual does not support ordering on type MapType")
 
     assertError(If('intField, 'stringField, 'stringField),
       "type of predicate expression in If should be boolean")
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
index ef510a95ef446..055c31c2b3018 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
@@ -17,12 +17,15 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import java.sql.{Date, Timestamp}
+
 import scala.collection.immutable.HashSet
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.RandomDataGenerator
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.util.GenericArrayData
+import org.apache.spark.sql.catalyst.encoders.ExamplePointUDT
+import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData}
 import org.apache.spark.sql.types._
 
 
@@ -215,14 +218,35 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
     }
   }
 
-  private val smallValues = Seq(1, Decimal(1), Array(1.toByte), "a", 0f, 0d, false).map(Literal(_))
+  private case class MyStruct(a: Long, b: String)
+  private case class MyStruct2(a: MyStruct, b: Array[Int])
+  private val udt = new ExamplePointUDT
+
+  private val smallValues =
+    Seq(1.toByte, 1.toShort, 1, 1L, Decimal(1), Array(1.toByte), new Date(2000, 1, 1),
+      new Timestamp(1), "a", 1f, 1d, 0f, 0d, false, Array(1L, 2L))
+      .map(Literal(_)) ++ Seq(Literal.create(MyStruct(1L, "b")),
+      Literal.create(MyStruct2(MyStruct(1L, "a"), Array(1, 1))),
+      Literal.create(ArrayData.toArrayData(Array(1.0, 2.0)), udt))
   private val largeValues =
-    Seq(2, Decimal(2), Array(2.toByte), "b", Float.NaN, Double.NaN, true).map(Literal(_))
+    Seq(2.toByte, 2.toShort, 2, 2L, Decimal(2), Array(2.toByte), new Date(2000, 1, 2),
+      new Timestamp(2), "b", 2f, 2d, Float.NaN, Double.NaN, true, Array(2L, 1L))
+      .map(Literal(_)) ++ Seq(Literal.create(MyStruct(2L, "b")),
+      Literal.create(MyStruct2(MyStruct(1L, "a"), Array(1, 2))),
+      Literal.create(ArrayData.toArrayData(Array(1.0, 3.0)), udt))
 
   private val equalValues1 =
-    Seq(1, Decimal(1), Array(1.toByte), "a", Float.NaN, Double.NaN, true).map(Literal(_))
+    Seq(1.toByte, 1.toShort, 1, 1L, Decimal(1), Array(1.toByte), new Date(2000, 1, 1),
+      new Timestamp(1), "a", 1f, 1d, Float.NaN, Double.NaN, true, Array(1L, 2L))
+      .map(Literal(_)) ++ Seq(Literal.create(MyStruct(1L, "b")),
+      Literal.create(MyStruct2(MyStruct(1L, "a"), Array(1, 1))),
+      Literal.create(ArrayData.toArrayData(Array(1.0, 2.0)), udt))
   private val equalValues2 =
-    Seq(1, Decimal(1), Array(1.toByte), "a", Float.NaN, Double.NaN, true).map(Literal(_))
+    Seq(1.toByte, 1.toShort, 1, 1L, Decimal(1), Array(1.toByte), new Date(2000, 1, 1),
+      new Timestamp(1), "a", 1f, 1d, Float.NaN, Double.NaN, true, Array(1L, 2L))
+      .map(Literal(_)) ++ Seq(Literal.create(MyStruct(1L, "b")),
+      Literal.create(MyStruct2(MyStruct(1L, "a"), Array(1, 1))),
+      Literal.create(ArrayData.toArrayData(Array(1.0, 2.0)), udt))
 
   test("BinaryComparison consistency check") {
     DataTypeTestUtils.ordered.foreach { dt =>
@@ -285,11 +309,13 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
     // Use -1 (default value for codegen) which can trigger some weird bugs, e.g. SPARK-14757
     val normalInt = Literal(-1)
     val nullInt = NonFoldableLiteral.create(null, IntegerType)
+    val nullNullType = Literal.create(null, NullType)
 
     def nullTest(op: (Expression, Expression) => Expression): Unit = {
       checkEvaluation(op(normalInt, nullInt), null)
       checkEvaluation(op(nullInt, normalInt), null)
       checkEvaluation(op(nullInt, nullInt), null)
+      checkEvaluation(op(nullNullType, nullNullType), null)
     }
 
     nullTest(LessThan)
@@ -301,6 +327,7 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(EqualNullSafe(normalInt, nullInt), false)
     checkEvaluation(EqualNullSafe(nullInt, normalInt), false)
     checkEvaluation(EqualNullSafe(nullInt, nullInt), true)
+    checkEvaluation(EqualNullSafe(nullNullType, nullNullType), true)
   }
 
   test("EqualTo on complex type") {

From 96028e36b4d08427fdd94df55595849c2346ead4 Mon Sep 17 00:00:00 2001
From: WeichenXu <weichen.xu@databricks.com>
Date: Thu, 31 Aug 2017 16:22:40 -0700
Subject: [PATCH 469/779] [SPARK-17139][ML][FOLLOW-UP] Add convenient method
 `asBinary` for casting to BinaryLogisticRegressionSummary

## What changes were proposed in this pull request?

add an "asBinary" method to LogisticRegressionSummary for convenient casting to BinaryLogisticRegressionSummary.

## How was this patch tested?

Testcase updated.

Author: WeichenXu <weichen.xu@databricks.com>

Closes #19072 from WeichenXu123/mlor_summary_as_binary.
---
 .../spark/ml/classification/LogisticRegression.scala  | 11 +++++++++++
 .../ml/classification/LogisticRegressionSuite.scala   |  6 ++++++
 project/MimaExcludes.scala                            |  1 +
 3 files changed, 18 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 1869d51af7986..f491a679b2422 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -1473,6 +1473,17 @@ sealed trait LogisticRegressionSummary extends Serializable {
   /** Returns weighted averaged f1-measure. */
   @Since("2.3.0")
   def weightedFMeasure: Double = multiclassMetrics.weightedFMeasure(1.0)
+
+  /**
+   * Convenient method for casting to binary logistic regression summary.
+   * This method will throws an Exception if the summary is not a binary summary.
+   */
+  @Since("2.3.0")
+  def asBinary: BinaryLogisticRegressionSummary = this match {
+    case b: BinaryLogisticRegressionSummary => b
+    case _ =>
+      throw new RuntimeException("Cannot cast to a binary summary.")
+  }
 }
 
 /**
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 6649fa402527d..6bf1253b718d1 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -256,6 +256,7 @@ class LogisticRegressionSuite
 
     val blorModel = lr.fit(smallBinaryDataset)
     assert(blorModel.summary.isInstanceOf[BinaryLogisticRegressionTrainingSummary])
+    assert(blorModel.summary.asBinary.isInstanceOf[BinaryLogisticRegressionSummary])
     assert(blorModel.binarySummary.isInstanceOf[BinaryLogisticRegressionTrainingSummary])
 
     val mlorModel = lr.setFamily("multinomial").fit(smallMultinomialDataset)
@@ -265,6 +266,11 @@ class LogisticRegressionSuite
         mlorModel.binarySummary
       }
     }
+    withClue("cannot cast summary to binary summary multiclass model") {
+      intercept[RuntimeException] {
+        mlorModel.summary.asBinary
+      }
+    }
 
     val mlorBinaryModel = lr.setFamily("multinomial").fit(smallBinaryDataset)
     assert(mlorBinaryModel.summary.isInstanceOf[BinaryLogisticRegressionTrainingSummary])
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index eecda26abb7ed..27e4183550500 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -62,6 +62,7 @@ object MimaExcludes {
     ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightedRecall"),
     ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightedPrecision"),
     ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightedFMeasure"),
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.asBinary"),
     ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.org$apache$spark$ml$classification$LogisticRegressionSummary$$multiclassMetrics"),
     ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.org$apache$spark$ml$classification$LogisticRegressionSummary$_setter_$org$apache$spark$ml$classification$LogisticRegressionSummary$$multiclassMetrics_=")
   )

From f5e10a34e644edf3cbce9a7714d31bc433f3ccbd Mon Sep 17 00:00:00 2001
From: WeichenXu <weichen.xu@databricks.com>
Date: Thu, 31 Aug 2017 16:25:10 -0700
Subject: [PATCH 470/779] [SPARK-21862][ML] Add overflow check in PCA

## What changes were proposed in this pull request?

add overflow check in PCA, otherwise it is possible to throw `NegativeArraySizeException` when `k` and `numFeatures` are too large.
The overflow checking formula is here:
https://github.com/scalanlp/breeze/blob/master/math/src/main/scala/breeze/linalg/functions/svd.scala#L87

## How was this patch tested?

N/A

Author: WeichenXu <weichen.xu@databricks.com>

Closes #19078 from WeichenXu123/SVD_overflow_check.
---
 .../org/apache/spark/mllib/feature/PCA.scala  | 19 +++++++++++++++++++
 .../apache/spark/mllib/feature/PCASuite.scala |  6 ++++++
 2 files changed, 25 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala
index aaecfa8d45dc0..a01503f4b80a6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala
@@ -44,6 +44,11 @@ class PCA @Since("1.4.0") (@Since("1.4.0") val k: Int) {
     require(k <= numFeatures,
       s"source vector size $numFeatures must be no less than k=$k")
 
+    require(PCAUtil.memoryCost(k, numFeatures) < Int.MaxValue,
+      "The param k and numFeatures is too large for SVD computation. " +
+      "Try reducing the parameter k for PCA, or reduce the input feature " +
+      "vector dimension to make this tractable.")
+
     val mat = new RowMatrix(sources)
     val (pc, explainedVariance) = mat.computePrincipalComponentsAndExplainedVariance(k)
     val densePC = pc match {
@@ -110,3 +115,17 @@ class PCAModel private[spark] (
     }
   }
 }
+
+private[feature] object PCAUtil {
+
+  // This memory cost formula is from breeze code:
+  // https://github.com/scalanlp/breeze/blob/
+  // 6e541be066d547a097f5089165cd7c38c3ca276d/math/src/main/scala/breeze/linalg/
+  // functions/svd.scala#L87
+  def memoryCost(k: Int, numFeatures: Int): Long = {
+    3L * math.min(k, numFeatures) * math.min(k, numFeatures)
+    + math.max(math.max(k, numFeatures), 4L * math.min(k, numFeatures)
+    * math.min(k, numFeatures) + 4L * math.min(k, numFeatures))
+  }
+
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/PCASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/PCASuite.scala
index 2f90afdcee55e..8eab12416a698 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/PCASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/PCASuite.scala
@@ -48,4 +48,10 @@ class PCASuite extends SparkFunSuite with MLlibTestSparkContext {
     }
     assert(pca.explainedVariance ~== explainedVariance relTol 1e-8)
   }
+
+  test("memory cost computation") {
+    assert(PCAUtil.memoryCost(10, 100) < Int.MaxValue)
+    // check overflowing
+    assert(PCAUtil.memoryCost(40000, 60000) > Int.MaxValue)
+  }
 }

From 5cd8ea99f084bee40ee18a0c8e33d0ca0aa6bb60 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Fri, 1 Sep 2017 13:01:23 +0900
Subject: [PATCH 471/779] [SPARK-21779][PYTHON] Simpler DataFrame.sample API in
 Python

## What changes were proposed in this pull request?

This PR make `DataFrame.sample(...)` can omit `withReplacement` defaulting `False`, consistently with equivalent Scala / Java API.

In short, the following examples are allowed:

```python
>>> df = spark.range(10)
>>> df.sample(0.5).count()
7
>>> df.sample(fraction=0.5).count()
3
>>> df.sample(0.5, seed=42).count()
5
>>> df.sample(fraction=0.5, seed=42).count()
5
```

In addition, this PR also adds some type checking logics as below:

```python
>>> df = spark.range(10)
>>> df.sample().count()
...
TypeError: withReplacement (optional), fraction (required) and seed (optional) should be a bool, float and number; however, got [].
>>> df.sample(True).count()
...
TypeError: withReplacement (optional), fraction (required) and seed (optional) should be a bool, float and number; however, got [<type 'bool'>].
>>> df.sample(42).count()
...
TypeError: withReplacement (optional), fraction (required) and seed (optional) should be a bool, float and number; however, got [<type 'int'>].
>>> df.sample(fraction=False, seed="a").count()
...
TypeError: withReplacement (optional), fraction (required) and seed (optional) should be a bool, float and number; however, got [<type 'bool'>, <type 'str'>].
>>> df.sample(seed=[1]).count()
...
TypeError: withReplacement (optional), fraction (required) and seed (optional) should be a bool, float and number; however, got [<type 'list'>].
>>> df.sample(withReplacement="a", fraction=0.5, seed=1)
...
TypeError: withReplacement (optional), fraction (required) and seed (optional) should be a bool, float and number; however, got [<type 'str'>, <type 'float'>, <type 'int'>].
```

## How was this patch tested?

Manually tested, unit tests added in doc tests and manually checked the built documentation for Python.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #18999 from HyukjinKwon/SPARK-21779.
---
 python/pyspark/sql/dataframe.py               | 64 +++++++++++++++++--
 python/pyspark/sql/tests.py                   | 18 ++++++
 .../scala/org/apache/spark/sql/Dataset.scala  |  3 +-
 3 files changed, 77 insertions(+), 8 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index d1b2a9c9947e1..c19e599814e54 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -659,19 +659,69 @@ def distinct(self):
         return DataFrame(self._jdf.distinct(), self.sql_ctx)
 
     @since(1.3)
-    def sample(self, withReplacement, fraction, seed=None):
+    def sample(self, withReplacement=None, fraction=None, seed=None):
         """Returns a sampled subset of this :class:`DataFrame`.
 
+        :param withReplacement: Sample with replacement or not (default False).
+        :param fraction: Fraction of rows to generate, range [0.0, 1.0].
+        :param seed: Seed for sampling (default a random seed).
+
         .. note:: This is not guaranteed to provide exactly the fraction specified of the total
             count of the given :class:`DataFrame`.
 
-        >>> df.sample(False, 0.5, 42).count()
-        2
+        .. note:: `fraction` is required and, `withReplacement` and `seed` are optional.
+
+        >>> df = spark.range(10)
+        >>> df.sample(0.5, 3).count()
+        4
+        >>> df.sample(fraction=0.5, seed=3).count()
+        4
+        >>> df.sample(withReplacement=True, fraction=0.5, seed=3).count()
+        1
+        >>> df.sample(1.0).count()
+        10
+        >>> df.sample(fraction=1.0).count()
+        10
+        >>> df.sample(False, fraction=1.0).count()
+        10
         """
-        assert fraction >= 0.0, "Negative fraction value: %s" % fraction
-        seed = seed if seed is not None else random.randint(0, sys.maxsize)
-        rdd = self._jdf.sample(withReplacement, fraction, long(seed))
-        return DataFrame(rdd, self.sql_ctx)
+
+        # For the cases below:
+        #   sample(True, 0.5 [, seed])
+        #   sample(True, fraction=0.5 [, seed])
+        #   sample(withReplacement=False, fraction=0.5 [, seed])
+        is_withReplacement_set = \
+            type(withReplacement) == bool and isinstance(fraction, float)
+
+        # For the case below:
+        #   sample(faction=0.5 [, seed])
+        is_withReplacement_omitted_kwargs = \
+            withReplacement is None and isinstance(fraction, float)
+
+        # For the case below:
+        #   sample(0.5 [, seed])
+        is_withReplacement_omitted_args = isinstance(withReplacement, float)
+
+        if not (is_withReplacement_set
+                or is_withReplacement_omitted_kwargs
+                or is_withReplacement_omitted_args):
+            argtypes = [
+                str(type(arg)) for arg in [withReplacement, fraction, seed] if arg is not None]
+            raise TypeError(
+                "withReplacement (optional), fraction (required) and seed (optional)"
+                " should be a bool, float and number; however, "
+                "got [%s]." % ", ".join(argtypes))
+
+        if is_withReplacement_omitted_args:
+            if fraction is not None:
+                seed = fraction
+            fraction = withReplacement
+            withReplacement = None
+
+        seed = long(seed) if seed is not None else None
+        args = [arg for arg in [withReplacement, fraction, seed] if arg is not None]
+        jdf = self._jdf.sample(*args)
+        return DataFrame(jdf, self.sql_ctx)
 
     @since(1.5)
     def sampleBy(self, col, fractions, seed=None):
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index b3102853ceb53..a2a3ceb29d499 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -2108,6 +2108,24 @@ def test_generic_hints(self):
         plan = df1.join(df2.hint("broadcast"), "id")._jdf.queryExecution().executedPlan()
         self.assertEqual(1, plan.toString().count("BroadcastHashJoin"))
 
+    def test_sample(self):
+        self.assertRaisesRegexp(
+            TypeError,
+            "should be a bool, float and number",
+            lambda: self.spark.range(1).sample())
+
+        self.assertRaises(
+            TypeError,
+            lambda: self.spark.range(1).sample("a"))
+
+        self.assertRaises(
+            TypeError,
+            lambda: self.spark.range(1).sample(seed="abc"))
+
+        self.assertRaises(
+            IllegalArgumentException,
+            lambda: self.spark.range(1).sample(-1.0))
+
     def test_toDF_with_schema_string(self):
         data = [Row(key=i, value=str(i)) for i in range(100)]
         rdd = self.sc.parallelize(data, 5)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index c6707396af1a8..5d8a183b7f875 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -1867,7 +1867,8 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * Returns a new [[Dataset]] by sampling a fraction of rows (without replacement).
+   * Returns a new [[Dataset]] by sampling a fraction of rows (without replacement),
+   * using a random seed.
    *
    * @param fraction Fraction of rows to generate, range [0.0, 1.0].
    *

From 648a8626b82d27d84db3e48bccfd73d020828586 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Fri, 1 Sep 2017 13:09:24 +0900
Subject: [PATCH 472/779] [SPARK-21789][PYTHON] Remove obsolete codes for
 parsing abstract schema strings

## What changes were proposed in this pull request?

This PR proposes to remove private functions that look not used in the main codes, `_split_schema_abstract`, `_parse_field_abstract`, `_parse_schema_abstract` and `_infer_schema_type`.

## How was this patch tested?

Existing tests.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #18647 from HyukjinKwon/remove-abstract.
---
 python/pyspark/sql/tests.py |  10 ---
 python/pyspark/sql/types.py | 129 ------------------------------------
 2 files changed, 139 deletions(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index a2a3ceb29d499..3d87ccfc03ddd 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -894,16 +894,6 @@ def test_apply_schema(self):
 
         self.assertEqual((126, -127, -32767, 32766, 2147483646, 2.5), tuple(r))
 
-        from pyspark.sql.types import _parse_schema_abstract, _infer_schema_type
-        rdd = self.sc.parallelize([(127, -32768, 1.0, datetime(2010, 1, 1, 1, 1, 1),
-                                    {"a": 1}, (2,), [1, 2, 3])])
-        abstract = "byte1 short1 float1 time1 map1{} struct1(b) list1[]"
-        schema = _parse_schema_abstract(abstract)
-        typedSchema = _infer_schema_type(rdd.first(), schema)
-        df = self.spark.createDataFrame(rdd, typedSchema)
-        r = (127, -32768, 1.0, datetime(2010, 1, 1, 1, 1, 1), {"a": 1}, Row(b=2), [1, 2, 3])
-        self.assertEqual(r, tuple(df.first()))
-
     def test_struct_in_map(self):
         d = [Row(m={Row(i=1): Row(s="")})]
         df = self.sc.parallelize(d).toDF()
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index ecb8eb9a2f2fa..51bf7bef49763 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -1187,135 +1187,6 @@ def convert_struct(obj):
     return convert_struct
 
 
-def _split_schema_abstract(s):
-    """
-    split the schema abstract into fields
-
-    >>> _split_schema_abstract("a b  c")
-    ['a', 'b', 'c']
-    >>> _split_schema_abstract("a(a b)")
-    ['a(a b)']
-    >>> _split_schema_abstract("a b[] c{a b}")
-    ['a', 'b[]', 'c{a b}']
-    >>> _split_schema_abstract(" ")
-    []
-    """
-
-    r = []
-    w = ''
-    brackets = []
-    for c in s:
-        if c == ' ' and not brackets:
-            if w:
-                r.append(w)
-            w = ''
-        else:
-            w += c
-            if c in _BRACKETS:
-                brackets.append(c)
-            elif c in _BRACKETS.values():
-                if not brackets or c != _BRACKETS[brackets.pop()]:
-                    raise ValueError("unexpected " + c)
-
-    if brackets:
-        raise ValueError("brackets not closed: %s" % brackets)
-    if w:
-        r.append(w)
-    return r
-
-
-def _parse_field_abstract(s):
-    """
-    Parse a field in schema abstract
-
-    >>> _parse_field_abstract("a")
-    StructField(a,NullType,true)
-    >>> _parse_field_abstract("b(c d)")
-    StructField(b,StructType(...c,NullType,true),StructField(d...
-    >>> _parse_field_abstract("a[]")
-    StructField(a,ArrayType(NullType,true),true)
-    >>> _parse_field_abstract("a{[]}")
-    StructField(a,MapType(NullType,ArrayType(NullType,true),true),true)
-    """
-    if set(_BRACKETS.keys()) & set(s):
-        idx = min((s.index(c) for c in _BRACKETS if c in s))
-        name = s[:idx]
-        return StructField(name, _parse_schema_abstract(s[idx:]), True)
-    else:
-        return StructField(s, NullType(), True)
-
-
-def _parse_schema_abstract(s):
-    """
-    parse abstract into schema
-
-    >>> _parse_schema_abstract("a b  c")
-    StructType...a...b...c...
-    >>> _parse_schema_abstract("a[b c] b{}")
-    StructType...a,ArrayType...b...c...b,MapType...
-    >>> _parse_schema_abstract("c{} d{a b}")
-    StructType...c,MapType...d,MapType...a...b...
-    >>> _parse_schema_abstract("a b(t)").fields[1]
-    StructField(b,StructType(List(StructField(t,NullType,true))),true)
-    """
-    s = s.strip()
-    if not s:
-        return NullType()
-
-    elif s.startswith('('):
-        return _parse_schema_abstract(s[1:-1])
-
-    elif s.startswith('['):
-        return ArrayType(_parse_schema_abstract(s[1:-1]), True)
-
-    elif s.startswith('{'):
-        return MapType(NullType(), _parse_schema_abstract(s[1:-1]))
-
-    parts = _split_schema_abstract(s)
-    fields = [_parse_field_abstract(p) for p in parts]
-    return StructType(fields)
-
-
-def _infer_schema_type(obj, dataType):
-    """
-    Fill the dataType with types inferred from obj
-
-    >>> schema = _parse_schema_abstract("a b c d")
-    >>> row = (1, 1.0, "str", datetime.date(2014, 10, 10))
-    >>> _infer_schema_type(row, schema)
-    StructType...LongType...DoubleType...StringType...DateType...
-    >>> row = [[1], {"key": (1, 2.0)}]
-    >>> schema = _parse_schema_abstract("a[] b{c d}")
-    >>> _infer_schema_type(row, schema)
-    StructType...a,ArrayType...b,MapType(StringType,...c,LongType...
-    """
-    if isinstance(dataType, NullType):
-        return _infer_type(obj)
-
-    if not obj:
-        return NullType()
-
-    if isinstance(dataType, ArrayType):
-        eType = _infer_schema_type(obj[0], dataType.elementType)
-        return ArrayType(eType, True)
-
-    elif isinstance(dataType, MapType):
-        k, v = next(iter(obj.items()))
-        return MapType(_infer_schema_type(k, dataType.keyType),
-                       _infer_schema_type(v, dataType.valueType))
-
-    elif isinstance(dataType, StructType):
-        fs = dataType.fields
-        assert len(fs) == len(obj), \
-            "Obj(%s) have different length with fields(%s)" % (obj, fs)
-        fields = [StructField(f.name, _infer_schema_type(o, f.dataType), True)
-                  for o, f in zip(obj, fs)]
-        return StructType(fields)
-
-    else:
-        raise TypeError("Unexpected dataType: %s" % type(dataType))
-
-
 _acceptable_types = {
     BooleanType: (bool,),
     ByteType: (int, long),

From 0bdbefe9dd1e7c95c58ea6b52d3b264794abbc0e Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Fri, 1 Sep 2017 10:29:36 -0700
Subject: [PATCH 473/779] [SPARK-21728][CORE] Follow up: fix user config, auth
 in SparkSubmit logging.

- SecurityManager complains when auth is enabled but no secret is defined;
  SparkSubmit doesn't use the auth functionality of the SecurityManager,
  so use a dummy secret to work around the exception.

- Only reset the log4j configuration when Spark was the one initializing
  it, otherwise user-defined log configuration may be lost.

Tested with the log config file posted to the bug, on a secured YARN cluster.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #19089 from vanzin/SPARK-21728.
---
 .../main/scala/org/apache/spark/deploy/SparkSubmit.scala | 3 +++
 .../main/scala/org/apache/spark/internal/Logging.scala   | 9 ++++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 38604fe93998c..ea9c9bdaede76 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -352,6 +352,9 @@ object SparkSubmit extends CommandLineUtils with Logging {
     var localJars: String = null
     var localPyFiles: String = null
     if (deployMode == CLIENT) {
+      // This security manager will not need an auth secret, but set a dummy value in case
+      // spark.authenticate is enabled, otherwise an exception is thrown.
+      sparkConf.set(SecurityManager.SPARK_AUTH_SECRET_CONF, "unused")
       val secMgr = new SecurityManager(sparkConf)
       localPrimaryResource = Option(args.primaryResource).map {
         downloadFile(_, targetDir, sparkConf, hadoopConf, secMgr)
diff --git a/core/src/main/scala/org/apache/spark/internal/Logging.scala b/core/src/main/scala/org/apache/spark/internal/Logging.scala
index cea9964ea8c91..c0d709ad25f29 100644
--- a/core/src/main/scala/org/apache/spark/internal/Logging.scala
+++ b/core/src/main/scala/org/apache/spark/internal/Logging.scala
@@ -120,6 +120,7 @@ trait Logging {
       val log4j12Initialized = LogManager.getRootLogger.getAllAppenders.hasMoreElements
       // scalastyle:off println
       if (!log4j12Initialized) {
+        Logging.defaultSparkLog4jConfig = true
         val defaultLogProps = "org/apache/spark/log4j-defaults.properties"
         Option(Utils.getSparkClassLoader.getResource(defaultLogProps)) match {
           case Some(url) =>
@@ -164,6 +165,7 @@ trait Logging {
 private[spark] object Logging {
   @volatile private var initialized = false
   @volatile private var defaultRootLevel: Level = null
+  @volatile private var defaultSparkLog4jConfig = false
 
   val initLock = new Object()
   try {
@@ -186,7 +188,12 @@ private[spark] object Logging {
    */
   def uninitialize(): Unit = initLock.synchronized {
     if (isLog4j12()) {
-      LogManager.resetConfiguration()
+      if (defaultSparkLog4jConfig) {
+        defaultSparkLog4jConfig = false
+        LogManager.resetConfiguration()
+      } else {
+        LogManager.getRootLogger().setLevel(defaultRootLevel)
+      }
     }
     this.initialized = false
   }

From 12f0d2422526ec1d2fc179a04b8e1e0f70e18c1e Mon Sep 17 00:00:00 2001
From: "he.qiao" <he.qiao17@zte.com.cn>
Date: Fri, 1 Sep 2017 10:47:11 -0700
Subject: [PATCH 474/779] [SPARK-21880][WEB UI] In the SQL table page, modify
 jobs trace information

## What changes were proposed in this pull request?
As shown below, for example, When the job 5 is running, It was a mistake to think that five jobs were running, So I think it would be more appropriate to change jobs to job id.
![image](https://user-images.githubusercontent.com/21355020/29909612-4dc85064-8e59-11e7-87cd-275a869243bb.png)

## How was this patch tested?
no need

Author: he.qiao <he.qiao17@zte.com.cn>

Closes #19093 from Geek-He/08_31_sqltable.
---
 .../apache/spark/sql/execution/ui/AllExecutionsPage.scala   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala
index e96fb9f7550a3..64c9d90edcab4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala
@@ -177,7 +177,7 @@ private[ui] class RunningExecutionTable(
     showFailedJobs = true) {
 
   override protected def header: Seq[String] =
-    baseHeader ++ Seq("Running Jobs", "Succeeded Jobs", "Failed Jobs")
+    baseHeader ++ Seq("Running Job IDs", "Succeeded Job IDs", "Failed Job IDs")
 }
 
 private[ui] class CompletedExecutionTable(
@@ -195,7 +195,7 @@ private[ui] class CompletedExecutionTable(
     showSucceededJobs = true,
     showFailedJobs = false) {
 
-  override protected def header: Seq[String] = baseHeader ++ Seq("Jobs")
+  override protected def header: Seq[String] = baseHeader ++ Seq("Job IDs")
 }
 
 private[ui] class FailedExecutionTable(
@@ -214,5 +214,5 @@ private[ui] class FailedExecutionTable(
     showFailedJobs = true) {
 
   override protected def header: Seq[String] =
-    baseHeader ++ Seq("Succeeded Jobs", "Failed Jobs")
+    baseHeader ++ Seq("Succeeded Job IDs", "Failed Job IDs")
 }

From 12ab7f7e89ec9e102859ab3b710815d3058a2e8d Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Fri, 1 Sep 2017 19:21:21 +0100
Subject: [PATCH 475/779] [SPARK-14280][BUILD][WIP] Update change-version.sh
 and pom.xml to add Scala 2.12 profiles and enable 2.12 compilation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…build; fix some things that will be warnings or errors in 2.12; restore Scala 2.12 profile infrastructure

## What changes were proposed in this pull request?

This change adds back the infrastructure for a Scala 2.12 build, but does not enable it in the release or Python test scripts.

In order to make that meaningful, it also resolves compile errors that the code hits in 2.12 only, in a way that still works with 2.11.

It also updates dependencies to the earliest minor release of dependencies whose current version does not yet support Scala 2.12. This is in a sense covered by other JIRAs under the main umbrella, but implemented here. The versions below still work with 2.11, and are the _latest_ maintenance release in the _earliest_ viable minor release.

- Scalatest 2.x -> 3.0.3
- Chill 0.8.0 -> 0.8.4
- Clapper 1.0.x -> 1.1.2
- json4s 3.2.x -> 3.4.2
- Jackson 2.6.x -> 2.7.9 (required by json4s)

This change does _not_ fully enable a Scala 2.12 build:

- It will also require dropping support for Kafka before 0.10. Easy enough, just didn't do it yet here
- It will require recreating `SparkILoop` and `Main` for REPL 2.12, which is SPARK-14650. Possible to do here too.

What it does do is make changes that resolve much of the remaining gap without affecting the current 2.11 build.

## How was this patch tested?

Existing tests and build. Manually tested with `./dev/change-scala-version.sh 2.12` to verify it compiles, modulo the exceptions above.

Author: Sean Owen <sowen@cloudera.com>

Closes #18645 from srowen/SPARK-14280.
---
 bin/load-spark-env.cmd                        | 22 ++++-----
 bin/load-spark-env.sh                         | 22 ++++-----
 .../scala/org/apache/spark/FutureAction.scala |  8 ++++
 .../org/apache/spark/api/java/JavaUtils.scala |  6 +--
 .../history/ApplicationCacheSuite.scala       |  2 +-
 .../deploy/history/HistoryServerSuite.scala   |  2 +-
 .../apache/spark/executor/ExecutorSuite.scala |  2 +-
 .../NettyBlockTransferSecuritySuite.scala     |  6 +--
 .../NettyBlockTransferServiceSuite.scala      |  2 +-
 .../spark/rpc/netty/NettyRpcEnvSuite.scala    |  2 +-
 .../scheduler/BlacklistTrackerSuite.scala     |  2 +-
 .../spark/scheduler/TaskContextSuite.scala    | 48 ++++++++++++++-----
 .../scheduler/TaskSchedulerImplSuite.scala    |  2 +-
 .../PartiallyUnrolledIteratorSuite.scala      |  2 +-
 dev/create-release/release-build.sh           |  2 +
 dev/deps/spark-deps-hadoop-2.6                |  6 +--
 dev/deps/spark-deps-hadoop-2.7                |  6 +--
 .../kinesis/KinesisCheckpointerSuite.scala    |  2 +-
 .../KinesisInputDStreamBuilderSuite.scala     |  5 +-
 .../kinesis/KinesisReceiverSuite.scala        |  2 +-
 .../launcher/AbstractCommandBuilder.java      | 20 ++++----
 .../spark/ml/linalg/MatricesSuite.scala       |  2 +-
 mllib/pom.xml                                 |  4 ++
 .../org/apache/spark/ml/PipelineSuite.scala   |  2 +-
 .../spark/mllib/linalg/MatricesSuite.scala    |  2 +-
 pom.xml                                       | 24 ++++------
 project/MimaExcludes.scala                    |  6 ++-
 project/SparkBuild.scala                      | 16 ++++++-
 python/run-tests.py                           |  1 +
 .../spark/repl/ExecutorClassLoaderSuite.scala |  2 +-
 .../mesos/MesosClusterSchedulerSuite.scala    |  2 +-
 ...osCoarseGrainedSchedulerBackendSuite.scala |  2 +-
 ...esosFineGrainedSchedulerBackendSuite.scala |  2 +-
 .../MesosSchedulerBackendUtilSuite.scala      |  3 --
 .../mesos/MesosSchedulerUtilsSuite.scala      |  2 +-
 .../ClientDistributedCacheManagerSuite.scala  |  2 +-
 .../cluster/YarnSchedulerBackendSuite.scala   |  2 +-
 sql/catalyst/pom.xml                          |  4 ++
 .../sql/catalyst/analysis/AnalysisSuite.scala |  4 +-
 .../scala/org/apache/spark/sql/Dataset.scala  |  5 +-
 .../aggregate/typedaggregators.scala          |  8 ++--
 .../spark/sql/DatasetPrimitiveSuite.scala     |  2 +-
 .../org/apache/spark/sql/DatasetSuite.scala   |  2 +-
 .../execution/ExchangeCoordinatorSuite.scala  |  4 +-
 .../parquet/ParquetFilterSuite.scala          |  2 +-
 .../sql/streaming/StreamingQuerySuite.scala   |  2 +-
 sql/hive/pom.xml                              |  5 ++
 .../sql/hive/client/HiveVersionSuite.scala    |  4 +-
 .../spark/streaming/ReceiverSuite.scala       |  6 ++-
 .../streaming/StreamingContextSuite.scala     |  6 ++-
 .../ExecutorAllocationManagerSuite.scala      |  2 +-
 .../streaming/util/WriteAheadLogSuite.scala   |  4 +-
 tools/pom.xml                                 |  2 +-
 53 files changed, 184 insertions(+), 123 deletions(-)

diff --git a/bin/load-spark-env.cmd b/bin/load-spark-env.cmd
index 993aa31a4c374..f946197b02d55 100644
--- a/bin/load-spark-env.cmd
+++ b/bin/load-spark-env.cmd
@@ -35,21 +35,21 @@ if [%SPARK_ENV_LOADED%] == [] (
 
 rem Setting SPARK_SCALA_VERSION if not already set.
 
-rem set ASSEMBLY_DIR2="%SPARK_HOME%\assembly\target\scala-2.11"
-rem set ASSEMBLY_DIR1="%SPARK_HOME%\assembly\target\scala-2.12"
+set ASSEMBLY_DIR2="%SPARK_HOME%\assembly\target\scala-2.11"
+set ASSEMBLY_DIR1="%SPARK_HOME%\assembly\target\scala-2.12"
 
 if [%SPARK_SCALA_VERSION%] == [] (
 
-  rem if exist %ASSEMBLY_DIR2% if exist %ASSEMBLY_DIR1% (
-  rem   echo "Presence of build for multiple Scala versions detected."
-  rem   echo "Either clean one of them or, set SPARK_SCALA_VERSION=2.11 in spark-env.cmd."
-  rem   exit 1
-  rem )
-  rem if exist %ASSEMBLY_DIR2% (
+  if exist %ASSEMBLY_DIR2% if exist %ASSEMBLY_DIR1% (
+    echo "Presence of build for multiple Scala versions detected."
+    echo "Either clean one of them or, set SPARK_SCALA_VERSION in spark-env.cmd."
+    exit 1
+  )
+  if exist %ASSEMBLY_DIR2% (
     set SPARK_SCALA_VERSION=2.11
-  rem ) else (
-  rem   set SPARK_SCALA_VERSION=2.12
-  rem )
+  ) else (
+    set SPARK_SCALA_VERSION=2.12
+  )
 )
 exit /b 0
 
diff --git a/bin/load-spark-env.sh b/bin/load-spark-env.sh
index 9de62039c51ed..d05d94e68c81b 100644
--- a/bin/load-spark-env.sh
+++ b/bin/load-spark-env.sh
@@ -46,18 +46,18 @@ fi
 
 if [ -z "$SPARK_SCALA_VERSION" ]; then
 
-  #ASSEMBLY_DIR2="${SPARK_HOME}/assembly/target/scala-2.11"
-  #ASSEMBLY_DIR1="${SPARK_HOME}/assembly/target/scala-2.12"
+  ASSEMBLY_DIR2="${SPARK_HOME}/assembly/target/scala-2.11"
+  ASSEMBLY_DIR1="${SPARK_HOME}/assembly/target/scala-2.12"
 
-  #if [[ -d "$ASSEMBLY_DIR2" && -d "$ASSEMBLY_DIR1" ]]; then
-  #  echo -e "Presence of build for multiple Scala versions detected." 1>&2
-  #  echo -e 'Either clean one of them or, export SPARK_SCALA_VERSION=2.11 in spark-env.sh.' 1>&2
-  #  exit 1
-  #fi
+  if [[ -d "$ASSEMBLY_DIR2" && -d "$ASSEMBLY_DIR1" ]]; then
+    echo -e "Presence of build for multiple Scala versions detected." 1>&2
+    echo -e 'Either clean one of them or, export SPARK_SCALA_VERSION in spark-env.sh.' 1>&2
+    exit 1
+  fi
 
-  #if [ -d "$ASSEMBLY_DIR2" ]; then
+  if [ -d "$ASSEMBLY_DIR2" ]; then
     export SPARK_SCALA_VERSION="2.11"
-  #else
-  #  export SPARK_SCALA_VERSION="2.12"
-  #fi
+  else
+    export SPARK_SCALA_VERSION="2.12"
+  fi
 fi
diff --git a/core/src/main/scala/org/apache/spark/FutureAction.scala b/core/src/main/scala/org/apache/spark/FutureAction.scala
index 0899693988016..1034fdcae8e8c 100644
--- a/core/src/main/scala/org/apache/spark/FutureAction.scala
+++ b/core/src/main/scala/org/apache/spark/FutureAction.scala
@@ -89,6 +89,14 @@ trait FutureAction[T] extends Future[T] {
    */
   override def value: Option[Try[T]]
 
+  // These two methods must be implemented in Scala 2.12, but won't be used by Spark
+
+  def transform[S](f: (Try[T]) => Try[S])(implicit executor: ExecutionContext): Future[S] =
+    throw new UnsupportedOperationException()
+
+  def transformWith[S](f: (Try[T]) => Future[S])(implicit executor: ExecutionContext): Future[S] =
+    throw new UnsupportedOperationException()
+
   /**
    * Blocks and returns the result of this job.
    */
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaUtils.scala b/core/src/main/scala/org/apache/spark/api/java/JavaUtils.scala
index f820401da2fc3..d6506231b8d74 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaUtils.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaUtils.scala
@@ -56,9 +56,9 @@ private[spark] object JavaUtils {
         val ui = underlying.iterator
         var prev : Option[A] = None
 
-        def hasNext: Boolean = ui.hasNext
+        override def hasNext: Boolean = ui.hasNext
 
-        def next(): Entry[A, B] = {
+        override def next(): Entry[A, B] = {
           val (k, v) = ui.next()
           prev = Some(k)
           new ju.Map.Entry[A, B] {
@@ -74,7 +74,7 @@ private[spark] object JavaUtils {
           }
         }
 
-        def remove() {
+        override def remove() {
           prev match {
             case Some(k) =>
               underlying match {
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/ApplicationCacheSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/ApplicationCacheSuite.scala
index 871c87415d35d..c175ed3fb6e3d 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/ApplicationCacheSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/ApplicationCacheSuite.scala
@@ -33,7 +33,7 @@ import org.mockito.Mockito._
 import org.mockito.invocation.InvocationOnMock
 import org.mockito.stubbing.Answer
 import org.scalatest.Matchers
-import org.scalatest.mock.MockitoSugar
+import org.scalatest.mockito.MockitoSugar
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.internal.Logging
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
index 95acb9a54440f..18da8c18939ed 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
@@ -39,7 +39,7 @@ import org.openqa.selenium.WebDriver
 import org.openqa.selenium.htmlunit.HtmlUnitDriver
 import org.scalatest.{BeforeAndAfter, Matchers}
 import org.scalatest.concurrent.Eventually
-import org.scalatest.mock.MockitoSugar
+import org.scalatest.mockito.MockitoSugar
 import org.scalatest.selenium.WebBrowser
 
 import org.apache.spark._
diff --git a/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala b/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala
index 601dde6c63284..884a2750e621d 100644
--- a/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala
@@ -33,7 +33,7 @@ import org.mockito.Mockito.{inOrder, verify, when}
 import org.mockito.invocation.InvocationOnMock
 import org.mockito.stubbing.Answer
 import org.scalatest.concurrent.Eventually
-import org.scalatest.mock.MockitoSugar
+import org.scalatest.mockito.MockitoSugar
 
 import org.apache.spark._
 import org.apache.spark.TaskState.TaskState
diff --git a/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferSecuritySuite.scala b/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferSecuritySuite.scala
index e4c133c9f2cdd..21138bd4a16ba 100644
--- a/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferSecuritySuite.scala
+++ b/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferSecuritySuite.scala
@@ -28,8 +28,8 @@ import scala.util.{Failure, Success, Try}
 
 import com.google.common.io.CharStreams
 import org.mockito.Mockito._
-import org.scalatest.ShouldMatchers
-import org.scalatest.mock.MockitoSugar
+import org.scalatest.Matchers
+import org.scalatest.mockito.MockitoSugar
 
 import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
 import org.apache.spark.network.{BlockDataManager, BlockTransferService}
@@ -38,7 +38,7 @@ import org.apache.spark.network.shuffle.BlockFetchingListener
 import org.apache.spark.storage.{BlockId, ShuffleBlockId}
 import org.apache.spark.util.ThreadUtils
 
-class NettyBlockTransferSecuritySuite extends SparkFunSuite with MockitoSugar with ShouldMatchers {
+class NettyBlockTransferSecuritySuite extends SparkFunSuite with MockitoSugar with Matchers {
   test("security default off") {
     val conf = new SparkConf()
       .set("spark.app.id", "app-id")
diff --git a/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferServiceSuite.scala b/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferServiceSuite.scala
index 98259300381eb..f7bc3725d7278 100644
--- a/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferServiceSuite.scala
+++ b/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferServiceSuite.scala
@@ -28,7 +28,7 @@ import org.apache.spark.network.BlockDataManager
 class NettyBlockTransferServiceSuite
   extends SparkFunSuite
   with BeforeAndAfterEach
-  with ShouldMatchers {
+  with Matchers {
 
   private var service0: NettyBlockTransferService = _
   private var service1: NettyBlockTransferService = _
diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala
index 777163709bbf5..f9481f875d439 100644
--- a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.rpc.netty
 
-import org.scalatest.mock.MockitoSugar
+import org.scalatest.mockito.MockitoSugar
 
 import org.apache.spark._
 import org.apache.spark.network.client.TransportClient
diff --git a/core/src/test/scala/org/apache/spark/scheduler/BlacklistTrackerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/BlacklistTrackerSuite.scala
index 520d85a298922..a136d69b36d6c 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/BlacklistTrackerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/BlacklistTrackerSuite.scala
@@ -22,7 +22,7 @@ import org.mockito.Mockito.{never, verify, when}
 import org.mockito.invocation.InvocationOnMock
 import org.mockito.stubbing.Answer
 import org.scalatest.BeforeAndAfterEach
-import org.scalatest.mock.MockitoSugar
+import org.scalatest.mockito.MockitoSugar
 
 import org.apache.spark._
 import org.apache.spark.internal.config
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala
index 992d3396d203f..a1d9085fa085d 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala
@@ -54,7 +54,10 @@ class TaskContextSuite extends SparkFunSuite with BeforeAndAfter with LocalSpark
     val rdd = new RDD[String](sc, List()) {
       override def getPartitions = Array[Partition](StubPartition(0))
       override def compute(split: Partition, context: TaskContext) = {
-        context.addTaskCompletionListener(context => TaskContextSuite.completed = true)
+        context.addTaskCompletionListener(new TaskCompletionListener {
+          override def onTaskCompletion(context: TaskContext): Unit =
+            TaskContextSuite.completed = true
+        })
         sys.error("failed")
       }
     }
@@ -95,9 +98,13 @@ class TaskContextSuite extends SparkFunSuite with BeforeAndAfter with LocalSpark
   test("all TaskCompletionListeners should be called even if some fail") {
     val context = TaskContext.empty()
     val listener = mock(classOf[TaskCompletionListener])
-    context.addTaskCompletionListener(_ => throw new Exception("blah"))
+    context.addTaskCompletionListener(new TaskCompletionListener {
+      override def onTaskCompletion(context: TaskContext): Unit = throw new Exception("blah")
+    })
     context.addTaskCompletionListener(listener)
-    context.addTaskCompletionListener(_ => throw new Exception("blah"))
+    context.addTaskCompletionListener(new TaskCompletionListener {
+      override def onTaskCompletion(context: TaskContext): Unit = throw new Exception("blah")
+    })
 
     intercept[TaskCompletionListenerException] {
       context.markTaskCompleted(None)
@@ -109,9 +116,15 @@ class TaskContextSuite extends SparkFunSuite with BeforeAndAfter with LocalSpark
   test("all TaskFailureListeners should be called even if some fail") {
     val context = TaskContext.empty()
     val listener = mock(classOf[TaskFailureListener])
-    context.addTaskFailureListener((_, _) => throw new Exception("exception in listener1"))
+    context.addTaskFailureListener(new TaskFailureListener {
+      override def onTaskFailure(context: TaskContext, error: Throwable): Unit =
+        throw new Exception("exception in listener1")
+    })
     context.addTaskFailureListener(listener)
-    context.addTaskFailureListener((_, _) => throw new Exception("exception in listener3"))
+    context.addTaskFailureListener(new TaskFailureListener {
+      override def onTaskFailure(context: TaskContext, error: Throwable): Unit =
+        throw new Exception("exception in listener3")
+    })
 
     val e = intercept[TaskCompletionListenerException] {
       context.markTaskFailed(new Exception("exception in task"))
@@ -232,7 +245,10 @@ class TaskContextSuite extends SparkFunSuite with BeforeAndAfter with LocalSpark
     var invocations = 0
     val context = TaskContext.empty()
     context.markTaskCompleted(None)
-    context.addTaskCompletionListener(_ => invocations += 1)
+    context.addTaskCompletionListener(new TaskCompletionListener {
+      override def onTaskCompletion(context: TaskContext): Unit =
+        invocations += 1
+    })
     assert(invocations == 1)
     context.markTaskCompleted(None)
     assert(invocations == 1)
@@ -244,10 +260,12 @@ class TaskContextSuite extends SparkFunSuite with BeforeAndAfter with LocalSpark
     val error = new RuntimeException
     val context = TaskContext.empty()
     context.markTaskFailed(error)
-    context.addTaskFailureListener { (_, e) =>
-      lastError = e
-      invocations += 1
-    }
+    context.addTaskFailureListener(new TaskFailureListener {
+      override def onTaskFailure(context: TaskContext, e: Throwable): Unit = {
+        lastError = e
+        invocations += 1
+      }
+    })
     assert(lastError == error)
     assert(invocations == 1)
     context.markTaskFailed(error)
@@ -267,9 +285,15 @@ class TaskContextSuite extends SparkFunSuite with BeforeAndAfter with LocalSpark
   test("all TaskCompletionListeners should be called even if some fail or a task") {
     val context = TaskContext.empty()
     val listener = mock(classOf[TaskCompletionListener])
-    context.addTaskCompletionListener(_ => throw new Exception("exception in listener1"))
+    context.addTaskCompletionListener(new TaskCompletionListener {
+      override def onTaskCompletion(context: TaskContext): Unit =
+        throw new Exception("exception in listener1")
+    })
     context.addTaskCompletionListener(listener)
-    context.addTaskCompletionListener(_ => throw new Exception("exception in listener3"))
+    context.addTaskCompletionListener(new TaskCompletionListener {
+      override def onTaskCompletion(context: TaskContext): Unit =
+        throw new Exception("exception in listener3")
+    })
 
     val e = intercept[TaskCompletionListenerException] {
       context.markTaskCompleted(Some(new Exception("exception in task")))
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
index ab67a393e2ac5..b8626bf777598 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
@@ -24,7 +24,7 @@ import scala.collection.mutable.HashMap
 import org.mockito.Matchers.{anyInt, anyObject, anyString, eq => meq}
 import org.mockito.Mockito.{atLeast, atMost, never, spy, times, verify, when}
 import org.scalatest.BeforeAndAfterEach
-import org.scalatest.mock.MockitoSugar
+import org.scalatest.mockito.MockitoSugar
 
 import org.apache.spark._
 import org.apache.spark.internal.Logging
diff --git a/core/src/test/scala/org/apache/spark/storage/PartiallyUnrolledIteratorSuite.scala b/core/src/test/scala/org/apache/spark/storage/PartiallyUnrolledIteratorSuite.scala
index 4253cc8ca4cd1..cbc903f17ad75 100644
--- a/core/src/test/scala/org/apache/spark/storage/PartiallyUnrolledIteratorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/PartiallyUnrolledIteratorSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.storage
 
 import org.mockito.Matchers
 import org.mockito.Mockito._
-import org.scalatest.mock.MockitoSugar
+import org.scalatest.mockito.MockitoSugar
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.memory.MemoryMode.ON_HEAP
diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index 9bf2899e340ec..ee2407a4ddf0e 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -349,6 +349,8 @@ if [[ "$1" == "publish-release" ]]; then
   # Clean-up Zinc nailgun process
   /usr/sbin/lsof -P |grep $ZINC_PORT | grep LISTEN | awk '{ print $2; }' | xargs kill
 
+  #./dev/change-scala-version.sh 2.11
+
   pushd $tmp_repo/org/apache/spark
 
   # Remove any extra files generated during install
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index e481b4d528e3e..1535103321193 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -28,8 +28,8 @@ breeze_2.11-0.13.2.jar
 calcite-avatica-1.2.0-incubating.jar
 calcite-core-1.2.0-incubating.jar
 calcite-linq4j-1.2.0-incubating.jar
-chill-java-0.8.0.jar
-chill_2.11-0.8.0.jar
+chill-java-0.8.4.jar
+chill_2.11-0.8.4.jar
 commons-beanutils-1.7.0.jar
 commons-beanutils-core-1.8.0.jar
 commons-cli-1.2.jar
@@ -168,7 +168,7 @@ scala-compiler-2.11.8.jar
 scala-library-2.11.8.jar
 scala-parser-combinators_2.11-1.0.4.jar
 scala-reflect-2.11.8.jar
-scala-xml_2.11-1.0.2.jar
+scala-xml_2.11-1.0.5.jar
 scalap-2.11.8.jar
 shapeless_2.11-2.3.2.jar
 slf4j-api-1.7.16.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index b8046b117cfcd..deaa288ef4efe 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -28,8 +28,8 @@ breeze_2.11-0.13.2.jar
 calcite-avatica-1.2.0-incubating.jar
 calcite-core-1.2.0-incubating.jar
 calcite-linq4j-1.2.0-incubating.jar
-chill-java-0.8.0.jar
-chill_2.11-0.8.0.jar
+chill-java-0.8.4.jar
+chill_2.11-0.8.4.jar
 commons-beanutils-1.7.0.jar
 commons-beanutils-core-1.8.0.jar
 commons-cli-1.2.jar
@@ -169,7 +169,7 @@ scala-compiler-2.11.8.jar
 scala-library-2.11.8.jar
 scala-parser-combinators_2.11-1.0.4.jar
 scala-reflect-2.11.8.jar
-scala-xml_2.11-1.0.2.jar
+scala-xml_2.11-1.0.5.jar
 scalap-2.11.8.jar
 shapeless_2.11-2.3.2.jar
 slf4j-api-1.7.16.jar
diff --git a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointerSuite.scala b/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointerSuite.scala
index 8d56d4be9c42a..e26f4477d1d7d 100644
--- a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointerSuite.scala
+++ b/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointerSuite.scala
@@ -30,7 +30,7 @@ import org.mockito.invocation.InvocationOnMock
 import org.mockito.stubbing.Answer
 import org.scalatest.{BeforeAndAfterEach, PrivateMethodTester}
 import org.scalatest.concurrent.Eventually
-import org.scalatest.mock.MockitoSugar
+import org.scalatest.mockito.MockitoSugar
 
 import org.apache.spark.streaming.{Duration, TestSuiteBase}
 import org.apache.spark.util.ManualClock
diff --git a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisInputDStreamBuilderSuite.scala b/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisInputDStreamBuilderSuite.scala
index 1c130654f3f95..afa1a7f8ca663 100644
--- a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisInputDStreamBuilderSuite.scala
+++ b/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisInputDStreamBuilderSuite.scala
@@ -17,13 +17,10 @@
 
 package org.apache.spark.streaming.kinesis
 
-import java.lang.IllegalArgumentException
-
 import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
 import org.scalatest.BeforeAndAfterEach
-import org.scalatest.mock.MockitoSugar
+import org.scalatest.mockito.MockitoSugar
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.{Seconds, StreamingContext, TestSuiteBase}
 
diff --git a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala b/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
index 3b14c8471e205..2fadda271ea28 100644
--- a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
+++ b/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
@@ -28,7 +28,7 @@ import org.mockito.Matchers._
 import org.mockito.Matchers.{eq => meq}
 import org.mockito.Mockito._
 import org.scalatest.{BeforeAndAfter, Matchers}
-import org.scalatest.mock.MockitoSugar
+import org.scalatest.mockito.MockitoSugar
 
 import org.apache.spark.streaming.{Duration, TestSuiteBase}
 
diff --git a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
index c32974a57fccc..ce24400f557cd 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
@@ -231,17 +231,17 @@ String getScalaVersion() {
       return scala;
     }
     String sparkHome = getSparkHome();
-    //File scala212 = new File(sparkHome, "launcher/target/scala-2.12");
+    File scala212 = new File(sparkHome, "launcher/target/scala-2.12");
     File scala211 = new File(sparkHome, "launcher/target/scala-2.11");
-    //checkState(!scala210.isDirectory() || !scala211.isDirectory(),
-    //  "Presence of build for multiple Scala versions detected.\n" +
-    //  "Either clean one of them or set SPARK_SCALA_VERSION in your environment.");
-    //if (scala212.isDirectory()) {
-    //  return "2.12";
-    //} else {
-    checkState(scala211.isDirectory(), "Cannot find any build directories.");
-    return "2.11";
-    //}
+    checkState(!scala212.isDirectory() || !scala211.isDirectory(),
+      "Presence of build for multiple Scala versions detected.\n" +
+      "Either clean one of them or set SPARK_SCALA_VERSION in your environment.");
+    if (scala212.isDirectory()) {
+      return "2.12";
+    } else {
+      checkState(scala211.isDirectory(), "Cannot find any build directories.");
+      return "2.11";
+    }
   }
 
   String getSparkHome() {
diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala
index 7fb9034d6501a..ace44165b1067 100644
--- a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala
+++ b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala
@@ -21,7 +21,7 @@ import java.util.Random
 
 import breeze.linalg.{CSCMatrix, Matrix => BM}
 import org.mockito.Mockito.when
-import org.scalatest.mock.MockitoSugar._
+import org.scalatest.mockito.MockitoSugar._
 import scala.collection.mutable.{Map => MutableMap}
 
 import org.apache.spark.ml.SparkMLFunSuite
diff --git a/mllib/pom.xml b/mllib/pom.xml
index c72a16a56e05c..925b5422a54cc 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -34,6 +34,10 @@
   <url>http://spark.apache.org/</url>
 
   <dependencies>
+    <dependency>
+      <groupId>org.scala-lang.modules</groupId>
+      <artifactId>scala-parser-combinators_${scala.binary.version}</artifactId>
+    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-core_${scala.binary.version}</artifactId>
diff --git a/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala
index 4a7e4dd80f246..7848eae931a06 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala
@@ -22,7 +22,7 @@ import scala.collection.JavaConverters._
 import org.apache.hadoop.fs.Path
 import org.mockito.Matchers.{any, eq => meq}
 import org.mockito.Mockito.when
-import org.scalatest.mock.MockitoSugar.mock
+import org.scalatest.mockito.MockitoSugar.mock
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.Pipeline.SharedReadWrite
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
index 6736e7d3db511..c8ac92eecf40b 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
@@ -23,7 +23,7 @@ import scala.collection.mutable.{Map => MutableMap}
 
 import breeze.linalg.{CSCMatrix, Matrix => BM}
 import org.mockito.Mockito.when
-import org.scalatest.mock.MockitoSugar._
+import org.scalatest.mockito.MockitoSugar._
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.{linalg => newlinalg}
diff --git a/pom.xml b/pom.xml
index fffd70ec1d929..eab8056084df0 100644
--- a/pom.xml
+++ b/pom.xml
@@ -137,7 +137,7 @@
     <hive.parquet.version>1.6.0</hive.parquet.version>
     <jetty.version>9.3.20.v20170531</jetty.version>
     <javaxservlet.version>3.1.0</javaxservlet.version>
-    <chill.version>0.8.0</chill.version>
+    <chill.version>0.8.4</chill.version>
     <ivy.version>2.4.0</ivy.version>
     <oro.version>2.0.8</oro.version>
     <codahale.metrics.version>3.1.2</codahale.metrics.version>
@@ -726,6 +726,11 @@
         <artifactId>scala-actors</artifactId>
         <version>${scala.version}</version>
       </dependency>
+      <dependency>
+        <groupId>org.scala-lang.modules</groupId>
+        <artifactId>scala-parser-combinators_${scala.binary.version}</artifactId>
+        <version>1.0.4</version>
+      </dependency>
       <dependency>
         <groupId>org.scala-lang</groupId>
         <artifactId>scalap</artifactId>
@@ -734,7 +739,7 @@
       <dependency>
         <groupId>org.scalatest</groupId>
         <artifactId>scalatest_${scala.binary.version}</artifactId>
-        <version>2.2.6</version>
+        <version>3.0.3</version>
         <scope>test</scope>
       </dependency>
       <dependency>
@@ -746,7 +751,7 @@
       <dependency>
         <groupId>org.scalacheck</groupId>
         <artifactId>scalacheck_${scala.binary.version}</artifactId>
-        <version>1.12.5</version> <!-- 1.13.0 appears incompatible with scalatest 2.2.6 -->
+        <version>1.13.5</version>
         <scope>test</scope>
       </dependency>
       <dependency>
@@ -2029,13 +2034,6 @@
                 <goal>testCompile</goal>
               </goals>
             </execution>
-            <execution>
-              <id>attach-scaladocs</id>
-              <phase>verify</phase>
-              <goals>
-                <goal>doc-jar</goal>
-              </goals>
-            </execution>
           </executions>
           <configuration>
             <scalaVersion>${scala.version}</scalaVersion>
@@ -2670,12 +2668,10 @@
       <id>scala-2.11</id>
     </profile>
 
-    <!-- Draft of Scala 2.12 profile for later -->
-    <!--
     <profile>
       <id>scala-2.12</id>
       <properties>
-        <scala.version>2.12.1</scala.version>
+        <scala.version>2.12.3</scala.version>
         <scala.binary.version>2.12</scala.binary.version>
       </properties>
       <build>
@@ -2694,6 +2690,7 @@
                     <bannedDependencies>
                       <excludes combine.children="append">
                         <exclude>*:*_2.11</exclude>
+                        <exclude>*:*_2.10</exclude>
                       </excludes>
                     </bannedDependencies>
                   </rules>
@@ -2704,7 +2701,6 @@
         </plugins>
       </build>
     </profile>
-    -->
 
     <!--
      This is a profile to enable the use of the ASF snapshot and staging repositories
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 27e4183550500..dd299e074535e 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -64,7 +64,11 @@ object MimaExcludes {
     ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightedFMeasure"),
     ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.asBinary"),
     ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.org$apache$spark$ml$classification$LogisticRegressionSummary$$multiclassMetrics"),
-    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.org$apache$spark$ml$classification$LogisticRegressionSummary$_setter_$org$apache$spark$ml$classification$LogisticRegressionSummary$$multiclassMetrics_=")
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.org$apache$spark$ml$classification$LogisticRegressionSummary$_setter_$org$apache$spark$ml$classification$LogisticRegressionSummary$$multiclassMetrics_="),
+
+    // [SPARK-14280] Support Scala 2.12
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.FutureAction.transformWith"),
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.FutureAction.transform")
   )
 
   // Exclude rules for 2.2.x
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 18059adc864b5..9d903edbf91fb 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -87,11 +87,18 @@ object SparkBuild extends PomBuild {
   val projectsMap: Map[String, Seq[Setting[_]]] = Map.empty
 
   override val profiles = {
-    Properties.envOrNone("SBT_MAVEN_PROFILES") match {
+    val profiles = Properties.envOrNone("SBT_MAVEN_PROFILES") match {
       case None => Seq("sbt")
       case Some(v) =>
         v.split("(\\s+|,)").filterNot(_.isEmpty).map(_.trim.replaceAll("-P", "")).toSeq
     }
+    if (System.getProperty("scala-2.12") == "") {
+      // To activate scala-2.10 profile, replace empty property value to non-empty value
+      // in the same way as Maven which handles -Dname as -Dname=true before executes build process.
+      // see: https://github.com/apache/maven/blob/maven-3.0.4/maven-embedder/src/main/java/org/apache/maven/cli/MavenCli.java#L1082
+      System.setProperty("scala-2.12", "true")
+    }
+    profiles
   }
 
   Properties.envOrNone("SBT_MAVEN_PROPERTIES") match {
@@ -747,7 +754,12 @@ object CopyDependencies {
 object TestSettings {
   import BuildCommons._
 
-  private val scalaBinaryVersion = "2.11"
+  private val scalaBinaryVersion =
+    if (System.getProperty("scala-2.12") == "true") {
+      "2.12"
+    } else {
+      "2.11"
+    }
   lazy val settings = Seq (
     // Fork new JVMs for tests and set Java options for those
     fork := true,
diff --git a/python/run-tests.py b/python/run-tests.py
index afd3d29a0ff90..1341086f02db0 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -55,6 +55,7 @@ def print_red(text):
 
 # Find out where the assembly jars are located.
 # Later, add back 2.12 to this list:
+# for scala in ["2.11", "2.12"]:
 for scala in ["2.11"]:
     build_dir = os.path.join(SPARK_HOME, "assembly", "target", "scala-" + scala)
     if os.path.isdir(build_dir):
diff --git a/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala b/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala
index 092d3c272b8f6..e5e2094368fb0 100644
--- a/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala
+++ b/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala
@@ -35,7 +35,7 @@ import org.mockito.Mockito._
 import org.mockito.invocation.InvocationOnMock
 import org.mockito.stubbing.Answer
 import org.scalatest.BeforeAndAfterAll
-import org.scalatest.mock.MockitoSugar
+import org.scalatest.mockito.MockitoSugar
 
 import org.apache.spark._
 import org.apache.spark.internal.Logging
diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala
index f0f99e9554d58..ff63e3f4ccfc3 100644
--- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala
+++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala
@@ -27,7 +27,7 @@ import org.apache.mesos.SchedulerDriver
 import org.apache.mesos.protobuf.ByteString
 import org.mockito.{ArgumentCaptor, Matchers}
 import org.mockito.Mockito._
-import org.scalatest.mock.MockitoSugar
+import org.scalatest.mockito.MockitoSugar
 
 import org.apache.spark.{LocalSparkContext, SparkConf, SparkFunSuite}
 import org.apache.spark.deploy.Command
diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
index ab29c295dd893..f6bae01c3af59 100644
--- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
+++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
@@ -30,7 +30,7 @@ import org.mockito.Matchers._
 import org.mockito.Mockito._
 import org.scalatest.BeforeAndAfter
 import org.scalatest.concurrent.ScalaFutures
-import org.scalatest.mock.MockitoSugar
+import org.scalatest.mockito.MockitoSugar
 
 import org.apache.spark.{LocalSparkContext, SecurityManager, SparkConf, SparkContext, SparkFunSuite}
 import org.apache.spark.deploy.mesos.config._
diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala
index 4ee85b91830a9..2d2f90c63a309 100644
--- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala
+++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala
@@ -33,7 +33,7 @@ import org.apache.mesos.Protos.Value.Scalar
 import org.mockito.{ArgumentCaptor, Matchers}
 import org.mockito.Matchers._
 import org.mockito.Mockito._
-import org.scalatest.mock.MockitoSugar
+import org.scalatest.mockito.MockitoSugar
 
 import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite}
 import org.apache.spark.executor.MesosExecutorBackend
diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtilSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtilSuite.scala
index caf9d89fdd201..f49d7c29eda49 100644
--- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtilSuite.scala
+++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtilSuite.scala
@@ -17,9 +17,6 @@
 
 package org.apache.spark.scheduler.cluster.mesos
 
-import org.scalatest._
-import org.scalatest.mock.MockitoSugar
-
 import org.apache.spark.{SparkConf, SparkFunSuite}
 
 class MesosSchedulerBackendUtilSuite extends SparkFunSuite {
diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtilsSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtilsSuite.scala
index 5d4bf6d082c4c..7df738958f85c 100644
--- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtilsSuite.scala
+++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtilsSuite.scala
@@ -23,7 +23,7 @@ import scala.language.reflectiveCalls
 import org.apache.mesos.Protos.{Resource, Value}
 import org.mockito.Mockito._
 import org.scalatest._
-import org.scalatest.mock.MockitoSugar
+import org.scalatest.mockito.MockitoSugar
 
 import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite}
 import org.apache.spark.internal.config._
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManagerSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManagerSuite.scala
index b696e080ce62f..b091fec926c4c 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManagerSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManagerSuite.scala
@@ -31,7 +31,7 @@ import org.apache.hadoop.yarn.api.records.LocalResourceType
 import org.apache.hadoop.yarn.api.records.LocalResourceVisibility
 import org.apache.hadoop.yarn.util.ConverterUtils
 import org.mockito.Mockito.when
-import org.scalatest.mock.MockitoSugar
+import org.scalatest.mockito.MockitoSugar
 
 import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.deploy.yarn.config._
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackendSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackendSuite.scala
index 0a413b2c23de1..7fac57ff68abc 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackendSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackendSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.scheduler.cluster
 import scala.language.reflectiveCalls
 
 import org.mockito.Mockito.when
-import org.scalatest.mock.MockitoSugar
+import org.scalatest.mockito.MockitoSugar
 
 import org.apache.spark.{LocalSparkContext, SparkContext, SparkFunSuite}
 import org.apache.spark.scheduler.TaskSchedulerImpl
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index fce81493795c8..1a75c7e504328 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -39,6 +39,10 @@
       <groupId>org.scala-lang</groupId>
       <artifactId>scala-reflect</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.scala-lang.modules</groupId>
+      <artifactId>scala-parser-combinators_${scala.binary.version}</artifactId>
+    </dependency>
 
     <dependency>
       <groupId>org.apache.spark</groupId>
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index e5fcd60b2d3da..e56a5d6368318 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.analysis
 
 import java.util.TimeZone
 
-import org.scalatest.ShouldMatchers
+import org.scalatest.Matchers
 
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.dsl.expressions._
@@ -30,7 +30,7 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.types._
 
 
-class AnalysisSuite extends AnalysisTest with ShouldMatchers {
+class AnalysisSuite extends AnalysisTest with Matchers {
   import org.apache.spark.sql.catalyst.analysis.TestRelations._
 
   test("union project *") {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 5d8a183b7f875..6db6aa361e4cc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -2576,8 +2576,9 @@ class Dataset[T] private[sql](
    * @group action
    * @since 1.6.0
    */
-  def foreachPartition(func: ForeachPartitionFunction[T]): Unit =
-    foreachPartition(it => func.call(it.asJava))
+  def foreachPartition(func: ForeachPartitionFunction[T]): Unit = {
+    foreachPartition((it: Iterator[T]) => func.call(it.asJava))
+  }
 
   /**
    * Returns the first `n` rows in the Dataset.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/typedaggregators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/typedaggregators.scala
index 1dae5f6964e56..b6550bf3e4aac 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/typedaggregators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/typedaggregators.scala
@@ -37,7 +37,7 @@ class TypedSumDouble[IN](val f: IN => Double) extends Aggregator[IN, Double, Dou
   override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]()
 
   // Java api support
-  def this(f: MapFunction[IN, java.lang.Double]) = this(x => f.call(x).asInstanceOf[Double])
+  def this(f: MapFunction[IN, java.lang.Double]) = this((x: IN) => f.call(x).asInstanceOf[Double])
 
   def toColumnJava: TypedColumn[IN, java.lang.Double] = {
     toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]]
@@ -55,7 +55,7 @@ class TypedSumLong[IN](val f: IN => Long) extends Aggregator[IN, Long, Long] {
   override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]()
 
   // Java api support
-  def this(f: MapFunction[IN, java.lang.Long]) = this(x => f.call(x).asInstanceOf[Long])
+  def this(f: MapFunction[IN, java.lang.Long]) = this((x: IN) => f.call(x).asInstanceOf[Long])
 
   def toColumnJava: TypedColumn[IN, java.lang.Long] = {
     toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]]
@@ -75,7 +75,7 @@ class TypedCount[IN](val f: IN => Any) extends Aggregator[IN, Long, Long] {
   override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]()
 
   // Java api support
-  def this(f: MapFunction[IN, Object]) = this(x => f.call(x))
+  def this(f: MapFunction[IN, Object]) = this((x: IN) => f.call(x).asInstanceOf[Any])
   def toColumnJava: TypedColumn[IN, java.lang.Long] = {
     toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]]
   }
@@ -94,7 +94,7 @@ class TypedAverage[IN](val f: IN => Double) extends Aggregator[IN, (Double, Long
   override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]()
 
   // Java api support
-  def this(f: MapFunction[IN, java.lang.Double]) = this(x => f.call(x).asInstanceOf[Double])
+  def this(f: MapFunction[IN, java.lang.Double]) = this((x: IN) => f.call(x).asInstanceOf[Double])
   def toColumnJava: TypedColumn[IN, java.lang.Double] = {
     toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]]
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala
index f62f9e23db66d..edcdd77908d3a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala
@@ -151,7 +151,7 @@ class DatasetPrimitiveSuite extends QueryTest with SharedSQLContext {
   test("foreachPartition") {
     val ds = Seq(1, 2, 3).toDS()
     val acc = sparkContext.longAccumulator
-    ds.foreachPartition(_.foreach(acc.add(_)))
+    ds.foreachPartition((it: Iterator[Int]) => it.foreach(acc.add(_)))
     assert(acc.value == 6)
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 6245b2eff9fa1..5015f3709f131 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -364,7 +364,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
   test("foreachPartition") {
     val ds = Seq(("a", 1), ("b", 2), ("c", 3)).toDS()
     val acc = sparkContext.longAccumulator
-    ds.foreachPartition(_.foreach(v => acc.add(v._2)))
+    ds.foreachPartition((it: Iterator[(String, Int)]) => it.foreach(v => acc.add(v._2)))
     assert(acc.value == 6)
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeCoordinatorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeCoordinatorSuite.scala
index ab4042a9816e1..f1b5e3be5b63f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeCoordinatorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeCoordinatorSuite.scala
@@ -377,7 +377,7 @@ class ExchangeCoordinatorSuite extends SparkFunSuite with BeforeAndAfterAll {
     }
 
     test(s"determining the number of reducers: complex query 1$testNameNote") {
-      val test = { spark: SparkSession =>
+      val test: (SparkSession) => Unit = { spark: SparkSession =>
         val df1 =
           spark
             .range(0, 1000, 1, numInputPartitions)
@@ -429,7 +429,7 @@ class ExchangeCoordinatorSuite extends SparkFunSuite with BeforeAndAfterAll {
     }
 
     test(s"determining the number of reducers: complex query 2$testNameNote") {
-      val test = { spark: SparkSession =>
+      val test: (SparkSession) => Unit = { spark: SparkSession =>
         val df1 =
           spark
             .range(0, 1000, 1, numInputPartitions)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
index 28e8521b35fa0..90f6620d990cb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
@@ -506,7 +506,7 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
             sparkContext.register(accu)
 
             val df = spark.read.parquet(path).filter("a < 100")
-            df.foreachPartition(_.foreach(v => accu.add(0)))
+            df.foreachPartition((it: Iterator[Row]) => it.foreach(v => accu.add(0)))
             df.collect
 
             if (enablePushDown) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index 969f594edf615..bf7c448ab5d62 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -25,7 +25,7 @@ import org.scalactic.TolerantNumerics
 import org.scalatest.BeforeAndAfter
 import org.scalatest.concurrent.Eventually._
 import org.scalatest.concurrent.PatienceConfiguration.Timeout
-import org.scalatest.mock.MockitoSugar
+import org.scalatest.mockito.MockitoSugar
 
 import org.apache.spark.SparkException
 import org.apache.spark.internal.Logging
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 5508a00259e09..a649daf5a1c13 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -176,6 +176,11 @@
       <groupId>org.apache.thrift</groupId>
       <artifactId>libfb303</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.scala-lang</groupId>
+      <artifactId>scala-compiler</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.scalacheck</groupId>
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveVersionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveVersionSuite.scala
index 986c6675cbb63..ed475a0261b0b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveVersionSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveVersionSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.hive.client
 
 import org.apache.hadoop.conf.Configuration
+import org.scalactic.source.Position
 import org.scalatest.Tag
 
 import org.apache.spark.SparkFunSuite
@@ -40,7 +41,8 @@ private[client] abstract class HiveVersionSuite(version: String) extends SparkFu
 
   override def suiteName: String = s"${super.suiteName}($version)"
 
-  override protected def test(testName: String, testTags: Tag*)(testFun: => Unit): Unit = {
+  override protected def test(testName: String, testTags: Tag*)(testFun: => Any)
+      (implicit pos: Position): Unit = {
     super.test(s"$version: $testName", testTags: _*)(testFun)
   }
 }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala
index 1b1e21f6e5bab..5fc626c1f78b8 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala
@@ -24,8 +24,8 @@ import java.util.concurrent.Semaphore
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
+import org.scalatest.concurrent.{Signaler, ThreadSignaler, TimeLimits}
 import org.scalatest.concurrent.Eventually._
-import org.scalatest.concurrent.Timeouts
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.SparkConf
@@ -36,7 +36,7 @@ import org.apache.spark.streaming.receiver.WriteAheadLogBasedBlockHandler._
 import org.apache.spark.util.Utils
 
 /** Testsuite for testing the network receiver behavior */
-class ReceiverSuite extends TestSuiteBase with Timeouts with Serializable {
+class ReceiverSuite extends TestSuiteBase with TimeLimits with Serializable {
 
   test("receiver life cycle") {
 
@@ -60,6 +60,8 @@ class ReceiverSuite extends TestSuiteBase with Timeouts with Serializable {
 
     // Verify that the receiver
     intercept[Exception] {
+      // Necessary to make failAfter interrupt awaitTermination() in ScalaTest 3.x
+      implicit val signaler: Signaler = ThreadSignaler
       failAfter(200 millis) {
         executingThread.join()
       }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
index eb996c93ff381..96ab5a2080b8e 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
@@ -27,8 +27,8 @@ import scala.collection.mutable.Queue
 
 import org.apache.commons.io.FileUtils
 import org.scalatest.{Assertions, BeforeAndAfter, PrivateMethodTester}
+import org.scalatest.concurrent.{Signaler, ThreadSignaler, TimeLimits}
 import org.scalatest.concurrent.Eventually._
-import org.scalatest.concurrent.Timeouts
 import org.scalatest.exceptions.TestFailedDueToTimeoutException
 import org.scalatest.time.SpanSugar._
 
@@ -42,7 +42,7 @@ import org.apache.spark.streaming.receiver.Receiver
 import org.apache.spark.util.Utils
 
 
-class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with Timeouts with Logging {
+class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with TimeLimits with Logging {
 
   val master = "local[2]"
   val appName = this.getClass.getSimpleName
@@ -406,6 +406,8 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with Timeo
 
     // test whether awaitTermination() does not exit if not time is given
     val exception = intercept[Exception] {
+      // Necessary to make failAfter interrupt awaitTermination() in ScalaTest 3.x
+      implicit val signaler: Signaler = ThreadSignaler
       failAfter(1000 millis) {
         ssc.awaitTermination()
         throw new Exception("Did not wait for stop")
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManagerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManagerSuite.scala
index 1d2bf35a6d458..8d81b582e4d30 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManagerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManagerSuite.scala
@@ -21,7 +21,7 @@ import org.mockito.Matchers.{eq => meq}
 import org.mockito.Mockito._
 import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, PrivateMethodTester}
 import org.scalatest.concurrent.Eventually.{eventually, timeout}
-import org.scalatest.mock.MockitoSugar
+import org.scalatest.mockito.MockitoSugar
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.{ExecutorAllocationClient, SparkConf, SparkFunSuite}
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala
index ede15399f0e2f..4a2549fc0a96d 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala
@@ -36,7 +36,7 @@ import org.mockito.Mockito._
 import org.scalatest.{BeforeAndAfter, BeforeAndAfterEach, PrivateMethodTester}
 import org.scalatest.concurrent.Eventually
 import org.scalatest.concurrent.Eventually._
-import org.scalatest.mock.MockitoSugar
+import org.scalatest.mockito.MockitoSugar
 
 import org.apache.spark.{SparkConf, SparkException, SparkFunSuite}
 import org.apache.spark.streaming.scheduler._
@@ -484,7 +484,7 @@ class BatchedWriteAheadLogSuite extends CommonWriteAheadLogTests(
   // we make the write requests in separate threads so that we don't block the test thread
   private def writeAsync(wal: WriteAheadLog, event: String, time: Long): Promise[Unit] = {
     val p = Promise[Unit]()
-    p.completeWith(Future {
+    p.completeWith(Future[Unit] {
       val v = wal.write(event, time)
       assert(v === walHandle)
     }(walBatchingExecutionContext))
diff --git a/tools/pom.xml b/tools/pom.xml
index 7ba4dc9842f1b..37427e8da62d8 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -44,7 +44,7 @@
     <dependency>
       <groupId>org.clapper</groupId>
       <artifactId>classutil_${scala.binary.version}</artifactId>
-      <version>1.0.6</version>
+      <version>1.1.2</version>
     </dependency>
   </dependencies>
 

From aba9492d25e285d00033c408e9bfdd543ee12f72 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Fri, 1 Sep 2017 13:21:06 -0700
Subject: [PATCH 476/779] [SPARK-21895][SQL] Support changing database in
 HiveClient

## What changes were proposed in this pull request?
Supporting moving tables across different database in HiveClient `alterTable`

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #19104 from gatorsmile/alterTable.
---
 .../spark/sql/hive/HiveExternalCatalog.scala  |  2 +-
 .../spark/sql/hive/client/HiveClient.scala    | 11 +++--
 .../sql/hive/client/HiveClientImpl.scala      |  7 +++-
 .../spark/sql/hive/client/VersionsSuite.scala | 41 ++++++++++++++++++-
 4 files changed, 53 insertions(+), 8 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 34af37ce11103..96dc983b0bfc6 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -512,7 +512,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       identifier = TableIdentifier(newName, Some(db)),
       storage = storageWithNewPath)
 
-    client.alterTable(oldName, newTable)
+    client.alterTable(db, oldName, newTable)
   }
 
   private def getLocationFromStorageProps(table: CatalogTable): Option[String] = {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
index 8cff0ca0963bd..ee3eb2ee8abe5 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
@@ -90,10 +90,15 @@ private[hive] trait HiveClient {
   def dropTable(dbName: String, tableName: String, ignoreIfNotExists: Boolean, purge: Boolean): Unit
 
   /** Alter a table whose name matches the one specified in `table`, assuming it exists. */
-  final def alterTable(table: CatalogTable): Unit = alterTable(table.identifier.table, table)
+  final def alterTable(table: CatalogTable): Unit = {
+    alterTable(table.database, table.identifier.table, table)
+  }
 
-  /** Updates the given table with new metadata, optionally renaming the table. */
-  def alterTable(tableName: String, table: CatalogTable): Unit
+  /**
+   * Updates the given table with new metadata, optionally renaming the table or
+   * moving across different database.
+   */
+  def alterTable(dbName: String, tableName: String, table: CatalogTable): Unit
 
   /** Creates a new database with the given name. */
   def createDatabase(database: CatalogDatabase, ignoreIfExists: Boolean): Unit
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 7c0b9bf19bf30..69dac7b062354 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -495,7 +495,10 @@ private[hive] class HiveClientImpl(
     shim.dropTable(client, dbName, tableName, true, ignoreIfNotExists, purge)
   }
 
-  override def alterTable(tableName: String, table: CatalogTable): Unit = withHiveState {
+  override def alterTable(
+      dbName: String,
+      tableName: String,
+      table: CatalogTable): Unit = withHiveState {
     // getTableOption removes all the Hive-specific properties. Here, we fill them back to ensure
     // these properties are still available to the others that share the same Hive metastore.
     // If users explicitly alter these Hive-specific properties through ALTER TABLE DDL, we respect
@@ -503,7 +506,7 @@ private[hive] class HiveClientImpl(
     val hiveTable = toHiveTable(
       table.copy(properties = table.ignoredProperties ++ table.properties), Some(userName))
     // Do not use `table.qualifiedName` here because this may be a rename
-    val qualifiedTableName = s"${table.database}.$tableName"
+    val qualifiedTableName = s"$dbName.$tableName"
     shim.alterTable(client, qualifiedTableName, hiveTable)
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index cbbe869403724..1d9c8da996fea 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -232,12 +232,49 @@ class VersionsSuite extends SparkFunSuite with Logging {
       assert(client.getTable("default", "src").properties.contains("changed"))
     }
 
-    test(s"$version: alterTable(tableName: String, table: CatalogTable)") {
+    test(s"$version: alterTable(dbName: String, tableName: String, table: CatalogTable)") {
       val newTable = client.getTable("default", "src").copy(properties = Map("changedAgain" -> ""))
-      client.alterTable("src", newTable)
+      client.alterTable("default", "src", newTable)
       assert(client.getTable("default", "src").properties.contains("changedAgain"))
     }
 
+    test(s"$version: alterTable - rename") {
+      val newTable = client.getTable("default", "src")
+        .copy(identifier = TableIdentifier("tgt", database = Some("default")))
+      assert(!client.tableExists("default", "tgt"))
+
+      client.alterTable("default", "src", newTable)
+
+      assert(client.tableExists("default", "tgt"))
+      assert(!client.tableExists("default", "src"))
+    }
+
+    test(s"$version: alterTable - change database") {
+      val tempDB = CatalogDatabase(
+        "temporary", description = "test create", tempDatabasePath, Map())
+      client.createDatabase(tempDB, ignoreIfExists = true)
+
+      val newTable = client.getTable("default", "tgt")
+        .copy(identifier = TableIdentifier("tgt", database = Some("temporary")))
+      assert(!client.tableExists("temporary", "tgt"))
+
+      client.alterTable("default", "tgt", newTable)
+
+      assert(client.tableExists("temporary", "tgt"))
+      assert(!client.tableExists("default", "tgt"))
+    }
+
+    test(s"$version: alterTable - change database and table names") {
+      val newTable = client.getTable("temporary", "tgt")
+        .copy(identifier = TableIdentifier("src", database = Some("default")))
+      assert(!client.tableExists("default", "src"))
+
+      client.alterTable("temporary", "tgt", newTable)
+
+      assert(client.tableExists("default", "src"))
+      assert(!client.tableExists("temporary", "tgt"))
+    }
+
     test(s"$version: listTables(database)") {
       assert(client.listTables("default") === Seq("src", "temporary"))
     }

From 900f14f6fad50369aa849922447f60d7cf06cf2f Mon Sep 17 00:00:00 2001
From: WeichenXu <weichen.xu@databricks.com>
Date: Fri, 1 Sep 2017 17:32:33 -0700
Subject: [PATCH 477/779] [SPARK-21729][ML][TEST] Generic test for
 ProbabilisticClassifier to ensure consistent output columns

## What changes were proposed in this pull request?

Add test for prediction using the model with all combinations of output columns turned on/off.
Make sure the output column values match, presumably by comparing vs. the case with all 3 output columns turned on.

## How was this patch tested?

Test updated.

Author: WeichenXu <weichen.xu@databricks.com>
Author: WeichenXu <WeichenXu123@outlook.com>

Closes #19065 from WeichenXu123/generic_test_for_prob_classifier.
---
 .../DecisionTreeClassifierSuite.scala         |  3 +
 .../classification/GBTClassifierSuite.scala   |  3 +
 .../LogisticRegressionSuite.scala             |  6 ++
 .../MultilayerPerceptronClassifierSuite.scala |  2 +
 .../ml/classification/NaiveBayesSuite.scala   |  6 ++
 .../ProbabilisticClassifierSuite.scala        | 60 +++++++++++++++++++
 .../RandomForestClassifierSuite.scala         |  2 +
 7 files changed, 82 insertions(+)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
index 918ab27e2730b..98c879ece62d6 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
@@ -262,6 +262,9 @@ class DecisionTreeClassifierSuite
       assert(Vectors.dense(rawPred.toArray.map(_ / sum)) === probPred,
         "probability prediction mismatch")
     }
+
+    ProbabilisticClassifierSuite.testPredictMethods[
+      Vector, DecisionTreeClassificationModel](newTree, newData)
   }
 
   test("training with 1-category categorical feature") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
index 1f79e0d4e6228..8000143d4d142 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
@@ -219,6 +219,9 @@ class GBTClassifierSuite extends SparkFunSuite with MLlibTestSparkContext
     resultsUsingPredict.zip(results.select(predictionCol).as[Double].collect()).foreach {
       case (pred1, pred2) => assert(pred1 === pred2)
     }
+
+    ProbabilisticClassifierSuite.testPredictMethods[
+      Vector, GBTClassificationModel](gbtModel, validationDataset)
   }
 
   test("GBT parameter stepSize should be in interval (0, 1]") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 6bf1253b718d1..d43c7cdbde62c 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -502,6 +502,9 @@ class LogisticRegressionSuite
     resultsUsingPredict.zip(results.select("prediction").as[Double].collect()).foreach {
       case (pred1, pred2) => assert(pred1 === pred2)
     }
+
+    ProbabilisticClassifierSuite.testPredictMethods[
+      Vector, LogisticRegressionModel](model, smallMultinomialDataset)
   }
 
   test("binary logistic regression: Predictor, Classifier methods") {
@@ -556,6 +559,9 @@ class LogisticRegressionSuite
     resultsUsingPredict.zip(results.select("prediction").as[Double].collect()).foreach {
       case (pred1, pred2) => assert(pred1 === pred2)
     }
+
+    ProbabilisticClassifierSuite.testPredictMethods[
+      Vector, LogisticRegressionModel](model, smallBinaryDataset)
   }
 
   test("coefficients and intercept methods") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
index c294e4ad54bf7..d3141ec708560 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
@@ -104,6 +104,8 @@ class MultilayerPerceptronClassifierSuite
       case Row(p: Vector, e: Vector) =>
         assert(p ~== e absTol 1e-3)
     }
+    ProbabilisticClassifierSuite.testPredictMethods[
+      Vector, MultilayerPerceptronClassificationModel](model, strongDataset)
   }
 
   test("test model probability") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
index 3a2be236f1257..9730dd68a3b27 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
@@ -160,6 +160,9 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
     val featureAndProbabilities = model.transform(validationDataset)
       .select("features", "probability")
     validateProbabilities(featureAndProbabilities, model, "multinomial")
+
+    ProbabilisticClassifierSuite.testPredictMethods[
+      Vector, NaiveBayesModel](model, testDataset)
   }
 
   test("Naive Bayes with weighted samples") {
@@ -213,6 +216,9 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
     val featureAndProbabilities = model.transform(validationDataset)
       .select("features", "probability")
     validateProbabilities(featureAndProbabilities, model, "bernoulli")
+
+    ProbabilisticClassifierSuite.testPredictMethods[
+      Vector, NaiveBayesModel](model, testDataset)
   }
 
   test("detect negative values") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala
index 172c64aab9d3d..4ecd5a05365eb 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala
@@ -19,6 +19,9 @@ package org.apache.spark.ml.classification
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.linalg.{Vector, Vectors}
+import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.ml.util.TestingUtils._
+import org.apache.spark.sql.{Dataset, Row}
 
 final class TestProbabilisticClassificationModel(
     override val uid: String,
@@ -91,4 +94,61 @@ object ProbabilisticClassifierSuite {
     "thresholds" -> Array(0.4, 0.6)
   )
 
+  /**
+   * Helper for testing that a ProbabilisticClassificationModel computes
+   * the same predictions across all combinations of output columns
+   * (rawPrediction/probability/prediction) turned on/off. Makes sure the
+   * output column values match by comparing vs. the case with all 3 output
+   * columns turned on.
+   */
+  def testPredictMethods[
+      FeaturesType,
+      M <: ProbabilisticClassificationModel[FeaturesType, M]](
+    model: M, testData: Dataset[_]): Unit = {
+
+    val allColModel = model.copy(ParamMap.empty)
+      .setRawPredictionCol("rawPredictionAll")
+      .setProbabilityCol("probabilityAll")
+      .setPredictionCol("predictionAll")
+    val allColResult = allColModel.transform(testData)
+
+    for (rawPredictionCol <- Seq("", "rawPredictionSingle")) {
+      for (probabilityCol <- Seq("", "probabilitySingle")) {
+        for (predictionCol <- Seq("", "predictionSingle")) {
+          val newModel = model.copy(ParamMap.empty)
+            .setRawPredictionCol(rawPredictionCol)
+            .setProbabilityCol(probabilityCol)
+            .setPredictionCol(predictionCol)
+
+          val result = newModel.transform(allColResult)
+
+          import org.apache.spark.sql.functions._
+
+          val resultRawPredictionCol =
+            if (rawPredictionCol.isEmpty) col("rawPredictionAll") else col(rawPredictionCol)
+          val resultProbabilityCol =
+            if (probabilityCol.isEmpty) col("probabilityAll") else col(probabilityCol)
+          val resultPredictionCol =
+            if (predictionCol.isEmpty) col("predictionAll") else col(predictionCol)
+
+          result.select(
+            resultRawPredictionCol, col("rawPredictionAll"),
+            resultProbabilityCol, col("probabilityAll"),
+            resultPredictionCol, col("predictionAll")
+          ).collect().foreach {
+            case Row(
+              rawPredictionSingle: Vector, rawPredictionAll: Vector,
+              probabilitySingle: Vector, probabilityAll: Vector,
+              predictionSingle: Double, predictionAll: Double
+            ) => {
+              assert(rawPredictionSingle ~== rawPredictionAll relTol 1E-3)
+              assert(probabilitySingle ~== probabilityAll relTol 1E-3)
+              assert(predictionSingle ~== predictionAll relTol 1E-3)
+            }
+          }
+        }
+      }
+    }
+  }
+
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
index ca2954d2f32c4..2cca2e6c04698 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
@@ -155,6 +155,8 @@ class RandomForestClassifierSuite
         "probability prediction mismatch")
       assert(probPred.toArray.sum ~== 1.0 relTol 1E-5)
     }
+    ProbabilisticClassifierSuite.testPredictMethods[
+      Vector, RandomForestClassificationModel](model, df)
   }
 
   test("Fitting without numClasses in metadata") {

From acb7fed23700a524b6d92ee745ee9de5a6bb2f22 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Sat, 2 Sep 2017 14:53:41 -0700
Subject: [PATCH 478/779] [SPARK-21891][SQL] Add TBLPROPERTIES to DDL
 statement: CREATE TABLE USING

## What changes were proposed in this pull request?
Add `TBLPROPERTIES` to the DDL statement `CREATE TABLE USING`.

After this change, the DDL becomes
```
CREATE [TEMPORARY] TABLE [IF NOT EXISTS] [db_name.]table_name
USING table_provider
[OPTIONS table_property_list]
[PARTITIONED BY (col_name, col_name, ...)]
[CLUSTERED BY (col_name, col_name, ...)
 [SORTED BY (col_name [ASC|DESC], ...)]
 INTO num_buckets BUCKETS
]
[LOCATION path]
[COMMENT table_comment]
[TBLPROPERTIES (property_name=property_value, ...)]
[[AS] select_statement];
```

## How was this patch tested?
Add a few tests

Author: gatorsmile <gatorsmile@gmail.com>

Closes #19100 from gatorsmile/addTablePropsToCreateTableUsing.
---
 .../spark/sql/catalyst/parser/SqlBase.g4      |  1 +
 .../spark/sql/execution/SparkSqlParser.scala  |  7 +++++--
 .../resources/sql-tests/inputs/describe.sql   |  3 ++-
 .../sql-tests/results/cross-join.sql.out      |  1 +
 .../sql-tests/results/describe.sql.out        |  7 +++++--
 .../OptimizeMetadataOnlyQuerySuite.scala      |  8 ++++++++
 .../execution/command/DDLParserSuite.scala    | 20 +++++++++++++++++++
 7 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index 5d4363f945bf8..f741dcfbf2002 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -81,6 +81,7 @@ statement
         (PARTITIONED BY partitionColumnNames=identifierList)?
         bucketSpec? locationSpec?
         (COMMENT comment=STRING)?
+        (TBLPROPERTIES tableProps=tablePropertyList)?
         (AS? query)?                                                   #createTable
     | createTableHeader ('(' columns=colTypeList ')')?
         (COMMENT comment=STRING)?
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index 8379e740a0717..d3f6ab5654689 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -385,7 +385,8 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
    *   ]
    *   [LOCATION path]
    *   [COMMENT table_comment]
-   *   [AS select_statement];
+   *   [TBLPROPERTIES (property_name=property_value, ...)]
+   *   [[AS] select_statement];
    * }}}
    */
   override def visitCreateTable(ctx: CreateTableContext): LogicalPlan = withOrigin(ctx) {
@@ -400,6 +401,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
       Option(ctx.partitionColumnNames)
         .map(visitIdentifierList(_).toArray)
         .getOrElse(Array.empty[String])
+    val properties = Option(ctx.tableProps).map(visitPropertyKeyValues).getOrElse(Map.empty)
     val bucketSpec = Option(ctx.bucketSpec()).map(visitBucketSpec)
 
     val location = Option(ctx.locationSpec).map(visitLocationSpec)
@@ -410,7 +412,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
         "LOCATION and 'path' in OPTIONS are both used to indicate the custom table path, " +
           "you can only specify one of them.", ctx)
     }
-    val customLocation = storage.locationUri.orElse(location.map(CatalogUtils.stringToURI(_)))
+    val customLocation = storage.locationUri.orElse(location.map(CatalogUtils.stringToURI))
 
     val tableType = if (customLocation.isDefined) {
       CatalogTableType.EXTERNAL
@@ -426,6 +428,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
       provider = Some(provider),
       partitionColumnNames = partitionColumnNames,
       bucketSpec = bucketSpec,
+      properties = properties,
       comment = Option(ctx.comment).map(string))
 
     // Determine the storage mode.
diff --git a/sql/core/src/test/resources/sql-tests/inputs/describe.sql b/sql/core/src/test/resources/sql-tests/inputs/describe.sql
index a222e11916cda..f26d5efec076c 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/describe.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/describe.sql
@@ -1,7 +1,8 @@
 CREATE TABLE t (a STRING, b INT, c STRING, d STRING) USING parquet
   OPTIONS (a '1', b '2')
   PARTITIONED BY (c, d) CLUSTERED BY (a) SORTED BY (b ASC) INTO 2 BUCKETS
-  COMMENT 'table_comment';
+  COMMENT 'table_comment'
+  TBLPROPERTIES (t 'test');
 
 CREATE TEMPORARY VIEW temp_v AS SELECT * FROM t;
 
diff --git a/sql/core/src/test/resources/sql-tests/results/cross-join.sql.out b/sql/core/src/test/resources/sql-tests/results/cross-join.sql.out
index e75cc4448a1ea..3833c42bdfecf 100644
--- a/sql/core/src/test/resources/sql-tests/results/cross-join.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/cross-join.sql.out
@@ -128,6 +128,7 @@ two	2	two	2	one	1	two	2
 two	2	two	2	three	3	two	2
 two	2	two	2	two	2	two	2
 
+
 -- !query 12
 SELECT * FROM nt1 CROSS JOIN nt2 ON (nt1.k > nt2.k)
 -- !query 12 schema
diff --git a/sql/core/src/test/resources/sql-tests/results/describe.sql.out b/sql/core/src/test/resources/sql-tests/results/describe.sql.out
index b91f2c09f3cd4..8c908b7625056 100644
--- a/sql/core/src/test/resources/sql-tests/results/describe.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/describe.sql.out
@@ -7,6 +7,7 @@ CREATE TABLE t (a STRING, b INT, c STRING, d STRING) USING parquet
   OPTIONS (a '1', b '2')
   PARTITIONED BY (c, d) CLUSTERED BY (a) SORTED BY (b ASC) INTO 2 BUCKETS
   COMMENT 'table_comment'
+  TBLPROPERTIES (t 'test')
 -- !query 0 schema
 struct<>
 -- !query 0 output
@@ -129,7 +130,7 @@ Num Buckets         	2
 Bucket Columns      	[`a`]               	                    
 Sort Columns        	[`b`]               	                    
 Comment             	table_comment       	                    
-Table Properties    	[e=3]               	                    
+Table Properties    	[t=test, e=3]       	                    
 Location [not included in comparison]sql/core/spark-warehouse/t	                    
 Storage Properties  	[a=1, b=2]          	                    
 Partition Provider  	Catalog
@@ -161,7 +162,7 @@ Num Buckets         	2
 Bucket Columns      	[`a`]               	                    
 Sort Columns        	[`b`]               	                    
 Comment             	table_comment       	                    
-Table Properties    	[e=3]               	                    
+Table Properties    	[t=test, e=3]       	                    
 Location [not included in comparison]sql/core/spark-warehouse/t	                    
 Storage Properties  	[a=1, b=2]          	                    
 Partition Provider  	Catalog
@@ -201,6 +202,7 @@ Num Buckets         	2
 Bucket Columns      	[`a`]               	                    
 Sort Columns        	[`b`]               	                    
 Comment             	table_comment       	                    
+Table Properties    	[t=test]            	                    
 Location [not included in comparison]sql/core/spark-warehouse/t	                    
 Storage Properties  	[a=1, b=2]          	                    
 Partition Provider  	Catalog
@@ -239,6 +241,7 @@ Provider            	parquet
 Num Buckets         	2                   	                    
 Bucket Columns      	[`a`]               	                    
 Sort Columns        	[`b`]               	                    
+Table Properties    	[t=test]            	                    
 Location [not included in comparison]sql/core/spark-warehouse/t	                    
 Storage Properties  	[a=1, b=2]          	                    
 Partition Provider  	Catalog
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala
index 223c3d7729a50..78c1e5dae566d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala
@@ -117,4 +117,12 @@ class OptimizeMetadataOnlyQuerySuite extends QueryTest with SharedSQLContext {
     "select partcol1, max(partcol2) from srcpart where partcol1 = 0 group by rollup (partcol1)",
     "select partcol2 from (select partcol2 from srcpart where partcol1 = 0 union all " +
       "select partcol2 from srcpart where partcol1 = 1) t group by partcol2")
+
+  test("SPARK-21884 Fix StackOverflowError on MetadataOnlyQuery") {
+    withTable("t_1000") {
+      sql("CREATE TABLE t_1000 (a INT, p INT) USING PARQUET PARTITIONED BY (p)")
+      (1 to 1000).foreach(p => sql(s"ALTER TABLE t_1000 ADD PARTITION (p=$p)"))
+      sql("SELECT COUNT(DISTINCT p) FROM t_1000").collect()
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
index 70df7607a713f..4ee38215f5973 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
@@ -473,6 +473,26 @@ class DDLParserSuite extends PlanTest with SharedSQLContext {
     }
   }
 
+  test("create table - with table properties") {
+    val sql = "CREATE TABLE my_tab(a INT, b STRING) USING parquet TBLPROPERTIES('test' = 'test')"
+
+    val expectedTableDesc = CatalogTable(
+      identifier = TableIdentifier("my_tab"),
+      tableType = CatalogTableType.MANAGED,
+      storage = CatalogStorageFormat.empty,
+      schema = new StructType().add("a", IntegerType).add("b", StringType),
+      provider = Some("parquet"),
+      properties = Map("test" -> "test"))
+
+    parser.parsePlan(sql) match {
+      case CreateTable(tableDesc, _, None) =>
+        assert(tableDesc == expectedTableDesc.copy(createTime = tableDesc.createTime))
+      case other =>
+        fail(s"Expected to parse ${classOf[CreateTableCommand].getClass.getName} from query," +
+          s"got ${other.getClass.getName}: $sql")
+    }
+  }
+
   test("create table - with location") {
     val v1 = "CREATE TABLE my_tab(a INT, b STRING) USING parquet LOCATION '/tmp/file'"
 

From 07fd68a29fb6cad960b5ac72718bb05decf28a1a Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sun, 3 Sep 2017 21:03:21 +0900
Subject: [PATCH 479/779] [SPARK-21897][PYTHON][R] Add unionByName API to
 DataFrame in Python and R

## What changes were proposed in this pull request?

This PR proposes to add a wrapper for `unionByName` API to R and Python as well.

**Python**

```python
df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])
df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col0"])
df1.unionByName(df2).show()
```

```
+----+----+----+
|col0|col1|col3|
+----+----+----+
|   1|   2|   3|
|   6|   4|   5|
+----+----+----+
```

**R**

```R
df1 <- select(createDataFrame(mtcars), "carb", "am", "gear")
df2 <- select(createDataFrame(mtcars), "am", "gear", "carb")
head(unionByName(limit(df1, 2), limit(df2, 2)))
```

```
  carb am gear
1    4  1    4
2    4  1    4
3    4  1    4
4    4  1    4
```

## How was this patch tested?

Doctests for Python and unit test added in `test_sparkSQL.R` for R.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #19105 from HyukjinKwon/unionByName-r-python.
---
 R/pkg/NAMESPACE                       |  1 +
 R/pkg/R/DataFrame.R                   | 38 +++++++++++++++++++++++++--
 R/pkg/R/generics.R                    |  4 +++
 R/pkg/tests/fulltests/test_sparkSQL.R |  9 ++++++-
 python/pyspark/sql/dataframe.py       | 28 +++++++++++++++++---
 5 files changed, 74 insertions(+), 6 deletions(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index a1dd1af4bf266..3fc756b9ef40c 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -169,6 +169,7 @@ exportMethods("arrange",
               "transform",
               "union",
               "unionAll",
+              "unionByName",
               "unique",
               "unpersist",
               "where",
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 80526cdd4fd45..1b46c1e800c96 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -2683,7 +2683,7 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) {
 #' @rdname union
 #' @name union
 #' @aliases union,SparkDataFrame,SparkDataFrame-method
-#' @seealso \link{rbind}
+#' @seealso \link{rbind} \link{unionByName}
 #' @export
 #' @examples
 #'\dontrun{
@@ -2714,6 +2714,40 @@ setMethod("unionAll",
             union(x, y)
           })
 
+#' Return a new SparkDataFrame containing the union of rows, matched by column names
+#'
+#' Return a new SparkDataFrame containing the union of rows in this SparkDataFrame
+#' and another SparkDataFrame. This is different from \code{union} function, and both
+#' \code{UNION ALL} and \code{UNION DISTINCT} in SQL as column positions are not taken
+#' into account. Input SparkDataFrames can have different data types in the schema.
+#'
+#' Note: This does not remove duplicate rows across the two SparkDataFrames.
+#' This function resolves columns by name (not by position).
+#'
+#' @param x A SparkDataFrame
+#' @param y A SparkDataFrame
+#' @return A SparkDataFrame containing the result of the union.
+#' @family SparkDataFrame functions
+#' @rdname unionByName
+#' @name unionByName
+#' @aliases unionByName,SparkDataFrame,SparkDataFrame-method
+#' @seealso \link{rbind} \link{union}
+#' @export
+#' @examples
+#'\dontrun{
+#' sparkR.session()
+#' df1 <- select(createDataFrame(mtcars), "carb", "am", "gear")
+#' df2 <- select(createDataFrame(mtcars), "am", "gear", "carb")
+#' head(unionByName(df1, df2))
+#' }
+#' @note unionByName since 2.3.0
+setMethod("unionByName",
+          signature(x = "SparkDataFrame", y = "SparkDataFrame"),
+          function(x, y) {
+            unioned <- callJMethod(x@sdf, "unionByName", y@sdf)
+            dataFrame(unioned)
+          })
+
 #' Union two or more SparkDataFrames
 #'
 #' Union two or more SparkDataFrames by row. As in R's \code{rbind}, this method
@@ -2730,7 +2764,7 @@ setMethod("unionAll",
 #' @aliases rbind,SparkDataFrame-method
 #' @rdname rbind
 #' @name rbind
-#' @seealso \link{union}
+#' @seealso \link{union} \link{unionByName}
 #' @export
 #' @examples
 #'\dontrun{
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index f0cc2dc3f195a..603ff4e4a2e3b 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -769,6 +769,10 @@ setGeneric("union", function(x, y) { standardGeneric("union") })
 #' @export
 setGeneric("unionAll", function(x, y) { standardGeneric("unionAll") })
 
+#' @rdname unionByName
+#' @export
+setGeneric("unionByName", function(x, y) { standardGeneric("unionByName") })
+
 #' @rdname unpersist
 #' @export
 setGeneric("unpersist", function(x, ...) { standardGeneric("unpersist") })
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index d477fc6a4256c..7abc8720473c1 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -2255,7 +2255,7 @@ test_that("isLocal()", {
   expect_false(isLocal(df))
 })
 
-test_that("union(), rbind(), except(), and intersect() on a DataFrame", {
+test_that("union(), unionByName(), rbind(), except(), and intersect() on a DataFrame", {
   df <- read.json(jsonPath)
 
   lines <- c("{\"name\":\"Bob\", \"age\":24}",
@@ -2271,6 +2271,13 @@ test_that("union(), rbind(), except(), and intersect() on a DataFrame", {
   expect_equal(first(unioned)$name, "Michael")
   expect_equal(count(arrange(suppressWarnings(unionAll(df, df2)), df$age)), 6)
 
+  df1 <- select(df2, "age", "name")
+  unioned1 <- arrange(unionByName(df1, df), df1$age)
+  expect_is(unioned, "SparkDataFrame")
+  expect_equal(count(unioned), 6)
+  # Here, we test if 'Michael' in df is correctly mapped to the same name.
+  expect_equal(first(unioned)$name, "Michael")
+
   unioned2 <- arrange(rbind(unioned, df, df2), df$age)
   expect_is(unioned2, "SparkDataFrame")
   expect_equal(count(unioned2), 12)
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index c19e599814e54..1cea130d918ad 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1290,7 +1290,7 @@ def union(self, other):
         """ Return a new :class:`DataFrame` containing union of rows in this and another frame.
 
         This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union
-        (that does deduplication of elements), use this function followed by a distinct.
+        (that does deduplication of elements), use this function followed by :func:`distinct`.
 
         Also as standard in SQL, this function resolves columns by position (not by name).
         """
@@ -1301,14 +1301,36 @@ def unionAll(self, other):
         """ Return a new :class:`DataFrame` containing union of rows in this and another frame.
 
         This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union
-        (that does deduplication of elements), use this function followed by a distinct.
+        (that does deduplication of elements), use this function followed by :func:`distinct`.
 
         Also as standard in SQL, this function resolves columns by position (not by name).
 
-        .. note:: Deprecated in 2.0, use union instead.
+        .. note:: Deprecated in 2.0, use :func:`union` instead.
         """
         return self.union(other)
 
+    @since(2.3)
+    def unionByName(self, other):
+        """ Returns a new :class:`DataFrame` containing union of rows in this and another frame.
+
+        This is different from both `UNION ALL` and `UNION DISTINCT` in SQL. To do a SQL-style set
+        union (that does deduplication of elements), use this function followed by :func:`distinct`.
+
+        The difference between this function and :func:`union` is that this function
+        resolves columns by name (not by position):
+
+        >>> df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])
+        >>> df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col0"])
+        >>> df1.unionByName(df2).show()
+        +----+----+----+
+        |col0|col1|col2|
+        +----+----+----+
+        |   1|   2|   3|
+        |   6|   4|   5|
+        +----+----+----+
+        """
+        return DataFrame(self._jdf.unionByName(other._jdf), self.sql_ctx)
+
     @since(1.3)
     def intersect(self, other):
         """ Return a new :class:`DataFrame` containing rows only in

From 9f30d9280308f8f33f7459e39f2683e8fea49026 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Sun, 3 Sep 2017 21:55:18 -0700
Subject: [PATCH 480/779] [SPARK-21654][SQL] Complement SQL predicates
 expression description

## What changes were proposed in this pull request?

SQL predicates don't have complete expression description. This patch goes to complement the description by adding arguments, examples.

This change also adds related test cases for the SQL predicate expressions.

## How was this patch tested?

Existing tests. And added predicate test.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #18869 from viirya/SPARK-21654.
---
 .../sql/catalyst/expressions/predicates.scala | 143 +++++++++++-
 .../catalyst/expressions/PredicateSuite.scala |  76 +++++-
 .../sql-tests/inputs/predicate-functions.sql  |  36 +++
 .../results/predicate-functions.sql.out       | 218 ++++++++++++++++++
 4 files changed, 460 insertions(+), 13 deletions(-)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/predicate-functions.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/predicate-functions.sql.out

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index d3071c533bef9..efcd45fad779c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -133,8 +133,25 @@ case class Not(child: Expression)
 /**
  * Evaluates to `true` if `list` contains `value`.
  */
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "expr1 _FUNC_(expr2, expr3, ...) - Returns true if `expr` equals to any valN.")
+  usage = "expr1 _FUNC_(expr2, expr3, ...) - Returns true if `expr` equals to any valN.",
+  arguments = """
+    Arguments:
+      * expr1, expr2, expr3, ... - the arguments must be same type.
+  """,
+  examples = """
+    Examples:
+      > SELECT 1 _FUNC_(1, 2, 3);
+       true
+      > SELECT 1 _FUNC_(2, 3, 4);
+       false
+      > SELECT named_struct('a', 1, 'b', 2) _FUNC_(named_struct('a', 1, 'b', 1), named_struct('a', 1, 'b', 3));
+       false
+      > SELECT named_struct('a', 1, 'b', 2) _FUNC_(named_struct('a', 1, 'b', 2), named_struct('a', 1, 'b', 3));
+       true
+  """)
+// scalastyle:on line.size.limit
 case class In(value: Expression, list: Seq[Expression]) extends Predicate {
 
   require(list != null, "list should not be null")
@@ -491,7 +508,24 @@ object Equality {
 // TODO: although map type is not orderable, technically map type should be able to be used
 // in equality comparison
 @ExpressionDescription(
-  usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` equals `expr2`, or false otherwise.")
+  usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` equals `expr2`, or false otherwise.",
+  arguments = """
+    Arguments:
+      * expr1, expr2 - the two expressions must be same type or can be casted to a common type,
+          and must be a type that can be used in equality comparison. Map type is not supported.
+          For complex types such array/struct, the data types of fields must be orderable.
+  """,
+  examples = """
+    Examples:
+      > SELECT 2 _FUNC_ 2;
+       true
+      > SELECT 1 _FUNC_ '1';
+       true
+      > SELECT true _FUNC_ NULL;
+       NULL
+      > SELECT NULL _FUNC_ NULL;
+       NULL
+  """)
 case class EqualTo(left: Expression, right: Expression)
     extends BinaryComparison with NullIntolerant {
 
@@ -510,6 +544,23 @@ case class EqualTo(left: Expression, right: Expression)
   usage = """
     expr1 _FUNC_ expr2 - Returns same result as the EQUAL(=) operator for non-null operands,
       but returns true if both are null, false if one of the them is null.
+  """,
+  arguments = """
+    Arguments:
+      * expr1, expr2 - the two expressions must be same type or can be casted to a common type,
+          and must be a type that can be used in equality comparison. Map type is not supported.
+          For complex types such array/struct, the data types of fields must be orderable.
+  """,
+  examples = """
+    Examples:
+      > SELECT 2 _FUNC_ 2;
+       true
+      > SELECT 1 _FUNC_ '1';
+       true
+      > SELECT true _FUNC_ NULL;
+       false
+      > SELECT NULL _FUNC_ NULL;
+       true
   """)
 case class EqualNullSafe(left: Expression, right: Expression) extends BinaryComparison {
 
@@ -540,7 +591,27 @@ case class EqualNullSafe(left: Expression, right: Expression) extends BinaryComp
 }
 
 @ExpressionDescription(
-  usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` is less than `expr2`.")
+  usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` is less than `expr2`.",
+  arguments = """
+    Arguments:
+      * expr1, expr2 - the two expressions must be same type or can be casted to a common type,
+          and must be a type that can be ordered. For example, map type is not orderable, so it
+          is not supported. For complex types such array/struct, the data types of fields must
+          be orderable.
+  """,
+  examples = """
+    Examples:
+      > SELECT 1 _FUNC_ 2;
+       true
+      > SELECT 1.1 _FUNC_ '1';
+       false
+      > SELECT to_date('2009-07-30 04:17:52') _FUNC_ to_date('2009-07-30 04:17:52');
+       false
+      > SELECT to_date('2009-07-30 04:17:52') _FUNC_ to_date('2009-08-01 04:17:52');
+       true
+      > SELECT 1 _FUNC_ NULL;
+       NULL
+  """)
 case class LessThan(left: Expression, right: Expression)
     extends BinaryComparison with NullIntolerant {
 
@@ -550,7 +621,27 @@ case class LessThan(left: Expression, right: Expression)
 }
 
 @ExpressionDescription(
-  usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` is less than or equal to `expr2`.")
+  usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` is less than or equal to `expr2`.",
+  arguments = """
+    Arguments:
+      * expr1, expr2 - the two expressions must be same type or can be casted to a common type,
+          and must be a type that can be ordered. For example, map type is not orderable, so it
+          is not supported. For complex types such array/struct, the data types of fields must
+          be orderable.
+  """,
+  examples = """
+    Examples:
+      > SELECT 2 _FUNC_ 2;
+       true
+      > SELECT 1.0 _FUNC_ '1';
+       true
+      > SELECT to_date('2009-07-30 04:17:52') _FUNC_ to_date('2009-07-30 04:17:52');
+       true
+      > SELECT to_date('2009-07-30 04:17:52') _FUNC_ to_date('2009-08-01 04:17:52');
+       true
+      > SELECT 1 _FUNC_ NULL;
+       NULL
+  """)
 case class LessThanOrEqual(left: Expression, right: Expression)
     extends BinaryComparison with NullIntolerant {
 
@@ -560,7 +651,27 @@ case class LessThanOrEqual(left: Expression, right: Expression)
 }
 
 @ExpressionDescription(
-  usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` is greater than `expr2`.")
+  usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` is greater than `expr2`.",
+  arguments = """
+    Arguments:
+      * expr1, expr2 - the two expressions must be same type or can be casted to a common type,
+          and must be a type that can be ordered. For example, map type is not orderable, so it
+          is not supported. For complex types such array/struct, the data types of fields must
+          be orderable.
+  """,
+  examples = """
+    Examples:
+      > SELECT 2 _FUNC_ 1;
+       true
+      > SELECT 2 _FUNC_ '1.1';
+       true
+      > SELECT to_date('2009-07-30 04:17:52') _FUNC_ to_date('2009-07-30 04:17:52');
+       false
+      > SELECT to_date('2009-07-30 04:17:52') _FUNC_ to_date('2009-08-01 04:17:52');
+       false
+      > SELECT 1 _FUNC_ NULL;
+       NULL
+  """)
 case class GreaterThan(left: Expression, right: Expression)
     extends BinaryComparison with NullIntolerant {
 
@@ -570,7 +681,27 @@ case class GreaterThan(left: Expression, right: Expression)
 }
 
 @ExpressionDescription(
-  usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` is greater than or equal to `expr2`.")
+  usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` is greater than or equal to `expr2`.",
+  arguments = """
+    Arguments:
+      * expr1, expr2 - the two expressions must be same type or can be casted to a common type,
+          and must be a type that can be ordered. For example, map type is not orderable, so it
+          is not supported. For complex types such array/struct, the data types of fields must
+          be orderable.
+  """,
+  examples = """
+    Examples:
+      > SELECT 2 _FUNC_ 1;
+       true
+      > SELECT 2.0 _FUNC_ '2.1';
+       false
+      > SELECT to_date('2009-07-30 04:17:52') _FUNC_ to_date('2009-07-30 04:17:52');
+       true
+      > SELECT to_date('2009-07-30 04:17:52') _FUNC_ to_date('2009-08-01 04:17:52');
+       false
+      > SELECT 1 _FUNC_ NULL;
+       NULL
+  """)
 case class GreaterThanOrEqual(left: Expression, right: Expression)
     extends BinaryComparison with NullIntolerant {
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
index 055c31c2b3018..1438a88c19e0b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
@@ -123,7 +123,7 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
       (null, false, null) ::
       (null, null, null) :: Nil)
 
-  test("IN") {
+  test("basic IN predicate test") {
     checkEvaluation(In(NonFoldableLiteral.create(null, IntegerType), Seq(Literal(1),
       Literal(2))), null)
     checkEvaluation(In(NonFoldableLiteral.create(null, IntegerType),
@@ -151,19 +151,32 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(In(Literal("^Ba*n"), Seq(Literal("aa"), Literal("^Ba*n"))), true)
     checkEvaluation(In(Literal("^Ba*n"), Seq(Literal("aa"), Literal("^n"))), false)
 
-    val primitiveTypes = Seq(IntegerType, FloatType, DoubleType, StringType, ByteType, ShortType,
-      LongType, BinaryType, BooleanType, DecimalType.USER_DEFAULT, TimestampType)
-    primitiveTypes.foreach { t =>
-      val dataGen = RandomDataGenerator.forType(t, nullable = true).get
+  }
+
+  test("IN with different types") {
+    def testWithRandomDataGeneration(dataType: DataType, nullable: Boolean): Unit = {
+      val maybeDataGen = RandomDataGenerator.forType(dataType, nullable = nullable)
+      // Actually we won't pass in unsupported data types, this is a safety check.
+      val dataGen = maybeDataGen.getOrElse(
+        fail(s"Failed to create data generator for type $dataType"))
       val inputData = Seq.fill(10) {
         val value = dataGen.apply()
-        value match {
+        def cleanData(value: Any) = value match {
           case d: Double if d.isNaN => 0.0d
           case f: Float if f.isNaN => 0.0f
           case _ => value
         }
+        value match {
+          case s: Seq[_] => s.map(cleanData(_))
+          case m: Map[_, _] =>
+            val pair = m.unzip
+            val newKeys = pair._1.map(cleanData(_))
+            val newValues = pair._2.map(cleanData(_))
+            newKeys.zip(newValues).toMap
+          case _ => cleanData(value)
+        }
       }
-      val input = inputData.map(NonFoldableLiteral.create(_, t))
+      val input = inputData.map(NonFoldableLiteral.create(_, dataType))
       val expected = if (inputData(0) == null) {
         null
       } else if (inputData.slice(1, 10).contains(inputData(0))) {
@@ -175,6 +188,55 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
       }
       checkEvaluation(In(input(0), input.slice(1, 10)), expected)
     }
+
+    val atomicTypes = DataTypeTestUtils.atomicTypes.filter { t =>
+      RandomDataGenerator.forType(t).isDefined && !t.isInstanceOf[DecimalType]
+    } ++ Seq(DecimalType.USER_DEFAULT)
+
+    val atomicArrayTypes = atomicTypes.map(ArrayType(_, containsNull = true))
+
+    // Basic types:
+    for (
+        dataType <- atomicTypes;
+        nullable <- Seq(true, false)) {
+      testWithRandomDataGeneration(dataType, nullable)
+    }
+
+    // Array types:
+    for (
+        arrayType <- atomicArrayTypes;
+        nullable <- Seq(true, false)
+        if RandomDataGenerator.forType(arrayType.elementType, arrayType.containsNull).isDefined) {
+      testWithRandomDataGeneration(arrayType, nullable)
+    }
+
+    // Struct types:
+    for (
+        colOneType <- atomicTypes;
+        colTwoType <- atomicTypes;
+        nullable <- Seq(true, false)) {
+      val structType = StructType(
+        StructField("a", colOneType) :: StructField("b", colTwoType) :: Nil)
+      testWithRandomDataGeneration(structType, nullable)
+    }
+
+    // Map types: not supported
+    for (
+        keyType <- atomicTypes;
+        valueType <- atomicTypes;
+        nullable <- Seq(true, false)) {
+      val mapType = MapType(keyType, valueType)
+      val e = intercept[Exception] {
+        testWithRandomDataGeneration(mapType, nullable)
+      }
+      if (e.getMessage.contains("Code generation of")) {
+        // If the `value` expression is null, `eval` will be short-circuited.
+        // Codegen version evaluation will be run then.
+        assert(e.getMessage.contains("cannot generate equality code for un-comparable type"))
+      } else {
+        assert(e.getMessage.contains("Exception evaluating"))
+      }
+    }
   }
 
   test("INSET") {
diff --git a/sql/core/src/test/resources/sql-tests/inputs/predicate-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/predicate-functions.sql
new file mode 100644
index 0000000000000..3b3d4ad64b3ec
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/predicate-functions.sql
@@ -0,0 +1,36 @@
+-- EqualTo
+select 1 = 1;
+select 1 = '1';
+select 1.0 = '1';
+
+-- GreaterThan
+select 1 > '1';
+select 2 > '1.0';
+select 2 > '2.0';
+select 2 > '2.2';
+select to_date('2009-07-30 04:17:52') > to_date('2009-07-30 04:17:52');
+select to_date('2009-07-30 04:17:52') > '2009-07-30 04:17:52';
+ 
+-- GreaterThanOrEqual
+select 1 >= '1';
+select 2 >= '1.0';
+select 2 >= '2.0';
+select 2.0 >= '2.2';
+select to_date('2009-07-30 04:17:52') >= to_date('2009-07-30 04:17:52');
+select to_date('2009-07-30 04:17:52') >= '2009-07-30 04:17:52';
+ 
+-- LessThan
+select 1 < '1';
+select 2 < '1.0';
+select 2 < '2.0';
+select 2.0 < '2.2';
+select to_date('2009-07-30 04:17:52') < to_date('2009-07-30 04:17:52');
+select to_date('2009-07-30 04:17:52') < '2009-07-30 04:17:52';
+ 
+-- LessThanOrEqual
+select 1 <= '1';
+select 2 <= '1.0';
+select 2 <= '2.0';
+select 2.0 <= '2.2';
+select to_date('2009-07-30 04:17:52') <= to_date('2009-07-30 04:17:52');
+select to_date('2009-07-30 04:17:52') <= '2009-07-30 04:17:52';
diff --git a/sql/core/src/test/resources/sql-tests/results/predicate-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/predicate-functions.sql.out
new file mode 100644
index 0000000000000..8e7e04c8e1c4f
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/predicate-functions.sql.out
@@ -0,0 +1,218 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 27
+
+
+-- !query 0
+select 1 = 1
+-- !query 0 schema
+struct<(1 = 1):boolean>
+-- !query 0 output
+true
+
+
+-- !query 1
+select 1 = '1'
+-- !query 1 schema
+struct<(1 = CAST(1 AS INT)):boolean>
+-- !query 1 output
+true
+
+
+-- !query 2
+select 1.0 = '1'
+-- !query 2 schema
+struct<(1.0 = CAST(1 AS DECIMAL(2,1))):boolean>
+-- !query 2 output
+true
+
+
+-- !query 3
+select 1 > '1'
+-- !query 3 schema
+struct<(1 > CAST(1 AS INT)):boolean>
+-- !query 3 output
+false
+
+
+-- !query 4
+select 2 > '1.0'
+-- !query 4 schema
+struct<(2 > CAST(1.0 AS INT)):boolean>
+-- !query 4 output
+true
+
+
+-- !query 5
+select 2 > '2.0'
+-- !query 5 schema
+struct<(2 > CAST(2.0 AS INT)):boolean>
+-- !query 5 output
+false
+
+
+-- !query 6
+select 2 > '2.2'
+-- !query 6 schema
+struct<(2 > CAST(2.2 AS INT)):boolean>
+-- !query 6 output
+false
+
+
+-- !query 7
+select to_date('2009-07-30 04:17:52') > to_date('2009-07-30 04:17:52')
+-- !query 7 schema
+struct<(to_date('2009-07-30 04:17:52') > to_date('2009-07-30 04:17:52')):boolean>
+-- !query 7 output
+false
+
+
+-- !query 8
+select to_date('2009-07-30 04:17:52') > '2009-07-30 04:17:52'
+-- !query 8 schema
+struct<(CAST(to_date('2009-07-30 04:17:52') AS STRING) > 2009-07-30 04:17:52):boolean>
+-- !query 8 output
+false
+
+
+-- !query 9
+select 1 >= '1'
+-- !query 9 schema
+struct<(1 >= CAST(1 AS INT)):boolean>
+-- !query 9 output
+true
+
+
+-- !query 10
+select 2 >= '1.0'
+-- !query 10 schema
+struct<(2 >= CAST(1.0 AS INT)):boolean>
+-- !query 10 output
+true
+
+
+-- !query 11
+select 2 >= '2.0'
+-- !query 11 schema
+struct<(2 >= CAST(2.0 AS INT)):boolean>
+-- !query 11 output
+true
+
+
+-- !query 12
+select 2.0 >= '2.2'
+-- !query 12 schema
+struct<(2.0 >= CAST(2.2 AS DECIMAL(2,1))):boolean>
+-- !query 12 output
+false
+
+
+-- !query 13
+select to_date('2009-07-30 04:17:52') >= to_date('2009-07-30 04:17:52')
+-- !query 13 schema
+struct<(to_date('2009-07-30 04:17:52') >= to_date('2009-07-30 04:17:52')):boolean>
+-- !query 13 output
+true
+
+
+-- !query 14
+select to_date('2009-07-30 04:17:52') >= '2009-07-30 04:17:52'
+-- !query 14 schema
+struct<(CAST(to_date('2009-07-30 04:17:52') AS STRING) >= 2009-07-30 04:17:52):boolean>
+-- !query 14 output
+false
+
+
+-- !query 15
+select 1 < '1'
+-- !query 15 schema
+struct<(1 < CAST(1 AS INT)):boolean>
+-- !query 15 output
+false
+
+
+-- !query 16
+select 2 < '1.0'
+-- !query 16 schema
+struct<(2 < CAST(1.0 AS INT)):boolean>
+-- !query 16 output
+false
+
+
+-- !query 17
+select 2 < '2.0'
+-- !query 17 schema
+struct<(2 < CAST(2.0 AS INT)):boolean>
+-- !query 17 output
+false
+
+
+-- !query 18
+select 2.0 < '2.2'
+-- !query 18 schema
+struct<(2.0 < CAST(2.2 AS DECIMAL(2,1))):boolean>
+-- !query 18 output
+true
+
+
+-- !query 19
+select to_date('2009-07-30 04:17:52') < to_date('2009-07-30 04:17:52')
+-- !query 19 schema
+struct<(to_date('2009-07-30 04:17:52') < to_date('2009-07-30 04:17:52')):boolean>
+-- !query 19 output
+false
+
+
+-- !query 20
+select to_date('2009-07-30 04:17:52') < '2009-07-30 04:17:52'
+-- !query 20 schema
+struct<(CAST(to_date('2009-07-30 04:17:52') AS STRING) < 2009-07-30 04:17:52):boolean>
+-- !query 20 output
+true
+
+
+-- !query 21
+select 1 <= '1'
+-- !query 21 schema
+struct<(1 <= CAST(1 AS INT)):boolean>
+-- !query 21 output
+true
+
+
+-- !query 22
+select 2 <= '1.0'
+-- !query 22 schema
+struct<(2 <= CAST(1.0 AS INT)):boolean>
+-- !query 22 output
+false
+
+
+-- !query 23
+select 2 <= '2.0'
+-- !query 23 schema
+struct<(2 <= CAST(2.0 AS INT)):boolean>
+-- !query 23 output
+true
+
+
+-- !query 24
+select 2.0 <= '2.2'
+-- !query 24 schema
+struct<(2.0 <= CAST(2.2 AS DECIMAL(2,1))):boolean>
+-- !query 24 output
+true
+
+
+-- !query 25
+select to_date('2009-07-30 04:17:52') <= to_date('2009-07-30 04:17:52')
+-- !query 25 schema
+struct<(to_date('2009-07-30 04:17:52') <= to_date('2009-07-30 04:17:52')):boolean>
+-- !query 25 output
+true
+
+
+-- !query 26
+select to_date('2009-07-30 04:17:52') <= '2009-07-30 04:17:52'
+-- !query 26 schema
+struct<(CAST(to_date('2009-07-30 04:17:52') AS STRING) <= 2009-07-30 04:17:52):boolean>
+-- !query 26 output
+true

From ca59445adb30ed796189532df2a2898ecd33db68 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Mon, 4 Sep 2017 23:02:59 +0200
Subject: [PATCH 481/779] [SPARK-21418][SQL] NoSuchElementException: None.get
 in DataSourceScanExec with sun.io.serialization.extendedDebugInfo=true

## What changes were proposed in this pull request?

If no SparkConf is available to Utils.redact, simply don't redact.

## How was this patch tested?

Existing tests

Author: Sean Owen <sowen@cloudera.com>

Closes #19123 from srowen/SPARK-21418.
---
 core/src/main/scala/org/apache/spark/util/Utils.scala    | 9 ++++++---
 .../apache/spark/sql/execution/DataSourceScanExec.scala  | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 0da075de71d2f..1e8250fd6a846 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -2639,9 +2639,12 @@ private[spark] object Utils extends Logging {
    * Redact the sensitive information in the given string.
    */
   def redact(conf: SparkConf, text: String): String = {
-    if (text == null || text.isEmpty || !conf.contains(STRING_REDACTION_PATTERN)) return text
-    val regex = conf.get(STRING_REDACTION_PATTERN).get
-    regex.replaceAllIn(text, REDACTION_REPLACEMENT_TEXT)
+    if (text == null || text.isEmpty || conf == null || !conf.contains(STRING_REDACTION_PATTERN)) {
+      text
+    } else {
+      val regex = conf.get(STRING_REDACTION_PATTERN).get
+      regex.replaceAllIn(text, REDACTION_REPLACEMENT_TEXT)
+    }
   }
 
   private def redact(redactionPattern: Regex, kvs: Seq[(String, String)]): Seq[(String, String)] = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index 77e6dbf636476..8d0fc32feac99 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -69,7 +69,7 @@ trait DataSourceScanExec extends LeafExecNode with CodegenSupport {
    * Shorthand for calling redactString() without specifying redacting rules
    */
   private def redact(text: String): String = {
-    Utils.redact(SparkSession.getActiveSession.get.sparkContext.conf, text)
+    Utils.redact(SparkSession.getActiveSession.map(_.sparkContext.conf).orNull, text)
   }
 }
 

From 4e7a29efdba6972a4713a62dfccb495504a25ab9 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Tue, 5 Sep 2017 00:20:16 -0700
Subject: [PATCH 482/779] [SPARK-21913][SQL][TEST] withDatabase` should drop
 database with CASCADE

## What changes were proposed in this pull request?

Currently, `withDatabase` fails if the database is not empty. It would be great if we drop cleanly with CASCADE.

## How was this patch tested?

This is a change on test util. Pass the existing Jenkins.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #19125 from dongjoon-hyun/SPARK-21913.
---
 .../src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
index e68db3b636bce..a14a1441a4313 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
@@ -247,7 +247,7 @@ private[sql] trait SQLTestUtils
   protected def withDatabase(dbNames: String*)(f: => Unit): Unit = {
     try f finally {
       dbNames.foreach { name =>
-        spark.sql(s"DROP DATABASE IF EXISTS $name")
+        spark.sql(s"DROP DATABASE IF EXISTS $name CASCADE")
       }
       spark.sql(s"USE $DEFAULT_DATABASE")
     }

From 7f3c6ff4ff0a501cc7f1fb53a90ea7b5787f68e1 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Tue, 5 Sep 2017 19:40:05 +0900
Subject: [PATCH 483/779] [SPARK-21903][BUILD] Upgrade scalastyle to 1.0.0.

## What changes were proposed in this pull request?

1.0.0 fixes an issue with import order, explicit type for public methods, line length limitation and comment validation:

```
[error] .../spark/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala:50:16: Are you sure you want to println? If yes, wrap the code block with
[error]       // scalastyle:off println
[error]       println(...)
[error]       // scalastyle:on println
[error] .../spark/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala:49: File line length exceeds 100 characters
[error] .../spark/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala:22:21: Are you sure you want to println? If yes, wrap the code block with
[error]       // scalastyle:off println
[error]       println(...)
[error]       // scalastyle:on println
[error] .../spark/streaming/src/test/java/org/apache/spark/streaming/JavaTestUtils.scala:35:6: Public method must have explicit type
[error] .../spark/streaming/src/test/java/org/apache/spark/streaming/JavaTestUtils.scala:51:6: Public method must have explicit type
[error] .../spark/streaming/src/test/java/org/apache/spark/streaming/JavaTestUtils.scala:93:15: Public method must have explicit type
[error] .../spark/streaming/src/test/java/org/apache/spark/streaming/JavaTestUtils.scala:98:15: Public method must have explicit type
[error] .../spark/streaming/src/test/java/org/apache/spark/streaming/JavaTestUtils.scala:47:2: Insert a space after the start of the comment
[error] .../spark/streaming/src/test/java/org/apache/spark/streaming/JavaTestUtils.scala:26:43: JavaDStream should come before JavaDStreamLike.
```

This PR also fixes the workaround added in SPARK-16877 for `org.scalastyle.scalariform.OverrideJavaChecker` feature, added from 0.9.0.

## How was this patch tested?

Manually tested.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #19116 from HyukjinKwon/scalastyle-1.0.0.
---
 project/SparkBuild.scala                             |  5 +++--
 project/plugins.sbt                                  |  3 +--
 .../src/main/scala/org/apache/spark/repl/Main.scala  |  2 ++
 .../scala/org/apache/spark/repl/SparkILoop.scala     |  5 ++++-
 scalastyle-config.xml                                |  5 +----
 .../org/apache/spark/streaming/JavaTestUtils.scala   | 12 ++++++------
 6 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 9d903edbf91fb..20848f06172d4 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -163,14 +163,15 @@ object SparkBuild extends PomBuild {
         val configUrlV = scalastyleConfigUrl.in(config).value
         val streamsV = streams.in(config).value
         val failOnErrorV = true
+        val failOnWarningV = false
         val scalastyleTargetV = scalastyleTarget.in(config).value
         val configRefreshHoursV = scalastyleConfigRefreshHours.in(config).value
         val targetV = target.in(config).value
         val configCacheFileV = scalastyleConfigUrlCacheFile.in(config).value
 
         logger.info(s"Running scalastyle on ${name.value} in ${config.name}")
-        Tasks.doScalastyle(args, configV, configUrlV, failOnErrorV, scalaSourceV, scalastyleTargetV,
-          streamsV, configRefreshHoursV, targetV, configCacheFileV)
+        Tasks.doScalastyle(args, configV, configUrlV, failOnErrorV, failOnWarningV, scalaSourceV,
+          scalastyleTargetV, streamsV, configRefreshHoursV, targetV, configCacheFileV)
 
         Set.empty
       }
diff --git a/project/plugins.sbt b/project/plugins.sbt
index f67e0a14a8f24..3c5442b04b8e4 100644
--- a/project/plugins.sbt
+++ b/project/plugins.sbt
@@ -7,8 +7,7 @@ addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "5.1.0")
 // sbt 1.0.0 support: https://github.com/jrudolph/sbt-dependency-graph/issues/134
 addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.8.2")
 
-// need to make changes to uptake sbt 1.0 support in "org.scalastyle" %% "scalastyle-sbt-plugin" % "0.9.0"
-addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.9.0")
+addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0")
 
 addSbtPlugin("com.typesafe" % "sbt-mima-plugin" % "0.1.17")
 
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
index 0b16e1b073e32..cc76a703bdf8f 100644
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
+++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
@@ -47,7 +47,9 @@ object Main extends Logging {
 
   private def scalaOptionError(msg: String): Unit = {
     hasErrors = true
+    // scalastyle:off println
     Console.err.println(msg)
+    // scalastyle:on println
   }
 
   def main(args: Array[String]) {
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala
index d1d25b7bf041f..ea279e4f0ebce 100644
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala
+++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala
@@ -19,7 +19,9 @@ package org.apache.spark.repl
 
 import java.io.BufferedReader
 
+// scalastyle:off println
 import scala.Predef.{println => _, _}
+// scalastyle:on println
 import scala.tools.nsc.Settings
 import scala.tools.nsc.interpreter.{ILoop, JPrintWriter}
 import scala.tools.nsc.util.stringFromStream
@@ -46,7 +48,8 @@ class SparkILoop(in0: Option[BufferedReader], out: JPrintWriter)
           if (_sc.getConf.getBoolean("spark.ui.reverseProxy", false)) {
             val proxyUrl = _sc.getConf.get("spark.ui.reverseProxyUrl", null)
             if (proxyUrl != null) {
-              println(s"Spark Context Web UI is available at ${proxyUrl}/proxy/${_sc.applicationId}")
+              println(
+                s"Spark Context Web UI is available at ${proxyUrl}/proxy/${_sc.applicationId}")
             } else {
               println(s"Spark Context Web UI is available at Spark Master Public URL")
             }
diff --git a/scalastyle-config.xml b/scalastyle-config.xml
index 0a4073b03957c..bd7f462b722cd 100644
--- a/scalastyle-config.xml
+++ b/scalastyle-config.xml
@@ -268,10 +268,7 @@ This file is divided into 3 sections:
   </check>
 
   <!-- SPARK-16877: Avoid Java annotations -->
-  <check customId="OverrideJavaCase" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
-    <parameters><parameter name="regex">^Override$</parameter></parameters>
-    <customMessage>override modifier should be used instead of @java.lang.Override.</customMessage>
-  </check>
+  <check level="error" class="org.scalastyle.scalariform.OverrideJavaChecker" enabled="true"></check>
 
   <check level="error" class="org.scalastyle.scalariform.DeprecatedJavaChecker" enabled="true"></check>
 
diff --git a/streaming/src/test/java/org/apache/spark/streaming/JavaTestUtils.scala b/streaming/src/test/java/org/apache/spark/streaming/JavaTestUtils.scala
index ae44fd07ac558..0c4a64ccc513f 100644
--- a/streaming/src/test/java/org/apache/spark/streaming/JavaTestUtils.scala
+++ b/streaming/src/test/java/org/apache/spark/streaming/JavaTestUtils.scala
@@ -23,7 +23,7 @@ import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
 
 import org.apache.spark.api.java.JavaRDDLike
-import org.apache.spark.streaming.api.java.{JavaDStreamLike, JavaDStream, JavaStreamingContext}
+import org.apache.spark.streaming.api.java.{JavaDStream, JavaDStreamLike, JavaStreamingContext}
 
 /** Exposes streaming test functionality in a Java-friendly way. */
 trait JavaTestBase extends TestSuiteBase {
@@ -35,7 +35,7 @@ trait JavaTestBase extends TestSuiteBase {
   def attachTestInputStream[T](
       ssc: JavaStreamingContext,
       data: JList[JList[T]],
-      numPartitions: Int) = {
+      numPartitions: Int): JavaDStream[T] = {
     val seqData = data.asScala.map(_.asScala)
 
     implicit val cm: ClassTag[T] =
@@ -47,9 +47,9 @@ trait JavaTestBase extends TestSuiteBase {
   /**
    * Attach a provided stream to it's associated StreamingContext as a
    * [[org.apache.spark.streaming.TestOutputStream]].
-   **/
+   */
   def attachTestOutputStream[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T, R]](
-      dstream: JavaDStreamLike[T, This, R]) = {
+      dstream: JavaDStreamLike[T, This, R]): Unit = {
     implicit val cm: ClassTag[T] =
       implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[T]]
     val ostream = new TestOutputStreamWithPartitions(dstream.dstream)
@@ -90,10 +90,10 @@ trait JavaTestBase extends TestSuiteBase {
 }
 
 object JavaTestUtils extends JavaTestBase {
-  override def maxWaitTimeMillis = 20000
+  override def maxWaitTimeMillis: Int = 20000
 
 }
 
 object JavaCheckpointTestUtils extends JavaTestBase {
-  override def actuallyWait = true
+  override def actuallyWait: Boolean = true
 }

From 02a4386aec5f83f41ca1abc5f56e223b6fae015c Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Tue, 5 Sep 2017 23:21:43 +0800
Subject: [PATCH 484/779] [SPARK-20978][SQL] Bump up Univocity version to 2.5.4

## What changes were proposed in this pull request?

There was a bug in Univocity Parser that causes the issue in SPARK-20978. This was fixed as below:

```scala
val df = spark.read.schema("a string, b string, unparsed string").option("columnNameOfCorruptRecord", "unparsed").csv(Seq("a").toDS())
df.show()
```

**Before**

```
java.lang.NullPointerException
	at scala.collection.immutable.StringLike$class.stripLineEnd(StringLike.scala:89)
	at scala.collection.immutable.StringOps.stripLineEnd(StringOps.scala:29)
	at org.apache.spark.sql.execution.datasources.csv.UnivocityParser.org$apache$spark$sql$execution$datasources$csv$UnivocityParser$$getCurrentInput(UnivocityParser.scala:56)
	at org.apache.spark.sql.execution.datasources.csv.UnivocityParser$$anonfun$org$apache$spark$sql$execution$datasources$csv$UnivocityParser$$convert$1.apply(UnivocityParser.scala:207)
	at org.apache.spark.sql.execution.datasources.csv.UnivocityParser$$anonfun$org$apache$spark$sql$execution$datasources$csv$UnivocityParser$$convert$1.apply(UnivocityParser.scala:207)
...
```

**After**

```
+---+----+--------+
|  a|   b|unparsed|
+---+----+--------+
|  a|null|       a|
+---+----+--------+
```

It was fixed in 2.5.0 and 2.5.4 was released. I guess it'd be safe to upgrade this.

## How was this patch tested?

Unit test added in `CSVSuite.scala`.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #19113 from HyukjinKwon/bump-up-univocity.
---
 dev/deps/spark-deps-hadoop-2.6                            | 2 +-
 dev/deps/spark-deps-hadoop-2.7                            | 2 +-
 sql/core/pom.xml                                          | 2 +-
 .../spark/sql/execution/datasources/csv/CSVSuite.scala    | 8 ++++++++
 4 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 1535103321193..e3b9ce0f847e3 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -182,7 +182,7 @@ stax-api-1.0.1.jar
 stream-2.7.0.jar
 stringtemplate-3.2.1.jar
 super-csv-2.2.0.jar
-univocity-parsers-2.2.1.jar
+univocity-parsers-2.5.4.jar
 validation-api-1.1.0.Final.jar
 xbean-asm5-shaded-4.4.jar
 xercesImpl-2.9.1.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index deaa288ef4efe..a3f3f32444beb 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -183,7 +183,7 @@ stax-api-1.0.1.jar
 stream-2.7.0.jar
 stringtemplate-3.2.1.jar
 super-csv-2.2.0.jar
-univocity-parsers-2.2.1.jar
+univocity-parsers-2.5.4.jar
 validation-api-1.1.0.Final.jar
 xbean-asm5-shaded-4.4.jar
 xercesImpl-2.9.1.jar
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 9a3cacbe3825e..7ee002e465756 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -38,7 +38,7 @@
     <dependency>
       <groupId>com.univocity</groupId>
       <artifactId>univocity-parsers</artifactId>
-      <version>2.2.1</version>
+      <version>2.5.4</version>
       <type>jar</type>
     </dependency>
     <dependency>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 243a55cffd47f..be89141151098 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -1195,4 +1195,12 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
       .csv(Seq("10u12").toDS())
     checkAnswer(results, Row(null))
   }
+
+  test("SPARK-20978: Fill the malformed column when the number of tokens is less than schema") {
+    val df = spark.read
+      .schema("a string, b string, unparsed string")
+      .option("columnNameOfCorruptRecord", "unparsed")
+      .csv(Seq("a").toDS())
+    checkAnswer(df, Row("a", null, "a"))
+  }
 }

From 2974406d17a3831c1897b8d99261419592f8042f Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 5 Sep 2017 09:04:03 -0700
Subject: [PATCH 485/779] [SPARK-21845][SQL][TEST-MAVEN] Make codegen fallback
 of expressions configurable

## What changes were proposed in this pull request?
We should make codegen fallback of expressions configurable. So far, it is always on. We might hide it when our codegen have compilation bugs. Thus, we should also disable the codegen fallback when running test cases.

## How was this patch tested?
Added test cases

Author: gatorsmile <gatorsmile@gmail.com>

Closes #19119 from gatorsmile/fallbackCodegen.
---
 .../org/apache/spark/sql/internal/SQLConf.scala      |  6 +++---
 .../org/apache/spark/sql/execution/SparkPlan.scala   | 11 ++++++-----
 .../spark/sql/execution/WholeStageCodegenExec.scala  |  2 +-
 .../apache/spark/sql/DataFrameFunctionsSuite.scala   |  2 +-
 .../scala/org/apache/spark/sql/DataFrameSuite.scala  | 12 +++++++++++-
 .../org/apache/spark/sql/test/SharedSQLContext.scala |  2 ++
 .../org/apache/spark/sql/hive/test/TestHive.scala    |  1 +
 7 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index c407874381ac2..db5d65c9b2e23 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -559,9 +559,9 @@ object SQLConf {
     .intConf
     .createWithDefault(100)
 
-  val WHOLESTAGE_FALLBACK = buildConf("spark.sql.codegen.fallback")
+  val CODEGEN_FALLBACK = buildConf("spark.sql.codegen.fallback")
     .internal()
-    .doc("When true, whole stage codegen could be temporary disabled for the part of query that" +
+    .doc("When true, (whole stage) codegen could be temporary disabled for the part of query that" +
       " fail to compile generated code")
     .booleanConf
     .createWithDefault(true)
@@ -1051,7 +1051,7 @@ class SQLConf extends Serializable with Logging {
 
   def wholeStageMaxNumFields: Int = getConf(WHOLESTAGE_MAX_NUM_FIELDS)
 
-  def wholeStageFallback: Boolean = getConf(WHOLESTAGE_FALLBACK)
+  def codegenFallback: Boolean = getConf(CODEGEN_FALLBACK)
 
   def maxCaseBranchesForCodegen: Int = getConf(MAX_CASES_BRANCHES)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index c7277c21cebb2..b263f100e6068 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -56,15 +56,17 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
 
   protected def sparkContext = sqlContext.sparkContext
 
-  // sqlContext will be null when we are being deserialized on the slaves.  In this instance
-  // the value of subexpressionEliminationEnabled will be set by the deserializer after the
-  // constructor has run.
+  // sqlContext will be null when SparkPlan nodes are created without the active sessions.
+  // So far, this only happens in the test cases.
   val subexpressionEliminationEnabled: Boolean = if (sqlContext != null) {
     sqlContext.conf.subexpressionEliminationEnabled
   } else {
     false
   }
 
+  // whether we should fallback when hitting compilation errors caused by codegen
+  private val codeGenFallBack = (sqlContext == null) || sqlContext.conf.codegenFallback
+
   /** Overridden make copy also propagates sqlContext to copied plan. */
   override def makeCopy(newArgs: Array[AnyRef]): SparkPlan = {
     SparkSession.setActiveSession(sqlContext.sparkSession)
@@ -370,8 +372,7 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
     try {
       GeneratePredicate.generate(expression, inputSchema)
     } catch {
-      case e @ (_: JaninoRuntimeException | _: CompileException)
-          if sqlContext == null || sqlContext.conf.wholeStageFallback =>
+      case _ @ (_: JaninoRuntimeException | _: CompileException) if codeGenFallBack =>
         genInterpretedPredicate(expression, inputSchema)
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
index bacb7090a70ab..a41a7ca56a0a1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
@@ -382,7 +382,7 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
     try {
       CodeGenerator.compile(cleanedSource)
     } catch {
-      case e: Exception if !Utils.isTesting && sqlContext.conf.wholeStageFallback =>
+      case _: Exception if !Utils.isTesting && sqlContext.conf.codegenFallback =>
         // We should already saw the error message
         logWarning(s"Whole-stage codegen disabled for this plan:\n $treeString")
         return child.execute()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 0681b9cbeb1d8..50e475984f458 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -422,7 +422,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext {
         v
       }
       withSQLConf(
-        (SQLConf.WHOLESTAGE_FALLBACK.key, codegenFallback.toString),
+        (SQLConf.CODEGEN_FALLBACK.key, codegenFallback.toString),
         (SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, wholeStage.toString)) {
         val df = spark.range(0, 4, 1, 4).withColumn("c", c)
         val rows = df.collect()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 5eb34e587e95f..13341645e8ff8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -2011,7 +2011,17 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
 
     val filter = (0 until N)
       .foldLeft(lit(false))((e, index) => e.or(df.col(df.columns(index)) =!= "string"))
-    df.filter(filter).count
+
+    withSQLConf(SQLConf.CODEGEN_FALLBACK.key -> "true") {
+      df.filter(filter).count()
+    }
+
+    withSQLConf(SQLConf.CODEGEN_FALLBACK.key -> "false") {
+      val e = intercept[SparkException] {
+        df.filter(filter).count()
+      }.getMessage
+      assert(e.contains("grows beyond 64 KB"))
+    }
   }
 
   test("SPARK-20897: cached self-join should not fail") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
index 1f073d5f64c6b..cd8d0708d8a32 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
@@ -24,6 +24,7 @@ import org.scalatest.concurrent.Eventually
 
 import org.apache.spark.{DebugFilesystem, SparkConf}
 import org.apache.spark.sql.{SparkSession, SQLContext}
+import org.apache.spark.sql.internal.SQLConf
 
 /**
  * Helper trait for SQL test suites where all tests share a single [[TestSparkSession]].
@@ -34,6 +35,7 @@ trait SharedSQLContext extends SQLTestUtils with BeforeAndAfterEach with Eventua
     new SparkConf()
       .set("spark.hadoop.fs.file.impl", classOf[DebugFilesystem].getName)
       .set("spark.unsafe.exceptionOnMemoryLeak", "true")
+      .set(SQLConf.CODEGEN_FALLBACK.key, "false")
   }
 
   /**
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 10c9a2de6540a..0f6a81b6f813b 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -51,6 +51,7 @@ object TestHive
       "TestSQLContext",
       new SparkConf()
         .set("spark.sql.test", "")
+        .set(SQLConf.CODEGEN_FALLBACK.key, "false")
         .set("spark.sql.hive.metastore.barrierPrefixes",
           "org.apache.spark.sql.hive.execution.PairSerDe")
         .set("spark.sql.warehouse.dir", TestHiveContext.makeWarehouseDir().toURI.getPath)

From 8c954d2cd10a2cf729d2971fbeb19b2dd751a178 Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Tue, 5 Sep 2017 13:10:32 -0700
Subject: [PATCH 486/779] [SPARK-21925] Update trigger interval documentation
 in docs with behavior change in Spark 2.2

Forgot to update docs with behavior change.

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #19138 from brkyvz/trigger-doc-fix.
---
 docs/structured-streaming-programming-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md
index 8367f5a08c755..13a6a8297b922 100644
--- a/docs/structured-streaming-programming-guide.md
+++ b/docs/structured-streaming-programming-guide.md
@@ -1168,7 +1168,7 @@ returned through `Dataset.writeStream()`. You will have to specify one or more o
 
 - *Query name:* Optionally, specify a unique name of the query for identification.
 
-- *Trigger interval:* Optionally, specify the trigger interval. If it is not specified, the system will check for availability of new data as soon as the previous processing has completed. If a trigger time is missed because the previous processing has not completed, then the system will attempt to trigger at the next trigger point, not immediately after the processing has completed.
+- *Trigger interval:* Optionally, specify the trigger interval. If it is not specified, the system will check for availability of new data as soon as the previous processing has completed. If a trigger time is missed because the previous processing has not completed, then the system will trigger processing immediately.
 
 - *Checkpoint location:* For some output sinks where the end-to-end fault-tolerance can be guaranteed, specify the location where the system will write all the checkpoint information. This should be a directory in an HDFS-compatible fault-tolerant file system. The semantics of checkpointing is discussed in more detail in the next section.
 

From fd60d4fa6c516496a60d6979edd1b4630bf721bd Mon Sep 17 00:00:00 2001
From: Xingbo Jiang <xingbo.jiang@databricks.com>
Date: Tue, 5 Sep 2017 13:12:39 -0700
Subject: [PATCH 487/779] [SPARK-21652][SQL] Fix rule confliction between
 InferFiltersFromConstraints and ConstantPropagation

## What changes were proposed in this pull request?

For the given example below, the predicate added by `InferFiltersFromConstraints` is folded by `ConstantPropagation` later, this leads to unconverged optimize iteration:
```
Seq((1, 1)).toDF("col1", "col2").createOrReplaceTempView("t1")
Seq(1, 2).toDF("col").createOrReplaceTempView("t2")
sql("SELECT * FROM t1, t2 WHERE t1.col1 = 1 AND 1 = t1.col2 AND t1.col1 = t2.col AND t1.col2 = t2.col")
```

We can fix this by adjusting the indent of the optimize rules.

## How was this patch tested?

Add test case that would have failed in `SQLQuerySuite`.

Author: Xingbo Jiang <xingbo.jiang@databricks.com>

Closes #19099 from jiangxb1987/unconverge-optimization.
---
 .../spark/sql/catalyst/optimizer/Optimizer.scala   |  3 ++-
 .../scala/org/apache/spark/sql/SQLQuerySuite.scala | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index b73f70a35166e..d7e5906f671cb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -79,11 +79,12 @@ abstract class Optimizer(sessionCatalog: SessionCatalog)
       PushProjectionThroughUnion,
       ReorderJoin,
       EliminateOuterJoin,
+      InferFiltersFromConstraints,
+      BooleanSimplification,
       PushPredicateThroughJoin,
       PushDownPredicate,
       LimitPushDown,
       ColumnPruning,
-      InferFiltersFromConstraints,
       // Operator combine
       CollapseRepartition,
       CollapseProject,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 923c6d8eb71fd..93a7777b70b46 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2663,4 +2663,18 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     // In unit test, Spark will fail the query if memory leak detected.
     spark.range(100).groupBy("id").count().limit(1).collect()
   }
+
+  test("SPARK-21652: rule confliction of InferFiltersFromConstraints and ConstantPropagation") {
+    withTempView("t1", "t2") {
+      Seq((1, 1)).toDF("col1", "col2").createOrReplaceTempView("t1")
+      Seq(1, 2).toDF("col").createOrReplaceTempView("t2")
+      val df = sql(
+        """
+          |SELECT *
+          |FROM t1, t2
+          |WHERE t1.col1 = 1 AND 1 = t1.col2 AND t1.col1 = t2.col AND t1.col2 = t2.col
+        """.stripMargin)
+      checkAnswer(df, Row(1, 1, 1))
+    }
+  }
 }

From 9e451bcf36151bf401f72dcd66001b9ceb079738 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Tue, 5 Sep 2017 14:35:09 -0700
Subject: [PATCH 488/779] [MINOR][DOC] Update `Partition Discovery` section to
 enumerate all available file sources

## What changes were proposed in this pull request?

All built-in data sources support `Partition Discovery`. We had better update the document to give the users more benefit clearly.

**AFTER**

<img width="906" alt="1" src="https://user-images.githubusercontent.com/9700541/30083628-14278908-9244-11e7-98dc-9ad45fe233a9.png">

## How was this patch tested?

```
SKIP_API=1 jekyll serve --watch
```

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #19139 from dongjoon-hyun/partitiondiscovery.
---
 docs/sql-programming-guide.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index ee231a934a3af..032073bfc40dd 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -733,8 +733,9 @@ SELECT * FROM parquetTable
 
 Table partitioning is a common optimization approach used in systems like Hive. In a partitioned
 table, data are usually stored in different directories, with partitioning column values encoded in
-the path of each partition directory. The Parquet data source is now able to discover and infer
-partitioning information automatically. For example, we can store all our previously used
+the path of each partition directory. All built-in file sources (including Text/CSV/JSON/ORC/Parquet)
+are able to discover and infer partitioning information automatically.
+For example, we can store all our previously used
 population data into a partitioned table using the following directory structure, with two extra
 columns, `gender` and `country` as partitioning columns:
 

From 6a2325448000ba431ba3b982d181c017559abfe3 Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Wed, 6 Sep 2017 09:39:39 +0800
Subject: [PATCH 489/779] [SPARK-18061][THRIFTSERVER] Add spnego auth support
 for ThriftServer thrift/http protocol

Spark ThriftServer doesn't support spnego auth for thrift/http protocol, this mainly used for knox+thriftserver scenario. Since in HiveServer2 CLIService there already has existing codes to support it. So here copy it to Spark ThriftServer to make it support.

Related Hive JIRA HIVE-6697.

Manual verification.

Author: jerryshao <sshao@hortonworks.com>

Closes #18628 from jerryshao/SPARK-21407.

Change-Id: I61ef0c09f6972bba982475084a6b0ae3a74e385e
---
 .../hive/thriftserver/SparkSQLCLIService.scala   | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala
index 1b17a9a56e5b9..ad1f5eb9ca3a7 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala
@@ -25,6 +25,7 @@ import scala.collection.JavaConverters._
 
 import org.apache.commons.logging.Log
 import org.apache.hadoop.hive.conf.HiveConf
+import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 import org.apache.hadoop.hive.shims.Utils
 import org.apache.hadoop.security.UserGroupInformation
 import org.apache.hive.service.{AbstractService, Service, ServiceException}
@@ -47,6 +48,7 @@ private[hive] class SparkSQLCLIService(hiveServer: HiveServer2, sqlContext: SQLC
     setSuperField(this, "sessionManager", sparkSqlSessionManager)
     addService(sparkSqlSessionManager)
     var sparkServiceUGI: UserGroupInformation = null
+    var httpUGI: UserGroupInformation = null
 
     if (UserGroupInformation.isSecurityEnabled) {
       try {
@@ -57,6 +59,20 @@ private[hive] class SparkSQLCLIService(hiveServer: HiveServer2, sqlContext: SQLC
         case e @ (_: IOException | _: LoginException) =>
           throw new ServiceException("Unable to login to kerberos with given principal/keytab", e)
       }
+
+      // Try creating spnego UGI if it is configured.
+      val principal = hiveConf.getVar(ConfVars.HIVE_SERVER2_SPNEGO_PRINCIPAL).trim
+      val keyTabFile = hiveConf.getVar(ConfVars.HIVE_SERVER2_SPNEGO_KEYTAB).trim
+      if (principal.nonEmpty && keyTabFile.nonEmpty) {
+        try {
+          httpUGI = HiveAuthFactory.loginFromSpnegoKeytabAndReturnUGI(hiveConf)
+          setSuperField(this, "httpUGI", httpUGI)
+        } catch {
+          case e: IOException =>
+            throw new ServiceException("Unable to login to spnego with given principal " +
+              s"$principal and keytab $keyTabFile: $e", e)
+        }
+      }
     }
 
     initCompositeService(hiveConf)

From 445f1790ade1c53cf7eee1f282395648e4d0992c Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Tue, 5 Sep 2017 21:28:54 -0700
Subject: [PATCH 490/779] [SPARK-9104][CORE] Expose Netty memory metrics in
 Spark

## What changes were proposed in this pull request?

This PR exposes Netty memory usage for Spark's `TransportClientFactory` and `TransportServer`, including the details of each direct arena and heap arena metrics, as well as aggregated metrics. The purpose of adding the Netty metrics is to better know the memory usage of Netty in Spark shuffle, rpc and others network communications, and guide us to better configure the memory size of executors.

This PR doesn't expose these metrics to any sink, to leverage this feature, still requires to connect to either MetricsSystem or collect them back to Driver to display.

## How was this patch tested?

Add Unit test to verify it, also manually verified in real cluster.

Author: jerryshao <sshao@hortonworks.com>

Closes #18935 from jerryshao/SPARK-9104.
---
 common/network-common/pom.xml                 |   5 +
 .../client/TransportClientFactory.java        |  13 +-
 .../spark/network/server/TransportServer.java |  14 +-
 .../network/util/NettyMemoryMetrics.java      | 145 +++++++++++++++
 .../spark/network/util/TransportConf.java     |  10 +
 .../network/util/NettyMemoryMetricsSuite.java | 171 ++++++++++++++++++
 dev/deps/spark-deps-hadoop-2.6                |   2 +-
 dev/deps/spark-deps-hadoop-2.7                |   2 +-
 pom.xml                                       |   2 +-
 9 files changed, 353 insertions(+), 11 deletions(-)
 create mode 100644 common/network-common/src/main/java/org/apache/spark/network/util/NettyMemoryMetrics.java
 create mode 100644 common/network-common/src/test/java/org/apache/spark/network/util/NettyMemoryMetricsSuite.java

diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index ccd8504ba0d7b..18cbdadd224ab 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -61,6 +61,11 @@
       <artifactId>jackson-annotations</artifactId>
     </dependency>
 
+    <dependency>
+      <groupId>io.dropwizard.metrics</groupId>
+      <artifactId>metrics-core</artifactId>
+    </dependency>
+
     <!-- Provided dependencies -->
     <dependency>
       <groupId>org.slf4j</groupId>
diff --git a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java
index 8add4e1ab021d..16d242dbb2c47 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java
@@ -26,6 +26,7 @@
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.atomic.AtomicReference;
 
+import com.codahale.metrics.MetricSet;
 import com.google.common.base.Preconditions;
 import com.google.common.base.Throwables;
 import com.google.common.collect.Lists;
@@ -42,10 +43,7 @@
 
 import org.apache.spark.network.TransportContext;
 import org.apache.spark.network.server.TransportChannelHandler;
-import org.apache.spark.network.util.IOMode;
-import org.apache.spark.network.util.JavaUtils;
-import org.apache.spark.network.util.NettyUtils;
-import org.apache.spark.network.util.TransportConf;
+import org.apache.spark.network.util.*;
 
 /**
  * Factory for creating {@link TransportClient}s by using createClient.
@@ -87,6 +85,7 @@ private static class ClientPool {
   private final Class<? extends Channel> socketChannelClass;
   private EventLoopGroup workerGroup;
   private PooledByteBufAllocator pooledAllocator;
+  private final NettyMemoryMetrics metrics;
 
   public TransportClientFactory(
       TransportContext context,
@@ -106,6 +105,12 @@ public TransportClientFactory(
         conf.getModuleName() + "-client");
     this.pooledAllocator = NettyUtils.createPooledByteBufAllocator(
       conf.preferDirectBufs(), false /* allowCache */, conf.clientThreads());
+    this.metrics = new NettyMemoryMetrics(
+      this.pooledAllocator, conf.getModuleName() + "-client", conf);
+  }
+
+  public MetricSet getAllMetrics() {
+    return metrics;
   }
 
   /**
diff --git a/common/network-common/src/main/java/org/apache/spark/network/server/TransportServer.java b/common/network-common/src/main/java/org/apache/spark/network/server/TransportServer.java
index 047c5f3f1f094..3f1fea8bca30c 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/server/TransportServer.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/server/TransportServer.java
@@ -22,6 +22,7 @@
 import java.util.List;
 import java.util.concurrent.TimeUnit;
 
+import com.codahale.metrics.MetricSet;
 import com.google.common.base.Preconditions;
 import com.google.common.collect.Lists;
 import io.netty.bootstrap.ServerBootstrap;
@@ -31,14 +32,11 @@
 import io.netty.channel.ChannelOption;
 import io.netty.channel.EventLoopGroup;
 import io.netty.channel.socket.SocketChannel;
-import org.apache.spark.network.util.JavaUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.spark.network.TransportContext;
-import org.apache.spark.network.util.IOMode;
-import org.apache.spark.network.util.NettyUtils;
-import org.apache.spark.network.util.TransportConf;
+import org.apache.spark.network.util.*;
 
 /**
  * Server for the efficient, low-level streaming service.
@@ -54,6 +52,7 @@ public class TransportServer implements Closeable {
   private ServerBootstrap bootstrap;
   private ChannelFuture channelFuture;
   private int port = -1;
+  private NettyMemoryMetrics metrics;
 
   /**
    * Creates a TransportServer that binds to the given host and the given port, or to any available
@@ -101,6 +100,9 @@ private void init(String hostToBind, int portToBind) {
       .option(ChannelOption.ALLOCATOR, allocator)
       .childOption(ChannelOption.ALLOCATOR, allocator);
 
+    this.metrics = new NettyMemoryMetrics(
+      allocator, conf.getModuleName() + "-server", conf);
+
     if (conf.backLog() > 0) {
       bootstrap.option(ChannelOption.SO_BACKLOG, conf.backLog());
     }
@@ -133,6 +135,10 @@ protected void initChannel(SocketChannel ch) throws Exception {
     logger.debug("Shuffle server started on port: {}", port);
   }
 
+  public MetricSet getAllMetrics() {
+    return metrics;
+  }
+
   @Override
   public void close() {
     if (channelFuture != null) {
diff --git a/common/network-common/src/main/java/org/apache/spark/network/util/NettyMemoryMetrics.java b/common/network-common/src/main/java/org/apache/spark/network/util/NettyMemoryMetrics.java
new file mode 100644
index 0000000000000..969d9f65fb409
--- /dev/null
+++ b/common/network-common/src/main/java/org/apache/spark/network/util/NettyMemoryMetrics.java
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.network.util;
+
+import java.lang.reflect.Method;
+import java.lang.reflect.Modifier;
+import java.util.*;
+
+import com.codahale.metrics.Gauge;
+import com.codahale.metrics.Metric;
+import com.codahale.metrics.MetricRegistry;
+import com.codahale.metrics.MetricSet;
+import com.google.common.annotations.VisibleForTesting;
+import io.netty.buffer.PoolArenaMetric;
+import io.netty.buffer.PooledByteBufAllocator;
+import io.netty.buffer.PooledByteBufAllocatorMetric;
+
+/**
+ * A Netty memory metrics class to collect metrics from Netty PooledByteBufAllocator.
+ */
+public class NettyMemoryMetrics implements MetricSet {
+
+  private final PooledByteBufAllocator pooledAllocator;
+
+  private final boolean verboseMetricsEnabled;
+
+  private final Map<String, Metric> allMetrics;
+
+  private final String metricPrefix;
+
+  @VisibleForTesting
+  final static Set<String> VERBOSE_METRICS = new HashSet<>();
+  static {
+    VERBOSE_METRICS.addAll(Arrays.asList(
+      "numAllocations",
+      "numTinyAllocations",
+      "numSmallAllocations",
+      "numNormalAllocations",
+      "numHugeAllocations",
+      "numDeallocations",
+      "numTinyDeallocations",
+      "numSmallDeallocations",
+      "numNormalDeallocations",
+      "numHugeDeallocations",
+      "numActiveAllocations",
+      "numActiveTinyAllocations",
+      "numActiveSmallAllocations",
+      "numActiveNormalAllocations",
+      "numActiveHugeAllocations",
+      "numActiveBytes"));
+  }
+
+  public NettyMemoryMetrics(PooledByteBufAllocator pooledAllocator,
+      String metricPrefix,
+      TransportConf conf) {
+    this.pooledAllocator = pooledAllocator;
+    this.allMetrics = new HashMap<>();
+    this.metricPrefix = metricPrefix;
+    this.verboseMetricsEnabled = conf.verboseMetrics();
+
+    registerMetrics(this.pooledAllocator);
+  }
+
+  private void registerMetrics(PooledByteBufAllocator allocator) {
+    PooledByteBufAllocatorMetric pooledAllocatorMetric = allocator.metric();
+
+    // Register general metrics.
+    allMetrics.put(MetricRegistry.name(metricPrefix, "usedHeapMemory"),
+      (Gauge<Long>) () -> pooledAllocatorMetric.usedHeapMemory());
+    allMetrics.put(MetricRegistry.name(metricPrefix, "usedDirectMemory"),
+      (Gauge<Long>) () -> pooledAllocatorMetric.usedDirectMemory());
+
+    if (verboseMetricsEnabled) {
+      int directArenaIndex = 0;
+      for (PoolArenaMetric metric : pooledAllocatorMetric.directArenas()) {
+        registerArenaMetric(metric, "directArena" + directArenaIndex);
+        directArenaIndex++;
+      }
+
+      int heapArenaIndex = 0;
+      for (PoolArenaMetric metric : pooledAllocatorMetric.heapArenas()) {
+        registerArenaMetric(metric, "heapArena" + heapArenaIndex);
+        heapArenaIndex++;
+      }
+    }
+  }
+
+  private void registerArenaMetric(PoolArenaMetric arenaMetric, String arenaName) {
+    for (String methodName : VERBOSE_METRICS) {
+      Method m;
+      try {
+        m = PoolArenaMetric.class.getMethod(methodName);
+      } catch (Exception e) {
+        // Failed to find metric related method, ignore this metric.
+        continue;
+      }
+
+      if (!Modifier.isPublic(m.getModifiers())) {
+        // Ignore non-public methods.
+        continue;
+      }
+
+      Class<?> returnType = m.getReturnType();
+      String metricName = MetricRegistry.name(metricPrefix, arenaName, m.getName());
+      if (returnType.equals(int.class)) {
+        allMetrics.put(metricName, (Gauge<Integer>) () -> {
+          try {
+            return (Integer) m.invoke(arenaMetric);
+          } catch (Exception e) {
+            return -1; // Swallow the exceptions.
+          }
+        });
+
+      } else if (returnType.equals(long.class)) {
+        allMetrics.put(metricName, (Gauge<Long>) () -> {
+          try {
+            return (Long) m.invoke(arenaMetric);
+          } catch (Exception e) {
+            return -1L; // Swallow the exceptions.
+          }
+        });
+      }
+    }
+  }
+
+  @Override
+  public Map<String, Metric> getMetrics() {
+    return Collections.unmodifiableMap(allMetrics);
+  }
+}
diff --git a/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java b/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java
index fa2ff42de07d0..91497b9492219 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java
@@ -40,6 +40,7 @@ public class TransportConf {
   private final String SPARK_NETWORK_IO_MAXRETRIES_KEY;
   private final String SPARK_NETWORK_IO_RETRYWAIT_KEY;
   private final String SPARK_NETWORK_IO_LAZYFD_KEY;
+  private final String SPARK_NETWORK_VERBOSE_METRICS;
 
   private final ConfigProvider conf;
 
@@ -61,6 +62,7 @@ public TransportConf(String module, ConfigProvider conf) {
     SPARK_NETWORK_IO_MAXRETRIES_KEY = getConfKey("io.maxRetries");
     SPARK_NETWORK_IO_RETRYWAIT_KEY = getConfKey("io.retryWait");
     SPARK_NETWORK_IO_LAZYFD_KEY = getConfKey("io.lazyFD");
+    SPARK_NETWORK_VERBOSE_METRICS = getConfKey("io.enableVerboseMetrics");
   }
 
   public int getInt(String name, int defaultValue) {
@@ -162,6 +164,14 @@ public boolean lazyFileDescriptor() {
     return conf.getBoolean(SPARK_NETWORK_IO_LAZYFD_KEY, true);
   }
 
+  /**
+   * Whether to track Netty memory detailed metrics. If true, the detailed metrics of Netty
+   * PoolByteBufAllocator will be gotten, otherwise only general memory usage will be tracked.
+   */
+  public boolean verboseMetrics() {
+    return conf.getBoolean(SPARK_NETWORK_VERBOSE_METRICS, false);
+  }
+
   /**
    * Maximum number of retries when binding to a port before giving up.
    */
diff --git a/common/network-common/src/test/java/org/apache/spark/network/util/NettyMemoryMetricsSuite.java b/common/network-common/src/test/java/org/apache/spark/network/util/NettyMemoryMetricsSuite.java
new file mode 100644
index 0000000000000..400b385c9703a
--- /dev/null
+++ b/common/network-common/src/test/java/org/apache/spark/network/util/NettyMemoryMetricsSuite.java
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.network.util;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import com.codahale.metrics.Gauge;
+import com.codahale.metrics.Metric;
+import com.codahale.metrics.MetricRegistry;
+import com.codahale.metrics.MetricSet;
+import org.apache.spark.network.TestUtils;
+import org.apache.spark.network.client.TransportClient;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.spark.network.TransportContext;
+import org.apache.spark.network.client.TransportClientFactory;
+import org.apache.spark.network.server.NoOpRpcHandler;
+import org.apache.spark.network.server.RpcHandler;
+import org.apache.spark.network.server.TransportServer;
+
+public class NettyMemoryMetricsSuite {
+
+  private TransportConf conf;
+  private TransportContext context;
+  private TransportServer server;
+  private TransportClientFactory clientFactory;
+
+  private void setUp(boolean enableVerboseMetrics) {
+    HashMap<String, String> configMap = new HashMap<>();
+    configMap.put("spark.shuffle.io.enableVerboseMetrics", String.valueOf(enableVerboseMetrics));
+    conf = new TransportConf("shuffle", new MapConfigProvider(configMap));
+    RpcHandler rpcHandler = new NoOpRpcHandler();
+    context = new TransportContext(conf, rpcHandler);
+    server = context.createServer();
+    clientFactory = context.createClientFactory();
+  }
+
+  @After
+  public void tearDown() {
+    if (clientFactory != null) {
+      JavaUtils.closeQuietly(clientFactory);
+      clientFactory = null;
+    }
+
+    if (server != null) {
+      JavaUtils.closeQuietly(server);
+      server = null;
+    }
+  }
+
+  @Test
+  @SuppressWarnings("unchecked")
+  public void testGeneralNettyMemoryMetrics() throws IOException, InterruptedException {
+    setUp(false);
+
+    MetricSet serverMetrics = server.getAllMetrics();
+    Assert.assertNotNull(serverMetrics);
+    Assert.assertNotNull(serverMetrics.getMetrics());
+    Assert.assertNotEquals(serverMetrics.getMetrics().size(), 0);
+
+    Map<String, Metric> serverMetricMap = serverMetrics.getMetrics();
+    serverMetricMap.forEach((name, metric) ->
+      Assert.assertTrue(name.startsWith("shuffle-server"))
+    );
+
+    MetricSet clientMetrics = clientFactory.getAllMetrics();
+    Assert.assertNotNull(clientMetrics);
+    Assert.assertNotNull(clientMetrics.getMetrics());
+    Assert.assertNotEquals(clientMetrics.getMetrics().size(), 0);
+
+    Map<String, Metric> clientMetricMap = clientMetrics.getMetrics();
+    clientMetricMap.forEach((name, metrics) ->
+      Assert.assertTrue(name.startsWith("shuffle-client"))
+    );
+
+    // Make sure general metrics existed.
+    String heapMemoryMetric = "usedHeapMemory";
+    String directMemoryMetric = "usedDirectMemory";
+    Assert.assertNotNull(serverMetricMap.get(
+      MetricRegistry.name("shuffle-server", heapMemoryMetric)));
+    Assert.assertNotNull(serverMetricMap.get(
+      MetricRegistry.name("shuffle-server", directMemoryMetric)));
+
+    Assert.assertNotNull(clientMetricMap.get(
+      MetricRegistry.name("shuffle-client", heapMemoryMetric)));
+    Assert.assertNotNull(clientMetricMap.get(
+      MetricRegistry.name("shuffle-client", directMemoryMetric)));
+
+    TransportClient client = null;
+    try {
+      client = clientFactory.createClient(TestUtils.getLocalHost(), server.getPort());
+      Assert.assertTrue(client.isActive());
+
+      Assert.assertTrue(((Gauge<Long>)serverMetricMap.get(
+        MetricRegistry.name("shuffle-server", heapMemoryMetric))).getValue() >= 0L);
+      Assert.assertTrue(((Gauge<Long>)serverMetricMap.get(
+        MetricRegistry.name("shuffle-server", directMemoryMetric))).getValue() >= 0L);
+
+      Assert.assertTrue(((Gauge<Long>)clientMetricMap.get(
+        MetricRegistry.name("shuffle-client", heapMemoryMetric))).getValue() >= 0L);
+      Assert.assertTrue(((Gauge<Long>)clientMetricMap.get(
+        MetricRegistry.name("shuffle-client", directMemoryMetric))).getValue() >= 0L);
+
+    } finally {
+      if (client != null) {
+        client.close();
+      }
+    }
+  }
+
+  @Test
+  @SuppressWarnings("unchecked")
+  public void testAdditionalMetrics() throws IOException, InterruptedException {
+    setUp(true);
+
+    // Make sure additional metrics are added.
+    Map<String, Metric> serverMetricMap = server.getAllMetrics().getMetrics();
+    serverMetricMap.forEach((name, metric) -> {
+      Assert.assertTrue(name.startsWith("shuffle-server"));
+      String metricName = name.substring(name.lastIndexOf(".") + 1);
+      Assert.assertTrue(metricName.equals("usedDirectMemory")
+        || metricName.equals("usedHeapMemory")
+        || NettyMemoryMetrics.VERBOSE_METRICS.contains(metricName));
+    });
+
+    Map<String, Metric> clientMetricMap = clientFactory.getAllMetrics().getMetrics();
+    clientMetricMap.forEach((name, metric) -> {
+      Assert.assertTrue(name.startsWith("shuffle-client"));
+      String metricName = name.substring(name.lastIndexOf(".") + 1);
+      Assert.assertTrue(metricName.equals("usedDirectMemory")
+        || metricName.equals("usedHeapMemory")
+        || NettyMemoryMetrics.VERBOSE_METRICS.contains(metricName));
+    });
+
+    TransportClient client = null;
+    try {
+      client = clientFactory.createClient(TestUtils.getLocalHost(), server.getPort());
+      Assert.assertTrue(client.isActive());
+
+      String activeBytesMetric = "numActiveBytes";
+      Assert.assertTrue(((Gauge<Long>) serverMetricMap.get(MetricRegistry.name("shuffle-server",
+        "directArena0", activeBytesMetric))).getValue() >= 0L);
+
+      Assert.assertTrue(((Gauge<Long>) clientMetricMap.get(MetricRegistry.name("shuffle-client",
+        "directArena0", activeBytesMetric))).getValue() >= 0L);
+    } finally {
+      if (client != null) {
+        client.close();
+      }
+    }
+  }
+}
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index e3b9ce0f847e3..9ac753861dd84 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -146,7 +146,7 @@ metrics-jvm-3.1.2.jar
 minlog-1.3.0.jar
 mx4j-3.0.2.jar
 netty-3.9.9.Final.jar
-netty-all-4.0.43.Final.jar
+netty-all-4.0.47.Final.jar
 objenesis-2.1.jar
 opencsv-2.3.jar
 orc-core-1.4.0-nohive.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index a3f3f32444beb..d39747e9ee058 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -147,7 +147,7 @@ metrics-jvm-3.1.2.jar
 minlog-1.3.0.jar
 mx4j-3.0.2.jar
 netty-3.9.9.Final.jar
-netty-all-4.0.43.Final.jar
+netty-all-4.0.47.Final.jar
 objenesis-2.1.jar
 opencsv-2.3.jar
 orc-core-1.4.0-nohive.jar
diff --git a/pom.xml b/pom.xml
index eab8056084df0..09794c1eaf6cf 100644
--- a/pom.xml
+++ b/pom.xml
@@ -574,7 +574,7 @@
       <dependency>
         <groupId>io.netty</groupId>
         <artifactId>netty-all</artifactId>
-        <version>4.0.43.Final</version>
+        <version>4.0.47.Final</version>
       </dependency>
       <dependency>
         <groupId>io.netty</groupId>

From 4ee7dfe41b27abbd4c32074ecc8f268f6193c3f4 Mon Sep 17 00:00:00 2001
From: Riccardo Corbella <r.corbella@reply.it>
Date: Wed, 6 Sep 2017 08:22:57 +0100
Subject: [PATCH 491/779] [SPARK-21924][DOCS] Update structured streaming
 programming guide doc

## What changes were proposed in this pull request?

Update the line "For example, the data (12:09, cat) is out of order and late, and it falls in windows 12:05 - 12:15 and 12:10 - 12:20." as follow "For example, the data (12:09, cat) is out of order and late, and it falls in windows 12:00 - 12:10 and 12:05 - 12:15." under the programming structured streaming programming guide.

Author: Riccardo Corbella <r.corbella@reply.it>

Closes #19137 from riccardocorbella/bugfix.
---
 docs/structured-streaming-programming-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md
index 13a6a8297b922..93bef8d5bb7e2 100644
--- a/docs/structured-streaming-programming-guide.md
+++ b/docs/structured-streaming-programming-guide.md
@@ -977,7 +977,7 @@ at the beginning of every trigger is the red line  For example, when the engine
 `(12:14, dog)`, it sets the watermark for the next trigger as `12:04`.
 This watermark lets the engine maintain intermediate state for additional 10 minutes to allow late
 data to be counted. For example, the data `(12:09, cat)` is out of order and late, and it falls in
-windows `12:05 - 12:15` and `12:10 - 12:20`. Since, it is still ahead of the watermark `12:04` in 
+windows `12:00 - 12:10` and `12:05 - 12:15`. Since, it is still ahead of the watermark `12:04` in 
 the trigger, the engine still maintains the intermediate counts as state and correctly updates the 
 counts of the related windows. However, when the watermark is updated to `12:11`, the intermediate 
 state for window `(12:00 - 12:10)` is cleared, and all subsequent data (e.g. `(12:04, donkey)`) 

From 16c4c03c71394ab30c8edaf4418973e1a2c5ebfe Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Wed, 6 Sep 2017 14:12:27 +0200
Subject: [PATCH 492/779] [SPARK-19357][ML] Adding parallel model evaluation in
 ML tuning

## What changes were proposed in this pull request?
Modified `CrossValidator` and `TrainValidationSplit` to be able to evaluate models in parallel for a given parameter grid.  The level of parallelism is controlled by a parameter `numParallelEval` used to schedule a number of models to be trained/evaluated so that the jobs can be run concurrently.  This is a naive approach that does not check the cluster for needed resources, so care must be taken by the user to tune the parameter appropriately.  The default value is `1` which will train/evaluate in serial.

## How was this patch tested?
Added unit tests for CrossValidator and TrainValidationSplit to verify that model selection is the same when run in serial vs parallel.  Manual testing to verify tasks run in parallel when param is > 1. Added parameter usage to relevant examples.

Author: Bryan Cutler <cutlerb@gmail.com>

Closes #16774 from BryanCutler/parallel-model-eval-SPARK-19357.
---
 docs/ml-tuning.md                             |  2 +
 ...delSelectionViaCrossValidationExample.java |  4 +-
 ...lectionViaTrainValidationSplitExample.java |  3 +-
 ...elSelectionViaCrossValidationExample.scala |  1 +
 ...ectionViaTrainValidationSplitExample.scala |  2 +
 .../ml/param/shared/HasParallelism.scala      | 59 +++++++++++++++
 .../spark/ml/tuning/CrossValidator.scala      | 71 +++++++++++++------
 .../ml/tuning/TrainValidationSplit.scala      | 57 +++++++++++----
 .../spark/ml/tuning/CrossValidatorSuite.scala | 27 +++++++
 .../ml/tuning/TrainValidationSplitSuite.scala | 35 ++++++++-
 10 files changed, 221 insertions(+), 40 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/param/shared/HasParallelism.scala

diff --git a/docs/ml-tuning.md b/docs/ml-tuning.md
index e9123db29648e..64dc46cf0c0e7 100644
--- a/docs/ml-tuning.md
+++ b/docs/ml-tuning.md
@@ -55,6 +55,8 @@ for multiclass problems. The default metric used to choose the best `ParamMap` c
 method in each of these evaluators.
 
 To help construct the parameter grid, users can use the [`ParamGridBuilder`](api/scala/index.html#org.apache.spark.ml.tuning.ParamGridBuilder) utility.
+By default, sets of parameters from the parameter grid are evaluated in serial. Parameter evaluation can be done in parallel by setting `parallelism` with a value of 2 or more (a value of 1 will be serial) before running model selection with `CrossValidator` or `TrainValidationSplit` (NOTE: this is not yet supported in Python).
+The value of `parallelism` should be chosen carefully to maximize parallelism without exceeding cluster resources, and larger values may not always lead to improved performance.  Generally speaking, a value up to 10 should be sufficient for most clusters.
 
 # Cross-Validation
 
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java
index 975c65edc0ca6..d97327969ab26 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java
@@ -94,7 +94,9 @@ public static void main(String[] args) {
     CrossValidator cv = new CrossValidator()
       .setEstimator(pipeline)
       .setEvaluator(new BinaryClassificationEvaluator())
-      .setEstimatorParamMaps(paramGrid).setNumFolds(2);  // Use 3+ in practice
+      .setEstimatorParamMaps(paramGrid)
+      .setNumFolds(2)  // Use 3+ in practice
+      .setParallelism(2);  // Evaluate up to 2 parameter settings in parallel
 
     // Run cross-validation, and choose the best set of parameters.
     CrossValidatorModel cvModel = cv.fit(training);
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java
index 9a4722b90cf1b..2ef8bea0b2a2b 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java
@@ -70,7 +70,8 @@ public static void main(String[] args) {
       .setEstimator(lr)
       .setEvaluator(new RegressionEvaluator())
       .setEstimatorParamMaps(paramGrid)
-      .setTrainRatio(0.8);  // 80% for training and the remaining 20% for validation
+      .setTrainRatio(0.8)  // 80% for training and the remaining 20% for validation
+      .setParallelism(2);  // Evaluate up to 2 parameter settings in parallel
 
     // Run train validation split, and choose the best set of parameters.
     TrainValidationSplitModel model = trainValidationSplit.fit(training);
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala
index c1ff9ef521706..87d96dd51eb94 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala
@@ -93,6 +93,7 @@ object ModelSelectionViaCrossValidationExample {
       .setEvaluator(new BinaryClassificationEvaluator)
       .setEstimatorParamMaps(paramGrid)
       .setNumFolds(2)  // Use 3+ in practice
+      .setParallelism(2)  // Evaluate up to 2 parameter settings in parallel
 
     // Run cross-validation, and choose the best set of parameters.
     val cvModel = cv.fit(training)
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala
index 1cd2641f9a8d0..71e41e7298c73 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala
@@ -65,6 +65,8 @@ object ModelSelectionViaTrainValidationSplitExample {
       .setEstimatorParamMaps(paramGrid)
       // 80% of the data will be used for training and the remaining 20% for validation.
       .setTrainRatio(0.8)
+      // Evaluate up to 2 parameter settings in parallel
+      .setParallelism(2)
 
     // Run train validation split, and choose the best set of parameters.
     val model = trainValidationSplit.fit(training)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/HasParallelism.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/HasParallelism.scala
new file mode 100644
index 0000000000000..021d0b3e34166
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/HasParallelism.scala
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.param.shared
+
+import scala.concurrent.ExecutionContext
+
+import org.apache.spark.ml.param.{IntParam, Params, ParamValidators}
+import org.apache.spark.util.ThreadUtils
+
+/**
+ * Trait to define a level of parallelism for algorithms that are able to use
+ * multithreaded execution, and provide a thread-pool based execution context.
+ */
+private[ml] trait HasParallelism extends Params {
+
+  /**
+   * The number of threads to use when running parallel algorithms.
+   * Default is 1 for serial execution
+   *
+   * @group expertParam
+   */
+  val parallelism = new IntParam(this, "parallelism",
+    "the number of threads to use when running parallel algorithms", ParamValidators.gtEq(1))
+
+  setDefault(parallelism -> 1)
+
+  /** @group expertGetParam */
+  def getParallelism: Int = $(parallelism)
+
+  /**
+   * Create a new execution context with a thread-pool that has a maximum number of threads
+   * set to the value of [[parallelism]]. If this param is set to 1, a same-thread executor
+   * will be used to run in serial.
+   */
+  private[ml] def getExecutionContext: ExecutionContext = {
+    getParallelism match {
+      case 1 =>
+        ThreadUtils.sameThread
+      case n =>
+        ExecutionContext.fromExecutorService(ThreadUtils
+          .newDaemonCachedThreadPool(s"${this.getClass.getSimpleName}-thread-pool", n))
+    }
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
index 2012d6ca8b5ea..ce2a3a2e40411 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -20,20 +20,23 @@ package org.apache.spark.ml.tuning
 import java.util.{List => JList}
 
 import scala.collection.JavaConverters._
+import scala.concurrent.Future
+import scala.concurrent.duration.Duration
 
-import com.github.fommil.netlib.F2jBLAS
 import org.apache.hadoop.fs.Path
 import org.json4s.DefaultFormats
 
 import org.apache.spark.annotation.Since
 import org.apache.spark.internal.Logging
-import org.apache.spark.ml._
+import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.evaluation.Evaluator
-import org.apache.spark.ml.param._
+import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators}
+import org.apache.spark.ml.param.shared.HasParallelism
 import org.apache.spark.ml.util._
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.sql.{DataFrame, Dataset}
 import org.apache.spark.sql.types.StructType
+import org.apache.spark.util.ThreadUtils
 
 /**
  * Params for [[CrossValidator]] and [[CrossValidatorModel]].
@@ -64,13 +67,11 @@ private[ml] trait CrossValidatorParams extends ValidatorParams {
 @Since("1.2.0")
 class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String)
   extends Estimator[CrossValidatorModel]
-  with CrossValidatorParams with MLWritable with Logging {
+  with CrossValidatorParams with HasParallelism with MLWritable with Logging {
 
   @Since("1.2.0")
   def this() = this(Identifiable.randomUID("cv"))
 
-  private val f2jBLAS = new F2jBLAS
-
   /** @group setParam */
   @Since("1.2.0")
   def setEstimator(value: Estimator[_]): this.type = set(estimator, value)
@@ -91,6 +92,15 @@ class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String)
   @Since("2.0.0")
   def setSeed(value: Long): this.type = set(seed, value)
 
+  /**
+   * Set the mamixum level of parallelism to evaluate models in parallel.
+   * Default is 1 for serial evaluation
+   *
+   * @group expertSetParam
+   */
+  @Since("2.3.0")
+  def setParallelism(value: Int): this.type = set(parallelism, value)
+
   @Since("2.0.0")
   override def fit(dataset: Dataset[_]): CrossValidatorModel = {
     val schema = dataset.schema
@@ -99,32 +109,49 @@ class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String)
     val est = $(estimator)
     val eval = $(evaluator)
     val epm = $(estimatorParamMaps)
-    val numModels = epm.length
-    val metrics = new Array[Double](epm.length)
+
+    // Create execution context based on $(parallelism)
+    val executionContext = getExecutionContext
 
     val instr = Instrumentation.create(this, dataset)
-    instr.logParams(numFolds, seed)
+    instr.logParams(numFolds, seed, parallelism)
     logTuningParams(instr)
 
+    // Compute metrics for each model over each split
     val splits = MLUtils.kFold(dataset.toDF.rdd, $(numFolds), $(seed))
-    splits.zipWithIndex.foreach { case ((training, validation), splitIndex) =>
+    val metrics = splits.zipWithIndex.map { case ((training, validation), splitIndex) =>
       val trainingDataset = sparkSession.createDataFrame(training, schema).cache()
       val validationDataset = sparkSession.createDataFrame(validation, schema).cache()
-      // multi-model training
       logDebug(s"Train split $splitIndex with multiple sets of parameters.")
-      val models = est.fit(trainingDataset, epm).asInstanceOf[Seq[Model[_]]]
-      trainingDataset.unpersist()
-      var i = 0
-      while (i < numModels) {
-        // TODO: duplicate evaluator to take extra params from input
-        val metric = eval.evaluate(models(i).transform(validationDataset, epm(i)))
-        logDebug(s"Got metric $metric for model trained with ${epm(i)}.")
-        metrics(i) += metric
-        i += 1
+
+      // Fit models in a Future for training in parallel
+      val modelFutures = epm.map { paramMap =>
+        Future[Model[_]] {
+          val model = est.fit(trainingDataset, paramMap)
+          model.asInstanceOf[Model[_]]
+        } (executionContext)
+      }
+
+      // Unpersist training data only when all models have trained
+      Future.sequence[Model[_], Iterable](modelFutures)(implicitly, executionContext)
+        .onComplete { _ => trainingDataset.unpersist() } (executionContext)
+
+      // Evaluate models in a Future that will calulate a metric and allow model to be cleaned up
+      val foldMetricFutures = modelFutures.zip(epm).map { case (modelFuture, paramMap) =>
+        modelFuture.map { model =>
+          // TODO: duplicate evaluator to take extra params from input
+          val metric = eval.evaluate(model.transform(validationDataset, paramMap))
+          logDebug(s"Got metric $metric for model trained with $paramMap.")
+          metric
+        } (executionContext)
       }
+
+      // Wait for metrics to be calculated before unpersisting validation dataset
+      val foldMetrics = foldMetricFutures.map(ThreadUtils.awaitResult(_, Duration.Inf))
       validationDataset.unpersist()
-    }
-    f2jBLAS.dscal(numModels, 1.0 / $(numFolds), metrics, 1)
+      foldMetrics
+    }.transpose.map(_.sum / $(numFolds)) // Calculate average metric over all splits
+
     logInfo(s"Average cross-validation metrics: ${metrics.toSeq}")
     val (bestMetric, bestIndex) =
       if (eval.isLargerBetter) metrics.zipWithIndex.maxBy(_._1)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala
index db7c9d13d301a..16db0f5f12c77 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala
@@ -20,6 +20,8 @@ package org.apache.spark.ml.tuning
 import java.util.{List => JList}
 
 import scala.collection.JavaConverters._
+import scala.concurrent.Future
+import scala.concurrent.duration.Duration
 import scala.language.existentials
 
 import org.apache.hadoop.fs.Path
@@ -30,9 +32,11 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.evaluation.Evaluator
 import org.apache.spark.ml.param.{DoubleParam, ParamMap, ParamValidators}
+import org.apache.spark.ml.param.shared.HasParallelism
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.{DataFrame, Dataset}
 import org.apache.spark.sql.types.StructType
+import org.apache.spark.util.ThreadUtils
 
 /**
  * Params for [[TrainValidationSplit]] and [[TrainValidationSplitModel]].
@@ -62,7 +66,7 @@ private[ml] trait TrainValidationSplitParams extends ValidatorParams {
 @Since("1.5.0")
 class TrainValidationSplit @Since("1.5.0") (@Since("1.5.0") override val uid: String)
   extends Estimator[TrainValidationSplitModel]
-  with TrainValidationSplitParams with MLWritable with Logging {
+  with TrainValidationSplitParams with HasParallelism with MLWritable with Logging {
 
   @Since("1.5.0")
   def this() = this(Identifiable.randomUID("tvs"))
@@ -87,6 +91,15 @@ class TrainValidationSplit @Since("1.5.0") (@Since("1.5.0") override val uid: St
   @Since("2.0.0")
   def setSeed(value: Long): this.type = set(seed, value)
 
+  /**
+   * Set the mamixum level of parallelism to evaluate models in parallel.
+   * Default is 1 for serial evaluation
+   *
+   * @group expertSetParam
+   */
+  @Since("2.3.0")
+  def setParallelism(value: Int): this.type = set(parallelism, value)
+
   @Since("2.0.0")
   override def fit(dataset: Dataset[_]): TrainValidationSplitModel = {
     val schema = dataset.schema
@@ -94,11 +107,12 @@ class TrainValidationSplit @Since("1.5.0") (@Since("1.5.0") override val uid: St
     val est = $(estimator)
     val eval = $(evaluator)
     val epm = $(estimatorParamMaps)
-    val numModels = epm.length
-    val metrics = new Array[Double](epm.length)
+
+    // Create execution context based on $(parallelism)
+    val executionContext = getExecutionContext
 
     val instr = Instrumentation.create(this, dataset)
-    instr.logParams(trainRatio, seed)
+    instr.logParams(trainRatio, seed, parallelism)
     logTuningParams(instr)
 
     val Array(trainingDataset, validationDataset) =
@@ -106,18 +120,33 @@ class TrainValidationSplit @Since("1.5.0") (@Since("1.5.0") override val uid: St
     trainingDataset.cache()
     validationDataset.cache()
 
-    // multi-model training
+    // Fit models in a Future for training in parallel
     logDebug(s"Train split with multiple sets of parameters.")
-    val models = est.fit(trainingDataset, epm).asInstanceOf[Seq[Model[_]]]
-    trainingDataset.unpersist()
-    var i = 0
-    while (i < numModels) {
-      // TODO: duplicate evaluator to take extra params from input
-      val metric = eval.evaluate(models(i).transform(validationDataset, epm(i)))
-      logDebug(s"Got metric $metric for model trained with ${epm(i)}.")
-      metrics(i) += metric
-      i += 1
+    val modelFutures = epm.map { paramMap =>
+      Future[Model[_]] {
+        val model = est.fit(trainingDataset, paramMap)
+        model.asInstanceOf[Model[_]]
+      } (executionContext)
     }
+
+    // Unpersist training data only when all models have trained
+    Future.sequence[Model[_], Iterable](modelFutures)(implicitly, executionContext)
+      .onComplete { _ => trainingDataset.unpersist() } (executionContext)
+
+    // Evaluate models in a Future that will calulate a metric and allow model to be cleaned up
+    val metricFutures = modelFutures.zip(epm).map { case (modelFuture, paramMap) =>
+      modelFuture.map { model =>
+        // TODO: duplicate evaluator to take extra params from input
+        val metric = eval.evaluate(model.transform(validationDataset, paramMap))
+        logDebug(s"Got metric $metric for model trained with $paramMap.")
+        metric
+      } (executionContext)
+    }
+
+    // Wait for all metrics to be calculated
+    val metrics = metricFutures.map(ThreadUtils.awaitResult(_, Duration.Inf))
+
+    // Unpersist validation set once all metrics have been produced
     validationDataset.unpersist()
 
     logInfo(s"Train validation split metrics: ${metrics.toSeq}")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
index 90778d7890064..a8d4377cff2d1 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
@@ -120,6 +120,33 @@ class CrossValidatorSuite
     }
   }
 
+  test("cross validation with parallel evaluation") {
+    val lr = new LogisticRegression
+    val lrParamMaps = new ParamGridBuilder()
+      .addGrid(lr.regParam, Array(0.001, 1000.0))
+      .addGrid(lr.maxIter, Array(0, 3))
+      .build()
+    val eval = new BinaryClassificationEvaluator
+    val cv = new CrossValidator()
+      .setEstimator(lr)
+      .setEstimatorParamMaps(lrParamMaps)
+      .setEvaluator(eval)
+      .setNumFolds(2)
+      .setParallelism(1)
+    val cvSerialModel = cv.fit(dataset)
+    cv.setParallelism(2)
+    val cvParallelModel = cv.fit(dataset)
+
+    val serialMetrics = cvSerialModel.avgMetrics.sorted
+    val parallelMetrics = cvParallelModel.avgMetrics.sorted
+    assert(serialMetrics === parallelMetrics)
+
+    val parentSerial = cvSerialModel.bestModel.parent.asInstanceOf[LogisticRegression]
+    val parentParallel = cvParallelModel.bestModel.parent.asInstanceOf[LogisticRegression]
+    assert(parentSerial.getRegParam === parentParallel.getRegParam)
+    assert(parentSerial.getMaxIter === parentParallel.getMaxIter)
+  }
+
   test("read/write: CrossValidator with simple estimator") {
     val lr = new LogisticRegression().setMaxIter(3)
     val evaluator = new BinaryClassificationEvaluator()
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
index aa8b4cf173cc3..74801733381c1 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
@@ -36,9 +36,14 @@ class TrainValidationSplitSuite
 
   import testImplicits._
 
-  test("train validation with logistic regression") {
-    val dataset = sc.parallelize(generateLogisticInput(1.0, 1.0, 100, 42), 2).toDF()
+  @transient var dataset: Dataset[_] = _
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    dataset = sc.parallelize(generateLogisticInput(1.0, 1.0, 100, 42), 2).toDF()
+  }
 
+  test("train validation with logistic regression") {
     val lr = new LogisticRegression
     val lrParamMaps = new ParamGridBuilder()
       .addGrid(lr.regParam, Array(0.001, 1000.0))
@@ -117,6 +122,32 @@ class TrainValidationSplitSuite
     }
   }
 
+  test("train validation with parallel evaluation") {
+    val lr = new LogisticRegression
+    val lrParamMaps = new ParamGridBuilder()
+      .addGrid(lr.regParam, Array(0.001, 1000.0))
+      .addGrid(lr.maxIter, Array(0, 3))
+      .build()
+    val eval = new BinaryClassificationEvaluator
+    val cv = new TrainValidationSplit()
+      .setEstimator(lr)
+      .setEstimatorParamMaps(lrParamMaps)
+      .setEvaluator(eval)
+      .setParallelism(1)
+    val cvSerialModel = cv.fit(dataset)
+    cv.setParallelism(2)
+    val cvParallelModel = cv.fit(dataset)
+
+    val serialMetrics = cvSerialModel.validationMetrics.sorted
+    val parallelMetrics = cvParallelModel.validationMetrics.sorted
+    assert(serialMetrics === parallelMetrics)
+
+    val parentSerial = cvSerialModel.bestModel.parent.asInstanceOf[LogisticRegression]
+    val parentParallel = cvParallelModel.bestModel.parent.asInstanceOf[LogisticRegression]
+    assert(parentSerial.getRegParam === parentParallel.getRegParam)
+    assert(parentSerial.getMaxIter === parentParallel.getMaxIter)
+  }
+
   test("read/write: TrainValidationSplit") {
     val lr = new LogisticRegression().setMaxIter(3)
     val evaluator = new BinaryClassificationEvaluator()

From 64936c14a7ef30b9eacb129bafe6a1665887bf21 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Wed, 6 Sep 2017 23:28:12 +0900
Subject: [PATCH 493/779] [SPARK-21903][BUILD][FOLLOWUP] Upgrade
 scalastyle-maven-plugin and scalastyle as well in POM and SparkBuild.scala

## What changes were proposed in this pull request?

This PR proposes to match scalastyle version in POM and SparkBuild.scala

## How was this patch tested?

Manual builds.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #19146 from HyukjinKwon/SPARK-21903-follow-up.
---
 pom.xml                  | 2 +-
 project/SparkBuild.scala | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pom.xml b/pom.xml
index 09794c1eaf6cf..a051fea77b6b8 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2463,7 +2463,7 @@
       <plugin>
         <groupId>org.scalastyle</groupId>
         <artifactId>scalastyle-maven-plugin</artifactId>
-        <version>0.9.0</version>
+        <version>1.0.0</version>
         <configuration>
           <verbose>false</verbose>
           <failOnViolation>true</failOnViolation>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 20848f06172d4..748b1c49c8f1c 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -123,7 +123,7 @@ object SparkBuild extends PomBuild {
 
   lazy val scalaStyleRules = Project("scalaStyleRules", file("scalastyle"))
     .settings(
-      libraryDependencies += "org.scalastyle" %% "scalastyle" % "0.9.0"
+      libraryDependencies += "org.scalastyle" %% "scalastyle" % "1.0.0"
     )
 
   lazy val scalaStyleOnCompile = taskKey[Unit]("scalaStyleOnCompile")

From f2e22aebfe49cdfdf20f060305772971bcea9266 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Wed, 6 Sep 2017 07:42:19 -0700
Subject: [PATCH 494/779] [SPARK-21835][SQL] RewritePredicateSubquery should
 not produce unresolved query plans

## What changes were proposed in this pull request?

Correlated predicate subqueries are rewritten into `Join` by the rule `RewritePredicateSubquery`  during optimization.

It is possibly that the two sides of the `Join` have conflicting attributes. The query plans produced by `RewritePredicateSubquery` become unresolved and break structural integrity.

We should check if there are conflicting attributes in the `Join` and de-duplicate them by adding a `Project`.

## How was this patch tested?

Added tests.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #19050 from viirya/SPARK-21835.
---
 .../sql/catalyst/optimizer/subquery.scala     | 39 ++++++++++--
 .../org/apache/spark/sql/SubquerySuite.scala  | 63 +++++++++++++++++++
 2 files changed, 98 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
index 4386a10162767..7ff891516dac4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
@@ -49,6 +49,33 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper {
     }
   }
 
+  private def dedupJoin(joinPlan: Join): Join = joinPlan match {
+    // SPARK-21835: It is possibly that the two sides of the join have conflicting attributes,
+    // the produced join then becomes unresolved and break structural integrity. We should
+    // de-duplicate conflicting attributes. We don't use transformation here because we only
+    // care about the most top join converted from correlated predicate subquery.
+    case j @ Join(left, right, joinType @ (LeftSemi | LeftAnti), joinCond) =>
+      val duplicates = right.outputSet.intersect(left.outputSet)
+      if (duplicates.nonEmpty) {
+        val aliasMap = AttributeMap(duplicates.map { dup =>
+          dup -> Alias(dup, dup.toString)()
+        }.toSeq)
+        val aliasedExpressions = right.output.map { ref =>
+          aliasMap.getOrElse(ref, ref)
+        }
+        val newRight = Project(aliasedExpressions, right)
+        val newJoinCond = joinCond.map { condExpr =>
+          condExpr transform {
+            case a: Attribute => aliasMap.getOrElse(a, a).toAttribute
+          }
+        }
+        Join(left, newRight, joinType, newJoinCond)
+      } else {
+        j
+      }
+    case _ => joinPlan
+  }
+
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
     case Filter(condition, child) =>
       val (withSubquery, withoutSubquery) =
@@ -64,14 +91,17 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper {
       withSubquery.foldLeft(newFilter) {
         case (p, Exists(sub, conditions, _)) =>
           val (joinCond, outerPlan) = rewriteExistentialExpr(conditions, p)
-          Join(outerPlan, sub, LeftSemi, joinCond)
+          // Deduplicate conflicting attributes if any.
+          dedupJoin(Join(outerPlan, sub, LeftSemi, joinCond))
         case (p, Not(Exists(sub, conditions, _))) =>
           val (joinCond, outerPlan) = rewriteExistentialExpr(conditions, p)
-          Join(outerPlan, sub, LeftAnti, joinCond)
+          // Deduplicate conflicting attributes if any.
+          dedupJoin(Join(outerPlan, sub, LeftAnti, joinCond))
         case (p, In(value, Seq(ListQuery(sub, conditions, _, _)))) =>
           val inConditions = getValueExpression(value).zip(sub.output).map(EqualTo.tupled)
           val (joinCond, outerPlan) = rewriteExistentialExpr(inConditions ++ conditions, p)
-          Join(outerPlan, sub, LeftSemi, joinCond)
+          // Deduplicate conflicting attributes if any.
+          dedupJoin(Join(outerPlan, sub, LeftSemi, joinCond))
         case (p, Not(In(value, Seq(ListQuery(sub, conditions, _, _))))) =>
           // This is a NULL-aware (left) anti join (NAAJ) e.g. col NOT IN expr
           // Construct the condition. A NULL in one of the conditions is regarded as a positive
@@ -93,7 +123,8 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper {
           // will have the final conditions in the LEFT ANTI as
           // (A.A1 = B.B1 OR ISNULL(A.A1 = B.B1)) AND (B.B2 = A.A2)
           val pairs = (joinConds.map(c => Or(c, IsNull(c))) ++ conditions).reduceLeft(And)
-          Join(outerPlan, sub, LeftAnti, Option(pairs))
+          // Deduplicate conflicting attributes if any.
+          dedupJoin(Join(outerPlan, sub, LeftAnti, Option(pairs)))
         case (p, predicate) =>
           val (newCond, inputPlan) = rewriteExistentialExpr(Seq(predicate), p)
           Project(p.output, Filter(newCond.get, inputPlan))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
index 274694b99541e..ee6905e999df9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql
 
+import org.apache.spark.sql.catalyst.plans.logical.Join
 import org.apache.spark.sql.test.SharedSQLContext
 
 class SubquerySuite extends QueryTest with SharedSQLContext {
@@ -875,4 +876,66 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
       assert(e.message.contains("cannot resolve '`a`' given input columns: [t.i, t.j]"))
     }
   }
+
+  test("SPARK-21835: Join in correlated subquery should be duplicateResolved: case 1") {
+    withTable("t1") {
+      withTempPath { path =>
+        Seq(1 -> "a").toDF("i", "j").write.parquet(path.getCanonicalPath)
+        sql(s"CREATE TABLE t1 USING parquet LOCATION '${path.toURI}'")
+
+        val sqlText =
+          """
+            |SELECT * FROM t1
+            |WHERE
+            |NOT EXISTS (SELECT * FROM t1)
+          """.stripMargin
+        val optimizedPlan = sql(sqlText).queryExecution.optimizedPlan
+        val join = optimizedPlan.collectFirst { case j: Join => j }.get
+        assert(join.duplicateResolved)
+        assert(optimizedPlan.resolved)
+      }
+    }
+  }
+
+  test("SPARK-21835: Join in correlated subquery should be duplicateResolved: case 2") {
+    withTable("t1", "t2", "t3") {
+      withTempPath { path =>
+        val data = Seq((1, 1, 1), (2, 0, 2))
+
+        data.toDF("t1a", "t1b", "t1c").write.parquet(path.getCanonicalPath + "/t1")
+        data.toDF("t2a", "t2b", "t2c").write.parquet(path.getCanonicalPath + "/t2")
+        data.toDF("t3a", "t3b", "t3c").write.parquet(path.getCanonicalPath + "/t3")
+
+        sql(s"CREATE TABLE t1 USING parquet LOCATION '${path.toURI}/t1'")
+        sql(s"CREATE TABLE t2 USING parquet LOCATION '${path.toURI}/t2'")
+        sql(s"CREATE TABLE t3 USING parquet LOCATION '${path.toURI}/t3'")
+
+        val sqlText =
+          s"""
+             |SELECT *
+             |FROM   (SELECT *
+             |        FROM   t2
+             |        WHERE  t2c IN (SELECT t1c
+             |                       FROM   t1
+             |                       WHERE  t1a = t2a)
+             |        UNION
+             |        SELECT *
+             |        FROM   t3
+             |        WHERE  t3a IN (SELECT t2a
+             |                       FROM   t2
+             |                       UNION ALL
+             |                       SELECT t1a
+             |                       FROM   t1
+             |                       WHERE  t1b > 0)) t4
+             |WHERE  t4.t2b IN (SELECT Min(t3b)
+             |                          FROM   t3
+             |                          WHERE  t4.t2a = t3a)
+           """.stripMargin
+        val optimizedPlan = sql(sqlText).queryExecution.optimizedPlan
+        val joinNodes = optimizedPlan.collect { case j: Join => j }
+        joinNodes.foreach(j => assert(j.duplicateResolved))
+        assert(optimizedPlan.resolved)
+      }
+    }
+  }
 }

From 36b48ee6e92661645648a001d0d83623a8e5d601 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Wed, 6 Sep 2017 09:53:55 -0700
Subject: [PATCH 495/779] [SPARK-21801][SPARKR][TEST] set random seed for
 predictable test

## What changes were proposed in this pull request?

set.seed() before running tests

## How was this patch tested?

jenkins, appveyor

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #19111 from felixcheung/rranseed.
---
 R/pkg/tests/run-all.R | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/R/pkg/tests/run-all.R b/R/pkg/tests/run-all.R
index 0aefd8006caa4..a1834a220261d 100644
--- a/R/pkg/tests/run-all.R
+++ b/R/pkg/tests/run-all.R
@@ -43,6 +43,8 @@ if (identical(Sys.getenv("NOT_CRAN"), "true")) {
 test_package("SparkR")
 
 if (identical(Sys.getenv("NOT_CRAN"), "true")) {
+  # set random seed for predictable results. mostly for base's sample() in tree and classification
+  set.seed(42)
   # for testthat 1.0.2 later, change reporter from "summary" to default_reporter()
   testthat:::run_tests("SparkR",
                        file.path(sparkRDir, "pkg", "tests", "fulltests"),

From acdf45fb52e29a0308cccdbef0ec0dca0815d300 Mon Sep 17 00:00:00 2001
From: Jose Torres <joseph.torres@databricks.com>
Date: Wed, 6 Sep 2017 11:19:46 -0700
Subject: [PATCH 496/779] [SPARK-21765] Check that optimization doesn't affect
 isStreaming bit.

## What changes were proposed in this pull request?

Add an assert in logical plan optimization that the isStreaming bit stays the same, and fix empty relation rules where that wasn't happening.

## How was this patch tested?

new and existing unit tests

Author: Jose Torres <joseph.torres@databricks.com>
Author: Jose Torres <joseph-torres@databricks.com>

Closes #19056 from joseph-torres/SPARK-21765-followup.
---
 .../sql/catalyst/optimizer/Optimizer.scala    |  6 +-
 .../optimizer/PropagateEmptyRelation.scala    | 11 ++-
 .../plans/logical/LocalRelation.scala         |  2 +-
 .../sql/catalyst/rules/RuleExecutor.scala     |  1 -
 .../PropagateEmptyRelationSuite.scala         | 44 ++++++++++++
 .../execution/streaming/StreamExecution.scala |  2 +-
 .../sql/execution/streaming/socket.scala      | 17 ++---
 .../streaming/TextSocketStreamSuite.scala     | 72 ++++++++++---------
 8 files changed, 103 insertions(+), 52 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index d7e5906f671cb..02d6778c08e6f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -724,8 +724,10 @@ object PruneFilters extends Rule[LogicalPlan] with PredicateHelper {
     case Filter(Literal(true, BooleanType), child) => child
     // If the filter condition always evaluate to null or false,
     // replace the input with an empty relation.
-    case Filter(Literal(null, _), child) => LocalRelation(child.output, data = Seq.empty)
-    case Filter(Literal(false, BooleanType), child) => LocalRelation(child.output, data = Seq.empty)
+    case Filter(Literal(null, _), child) =>
+      LocalRelation(child.output, data = Seq.empty, isStreaming = plan.isStreaming)
+    case Filter(Literal(false, BooleanType), child) =>
+      LocalRelation(child.output, data = Seq.empty, isStreaming = plan.isStreaming)
     // If any deterministic condition is guaranteed to be true given the constraints on the child's
     // output, remove the condition
     case f @ Filter(fc, p: LogicalPlan) =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala
index 987cd7434b459..cfffa6bc2bfdd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala
@@ -38,7 +38,8 @@ object PropagateEmptyRelation extends Rule[LogicalPlan] with PredicateHelper {
     case _ => false
   }
 
-  private def empty(plan: LogicalPlan) = LocalRelation(plan.output, data = Seq.empty)
+  private def empty(plan: LogicalPlan) =
+    LocalRelation(plan.output, data = Seq.empty, isStreaming = plan.isStreaming)
 
   def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
     case p: Union if p.children.forall(isEmptyLocalRelation) =>
@@ -65,11 +66,15 @@ object PropagateEmptyRelation extends Rule[LogicalPlan] with PredicateHelper {
       case _: RepartitionByExpression => empty(p)
       // An aggregate with non-empty group expression will return one output row per group when the
       // input to the aggregate is not empty. If the input to the aggregate is empty then all groups
-      // will be empty and thus the output will be empty.
+      // will be empty and thus the output will be empty. If we're working on batch data, we can
+      // then treat the aggregate as redundant.
+      //
+      // If the aggregate is over streaming data, we may need to update the state store even if no
+      // new rows are processed, so we can't eliminate the node.
       //
       // If the grouping expressions are empty, however, then the aggregate will always produce a
       // single output row and thus we cannot propagate the EmptyRelation.
-      case Aggregate(ge, _, _) if ge.nonEmpty => empty(p)
+      case Aggregate(ge, _, _) if ge.nonEmpty && !p.isStreaming => empty(p)
       // Generators like Hive-style UDTF may return their records within `close`.
       case Generate(_: Explode, _, _, _, _, _) => empty(p)
       case _ => p
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala
index 7a21183664c56..d73d7e73f28d5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala
@@ -58,7 +58,7 @@ case class LocalRelation(output: Seq[Attribute],
    * query.
    */
   override final def newInstance(): this.type = {
-    LocalRelation(output.map(_.newInstance()), data).asInstanceOf[this.type]
+    LocalRelation(output.map(_.newInstance()), data, isStreaming).asInstanceOf[this.type]
   }
 
   override protected def stringArgs: Iterator[Any] = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
index 85b368c862630..0e89d1c8f31e8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
@@ -63,7 +63,6 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging {
   /** Defines a sequence of rule batches, to be overridden by the implementation. */
   protected def batches: Seq[Batch]
 
-
   /**
    * Executes the batches of rules defined by the subclass. The batches are executed serially
    * using the defined execution strategy. Within each batch, rules are also executed serially.
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelationSuite.scala
index 2285be16938d6..bc1c48b99c295 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelationSuite.scala
@@ -18,11 +18,13 @@
 package org.apache.spark.sql.catalyst.optimizer
 
 import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
+import org.apache.spark.sql.types.StructType
 
 class PropagateEmptyRelationSuite extends PlanTest {
   object Optimize extends RuleExecutor[LogicalPlan] {
@@ -124,6 +126,48 @@ class PropagateEmptyRelationSuite extends PlanTest {
     comparePlans(optimized, correctAnswer)
   }
 
+  test("propagate empty streaming relation through multiple UnaryNode") {
+    val output = Seq('a.int)
+    val data = Seq(Row(1))
+    val schema = StructType.fromAttributes(output)
+    val converter = CatalystTypeConverters.createToCatalystConverter(schema)
+    val relation = LocalRelation(
+      output,
+      data.map(converter(_).asInstanceOf[InternalRow]),
+      isStreaming = true)
+
+    val query = relation
+      .where(false)
+      .select('a)
+      .where('a > 1)
+      .where('a != 200)
+      .orderBy('a.asc)
+
+    val optimized = Optimize.execute(query.analyze)
+    val correctAnswer = LocalRelation(output, isStreaming = true)
+
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("don't propagate empty streaming relation through agg") {
+    val output = Seq('a.int)
+    val data = Seq(Row(1))
+    val schema = StructType.fromAttributes(output)
+    val converter = CatalystTypeConverters.createToCatalystConverter(schema)
+    val relation = LocalRelation(
+      output,
+      data.map(converter(_).asInstanceOf[InternalRow]),
+      isStreaming = true)
+
+    val query = relation
+      .groupBy('a)('a)
+
+    val optimized = Optimize.execute(query.analyze)
+    val correctAnswer = query.analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
   test("don't propagate non-empty local relation") {
     val query = testRelation1
       .where(true)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index c224f2f9f1404..71088ff6386be 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -659,7 +659,7 @@ class StreamExecution(
           replacements ++= output.zip(newPlan.output)
           newPlan
         }.getOrElse {
-          LocalRelation(output)
+          LocalRelation(output, isStreaming = true)
         }
     }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/socket.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/socket.scala
index 8e63207959575..0b22cbc46e6bf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/socket.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/socket.scala
@@ -29,8 +29,10 @@ import scala.util.{Failure, Success, Try}
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider}
 import org.apache.spark.sql.types.{StringType, StructField, StructType, TimestampType}
+import org.apache.spark.unsafe.types.UTF8String
 
 
 object TextSocketSource {
@@ -126,17 +128,10 @@ class TextSocketSource(host: String, port: Int, includeTimestamp: Boolean, sqlCo
       batches.slice(sliceStart, sliceEnd)
     }
 
-    import sqlContext.implicits._
-    val rawBatch = sqlContext.createDataset(rawList)
-
-    // Underlying MemoryStream has schema (String, Timestamp); strip out the timestamp
-    // if requested.
-    if (includeTimestamp) {
-      rawBatch.toDF("value", "timestamp")
-    } else {
-      // Strip out timestamp
-      rawBatch.select("_1").toDF("value")
-    }
+    val rdd = sqlContext.sparkContext
+      .parallelize(rawList)
+      .map { case (v, ts) => InternalRow(UTF8String.fromString(v), ts.getTime) }
+    sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = true)
   }
 
   override def commit(end: Offset): Unit = synchronized {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/TextSocketStreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/TextSocketStreamSuite.scala
index 9ebf4d2835266..ec11549073650 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/TextSocketStreamSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/TextSocketStreamSuite.scala
@@ -65,20 +65,22 @@ class TextSocketStreamSuite extends StreamTest with SharedSQLContext with Before
       while (source.getOffset.isEmpty) {
         Thread.sleep(10)
       }
-      val offset1 = source.getOffset.get
-      val batch1 = source.getBatch(None, offset1)
-      assert(batch1.as[String].collect().toSeq === Seq("hello"))
+      withSQLConf("spark.sql.streaming.unsupportedOperationCheck" -> "false") {
+        val offset1 = source.getOffset.get
+        val batch1 = source.getBatch(None, offset1)
+        assert(batch1.as[String].collect().toSeq === Seq("hello"))
+
+        serverThread.enqueue("world")
+        while (source.getOffset.get === offset1) {
+          Thread.sleep(10)
+        }
+        val offset2 = source.getOffset.get
+        val batch2 = source.getBatch(Some(offset1), offset2)
+        assert(batch2.as[String].collect().toSeq === Seq("world"))
 
-      serverThread.enqueue("world")
-      while (source.getOffset.get === offset1) {
-        Thread.sleep(10)
+        val both = source.getBatch(None, offset2)
+        assert(both.as[String].collect().sorted.toSeq === Seq("hello", "world"))
       }
-      val offset2 = source.getOffset.get
-      val batch2 = source.getBatch(Some(offset1), offset2)
-      assert(batch2.as[String].collect().toSeq === Seq("world"))
-
-      val both = source.getBatch(None, offset2)
-      assert(both.as[String].collect().sorted.toSeq === Seq("hello", "world"))
 
       // Try stopping the source to make sure this does not block forever.
       source.stop()
@@ -104,22 +106,24 @@ class TextSocketStreamSuite extends StreamTest with SharedSQLContext with Before
       while (source.getOffset.isEmpty) {
         Thread.sleep(10)
       }
-      val offset1 = source.getOffset.get
-      val batch1 = source.getBatch(None, offset1)
-      val batch1Seq = batch1.as[(String, Timestamp)].collect().toSeq
-      assert(batch1Seq.map(_._1) === Seq("hello"))
-      val batch1Stamp = batch1Seq(0)._2
-
-      serverThread.enqueue("world")
-      while (source.getOffset.get === offset1) {
-        Thread.sleep(10)
+      withSQLConf("spark.sql.streaming.unsupportedOperationCheck" -> "false") {
+        val offset1 = source.getOffset.get
+        val batch1 = source.getBatch(None, offset1)
+        val batch1Seq = batch1.as[(String, Timestamp)].collect().toSeq
+        assert(batch1Seq.map(_._1) === Seq("hello"))
+        val batch1Stamp = batch1Seq(0)._2
+
+        serverThread.enqueue("world")
+        while (source.getOffset.get === offset1) {
+          Thread.sleep(10)
+        }
+        val offset2 = source.getOffset.get
+        val batch2 = source.getBatch(Some(offset1), offset2)
+        val batch2Seq = batch2.as[(String, Timestamp)].collect().toSeq
+        assert(batch2Seq.map(_._1) === Seq("world"))
+        val batch2Stamp = batch2Seq(0)._2
+        assert(!batch2Stamp.before(batch1Stamp))
       }
-      val offset2 = source.getOffset.get
-      val batch2 = source.getBatch(Some(offset1), offset2)
-      val batch2Seq = batch2.as[(String, Timestamp)].collect().toSeq
-      assert(batch2Seq.map(_._1) === Seq("world"))
-      val batch2Stamp = batch2Seq(0)._2
-      assert(!batch2Stamp.before(batch1Stamp))
 
       // Try stopping the source to make sure this does not block forever.
       source.stop()
@@ -184,12 +188,14 @@ class TextSocketStreamSuite extends StreamTest with SharedSQLContext with Before
       while (source.getOffset.isEmpty) {
         Thread.sleep(10)
       }
-      val batch = source.getBatch(None, source.getOffset.get).as[String]
-      batch.collect()
-      val numRowsMetric =
-        batch.queryExecution.executedPlan.collectLeaves().head.metrics.get("numOutputRows")
-      assert(numRowsMetric.nonEmpty)
-      assert(numRowsMetric.get.value === 1)
+      withSQLConf("spark.sql.streaming.unsupportedOperationCheck" -> "false") {
+        val batch = source.getBatch(None, source.getOffset.get).as[String]
+        batch.collect()
+        val numRowsMetric =
+          batch.queryExecution.executedPlan.collectLeaves().head.metrics.get("numOutputRows")
+        assert(numRowsMetric.nonEmpty)
+        assert(numRowsMetric.get.value === 1)
+      }
       source.stop()
       source = null
     }

From fa0092bddf695a757f5ddaed539e55e2dc9fccb7 Mon Sep 17 00:00:00 2001
From: Jacek Laskowski <jacek@japila.pl>
Date: Wed, 6 Sep 2017 15:48:48 -0700
Subject: [PATCH 497/779] [SPARK-21901][SS] Define toString for
 StateOperatorProgress

## What changes were proposed in this pull request?

Just `StateOperatorProgress.toString` + few formatting fixes

## How was this patch tested?

Local build. Waiting for OK from Jenkins.

Author: Jacek Laskowski <jacek@japila.pl>

Closes #19112 from jaceklaskowski/SPARK-21901-StateOperatorProgress-toString.
---
 .../main/scala/org/apache/spark/sql/streaming/progress.scala    | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
index 3000c4233cfb3..cedc1dce4a703 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
@@ -55,6 +55,8 @@ class StateOperatorProgress private[sql](
     ("numRowsUpdated" -> JInt(numRowsUpdated)) ~
     ("memoryUsedBytes" -> JInt(memoryUsedBytes))
   }
+
+  override def toString: String = prettyJson
 }
 
 /**

From aad2125475dcdeb4a0410392b6706511db17bac4 Mon Sep 17 00:00:00 2001
From: Tucker Beck <tucker.beck@rentrakmail.com>
Date: Thu, 7 Sep 2017 09:38:00 +0900
Subject: [PATCH 498/779] Fixed pandoc dependency issue in python/setup.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem Description

When pyspark is listed as a dependency of another package, installing
the other package will cause an install failure in pyspark. When the
other package is being installed, pyspark's setup_requires requirements
are installed including pypandoc. Thus, the exception handling on
setup.py:152 does not work because the pypandoc module is indeed
available. However, the pypandoc.convert() function fails if pandoc
itself is not installed (in our use cases it is not). This raises an
OSError that is not handled, and setup fails.

The following is a sample failure:
```
$ which pandoc
$ pip freeze | grep pypandoc
pypandoc==1.4
$ pip install pyspark
Collecting pyspark
  Downloading pyspark-2.2.0.post0.tar.gz (188.3MB)
    100% |████████████████████████████████| 188.3MB 16.8MB/s
    Complete output from command python setup.py egg_info:
    Maybe try:

        sudo apt-get install pandoc
    See http://johnmacfarlane.net/pandoc/installing.html
    for installation options
    ---------------------------------------------------------------

    Traceback (most recent call last):
      File "<string>", line 1, in <module>
      File "/tmp/pip-build-mfnizcwa/pyspark/setup.py", line 151, in <module>
        long_description = pypandoc.convert('README.md', 'rst')
      File "/home/tbeck/.virtualenvs/cem/lib/python3.5/site-packages/pypandoc/__init__.py", line 69, in convert
        outputfile=outputfile, filters=filters)
      File "/home/tbeck/.virtualenvs/cem/lib/python3.5/site-packages/pypandoc/__init__.py", line 260, in _convert_input
        _ensure_pandoc_path()
      File "/home/tbeck/.virtualenvs/cem/lib/python3.5/site-packages/pypandoc/__init__.py", line 544, in _ensure_pandoc_path
        raise OSError("No pandoc was found: either install pandoc and add it\n"
    OSError: No pandoc was found: either install pandoc and add it
    to your PATH or or call pypandoc.download_pandoc(...) or
    install pypandoc wheels with included pandoc.

    ----------------------------------------
Command "python setup.py egg_info" failed with error code 1 in /tmp/pip-build-mfnizcwa/pyspark/
```

## What changes were proposed in this pull request?

This change simply adds an additional exception handler for the OSError
that is raised. This allows pyspark to be installed client-side without requiring pandoc to be installed.

## How was this patch tested?

I tested this by building a wheel package of pyspark with the change applied. Then, in a clean virtual environment with pypandoc installed but pandoc not available on the system, I installed pyspark from the wheel.

Here is the output

```
$ pip freeze | grep pypandoc
pypandoc==1.4
$ which pandoc
$ pip install --no-cache-dir ../spark/python/dist/pyspark-2.3.0.dev0-py2.py3-none-any.whl
Processing /home/tbeck/work/spark/python/dist/pyspark-2.3.0.dev0-py2.py3-none-any.whl
Requirement already satisfied: py4j==0.10.6 in /home/tbeck/.virtualenvs/cem/lib/python3.5/site-packages (from pyspark==2.3.0.dev0)
Installing collected packages: pyspark
Successfully installed pyspark-2.3.0.dev0
```

Author: Tucker Beck <tucker.beck@rentrakmail.com>

Closes #18981 from dusktreader/dusktreader/fix-pandoc-dependency-issue-in-setup_py.
---
 python/setup.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/setup.py b/python/setup.py
index cfc83c68e3df5..02612ff8a7247 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -151,6 +151,8 @@ def _supports_symlinks():
         long_description = pypandoc.convert('README.md', 'rst')
     except ImportError:
         print("Could not import pypandoc - required to package PySpark", file=sys.stderr)
+    except OSError:
+        print("Could not convert - pandoc is not installed", file=sys.stderr)
 
     setup(
         name='pyspark',

From ce7293c150c71a872d20beda44b12dec9deca18d Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Wed, 6 Sep 2017 22:15:25 -0700
Subject: [PATCH 499/779] [SPARK-21835][SQL][FOLLOW-UP]
 RewritePredicateSubquery should not produce unresolved query plans

## What changes were proposed in this pull request?

This is a follow-up of #19050 to deal with `ExistenceJoin` case.

## How was this patch tested?

Added test.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #19151 from viirya/SPARK-21835-followup.
---
 .../spark/sql/catalyst/optimizer/subquery.scala      | 11 +++++++----
 .../scala/org/apache/spark/sql/SubquerySuite.scala   | 12 ++++++++++++
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
index 7ff891516dac4..64b28565eb27c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
@@ -49,12 +49,12 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper {
     }
   }
 
-  private def dedupJoin(joinPlan: Join): Join = joinPlan match {
+  private def dedupJoin(joinPlan: LogicalPlan): LogicalPlan = joinPlan match {
     // SPARK-21835: It is possibly that the two sides of the join have conflicting attributes,
     // the produced join then becomes unresolved and break structural integrity. We should
     // de-duplicate conflicting attributes. We don't use transformation here because we only
     // care about the most top join converted from correlated predicate subquery.
-    case j @ Join(left, right, joinType @ (LeftSemi | LeftAnti), joinCond) =>
+    case j @ Join(left, right, joinType @ (LeftSemi | LeftAnti | ExistenceJoin(_)), joinCond) =>
       val duplicates = right.outputSet.intersect(left.outputSet)
       if (duplicates.nonEmpty) {
         val aliasMap = AttributeMap(duplicates.map { dup =>
@@ -145,13 +145,16 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper {
       e transformUp {
         case Exists(sub, conditions, _) =>
           val exists = AttributeReference("exists", BooleanType, nullable = false)()
-          newPlan = Join(newPlan, sub, ExistenceJoin(exists), conditions.reduceLeftOption(And))
+          // Deduplicate conflicting attributes if any.
+          newPlan = dedupJoin(
+            Join(newPlan, sub, ExistenceJoin(exists), conditions.reduceLeftOption(And)))
           exists
         case In(value, Seq(ListQuery(sub, conditions, _, _))) =>
           val exists = AttributeReference("exists", BooleanType, nullable = false)()
           val inConditions = getValueExpression(value).zip(sub.output).map(EqualTo.tupled)
           val newConditions = (inConditions ++ conditions).reduceLeftOption(And)
-          newPlan = Join(newPlan, sub, ExistenceJoin(exists), newConditions)
+          // Deduplicate conflicting attributes if any.
+          newPlan = dedupJoin(Join(newPlan, sub, ExistenceJoin(exists), newConditions))
           exists
       }
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
index ee6905e999df9..8673dc14f7597 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
@@ -938,4 +938,16 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
       }
     }
   }
+
+  test("SPARK-21835: Join in correlated subquery should be duplicateResolved: case 3") {
+    val sqlText =
+      """
+        |SELECT * FROM l, r WHERE l.a = r.c + 1 AND
+        |(EXISTS (SELECT * FROM r) OR l.a = r.c)
+      """.stripMargin
+    val optimizedPlan = sql(sqlText).queryExecution.optimizedPlan
+    val join = optimizedPlan.collectFirst { case j: Join => j }.get
+    assert(join.duplicateResolved)
+    assert(optimizedPlan.resolved)
+  }
 }

From eea2b877cf4e6ba4ea524bf8d782516add1b093e Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Wed, 6 Sep 2017 22:20:48 -0700
Subject: [PATCH 500/779] [SPARK-21912][SQL] ORC/Parquet table should not
 create invalid column names

## What changes were proposed in this pull request?

Currently, users meet job abortions while creating or altering ORC/Parquet tables with invalid column names. We had better prevent this by raising **AnalysisException** with a guide to use aliases instead like Paquet data source tables.

**BEFORE**
```scala
scala> sql("CREATE TABLE orc1 USING ORC AS SELECT 1 `a b`")
17/09/04 13:28:21 ERROR Utils: Aborting task
java.lang.IllegalArgumentException: Error: : expected at the position 8 of 'struct<a b:int>' but ' ' is found.
17/09/04 13:28:21 ERROR FileFormatWriter: Job job_20170904132821_0001 aborted.
17/09/04 13:28:21 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1)
org.apache.spark.SparkException: Task failed while writing rows.
```

**AFTER**
```scala
scala> sql("CREATE TABLE orc1 USING ORC AS SELECT 1 `a b`")
17/09/04 13:27:40 ERROR CreateDataSourceTableAsSelectCommand: Failed to write to table orc1
org.apache.spark.sql.AnalysisException: Attribute name "a b" contains invalid character(s) among " ,;{}()\n\t=". Please use alias to rename it.;
```

## How was this patch tested?

Pass the Jenkins with a new test case.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #19124 from dongjoon-hyun/SPARK-21912.
---
 .../spark/sql/execution/command/ddl.scala     | 21 ++++++++++
 .../spark/sql/execution/command/tables.scala  |  5 ++-
 .../datasources/DataSourceStrategy.scala      |  2 +
 .../datasources/orc/OrcFileFormat.scala       | 42 +++++++++++++++++++
 .../parquet/ParquetSchemaConverter.scala      |  2 +-
 .../sql-tests/inputs/show_columns.sql         |  4 +-
 .../sql-tests/results/show_columns.sql.out    |  4 +-
 .../spark/sql/hive/HiveStrategies.scala       |  2 +
 .../sql/hive/execution/SQLQuerySuite.scala    | 34 +++++++++++++++
 9 files changed, 109 insertions(+), 7 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index dae160f1bbb18..7611e1c2e268c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -34,6 +34,9 @@ import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
 import org.apache.spark.sql.execution.datasources.PartitioningUtils
+import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat
+import org.apache.spark.sql.execution.datasources.parquet.ParquetSchemaConverter
+import org.apache.spark.sql.internal.HiveSerDe
 import org.apache.spark.sql.types._
 import org.apache.spark.util.{SerializableConfiguration, ThreadUtils}
 
@@ -848,4 +851,22 @@ object DDLUtils {
       }
     }
   }
+
+  private[sql] def checkDataSchemaFieldNames(table: CatalogTable): Unit = {
+    table.provider.foreach {
+      _.toLowerCase(Locale.ROOT) match {
+        case HIVE_PROVIDER =>
+          val serde = table.storage.serde
+          if (serde == HiveSerDe.sourceToSerDe("orc").get.serde) {
+            OrcFileFormat.checkFieldNames(table.dataSchema)
+          } else if (serde == HiveSerDe.sourceToSerDe("parquet").get.serde ||
+              serde == Some("parquet.hive.serde.ParquetHiveSerDe")) {
+            ParquetSchemaConverter.checkFieldNames(table.dataSchema)
+          }
+        case "parquet" => ParquetSchemaConverter.checkFieldNames(table.dataSchema)
+        case "orc" => OrcFileFormat.checkFieldNames(table.dataSchema)
+        case _ =>
+      }
+    }
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index 694d517668a2c..1dddc1ca324bd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -201,13 +201,14 @@ case class AlterTableAddColumnsCommand(
 
     // make sure any partition columns are at the end of the fields
     val reorderedSchema = catalogTable.dataSchema ++ columns ++ catalogTable.partitionSchema
+    val newSchema = catalogTable.schema.copy(fields = reorderedSchema.toArray)
 
     SchemaUtils.checkColumnNameDuplication(
       reorderedSchema.map(_.name), "in the table definition of " + table.identifier,
       conf.caseSensitiveAnalysis)
+    DDLUtils.checkDataSchemaFieldNames(catalogTable.copy(schema = newSchema))
 
-    catalog.alterTableSchema(
-      table, catalogTable.schema.copy(fields = reorderedSchema.toArray))
+    catalog.alterTableSchema(table, newSchema)
 
     Seq.empty[Row]
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 0deac1984bd62..5d6223dffd285 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -130,10 +130,12 @@ case class DataSourceAnalysis(conf: SQLConf) extends Rule[LogicalPlan] with Cast
 
   override def apply(plan: LogicalPlan): LogicalPlan = plan transform {
     case CreateTable(tableDesc, mode, None) if DDLUtils.isDatasourceTable(tableDesc) =>
+      DDLUtils.checkDataSchemaFieldNames(tableDesc)
       CreateDataSourceTableCommand(tableDesc, ignoreIfExists = mode == SaveMode.Ignore)
 
     case CreateTable(tableDesc, mode, Some(query))
         if query.resolved && DDLUtils.isDatasourceTable(tableDesc) =>
+      DDLUtils.checkDataSchemaFieldNames(tableDesc.copy(schema = query.schema))
       CreateDataSourceTableAsSelectCommand(tableDesc, mode, query)
 
     case InsertIntoTable(l @ LogicalRelation(_: InsertableRelation, _, _, _),
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
new file mode 100644
index 0000000000000..2eeb0065455f3
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.orc
+
+import org.apache.orc.TypeDescription
+
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.types.StructType
+
+private[sql] object OrcFileFormat {
+  private def checkFieldName(name: String): Unit = {
+    try {
+      TypeDescription.fromString(s"struct<$name:int>")
+    } catch {
+      case _: IllegalArgumentException =>
+        throw new AnalysisException(
+          s"""Column name "$name" contains invalid character(s).
+             |Please use alias to rename it.
+           """.stripMargin.split("\n").mkString(" ").trim)
+    }
+  }
+
+  def checkFieldNames(schema: StructType): StructType = {
+    schema.fieldNames.foreach(checkFieldName)
+    schema
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala
index 0b805e4362883..b3781cfc4a607 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala
@@ -556,7 +556,7 @@ private[parquet] class ParquetSchemaConverter(
   }
 }
 
-private[parquet] object ParquetSchemaConverter {
+private[sql] object ParquetSchemaConverter {
   val SPARK_PARQUET_SCHEMA_NAME = "spark_schema"
 
   val EMPTY_MESSAGE: MessageType =
diff --git a/sql/core/src/test/resources/sql-tests/inputs/show_columns.sql b/sql/core/src/test/resources/sql-tests/inputs/show_columns.sql
index 1e02c2f045ea9..521018e94e501 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/show_columns.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/show_columns.sql
@@ -2,9 +2,9 @@ CREATE DATABASE showdb;
 
 USE showdb;
 
-CREATE TABLE showcolumn1 (col1 int, `col 2` int) USING parquet;
+CREATE TABLE showcolumn1 (col1 int, `col 2` int) USING json;
 CREATE TABLE showcolumn2 (price int, qty int, year int, month int) USING parquet partitioned by (year, month);
-CREATE TEMPORARY VIEW showColumn3 (col3 int, `col 4` int) USING parquet;
+CREATE TEMPORARY VIEW showColumn3 (col3 int, `col 4` int) USING json;
 CREATE GLOBAL TEMP VIEW showColumn4 AS SELECT 1 as col1, 'abc' as `col 5`;
 
 
diff --git a/sql/core/src/test/resources/sql-tests/results/show_columns.sql.out b/sql/core/src/test/resources/sql-tests/results/show_columns.sql.out
index 05c3a083ee3b3..71d6e120e8943 100644
--- a/sql/core/src/test/resources/sql-tests/results/show_columns.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/show_columns.sql.out
@@ -19,7 +19,7 @@ struct<>
 
 
 -- !query 2
-CREATE TABLE showcolumn1 (col1 int, `col 2` int) USING parquet
+CREATE TABLE showcolumn1 (col1 int, `col 2` int) USING json
 -- !query 2 schema
 struct<>
 -- !query 2 output
@@ -35,7 +35,7 @@ struct<>
 
 
 -- !query 4
-CREATE TEMPORARY VIEW showColumn3 (col3 int, `col 4` int) USING parquet
+CREATE TEMPORARY VIEW showColumn3 (col3 int, `col 4` int) USING json
 -- !query 4 schema
 struct<>
 -- !query 4 output
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index ae1e7e72e8c3f..47203a80c37bd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -151,9 +151,11 @@ object HiveAnalysis extends Rule[LogicalPlan] {
       InsertIntoHiveTable(r.tableMeta, partSpec, query, overwrite, ifPartitionNotExists)
 
     case CreateTable(tableDesc, mode, None) if DDLUtils.isHiveTable(tableDesc) =>
+      DDLUtils.checkDataSchemaFieldNames(tableDesc)
       CreateTableCommand(tableDesc, ignoreIfExists = mode == SaveMode.Ignore)
 
     case CreateTable(tableDesc, mode, Some(query)) if DDLUtils.isHiveTable(tableDesc) =>
+      DDLUtils.checkDataSchemaFieldNames(tableDesc)
       CreateHiveTableAsSelectCommand(tableDesc, query, mode)
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index d2a6ef7b2b377..85a6a77cedc4f 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -2000,4 +2000,38 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       assert(setOfPath.size() == pathSizeToDeleteOnExit)
     }
   }
+
+  test("SPARK-21912 ORC/Parquet table should not create invalid column names") {
+    Seq(" ", ",", ";", "{", "}", "(", ")", "\n", "\t", "=").foreach { name =>
+      withTable("t21912") {
+        Seq("ORC", "PARQUET").foreach { source =>
+          val m = intercept[AnalysisException] {
+            sql(s"CREATE TABLE t21912(`col$name` INT) USING $source")
+          }.getMessage
+          assert(m.contains(s"contains invalid character(s)"))
+
+          val m2 = intercept[AnalysisException] {
+            sql(s"CREATE TABLE t21912 USING $source AS SELECT 1 `col$name`")
+          }.getMessage
+          assert(m2.contains(s"contains invalid character(s)"))
+
+          withSQLConf(HiveUtils.CONVERT_METASTORE_PARQUET.key -> "false") {
+            val m3 = intercept[AnalysisException] {
+              sql(s"CREATE TABLE t21912(`col$name` INT) USING hive OPTIONS (fileFormat '$source')")
+            }.getMessage
+            assert(m3.contains(s"contains invalid character(s)"))
+          }
+        }
+
+        // TODO: After SPARK-21929, we need to check ORC, too.
+        Seq("PARQUET").foreach { source =>
+          sql(s"CREATE TABLE t21912(`col` INT) USING $source")
+          val m = intercept[AnalysisException] {
+            sql(s"ALTER TABLE t21912 ADD COLUMNS(`col$name` INT)")
+          }.getMessage
+          assert(m.contains(s"contains invalid character(s)"))
+        }
+      }
+    }
+  }
 }

From b9ab791a9efb0dc165ba283c91acf831fa6be5d8 Mon Sep 17 00:00:00 2001
From: Sanket Chintapalli <schintap@yahoo-inc.com>
Date: Thu, 7 Sep 2017 11:25:24 -0500
Subject: [PATCH 501/779] [SPARK-21890] Credentials not being passed to add the
 tokens

I observed this while running a oozie job trying to connect to hbase via spark.
It look like the creds are not being passed in thehttps://github.com/apache/spark/blob/branch-2.2/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HadoopFSCredentialProvider.scala#L53 for 2.2 release.
More Info as to why it fails on secure grid:
Oozie client gets the necessary tokens the application needs before launching. It passes those tokens along to the oozie launcher job (MR job) which will then actually call the Spark client to launch the spark app and pass the tokens along.
The oozie launcher job cannot get anymore tokens because all it has is tokens ( you can't get tokens with tokens, you need tgt or keytab).
The error here is because the launcher job runs the Spark Client to submit the spark job but the spark client doesn't see that it already has the hdfs tokens so it tries to get more, which ends with the exception.
There was a change with SPARK-19021 to generalize the hdfs credentials provider that changed it so we don't pass the existing credentials into the call to get tokens so it doesn't realize it already has the necessary tokens.

https://issues.apache.org/jira/browse/SPARK-21890
Modified to pass creds to get delegation tokens

Author: Sanket Chintapalli <schintap@yahoo-inc.com>

Closes #19140 from redsanket/SPARK-21890-master.
---
 .../HBaseDelegationTokenProvider.scala        |  2 +
 .../HadoopDelegationTokenManager.scala        |  2 +-
 .../HadoopDelegationTokenProvider.scala       |  3 ++
 .../HadoopFSDelegationTokenProvider.scala     | 50 +++++++++----------
 .../HiveDelegationTokenProvider.scala         |  2 +
 .../HadoopDelegationTokenManagerSuite.scala   |  4 +-
 6 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/security/HBaseDelegationTokenProvider.scala b/core/src/main/scala/org/apache/spark/deploy/security/HBaseDelegationTokenProvider.scala
index 35621daf9c0d7..78b0e6b2cbf39 100644
--- a/core/src/main/scala/org/apache/spark/deploy/security/HBaseDelegationTokenProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/security/HBaseDelegationTokenProvider.scala
@@ -24,6 +24,7 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.security.Credentials
 import org.apache.hadoop.security.token.{Token, TokenIdentifier}
 
+import org.apache.spark.SparkConf
 import org.apache.spark.internal.Logging
 import org.apache.spark.util.Utils
 
@@ -34,6 +35,7 @@ private[security] class HBaseDelegationTokenProvider
 
   override def obtainDelegationTokens(
       hadoopConf: Configuration,
+      sparkConf: SparkConf,
       creds: Credentials): Option[Long] = {
     try {
       val mirror = universe.runtimeMirror(Utils.getContextOrSparkClassLoader)
diff --git a/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala b/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala
index c317c4fe3d821..c134b7ebe38fa 100644
--- a/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala
@@ -116,7 +116,7 @@ private[spark] class HadoopDelegationTokenManager(
       creds: Credentials): Long = {
     delegationTokenProviders.values.flatMap { provider =>
       if (provider.delegationTokensRequired(hadoopConf)) {
-        provider.obtainDelegationTokens(hadoopConf, creds)
+        provider.obtainDelegationTokens(hadoopConf, sparkConf, creds)
       } else {
         logDebug(s"Service ${provider.serviceName} does not require a token." +
           s" Check your configuration to see if security is disabled or not.")
diff --git a/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenProvider.scala b/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenProvider.scala
index f162e7e58c53a..1ba245e84af4b 100644
--- a/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenProvider.scala
@@ -20,6 +20,8 @@ package org.apache.spark.deploy.security
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.security.Credentials
 
+import org.apache.spark.SparkConf
+
 /**
  * Hadoop delegation token provider.
  */
@@ -46,5 +48,6 @@ private[spark] trait HadoopDelegationTokenProvider {
    */
   def obtainDelegationTokens(
     hadoopConf: Configuration,
+    sparkConf: SparkConf,
     creds: Credentials): Option[Long]
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProvider.scala b/core/src/main/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProvider.scala
index f0ac7f501ceb1..300773c58b183 100644
--- a/core/src/main/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProvider.scala
@@ -26,8 +26,9 @@ import org.apache.hadoop.mapred.Master
 import org.apache.hadoop.security.{Credentials, UserGroupInformation}
 import org.apache.hadoop.security.token.delegation.AbstractDelegationTokenIdentifier
 
-import org.apache.spark.SparkException
+import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config._
 
 private[deploy] class HadoopFSDelegationTokenProvider(fileSystems: Configuration => Set[FileSystem])
     extends HadoopDelegationTokenProvider with Logging {
@@ -41,21 +42,20 @@ private[deploy] class HadoopFSDelegationTokenProvider(fileSystems: Configuration
 
   override def obtainDelegationTokens(
       hadoopConf: Configuration,
+      sparkConf: SparkConf,
       creds: Credentials): Option[Long] = {
 
     val fsToGetTokens = fileSystems(hadoopConf)
-    val newCreds = fetchDelegationTokens(
-      getTokenRenewer(hadoopConf),
-      fsToGetTokens)
+    val fetchCreds = fetchDelegationTokens(getTokenRenewer(hadoopConf), fsToGetTokens, creds)
 
     // Get the token renewal interval if it is not set. It will only be called once.
     if (tokenRenewalInterval == null) {
-      tokenRenewalInterval = getTokenRenewalInterval(hadoopConf, fsToGetTokens)
+      tokenRenewalInterval = getTokenRenewalInterval(hadoopConf, sparkConf, fsToGetTokens)
     }
 
     // Get the time of next renewal.
     val nextRenewalDate = tokenRenewalInterval.flatMap { interval =>
-      val nextRenewalDates = newCreds.getAllTokens.asScala
+      val nextRenewalDates = fetchCreds.getAllTokens.asScala
         .filter(_.decodeIdentifier().isInstanceOf[AbstractDelegationTokenIdentifier])
         .map { token =>
           val identifier = token
@@ -66,7 +66,6 @@ private[deploy] class HadoopFSDelegationTokenProvider(fileSystems: Configuration
       if (nextRenewalDates.isEmpty) None else Some(nextRenewalDates.min)
     }
 
-    creds.addAll(newCreds)
     nextRenewalDate
   }
 
@@ -89,9 +88,8 @@ private[deploy] class HadoopFSDelegationTokenProvider(fileSystems: Configuration
 
   private def fetchDelegationTokens(
       renewer: String,
-      filesystems: Set[FileSystem]): Credentials = {
-
-    val creds = new Credentials()
+      filesystems: Set[FileSystem],
+      creds: Credentials): Credentials = {
 
     filesystems.foreach { fs =>
       logInfo("getting token for: " + fs)
@@ -103,25 +101,27 @@ private[deploy] class HadoopFSDelegationTokenProvider(fileSystems: Configuration
 
   private def getTokenRenewalInterval(
       hadoopConf: Configuration,
+      sparkConf: SparkConf,
       filesystems: Set[FileSystem]): Option[Long] = {
     // We cannot use the tokens generated with renewer yarn. Trying to renew
     // those will fail with an access control issue. So create new tokens with the logged in
     // user as renewer.
-    val creds = fetchDelegationTokens(
-      UserGroupInformation.getCurrentUser.getUserName,
-      filesystems)
-
-    val renewIntervals = creds.getAllTokens.asScala.filter {
-      _.decodeIdentifier().isInstanceOf[AbstractDelegationTokenIdentifier]
-    }.flatMap { token =>
-      Try {
-        val newExpiration = token.renew(hadoopConf)
-        val identifier = token.decodeIdentifier().asInstanceOf[AbstractDelegationTokenIdentifier]
-        val interval = newExpiration - identifier.getIssueDate
-        logInfo(s"Renewal interval is $interval for token ${token.getKind.toString}")
-        interval
-      }.toOption
+    sparkConf.get(PRINCIPAL).flatMap { renewer =>
+      val creds = new Credentials()
+      fetchDelegationTokens(renewer, filesystems, creds)
+
+      val renewIntervals = creds.getAllTokens.asScala.filter {
+        _.decodeIdentifier().isInstanceOf[AbstractDelegationTokenIdentifier]
+      }.flatMap { token =>
+        Try {
+          val newExpiration = token.renew(hadoopConf)
+          val identifier = token.decodeIdentifier().asInstanceOf[AbstractDelegationTokenIdentifier]
+          val interval = newExpiration - identifier.getIssueDate
+          logInfo(s"Renewal interval is $interval for token ${token.getKind.toString}")
+          interval
+        }.toOption
+      }
+      if (renewIntervals.isEmpty) None else Some(renewIntervals.min)
     }
-    if (renewIntervals.isEmpty) None else Some(renewIntervals.min)
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/security/HiveDelegationTokenProvider.scala b/core/src/main/scala/org/apache/spark/deploy/security/HiveDelegationTokenProvider.scala
index 53b9f898c6e7d..b31cc595ed83b 100644
--- a/core/src/main/scala/org/apache/spark/deploy/security/HiveDelegationTokenProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/security/HiveDelegationTokenProvider.scala
@@ -30,6 +30,7 @@ import org.apache.hadoop.io.Text
 import org.apache.hadoop.security.{Credentials, UserGroupInformation}
 import org.apache.hadoop.security.token.Token
 
+import org.apache.spark.SparkConf
 import org.apache.spark.internal.Logging
 import org.apache.spark.util.Utils
 
@@ -61,6 +62,7 @@ private[security] class HiveDelegationTokenProvider
 
   override def obtainDelegationTokens(
       hadoopConf: Configuration,
+      sparkConf: SparkConf,
       creds: Credentials): Option[Long] = {
     try {
       val conf = hiveConf(hadoopConf)
diff --git a/core/src/test/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManagerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManagerSuite.scala
index 5b05521e48f8a..eeffc36070b44 100644
--- a/core/src/test/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManagerSuite.scala
@@ -94,7 +94,7 @@ class HadoopDelegationTokenManagerSuite extends SparkFunSuite with Matchers {
 
     val hiveCredentialProvider = new HiveDelegationTokenProvider()
     val credentials = new Credentials()
-    hiveCredentialProvider.obtainDelegationTokens(hadoopConf, credentials)
+    hiveCredentialProvider.obtainDelegationTokens(hadoopConf, sparkConf, credentials)
 
     credentials.getAllTokens.size() should be (0)
   }
@@ -105,7 +105,7 @@ class HadoopDelegationTokenManagerSuite extends SparkFunSuite with Matchers {
 
     val hbaseTokenProvider = new HBaseDelegationTokenProvider()
     val creds = new Credentials()
-    hbaseTokenProvider.obtainDelegationTokens(hadoopConf, creds)
+    hbaseTokenProvider.obtainDelegationTokens(hadoopConf, sparkConf, creds)
 
     creds.getAllTokens.size should be (0)
   }

From e00f1a1da12be4a1fdb7b89eb5e098aa16c5c2c3 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Thu, 7 Sep 2017 16:26:56 -0700
Subject: [PATCH 502/779] [SPARK-13656][SQL] Delete
 spark.sql.parquet.cacheMetadata from SQLConf and docs

## What changes were proposed in this pull request?

Since [SPARK-15639](https://github.com/apache/spark/pull/13701), `spark.sql.parquet.cacheMetadata` and `PARQUET_CACHE_METADATA` is not used. This PR removes from SQLConf and docs.

## How was this patch tested?

Pass the existing Jenkins.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #19129 from dongjoon-hyun/SPARK-13656.
---
 docs/sql-programming-guide.md                          | 10 +++-------
 .../scala/org/apache/spark/sql/internal/SQLConf.scala  |  7 -------
 2 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 032073bfc40dd..45ba4d168f043 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -925,13 +925,6 @@ Configuration of Parquet can be done using the `setConf` method on `SparkSession
     flag tells Spark SQL to interpret INT96 data as a timestamp to provide compatibility with these systems.
   </td>
 </tr>
-<tr>
-  <td><code>spark.sql.parquet.cacheMetadata</code></td>
-  <td>true</td>
-  <td>
-    Turns on caching of Parquet schema metadata. Can speed up querying of static data.
-  </td>
-</tr>
 <tr>
   <td><code>spark.sql.parquet.compression.codec</code></td>
   <td>snappy</td>
@@ -1588,6 +1581,9 @@ options.
       Note that this is different from the Hive behavior.
     - As a result, `DROP TABLE` statements on those tables will not remove the data.
 
+ - `spark.sql.parquet.cacheMetadata` is no longer used.
+   See [SPARK-13664](https://issues.apache.org/jira/browse/SPARK-13664) for details.
+
 ## Upgrading From Spark SQL 1.5 to 1.6
 
  - From Spark 1.6, by default the Thrift server runs in multi-session mode. Which means each JDBC/ODBC
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index db5d65c9b2e23..d00c672487532 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -284,11 +284,6 @@ object SQLConf {
     .booleanConf
     .createWithDefault(false)
 
-  val PARQUET_CACHE_METADATA = buildConf("spark.sql.parquet.cacheMetadata")
-    .doc("Turns on caching of Parquet schema metadata. Can speed up querying of static data.")
-    .booleanConf
-    .createWithDefault(true)
-
   val PARQUET_COMPRESSION = buildConf("spark.sql.parquet.compression.codec")
     .doc("Sets the compression codec use when writing Parquet files. Acceptable values include: " +
       "uncompressed, snappy, gzip, lzo.")
@@ -1010,8 +1005,6 @@ class SQLConf extends Serializable with Logging {
 
   def parquetCompressionCodec: String = getConf(PARQUET_COMPRESSION)
 
-  def parquetCacheMetadata: Boolean = getConf(PARQUET_CACHE_METADATA)
-
   def parquetVectorizedReaderEnabled: Boolean = getConf(PARQUET_VECTORIZED_READER_ENABLED)
 
   def columnBatchSize: Int = getConf(COLUMN_BATCH_SIZE)

From c26976fe148a2a59cec2f399484be73d08fb6b7f Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Fri, 8 Sep 2017 09:31:13 +0800
Subject: [PATCH 503/779] [SPARK-21939][TEST] Use TimeLimits instead of
 Timeouts

Since ScalaTest 3.0.0, `org.scalatest.concurrent.Timeouts` is deprecated.
This PR replaces the deprecated one with `org.scalatest.concurrent.TimeLimits`.

```scala
-import org.scalatest.concurrent.Timeouts._
+import org.scalatest.concurrent.TimeLimits._
```

Pass the existing test suites.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #19150 from dongjoon-hyun/SPARK-21939.

Change-Id: I1a1b07f1b97e51e2263dfb34b7eaaa099b2ded5e
---
 core/src/test/scala/org/apache/spark/DistributedSuite.scala  | 2 +-
 core/src/test/scala/org/apache/spark/DriverSuite.scala       | 4 ++--
 core/src/test/scala/org/apache/spark/UnpersistSuite.scala    | 2 +-
 .../scala/org/apache/spark/deploy/SparkSubmitSuite.scala     | 4 ++--
 .../scala/org/apache/spark/rdd/AsyncRDDActionsSuite.scala    | 4 ++--
 .../scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala | 4 ++--
 .../scheduler/OutputCommitCoordinatorIntegrationSuite.scala  | 4 ++--
 .../scala/org/apache/spark/storage/BlockManagerSuite.scala   | 2 +-
 .../test/scala/org/apache/spark/util/EventLoopSuite.scala    | 4 ++--
 .../execution/streaming/ProcessingTimeExecutorSuite.scala    | 2 +-
 .../scala/org/apache/spark/sql/streaming/StreamTest.scala    | 5 +++--
 .../org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala     | 4 ++--
 .../spark/streaming/receiver/BlockGeneratorSuite.scala       | 4 +++-
 13 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/DistributedSuite.scala b/core/src/test/scala/org/apache/spark/DistributedSuite.scala
index 23372384940e7..bea67b71a5a12 100644
--- a/core/src/test/scala/org/apache/spark/DistributedSuite.scala
+++ b/core/src/test/scala/org/apache/spark/DistributedSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark
 
 import org.scalatest.Matchers
-import org.scalatest.concurrent.Timeouts._
+import org.scalatest.concurrent.TimeLimits._
 import org.scalatest.time.{Millis, Span}
 
 import org.apache.spark.security.EncryptionFunSuite
diff --git a/core/src/test/scala/org/apache/spark/DriverSuite.scala b/core/src/test/scala/org/apache/spark/DriverSuite.scala
index 454b7e607a51b..be80d278fcea8 100644
--- a/core/src/test/scala/org/apache/spark/DriverSuite.scala
+++ b/core/src/test/scala/org/apache/spark/DriverSuite.scala
@@ -19,13 +19,13 @@ package org.apache.spark
 
 import java.io.File
 
-import org.scalatest.concurrent.Timeouts
+import org.scalatest.concurrent.TimeLimits
 import org.scalatest.prop.TableDrivenPropertyChecks._
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.util.Utils
 
-class DriverSuite extends SparkFunSuite with Timeouts {
+class DriverSuite extends SparkFunSuite with TimeLimits {
 
   ignore("driver should exit after finishing without cleanup (SPARK-530)") {
     val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!"))
diff --git a/core/src/test/scala/org/apache/spark/UnpersistSuite.scala b/core/src/test/scala/org/apache/spark/UnpersistSuite.scala
index 09e21646ee744..bc3f58cf2a35d 100644
--- a/core/src/test/scala/org/apache/spark/UnpersistSuite.scala
+++ b/core/src/test/scala/org/apache/spark/UnpersistSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark
 
-import org.scalatest.concurrent.Timeouts._
+import org.scalatest.concurrent.TimeLimits._
 import org.scalatest.time.{Millis, Span}
 
 class UnpersistSuite extends SparkFunSuite with LocalSparkContext {
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index 7400ceb802d14..4d69ce844d2ea 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -31,7 +31,7 @@ import org.apache.commons.io.FileUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, FSDataInputStream, Path}
 import org.scalatest.{BeforeAndAfterEach, Matchers}
-import org.scalatest.concurrent.Timeouts
+import org.scalatest.concurrent.TimeLimits
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark._
@@ -97,7 +97,7 @@ class SparkSubmitSuite
   with Matchers
   with BeforeAndAfterEach
   with ResetSystemProperties
-  with Timeouts
+  with TimeLimits
   with TestPrematureExit {
 
   override def beforeEach() {
diff --git a/core/src/test/scala/org/apache/spark/rdd/AsyncRDDActionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/AsyncRDDActionsSuite.scala
index 8f639eef46f66..f4be8eaef7013 100644
--- a/core/src/test/scala/org/apache/spark/rdd/AsyncRDDActionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/AsyncRDDActionsSuite.scala
@@ -24,13 +24,13 @@ import scala.concurrent.ExecutionContext.Implicits.global
 import scala.concurrent.duration.Duration
 
 import org.scalatest.BeforeAndAfterAll
-import org.scalatest.concurrent.Timeouts
+import org.scalatest.concurrent.TimeLimits
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark._
 import org.apache.spark.util.ThreadUtils
 
-class AsyncRDDActionsSuite extends SparkFunSuite with BeforeAndAfterAll with Timeouts {
+class AsyncRDDActionsSuite extends SparkFunSuite with BeforeAndAfterAll with TimeLimits {
 
   @transient private var sc: SparkContext = _
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 3b5df657d45cf..703fc1b34c387 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -25,7 +25,7 @@ import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, Map}
 import scala.language.reflectiveCalls
 import scala.util.control.NonFatal
 
-import org.scalatest.concurrent.Timeouts
+import org.scalatest.concurrent.TimeLimits
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark._
@@ -98,7 +98,7 @@ class MyRDD(
 
 class DAGSchedulerSuiteDummyException extends Exception
 
-class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with Timeouts {
+class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLimits {
 
   import DAGSchedulerSuite._
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorIntegrationSuite.scala
index 32cdf16dd3318..a27dadcf49bfc 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorIntegrationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorIntegrationSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.scheduler
 
 import org.apache.hadoop.mapred.{FileOutputCommitter, TaskAttemptContext}
-import org.scalatest.concurrent.Timeouts
+import org.scalatest.concurrent.TimeLimits
 import org.scalatest.time.{Seconds, Span}
 
 import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite, TaskContext}
@@ -32,7 +32,7 @@ import org.apache.spark.util.Utils
 class OutputCommitCoordinatorIntegrationSuite
   extends SparkFunSuite
   with LocalSparkContext
-  with Timeouts {
+  with TimeLimits {
 
   override def beforeAll(): Unit = {
     super.beforeAll()
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
index 8dd70fcb2fbd5..cfe89fde63f88 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
@@ -33,7 +33,7 @@ import org.mockito.{Matchers => mc}
 import org.mockito.Mockito.{mock, times, verify, when}
 import org.scalatest._
 import org.scalatest.concurrent.Eventually._
-import org.scalatest.concurrent.Timeouts._
+import org.scalatest.concurrent.TimeLimits._
 
 import org.apache.spark._
 import org.apache.spark.broadcast.BroadcastManager
diff --git a/core/src/test/scala/org/apache/spark/util/EventLoopSuite.scala b/core/src/test/scala/org/apache/spark/util/EventLoopSuite.scala
index 6f7dddd4f760a..f4f8388f5f19f 100644
--- a/core/src/test/scala/org/apache/spark/util/EventLoopSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/EventLoopSuite.scala
@@ -24,11 +24,11 @@ import scala.concurrent.duration._
 import scala.language.postfixOps
 
 import org.scalatest.concurrent.Eventually._
-import org.scalatest.concurrent.Timeouts
+import org.scalatest.concurrent.TimeLimits
 
 import org.apache.spark.SparkFunSuite
 
-class EventLoopSuite extends SparkFunSuite with Timeouts {
+class EventLoopSuite extends SparkFunSuite with TimeLimits {
 
   test("EventLoop") {
     val buffer = new ConcurrentLinkedQueue[Int]
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ProcessingTimeExecutorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ProcessingTimeExecutorSuite.scala
index 007554a83f548..519e3c01afe8a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ProcessingTimeExecutorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ProcessingTimeExecutorSuite.scala
@@ -24,7 +24,7 @@ import scala.collection.mutable
 import org.eclipse.jetty.util.ConcurrentHashSet
 import org.scalatest.concurrent.Eventually
 import org.scalatest.concurrent.PatienceConfiguration.Timeout
-import org.scalatest.concurrent.Timeouts._
+import org.scalatest.concurrent.TimeLimits._
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.SparkFunSuite
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index b2c42eef88f6d..4f8764060d922 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -27,7 +27,7 @@ import scala.util.Random
 import scala.util.control.NonFatal
 
 import org.scalatest.{Assertions, BeforeAndAfterAll}
-import org.scalatest.concurrent.{Eventually, Timeouts}
+import org.scalatest.concurrent.{Eventually, Signaler, ThreadSignaler, TimeLimits}
 import org.scalatest.concurrent.PatienceConfiguration.Timeout
 import org.scalatest.exceptions.TestFailedDueToTimeoutException
 import org.scalatest.time.Span
@@ -67,8 +67,9 @@ import org.apache.spark.util.{Clock, SystemClock, Utils}
  * avoid hanging forever in the case of failures. However, individual suites can change this
  * by overriding `streamingTimeout`.
  */
-trait StreamTest extends QueryTest with SharedSQLContext with Timeouts with BeforeAndAfterAll {
+trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with BeforeAndAfterAll {
 
+  implicit val defaultSignaler: Signaler = ThreadSignaler
   override def afterAll(): Unit = {
     super.afterAll()
     StateStore.stop() // stop the state store maintenance thread and unload store providers
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
index 17715465d38f5..be6aa6d8dc3c9 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
@@ -26,7 +26,7 @@ import scala.tools.nsc.Properties
 
 import org.apache.hadoop.fs.Path
 import org.scalatest.{BeforeAndAfterEach, Matchers}
-import org.scalatest.concurrent.Timeouts
+import org.scalatest.concurrent.TimeLimits
 import org.scalatest.exceptions.TestFailedDueToTimeoutException
 import org.scalatest.time.SpanSugar._
 
@@ -50,7 +50,7 @@ class HiveSparkSubmitSuite
   with Matchers
   with BeforeAndAfterEach
   with ResetSystemProperties
-  with Timeouts {
+  with TimeLimits {
 
   // TODO: rewrite these or mark them as slow tests to be run sparingly
 
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala
index 4f41b9d0a0b3c..898da4445e464 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala
@@ -24,8 +24,9 @@ import scala.collection.mutable
 
 import org.scalatest.BeforeAndAfter
 import org.scalatest.Matchers._
+import org.scalatest.concurrent.{Signaler, ThreadSignaler}
 import org.scalatest.concurrent.Eventually._
-import org.scalatest.concurrent.Timeouts._
+import org.scalatest.concurrent.TimeLimits._
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.{SparkConf, SparkException, SparkFunSuite}
@@ -34,6 +35,7 @@ import org.apache.spark.util.ManualClock
 
 class BlockGeneratorSuite extends SparkFunSuite with BeforeAndAfter {
 
+  implicit val defaultSignaler: Signaler = ThreadSignaler
   private val blockIntervalMs = 10
   private val conf = new SparkConf().set("spark.streaming.blockInterval", s"${blockIntervalMs}ms")
   @volatile private var blockGenerator: BlockGenerator = null

From 57bc1e9eb452284cbed090dbd5008eb2062f1b36 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Fri, 8 Sep 2017 14:26:07 +0900
Subject: [PATCH 504/779] [SPARK-21950][SQL][PYTHON][TEST]
 pyspark.sql.tests.SQLTests2 should stop SparkContext.

## What changes were proposed in this pull request?

`pyspark.sql.tests.SQLTests2` doesn't stop newly created spark context in the test and it might affect the following tests.
This pr makes `pyspark.sql.tests.SQLTests2` stop `SparkContext`.

## How was this patch tested?

Existing tests.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #19158 from ueshin/issues/SPARK-21950.
---
 python/pyspark/sql/tests.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 3d87ccfc03ddd..1bc889c3f45c7 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -2619,8 +2619,12 @@ def test_sparksession_with_stopped_sparkcontext(self):
         self.sc.stop()
         sc = SparkContext('local[4]', self.sc.appName)
         spark = SparkSession.builder.getOrCreate()
-        df = spark.createDataFrame([(1, 2)], ["c", "c"])
-        df.collect()
+        try:
+            df = spark.createDataFrame([(1, 2)], ["c", "c"])
+            df.collect()
+        finally:
+            spark.stop()
+            sc.stop()
 
 
 class UDFInitializationTests(unittest.TestCase):

From f62b20f39c5e44ad6de535117e076060fef3f9ec Mon Sep 17 00:00:00 2001
From: liuxian <liu.xian3@zte.com.cn>
Date: Thu, 7 Sep 2017 23:09:26 -0700
Subject: [PATCH 505/779] [SPARK-21949][TEST] Tables created in unit tests
 should be dropped after use

## What changes were proposed in this pull request?
 Tables should be dropped after use in unit tests.
## How was this patch tested?
N/A

Author: liuxian <liu.xian3@zte.com.cn>

Closes #19155 from 10110346/droptable.
---
 .../sql/hive/InsertIntoHiveTableSuite.scala   | 116 ++++---
 .../sql/hive/execution/HiveQuerySuite.scala   |  59 ++--
 .../hive/execution/HiveTableScanSuite.scala   |  18 +-
 .../sql/hive/execution/HiveUDFSuite.scala     |  42 +--
 .../sql/hive/execution/SQLQuerySuite.scala    | 326 +++++++++---------
 5 files changed, 301 insertions(+), 260 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
index cc80f2e481cbf..e93c6549f1538 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
@@ -50,47 +50,53 @@ class InsertIntoHiveTableSuite extends QueryTest with TestHiveSingleton with Bef
   }
 
   test("insertInto() HiveTable") {
-    sql("CREATE TABLE createAndInsertTest (key int, value string)")
-
-    // Add some data.
-    testData.write.mode(SaveMode.Append).insertInto("createAndInsertTest")
-
-    // Make sure the table has also been updated.
-    checkAnswer(
-      sql("SELECT * FROM createAndInsertTest"),
-      testData.collect().toSeq
-    )
-
-    // Add more data.
-    testData.write.mode(SaveMode.Append).insertInto("createAndInsertTest")
-
-    // Make sure the table has been updated.
-    checkAnswer(
-      sql("SELECT * FROM createAndInsertTest"),
-      testData.toDF().collect().toSeq ++ testData.toDF().collect().toSeq
-    )
-
-    // Now overwrite.
-    testData.write.mode(SaveMode.Overwrite).insertInto("createAndInsertTest")
-
-    // Make sure the registered table has also been updated.
-    checkAnswer(
-      sql("SELECT * FROM createAndInsertTest"),
-      testData.collect().toSeq
-    )
+    withTable("createAndInsertTest") {
+      sql("CREATE TABLE createAndInsertTest (key int, value string)")
+
+      // Add some data.
+      testData.write.mode(SaveMode.Append).insertInto("createAndInsertTest")
+
+      // Make sure the table has also been updated.
+      checkAnswer(
+        sql("SELECT * FROM createAndInsertTest"),
+        testData.collect().toSeq
+      )
+
+      // Add more data.
+      testData.write.mode(SaveMode.Append).insertInto("createAndInsertTest")
+
+      // Make sure the table has been updated.
+      checkAnswer(
+        sql("SELECT * FROM createAndInsertTest"),
+        testData.toDF().collect().toSeq ++ testData.toDF().collect().toSeq
+      )
+
+      // Now overwrite.
+      testData.write.mode(SaveMode.Overwrite).insertInto("createAndInsertTest")
+
+      // Make sure the registered table has also been updated.
+      checkAnswer(
+        sql("SELECT * FROM createAndInsertTest"),
+        testData.collect().toSeq
+      )
+    }
   }
 
   test("Double create fails when allowExisting = false") {
-    sql("CREATE TABLE doubleCreateAndInsertTest (key int, value string)")
-
-    intercept[AnalysisException] {
+    withTable("doubleCreateAndInsertTest") {
       sql("CREATE TABLE doubleCreateAndInsertTest (key int, value string)")
+
+      intercept[AnalysisException] {
+        sql("CREATE TABLE doubleCreateAndInsertTest (key int, value string)")
+      }
     }
   }
 
   test("Double create does not fail when allowExisting = true") {
-    sql("CREATE TABLE doubleCreateAndInsertTest (key int, value string)")
-    sql("CREATE TABLE IF NOT EXISTS doubleCreateAndInsertTest (key int, value string)")
+    withTable("doubleCreateAndInsertTest") {
+      sql("CREATE TABLE doubleCreateAndInsertTest (key int, value string)")
+      sql("CREATE TABLE IF NOT EXISTS doubleCreateAndInsertTest (key int, value string)")
+    }
   }
 
   test("SPARK-4052: scala.collection.Map as value type of MapType") {
@@ -268,29 +274,33 @@ class InsertIntoHiveTableSuite extends QueryTest with TestHiveSingleton with Bef
 
   test("Test partition mode = strict") {
     withSQLConf(("hive.exec.dynamic.partition.mode", "strict")) {
-      sql("CREATE TABLE partitioned (id bigint, data string) PARTITIONED BY (part string)")
-      val data = (1 to 10).map(i => (i, s"data-$i", if ((i % 2) == 0) "even" else "odd"))
+      withTable("partitioned") {
+        sql("CREATE TABLE partitioned (id bigint, data string) PARTITIONED BY (part string)")
+        val data = (1 to 10).map(i => (i, s"data-$i", if ((i % 2) == 0) "even" else "odd"))
           .toDF("id", "data", "part")
 
-      intercept[SparkException] {
-        data.write.insertInto("partitioned")
+        intercept[SparkException] {
+          data.write.insertInto("partitioned")
+        }
       }
     }
   }
 
   test("Detect table partitioning") {
     withSQLConf(("hive.exec.dynamic.partition.mode", "nonstrict")) {
-      sql("CREATE TABLE source (id bigint, data string, part string)")
-      val data = (1 to 10).map(i => (i, s"data-$i", if ((i % 2) == 0) "even" else "odd")).toDF()
+      withTable("source", "partitioned") {
+        sql("CREATE TABLE source (id bigint, data string, part string)")
+        val data = (1 to 10).map(i => (i, s"data-$i", if ((i % 2) == 0) "even" else "odd")).toDF()
 
-      data.write.insertInto("source")
-      checkAnswer(sql("SELECT * FROM source"), data.collect().toSeq)
+        data.write.insertInto("source")
+        checkAnswer(sql("SELECT * FROM source"), data.collect().toSeq)
 
-      sql("CREATE TABLE partitioned (id bigint, data string) PARTITIONED BY (part string)")
-      // this will pick up the output partitioning from the table definition
-      spark.table("source").write.insertInto("partitioned")
+        sql("CREATE TABLE partitioned (id bigint, data string) PARTITIONED BY (part string)")
+        // this will pick up the output partitioning from the table definition
+        spark.table("source").write.insertInto("partitioned")
 
-      checkAnswer(sql("SELECT * FROM partitioned"), data.collect().toSeq)
+        checkAnswer(sql("SELECT * FROM partitioned"), data.collect().toSeq)
+      }
     }
   }
 
@@ -461,19 +471,23 @@ class InsertIntoHiveTableSuite extends QueryTest with TestHiveSingleton with Bef
 
   testPartitionedTable("insertInto() should reject missing columns") {
     tableName =>
-      sql("CREATE TABLE t (a INT, b INT)")
+      withTable("t") {
+        sql("CREATE TABLE t (a INT, b INT)")
 
-      intercept[AnalysisException] {
-        spark.table("t").write.insertInto(tableName)
+        intercept[AnalysisException] {
+          spark.table("t").write.insertInto(tableName)
+        }
       }
   }
 
   testPartitionedTable("insertInto() should reject extra columns") {
     tableName =>
-      sql("CREATE TABLE t (a INT, b INT, c INT, d INT, e INT)")
+      withTable("t") {
+        sql("CREATE TABLE t (a INT, b INT, c INT, d INT, e INT)")
 
-      intercept[AnalysisException] {
-        spark.table("t").write.insertInto(tableName)
+        intercept[AnalysisException] {
+          spark.table("t").write.insertInto(tableName)
+        }
       }
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index 799abc1d0c42f..2ea51791d0f79 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -370,21 +370,23 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd
     """.stripMargin)
 
   test("SPARK-7270: consider dynamic partition when comparing table output") {
-    sql(s"CREATE TABLE test_partition (a STRING) PARTITIONED BY (b BIGINT, c STRING)")
-    sql(s"CREATE TABLE ptest (a STRING, b BIGINT, c STRING)")
+    withTable("test_partition", "ptest") {
+      sql(s"CREATE TABLE test_partition (a STRING) PARTITIONED BY (b BIGINT, c STRING)")
+      sql(s"CREATE TABLE ptest (a STRING, b BIGINT, c STRING)")
 
-    val analyzedPlan = sql(
-      """
+      val analyzedPlan = sql(
+        """
         |INSERT OVERWRITE table test_partition PARTITION (b=1, c)
         |SELECT 'a', 'c' from ptest
       """.stripMargin).queryExecution.analyzed
 
-    assertResult(false, "Incorrect cast detected\n" + analyzedPlan) {
+      assertResult(false, "Incorrect cast detected\n" + analyzedPlan) {
       var hasCast = false
-      analyzedPlan.collect {
-        case p: Project => p.transformExpressionsUp { case c: Cast => hasCast = true; c }
+        analyzedPlan.collect {
+          case p: Project => p.transformExpressionsUp { case c: Cast => hasCast = true; c }
+        }
+        hasCast
       }
-      hasCast
     }
   }
 
@@ -435,13 +437,13 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd
 
   test("transform with SerDe2") {
     assume(TestUtils.testCommandAvailable("/bin/bash"))
+    withTable("small_src") {
+      sql("CREATE TABLE small_src(key INT, value STRING)")
+      sql("INSERT OVERWRITE TABLE small_src SELECT key, value FROM src LIMIT 10")
 
-    sql("CREATE TABLE small_src(key INT, value STRING)")
-    sql("INSERT OVERWRITE TABLE small_src SELECT key, value FROM src LIMIT 10")
-
-    val expected = sql("SELECT key FROM small_src").collect().head
-    val res = sql(
-      """
+      val expected = sql("SELECT key FROM small_src").collect().head
+      val res = sql(
+        """
         |SELECT TRANSFORM (key) ROW FORMAT SERDE
         |'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
         |WITH SERDEPROPERTIES ('avro.schema.literal'='{"namespace":
@@ -453,7 +455,8 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd
         |FROM small_src
       """.stripMargin.replaceAll(System.lineSeparator(), " ")).collect().head
 
-    assert(expected(0) === res(0))
+      assert(expected(0) === res(0))
+    }
   }
 
   createQueryTest("transform with SerDe3",
@@ -780,22 +783,26 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd
 
   test("Exactly once semantics for DDL and command statements") {
     val tableName = "test_exactly_once"
-    val q0 = sql(s"CREATE TABLE $tableName(key INT, value STRING)")
+    withTable(tableName) {
+      val q0 = sql(s"CREATE TABLE $tableName(key INT, value STRING)")
 
-    // If the table was not created, the following assertion would fail
-    assert(Try(table(tableName)).isSuccess)
+      // If the table was not created, the following assertion would fail
+      assert(Try(table(tableName)).isSuccess)
 
-    // If the CREATE TABLE command got executed again, the following assertion would fail
-    assert(Try(q0.count()).isSuccess)
+      // If the CREATE TABLE command got executed again, the following assertion would fail
+      assert(Try(q0.count()).isSuccess)
+    }
   }
 
   test("SPARK-2263: Insert Map<K, V> values") {
-    sql("CREATE TABLE m(value MAP<INT, STRING>)")
-    sql("INSERT OVERWRITE TABLE m SELECT MAP(key, value) FROM src LIMIT 10")
-    sql("SELECT * FROM m").collect().zip(sql("SELECT * FROM src LIMIT 10").collect()).foreach {
-      case (Row(map: Map[_, _]), Row(key: Int, value: String)) =>
-        assert(map.size === 1)
-        assert(map.head === ((key, value)))
+    withTable("m") {
+      sql("CREATE TABLE m(value MAP<INT, STRING>)")
+      sql("INSERT OVERWRITE TABLE m SELECT MAP(key, value) FROM src LIMIT 10")
+      sql("SELECT * FROM m").collect().zip(sql("SELECT * FROM src LIMIT 10").collect()).foreach {
+        case (Row(map: Map[_, _]), Row(key: Int, value: String)) =>
+          assert(map.size === 1)
+          assert(map.head === ((key, value)))
+      }
     }
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala
index ae64cb3210b53..3f9bb8de42e09 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala
@@ -81,14 +81,16 @@ class HiveTableScanSuite extends HiveComparisonTest with SQLTestUtils with TestH
   }
 
   test("Spark-4959 Attributes are case sensitive when using a select query from a projection") {
-    sql("create table spark_4959 (col1 string)")
-    sql("""insert into table spark_4959 select "hi" from src limit 1""")
-    table("spark_4959").select(
-      'col1.as("CaseSensitiveColName"),
-      'col1.as("CaseSensitiveColName2")).createOrReplaceTempView("spark_4959_2")
-
-    assert(sql("select CaseSensitiveColName from spark_4959_2").head() === Row("hi"))
-    assert(sql("select casesensitivecolname from spark_4959_2").head() === Row("hi"))
+    withTable("spark_4959") {
+      sql("create table spark_4959 (col1 string)")
+      sql("""insert into table spark_4959 select "hi" from src limit 1""")
+      table("spark_4959").select(
+        'col1.as("CaseSensitiveColName"),
+        'col1.as("CaseSensitiveColName2")).createOrReplaceTempView("spark_4959_2")
+
+      assert(sql("select CaseSensitiveColName from spark_4959_2").head() === Row("hi"))
+      assert(sql("select casesensitivecolname from spark_4959_2").head() === Row("hi"))
+    }
   }
 
   private def checkNumScannedPartitions(stmt: String, expectedNumParts: Int): Unit = {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
index 383d41f907c6d..6198d4963df33 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
@@ -74,26 +74,28 @@ class HiveUDFSuite extends QueryTest with TestHiveSingleton with SQLTestUtils {
   }
 
   test("hive struct udf") {
-    sql(
-      """
-      |CREATE TABLE hiveUDFTestTable (
-      |   pair STRUCT<id: INT, value: INT>
-      |)
-      |PARTITIONED BY (partition STRING)
-      |ROW FORMAT SERDE '%s'
-      |STORED AS SEQUENCEFILE
-    """.
-        stripMargin.format(classOf[PairSerDe].getName))
-
-    val location = Utils.getSparkClassLoader.getResource("data/files/testUDF").getFile
-    sql(s"""
-      ALTER TABLE hiveUDFTestTable
-      ADD IF NOT EXISTS PARTITION(partition='testUDF')
-      LOCATION '$location'""")
-
-    sql(s"CREATE TEMPORARY FUNCTION testUDF AS '${classOf[PairUDF].getName}'")
-    sql("SELECT testUDF(pair) FROM hiveUDFTestTable")
-    sql("DROP TEMPORARY FUNCTION IF EXISTS testUDF")
+    withTable("hiveUDFTestTable") {
+      sql(
+          """
+          |CREATE TABLE hiveUDFTestTable (
+          |   pair STRUCT<id: INT, value: INT>
+          |)
+          |PARTITIONED BY (partition STRING)
+          |ROW FORMAT SERDE '%s'
+          |STORED AS SEQUENCEFILE
+        """.
+            stripMargin.format(classOf[PairSerDe].getName))
+
+      val location = Utils.getSparkClassLoader.getResource("data/files/testUDF").getFile
+      sql(s"""
+        ALTER TABLE hiveUDFTestTable
+        ADD IF NOT EXISTS PARTITION(partition='testUDF')
+        LOCATION '$location'""")
+
+      sql(s"CREATE TEMPORARY FUNCTION testUDF AS '${classOf[PairUDF].getName}'")
+      sql("SELECT testUDF(pair) FROM hiveUDFTestTable")
+      sql("DROP TEMPORARY FUNCTION IF EXISTS testUDF")
+    }
   }
 
   test("Max/Min on named_struct") {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 85a6a77cedc4f..09c59000b3e3f 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -136,49 +136,51 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     orders.toDF.createOrReplaceTempView("orders1")
     orderUpdates.toDF.createOrReplaceTempView("orderupdates1")
 
-    sql(
-      """CREATE TABLE orders(
-        |  id INT,
-        |  make String,
-        |  type String,
-        |  price INT,
-        |  pdate String,
-        |  customer String,
-        |  city String)
-        |PARTITIONED BY (state STRING, month INT)
-        |STORED AS PARQUET
-      """.stripMargin)
+    withTable("orders", "orderupdates") {
+      sql(
+        """CREATE TABLE orders(
+          |  id INT,
+          |  make String,
+          |  type String,
+          |  price INT,
+          |  pdate String,
+          |  customer String,
+          |  city String)
+          |PARTITIONED BY (state STRING, month INT)
+          |STORED AS PARQUET
+        """.stripMargin)
 
-    sql(
-      """CREATE TABLE orderupdates(
-        |  id INT,
-        |  make String,
-        |  type String,
-        |  price INT,
-        |  pdate String,
-        |  customer String,
-        |  city String)
-        |PARTITIONED BY (state STRING, month INT)
-        |STORED AS PARQUET
-      """.stripMargin)
+      sql(
+        """CREATE TABLE orderupdates(
+          |  id INT,
+          |  make String,
+          |  type String,
+          |  price INT,
+          |  pdate String,
+          |  customer String,
+          |  city String)
+          |PARTITIONED BY (state STRING, month INT)
+          |STORED AS PARQUET
+        """.stripMargin)
 
-    sql("set hive.exec.dynamic.partition.mode=nonstrict")
-    sql("INSERT INTO TABLE orders PARTITION(state, month) SELECT * FROM orders1")
-    sql("INSERT INTO TABLE orderupdates PARTITION(state, month) SELECT * FROM orderupdates1")
+      sql("set hive.exec.dynamic.partition.mode=nonstrict")
+      sql("INSERT INTO TABLE orders PARTITION(state, month) SELECT * FROM orders1")
+      sql("INSERT INTO TABLE orderupdates PARTITION(state, month) SELECT * FROM orderupdates1")
 
-    checkAnswer(
-      sql(
-        """
-          |select orders.state, orders.month
-          |from orders
-          |join (
-          |  select distinct orders.state,orders.month
-          |  from orders
-          |  join orderupdates
-          |    on orderupdates.id = orders.id) ao
-          |  on ao.state = orders.state and ao.month = orders.month
-        """.stripMargin),
-      (1 to 6).map(_ => Row("CA", 20151)))
+      checkAnswer(
+        sql(
+          """
+            |select orders.state, orders.month
+            |from orders
+            |join (
+            |  select distinct orders.state,orders.month
+            |  from orders
+            |  join orderupdates
+            |    on orderupdates.id = orders.id) ao
+            |  on ao.state = orders.state and ao.month = orders.month
+          """.stripMargin),
+        (1 to 6).map(_ => Row("CA", 20151)))
+    }
   }
 
   test("show functions") {
@@ -349,21 +351,23 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
   }
 
   test("CTAS with WITH clause") {
+
     val df = Seq((1, 1)).toDF("c1", "c2")
     df.createOrReplaceTempView("table1")
-
-    sql(
-      """
-        |CREATE TABLE with_table1 AS
-        |WITH T AS (
-        |  SELECT *
-        |  FROM table1
-        |)
-        |SELECT *
-        |FROM T
-      """.stripMargin)
-    val query = sql("SELECT * FROM with_table1")
-    checkAnswer(query, Row(1, 1) :: Nil)
+    withTable("with_table1") {
+      sql(
+        """
+          |CREATE TABLE with_table1 AS
+          |WITH T AS (
+          |  SELECT *
+          |  FROM table1
+          |)
+          |SELECT *
+          |FROM T
+        """.stripMargin)
+      val query = sql("SELECT * FROM with_table1")
+      checkAnswer(query, Row(1, 1) :: Nil)
+    }
   }
 
   test("explode nested Field") {
@@ -564,86 +568,90 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
   }
 
   test("CTAS with serde") {
-    sql("CREATE TABLE ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
-    sql(
-      """CREATE TABLE ctas2
-        | ROW FORMAT SERDE "org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe"
-        | WITH SERDEPROPERTIES("serde_p1"="p1","serde_p2"="p2")
-        | STORED AS RCFile
-        | TBLPROPERTIES("tbl_p1"="p11", "tbl_p2"="p22")
-        | AS
-        |   SELECT key, value
-        |   FROM src
-        |   ORDER BY key, value""".stripMargin)
-
-    val storageCtas2 = spark.sessionState.catalog.getTableMetadata(TableIdentifier("ctas2")).storage
-    assert(storageCtas2.inputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileInputFormat"))
-    assert(storageCtas2.outputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileOutputFormat"))
-    assert(storageCtas2.serde == Some("org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe"))
-
-    sql(
-      """CREATE TABLE ctas3
-        | ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\012'
-        | STORED AS textfile AS
-        |   SELECT key, value
-        |   FROM src
-        |   ORDER BY key, value""".stripMargin)
-
-    // the table schema may like (key: integer, value: string)
-    sql(
-      """CREATE TABLE IF NOT EXISTS ctas4 AS
-        | SELECT 1 AS key, value FROM src LIMIT 1""".stripMargin)
-    // do nothing cause the table ctas4 already existed.
-    sql(
-      """CREATE TABLE IF NOT EXISTS ctas4 AS
-        | SELECT key, value FROM src ORDER BY key, value""".stripMargin)
+    withTable("ctas1", "ctas2", "ctas3", "ctas4", "ctas5") {
+      sql("CREATE TABLE ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
+      sql(
+        """CREATE TABLE ctas2
+          | ROW FORMAT SERDE "org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe"
+          | WITH SERDEPROPERTIES("serde_p1"="p1","serde_p2"="p2")
+          | STORED AS RCFile
+          | TBLPROPERTIES("tbl_p1"="p11", "tbl_p2"="p22")
+          | AS
+          |   SELECT key, value
+          |   FROM src
+          |   ORDER BY key, value""".stripMargin)
+
+      val storageCtas2 = spark.sessionState.catalog.
+        getTableMetadata(TableIdentifier("ctas2")).storage
+      assert(storageCtas2.inputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileInputFormat"))
+      assert(storageCtas2.outputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileOutputFormat"))
+      assert(storageCtas2.serde == Some("org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe"))
 
-    checkAnswer(
-      sql("SELECT k, value FROM ctas1 ORDER BY k, value"),
-      sql("SELECT key, value FROM src ORDER BY key, value"))
-    checkAnswer(
-      sql("SELECT key, value FROM ctas2 ORDER BY key, value"),
       sql(
-        """
+        """CREATE TABLE ctas3
+          | ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\012'
+          | STORED AS textfile AS
+          |   SELECT key, value
+          |   FROM src
+          |   ORDER BY key, value""".stripMargin)
+
+      // the table schema may like (key: integer, value: string)
+      sql(
+        """CREATE TABLE IF NOT EXISTS ctas4 AS
+          | SELECT 1 AS key, value FROM src LIMIT 1""".stripMargin)
+      // do nothing cause the table ctas4 already existed.
+      sql(
+        """CREATE TABLE IF NOT EXISTS ctas4 AS
+          | SELECT key, value FROM src ORDER BY key, value""".stripMargin)
+
+      checkAnswer(
+        sql("SELECT k, value FROM ctas1 ORDER BY k, value"),
+        sql("SELECT key, value FROM src ORDER BY key, value"))
+      checkAnswer(
+        sql("SELECT key, value FROM ctas2 ORDER BY key, value"),
+        sql(
+          """
           SELECT key, value
           FROM src
           ORDER BY key, value"""))
-    checkAnswer(
-      sql("SELECT key, value FROM ctas3 ORDER BY key, value"),
-      sql(
-        """
+      checkAnswer(
+        sql("SELECT key, value FROM ctas3 ORDER BY key, value"),
+        sql(
+          """
           SELECT key, value
           FROM src
           ORDER BY key, value"""))
-    intercept[AnalysisException] {
-      sql(
-        """CREATE TABLE ctas4 AS
-          | SELECT key, value FROM src ORDER BY key, value""".stripMargin)
-    }
-    checkAnswer(
-      sql("SELECT key, value FROM ctas4 ORDER BY key, value"),
-      sql("SELECT key, value FROM ctas4 LIMIT 1").collect().toSeq)
-
-    sql(
-      """CREATE TABLE ctas5
-        | STORED AS parquet AS
-        |   SELECT key, value
-        |   FROM src
-        |   ORDER BY key, value""".stripMargin)
-    val storageCtas5 = spark.sessionState.catalog.getTableMetadata(TableIdentifier("ctas5")).storage
-    assert(storageCtas5.inputFormat ==
-      Some("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat"))
-    assert(storageCtas5.outputFormat ==
-      Some("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat"))
-    assert(storageCtas5.serde ==
-      Some("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"))
-
-
-    // use the Hive SerDe for parquet tables
-    withSQLConf(HiveUtils.CONVERT_METASTORE_PARQUET.key -> "false") {
+      intercept[AnalysisException] {
+        sql(
+          """CREATE TABLE ctas4 AS
+            | SELECT key, value FROM src ORDER BY key, value""".stripMargin)
+      }
       checkAnswer(
-        sql("SELECT key, value FROM ctas5 ORDER BY key, value"),
-        sql("SELECT key, value FROM src ORDER BY key, value"))
+        sql("SELECT key, value FROM ctas4 ORDER BY key, value"),
+        sql("SELECT key, value FROM ctas4 LIMIT 1").collect().toSeq)
+
+      sql(
+        """CREATE TABLE ctas5
+          | STORED AS parquet AS
+          |   SELECT key, value
+          |   FROM src
+          |   ORDER BY key, value""".stripMargin)
+      val storageCtas5 = spark.sessionState.catalog.
+        getTableMetadata(TableIdentifier("ctas5")).storage
+      assert(storageCtas5.inputFormat ==
+        Some("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat"))
+      assert(storageCtas5.outputFormat ==
+        Some("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat"))
+      assert(storageCtas5.serde ==
+        Some("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"))
+
+
+      // use the Hive SerDe for parquet tables
+      withSQLConf(HiveUtils.CONVERT_METASTORE_PARQUET.key -> "false") {
+        checkAnswer(
+          sql("SELECT key, value FROM ctas5 ORDER BY key, value"),
+          sql("SELECT key, value FROM src ORDER BY key, value"))
+      }
     }
   }
 
@@ -716,40 +724,46 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
   }
 
   test("double nested data") {
-    sparkContext.parallelize(Nested1(Nested2(Nested3(1))) :: Nil)
-      .toDF().createOrReplaceTempView("nested")
-    checkAnswer(
-      sql("SELECT f1.f2.f3 FROM nested"),
-      Row(1))
+    withTable("test_ctas_1234") {
+      sparkContext.parallelize(Nested1(Nested2(Nested3(1))) :: Nil)
+        .toDF().createOrReplaceTempView("nested")
+      checkAnswer(
+        sql("SELECT f1.f2.f3 FROM nested"),
+        Row(1))
 
-    sql("CREATE TABLE test_ctas_1234 AS SELECT * from nested")
-    checkAnswer(
-      sql("SELECT * FROM test_ctas_1234"),
-      sql("SELECT * FROM nested").collect().toSeq)
+      sql("CREATE TABLE test_ctas_1234 AS SELECT * from nested")
+      checkAnswer(
+        sql("SELECT * FROM test_ctas_1234"),
+        sql("SELECT * FROM nested").collect().toSeq)
 
-    intercept[AnalysisException] {
-      sql("CREATE TABLE test_ctas_1234 AS SELECT * from notexists").collect()
+      intercept[AnalysisException] {
+        sql("CREATE TABLE test_ctas_1234 AS SELECT * from notexists").collect()
+      }
     }
   }
 
   test("test CTAS") {
-    sql("CREATE TABLE test_ctas_123 AS SELECT key, value FROM src")
-    checkAnswer(
-      sql("SELECT key, value FROM test_ctas_123 ORDER BY key"),
-      sql("SELECT key, value FROM src ORDER BY key").collect().toSeq)
+    withTable("test_ctas_1234") {
+      sql("CREATE TABLE test_ctas_123 AS SELECT key, value FROM src")
+      checkAnswer(
+        sql("SELECT key, value FROM test_ctas_123 ORDER BY key"),
+        sql("SELECT key, value FROM src ORDER BY key").collect().toSeq)
+    }
   }
 
   test("SPARK-4825 save join to table") {
-    val testData = sparkContext.parallelize(1 to 10).map(i => TestData(i, i.toString)).toDF()
-    sql("CREATE TABLE test1 (key INT, value STRING)")
-    testData.write.mode(SaveMode.Append).insertInto("test1")
-    sql("CREATE TABLE test2 (key INT, value STRING)")
-    testData.write.mode(SaveMode.Append).insertInto("test2")
-    testData.write.mode(SaveMode.Append).insertInto("test2")
-    sql("CREATE TABLE test AS SELECT COUNT(a.value) FROM test1 a JOIN test2 b ON a.key = b.key")
-    checkAnswer(
-      table("test"),
-      sql("SELECT COUNT(a.value) FROM test1 a JOIN test2 b ON a.key = b.key").collect().toSeq)
+    withTable("test1", "test2", "test") {
+      val testData = sparkContext.parallelize(1 to 10).map(i => TestData(i, i.toString)).toDF()
+      sql("CREATE TABLE test1 (key INT, value STRING)")
+      testData.write.mode(SaveMode.Append).insertInto("test1")
+      sql("CREATE TABLE test2 (key INT, value STRING)")
+      testData.write.mode(SaveMode.Append).insertInto("test2")
+      testData.write.mode(SaveMode.Append).insertInto("test2")
+      sql("CREATE TABLE test AS SELECT COUNT(a.value) FROM test1 a JOIN test2 b ON a.key = b.key")
+      checkAnswer(
+        table("test"),
+        sql("SELECT COUNT(a.value) FROM test1 a JOIN test2 b ON a.key = b.key").collect().toSeq)
+    }
   }
 
   test("SPARK-3708 Backticks aren't handled correctly is aliases") {
@@ -1843,14 +1857,16 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
 
 
   test("SPARK-17108: Fix BIGINT and INT comparison failure in spark sql") {
-    sql("create table t1(a map<bigint, array<string>>)")
-    sql("select * from t1 where a[1] is not null")
+    withTable("t1", "t2", "t3") {
+      sql("create table t1(a map<bigint, array<string>>)")
+      sql("select * from t1 where a[1] is not null")
 
-    sql("create table t2(a map<int, array<string>>)")
-    sql("select * from t2 where a[1] is not null")
+      sql("create table t2(a map<int, array<string>>)")
+      sql("select * from t2 where a[1] is not null")
 
-    sql("create table t3(a map<bigint, array<string>>)")
-    sql("select * from t3 where a[1L] is not null")
+      sql("create table t3(a map<bigint, array<string>>)")
+      sql("select * from t3 where a[1L] is not null")
+    }
   }
 
   test("SPARK-17796 Support wildcard character in filename for LOAD DATA LOCAL INPATH") {

From 6e37524a1fd26bbfe5034ecf971472931d1d47a9 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Thu, 7 Sep 2017 23:12:18 -0700
Subject: [PATCH 506/779] [SPARK-21726][SQL] Check for structural integrity of
 the plan in Optimzer in test mode.

## What changes were proposed in this pull request?

We have many optimization rules now in `Optimzer`. Right now we don't have any checks in the optimizer to check for the structural integrity of the plan (e.g. resolved). When debugging, it is difficult to identify which rules return invalid plans.

It would be great if in test mode, we can check whether a plan is still resolved after the execution of each rule, so we can catch rules that return invalid plans.

## How was this patch tested?

Added tests.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #18956 from viirya/SPARK-21726.
---
 .../sql/catalyst/optimizer/Optimizer.scala    |  7 +++
 .../sql/catalyst/rules/RuleExecutor.scala     | 15 +++++
 .../expressions/ExpressionEvalHelper.scala    |  6 +-
 ...mizerStructuralIntegrityCheckerSuite.scala | 60 +++++++++++++++++++
 .../catalyst/trees/RuleExecutorSuite.scala    | 17 ++++++
 5 files changed, 103 insertions(+), 2 deletions(-)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerStructuralIntegrityCheckerSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 02d6778c08e6f..2426a8b4a9062 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
+import org.apache.spark.util.Utils
 
 /**
  * Abstract class all optimizers should inherit of, contains the standard batches (extending
@@ -37,6 +38,12 @@ import org.apache.spark.sql.types._
 abstract class Optimizer(sessionCatalog: SessionCatalog)
   extends RuleExecutor[LogicalPlan] {
 
+  // Check for structural integrity of the plan in test mode. Currently we only check if a plan is
+  // still resolved after the execution of each rule.
+  override protected def isPlanIntegral(plan: LogicalPlan): Boolean = {
+    Utils.isTesting && plan.resolved
+  }
+
   protected def fixedPoint = FixedPoint(SQLConf.get.optimizerMaxIterations)
 
   def batches: Seq[Batch] = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
index 0e89d1c8f31e8..7e4b784033bfc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
@@ -63,6 +63,14 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging {
   /** Defines a sequence of rule batches, to be overridden by the implementation. */
   protected def batches: Seq[Batch]
 
+  /**
+   * Defines a check function that checks for structural integrity of the plan after the execution
+   * of each rule. For example, we can check whether a plan is still resolved after each rule in
+   * `Optimizer`, so we can catch rules that return invalid plans. The check function returns
+   * `false` if the given plan doesn't pass the structural integrity check.
+   */
+  protected def isPlanIntegral(plan: TreeType): Boolean = true
+
   /**
    * Executes the batches of rules defined by the subclass. The batches are executed serially
    * using the defined execution strategy. Within each batch, rules are also executed serially.
@@ -93,6 +101,13 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging {
                 """.stripMargin)
             }
 
+            // Run the structural integrity checker against the plan after each rule.
+            if (!isPlanIntegral(result)) {
+              val message = s"After applying rule ${rule.ruleName} in batch ${batch.name}, " +
+                "the structural integrity of the plan is broken."
+              throw new TreeNodeException(result, message, null)
+            }
+
             result
         }
         iteration += 1
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
index 0496d611ec3c7..b4c8eab19c5cc 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
@@ -25,7 +25,7 @@ import org.scalatest.prop.GeneratorDrivenPropertyChecks
 import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.serializer.JavaSerializer
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
-import org.apache.spark.sql.catalyst.analysis.ResolveTimeZone
+import org.apache.spark.sql.catalyst.analysis.{ResolveTimeZone, SimpleAnalyzer}
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.optimizer.SimpleTestOptimizer
 import org.apache.spark.sql.catalyst.plans.logical.{OneRowRelation, Project}
@@ -188,7 +188,9 @@ trait ExpressionEvalHelper extends GeneratorDrivenPropertyChecks {
       expected: Any,
       inputRow: InternalRow = EmptyRow): Unit = {
     val plan = Project(Alias(expression, s"Optimized($expression)")() :: Nil, OneRowRelation())
-    val optimizedPlan = SimpleTestOptimizer.execute(plan)
+    // We should analyze the plan first, otherwise we possibly optimize an unresolved plan.
+    val analyzedPlan = SimpleAnalyzer.execute(plan)
+    val optimizedPlan = SimpleTestOptimizer.execute(analyzedPlan)
     checkEvaluationWithoutCodegen(optimizedPlan.expressions.head, expected, inputRow)
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerStructuralIntegrityCheckerSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerStructuralIntegrityCheckerSuite.scala
new file mode 100644
index 0000000000000..6e183d81b7265
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerStructuralIntegrityCheckerSuite.scala
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import org.apache.spark.sql.catalyst.analysis.{EmptyFunctionRegistry, UnresolvedAttribute}
+import org.apache.spark.sql.catalyst.catalog.{InMemoryCatalog, SessionCatalog}
+import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.errors.TreeNodeException
+import org.apache.spark.sql.catalyst.expressions.{Alias, Literal}
+import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, OneRowRelation, Project}
+import org.apache.spark.sql.catalyst.rules._
+import org.apache.spark.sql.internal.SQLConf
+
+
+class OptimizerStructuralIntegrityCheckerSuite extends PlanTest {
+
+  object OptimizeRuleBreakSI extends Rule[LogicalPlan] {
+    def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+      case Project(projectList, child) =>
+        val newAttr = UnresolvedAttribute("unresolvedAttr")
+        Project(projectList ++ Seq(newAttr), child)
+    }
+  }
+
+  object Optimize extends Optimizer(
+    new SessionCatalog(
+      new InMemoryCatalog,
+      EmptyFunctionRegistry,
+      new SQLConf())) {
+    val newBatch = Batch("OptimizeRuleBreakSI", Once, OptimizeRuleBreakSI)
+    override def batches: Seq[Batch] = Seq(newBatch) ++ super.batches
+  }
+
+  test("check for invalid plan after execution of rule") {
+    val analyzed = Project(Alias(Literal(10), "attr")() :: Nil, OneRowRelation()).analyze
+    assert(analyzed.resolved)
+    val message = intercept[TreeNodeException[LogicalPlan]] {
+      Optimize.execute(analyzed)
+    }.getMessage
+    val ruleName = OptimizeRuleBreakSI.ruleName
+    assert(message.contains(s"After applying rule $ruleName in batch OptimizeRuleBreakSI"))
+    assert(message.contains("the structural integrity of the plan is broken"))
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/RuleExecutorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/RuleExecutorSuite.scala
index c9d36910b0998..a67f54b263cc9 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/RuleExecutorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/RuleExecutorSuite.scala
@@ -56,4 +56,21 @@ class RuleExecutorSuite extends SparkFunSuite {
     }.getMessage
     assert(message.contains("Max iterations (10) reached for batch fixedPoint"))
   }
+
+  test("structural integrity checker") {
+    object WithSIChecker extends RuleExecutor[Expression] {
+      override protected def isPlanIntegral(expr: Expression): Boolean = expr match {
+        case IntegerLiteral(_) => true
+        case _ => false
+      }
+      val batches = Batch("once", Once, DecrementLiterals) :: Nil
+    }
+
+    assert(WithSIChecker.execute(Literal(10)) === Literal(9))
+
+    val message = intercept[TreeNodeException[LogicalPlan]] {
+      WithSIChecker.execute(Literal(10.1))
+    }.getMessage
+    assert(message.contains("the structural integrity of the plan is broken"))
+  }
 }

From dbb824125d4d31166d9a47c330f8d51f5d159515 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 7 Sep 2017 23:21:49 -0700
Subject: [PATCH 507/779] [SPARK-21936][SQL] backward compatibility test
 framework for HiveExternalCatalog

## What changes were proposed in this pull request?

`HiveExternalCatalog` is a semi-public interface. When creating tables, `HiveExternalCatalog` converts the table metadata to hive table format and save into hive metastore. It's very import to guarantee backward compatibility here, i.e., tables created by previous Spark versions should still be readable in newer Spark versions.

Previously we find backward compatibility issues manually, which is really easy to miss bugs. This PR introduces a test framework to automatically test `HiveExternalCatalog` backward compatibility, by downloading Spark binaries with different versions, and create tables with these Spark versions, and read these tables with current Spark version.

## How was this patch tested?

test-only change

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19148 from cloud-fan/test.
---
 sql/hive/pom.xml                              |   4 +
 ...nalCatalogBackwardCompatibilitySuite.scala | 260 ------------------
 .../HiveExternalCatalogVersionsSuite.scala    | 194 +++++++++++++
 .../spark/sql/hive/HiveSparkSubmitSuite.scala |  77 +-----
 .../sql/hive/MetastoreDataSourcesSuite.scala  |  27 --
 .../spark/sql/hive/SparkSubmitTestUtils.scala | 101 +++++++
 6 files changed, 301 insertions(+), 362 deletions(-)
 delete mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogBackwardCompatibilitySuite.scala
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/SparkSubmitTestUtils.scala

diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index a649daf5a1c13..66fad85ea0263 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -176,6 +176,10 @@
       <groupId>org.apache.thrift</groupId>
       <artifactId>libfb303</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.apache.derby</groupId>
+      <artifactId>derby</artifactId>
+    </dependency>
     <dependency>
       <groupId>org.scala-lang</groupId>
       <artifactId>scala-compiler</artifactId>
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogBackwardCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogBackwardCompatibilitySuite.scala
deleted file mode 100644
index 3bd3d0d6db355..0000000000000
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogBackwardCompatibilitySuite.scala
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.hive
-
-import java.net.URI
-
-import org.apache.hadoop.fs.Path
-import org.scalatest.BeforeAndAfterEach
-
-import org.apache.spark.sql.QueryTest
-import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType}
-import org.apache.spark.sql.hive.client.HiveClient
-import org.apache.spark.sql.hive.test.TestHiveSingleton
-import org.apache.spark.sql.test.SQLTestUtils
-import org.apache.spark.sql.types.StructType
-import org.apache.spark.util.Utils
-
-
-class HiveExternalCatalogBackwardCompatibilitySuite extends QueryTest
-  with SQLTestUtils with TestHiveSingleton with BeforeAndAfterEach {
-
-  val tempDir = Utils.createTempDir().getCanonicalFile
-  val tempDirUri = tempDir.toURI
-  val tempDirStr = tempDir.getAbsolutePath
-
-  override def beforeEach(): Unit = {
-    sql("CREATE DATABASE test_db")
-    for ((tbl, _) <- rawTablesAndExpectations) {
-      hiveClient.createTable(tbl, ignoreIfExists = false)
-    }
-  }
-
-  override def afterEach(): Unit = {
-    Utils.deleteRecursively(tempDir)
-    hiveClient.dropDatabase("test_db", ignoreIfNotExists = false, cascade = true)
-  }
-
-  private def getTableMetadata(tableName: String): CatalogTable = {
-    spark.sharedState.externalCatalog.getTable("test_db", tableName)
-  }
-
-  private def defaultTableURI(tableName: String): URI = {
-    spark.sessionState.catalog.defaultTablePath(TableIdentifier(tableName, Some("test_db")))
-  }
-
-  // Raw table metadata that are dumped from tables created by Spark 2.0. Note that, all spark
-  // versions prior to 2.1 would generate almost same raw table metadata for a specific table.
-  val simpleSchema = new StructType().add("i", "int")
-  val partitionedSchema = new StructType().add("i", "int").add("j", "int")
-
-  lazy val hiveTable = CatalogTable(
-    identifier = TableIdentifier("tbl1", Some("test_db")),
-    tableType = CatalogTableType.MANAGED,
-    storage = CatalogStorageFormat.empty.copy(
-      inputFormat = Some("org.apache.hadoop.mapred.TextInputFormat"),
-      outputFormat = Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")),
-    schema = simpleSchema)
-
-  lazy val externalHiveTable = CatalogTable(
-    identifier = TableIdentifier("tbl2", Some("test_db")),
-    tableType = CatalogTableType.EXTERNAL,
-    storage = CatalogStorageFormat.empty.copy(
-      locationUri = Some(tempDirUri),
-      inputFormat = Some("org.apache.hadoop.mapred.TextInputFormat"),
-      outputFormat = Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")),
-    schema = simpleSchema)
-
-  lazy val partitionedHiveTable = CatalogTable(
-    identifier = TableIdentifier("tbl3", Some("test_db")),
-    tableType = CatalogTableType.MANAGED,
-    storage = CatalogStorageFormat.empty.copy(
-      inputFormat = Some("org.apache.hadoop.mapred.TextInputFormat"),
-      outputFormat = Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")),
-    schema = partitionedSchema,
-    partitionColumnNames = Seq("j"))
-
-
-  val simpleSchemaJson =
-    """
-      |{
-      | "type": "struct",
-      | "fields": [{
-      |             "name": "i",
-      |             "type": "integer",
-      |             "nullable": true,
-      |             "metadata": {}
-      |            }]
-      |}
-    """.stripMargin
-
-  val partitionedSchemaJson =
-    """
-      |{
-      | "type": "struct",
-      | "fields": [{
-      |             "name": "i",
-      |             "type": "integer",
-      |             "nullable": true,
-      |             "metadata": {}
-      |            },
-      |            {
-      |             "name": "j",
-      |             "type": "integer",
-      |             "nullable": true,
-      |             "metadata": {}
-      |            }]
-      |}
-    """.stripMargin
-
-  lazy val dataSourceTable = CatalogTable(
-    identifier = TableIdentifier("tbl4", Some("test_db")),
-    tableType = CatalogTableType.MANAGED,
-    storage = CatalogStorageFormat.empty.copy(
-      properties = Map("path" -> defaultTableURI("tbl4").toString)),
-    schema = new StructType(),
-    provider = Some("json"),
-    properties = Map(
-      "spark.sql.sources.provider" -> "json",
-      "spark.sql.sources.schema.numParts" -> "1",
-      "spark.sql.sources.schema.part.0" -> simpleSchemaJson))
-
-  lazy val hiveCompatibleDataSourceTable = CatalogTable(
-    identifier = TableIdentifier("tbl5", Some("test_db")),
-    tableType = CatalogTableType.MANAGED,
-    storage = CatalogStorageFormat.empty.copy(
-      properties = Map("path" -> defaultTableURI("tbl5").toString)),
-    schema = simpleSchema,
-    provider = Some("parquet"),
-    properties = Map(
-      "spark.sql.sources.provider" -> "parquet",
-      "spark.sql.sources.schema.numParts" -> "1",
-      "spark.sql.sources.schema.part.0" -> simpleSchemaJson))
-
-  lazy val partitionedDataSourceTable = CatalogTable(
-    identifier = TableIdentifier("tbl6", Some("test_db")),
-    tableType = CatalogTableType.MANAGED,
-    storage = CatalogStorageFormat.empty.copy(
-      properties = Map("path" -> defaultTableURI("tbl6").toString)),
-    schema = new StructType(),
-    provider = Some("json"),
-    properties = Map(
-      "spark.sql.sources.provider" -> "json",
-      "spark.sql.sources.schema.numParts" -> "1",
-      "spark.sql.sources.schema.part.0" -> partitionedSchemaJson,
-      "spark.sql.sources.schema.numPartCols" -> "1",
-      "spark.sql.sources.schema.partCol.0" -> "j"))
-
-  lazy val externalDataSourceTable = CatalogTable(
-    identifier = TableIdentifier("tbl7", Some("test_db")),
-    tableType = CatalogTableType.EXTERNAL,
-    storage = CatalogStorageFormat.empty.copy(
-      locationUri = Some(new URI(defaultTableURI("tbl7") + "-__PLACEHOLDER__")),
-      properties = Map("path" -> tempDirStr)),
-    schema = new StructType(),
-    provider = Some("json"),
-    properties = Map(
-      "spark.sql.sources.provider" -> "json",
-      "spark.sql.sources.schema.numParts" -> "1",
-      "spark.sql.sources.schema.part.0" -> simpleSchemaJson))
-
-  lazy val hiveCompatibleExternalDataSourceTable = CatalogTable(
-    identifier = TableIdentifier("tbl8", Some("test_db")),
-    tableType = CatalogTableType.EXTERNAL,
-    storage = CatalogStorageFormat.empty.copy(
-      locationUri = Some(tempDirUri),
-      properties = Map("path" -> tempDirStr)),
-    schema = simpleSchema,
-    properties = Map(
-      "spark.sql.sources.provider" -> "parquet",
-      "spark.sql.sources.schema.numParts" -> "1",
-      "spark.sql.sources.schema.part.0" -> simpleSchemaJson))
-
-  lazy val dataSourceTableWithoutSchema = CatalogTable(
-    identifier = TableIdentifier("tbl9", Some("test_db")),
-    tableType = CatalogTableType.EXTERNAL,
-    storage = CatalogStorageFormat.empty.copy(
-      locationUri = Some(new URI(defaultTableURI("tbl9") + "-__PLACEHOLDER__")),
-      properties = Map("path" -> tempDirStr)),
-    schema = new StructType(),
-    provider = Some("json"),
-    properties = Map("spark.sql.sources.provider" -> "json"))
-
-  // A list of all raw tables we want to test, with their expected schema.
-  lazy val rawTablesAndExpectations = Seq(
-    hiveTable -> simpleSchema,
-    externalHiveTable -> simpleSchema,
-    partitionedHiveTable -> partitionedSchema,
-    dataSourceTable -> simpleSchema,
-    hiveCompatibleDataSourceTable -> simpleSchema,
-    partitionedDataSourceTable -> partitionedSchema,
-    externalDataSourceTable -> simpleSchema,
-    hiveCompatibleExternalDataSourceTable -> simpleSchema,
-    dataSourceTableWithoutSchema -> new StructType())
-
-  test("make sure we can read table created by old version of Spark") {
-    for ((tbl, expectedSchema) <- rawTablesAndExpectations) {
-      val readBack = getTableMetadata(tbl.identifier.table)
-      assert(readBack.schema.sameType(expectedSchema))
-
-      if (tbl.tableType == CatalogTableType.EXTERNAL) {
-        // trim the URI prefix
-        val tableLocation = readBack.storage.locationUri.get.getPath
-        val expectedLocation = tempDir.toURI.getPath.stripSuffix("/")
-        assert(tableLocation == expectedLocation)
-      }
-    }
-  }
-
-  test("make sure we can alter table location created by old version of Spark") {
-    withTempDir { dir =>
-      for ((tbl, _) <- rawTablesAndExpectations if tbl.tableType == CatalogTableType.EXTERNAL) {
-        val path = dir.toURI.toString.stripSuffix("/")
-        sql(s"ALTER TABLE ${tbl.identifier} SET LOCATION '$path'")
-
-        val readBack = getTableMetadata(tbl.identifier.table)
-
-        // trim the URI prefix
-        val actualTableLocation = readBack.storage.locationUri.get.getPath
-        val expected = dir.toURI.getPath.stripSuffix("/")
-        assert(actualTableLocation == expected)
-      }
-    }
-  }
-
-  test("make sure we can rename table created by old version of Spark") {
-    for ((tbl, expectedSchema) <- rawTablesAndExpectations) {
-      val newName = tbl.identifier.table + "_renamed"
-      sql(s"ALTER TABLE ${tbl.identifier} RENAME TO $newName")
-
-      val readBack = getTableMetadata(newName)
-      assert(readBack.schema.sameType(expectedSchema))
-
-      // trim the URI prefix
-      val actualTableLocation = readBack.storage.locationUri.get.getPath
-      val expectedLocation = if (tbl.tableType == CatalogTableType.EXTERNAL) {
-        tempDir.toURI.getPath.stripSuffix("/")
-      } else {
-        // trim the URI prefix
-        defaultTableURI(newName).getPath
-      }
-      assert(actualTableLocation == expectedLocation)
-    }
-  }
-}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
new file mode 100644
index 0000000000000..2928a734a7e36
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import java.io.File
+import java.nio.file.Files
+
+import org.apache.spark.TestUtils
+import org.apache.spark.sql.{QueryTest, Row, SparkSession}
+import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.catalog.CatalogTableType
+import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.util.Utils
+
+/**
+ * Test HiveExternalCatalog backward compatibility.
+ *
+ * Note that, this test suite will automatically download spark binary packages of different
+ * versions to a local directory `/tmp/spark-test`. If there is already a spark folder with
+ * expected version under this local directory, e.g. `/tmp/spark-test/spark-2.0.3`, we will skip the
+ * downloading for this spark version.
+ */
+class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils {
+  private val wareHousePath = Utils.createTempDir(namePrefix = "warehouse")
+  private val tmpDataDir = Utils.createTempDir(namePrefix = "test-data")
+  private val sparkTestingDir = "/tmp/spark-test"
+  private val unusedJar = TestUtils.createJarWithClasses(Seq.empty)
+
+  override def afterAll(): Unit = {
+    Utils.deleteRecursively(wareHousePath)
+    Utils.deleteRecursively(tmpDataDir)
+    super.afterAll()
+  }
+
+  private def downloadSpark(version: String): Unit = {
+    import scala.sys.process._
+
+    val url = s"https://d3kbcqa49mib13.cloudfront.net/spark-$version-bin-hadoop2.7.tgz"
+
+    Seq("wget", url, "-q", "-P", sparkTestingDir).!
+
+    val downloaded = new File(sparkTestingDir, s"spark-$version-bin-hadoop2.7.tgz").getCanonicalPath
+    val targetDir = new File(sparkTestingDir, s"spark-$version").getCanonicalPath
+
+    Seq("mkdir", targetDir).!
+
+    Seq("tar", "-xzf", downloaded, "-C", targetDir, "--strip-components=1").!
+
+    Seq("rm", downloaded).!
+  }
+
+  private def genDataDir(name: String): String = {
+    new File(tmpDataDir, name).getCanonicalPath
+  }
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+
+    val tempPyFile = File.createTempFile("test", ".py")
+    Files.write(tempPyFile.toPath,
+      s"""
+        |from pyspark.sql import SparkSession
+        |
+        |spark = SparkSession.builder.enableHiveSupport().getOrCreate()
+        |version_index = spark.conf.get("spark.sql.test.version.index", None)
+        |
+        |spark.sql("create table data_source_tbl_{} using json as select 1 i".format(version_index))
+        |
+        |spark.sql("create table hive_compatible_data_source_tbl_" + version_index + \\
+        |          " using parquet as select 1 i")
+        |
+        |json_file = "${genDataDir("json_")}" + str(version_index)
+        |spark.range(1, 2).selectExpr("cast(id as int) as i").write.json(json_file)
+        |spark.sql("create table external_data_source_tbl_" + version_index + \\
+        |          "(i int) using json options (path '{}')".format(json_file))
+        |
+        |parquet_file = "${genDataDir("parquet_")}" + str(version_index)
+        |spark.range(1, 2).selectExpr("cast(id as int) as i").write.parquet(parquet_file)
+        |spark.sql("create table hive_compatible_external_data_source_tbl_" + version_index + \\
+        |          "(i int) using parquet options (path '{}')".format(parquet_file))
+        |
+        |json_file2 = "${genDataDir("json2_")}" + str(version_index)
+        |spark.range(1, 2).selectExpr("cast(id as int) as i").write.json(json_file2)
+        |spark.sql("create table external_table_without_schema_" + version_index + \\
+        |          " using json options (path '{}')".format(json_file2))
+        |
+        |spark.sql("create view v_{} as select 1 i".format(version_index))
+      """.stripMargin.getBytes("utf8"))
+
+    PROCESS_TABLES.testingVersions.zipWithIndex.foreach { case (version, index) =>
+      val sparkHome = new File(sparkTestingDir, s"spark-$version")
+      if (!sparkHome.exists()) {
+        downloadSpark(version)
+      }
+
+      val args = Seq(
+        "--name", "prepare testing tables",
+        "--master", "local[2]",
+        "--conf", "spark.ui.enabled=false",
+        "--conf", "spark.master.rest.enabled=false",
+        "--conf", s"spark.sql.warehouse.dir=${wareHousePath.getCanonicalPath}",
+        "--conf", s"spark.sql.test.version.index=$index",
+        "--driver-java-options", s"-Dderby.system.home=${wareHousePath.getCanonicalPath}",
+        tempPyFile.getCanonicalPath)
+      runSparkSubmit(args, Some(sparkHome.getCanonicalPath))
+    }
+
+    tempPyFile.delete()
+  }
+
+  test("backward compatibility") {
+    val args = Seq(
+      "--class", PROCESS_TABLES.getClass.getName.stripSuffix("$"),
+      "--name", "HiveExternalCatalog backward compatibility test",
+      "--master", "local[2]",
+      "--conf", "spark.ui.enabled=false",
+      "--conf", "spark.master.rest.enabled=false",
+      "--conf", s"spark.sql.warehouse.dir=${wareHousePath.getCanonicalPath}",
+      "--driver-java-options", s"-Dderby.system.home=${wareHousePath.getCanonicalPath}",
+      unusedJar.toString)
+    runSparkSubmit(args)
+  }
+}
+
+object PROCESS_TABLES extends QueryTest with SQLTestUtils {
+  // Tests the latest version of every release line.
+  val testingVersions = Seq("2.0.2", "2.1.1", "2.2.0")
+
+  protected var spark: SparkSession = _
+
+  def main(args: Array[String]): Unit = {
+    val session = SparkSession.builder()
+      .enableHiveSupport()
+      .getOrCreate()
+    spark = session
+
+    testingVersions.indices.foreach { index =>
+      Seq(
+        s"data_source_tbl_$index",
+        s"hive_compatible_data_source_tbl_$index",
+        s"external_data_source_tbl_$index",
+        s"hive_compatible_external_data_source_tbl_$index",
+        s"external_table_without_schema_$index").foreach { tbl =>
+        val tableMeta = spark.sharedState.externalCatalog.getTable("default", tbl)
+
+        // make sure we can insert and query these tables.
+        session.sql(s"insert into $tbl select 2")
+        checkAnswer(session.sql(s"select * from $tbl"), Row(1) :: Row(2) :: Nil)
+        checkAnswer(session.sql(s"select i from $tbl where i > 1"), Row(2))
+
+        // make sure we can rename table.
+        val newName = tbl + "_renamed"
+        sql(s"ALTER TABLE $tbl RENAME TO $newName")
+        val readBack = spark.sharedState.externalCatalog.getTable("default", newName)
+
+        val actualTableLocation = readBack.storage.locationUri.get.getPath
+        val expectedLocation = if (tableMeta.tableType == CatalogTableType.EXTERNAL) {
+          tableMeta.storage.locationUri.get.getPath
+        } else {
+          spark.sessionState.catalog.defaultTablePath(TableIdentifier(newName, None)).getPath
+        }
+        assert(actualTableLocation == expectedLocation)
+
+        // make sure we can alter table location.
+        withTempDir { dir =>
+          val path = dir.toURI.toString.stripSuffix("/")
+          sql(s"ALTER TABLE ${tbl}_renamed SET LOCATION '$path'")
+          val readBack = spark.sharedState.externalCatalog.getTable("default", tbl + "_renamed")
+          val actualTableLocation = readBack.storage.locationUri.get.getPath
+          val expected = dir.toURI.getPath.stripSuffix("/")
+          assert(actualTableLocation == expected)
+        }
+      }
+
+      // test permanent view
+      checkAnswer(sql(s"select i from v_$index"), Row(1))
+    }
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
index be6aa6d8dc3c9..21b3e281490cf 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
@@ -18,17 +18,11 @@
 package org.apache.spark.sql.hive
 
 import java.io.{BufferedWriter, File, FileWriter}
-import java.sql.Timestamp
-import java.util.Date
 
-import scala.collection.mutable.ArrayBuffer
 import scala.tools.nsc.Properties
 
 import org.apache.hadoop.fs.Path
 import org.scalatest.{BeforeAndAfterEach, Matchers}
-import org.scalatest.concurrent.TimeLimits
-import org.scalatest.exceptions.TestFailedDueToTimeoutException
-import org.scalatest.time.SpanSugar._
 
 import org.apache.spark._
 import org.apache.spark.internal.Logging
@@ -38,7 +32,6 @@ import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.expressions.Window
 import org.apache.spark.sql.hive.test.{TestHive, TestHiveContext}
-import org.apache.spark.sql.test.ProcessTestUtils.ProcessOutputCapturer
 import org.apache.spark.sql.types.{DecimalType, StructType}
 import org.apache.spark.util.{ResetSystemProperties, Utils}
 
@@ -46,11 +39,10 @@ import org.apache.spark.util.{ResetSystemProperties, Utils}
  * This suite tests spark-submit with applications using HiveContext.
  */
 class HiveSparkSubmitSuite
-  extends SparkFunSuite
+  extends SparkSubmitTestUtils
   with Matchers
   with BeforeAndAfterEach
-  with ResetSystemProperties
-  with TimeLimits {
+  with ResetSystemProperties {
 
   // TODO: rewrite these or mark them as slow tests to be run sparingly
 
@@ -333,71 +325,6 @@ class HiveSparkSubmitSuite
       unusedJar.toString)
     runSparkSubmit(argsForShowTables)
   }
-
-  // NOTE: This is an expensive operation in terms of time (10 seconds+). Use sparingly.
-  // This is copied from org.apache.spark.deploy.SparkSubmitSuite
-  private def runSparkSubmit(args: Seq[String]): Unit = {
-    val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!"))
-    val history = ArrayBuffer.empty[String]
-    val sparkSubmit = if (Utils.isWindows) {
-      // On Windows, `ProcessBuilder.directory` does not change the current working directory.
-      new File("..\\..\\bin\\spark-submit.cmd").getAbsolutePath
-    } else {
-      "./bin/spark-submit"
-    }
-    val commands = Seq(sparkSubmit) ++ args
-    val commandLine = commands.mkString("'", "' '", "'")
-
-    val builder = new ProcessBuilder(commands: _*).directory(new File(sparkHome))
-    val env = builder.environment()
-    env.put("SPARK_TESTING", "1")
-    env.put("SPARK_HOME", sparkHome)
-
-    def captureOutput(source: String)(line: String): Unit = {
-      // This test suite has some weird behaviors when executed on Jenkins:
-      //
-      // 1. Sometimes it gets extremely slow out of unknown reason on Jenkins.  Here we add a
-      //    timestamp to provide more diagnosis information.
-      // 2. Log lines are not correctly redirected to unit-tests.log as expected, so here we print
-      //    them out for debugging purposes.
-      val logLine = s"${new Timestamp(new Date().getTime)} - $source> $line"
-      // scalastyle:off println
-      println(logLine)
-      // scalastyle:on println
-      history += logLine
-    }
-
-    val process = builder.start()
-    new ProcessOutputCapturer(process.getInputStream, captureOutput("stdout")).start()
-    new ProcessOutputCapturer(process.getErrorStream, captureOutput("stderr")).start()
-
-    try {
-      val exitCode = failAfter(300.seconds) { process.waitFor() }
-      if (exitCode != 0) {
-        // include logs in output. Note that logging is async and may not have completed
-        // at the time this exception is raised
-        Thread.sleep(1000)
-        val historyLog = history.mkString("\n")
-        fail {
-          s"""spark-submit returned with exit code $exitCode.
-             |Command line: $commandLine
-             |
-             |$historyLog
-           """.stripMargin
-        }
-      }
-    } catch {
-      case to: TestFailedDueToTimeoutException =>
-        val historyLog = history.mkString("\n")
-        fail(s"Timeout of $commandLine" +
-            s" See the log4j logs for more detail." +
-            s"\n$historyLog", to)
-        case t: Throwable => throw t
-    } finally {
-      // Ensure we still kill the process in case it timed out
-      process.destroy()
-    }
-  }
 }
 
 object SetMetastoreURLTest extends Logging {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index 83cee5d1b8a42..29b0e6c8533ef 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -1354,31 +1354,4 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
       sparkSession.sparkContext.conf.set(DEBUG_MODE, previousValue)
     }
   }
-
-  test("SPARK-18464: support old table which doesn't store schema in table properties") {
-    withTable("old") {
-      withTempPath { path =>
-        Seq(1 -> "a").toDF("i", "j").write.parquet(path.getAbsolutePath)
-        val tableDesc = CatalogTable(
-          identifier = TableIdentifier("old", Some("default")),
-          tableType = CatalogTableType.EXTERNAL,
-          storage = CatalogStorageFormat.empty.copy(
-            properties = Map("path" -> path.getAbsolutePath)
-          ),
-          schema = new StructType(),
-          provider = Some("parquet"),
-          properties = Map(
-            HiveExternalCatalog.DATASOURCE_PROVIDER -> "parquet"))
-        hiveClient.createTable(tableDesc, ignoreIfExists = false)
-
-        checkAnswer(spark.table("old"), Row(1, "a"))
-        checkAnswer(sql("select * from old"), Row(1, "a"))
-
-        val expectedSchema = StructType(Seq(
-          StructField("i", IntegerType, nullable = true),
-          StructField("j", StringType, nullable = true)))
-        assert(table("old").schema === expectedSchema)
-      }
-    }
-  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/SparkSubmitTestUtils.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/SparkSubmitTestUtils.scala
new file mode 100644
index 0000000000000..ede44df4afe11
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/SparkSubmitTestUtils.scala
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import java.io.File
+import java.sql.Timestamp
+import java.util.Date
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.scalatest.concurrent.TimeLimits
+import org.scalatest.exceptions.TestFailedDueToTimeoutException
+import org.scalatest.time.SpanSugar._
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.test.ProcessTestUtils.ProcessOutputCapturer
+import org.apache.spark.util.Utils
+
+trait SparkSubmitTestUtils extends SparkFunSuite with TimeLimits {
+
+  // NOTE: This is an expensive operation in terms of time (10 seconds+). Use sparingly.
+  // This is copied from org.apache.spark.deploy.SparkSubmitSuite
+  protected def runSparkSubmit(args: Seq[String], sparkHomeOpt: Option[String] = None): Unit = {
+    val sparkHome = sparkHomeOpt.getOrElse(
+      sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!")))
+    val history = ArrayBuffer.empty[String]
+    val sparkSubmit = if (Utils.isWindows) {
+      // On Windows, `ProcessBuilder.directory` does not change the current working directory.
+      new File("..\\..\\bin\\spark-submit.cmd").getAbsolutePath
+    } else {
+      "./bin/spark-submit"
+    }
+    val commands = Seq(sparkSubmit) ++ args
+    val commandLine = commands.mkString("'", "' '", "'")
+
+    val builder = new ProcessBuilder(commands: _*).directory(new File(sparkHome))
+    val env = builder.environment()
+    env.put("SPARK_TESTING", "1")
+    env.put("SPARK_HOME", sparkHome)
+
+    def captureOutput(source: String)(line: String): Unit = {
+      // This test suite has some weird behaviors when executed on Jenkins:
+      //
+      // 1. Sometimes it gets extremely slow out of unknown reason on Jenkins.  Here we add a
+      //    timestamp to provide more diagnosis information.
+      // 2. Log lines are not correctly redirected to unit-tests.log as expected, so here we print
+      //    them out for debugging purposes.
+      val logLine = s"${new Timestamp(new Date().getTime)} - $source> $line"
+      // scalastyle:off println
+      println(logLine)
+      // scalastyle:on println
+      history += logLine
+    }
+
+    val process = builder.start()
+    new ProcessOutputCapturer(process.getInputStream, captureOutput("stdout")).start()
+    new ProcessOutputCapturer(process.getErrorStream, captureOutput("stderr")).start()
+
+    try {
+      val exitCode = failAfter(300.seconds) { process.waitFor() }
+      if (exitCode != 0) {
+        // include logs in output. Note that logging is async and may not have completed
+        // at the time this exception is raised
+        Thread.sleep(1000)
+        val historyLog = history.mkString("\n")
+        fail {
+          s"""spark-submit returned with exit code $exitCode.
+             |Command line: $commandLine
+             |
+             |$historyLog
+           """.stripMargin
+        }
+      }
+    } catch {
+      case to: TestFailedDueToTimeoutException =>
+        val historyLog = history.mkString("\n")
+        fail(s"Timeout of $commandLine" +
+          s" See the log4j logs for more detail." +
+          s"\n$historyLog", to)
+      case t: Throwable => throw t
+    } finally {
+      // Ensure we still kill the process in case it timed out
+      process.destroy()
+    }
+  }
+}

From 0dfc1ec59e45c836cb968bc9b77c69bf0e917b06 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Fri, 8 Sep 2017 20:21:37 +0900
Subject: [PATCH 508/779] [SPARK-21726][SQL][FOLLOW-UP] Check for structural
 integrity of the plan in Optimzer in test mode

## What changes were proposed in this pull request?

The condition in `Optimizer.isPlanIntegral` is wrong. We should always return `true` if not in test mode.

## How was this patch tested?

Manually test.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #19161 from viirya/SPARK-21726-followup.
---
 .../org/apache/spark/sql/catalyst/optimizer/Optimizer.scala     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 2426a8b4a9062..a602894efbcae 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -41,7 +41,7 @@ abstract class Optimizer(sessionCatalog: SessionCatalog)
   // Check for structural integrity of the plan in test mode. Currently we only check if a plan is
   // still resolved after the execution of each rule.
   override protected def isPlanIntegral(plan: LogicalPlan): Boolean = {
-    Utils.isTesting && plan.resolved
+    !Utils.isTesting || plan.resolved
   }
 
   protected def fixedPoint = FixedPoint(SQLConf.get.optimizerMaxIterations)

From 8a4f228dc0afed7992695486ecab6bc522f1e392 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Fri, 8 Sep 2017 09:39:20 -0700
Subject: [PATCH 509/779] [SPARK-21946][TEST] fix flaky test: "alter table:
 rename cached table" in InMemoryCatalogedDDLSuite

## What changes were proposed in this pull request?

This PR fixes flaky test `InMemoryCatalogedDDLSuite "alter table: rename cached table"`.
Since this test validates distributed DataFrame, the result should be checked by using `checkAnswer`. The original version used `df.collect().Seq` method that does not guaranty an order of each element of the result.

## How was this patch tested?

Use existing test case

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #19159 from kiszk/SPARK-21946.
---
 .../org/apache/spark/sql/execution/command/DDLSuite.scala     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index ad6fc20df1f02..d19cfeef7d19f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -783,7 +783,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
     val df = (1 to 2).map { i => (i, i.toString) }.toDF("age", "name")
     df.write.insertInto("students")
     spark.catalog.cacheTable("students")
-    assume(spark.table("students").collect().toSeq == df.collect().toSeq, "bad test: wrong data")
+    checkAnswer(spark.table("students"), df)
     assume(spark.catalog.isCached("students"), "bad test: table was not cached in the first place")
     sql("ALTER TABLE students RENAME TO teachers")
     sql("CREATE TABLE students (age INT, name STRING) USING parquet")
@@ -792,7 +792,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
     assert(!spark.catalog.isCached("students"))
     assert(spark.catalog.isCached("teachers"))
     assert(spark.table("students").collect().isEmpty)
-    assert(spark.table("teachers").collect().toSeq == df.collect().toSeq)
+    checkAnswer(spark.table("teachers"), df)
   }
 
   test("rename temporary table - destination table with database name") {

From 8598d03a00a39dd23646bf752f9fed5d28e271c6 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Fri, 8 Sep 2017 11:57:33 -0700
Subject: [PATCH 510/779] [SPARK-15243][ML][SQL][PYTHON] Add missing support
 for unicode in Param methods & functions in dataframe

## What changes were proposed in this pull request?

This PR proposes to support unicodes in Param methods in ML, other missed functions in DataFrame.

For example, this causes a `ValueError` in Python 2.x when param is a unicode string:

```python
>>> from pyspark.ml.classification import LogisticRegression
>>> lr = LogisticRegression()
>>> lr.hasParam("threshold")
True
>>> lr.hasParam(u"threshold")
Traceback (most recent call last):
 ...
    raise TypeError("hasParam(): paramName must be a string")
TypeError: hasParam(): paramName must be a string
```

This PR is based on https://github.com/apache/spark/pull/13036

## How was this patch tested?

Unit tests in `python/pyspark/ml/tests.py` and `python/pyspark/sql/tests.py`.

Author: hyukjinkwon <gurwls223@gmail.com>
Author: sethah <seth.hendrickson16@gmail.com>

Closes #17096 from HyukjinKwon/SPARK-15243.
---
 python/pyspark/ml/param/__init__.py |  4 ++--
 python/pyspark/ml/tests.py          | 15 +++++++++++++++
 python/pyspark/sql/dataframe.py     | 22 +++++++++++-----------
 python/pyspark/sql/tests.py         | 22 ++++++++++++++--------
 4 files changed, 42 insertions(+), 21 deletions(-)

diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py
index 13342073b4988..043c25cf9feb4 100644
--- a/python/pyspark/ml/param/__init__.py
+++ b/python/pyspark/ml/param/__init__.py
@@ -330,7 +330,7 @@ def hasParam(self, paramName):
         Tests whether this instance contains a param with a given
         (string) name.
         """
-        if isinstance(paramName, str):
+        if isinstance(paramName, basestring):
             p = getattr(self, paramName, None)
             return isinstance(p, Param)
         else:
@@ -413,7 +413,7 @@ def _resolveParam(self, param):
         if isinstance(param, Param):
             self._shouldOwn(param)
             return param
-        elif isinstance(param, str):
+        elif isinstance(param, basestring):
             return self.getParam(param)
         else:
             raise ValueError("Cannot resolve %r as a param." % param)
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 6076b3c2f26a6..509698f6014eb 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
@@ -352,6 +353,20 @@ def test_hasparam(self):
         testParams = TestParams()
         self.assertTrue(all([testParams.hasParam(p.name) for p in testParams.params]))
         self.assertFalse(testParams.hasParam("notAParameter"))
+        self.assertTrue(testParams.hasParam(u"maxIter"))
+
+    def test_resolveparam(self):
+        testParams = TestParams()
+        self.assertEqual(testParams._resolveParam(testParams.maxIter), testParams.maxIter)
+        self.assertEqual(testParams._resolveParam("maxIter"), testParams.maxIter)
+
+        self.assertEqual(testParams._resolveParam(u"maxIter"), testParams.maxIter)
+        if sys.version_info[0] >= 3:
+            # In Python 3, it is allowed to get/set attributes with non-ascii characters.
+            e_cls = AttributeError
+        else:
+            e_cls = UnicodeEncodeError
+        self.assertRaises(e_cls, lambda: testParams._resolveParam(u"아"))
 
     def test_params(self):
         testParams = TestParams()
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 1cea130d918ad..8f88545443c75 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -748,7 +748,7 @@ def sampleBy(self, col, fractions, seed=None):
         +---+-----+
 
         """
-        if not isinstance(col, str):
+        if not isinstance(col, basestring):
             raise ValueError("col must be a string, but got %r" % type(col))
         if not isinstance(fractions, dict):
             raise ValueError("fractions must be a dict but got %r" % type(fractions))
@@ -1664,18 +1664,18 @@ def approxQuantile(self, col, probabilities, relativeError):
            Added support for multiple columns.
         """
 
-        if not isinstance(col, (str, list, tuple)):
+        if not isinstance(col, (basestring, list, tuple)):
             raise ValueError("col should be a string, list or tuple, but got %r" % type(col))
 
-        isStr = isinstance(col, str)
+        isStr = isinstance(col, basestring)
 
         if isinstance(col, tuple):
             col = list(col)
-        elif isinstance(col, str):
+        elif isStr:
             col = [col]
 
         for c in col:
-            if not isinstance(c, str):
+            if not isinstance(c, basestring):
                 raise ValueError("columns should be strings, but got %r" % type(c))
         col = _to_list(self._sc, col)
 
@@ -1707,9 +1707,9 @@ def corr(self, col1, col2, method=None):
         :param col2: The name of the second column
         :param method: The correlation method. Currently only supports "pearson"
         """
-        if not isinstance(col1, str):
+        if not isinstance(col1, basestring):
             raise ValueError("col1 should be a string.")
-        if not isinstance(col2, str):
+        if not isinstance(col2, basestring):
             raise ValueError("col2 should be a string.")
         if not method:
             method = "pearson"
@@ -1727,9 +1727,9 @@ def cov(self, col1, col2):
         :param col1: The name of the first column
         :param col2: The name of the second column
         """
-        if not isinstance(col1, str):
+        if not isinstance(col1, basestring):
             raise ValueError("col1 should be a string.")
-        if not isinstance(col2, str):
+        if not isinstance(col2, basestring):
             raise ValueError("col2 should be a string.")
         return self._jdf.stat().cov(col1, col2)
 
@@ -1749,9 +1749,9 @@ def crosstab(self, col1, col2):
         :param col2: The name of the second column. Distinct items will make the column names
             of the DataFrame.
         """
-        if not isinstance(col1, str):
+        if not isinstance(col1, basestring):
             raise ValueError("col1 should be a string.")
-        if not isinstance(col2, str):
+        if not isinstance(col2, basestring):
             raise ValueError("col2 should be a string.")
         return DataFrame(self._jdf.stat().crosstab(col1, col2), self.sql_ctx)
 
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 1bc889c3f45c7..4d65abc11eaf9 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -1140,11 +1140,12 @@ def test_first_last_ignorenulls(self):
 
     def test_approxQuantile(self):
         df = self.sc.parallelize([Row(a=i, b=i+10) for i in range(10)]).toDF()
-        aq = df.stat.approxQuantile("a", [0.1, 0.5, 0.9], 0.1)
-        self.assertTrue(isinstance(aq, list))
-        self.assertEqual(len(aq), 3)
+        for f in ["a", u"a"]:
+            aq = df.stat.approxQuantile(f, [0.1, 0.5, 0.9], 0.1)
+            self.assertTrue(isinstance(aq, list))
+            self.assertEqual(len(aq), 3)
         self.assertTrue(all(isinstance(q, float) for q in aq))
-        aqs = df.stat.approxQuantile(["a", "b"], [0.1, 0.5, 0.9], 0.1)
+        aqs = df.stat.approxQuantile(["a", u"b"], [0.1, 0.5, 0.9], 0.1)
         self.assertTrue(isinstance(aqs, list))
         self.assertEqual(len(aqs), 2)
         self.assertTrue(isinstance(aqs[0], list))
@@ -1153,7 +1154,7 @@ def test_approxQuantile(self):
         self.assertTrue(isinstance(aqs[1], list))
         self.assertEqual(len(aqs[1]), 3)
         self.assertTrue(all(isinstance(q, float) for q in aqs[1]))
-        aqt = df.stat.approxQuantile(("a", "b"), [0.1, 0.5, 0.9], 0.1)
+        aqt = df.stat.approxQuantile((u"a", "b"), [0.1, 0.5, 0.9], 0.1)
         self.assertTrue(isinstance(aqt, list))
         self.assertEqual(len(aqt), 2)
         self.assertTrue(isinstance(aqt[0], list))
@@ -1169,17 +1170,22 @@ def test_approxQuantile(self):
     def test_corr(self):
         import math
         df = self.sc.parallelize([Row(a=i, b=math.sqrt(i)) for i in range(10)]).toDF()
-        corr = df.stat.corr("a", "b")
+        corr = df.stat.corr(u"a", "b")
         self.assertTrue(abs(corr - 0.95734012) < 1e-6)
 
+    def test_sampleby(self):
+        df = self.sc.parallelize([Row(a=i, b=(i % 3)) for i in range(10)]).toDF()
+        sampled = df.stat.sampleBy(u"b", fractions={0: 0.5, 1: 0.5}, seed=0)
+        self.assertTrue(sampled.count() == 3)
+
     def test_cov(self):
         df = self.sc.parallelize([Row(a=i, b=2 * i) for i in range(10)]).toDF()
-        cov = df.stat.cov("a", "b")
+        cov = df.stat.cov(u"a", "b")
         self.assertTrue(abs(cov - 55.0 / 3) < 1e-6)
 
     def test_crosstab(self):
         df = self.sc.parallelize([Row(a=i % 3, b=i % 2) for i in range(1, 7)]).toDF()
-        ct = df.stat.crosstab("a", "b").collect()
+        ct = df.stat.crosstab(u"a", "b").collect()
         ct = sorted(ct, key=lambda x: x[0])
         for i, row in enumerate(ct):
             self.assertEqual(row[0], str(i))

From 31c74fec24ae3bc8b9eb4ecd90896de459c3cc22 Mon Sep 17 00:00:00 2001
From: Xin Ren <iamshrek@126.com>
Date: Fri, 8 Sep 2017 12:09:00 -0700
Subject: [PATCH 511/779] [SPARK-19866][ML][PYSPARK] Add local version of
 Word2Vec findSynonyms for spark.ml: Python API

https://issues.apache.org/jira/browse/SPARK-19866

## What changes were proposed in this pull request?

Add Python API for findSynonymsArray matching Scala API.

## How was this patch tested?

Manual test
`./python/run-tests --python-executables=python2.7 --modules=pyspark-ml`

Author: Xin Ren <iamshrek@126.com>
Author: Xin Ren <renxin.ubc@gmail.com>
Author: Xin Ren <keypointt@users.noreply.github.com>

Closes #17451 from keypointt/SPARK-19866.
---
 .../org/apache/spark/ml/feature/Word2Vec.scala    |  2 +-
 python/pyspark/ml/feature.py                      | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
index d4c8e4b361959..f6095e26f435c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
@@ -229,7 +229,7 @@ class Word2VecModel private[ml] (
    * Find "num" number of words closest in similarity to the given word, not
    * including the word itself.
    * @return a dataframe with columns "word" and "similarity" of the word and the cosine
-   * similarities between the synonyms and the given word vector.
+   * similarities between the synonyms and the given word.
    */
   @Since("1.5.0")
   def findSynonyms(word: String, num: Int): DataFrame = {
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 050537b811f61..232ae3ef41166 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -2751,6 +2751,8 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has
     |   c|[-0.3794820010662...|
     +----+--------------------+
     ...
+    >>> model.findSynonymsArray("a", 2)
+    [(u'b', 0.25053444504737854), (u'c', -0.6980510950088501)]
     >>> from pyspark.sql.functions import format_number as fmt
     >>> model.findSynonyms("a", 2).select("word", fmt("similarity", 5).alias("similarity")).show()
     +----+----------+
@@ -2927,6 +2929,19 @@ def findSynonyms(self, word, num):
             word = _convert_to_vector(word)
         return self._call_java("findSynonyms", word, num)
 
+    @since("2.3.0")
+    def findSynonymsArray(self, word, num):
+        """
+        Find "num" number of words closest in similarity to "word".
+        word can be a string or vector representation.
+        Returns an array with two fields word and similarity (which
+        gives the cosine similarity).
+        """
+        if not isinstance(word, basestring):
+            word = _convert_to_vector(word)
+        tuples = self._java_obj.findSynonymsArray(word, num)
+        return list(map(lambda st: (st._1(), st._2()), list(tuples)))
+
 
 @inherit_doc
 class PCA(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):

From 8a5eb5068104f527426fb2d0908f45c8eff0749f Mon Sep 17 00:00:00 2001
From: Andrew Ash <andrew@andrewash.com>
Date: Fri, 8 Sep 2017 23:33:15 -0700
Subject: [PATCH 512/779] [SPARK-21941] Stop storing unused attemptId in
 SQLTaskMetrics

## What changes were proposed in this pull request?

In a driver heap dump containing 390,105 instances of SQLTaskMetrics this
would have saved me approximately 3.2MB of memory.

Since we're not getting any benefit from storing this unused value, let's
eliminate it until a future PR makes use of it.

## How was this patch tested?

Existing unit tests

Author: Andrew Ash <andrew@andrewash.com>

Closes #19153 from ash211/aash/trim-sql-listener.
---
 .../org/apache/spark/sql/execution/ui/SQLListener.scala    | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala
index b4a91230a0012..8c27af374febd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala
@@ -255,10 +255,8 @@ class SQLListener(conf: SparkConf) extends SparkListener with Logging {
                 // heartbeat reports
               }
             case None =>
-              // TODO Now just set attemptId to 0. Should fix here when we can get the attempt
-              // id from SparkListenerExecutorMetricsUpdate
               stageMetrics.taskIdToMetricUpdates(taskId) = new SQLTaskMetrics(
-                  attemptId = 0, finished = finishTask, accumulatorUpdates)
+                  finished = finishTask, accumulatorUpdates)
           }
         }
       case None =>
@@ -478,10 +476,11 @@ private[ui] class SQLStageMetrics(
     val stageAttemptId: Long,
     val taskIdToMetricUpdates: mutable.HashMap[Long, SQLTaskMetrics] = mutable.HashMap.empty)
 
+
+// TODO Should add attemptId here when we can get it from SparkListenerExecutorMetricsUpdate
 /**
  * Store all accumulatorUpdates for a Spark task.
  */
 private[ui] class SQLTaskMetrics(
-    val attemptId: Long, // TODO not used yet
     var finished: Boolean,
     var accumulatorUpdates: Seq[(Long, Any)])

From 6b45d7e941eba8a36be26116787322d9e3ae25d0 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Sat, 9 Sep 2017 19:10:52 +0900
Subject: [PATCH 513/779] [SPARK-21954][SQL] JacksonUtils should verify
 MapType's value type instead of key type

## What changes were proposed in this pull request?

`JacksonUtils.verifySchema` verifies if a data type can be converted to JSON. For `MapType`, it now verifies the key type. However, in `JacksonGenerator`, when converting a map to JSON, we only care about its values and create a writer for the values. The keys in a map are treated as strings by calling `toString` on the keys.

Thus, we should change `JacksonUtils.verifySchema` to verify the value type of `MapType`.

## How was this patch tested?

Added tests.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #19167 from viirya/test-jacksonutils.
---
 .../sql/catalyst/json/JacksonUtils.scala      |  4 +++-
 .../expressions/JsonExpressionsSuite.scala    | 23 ++++++++++++++++++
 .../apache/spark/sql/JsonFunctionsSuite.scala | 24 ++++++++++++++++---
 3 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonUtils.scala
index 3b23c6cd2816f..134d16e981a15 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonUtils.scala
@@ -44,7 +44,9 @@ object JacksonUtils {
 
       case at: ArrayType => verifyType(name, at.elementType)
 
-      case mt: MapType => verifyType(name, mt.keyType)
+      // For MapType, its keys are treated as a string (i.e. calling `toString`) basically when
+      // generating JSON, so we only care if the values are valid for JSON.
+      case mt: MapType => verifyType(name, mt.valueType)
 
       case udt: UserDefinedType[_] => verifyType(name, udt.sqlType)
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
index 9991bda165a01..5de11433b0c0d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
@@ -21,6 +21,7 @@ import java.util.Calendar
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.util.{DateTimeTestUtils, DateTimeUtils, GenericArrayData, PermissiveMode}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
@@ -610,4 +611,26 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       """{"t":"2015-12-31T16:00:00"}"""
     )
   }
+
+  test("to_json: verify MapType's value type instead of key type") {
+    // Keys in map are treated as strings when converting to JSON. The type doesn't matter at all.
+    val mapType1 = MapType(CalendarIntervalType, IntegerType)
+    val schema1 = StructType(StructField("a", mapType1) :: Nil)
+    val struct1 = Literal.create(null, schema1)
+    checkEvaluation(
+      StructsToJson(Map.empty, struct1, gmtId),
+      null
+    )
+
+    // The value type must be valid for converting to JSON.
+    val mapType2 = MapType(IntegerType, CalendarIntervalType)
+    val schema2 = StructType(StructField("a", mapType2) :: Nil)
+    val struct2 = Literal.create(null, schema2)
+    intercept[TreeNodeException[_]] {
+      checkEvaluation(
+        StructsToJson(Map.empty, struct2, gmtId),
+        null
+      )
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
index cf2d00fc94423..119af213047f9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql
 
-import org.apache.spark.sql.functions.{from_json, struct, to_json}
+import org.apache.spark.sql.functions.{from_json, lit, map, struct, to_json}
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
 
@@ -195,15 +195,33 @@ class JsonFunctionsSuite extends QueryTest with SharedSQLContext {
       Row("""{"_1":"26/08/2015 18:00"}""") :: Nil)
   }
 
-  test("to_json unsupported type") {
+  test("to_json - key types of map don't matter") {
+    // interval type is invalid for converting to JSON. However, the keys of a map are treated
+    // as strings, so its type doesn't matter.
     val df = Seq(Tuple1(Tuple1("interval -3 month 7 hours"))).toDF("a")
-      .select(struct($"a._1".cast(CalendarIntervalType).as("a")).as("c"))
+      .select(struct(map($"a._1".cast(CalendarIntervalType), lit("a")).as("col1")).as("c"))
+    checkAnswer(
+      df.select(to_json($"c")),
+      Row("""{"col1":{"interval -3 months 7 hours":"a"}}""") :: Nil)
+  }
+
+  test("to_json unsupported type") {
+    val baseDf = Seq(Tuple1(Tuple1("interval -3 month 7 hours"))).toDF("a")
+    val df = baseDf.select(struct($"a._1".cast(CalendarIntervalType).as("a")).as("c"))
     val e = intercept[AnalysisException]{
       // Unsupported type throws an exception
       df.select(to_json($"c")).collect()
     }
     assert(e.getMessage.contains(
       "Unable to convert column a of type calendarinterval to JSON."))
+
+    // interval type is invalid for converting to JSON. We can't use it as value type of a map.
+    val df2 = baseDf
+      .select(struct(map(lit("a"), $"a._1".cast(CalendarIntervalType)).as("col1")).as("c"))
+    val e2 = intercept[AnalysisException] {
+      df2.select(to_json($"c")).collect()
+    }
+    assert(e2.getMessage.contains("Unable to convert column col1 of type calendarinterval to JSON"))
   }
 
   test("roundtrip in to_json and from_json - struct") {

From e4d8f9a36ac27b0175f310bf5592b2881b025468 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Sat, 9 Sep 2017 09:25:12 -0700
Subject: [PATCH 514/779] [MINOR][SQL] Correct DataFrame doc.

## What changes were proposed in this pull request?
Correct DataFrame doc.

## How was this patch tested?
Only doc change, no tests.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #19173 from yanboliang/df-doc.
---
 python/pyspark/sql/dataframe.py                    | 14 +++++++-------
 .../main/scala/org/apache/spark/sql/Dataset.scala  |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 8f88545443c75..88ac4134a0d0d 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -46,9 +46,9 @@ class DataFrame(object):
     """A distributed collection of data grouped into named columns.
 
     A :class:`DataFrame` is equivalent to a relational table in Spark SQL,
-    and can be created using various functions in :class:`SQLContext`::
+    and can be created using various functions in :class:`SparkSession`::
 
-        people = sqlContext.read.parquet("...")
+        people = spark.read.parquet("...")
 
     Once created, it can be manipulated using the various domain-specific-language
     (DSL) functions defined in: :class:`DataFrame`, :class:`Column`.
@@ -59,9 +59,9 @@ class DataFrame(object):
 
     A more concrete example::
 
-        # To create DataFrame using SQLContext
-        people = sqlContext.read.parquet("...")
-        department = sqlContext.read.parquet("...")
+        # To create DataFrame using SparkSession
+        people = spark.read.parquet("...")
+        department = spark.read.parquet("...")
 
         people.filter(people.age > 30).join(department, people.deptId == department.id) \\
           .groupBy(department.name, "gender").agg({"salary": "avg", "age": "max"})
@@ -116,9 +116,9 @@ def toJSON(self, use_unicode=True):
 
     @since(1.3)
     def registerTempTable(self, name):
-        """Registers this RDD as a temporary table using the given name.
+        """Registers this DataFrame as a temporary table using the given name.
 
-        The lifetime of this temporary table is tied to the :class:`SQLContext`
+        The lifetime of this temporary table is tied to the :class:`SparkSession`
         that was used to create this :class:`DataFrame`.
 
         >>> df.registerTempTable("people")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 6db6aa361e4cc..ab0c4126bcbdd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -2891,8 +2891,8 @@ class Dataset[T] private[sql](
    *
    * Global temporary view is cross-session. Its lifetime is the lifetime of the Spark application,
    * i.e. it will be automatically dropped when the application terminates. It's tied to a system
-   * preserved database `_global_temp`, and we must use the qualified name to refer a global temp
-   * view, e.g. `SELECT * FROM _global_temp.view1`.
+   * preserved database `global_temp`, and we must use the qualified name to refer a global temp
+   * view, e.g. `SELECT * FROM global_temp.view1`.
    *
    * @group basic
    * @since 2.2.0

From f76790557b063edc3080d5c792167e2f8b7060d1 Mon Sep 17 00:00:00 2001
From: Jane Wang <janewang@fb.com>
Date: Sat, 9 Sep 2017 11:48:34 -0700
Subject: [PATCH 515/779] [SPARK-4131] Support "Writing data into the
 filesystem from queries"

## What changes were proposed in this pull request?

This PR implements the sql feature:
INSERT OVERWRITE [LOCAL] DIRECTORY directory1
  [ROW FORMAT row_format] [STORED AS file_format]
  SELECT ... FROM ...

## How was this patch tested?
Added new unittests and also pulled the code to fb-spark so that we could test writing to hdfs directory.

Author: Jane Wang <janewang@fb.com>

Closes #18975 from janewangfb/port_local_directory.
---
 .../spark/sql/catalyst/parser/SqlBase.g4      |   8 +-
 .../UnsupportedOperationChecker.scala         |   3 +
 .../sql/catalyst/parser/AstBuilder.scala      |  79 ++++++-
 .../plans/logical/basicLogicalOperators.scala |  26 ++-
 .../spark/sql/execution/SparkSqlParser.scala  |  79 ++++++-
 .../InsertIntoDataSourceDirCommand.scala      |  82 +++++++
 .../spark/sql/execution/command/ddl.scala     |  17 +-
 .../datasources/DataSourceStrategy.scala      |  21 +-
 .../execution/command/DDLParserSuite.scala    |  52 ++++-
 .../spark/sql/sources/InsertSuite.scala       |  60 +++++
 .../spark/sql/hive/HiveStrategies.scala       |  11 +-
 .../sql/hive/execution/HiveTmpPath.scala      | 203 +++++++++++++++++
 .../execution/InsertIntoHiveDirCommand.scala  | 131 +++++++++++
 .../hive/execution/InsertIntoHiveTable.scala  | 213 +-----------------
 .../sql/hive/execution/SaveAsHiveFile.scala   |  73 ++++++
 ...HiveTableSuite.scala => InsertSuite.scala} | 182 ++++++++++++++-
 16 files changed, 1015 insertions(+), 225 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/command/InsertIntoDataSourceDirCommand.scala
 create mode 100644 sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTmpPath.scala
 create mode 100644 sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveDirCommand.scala
 create mode 100644 sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
 rename sql/hive/src/test/scala/org/apache/spark/sql/hive/{InsertIntoHiveTableSuite.scala => InsertSuite.scala} (78%)

diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index f741dcfbf2002..239e73ef6986c 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -243,8 +243,10 @@ query
     ;
 
 insertInto
-    : INSERT OVERWRITE TABLE tableIdentifier (partitionSpec (IF NOT EXISTS)?)?
-    | INSERT INTO TABLE? tableIdentifier partitionSpec?
+    : INSERT OVERWRITE TABLE tableIdentifier (partitionSpec (IF NOT EXISTS)?)?                              #insertOverwriteTable
+    | INSERT INTO TABLE? tableIdentifier partitionSpec?                                                     #insertIntoTable
+    | INSERT OVERWRITE LOCAL? DIRECTORY path=STRING rowFormat? createFileFormat?                            #insertOverwriteHiveDir
+    | INSERT OVERWRITE LOCAL? DIRECTORY (path=STRING)? tableProvider (OPTIONS options=tablePropertyList)?   #insertOverwriteDir
     ;
 
 partitionSpecLocation
@@ -745,6 +747,7 @@ nonReserved
     | AND | CASE | CAST | DISTINCT | DIV | ELSE | END | FUNCTION | INTERVAL | MACRO | OR | STRATIFY | THEN
     | UNBOUNDED | WHEN
     | DATABASE | SELECT | FROM | WHERE | HAVING | TO | TABLE | WITH | NOT | CURRENT_DATE | CURRENT_TIMESTAMP
+    | DIRECTORY
     ;
 
 SELECT: 'SELECT';
@@ -815,6 +818,7 @@ WITH: 'WITH';
 VALUES: 'VALUES';
 CREATE: 'CREATE';
 TABLE: 'TABLE';
+DIRECTORY: 'DIRECTORY';
 VIEW: 'VIEW';
 REPLACE: 'REPLACE';
 INSERT: 'INSERT';
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
index 6ab4153bac70e..33ba0867a33e0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
@@ -146,6 +146,9 @@ object UnsupportedOperationChecker {
           throwError("Commands like CreateTable*, AlterTable*, Show* are not supported with " +
             "streaming DataFrames/Datasets")
 
+        case _: InsertIntoDir =>
+          throwError("InsertIntoDir is not supported with streaming DataFrames/Datasets")
+
         // mapGroupsWithState and flatMapGroupsWithState
         case m: FlatMapGroupsWithState if m.isStreaming =>
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 8a45c5216781b..891f61698f177 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -31,6 +31,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
 import org.apache.spark.sql.catalyst.analysis._
+import org.apache.spark.sql.catalyst.catalog.CatalogStorageFormat
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate.{First, Last}
 import org.apache.spark.sql.catalyst.parser.SqlBaseParser._
@@ -178,11 +179,64 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
   }
 
   /**
-   * Add an INSERT INTO [TABLE]/INSERT OVERWRITE TABLE operation to the logical plan.
+   * Parameters used for writing query to a table:
+   *   (tableIdentifier, partitionKeys, exists).
+   */
+  type InsertTableParams = (TableIdentifier, Map[String, Option[String]], Boolean)
+
+  /**
+   * Parameters used for writing query to a directory: (isLocal, CatalogStorageFormat, provider).
+   */
+  type InsertDirParams = (Boolean, CatalogStorageFormat, Option[String])
+
+  /**
+   * Add an
+   * {{{
+   *   INSERT OVERWRITE TABLE tableIdentifier [partitionSpec [IF NOT EXISTS]]?
+   *   INSERT INTO [TABLE] tableIdentifier [partitionSpec]
+   *   INSERT OVERWRITE [LOCAL] DIRECTORY STRING [rowFormat] [createFileFormat]
+   *   INSERT OVERWRITE [LOCAL] DIRECTORY [STRING] tableProvider [OPTIONS tablePropertyList]
+   * }}}
+   * operation to logical plan
    */
   private def withInsertInto(
       ctx: InsertIntoContext,
       query: LogicalPlan): LogicalPlan = withOrigin(ctx) {
+    ctx match {
+      case table: InsertIntoTableContext =>
+        val (tableIdent, partitionKeys, exists) = visitInsertIntoTable(table)
+        InsertIntoTable(UnresolvedRelation(tableIdent), partitionKeys, query, false, exists)
+      case table: InsertOverwriteTableContext =>
+        val (tableIdent, partitionKeys, exists) = visitInsertOverwriteTable(table)
+        InsertIntoTable(UnresolvedRelation(tableIdent), partitionKeys, query, true, exists)
+      case dir: InsertOverwriteDirContext =>
+        val (isLocal, storage, provider) = visitInsertOverwriteDir(dir)
+        InsertIntoDir(isLocal, storage, provider, query, overwrite = true)
+      case hiveDir: InsertOverwriteHiveDirContext =>
+        val (isLocal, storage, provider) = visitInsertOverwriteHiveDir(hiveDir)
+        InsertIntoDir(isLocal, storage, provider, query, overwrite = true)
+      case _ =>
+        throw new ParseException("Invalid InsertIntoContext", ctx)
+    }
+  }
+
+  /**
+   * Add an INSERT INTO TABLE operation to the logical plan.
+   */
+  override def visitInsertIntoTable(
+      ctx: InsertIntoTableContext): InsertTableParams = withOrigin(ctx) {
+    val tableIdent = visitTableIdentifier(ctx.tableIdentifier)
+    val partitionKeys = Option(ctx.partitionSpec).map(visitPartitionSpec).getOrElse(Map.empty)
+
+    (tableIdent, partitionKeys, false)
+  }
+
+  /**
+   * Add an INSERT OVERWRITE TABLE operation to the logical plan.
+   */
+  override def visitInsertOverwriteTable(
+      ctx: InsertOverwriteTableContext): InsertTableParams = withOrigin(ctx) {
+    assert(ctx.OVERWRITE() != null)
     val tableIdent = visitTableIdentifier(ctx.tableIdentifier)
     val partitionKeys = Option(ctx.partitionSpec).map(visitPartitionSpec).getOrElse(Map.empty)
 
@@ -192,12 +246,23 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
         "partitions with value: " + dynamicPartitionKeys.keys.mkString("[", ",", "]"), ctx)
     }
 
-    InsertIntoTable(
-      UnresolvedRelation(tableIdent),
-      partitionKeys,
-      query,
-      ctx.OVERWRITE != null,
-      ctx.EXISTS != null)
+    (tableIdent, partitionKeys, ctx.EXISTS() != null)
+  }
+
+  /**
+   * Write to a directory, returning a [[InsertIntoDir]] logical plan.
+   */
+  override def visitInsertOverwriteDir(
+      ctx: InsertOverwriteDirContext): InsertDirParams = withOrigin(ctx) {
+    throw new ParseException("INSERT OVERWRITE DIRECTORY is not supported", ctx)
+  }
+
+  /**
+   * Write to a directory, returning a [[InsertIntoDir]] logical plan.
+   */
+  override def visitInsertOverwriteHiveDir(
+      ctx: InsertOverwriteHiveDirContext): InsertDirParams = withOrigin(ctx) {
+    throw new ParseException("INSERT OVERWRITE DIRECTORY is not supported", ctx)
   }
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index 4b3054dbfe2f0..f443cd5a69de3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.plans.logical
 
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
-import org.apache.spark.sql.catalyst.catalog.CatalogTable
+import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.plans._
@@ -359,6 +359,30 @@ case class InsertIntoTable(
   override lazy val resolved: Boolean = false
 }
 
+/**
+ * Insert query result into a directory.
+ *
+ * @param isLocal Indicates whether the specified directory is local directory
+ * @param storage Info about output file, row and what serialization format
+ * @param provider Specifies what data source to use; only used for data source file.
+ * @param child The query to be executed
+ * @param overwrite If true, the existing directory will be overwritten
+ *
+ * Note that this plan is unresolved and has to be replaced by the concrete implementations
+ * during analysis.
+ */
+case class InsertIntoDir(
+    isLocal: Boolean,
+    storage: CatalogStorageFormat,
+    provider: Option[String],
+    child: LogicalPlan,
+    overwrite: Boolean = true)
+  extends UnaryNode {
+
+  override def output: Seq[Attribute] = Seq.empty
+  override lazy val resolved: Boolean = false
+}
+
 /**
  * A container for holding the view description(CatalogTable), and the output of the view. The
  * child should be a logical plan parsed from the `CatalogTable.viewText`, should throw an error
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index d3f6ab5654689..d38919b5d940e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.catalyst.parser._
 import org.apache.spark.sql.catalyst.parser.SqlBaseParser._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.execution.command._
-import org.apache.spark.sql.execution.datasources.{CreateTable, _}
+import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.internal.{HiveSerDe, SQLConf, VariableSubstitution}
 import org.apache.spark.sql.types.StructType
 
@@ -1512,4 +1512,81 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
       query: LogicalPlan): LogicalPlan = {
     RepartitionByExpression(expressions, query, conf.numShufflePartitions)
   }
+
+  /**
+   * Return the parameters for [[InsertIntoDir]] logical plan.
+   *
+   * Expected format:
+   * {{{
+   *   INSERT OVERWRITE DIRECTORY
+   *   [path]
+   *   [OPTIONS table_property_list]
+   *   select_statement;
+   * }}}
+   */
+  override def visitInsertOverwriteDir(
+      ctx: InsertOverwriteDirContext): InsertDirParams = withOrigin(ctx) {
+    if (ctx.LOCAL != null) {
+      throw new ParseException(
+        "LOCAL is not supported in INSERT OVERWRITE DIRECTORY to data source", ctx)
+    }
+
+    val options = Option(ctx.options).map(visitPropertyKeyValues).getOrElse(Map.empty)
+    var storage = DataSource.buildStorageFormatFromOptions(options)
+
+    val path = Option(ctx.path).map(string).getOrElse("")
+
+    if (!(path.isEmpty ^ storage.locationUri.isEmpty)) {
+      throw new ParseException(
+        "Directory path and 'path' in OPTIONS should be specified one, but not both", ctx)
+    }
+
+    if (!path.isEmpty) {
+      val customLocation = Some(CatalogUtils.stringToURI(path))
+      storage = storage.copy(locationUri = customLocation)
+    }
+
+    val provider = ctx.tableProvider.qualifiedName.getText
+
+    (false, storage, Some(provider))
+  }
+
+  /**
+   * Return the parameters for [[InsertIntoDir]] logical plan.
+   *
+   * Expected format:
+   * {{{
+   *   INSERT OVERWRITE [LOCAL] DIRECTORY
+   *   path
+   *   [ROW FORMAT row_format]
+   *   [STORED AS file_format]
+   *   select_statement;
+   * }}}
+   */
+  override def visitInsertOverwriteHiveDir(
+      ctx: InsertOverwriteHiveDirContext): InsertDirParams = withOrigin(ctx) {
+    validateRowFormatFileFormat(ctx.rowFormat, ctx.createFileFormat, ctx)
+    val rowStorage = Option(ctx.rowFormat).map(visitRowFormat)
+      .getOrElse(CatalogStorageFormat.empty)
+    val fileStorage = Option(ctx.createFileFormat).map(visitCreateFileFormat)
+      .getOrElse(CatalogStorageFormat.empty)
+
+    val path = string(ctx.path)
+    // The path field is required
+    if (path.isEmpty) {
+      operationNotAllowed("INSERT OVERWRITE DIRECTORY must be accompanied by path", ctx)
+    }
+
+    val defaultStorage = HiveSerDe.getDefaultStorage(conf)
+
+    val storage = CatalogStorageFormat(
+      locationUri = Some(CatalogUtils.stringToURI(path)),
+      inputFormat = fileStorage.inputFormat.orElse(defaultStorage.inputFormat),
+      outputFormat = fileStorage.outputFormat.orElse(defaultStorage.outputFormat),
+      serde = rowStorage.serde.orElse(fileStorage.serde).orElse(defaultStorage.serde),
+      compressed = false,
+      properties = rowStorage.properties ++ fileStorage.properties)
+
+    (ctx.LOCAL != null, storage, Some(DDLUtils.HIVE_PROVIDER))
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/InsertIntoDataSourceDirCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/InsertIntoDataSourceDirCommand.scala
new file mode 100644
index 0000000000000..633de4c37af94
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/InsertIntoDataSourceDirCommand.scala
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.command
+
+import org.apache.spark.SparkException
+import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.catalog._
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.datasources._
+
+/**
+ * A command used to write the result of a query to a directory.
+ *
+ * The syntax of using this command in SQL is:
+ * {{{
+ *   INSERT OVERWRITE DIRECTORY (path=STRING)?
+ *   USING format OPTIONS ([option1_name "option1_value", option2_name "option2_value", ...])
+ *   SELECT ...
+ * }}}
+ *
+ * @param storage storage format used to describe how the query result is stored.
+ * @param provider the data source type to be used
+ * @param query the logical plan representing data to write to
+ * @param overwrite whthere overwrites existing directory
+ */
+case class InsertIntoDataSourceDirCommand(
+    storage: CatalogStorageFormat,
+    provider: String,
+    query: LogicalPlan,
+    overwrite: Boolean) extends RunnableCommand {
+
+  override def children: Seq[LogicalPlan] = Seq(query)
+
+  override def run(sparkSession: SparkSession, children: Seq[SparkPlan]): Seq[Row] = {
+    assert(children.length == 1)
+    assert(storage.locationUri.nonEmpty, "Directory path is required")
+    assert(provider.nonEmpty, "Data source is required")
+
+    // Create the relation based on the input logical plan: `query`.
+    val pathOption = storage.locationUri.map("path" -> CatalogUtils.URIToString(_))
+
+    val dataSource = DataSource(
+      sparkSession,
+      className = provider,
+      options = storage.properties ++ pathOption,
+      catalogTable = None)
+
+    val isFileFormat = classOf[FileFormat].isAssignableFrom(dataSource.providingClass)
+    if (!isFileFormat) {
+      throw new SparkException(
+        "Only Data Sources providing FileFormat are supported: " + dataSource.providingClass)
+    }
+
+    val saveMode = if (overwrite) SaveMode.Overwrite else SaveMode.ErrorIfExists
+    try {
+      sparkSession.sessionState.executePlan(dataSource.planForWriting(saveMode, query))
+      dataSource.writeAndRead(saveMode, query)
+    } catch {
+      case ex: AnalysisException =>
+        logError(s"Failed to write to directory " + storage.locationUri.toString, ex)
+        throw ex
+    }
+
+    Seq.empty[Row]
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index 7611e1c2e268c..b06f4ccaa3bbf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -33,7 +33,8 @@ import org.apache.spark.sql.catalyst.analysis.{NoSuchTableException, Resolver}
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
-import org.apache.spark.sql.execution.datasources.PartitioningUtils
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation, PartitioningUtils}
 import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat
 import org.apache.spark.sql.execution.datasources.parquet.ParquetSchemaConverter
 import org.apache.spark.sql.internal.HiveSerDe
@@ -869,4 +870,18 @@ object DDLUtils {
       }
     }
   }
+
+  /**
+   * Throws exception if outputPath tries to overwrite inputpath.
+   */
+  def verifyNotReadPath(query: LogicalPlan, outputPath: Path) : Unit = {
+    val inputPaths = query.collect {
+      case LogicalRelation(r: HadoopFsRelation, _, _, _) => r.location.rootPaths
+    }.flatten
+
+    if (inputPaths.contains(outputPath)) {
+      throw new AnalysisException(
+        "Cannot overwrite a path that is also being read from.")
+    }
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 5d6223dffd285..018f24e290b4b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -17,8 +17,11 @@
 
 package org.apache.spark.sql.execution.datasources
 
+import java.util.Locale
 import java.util.concurrent.Callable
 
+import org.apache.hadoop.fs.Path
+
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
@@ -29,7 +32,7 @@ import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
-import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan, Project}
+import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoDir, InsertIntoTable, LogicalPlan, Project}
 import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.{RowDataSourceScanExec, SparkPlan}
@@ -142,6 +145,14 @@ case class DataSourceAnalysis(conf: SQLConf) extends Rule[LogicalPlan] with Cast
         parts, query, overwrite, false) if parts.isEmpty =>
       InsertIntoDataSourceCommand(l, query, overwrite)
 
+    case InsertIntoDir(_, storage, provider, query, overwrite)
+      if provider.isDefined && provider.get.toLowerCase(Locale.ROOT) != DDLUtils.HIVE_PROVIDER =>
+
+      val outputPath = new Path(storage.locationUri.get)
+      if (overwrite) DDLUtils.verifyNotReadPath(query, outputPath)
+
+      InsertIntoDataSourceDirCommand(storage, provider.get, query, overwrite)
+
     case i @ InsertIntoTable(
         l @ LogicalRelation(t: HadoopFsRelation, _, table, _), parts, query, overwrite, _) =>
       // If the InsertIntoTable command is for a partitioned HadoopFsRelation and
@@ -178,15 +189,9 @@ case class DataSourceAnalysis(conf: SQLConf) extends Rule[LogicalPlan] with Cast
       }
 
       val outputPath = t.location.rootPaths.head
-      val inputPaths = actualQuery.collect {
-        case LogicalRelation(r: HadoopFsRelation, _, _, _) => r.location.rootPaths
-      }.flatten
+      if (overwrite) DDLUtils.verifyNotReadPath(actualQuery, outputPath)
 
       val mode = if (overwrite) SaveMode.Overwrite else SaveMode.Append
-      if (overwrite && inputPaths.contains(outputPath)) {
-        throw new AnalysisException(
-          "Cannot overwrite a path that is also being read from.")
-      }
 
       val partitionSchema = actualQuery.resolve(
         t.partitionSchema, t.sparkSession.sessionState.analyzer.resolver)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
index 4ee38215f5973..fa5172ca8a3e7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
@@ -32,7 +32,8 @@ import org.apache.spark.sql.catalyst.dsl.plans.DslLogicalPlan
 import org.apache.spark.sql.catalyst.expressions.JsonTuple
 import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.catalyst.plans.PlanTest
-import org.apache.spark.sql.catalyst.plans.logical.{Generate, LogicalPlan, Project, ScriptTransformation}
+import org.apache.spark.sql.catalyst.plans.logical.{Generate, InsertIntoDir, LogicalPlan}
+import org.apache.spark.sql.catalyst.plans.logical.{Project, ScriptTransformation}
 import org.apache.spark.sql.execution.SparkSqlParser
 import org.apache.spark.sql.execution.datasources.CreateTable
 import org.apache.spark.sql.internal.{HiveSerDe, SQLConf}
@@ -524,6 +525,55 @@ class DDLParserSuite extends PlanTest with SharedSQLContext {
     assert(e.message.contains("you can only specify one of them."))
   }
 
+  test("insert overwrite directory") {
+    val v1 = "INSERT OVERWRITE DIRECTORY '/tmp/file' USING parquet SELECT 1 as a"
+    parser.parsePlan(v1) match {
+      case InsertIntoDir(_, storage, provider, query, overwrite) =>
+        assert(storage.locationUri.isDefined && storage.locationUri.get.toString == "/tmp/file")
+      case other =>
+        fail(s"Expected to parse ${classOf[InsertIntoDataSourceDirCommand].getClass.getName}" +
+          " from query," + s" got ${other.getClass.getName}: $v1")
+    }
+
+    val v2 = "INSERT OVERWRITE DIRECTORY USING parquet SELECT 1 as a"
+    val e2 = intercept[ParseException] {
+      parser.parsePlan(v2)
+    }
+    assert(e2.message.contains(
+      "Directory path and 'path' in OPTIONS should be specified one, but not both"))
+
+    val v3 =
+      """
+        | INSERT OVERWRITE DIRECTORY USING json
+        | OPTIONS ('path' '/tmp/file', a 1, b 0.1, c TRUE)
+        | SELECT 1 as a
+      """.stripMargin
+    parser.parsePlan(v3) match {
+      case InsertIntoDir(_, storage, provider, query, overwrite) =>
+        assert(storage.locationUri.isDefined && provider == Some("json"))
+        assert(storage.properties.get("a") == Some("1"))
+        assert(storage.properties.get("b") == Some("0.1"))
+        assert(storage.properties.get("c") == Some("true"))
+        assert(!storage.properties.contains("abc"))
+        assert(!storage.properties.contains("path"))
+      case other =>
+        fail(s"Expected to parse ${classOf[InsertIntoDataSourceDirCommand].getClass.getName}" +
+          " from query," + s"got ${other.getClass.getName}: $v1")
+    }
+
+    val v4 =
+      """
+        | INSERT OVERWRITE DIRECTORY '/tmp/file' USING json
+        | OPTIONS ('path' '/tmp/file', a 1, b 0.1, c TRUE)
+        | SELECT 1 as a
+      """.stripMargin
+    val e4 = intercept[ParseException] {
+      parser.parsePlan(v4)
+    }
+    assert(e4.message.contains(
+      "Directory path and 'path' in OPTIONS should be specified one, but not both"))
+  }
+
   // ALTER TABLE table_name RENAME TO new_table_name;
   // ALTER VIEW view_name RENAME TO new_view_name;
   test("alter table/view: rename table/view") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
index 41abff2a5da25..875b74551addb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.sources
 
 import java.io.File
 
+import org.apache.spark.SparkException
 import org.apache.spark.sql.{AnalysisException, Row}
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.util.Utils
@@ -366,4 +367,63 @@ class InsertSuite extends DataSourceTest with SharedSQLContext {
         Row(Array(1, 2), Array("a", "b")))
     }
   }
+
+  test("insert overwrite directory") {
+    withTempDir { dir =>
+      val path = dir.toURI.getPath
+
+      val v1 =
+        s"""
+           | INSERT OVERWRITE DIRECTORY '${path}'
+           | USING json
+           | OPTIONS (a 1, b 0.1, c TRUE)
+           | SELECT 1 as a, 'c' as b
+         """.stripMargin
+
+      spark.sql(v1)
+
+      checkAnswer(
+        spark.read.json(dir.getCanonicalPath),
+        sql("SELECT 1 as a, 'c' as b"))
+    }
+  }
+
+  test("insert overwrite directory with path in options") {
+    withTempDir { dir =>
+      val path = dir.toURI.getPath
+
+      val v1 =
+        s"""
+           | INSERT OVERWRITE DIRECTORY
+           | USING json
+           | OPTIONS ('path' '${path}')
+           | SELECT 1 as a, 'c' as b
+         """.stripMargin
+
+      spark.sql(v1)
+
+      checkAnswer(
+        spark.read.json(dir.getCanonicalPath),
+        sql("SELECT 1 as a, 'c' as b"))
+    }
+  }
+
+  test("insert overwrite directory to data source not providing FileFormat") {
+    withTempDir { dir =>
+      val path = dir.toURI.getPath
+
+      val v1 =
+        s"""
+           | INSERT OVERWRITE DIRECTORY '${path}'
+           | USING JDBC
+           | OPTIONS (a 1, b 0.1, c TRUE)
+           | SELECT 1 as a, 'c' as b
+         """.stripMargin
+      val e = intercept[SparkException] {
+        spark.sql(v1)
+      }.getMessage
+
+      assert(e.contains("Only Data Sources providing FileFormat are supported"))
+    }
+  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index 47203a80c37bd..caf554d9ea510 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -26,7 +26,8 @@ import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning._
-import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan, ScriptTransformation}
+import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoDir, InsertIntoTable, LogicalPlan,
+    ScriptTransformation}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.command.{CreateTableCommand, DDLUtils}
@@ -157,6 +158,14 @@ object HiveAnalysis extends Rule[LogicalPlan] {
     case CreateTable(tableDesc, mode, Some(query)) if DDLUtils.isHiveTable(tableDesc) =>
       DDLUtils.checkDataSchemaFieldNames(tableDesc)
       CreateHiveTableAsSelectCommand(tableDesc, query, mode)
+
+    case InsertIntoDir(isLocal, storage, provider, child, overwrite)
+      if provider.isDefined && provider.get.toLowerCase(Locale.ROOT) == DDLUtils.HIVE_PROVIDER =>
+
+      val outputPath = new Path(storage.locationUri.get)
+      if (overwrite) DDLUtils.verifyNotReadPath(child, outputPath)
+
+      InsertIntoHiveDirCommand(isLocal, storage, child, overwrite)
   }
 }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTmpPath.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTmpPath.scala
new file mode 100644
index 0000000000000..15ca1dfc76d19
--- /dev/null
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTmpPath.scala
@@ -0,0 +1,203 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.execution
+
+import java.io.{File, IOException}
+import java.net.URI
+import java.text.SimpleDateFormat
+import java.util.{Date, Locale, Random}
+
+import scala.util.control.NonFatal
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.hadoop.hive.common.FileUtils
+import org.apache.hadoop.hive.ql.exec.TaskRunner
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.hive.HiveExternalCatalog
+import org.apache.spark.sql.hive.client.HiveVersion
+
+// Base trait for getting a temporary location for writing data
+private[hive] trait HiveTmpPath extends Logging {
+
+  var createdTempDir: Option[Path] = None
+
+  def getExternalTmpPath(
+      sparkSession: SparkSession,
+      hadoopConf: Configuration,
+      path: Path): Path = {
+    import org.apache.spark.sql.hive.client.hive._
+
+    // Before Hive 1.1, when inserting into a table, Hive will create the staging directory under
+    // a common scratch directory. After the writing is finished, Hive will simply empty the table
+    // directory and move the staging directory to it.
+    // After Hive 1.1, Hive will create the staging directory under the table directory, and when
+    // moving staging directory to table directory, Hive will still empty the table directory, but
+    // will exclude the staging directory there.
+    // We have to follow the Hive behavior here, to avoid troubles. For example, if we create
+    // staging directory under the table director for Hive prior to 1.1, the staging directory will
+    // be removed by Hive when Hive is trying to empty the table directory.
+    val hiveVersionsUsingOldExternalTempPath: Set[HiveVersion] = Set(v12, v13, v14, v1_0)
+    val hiveVersionsUsingNewExternalTempPath: Set[HiveVersion] = Set(v1_1, v1_2, v2_0, v2_1)
+
+    // Ensure all the supported versions are considered here.
+    assert(hiveVersionsUsingNewExternalTempPath ++ hiveVersionsUsingOldExternalTempPath ==
+      allSupportedHiveVersions)
+
+    val externalCatalog = sparkSession.sharedState.externalCatalog
+    val hiveVersion = externalCatalog.asInstanceOf[HiveExternalCatalog].client.version
+    val stagingDir = hadoopConf.get("hive.exec.stagingdir", ".hive-staging")
+    val scratchDir = hadoopConf.get("hive.exec.scratchdir", "/tmp/hive")
+
+    if (hiveVersionsUsingOldExternalTempPath.contains(hiveVersion)) {
+      oldVersionExternalTempPath(path, hadoopConf, scratchDir)
+    } else if (hiveVersionsUsingNewExternalTempPath.contains(hiveVersion)) {
+      newVersionExternalTempPath(path, hadoopConf, stagingDir)
+    } else {
+      throw new IllegalStateException("Unsupported hive version: " + hiveVersion.fullVersion)
+    }
+  }
+
+  def deleteExternalTmpPath(hadoopConf: Configuration) : Unit = {
+    // Attempt to delete the staging directory and the inclusive files. If failed, the files are
+    // expected to be dropped at the normal termination of VM since deleteOnExit is used.
+    try {
+      createdTempDir.foreach { path =>
+        val fs = path.getFileSystem(hadoopConf)
+        if (fs.delete(path, true)) {
+          // If we successfully delete the staging directory, remove it from FileSystem's cache.
+          fs.cancelDeleteOnExit(path)
+        }
+      }
+    } catch {
+      case NonFatal(e) =>
+        val stagingDir = hadoopConf.get("hive.exec.stagingdir", ".hive-staging")
+        logWarning(s"Unable to delete staging directory: $stagingDir.\n" + e)
+    }
+  }
+
+  // Mostly copied from Context.java#getExternalTmpPath of Hive 0.13
+  private def oldVersionExternalTempPath(
+      path: Path,
+      hadoopConf: Configuration,
+      scratchDir: String): Path = {
+    val extURI: URI = path.toUri
+    val scratchPath = new Path(scratchDir, executionId)
+    var dirPath = new Path(
+      extURI.getScheme,
+      extURI.getAuthority,
+      scratchPath.toUri.getPath + "-" + TaskRunner.getTaskRunnerID())
+
+    try {
+      val fs: FileSystem = dirPath.getFileSystem(hadoopConf)
+      dirPath = new Path(fs.makeQualified(dirPath).toString())
+
+      if (!FileUtils.mkdir(fs, dirPath, true, hadoopConf)) {
+        throw new IllegalStateException("Cannot create staging directory: " + dirPath.toString)
+      }
+      createdTempDir = Some(dirPath)
+      fs.deleteOnExit(dirPath)
+    } catch {
+      case e: IOException =>
+        throw new RuntimeException("Cannot create staging directory: " + dirPath.toString, e)
+    }
+    dirPath
+  }
+
+  // Mostly copied from Context.java#getExternalTmpPath of Hive 1.2
+  private def newVersionExternalTempPath(
+      path: Path,
+      hadoopConf: Configuration,
+      stagingDir: String): Path = {
+    val extURI: URI = path.toUri
+    if (extURI.getScheme == "viewfs") {
+      getExtTmpPathRelTo(path.getParent, hadoopConf, stagingDir)
+    } else {
+      new Path(getExternalScratchDir(extURI, hadoopConf, stagingDir), "-ext-10000")
+    }
+  }
+
+  private def getExtTmpPathRelTo(
+      path: Path,
+      hadoopConf: Configuration,
+      stagingDir: String): Path = {
+    new Path(getStagingDir(path, hadoopConf, stagingDir), "-ext-10000") // Hive uses 10000
+  }
+
+  private def getExternalScratchDir(
+      extURI: URI,
+      hadoopConf: Configuration,
+      stagingDir: String): Path = {
+    getStagingDir(
+      new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath),
+      hadoopConf,
+      stagingDir)
+  }
+
+  private def getStagingDir(
+      inputPath: Path,
+      hadoopConf: Configuration,
+      stagingDir: String): Path = {
+    val inputPathUri: URI = inputPath.toUri
+    val inputPathName: String = inputPathUri.getPath
+    val fs: FileSystem = inputPath.getFileSystem(hadoopConf)
+    var stagingPathName: String =
+      if (inputPathName.indexOf(stagingDir) == -1) {
+        new Path(inputPathName, stagingDir).toString
+      } else {
+        inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length)
+      }
+
+    // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the
+    // staging directory needs to avoid being deleted when users set hive.exec.stagingdir
+    // under the table directory.
+    if (FileUtils.isSubDir(new Path(stagingPathName), inputPath, fs) &&
+      !stagingPathName.stripPrefix(inputPathName).stripPrefix(File.separator).startsWith(".")) {
+      logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " +
+        "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " +
+        "directory.")
+      stagingPathName = new Path(inputPathName, ".hive-staging").toString
+    }
+
+    val dir: Path =
+      fs.makeQualified(
+        new Path(stagingPathName + "_" + executionId + "-" + TaskRunner.getTaskRunnerID))
+    logDebug("Created staging dir = " + dir + " for path = " + inputPath)
+    try {
+      if (!FileUtils.mkdir(fs, dir, true, hadoopConf)) {
+        throw new IllegalStateException("Cannot create staging directory  '" + dir.toString + "'")
+      }
+      createdTempDir = Some(dir)
+      fs.deleteOnExit(dir)
+    } catch {
+      case e: IOException =>
+        throw new RuntimeException(
+          "Cannot create staging directory '" + dir.toString + "': " + e.getMessage, e)
+    }
+    dir
+  }
+
+  private def executionId: String = {
+    val rand: Random = new Random
+    val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US)
+    "hive_" + format.format(new Date) + "_" + Math.abs(rand.nextLong)
+  }
+}
+
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveDirCommand.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveDirCommand.scala
new file mode 100644
index 0000000000000..2110038db36ac
--- /dev/null
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveDirCommand.scala
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.execution
+
+import scala.language.existentials
+
+import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.hadoop.hive.common.FileUtils
+import org.apache.hadoop.hive.ql.plan.TableDesc
+import org.apache.hadoop.hive.serde.serdeConstants
+import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
+import org.apache.hadoop.mapred._
+
+import org.apache.spark.SparkException
+import org.apache.spark.sql.{Row, SparkSession}
+import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable}
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.hive.client.HiveClientImpl
+
+/**
+ * Command for writing the results of `query` to file system.
+ *
+ * The syntax of using this command in SQL is:
+ * {{{
+ *   INSERT OVERWRITE [LOCAL] DIRECTORY
+ *   path
+ *   [ROW FORMAT row_format]
+ *   [STORED AS file_format]
+ *   SELECT ...
+ * }}}
+ *
+ * @param isLocal whether the path specified in `storage` is a local directory
+ * @param storage storage format used to describe how the query result is stored.
+ * @param query the logical plan representing data to write to
+ * @param overwrite whether overwrites existing directory
+ */
+case class InsertIntoHiveDirCommand(
+    isLocal: Boolean,
+    storage: CatalogStorageFormat,
+    query: LogicalPlan,
+    overwrite: Boolean) extends SaveAsHiveFile with HiveTmpPath {
+
+  override def children: Seq[LogicalPlan] = query :: Nil
+
+  override def run(sparkSession: SparkSession, children: Seq[SparkPlan]): Seq[Row] = {
+    assert(children.length == 1)
+    assert(storage.locationUri.nonEmpty)
+
+    val hiveTable = HiveClientImpl.toHiveTable(CatalogTable(
+      identifier = TableIdentifier(storage.locationUri.get.toString, Some("default")),
+      tableType = org.apache.spark.sql.catalyst.catalog.CatalogTableType.VIEW,
+      storage = storage,
+      schema = query.schema
+    ))
+    hiveTable.getMetadata.put(serdeConstants.SERIALIZATION_LIB,
+      storage.serde.getOrElse(classOf[LazySimpleSerDe].getName))
+
+    val tableDesc = new TableDesc(
+      hiveTable.getInputFormatClass,
+      hiveTable.getOutputFormatClass,
+      hiveTable.getMetadata
+    )
+
+    val hadoopConf = sparkSession.sessionState.newHadoopConf()
+    val jobConf = new JobConf(hadoopConf)
+
+    val targetPath = new Path(storage.locationUri.get)
+    val writeToPath =
+      if (isLocal) {
+        val localFileSystem = FileSystem.getLocal(jobConf)
+        localFileSystem.makeQualified(targetPath)
+      } else {
+        val qualifiedPath = FileUtils.makeQualified(targetPath, hadoopConf)
+        val dfs = qualifiedPath.getFileSystem(jobConf)
+        if (!dfs.exists(qualifiedPath)) {
+          dfs.mkdirs(qualifiedPath.getParent)
+        }
+        qualifiedPath
+      }
+
+    val tmpPath = getExternalTmpPath(sparkSession, hadoopConf, writeToPath)
+    val fileSinkConf = new org.apache.spark.sql.hive.HiveShim.ShimFileSinkDesc(
+      tmpPath.toString, tableDesc, false)
+
+    try {
+      saveAsHiveFile(
+        sparkSession = sparkSession,
+        plan = children.head,
+        hadoopConf = hadoopConf,
+        fileSinkConf = fileSinkConf,
+        outputLocation = tmpPath.toString)
+
+      val fs = writeToPath.getFileSystem(hadoopConf)
+      if (overwrite && fs.exists(writeToPath)) {
+        fs.listStatus(writeToPath).foreach { existFile =>
+          if (Option(existFile.getPath) != createdTempDir) fs.delete(existFile.getPath, true)
+        }
+      }
+
+      fs.listStatus(tmpPath).foreach {
+        tmpFile => fs.rename(tmpFile.getPath, writeToPath)
+      }
+    } catch {
+      case e: Throwable =>
+        throw new SparkException(
+          "Failed inserting overwrite directory " + storage.locationUri.get, e)
+    } finally {
+      deleteExternalTmpPath(hadoopConf)
+    }
+
+    Seq.empty[Row]
+  }
+}
+
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index 46610f84dd822..5bdc97a2982df 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -17,32 +17,22 @@
 
 package org.apache.spark.sql.hive.execution
 
-import java.io.{File, IOException}
-import java.net.URI
-import java.text.SimpleDateFormat
-import java.util.{Date, Locale, Random}
-
 import scala.util.control.NonFatal
 
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileSystem, Path}
-import org.apache.hadoop.hive.common.FileUtils
+import org.apache.hadoop.fs.Path
 import org.apache.hadoop.hive.ql.ErrorMsg
-import org.apache.hadoop.hive.ql.exec.TaskRunner
 import org.apache.hadoop.hive.ql.plan.TableDesc
 
 import org.apache.spark.SparkException
-import org.apache.spark.internal.io.FileCommitProtocol
 import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
 import org.apache.spark.sql.catalyst.catalog.CatalogTable
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SparkPlan
-import org.apache.spark.sql.execution.command.{CommandUtils, DataWritingCommand}
-import org.apache.spark.sql.execution.datasources.FileFormatWriter
+import org.apache.spark.sql.execution.command.CommandUtils
 import org.apache.spark.sql.hive._
 import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc}
-import org.apache.spark.sql.hive.client.{HiveClientImpl, HiveVersion}
+import org.apache.spark.sql.hive.client.HiveClientImpl
 
 
 /**
@@ -80,152 +70,10 @@ case class InsertIntoHiveTable(
     partition: Map[String, Option[String]],
     query: LogicalPlan,
     overwrite: Boolean,
-    ifPartitionNotExists: Boolean) extends DataWritingCommand {
+    ifPartitionNotExists: Boolean) extends SaveAsHiveFile with HiveTmpPath {
 
   override def children: Seq[LogicalPlan] = query :: Nil
 
-  var createdTempDir: Option[Path] = None
-
-  private def executionId: String = {
-    val rand: Random = new Random
-    val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US)
-    "hive_" + format.format(new Date) + "_" + Math.abs(rand.nextLong)
-  }
-
-  private def getStagingDir(
-      inputPath: Path,
-      hadoopConf: Configuration,
-      stagingDir: String): Path = {
-    val inputPathUri: URI = inputPath.toUri
-    val inputPathName: String = inputPathUri.getPath
-    val fs: FileSystem = inputPath.getFileSystem(hadoopConf)
-    var stagingPathName: String =
-      if (inputPathName.indexOf(stagingDir) == -1) {
-        new Path(inputPathName, stagingDir).toString
-      } else {
-        inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length)
-      }
-
-    // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the
-    // staging directory needs to avoid being deleted when users set hive.exec.stagingdir
-    // under the table directory.
-    if (FileUtils.isSubDir(new Path(stagingPathName), inputPath, fs) &&
-      !stagingPathName.stripPrefix(inputPathName).stripPrefix(File.separator).startsWith(".")) {
-      logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " +
-        "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " +
-        "directory.")
-      stagingPathName = new Path(inputPathName, ".hive-staging").toString
-    }
-
-    val dir: Path =
-      fs.makeQualified(
-        new Path(stagingPathName + "_" + executionId + "-" + TaskRunner.getTaskRunnerID))
-    logDebug("Created staging dir = " + dir + " for path = " + inputPath)
-    try {
-      if (!FileUtils.mkdir(fs, dir, true, hadoopConf)) {
-        throw new IllegalStateException("Cannot create staging directory  '" + dir.toString + "'")
-      }
-      createdTempDir = Some(dir)
-      fs.deleteOnExit(dir)
-    } catch {
-      case e: IOException =>
-        throw new RuntimeException(
-          "Cannot create staging directory '" + dir.toString + "': " + e.getMessage, e)
-    }
-    dir
-  }
-
-  private def getExternalScratchDir(
-      extURI: URI,
-      hadoopConf: Configuration,
-      stagingDir: String): Path = {
-    getStagingDir(
-      new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath),
-      hadoopConf,
-      stagingDir)
-  }
-
-  def getExternalTmpPath(
-      path: Path,
-      hiveVersion: HiveVersion,
-      hadoopConf: Configuration,
-      stagingDir: String,
-      scratchDir: String): Path = {
-    import org.apache.spark.sql.hive.client.hive._
-
-    // Before Hive 1.1, when inserting into a table, Hive will create the staging directory under
-    // a common scratch directory. After the writing is finished, Hive will simply empty the table
-    // directory and move the staging directory to it.
-    // After Hive 1.1, Hive will create the staging directory under the table directory, and when
-    // moving staging directory to table directory, Hive will still empty the table directory, but
-    // will exclude the staging directory there.
-    // We have to follow the Hive behavior here, to avoid troubles. For example, if we create
-    // staging directory under the table director for Hive prior to 1.1, the staging directory will
-    // be removed by Hive when Hive is trying to empty the table directory.
-    val hiveVersionsUsingOldExternalTempPath: Set[HiveVersion] = Set(v12, v13, v14, v1_0)
-    val hiveVersionsUsingNewExternalTempPath: Set[HiveVersion] = Set(v1_1, v1_2, v2_0, v2_1)
-
-    // Ensure all the supported versions are considered here.
-    assert(hiveVersionsUsingNewExternalTempPath ++ hiveVersionsUsingOldExternalTempPath ==
-      allSupportedHiveVersions)
-
-    if (hiveVersionsUsingOldExternalTempPath.contains(hiveVersion)) {
-      oldVersionExternalTempPath(path, hadoopConf, scratchDir)
-    } else if (hiveVersionsUsingNewExternalTempPath.contains(hiveVersion)) {
-      newVersionExternalTempPath(path, hadoopConf, stagingDir)
-    } else {
-      throw new IllegalStateException("Unsupported hive version: " + hiveVersion.fullVersion)
-    }
-  }
-
-  // Mostly copied from Context.java#getExternalTmpPath of Hive 0.13
-  def oldVersionExternalTempPath(
-      path: Path,
-      hadoopConf: Configuration,
-      scratchDir: String): Path = {
-    val extURI: URI = path.toUri
-    val scratchPath = new Path(scratchDir, executionId)
-    var dirPath = new Path(
-      extURI.getScheme,
-      extURI.getAuthority,
-      scratchPath.toUri.getPath + "-" + TaskRunner.getTaskRunnerID())
-
-    try {
-      val fs: FileSystem = dirPath.getFileSystem(hadoopConf)
-      dirPath = new Path(fs.makeQualified(dirPath).toString())
-
-      if (!FileUtils.mkdir(fs, dirPath, true, hadoopConf)) {
-        throw new IllegalStateException("Cannot create staging directory: " + dirPath.toString)
-      }
-      createdTempDir = Some(dirPath)
-      fs.deleteOnExit(dirPath)
-    } catch {
-      case e: IOException =>
-        throw new RuntimeException("Cannot create staging directory: " + dirPath.toString, e)
-    }
-    dirPath
-  }
-
-  // Mostly copied from Context.java#getExternalTmpPath of Hive 1.2
-  def newVersionExternalTempPath(
-      path: Path,
-      hadoopConf: Configuration,
-      stagingDir: String): Path = {
-    val extURI: URI = path.toUri
-    if (extURI.getScheme == "viewfs") {
-      getExtTmpPathRelTo(path.getParent, hadoopConf, stagingDir)
-    } else {
-      new Path(getExternalScratchDir(extURI, hadoopConf, stagingDir), "-ext-10000")
-    }
-  }
-
-  def getExtTmpPathRelTo(
-      path: Path,
-      hadoopConf: Configuration,
-      stagingDir: String): Path = {
-    new Path(getStagingDir(path, hadoopConf, stagingDir), "-ext-10000") // Hive uses 10000
-  }
-
   /**
    * Inserts all the rows in the table into Hive.  Row objects are properly serialized with the
    * `org.apache.hadoop.hive.serde2.SerDe` and the
@@ -234,12 +82,8 @@ case class InsertIntoHiveTable(
   override def run(sparkSession: SparkSession, children: Seq[SparkPlan]): Seq[Row] = {
     assert(children.length == 1)
 
-    val sessionState = sparkSession.sessionState
     val externalCatalog = sparkSession.sharedState.externalCatalog
-    val hiveVersion = externalCatalog.asInstanceOf[HiveExternalCatalog].client.version
-    val hadoopConf = sessionState.newHadoopConf()
-    val stagingDir = hadoopConf.get("hive.exec.stagingdir", ".hive-staging")
-    val scratchDir = hadoopConf.get("hive.exec.scratchdir", "/tmp/hive")
+    val hadoopConf = sparkSession.sessionState.newHadoopConf()
 
     val hiveQlTable = HiveClientImpl.toHiveTable(table)
     // Have to pass the TableDesc object to RDD.mapPartitions and then instantiate new serializer
@@ -254,23 +98,8 @@ case class InsertIntoHiveTable(
       hiveQlTable.getMetadata
     )
     val tableLocation = hiveQlTable.getDataLocation
-    val tmpLocation =
-      getExternalTmpPath(tableLocation, hiveVersion, hadoopConf, stagingDir, scratchDir)
+    val tmpLocation = getExternalTmpPath(sparkSession, hadoopConf, tableLocation)
     val fileSinkConf = new FileSinkDesc(tmpLocation.toString, tableDesc, false)
-    val isCompressed = hadoopConf.get("hive.exec.compress.output", "false").toBoolean
-
-    if (isCompressed) {
-      // Please note that isCompressed, "mapreduce.output.fileoutputformat.compress",
-      // "mapreduce.output.fileoutputformat.compress.codec", and
-      // "mapreduce.output.fileoutputformat.compress.type"
-      // have no impact on ORC because it uses table properties to store compression information.
-      hadoopConf.set("mapreduce.output.fileoutputformat.compress", "true")
-      fileSinkConf.setCompressed(true)
-      fileSinkConf.setCompressCodec(hadoopConf
-        .get("mapreduce.output.fileoutputformat.compress.codec"))
-      fileSinkConf.setCompressType(hadoopConf
-        .get("mapreduce.output.fileoutputformat.compress.type"))
-    }
 
     val numDynamicPartitions = partition.values.count(_.isEmpty)
     val numStaticPartitions = partition.values.count(_.nonEmpty)
@@ -332,11 +161,6 @@ case class InsertIntoHiveTable(
       case _ => // do nothing since table has no bucketing
     }
 
-    val committer = FileCommitProtocol.instantiate(
-      sparkSession.sessionState.conf.fileCommitProtocolClass,
-      jobId = java.util.UUID.randomUUID().toString,
-      outputPath = tmpLocation.toString)
-
     val partitionAttributes = partitionColumnNames.takeRight(numDynamicPartitions).map { name =>
       query.resolve(name :: Nil, sparkSession.sessionState.analyzer.resolver).getOrElse {
         throw new AnalysisException(
@@ -344,17 +168,13 @@ case class InsertIntoHiveTable(
       }.asInstanceOf[Attribute]
     }
 
-    FileFormatWriter.write(
+    saveAsHiveFile(
       sparkSession = sparkSession,
       plan = children.head,
-      fileFormat = new HiveFileFormat(fileSinkConf),
-      committer = committer,
-      outputSpec = FileFormatWriter.OutputSpec(tmpLocation.toString, Map.empty),
       hadoopConf = hadoopConf,
-      partitionColumns = partitionAttributes,
-      bucketSpec = None,
-      statsTrackers = Seq(basicWriteJobStatsTracker(hadoopConf)),
-      options = Map.empty)
+      fileSinkConf = fileSinkConf,
+      outputLocation = tmpLocation.toString,
+      partitionAttributes = partitionAttributes)
 
     if (partition.nonEmpty) {
       if (numDynamicPartitions > 0) {
@@ -422,18 +242,7 @@ case class InsertIntoHiveTable(
 
     // Attempt to delete the staging directory and the inclusive files. If failed, the files are
     // expected to be dropped at the normal termination of VM since deleteOnExit is used.
-    try {
-      createdTempDir.foreach { path =>
-        val fs = path.getFileSystem(hadoopConf)
-        if (fs.delete(path, true)) {
-          // If we successfully delete the staging directory, remove it from FileSystem's cache.
-          fs.cancelDeleteOnExit(path)
-        }
-      }
-    } catch {
-      case NonFatal(e) =>
-        logWarning(s"Unable to delete staging directory: $stagingDir.\n" + e)
-    }
+    deleteExternalTmpPath(hadoopConf)
 
     // un-cache this table.
     sparkSession.catalog.uncacheTable(table.identifier.quotedString)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
new file mode 100644
index 0000000000000..7de9b421245f0
--- /dev/null
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.execution
+
+import org.apache.hadoop.conf.Configuration
+
+import org.apache.spark.internal.io.FileCommitProtocol
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.command.DataWritingCommand
+import org.apache.spark.sql.execution.datasources.FileFormatWriter
+import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc}
+
+// Base trait from which all hive insert statement physical execution extends.
+private[hive] trait SaveAsHiveFile extends DataWritingCommand {
+
+  protected def saveAsHiveFile(
+      sparkSession: SparkSession,
+      plan: SparkPlan,
+      hadoopConf: Configuration,
+      fileSinkConf: FileSinkDesc,
+      outputLocation: String,
+      partitionAttributes: Seq[Attribute] = Nil): Unit = {
+
+    val isCompressed = hadoopConf.get("hive.exec.compress.output", "false").toBoolean
+    if (isCompressed) {
+      // Please note that isCompressed, "mapreduce.output.fileoutputformat.compress",
+      // "mapreduce.output.fileoutputformat.compress.codec", and
+      // "mapreduce.output.fileoutputformat.compress.type"
+      // have no impact on ORC because it uses table properties to store compression information.
+      hadoopConf.set("mapreduce.output.fileoutputformat.compress", "true")
+      fileSinkConf.setCompressed(true)
+      fileSinkConf.setCompressCodec(hadoopConf
+        .get("mapreduce.output.fileoutputformat.compress.codec"))
+      fileSinkConf.setCompressType(hadoopConf
+        .get("mapreduce.output.fileoutputformat.compress.type"))
+    }
+
+    val committer = FileCommitProtocol.instantiate(
+      sparkSession.sessionState.conf.fileCommitProtocolClass,
+      jobId = java.util.UUID.randomUUID().toString,
+      outputPath = outputLocation)
+
+    FileFormatWriter.write(
+      sparkSession = sparkSession,
+      plan = plan,
+      fileFormat = new HiveFileFormat(fileSinkConf),
+      committer = committer,
+      outputSpec = FileFormatWriter.OutputSpec(outputLocation, Map.empty),
+      hadoopConf = hadoopConf,
+      partitionColumns = partitionAttributes,
+      bucketSpec = None,
+      statsTrackers = Seq(basicWriteJobStatsTracker(hadoopConf)),
+      options = Map.empty)
+  }
+}
+
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
similarity index 78%
rename from sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
rename to sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
index e93c6549f1538..aa5cae33f5cd9 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
@@ -34,7 +34,7 @@ case class TestData(key: Int, value: String)
 
 case class ThreeCloumntable(key: Int, value: String, key1: String)
 
-class InsertIntoHiveTableSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter
+class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter
     with SQLTestUtils {
   import spark.implicits._
 
@@ -548,4 +548,184 @@ class InsertIntoHiveTableSuite extends QueryTest with TestHiveSingleton with Bef
       }
     }
   }
+
+  test("insert overwrite to dir from hive metastore table") {
+    withTempDir { dir =>
+      val path = dir.toURI.getPath
+
+      sql(s"INSERT OVERWRITE LOCAL DIRECTORY '${path}' SELECT * FROM src where key < 10")
+
+      sql(
+        s"""
+           |INSERT OVERWRITE LOCAL DIRECTORY '${path}'
+           |STORED AS orc
+           |SELECT * FROM src where key < 10
+         """.stripMargin)
+
+      // use orc data source to check the data of path is right.
+      withTempView("orc_source") {
+        sql(
+          s"""
+             |CREATE TEMPORARY VIEW orc_source
+             |USING org.apache.spark.sql.hive.orc
+             |OPTIONS (
+             |  PATH '${dir.getCanonicalPath}'
+             |)
+           """.stripMargin)
+
+        checkAnswer(
+          sql("select * from orc_source"),
+          sql("select * from src where key < 10"))
+      }
+    }
+  }
+
+  test("insert overwrite to local dir from temp table") {
+    withTempView("test_insert_table") {
+      spark.range(10).selectExpr("id", "id AS str").createOrReplaceTempView("test_insert_table")
+
+      withTempDir { dir =>
+        val path = dir.toURI.getPath
+
+        sql(
+          s"""
+             |INSERT OVERWRITE LOCAL DIRECTORY '${path}'
+             |ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
+             |SELECT * FROM test_insert_table
+           """.stripMargin)
+
+        sql(
+          s"""
+             |INSERT OVERWRITE LOCAL DIRECTORY '${path}'
+             |STORED AS orc
+             |SELECT * FROM test_insert_table
+           """.stripMargin)
+
+        // use orc data source to check the data of path is right.
+        checkAnswer(
+          spark.read.orc(dir.getCanonicalPath),
+          sql("select * from test_insert_table"))
+      }
+    }
+  }
+
+  test("insert overwrite to dir from temp table") {
+    withTempView("test_insert_table") {
+      spark.range(10).selectExpr("id", "id AS str").createOrReplaceTempView("test_insert_table")
+
+      withTempDir { dir =>
+        val pathUri = dir.toURI
+
+        sql(
+          s"""
+             |INSERT OVERWRITE DIRECTORY '${pathUri}'
+             |ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
+             |SELECT * FROM test_insert_table
+           """.stripMargin)
+
+        sql(
+          s"""
+             |INSERT OVERWRITE DIRECTORY '${pathUri}'
+             |STORED AS orc
+             |SELECT * FROM test_insert_table
+           """.stripMargin)
+
+        // use orc data source to check the data of path is right.
+        checkAnswer(
+          spark.read.orc(dir.getCanonicalPath),
+          sql("select * from test_insert_table"))
+      }
+    }
+  }
+
+  test("multi insert overwrite to dir") {
+    withTempView("test_insert_table") {
+      spark.range(10).selectExpr("id", "id AS str").createOrReplaceTempView("test_insert_table")
+
+      withTempDir { dir =>
+        val pathUri = dir.toURI
+
+        withTempDir { dir2 =>
+          val pathUri2 = dir2.toURI
+
+          sql(
+            s"""
+               |FROM test_insert_table
+               |INSERT OVERWRITE DIRECTORY '${pathUri}'
+               |STORED AS orc
+               |SELECT id
+               |INSERT OVERWRITE DIRECTORY '${pathUri2}'
+               |STORED AS orc
+               |SELECT *
+             """.stripMargin)
+
+          // use orc data source to check the data of path is right.
+          checkAnswer(
+            spark.read.orc(dir.getCanonicalPath),
+            sql("select id from test_insert_table"))
+
+          checkAnswer(
+            spark.read.orc(dir2.getCanonicalPath),
+            sql("select * from test_insert_table"))
+        }
+      }
+    }
+  }
+
+  test("insert overwrite to dir to illegal path") {
+    withTempView("test_insert_table") {
+      spark.range(10).selectExpr("id", "id AS str").createOrReplaceTempView("test_insert_table")
+
+      val e = intercept[IllegalArgumentException] {
+        sql(
+          s"""
+             |INSERT OVERWRITE LOCAL DIRECTORY 'abc://a'
+             |ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
+             |SELECT * FROM test_insert_table
+           """.stripMargin)
+      }.getMessage
+
+      assert(e.contains("Wrong FS: abc://a, expected: file:///"))
+    }
+  }
+
+  test("insert overwrite to dir with mixed syntax") {
+    withTempView("test_insert_table") {
+      spark.range(10).selectExpr("id", "id AS str").createOrReplaceTempView("test_insert_table")
+
+      val e = intercept[ParseException] {
+        sql(
+          s"""
+             |INSERT OVERWRITE DIRECTORY 'file://tmp'
+             |USING json
+             |ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
+             |SELECT * FROM test_insert_table
+           """.stripMargin)
+      }.getMessage
+
+      assert(e.contains("mismatched input 'ROW'"))
+    }
+  }
+
+  test("insert overwrite to dir with multi inserts") {
+    withTempView("test_insert_table") {
+      spark.range(10).selectExpr("id", "id AS str").createOrReplaceTempView("test_insert_table")
+
+      val e = intercept[ParseException] {
+        sql(
+          s"""
+             |INSERT OVERWRITE DIRECTORY 'file://tmp2'
+             |USING json
+             |ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
+             |SELECT * FROM test_insert_table
+             |INSERT OVERWRITE DIRECTORY 'file://tmp2'
+             |USING json
+             |ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
+             |SELECT * FROM test_insert_table
+           """.stripMargin)
+      }.getMessage
+
+      assert(e.contains("mismatched input 'ROW'"))
+    }
+  }
 }

From 520d92a191c3148498087d751aeeddd683055622 Mon Sep 17 00:00:00 2001
From: Peter Szalai <szalaipeti.vagyok@gmail.com>
Date: Sun, 10 Sep 2017 17:47:45 +0900
Subject: [PATCH 516/779] [SPARK-20098][PYSPARK] dataType's typeName fix

## What changes were proposed in this pull request?
`typeName`  classmethod has been fixed by using type -> typeName map.

## How was this patch tested?
local build

Author: Peter Szalai <szalaipeti.vagyok@gmail.com>

Closes #17435 from szalai1/datatype-gettype-fix.
---
 python/pyspark/sql/tests.py | 4 ++++
 python/pyspark/sql/types.py | 5 +++++
 2 files changed, 9 insertions(+)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 4d65abc11eaf9..6e7ddf94b7c03 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -209,6 +209,10 @@ def test_empty_row(self):
         row = Row()
         self.assertEqual(len(row), 0)
 
+    def test_struct_field_type_name(self):
+        struct_field = StructField("a", IntegerType())
+        self.assertRaises(TypeError, struct_field.typeName)
+
 
 class SQLTests(ReusedPySparkTestCase):
 
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 51bf7bef49763..920cf009f599d 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -440,6 +440,11 @@ def toInternal(self, obj):
     def fromInternal(self, obj):
         return self.dataType.fromInternal(obj)
 
+    def typeName(self):
+        raise TypeError(
+            "StructField does not have typeName. "
+            "Use typeName on its type explicitly instead.")
+
 
 class StructType(DataType):
     """Struct type, consisting of a list of :class:`StructField`.

From 6273a711b69139ef0210f59759030a0b4a26b118 Mon Sep 17 00:00:00 2001
From: Jen-Ming Chung <jenmingisme@gmail.com>
Date: Sun, 10 Sep 2017 17:26:43 -0700
Subject: [PATCH 517/779] [SPARK-21610][SQL] Corrupt records are not handled
 properly when creating a dataframe from a file

## What changes were proposed in this pull request?
```
echo '{"field": 1}
{"field": 2}
{"field": "3"}' >/tmp/sample.json
```

```scala
import org.apache.spark.sql.types._

val schema = new StructType()
  .add("field", ByteType)
  .add("_corrupt_record", StringType)

val file = "/tmp/sample.json"

val dfFromFile = spark.read.schema(schema).json(file)

scala> dfFromFile.show(false)
+-----+---------------+
|field|_corrupt_record|
+-----+---------------+
|1    |null           |
|2    |null           |
|null |{"field": "3"} |
+-----+---------------+

scala> dfFromFile.filter($"_corrupt_record".isNotNull).count()
res1: Long = 0

scala> dfFromFile.filter($"_corrupt_record".isNull).count()
res2: Long = 3
```
When the `requiredSchema` only contains `_corrupt_record`, the derived `actualSchema` is empty and the `_corrupt_record` are all null for all rows. This PR captures above situation and raise an exception with a reasonable workaround messag so that users can know what happened and how to fix the query.

## How was this patch tested?

Added test case.

Author: Jen-Ming Chung <jenmingisme@gmail.com>

Closes #18865 from jmchung/SPARK-21610.
---
 docs/sql-programming-guide.md                 |  4 +++
 .../datasources/json/JsonFileFormat.scala     | 14 +++++++++
 .../datasources/json/JsonSuite.scala          | 29 +++++++++++++++++++
 3 files changed, 47 insertions(+)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 45ba4d168f043..0a8acbb52575b 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1543,6 +1543,10 @@ options.
 
 # Migration Guide
 
+## Upgrading From Spark SQL 2.2 to 2.3
+
+  - Since Spark 2.3, the queries from raw JSON/CSV files are disallowed when the referenced columns only include the internal corrupt record column (named `_corrupt_record` by default). For example, `spark.read.schema(schema).json(file).filter($"_corrupt_record".isNotNull).count()` and `spark.read.schema(schema).json(file).select("_corrupt_record").show()`. Instead, you can cache or save the parsed results and then send the same query. For example, `val df = spark.read.schema(schema).json(file).cache()` and then `df.filter($"_corrupt_record".isNotNull).count()`.
+
 ## Upgrading From Spark SQL 2.1 to 2.2
 
   - Spark 2.1.1 introduced a new configuration key: `spark.sql.hive.caseSensitiveInferenceMode`. It had a default setting of `NEVER_INFER`, which kept behavior identical to 2.1.0. However, Spark 2.2.0 changes this setting's default value to `INFER_AND_SAVE` to restore compatibility with reading Hive metastore tables whose underlying file schema have mixed-case column names. With the `INFER_AND_SAVE` configuration value, on first access Spark will perform schema inference on any Hive metastore table for which it has not already saved an inferred schema. Note that schema inference can be a very time consuming operation for tables with thousands of partitions. If compatibility with mixed-case column names is not a concern, you can safely set `spark.sql.hive.caseSensitiveInferenceMode` to `NEVER_INFER` to avoid the initial overhead of schema inference. Note that with the new default `INFER_AND_SAVE` setting, the results of the schema inference are saved as a metastore key for future use. Therefore, the initial schema inference occurs only at a table's first access.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala
index 53d62d88b04c6..b5ed6e4636653 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala
@@ -113,6 +113,20 @@ class JsonFileFormat extends TextBasedFileFormat with DataSourceRegister {
       }
     }
 
+    if (requiredSchema.length == 1 &&
+      requiredSchema.head.name == parsedOptions.columnNameOfCorruptRecord) {
+      throw new AnalysisException(
+        "Since Spark 2.3, the queries from raw JSON/CSV files are disallowed when the\n" +
+        "referenced columns only include the internal corrupt record column\n" +
+        s"(named ${parsedOptions.columnNameOfCorruptRecord} by default). For example:\n" +
+        "spark.read.schema(schema).json(file).filter($\"_corrupt_record\".isNotNull).count()\n" +
+        "and spark.read.schema(schema).json(file).select(\"_corrupt_record\").show().\n" +
+        "Instead, you can cache or save the parsed results and then send the same query.\n" +
+        "For example, val df = spark.read.schema(schema).json(file).cache() and then\n" +
+        "df.filter($\"_corrupt_record\".isNotNull).count()."
+      )
+    }
+
     (file: PartitionedFile) => {
       val parser = new JacksonParser(actualSchema, parsedOptions)
       JsonDataSource(parsedOptions).readFile(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
index 0008954e36bdd..8c8d41ebf115a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -2034,4 +2034,33 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
       }
     }
   }
+
+  test("SPARK-21610: Corrupt records are not handled properly when creating a dataframe " +
+    "from a file") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+      val data =
+        """{"field": 1}
+          |{"field": 2}
+          |{"field": "3"}""".stripMargin
+      Seq(data).toDF().repartition(1).write.text(path)
+      val schema = new StructType().add("field", ByteType).add("_corrupt_record", StringType)
+      // negative cases
+      val msg = intercept[AnalysisException] {
+        spark.read.schema(schema).json(path).select("_corrupt_record").collect()
+      }.getMessage
+      assert(msg.contains("only include the internal corrupt record column"))
+      intercept[catalyst.errors.TreeNodeException[_]] {
+        spark.read.schema(schema).json(path).filter($"_corrupt_record".isNotNull).count()
+      }
+      // workaround
+      val df = spark.read.schema(schema).json(path).cache()
+      assert(df.filter($"_corrupt_record".isNotNull).count() == 1)
+      assert(df.filter($"_corrupt_record".isNull).count() == 2)
+      checkAnswer(
+        df.select("_corrupt_record"),
+        Row(null) :: Row(null) :: Row("{\"field\": \"3\"}") :: Nil
+      )
+    }
+  }
 }

From 828fab03567ecc245a65c4d295a677ce0ba26c19 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Mon, 11 Sep 2017 09:32:25 +0900
Subject: [PATCH 518/779] [BUILD][TEST][SPARKR] add sparksubmitsuite to
 appveyor tests

## What changes were proposed in this pull request?

more file regex

## How was this patch tested?

Jenkins, AppVeyor

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #19177 from felixcheung/rmoduletotest.
---
 appveyor.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/appveyor.yml b/appveyor.yml
index 43dad9bce60ac..dc2d81fcdc091 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -32,6 +32,7 @@ only_commits:
     - sql/core/src/main/scala/org/apache/spark/sql/api/r/
     - core/src/main/scala/org/apache/spark/api/r/
     - mllib/src/main/scala/org/apache/spark/ml/r/
+    - core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
 
 cache:
   - C:\Users\appveyor\.m2

From 4bab8f5996d94a468a40fde2961ebebafc393508 Mon Sep 17 00:00:00 2001
From: Chunsheng Ji <chunsheng.ji@gmail.com>
Date: Mon, 11 Sep 2017 16:52:48 +0800
Subject: [PATCH 519/779] [SPARK-21856] Add probability and rawPrediction to
 MLPC for Python

Probability and rawPrediction has been added to MultilayerPerceptronClassifier for Python

Add unit test.

Author: Chunsheng Ji <chunsheng.ji@gmail.com>

Closes #19172 from chunshengji/SPARK-21856.
---
 python/pyspark/ml/classification.py | 15 ++++++++++-----
 python/pyspark/ml/tests.py          | 20 ++++++++++++++++++++
 2 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index f0f42a34942d7..aa747f30d8c93 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -1356,7 +1356,8 @@ def theta(self):
 @inherit_doc
 class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
                                      HasMaxIter, HasTol, HasSeed, HasStepSize, HasSolver,
-                                     JavaMLWritable, JavaMLReadable):
+                                     JavaMLWritable, JavaMLReadable, HasProbabilityCol,
+                                     HasRawPredictionCol):
     """
     Classifier trainer based on the Multilayer Perceptron.
     Each layer has sigmoid activation function, output layer has softmax.
@@ -1425,11 +1426,13 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol,
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                  maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03,
-                 solver="l-bfgs", initialWeights=None):
+                 solver="l-bfgs", initialWeights=None, probabilityCol="probability",
+                 rawPredicitionCol="rawPrediction"):
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, \
-                 solver="l-bfgs", initialWeights=None)
+                 solver="l-bfgs", initialWeights=None, probabilityCol="probability", \
+                 rawPredicitionCol="rawPrediction")
         """
         super(MultilayerPerceptronClassifier, self).__init__()
         self._java_obj = self._new_java_obj(
@@ -1442,11 +1445,13 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
     @since("1.6.0")
     def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                   maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03,
-                  solver="l-bfgs", initialWeights=None):
+                  solver="l-bfgs", initialWeights=None, probabilityCol="probability",
+                  rawPredicitionCol="rawPrediction"):
         """
         setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                   maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, \
-                  solver="l-bfgs", initialWeights=None)
+                  solver="l-bfgs", initialWeights=None, probabilityCol="probability", \
+                  rawPredicitionCol="rawPrediction"):
         Sets params for MultilayerPerceptronClassifier.
         """
         kwargs = self._input_kwargs
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 509698f6014eb..15d6c76387393 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -1655,6 +1655,26 @@ def test_multinomial_logistic_regression_with_bound(self):
             np.allclose(model.interceptVector.toArray(), [-0.9057, -1.1392, -0.0033], atol=1E-4))
 
 
+class MultilayerPerceptronClassifierTest(SparkSessionTestCase):
+
+    def test_raw_and_probability_prediction(self):
+
+        data_path = "data/mllib/sample_multiclass_classification_data.txt"
+        df = self.spark.read.format("libsvm").load(data_path)
+
+        mlp = MultilayerPerceptronClassifier(maxIter=100, layers=[4, 5, 4, 3],
+                                             blockSize=128, seed=123)
+        model = mlp.fit(df)
+        test = self.sc.parallelize([Row(features=Vectors.dense(0.1, 0.1, 0.25, 0.25))]).toDF()
+        result = model.transform(test).head()
+        expected_prediction = 2.0
+        expected_probability = [0.0, 0.0, 1.0]
+        expected_rawPrediction = [57.3955, -124.5462, 67.9943]
+        self.assertTrue(result.prediction, expected_prediction)
+        self.assertTrue(np.allclose(result.probability, expected_probability, atol=1E-4))
+        self.assertTrue(np.allclose(result.rawPrediction, expected_rawPrediction, atol=1E-4))
+
+
 class FPGrowthTests(SparkSessionTestCase):
     def setUp(self):
         super(FPGrowthTests, self).setUp()

From dc74c0e67d96ed22210916b73dd896137f54bc37 Mon Sep 17 00:00:00 2001
From: caoxuewen <cao.xuewen@zte.com.cn>
Date: Mon, 11 Sep 2017 10:09:20 +0100
Subject: [PATCH 520/779] [MINOR][SQL] remove unuse import class

## What changes were proposed in this pull request?

this PR describe remove the import class that are unused.

## How was this patch tested?

N/A

Author: caoxuewen <cao.xuewen@zte.com.cn>

Closes #19131 from heary-cao/unuse_import.
---
 .../scala/org/apache/spark/broadcast/TorrentBroadcast.scala | 2 +-
 .../scala/org/apache/spark/deploy/DependencyUtils.scala     | 2 --
 .../spark/executor/CoarseGrainedExecutorBackend.scala       | 2 --
 .../apache/spark/network/netty/NettyBlockRpcServer.scala    | 2 +-
 .../scala/org/apache/spark/scheduler/ShuffleMapStage.scala  | 2 +-
 .../org/apache/spark/scheduler/TaskSchedulerImpl.scala      | 1 -
 .../scala/org/apache/spark/security/CryptoStreamUtils.scala | 2 +-
 .../apache/spark/storage/BlockManagerManagedBuffer.scala    | 1 -
 .../src/main/scala/org/apache/spark/storage/DiskStore.scala | 6 ++----
 core/src/main/scala/org/apache/spark/ui/JettyUtils.scala    | 1 -
 .../scala/org/apache/spark/ui/scope/RDDOperationGraph.scala | 2 +-
 .../main/scala/org/apache/spark/util/AccumulatorV2.scala    | 3 ---
 .../scala/org/apache/spark/util/ByteBufferInputStream.scala | 2 --
 .../spark/util/collection/ExternalAppendOnlyMap.scala       | 1 -
 .../spark/streaming/kafka010/DirectKafkaInputDStream.scala  | 4 +---
 .../main/scala/org/apache/spark/ml/attribute/package.scala  | 1 -
 .../org/apache/spark/ml/classification/GBTClassifier.scala  | 1 -
 .../org/apache/spark/ml/classification/LinearSVC.scala      | 2 +-
 .../ml/classification/MultilayerPerceptronClassifier.scala  | 1 -
 .../org/apache/spark/ml/classification/NaiveBayes.scala     | 1 -
 .../main/scala/org/apache/spark/ml/feature/package.scala    | 1 -
 .../org/apache/spark/ml/optim/WeightedLeastSquares.scala    | 2 +-
 .../org/apache/spark/ml/optim/loss/RDDLossFunction.scala    | 1 -
 .../main/scala/org/apache/spark/sql/DataFrameReader.scala   | 1 -
 .../org/apache/spark/sql/execution/SparkPlanInfo.scala      | 1 -
 .../apache/spark/sql/execution/basicPhysicalOperators.scala | 3 +--
 .../org/apache/spark/sql/execution/command/cache.scala      | 1 -
 .../spark/sql/execution/exchange/ShuffleExchange.scala      | 2 +-
 .../spark/sql/execution/joins/BroadcastHashJoinExec.scala   | 2 +-
 .../apache/spark/sql/execution/joins/HashedRelation.scala   | 2 +-
 .../spark/sql/execution/joins/ShuffledHashJoinExec.scala    | 2 +-
 .../spark/sql/execution/streaming/FileStreamSinkLog.scala   | 1 -
 .../org/apache/spark/sql/hive/HiveSessionCatalog.scala      | 1 -
 .../scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala | 2 +-
 .../org/apache/spark/sql/hive/MultiDatabaseSuite.scala      | 2 --
 35 files changed, 16 insertions(+), 47 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
index 039df75ce74fd..67e993c7f02e2 100644
--- a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
+++ b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
@@ -30,7 +30,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.io.CompressionCodec
 import org.apache.spark.serializer.Serializer
 import org.apache.spark.storage._
-import org.apache.spark.util.{ByteBufferInputStream, Utils}
+import org.apache.spark.util.Utils
 import org.apache.spark.util.io.{ChunkedByteBuffer, ChunkedByteBufferOutputStream}
 
 /**
diff --git a/core/src/main/scala/org/apache/spark/deploy/DependencyUtils.scala b/core/src/main/scala/org/apache/spark/deploy/DependencyUtils.scala
index db92a8f102365..51c3d9b158cbe 100644
--- a/core/src/main/scala/org/apache/spark/deploy/DependencyUtils.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/DependencyUtils.scala
@@ -19,8 +19,6 @@ package org.apache.spark.deploy
 
 import java.io.File
 
-import scala.collection.mutable.HashMap
-
 import org.apache.commons.lang3.StringUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}
diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
index a5d60e90210f1..ed893cd1e9482 100644
--- a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
+++ b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -26,8 +26,6 @@ import scala.collection.mutable
 import scala.util.{Failure, Success}
 import scala.util.control.NonFatal
 
-import org.apache.hadoop.security.{Credentials, UserGroupInformation}
-
 import org.apache.spark._
 import org.apache.spark.TaskState.TaskState
 import org.apache.spark.deploy.SparkHadoopUtil
diff --git a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala
index 305fd9a6de10d..eb4cf94164fd4 100644
--- a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala
+++ b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala
@@ -25,7 +25,7 @@ import scala.reflect.ClassTag
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.network.BlockDataManager
-import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer}
+import org.apache.spark.network.buffer.NioManagedBuffer
 import org.apache.spark.network.client.{RpcResponseCallback, TransportClient}
 import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager}
 import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, OpenBlocks, StreamHandle, UploadBlock}
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala
index 05f650fbf5df9..1b44d0aee3195 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala
@@ -19,7 +19,7 @@ package org.apache.spark.scheduler
 
 import scala.collection.mutable.HashSet
 
-import org.apache.spark.{MapOutputTrackerMaster, ShuffleDependency, SparkEnv}
+import org.apache.spark.{MapOutputTrackerMaster, ShuffleDependency}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.util.CallSite
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
index 737b383631148..0c11806b3981b 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -32,7 +32,6 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config
 import org.apache.spark.scheduler.SchedulingMode.SchedulingMode
 import org.apache.spark.scheduler.TaskLocality.TaskLocality
-import org.apache.spark.scheduler.local.LocalSchedulerBackend
 import org.apache.spark.storage.BlockManagerId
 import org.apache.spark.util.{AccumulatorV2, ThreadUtils, Utils}
 
diff --git a/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala b/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala
index 78dabb42ac9d2..00621976b77f4 100644
--- a/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala
+++ b/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala
@@ -16,7 +16,7 @@
  */
 package org.apache.spark.security
 
-import java.io.{EOFException, InputStream, OutputStream}
+import java.io.{InputStream, OutputStream}
 import java.nio.ByteBuffer
 import java.nio.channels.{ReadableByteChannel, WritableByteChannel}
 import java.util.Properties
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerManagedBuffer.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerManagedBuffer.scala
index 1ea0d378cbe87..3d3806126676c 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerManagedBuffer.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerManagedBuffer.scala
@@ -22,7 +22,6 @@ import java.nio.ByteBuffer
 import java.util.concurrent.atomic.AtomicInteger
 
 import org.apache.spark.network.buffer.ManagedBuffer
-import org.apache.spark.util.io.ChunkedByteBuffer
 
 /**
  * This [[ManagedBuffer]] wraps a [[BlockData]] instance retrieved from the [[BlockManager]]
diff --git a/core/src/main/scala/org/apache/spark/storage/DiskStore.scala b/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
index 95d70479ef017..3579acf8d83d9 100644
--- a/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
@@ -21,21 +21,19 @@ import java.io._
 import java.nio.ByteBuffer
 import java.nio.channels.{Channels, ReadableByteChannel, WritableByteChannel}
 import java.nio.channels.FileChannel.MapMode
-import java.nio.charset.StandardCharsets.UTF_8
 import java.util.concurrent.ConcurrentHashMap
 
 import scala.collection.mutable.ListBuffer
 
-import com.google.common.io.{ByteStreams, Closeables, Files}
+import com.google.common.io.Closeables
 import io.netty.channel.{DefaultFileRegion, FileRegion}
 import io.netty.util.AbstractReferenceCounted
 
 import org.apache.spark.{SecurityManager, SparkConf}
 import org.apache.spark.internal.Logging
-import org.apache.spark.network.buffer.ManagedBuffer
 import org.apache.spark.network.util.JavaUtils
 import org.apache.spark.security.CryptoStreamUtils
-import org.apache.spark.util.{ByteBufferInputStream, Utils}
+import org.apache.spark.util.Utils
 import org.apache.spark.util.io.ChunkedByteBuffer
 
 /**
diff --git a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
index 3ddaac78f0257..5ee04dad6ed4d 100644
--- a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
@@ -21,7 +21,6 @@ import java.net.{URI, URL}
 import javax.servlet.DispatcherType
 import javax.servlet.http.{HttpServlet, HttpServletRequest, HttpServletResponse}
 
-import scala.collection.mutable.ArrayBuffer
 import scala.language.implicitConversions
 import scala.xml.Node
 
diff --git a/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala b/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala
index 43bfe0aacf35b..bb763248cd7e0 100644
--- a/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala
+++ b/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala
@@ -26,7 +26,7 @@ import org.apache.commons.lang3.StringEscapeUtils
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.scheduler.StageInfo
-import org.apache.spark.storage.{RDDInfo, StorageLevel}
+import org.apache.spark.storage.StorageLevel
 
 /**
  * A representation of a generic cluster graph used for storing information on RDD operations.
diff --git a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
index 603c23abb6895..f4a736d6d439a 100644
--- a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
+++ b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
@@ -23,12 +23,9 @@ import java.util.{ArrayList, Collections}
 import java.util.concurrent.ConcurrentHashMap
 import java.util.concurrent.atomic.AtomicLong
 
-import scala.collection.JavaConverters._
-
 import org.apache.spark.{InternalAccumulator, SparkContext, TaskContext}
 import org.apache.spark.scheduler.AccumulableInfo
 
-
 private[spark] case class AccumulatorMetadata(
     id: Long,
     name: Option[String],
diff --git a/core/src/main/scala/org/apache/spark/util/ByteBufferInputStream.scala b/core/src/main/scala/org/apache/spark/util/ByteBufferInputStream.scala
index 50dc948e6c410..a938cb07724c7 100644
--- a/core/src/main/scala/org/apache/spark/util/ByteBufferInputStream.scala
+++ b/core/src/main/scala/org/apache/spark/util/ByteBufferInputStream.scala
@@ -20,8 +20,6 @@ package org.apache.spark.util
 import java.io.InputStream
 import java.nio.ByteBuffer
 
-import org.apache.spark.storage.StorageUtils
-
 /**
  * Reads data from a ByteBuffer.
  */
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
index a08563562b874..6f5b5bb3652de 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
@@ -32,7 +32,6 @@ import org.apache.spark.{SparkEnv, TaskContext}
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.executor.ShuffleWriteMetrics
 import org.apache.spark.internal.Logging
-import org.apache.spark.memory.TaskMemoryManager
 import org.apache.spark.serializer.{DeserializationStream, Serializer, SerializerManager}
 import org.apache.spark.storage.{BlockId, BlockManager}
 import org.apache.spark.util.CompletionIterator
diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala
index 9a4a1cf32a480..0fa3287f36db8 100644
--- a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala
+++ b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala
@@ -21,14 +21,12 @@ import java.{ util => ju }
 import java.util.concurrent.ConcurrentLinkedQueue
 import java.util.concurrent.atomic.AtomicReference
 
-import scala.annotation.tailrec
 import scala.collection.JavaConverters._
 import scala.collection.mutable
 
 import org.apache.kafka.clients.consumer._
-import org.apache.kafka.common.{ PartitionInfo, TopicPartition }
+import org.apache.kafka.common.TopicPartition
 
-import org.apache.spark.SparkException
 import org.apache.spark.internal.Logging
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.{StreamingContext, Time}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/package.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/package.scala
index 25ce0282b1274..d26acf924c0a3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/attribute/package.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/package.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.ml
 
 import org.apache.spark.ml.attribute.{Attribute, AttributeGroup}
-import org.apache.spark.sql.DataFrame
 
 /**
  * ==ML attributes==
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
index ade0960f87a0d..3da809ce5f77c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
@@ -32,7 +32,6 @@ import org.apache.spark.ml.tree.impl.GradientBoostedTrees
 import org.apache.spark.ml.util._
 import org.apache.spark.ml.util.DefaultParamsReader.Metadata
 import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo}
-import org.apache.spark.mllib.tree.loss.LogLoss
 import org.apache.spark.mllib.tree.model.{GradientBoostedTreesModel => OldGBTModel}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
index 3b0666c36d20a..1c97d77d38948 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
@@ -20,7 +20,7 @@ package org.apache.spark.ml.classification
 import scala.collection.mutable
 
 import breeze.linalg.{DenseVector => BDV}
-import breeze.optimize.{CachedDiffFunction, DiffFunction, OWLQN => BreezeOWLQN}
+import breeze.optimize.{CachedDiffFunction, OWLQN => BreezeOWLQN}
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.SparkException
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
index 14a0c9f5a66dd..fd4c98f22132f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
@@ -22,7 +22,6 @@ import scala.collection.JavaConverters._
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.annotation.Since
-import org.apache.spark.ml.{PredictionModel, Predictor, PredictorParams}
 import org.apache.spark.ml.ann.{FeedForwardTopology, FeedForwardTrainer}
 import org.apache.spark.ml.feature.LabeledPoint
 import org.apache.spark.ml.linalg.{Vector, Vectors}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
index e5713599406e0..0293e03d47435 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
@@ -28,7 +28,6 @@ import org.apache.spark.ml.util._
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.sql.{Dataset, Row}
 import org.apache.spark.sql.functions.{col, lit}
-import org.apache.spark.sql.types.DoubleType
 
 /**
  * Params for Naive Bayes Classifiers.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala
index d75a6dc9377ae..6ff970cc72dfd 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.ml
 
 import org.apache.spark.ml.feature.{HashingTF, IDF, IDFModel, VectorAssembler}
-import org.apache.spark.sql.DataFrame
 
 /**
  * == Feature transformers ==
diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
index 1ed218aa58bdd..c5c9c8eb2bd29 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.ml.optim
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.ml.feature.{Instance, OffsetInstance}
+import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.linalg._
 import org.apache.spark.rdd.RDD
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/loss/RDDLossFunction.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/loss/RDDLossFunction.scala
index 173041688128f..387f7c5b1ff33 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/optim/loss/RDDLossFunction.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/loss/RDDLossFunction.scala
@@ -22,7 +22,6 @@ import breeze.linalg.{DenseVector => BDV}
 import breeze.optimize.DiffFunction
 
 import org.apache.spark.broadcast.Broadcast
-import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors}
 import org.apache.spark.ml.optim.aggregator.DifferentiableLossAggregator
 import org.apache.spark.rdd.RDD
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 4f375e59c34d4..c69acc413e87f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -27,7 +27,6 @@ import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.json.{CreateJacksonParser, JacksonParser, JSONOptions}
-import org.apache.spark.sql.execution.LogicalRDD
 import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.execution.datasources.{DataSource, FailureSafeParser}
 import org.apache.spark.sql.execution.datasources.csv._
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala
index 06b69625fb53e..2118b9118a22f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala
@@ -20,7 +20,6 @@ package org.apache.spark.sql.execution
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.execution.exchange.ReusedExchangeExec
 import org.apache.spark.sql.execution.metric.SQLMetricInfo
-import org.apache.spark.util.Utils
 
 /**
  * :: DeveloperApi ::
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
index 2151c339b9b87..e4e9372447f7c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
@@ -20,14 +20,13 @@ package org.apache.spark.sql.execution
 import scala.concurrent.{ExecutionContext, Future}
 import scala.concurrent.duration.Duration
 
-import org.apache.spark.{InterruptibleIterator, SparkException, TaskContext}
+import org.apache.spark.{InterruptibleIterator, TaskContext}
 import org.apache.spark.rdd.{EmptyRDD, PartitionwiseSampledRDD, RDD}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode, ExpressionCanonicalizer}
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution.metric.SQLMetrics
-import org.apache.spark.sql.execution.ui.SparkListenerDriverAccumUpdates
 import org.apache.spark.sql.types.LongType
 import org.apache.spark.util.ThreadUtils
 import org.apache.spark.util.random.{BernoulliCellSampler, PoissonSampler}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala
index 47952f2f227a3..792290bef0163 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala
@@ -22,7 +22,6 @@ import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.NoSuchTableException
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.execution.SQLExecution
 
 case class CacheTableCommand(
     tableIdent: TableIdentifier,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchange.scala
index eebe6ad2e7944..0d06d83fb2f3c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchange.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchange.scala
@@ -25,7 +25,7 @@ import org.apache.spark.serializer.Serializer
 import org.apache.spark.shuffle.sort.SortShuffleManager
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.errors._
-import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection, UnsafeRow}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection}
 import org.apache.spark.sql.catalyst.expressions.codegen.LazilyGeneratedOrdering
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution._
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
index 2f52a089ef9bf..ab7bb8ab9d87a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode, GenerateUnsafeProjection}
 import org.apache.spark.sql.catalyst.plans._
-import org.apache.spark.sql.catalyst.plans.physical.{BroadcastDistribution, Distribution, Partitioning, UnspecifiedDistribution}
+import org.apache.spark.sql.catalyst.plans.physical.{BroadcastDistribution, Distribution, UnspecifiedDistribution}
 import org.apache.spark.sql.execution.{BinaryExecNode, CodegenSupport, SparkPlan}
 import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.sql.types.LongType
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
index 2038cb9edb67d..1b6a28cde2931 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
@@ -23,7 +23,7 @@ import com.esotericsoftware.kryo.{Kryo, KryoSerializable}
 import com.esotericsoftware.kryo.io.{Input, Output}
 
 import org.apache.spark.{SparkConf, SparkEnv, SparkException}
-import org.apache.spark.memory.{MemoryConsumer, MemoryMode, StaticMemoryManager, TaskMemoryManager}
+import org.apache.spark.memory.{MemoryConsumer, StaticMemoryManager, TaskMemoryManager}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical.BroadcastMode
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala
index f1df41ca49c27..66e8031bb5191 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.joins
 import org.apache.spark.TaskContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
+import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLog.scala
index c9939ac1db746..17b6874a61648 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLog.scala
@@ -22,7 +22,6 @@ import java.net.URI
 import org.apache.hadoop.fs.{FileStatus, Path}
 import org.json4s.NoTypeHints
 import org.json4s.jackson.Serialization
-import org.json4s.jackson.Serialization.{read, write}
 
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.internal.SQLConf
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
index b352bf6971bad..b256ffc27b199 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
@@ -30,7 +30,6 @@ import org.apache.hadoop.hive.ql.udf.generic.{AbstractGenericUDAFResolver, Gener
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.FunctionIdentifier
 import org.apache.spark.sql.catalyst.analysis.FunctionRegistry
-import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder
 import org.apache.spark.sql.catalyst.catalog.{CatalogFunction, FunctionResourceLoader, GlobalTempViewManager, SessionCatalog}
 import org.apache.spark.sql.catalyst.expressions.{Cast, Expression}
 import org.apache.spark.sql.catalyst.parser.ParserInterface
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
index edf2013a4c936..4d92a67044373 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -34,7 +34,7 @@ import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit}
 
 import org.apache.spark.TaskContext
-import org.apache.spark.sql.{Row, SparkSession}
+import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution.datasources._
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala
index 4aea6d14efb0e..9060ce2e0eb4b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql.hive
 
-import java.net.URI
-
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.sql.{AnalysisException, QueryTest, SaveMode}

From e2ac2f1c71a0f8b03743d0d916dc0ef28482a393 Mon Sep 17 00:00:00 2001
From: FavioVazquez <favio.vazquezp@gmail.com>
Date: Tue, 12 Sep 2017 10:33:35 +0100
Subject: [PATCH 521/779] [SPARK-21976][DOC] Fix wrong documentation for Mean
 Absolute Error.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

Fixed wrong documentation for Mean Absolute Error.

Even though the code is correct for the MAE:

```scala
Since("1.2.0")
  def meanAbsoluteError: Double = {
    summary.normL1(1) / summary.count
  }
```
In the documentation the division by N is missing.

## How was this patch tested?

All of spark tests were run.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: FavioVazquez <favio.vazquezp@gmail.com>
Author: faviovazquez <favio.vazquezp@gmail.com>
Author: Favio André Vázquez <favio.vazquezp@gmail.com>

Closes #19190 from FavioVazquez/mae-fix.
---
 docs/mllib-evaluation-metrics.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/mllib-evaluation-metrics.md b/docs/mllib-evaluation-metrics.md
index ac82f43cfb79d..7f277543d2e9a 100644
--- a/docs/mllib-evaluation-metrics.md
+++ b/docs/mllib-evaluation-metrics.md
@@ -549,7 +549,7 @@ variable from a number of independent variables.
     </tr>
     <tr>
       <td>Mean Absolute Error (MAE)</td>
-      <td>$MAE=\sum_{i=0}^{N-1} \left|\mathbf{y}_i - \hat{\mathbf{y}}_i\right|$</td>
+      <td>$MAE=\frac{1}{N}\sum_{i=0}^{N-1} \left|\mathbf{y}_i - \hat{\mathbf{y}}_i\right|$</td>
     </tr>
     <tr>
       <td>Coefficient of Determination $(R^2)$</td>

From dd7816758516b303d79adaac856670c3ccda11ce Mon Sep 17 00:00:00 2001
From: Marco Gaido <mgaido@hortonworks.com>
Date: Tue, 12 Sep 2017 17:59:53 +0800
Subject: [PATCH 522/779] [SPARK-14516][ML] Adding ClusteringEvaluator with the
 implementation of Cosine silhouette and squared Euclidean silhouette.

## What changes were proposed in this pull request?

This PR adds the ClusteringEvaluator Evaluator which contains two metrics:
 - **cosineSilhouette**: the Silhouette measure using the cosine distance;
 - **squaredSilhouette**: the Silhouette measure using the squared Euclidean distance.

The implementation of the two metrics refers to the algorithm proposed and explained [here](https://drive.google.com/file/d/0B0Hyo%5f%5fbG%5f3fdkNvSVNYX2E3ZU0/view). These algorithms have been thought for a distributed and parallel environment, thus they have reasonable performance, unlike a naive Silhouette implementation following its definition.

## How was this patch tested?

The patch has been tested with the additional unit tests added (comparing the results with the ones provided by [Python sklearn library](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html)).

Author: Marco Gaido <mgaido@hortonworks.com>

Closes #18538 from mgaido91/SPARK-14516.
---
 .../ml/evaluation/ClusteringEvaluator.scala   | 436 ++++++++++++++++++
 .../src/test/resources/test-data/iris.libsvm  | 150 ++++++
 .../evaluation/ClusteringEvaluatorSuite.scala |  89 ++++
 3 files changed, 675 insertions(+)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala
 create mode 100644 mllib/src/test/resources/test-data/iris.libsvm
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala
new file mode 100644
index 0000000000000..d6ec5223237bb
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala
@@ -0,0 +1,436 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.evaluation
+
+import org.apache.spark.SparkContext
+import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT}
+import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
+import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol}
+import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils}
+import org.apache.spark.sql.{DataFrame, Dataset}
+import org.apache.spark.sql.functions.{avg, col, udf}
+import org.apache.spark.sql.types.DoubleType
+
+/**
+ * :: Experimental ::
+ *
+ * Evaluator for clustering results.
+ * The metric computes the Silhouette measure
+ * using the squared Euclidean distance.
+ *
+ * The Silhouette is a measure for the validation
+ * of the consistency within clusters. It ranges
+ * between 1 and -1, where a value close to 1
+ * means that the points in a cluster are close
+ * to the other points in the same cluster and
+ * far from the points of the other clusters.
+ */
+@Experimental
+@Since("2.3.0")
+class ClusteringEvaluator @Since("2.3.0") (@Since("2.3.0") override val uid: String)
+  extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable {
+
+  @Since("2.3.0")
+  def this() = this(Identifiable.randomUID("cluEval"))
+
+  @Since("2.3.0")
+  override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap)
+
+  @Since("2.3.0")
+  override def isLargerBetter: Boolean = true
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setPredictionCol(value: String): this.type = set(predictionCol, value)
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setFeaturesCol(value: String): this.type = set(featuresCol, value)
+
+  /**
+   * param for metric name in evaluation
+   * (supports `"silhouette"` (default))
+   * @group param
+   */
+  @Since("2.3.0")
+  val metricName: Param[String] = {
+    val allowedParams = ParamValidators.inArray(Array("silhouette"))
+    new Param(
+      this, "metricName", "metric name in evaluation (silhouette)", allowedParams)
+  }
+
+  /** @group getParam */
+  @Since("2.3.0")
+  def getMetricName: String = $(metricName)
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setMetricName(value: String): this.type = set(metricName, value)
+
+  setDefault(metricName -> "silhouette")
+
+  @Since("2.3.0")
+  override def evaluate(dataset: Dataset[_]): Double = {
+    SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT)
+    SchemaUtils.checkNumericType(dataset.schema, $(predictionCol))
+
+    $(metricName) match {
+      case "silhouette" =>
+        SquaredEuclideanSilhouette.computeSilhouetteScore(
+          dataset, $(predictionCol), $(featuresCol)
+      )
+    }
+  }
+}
+
+
+@Since("2.3.0")
+object ClusteringEvaluator
+  extends DefaultParamsReadable[ClusteringEvaluator] {
+
+  @Since("2.3.0")
+  override def load(path: String): ClusteringEvaluator = super.load(path)
+
+}
+
+
+/**
+ * SquaredEuclideanSilhouette computes the average of the
+ * Silhouette over all the data of the dataset, which is
+ * a measure of how appropriately the data have been clustered.
+ *
+ * The Silhouette for each point `i` is defined as:
+ *
+ * <blockquote>
+ *   $$
+ *   s_{i} = \frac{b_{i}-a_{i}}{max\{a_{i},b_{i}\}}
+ *   $$
+ * </blockquote>
+ *
+ * which can be rewritten as
+ *
+ * <blockquote>
+ *   $$
+ *   s_{i}= \begin{cases}
+ *   1-\frac{a_{i}}{b_{i}} & \text{if } a_{i} \leq b_{i} \\
+ *   \frac{b_{i}}{a_{i}}-1 & \text{if } a_{i} \gt b_{i} \end{cases}
+ *   $$
+ * </blockquote>
+ *
+ * where `$a_{i}$` is the average dissimilarity of `i` with all other data
+ * within the same cluster, `$b_{i}$` is the lowest average dissimilarity
+ * of `i` to any other cluster, of which `i` is not a member.
+ * `$a_{i}$` can be interpreted as how well `i` is assigned to its cluster
+ * (the smaller the value, the better the assignment), while `$b_{i}$` is
+ * a measure of how well `i` has not been assigned to its "neighboring cluster",
+ * ie. the nearest cluster to `i`.
+ *
+ * Unfortunately, the naive implementation of the algorithm requires to compute
+ * the distance of each couple of points in the dataset. Since the computation of
+ * the distance measure takes `D` operations - if `D` is the number of dimensions
+ * of each point, the computational complexity of the algorithm is `O(N^2^*D)`, where
+ * `N` is the cardinality of the dataset. Of course this is not scalable in `N`,
+ * which is the critical number in a Big Data context.
+ *
+ * The algorithm which is implemented in this object, instead, is an efficient
+ * and parallel implementation of the Silhouette using the squared Euclidean
+ * distance measure.
+ *
+ * With this assumption, the total distance of the point `X`
+ * to the points `$C_{i}$` belonging to the cluster `$\Gamma$` is:
+ *
+ * <blockquote>
+ *   $$
+ *   \sum\limits_{i=1}^N d(X, C_{i} ) =
+ *   \sum\limits_{i=1}^N \Big( \sum\limits_{j=1}^D (x_{j}-c_{ij})^2 \Big)
+ *   = \sum\limits_{i=1}^N \Big( \sum\limits_{j=1}^D x_{j}^2 +
+ *   \sum\limits_{j=1}^D c_{ij}^2 -2\sum\limits_{j=1}^D x_{j}c_{ij} \Big)
+ *   = \sum\limits_{i=1}^N \sum\limits_{j=1}^D x_{j}^2 +
+ *   \sum\limits_{i=1}^N \sum\limits_{j=1}^D c_{ij}^2
+ *   -2 \sum\limits_{i=1}^N \sum\limits_{j=1}^D x_{j}c_{ij}
+ *   $$
+ * </blockquote>
+ *
+ * where `$x_{j}$` is the `j`-th dimension of the point `X` and
+ * `$c_{ij}$` is the `j`-th dimension of the `i`-th point in cluster `$\Gamma$`.
+ *
+ * Then, the first term of the equation can be rewritten as:
+ *
+ * <blockquote>
+ *   $$
+ *   \sum\limits_{i=1}^N \sum\limits_{j=1}^D x_{j}^2 = N \xi_{X} \text{ ,
+ *   with } \xi_{X} = \sum\limits_{j=1}^D x_{j}^2
+ *   $$
+ * </blockquote>
+ *
+ * where `$\xi_{X}$` is fixed for each point and it can be precomputed.
+ *
+ * Moreover, the second term is fixed for each cluster too,
+ * thus we can name it `$\Psi_{\Gamma}$`
+ *
+ * <blockquote>
+ *   $$
+ *   \sum\limits_{i=1}^N \sum\limits_{j=1}^D c_{ij}^2 =
+ *   \sum\limits_{i=1}^N \xi_{C_{i}} = \Psi_{\Gamma}
+ *   $$
+ * </blockquote>
+ *
+ * Last, the third element becomes
+ *
+ * <blockquote>
+ *   $$
+ *   \sum\limits_{i=1}^N \sum\limits_{j=1}^D x_{j}c_{ij} =
+ *   \sum\limits_{j=1}^D \Big(\sum\limits_{i=1}^N c_{ij} \Big) x_{j}
+ *   $$
+ * </blockquote>
+ *
+ * thus defining the vector
+ *
+ * <blockquote>
+ *   $$
+ *   Y_{\Gamma}:Y_{\Gamma j} = \sum\limits_{i=1}^N c_{ij} , j=0, ..., D
+ *   $$
+ * </blockquote>
+ *
+ * which is fixed for each cluster `$\Gamma$`, we have
+ *
+ * <blockquote>
+ *   $$
+ *   \sum\limits_{j=1}^D \Big(\sum\limits_{i=1}^N c_{ij} \Big) x_{j} =
+ *   \sum\limits_{j=1}^D Y_{\Gamma j} x_{j}
+ *   $$
+ * </blockquote>
+ *
+ * In this way, the previous equation becomes
+ *
+ * <blockquote>
+ *   $$
+ *   N\xi_{X} + \Psi_{\Gamma} - 2 \sum\limits_{j=1}^D Y_{\Gamma j} x_{j}
+ *   $$
+ * </blockquote>
+ *
+ * and the average distance of a point to a cluster can be computed as
+ *
+ * <blockquote>
+ *   $$
+ *   \frac{\sum\limits_{i=1}^N d(X, C_{i} )}{N} =
+ *   \frac{N\xi_{X} + \Psi_{\Gamma} - 2 \sum\limits_{j=1}^D Y_{\Gamma j} x_{j}}{N} =
+ *   \xi_{X} + \frac{\Psi_{\Gamma} }{N} - 2 \frac{\sum\limits_{j=1}^D Y_{\Gamma j} x_{j}}{N}
+ *   $$
+ * </blockquote>
+ *
+ * Thus, it is enough to precompute: the constant `$\xi_{X}$` for each point `X`; the
+ * constants `$\Psi_{\Gamma}$`, `N` and the vector `$Y_{\Gamma}$` for
+ * each cluster `$\Gamma$`.
+ *
+ * In the implementation, the precomputed values for the clusters
+ * are distributed among the worker nodes via broadcasted variables,
+ * because we can assume that the clusters are limited in number and
+ * anyway they are much fewer than the points.
+ *
+ * The main strengths of this algorithm are the low computational complexity
+ * and the intrinsic parallelism. The precomputed information for each point
+ * and for each cluster can be computed with a computational complexity
+ * which is `O(N/W)`, where `N` is the number of points in the dataset and
+ * `W` is the number of worker nodes. After that, every point can be
+ * analyzed independently of the others.
+ *
+ * For every point we need to compute the average distance to all the clusters.
+ * Since the formula above requires `O(D)` operations, this phase has a
+ * computational complexity which is `O(C*D*N/W)` where `C` is the number of
+ * clusters (which we assume quite low), `D` is the number of dimensions,
+ * `N` is the number of points in the dataset and `W` is the number
+ * of worker nodes.
+ */
+private[evaluation] object SquaredEuclideanSilhouette {
+
+  private[this] var kryoRegistrationPerformed: Boolean = false
+
+  /**
+   * This method registers the class
+   * [[org.apache.spark.ml.evaluation.SquaredEuclideanSilhouette.ClusterStats]]
+   * for kryo serialization.
+   *
+   * @param sc `SparkContext` to be used
+   */
+  def registerKryoClasses(sc: SparkContext): Unit = {
+    if (!kryoRegistrationPerformed) {
+      sc.getConf.registerKryoClasses(
+        Array(
+          classOf[SquaredEuclideanSilhouette.ClusterStats]
+        )
+      )
+      kryoRegistrationPerformed = true
+    }
+  }
+
+  case class ClusterStats(featureSum: Vector, squaredNormSum: Double, numOfPoints: Long)
+
+  /**
+   * The method takes the input dataset and computes the aggregated values
+   * about a cluster which are needed by the algorithm.
+   *
+   * @param df The DataFrame which contains the input data
+   * @param predictionCol The name of the column which contains the predicted cluster id
+   *                      for the point.
+   * @param featuresCol The name of the column which contains the feature vector of the point.
+   * @return A [[scala.collection.immutable.Map]] which associates each cluster id
+   *         to a [[ClusterStats]] object (which contains the precomputed values `N`,
+   *         `$\Psi_{\Gamma}$` and `$Y_{\Gamma}$` for a cluster).
+   */
+  def computeClusterStats(
+    df: DataFrame,
+    predictionCol: String,
+    featuresCol: String): Map[Double, ClusterStats] = {
+    val numFeatures = df.select(col(featuresCol)).first().getAs[Vector](0).size
+    val clustersStatsRDD = df.select(
+        col(predictionCol).cast(DoubleType), col(featuresCol), col("squaredNorm"))
+      .rdd
+      .map { row => (row.getDouble(0), (row.getAs[Vector](1), row.getDouble(2))) }
+      .aggregateByKey[(DenseVector, Double, Long)]((Vectors.zeros(numFeatures).toDense, 0.0, 0L))(
+        seqOp = {
+          case (
+              (featureSum: DenseVector, squaredNormSum: Double, numOfPoints: Long),
+              (features, squaredNorm)
+            ) =>
+            BLAS.axpy(1.0, features, featureSum)
+            (featureSum, squaredNormSum + squaredNorm, numOfPoints + 1)
+        },
+        combOp = {
+          case (
+              (featureSum1, squaredNormSum1, numOfPoints1),
+              (featureSum2, squaredNormSum2, numOfPoints2)
+            ) =>
+            BLAS.axpy(1.0, featureSum2, featureSum1)
+            (featureSum1, squaredNormSum1 + squaredNormSum2, numOfPoints1 + numOfPoints2)
+        }
+      )
+
+    clustersStatsRDD
+      .collectAsMap()
+      .mapValues {
+        case (featureSum: DenseVector, squaredNormSum: Double, numOfPoints: Long) =>
+          SquaredEuclideanSilhouette.ClusterStats(featureSum, squaredNormSum, numOfPoints)
+      }
+      .toMap
+  }
+
+  /**
+   * It computes the Silhouette coefficient for a point.
+   *
+   * @param broadcastedClustersMap A map of the precomputed values for each cluster.
+   * @param features The [[org.apache.spark.ml.linalg.Vector]] representing the current point.
+   * @param clusterId The id of the cluster the current point belongs to.
+   * @param squaredNorm The `$\Xi_{X}$` (which is the squared norm) precomputed for the point.
+   * @return The Silhouette for the point.
+   */
+  def computeSilhouetteCoefficient(
+     broadcastedClustersMap: Broadcast[Map[Double, ClusterStats]],
+     features: Vector,
+     clusterId: Double,
+     squaredNorm: Double): Double = {
+
+    def compute(squaredNorm: Double, point: Vector, clusterStats: ClusterStats): Double = {
+      val pointDotClusterFeaturesSum = BLAS.dot(point, clusterStats.featureSum)
+
+      squaredNorm +
+        clusterStats.squaredNormSum / clusterStats.numOfPoints -
+        2 * pointDotClusterFeaturesSum / clusterStats.numOfPoints
+    }
+
+    // Here we compute the average dissimilarity of the
+    // current point to any cluster of which the point
+    // is not a member.
+    // The cluster with the lowest average dissimilarity
+    // - i.e. the nearest cluster to the current point -
+    // is said to be the "neighboring cluster".
+    var neighboringClusterDissimilarity = Double.MaxValue
+    broadcastedClustersMap.value.keySet.foreach {
+      c =>
+        if (c != clusterId) {
+          val dissimilarity = compute(squaredNorm, features, broadcastedClustersMap.value(c))
+          if(dissimilarity < neighboringClusterDissimilarity) {
+            neighboringClusterDissimilarity = dissimilarity
+          }
+        }
+    }
+    val currentCluster = broadcastedClustersMap.value(clusterId)
+    // adjustment for excluding the node itself from
+    // the computation of the average dissimilarity
+    val currentClusterDissimilarity = if (currentCluster.numOfPoints == 1) {
+      0
+    } else {
+      compute(squaredNorm, features, currentCluster) * currentCluster.numOfPoints /
+        (currentCluster.numOfPoints - 1)
+    }
+
+    (currentClusterDissimilarity compare neighboringClusterDissimilarity).signum match {
+      case -1 => 1 - (currentClusterDissimilarity / neighboringClusterDissimilarity)
+      case 1 => (neighboringClusterDissimilarity / currentClusterDissimilarity) - 1
+      case 0 => 0.0
+    }
+  }
+
+  /**
+   * Compute the mean Silhouette values of all samples.
+   *
+   * @param dataset The input dataset (previously clustered) on which compute the Silhouette.
+   * @param predictionCol The name of the column which contains the predicted cluster id
+   *                      for the point.
+   * @param featuresCol The name of the column which contains the feature vector of the point.
+   * @return The average of the Silhouette values of the clustered data.
+   */
+  def computeSilhouetteScore(
+      dataset: Dataset[_],
+      predictionCol: String,
+      featuresCol: String): Double = {
+    SquaredEuclideanSilhouette.registerKryoClasses(dataset.sparkSession.sparkContext)
+
+    val squaredNormUDF = udf {
+      features: Vector => math.pow(Vectors.norm(features, 2.0), 2.0)
+    }
+    val dfWithSquaredNorm = dataset.withColumn("squaredNorm", squaredNormUDF(col(featuresCol)))
+
+    // compute aggregate values for clusters needed by the algorithm
+    val clustersStatsMap = SquaredEuclideanSilhouette
+      .computeClusterStats(dfWithSquaredNorm, predictionCol, featuresCol)
+
+    // Silhouette is reasonable only when the number of clusters is grater then 1
+    assert(clustersStatsMap.size > 1, "Number of clusters must be greater than one.")
+
+    val bClustersStatsMap = dataset.sparkSession.sparkContext.broadcast(clustersStatsMap)
+
+    val computeSilhouetteCoefficientUDF = udf {
+      computeSilhouetteCoefficient(bClustersStatsMap, _: Vector, _: Double, _: Double)
+    }
+
+    val silhouetteScore = dfWithSquaredNorm
+      .select(avg(
+        computeSilhouetteCoefficientUDF(
+          col(featuresCol), col(predictionCol).cast(DoubleType), col("squaredNorm"))
+      ))
+      .collect()(0)
+      .getDouble(0)
+
+    bClustersStatsMap.destroy()
+
+    silhouetteScore
+  }
+}
diff --git a/mllib/src/test/resources/test-data/iris.libsvm b/mllib/src/test/resources/test-data/iris.libsvm
new file mode 100644
index 0000000000000..db959010255d0
--- /dev/null
+++ b/mllib/src/test/resources/test-data/iris.libsvm
@@ -0,0 +1,150 @@
+0.0 1:5.1 2:3.5 3:1.4 4:0.2
+0.0 1:4.9 2:3.0 3:1.4 4:0.2
+0.0 1:4.7 2:3.2 3:1.3 4:0.2
+0.0 1:4.6 2:3.1 3:1.5 4:0.2
+0.0 1:5.0 2:3.6 3:1.4 4:0.2
+0.0 1:5.4 2:3.9 3:1.7 4:0.4
+0.0 1:4.6 2:3.4 3:1.4 4:0.3
+0.0 1:5.0 2:3.4 3:1.5 4:0.2
+0.0 1:4.4 2:2.9 3:1.4 4:0.2
+0.0 1:4.9 2:3.1 3:1.5 4:0.1
+0.0 1:5.4 2:3.7 3:1.5 4:0.2
+0.0 1:4.8 2:3.4 3:1.6 4:0.2
+0.0 1:4.8 2:3.0 3:1.4 4:0.1
+0.0 1:4.3 2:3.0 3:1.1 4:0.1
+0.0 1:5.8 2:4.0 3:1.2 4:0.2
+0.0 1:5.7 2:4.4 3:1.5 4:0.4
+0.0 1:5.4 2:3.9 3:1.3 4:0.4
+0.0 1:5.1 2:3.5 3:1.4 4:0.3
+0.0 1:5.7 2:3.8 3:1.7 4:0.3
+0.0 1:5.1 2:3.8 3:1.5 4:0.3
+0.0 1:5.4 2:3.4 3:1.7 4:0.2
+0.0 1:5.1 2:3.7 3:1.5 4:0.4
+0.0 1:4.6 2:3.6 3:1.0 4:0.2
+0.0 1:5.1 2:3.3 3:1.7 4:0.5
+0.0 1:4.8 2:3.4 3:1.9 4:0.2
+0.0 1:5.0 2:3.0 3:1.6 4:0.2
+0.0 1:5.0 2:3.4 3:1.6 4:0.4
+0.0 1:5.2 2:3.5 3:1.5 4:0.2
+0.0 1:5.2 2:3.4 3:1.4 4:0.2
+0.0 1:4.7 2:3.2 3:1.6 4:0.2
+0.0 1:4.8 2:3.1 3:1.6 4:0.2
+0.0 1:5.4 2:3.4 3:1.5 4:0.4
+0.0 1:5.2 2:4.1 3:1.5 4:0.1
+0.0 1:5.5 2:4.2 3:1.4 4:0.2
+0.0 1:4.9 2:3.1 3:1.5 4:0.1
+0.0 1:5.0 2:3.2 3:1.2 4:0.2
+0.0 1:5.5 2:3.5 3:1.3 4:0.2
+0.0 1:4.9 2:3.1 3:1.5 4:0.1
+0.0 1:4.4 2:3.0 3:1.3 4:0.2
+0.0 1:5.1 2:3.4 3:1.5 4:0.2
+0.0 1:5.0 2:3.5 3:1.3 4:0.3
+0.0 1:4.5 2:2.3 3:1.3 4:0.3
+0.0 1:4.4 2:3.2 3:1.3 4:0.2
+0.0 1:5.0 2:3.5 3:1.6 4:0.6
+0.0 1:5.1 2:3.8 3:1.9 4:0.4
+0.0 1:4.8 2:3.0 3:1.4 4:0.3
+0.0 1:5.1 2:3.8 3:1.6 4:0.2
+0.0 1:4.6 2:3.2 3:1.4 4:0.2
+0.0 1:5.3 2:3.7 3:1.5 4:0.2
+0.0 1:5.0 2:3.3 3:1.4 4:0.2
+1.0 1:7.0 2:3.2 3:4.7 4:1.4
+1.0 1:6.4 2:3.2 3:4.5 4:1.5
+1.0 1:6.9 2:3.1 3:4.9 4:1.5
+1.0 1:5.5 2:2.3 3:4.0 4:1.3
+1.0 1:6.5 2:2.8 3:4.6 4:1.5
+1.0 1:5.7 2:2.8 3:4.5 4:1.3
+1.0 1:6.3 2:3.3 3:4.7 4:1.6
+1.0 1:4.9 2:2.4 3:3.3 4:1.0
+1.0 1:6.6 2:2.9 3:4.6 4:1.3
+1.0 1:5.2 2:2.7 3:3.9 4:1.4
+1.0 1:5.0 2:2.0 3:3.5 4:1.0
+1.0 1:5.9 2:3.0 3:4.2 4:1.5
+1.0 1:6.0 2:2.2 3:4.0 4:1.0
+1.0 1:6.1 2:2.9 3:4.7 4:1.4
+1.0 1:5.6 2:2.9 3:3.6 4:1.3
+1.0 1:6.7 2:3.1 3:4.4 4:1.4
+1.0 1:5.6 2:3.0 3:4.5 4:1.5
+1.0 1:5.8 2:2.7 3:4.1 4:1.0
+1.0 1:6.2 2:2.2 3:4.5 4:1.5
+1.0 1:5.6 2:2.5 3:3.9 4:1.1
+1.0 1:5.9 2:3.2 3:4.8 4:1.8
+1.0 1:6.1 2:2.8 3:4.0 4:1.3
+1.0 1:6.3 2:2.5 3:4.9 4:1.5
+1.0 1:6.1 2:2.8 3:4.7 4:1.2
+1.0 1:6.4 2:2.9 3:4.3 4:1.3
+1.0 1:6.6 2:3.0 3:4.4 4:1.4
+1.0 1:6.8 2:2.8 3:4.8 4:1.4
+1.0 1:6.7 2:3.0 3:5.0 4:1.7
+1.0 1:6.0 2:2.9 3:4.5 4:1.5
+1.0 1:5.7 2:2.6 3:3.5 4:1.0
+1.0 1:5.5 2:2.4 3:3.8 4:1.1
+1.0 1:5.5 2:2.4 3:3.7 4:1.0
+1.0 1:5.8 2:2.7 3:3.9 4:1.2
+1.0 1:6.0 2:2.7 3:5.1 4:1.6
+1.0 1:5.4 2:3.0 3:4.5 4:1.5
+1.0 1:6.0 2:3.4 3:4.5 4:1.6
+1.0 1:6.7 2:3.1 3:4.7 4:1.5
+1.0 1:6.3 2:2.3 3:4.4 4:1.3
+1.0 1:5.6 2:3.0 3:4.1 4:1.3
+1.0 1:5.5 2:2.5 3:4.0 4:1.3
+1.0 1:5.5 2:2.6 3:4.4 4:1.2
+1.0 1:6.1 2:3.0 3:4.6 4:1.4
+1.0 1:5.8 2:2.6 3:4.0 4:1.2
+1.0 1:5.0 2:2.3 3:3.3 4:1.0
+1.0 1:5.6 2:2.7 3:4.2 4:1.3
+1.0 1:5.7 2:3.0 3:4.2 4:1.2
+1.0 1:5.7 2:2.9 3:4.2 4:1.3
+1.0 1:6.2 2:2.9 3:4.3 4:1.3
+1.0 1:5.1 2:2.5 3:3.0 4:1.1
+1.0 1:5.7 2:2.8 3:4.1 4:1.3
+2.0 1:6.3 2:3.3 3:6.0 4:2.5
+2.0 1:5.8 2:2.7 3:5.1 4:1.9
+2.0 1:7.1 2:3.0 3:5.9 4:2.1
+2.0 1:6.3 2:2.9 3:5.6 4:1.8
+2.0 1:6.5 2:3.0 3:5.8 4:2.2
+2.0 1:7.6 2:3.0 3:6.6 4:2.1
+2.0 1:4.9 2:2.5 3:4.5 4:1.7
+2.0 1:7.3 2:2.9 3:6.3 4:1.8
+2.0 1:6.7 2:2.5 3:5.8 4:1.8
+2.0 1:7.2 2:3.6 3:6.1 4:2.5
+2.0 1:6.5 2:3.2 3:5.1 4:2.0
+2.0 1:6.4 2:2.7 3:5.3 4:1.9
+2.0 1:6.8 2:3.0 3:5.5 4:2.1
+2.0 1:5.7 2:2.5 3:5.0 4:2.0
+2.0 1:5.8 2:2.8 3:5.1 4:2.4
+2.0 1:6.4 2:3.2 3:5.3 4:2.3
+2.0 1:6.5 2:3.0 3:5.5 4:1.8
+2.0 1:7.7 2:3.8 3:6.7 4:2.2
+2.0 1:7.7 2:2.6 3:6.9 4:2.3
+2.0 1:6.0 2:2.2 3:5.0 4:1.5
+2.0 1:6.9 2:3.2 3:5.7 4:2.3
+2.0 1:5.6 2:2.8 3:4.9 4:2.0
+2.0 1:7.7 2:2.8 3:6.7 4:2.0
+2.0 1:6.3 2:2.7 3:4.9 4:1.8
+2.0 1:6.7 2:3.3 3:5.7 4:2.1
+2.0 1:7.2 2:3.2 3:6.0 4:1.8
+2.0 1:6.2 2:2.8 3:4.8 4:1.8
+2.0 1:6.1 2:3.0 3:4.9 4:1.8
+2.0 1:6.4 2:2.8 3:5.6 4:2.1
+2.0 1:7.2 2:3.0 3:5.8 4:1.6
+2.0 1:7.4 2:2.8 3:6.1 4:1.9
+2.0 1:7.9 2:3.8 3:6.4 4:2.0
+2.0 1:6.4 2:2.8 3:5.6 4:2.2
+2.0 1:6.3 2:2.8 3:5.1 4:1.5
+2.0 1:6.1 2:2.6 3:5.6 4:1.4
+2.0 1:7.7 2:3.0 3:6.1 4:2.3
+2.0 1:6.3 2:3.4 3:5.6 4:2.4
+2.0 1:6.4 2:3.1 3:5.5 4:1.8
+2.0 1:6.0 2:3.0 3:4.8 4:1.8
+2.0 1:6.9 2:3.1 3:5.4 4:2.1
+2.0 1:6.7 2:3.1 3:5.6 4:2.4
+2.0 1:6.9 2:3.1 3:5.1 4:2.3
+2.0 1:5.8 2:2.7 3:5.1 4:1.9
+2.0 1:6.8 2:3.2 3:5.9 4:2.3
+2.0 1:6.7 2:3.3 3:5.7 4:2.5
+2.0 1:6.7 2:3.0 3:5.2 4:2.3
+2.0 1:6.3 2:2.5 3:5.0 4:1.9
+2.0 1:6.5 2:3.0 3:5.2 4:2.0
+2.0 1:6.2 2:3.4 3:5.4 4:2.3
+2.0 1:5.9 2:3.0 3:5.1 4:1.8
diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala
new file mode 100644
index 0000000000000..e60ebbd7c852d
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.evaluation
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.param.ParamsSuite
+import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.ml.util.TestingUtils._
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.sql.{DataFrame, SparkSession}
+import org.apache.spark.sql.types.IntegerType
+
+
+class ClusteringEvaluatorSuite
+  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+
+  import testImplicits._
+
+  test("params") {
+    ParamsSuite.checkParams(new ClusteringEvaluator)
+  }
+
+  test("read/write") {
+    val evaluator = new ClusteringEvaluator()
+      .setPredictionCol("myPrediction")
+      .setFeaturesCol("myLabel")
+    testDefaultReadWrite(evaluator)
+  }
+
+  /*
+    Use the following python code to load the data and evaluate it using scikit-learn package.
+
+    from sklearn import datasets
+    from sklearn.metrics import silhouette_score
+    iris = datasets.load_iris()
+    round(silhouette_score(iris.data, iris.target, metric='sqeuclidean'), 10)
+
+    0.6564679231
+  */
+  test("squared euclidean Silhouette") {
+    val iris = ClusteringEvaluatorSuite.irisDataset(spark)
+    val evaluator = new ClusteringEvaluator()
+        .setFeaturesCol("features")
+        .setPredictionCol("label")
+
+    assert(evaluator.evaluate(iris) ~== 0.6564679231 relTol 1e-5)
+  }
+
+  test("number of clusters must be greater than one") {
+    val iris = ClusteringEvaluatorSuite.irisDataset(spark)
+      .where($"label" === 0.0)
+    val evaluator = new ClusteringEvaluator()
+      .setFeaturesCol("features")
+      .setPredictionCol("label")
+
+    val e = intercept[AssertionError]{
+      evaluator.evaluate(iris)
+    }
+    assert(e.getMessage.contains("Number of clusters must be greater than one"))
+  }
+
+}
+
+object ClusteringEvaluatorSuite {
+  def irisDataset(spark: SparkSession): DataFrame = {
+
+    val irisPath = Thread.currentThread()
+      .getContextClassLoader
+      .getResource("test-data/iris.libsvm")
+      .toString
+
+    spark.read.format("libsvm").load(irisPath)
+  }
+}

From 7d0a3ef4ced9684457ad6c5924c58b95249419e1 Mon Sep 17 00:00:00 2001
From: Jen-Ming Chung <jenmingisme@gmail.com>
Date: Tue, 12 Sep 2017 22:47:12 +0900
Subject: [PATCH 523/779] [SPARK-21610][SQL][FOLLOWUP] Corrupt records are not
 handled properly when creating a dataframe from a file

## What changes were proposed in this pull request?

When the `requiredSchema` only contains `_corrupt_record`, the derived `actualSchema` is empty and the `_corrupt_record` are all null for all rows. This PR captures above situation and raise an exception with a reasonable workaround messag so that users can know what happened and how to fix the query.

## How was this patch tested?

Added unit test in `CSVSuite`.

Author: Jen-Ming Chung <jenmingisme@gmail.com>

Closes #19199 from jmchung/SPARK-21610-FOLLOWUP.
---
 .../datasources/csv/CSVFileFormat.scala       | 14 +++++++
 .../datasources/json/JsonFileFormat.scala     |  2 +-
 .../execution/datasources/csv/CSVSuite.scala  | 42 +++++++++++++++++++
 3 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
index a99bdfee5d6e6..e20977a4ec79f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
@@ -109,6 +109,20 @@ class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister {
       }
     }
 
+    if (requiredSchema.length == 1 &&
+      requiredSchema.head.name == parsedOptions.columnNameOfCorruptRecord) {
+      throw new AnalysisException(
+        "Since Spark 2.3, the queries from raw JSON/CSV files are disallowed when the\n" +
+          "referenced columns only include the internal corrupt record column\n" +
+          s"(named _corrupt_record by default). For example:\n" +
+          "spark.read.schema(schema).csv(file).filter($\"_corrupt_record\".isNotNull).count()\n" +
+          "and spark.read.schema(schema).csv(file).select(\"_corrupt_record\").show().\n" +
+          "Instead, you can cache or save the parsed results and then send the same query.\n" +
+          "For example, val df = spark.read.schema(schema).csv(file).cache() and then\n" +
+          "df.filter($\"_corrupt_record\".isNotNull).count()."
+      )
+    }
+
     (file: PartitionedFile) => {
       val conf = broadcastedHadoopConf.value.value
       val parser = new UnivocityParser(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala
index b5ed6e4636653..0862c746fffad 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala
@@ -118,7 +118,7 @@ class JsonFileFormat extends TextBasedFileFormat with DataSourceRegister {
       throw new AnalysisException(
         "Since Spark 2.3, the queries from raw JSON/CSV files are disallowed when the\n" +
         "referenced columns only include the internal corrupt record column\n" +
-        s"(named ${parsedOptions.columnNameOfCorruptRecord} by default). For example:\n" +
+        s"(named _corrupt_record by default). For example:\n" +
         "spark.read.schema(schema).json(file).filter($\"_corrupt_record\".isNotNull).count()\n" +
         "and spark.read.schema(schema).json(file).select(\"_corrupt_record\").show().\n" +
         "Instead, you can cache or save the parsed results and then send the same query.\n" +
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index be89141151098..e439699605abb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -1203,4 +1203,46 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
       .csv(Seq("a").toDS())
     checkAnswer(df, Row("a", null, "a"))
   }
+
+  test("SPARK-21610: Corrupt records are not handled properly when creating a dataframe " +
+    "from a file") {
+    val columnNameOfCorruptRecord = "_corrupt_record"
+    val schema = new StructType()
+      .add("a", IntegerType)
+      .add("b", TimestampType)
+      .add(columnNameOfCorruptRecord, StringType)
+    // negative cases
+    val msg = intercept[AnalysisException] {
+      spark
+        .read
+        .option("columnNameOfCorruptRecord", columnNameOfCorruptRecord)
+        .schema(schema)
+        .csv(testFile(valueMalformedFile))
+        .select(columnNameOfCorruptRecord)
+        .collect()
+    }.getMessage
+    assert(msg.contains("only include the internal corrupt record column"))
+    intercept[org.apache.spark.sql.catalyst.errors.TreeNodeException[_]] {
+      spark
+        .read
+        .option("columnNameOfCorruptRecord", columnNameOfCorruptRecord)
+        .schema(schema)
+        .csv(testFile(valueMalformedFile))
+        .filter($"_corrupt_record".isNotNull)
+        .count()
+    }
+    // workaround
+    val df = spark
+      .read
+      .option("columnNameOfCorruptRecord", columnNameOfCorruptRecord)
+      .schema(schema)
+      .csv(testFile(valueMalformedFile))
+      .cache()
+    assert(df.filter($"_corrupt_record".isNotNull).count() == 1)
+    assert(df.filter($"_corrupt_record".isNull).count() == 1)
+    checkAnswer(
+      df.select(columnNameOfCorruptRecord),
+      Row("0,2013-111-11 12:13:14") :: Row(null) :: Nil
+    )
+  }
 }

From 957558235b7537c706c6ab4779655aa57838ebac Mon Sep 17 00:00:00 2001
From: Kousuke Saruta <sarutak@oss.nttdata.co.jp>
Date: Tue, 12 Sep 2017 15:07:04 +0100
Subject: [PATCH 524/779] [DOCS] Fix unreachable links in the document

## What changes were proposed in this pull request?

Recently, I found two unreachable links in the document and fixed them.
Because of small changes related to the document, I don't file this issue in JIRA but please suggest I should do it if you think it's needed.

## How was this patch tested?

Tested manually.

Author: Kousuke Saruta <sarutak@oss.nttdata.co.jp>

Closes #19195 from sarutak/fix-unreachable-link.
---
 docs/building-spark.md        | 2 +-
 docs/rdd-programming-guide.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/building-spark.md b/docs/building-spark.md
index 69d83023b2281..67a2ce79dc814 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -111,7 +111,7 @@ should run continuous compilation (i.e. wait for changes). However, this has not
 extensively. A couple of gotchas to note:
 
 * it only scans the paths `src/main` and `src/test` (see
-[docs](http://scala-tools.org/mvnsites/maven-scala-plugin/usage_cc.html)), so it will only work
+[docs](http://davidb.github.io/scala-maven-plugin/example_cc.html)), so it will only work
 from within certain submodules that have that structure.
 
 * you'll typically need to run `mvn install` from the project root for compilation within
diff --git a/docs/rdd-programming-guide.md b/docs/rdd-programming-guide.md
index 26025984da64c..29af159510e46 100644
--- a/docs/rdd-programming-guide.md
+++ b/docs/rdd-programming-guide.md
@@ -604,7 +604,7 @@ before the `reduce`, which would cause `lineLengths` to be saved in memory after
 Spark's API relies heavily on passing functions in the driver program to run on the cluster.
 There are two recommended ways to do this:
 
-* [Anonymous function syntax](http://docs.scala-lang.org/tutorials/tour/anonymous-function-syntax.html),
+* [Anonymous function syntax](http://docs.scala-lang.org/tour/basics.html#functions),
   which can be used for short pieces of code.
 * Static methods in a global singleton object. For example, you can define `object MyFunctions` and then
   pass `MyFunctions.func1`, as follows:

From 515910e9bdc1d1b7f0fc05cadc6aeb3a58860e2d Mon Sep 17 00:00:00 2001
From: Zhenhua Wang <wzh_zju@163.com>
Date: Tue, 12 Sep 2017 08:59:52 -0700
Subject: [PATCH 525/779] [SPARK-17642][SQL] support DESC EXTENDED/FORMATTED
 table column commands

## What changes were proposed in this pull request?

Support DESC (EXTENDED | FORMATTED) ? TABLE COLUMN command.
Support DESC EXTENDED | FORMATTED TABLE COLUMN command to show column-level statistics.
Do NOT support describe nested columns.

## How was this patch tested?

Added test cases.

Author: Zhenhua Wang <wzh_zju@163.com>
Author: Zhenhua Wang <wangzhenhua@huawei.com>
Author: wangzhenhua <wangzhenhua@huawei.com>

Closes #16422 from wzhfy/descColumn.
---
 .../spark/sql/catalyst/parser/SqlBase.g4      |   2 +-
 .../spark/sql/execution/SparkSqlParser.scala  |  14 +-
 .../spark/sql/execution/command/tables.scala  |  72 ++++++-
 .../inputs/describe-table-column.sql          |  35 ++++
 .../results/describe-table-column.sql.out     | 184 ++++++++++++++++++
 .../apache/spark/sql/SQLQueryTestSuite.scala  |  10 +-
 .../sql/execution/SparkSqlParserSuite.scala   |  28 ++-
 7 files changed, 332 insertions(+), 13 deletions(-)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/describe-table-column.sql.out

diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index 239e73ef6986c..33bc79a92b9e7 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -270,7 +270,7 @@ describeFuncName
     ;
 
 describeColName
-    : identifier ('.' (identifier | STRING))*
+    : nameParts+=identifier ('.' nameParts+=identifier)*
     ;
 
 ctes
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index d38919b5d940e..6de9ea0efd2c6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -330,10 +330,16 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
    * Create a [[DescribeTableCommand]] logical plan.
    */
   override def visitDescribeTable(ctx: DescribeTableContext): LogicalPlan = withOrigin(ctx) {
-    // Describe column are not supported yet. Return null and let the parser decide
-    // what to do with this (create an exception or pass it on to a different system).
+    val isExtended = ctx.EXTENDED != null || ctx.FORMATTED != null
     if (ctx.describeColName != null) {
-      null
+      if (ctx.partitionSpec != null) {
+        throw new ParseException("DESC TABLE COLUMN for a specific partition is not supported", ctx)
+      } else {
+        DescribeColumnCommand(
+          visitTableIdentifier(ctx.tableIdentifier),
+          ctx.describeColName.nameParts.asScala.map(_.getText),
+          isExtended)
+      }
     } else {
       val partitionSpec = if (ctx.partitionSpec != null) {
         // According to the syntax, visitPartitionSpec returns `Map[String, Option[String]]`.
@@ -348,7 +354,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
       DescribeTableCommand(
         visitTableIdentifier(ctx.tableIdentifier),
         partitionSpec,
-        ctx.EXTENDED != null || ctx.FORMATTED != null)
+        isExtended)
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index 1dddc1ca324bd..da0c815581d53 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -29,13 +29,13 @@ import org.apache.hadoop.fs.Path
 
 import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.analysis.NoSuchPartitionException
+import org.apache.spark.sql.catalyst.analysis.{NoSuchPartitionException, UnresolvedAttribute}
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.catalog.CatalogTableType._
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
 import org.apache.spark.sql.catalyst.util.quoteIdentifier
-import org.apache.spark.sql.execution.datasources.{DataSource, FileFormat, PartitioningUtils}
+import org.apache.spark.sql.execution.datasources.{DataSource, PartitioningUtils}
 import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat
 import org.apache.spark.sql.execution.datasources.json.JsonFileFormat
 import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
@@ -631,6 +631,74 @@ case class DescribeTableCommand(
   }
 }
 
+/**
+ * A command to list the info for a column, including name, data type, column stats and comment.
+ * This function creates a [[DescribeColumnCommand]] logical plan.
+ *
+ * The syntax of using this command in SQL is:
+ * {{{
+ *   DESCRIBE [EXTENDED|FORMATTED] table_name column_name;
+ * }}}
+ */
+case class DescribeColumnCommand(
+    table: TableIdentifier,
+    colNameParts: Seq[String],
+    isExtended: Boolean)
+  extends RunnableCommand {
+
+  override val output: Seq[Attribute] = {
+    Seq(
+      AttributeReference("info_name", StringType, nullable = false,
+        new MetadataBuilder().putString("comment", "name of the column info").build())(),
+      AttributeReference("info_value", StringType, nullable = false,
+        new MetadataBuilder().putString("comment", "value of the column info").build())()
+    )
+  }
+
+  override def run(sparkSession: SparkSession): Seq[Row] = {
+    val catalog = sparkSession.sessionState.catalog
+    val resolver = sparkSession.sessionState.conf.resolver
+    val relation = sparkSession.table(table).queryExecution.analyzed
+
+    val colName = UnresolvedAttribute(colNameParts).name
+    val field = {
+      relation.resolve(colNameParts, resolver).getOrElse {
+        throw new AnalysisException(s"Column $colName does not exist")
+      }
+    }
+    if (!field.isInstanceOf[Attribute]) {
+      // If the field is not an attribute after `resolve`, then it's a nested field.
+      throw new AnalysisException(
+        s"DESC TABLE COLUMN command does not support nested data types: $colName")
+    }
+
+    val catalogTable = catalog.getTempViewOrPermanentTableMetadata(table)
+    val colStats = catalogTable.stats.map(_.colStats).getOrElse(Map.empty)
+    val cs = colStats.get(field.name)
+
+    val comment = if (field.metadata.contains("comment")) {
+      Option(field.metadata.getString("comment"))
+    } else {
+      None
+    }
+
+    val buffer = ArrayBuffer[Row](
+      Row("col_name", field.name),
+      Row("data_type", field.dataType.catalogString),
+      Row("comment", comment.getOrElse("NULL"))
+    )
+    if (isExtended) {
+      // Show column stats when EXTENDED or FORMATTED is specified.
+      buffer += Row("min", cs.flatMap(_.min.map(_.toString)).getOrElse("NULL"))
+      buffer += Row("max", cs.flatMap(_.max.map(_.toString)).getOrElse("NULL"))
+      buffer += Row("num_nulls", cs.map(_.nullCount.toString).getOrElse("NULL"))
+      buffer += Row("distinct_count", cs.map(_.distinctCount.toString).getOrElse("NULL"))
+      buffer += Row("avg_col_len", cs.map(_.avgLen.toString).getOrElse("NULL"))
+      buffer += Row("max_col_len", cs.map(_.maxLen.toString).getOrElse("NULL"))
+    }
+    buffer
+  }
+}
 
 /**
  * A command for users to get tables in the given database.
diff --git a/sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql b/sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql
new file mode 100644
index 0000000000000..24870def0316e
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql
@@ -0,0 +1,35 @@
+-- Test temp table
+CREATE TEMPORARY VIEW desc_col_temp_table (key int COMMENT 'column_comment') USING PARQUET;
+
+DESC desc_col_temp_table key;
+
+DESC EXTENDED desc_col_temp_table key;
+
+DESC FORMATTED desc_col_temp_table key;
+
+-- Describe a column with qualified name
+DESC FORMATTED desc_col_temp_table desc_col_temp_table.key;
+
+-- Describe a non-existent column
+DESC desc_col_temp_table key1;
+
+-- Test persistent table
+CREATE TABLE desc_col_table (key int COMMENT 'column_comment') USING PARQUET;
+
+ANALYZE TABLE desc_col_table COMPUTE STATISTICS FOR COLUMNS key;
+
+DESC desc_col_table key;
+
+DESC EXTENDED desc_col_table key;
+
+DESC FORMATTED desc_col_table key;
+
+-- Test complex columns
+CREATE TABLE desc_col_complex_table (`a.b` int, col struct<x:int, y:string>) USING PARQUET;
+
+DESC FORMATTED desc_col_complex_table `a.b`;
+
+DESC FORMATTED desc_col_complex_table col;
+
+-- Describe a nested column
+DESC FORMATTED desc_col_complex_table col.x;
diff --git a/sql/core/src/test/resources/sql-tests/results/describe-table-column.sql.out b/sql/core/src/test/resources/sql-tests/results/describe-table-column.sql.out
new file mode 100644
index 0000000000000..a51eef7c7e1f6
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/describe-table-column.sql.out
@@ -0,0 +1,184 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 15
+
+
+-- !query 0
+CREATE TEMPORARY VIEW desc_col_temp_table (key int COMMENT 'column_comment') USING PARQUET
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+DESC desc_col_temp_table key
+-- !query 1 schema
+struct<info_name:string,info_value:string>
+-- !query 1 output
+col_name	key
+data_type	int
+comment	column_comment
+
+
+-- !query 2
+DESC EXTENDED desc_col_temp_table key
+-- !query 2 schema
+struct<info_name:string,info_value:string>
+-- !query 2 output
+col_name	key
+data_type	int
+comment	column_comment
+min	NULL
+max	NULL
+num_nulls	NULL
+distinct_count	NULL
+avg_col_len	NULL
+max_col_len	NULL
+
+
+-- !query 3
+DESC FORMATTED desc_col_temp_table key
+-- !query 3 schema
+struct<info_name:string,info_value:string>
+-- !query 3 output
+col_name	key
+data_type	int
+comment	column_comment
+min	NULL
+max	NULL
+num_nulls	NULL
+distinct_count	NULL
+avg_col_len	NULL
+max_col_len	NULL
+
+
+-- !query 4
+DESC FORMATTED desc_col_temp_table desc_col_temp_table.key
+-- !query 4 schema
+struct<info_name:string,info_value:string>
+-- !query 4 output
+col_name	key
+data_type	int
+comment	column_comment
+min	NULL
+max	NULL
+num_nulls	NULL
+distinct_count	NULL
+avg_col_len	NULL
+max_col_len	NULL
+
+
+-- !query 5
+DESC desc_col_temp_table key1
+-- !query 5 schema
+struct<>
+-- !query 5 output
+org.apache.spark.sql.AnalysisException
+Column key1 does not exist;
+
+
+-- !query 6
+CREATE TABLE desc_col_table (key int COMMENT 'column_comment') USING PARQUET
+-- !query 6 schema
+struct<>
+-- !query 6 output
+
+
+
+-- !query 7
+ANALYZE TABLE desc_col_table COMPUTE STATISTICS FOR COLUMNS key
+-- !query 7 schema
+struct<>
+-- !query 7 output
+
+
+
+-- !query 8
+DESC desc_col_table key
+-- !query 8 schema
+struct<info_name:string,info_value:string>
+-- !query 8 output
+col_name	key
+data_type	int
+comment	column_comment
+
+
+-- !query 9
+DESC EXTENDED desc_col_table key
+-- !query 9 schema
+struct<info_name:string,info_value:string>
+-- !query 9 output
+col_name	key
+data_type	int
+comment	column_comment
+min	NULL
+max	NULL
+num_nulls	0
+distinct_count	0
+avg_col_len	4
+max_col_len	4
+
+
+-- !query 10
+DESC FORMATTED desc_col_table key
+-- !query 10 schema
+struct<info_name:string,info_value:string>
+-- !query 10 output
+col_name	key
+data_type	int
+comment	column_comment
+min	NULL
+max	NULL
+num_nulls	0
+distinct_count	0
+avg_col_len	4
+max_col_len	4
+
+
+-- !query 11
+CREATE TABLE desc_col_complex_table (`a.b` int, col struct<x:int, y:string>) USING PARQUET
+-- !query 11 schema
+struct<>
+-- !query 11 output
+
+
+
+-- !query 12
+DESC FORMATTED desc_col_complex_table `a.b`
+-- !query 12 schema
+struct<info_name:string,info_value:string>
+-- !query 12 output
+col_name	a.b
+data_type	int
+comment	NULL
+min	NULL
+max	NULL
+num_nulls	NULL
+distinct_count	NULL
+avg_col_len	NULL
+max_col_len	NULL
+
+
+-- !query 13
+DESC FORMATTED desc_col_complex_table col
+-- !query 13 schema
+struct<info_name:string,info_value:string>
+-- !query 13 output
+col_name	col
+data_type	struct<x:int,y:string>
+comment	NULL
+min	NULL
+max	NULL
+num_nulls	NULL
+distinct_count	NULL
+avg_col_len	NULL
+max_col_len	NULL
+
+
+-- !query 14
+DESC FORMATTED desc_col_complex_table col.x
+-- !query 14 schema
+struct<>
+-- !query 14 output
+org.apache.spark.sql.AnalysisException
+DESC TABLE COLUMN command does not support nested data types: col.x;
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
index aa000bddf9c7e..e3901af4b9988 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.apache.spark.sql.catalyst.util.{fileToString, stringToFile}
-import org.apache.spark.sql.execution.command.DescribeTableCommand
+import org.apache.spark.sql.execution.command.{DescribeColumnCommand, DescribeTableCommand}
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types.StructType
 
@@ -214,11 +214,11 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext {
   /** Executes a query and returns the result as (schema of the output, normalized output). */
   private def getNormalizedResult(session: SparkSession, sql: String): (StructType, Seq[String]) = {
     // Returns true if the plan is supposed to be sorted.
-    def needSort(plan: LogicalPlan): Boolean = plan match {
+    def isSorted(plan: LogicalPlan): Boolean = plan match {
       case _: Join | _: Aggregate | _: Generate | _: Sample | _: Distinct => false
-      case _: DescribeTableCommand => true
+      case _: DescribeTableCommand | _: DescribeColumnCommand => true
       case PhysicalOperation(_, _, Sort(_, true, _)) => true
-      case _ => plan.children.iterator.exists(needSort)
+      case _ => plan.children.iterator.exists(isSorted)
     }
 
     try {
@@ -233,7 +233,7 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext {
         .replaceAll("Last Access.*", s"Last Access $notIncludedMsg"))
 
       // If the output is not pre-sorted, sort it.
-      if (needSort(df.queryExecution.analyzed)) (schema, answer) else (schema, answer.sorted)
+      if (isSorted(df.queryExecution.analyzed)) (schema, answer) else (schema, answer.sorted)
 
     } catch {
       case a: AnalysisException =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala
index fa7a866f4d551..107a2f7109793 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala
@@ -249,8 +249,34 @@ class SparkSqlParserSuite extends AnalysisTest {
     assertEqual("describe table formatted t",
       DescribeTableCommand(
         TableIdentifier("t"), Map.empty, isExtended = true))
+  }
+
+  test("describe table column") {
+    assertEqual("DESCRIBE t col",
+      DescribeColumnCommand(
+        TableIdentifier("t"), Seq("col"), isExtended = false))
+    assertEqual("DESCRIBE t `abc.xyz`",
+      DescribeColumnCommand(
+        TableIdentifier("t"), Seq("abc.xyz"), isExtended = false))
+    assertEqual("DESCRIBE t abc.xyz",
+      DescribeColumnCommand(
+        TableIdentifier("t"), Seq("abc", "xyz"), isExtended = false))
+    assertEqual("DESCRIBE t `a.b`.`x.y`",
+      DescribeColumnCommand(
+        TableIdentifier("t"), Seq("a.b", "x.y"), isExtended = false))
+
+    assertEqual("DESCRIBE TABLE t col",
+      DescribeColumnCommand(
+        TableIdentifier("t"), Seq("col"), isExtended = false))
+    assertEqual("DESCRIBE TABLE EXTENDED t col",
+      DescribeColumnCommand(
+        TableIdentifier("t"), Seq("col"), isExtended = true))
+    assertEqual("DESCRIBE TABLE FORMATTED t col",
+      DescribeColumnCommand(
+        TableIdentifier("t"), Seq("col"), isExtended = true))
 
-    intercept("explain describe tables x", "Unsupported SQL statement")
+    intercept("DESCRIBE TABLE t PARTITION (ds='1970-01-01') col",
+      "DESC TABLE COLUMN for a specific partition is not supported")
   }
 
   test("analyze table statistics") {

From 720c94fe774431a8a40215757ded3dae9f267c7f Mon Sep 17 00:00:00 2001
From: Ajay Saini <ajays725@gmail.com>
Date: Tue, 12 Sep 2017 10:02:27 -0700
Subject: [PATCH 526/779] [SPARK-21027][ML][PYTHON] Added tunable parallelism
 to one vs. rest in both Scala mllib and Pyspark

# What changes were proposed in this pull request?

Added tunable parallelism to the pyspark implementation of one vs. rest classification. Added a parallelism parameter to the Scala implementation of one vs. rest along with functionality for using the parameter to tune the level of parallelism.

I take this PR #18281 over because the original author is busy but we need merge this PR soon.
After this been merged, we can close #18281 .

## How was this patch tested?

Test suite added.

Author: Ajay Saini <ajays725@gmail.com>
Author: WeichenXu <weichen.xu@databricks.com>

Closes #19110 from WeichenXu123/spark-21027.
---
 docs/ml-guide.md                              | 18 ++++++++
 .../spark/ml/classification/OneVsRest.scala   | 45 +++++++++++++------
 .../ml/classification/OneVsRestSuite.scala    | 42 ++++++++++++++++-
 python/pyspark/ml/classification.py           | 25 ++++++-----
 .../ml/param/_shared_params_code_gen.py       |  4 +-
 python/pyspark/ml/param/shared.py             | 24 ++++++++++
 python/pyspark/ml/tests.py                    | 16 ++++++-
 7 files changed, 146 insertions(+), 28 deletions(-)

diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index 7aec6a40d4c64..f6288e7c32d97 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -105,6 +105,24 @@ MLlib is under active development.
 The APIs marked `Experimental`/`DeveloperApi` may change in future releases,
 and the migration guide below will explain all changes between releases.
 
+## From 2.2 to 2.3
+
+### Breaking changes
+
+There are no breaking changes.
+
+### Deprecations and changes of behavior
+
+**Deprecations**
+
+There are no deprecations.
+
+**Changes of behavior**
+
+* [SPARK-21027](https://issues.apache.org/jira/browse/SPARK-21027):
+ We are now setting the default parallelism used in `OneVsRest` to be 1 (i.e. serial), in 2.2 and earlier version,
+ the `OneVsRest` parallelism would be parallelism of the default threadpool in scala.
+
 ## From 2.1 to 2.2
 
 ### Breaking changes
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
index 05b8c3ab5456e..99bb2341662e1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
@@ -17,10 +17,10 @@
 
 package org.apache.spark.ml.classification
 
-import java.util.{List => JList}
 import java.util.UUID
 
-import scala.collection.JavaConverters._
+import scala.concurrent.Future
+import scala.concurrent.duration.Duration
 import scala.language.existentials
 
 import org.apache.hadoop.fs.Path
@@ -34,12 +34,13 @@ import org.apache.spark.ml._
 import org.apache.spark.ml.attribute._
 import org.apache.spark.ml.linalg.Vector
 import org.apache.spark.ml.param.{Param, ParamMap, ParamPair, Params}
-import org.apache.spark.ml.param.shared.HasWeightCol
+import org.apache.spark.ml.param.shared.{HasParallelism, HasWeightCol}
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
 import org.apache.spark.storage.StorageLevel
+import org.apache.spark.util.ThreadUtils
 
 private[ml] trait ClassifierTypeTrait {
   // scalastyle:off structural.type
@@ -273,7 +274,7 @@ object OneVsRestModel extends MLReadable[OneVsRestModel] {
 @Since("1.4.0")
 final class OneVsRest @Since("1.4.0") (
     @Since("1.4.0") override val uid: String)
-  extends Estimator[OneVsRestModel] with OneVsRestParams with MLWritable {
+  extends Estimator[OneVsRestModel] with OneVsRestParams with HasParallelism with MLWritable {
 
   @Since("1.4.0")
   def this() = this(Identifiable.randomUID("oneVsRest"))
@@ -296,6 +297,16 @@ final class OneVsRest @Since("1.4.0") (
   @Since("1.5.0")
   def setPredictionCol(value: String): this.type = set(predictionCol, value)
 
+  /**
+   * The implementation of parallel one vs. rest runs the classification for
+   * each class in a separate threads.
+   *
+   * @group expertSetParam
+   */
+  def setParallelism(value: Int): this.type = {
+    set(parallelism, value)
+  }
+
   /**
    * Sets the value of param [[weightCol]].
    *
@@ -318,7 +329,7 @@ final class OneVsRest @Since("1.4.0") (
     transformSchema(dataset.schema)
 
     val instr = Instrumentation.create(this, dataset)
-    instr.logParams(labelCol, featuresCol, predictionCol)
+    instr.logParams(labelCol, featuresCol, predictionCol, parallelism)
     instr.logNamedValue("classifier", $(classifier).getClass.getCanonicalName)
 
     // determine number of classes either from metadata if provided, or via computation.
@@ -352,8 +363,10 @@ final class OneVsRest @Since("1.4.0") (
       multiclassLabeled.persist(StorageLevel.MEMORY_AND_DISK)
     }
 
+    val executionContext = getExecutionContext
+
     // create k columns, one for each binary classifier.
-    val models = Range(0, numClasses).par.map { index =>
+    val modelFutures = Range(0, numClasses).map { index =>
       // generate new label metadata for the binary problem.
       val newLabelMeta = BinaryAttribute.defaultAttr.withName("label").toMetadata()
       val labelColName = "mc2b$" + index
@@ -364,14 +377,18 @@ final class OneVsRest @Since("1.4.0") (
       paramMap.put(classifier.labelCol -> labelColName)
       paramMap.put(classifier.featuresCol -> getFeaturesCol)
       paramMap.put(classifier.predictionCol -> getPredictionCol)
-      if (weightColIsUsed) {
-        val classifier_ = classifier.asInstanceOf[ClassifierType with HasWeightCol]
-        paramMap.put(classifier_.weightCol -> getWeightCol)
-        classifier_.fit(trainingDataset, paramMap)
-      } else {
-        classifier.fit(trainingDataset, paramMap)
-      }
-    }.toArray[ClassificationModel[_, _]]
+      Future {
+        if (weightColIsUsed) {
+          val classifier_ = classifier.asInstanceOf[ClassifierType with HasWeightCol]
+          paramMap.put(classifier_.weightCol -> getWeightCol)
+          classifier_.fit(trainingDataset, paramMap)
+        } else {
+          classifier.fit(trainingDataset, paramMap)
+        }
+      }(executionContext)
+    }
+    val models = modelFutures
+      .map(ThreadUtils.awaitResult(_, Duration.Inf)).toArray[ClassificationModel[_, _]]
     instr.logNumFeatures(models.head.numFeatures)
 
     if (handlePersistence) {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
index 17f82827b74e6..25bad59b9c9cf 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
@@ -25,12 +25,12 @@ import org.apache.spark.ml.feature.StringIndexer
 import org.apache.spark.ml.linalg.Vectors
 import org.apache.spark.ml.param.{ParamMap, ParamsSuite}
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MetadataUtils, MLTestingUtils}
+import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
 import org.apache.spark.mllib.evaluation.MulticlassMetrics
 import org.apache.spark.mllib.linalg.{Vectors => OldVectors}
 import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.Dataset
 import org.apache.spark.sql.functions._
@@ -98,7 +98,45 @@ class OneVsRestSuite extends SparkFunSuite with MLlibTestSparkContext with Defau
     // bound how much error we allow compared to multinomial logistic regression.
     val expectedMetrics = new MulticlassMetrics(results)
     val ovaMetrics = new MulticlassMetrics(ovaResults)
-    assert(expectedMetrics.confusionMatrix ~== ovaMetrics.confusionMatrix absTol 400)
+    assert(expectedMetrics.confusionMatrix.asML ~== ovaMetrics.confusionMatrix.asML absTol 400)
+  }
+
+  test("one-vs-rest: tuning parallelism does not change output") {
+    val ovaPar1 = new OneVsRest()
+      .setClassifier(new LogisticRegression)
+
+    val ovaModelPar1 = ovaPar1.fit(dataset)
+
+    val transformedDatasetPar1 = ovaModelPar1.transform(dataset)
+
+    val ovaResultsPar1 = transformedDatasetPar1.select("prediction", "label").rdd.map {
+      row => (row.getDouble(0), row.getDouble(1))
+    }
+
+    val ovaPar2 = new OneVsRest()
+      .setClassifier(new LogisticRegression)
+      .setParallelism(2)
+
+    val ovaModelPar2 = ovaPar2.fit(dataset)
+
+    val transformedDatasetPar2 = ovaModelPar2.transform(dataset)
+
+    val ovaResultsPar2 = transformedDatasetPar2.select("prediction", "label").rdd.map {
+      row => (row.getDouble(0), row.getDouble(1))
+    }
+
+    val metricsPar1 = new MulticlassMetrics(ovaResultsPar1)
+    val metricsPar2 = new MulticlassMetrics(ovaResultsPar2)
+    assert(metricsPar1.confusionMatrix == metricsPar2.confusionMatrix)
+
+    ovaModelPar1.models.zip(ovaModelPar2.models).foreach {
+      case (lrModel1: LogisticRegressionModel, lrModel2: LogisticRegressionModel) =>
+        assert(lrModel1.coefficients ~== lrModel2.coefficients relTol 1E-3)
+        assert(lrModel1.intercept ~== lrModel2.intercept relTol 1E-3)
+      case other =>
+        throw new AssertionError(s"Loaded OneVsRestModel expected model of type" +
+          s" LogisticRegressionModel but found ${other.getClass.getName}")
+    }
   }
 
   test("one-vs-rest: pass label metadata correctly during train") {
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index aa747f30d8c93..fbb9e7fcdd86b 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -16,6 +16,7 @@
 #
 
 import operator
+from multiprocessing.pool import ThreadPool
 
 from pyspark import since, keyword_only
 from pyspark.ml import Estimator, Model
@@ -1567,7 +1568,7 @@ def getClassifier(self):
 
 
 @inherit_doc
-class OneVsRest(Estimator, OneVsRestParams, JavaMLReadable, JavaMLWritable):
+class OneVsRest(Estimator, OneVsRestParams, HasParallelism, JavaMLReadable, JavaMLWritable):
     """
     .. note:: Experimental
 
@@ -1612,22 +1613,23 @@ class OneVsRest(Estimator, OneVsRestParams, JavaMLReadable, JavaMLWritable):
 
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
-                 classifier=None, weightCol=None):
+                 classifier=None, weightCol=None, parallelism=1):
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
-                 classifier=None, weightCol=None)
+                 classifier=None, weightCol=None, parallelism=1):
         """
         super(OneVsRest, self).__init__()
+        self._setDefault(parallelism=1)
         kwargs = self._input_kwargs
         self._set(**kwargs)
 
     @keyword_only
     @since("2.0.0")
-    def setParams(self, featuresCol=None, labelCol=None, predictionCol=None,
-                  classifier=None, weightCol=None):
+    def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                  classifier=None, weightCol=None, parallelism=1):
         """
-        setParams(self, featuresCol=None, labelCol=None, predictionCol=None, \
-                  classifier=None, weightCol=None):
+        setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
+                  classifier=None, weightCol=None, parallelism=1):
         Sets params for OneVsRest.
         """
         kwargs = self._input_kwargs
@@ -1674,8 +1676,9 @@ def trainSingleClass(index):
                 paramMap[classifier.weightCol] = weightCol
             return classifier.fit(trainingDataset, paramMap)
 
-        # TODO: Parallel training for all classes.
-        models = [trainSingleClass(i) for i in range(numClasses)]
+        pool = ThreadPool(processes=min(self.getParallelism(), numClasses))
+
+        models = pool.map(trainSingleClass, range(numClasses))
 
         if handlePersistence:
             multiclassLabeled.unpersist()
@@ -1709,8 +1712,9 @@ def _from_java(cls, java_stage):
         labelCol = java_stage.getLabelCol()
         predictionCol = java_stage.getPredictionCol()
         classifier = JavaParams._from_java(java_stage.getClassifier())
+        parallelism = java_stage.getParallelism()
         py_stage = cls(featuresCol=featuresCol, labelCol=labelCol, predictionCol=predictionCol,
-                       classifier=classifier)
+                       classifier=classifier, parallelism=parallelism)
         py_stage._resetUid(java_stage.uid())
         return py_stage
 
@@ -1723,6 +1727,7 @@ def _to_java(self):
         _java_obj = JavaParams._new_java_obj("org.apache.spark.ml.classification.OneVsRest",
                                              self.uid)
         _java_obj.setClassifier(self.getClassifier()._to_java())
+        _java_obj.setParallelism(self.getParallelism())
         _java_obj.setFeaturesCol(self.getFeaturesCol())
         _java_obj.setLabelCol(self.getLabelCol())
         _java_obj.setPredictionCol(self.getPredictionCol())
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
index 51d49b524c326..130d1a0bae7f0 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -152,7 +152,9 @@ def get$Name(self):
         ("varianceCol", "column name for the biased sample variance of prediction.",
          None, "TypeConverters.toString"),
         ("aggregationDepth", "suggested depth for treeAggregate (>= 2).", "2",
-         "TypeConverters.toInt")]
+         "TypeConverters.toInt"),
+        ("parallelism", "the number of threads to use when running parallel algorithms (>= 1).",
+         "1", "TypeConverters.toInt")]
 
     code = []
     for name, doc, defaultValueStr, typeConverter in shared:
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
index 163a0e2b3a968..4041d9c43b236 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -608,6 +608,30 @@ def getAggregationDepth(self):
         return self.getOrDefault(self.aggregationDepth)
 
 
+class HasParallelism(Params):
+    """
+    Mixin for param parallelism: the number of threads to use when running parallel algorithms (>= 1).
+    """
+
+    parallelism = Param(Params._dummy(), "parallelism", "the number of threads to use when running parallel algorithms (>= 1).", typeConverter=TypeConverters.toInt)
+
+    def __init__(self):
+        super(HasParallelism, self).__init__()
+        self._setDefault(parallelism=1)
+
+    def setParallelism(self, value):
+        """
+        Sets the value of :py:attr:`parallelism`.
+        """
+        return self._set(parallelism=value)
+
+    def getParallelism(self):
+        """
+        Gets the value of parallelism or its default value.
+        """
+        return self.getOrDefault(self.parallelism)
+
+
 class DecisionTreeParams(Params):
     """
     Mixin for Decision Tree parameters.
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 15d6c76387393..c66cd763b5b9a 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -1548,11 +1548,25 @@ def test_output_columns(self):
                                          (2.0, Vectors.dense(0.5, 0.5))],
                                         ["label", "features"])
         lr = LogisticRegression(maxIter=5, regParam=0.01)
-        ovr = OneVsRest(classifier=lr)
+        ovr = OneVsRest(classifier=lr, parallelism=1)
         model = ovr.fit(df)
         output = model.transform(df)
         self.assertEqual(output.columns, ["label", "features", "prediction"])
 
+    def test_parallelism_doesnt_change_output(self):
+        df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
+                                         (1.0, Vectors.sparse(2, [], [])),
+                                         (2.0, Vectors.dense(0.5, 0.5))],
+                                        ["label", "features"])
+        ovrPar1 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=1)
+        modelPar1 = ovrPar1.fit(df)
+        ovrPar2 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=2)
+        modelPar2 = ovrPar2.fit(df)
+        for i, model in enumerate(modelPar1.models):
+            self.assertTrue(np.allclose(model.coefficients.toArray(),
+                                        modelPar2.models[i].coefficients.toArray(), atol=1E-4))
+            self.assertTrue(np.allclose(model.intercept, modelPar2.models[i].intercept, atol=1E-4))
+
     def test_support_for_weightCol(self):
         df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8), 1.0),
                                          (1.0, Vectors.sparse(2, [], []), 1.0),

From b9b54b1c88fe499c3bafdaf85ca1c3e4490bc900 Mon Sep 17 00:00:00 2001
From: sarutak <sarutak@oss.nttdata.co.jp>
Date: Tue, 12 Sep 2017 10:49:46 -0700
Subject: [PATCH 527/779] [SPARK-21368][SQL] TPCDSQueryBenchmark can't refer
 query files.

## What changes were proposed in this pull request?

TPCDSQueryBenchmark packaged into a jar doesn't work with spark-submit.
It's because of the failure of reference query files in the jar file.

## How was this patch tested?

Ran the benchmark.

Author: sarutak <sarutak@oss.nttdata.co.jp>
Author: Kousuke Saruta <sarutak@oss.nttdata.co.jp>

Closes #18592 from sarutak/fix-tpcds-benchmark.
---
 .../benchmark/TPCDSQueryBenchmark.scala       | 19 ++---
 .../TPCDSQueryBenchmarkArguments.scala        | 69 +++++++++++++++++++
 2 files changed, 74 insertions(+), 14 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmarkArguments.scala

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala
index d2d013682cd2d..63d118cf857be 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql.execution.benchmark
 
-import java.io.File
-
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.TableIdentifier
@@ -31,7 +29,7 @@ import org.apache.spark.util.Benchmark
 /**
  * Benchmark to measure TPCDS query performance.
  * To run this:
- *  spark-submit --class <this class> --jars <spark sql test jar>
+ *  spark-submit --class <this class> <spark sql test jar> <TPCDS data location>
  */
 object TPCDSQueryBenchmark {
   val conf =
@@ -61,12 +59,10 @@ object TPCDSQueryBenchmark {
   }
 
   def tpcdsAll(dataLocation: String, queries: Seq[String]): Unit = {
-    require(dataLocation.nonEmpty,
-      "please modify the value of dataLocation to point to your local TPCDS data")
     val tableSizes = setupTables(dataLocation)
     queries.foreach { name =>
-      val queryString = fileToString(new File(Thread.currentThread().getContextClassLoader
-        .getResource(s"tpcds/$name.sql").getFile))
+      val queryString = resourceToString(s"tpcds/$name.sql",
+        classLoader = Thread.currentThread().getContextClassLoader)
 
       // This is an indirect hack to estimate the size of each query's input by traversing the
       // logical plan and adding up the sizes of all tables that appear in the plan. Note that this
@@ -99,6 +95,7 @@ object TPCDSQueryBenchmark {
   }
 
   def main(args: Array[String]): Unit = {
+    val benchmarkArgs = new TPCDSQueryBenchmarkArguments(args)
 
     // List of all TPC-DS queries
     val tpcdsQueries = Seq(
@@ -113,12 +110,6 @@ object TPCDSQueryBenchmark {
       "q81", "q82", "q83", "q84", "q85", "q86", "q87", "q88", "q89", "q90",
       "q91", "q92", "q93", "q94", "q95", "q96", "q97", "q98", "q99")
 
-    // In order to run this benchmark, please follow the instructions at
-    // https://github.com/databricks/spark-sql-perf/blob/master/README.md to generate the TPCDS data
-    // locally (preferably with a scale factor of 5 for benchmarking). Thereafter, the value of
-    // dataLocation below needs to be set to the location where the generated data is stored.
-    val dataLocation = ""
-
-    tpcdsAll(dataLocation, queries = tpcdsQueries)
+    tpcdsAll(benchmarkArgs.dataLocation, queries = tpcdsQueries)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmarkArguments.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmarkArguments.scala
new file mode 100644
index 0000000000000..8edc77bd0ec6d
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmarkArguments.scala
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.benchmark
+
+class TPCDSQueryBenchmarkArguments(val args: Array[String]) {
+  var dataLocation: String = null
+
+  parseArgs(args.toList)
+  validateArguments()
+
+  private def parseArgs(inputArgs: List[String]): Unit = {
+    var args = inputArgs
+
+    while(args.nonEmpty) {
+      args match {
+        case ("--data-location") :: value :: tail =>
+          dataLocation = value
+          args = tail
+
+        case _ =>
+          // scalastyle:off println
+          System.err.println("Unknown/unsupported param " + args)
+          // scalastyle:on println
+          printUsageAndExit(1)
+      }
+    }
+  }
+
+  private def printUsageAndExit(exitCode: Int): Unit = {
+    // scalastyle:off
+    System.err.println("""
+      |Usage: spark-submit --class <this class> <spark sql test jar> [Options]
+      |Options:
+      |  --data-location      Path to TPCDS data
+      |
+      |------------------------------------------------------------------------------------------------------------------
+      |In order to run this benchmark, please follow the instructions at
+      |https://github.com/databricks/spark-sql-perf/blob/master/README.md
+      |to generate the TPCDS data locally (preferably with a scale factor of 5 for benchmarking).
+      |Thereafter, the value of <TPCDS data location> needs to be set to the location where the generated data is stored.
+      """.stripMargin)
+    // scalastyle:on
+    System.exit(exitCode)
+  }
+
+  private def validateArguments(): Unit = {
+    if (dataLocation == null) {
+      // scalastyle:off println
+      System.err.println("Must specify a data location")
+      // scalastyle:on println
+      printUsageAndExit(-1)
+    }
+  }
+}

From c5f9b89dda40ffaa4622a7ba2b3d0605dbe815c0 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Tue, 12 Sep 2017 11:37:05 -0700
Subject: [PATCH 528/779] [SPARK-18608][ML] Fix double caching

## What changes were proposed in this pull request?
`df.rdd.getStorageLevel` => `df.storageLevel`

using cmd `find . -name '*.scala' | xargs -i bash -c 'egrep -in "\.rdd\.getStorageLevel" {} && echo {}'` to make sure all algs involved in this issue are fixed.

Previous discussion in other PRs: https://github.com/apache/spark/pull/19107, https://github.com/apache/spark/pull/17014

## How was this patch tested?
existing tests

Author: Zheng RuiFeng <ruifengz@foxmail.com>

Closes #19197 from zhengruifeng/double_caching.
---
 .../apache/spark/ml/classification/LogisticRegression.scala   | 2 +-
 .../scala/org/apache/spark/ml/classification/OneVsRest.scala  | 4 ++--
 .../main/scala/org/apache/spark/ml/clustering/KMeans.scala    | 2 +-
 .../apache/spark/ml/regression/AFTSurvivalRegression.scala    | 2 +-
 .../org/apache/spark/ml/regression/IsotonicRegression.scala   | 2 +-
 .../org/apache/spark/ml/regression/LinearRegression.scala     | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index f491a679b2422..cbc8f4a2d8c27 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -484,7 +484,7 @@ class LogisticRegression @Since("1.2.0") (
   }
 
   override protected[spark] def train(dataset: Dataset[_]): LogisticRegressionModel = {
-    val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE
+    val handlePersistence = dataset.storageLevel == StorageLevel.NONE
     train(dataset, handlePersistence)
   }
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
index 99bb2341662e1..942e981a8b562 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
@@ -165,7 +165,7 @@ final class OneVsRestModel private[ml] (
     val newDataset = dataset.withColumn(accColName, initUDF())
 
     // persist if underlying dataset is not persistent.
-    val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE
+    val handlePersistence = dataset.storageLevel == StorageLevel.NONE
     if (handlePersistence) {
       newDataset.persist(StorageLevel.MEMORY_AND_DISK)
     }
@@ -358,7 +358,7 @@ final class OneVsRest @Since("1.4.0") (
     }
 
     // persist if underlying dataset is not persistent.
-    val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE
+    val handlePersistence = dataset.storageLevel == StorageLevel.NONE
     if (handlePersistence) {
       multiclassLabeled.persist(StorageLevel.MEMORY_AND_DISK)
     }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
index e02b532ca8a93..f2af7fe082b41 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -304,7 +304,7 @@ class KMeans @Since("1.5.0") (
   override def fit(dataset: Dataset[_]): KMeansModel = {
     transformSchema(dataset.schema, logging = true)
 
-    val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE
+    val handlePersistence = dataset.storageLevel == StorageLevel.NONE
     val instances: RDD[OldVector] = dataset.select(col($(featuresCol))).rdd.map {
       case Row(point: Vector) => OldVectors.fromML(point)
     }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
index 16821f317760e..4b46c3831d75f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
@@ -213,7 +213,7 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S
   override def fit(dataset: Dataset[_]): AFTSurvivalRegressionModel = {
     transformSchema(dataset.schema, logging = true)
     val instances = extractAFTPoints(dataset)
-    val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE
+    val handlePersistence = dataset.storageLevel == StorageLevel.NONE
     if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK)
 
     val featuresSummarizer = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
index 529f66eadbcff..8faab52ea474b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
@@ -165,7 +165,7 @@ class IsotonicRegression @Since("1.5.0") (@Since("1.5.0") override val uid: Stri
     transformSchema(dataset.schema, logging = true)
     // Extract columns from data.  If dataset is persisted, do not persist oldDataset.
     val instances = extractWeightedLabeledPoints(dataset)
-    val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE
+    val handlePersistence = dataset.storageLevel == StorageLevel.NONE
     if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK)
 
     val instr = Instrumentation.create(this, dataset)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index ed431f550817e..b2a968118d1a9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -251,7 +251,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
       return lrModel
     }
 
-    val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE
+    val handlePersistence = dataset.storageLevel == StorageLevel.NONE
     if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK)
 
     val (featuresSummarizer, ySummarizer) = {

From 1a9857476698a094dbcbdf261c90e5a684231669 Mon Sep 17 00:00:00 2001
From: Wang Gengliang <ltnwgl@gmail.com>
Date: Tue, 12 Sep 2017 13:02:29 -0700
Subject: [PATCH 529/779] [SPARK-21979][SQL] Improve QueryPlanConstraints
 framework

## What changes were proposed in this pull request?

Improve QueryPlanConstraints framework, make it robust and simple.
In https://github.com/apache/spark/pull/15319, constraints for expressions like `a = f(b, c)` is resolved.
However, for expressions like
```scala
a = f(b, c) && c = g(a, b)
```
The current QueryPlanConstraints framework will produce non-converging constraints.
Essentially, the problem is caused by having both the name and child of aliases in the same constraint set.   We infer constraints, and push down constraints as predicates in filters, later on these predicates are propagated as constraints, etc..
Simply using the alias names only can resolve these problems.  The size of constraints is reduced without losing any information. We can always get these inferred constraints on child of aliases when pushing down filters.

Also, the EqualNullSafe between name and child in propagating alias is meaningless
```scala
allConstraints += EqualNullSafe(e, a.toAttribute)
```
It just produces redundant constraints.

## How was this patch tested?

Unit test

Author: Wang Gengliang <ltnwgl@gmail.com>

Closes #19201 from gengliangwang/QueryPlanConstraints.
---
 .../catalyst/plans/logical/LogicalPlan.scala  |   1 -
 .../plans/logical/QueryPlanConstraints.scala  | 103 +++++-------------
 .../InferFiltersFromConstraintsSuite.scala    |  46 ++++++--
 .../plans/ConstraintPropagationSuite.scala    |   2 -
 4 files changed, 65 insertions(+), 87 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index d893b392e56b9..68aae720e026a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -297,7 +297,6 @@ abstract class UnaryNode extends LogicalPlan {
           case expr: Expression if expr.semanticEquals(e) =>
             a.toAttribute
         })
-        allConstraints += EqualNullSafe(e, a.toAttribute)
       case _ => // Don't change.
     }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/QueryPlanConstraints.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/QueryPlanConstraints.scala
index 8bffbd0c208cb..b0f611fd38dea 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/QueryPlanConstraints.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/QueryPlanConstraints.scala
@@ -106,91 +106,48 @@ trait QueryPlanConstraints { self: LogicalPlan =>
    * Infers an additional set of constraints from a given set of equality constraints.
    * For e.g., if an operator has constraints of the form (`a = 5`, `a = b`), this returns an
    * additional constraint of the form `b = 5`.
-   *
-   * [SPARK-17733] We explicitly prevent producing recursive constraints of the form `a = f(a, b)`
-   * as they are often useless and can lead to a non-converging set of constraints.
    */
   private def inferAdditionalConstraints(constraints: Set[Expression]): Set[Expression] = {
-    val constraintClasses = generateEquivalentConstraintClasses(constraints)
-
+    val aliasedConstraints = eliminateAliasedExpressionInConstraints(constraints)
     var inferredConstraints = Set.empty[Expression]
-    constraints.foreach {
+    aliasedConstraints.foreach {
       case eq @ EqualTo(l: Attribute, r: Attribute) =>
-        val candidateConstraints = constraints - eq
-        inferredConstraints ++= candidateConstraints.map(_ transform {
-          case a: Attribute if a.semanticEquals(l) &&
-            !isRecursiveDeduction(r, constraintClasses) => r
-        })
-        inferredConstraints ++= candidateConstraints.map(_ transform {
-          case a: Attribute if a.semanticEquals(r) &&
-            !isRecursiveDeduction(l, constraintClasses) => l
-        })
+        val candidateConstraints = aliasedConstraints - eq
+        inferredConstraints ++= replaceConstraints(candidateConstraints, l, r)
+        inferredConstraints ++= replaceConstraints(candidateConstraints, r, l)
       case _ => // No inference
     }
     inferredConstraints -- constraints
   }
 
   /**
-   * Generate a sequence of expression sets from constraints, where each set stores an equivalence
-   * class of expressions. For example, Set(`a = b`, `b = c`, `e = f`) will generate the following
-   * expression sets: (Set(a, b, c), Set(e, f)). This will be used to search all expressions equal
-   * to an selected attribute.
+   * Replace the aliased expression in [[Alias]] with the alias name if both exist in constraints.
+   * Thus non-converging inference can be prevented.
+   * E.g. `Alias(b, f(a)), a = b` infers `f(a) = f(f(a))` without eliminating aliased expressions.
+   * Also, the size of constraints is reduced without losing any information.
+   * When the inferred filters are pushed down the operators that generate the alias,
+   * the alias names used in filters are replaced by the aliased expressions.
    */
-  private def generateEquivalentConstraintClasses(
-      constraints: Set[Expression]): Seq[Set[Expression]] = {
-    var constraintClasses = Seq.empty[Set[Expression]]
-    constraints.foreach {
-      case eq @ EqualTo(l: Attribute, r: Attribute) =>
-        // Transform [[Alias]] to its child.
-        val left = aliasMap.getOrElse(l, l)
-        val right = aliasMap.getOrElse(r, r)
-        // Get the expression set for an equivalence constraint class.
-        val leftConstraintClass = getConstraintClass(left, constraintClasses)
-        val rightConstraintClass = getConstraintClass(right, constraintClasses)
-        if (leftConstraintClass.nonEmpty && rightConstraintClass.nonEmpty) {
-          // Combine the two sets.
-          constraintClasses = constraintClasses
-            .diff(leftConstraintClass :: rightConstraintClass :: Nil) :+
-            (leftConstraintClass ++ rightConstraintClass)
-        } else if (leftConstraintClass.nonEmpty) { // && rightConstraintClass.isEmpty
-          // Update equivalence class of `left` expression.
-          constraintClasses = constraintClasses
-            .diff(leftConstraintClass :: Nil) :+ (leftConstraintClass + right)
-        } else if (rightConstraintClass.nonEmpty) { // && leftConstraintClass.isEmpty
-          // Update equivalence class of `right` expression.
-          constraintClasses = constraintClasses
-            .diff(rightConstraintClass :: Nil) :+ (rightConstraintClass + left)
-        } else { // leftConstraintClass.isEmpty && rightConstraintClass.isEmpty
-          // Create new equivalence constraint class since neither expression presents
-          // in any classes.
-          constraintClasses = constraintClasses :+ Set(left, right)
-        }
-      case _ => // Skip
+  private def eliminateAliasedExpressionInConstraints(constraints: Set[Expression])
+    : Set[Expression] = {
+    val attributesInEqualTo = constraints.flatMap {
+      case EqualTo(l: Attribute, r: Attribute) => l :: r :: Nil
+      case _ => Nil
     }
-
-    constraintClasses
-  }
-
-  /**
-   * Get all expressions equivalent to the selected expression.
-   */
-  private def getConstraintClass(
-      expr: Expression,
-      constraintClasses: Seq[Set[Expression]]): Set[Expression] =
-    constraintClasses.find(_.contains(expr)).getOrElse(Set.empty[Expression])
-
-  /**
-   * Check whether replace by an [[Attribute]] will cause a recursive deduction. Generally it
-   * has the form like: `a -> f(a, b)`, where `a` and `b` are expressions and `f` is a function.
-   * Here we first get all expressions equal to `attr` and then check whether at least one of them
-   * is a child of the referenced expression.
-   */
-  private def isRecursiveDeduction(
-      attr: Attribute,
-      constraintClasses: Seq[Set[Expression]]): Boolean = {
-    val expr = aliasMap.getOrElse(attr, attr)
-    getConstraintClass(expr, constraintClasses).exists { e =>
-      expr.children.exists(_.semanticEquals(e))
+    var aliasedConstraints = constraints
+    attributesInEqualTo.foreach { a =>
+      if (aliasMap.contains(a)) {
+        val child = aliasMap.get(a).get
+        aliasedConstraints = replaceConstraints(aliasedConstraints, child, a)
+      }
     }
+    aliasedConstraints
   }
+
+  private def replaceConstraints(
+      constraints: Set[Expression],
+      source: Expression,
+      destination: Attribute): Set[Expression] = constraints.map(_ transform {
+    case e: Expression if e.semanticEquals(source) => destination
+  })
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala
index d2dd469e2d74f..5580f8604ec72 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala
@@ -151,9 +151,9 @@ class InferFiltersFromConstraintsSuite extends PlanTest {
       .join(t2, Inner, Some("t.a".attr === "t2.a".attr && "t.d".attr === "t2.a".attr))
       .analyze
     val correctAnswer = t1
-      .where(IsNotNull('a) && IsNotNull('b) && 'a <=> 'a && 'b <=> 'b &&'a === 'b)
+      .where(IsNotNull('a) && IsNotNull('b) &&'a === 'b)
       .select('a, 'b.as('d)).as("t")
-      .join(t2.where(IsNotNull('a) && 'a <=> 'a), Inner,
+      .join(t2.where(IsNotNull('a)), Inner,
         Some("t.a".attr === "t2.a".attr && "t.d".attr === "t2.a".attr))
       .analyze
     val optimized = Optimize.execute(originalQuery)
@@ -176,17 +176,17 @@ class InferFiltersFromConstraintsSuite extends PlanTest {
           && "t.int_col".attr === "t2.a".attr))
       .analyze
     val correctAnswer = t1
-      .where(IsNotNull('a) && IsNotNull(Coalesce(Seq('a, 'a)))
-        && 'a === Coalesce(Seq('a, 'a)) && 'a <=> Coalesce(Seq('a, 'a))
-        && Coalesce(Seq('b, 'b)) <=> 'a && 'a === 'b && IsNotNull(Coalesce(Seq('a, 'b)))
-        && 'a === Coalesce(Seq('a, 'b)) && Coalesce(Seq('a, 'b)) === 'b
-        && IsNotNull('b) && IsNotNull(Coalesce(Seq('b, 'b)))
-        && 'b === Coalesce(Seq('b, 'b)) && 'b <=> Coalesce(Seq('b, 'b)))
+      .where(IsNotNull('a) && IsNotNull(Coalesce(Seq('a, 'a)))  && IsNotNull(Coalesce(Seq('b, 'a)))
+        && IsNotNull('b) && IsNotNull(Coalesce(Seq('b, 'b))) && IsNotNull(Coalesce(Seq('a, 'b)))
+        && 'a === 'b && 'a === Coalesce(Seq('a, 'a)) && 'a === Coalesce(Seq('a, 'b))
+        && 'a === Coalesce(Seq('b, 'a)) && 'b === Coalesce(Seq('a, 'b))
+        && 'b === Coalesce(Seq('b, 'a)) && 'b === Coalesce(Seq('b, 'b)))
       .select('a, 'b.as('d), Coalesce(Seq('a, 'b)).as('int_col))
       .select('int_col, 'd, 'a).as("t")
-      .join(t2
-        .where(IsNotNull('a) && IsNotNull(Coalesce(Seq('a, 'a)))
-          && 'a <=> Coalesce(Seq('a, 'a)) && 'a === Coalesce(Seq('a, 'a)) && 'a <=> 'a), Inner,
+      .join(
+        t2.where(IsNotNull('a) && IsNotNull(Coalesce(Seq('a, 'a))) &&
+          'a === Coalesce(Seq('a, 'a))),
+        Inner,
         Some("t.a".attr === "t2.a".attr && "t.d".attr === "t2.a".attr
           && "t.int_col".attr === "t2.a".attr))
       .analyze
@@ -194,6 +194,30 @@ class InferFiltersFromConstraintsSuite extends PlanTest {
     comparePlans(optimized, correctAnswer)
   }
 
+  test("inner join with EqualTo expressions containing part of each other: don't generate " +
+    "constraints for recursive functions") {
+    val t1 = testRelation.subquery('t1)
+    val t2 = testRelation.subquery('t2)
+
+    // We should prevent `c = Coalese(a, b)` and `a = Coalese(b, c)` from recursively creating
+    // complicated constraints through the constraint inference procedure.
+    val originalQuery = t1
+      .select('a, 'b, 'c, Coalesce(Seq('b, 'c)).as('d), Coalesce(Seq('a, 'b)).as('e))
+      .where('a === 'd && 'c === 'e)
+      .join(t2, Inner, Some("t1.a".attr === "t2.a".attr && "t1.c".attr === "t2.c".attr))
+      .analyze
+    val correctAnswer = t1
+      .where(IsNotNull('a) && IsNotNull('c) && 'a === Coalesce(Seq('b, 'c)) &&
+        'c === Coalesce(Seq('a, 'b)))
+      .select('a, 'b, 'c, Coalesce(Seq('b, 'c)).as('d), Coalesce(Seq('a, 'b)).as('e))
+      .join(t2.where(IsNotNull('a) && IsNotNull('c)),
+        Inner,
+        Some("t1.a".attr === "t2.a".attr && "t1.c".attr === "t2.c".attr))
+      .analyze
+    val optimized = Optimize.execute(originalQuery)
+    comparePlans(optimized, correctAnswer)
+  }
+
   test("generate correct filters for alias that don't produce recursive constraints") {
     val t1 = testRelation.subquery('t1)
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
index a37e06d922642..866ff0d33cbb2 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
@@ -134,8 +134,6 @@ class ConstraintPropagationSuite extends SparkFunSuite with PlanTest {
     verifyConstraints(aliasedRelation.analyze.constraints,
       ExpressionSet(Seq(resolveColumn(aliasedRelation.analyze, "x") > 10,
         IsNotNull(resolveColumn(aliasedRelation.analyze, "x")),
-        resolveColumn(aliasedRelation.analyze, "b") <=> resolveColumn(aliasedRelation.analyze, "y"),
-        resolveColumn(aliasedRelation.analyze, "z") <=> resolveColumn(aliasedRelation.analyze, "x"),
         resolveColumn(aliasedRelation.analyze, "z") > 10,
         IsNotNull(resolveColumn(aliasedRelation.analyze, "z")))))
 

From 371e4e2053eb7535a27dd71756a3a479aae22306 Mon Sep 17 00:00:00 2001
From: goldmedal <liugs963@gmail.com>
Date: Wed, 13 Sep 2017 09:43:00 +0900
Subject: [PATCH 530/779] [SPARK-21513][SQL] Allow UDF to_json support
 converting MapType to json

# What changes were proposed in this pull request?
UDF to_json only supports converting `StructType` or `ArrayType` of `StructType`s to a json output string now.
According to the discussion of JIRA SPARK-21513, I allow to `to_json` support converting `MapType` and `ArrayType` of `MapType`s to a json output string.
This PR is for SQL and Scala API only.

# How was this patch tested?
Adding unit test case.

cc viirya HyukjinKwon

Author: goldmedal <liugs963@gmail.com>
Author: Jia-Xuan Liu <liugs963@gmail.com>

Closes #18875 from goldmedal/SPARK-21513.
---
 .../expressions/jsonExpressions.scala         |  38 ++++-
 .../sql/catalyst/json/JacksonGenerator.scala  |  65 +++++++-
 .../expressions/JsonExpressionsSuite.scala    |  49 +++++-
 .../catalyst/json/JacksonGeneratorSuite.scala | 113 ++++++++++++++
 .../org/apache/spark/sql/functions.scala      |  17 ++-
 .../sql-tests/inputs/json-functions.sql       |   5 +
 .../sql-tests/results/json-functions.sql.out  | 144 ++++++++++++------
 .../apache/spark/sql/JsonFunctionsSuite.scala |  16 ++
 8 files changed, 378 insertions(+), 69 deletions(-)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JacksonGeneratorSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index ee5da1a83a4ce..134163187b7c6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -29,7 +29,7 @@ import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
 import org.apache.spark.sql.catalyst.json._
 import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
-import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, BadRecordException, FailFastMode, GenericArrayData}
+import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, BadRecordException, FailFastMode, GenericArrayData, MapData}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.Utils
@@ -604,7 +604,8 @@ case class JsonToStructs(
 }
 
 /**
- * Converts a [[StructType]] or [[ArrayType]] of [[StructType]]s to a json output string.
+ * Converts a [[StructType]], [[ArrayType]] of [[StructType]]s, [[MapType]]
+ * or [[ArrayType]] of [[MapType]]s to a json output string.
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
@@ -617,6 +618,14 @@ case class JsonToStructs(
        {"time":"26/08/2015"}
       > SELECT _FUNC_(array(named_struct('a', 1, 'b', 2));
        [{"a":1,"b":2}]
+      > SELECT _FUNC_(map('a',named_struct('b',1)));
+       {"a":{"b":1}}
+      > SELECT _FUNC_(map(named_struct('a',1),named_struct('b',2)));
+       {"[1]":{"b":2}}
+      > SELECT _FUNC_(map('a',1));
+       {"a":1}
+      > SELECT _FUNC_(array((map('a',1))));
+       [{"a":1}]
   """,
   since = "2.2.0")
 // scalastyle:on line.size.limit
@@ -648,6 +657,8 @@ case class StructsToJson(
   lazy val rowSchema = child.dataType match {
     case st: StructType => st
     case ArrayType(st: StructType, _) => st
+    case mt: MapType => mt
+    case ArrayType(mt: MapType, _) => mt
   }
 
   // This converts rows to the JSON output according to the given schema.
@@ -669,6 +680,14 @@ case class StructsToJson(
         (arr: Any) =>
           gen.write(arr.asInstanceOf[ArrayData])
           getAndReset()
+      case _: MapType =>
+        (map: Any) =>
+          gen.write(map.asInstanceOf[MapData])
+          getAndReset()
+      case ArrayType(_: MapType, _) =>
+        (arr: Any) =>
+          gen.write(arr.asInstanceOf[ArrayData])
+          getAndReset()
     }
   }
 
@@ -677,14 +696,25 @@ case class StructsToJson(
   override def checkInputDataTypes(): TypeCheckResult = child.dataType match {
     case _: StructType | ArrayType(_: StructType, _) =>
       try {
-        JacksonUtils.verifySchema(rowSchema)
+        JacksonUtils.verifySchema(rowSchema.asInstanceOf[StructType])
+        TypeCheckResult.TypeCheckSuccess
+      } catch {
+        case e: UnsupportedOperationException =>
+          TypeCheckResult.TypeCheckFailure(e.getMessage)
+      }
+    case _: MapType | ArrayType(_: MapType, _) =>
+      // TODO: let `JacksonUtils.verifySchema` verify a `MapType`
+      try {
+        val st = StructType(StructField("a", rowSchema.asInstanceOf[MapType]) :: Nil)
+        JacksonUtils.verifySchema(st)
         TypeCheckResult.TypeCheckSuccess
       } catch {
         case e: UnsupportedOperationException =>
           TypeCheckResult.TypeCheckFailure(e.getMessage)
       }
     case _ => TypeCheckResult.TypeCheckFailure(
-      s"Input type ${child.dataType.simpleString} must be a struct or array of structs.")
+      s"Input type ${child.dataType.simpleString} must be a struct, array of structs or " +
+          "a map or array of map.")
   }
 
   override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression =
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala
index 1d302aea6fd16..dfe7e28121943 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala
@@ -26,8 +26,15 @@ import org.apache.spark.sql.catalyst.expressions.SpecializedGetters
 import org.apache.spark.sql.catalyst.util.{ArrayData, DateTimeUtils, MapData}
 import org.apache.spark.sql.types._
 
+/**
+ * `JackGenerator` can only be initialized with a `StructType` or a `MapType`.
+ * Once it is initialized with `StructType`, it can be used to write out a struct or an array of
+ * struct. Once it is initialized with `MapType`, it can be used to write out a map or an array
+ * of map. An exception will be thrown if trying to write out a struct if it is initialized with
+ * a `MapType`, and vice verse.
+ */
 private[sql] class JacksonGenerator(
-    schema: StructType,
+    dataType: DataType,
     writer: Writer,
     options: JSONOptions) {
   // A `ValueWriter` is responsible for writing a field of an `InternalRow` to appropriate
@@ -35,11 +42,34 @@ private[sql] class JacksonGenerator(
   // we can directly access data in `ArrayData` without the help of `SpecificMutableRow`.
   private type ValueWriter = (SpecializedGetters, Int) => Unit
 
+  // `JackGenerator` can only be initialized with a `StructType` or a `MapType`.
+  require(dataType.isInstanceOf[StructType] | dataType.isInstanceOf[MapType],
+    "JacksonGenerator only supports to be initialized with a StructType " +
+      s"or MapType but got ${dataType.simpleString}")
+
   // `ValueWriter`s for all fields of the schema
-  private val rootFieldWriters: Array[ValueWriter] = schema.map(_.dataType).map(makeWriter).toArray
+  private lazy val rootFieldWriters: Array[ValueWriter] = dataType match {
+    case st: StructType => st.map(_.dataType).map(makeWriter).toArray
+    case _ => throw new UnsupportedOperationException(
+      s"Initial type ${dataType.simpleString} must be a struct")
+  }
+
   // `ValueWriter` for array data storing rows of the schema.
-  private val arrElementWriter: ValueWriter = (arr: SpecializedGetters, i: Int) => {
-    writeObject(writeFields(arr.getStruct(i, schema.length), schema, rootFieldWriters))
+  private lazy val arrElementWriter: ValueWriter = dataType match {
+    case st: StructType =>
+      (arr: SpecializedGetters, i: Int) => {
+        writeObject(writeFields(arr.getStruct(i, st.length), st, rootFieldWriters))
+      }
+    case mt: MapType =>
+      (arr: SpecializedGetters, i: Int) => {
+        writeObject(writeMapData(arr.getMap(i), mt, mapElementWriter))
+      }
+  }
+
+  private lazy val mapElementWriter: ValueWriter = dataType match {
+    case mt: MapType => makeWriter(mt.valueType)
+    case _ => throw new UnsupportedOperationException(
+      s"Initial type ${dataType.simpleString} must be a map")
   }
 
   private val gen = new JsonFactory().createGenerator(writer).setRootValueSeparator(null)
@@ -189,18 +219,37 @@ private[sql] class JacksonGenerator(
   def flush(): Unit = gen.flush()
 
   /**
-   * Transforms a single `InternalRow` to JSON object using Jackson
+   * Transforms a single `InternalRow` to JSON object using Jackson.
+   * This api calling will be validated through accessing `rootFieldWriters`.
    *
    * @param row The row to convert
    */
-  def write(row: InternalRow): Unit = writeObject(writeFields(row, schema, rootFieldWriters))
+  def write(row: InternalRow): Unit = {
+    writeObject(writeFields(
+      fieldWriters = rootFieldWriters,
+      row = row,
+      schema = dataType.asInstanceOf[StructType]))
+  }
 
   /**
-   * Transforms multiple `InternalRow`s to JSON array using Jackson
+   * Transforms multiple `InternalRow`s or `MapData`s to JSON array using Jackson
    *
-   * @param array The array of rows to convert
+   * @param array The array of rows or maps to convert
    */
   def write(array: ArrayData): Unit = writeArray(writeArrayData(array, arrElementWriter))
 
+  /**
+   * Transforms a single `MapData` to JSON object using Jackson
+   * This api calling will will be validated through accessing `mapElementWriter`.
+   *
+   * @param map a map to convert
+   */
+  def write(map: MapData): Unit = {
+    writeObject(writeMapData(
+      fieldWriter = mapElementWriter,
+      map = map,
+      mapType = dataType.asInstanceOf[MapType]))
+  }
+
   def writeLineEnding(): Unit = gen.writeRaw('\n')
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
index 5de11433b0c0d..a0bbe02f92354 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
@@ -22,7 +22,7 @@ import java.util.Calendar
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
-import org.apache.spark.sql.catalyst.util.{DateTimeTestUtils, DateTimeUtils, GenericArrayData, PermissiveMode}
+import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeTestUtils, DateTimeUtils, GenericArrayData, PermissiveMode}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -612,6 +612,53 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     )
   }
 
+  test("SPARK-21513: to_json support map[string, struct] to json") {
+    val schema = MapType(StringType, StructType(StructField("a", IntegerType) :: Nil))
+    val input = Literal.create(ArrayBasedMapData(Map("test" -> InternalRow(1))), schema)
+    checkEvaluation(
+      StructsToJson(Map.empty, input),
+      """{"test":{"a":1}}"""
+    )
+  }
+
+  test("SPARK-21513: to_json support map[struct, struct] to json") {
+    val schema = MapType(StructType(StructField("a", IntegerType) :: Nil),
+      StructType(StructField("b", IntegerType) :: Nil))
+    val input = Literal.create(ArrayBasedMapData(Map(InternalRow(1) -> InternalRow(2))), schema)
+    checkEvaluation(
+      StructsToJson(Map.empty, input),
+      """{"[1]":{"b":2}}"""
+    )
+  }
+
+  test("SPARK-21513: to_json support map[string, integer] to json") {
+    val schema = MapType(StringType, IntegerType)
+    val input = Literal.create(ArrayBasedMapData(Map("a" -> 1)), schema)
+    checkEvaluation(
+      StructsToJson(Map.empty, input),
+      """{"a":1}"""
+    )
+  }
+
+  test("to_json - array with maps") {
+    val inputSchema = ArrayType(MapType(StringType, IntegerType))
+    val input = new GenericArrayData(ArrayBasedMapData(
+      Map("a" -> 1)) :: ArrayBasedMapData(Map("b" -> 2)) :: Nil)
+    val output = """[{"a":1},{"b":2}]"""
+    checkEvaluation(
+      StructsToJson(Map.empty, Literal.create(input, inputSchema), gmtId),
+      output)
+  }
+
+  test("to_json - array with single map") {
+    val inputSchema = ArrayType(MapType(StringType, IntegerType))
+    val input = new GenericArrayData(ArrayBasedMapData(Map("a" -> 1)) :: Nil)
+    val output = """[{"a":1}]"""
+    checkEvaluation(
+      StructsToJson(Map.empty, Literal.create(input, inputSchema), gmtId),
+      output)
+  }
+
   test("to_json: verify MapType's value type instead of key type") {
     // Keys in map are treated as strings when converting to JSON. The type doesn't matter at all.
     val mapType1 = MapType(CalendarIntervalType, IntegerType)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JacksonGeneratorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JacksonGeneratorSuite.scala
new file mode 100644
index 0000000000000..9b27490ed0e35
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JacksonGeneratorSuite.scala
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.json
+
+import java.io.CharArrayWriter
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericArrayData}
+import org.apache.spark.sql.types._
+
+class JacksonGeneratorSuite extends SparkFunSuite {
+
+  val gmtId = DateTimeUtils.TimeZoneGMT.getID
+  val option = new JSONOptions(Map.empty, gmtId)
+
+  test("initial with StructType and write out a row") {
+    val dataType = StructType(StructField("a", IntegerType) :: Nil)
+    val input = InternalRow(1)
+    val writer = new CharArrayWriter()
+    val gen = new JacksonGenerator(dataType, writer, option)
+    gen.write(input)
+    gen.flush()
+    assert(writer.toString === """{"a":1}""")
+  }
+
+  test("initial with StructType and write out rows") {
+    val dataType = StructType(StructField("a", IntegerType) :: Nil)
+    val input = new GenericArrayData(InternalRow(1) :: InternalRow(2) :: Nil)
+    val writer = new CharArrayWriter()
+    val gen = new JacksonGenerator(dataType, writer, option)
+    gen.write(input)
+    gen.flush()
+    assert(writer.toString === """[{"a":1},{"a":2}]""")
+  }
+
+  test("initial with StructType and write out an array with single empty row") {
+    val dataType = StructType(StructField("a", IntegerType) :: Nil)
+    val input = new GenericArrayData(InternalRow(null) :: Nil)
+    val writer = new CharArrayWriter()
+    val gen = new JacksonGenerator(dataType, writer, option)
+    gen.write(input)
+    gen.flush()
+    assert(writer.toString === """[{}]""")
+  }
+
+  test("initial with StructType and write out an empty array") {
+    val dataType = StructType(StructField("a", IntegerType) :: Nil)
+    val input = new GenericArrayData(Nil)
+    val writer = new CharArrayWriter()
+    val gen = new JacksonGenerator(dataType, writer, option)
+    gen.write(input)
+    gen.flush()
+    assert(writer.toString === """[]""")
+  }
+
+  test("initial with Map and write out a map data") {
+    val dataType = MapType(StringType, IntegerType)
+    val input = ArrayBasedMapData(Map("a" -> 1))
+    val writer = new CharArrayWriter()
+    val gen = new JacksonGenerator(dataType, writer, option)
+    gen.write(input)
+    gen.flush()
+    assert(writer.toString === """{"a":1}""")
+  }
+
+  test("initial with Map and write out an array of maps") {
+    val dataType = MapType(StringType, IntegerType)
+    val input = new GenericArrayData(
+      ArrayBasedMapData(Map("a" -> 1)) :: ArrayBasedMapData(Map("b" -> 2)) :: Nil)
+    val writer = new CharArrayWriter()
+    val gen = new JacksonGenerator(dataType, writer, option)
+    gen.write(input)
+    gen.flush()
+    assert(writer.toString === """[{"a":1},{"b":2}]""")
+  }
+
+  test("error handling: initial with StructType but error calling write a map") {
+    val dataType = StructType(StructField("a", IntegerType) :: Nil)
+    val input = ArrayBasedMapData(Map("a" -> 1))
+    val writer = new CharArrayWriter()
+    val gen = new JacksonGenerator(dataType, writer, option)
+    intercept[UnsupportedOperationException] {
+      gen.write(input)
+    }
+  }
+
+  test("error handling: initial with MapType and write out a row") {
+    val dataType = MapType(StringType, IntegerType)
+    val input = InternalRow(1)
+    val writer = new CharArrayWriter()
+    val gen = new JacksonGenerator(dataType, writer, option)
+    intercept[UnsupportedOperationException] {
+      gen.write(input)
+    }
+  }
+
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 53b2552fa3b36..47324ed9f2fb8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -3119,9 +3119,9 @@ object functions {
   }
 
   /**
-   * (Scala-specific) Converts a column containing a `StructType` or `ArrayType` of `StructType`s
-   * into a JSON string with the specified schema. Throws an exception, in the case of an
-   * unsupported type.
+   * (Scala-specific) Converts a column containing a `StructType`, `ArrayType` of `StructType`s,
+   * a `MapType` or `ArrayType` of `MapType`s into a JSON string with the specified schema.
+   * Throws an exception, in the case of an unsupported type.
    *
    * @param e a column containing a struct or array of the structs.
    * @param options options to control how the struct column is converted into a json string.
@@ -3135,9 +3135,9 @@ object functions {
   }
 
   /**
-   * (Java-specific) Converts a column containing a `StructType` or `ArrayType` of `StructType`s
-   * into a JSON string with the specified schema. Throws an exception, in the case of an
-   * unsupported type.
+   * (Java-specific) Converts a column containing a `StructType`, `ArrayType` of `StructType`s,
+   * a `MapType` or `ArrayType` of `MapType`s into a JSON string with the specified schema.
+   * Throws an exception, in the case of an unsupported type.
    *
    * @param e a column containing a struct or array of the structs.
    * @param options options to control how the struct column is converted into a json string.
@@ -3150,8 +3150,9 @@ object functions {
     to_json(e, options.asScala.toMap)
 
   /**
-   * Converts a column containing a `StructType` or `ArrayType` of `StructType`s into a JSON string
-   * with the specified schema. Throws an exception, in the case of an unsupported type.
+   * Converts a column containing a `StructType`, `ArrayType` of `StructType`s,
+   * a `MapType` or `ArrayType` of `MapType`s into a JSON string with the specified schema.
+   * Throws an exception, in the case of an unsupported type.
    *
    * @param e a column containing a struct or array of the structs.
    *
diff --git a/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql
index 5a46fb4321f90..fea069eac4d48 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql
@@ -4,6 +4,11 @@ describe function extended to_json;
 select to_json(named_struct('a', 1, 'b', 2));
 select to_json(named_struct('time', to_timestamp('2015-08-26', 'yyyy-MM-dd')), map('timestampFormat', 'dd/MM/yyyy'));
 select to_json(array(named_struct('a', 1, 'b', 2)));
+select to_json(map(named_struct('a', 1, 'b', 2), named_struct('a', 1, 'b', 2)));
+select to_json(map('a', named_struct('a', 1, 'b', 2)));
+select to_json(map('a', 1));
+select to_json(array(map('a',1)));
+select to_json(array(map('a',1), map('b',2)));
 -- Check if errors handled
 select to_json(named_struct('a', 1, 'b', 2), named_struct('mode', 'PERMISSIVE'));
 select to_json(named_struct('a', 1, 'b', 2), map('mode', 1));
diff --git a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
index ae21d00116e9b..dcced79d315f3 100644
--- a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 21
+-- Number of queries: 26
 
 
 -- !query 0
@@ -26,6 +26,14 @@ Extended Usage:
        {"time":"26/08/2015"}
       > SELECT to_json(array(named_struct('a', 1, 'b', 2));
        [{"a":1,"b":2}]
+      > SELECT to_json(map('a',named_struct('b',1)));
+       {"a":{"b":1}}
+      > SELECT to_json(map(named_struct('a',1),named_struct('b',2)));
+       {"[1]":{"b":2}}
+      > SELECT to_json(map('a',1));
+       {"a":1}
+      > SELECT to_json(array((map('a',1))));
+       [{"a":1}]
   
     Since: 2.2.0
 
@@ -58,47 +66,87 @@ struct<structstojson(array(named_struct(a, 1, b, 2))):string>
 
 
 -- !query 5
-select to_json(named_struct('a', 1, 'b', 2), named_struct('mode', 'PERMISSIVE'))
+select to_json(map(named_struct('a', 1, 'b', 2), named_struct('a', 1, 'b', 2)))
 -- !query 5 schema
-struct<>
+struct<structstojson(map(named_struct(a, 1, b, 2), named_struct(a, 1, b, 2))):string>
 -- !query 5 output
+{"[1,2]":{"a":1,"b":2}}
+
+
+-- !query 6
+select to_json(map('a', named_struct('a', 1, 'b', 2)))
+-- !query 6 schema
+struct<structstojson(map(a, named_struct(a, 1, b, 2))):string>
+-- !query 6 output
+{"a":{"a":1,"b":2}}
+
+
+-- !query 7
+select to_json(map('a', 1))
+-- !query 7 schema
+struct<structstojson(map(a, 1)):string>
+-- !query 7 output
+{"a":1}
+
+
+-- !query 8
+select to_json(array(map('a',1)))
+-- !query 8 schema
+struct<structstojson(array(map(a, 1))):string>
+-- !query 8 output
+[{"a":1}]
+
+
+-- !query 9
+select to_json(array(map('a',1), map('b',2)))
+-- !query 9 schema
+struct<structstojson(array(map(a, 1), map(b, 2))):string>
+-- !query 9 output
+[{"a":1},{"b":2}]
+
+
+-- !query 10
+select to_json(named_struct('a', 1, 'b', 2), named_struct('mode', 'PERMISSIVE'))
+-- !query 10 schema
+struct<>
+-- !query 10 output
 org.apache.spark.sql.AnalysisException
 Must use a map() function for options;; line 1 pos 7
 
 
--- !query 6
+-- !query 11
 select to_json(named_struct('a', 1, 'b', 2), map('mode', 1))
--- !query 6 schema
+-- !query 11 schema
 struct<>
--- !query 6 output
+-- !query 11 output
 org.apache.spark.sql.AnalysisException
 A type of keys and values in map() must be string, but got MapType(StringType,IntegerType,false);; line 1 pos 7
 
 
--- !query 7
+-- !query 12
 select to_json()
--- !query 7 schema
+-- !query 12 schema
 struct<>
--- !query 7 output
+-- !query 12 output
 org.apache.spark.sql.AnalysisException
 Invalid number of arguments for function to_json; line 1 pos 7
 
 
--- !query 8
+-- !query 13
 describe function from_json
--- !query 8 schema
+-- !query 13 schema
 struct<function_desc:string>
--- !query 8 output
+-- !query 13 output
 Class: org.apache.spark.sql.catalyst.expressions.JsonToStructs
 Function: from_json
 Usage: from_json(jsonStr, schema[, options]) - Returns a struct value with the given `jsonStr` and `schema`.
 
 
--- !query 9
+-- !query 14
 describe function extended from_json
--- !query 9 schema
+-- !query 14 schema
 struct<function_desc:string>
--- !query 9 output
+-- !query 14 output
 Class: org.apache.spark.sql.catalyst.expressions.JsonToStructs
 Extended Usage:
     Examples:
@@ -113,36 +161,36 @@ Function: from_json
 Usage: from_json(jsonStr, schema[, options]) - Returns a struct value with the given `jsonStr` and `schema`.
 
 
--- !query 10
+-- !query 15
 select from_json('{"a":1}', 'a INT')
--- !query 10 schema
+-- !query 15 schema
 struct<jsontostructs({"a":1}):struct<a:int>>
--- !query 10 output
+-- !query 15 output
 {"a":1}
 
 
--- !query 11
+-- !query 16
 select from_json('{"time":"26/08/2015"}', 'time Timestamp', map('timestampFormat', 'dd/MM/yyyy'))
--- !query 11 schema
+-- !query 16 schema
 struct<jsontostructs({"time":"26/08/2015"}):struct<time:timestamp>>
--- !query 11 output
+-- !query 16 output
 {"time":2015-08-26 00:00:00.0}
 
 
--- !query 12
+-- !query 17
 select from_json('{"a":1}', 1)
--- !query 12 schema
+-- !query 17 schema
 struct<>
--- !query 12 output
+-- !query 17 output
 org.apache.spark.sql.AnalysisException
 Expected a string literal instead of 1;; line 1 pos 7
 
 
--- !query 13
+-- !query 18
 select from_json('{"a":1}', 'a InvalidType')
--- !query 13 schema
+-- !query 18 schema
 struct<>
--- !query 13 output
+-- !query 18 output
 org.apache.spark.sql.AnalysisException
 
 DataType invalidtype is not supported.(line 1, pos 2)
@@ -153,60 +201,60 @@ a InvalidType
 ; line 1 pos 7
 
 
--- !query 14
+-- !query 19
 select from_json('{"a":1}', 'a INT', named_struct('mode', 'PERMISSIVE'))
--- !query 14 schema
+-- !query 19 schema
 struct<>
--- !query 14 output
+-- !query 19 output
 org.apache.spark.sql.AnalysisException
 Must use a map() function for options;; line 1 pos 7
 
 
--- !query 15
+-- !query 20
 select from_json('{"a":1}', 'a INT', map('mode', 1))
--- !query 15 schema
+-- !query 20 schema
 struct<>
--- !query 15 output
+-- !query 20 output
 org.apache.spark.sql.AnalysisException
 A type of keys and values in map() must be string, but got MapType(StringType,IntegerType,false);; line 1 pos 7
 
 
--- !query 16
+-- !query 21
 select from_json()
--- !query 16 schema
+-- !query 21 schema
 struct<>
--- !query 16 output
+-- !query 21 output
 org.apache.spark.sql.AnalysisException
 Invalid number of arguments for function from_json; line 1 pos 7
 
 
--- !query 17
+-- !query 22
 SELECT json_tuple('{"a" : 1, "b" : 2}', CAST(NULL AS STRING), 'b', CAST(NULL AS STRING), 'a')
--- !query 17 schema
+-- !query 22 schema
 struct<c0:string,c1:string,c2:string,c3:string>
--- !query 17 output
+-- !query 22 output
 NULL	2	NULL	1
 
 
--- !query 18
+-- !query 23
 CREATE TEMPORARY VIEW jsonTable(jsonField, a) AS SELECT * FROM VALUES ('{"a": 1, "b": 2}', 'a')
--- !query 18 schema
+-- !query 23 schema
 struct<>
--- !query 18 output
+-- !query 23 output
 
 
 
--- !query 19
+-- !query 24
 SELECT json_tuple(jsonField, 'b', CAST(NULL AS STRING), a) FROM jsonTable
--- !query 19 schema
+-- !query 24 schema
 struct<c0:string,c1:string,c2:string>
--- !query 19 output
+-- !query 24 output
 2	NULL	1
 
 
--- !query 20
+-- !query 25
 DROP VIEW IF EXISTS jsonTable
--- !query 20 schema
+-- !query 25 schema
 struct<>
--- !query 20 output
+-- !query 25 output
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
index 119af213047f9..00d2acc4a1d8a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
@@ -180,10 +180,26 @@ class JsonFunctionsSuite extends QueryTest with SharedSQLContext {
 
   test("to_json - array") {
     val df = Seq(Tuple1(Tuple1(1) :: Nil)).toDF("a")
+    val df2 = Seq(Tuple1(Map("a" -> 1) :: Nil)).toDF("a")
 
     checkAnswer(
       df.select(to_json($"a")),
       Row("""[{"_1":1}]""") :: Nil)
+    checkAnswer(
+      df2.select(to_json($"a")),
+      Row("""[{"a":1}]""") :: Nil)
+  }
+
+  test("to_json - map") {
+    val df1 = Seq(Map("a" -> Tuple1(1))).toDF("a")
+    val df2 = Seq(Map("a" -> 1)).toDF("a")
+
+    checkAnswer(
+      df1.select(to_json($"a")),
+      Row("""{"a":{"_1":1}}""") :: Nil)
+    checkAnswer(
+      df2.select(to_json($"a")),
+      Row("""{"a":1}""") :: Nil)
   }
 
   test("to_json with option") {

From f6c5d8f6925e1868b8948a442b44c19535150e2a Mon Sep 17 00:00:00 2001
From: WeichenXu <weichen.xu@databricks.com>
Date: Wed, 13 Sep 2017 09:48:04 +0100
Subject: [PATCH 531/779] [SPARK-21027][MINOR][FOLLOW-UP] add missing since tag

## What changes were proposed in this pull request?

add missing since tag for `setParallelism` in #19110

## How was this patch tested?

N/A

Author: WeichenXu <weichen.xu@databricks.com>

Closes #19214 from WeichenXu123/minor01.
---
 .../scala/org/apache/spark/ml/classification/OneVsRest.scala     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
index 942e981a8b562..92a7742f6c865 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
@@ -303,6 +303,7 @@ final class OneVsRest @Since("1.4.0") (
    *
    * @group expertSetParam
    */
+  @Since("2.3.0")
   def setParallelism(value: Int): this.type = {
     set(parallelism, value)
   }

From dd88fa3d3b335b40ddd5e63ddf19818890aba4a3 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Wed, 13 Sep 2017 09:51:49 +0100
Subject: [PATCH 532/779] [BUILD] Close stale PRs

Closes #18522
Closes #17722
Closes #18879
Closes #18891
Closes #18806
Closes #18948
Closes #18949
Closes #19070
Closes #19039
Closes #19142
Closes #18515
Closes #19154
Closes #19162
Closes #19187
Closes #19091

Author: Sean Owen <sowen@cloudera.com>

Closes #19203 from srowen/CloseStalePRs3.

From a1d98c6dcdf387121139d1566f5b1924e2a02a75 Mon Sep 17 00:00:00 2001
From: German Schiavon <germanschiavon@gmail.com>
Date: Wed, 13 Sep 2017 09:52:45 +0100
Subject: [PATCH 533/779] [SPARK-21982] Set locale to US

## What changes were proposed in this pull request?

In UtilsSuite Locale was set by default to US, but at the moment of using format function it wasn't, taking by default JVM locale which could be different than US making this test fail.

## How was this patch tested?
Unit test (UtilsSuite)

Author: German Schiavon <germanschiavon@gmail.com>

Closes #19205 from Gschiavon/fix/test-locale.
---
 core/src/main/scala/org/apache/spark/util/Utils.scala | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 1e8250fd6a846..bc08808a4d292 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -1193,16 +1193,17 @@ private[spark] object Utils extends Logging {
     val second = 1000
     val minute = 60 * second
     val hour = 60 * minute
+    val locale = Locale.US
 
     ms match {
       case t if t < second =>
-        "%d ms".format(t)
+        "%d ms".formatLocal(locale, t)
       case t if t < minute =>
-        "%.1f s".format(t.toFloat / second)
+        "%.1f s".formatLocal(locale, t.toFloat / second)
       case t if t < hour =>
-        "%.1f m".format(t.toFloat / minute)
+        "%.1f m".formatLocal(locale, t.toFloat / minute)
       case t =>
-        "%.2f h".format(t.toFloat / hour)
+        "%.2f h".formatLocal(locale, t.toFloat / hour)
     }
   }
 

From 4fbf748bf85b18f32a2cd32b1b1881d24360626e Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Wed, 13 Sep 2017 10:10:40 +0100
Subject: [PATCH 534/779] [SPARK-21893][BUILD][STREAMING][WIP] Put Kafka 0.8
 behind a profile

## What changes were proposed in this pull request?

Put Kafka 0.8 support behind a kafka-0-8 profile.

## How was this patch tested?

Existing tests, but, until PR builder and Jenkins configs are updated the effect here is to not build or test Kafka 0.8 support at all.

Author: Sean Owen <sowen@cloudera.com>

Closes #19134 from srowen/SPARK-21893.
---
 dev/create-release/release-build.sh           |  32 ++++--
 dev/mima                                      |   2 +-
 dev/scalastyle                                |   1 +
 dev/sparktestsupport/modules.py               |   6 +
 dev/test-dependencies.sh                      |   2 +-
 docs/building-spark.md                        |   9 ++
 docs/streaming-kafka-0-8-integration.md       |  23 ++--
 docs/streaming-kafka-integration.md           |  11 +-
 docs/streaming-programming-guide.md           |   6 +-
 examples/pom.xml                              |   2 +-
 .../streaming/JavaDirectKafkaWordCount.java   |  21 ++--
 .../streaming/JavaKafkaWordCount.java         |  87 ---------------
 .../streaming/DirectKafkaWordCount.scala      |  12 +-
 .../examples/streaming/KafkaWordCount.scala   | 105 ------------------
 .../apache/spark/streaming/kafka/Broker.scala |   2 +
 .../spark/streaming/kafka/KafkaCluster.scala  |   2 +
 .../spark/streaming/kafka/KafkaUtils.scala    |   1 +
 .../spark/streaming/kafka/OffsetRange.scala   |   3 +
 pom.xml                                       |  10 +-
 project/SparkBuild.scala                      |   8 +-
 python/pyspark/streaming/kafka.py             |  26 ++++-
 python/pyspark/streaming/tests.py             |  14 ++-
 22 files changed, 127 insertions(+), 258 deletions(-)
 delete mode 100644 examples/src/main/java/org/apache/spark/examples/streaming/JavaKafkaWordCount.java
 delete mode 100644 examples/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala

diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index ee2407a4ddf0e..f4a7f25c2413f 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -80,8 +80,17 @@ NEXUS_PROFILE=d63f592e7eac0 # Profile for Spark staging uploads
 BASE_DIR=$(pwd)
 
 MVN="build/mvn --force"
-PUBLISH_PROFILES="-Pmesos -Pyarn -Phive -Phive-thriftserver"
-PUBLISH_PROFILES="$PUBLISH_PROFILES -Pspark-ganglia-lgpl -Pkinesis-asl"
+
+# Hive-specific profiles for some builds
+HIVE_PROFILES="-Phive -Phive-thriftserver"
+# Profiles for publishing snapshots and release to Maven Central
+PUBLISH_PROFILES="-Pmesos -Pyarn $HIVE_PROFILES -Pspark-ganglia-lgpl -Pkinesis-asl"
+# Profiles for building binary releases
+BASE_RELEASE_PROFILES="-Pmesos -Pyarn -Psparkr"
+# Scala 2.11 only profiles for some builds
+SCALA_2_11_PROFILES="-Pkafka-0-8"
+# Scala 2.12 only profiles for some builds
+SCALA_2_12_PROFILES="-Pscala-2.12"
 
 rm -rf spark
 git clone https://git-wip-us.apache.org/repos/asf/spark.git
@@ -235,10 +244,9 @@ if [[ "$1" == "package" ]]; then
 
   # We increment the Zinc port each time to avoid OOM's and other craziness if multiple builds
   # share the same Zinc server.
-  FLAGS="-Psparkr -Phive -Phive-thriftserver -Pyarn -Pmesos"
-  make_binary_release "hadoop2.6" "-Phadoop-2.6 $FLAGS" "3035" "withr" &
-  make_binary_release "hadoop2.7" "-Phadoop-2.7 $FLAGS" "3036" "withpip" &
-  make_binary_release "without-hadoop" "-Psparkr -Phadoop-provided -Pyarn -Pmesos" "3038" &
+  make_binary_release "hadoop2.6" "-Phadoop-2.6 $HIVE_PROFILES $SCALA_2_11_PROFILES $BASE_RELEASE_PROFILES" "3035" "withr" &
+  make_binary_release "hadoop2.7" "-Phadoop-2.7 $HIVE_PROFILES $SCALA_2_11_PROFILES $BASE_RELEASE_PROFILES" "3036" "withpip" &
+  make_binary_release "without-hadoop" "-Phadoop-provided $SCALA_2_11_PROFILES $BASE_RELEASE_PROFILES" "3038" &
   wait
   rm -rf spark-$SPARK_VERSION-bin-*/
 
@@ -304,10 +312,10 @@ if [[ "$1" == "publish-snapshot" ]]; then
   # Generate random point for Zinc
   export ZINC_PORT=$(python -S -c "import random; print random.randrange(3030,4030)")
 
-  $MVN -DzincPort=$ZINC_PORT --settings $tmp_settings -DskipTests $PUBLISH_PROFILES deploy
+  $MVN -DzincPort=$ZINC_PORT --settings $tmp_settings -DskipTests $SCALA_2_11_PROFILES $PUBLISH_PROFILES deploy
   #./dev/change-scala-version.sh 2.12
-  #$MVN -DzincPort=$ZINC_PORT -Pscala-2.12 --settings $tmp_settings \
-  #  -DskipTests $PUBLISH_PROFILES clean deploy
+  #$MVN -DzincPort=$ZINC_PORT --settings $tmp_settings \
+  #  -DskipTests $SCALA_2_12_PROFILES $PUBLISH_PROFILES clean deploy
 
   # Clean-up Zinc nailgun process
   /usr/sbin/lsof -P |grep $ZINC_PORT | grep LISTEN | awk '{ print $2; }' | xargs kill
@@ -340,11 +348,11 @@ if [[ "$1" == "publish-release" ]]; then
   # Generate random point for Zinc
   export ZINC_PORT=$(python -S -c "import random; print random.randrange(3030,4030)")
 
-  $MVN -DzincPort=$ZINC_PORT -Dmaven.repo.local=$tmp_repo -DskipTests $PUBLISH_PROFILES clean install
+  $MVN -DzincPort=$ZINC_PORT -Dmaven.repo.local=$tmp_repo -DskipTests $SCALA_2_11_PROFILES $PUBLISH_PROFILES clean install
 
   #./dev/change-scala-version.sh 2.12
-  #$MVN -DzincPort=$ZINC_PORT -Dmaven.repo.local=$tmp_repo -Pscala-2.12 \
-  #  -DskipTests $PUBLISH_PROFILES clean install
+  #$MVN -DzincPort=$ZINC_PORT -Dmaven.repo.local=$tmp_repo \
+  #  -DskipTests $SCALA_2_12_PROFILES §$PUBLISH_PROFILES clean install
 
   # Clean-up Zinc nailgun process
   /usr/sbin/lsof -P |grep $ZINC_PORT | grep LISTEN | awk '{ print $2; }' | xargs kill
diff --git a/dev/mima b/dev/mima
index 5501589b7900a..fdb21f5007cf2 100755
--- a/dev/mima
+++ b/dev/mima
@@ -24,7 +24,7 @@ set -e
 FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
 cd "$FWDIR"
 
-SPARK_PROFILES="-Pmesos -Pyarn -Pspark-ganglia-lgpl -Pkinesis-asl -Phive-thriftserver -Phive"
+SPARK_PROFILES="-Pmesos -Pkafka-0-8 -Pyarn -Pspark-ganglia-lgpl -Pkinesis-asl -Phive-thriftserver -Phive"
 TOOLS_CLASSPATH="$(build/sbt -DcopyDependencies=false "export tools/fullClasspath" | tail -n1)"
 OLD_DEPS_CLASSPATH="$(build/sbt -DcopyDependencies=false $SPARK_PROFILES "export oldDeps/fullClasspath" | tail -n1)"
 
diff --git a/dev/scalastyle b/dev/scalastyle
index f3dec833636c6..e5aa589869535 100755
--- a/dev/scalastyle
+++ b/dev/scalastyle
@@ -23,6 +23,7 @@ ERRORS=$(echo -e "q\n" \
     | build/sbt \
         -Pkinesis-asl \
         -Pmesos \
+        -Pkafka-0-8 \
         -Pyarn \
         -Phive \
         -Phive-thriftserver \
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 2971e0db40496..50e14b60545af 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -249,6 +249,12 @@ def __hash__(self):
         "external/kafka-0-8",
         "external/kafka-0-8-assembly",
     ],
+    build_profile_flags=[
+        "-Pkafka-0-8",
+    ],
+    environ={
+        "ENABLE_KAFKA_0_8_TESTS": "1"
+    },
     sbt_test_goals=[
         "streaming-kafka-0-8/test",
     ]
diff --git a/dev/test-dependencies.sh b/dev/test-dependencies.sh
index 2906a81f61cd1..114b116d84cd9 100755
--- a/dev/test-dependencies.sh
+++ b/dev/test-dependencies.sh
@@ -29,7 +29,7 @@ export LC_ALL=C
 # TODO: This would be much nicer to do in SBT, once SBT supports Maven-style resolution.
 
 # NOTE: These should match those in the release publishing script
-HADOOP2_MODULE_PROFILES="-Phive-thriftserver -Pmesos -Pyarn -Phive"
+HADOOP2_MODULE_PROFILES="-Phive-thriftserver -Pmesos -Pkafka-0-8 -Pyarn -Phive"
 MVN="build/mvn"
 HADOOP_PROFILES=(
     hadoop-2.6
diff --git a/docs/building-spark.md b/docs/building-spark.md
index 67a2ce79dc814..57baa503259c1 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -90,6 +90,15 @@ like ZooKeeper and Hadoop itself.
 ## Building with Mesos support
 
     ./build/mvn -Pmesos -DskipTests clean package
+    
+## Building with Kafka 0.8 support
+
+Kafka 0.8 support must be explicitly enabled with the `kafka-0-8` profile.
+Note: Kafka 0.8 support is deprecated as of Spark 2.3.0.
+
+    ./build/mvn -Pkafka-0-8 -DskipTests clean package
+
+Kafka 0.10 support is still automatically built.
 
 ## Building submodules individually
 
diff --git a/docs/streaming-kafka-0-8-integration.md b/docs/streaming-kafka-0-8-integration.md
index 24a3e4cdbbd7d..9f0671da2ee31 100644
--- a/docs/streaming-kafka-0-8-integration.md
+++ b/docs/streaming-kafka-0-8-integration.md
@@ -2,6 +2,9 @@
 layout: global
 title: Spark Streaming + Kafka Integration Guide (Kafka broker version 0.8.2.1 or higher)
 ---
+
+**Note: Kafka 0.8 support is deprecated as of Spark 2.3.0.**
+
 Here we explain how to configure Spark Streaming to receive data from Kafka. There are two approaches to this - the old approach using Receivers and Kafka's high-level API, and a new approach (introduced in Spark 1.3) without using Receivers. They have different programming models, performance characteristics, and semantics guarantees, so read on for more details.  Both approaches are considered stable APIs as of the current version of Spark.
 
 ## Approach 1: Receiver-based Approach
@@ -28,8 +31,7 @@ Next, we discuss how to use this approach in your streaming application.
 		val kafkaStream = KafkaUtils.createStream(streamingContext,
             [ZK quorum], [consumer group id], [per-topic number of Kafka partitions to consume])
 
-    You can also specify the key and value classes and their corresponding decoder classes using variations of `createStream`. See the [API docs](api/scala/index.html#org.apache.spark.streaming.kafka.KafkaUtils$)
-	and the [example]({{site.SPARK_GITHUB_URL}}/blob/v{{site.SPARK_VERSION_SHORT}}/examples/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala).
+    You can also specify the key and value classes and their corresponding decoder classes using variations of `createStream`. See the [API docs](api/scala/index.html#org.apache.spark.streaming.kafka.KafkaUtils$).
 	</div>
 	<div data-lang="java" markdown="1">
 		import org.apache.spark.streaming.kafka.*;
@@ -38,8 +40,7 @@ Next, we discuss how to use this approach in your streaming application.
 			KafkaUtils.createStream(streamingContext,
             [ZK quorum], [consumer group id], [per-topic number of Kafka partitions to consume]);
 
-    You can also specify the key and value classes and their corresponding decoder classes using variations of `createStream`. See the [API docs](api/java/index.html?org/apache/spark/streaming/kafka/KafkaUtils.html)
-	and the [example]({{site.SPARK_GITHUB_URL}}/blob/v{{site.SPARK_VERSION_SHORT}}/examples/src/main/java/org/apache/spark/examples/streaming/JavaKafkaWordCount.java).
+    You can also specify the key and value classes and their corresponding decoder classes using variations of `createStream`. See the [API docs](api/java/index.html?org/apache/spark/streaming/kafka/KafkaUtils.html).
 
 	</div>
 	<div data-lang="python" markdown="1">
@@ -48,8 +49,7 @@ Next, we discuss how to use this approach in your streaming application.
 		kafkaStream = KafkaUtils.createStream(streamingContext, \
 			[ZK quorum], [consumer group id], [per-topic number of Kafka partitions to consume])
 
-	By default, the Python API will decode Kafka data as UTF8 encoded strings. You can specify your custom decoding function to decode the byte arrays in Kafka records to any arbitrary data type. See the [API docs](api/python/pyspark.streaming.html#pyspark.streaming.kafka.KafkaUtils)
-	and the [example]({{site.SPARK_GITHUB_URL}}/blob/v{{site.SPARK_VERSION_SHORT}}/examples/src/main/python/streaming/kafka_wordcount.py).
+	By default, the Python API will decode Kafka data as UTF8 encoded strings. You can specify your custom decoding function to decode the byte arrays in Kafka records to any arbitrary data type. See the [API docs](api/python/pyspark.streaming.html#pyspark.streaming.kafka.KafkaUtils).
 	</div>
 	</div>
 
@@ -71,7 +71,7 @@ Next, we discuss how to use this approach in your streaming application.
 	    ./bin/spark-submit --packages org.apache.spark:spark-streaming-kafka-0-8_{{site.SCALA_BINARY_VERSION}}:{{site.SPARK_VERSION_SHORT}} ...
 
 	Alternatively, you can also download the JAR of the Maven artifact `spark-streaming-kafka-0-8-assembly` from the
-	[Maven repository](http://search.maven.org/#search|ga|1|a%3A%22spark-streaming-kafka-0-8-assembly_{{site.SCALA_BINARY_VERSION}}%22%20AND%20v%3A%22{{site.SPARK_VERSION_SHORT}}%22) and add it to `spark-submit` with `--jars`.
+	[Maven repository](https://search.maven.org/#search|ga|1|a%3A%22spark-streaming-kafka-0-8-assembly_{{site.SCALA_BINARY_VERSION}}%22%20AND%20v%3A%22{{site.SPARK_VERSION_SHORT}}%22) and add it to `spark-submit` with `--jars`.
 
 ## Approach 2: Direct Approach (No Receivers)
 This new receiver-less "direct" approach has been introduced in Spark 1.3 to ensure stronger end-to-end guarantees. Instead of using receivers to receive data, this approach periodically queries Kafka for the latest offsets in each topic+partition, and accordingly defines the offset ranges to process in each batch. When the jobs to process the data are launched, Kafka's simple consumer API is used to read the defined ranges of offsets from Kafka (similar to read files from a file system). Note that this feature was introduced in Spark 1.3 for the Scala and Java API, in Spark 1.4 for the Python API.
@@ -105,8 +105,7 @@ Next, we discuss how to use this approach in your streaming application.
 			streamingContext, [map of Kafka parameters], [set of topics to consume])
 
 	You can also pass a `messageHandler` to `createDirectStream` to access `MessageAndMetadata` that contains metadata about the current message and transform it to any desired type.
-	See the [API docs](api/scala/index.html#org.apache.spark.streaming.kafka.KafkaUtils$)
-	and the [example]({{site.SPARK_GITHUB_URL}}/blob/v{{site.SPARK_VERSION_SHORT}}/examples/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala).
+	See the [API docs](api/scala/index.html#org.apache.spark.streaming.kafka.KafkaUtils$).
 	</div>
 	<div data-lang="java" markdown="1">
 		import org.apache.spark.streaming.kafka.*;
@@ -117,8 +116,7 @@ Next, we discuss how to use this approach in your streaming application.
 				[map of Kafka parameters], [set of topics to consume]);
 
 	You can also pass a `messageHandler` to `createDirectStream` to access `MessageAndMetadata` that contains metadata about the current message and transform it to any desired type.
-	See the [API docs](api/java/index.html?org/apache/spark/streaming/kafka/KafkaUtils.html)
-	and the [example]({{site.SPARK_GITHUB_URL}}/blob/v{{site.SPARK_VERSION_SHORT}}/examples/src/main/java/org/apache/spark/examples/streaming/JavaDirectKafkaWordCount.java).
+	See the [API docs](api/java/index.html?org/apache/spark/streaming/kafka/KafkaUtils.html).
 
 	</div>
 	<div data-lang="python" markdown="1">
@@ -126,8 +124,7 @@ Next, we discuss how to use this approach in your streaming application.
 		directKafkaStream = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})
 
 	You can also pass a `messageHandler` to `createDirectStream` to access `KafkaMessageAndMetadata` that contains metadata about the current message and transform it to any desired type.
-	By default, the Python API will decode Kafka data as UTF8 encoded strings. You can specify your custom decoding function to decode the byte arrays in Kafka records to any arbitrary data type. See the [API docs](api/python/pyspark.streaming.html#pyspark.streaming.kafka.KafkaUtils)
-	and the [example]({{site.SPARK_GITHUB_URL}}/blob/v{{site.SPARK_VERSION_SHORT}}/examples/src/main/python/streaming/direct_kafka_wordcount.py).
+	By default, the Python API will decode Kafka data as UTF8 encoded strings. You can specify your custom decoding function to decode the byte arrays in Kafka records to any arbitrary data type. See the [API docs](api/python/pyspark.streaming.html#pyspark.streaming.kafka.KafkaUtils).
 	</div>
 	</div>
 
diff --git a/docs/streaming-kafka-integration.md b/docs/streaming-kafka-integration.md
index a8f3667a49850..4aca391e4ba1a 100644
--- a/docs/streaming-kafka-integration.md
+++ b/docs/streaming-kafka-integration.md
@@ -3,10 +3,11 @@ layout: global
 title: Spark Streaming + Kafka Integration Guide
 ---
 
-[Apache Kafka](http://kafka.apache.org/) is publish-subscribe messaging rethought as a distributed, partitioned, replicated commit log service.  Please read the [Kafka documentation](http://kafka.apache.org/documentation.html) thoroughly before starting an integration using Spark.
+[Apache Kafka](https://kafka.apache.org/) is publish-subscribe messaging rethought as a distributed, partitioned, replicated commit log service.  Please read the [Kafka documentation](https://kafka.apache.org/documentation.html) thoroughly before starting an integration using Spark.
 
-The Kafka project introduced a new consumer api between versions 0.8 and 0.10, so there are 2 separate corresponding Spark Streaming packages available.  Please choose the correct package for your brokers and desired features; note that the 0.8 integration is compatible with later 0.9 and 0.10 brokers, but the 0.10 integration is not compatible with earlier brokers.
+The Kafka project introduced a new consumer API between versions 0.8 and 0.10, so there are 2 separate corresponding Spark Streaming packages available.  Please choose the correct package for your brokers and desired features; note that the 0.8 integration is compatible with later 0.9 and 0.10 brokers, but the 0.10 integration is not compatible with earlier brokers.
 
+**Note: Kafka 0.8 support is deprecated as of Spark 2.3.0.**
 
 <table class="table">
 <tr><th></th><th><a href="streaming-kafka-0-8-integration.html">spark-streaming-kafka-0-8</a></th><th><a href="streaming-kafka-0-10-integration.html">spark-streaming-kafka-0-10</a></th></tr>
@@ -16,9 +17,9 @@ The Kafka project introduced a new consumer api between versions 0.8 and 0.10, s
   <td>0.10.0 or higher</td>
 </tr>
 <tr>
-  <td>Api Stability</td>
+  <td>API Maturity</td>
+  <td>Deprecated</td>
   <td>Stable</td>
-  <td>Experimental</td>
 </tr>
 <tr>
   <td>Language Support</td>
@@ -41,7 +42,7 @@ The Kafka project introduced a new consumer api between versions 0.8 and 0.10, s
   <td>Yes</td>
 </tr>
 <tr>
-  <td>Offset Commit Api</td>
+  <td>Offset Commit API</td>
   <td>No</td>
   <td>Yes</td>
 </tr>
diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
index fca0cf8ff05f2..bc200cd07ebd8 100644
--- a/docs/streaming-programming-guide.md
+++ b/docs/streaming-programming-guide.md
@@ -401,14 +401,14 @@ some of the common ones are as follows.
 
 <table class="table">
 <tr><th>Source</th><th>Artifact</th></tr>
-<tr><td> Kafka </td><td> spark-streaming-kafka-0-8_{{site.SCALA_BINARY_VERSION}} </td></tr>
+<tr><td> Kafka </td><td> spark-streaming-kafka-0-10_{{site.SCALA_BINARY_VERSION}} </td></tr>
 <tr><td> Flume </td><td> spark-streaming-flume_{{site.SCALA_BINARY_VERSION}} </td></tr>
 <tr><td> Kinesis<br/></td><td>spark-streaming-kinesis-asl_{{site.SCALA_BINARY_VERSION}} [Amazon Software License] </td></tr>
 <tr><td></td><td></td></tr>
 </table>
 
 For an up-to-date list, please refer to the
-[Maven repository](http://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.spark%22%20AND%20v%3A%22{{site.SPARK_VERSION_SHORT}}%22)
+[Maven repository](https://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.spark%22%20AND%20v%3A%22{{site.SPARK_VERSION_SHORT}}%22)
 for the full list of supported sources and artifacts.
 
 ***
@@ -1899,7 +1899,7 @@ To run a Spark Streaming applications, you need to have the following.
   if your application uses [advanced sources](#advanced-sources) (e.g. Kafka, Flume),
   then you will have to package the extra artifact they link to, along with their dependencies,
   in the JAR that is used to deploy the application. For example, an application using `KafkaUtils`
-  will have to include `spark-streaming-kafka-0-8_{{site.SCALA_BINARY_VERSION}}` and all its
+  will have to include `spark-streaming-kafka-0-10_{{site.SCALA_BINARY_VERSION}}` and all its
   transitive dependencies in the application JAR.
 
 - *Configuring sufficient memory for the executors* - Since the received data must be stored in
diff --git a/examples/pom.xml b/examples/pom.xml
index 81af7357f0887..33eca48645725 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -86,7 +86,7 @@
     </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
-      <artifactId>spark-streaming-kafka-0-8_${scala.binary.version}</artifactId>
+      <artifactId>spark-streaming-kafka-0-10_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
       <scope>provided</scope>
     </dependency>
diff --git a/examples/src/main/java/org/apache/spark/examples/streaming/JavaDirectKafkaWordCount.java b/examples/src/main/java/org/apache/spark/examples/streaming/JavaDirectKafkaWordCount.java
index 5e5ae6213d5d9..b6b163fa8b2cd 100644
--- a/examples/src/main/java/org/apache/spark/examples/streaming/JavaDirectKafkaWordCount.java
+++ b/examples/src/main/java/org/apache/spark/examples/streaming/JavaDirectKafkaWordCount.java
@@ -26,11 +26,13 @@
 
 import scala.Tuple2;
 
-import kafka.serializer.StringDecoder;
+import org.apache.kafka.clients.consumer.ConsumerRecord;
 
 import org.apache.spark.SparkConf;
 import org.apache.spark.streaming.api.java.*;
-import org.apache.spark.streaming.kafka.KafkaUtils;
+import org.apache.spark.streaming.kafka010.ConsumerStrategies;
+import org.apache.spark.streaming.kafka010.KafkaUtils;
+import org.apache.spark.streaming.kafka010.LocationStrategies;
 import org.apache.spark.streaming.Durations;
 
 /**
@@ -65,22 +67,17 @@ public static void main(String[] args) throws Exception {
     JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(2));
 
     Set<String> topicsSet = new HashSet<>(Arrays.asList(topics.split(",")));
-    Map<String, String> kafkaParams = new HashMap<>();
+    Map<String, Object> kafkaParams = new HashMap<>();
     kafkaParams.put("metadata.broker.list", brokers);
 
     // Create direct kafka stream with brokers and topics
-    JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(
+    JavaInputDStream<ConsumerRecord<String, String>> messages = KafkaUtils.createDirectStream(
         jssc,
-        String.class,
-        String.class,
-        StringDecoder.class,
-        StringDecoder.class,
-        kafkaParams,
-        topicsSet
-    );
+        LocationStrategies.PreferConsistent(),
+        ConsumerStrategies.Subscribe(topicsSet, kafkaParams));
 
     // Get the lines, split them into words, count the words and print
-    JavaDStream<String> lines = messages.map(Tuple2::_2);
+    JavaDStream<String> lines = messages.map(ConsumerRecord::value);
     JavaDStream<String> words = lines.flatMap(x -> Arrays.asList(SPACE.split(x)).iterator());
     JavaPairDStream<String, Integer> wordCounts = words.mapToPair(s -> new Tuple2<>(s, 1))
         .reduceByKey((i1, i2) -> i1 + i2);
diff --git a/examples/src/main/java/org/apache/spark/examples/streaming/JavaKafkaWordCount.java b/examples/src/main/java/org/apache/spark/examples/streaming/JavaKafkaWordCount.java
deleted file mode 100644
index ce5acdca92666..0000000000000
--- a/examples/src/main/java/org/apache/spark/examples/streaming/JavaKafkaWordCount.java
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.examples.streaming;
-
-import java.util.Arrays;
-import java.util.Map;
-import java.util.HashMap;
-import java.util.regex.Pattern;
-
-import scala.Tuple2;
-
-import org.apache.spark.SparkConf;
-import org.apache.spark.streaming.Duration;
-import org.apache.spark.streaming.api.java.JavaDStream;
-import org.apache.spark.streaming.api.java.JavaPairDStream;
-import org.apache.spark.streaming.api.java.JavaPairReceiverInputDStream;
-import org.apache.spark.streaming.api.java.JavaStreamingContext;
-import org.apache.spark.streaming.kafka.KafkaUtils;
-
-/**
- * Consumes messages from one or more topics in Kafka and does wordcount.
- *
- * Usage: JavaKafkaWordCount <zkQuorum> <group> <topics> <numThreads>
- *   <zkQuorum> is a list of one or more zookeeper servers that make quorum
- *   <group> is the name of kafka consumer group
- *   <topics> is a list of one or more kafka topics to consume from
- *   <numThreads> is the number of threads the kafka consumer should use
- *
- * To run this example:
- *   `$ bin/run-example org.apache.spark.examples.streaming.JavaKafkaWordCount zoo01,zoo02, \
- *    zoo03 my-consumer-group topic1,topic2 1`
- */
-
-public final class JavaKafkaWordCount {
-  private static final Pattern SPACE = Pattern.compile(" ");
-
-  private JavaKafkaWordCount() {
-  }
-
-  public static void main(String[] args) throws Exception {
-    if (args.length < 4) {
-      System.err.println("Usage: JavaKafkaWordCount <zkQuorum> <group> <topics> <numThreads>");
-      System.exit(1);
-    }
-
-    StreamingExamples.setStreamingLogLevels();
-    SparkConf sparkConf = new SparkConf().setAppName("JavaKafkaWordCount");
-    // Create the context with 2 seconds batch size
-    JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(2000));
-
-    int numThreads = Integer.parseInt(args[3]);
-    Map<String, Integer> topicMap = new HashMap<>();
-    String[] topics = args[2].split(",");
-    for (String topic: topics) {
-      topicMap.put(topic, numThreads);
-    }
-
-    JavaPairReceiverInputDStream<String, String> messages =
-            KafkaUtils.createStream(jssc, args[0], args[1], topicMap);
-
-    JavaDStream<String> lines = messages.map(Tuple2::_2);
-
-    JavaDStream<String> words = lines.flatMap(x -> Arrays.asList(SPACE.split(x)).iterator());
-
-    JavaPairDStream<String, Integer> wordCounts = words.mapToPair(s -> new Tuple2<>(s, 1))
-        .reduceByKey((i1, i2) -> i1 + i2);
-
-    wordCounts.print();
-    jssc.start();
-    jssc.awaitTermination();
-  }
-}
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala
index 474b03aa24a5d..def06026bde96 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala
@@ -18,11 +18,9 @@
 // scalastyle:off println
 package org.apache.spark.examples.streaming
 
-import kafka.serializer.StringDecoder
-
 import org.apache.spark.SparkConf
 import org.apache.spark.streaming._
-import org.apache.spark.streaming.kafka._
+import org.apache.spark.streaming.kafka010._
 
 /**
  * Consumes messages from one or more topics in Kafka and does wordcount.
@@ -57,11 +55,13 @@ object DirectKafkaWordCount {
     // Create direct kafka stream with brokers and topics
     val topicsSet = topics.split(",").toSet
     val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers)
-    val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
-      ssc, kafkaParams, topicsSet)
+    val messages = KafkaUtils.createDirectStream[String, String](
+      ssc,
+      LocationStrategies.PreferConsistent,
+      ConsumerStrategies.Subscribe[String, String](topicsSet, kafkaParams))
 
     // Get the lines, split them into words, count the words and print
-    val lines = messages.map(_._2)
+    val lines = messages.map(_.value)
     val words = lines.flatMap(_.split(" "))
     val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _)
     wordCounts.print()
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala
deleted file mode 100644
index e7f9bf36e35cf..0000000000000
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// scalastyle:off println
-package org.apache.spark.examples.streaming
-
-import java.util.HashMap
-
-import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord}
-
-import org.apache.spark.SparkConf
-import org.apache.spark.streaming._
-import org.apache.spark.streaming.kafka._
-
-/**
- * Consumes messages from one or more topics in Kafka and does wordcount.
- * Usage: KafkaWordCount <zkQuorum> <group> <topics> <numThreads>
- *   <zkQuorum> is a list of one or more zookeeper servers that make quorum
- *   <group> is the name of kafka consumer group
- *   <topics> is a list of one or more kafka topics to consume from
- *   <numThreads> is the number of threads the kafka consumer should use
- *
- * Example:
- *    `$ bin/run-example \
- *      org.apache.spark.examples.streaming.KafkaWordCount zoo01,zoo02,zoo03 \
- *      my-consumer-group topic1,topic2 1`
- */
-object KafkaWordCount {
-  def main(args: Array[String]) {
-    if (args.length < 4) {
-      System.err.println("Usage: KafkaWordCount <zkQuorum> <group> <topics> <numThreads>")
-      System.exit(1)
-    }
-
-    StreamingExamples.setStreamingLogLevels()
-
-    val Array(zkQuorum, group, topics, numThreads) = args
-    val sparkConf = new SparkConf().setAppName("KafkaWordCount")
-    val ssc = new StreamingContext(sparkConf, Seconds(2))
-    ssc.checkpoint("checkpoint")
-
-    val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap
-    val lines = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap).map(_._2)
-    val words = lines.flatMap(_.split(" "))
-    val wordCounts = words.map(x => (x, 1L))
-      .reduceByKeyAndWindow(_ + _, _ - _, Minutes(10), Seconds(2), 2)
-    wordCounts.print()
-
-    ssc.start()
-    ssc.awaitTermination()
-  }
-}
-
-// Produces some random words between 1 and 100.
-object KafkaWordCountProducer {
-
-  def main(args: Array[String]) {
-    if (args.length < 4) {
-      System.err.println("Usage: KafkaWordCountProducer <metadataBrokerList> <topic> " +
-        "<messagesPerSec> <wordsPerMessage>")
-      System.exit(1)
-    }
-
-    val Array(brokers, topic, messagesPerSec, wordsPerMessage) = args
-
-    // Zookeeper connection properties
-    val props = new HashMap[String, Object]()
-    props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers)
-    props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,
-      "org.apache.kafka.common.serialization.StringSerializer")
-    props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG,
-      "org.apache.kafka.common.serialization.StringSerializer")
-
-    val producer = new KafkaProducer[String, String](props)
-
-    // Send some messages
-    while(true) {
-      (1 to messagesPerSec.toInt).foreach { messageNum =>
-        val str = (1 to wordsPerMessage.toInt).map(x => scala.util.Random.nextInt(10).toString)
-          .mkString(" ")
-
-        val message = new ProducerRecord[String, String](topic, null, str)
-        producer.send(message)
-      }
-
-      Thread.sleep(1000)
-    }
-  }
-
-}
-// scalastyle:on println
diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/Broker.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/Broker.scala
index 9159051ba06e4..89ccbe219cecd 100644
--- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/Broker.scala
+++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/Broker.scala
@@ -23,6 +23,7 @@ import org.apache.spark.annotation.Experimental
  * Represents the host and port info for a Kafka broker.
  * Differs from the Kafka project's internal kafka.cluster.Broker, which contains a server ID.
  */
+@deprecated("Update to Kafka 0.10 integration", "2.3.0")
 final class Broker private(
     /** Broker's hostname */
     val host: String,
@@ -49,6 +50,7 @@ final class Broker private(
  * Companion object that provides methods to create instances of [[Broker]].
  */
 @Experimental
+@deprecated("Update to Kafka 0.10 integration", "2.3.0")
 object Broker {
   def create(host: String, port: Int): Broker =
     new Broker(host, port)
diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala
index e0e44d4440272..570affab11853 100644
--- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala
+++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala
@@ -42,6 +42,7 @@ import org.apache.spark.annotation.DeveloperApi
  *   NOT zookeeper servers, specified in host1:port1,host2:port2 form
  */
 @DeveloperApi
+@deprecated("Update to Kafka 0.10 integration", "2.3.0")
 class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable {
   import KafkaCluster.{Err, LeaderOffset, SimpleConsumerConfig}
 
@@ -376,6 +377,7 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable {
 }
 
 @DeveloperApi
+@deprecated("Update to Kafka 0.10 integration", "2.3.0")
 object KafkaCluster {
   type Err = ArrayBuffer[Throwable]
 
diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
index 78230725f322e..36082e93707b8 100644
--- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
+++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
@@ -41,6 +41,7 @@ import org.apache.spark.streaming.api.java._
 import org.apache.spark.streaming.dstream.{DStream, InputDStream, ReceiverInputDStream}
 import org.apache.spark.streaming.util.WriteAheadLogUtils
 
+@deprecated("Update to Kafka 0.10 integration", "2.3.0")
 object KafkaUtils {
   /**
    * Create an input stream that pulls messages from Kafka Brokers.
diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala
index 10d364f987405..6dab5f950d4cd 100644
--- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala
+++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala
@@ -30,6 +30,7 @@ import kafka.common.TopicAndPartition
  *   }
  * }}}
  */
+@deprecated("Update to Kafka 0.10 integration", "2.3.0")
 trait HasOffsetRanges {
   def offsetRanges: Array[OffsetRange]
 }
@@ -42,6 +43,7 @@ trait HasOffsetRanges {
  * @param fromOffset Inclusive starting offset
  * @param untilOffset Exclusive ending offset
  */
+@deprecated("Update to Kafka 0.10 integration", "2.3.0")
 final class OffsetRange private(
     val topic: String,
     val partition: Int,
@@ -80,6 +82,7 @@ final class OffsetRange private(
 /**
  * Companion object the provides methods to create instances of [[OffsetRange]].
  */
+@deprecated("Update to Kafka 0.10 integration", "2.3.0")
 object OffsetRange {
   def create(topic: String, partition: Int, fromOffset: Long, untilOffset: Long): OffsetRange =
     new OffsetRange(topic, partition, fromOffset, untilOffset)
diff --git a/pom.xml b/pom.xml
index a051fea77b6b8..af511c3e2e5df 100644
--- a/pom.xml
+++ b/pom.xml
@@ -104,8 +104,6 @@
     <module>examples</module>
     <module>repl</module>
     <module>launcher</module>
-    <module>external/kafka-0-8</module>
-    <module>external/kafka-0-8-assembly</module>
     <module>external/kafka-0-10</module>
     <module>external/kafka-0-10-assembly</module>
     <module>external/kafka-0-10-sql</module>
@@ -2653,6 +2651,14 @@
       </modules>
     </profile>
 
+    <profile>
+      <id>kafka-0-8</id>
+      <modules>
+        <module>external/kafka-0-8</module>
+        <module>external/kafka-0-8-assembly</module>
+      </modules>
+    </profile>
+
     <profile>
       <id>test-java-home</id>
       <activation>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 748b1c49c8f1c..a568d264cb2db 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -44,9 +44,9 @@ object BuildCommons {
   ).map(ProjectRef(buildLocation, _))
 
   val streamingProjects@Seq(
-    streaming, streamingFlumeSink, streamingFlume, streamingKafka, streamingKafka010
+    streaming, streamingFlumeSink, streamingFlume, streamingKafka010
   ) = Seq(
-    "streaming", "streaming-flume-sink", "streaming-flume", "streaming-kafka-0-8", "streaming-kafka-0-10"
+    "streaming", "streaming-flume-sink", "streaming-flume", "streaming-kafka-0-10"
   ).map(ProjectRef(buildLocation, _))
 
   val allProjects@Seq(
@@ -56,9 +56,9 @@ object BuildCommons {
     "tags", "sketch", "kvstore"
   ).map(ProjectRef(buildLocation, _)) ++ sqlProjects ++ streamingProjects
 
-  val optionallyEnabledProjects@Seq(mesos, yarn, sparkGangliaLgpl,
+  val optionallyEnabledProjects@Seq(mesos, yarn, streamingKafka, sparkGangliaLgpl,
     streamingKinesisAsl, dockerIntegrationTests, hadoopCloud) =
-    Seq("mesos", "yarn", "ganglia-lgpl", "streaming-kinesis-asl",
+    Seq("mesos", "yarn", "streaming-kafka-0-8", "ganglia-lgpl", "streaming-kinesis-asl",
       "docker-integration-tests", "hadoop-cloud").map(ProjectRef(buildLocation, _))
 
   val assemblyProjects@Seq(networkYarn, streamingFlumeAssembly, streamingKafkaAssembly, streamingKafka010Assembly, streamingKinesisAslAssembly) =
diff --git a/python/pyspark/streaming/kafka.py b/python/pyspark/streaming/kafka.py
index 9d1a6ecf2c028..4af4135c81958 100644
--- a/python/pyspark/streaming/kafka.py
+++ b/python/pyspark/streaming/kafka.py
@@ -55,6 +55,8 @@ def createStream(ssc, zkQuorum, groupId, topics, kafkaParams=None,
         :param keyDecoder:  A function used to decode key (default is utf8_decoder)
         :param valueDecoder:  A function used to decode value (default is utf8_decoder)
         :return: A DStream object
+
+        .. note:: Deprecated in 2.3.0
         """
         if kafkaParams is None:
             kafkaParams = dict()
@@ -77,8 +79,6 @@ def createDirectStream(ssc, topics, kafkaParams, fromOffsets=None,
                            keyDecoder=utf8_decoder, valueDecoder=utf8_decoder,
                            messageHandler=None):
         """
-        .. note:: Experimental
-
         Create an input stream that directly pulls messages from a Kafka Broker and specific offset.
 
         This is not a receiver based Kafka input stream, it directly pulls the message from Kafka
@@ -103,6 +103,9 @@ def createDirectStream(ssc, topics, kafkaParams, fromOffsets=None,
         :param messageHandler: A function used to convert KafkaMessageAndMetadata. You can assess
                                meta using messageHandler (default is None).
         :return: A DStream object
+
+        .. note:: Experimental
+        .. note:: Deprecated in 2.3.0
         """
         if fromOffsets is None:
             fromOffsets = dict()
@@ -142,8 +145,6 @@ def createRDD(sc, kafkaParams, offsetRanges, leaders=None,
                   keyDecoder=utf8_decoder, valueDecoder=utf8_decoder,
                   messageHandler=None):
         """
-        .. note:: Experimental
-
         Create an RDD from Kafka using offset ranges for each topic and partition.
 
         :param sc:  SparkContext object
@@ -156,6 +157,9 @@ def createRDD(sc, kafkaParams, offsetRanges, leaders=None,
         :param messageHandler: A function used to convert KafkaMessageAndMetadata. You can assess
                                meta using messageHandler (default is None).
         :return: An RDD object
+
+        .. note:: Experimental
+        .. note:: Deprecated in 2.3.0
         """
         if leaders is None:
             leaders = dict()
@@ -224,6 +228,8 @@ def _printErrorMsg(sc):
 class OffsetRange(object):
     """
     Represents a range of offsets from a single Kafka TopicAndPartition.
+
+    .. note:: Deprecated in 2.3.0
     """
 
     def __init__(self, topic, partition, fromOffset, untilOffset):
@@ -263,6 +269,8 @@ def _jOffsetRange(self, helper):
 class TopicAndPartition(object):
     """
     Represents a specific topic and partition for Kafka.
+
+    .. note:: Deprecated in 2.3.0
     """
 
     def __init__(self, topic, partition):
@@ -294,6 +302,8 @@ def __hash__(self):
 class Broker(object):
     """
     Represent the host and port info for a Kafka broker.
+
+    .. note:: Deprecated in 2.3.0
     """
 
     def __init__(self, host, port):
@@ -312,6 +322,8 @@ def _jBroker(self, helper):
 class KafkaRDD(RDD):
     """
     A Python wrapper of KafkaRDD, to provide additional information on normal RDD.
+
+    .. note:: Deprecated in 2.3.0
     """
 
     def __init__(self, jrdd, ctx, jrdd_deserializer):
@@ -332,6 +344,8 @@ def offsetRanges(self):
 class KafkaDStream(DStream):
     """
     A Python wrapper of KafkaDStream
+
+    .. note:: Deprecated in 2.3.0
     """
 
     def __init__(self, jdstream, ssc, jrdd_deserializer):
@@ -368,6 +382,8 @@ def transform(self, func):
 class KafkaTransformedDStream(TransformedDStream):
     """
     Kafka specific wrapper of TransformedDStream to transform on Kafka RDD.
+
+    .. note:: Deprecated in 2.3.0
     """
 
     def __init__(self, prev, func):
@@ -388,6 +404,8 @@ def _jdstream(self):
 class KafkaMessageAndMetadata(object):
     """
     Kafka message and metadata information. Including topic, partition, offset and message
+
+    .. note:: Deprecated in 2.3.0
     """
 
     def __init__(self, topic, partition, offset, key, message):
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index ffba99502b148..229cf53e47359 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -1516,7 +1516,10 @@ def search_kinesis_asl_assembly_jar():
         return jars[0]
 
 
-# Must be same as the variable and condition defined in KinesisTestUtils.scala
+# Must be same as the variable and condition defined in modules.py
+kafka_test_environ_var = "ENABLE_KAFKA_0_8_TESTS"
+are_kafka_tests_enabled = os.environ.get(kafka_test_environ_var) == '1'
+# Must be same as the variable and condition defined in KinesisTestUtils.scala and modules.py
 kinesis_test_environ_var = "ENABLE_KINESIS_TESTS"
 are_kinesis_tests_enabled = os.environ.get(kinesis_test_environ_var) == '1'
 
@@ -1535,9 +1538,16 @@ def search_kinesis_asl_assembly_jar():
 
     os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars %s pyspark-shell" % jars
     testcases = [BasicOperationTests, WindowFunctionTests, StreamingContextTests, CheckpointTests,
-                 KafkaStreamTests, FlumeStreamTests, FlumePollingStreamTests,
+                 FlumeStreamTests, FlumePollingStreamTests,
                  StreamingListenerTests]
 
+    if are_kafka_tests_enabled:
+        testcases.append(KafkaStreamTests)
+    else:
+        sys.stderr.write(
+            "Skipped test_kafka_stream (enable by setting environment variable %s=1"
+            % kafka_test_environ_var)
+
     if kinesis_jar_present is True:
         testcases.append(KinesisStreamTests)
     elif are_kinesis_tests_enabled is False:

From ca00cc70d6f01c0253a5bc2c22089cc54b476462 Mon Sep 17 00:00:00 2001
From: caoxuewen <cao.xuewen@zte.com.cn>
Date: Wed, 13 Sep 2017 13:01:30 +0100
Subject: [PATCH 535/779] [SPARK-21963][CORE][TEST] Create temp file should be
 delete after use
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

After you create a temporary table, you need to delete it, otherwise it will leave a file similar to the file name ‘SPARK194465907929586320484966temp’.

## How was this patch tested?

N / A

Author: caoxuewen <cao.xuewen@zte.com.cn>

Closes #19174 from heary-cao/DeleteTempFile.
---
 .../scala/org/apache/spark/SparkContextSuite.scala  |  1 +
 .../spark/security/CryptoStreamUtilsSuite.scala     |  1 +
 .../scala/org/apache/spark/util/UtilsSuite.scala    |  1 +
 .../spark/launcher/ChildProcAppHandleSuite.java     | 13 +++++++++++--
 4 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
index 890e93d764f90..0ed5f26863dad 100644
--- a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
@@ -600,6 +600,7 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu
     val fs = new DebugFilesystem()
     fs.initialize(new URI("file:///"), new Configuration())
     val file = File.createTempFile("SPARK19446", "temp")
+    file.deleteOnExit()
     Files.write(Array.ofDim[Byte](1000), file)
     val path = new Path("file:///" + file.getCanonicalPath)
     val stream = fs.open(path)
diff --git a/core/src/test/scala/org/apache/spark/security/CryptoStreamUtilsSuite.scala b/core/src/test/scala/org/apache/spark/security/CryptoStreamUtilsSuite.scala
index 608052f5ed855..78f618f8a2163 100644
--- a/core/src/test/scala/org/apache/spark/security/CryptoStreamUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/security/CryptoStreamUtilsSuite.scala
@@ -130,6 +130,7 @@ class CryptoStreamUtilsSuite extends SparkFunSuite {
     val conf = createConf()
     val key = createKey(conf)
     val file = Files.createTempFile("crypto", ".test").toFile()
+    file.deleteOnExit()
 
     val outStream = createCryptoOutputStream(new FileOutputStream(file), conf, key)
     try {
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index 4ce143f18bbf1..05d58d8e6099d 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -939,6 +939,7 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
         // creating a very misbehaving process. It ignores SIGTERM and has been SIGSTOPed. On
         // older versions of java, this will *not* terminate.
         val file = File.createTempFile("temp-file-name", ".tmp")
+        file.deleteOnExit()
         val cmd =
           s"""
              |#!/bin/bash
diff --git a/launcher/src/test/java/org/apache/spark/launcher/ChildProcAppHandleSuite.java b/launcher/src/test/java/org/apache/spark/launcher/ChildProcAppHandleSuite.java
index 3b4d1b07f606e..9f59b41d52d44 100644
--- a/launcher/src/test/java/org/apache/spark/launcher/ChildProcAppHandleSuite.java
+++ b/launcher/src/test/java/org/apache/spark/launcher/ChildProcAppHandleSuite.java
@@ -114,6 +114,7 @@ public void testRedirectErrorToLog() throws Exception {
     assumeFalse(isWindows());
 
     Path err = Files.createTempFile("stderr", "txt");
+    err.toFile().deleteOnExit();
 
     SparkAppHandle handle = (ChildProcAppHandle) new TestSparkLauncher()
       .redirectError(err.toFile())
@@ -129,6 +130,7 @@ public void testRedirectOutputToLog() throws Exception {
     assumeFalse(isWindows());
 
     Path out = Files.createTempFile("stdout", "txt");
+    out.toFile().deleteOnExit();
 
     SparkAppHandle handle = (ChildProcAppHandle) new TestSparkLauncher()
       .redirectOutput(out.toFile())
@@ -145,6 +147,8 @@ public void testNoRedirectToLog() throws Exception {
 
     Path out = Files.createTempFile("stdout", "txt");
     Path err = Files.createTempFile("stderr", "txt");
+    out.toFile().deleteOnExit();
+    err.toFile().deleteOnExit();
 
     ChildProcAppHandle handle = (ChildProcAppHandle) new TestSparkLauncher()
       .redirectError(err.toFile())
@@ -159,9 +163,11 @@ public void testNoRedirectToLog() throws Exception {
 
   @Test(expected = IllegalArgumentException.class)
   public void testBadLogRedirect() throws Exception {
+    File out = Files.createTempFile("stdout", "txt").toFile();
+    out.deleteOnExit();
     new SparkLauncher()
       .redirectError()
-      .redirectOutput(Files.createTempFile("stdout", "txt").toFile())
+      .redirectOutput(out)
       .redirectToLog("foo")
       .launch()
       .waitFor();
@@ -169,9 +175,11 @@ public void testBadLogRedirect() throws Exception {
 
   @Test(expected = IllegalArgumentException.class)
   public void testRedirectErrorTwiceFails() throws Exception {
+    File err = Files.createTempFile("stderr", "txt").toFile();
+    err.deleteOnExit();
     new SparkLauncher()
       .redirectError()
-      .redirectError(Files.createTempFile("stderr", "txt").toFile())
+      .redirectError(err)
       .launch()
       .waitFor();
   }
@@ -180,6 +188,7 @@ public void testRedirectErrorTwiceFails() throws Exception {
   public void testProcMonitorWithOutputRedirection() throws Exception {
     assumeFalse(isWindows());
     File err = Files.createTempFile("out", "txt").toFile();
+    err.deleteOnExit();
     SparkAppHandle handle = new TestSparkLauncher()
       .redirectError()
       .redirectOutput(err)

From 0fa5b7cacca4e867dd9f787cc2801616967932a4 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Wed, 13 Sep 2017 20:12:21 +0800
Subject: [PATCH 536/779] [SPARK-21690][ML] one-pass imputer

## What changes were proposed in this pull request?
parallelize the computation of all columns

performance tests:

|numColums| Mean(Old) | Median(Old) | Mean(RDD) | Median(RDD) | Mean(DF) | Median(DF) |
|------|----------|------------|----------|------------|----------|------------|
|1|0.0771394713|0.0658712813|0.080779802|0.048165981499999996|0.10525509870000001|0.0499620203|
|10|0.7234340630999999|0.5954440414|0.0867935197|0.13263428659999998|0.09255724889999999|0.1573943635|
|100|7.3756451568|6.2196631259|0.1911931552|0.8625376817000001|0.5557462431|1.7216837982000002|

## How was this patch tested?
existing tests

Author: Zheng RuiFeng <ruifengz@foxmail.com>

Closes #18902 from zhengruifeng/parallelize_imputer.
---
 .../org/apache/spark/ml/feature/Imputer.scala | 56 ++++++++++++++-----
 1 file changed, 41 insertions(+), 15 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
index 9e023b9dd469b..1f36eced3d08f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -133,23 +133,49 @@ class Imputer @Since("2.2.0") (@Since("2.2.0") override val uid: String)
   override def fit(dataset: Dataset[_]): ImputerModel = {
     transformSchema(dataset.schema, logging = true)
     val spark = dataset.sparkSession
-    import spark.implicits._
-    val surrogates = $(inputCols).map { inputCol =>
-      val ic = col(inputCol)
-      val filtered = dataset.select(ic.cast(DoubleType))
-        .filter(ic.isNotNull && ic =!= $(missingValue) && !ic.isNaN)
-      if(filtered.take(1).length == 0) {
-        throw new SparkException(s"surrogate cannot be computed. " +
-          s"All the values in $inputCol are Null, Nan or missingValue(${$(missingValue)})")
-      }
-      val surrogate = $(strategy) match {
-        case Imputer.mean => filtered.select(avg(inputCol)).as[Double].first()
-        case Imputer.median => filtered.stat.approxQuantile(inputCol, Array(0.5), 0.001).head
-      }
-      surrogate
+
+    val cols = $(inputCols).map { inputCol =>
+      when(col(inputCol).equalTo($(missingValue)), null)
+        .when(col(inputCol).isNaN, null)
+        .otherwise(col(inputCol))
+        .cast("double")
+        .as(inputCol)
+    }
+
+    val results = $(strategy) match {
+      case Imputer.mean =>
+        // Function avg will ignore null automatically.
+        // For a column only containing null, avg will return null.
+        val row = dataset.select(cols.map(avg): _*).head()
+        Array.range(0, $(inputCols).length).map { i =>
+          if (row.isNullAt(i)) {
+            Double.NaN
+          } else {
+            row.getDouble(i)
+          }
+        }
+
+      case Imputer.median =>
+        // Function approxQuantile will ignore null automatically.
+        // For a column only containing null, approxQuantile will return an empty array.
+        dataset.select(cols: _*).stat.approxQuantile($(inputCols), Array(0.5), 0.001)
+          .map { array =>
+            if (array.isEmpty) {
+              Double.NaN
+            } else {
+              array.head
+            }
+          }
+    }
+
+    val emptyCols = $(inputCols).zip(results).filter(_._2.isNaN).map(_._1)
+    if (emptyCols.nonEmpty) {
+      throw new SparkException(s"surrogate cannot be computed. " +
+        s"All the values in ${emptyCols.mkString(",")} are Null, Nan or " +
+        s"missingValue(${$(missingValue)})")
     }
 
-    val rows = spark.sparkContext.parallelize(Seq(Row.fromSeq(surrogates)))
+    val rows = spark.sparkContext.parallelize(Seq(Row.fromSeq(results)))
     val schema = StructType($(inputCols).map(col => StructField(col, DoubleType, nullable = false)))
     val surrogateDF = spark.createDataFrame(rows, schema)
     copyValues(new ImputerModel(uid, surrogateDF).setParent(this))

From b6ef1f57bc06a0b213b0367229a09b5094267d80 Mon Sep 17 00:00:00 2001
From: Armin <me@obrown.io>
Date: Wed, 13 Sep 2017 14:04:26 +0100
Subject: [PATCH 537/779] [SPARK-21970][CORE] Fix Redundant Throws Declarations
 in Java Codebase

## What changes were proposed in this pull request?

1. Removing all redundant throws declarations from Java codebase.
2. Removing dead code made visible by this from `ShuffleExternalSorter#closeAndGetSpills`

## How was this patch tested?

Build still passes.

Author: Armin <me@obrown.io>

Closes #19182 from original-brownbear/SPARK-21970.
---
 .../spark/util/kvstore/LevelDBTypeInfo.java   |  8 +++----
 .../spark/network/crypto/AuthEngine.java      |  4 ++--
 .../spark/network/sasl/SparkSaslClient.java   |  3 +--
 .../spark/network/sasl/SparkSaslServer.java   |  3 +--
 .../spark/network/server/TransportServer.java |  2 +-
 .../network/util/TransportFrameDecoder.java   |  2 +-
 .../spark/util/sketch/CountMinSketchImpl.java |  2 +-
 .../apache/spark/memory/MemoryConsumer.java   |  2 --
 .../shuffle/sort/ShuffleExternalSorter.java   | 23 ++++++++-----------
 .../shuffle/sort/UnsafeShuffleWriter.java     |  2 +-
 .../unsafe/sort/UnsafeExternalSorter.java     |  2 +-
 .../unsafe/sort/UnsafeSorterSpillWriter.java  |  4 ++--
 .../JavaStructuredSessionization.java         |  7 +++---
 .../apache/spark/launcher/SparkLauncher.java  |  2 +-
 .../parquet/VectorizedColumnReader.java       | 17 +++++++-------
 15 files changed, 36 insertions(+), 47 deletions(-)

diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java
index 93aa0bbc9dc8f..232ee41dd0b1f 100644
--- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java
@@ -249,7 +249,7 @@ Index parent() {
      * calculated only once, avoiding redundant work when multiple child indices of the
      * same parent index exist.
      */
-    byte[] childPrefix(Object value) throws Exception {
+    byte[] childPrefix(Object value) {
       Preconditions.checkState(parent == null, "Not a parent index.");
       return buildKey(name, toParentKey(value));
     }
@@ -295,7 +295,7 @@ byte[] end(byte[] prefix) {
     }
 
     /** The key for the end marker for entries with the given value. */
-    byte[] end(byte[] prefix, Object value) throws Exception {
+    byte[] end(byte[] prefix, Object value) {
       checkParent(prefix);
       return (parent != null) ? buildKey(false, prefix, name, toKey(value), END_MARKER)
         : buildKey(name, toKey(value), END_MARKER);
@@ -313,7 +313,7 @@ byte[] entityKey(byte[] prefix, Object entity) throws Exception {
       return entityKey;
     }
 
-    private void updateCount(WriteBatch batch, byte[] key, long delta) throws Exception {
+    private void updateCount(WriteBatch batch, byte[] key, long delta) {
       long updated = getCount(key) + delta;
       if (updated > 0) {
         batch.put(key, db.serializer.serialize(updated));
@@ -431,7 +431,7 @@ void remove(
       addOrRemove(batch, entity, null, null, naturalKey, prefix);
     }
 
-    long getCount(byte[] key) throws Exception {
+    long getCount(byte[] key) {
       byte[] data = db.db().get(key);
       return data != null ? db.serializer.deserializeLong(data) : 0;
     }
diff --git a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthEngine.java b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthEngine.java
index b769ebeba36cc..056505ef53356 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthEngine.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthEngine.java
@@ -81,7 +81,7 @@ class AuthEngine implements Closeable {
    *
    * @return A challenge to be sent the remote side.
    */
-  ClientChallenge challenge() throws GeneralSecurityException, IOException {
+  ClientChallenge challenge() throws GeneralSecurityException {
     this.authNonce = randomBytes(conf.encryptionKeyLength() / Byte.SIZE);
     SecretKeySpec authKey = generateKey(conf.keyFactoryAlgorithm(), conf.keyFactoryIterations(),
       authNonce, conf.encryptionKeyLength());
@@ -105,7 +105,7 @@ ClientChallenge challenge() throws GeneralSecurityException, IOException {
    * @return A response to be sent to the client.
    */
   ServerResponse respond(ClientChallenge clientChallenge)
-    throws GeneralSecurityException, IOException {
+    throws GeneralSecurityException {
 
     SecretKeySpec authKey = generateKey(clientChallenge.kdf, clientChallenge.iterations,
       clientChallenge.nonce, clientChallenge.keyLength);
diff --git a/common/network-common/src/main/java/org/apache/spark/network/sasl/SparkSaslClient.java b/common/network-common/src/main/java/org/apache/spark/network/sasl/SparkSaslClient.java
index b6256debb8e3e..05a5afe195e8c 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/sasl/SparkSaslClient.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/sasl/SparkSaslClient.java
@@ -17,7 +17,6 @@
 
 package org.apache.spark.network.sasl;
 
-import java.io.IOException;
 import java.util.Map;
 import javax.security.auth.callback.Callback;
 import javax.security.auth.callback.CallbackHandler;
@@ -125,7 +124,7 @@ public synchronized void dispose() {
    */
   private class ClientCallbackHandler implements CallbackHandler {
     @Override
-    public void handle(Callback[] callbacks) throws IOException, UnsupportedCallbackException {
+    public void handle(Callback[] callbacks) throws UnsupportedCallbackException {
 
       for (Callback callback : callbacks) {
         if (callback instanceof NameCallback) {
diff --git a/common/network-common/src/main/java/org/apache/spark/network/sasl/SparkSaslServer.java b/common/network-common/src/main/java/org/apache/spark/network/sasl/SparkSaslServer.java
index 00f3e83dbc8b3..e22e09d2a22e6 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/sasl/SparkSaslServer.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/sasl/SparkSaslServer.java
@@ -27,7 +27,6 @@
 import javax.security.sasl.Sasl;
 import javax.security.sasl.SaslException;
 import javax.security.sasl.SaslServer;
-import java.io.IOException;
 import java.nio.charset.StandardCharsets;
 import java.util.Map;
 
@@ -155,7 +154,7 @@ public byte[] unwrap(byte[] data, int offset, int len) throws SaslException {
    */
   private class DigestCallbackHandler implements CallbackHandler {
     @Override
-    public void handle(Callback[] callbacks) throws IOException, UnsupportedCallbackException {
+    public void handle(Callback[] callbacks) throws UnsupportedCallbackException {
       for (Callback callback : callbacks) {
         if (callback instanceof NameCallback) {
           logger.trace("SASL server callback: setting username");
diff --git a/common/network-common/src/main/java/org/apache/spark/network/server/TransportServer.java b/common/network-common/src/main/java/org/apache/spark/network/server/TransportServer.java
index 3f1fea8bca30c..0719fa7647bcc 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/server/TransportServer.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/server/TransportServer.java
@@ -117,7 +117,7 @@ private void init(String hostToBind, int portToBind) {
 
     bootstrap.childHandler(new ChannelInitializer<SocketChannel>() {
       @Override
-      protected void initChannel(SocketChannel ch) throws Exception {
+      protected void initChannel(SocketChannel ch) {
         RpcHandler rpcHandler = appRpcHandler;
         for (TransportServerBootstrap bootstrap : bootstraps) {
           rpcHandler = bootstrap.doBootstrap(ch, rpcHandler);
diff --git a/common/network-common/src/main/java/org/apache/spark/network/util/TransportFrameDecoder.java b/common/network-common/src/main/java/org/apache/spark/network/util/TransportFrameDecoder.java
index fcec7dfd0c210..50d9651ccbbb2 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/util/TransportFrameDecoder.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/util/TransportFrameDecoder.java
@@ -121,7 +121,7 @@ private long decodeFrameSize() {
     return nextFrameSize;
   }
 
-  private ByteBuf decodeNext() throws Exception {
+  private ByteBuf decodeNext() {
     long frameSize = decodeFrameSize();
     if (frameSize == UNKNOWN_FRAME_SIZE || totalSize < frameSize) {
       return null;
diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketchImpl.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketchImpl.java
index 045fec33a282a..fd1906d2e5ae9 100644
--- a/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketchImpl.java
+++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketchImpl.java
@@ -365,7 +365,7 @@ private void writeObject(ObjectOutputStream out) throws IOException {
     this.writeTo(out);
   }
 
-  private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
+  private void readObject(ObjectInputStream in) throws IOException {
     this.readFrom0(in);
   }
 }
diff --git a/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java b/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java
index 48cf4b9455e4d..4099fb01f2f95 100644
--- a/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java
+++ b/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java
@@ -111,8 +111,6 @@ public void freeArray(LongArray array) {
   /**
    * Allocate a memory block with at least `required` bytes.
    *
-   * Throws IOException if there is not enough memory.
-   *
    * @throws OutOfMemoryError
    */
   protected MemoryBlock allocatePage(long required) {
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java
index da6c55d9b8ac3..b4f46306f2827 100644
--- a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java
@@ -140,7 +140,7 @@ final class ShuffleExternalSorter extends MemoryConsumer {
    *                   bytes written should be counted towards shuffle spill metrics rather than
    *                   shuffle write metrics.
    */
-  private void writeSortedFile(boolean isLastFile) throws IOException {
+  private void writeSortedFile(boolean isLastFile) {
 
     final ShuffleWriteMetrics writeMetricsToUse;
 
@@ -325,7 +325,7 @@ public void cleanupResources() {
    * array and grows the array if additional space is required. If the required space cannot be
    * obtained, then the in-memory data will be spilled to disk.
    */
-  private void growPointerArrayIfNecessary() throws IOException {
+  private void growPointerArrayIfNecessary() {
     assert(inMemSorter != null);
     if (!inMemSorter.hasSpaceForAnotherRecord()) {
       long used = inMemSorter.getMemoryUsage();
@@ -406,19 +406,14 @@ public void insertRecord(Object recordBase, long recordOffset, int length, int p
    * @throws IOException
    */
   public SpillInfo[] closeAndGetSpills() throws IOException {
-    try {
-      if (inMemSorter != null) {
-        // Do not count the final file towards the spill count.
-        writeSortedFile(true);
-        freeMemory();
-        inMemSorter.free();
-        inMemSorter = null;
-      }
-      return spills.toArray(new SpillInfo[spills.size()]);
-    } catch (IOException e) {
-      cleanupResources();
-      throw e;
+    if (inMemSorter != null) {
+      // Do not count the final file towards the spill count.
+      writeSortedFile(true);
+      freeMemory();
+      inMemSorter.free();
+      inMemSorter = null;
     }
+    return spills.toArray(new SpillInfo[spills.size()]);
   }
 
 }
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
index c0ebe3cc9b792..e9c2a69c47cba 100644
--- a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
@@ -208,7 +208,7 @@ public void write(scala.collection.Iterator<Product2<K, V>> records) throws IOEx
     }
   }
 
-  private void open() throws IOException {
+  private void open() {
     assert (sorter == null);
     sorter = new ShuffleExternalSorter(
       memoryManager,
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
index e2059cec132d2..de4464080ef55 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
@@ -346,7 +346,7 @@ public void cleanupResources() {
    * array and grows the array if additional space is required. If the required space cannot be
    * obtained, then the in-memory data will be spilled to disk.
    */
-  private void growPointerArrayIfNecessary() throws IOException {
+  private void growPointerArrayIfNecessary() {
     assert(inMemSorter != null);
     if (!inMemSorter.hasSpaceForAnotherRecord()) {
       long used = inMemSorter.getMemoryUsage();
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java
index 850f247b045cf..9399024f01783 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java
@@ -79,7 +79,7 @@ public UnsafeSorterSpillWriter(
   }
 
   // Based on DataOutputStream.writeLong.
-  private void writeLongToBuffer(long v, int offset) throws IOException {
+  private void writeLongToBuffer(long v, int offset) {
     writeBuffer[offset + 0] = (byte)(v >>> 56);
     writeBuffer[offset + 1] = (byte)(v >>> 48);
     writeBuffer[offset + 2] = (byte)(v >>> 40);
@@ -91,7 +91,7 @@ private void writeLongToBuffer(long v, int offset) throws IOException {
   }
 
   // Based on DataOutputStream.writeInt.
-  private void writeIntToBuffer(int v, int offset) throws IOException {
+  private void writeIntToBuffer(int v, int offset) {
     writeBuffer[offset + 0] = (byte)(v >>> 24);
     writeBuffer[offset + 1] = (byte)(v >>> 16);
     writeBuffer[offset + 2] = (byte)(v >>>  8);
diff --git a/examples/src/main/java/org/apache/spark/examples/sql/streaming/JavaStructuredSessionization.java b/examples/src/main/java/org/apache/spark/examples/sql/streaming/JavaStructuredSessionization.java
index 6b8e6554f1bb1..943e3d82f30ff 100644
--- a/examples/src/main/java/org/apache/spark/examples/sql/streaming/JavaStructuredSessionization.java
+++ b/examples/src/main/java/org/apache/spark/examples/sql/streaming/JavaStructuredSessionization.java
@@ -69,7 +69,7 @@ public static void main(String[] args) throws Exception {
     FlatMapFunction<LineWithTimestamp, Event> linesToEvents =
       new FlatMapFunction<LineWithTimestamp, Event>() {
         @Override
-        public Iterator<Event> call(LineWithTimestamp lineWithTimestamp) throws Exception {
+        public Iterator<Event> call(LineWithTimestamp lineWithTimestamp) {
           ArrayList<Event> eventList = new ArrayList<Event>();
           for (String word : lineWithTimestamp.getLine().split(" ")) {
             eventList.add(new Event(word, lineWithTimestamp.getTimestamp()));
@@ -91,8 +91,7 @@ public Iterator<Event> call(LineWithTimestamp lineWithTimestamp) throws Exceptio
     MapGroupsWithStateFunction<String, Event, SessionInfo, SessionUpdate> stateUpdateFunc =
       new MapGroupsWithStateFunction<String, Event, SessionInfo, SessionUpdate>() {
         @Override public SessionUpdate call(
-            String sessionId, Iterator<Event> events, GroupState<SessionInfo> state)
-              throws Exception {
+            String sessionId, Iterator<Event> events, GroupState<SessionInfo> state) {
           // If timed out, then remove session and send final update
           if (state.hasTimedOut()) {
             SessionUpdate finalUpdate = new SessionUpdate(
@@ -138,7 +137,7 @@ public Iterator<Event> call(LineWithTimestamp lineWithTimestamp) throws Exceptio
     Dataset<SessionUpdate> sessionUpdates = events
         .groupByKey(
             new MapFunction<Event, String>() {
-              @Override public String call(Event event) throws Exception {
+              @Override public String call(Event event) {
                 return event.getSessionId();
               }
             }, Encoders.STRING())
diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java b/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java
index b83fe1b2d01cb..718a368a8e731 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java
@@ -605,7 +605,7 @@ private ProcessBuilder createBuilder() throws IOException {
   }
 
   // Visible for testing.
-  String findSparkSubmit() throws IOException {
+  String findSparkSubmit() {
     String script = isWindows() ? "spark-submit.cmd" : "spark-submit";
     return join(File.separator, builder.getSparkHome(), "bin", script);
   }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
index 2173bbce3eea9..3c8d766ffad30 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
@@ -350,14 +350,13 @@ private void decodeDictionaryIds(
    * is guaranteed that num is smaller than the number of values left in the current page.
    */
 
-  private void readBooleanBatch(int rowId, int num, WritableColumnVector column)
-      throws IOException {
+  private void readBooleanBatch(int rowId, int num, WritableColumnVector column) {
     assert(column.dataType() == DataTypes.BooleanType);
     defColumn.readBooleans(
         num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
   }
 
-  private void readIntBatch(int rowId, int num, WritableColumnVector column) throws IOException {
+  private void readIntBatch(int rowId, int num, WritableColumnVector column) {
     // This is where we implement support for the valid type conversions.
     // TODO: implement remaining type conversions
     if (column.dataType() == DataTypes.IntegerType || column.dataType() == DataTypes.DateType ||
@@ -375,7 +374,7 @@ private void readIntBatch(int rowId, int num, WritableColumnVector column) throw
     }
   }
 
-  private void readLongBatch(int rowId, int num, WritableColumnVector column) throws IOException {
+  private void readLongBatch(int rowId, int num, WritableColumnVector column) {
     // This is where we implement support for the valid type conversions.
     if (column.dataType() == DataTypes.LongType ||
         DecimalType.is64BitDecimalType(column.dataType())) {
@@ -394,7 +393,7 @@ private void readLongBatch(int rowId, int num, WritableColumnVector column) thro
     }
   }
 
-  private void readFloatBatch(int rowId, int num, WritableColumnVector column) throws IOException {
+  private void readFloatBatch(int rowId, int num, WritableColumnVector column) {
     // This is where we implement support for the valid type conversions.
     // TODO: support implicit cast to double?
     if (column.dataType() == DataTypes.FloatType) {
@@ -405,7 +404,7 @@ private void readFloatBatch(int rowId, int num, WritableColumnVector column) thr
     }
   }
 
-  private void readDoubleBatch(int rowId, int num, WritableColumnVector column) throws IOException {
+  private void readDoubleBatch(int rowId, int num, WritableColumnVector column) {
     // This is where we implement support for the valid type conversions.
     // TODO: implement remaining type conversions
     if (column.dataType() == DataTypes.DoubleType) {
@@ -416,7 +415,7 @@ private void readDoubleBatch(int rowId, int num, WritableColumnVector column) th
     }
   }
 
-  private void readBinaryBatch(int rowId, int num, WritableColumnVector column) throws IOException {
+  private void readBinaryBatch(int rowId, int num, WritableColumnVector column) {
     // This is where we implement support for the valid type conversions.
     // TODO: implement remaining type conversions
     VectorizedValuesReader data = (VectorizedValuesReader) dataColumn;
@@ -441,7 +440,7 @@ private void readFixedLenByteArrayBatch(
       int rowId,
       int num,
       WritableColumnVector column,
-      int arrayLen) throws IOException {
+      int arrayLen) {
     VectorizedValuesReader data = (VectorizedValuesReader) dataColumn;
     // This is where we implement support for the valid type conversions.
     // TODO: implement remaining type conversions
@@ -476,7 +475,7 @@ private void readFixedLenByteArrayBatch(
     }
   }
 
-  private void readPage() throws IOException {
+  private void readPage() {
     DataPage page = pageReader.readPage();
     // TODO: Why is this a visitor?
     page.accept(new DataPage.Visitor<Void>() {

From 21c4450fb24635fab6481a3756fefa9c6f6d6235 Mon Sep 17 00:00:00 2001
From: donnyzone <wellfengzhu@gmail.com>
Date: Wed, 13 Sep 2017 10:06:53 -0700
Subject: [PATCH 538/779] [SPARK-21980][SQL] References in grouping functions
 should be indexed with semanticEquals

## What changes were proposed in this pull request?

https://issues.apache.org/jira/browse/SPARK-21980

This PR fixes the issue in ResolveGroupingAnalytics rule, which indexes the column references in grouping functions without considering case sensitive configurations.

The problem can be reproduced by:

`val df = spark.createDataFrame(Seq((1, 1), (2, 1), (2, 2))).toDF("a", "b")
 df.cube("a").agg(grouping("A")).show()`

## How was this patch tested?
unit tests

Author: donnyzone <wellfengzhu@gmail.com>

Closes #19202 from DonnyZone/ResolveGroupingAnalytics.
---
 .../spark/sql/catalyst/analysis/Analyzer.scala   |  2 +-
 .../spark/sql/DataFrameAggregateSuite.scala      | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 1e934d0aa0e51..0880bd66ea4c4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -314,7 +314,7 @@ class Analyzer(
                 s"grouping columns (${groupByExprs.mkString(",")})")
           }
         case e @ Grouping(col: Expression) =>
-          val idx = groupByExprs.indexOf(col)
+          val idx = groupByExprs.indexWhere(_.semanticEquals(col))
           if (idx >= 0) {
             Alias(Cast(BitwiseAnd(ShiftRight(gid, Literal(groupByExprs.length - 1 - idx)),
               Literal(1)), ByteType), toPrettySQL(e))()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index affe97120c8f6..8549eac58ee95 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -190,6 +190,22 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {
     )
   }
 
+  test("SPARK-21980: References in grouping functions should be indexed with semanticEquals") {
+    checkAnswer(
+      courseSales.cube("course", "year")
+        .agg(grouping("CouRse"), grouping("year")),
+      Row("Java", 2012, 0, 0) ::
+        Row("Java", 2013, 0, 0) ::
+        Row("Java", null, 0, 1) ::
+        Row("dotNET", 2012, 0, 0) ::
+        Row("dotNET", 2013, 0, 0) ::
+        Row("dotNET", null, 0, 1) ::
+        Row(null, 2012, 1, 0) ::
+        Row(null, 2013, 1, 0) ::
+        Row(null, null, 1, 1) :: Nil
+    )
+  }
+
   test("rollup overlapping columns") {
     checkAnswer(
       testData2.rollup($"a" + $"b" as "foo", $"b" as "bar").agg(sum($"a" - $"b") as "foo"),

From 8c7e19a37dc5af924be8b7af0c3607d5c7a4e96c Mon Sep 17 00:00:00 2001
From: Jane Wang <janewang@fb.com>
Date: Wed, 13 Sep 2017 15:12:36 -0700
Subject: [PATCH 539/779] [SPARK-4131] Merge HiveTmpFile.scala to
 SaveAsHiveFile.scala

## What changes were proposed in this pull request?

The code is already merged to master:
https://github.com/apache/spark/pull/18975

This is a following up PR to merge HiveTmpFile.scala to SaveAsHiveFile.

## How was this patch tested?

Build successfully

Author: Jane Wang <janewang@fb.com>

Closes #19221 from janewangfb/merge_savehivefile_hivetmpfile.
---
 .../sql/hive/execution/HiveTmpPath.scala      | 203 ------------------
 .../execution/InsertIntoHiveDirCommand.scala  |   2 +-
 .../hive/execution/InsertIntoHiveTable.scala  |   2 +-
 .../sql/hive/execution/SaveAsHiveFile.scala   | 175 +++++++++++++++
 4 files changed, 177 insertions(+), 205 deletions(-)
 delete mode 100644 sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTmpPath.scala

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTmpPath.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTmpPath.scala
deleted file mode 100644
index 15ca1dfc76d19..0000000000000
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTmpPath.scala
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.hive.execution
-
-import java.io.{File, IOException}
-import java.net.URI
-import java.text.SimpleDateFormat
-import java.util.{Date, Locale, Random}
-
-import scala.util.control.NonFatal
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileSystem, Path}
-import org.apache.hadoop.hive.common.FileUtils
-import org.apache.hadoop.hive.ql.exec.TaskRunner
-
-import org.apache.spark.internal.Logging
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.hive.HiveExternalCatalog
-import org.apache.spark.sql.hive.client.HiveVersion
-
-// Base trait for getting a temporary location for writing data
-private[hive] trait HiveTmpPath extends Logging {
-
-  var createdTempDir: Option[Path] = None
-
-  def getExternalTmpPath(
-      sparkSession: SparkSession,
-      hadoopConf: Configuration,
-      path: Path): Path = {
-    import org.apache.spark.sql.hive.client.hive._
-
-    // Before Hive 1.1, when inserting into a table, Hive will create the staging directory under
-    // a common scratch directory. After the writing is finished, Hive will simply empty the table
-    // directory and move the staging directory to it.
-    // After Hive 1.1, Hive will create the staging directory under the table directory, and when
-    // moving staging directory to table directory, Hive will still empty the table directory, but
-    // will exclude the staging directory there.
-    // We have to follow the Hive behavior here, to avoid troubles. For example, if we create
-    // staging directory under the table director for Hive prior to 1.1, the staging directory will
-    // be removed by Hive when Hive is trying to empty the table directory.
-    val hiveVersionsUsingOldExternalTempPath: Set[HiveVersion] = Set(v12, v13, v14, v1_0)
-    val hiveVersionsUsingNewExternalTempPath: Set[HiveVersion] = Set(v1_1, v1_2, v2_0, v2_1)
-
-    // Ensure all the supported versions are considered here.
-    assert(hiveVersionsUsingNewExternalTempPath ++ hiveVersionsUsingOldExternalTempPath ==
-      allSupportedHiveVersions)
-
-    val externalCatalog = sparkSession.sharedState.externalCatalog
-    val hiveVersion = externalCatalog.asInstanceOf[HiveExternalCatalog].client.version
-    val stagingDir = hadoopConf.get("hive.exec.stagingdir", ".hive-staging")
-    val scratchDir = hadoopConf.get("hive.exec.scratchdir", "/tmp/hive")
-
-    if (hiveVersionsUsingOldExternalTempPath.contains(hiveVersion)) {
-      oldVersionExternalTempPath(path, hadoopConf, scratchDir)
-    } else if (hiveVersionsUsingNewExternalTempPath.contains(hiveVersion)) {
-      newVersionExternalTempPath(path, hadoopConf, stagingDir)
-    } else {
-      throw new IllegalStateException("Unsupported hive version: " + hiveVersion.fullVersion)
-    }
-  }
-
-  def deleteExternalTmpPath(hadoopConf: Configuration) : Unit = {
-    // Attempt to delete the staging directory and the inclusive files. If failed, the files are
-    // expected to be dropped at the normal termination of VM since deleteOnExit is used.
-    try {
-      createdTempDir.foreach { path =>
-        val fs = path.getFileSystem(hadoopConf)
-        if (fs.delete(path, true)) {
-          // If we successfully delete the staging directory, remove it from FileSystem's cache.
-          fs.cancelDeleteOnExit(path)
-        }
-      }
-    } catch {
-      case NonFatal(e) =>
-        val stagingDir = hadoopConf.get("hive.exec.stagingdir", ".hive-staging")
-        logWarning(s"Unable to delete staging directory: $stagingDir.\n" + e)
-    }
-  }
-
-  // Mostly copied from Context.java#getExternalTmpPath of Hive 0.13
-  private def oldVersionExternalTempPath(
-      path: Path,
-      hadoopConf: Configuration,
-      scratchDir: String): Path = {
-    val extURI: URI = path.toUri
-    val scratchPath = new Path(scratchDir, executionId)
-    var dirPath = new Path(
-      extURI.getScheme,
-      extURI.getAuthority,
-      scratchPath.toUri.getPath + "-" + TaskRunner.getTaskRunnerID())
-
-    try {
-      val fs: FileSystem = dirPath.getFileSystem(hadoopConf)
-      dirPath = new Path(fs.makeQualified(dirPath).toString())
-
-      if (!FileUtils.mkdir(fs, dirPath, true, hadoopConf)) {
-        throw new IllegalStateException("Cannot create staging directory: " + dirPath.toString)
-      }
-      createdTempDir = Some(dirPath)
-      fs.deleteOnExit(dirPath)
-    } catch {
-      case e: IOException =>
-        throw new RuntimeException("Cannot create staging directory: " + dirPath.toString, e)
-    }
-    dirPath
-  }
-
-  // Mostly copied from Context.java#getExternalTmpPath of Hive 1.2
-  private def newVersionExternalTempPath(
-      path: Path,
-      hadoopConf: Configuration,
-      stagingDir: String): Path = {
-    val extURI: URI = path.toUri
-    if (extURI.getScheme == "viewfs") {
-      getExtTmpPathRelTo(path.getParent, hadoopConf, stagingDir)
-    } else {
-      new Path(getExternalScratchDir(extURI, hadoopConf, stagingDir), "-ext-10000")
-    }
-  }
-
-  private def getExtTmpPathRelTo(
-      path: Path,
-      hadoopConf: Configuration,
-      stagingDir: String): Path = {
-    new Path(getStagingDir(path, hadoopConf, stagingDir), "-ext-10000") // Hive uses 10000
-  }
-
-  private def getExternalScratchDir(
-      extURI: URI,
-      hadoopConf: Configuration,
-      stagingDir: String): Path = {
-    getStagingDir(
-      new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath),
-      hadoopConf,
-      stagingDir)
-  }
-
-  private def getStagingDir(
-      inputPath: Path,
-      hadoopConf: Configuration,
-      stagingDir: String): Path = {
-    val inputPathUri: URI = inputPath.toUri
-    val inputPathName: String = inputPathUri.getPath
-    val fs: FileSystem = inputPath.getFileSystem(hadoopConf)
-    var stagingPathName: String =
-      if (inputPathName.indexOf(stagingDir) == -1) {
-        new Path(inputPathName, stagingDir).toString
-      } else {
-        inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length)
-      }
-
-    // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the
-    // staging directory needs to avoid being deleted when users set hive.exec.stagingdir
-    // under the table directory.
-    if (FileUtils.isSubDir(new Path(stagingPathName), inputPath, fs) &&
-      !stagingPathName.stripPrefix(inputPathName).stripPrefix(File.separator).startsWith(".")) {
-      logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " +
-        "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " +
-        "directory.")
-      stagingPathName = new Path(inputPathName, ".hive-staging").toString
-    }
-
-    val dir: Path =
-      fs.makeQualified(
-        new Path(stagingPathName + "_" + executionId + "-" + TaskRunner.getTaskRunnerID))
-    logDebug("Created staging dir = " + dir + " for path = " + inputPath)
-    try {
-      if (!FileUtils.mkdir(fs, dir, true, hadoopConf)) {
-        throw new IllegalStateException("Cannot create staging directory  '" + dir.toString + "'")
-      }
-      createdTempDir = Some(dir)
-      fs.deleteOnExit(dir)
-    } catch {
-      case e: IOException =>
-        throw new RuntimeException(
-          "Cannot create staging directory '" + dir.toString + "': " + e.getMessage, e)
-    }
-    dir
-  }
-
-  private def executionId: String = {
-    val rand: Random = new Random
-    val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US)
-    "hive_" + format.format(new Date) + "_" + Math.abs(rand.nextLong)
-  }
-}
-
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveDirCommand.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveDirCommand.scala
index 2110038db36ac..918c8be00d69d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveDirCommand.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveDirCommand.scala
@@ -55,7 +55,7 @@ case class InsertIntoHiveDirCommand(
     isLocal: Boolean,
     storage: CatalogStorageFormat,
     query: LogicalPlan,
-    overwrite: Boolean) extends SaveAsHiveFile with HiveTmpPath {
+    overwrite: Boolean) extends SaveAsHiveFile {
 
   override def children: Seq[LogicalPlan] = query :: Nil
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index 5bdc97a2982df..e5b59ed7a1a6b 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -70,7 +70,7 @@ case class InsertIntoHiveTable(
     partition: Map[String, Option[String]],
     query: LogicalPlan,
     overwrite: Boolean,
-    ifPartitionNotExists: Boolean) extends SaveAsHiveFile with HiveTmpPath {
+    ifPartitionNotExists: Boolean) extends SaveAsHiveFile {
 
   override def children: Seq[LogicalPlan] = query :: Nil
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
index 7de9b421245f0..ad8699489d4bf 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
@@ -17,7 +17,17 @@
 
 package org.apache.spark.sql.hive.execution
 
+import java.io.{File, IOException}
+import java.net.URI
+import java.text.SimpleDateFormat
+import java.util.{Date, Locale, Random}
+
+import scala.util.control.NonFatal
+
 import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.hadoop.hive.common.FileUtils
+import org.apache.hadoop.hive.ql.exec.TaskRunner
 
 import org.apache.spark.internal.io.FileCommitProtocol
 import org.apache.spark.sql.SparkSession
@@ -25,11 +35,15 @@ import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.command.DataWritingCommand
 import org.apache.spark.sql.execution.datasources.FileFormatWriter
+import org.apache.spark.sql.hive.HiveExternalCatalog
 import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc}
+import org.apache.spark.sql.hive.client.HiveVersion
 
 // Base trait from which all hive insert statement physical execution extends.
 private[hive] trait SaveAsHiveFile extends DataWritingCommand {
 
+  var createdTempDir: Option[Path] = None
+
   protected def saveAsHiveFile(
       sparkSession: SparkSession,
       plan: SparkPlan,
@@ -69,5 +83,166 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
       statsTrackers = Seq(basicWriteJobStatsTracker(hadoopConf)),
       options = Map.empty)
   }
+
+  protected def getExternalTmpPath(
+      sparkSession: SparkSession,
+      hadoopConf: Configuration,
+      path: Path): Path = {
+    import org.apache.spark.sql.hive.client.hive._
+
+    // Before Hive 1.1, when inserting into a table, Hive will create the staging directory under
+    // a common scratch directory. After the writing is finished, Hive will simply empty the table
+    // directory and move the staging directory to it.
+    // After Hive 1.1, Hive will create the staging directory under the table directory, and when
+    // moving staging directory to table directory, Hive will still empty the table directory, but
+    // will exclude the staging directory there.
+    // We have to follow the Hive behavior here, to avoid troubles. For example, if we create
+    // staging directory under the table director for Hive prior to 1.1, the staging directory will
+    // be removed by Hive when Hive is trying to empty the table directory.
+    val hiveVersionsUsingOldExternalTempPath: Set[HiveVersion] = Set(v12, v13, v14, v1_0)
+    val hiveVersionsUsingNewExternalTempPath: Set[HiveVersion] = Set(v1_1, v1_2, v2_0, v2_1)
+
+    // Ensure all the supported versions are considered here.
+    assert(hiveVersionsUsingNewExternalTempPath ++ hiveVersionsUsingOldExternalTempPath ==
+      allSupportedHiveVersions)
+
+    val externalCatalog = sparkSession.sharedState.externalCatalog
+    val hiveVersion = externalCatalog.asInstanceOf[HiveExternalCatalog].client.version
+    val stagingDir = hadoopConf.get("hive.exec.stagingdir", ".hive-staging")
+    val scratchDir = hadoopConf.get("hive.exec.scratchdir", "/tmp/hive")
+
+    if (hiveVersionsUsingOldExternalTempPath.contains(hiveVersion)) {
+      oldVersionExternalTempPath(path, hadoopConf, scratchDir)
+    } else if (hiveVersionsUsingNewExternalTempPath.contains(hiveVersion)) {
+      newVersionExternalTempPath(path, hadoopConf, stagingDir)
+    } else {
+      throw new IllegalStateException("Unsupported hive version: " + hiveVersion.fullVersion)
+    }
+  }
+
+  protected def deleteExternalTmpPath(hadoopConf: Configuration) : Unit = {
+    // Attempt to delete the staging directory and the inclusive files. If failed, the files are
+    // expected to be dropped at the normal termination of VM since deleteOnExit is used.
+    try {
+      createdTempDir.foreach { path =>
+        val fs = path.getFileSystem(hadoopConf)
+        if (fs.delete(path, true)) {
+          // If we successfully delete the staging directory, remove it from FileSystem's cache.
+          fs.cancelDeleteOnExit(path)
+        }
+      }
+    } catch {
+      case NonFatal(e) =>
+        val stagingDir = hadoopConf.get("hive.exec.stagingdir", ".hive-staging")
+        logWarning(s"Unable to delete staging directory: $stagingDir.\n" + e)
+    }
+  }
+
+  // Mostly copied from Context.java#getExternalTmpPath of Hive 0.13
+  private def oldVersionExternalTempPath(
+      path: Path,
+      hadoopConf: Configuration,
+      scratchDir: String): Path = {
+    val extURI: URI = path.toUri
+    val scratchPath = new Path(scratchDir, executionId)
+    var dirPath = new Path(
+      extURI.getScheme,
+      extURI.getAuthority,
+      scratchPath.toUri.getPath + "-" + TaskRunner.getTaskRunnerID())
+
+    try {
+      val fs: FileSystem = dirPath.getFileSystem(hadoopConf)
+      dirPath = new Path(fs.makeQualified(dirPath).toString())
+
+      if (!FileUtils.mkdir(fs, dirPath, true, hadoopConf)) {
+        throw new IllegalStateException("Cannot create staging directory: " + dirPath.toString)
+      }
+      createdTempDir = Some(dirPath)
+      fs.deleteOnExit(dirPath)
+    } catch {
+      case e: IOException =>
+        throw new RuntimeException("Cannot create staging directory: " + dirPath.toString, e)
+    }
+    dirPath
+  }
+
+  // Mostly copied from Context.java#getExternalTmpPath of Hive 1.2
+  private def newVersionExternalTempPath(
+      path: Path,
+      hadoopConf: Configuration,
+      stagingDir: String): Path = {
+    val extURI: URI = path.toUri
+    if (extURI.getScheme == "viewfs") {
+      getExtTmpPathRelTo(path.getParent, hadoopConf, stagingDir)
+    } else {
+      new Path(getExternalScratchDir(extURI, hadoopConf, stagingDir), "-ext-10000")
+    }
+  }
+
+  private def getExtTmpPathRelTo(
+      path: Path,
+      hadoopConf: Configuration,
+      stagingDir: String): Path = {
+    new Path(getStagingDir(path, hadoopConf, stagingDir), "-ext-10000") // Hive uses 10000
+  }
+
+  private def getExternalScratchDir(
+      extURI: URI,
+      hadoopConf: Configuration,
+      stagingDir: String): Path = {
+    getStagingDir(
+      new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath),
+      hadoopConf,
+      stagingDir)
+  }
+
+  private def getStagingDir(
+      inputPath: Path,
+      hadoopConf: Configuration,
+      stagingDir: String): Path = {
+    val inputPathUri: URI = inputPath.toUri
+    val inputPathName: String = inputPathUri.getPath
+    val fs: FileSystem = inputPath.getFileSystem(hadoopConf)
+    var stagingPathName: String =
+      if (inputPathName.indexOf(stagingDir) == -1) {
+        new Path(inputPathName, stagingDir).toString
+      } else {
+        inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length)
+      }
+
+    // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the
+    // staging directory needs to avoid being deleted when users set hive.exec.stagingdir
+    // under the table directory.
+    if (FileUtils.isSubDir(new Path(stagingPathName), inputPath, fs) &&
+      !stagingPathName.stripPrefix(inputPathName).stripPrefix(File.separator).startsWith(".")) {
+      logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " +
+        "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " +
+        "directory.")
+      stagingPathName = new Path(inputPathName, ".hive-staging").toString
+    }
+
+    val dir: Path =
+      fs.makeQualified(
+        new Path(stagingPathName + "_" + executionId + "-" + TaskRunner.getTaskRunnerID))
+    logDebug("Created staging dir = " + dir + " for path = " + inputPath)
+    try {
+      if (!FileUtils.mkdir(fs, dir, true, hadoopConf)) {
+        throw new IllegalStateException("Cannot create staging directory  '" + dir.toString + "'")
+      }
+      createdTempDir = Some(dir)
+      fs.deleteOnExit(dir)
+    } catch {
+      case e: IOException =>
+        throw new RuntimeException(
+          "Cannot create staging directory '" + dir.toString + "': " + e.getMessage, e)
+    }
+    dir
+  }
+
+  private def executionId: String = {
+    val rand: Random = new Random
+    val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US)
+    "hive_" + format.format(new Date) + "_" + Math.abs(rand.nextLong)
+  }
 }
 

From 17edfec59de8d8680f7450b4d07c147c086c105a Mon Sep 17 00:00:00 2001
From: Yuming Wang <wgyumg@gmail.com>
Date: Wed, 13 Sep 2017 16:34:17 -0700
Subject: [PATCH 540/779] [SPARK-20427][SQL] Read JDBC table use custom schema

## What changes were proposed in this pull request?

Auto generated Oracle schema some times not we expect:

- `number(1)` auto mapped to BooleanType, some times it's not we expect, per [SPARK-20921](https://issues.apache.org/jira/browse/SPARK-20921).
-  `number` auto mapped to Decimal(38,10), It can't read big data, per [SPARK-20427](https://issues.apache.org/jira/browse/SPARK-20427).

This PR fix this issue by custom schema as follows:
```scala
val props = new Properties()
props.put("customSchema", "ID decimal(38, 0), N1 int, N2 boolean")
val dfRead = spark.read.schema(schema).jdbc(jdbcUrl, "tableWithCustomSchema", props)
dfRead.show()
```
or
```sql
CREATE TEMPORARY VIEW tableWithCustomSchema
USING org.apache.spark.sql.jdbc
OPTIONS (url '$jdbcUrl', dbTable 'tableWithCustomSchema', customSchema'ID decimal(38, 0), N1 int, N2 boolean')
```

## How was this patch tested?

unit tests

Author: Yuming Wang <wgyumg@gmail.com>

Closes #18266 from wangyum/SPARK-20427.
---
 docs/sql-programming-guide.md                 |  9 +-
 examples/src/main/python/sql/datasource.py    | 10 +++
 .../examples/sql/SQLDataSourceExample.scala   |  4 +
 .../sql/jdbc/OracleIntegrationSuite.scala     | 47 ++++++++--
 .../datasources/jdbc/JDBCOptions.scala        |  4 +
 .../execution/datasources/jdbc/JDBCRDD.scala  |  2 +-
 .../datasources/jdbc/JDBCRelation.scala       |  9 +-
 .../datasources/jdbc/JdbcUtils.scala          | 30 ++++++-
 .../datasources/jdbc/JdbcUtilsSuite.scala     | 87 +++++++++++++++++++
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 30 +++++++
 10 files changed, 222 insertions(+), 10 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtilsSuite.scala

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 0a8acbb52575b..95d704014742c 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1328,7 +1328,14 @@ the following case-insensitive options:
     <td>
      The database column data types to use instead of the defaults, when creating the table. Data type information should be specified in the same format as CREATE TABLE columns syntax (e.g: <code>"name CHAR(64), comments VARCHAR(1024)")</code>. The specified types should be valid spark sql data types. This option applies only to writing.
     </td>
-  </tr>  
+  </tr>
+
+  <tr>
+    <td><code>customSchema</code></td>
+    <td>
+     The custom schema to use for reading data from JDBC connectors. For example, "id DECIMAL(38, 0), name STRING"). The column names should be identical to the corresponding column names of JDBC table. Users can specify the corresponding data types of Spark SQL instead of using the defaults. This option applies only to reading.
+    </td>
+  </tr>
 </table>
 
 <div class="codetabs">
diff --git a/examples/src/main/python/sql/datasource.py b/examples/src/main/python/sql/datasource.py
index 8777cca66bfe9..f86012ea382e8 100644
--- a/examples/src/main/python/sql/datasource.py
+++ b/examples/src/main/python/sql/datasource.py
@@ -177,6 +177,16 @@ def jdbc_dataset_example(spark):
         .jdbc("jdbc:postgresql:dbserver", "schema.tablename",
               properties={"user": "username", "password": "password"})
 
+    # Specifying dataframe column data types on read
+    jdbcDF3 = spark.read \
+        .format("jdbc") \
+        .option("url", "jdbc:postgresql:dbserver") \
+        .option("dbtable", "schema.tablename") \
+        .option("user", "username") \
+        .option("password", "password") \
+        .option("customSchema", "id DECIMAL(38, 0), name STRING") \
+        .load()
+
     # Saving data to a JDBC source
     jdbcDF.write \
         .format("jdbc") \
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala
index 6ff03bdb22129..86b3dc4a84f58 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala
@@ -185,6 +185,10 @@ object SQLDataSourceExample {
     connectionProperties.put("password", "password")
     val jdbcDF2 = spark.read
       .jdbc("jdbc:postgresql:dbserver", "schema.tablename", connectionProperties)
+    // Specifying the custom data types of the read schema
+    connectionProperties.put("customSchema", "id DECIMAL(38, 0), name STRING")
+    val jdbcDF3 = spark.read
+      .jdbc("jdbc:postgresql:dbserver", "schema.tablename", connectionProperties)
 
     // Saving data to a JDBC source
     jdbcDF.write
diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala
index 1b2c1b9e800ac..7680ae3835132 100644
--- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala
+++ b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala
@@ -21,7 +21,7 @@ import java.sql.{Connection, Date, Timestamp}
 import java.util.Properties
 import java.math.BigDecimal
 
-import org.apache.spark.sql.Row
+import org.apache.spark.sql.{DataFrame, Row}
 import org.apache.spark.sql.execution.{WholeStageCodegenExec, RowDataSourceScanExec}
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
@@ -72,10 +72,17 @@ class OracleIntegrationSuite extends DockerJDBCIntegrationSuite with SharedSQLCo
       """.stripMargin.replaceAll("\n", " ")).executeUpdate()
     conn.commit()
 
-    conn.prepareStatement("CREATE TABLE ts_with_timezone (id NUMBER(10), t TIMESTAMP WITH TIME ZONE)")
-        .executeUpdate()
-    conn.prepareStatement("INSERT INTO ts_with_timezone VALUES (1, to_timestamp_tz('1999-12-01 11:00:00 UTC','YYYY-MM-DD HH:MI:SS TZR'))")
-        .executeUpdate()
+    conn.prepareStatement(
+      "CREATE TABLE ts_with_timezone (id NUMBER(10), t TIMESTAMP WITH TIME ZONE)").executeUpdate()
+    conn.prepareStatement(
+      "INSERT INTO ts_with_timezone VALUES " +
+        "(1, to_timestamp_tz('1999-12-01 11:00:00 UTC','YYYY-MM-DD HH:MI:SS TZR'))").executeUpdate()
+    conn.commit()
+
+    conn.prepareStatement(
+      "CREATE TABLE tableWithCustomSchema (id NUMBER, n1 NUMBER(1), n2 NUMBER(1))").executeUpdate()
+    conn.prepareStatement(
+      "INSERT INTO tableWithCustomSchema values(12312321321321312312312312123, 1, 0)").executeUpdate()
     conn.commit()
 
     sql(
@@ -104,7 +111,7 @@ class OracleIntegrationSuite extends DockerJDBCIntegrationSuite with SharedSQLCo
   }
 
 
-  test("SPARK-16625 : Importing Oracle numeric types") { 
+  test("SPARK-16625 : Importing Oracle numeric types") {
     val df = sqlContext.read.jdbc(jdbcUrl, "numerics", new Properties);
     val rows = df.collect()
     assert(rows.size == 1)
@@ -272,4 +279,32 @@ class OracleIntegrationSuite extends DockerJDBCIntegrationSuite with SharedSQLCo
     assert(row.getDate(0).equals(dateVal))
     assert(row.getTimestamp(1).equals(timestampVal))
   }
+
+  test("SPARK-20427/SPARK-20921: read table use custom schema by jdbc api") {
+    // default will throw IllegalArgumentException
+    val e = intercept[org.apache.spark.SparkException] {
+      spark.read.jdbc(jdbcUrl, "tableWithCustomSchema", new Properties()).collect()
+    }
+    assert(e.getMessage.contains(
+      "requirement failed: Decimal precision 39 exceeds max precision 38"))
+
+    // custom schema can read data
+    val props = new Properties()
+    props.put("customSchema",
+      s"ID DECIMAL(${DecimalType.MAX_PRECISION}, 0), N1 INT, N2 BOOLEAN")
+    val dfRead = spark.read.jdbc(jdbcUrl, "tableWithCustomSchema", props)
+
+    val rows = dfRead.collect()
+    // verify the data type
+    val types = rows(0).toSeq.map(x => x.getClass.toString)
+    assert(types(0).equals("class java.math.BigDecimal"))
+    assert(types(1).equals("class java.lang.Integer"))
+    assert(types(2).equals("class java.lang.Boolean"))
+
+    // verify the value
+    val values = rows(0)
+    assert(values.getDecimal(0).equals(new java.math.BigDecimal("12312321321321312312312312123")))
+    assert(values.getInt(1).equals(1))
+    assert(values.getBoolean(2).equals(false))
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala
index 05b00058618a2..b4e5d169066d9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala
@@ -21,6 +21,7 @@ import java.sql.{Connection, DriverManager}
 import java.util.{Locale, Properties}
 
 import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
+import org.apache.spark.sql.types.StructType
 
 /**
  * Options for the JDBC data source.
@@ -123,6 +124,8 @@ class JDBCOptions(
   // TODO: to reuse the existing partition parameters for those partition specific options
   val createTableOptions = parameters.getOrElse(JDBC_CREATE_TABLE_OPTIONS, "")
   val createTableColumnTypes = parameters.get(JDBC_CREATE_TABLE_COLUMN_TYPES)
+  val customSchema = parameters.get(JDBC_CUSTOM_DATAFRAME_COLUMN_TYPES)
+
   val batchSize = {
     val size = parameters.getOrElse(JDBC_BATCH_INSERT_SIZE, "1000").toInt
     require(size >= 1,
@@ -161,6 +164,7 @@ object JDBCOptions {
   val JDBC_TRUNCATE = newOption("truncate")
   val JDBC_CREATE_TABLE_OPTIONS = newOption("createTableOptions")
   val JDBC_CREATE_TABLE_COLUMN_TYPES = newOption("createTableColumnTypes")
+  val JDBC_CUSTOM_DATAFRAME_COLUMN_TYPES = newOption("customSchema")
   val JDBC_BATCH_INSERT_SIZE = newOption("batchsize")
   val JDBC_TXN_ISOLATION_LEVEL = newOption("isolationLevel")
   val JDBC_SESSION_INIT_STATEMENT = newOption("sessionInitStatement")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
index 3274be91d4817..05326210f3242 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
@@ -80,7 +80,7 @@ object JDBCRDD extends Logging {
    * @return A Catalyst schema corresponding to columns in the given order.
    */
   private def pruneSchema(schema: StructType, columns: Array[String]): StructType = {
-    val fieldMap = Map(schema.fields.map(x => x.metadata.getString("name") -> x): _*)
+    val fieldMap = Map(schema.fields.map(x => x.name -> x): _*)
     new StructType(columns.map(name => fieldMap(name)))
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
index 17405f550d25f..b23e5a7722004 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
@@ -111,7 +111,14 @@ private[sql] case class JDBCRelation(
 
   override val needConversion: Boolean = false
 
-  override val schema: StructType = JDBCRDD.resolveTable(jdbcOptions)
+  override val schema: StructType = {
+    val tableSchema = JDBCRDD.resolveTable(jdbcOptions)
+    jdbcOptions.customSchema match {
+      case Some(customSchema) => JdbcUtils.getCustomSchema(
+        tableSchema, customSchema, sparkSession.sessionState.conf.resolver)
+      case None => tableSchema
+    }
+  }
 
   // Check if JDBCRDD.compileFilter can accept input filters
   override def unhandledFilters(filters: Array[Filter]): Array[Filter] = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
index bbe9024f13a44..75327f0d38c2e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
@@ -29,6 +29,7 @@ import org.apache.spark.executor.InputMetrics
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{AnalysisException, DataFrame, Row}
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.analysis.Resolver
 import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.catalyst.expressions.SpecificInternalRow
 import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
@@ -301,7 +302,6 @@ object JdbcUtils extends Logging {
         rsmd.isNullable(i + 1) != ResultSetMetaData.columnNoNulls
       }
       val metadata = new MetadataBuilder()
-        .putString("name", columnName)
         .putLong("scale", fieldScale)
       val columnType =
         dialect.getCatalystType(dataType, typeName, fieldSize, metadata).getOrElse(
@@ -767,6 +767,34 @@ object JdbcUtils extends Logging {
     if (isCaseSensitive) userSchemaMap else CaseInsensitiveMap(userSchemaMap)
   }
 
+  /**
+   * Parses the user specified customSchema option value to DataFrame schema,
+   * and returns it if it's all columns are equals to default schema's.
+   */
+  def getCustomSchema(
+      tableSchema: StructType,
+      customSchema: String,
+      nameEquality: Resolver): StructType = {
+    val userSchema = CatalystSqlParser.parseTableSchema(customSchema)
+
+    SchemaUtils.checkColumnNameDuplication(
+      userSchema.map(_.name), "in the customSchema option value", nameEquality)
+
+    val colNames = tableSchema.fieldNames.mkString(",")
+    val errorMsg = s"Please provide all the columns, all columns are: $colNames"
+    if (userSchema.size != tableSchema.size) {
+      throw new AnalysisException(errorMsg)
+    }
+
+    // This is resolved by names, only check the column names.
+    userSchema.fieldNames.foreach { col =>
+      tableSchema.find(f => nameEquality(f.name, col)).getOrElse {
+        throw new AnalysisException(errorMsg)
+      }
+    }
+    userSchema
+  }
+
   /**
    * Saves the RDD to the database in a single transaction.
    */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtilsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtilsSuite.scala
new file mode 100644
index 0000000000000..1255f262bce94
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtilsSuite.scala
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.jdbc
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.parser.ParseException
+import org.apache.spark.sql.types._
+
+class JdbcUtilsSuite extends SparkFunSuite {
+
+  val tableSchema = StructType(Seq(
+    StructField("C1", StringType, false), StructField("C2", IntegerType, false)))
+  val caseSensitive = org.apache.spark.sql.catalyst.analysis.caseSensitiveResolution
+  val caseInsensitive = org.apache.spark.sql.catalyst.analysis.caseInsensitiveResolution
+
+  test("Parse user specified column types") {
+    assert(
+      JdbcUtils.getCustomSchema(tableSchema, "C1 DATE, C2 STRING", caseInsensitive) ===
+      StructType(Seq(StructField("C1", DateType, true), StructField("C2", StringType, true))))
+    assert(JdbcUtils.getCustomSchema(tableSchema, "C1 DATE, C2 STRING", caseSensitive) ===
+      StructType(Seq(StructField("C1", DateType, true), StructField("C2", StringType, true))))
+    assert(
+      JdbcUtils.getCustomSchema(tableSchema, "c1 DATE, C2 STRING", caseInsensitive) ===
+        StructType(Seq(StructField("c1", DateType, true), StructField("C2", StringType, true))))
+    assert(JdbcUtils.getCustomSchema(
+      tableSchema, "c1 DECIMAL(38, 0), C2 STRING", caseInsensitive) ===
+      StructType(Seq(StructField("c1", DecimalType(38, 0), true),
+        StructField("C2", StringType, true))))
+
+    // Throw AnalysisException
+    val duplicate = intercept[AnalysisException]{
+      JdbcUtils.getCustomSchema(tableSchema, "c1 DATE, c1 STRING", caseInsensitive) ===
+        StructType(Seq(StructField("c1", DateType, true), StructField("c1", StringType, true)))
+    }
+    assert(duplicate.getMessage.contains(
+      "Found duplicate column(s) in the customSchema option value"))
+
+    val allColumns = intercept[AnalysisException]{
+      JdbcUtils.getCustomSchema(tableSchema, "C1 STRING", caseSensitive) ===
+        StructType(Seq(StructField("C1", DateType, true)))
+    }
+    assert(allColumns.getMessage.contains("Please provide all the columns,"))
+
+    val caseSensitiveColumnNotFound = intercept[AnalysisException]{
+      JdbcUtils.getCustomSchema(tableSchema, "c1 DATE, C2 STRING", caseSensitive) ===
+        StructType(Seq(StructField("c1", DateType, true), StructField("C2", StringType, true)))
+    }
+    assert(caseSensitiveColumnNotFound.getMessage.contains(
+      "Please provide all the columns, all columns are: C1,C2;"))
+
+    val caseInsensitiveColumnNotFound = intercept[AnalysisException]{
+      JdbcUtils.getCustomSchema(tableSchema, "c3 DATE, C2 STRING", caseInsensitive) ===
+        StructType(Seq(StructField("c3", DateType, true), StructField("C2", StringType, true)))
+    }
+    assert(caseInsensitiveColumnNotFound.getMessage.contains(
+      "Please provide all the columns, all columns are: C1,C2;"))
+
+    // Throw ParseException
+    val dataTypeNotSupported = intercept[ParseException]{
+      JdbcUtils.getCustomSchema(tableSchema, "c3 DATEE, C2 STRING", caseInsensitive) ===
+        StructType(Seq(StructField("c3", DateType, true), StructField("C2", StringType, true)))
+    }
+    assert(dataTypeNotSupported.getMessage.contains("DataType datee is not supported"))
+
+    val mismatchedInput = intercept[ParseException]{
+      JdbcUtils.getCustomSchema(tableSchema, "c3 DATE. C2 STRING", caseInsensitive) ===
+        StructType(Seq(StructField("c3", DateType, true), StructField("C2", StringType, true)))
+    }
+    assert(mismatchedInput.getMessage.contains("mismatched input '.' expecting"))
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index f951b46e4dd7d..40179261ab200 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -968,6 +968,36 @@ class JDBCSuite extends SparkFunSuite
     assert(e2.contains("User specified schema not supported with `jdbc`"))
   }
 
+  test("jdbc API support custom schema") {
+    val parts = Array[String]("THEID < 2", "THEID >= 2")
+    val props = new Properties()
+    props.put("customSchema", "NAME STRING, THEID BIGINT")
+    val schema = StructType(Seq(
+      StructField("NAME", StringType, true), StructField("THEID", LongType, true)))
+    val df = spark.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", parts, props)
+    assert(df.schema.size === 2)
+    assert(df.schema === schema)
+    assert(df.count() === 3)
+  }
+
+  test("jdbc API custom schema DDL-like strings.") {
+    withTempView("people_view") {
+      sql(
+        s"""
+           |CREATE TEMPORARY VIEW people_view
+           |USING org.apache.spark.sql.jdbc
+           |OPTIONS (uRl '$url', DbTaBlE 'TEST.PEOPLE', User 'testUser', PassWord 'testPass',
+           |customSchema 'NAME STRING, THEID INT')
+        """.stripMargin.replaceAll("\n", " "))
+      val schema = StructType(
+        Seq(StructField("NAME", StringType, true), StructField("THEID", IntegerType, true)))
+      val df = sql("select * from people_view")
+      assert(df.schema.size === 2)
+      assert(df.schema === schema)
+      assert(df.count() === 3)
+    }
+  }
+
   test("SPARK-15648: teradataDialect StringType data mapping") {
     val teradataDialect = JdbcDialects.get("jdbc:teradata://127.0.0.1/db")
     assert(teradataDialect.getJDBCType(StringType).

From 8be7e6bb3cc8afd07c24e7dbf0f8fbe0f491d740 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Wed, 13 Sep 2017 21:54:10 -0700
Subject: [PATCH 541/779] [SPARK-21973][SQL] Add an new option to filter
 queries in TPC-DS

## What changes were proposed in this pull request?
This pr added a new option to filter TPC-DS queries to run in `TPCDSQueryBenchmark`.
By default, `TPCDSQueryBenchmark` runs all the TPC-DS queries.
This change could enable developers to run some of the TPC-DS queries by this option,
e.g., to run q2, q4, and q6 only:
```
spark-submit --class <this class> --conf spark.sql.tpcds.queryFilter="q2,q4,q6" --jars <spark sql test jar>
```

## How was this patch tested?
Manually checked.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #19188 from maropu/RunPartialQueriesInTPCDS.
---
 .../benchmark/TPCDSQueryBenchmark.scala       | 23 ++++++++++++++++---
 .../TPCDSQueryBenchmarkArguments.scala        | 17 ++++++++++++--
 2 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala
index 63d118cf857be..99c6df7389205 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.execution.benchmark
 
 import org.apache.spark.SparkConf
+import org.apache.spark.internal.Logging
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
@@ -29,9 +30,9 @@ import org.apache.spark.util.Benchmark
 /**
  * Benchmark to measure TPCDS query performance.
  * To run this:
- *  spark-submit --class <this class> <spark sql test jar> <TPCDS data location>
+ *  spark-submit --class <this class> <spark sql test jar> --data-location <TPCDS data location>
  */
-object TPCDSQueryBenchmark {
+object TPCDSQueryBenchmark extends Logging {
   val conf =
     new SparkConf()
       .setMaster("local[1]")
@@ -90,7 +91,9 @@ object TPCDSQueryBenchmark {
       benchmark.addCase(name) { i =>
         spark.sql(queryString).collect()
       }
+      logInfo(s"\n\n===== TPCDS QUERY BENCHMARK OUTPUT FOR $name =====\n")
       benchmark.run()
+      logInfo(s"\n\n===== FINISHED $name =====\n")
     }
   }
 
@@ -110,6 +113,20 @@ object TPCDSQueryBenchmark {
       "q81", "q82", "q83", "q84", "q85", "q86", "q87", "q88", "q89", "q90",
       "q91", "q92", "q93", "q94", "q95", "q96", "q97", "q98", "q99")
 
-    tpcdsAll(benchmarkArgs.dataLocation, queries = tpcdsQueries)
+    // If `--query-filter` defined, filters the queries that this option selects
+    val queriesToRun = if (benchmarkArgs.queryFilter.nonEmpty) {
+      val queries = tpcdsQueries.filter { case queryName =>
+        benchmarkArgs.queryFilter.contains(queryName)
+      }
+      if (queries.isEmpty) {
+        throw new RuntimeException(
+          s"Empty queries to run. Bad query name filter: ${benchmarkArgs.queryFilter}")
+      }
+      queries
+    } else {
+      tpcdsQueries
+    }
+
+    tpcdsAll(benchmarkArgs.dataLocation, queries = queriesToRun)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmarkArguments.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmarkArguments.scala
index 8edc77bd0ec6d..184ffff94298a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmarkArguments.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmarkArguments.scala
@@ -17,21 +17,33 @@
 
 package org.apache.spark.sql.execution.benchmark
 
+import java.util.Locale
+
+
 class TPCDSQueryBenchmarkArguments(val args: Array[String]) {
   var dataLocation: String = null
+  var queryFilter: Set[String] = Set.empty
 
   parseArgs(args.toList)
   validateArguments()
 
+  private def optionMatch(optionName: String, s: String): Boolean = {
+    optionName == s.toLowerCase(Locale.ROOT)
+  }
+
   private def parseArgs(inputArgs: List[String]): Unit = {
     var args = inputArgs
 
-    while(args.nonEmpty) {
+    while (args.nonEmpty) {
       args match {
-        case ("--data-location") :: value :: tail =>
+        case optName :: value :: tail if optionMatch("--data-location", optName) =>
           dataLocation = value
           args = tail
 
+        case optName :: value :: tail if optionMatch("--query-filter", optName) =>
+          queryFilter = value.toLowerCase(Locale.ROOT).split(",").map(_.trim).toSet
+          args = tail
+
         case _ =>
           // scalastyle:off println
           System.err.println("Unknown/unsupported param " + args)
@@ -47,6 +59,7 @@ class TPCDSQueryBenchmarkArguments(val args: Array[String]) {
       |Usage: spark-submit --class <this class> <spark sql test jar> [Options]
       |Options:
       |  --data-location      Path to TPCDS data
+      |  --query-filter       Queries to filter, e.g., q3,q5,q13
       |
       |------------------------------------------------------------------------------------------------------------------
       |In order to run this benchmark, please follow the instructions at

From dcbb2294338c64d1c4a668948ec8ecb11efdeeca Mon Sep 17 00:00:00 2001
From: Dilip Biswal <dbiswal@us.ibm.com>
Date: Wed, 13 Sep 2017 22:45:44 -0700
Subject: [PATCH 542/779] [MINOR][SQL] Only populate type metadata for required
 types such as CHAR/VARCHAR.

## What changes were proposed in this pull request?
When reading column descriptions from hive catalog, we currently populate the metadata for all types to record the raw hive type string. In terms of processing , we need this additional metadata information for CHAR/VARCHAR types or complex type containing the CHAR/VARCHAR types.

Its a minor cleanup. I haven't created a JIRA for it.

## How was this patch tested?
Test added in HiveMetastoreCatalogSuite

Author: Dilip Biswal <dbiswal@us.ibm.com>

Closes #19215 from dilipbiswal/column_metadata.
---
 .../sql/hive/client/HiveClientImpl.scala      |  7 +-
 .../sql/hive/HiveMetastoreCatalogSuite.scala  | 70 ++++++++++++++++++-
 .../sql/hive/HiveSchemaInferenceSuite.scala   |  4 +-
 .../spark/sql/hive/StatisticsSuite.scala      | 18 ++---
 4 files changed, 82 insertions(+), 17 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 69dac7b062354..426db6a4e1c12 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -849,7 +849,12 @@ private[hive] object HiveClientImpl {
         throw new SparkException("Cannot recognize hive type string: " + hc.getType, e)
     }
 
-    val metadata = new MetadataBuilder().putString(HIVE_TYPE_STRING, hc.getType).build()
+    val metadata = if (hc.getType != columnType.catalogString) {
+      new MetadataBuilder().putString(HIVE_TYPE_STRING, hc.getType).build()
+    } else {
+      Metadata.empty
+    }
+
     val field = StructField(
       name = hc.getName,
       dataType = columnType,
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
index 8140f883ee542..18137e7ea1d63 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.plans.logical.SubqueryAlias
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.{ExamplePointUDT, SQLTestUtils}
-import org.apache.spark.sql.types.{DecimalType, IntegerType, StringType, StructField, StructType}
+import org.apache.spark.sql.types._
 
 class HiveMetastoreCatalogSuite extends TestHiveSingleton with SQLTestUtils {
   import spark.implicits._
@@ -67,6 +67,73 @@ class HiveMetastoreCatalogSuite extends TestHiveSingleton with SQLTestUtils {
       assert(aliases.size == 1)
     }
   }
+
+  test("Validate catalog metadata for supported data types")  {
+    withTable("t") {
+      sql(
+        """
+          |CREATE TABLE t (
+          |c1 boolean,
+          |c2 tinyint,
+          |c3 smallint,
+          |c4 short,
+          |c5 bigint,
+          |c6 long,
+          |c7 float,
+          |c8 double,
+          |c9 date,
+          |c10 timestamp,
+          |c11 string,
+          |c12 char(10),
+          |c13 varchar(10),
+          |c14 binary,
+          |c15 decimal,
+          |c16 decimal(10),
+          |c17 decimal(10,2),
+          |c18 array<string>,
+          |c19 array<int>,
+          |c20 array<char(10)>,
+          |c21 map<int,int>,
+          |c22 map<int,char(10)>,
+          |c23 struct<a:int,b:int>,
+          |c24 struct<c:varchar(10),d:int>
+          |)
+        """.stripMargin)
+
+      val schema = hiveClient.getTable("default", "t").schema
+      val expectedSchema = new StructType()
+        .add("c1", "boolean")
+        .add("c2", "tinyint")
+        .add("c3", "smallint")
+        .add("c4", "short")
+        .add("c5", "bigint")
+        .add("c6", "long")
+        .add("c7", "float")
+        .add("c8", "double")
+        .add("c9", "date")
+        .add("c10", "timestamp")
+        .add("c11", "string")
+        .add("c12", "string", true,
+          new MetadataBuilder().putString(HIVE_TYPE_STRING, "char(10)").build())
+        .add("c13", "string", true,
+          new MetadataBuilder().putString(HIVE_TYPE_STRING, "varchar(10)").build())
+        .add("c14", "binary")
+        .add("c15", "decimal")
+        .add("c16", "decimal(10)")
+        .add("c17", "decimal(10,2)")
+        .add("c18", "array<string>")
+        .add("c19", "array<int>")
+        .add("c20", "array<string>", true,
+          new MetadataBuilder().putString(HIVE_TYPE_STRING, "array<char(10)>").build())
+        .add("c21", "map<int,int>")
+        .add("c22", "map<int,string>", true,
+          new MetadataBuilder().putString(HIVE_TYPE_STRING, "map<int,char(10)>").build())
+        .add("c23", "struct<a:int,b:int>")
+        .add("c24", "struct<c:string,d:int>", true,
+          new MetadataBuilder().putString(HIVE_TYPE_STRING, "struct<c:varchar(10),d:int>").build())
+      assert(schema == expectedSchema)
+    }
+  }
 }
 
 class DataSourceWithHiveMetastoreCatalogSuite
@@ -180,5 +247,6 @@ class DataSourceWithHiveMetastoreCatalogSuite
         }
       }
     }
+
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala
index 3d0e43cbbe037..f2d27671094d7 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala
@@ -71,7 +71,7 @@ class HiveSchemaInferenceSuite
         name = field,
         dataType = LongType,
         nullable = true,
-        metadata = new MetadataBuilder().putString(HIVE_TYPE_STRING, "bigint").build())
+        metadata = Metadata.empty)
     }
     // and all partition columns as ints
     val partitionStructFields = partitionCols.map { field =>
@@ -80,7 +80,7 @@ class HiveSchemaInferenceSuite
         name = field.toLowerCase,
         dataType = IntegerType,
         nullable = true,
-        metadata = new MetadataBuilder().putString(HIVE_TYPE_STRING, "int").build())
+        metadata = Metadata.empty)
     }
     val schema = StructType(structFields ++ partitionStructFields)
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index 03e50e4119f6f..9ff9ecf7f3677 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -40,14 +40,7 @@ import org.apache.spark.sql.types._
 
 
 class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleton {
-  private def dropMetadata(schema: StructType): StructType = {
-    val newFields = schema.fields.map { f =>
-      StructField(f.name, f.dataType, f.nullable, Metadata.empty)
-    }
-    StructType(newFields)
-  }
-
-  test("Hive serde tables should fallback to HDFS for size estimation") {
+   test("Hive serde tables should fallback to HDFS for size estimation") {
     withSQLConf(SQLConf.ENABLE_FALL_BACK_TO_HDFS_FOR_STATS.key -> "true") {
       withTable("csv_table") {
         withTempDir { tempDir =>
@@ -138,9 +131,9 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
 
       // Verify that the schema stored in catalog is a dummy one used for
       // data source tables. The actual schema is stored in table properties.
-      val rawSchema = dropMetadata(hiveClient.getTable("default", table).schema)
-      val expectedRawSchema = new StructType()
-        .add("col", "array<string>")
+      val rawSchema = hiveClient.getTable("default", table).schema
+      val metadata = new MetadataBuilder().putString("comment", "from deserializer").build()
+      val expectedRawSchema = new StructType().add("col", "array<string>", true, metadata)
       assert(rawSchema == expectedRawSchema)
 
       val actualSchema = spark.sharedState.externalCatalog.getTable("default", table).schema
@@ -161,14 +154,13 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
   }
 
   test("Analyze hive serde tables when schema is not same as schema in table properties") {
-
     val table = "hive_serde"
     withTable(table) {
       sql(s"CREATE TABLE $table (C1 INT, C2 STRING, C3 DOUBLE)")
 
       // Verify that the table schema stored in hive catalog is
       // different than the schema stored in table properties.
-      val rawSchema = dropMetadata(hiveClient.getTable("default", table).schema)
+      val rawSchema = hiveClient.getTable("default", table).schema
       val expectedRawSchema = new StructType()
         .add("c1", "int")
         .add("c2", "string")

From 8d8641f12250b0a9d370ff9354407c27af7cfcf4 Mon Sep 17 00:00:00 2001
From: Ming Jiang <mjiang@fanatics.com>
Date: Thu, 14 Sep 2017 13:53:28 +0800
Subject: [PATCH 543/779] [SPARK-21854] Added LogisticRegressionTrainingSummary
 for MultinomialLogisticRegression in Python API

## What changes were proposed in this pull request?

Added LogisticRegressionTrainingSummary for MultinomialLogisticRegression in Python API

## How was this patch tested?

Added unit test

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Ming Jiang <mjiang@fanatics.com>
Author: Ming Jiang <jmwdpk@gmail.com>
Author: jmwdpk <jmwdpk@gmail.com>

Closes #19185 from jmwdpk/SPARK-21854.
---
 .../LogisticRegressionSuite.scala             |  12 ++
 python/pyspark/ml/classification.py           | 120 +++++++++++++++++-
 python/pyspark/ml/tests.py                    |  55 +++++++-
 3 files changed, 183 insertions(+), 4 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index d43c7cdbde62c..14f550890d238 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -2416,6 +2416,18 @@ class LogisticRegressionSuite
       blorSummary.recallByThreshold.collect() === sameBlorSummary.recallByThreshold.collect())
     assert(
       blorSummary.precisionByThreshold.collect() === sameBlorSummary.precisionByThreshold.collect())
+    assert(blorSummary.labels === sameBlorSummary.labels)
+    assert(blorSummary.truePositiveRateByLabel === sameBlorSummary.truePositiveRateByLabel)
+    assert(blorSummary.falsePositiveRateByLabel === sameBlorSummary.falsePositiveRateByLabel)
+    assert(blorSummary.precisionByLabel === sameBlorSummary.precisionByLabel)
+    assert(blorSummary.recallByLabel === sameBlorSummary.recallByLabel)
+    assert(blorSummary.fMeasureByLabel === sameBlorSummary.fMeasureByLabel)
+    assert(blorSummary.accuracy === sameBlorSummary.accuracy)
+    assert(blorSummary.weightedTruePositiveRate === sameBlorSummary.weightedTruePositiveRate)
+    assert(blorSummary.weightedFalsePositiveRate === sameBlorSummary.weightedFalsePositiveRate)
+    assert(blorSummary.weightedRecall === sameBlorSummary.weightedRecall)
+    assert(blorSummary.weightedPrecision === sameBlorSummary.weightedPrecision)
+    assert(blorSummary.weightedFMeasure === sameBlorSummary.weightedFMeasure)
 
     lr.setFamily("multinomial")
     val mlorModel = lr.fit(smallMultinomialDataset)
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index fbb9e7fcdd86b..0caafa67e30fe 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -529,9 +529,11 @@ def summary(self):
         trained on the training set. An exception is thrown if `trainingSummary is None`.
         """
         if self.hasSummary:
-            java_blrt_summary = self._call_java("summary")
-            # Note: Once multiclass is added, update this to return correct summary
-            return BinaryLogisticRegressionTrainingSummary(java_blrt_summary)
+            java_lrt_summary = self._call_java("summary")
+            if self.numClasses <= 2:
+                return BinaryLogisticRegressionTrainingSummary(java_lrt_summary)
+            else:
+                return LogisticRegressionTrainingSummary(java_lrt_summary)
         else:
             raise RuntimeError("No training summary available for this %s" %
                                self.__class__.__name__)
@@ -586,6 +588,14 @@ def probabilityCol(self):
         """
         return self._call_java("probabilityCol")
 
+    @property
+    @since("2.3.0")
+    def predictionCol(self):
+        """
+        Field in "predictions" which gives the prediction of each class.
+        """
+        return self._call_java("predictionCol")
+
     @property
     @since("2.0.0")
     def labelCol(self):
@@ -604,6 +614,110 @@ def featuresCol(self):
         """
         return self._call_java("featuresCol")
 
+    @property
+    @since("2.3.0")
+    def labels(self):
+        """
+        Returns the sequence of labels in ascending order. This order matches the order used
+        in metrics which are specified as arrays over labels, e.g., truePositiveRateByLabel.
+
+        Note: In most cases, it will be values {0.0, 1.0, ..., numClasses-1}, However, if the
+        training set is missing a label, then all of the arrays over labels
+        (e.g., from truePositiveRateByLabel) will be of length numClasses-1 instead of the
+        expected numClasses.
+        """
+        return self._call_java("labels")
+
+    @property
+    @since("2.3.0")
+    def truePositiveRateByLabel(self):
+        """
+        Returns true positive rate for each label (category).
+        """
+        return self._call_java("truePositiveRateByLabel")
+
+    @property
+    @since("2.3.0")
+    def falsePositiveRateByLabel(self):
+        """
+        Returns false positive rate for each label (category).
+        """
+        return self._call_java("falsePositiveRateByLabel")
+
+    @property
+    @since("2.3.0")
+    def precisionByLabel(self):
+        """
+        Returns precision for each label (category).
+        """
+        return self._call_java("precisionByLabel")
+
+    @property
+    @since("2.3.0")
+    def recallByLabel(self):
+        """
+        Returns recall for each label (category).
+        """
+        return self._call_java("recallByLabel")
+
+    @since("2.3.0")
+    def fMeasureByLabel(self, beta=1.0):
+        """
+        Returns f-measure for each label (category).
+        """
+        return self._call_java("fMeasureByLabel", beta)
+
+    @property
+    @since("2.3.0")
+    def accuracy(self):
+        """
+        Returns accuracy.
+        (equals to the total number of correctly classified instances
+        out of the total number of instances.)
+        """
+        return self._call_java("accuracy")
+
+    @property
+    @since("2.3.0")
+    def weightedTruePositiveRate(self):
+        """
+        Returns weighted true positive rate.
+        (equals to precision, recall and f-measure)
+        """
+        return self._call_java("weightedTruePositiveRate")
+
+    @property
+    @since("2.3.0")
+    def weightedFalsePositiveRate(self):
+        """
+        Returns weighted false positive rate.
+        """
+        return self._call_java("weightedFalsePositiveRate")
+
+    @property
+    @since("2.3.0")
+    def weightedRecall(self):
+        """
+        Returns weighted averaged recall.
+        (equals to precision, recall and f-measure)
+        """
+        return self._call_java("weightedRecall")
+
+    @property
+    @since("2.3.0")
+    def weightedPrecision(self):
+        """
+        Returns weighted averaged precision.
+        """
+        return self._call_java("weightedPrecision")
+
+    @since("2.3.0")
+    def weightedFMeasure(self, beta=1.0):
+        """
+        Returns weighted averaged f-measure.
+        """
+        return self._call_java("weightedFMeasure", beta)
+
 
 @inherit_doc
 class LogisticRegressionTrainingSummary(LogisticRegressionSummary):
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index c66cd763b5b9a..8b8bcc7b13a38 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -1451,7 +1451,7 @@ def test_glr_summary(self):
         sameSummary = model.evaluate(df)
         self.assertAlmostEqual(sameSummary.deviance, s.deviance)
 
-    def test_logistic_regression_summary(self):
+    def test_binary_logistic_regression_summary(self):
         df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                          (0.0, 2.0, Vectors.sparse(1, [], []))],
                                         ["label", "weight", "features"])
@@ -1464,20 +1464,73 @@ def test_logistic_regression_summary(self):
         self.assertEqual(s.probabilityCol, "probability")
         self.assertEqual(s.labelCol, "label")
         self.assertEqual(s.featuresCol, "features")
+        self.assertEqual(s.predictionCol, "prediction")
         objHist = s.objectiveHistory
         self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
         self.assertGreater(s.totalIterations, 0)
+        self.assertTrue(isinstance(s.labels, list))
+        self.assertTrue(isinstance(s.truePositiveRateByLabel, list))
+        self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))
+        self.assertTrue(isinstance(s.precisionByLabel, list))
+        self.assertTrue(isinstance(s.recallByLabel, list))
+        self.assertTrue(isinstance(s.fMeasureByLabel(), list))
+        self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))
         self.assertTrue(isinstance(s.roc, DataFrame))
         self.assertAlmostEqual(s.areaUnderROC, 1.0, 2)
         self.assertTrue(isinstance(s.pr, DataFrame))
         self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame))
         self.assertTrue(isinstance(s.precisionByThreshold, DataFrame))
         self.assertTrue(isinstance(s.recallByThreshold, DataFrame))
+        self.assertAlmostEqual(s.accuracy, 1.0, 2)
+        self.assertAlmostEqual(s.weightedTruePositiveRate, 1.0, 2)
+        self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.0, 2)
+        self.assertAlmostEqual(s.weightedRecall, 1.0, 2)
+        self.assertAlmostEqual(s.weightedPrecision, 1.0, 2)
+        self.assertAlmostEqual(s.weightedFMeasure(), 1.0, 2)
+        self.assertAlmostEqual(s.weightedFMeasure(1.0), 1.0, 2)
         # test evaluation (with training dataset) produces a summary with same values
         # one check is enough to verify a summary is returned, Scala version runs full test
         sameSummary = model.evaluate(df)
         self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
 
+    def test_multiclass_logistic_regression_summary(self):
+        df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
+                                         (0.0, 2.0, Vectors.sparse(1, [], [])),
+                                         (2.0, 2.0, Vectors.dense(2.0)),
+                                         (2.0, 2.0, Vectors.dense(1.9))],
+                                        ["label", "weight", "features"])
+        lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
+        model = lr.fit(df)
+        self.assertTrue(model.hasSummary)
+        s = model.summary
+        # test that api is callable and returns expected types
+        self.assertTrue(isinstance(s.predictions, DataFrame))
+        self.assertEqual(s.probabilityCol, "probability")
+        self.assertEqual(s.labelCol, "label")
+        self.assertEqual(s.featuresCol, "features")
+        self.assertEqual(s.predictionCol, "prediction")
+        objHist = s.objectiveHistory
+        self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
+        self.assertGreater(s.totalIterations, 0)
+        self.assertTrue(isinstance(s.labels, list))
+        self.assertTrue(isinstance(s.truePositiveRateByLabel, list))
+        self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))
+        self.assertTrue(isinstance(s.precisionByLabel, list))
+        self.assertTrue(isinstance(s.recallByLabel, list))
+        self.assertTrue(isinstance(s.fMeasureByLabel(), list))
+        self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))
+        self.assertAlmostEqual(s.accuracy, 0.75, 2)
+        self.assertAlmostEqual(s.weightedTruePositiveRate, 0.75, 2)
+        self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.25, 2)
+        self.assertAlmostEqual(s.weightedRecall, 0.75, 2)
+        self.assertAlmostEqual(s.weightedPrecision, 0.583, 2)
+        self.assertAlmostEqual(s.weightedFMeasure(), 0.65, 2)
+        self.assertAlmostEqual(s.weightedFMeasure(1.0), 0.65, 2)
+        # test evaluation (with training dataset) produces a summary with same values
+        # one check is enough to verify a summary is returned, Scala version runs full test
+        sameSummary = model.evaluate(df)
+        self.assertAlmostEqual(sameSummary.accuracy, s.accuracy)
+
     def test_gaussian_mixture_summary(self):
         data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
                 (Vectors.sparse(1, [], []),)]

From 66cb72d7b9178774ba253e244bb2eddb1345b21f Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Thu, 14 Sep 2017 14:04:43 +0800
Subject: [PATCH 544/779] [MINOR][DOC] Add missing call of `update()` in
 examples of PeriodicGraphCheckpointer & PeriodicRDDCheckpointer

## What changes were proposed in this pull request?
forgot to call `update()` with `graph1` & `rdd1` in examples for `PeriodicGraphCheckpointer` & `PeriodicRDDCheckpoin`
## How was this patch tested?
existing tests

Author: Zheng RuiFeng <ruifengz@foxmail.com>

Closes #19198 from zhengruifeng/fix_doc_checkpointer.
---
 .../org/apache/spark/rdd/util/PeriodicRDDCheckpointer.scala      | 1 +
 .../org/apache/spark/graphx/util/PeriodicGraphCheckpointer.scala | 1 +
 2 files changed, 2 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/rdd/util/PeriodicRDDCheckpointer.scala b/core/src/main/scala/org/apache/spark/rdd/util/PeriodicRDDCheckpointer.scala
index ab72addb2466b..facbb830a60d8 100644
--- a/core/src/main/scala/org/apache/spark/rdd/util/PeriodicRDDCheckpointer.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/util/PeriodicRDDCheckpointer.scala
@@ -50,6 +50,7 @@ import org.apache.spark.util.PeriodicCheckpointer
  * {{{
  *  val (rdd1, rdd2, rdd3, ...) = ...
  *  val cp = new PeriodicRDDCheckpointer(2, sc)
+ *  cp.update(rdd1)
  *  rdd1.count();
  *  // persisted: rdd1
  *  cp.update(rdd2)
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/util/PeriodicGraphCheckpointer.scala b/graphx/src/main/scala/org/apache/spark/graphx/util/PeriodicGraphCheckpointer.scala
index fda501aa757d6..539b66f747cc9 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/util/PeriodicGraphCheckpointer.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/util/PeriodicGraphCheckpointer.scala
@@ -50,6 +50,7 @@ import org.apache.spark.util.PeriodicCheckpointer
  * {{{
  *  val (graph1, graph2, graph3, ...) = ...
  *  val cp = new PeriodicGraphCheckpointer(2, sc)
+ *  cp.updateGraph(graph1)
  *  graph1.vertices.count(); graph1.edges.count()
  *  // persisted: graph1
  *  cp.updateGraph(graph2)

From c76153cc7dd25b8de5266fe119095066be7f78f5 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 14 Sep 2017 14:09:44 +0800
Subject: [PATCH 545/779] [SPARK-18608][ML][FOLLOWUP] Fix double caching for
 PySpark OneVsRest.

## What changes were proposed in this pull request?
#19197 fixed double caching for MLlib algorithms, but missed PySpark ```OneVsRest```, this PR fixed it.

## How was this patch tested?
Existing tests.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #19220 from yanboliang/SPARK-18608.
---
 python/pyspark/ml/classification.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 0caafa67e30fe..27ad1e80aa0d3 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -1773,8 +1773,7 @@ def _fit(self, dataset):
             multiclassLabeled = dataset.select(labelCol, featuresCol)
 
         # persist if underlying dataset is not persistent.
-        handlePersistence = \
-            dataset.rdd.getStorageLevel() == StorageLevel(False, False, False, False)
+        handlePersistence = dataset.storageLevel == StorageLevel(False, False, False, False)
         if handlePersistence:
             multiclassLabeled.persist(StorageLevel.MEMORY_AND_DISK)
 
@@ -1928,8 +1927,7 @@ def _transform(self, dataset):
         newDataset = dataset.withColumn(accColName, initUDF(dataset[origCols[0]]))
 
         # persist if underlying dataset is not persistent.
-        handlePersistence = \
-            dataset.rdd.getStorageLevel() == StorageLevel(False, False, False, False)
+        handlePersistence = dataset.storageLevel == StorageLevel(False, False, False, False)
         if handlePersistence:
             newDataset.persist(StorageLevel.MEMORY_AND_DISK)
 

From 4e6fc69014af997e4fc41a6959f2d44f4b973bfa Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Thu, 14 Sep 2017 14:48:04 +0800
Subject: [PATCH 546/779] [SPARK-4131][FOLLOW-UP] Support "Writing data into
 the filesystem from queries"

## What changes were proposed in this pull request?
This PR is clean the codes in https://github.com/apache/spark/pull/18975

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #19225 from gatorsmile/refactorSPARK-4131.
---
 .../scala/org/apache/spark/sql/execution/command/ddl.scala  | 6 +++++-
 .../scala/org/apache/spark/sql/hive/HiveStrategies.scala    | 3 +--
 .../apache/spark/sql/hive/execution/SaveAsHiveFile.scala    | 6 ++++--
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index b06f4ccaa3bbf..162e1d5be2938 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -801,7 +801,11 @@ object DDLUtils {
   val HIVE_PROVIDER = "hive"
 
   def isHiveTable(table: CatalogTable): Boolean = {
-    table.provider.isDefined && table.provider.get.toLowerCase(Locale.ROOT) == HIVE_PROVIDER
+    isHiveTable(table.provider)
+  }
+
+  def isHiveTable(provider: Option[String]): Boolean = {
+    provider.isDefined && provider.get.toLowerCase(Locale.ROOT) == HIVE_PROVIDER
   }
 
   def isDatasourceTable(table: CatalogTable): Boolean = {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index caf554d9ea510..805b3171cdaab 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -160,8 +160,7 @@ object HiveAnalysis extends Rule[LogicalPlan] {
       CreateHiveTableAsSelectCommand(tableDesc, query, mode)
 
     case InsertIntoDir(isLocal, storage, provider, child, overwrite)
-      if provider.isDefined && provider.get.toLowerCase(Locale.ROOT) == DDLUtils.HIVE_PROVIDER =>
-
+        if DDLUtils.isHiveTable(provider) =>
       val outputPath = new Path(storage.locationUri.get)
       if (overwrite) DDLUtils.verifyNotReadPath(child, outputPath)
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
index ad8699489d4bf..2d74ef040ef5a 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
@@ -31,6 +31,7 @@ import org.apache.hadoop.hive.ql.exec.TaskRunner
 
 import org.apache.spark.internal.io.FileCommitProtocol
 import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.command.DataWritingCommand
@@ -50,7 +51,8 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
       hadoopConf: Configuration,
       fileSinkConf: FileSinkDesc,
       outputLocation: String,
-      partitionAttributes: Seq[Attribute] = Nil): Unit = {
+      customPartitionLocations: Map[TablePartitionSpec, String] = Map.empty,
+      partitionAttributes: Seq[Attribute] = Nil): Set[String] = {
 
     val isCompressed = hadoopConf.get("hive.exec.compress.output", "false").toBoolean
     if (isCompressed) {
@@ -76,7 +78,7 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
       plan = plan,
       fileFormat = new HiveFileFormat(fileSinkConf),
       committer = committer,
-      outputSpec = FileFormatWriter.OutputSpec(outputLocation, Map.empty),
+      outputSpec = FileFormatWriter.OutputSpec(outputLocation, customPartitionLocations),
       hadoopConf = hadoopConf,
       partitionColumns = partitionAttributes,
       bucketSpec = None,

From 4b88393cb9f4c77f479749edf2377ed6b91280c0 Mon Sep 17 00:00:00 2001
From: zhoukang <zhoukang199191@gmail.com>
Date: Thu, 14 Sep 2017 20:40:33 +0800
Subject: [PATCH 547/779] =?UTF-8?q?[SPARK-21922]=20Fix=20duration=20always?=
 =?UTF-8?q?=20updating=20when=20task=20failed=20but=20status=20is=20still?=
 =?UTF-8?q?=20RUN=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…NING

## What changes were proposed in this pull request?
When driver quit abnormally which cause executor shutdown and task metrics can not be sent to driver for updating.In this case the status will always be 'RUNNING' and the duration on history UI will be 'CurrentTime - launchTime' which increase infinitely.
We can fix this time by modify time of event log since this time has gotten when `FSHistoryProvider` fetch event log from File System.
And the result picture is uploaded in [SPARK-21922](https://issues.apache.org/jira/browse/SPARK-21922).
How to reproduce?
(1) Submit a job to spark on yarn
(2) Mock an oom(or other case can make driver quit abnormally)  senario for driver
(3) Make sure executor is running task when driver quitting
(4) Open the history server and checkout result
It is not a corner case since there are many such jobs in our current cluster.

## How was this patch tested?
Deploy historyserver and open a job has this problem.

Author: zhoukang <zhoukang199191@gmail.com>

Closes #19132 from caneGuy/zhoukang/fix-duration.
---
 .../apache/spark/deploy/history/FsHistoryProvider.scala  | 2 +-
 .../apache/spark/status/api/v1/AllStagesResource.scala   | 8 +++++---
 .../apache/spark/status/api/v1/OneStageResource.scala    | 5 ++++-
 core/src/main/scala/org/apache/spark/ui/SparkUI.scala    | 8 ++++++--
 .../main/scala/org/apache/spark/ui/jobs/StagePage.scala  | 9 +++++++--
 .../main/scala/org/apache/spark/ui/jobs/StagesTab.scala  | 1 +
 .../src/main/scala/org/apache/spark/ui/jobs/UIData.scala | 5 +++--
 7 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index 687fd2d3ffe64..20fe911f2d294 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -249,7 +249,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
             val appSecManager = new SecurityManager(conf)
             SparkUI.createHistoryUI(conf, replayBus, appSecManager, appInfo.name,
               HistoryServer.getAttemptURI(appId, attempt.attemptId),
-              attempt.startTime)
+              Some(attempt.lastUpdated), attempt.startTime)
             // Do not call ui.bind() to avoid creating a new server for each application
           }
 
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/AllStagesResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/AllStagesResource.scala
index 56028710ecc66..4a4ed954d689e 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/AllStagesResource.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/AllStagesResource.scala
@@ -47,6 +47,7 @@ private[v1] class AllStagesResource(ui: SparkUI) {
         listener.stageIdToData.get((stageInfo.stageId, stageInfo.attemptId))
       }
     } yield {
+      stageUiData.lastUpdateTime = ui.lastUpdateTime
       AllStagesResource.stageUiToStageData(status, stageInfo, stageUiData, includeDetails = false)
     }
   }
@@ -69,7 +70,8 @@ private[v1] object AllStagesResource {
       }
 
     val taskData = if (includeDetails) {
-      Some(stageUiData.taskData.map { case (k, v) => k -> convertTaskData(v) } )
+      Some(stageUiData.taskData.map { case (k, v) =>
+        k -> convertTaskData(v, stageUiData.lastUpdateTime) })
     } else {
       None
     }
@@ -136,13 +138,13 @@ private[v1] object AllStagesResource {
     }
   }
 
-  def convertTaskData(uiData: TaskUIData): TaskData = {
+  def convertTaskData(uiData: TaskUIData, lastUpdateTime: Option[Long]): TaskData = {
     new TaskData(
       taskId = uiData.taskInfo.taskId,
       index = uiData.taskInfo.index,
       attempt = uiData.taskInfo.attemptNumber,
       launchTime = new Date(uiData.taskInfo.launchTime),
-      duration = uiData.taskDuration,
+      duration = uiData.taskDuration(lastUpdateTime),
       executorId = uiData.taskInfo.executorId,
       host = uiData.taskInfo.host,
       status = uiData.taskInfo.status,
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/OneStageResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/OneStageResource.scala
index 3e6d2942d0fbb..f15073bccced2 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/OneStageResource.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/OneStageResource.scala
@@ -35,6 +35,7 @@ private[v1] class OneStageResource(ui: SparkUI) {
   def stageData(@PathParam("stageId") stageId: Int): Seq[StageData] = {
     withStage(stageId) { stageAttempts =>
       stageAttempts.map { stage =>
+        stage.ui.lastUpdateTime = ui.lastUpdateTime
         AllStagesResource.stageUiToStageData(stage.status, stage.info, stage.ui,
           includeDetails = true)
       }
@@ -47,6 +48,7 @@ private[v1] class OneStageResource(ui: SparkUI) {
       @PathParam("stageId") stageId: Int,
       @PathParam("stageAttemptId") stageAttemptId: Int): StageData = {
     withStageAttempt(stageId, stageAttemptId) { stage =>
+      stage.ui.lastUpdateTime = ui.lastUpdateTime
       AllStagesResource.stageUiToStageData(stage.status, stage.info, stage.ui,
         includeDetails = true)
     }
@@ -81,7 +83,8 @@ private[v1] class OneStageResource(ui: SparkUI) {
       @DefaultValue("20") @QueryParam("length") length: Int,
       @DefaultValue("ID") @QueryParam("sortBy") sortBy: TaskSorting): Seq[TaskData] = {
     withStageAttempt(stageId, stageAttemptId) { stage =>
-      val tasks = stage.ui.taskData.values.map{AllStagesResource.convertTaskData}.toIndexedSeq
+      val tasks = stage.ui.taskData.values
+        .map{ AllStagesResource.convertTaskData(_, ui.lastUpdateTime)}.toIndexedSeq
         .sorted(OneStageResource.ordering(sortBy))
       tasks.slice(offset, offset + length)
     }
diff --git a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
index 589f811145519..f3fcf2778d39e 100644
--- a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
+++ b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
@@ -50,6 +50,7 @@ private[spark] class SparkUI private (
     val operationGraphListener: RDDOperationGraphListener,
     var appName: String,
     val basePath: String,
+    val lastUpdateTime: Option[Long] = None,
     val startTime: Long)
   extends WebUI(securityManager, securityManager.getSSLOptions("ui"), SparkUI.getUIPort(conf),
     conf, basePath, "SparkUI")
@@ -176,9 +177,11 @@ private[spark] object SparkUI {
       securityManager: SecurityManager,
       appName: String,
       basePath: String,
+      lastUpdateTime: Option[Long],
       startTime: Long): SparkUI = {
     val sparkUI = create(
-      None, conf, listenerBus, securityManager, appName, basePath, startTime = startTime)
+      None, conf, listenerBus, securityManager, appName, basePath,
+      lastUpdateTime = lastUpdateTime, startTime = startTime)
 
     val listenerFactories = ServiceLoader.load(classOf[SparkHistoryListenerFactory],
       Utils.getContextOrSparkClassLoader).asScala
@@ -204,6 +207,7 @@ private[spark] object SparkUI {
       appName: String,
       basePath: String = "",
       jobProgressListener: Option[JobProgressListener] = None,
+      lastUpdateTime: Option[Long] = None,
       startTime: Long): SparkUI = {
 
     val _jobProgressListener: JobProgressListener = jobProgressListener.getOrElse {
@@ -226,6 +230,6 @@ private[spark] object SparkUI {
 
     new SparkUI(sc, conf, securityManager, environmentListener, storageStatusListener,
       executorsListener, _jobProgressListener, storageListener, operationGraphListener,
-      appName, basePath, startTime)
+      appName, basePath, lastUpdateTime, startTime)
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index 633e740b9c9bd..4d80308eb0a6d 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -299,6 +299,7 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
           stageData.hasShuffleRead,
           stageData.hasShuffleWrite,
           stageData.hasBytesSpilled,
+          parent.lastUpdateTime,
           currentTime,
           pageSize = taskPageSize,
           sortColumn = taskSortColumn,
@@ -863,6 +864,7 @@ private[ui] class TaskDataSource(
     hasShuffleRead: Boolean,
     hasShuffleWrite: Boolean,
     hasBytesSpilled: Boolean,
+    lastUpdateTime: Option[Long],
     currentTime: Long,
     pageSize: Int,
     sortColumn: String,
@@ -889,8 +891,9 @@ private[ui] class TaskDataSource(
   private def taskRow(taskData: TaskUIData): TaskTableRowData = {
     val info = taskData.taskInfo
     val metrics = taskData.metrics
-    val duration = taskData.taskDuration.getOrElse(1L)
-    val formatDuration = taskData.taskDuration.map(d => UIUtils.formatDuration(d)).getOrElse("")
+    val duration = taskData.taskDuration(lastUpdateTime).getOrElse(1L)
+    val formatDuration =
+      taskData.taskDuration(lastUpdateTime).map(d => UIUtils.formatDuration(d)).getOrElse("")
     val schedulerDelay = metrics.map(getSchedulerDelay(info, _, currentTime)).getOrElse(0L)
     val gcTime = metrics.map(_.jvmGCTime).getOrElse(0L)
     val taskDeserializationTime = metrics.map(_.executorDeserializeTime).getOrElse(0L)
@@ -1154,6 +1157,7 @@ private[ui] class TaskPagedTable(
     hasShuffleRead: Boolean,
     hasShuffleWrite: Boolean,
     hasBytesSpilled: Boolean,
+    lastUpdateTime: Option[Long],
     currentTime: Long,
     pageSize: Int,
     sortColumn: String,
@@ -1179,6 +1183,7 @@ private[ui] class TaskPagedTable(
     hasShuffleRead,
     hasShuffleWrite,
     hasBytesSpilled,
+    lastUpdateTime,
     currentTime,
     pageSize,
     sortColumn,
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala
index 799d769626395..0787ea6625903 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala
@@ -30,6 +30,7 @@ private[ui] class StagesTab(parent: SparkUI) extends SparkUITab(parent, "stages"
   val progressListener = parent.jobProgressListener
   val operationGraphListener = parent.operationGraphListener
   val executorsListener = parent.executorsListener
+  val lastUpdateTime = parent.lastUpdateTime
 
   attachPage(new AllStagesPage(this))
   attachPage(new StagePage(this))
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala b/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala
index 9448baac096dc..d9c87f69d8a54 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala
@@ -97,6 +97,7 @@ private[spark] object UIData {
     var memoryBytesSpilled: Long = _
     var diskBytesSpilled: Long = _
     var isBlacklisted: Int = _
+    var lastUpdateTime: Option[Long] = None
 
     var schedulingPool: String = ""
     var description: Option[String] = None
@@ -133,9 +134,9 @@ private[spark] object UIData {
       _metrics = metrics.map(TaskMetricsUIData.fromTaskMetrics)
     }
 
-    def taskDuration: Option[Long] = {
+    def taskDuration(lastUpdateTime: Option[Long] = None): Option[Long] = {
       if (taskInfo.status == "RUNNING") {
-        Some(_taskInfo.timeRunning(System.currentTimeMillis))
+        Some(_taskInfo.timeRunning(lastUpdateTime.getOrElse(System.currentTimeMillis)))
       } else {
         _metrics.map(_.executorRunTime)
       }

From ddd7f5e11d0d3bdb3b79d95af3396fba009c512e Mon Sep 17 00:00:00 2001
From: Zhenhua Wang <wangzhenhua@huawei.com>
Date: Thu, 14 Sep 2017 23:14:21 +0800
Subject: [PATCH 548/779] [SPARK-17642][SQL][FOLLOWUP] drop test tables and
 improve comments

## What changes were proposed in this pull request?

Drop test tables and improve comments.

## How was this patch tested?

Modified existing test.

Author: Zhenhua Wang <wangzhenhua@huawei.com>

Closes #19213 from wzhfy/useless_comment.
---
 .../spark/sql/execution/command/tables.scala  |  8 +---
 .../inputs/describe-table-column.sql          | 26 +++++++----
 .../results/describe-table-column.sql.out     | 46 ++++++++++++++-----
 3 files changed, 53 insertions(+), 27 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index da0c815581d53..8d95ca6921cf8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -632,8 +632,7 @@ case class DescribeTableCommand(
 }
 
 /**
- * A command to list the info for a column, including name, data type, column stats and comment.
- * This function creates a [[DescribeColumnCommand]] logical plan.
+ * A command to list the info for a column, including name, data type, comment and column stats.
  *
  * The syntax of using this command in SQL is:
  * {{{
@@ -809,8 +808,7 @@ case class ShowTablePropertiesCommand(table: TableIdentifier, propertyKey: Optio
 }
 
 /**
- * A command to list the column names for a table. This function creates a
- * [[ShowColumnsCommand]] logical plan.
+ * A command to list the column names for a table.
  *
  * The syntax of using this command in SQL is:
  * {{{
@@ -849,8 +847,6 @@ case class ShowColumnsCommand(
  * 1. If the command is called for a non partitioned table.
  * 2. If the partition spec refers to the columns that are not defined as partitioning columns.
  *
- * This function creates a [[ShowPartitionsCommand]] logical plan
- *
  * The syntax of using this command in SQL is:
  * {{{
  *   SHOW PARTITIONS [db_name.]table_name [PARTITION(partition_spec)]
diff --git a/sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql b/sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql
index 24870def0316e..a6ddcd999bf9b 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql
@@ -1,17 +1,17 @@
 -- Test temp table
-CREATE TEMPORARY VIEW desc_col_temp_table (key int COMMENT 'column_comment') USING PARQUET;
+CREATE TEMPORARY VIEW desc_col_temp_view (key int COMMENT 'column_comment') USING PARQUET;
 
-DESC desc_col_temp_table key;
+DESC desc_col_temp_view key;
 
-DESC EXTENDED desc_col_temp_table key;
+DESC EXTENDED desc_col_temp_view key;
 
-DESC FORMATTED desc_col_temp_table key;
+DESC FORMATTED desc_col_temp_view key;
 
 -- Describe a column with qualified name
-DESC FORMATTED desc_col_temp_table desc_col_temp_table.key;
+DESC FORMATTED desc_col_temp_view desc_col_temp_view.key;
 
 -- Describe a non-existent column
-DESC desc_col_temp_table key1;
+DESC desc_col_temp_view key1;
 
 -- Test persistent table
 CREATE TABLE desc_col_table (key int COMMENT 'column_comment') USING PARQUET;
@@ -25,11 +25,17 @@ DESC EXTENDED desc_col_table key;
 DESC FORMATTED desc_col_table key;
 
 -- Test complex columns
-CREATE TABLE desc_col_complex_table (`a.b` int, col struct<x:int, y:string>) USING PARQUET;
+CREATE TABLE desc_complex_col_table (`a.b` int, col struct<x:int, y:string>) USING PARQUET;
 
-DESC FORMATTED desc_col_complex_table `a.b`;
+DESC FORMATTED desc_complex_col_table `a.b`;
 
-DESC FORMATTED desc_col_complex_table col;
+DESC FORMATTED desc_complex_col_table col;
 
 -- Describe a nested column
-DESC FORMATTED desc_col_complex_table col.x;
+DESC FORMATTED desc_complex_col_table col.x;
+
+DROP VIEW desc_col_temp_view;
+
+DROP TABLE desc_col_table;
+
+DROP TABLE desc_complex_col_table;
diff --git a/sql/core/src/test/resources/sql-tests/results/describe-table-column.sql.out b/sql/core/src/test/resources/sql-tests/results/describe-table-column.sql.out
index a51eef7c7e1f6..30d0a2dc5a3f7 100644
--- a/sql/core/src/test/resources/sql-tests/results/describe-table-column.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/describe-table-column.sql.out
@@ -1,9 +1,9 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 15
+-- Number of queries: 18
 
 
 -- !query 0
-CREATE TEMPORARY VIEW desc_col_temp_table (key int COMMENT 'column_comment') USING PARQUET
+CREATE TEMPORARY VIEW desc_col_temp_view (key int COMMENT 'column_comment') USING PARQUET
 -- !query 0 schema
 struct<>
 -- !query 0 output
@@ -11,7 +11,7 @@ struct<>
 
 
 -- !query 1
-DESC desc_col_temp_table key
+DESC desc_col_temp_view key
 -- !query 1 schema
 struct<info_name:string,info_value:string>
 -- !query 1 output
@@ -21,7 +21,7 @@ comment	column_comment
 
 
 -- !query 2
-DESC EXTENDED desc_col_temp_table key
+DESC EXTENDED desc_col_temp_view key
 -- !query 2 schema
 struct<info_name:string,info_value:string>
 -- !query 2 output
@@ -37,7 +37,7 @@ max_col_len	NULL
 
 
 -- !query 3
-DESC FORMATTED desc_col_temp_table key
+DESC FORMATTED desc_col_temp_view key
 -- !query 3 schema
 struct<info_name:string,info_value:string>
 -- !query 3 output
@@ -53,7 +53,7 @@ max_col_len	NULL
 
 
 -- !query 4
-DESC FORMATTED desc_col_temp_table desc_col_temp_table.key
+DESC FORMATTED desc_col_temp_view desc_col_temp_view.key
 -- !query 4 schema
 struct<info_name:string,info_value:string>
 -- !query 4 output
@@ -69,7 +69,7 @@ max_col_len	NULL
 
 
 -- !query 5
-DESC desc_col_temp_table key1
+DESC desc_col_temp_view key1
 -- !query 5 schema
 struct<>
 -- !query 5 output
@@ -136,7 +136,7 @@ max_col_len	4
 
 
 -- !query 11
-CREATE TABLE desc_col_complex_table (`a.b` int, col struct<x:int, y:string>) USING PARQUET
+CREATE TABLE desc_complex_col_table (`a.b` int, col struct<x:int, y:string>) USING PARQUET
 -- !query 11 schema
 struct<>
 -- !query 11 output
@@ -144,7 +144,7 @@ struct<>
 
 
 -- !query 12
-DESC FORMATTED desc_col_complex_table `a.b`
+DESC FORMATTED desc_complex_col_table `a.b`
 -- !query 12 schema
 struct<info_name:string,info_value:string>
 -- !query 12 output
@@ -160,7 +160,7 @@ max_col_len	NULL
 
 
 -- !query 13
-DESC FORMATTED desc_col_complex_table col
+DESC FORMATTED desc_complex_col_table col
 -- !query 13 schema
 struct<info_name:string,info_value:string>
 -- !query 13 output
@@ -176,9 +176,33 @@ max_col_len	NULL
 
 
 -- !query 14
-DESC FORMATTED desc_col_complex_table col.x
+DESC FORMATTED desc_complex_col_table col.x
 -- !query 14 schema
 struct<>
 -- !query 14 output
 org.apache.spark.sql.AnalysisException
 DESC TABLE COLUMN command does not support nested data types: col.x;
+
+
+-- !query 15
+DROP VIEW desc_col_temp_view
+-- !query 15 schema
+struct<>
+-- !query 15 output
+
+
+
+-- !query 16
+DROP TABLE desc_col_table
+-- !query 16 schema
+struct<>
+-- !query 16 output
+
+
+
+-- !query 17
+DROP TABLE desc_complex_col_table
+-- !query 17 schema
+struct<>
+-- !query 17 output
+

From 054ddb2f54ab8e6b0088fbf9d576c7770e5abcbf Mon Sep 17 00:00:00 2001
From: Jose Torres <jose@databricks.com>
Date: Thu, 14 Sep 2017 11:06:25 -0700
Subject: [PATCH 549/779] [SPARK-21988] Add default stats to
 StreamingExecutionRelation.

## What changes were proposed in this pull request?

Add default stats to StreamingExecutionRelation.

## How was this patch tested?

existing unit tests and an explain() test to be sure

Author: Jose Torres <jose@databricks.com>

Closes #19212 from joseph-torres/SPARK-21988.
---
 .../execution/streaming/StreamExecution.scala |  2 +-
 .../streaming/StreamingRelation.scala         | 20 ++++++++++++++++---
 .../sql/execution/streaming/memory.scala      |  2 +-
 .../spark/sql/streaming/StreamSuite.scala     | 20 ++++++++++++++++++-
 .../sql/streaming/StreamingQuerySuite.scala   |  2 +-
 5 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 71088ff6386be..952e431fb19d3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -166,7 +166,7 @@ class StreamExecution(
           nextSourceId += 1
           // We still need to use the previous `output` instead of `source.schema` as attributes in
           // "df.logicalPlan" has already used attributes of the previous `output`.
-          StreamingExecutionRelation(source, output)
+          StreamingExecutionRelation(source, output)(sparkSession)
         })
     }
     sources = _logicalPlan.collect { case s: StreamingExecutionRelation => s.source }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
index e8b00094add3a..ab716052c28ba 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
@@ -18,9 +18,11 @@
 package org.apache.spark.sql.execution.streaming
 
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LeafNode
+import org.apache.spark.sql.catalyst.plans.logical.Statistics
 import org.apache.spark.sql.execution.LeafExecNode
 import org.apache.spark.sql.execution.datasources.DataSource
 
@@ -48,9 +50,21 @@ case class StreamingRelation(dataSource: DataSource, sourceName: String, output:
  * Used to link a streaming [[Source]] of data into a
  * [[org.apache.spark.sql.catalyst.plans.logical.LogicalPlan]].
  */
-case class StreamingExecutionRelation(source: Source, output: Seq[Attribute]) extends LeafNode {
+case class StreamingExecutionRelation(
+    source: Source,
+    output: Seq[Attribute])(session: SparkSession)
+  extends LeafNode {
+
   override def isStreaming: Boolean = true
   override def toString: String = source.toString
+
+  // There's no sensible value here. On the execution path, this relation will be
+  // swapped out with microbatches. But some dataframe operations (in particular explain) do lead
+  // to this node surviving analysis. So we satisfy the LeafNode contract with the session default
+  // value.
+  override def computeStats(): Statistics = Statistics(
+    sizeInBytes = BigInt(session.sessionState.conf.defaultSizeInBytes)
+  )
 }
 
 /**
@@ -65,7 +79,7 @@ case class StreamingRelationExec(sourceName: String, output: Seq[Attribute]) ext
 }
 
 object StreamingExecutionRelation {
-  def apply(source: Source): StreamingExecutionRelation = {
-    StreamingExecutionRelation(source, source.schema.toAttributes)
+  def apply(source: Source, session: SparkSession): StreamingExecutionRelation = {
+    StreamingExecutionRelation(source, source.schema.toAttributes)(session)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
index c9784c093b408..3041d4d703cb4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
@@ -53,7 +53,7 @@ object MemoryStream {
 case class MemoryStream[A : Encoder](id: Int, sqlContext: SQLContext)
     extends Source with Logging {
   protected val encoder = encoderFor[A]
-  protected val logicalPlan = StreamingExecutionRelation(this)
+  protected val logicalPlan = StreamingExecutionRelation(this, sqlContext.sparkSession)
   protected val output = logicalPlan.output
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
index d0b2041a8644f..9c901062d570a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
@@ -76,6 +76,22 @@ class StreamSuite extends StreamTest {
       CheckAnswer(Row(1, 1, "one"), Row(2, 2, "two"), Row(4, 4, "four")))
   }
 
+
+  test("explain join") {
+    // Make a table and ensure it will be broadcast.
+    val smallTable = Seq((1, "one"), (2, "two"), (4, "four")).toDF("number", "word")
+
+    // Join the input stream with a table.
+    val inputData = MemoryStream[Int]
+    val joined = inputData.toDF().join(smallTable, smallTable("number") === $"value")
+
+    val outputStream = new java.io.ByteArrayOutputStream()
+    Console.withOut(outputStream) {
+      joined.explain()
+    }
+    assert(outputStream.toString.contains("StreamingRelation"))
+  }
+
   test("SPARK-20432: union one stream with itself") {
     val df = spark.readStream.format(classOf[FakeDefaultSource].getName).load().select("a")
     val unioned = df.union(df)
@@ -337,7 +353,9 @@ class StreamSuite extends StreamTest {
 
         override def stop(): Unit = {}
       }
-      val df = Dataset[Int](sqlContext.sparkSession, StreamingExecutionRelation(source))
+      val df = Dataset[Int](
+        sqlContext.sparkSession,
+        StreamingExecutionRelation(source, sqlContext.sparkSession))
       testStream(df)(
         // `ExpectFailure(isFatalError = true)` verifies two things:
         // - Fatal errors can be propagated to `StreamingQuery.exception` and
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index bf7c448ab5d62..3823e336d0b64 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -653,7 +653,7 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging wi
       }
       override def stop(): Unit = {}
     }
-    StreamingExecutionRelation(source)
+    StreamingExecutionRelation(source, spark)
   }
 
   /** Returns the query progress at the end of the first trigger of streaming DF */

From a28728a9afcff94194147573e07f6f4d0463687e Mon Sep 17 00:00:00 2001
From: goldmedal <liugs963@gmail.com>
Date: Fri, 15 Sep 2017 11:53:10 +0900
Subject: [PATCH 550/779] [SPARK-21513][SQL][FOLLOWUP] Allow UDF to_json
 support converting MapType to json for PySpark and SparkR

## What changes were proposed in this pull request?
In previous work SPARK-21513, we has allowed `MapType` and `ArrayType` of `MapType`s convert to a json string but only for Scala API. In this follow-up PR, we will make SparkSQL support it for PySpark and SparkR, too. We also fix some little bugs and comments of the previous work in this follow-up PR.

### For PySpark
```
>>> data = [(1, {"name": "Alice"})]
>>> df = spark.createDataFrame(data, ("key", "value"))
>>> df.select(to_json(df.value).alias("json")).collect()
[Row(json=u'{"name":"Alice")']
>>> data = [(1, [{"name": "Alice"}, {"name": "Bob"}])]
>>> df = spark.createDataFrame(data, ("key", "value"))
>>> df.select(to_json(df.value).alias("json")).collect()
[Row(json=u'[{"name":"Alice"},{"name":"Bob"}]')]
```
### For SparkR
```
# Converts a map into a JSON object
df2 <- sql("SELECT map('name', 'Bob')) as people")
df2 <- mutate(df2, people_json = to_json(df2$people))
# Converts an array of maps into a JSON array
df2 <- sql("SELECT array(map('name', 'Bob'), map('name', 'Alice')) as people")
df2 <- mutate(df2, people_json = to_json(df2$people))
```
## How was this patch tested?
Add unit test cases.

cc viirya HyukjinKwon

Author: goldmedal <liugs963@gmail.com>

Closes #19223 from goldmedal/SPARK-21513-fp-PySaprkAndSparkR.
---
 R/pkg/R/functions.R                           | 16 +++++++++++---
 R/pkg/tests/fulltests/test_sparkSQL.R         |  8 +++++++
 python/pyspark/sql/functions.py               | 22 ++++++++++++++-----
 .../expressions/jsonExpressions.scala         |  8 +++----
 .../sql/catalyst/json/JacksonGenerator.scala  |  2 +-
 .../sql-tests/results/json-functions.sql.out  |  8 +++----
 6 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 5a46d737aeeb7..e92e1fd72bf10 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -176,7 +176,8 @@ NULL
 #'
 #' @param x Column to compute on. Note the difference in the following methods:
 #'          \itemize{
-#'          \item \code{to_json}: it is the column containing the struct or array of the structs.
+#'          \item \code{to_json}: it is the column containing the struct, array of the structs,
+#'              the map or array of maps.
 #'          \item \code{from_json}: it is the column containing the JSON string.
 #'          }
 #' @param ... additional argument(s). In \code{to_json} and \code{from_json}, this contains
@@ -1700,8 +1701,9 @@ setMethod("to_date",
           })
 
 #' @details
-#' \code{to_json}: Converts a column containing a \code{structType} or array of \code{structType}
-#' into a Column of JSON string. Resolving the Column can fail if an unsupported type is encountered.
+#' \code{to_json}: Converts a column containing a \code{structType}, array of \code{structType},
+#' a \code{mapType} or array of \code{mapType} into a Column of JSON string.
+#' Resolving the Column can fail if an unsupported type is encountered.
 #'
 #' @rdname column_collection_functions
 #' @aliases to_json to_json,Column-method
@@ -1715,6 +1717,14 @@ setMethod("to_date",
 #'
 #' # Converts an array of structs into a JSON array
 #' df2 <- sql("SELECT array(named_struct('name', 'Bob'), named_struct('name', 'Alice')) as people")
+#' df2 <- mutate(df2, people_json = to_json(df2$people))
+#'
+#' # Converts a map into a JSON object
+#' df2 <- sql("SELECT map('name', 'Bob')) as people")
+#' df2 <- mutate(df2, people_json = to_json(df2$people))
+#'
+#' # Converts an array of maps into a JSON array
+#' df2 <- sql("SELECT array(map('name', 'Bob'), map('name', 'Alice')) as people")
 #' df2 <- mutate(df2, people_json = to_json(df2$people))}
 #' @note to_json since 2.2.0
 setMethod("to_json", signature(x = "Column"),
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index 7abc8720473c1..85a7e0819cff7 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -1491,6 +1491,14 @@ test_that("column functions", {
   j <- collect(select(df, alias(to_json(df$people), "json")))
   expect_equal(j[order(j$json), ][1], "[{\"name\":\"Bob\"},{\"name\":\"Alice\"}]")
 
+  df <- sql("SELECT map('name', 'Bob') as people")
+  j <- collect(select(df, alias(to_json(df$people), "json")))
+  expect_equal(j[order(j$json), ][1], "{\"name\":\"Bob\"}")
+
+  df <- sql("SELECT array(map('name', 'Bob'), map('name', 'Alice')) as people")
+  j <- collect(select(df, alias(to_json(df$people), "json")))
+  expect_equal(j[order(j$json), ][1], "[{\"name\":\"Bob\"},{\"name\":\"Alice\"}]")
+
   df <- read.json(mapTypeJsonPath)
   j <- collect(select(df, alias(to_json(df$info), "json")))
   expect_equal(j[order(j$json), ][1], "{\"age\":16,\"height\":176.5}")
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 0e76182e0e02d..399bef02d9cc4 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -1884,9 +1884,9 @@ def json_tuple(col, *fields):
 @since(2.1)
 def from_json(col, schema, options={}):
     """
-    Parses a column containing a JSON string into a [[StructType]] or [[ArrayType]]
-    of [[StructType]]s with the specified schema. Returns `null`, in the case of an unparseable
-    string.
+    Parses a column containing a JSON string into a :class:`StructType` or :class:`ArrayType`
+    of :class:`StructType`\\s with the specified schema. Returns `null`, in the case of an
+    unparseable string.
 
     :param col: string column in json format
     :param schema: a StructType or ArrayType of StructType to use when parsing the json column.
@@ -1921,10 +1921,12 @@ def from_json(col, schema, options={}):
 @since(2.1)
 def to_json(col, options={}):
     """
-    Converts a column containing a [[StructType]] or [[ArrayType]] of [[StructType]]s into a
-    JSON string. Throws an exception, in the case of an unsupported type.
+    Converts a column containing a :class:`StructType`, :class:`ArrayType` of
+    :class:`StructType`\\s, a :class:`MapType` or :class:`ArrayType` of :class:`MapType`\\s
+    into a JSON string. Throws an exception, in the case of an unsupported type.
 
-    :param col: name of column containing the struct or array of the structs
+    :param col: name of column containing the struct, array of the structs, the map or
+        array of the maps.
     :param options: options to control converting. accepts the same options as the json datasource
 
     >>> from pyspark.sql import Row
@@ -1937,6 +1939,14 @@ def to_json(col, options={}):
     >>> df = spark.createDataFrame(data, ("key", "value"))
     >>> df.select(to_json(df.value).alias("json")).collect()
     [Row(json=u'[{"age":2,"name":"Alice"},{"age":3,"name":"Bob"}]')]
+    >>> data = [(1, {"name": "Alice"})]
+    >>> df = spark.createDataFrame(data, ("key", "value"))
+    >>> df.select(to_json(df.value).alias("json")).collect()
+    [Row(json=u'{"name":"Alice"}')]
+    >>> data = [(1, [{"name": "Alice"}, {"name": "Bob"}])]
+    >>> df = spark.createDataFrame(data, ("key", "value"))
+    >>> df.select(to_json(df.value).alias("json")).collect()
+    [Row(json=u'[{"name":"Alice"},{"name":"Bob"}]')]
     """
 
     sc = SparkContext._active_spark_context
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index 134163187b7c6..18b4fed597447 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -618,13 +618,13 @@ case class JsonToStructs(
        {"time":"26/08/2015"}
       > SELECT _FUNC_(array(named_struct('a', 1, 'b', 2));
        [{"a":1,"b":2}]
-      > SELECT _FUNC_(map('a',named_struct('b',1)));
+      > SELECT _FUNC_(map('a', named_struct('b', 1)));
        {"a":{"b":1}}
-      > SELECT _FUNC_(map(named_struct('a',1),named_struct('b',2)));
+      > SELECT _FUNC_(map(named_struct('a', 1),named_struct('b', 2)));
        {"[1]":{"b":2}}
-      > SELECT _FUNC_(map('a',1));
+      > SELECT _FUNC_(map('a', 1));
        {"a":1}
-      > SELECT _FUNC_(array((map('a',1))));
+      > SELECT _FUNC_(array((map('a', 1))));
        [{"a":1}]
   """,
   since = "2.2.0")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala
index dfe7e28121943..eb06e4f304f0a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala
@@ -43,7 +43,7 @@ private[sql] class JacksonGenerator(
   private type ValueWriter = (SpecializedGetters, Int) => Unit
 
   // `JackGenerator` can only be initialized with a `StructType` or a `MapType`.
-  require(dataType.isInstanceOf[StructType] | dataType.isInstanceOf[MapType],
+  require(dataType.isInstanceOf[StructType] || dataType.isInstanceOf[MapType],
     "JacksonGenerator only supports to be initialized with a StructType " +
       s"or MapType but got ${dataType.simpleString}")
 
diff --git a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
index dcced79d315f3..d9dc728a18e8d 100644
--- a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
@@ -26,13 +26,13 @@ Extended Usage:
        {"time":"26/08/2015"}
       > SELECT to_json(array(named_struct('a', 1, 'b', 2));
        [{"a":1,"b":2}]
-      > SELECT to_json(map('a',named_struct('b',1)));
+      > SELECT to_json(map('a', named_struct('b', 1)));
        {"a":{"b":1}}
-      > SELECT to_json(map(named_struct('a',1),named_struct('b',2)));
+      > SELECT to_json(map(named_struct('a', 1),named_struct('b', 2)));
        {"[1]":{"b":2}}
-      > SELECT to_json(map('a',1));
+      > SELECT to_json(map('a', 1));
        {"a":1}
-      > SELECT to_json(array((map('a',1))));
+      > SELECT to_json(array((map('a', 1))));
        [{"a":1}]
   
     Since: 2.2.0

From 88661747f506e73c79de36711daebb0330de7b0d Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Thu, 14 Sep 2017 22:32:16 -0700
Subject: [PATCH 551/779] [SPARK-22018][SQL] Preserve top-level alias metadata
 when collapsing projects

## What changes were proposed in this pull request?
If there are two projects like as follows.
```
Project [a_with_metadata#27 AS b#26]
+- Project [a#0 AS a_with_metadata#27]
   +- LocalRelation <empty>, [a#0, b#1]
```
Child Project has an output column with a metadata in it, and the parent Project has an alias that implicitly forwards the metadata. So this metadata is visible for higher operators. Upon applying CollapseProject optimizer rule, the metadata is not preserved.
```
Project [a#0 AS b#26]
+- LocalRelation <empty>, [a#0, b#1]
```
This is incorrect, as downstream operators that expect certain metadata (e.g. watermark in structured streaming) to identify certain fields will fail to do so. This PR fixes it by preserving the metadata of top-level aliases.

## How was this patch tested?
New unit test

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #19240 from tdas/SPARK-22018.
---
 .../sql/catalyst/analysis/Analyzer.scala      |  5 +++-
 .../optimizer/CollapseProjectSuite.scala      | 23 +++++++++++++++++--
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 0880bd66ea4c4..db276fbc9d53a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -2256,7 +2256,10 @@ object CleanupAliases extends Rule[LogicalPlan] {
 
   def trimNonTopLevelAliases(e: Expression): Expression = e match {
     case a: Alias =>
-      a.withNewChildren(trimAliases(a.child) :: Nil)
+      a.copy(child = trimAliases(a.child))(
+        exprId = a.exprId,
+        qualifier = a.qualifier,
+        explicitMetadata = Some(a.metadata))
     case other => trimAliases(other)
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseProjectSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseProjectSuite.scala
index 587437e9aa81d..e7a5bcee420f5 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseProjectSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseProjectSuite.scala
@@ -20,10 +20,11 @@ package org.apache.spark.sql.catalyst.optimizer
 import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
-import org.apache.spark.sql.catalyst.expressions.Rand
+import org.apache.spark.sql.catalyst.expressions.{Alias, Rand}
 import org.apache.spark.sql.catalyst.plans.PlanTest
-import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
+import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, Project}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
+import org.apache.spark.sql.types.MetadataBuilder
 
 class CollapseProjectSuite extends PlanTest {
   object Optimize extends RuleExecutor[LogicalPlan] {
@@ -119,4 +120,22 @@ class CollapseProjectSuite extends PlanTest {
 
     comparePlans(optimized, correctAnswer)
   }
+
+  test("preserve top-level alias metadata while collapsing projects") {
+    def hasMetadata(logicalPlan: LogicalPlan): Boolean = {
+      logicalPlan.asInstanceOf[Project].projectList.exists(_.metadata.contains("key"))
+    }
+
+    val metadata = new MetadataBuilder().putLong("key", 1).build()
+    val analyzed =
+      Project(Seq(Alias('a_with_metadata, "b")()),
+        Project(Seq(Alias('a, "a_with_metadata")(explicitMetadata = Some(metadata))),
+          testRelation.logicalPlan)).analyze
+    require(hasMetadata(analyzed))
+
+    val optimized = Optimize.execute(analyzed)
+    val projects = optimized.collect { case p: Project => p }
+    assert(projects.size === 1)
+    assert(hasMetadata(optimized))
+  }
 }

From 22b111ef9d10ebf3c285974fd8c5ea0804ca144a Mon Sep 17 00:00:00 2001
From: zhoukang <zhoukang199191@gmail.com>
Date: Fri, 15 Sep 2017 14:03:26 +0800
Subject: [PATCH 552/779] [SPARK-21902][CORE] Print root cause for
 BlockManager#doPut

## What changes were proposed in this pull request?

As logging below, actually exception will be hidden when removeBlockInternal throw an exception.
`2017-08-31,10:26:57,733 WARN org.apache.spark.storage.BlockManager: Putting block broadcast_110 failed due to an exception
2017-08-31,10:26:57,734 WARN org.apache.spark.broadcast.BroadcastManager: Failed to create a new broadcast in 1 attempts
java.io.IOException: Failed to create local dir in /tmp/blockmgr-5bb5ac1e-c494-434a-ab89-bd1808c6b9ed/2e.
        at org.apache.spark.storage.DiskBlockManager.getFile(DiskBlockManager.scala:70)
        at org.apache.spark.storage.DiskStore.remove(DiskStore.scala:115)
        at org.apache.spark.storage.BlockManager.removeBlockInternal(BlockManager.scala:1339)
        at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:910)
        at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:948)
        at org.apache.spark.storage.BlockManager.putIterator(BlockManager.scala:726)
        at org.apache.spark.storage.BlockManager.putSingle(BlockManager.scala:1233)
        at org.apache.spark.broadcast.TorrentBroadcast.writeBlocks(TorrentBroadcast.scala:122)
        at org.apache.spark.broadcast.TorrentBroadcast.<init>(TorrentBroadcast.scala:88)
        at org.apache.spark.broadcast.TorrentBroadcastFactory.newBroadcast(TorrentBroadcastFactory.scala:34)
        at org.apache.spark.broadcast.BroadcastManager$$anonfun$newBroadcast$1.apply$mcVI$sp(BroadcastManager.scala:60)
        at scala.collection.immutable.Range.foreach$mVc$sp(Range.scala:160)
        at org.apache.spark.broadcast.BroadcastManager.newBroadcast(BroadcastManager.scala:58)
        at org.apache.spark.SparkContext.broadcast(SparkContext.scala:1415)
        at org.apache.spark.scheduler.DAGScheduler.submitMissingTasks(DAGScheduler.scala:1002)
        at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$submitStage(DAGScheduler.scala:924)
        at org.apache.spark.scheduler.DAGScheduler$$anonfun$submitWaitingChildStages$6.apply(DAGScheduler.scala:771)
        at org.apache.spark.scheduler.DAGScheduler$$anonfun$submitWaitingChildStages$6.apply(DAGScheduler.scala:770)
        at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
        at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
        at org.apache.spark.scheduler.DAGScheduler.submitWaitingChildStages(DAGScheduler.scala:770)
        at org.apache.spark.scheduler.DAGScheduler.handleTaskCompletion(DAGScheduler.scala:1235)
        at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1662)
        at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1620)
        at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1609)
        at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)`

In this pr i will print exception first make troubleshooting more conveniently.
PS:
This one split from [PR-19133](https://github.com/apache/spark/pull/19133)

## How was this patch tested?
Exsist unit test

Author: zhoukang <zhoukang199191@gmail.com>

Closes #19171 from caneGuy/zhoukang/print-rootcause.
---
 .../main/scala/org/apache/spark/storage/BlockManager.scala | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
index aaacabe79ace4..b4b5938c307e2 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -988,11 +988,16 @@ private[spark] class BlockManager(
         logWarning(s"Putting block $blockId failed")
       }
       res
+    } catch {
+      // Since removeBlockInternal may throw exception,
+      // we should print exception first to show root cause.
+      case NonFatal(e) =>
+        logWarning(s"Putting block $blockId failed due to exception $e.")
+        throw e
     } finally {
       // This cleanup is performed in a finally block rather than a `catch` to avoid having to
       // catch and properly re-throw InterruptedException.
       if (exceptionWasThrown) {
-        logWarning(s"Putting block $blockId failed due to an exception")
         // If an exception was thrown then it's possible that the code in `putBody` has already
         // notified the master about the availability of this block, so we need to send an update
         // to remove this block location.

From 4decedfdbd31525e66d33eecc9d38a747c98547b Mon Sep 17 00:00:00 2001
From: Yuming Wang <wgyumg@gmail.com>
Date: Thu, 14 Sep 2017 23:35:55 -0700
Subject: [PATCH 553/779] [SPARK-22002][SQL] Read JDBC table use custom schema
 support specify partial fields.

## What changes were proposed in this pull request?

https://github.com/apache/spark/pull/18266 add a new feature to support read JDBC table use custom schema, but we must specify all the fields. For simplicity, this PR support  specify partial fields.

## How was this patch tested?
unit tests

Author: Yuming Wang <wgyumg@gmail.com>

Closes #19231 from wangyum/SPARK-22002.
---
 docs/sql-programming-guide.md                 |  2 +-
 .../datasources/jdbc/JdbcUtils.scala          | 36 +++++++-------
 .../datasources/jdbc/JdbcUtilsSuite.scala     | 47 ++++++-------------
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 17 ++++---
 4 files changed, 40 insertions(+), 62 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 95d704014742c..5db60cc996e75 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1333,7 +1333,7 @@ the following case-insensitive options:
   <tr>
     <td><code>customSchema</code></td>
     <td>
-     The custom schema to use for reading data from JDBC connectors. For example, "id DECIMAL(38, 0), name STRING"). The column names should be identical to the corresponding column names of JDBC table. Users can specify the corresponding data types of Spark SQL instead of using the defaults. This option applies only to reading.
+     The custom schema to use for reading data from JDBC connectors. For example, <code>"id DECIMAL(38, 0), name STRING"</code>. You can also specify partial fields, and the others use the default type mapping. For example, <code>"id DECIMAL(38, 0)"</code>. The column names should be identical to the corresponding column names of JDBC table. Users can specify the corresponding data types of Spark SQL instead of using the defaults. This option applies only to reading.
     </td>
   </tr>
 </table>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
index 75327f0d38c2e..71133666b3249 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
@@ -301,12 +301,11 @@ object JdbcUtils extends Logging {
       } else {
         rsmd.isNullable(i + 1) != ResultSetMetaData.columnNoNulls
       }
-      val metadata = new MetadataBuilder()
-        .putLong("scale", fieldScale)
+      val metadata = new MetadataBuilder().putLong("scale", fieldScale)
       val columnType =
         dialect.getCatalystType(dataType, typeName, fieldSize, metadata).getOrElse(
           getCatalystType(dataType, fieldSize, fieldScale, isSigned))
-      fields(i) = StructField(columnName, columnType, nullable, metadata.build())
+      fields(i) = StructField(columnName, columnType, nullable)
       i = i + 1
     }
     new StructType(fields)
@@ -768,31 +767,30 @@ object JdbcUtils extends Logging {
   }
 
   /**
-   * Parses the user specified customSchema option value to DataFrame schema,
-   * and returns it if it's all columns are equals to default schema's.
+   * Parses the user specified customSchema option value to DataFrame schema, and
+   * returns a schema that is replaced by the custom schema's dataType if column name is matched.
    */
   def getCustomSchema(
       tableSchema: StructType,
       customSchema: String,
       nameEquality: Resolver): StructType = {
-    val userSchema = CatalystSqlParser.parseTableSchema(customSchema)
+    if (null != customSchema && customSchema.nonEmpty) {
+      val userSchema = CatalystSqlParser.parseTableSchema(customSchema)
 
-    SchemaUtils.checkColumnNameDuplication(
-      userSchema.map(_.name), "in the customSchema option value", nameEquality)
-
-    val colNames = tableSchema.fieldNames.mkString(",")
-    val errorMsg = s"Please provide all the columns, all columns are: $colNames"
-    if (userSchema.size != tableSchema.size) {
-      throw new AnalysisException(errorMsg)
-    }
+      SchemaUtils.checkColumnNameDuplication(
+        userSchema.map(_.name), "in the customSchema option value", nameEquality)
 
-    // This is resolved by names, only check the column names.
-    userSchema.fieldNames.foreach { col =>
-      tableSchema.find(f => nameEquality(f.name, col)).getOrElse {
-        throw new AnalysisException(errorMsg)
+      // This is resolved by names, use the custom filed dataType to replace the default dataType.
+      val newSchema = tableSchema.map { col =>
+        userSchema.find(f => nameEquality(f.name, col.name)) match {
+          case Some(c) => col.copy(dataType = c.dataType)
+          case None => col
+        }
       }
+      StructType(newSchema)
+    } else {
+      tableSchema
     }
-    userSchema
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtilsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtilsSuite.scala
index 1255f262bce94..7d277c1ffaffe 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtilsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtilsSuite.scala
@@ -30,57 +30,38 @@ class JdbcUtilsSuite extends SparkFunSuite {
   val caseInsensitive = org.apache.spark.sql.catalyst.analysis.caseInsensitiveResolution
 
   test("Parse user specified column types") {
-    assert(
-      JdbcUtils.getCustomSchema(tableSchema, "C1 DATE, C2 STRING", caseInsensitive) ===
-      StructType(Seq(StructField("C1", DateType, true), StructField("C2", StringType, true))))
-    assert(JdbcUtils.getCustomSchema(tableSchema, "C1 DATE, C2 STRING", caseSensitive) ===
-      StructType(Seq(StructField("C1", DateType, true), StructField("C2", StringType, true))))
+    assert(JdbcUtils.getCustomSchema(tableSchema, null, caseInsensitive) === tableSchema)
+    assert(JdbcUtils.getCustomSchema(tableSchema, "", caseInsensitive) === tableSchema)
+
+    assert(JdbcUtils.getCustomSchema(tableSchema, "c1 DATE", caseInsensitive) ===
+      StructType(Seq(StructField("C1", DateType, false), StructField("C2", IntegerType, false))))
+    assert(JdbcUtils.getCustomSchema(tableSchema, "c1 DATE", caseSensitive) ===
+      StructType(Seq(StructField("C1", StringType, false), StructField("C2", IntegerType, false))))
+
     assert(
       JdbcUtils.getCustomSchema(tableSchema, "c1 DATE, C2 STRING", caseInsensitive) ===
-        StructType(Seq(StructField("c1", DateType, true), StructField("C2", StringType, true))))
-    assert(JdbcUtils.getCustomSchema(
-      tableSchema, "c1 DECIMAL(38, 0), C2 STRING", caseInsensitive) ===
-      StructType(Seq(StructField("c1", DecimalType(38, 0), true),
-        StructField("C2", StringType, true))))
+      StructType(Seq(StructField("C1", DateType, false), StructField("C2", StringType, false))))
+    assert(JdbcUtils.getCustomSchema(tableSchema, "c1 DATE, C2 STRING", caseSensitive) ===
+      StructType(Seq(StructField("C1", StringType, false), StructField("C2", StringType, false))))
 
     // Throw AnalysisException
     val duplicate = intercept[AnalysisException]{
       JdbcUtils.getCustomSchema(tableSchema, "c1 DATE, c1 STRING", caseInsensitive) ===
-        StructType(Seq(StructField("c1", DateType, true), StructField("c1", StringType, true)))
+        StructType(Seq(StructField("c1", DateType, false), StructField("c1", StringType, false)))
     }
     assert(duplicate.getMessage.contains(
       "Found duplicate column(s) in the customSchema option value"))
 
-    val allColumns = intercept[AnalysisException]{
-      JdbcUtils.getCustomSchema(tableSchema, "C1 STRING", caseSensitive) ===
-        StructType(Seq(StructField("C1", DateType, true)))
-    }
-    assert(allColumns.getMessage.contains("Please provide all the columns,"))
-
-    val caseSensitiveColumnNotFound = intercept[AnalysisException]{
-      JdbcUtils.getCustomSchema(tableSchema, "c1 DATE, C2 STRING", caseSensitive) ===
-        StructType(Seq(StructField("c1", DateType, true), StructField("C2", StringType, true)))
-    }
-    assert(caseSensitiveColumnNotFound.getMessage.contains(
-      "Please provide all the columns, all columns are: C1,C2;"))
-
-    val caseInsensitiveColumnNotFound = intercept[AnalysisException]{
-      JdbcUtils.getCustomSchema(tableSchema, "c3 DATE, C2 STRING", caseInsensitive) ===
-        StructType(Seq(StructField("c3", DateType, true), StructField("C2", StringType, true)))
-    }
-    assert(caseInsensitiveColumnNotFound.getMessage.contains(
-      "Please provide all the columns, all columns are: C1,C2;"))
-
     // Throw ParseException
     val dataTypeNotSupported = intercept[ParseException]{
       JdbcUtils.getCustomSchema(tableSchema, "c3 DATEE, C2 STRING", caseInsensitive) ===
-        StructType(Seq(StructField("c3", DateType, true), StructField("C2", StringType, true)))
+        StructType(Seq(StructField("c3", DateType, false), StructField("C2", StringType, false)))
     }
     assert(dataTypeNotSupported.getMessage.contains("DataType datee is not supported"))
 
     val mismatchedInput = intercept[ParseException]{
       JdbcUtils.getCustomSchema(tableSchema, "c3 DATE. C2 STRING", caseInsensitive) ===
-        StructType(Seq(StructField("c3", DateType, true), StructField("C2", StringType, true)))
+        StructType(Seq(StructField("c3", DateType, false), StructField("C2", StringType, false)))
     }
     assert(mismatchedInput.getMessage.contains("mismatched input '.' expecting"))
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index 40179261ab200..689f4106824aa 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -26,6 +26,7 @@ import org.scalatest.{BeforeAndAfter, PrivateMethodTester}
 
 import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.sql.{AnalysisException, DataFrame, Row}
+import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
 import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 import org.apache.spark.sql.execution.DataSourceScanExec
 import org.apache.spark.sql.execution.command.ExplainCommand
@@ -970,30 +971,28 @@ class JDBCSuite extends SparkFunSuite
 
   test("jdbc API support custom schema") {
     val parts = Array[String]("THEID < 2", "THEID >= 2")
+    val customSchema = "NAME STRING, THEID INT"
     val props = new Properties()
-    props.put("customSchema", "NAME STRING, THEID BIGINT")
-    val schema = StructType(Seq(
-      StructField("NAME", StringType, true), StructField("THEID", LongType, true)))
+    props.put("customSchema", customSchema)
     val df = spark.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", parts, props)
     assert(df.schema.size === 2)
-    assert(df.schema === schema)
+    assert(df.schema === CatalystSqlParser.parseTableSchema(customSchema))
     assert(df.count() === 3)
   }
 
   test("jdbc API custom schema DDL-like strings.") {
     withTempView("people_view") {
+      val customSchema = "NAME STRING, THEID INT"
       sql(
         s"""
            |CREATE TEMPORARY VIEW people_view
            |USING org.apache.spark.sql.jdbc
            |OPTIONS (uRl '$url', DbTaBlE 'TEST.PEOPLE', User 'testUser', PassWord 'testPass',
-           |customSchema 'NAME STRING, THEID INT')
+           |customSchema '$customSchema')
         """.stripMargin.replaceAll("\n", " "))
-      val schema = StructType(
-        Seq(StructField("NAME", StringType, true), StructField("THEID", IntegerType, true)))
       val df = sql("select * from people_view")
-      assert(df.schema.size === 2)
-      assert(df.schema === schema)
+      assert(df.schema.length === 2)
+      assert(df.schema === CatalystSqlParser.parseTableSchema(customSchema))
       assert(df.count() === 3)
     }
   }

From 3c6198c86ef36e7e5814d74ede00672d0eeb7f32 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Fri, 15 Sep 2017 00:47:44 -0700
Subject: [PATCH 554/779] [SPARK-21987][SQL] fix a compatibility issue of sql
 event logs

## What changes were proposed in this pull request?

In https://github.com/apache/spark/pull/18600 we removed the `metadata` field from `SparkPlanInfo`. This causes a problem when we replay event logs that are generated by older Spark versions.

## How was this patch tested?

a regression test.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19237 from cloud-fan/event.
---
 .../spark/sql/execution/SparkPlanInfo.scala   |  3 ++
 .../sql/execution/SQLJsonProtocolSuite.scala  | 52 +++++++++++++++++++
 2 files changed, 55 insertions(+)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/SQLJsonProtocolSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala
index 2118b9118a22f..2a2315896831c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.execution
 
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties
+
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.execution.exchange.ReusedExchangeExec
 import org.apache.spark.sql.execution.metric.SQLMetricInfo
@@ -26,6 +28,7 @@ import org.apache.spark.sql.execution.metric.SQLMetricInfo
  * Stores information about a SQL SparkPlan.
  */
 @DeveloperApi
+@JsonIgnoreProperties(Array("metadata")) // The metadata field was removed in Spark 2.3.
 class SparkPlanInfo(
     val nodeName: String,
     val simpleString: String,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLJsonProtocolSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLJsonProtocolSuite.scala
new file mode 100644
index 0000000000000..c2e62b987e0cc
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLJsonProtocolSuite.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import org.json4s.jackson.JsonMethods.parse
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart
+import org.apache.spark.util.JsonProtocol
+
+class SQLJsonProtocolSuite extends SparkFunSuite {
+
+  test("SparkPlanGraph backward compatibility: metadata") {
+    val SQLExecutionStartJsonString =
+      """
+        |{
+        |  "Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart",
+        |  "executionId":0,
+        |  "description":"test desc",
+        |  "details":"test detail",
+        |  "physicalPlanDescription":"test plan",
+        |  "sparkPlanInfo": {
+        |    "nodeName":"TestNode",
+        |    "simpleString":"test string",
+        |    "children":[],
+        |    "metadata":{},
+        |    "metrics":[]
+        |  },
+        |  "time":0
+        |}
+      """.stripMargin
+    val reconstructedEvent = JsonProtocol.sparkEventFromJson(parse(SQLExecutionStartJsonString))
+    val expectedEvent = SparkListenerSQLExecutionStart(0, "test desc", "test detail", "test plan",
+      new SparkPlanInfo("TestNode", "test string", Nil, Nil), 0)
+    assert(reconstructedEvent == expectedEvent)
+  }
+}

From 79a4dab6297121c075a310a50d0fc0549a3c1e41 Mon Sep 17 00:00:00 2001
From: Travis Hegner <thegner@trilliumit.com>
Date: Fri, 15 Sep 2017 15:17:16 +0200
Subject: [PATCH 555/779] [SPARK-21958][ML] Word2VecModel save: transform data
 in the cluster

## What changes were proposed in this pull request?

Change a data transformation while saving a Word2VecModel to happen with distributed data instead of local driver data.

## How was this patch tested?

Unit tests for the ML sub-component still pass.
Running this patch against v2.2.0 in a fully distributed production cluster allows a 4.0G model to save and load correctly, where it would not do so without the patch.

Author: Travis Hegner <thegner@trilliumit.com>

Closes #19191 from travishegner/master.
---
 .../main/scala/org/apache/spark/ml/feature/Word2Vec.scala  | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
index f6095e26f435c..fe3306e1e50d6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
@@ -337,14 +337,17 @@ object Word2VecModel extends MLReadable[Word2VecModel] {
       DefaultParamsWriter.saveMetadata(instance, path, sc)
 
       val wordVectors = instance.wordVectors.getVectors
-      val dataSeq = wordVectors.toSeq.map { case (word, vector) => Data(word, vector) }
       val dataPath = new Path(path, "data").toString
       val bufferSizeInBytes = Utils.byteStringAsBytes(
         sc.conf.get("spark.kryoserializer.buffer.max", "64m"))
       val numPartitions = Word2VecModelWriter.calculateNumberOfPartitions(
         bufferSizeInBytes, instance.wordVectors.wordIndex.size, instance.getVectorSize)
-      sparkSession.createDataFrame(dataSeq)
+      val spark = sparkSession
+      import spark.implicits._
+      spark.createDataset[(String, Array[Float])](wordVectors.toSeq)
         .repartition(numPartitions)
+        .map { case (word, vector) => Data(word, vector) }
+        .toDF()
         .write
         .parquet(dataPath)
     }

From c7307acdad881d98857f0b63328fe9c420ddf9c3 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Fri, 15 Sep 2017 22:18:36 +0800
Subject: [PATCH 556/779] [SPARK-15689][SQL] data source v2 read path

## What changes were proposed in this pull request?

This PR adds the infrastructure for data source v2, and implement features which Spark already have in data source v1, i.e. column pruning, filter push down, catalyst expression filter push down, InternalRow scan, schema inference, data size report. The write path is excluded to avoid making this PR growing too big, and will be added in follow-up PR.

## How was this patch tested?

new tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19136 from cloud-fan/data-source-v2.
---
 .../spark/sql/sources/v2/DataSourceV2.java    |  31 +++
 .../sql/sources/v2/DataSourceV2Options.java   |  52 ++++
 .../spark/sql/sources/v2/ReadSupport.java     |  38 +++
 .../sql/sources/v2/ReadSupportWithSchema.java |  47 ++++
 .../sql/sources/v2/reader/DataReader.java     |  40 +++
 .../sources/v2/reader/DataSourceV2Reader.java |  67 +++++
 .../spark/sql/sources/v2/reader/ReadTask.java |  48 ++++
 .../sql/sources/v2/reader/Statistics.java     |  32 +++
 .../SupportsPushDownCatalystFilters.java      |  43 ++++
 .../v2/reader/SupportsPushDownFilters.java    |  38 +++
 .../SupportsPushDownRequiredColumns.java      |  42 ++++
 .../v2/reader/SupportsReportStatistics.java   |  33 +++
 .../v2/reader/SupportsScanUnsafeRow.java      |  49 ++++
 .../apache/spark/sql/DataFrameReader.scala    |  46 +++-
 .../spark/sql/execution/SparkPlanner.scala    |   2 +
 .../datasources/v2/DataSourceRDD.scala        |  68 ++++++
 .../datasources/v2/DataSourceV2Relation.scala |  40 +++
 .../datasources/v2/DataSourceV2ScanExec.scala |  89 +++++++
 .../datasources/v2/DataSourceV2Strategy.scala |  93 +++++++
 .../sources/v2/JavaAdvancedDataSourceV2.java  | 130 ++++++++++
 .../v2/JavaSchemaRequiredDataSource.java      |  54 +++++
 .../sources/v2/JavaSimpleDataSourceV2.java    |  86 +++++++
 .../sources/v2/JavaUnsafeRowDataSourceV2.java |  88 +++++++
 .../sources/v2/DataSourceV2OptionsSuite.scala |  40 +++
 .../sql/sources/v2/DataSourceV2Suite.scala    | 229 ++++++++++++++++++
 25 files changed, 1518 insertions(+), 7 deletions(-)
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2Options.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupport.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupportWithSchema.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReader.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ReadTask.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Statistics.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownCatalystFilters.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownFilters.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownRequiredColumns.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportStatistics.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanUnsafeRow.java
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala
 create mode 100644 sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java
 create mode 100644 sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSchemaRequiredDataSource.java
 create mode 100644 sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSimpleDataSourceV2.java
 create mode 100644 sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaUnsafeRowDataSourceV2.java
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2OptionsSuite.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala

diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2.java
new file mode 100644
index 0000000000000..dbcbe326a7510
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2;
+
+import org.apache.spark.annotation.InterfaceStability;
+
+/**
+ * The base interface for data source v2. Implementations must have a public, no arguments
+ * constructor.
+ *
+ * Note that this is an empty interface, data source implementations should mix-in at least one of
+ * the plug-in interfaces like {@link ReadSupport}. Otherwise it's just a dummy data source which is
+ * un-readable/writable.
+ */
+@InterfaceStability.Evolving
+public interface DataSourceV2 {}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2Options.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2Options.java
new file mode 100644
index 0000000000000..9a89c8193dd6e
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2Options.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2;
+
+import java.util.HashMap;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Optional;
+
+import org.apache.spark.annotation.InterfaceStability;
+
+/**
+ * An immutable string-to-string map in which keys are case-insensitive. This is used to represent
+ * data source options.
+ */
+@InterfaceStability.Evolving
+public class DataSourceV2Options {
+  private final Map<String, String> keyLowerCasedMap;
+
+  private String toLowerCase(String key) {
+    return key.toLowerCase(Locale.ROOT);
+  }
+
+  public DataSourceV2Options(Map<String, String> originalMap) {
+    keyLowerCasedMap = new HashMap<>(originalMap.size());
+    for (Map.Entry<String, String> entry : originalMap.entrySet()) {
+      keyLowerCasedMap.put(toLowerCase(entry.getKey()), entry.getValue());
+    }
+  }
+
+  /**
+   * Returns the option value to which the specified key is mapped, case-insensitively.
+   */
+  public Optional<String> get(String key) {
+    return Optional.ofNullable(keyLowerCasedMap.get(toLowerCase(key)));
+  }
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupport.java
new file mode 100644
index 0000000000000..ab5254a688d5a
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupport.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2;
+
+import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.sources.v2.reader.DataSourceV2Reader;
+
+/**
+ * A mix-in interface for {@link DataSourceV2}. Data sources can implement this interface to
+ * provide data reading ability and scan the data from the data source.
+ */
+@InterfaceStability.Evolving
+public interface ReadSupport {
+
+  /**
+   * Creates a {@link DataSourceV2Reader} to scan the data from this data source.
+   *
+   * @param options the options for this data source reader, which is an immutable case-insensitive
+   *                string-to-string map.
+   * @return a reader that implements the actual read logic.
+   */
+  DataSourceV2Reader createReader(DataSourceV2Options options);
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupportWithSchema.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupportWithSchema.java
new file mode 100644
index 0000000000000..c13aeca2ef36f
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupportWithSchema.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2;
+
+import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.sources.v2.reader.DataSourceV2Reader;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * A mix-in interface for {@link DataSourceV2}. Data sources can implement this interface to
+ * provide data reading ability and scan the data from the data source.
+ *
+ * This is a variant of {@link ReadSupport} that accepts user-specified schema when reading data.
+ * A data source can implement both {@link ReadSupport} and {@link ReadSupportWithSchema} if it
+ * supports both schema inference and user-specified schema.
+ */
+@InterfaceStability.Evolving
+public interface ReadSupportWithSchema {
+
+  /**
+   * Create a {@link DataSourceV2Reader} to scan the data from this data source.
+   *
+   * @param schema the full schema of this data source reader. Full schema usually maps to the
+   *               physical schema of the underlying storage of this data source reader, e.g.
+   *               CSV files, JSON files, etc, while this reader may not read data with full
+   *               schema, as column pruning or other optimizations may happen.
+   * @param options the options for this data source reader, which is an immutable case-insensitive
+   *                string-to-string map.
+   * @return a reader that implements the actual read logic.
+   */
+  DataSourceV2Reader createReader(StructType schema, DataSourceV2Options options);
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReader.java
new file mode 100644
index 0000000000000..cfafc1a576793
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReader.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+import java.io.Closeable;
+
+import org.apache.spark.annotation.InterfaceStability;
+
+/**
+ * A data reader returned by {@link ReadTask#createReader()} and is responsible for outputting data
+ * for a RDD partition.
+ */
+@InterfaceStability.Evolving
+public interface DataReader<T> extends Closeable {
+
+  /**
+   * Proceed to next record, returns false if there is no more records.
+   */
+  boolean next();
+
+  /**
+   * Return the current record. This method should return same value until `next` is called.
+   */
+  T get();
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java
new file mode 100644
index 0000000000000..48feb049c1de9
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+import java.util.List;
+
+import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.sources.v2.DataSourceV2Options;
+import org.apache.spark.sql.sources.v2.ReadSupport;
+import org.apache.spark.sql.sources.v2.ReadSupportWithSchema;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * A data source reader that is returned by
+ * {@link ReadSupport#createReader(DataSourceV2Options)} or
+ * {@link ReadSupportWithSchema#createReader(StructType, DataSourceV2Options)}.
+ * It can mix in various query optimization interfaces to speed up the data scan. The actual scan
+ * logic should be delegated to {@link ReadTask}s that are returned by {@link #createReadTasks()}.
+ *
+ * There are mainly 3 kinds of query optimizations:
+ *   1. Operators push-down. E.g., filter push-down, required columns push-down(aka column
+ *      pruning), etc. These push-down interfaces are named like `SupportsPushDownXXX`.
+ *   2. Information Reporting. E.g., statistics reporting, ordering reporting, etc. These
+ *      reporting interfaces are named like `SupportsReportingXXX`.
+ *   3. Special scans. E.g, columnar scan, unsafe row scan, etc. These scan interfaces are named
+ *      like `SupportsScanXXX`.
+ *
+ * Spark first applies all operator push-down optimizations that this data source supports. Then
+ * Spark collects information this data source reported for further optimizations. Finally Spark
+ * issues the scan request and does the actual data reading.
+ */
+@InterfaceStability.Evolving
+public interface DataSourceV2Reader {
+
+  /**
+   * Returns the actual schema of this data source reader, which may be different from the physical
+   * schema of the underlying storage, as column pruning or other optimizations may happen.
+   */
+  StructType readSchema();
+
+  /**
+   * Returns a list of read tasks. Each task is responsible for outputting data for one RDD
+   * partition. That means the number of tasks returned here is same as the number of RDD
+   * partitions this scan outputs.
+   *
+   * Note that, this may not be a full scan if the data source reader mixes in other optimization
+   * interfaces like column pruning, filter push-down, etc. These optimizations are applied before
+   * Spark issues the scan request.
+   */
+  List<ReadTask<Row>> createReadTasks();
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ReadTask.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ReadTask.java
new file mode 100644
index 0000000000000..7885bfcdd49e4
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ReadTask.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+import java.io.Serializable;
+
+import org.apache.spark.annotation.InterfaceStability;
+
+/**
+ * A read task returned by {@link DataSourceV2Reader#createReadTasks()} and is responsible for
+ * creating the actual data reader. The relationship between {@link ReadTask} and {@link DataReader}
+ * is similar to the relationship between {@link Iterable} and {@link java.util.Iterator}.
+ *
+ * Note that, the read task will be serialized and sent to executors, then the data reader will be
+ * created on executors and do the actual reading.
+ */
+@InterfaceStability.Evolving
+public interface ReadTask<T> extends Serializable {
+
+  /**
+   * The preferred locations where this read task can run faster, but Spark does not guarantee that
+   * this task will always run on these locations. The implementations should make sure that it can
+   * be run on any location. The location is a string representing the host name of an executor.
+   */
+  default String[] preferredLocations() {
+    return new String[0];
+  }
+
+  /**
+   * Returns a data reader to do the actual reading work for this read task.
+   */
+  DataReader<T> createReader();
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Statistics.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Statistics.java
new file mode 100644
index 0000000000000..e8cd7adbca071
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Statistics.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+import java.util.OptionalLong;
+
+import org.apache.spark.annotation.InterfaceStability;
+
+/**
+ * An interface to represent statistics for a data source, which is returned by
+ * {@link SupportsReportStatistics#getStatistics()}.
+ */
+@InterfaceStability.Evolving
+public interface Statistics {
+  OptionalLong sizeInBytes();
+  OptionalLong numRows();
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownCatalystFilters.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownCatalystFilters.java
new file mode 100644
index 0000000000000..19d706238ec8e
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownCatalystFilters.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+import org.apache.spark.annotation.Experimental;
+import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.catalyst.expressions.Expression;
+
+/**
+ * A mix-in interface for {@link DataSourceV2Reader}. Data source readers can implement this
+ * interface to push down arbitrary expressions as predicates to the data source.
+ * This is an experimental and unstable interface as {@link Expression} is not public and may get
+ * changed in the future Spark versions.
+ *
+ * Note that, if data source readers implement both this interface and
+ * {@link SupportsPushDownFilters}, Spark will ignore {@link SupportsPushDownFilters} and only
+ * process this interface.
+ */
+@InterfaceStability.Evolving
+@Experimental
+@InterfaceStability.Unstable
+public interface SupportsPushDownCatalystFilters {
+
+  /**
+   * Pushes down filters, and returns unsupported filters.
+   */
+  Expression[] pushCatalystFilters(Expression[] filters);
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownFilters.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownFilters.java
new file mode 100644
index 0000000000000..d4b509e7080f2
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownFilters.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.sources.Filter;
+
+/**
+ * A mix-in interface for {@link DataSourceV2Reader}. Data source readers can implement this
+ * interface to push down filters to the data source and reduce the size of the data to be read.
+ *
+ * Note that, if data source readers implement both this interface and
+ * {@link SupportsPushDownCatalystFilters}, Spark will ignore this interface and only process
+ * {@link SupportsPushDownCatalystFilters}.
+ */
+@InterfaceStability.Evolving
+public interface SupportsPushDownFilters {
+
+  /**
+   * Pushes down filters, and returns unsupported filters.
+   */
+  Filter[] pushFilters(Filter[] filters);
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownRequiredColumns.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownRequiredColumns.java
new file mode 100644
index 0000000000000..fe0ac8ee0ee32
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownRequiredColumns.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * A mix-in interface for {@link DataSourceV2Reader}. Data source readers can implement this
+ * interface to push down required columns to the data source and only read these columns during
+ * scan to reduce the size of the data to be read.
+ */
+@InterfaceStability.Evolving
+public interface SupportsPushDownRequiredColumns {
+
+  /**
+   * Applies column pruning w.r.t. the given requiredSchema.
+   *
+   * Implementation should try its best to prune the unnecessary columns or nested fields, but it's
+   * also OK to do the pruning partially, e.g., a data source may not be able to prune nested
+   * fields, and only prune top-level columns.
+   *
+   * Note that, data source readers should update {@link DataSourceV2Reader#readSchema()} after
+   * applying column pruning.
+   */
+  void pruneColumns(StructType requiredSchema);
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportStatistics.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportStatistics.java
new file mode 100644
index 0000000000000..c019d2f819ab7
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportStatistics.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+import org.apache.spark.annotation.InterfaceStability;
+
+/**
+ * A mix in interface for {@link DataSourceV2Reader}. Data source readers can implement this
+ * interface to report statistics to Spark.
+ */
+@InterfaceStability.Evolving
+public interface SupportsReportStatistics {
+
+  /**
+   * Returns the basic statistics of this data source.
+   */
+  Statistics getStatistics();
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanUnsafeRow.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanUnsafeRow.java
new file mode 100644
index 0000000000000..829f9a078760b
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanUnsafeRow.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+import java.util.List;
+
+import org.apache.spark.annotation.Experimental;
+import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
+import org.apache.spark.sql.sources.v2.reader.DataSourceV2Reader;
+import org.apache.spark.sql.sources.v2.reader.ReadTask;
+
+/**
+ * A mix-in interface for {@link DataSourceV2Reader}. Data source readers can implement this
+ * interface to output {@link UnsafeRow} directly and avoid the row copy at Spark side.
+ * This is an experimental and unstable interface, as {@link UnsafeRow} is not public and may get
+ * changed in the future Spark versions.
+ */
+@InterfaceStability.Evolving
+@Experimental
+@InterfaceStability.Unstable
+public interface SupportsScanUnsafeRow extends DataSourceV2Reader {
+
+  @Override
+  default List<ReadTask<Row>> createReadTasks() {
+    throw new IllegalStateException("createReadTasks should not be called with SupportsScanUnsafeRow.");
+  }
+
+  /**
+   * Similar to {@link DataSourceV2Reader#createReadTasks()}, but returns data in unsafe row format.
+   */
+  List<ReadTask<UnsafeRow>> createUnsafeRowReadTasks();
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index c69acc413e87f..78b668c04fd5c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -32,6 +32,8 @@ import org.apache.spark.sql.execution.datasources.{DataSource, FailureSafeParser
 import org.apache.spark.sql.execution.datasources.csv._
 import org.apache.spark.sql.execution.datasources.jdbc._
 import org.apache.spark.sql.execution.datasources.json.TextInputJsonDataSource
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
+import org.apache.spark.sql.sources.v2.{DataSourceV2, DataSourceV2Options, ReadSupport, ReadSupportWithSchema}
 import org.apache.spark.sql.types.{StringType, StructType}
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -180,13 +182,43 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
         "read files of Hive data source directly.")
     }
 
-    sparkSession.baseRelationToDataFrame(
-      DataSource.apply(
-        sparkSession,
-        paths = paths,
-        userSpecifiedSchema = userSpecifiedSchema,
-        className = source,
-        options = extraOptions.toMap).resolveRelation())
+    val cls = DataSource.lookupDataSource(source)
+    if (classOf[DataSourceV2].isAssignableFrom(cls)) {
+      val dataSource = cls.newInstance()
+      val options = new DataSourceV2Options(extraOptions.asJava)
+
+      val reader = (cls.newInstance(), userSpecifiedSchema) match {
+        case (ds: ReadSupportWithSchema, Some(schema)) =>
+          ds.createReader(schema, options)
+
+        case (ds: ReadSupport, None) =>
+          ds.createReader(options)
+
+        case (_: ReadSupportWithSchema, None) =>
+          throw new AnalysisException(s"A schema needs to be specified when using $dataSource.")
+
+        case (ds: ReadSupport, Some(schema)) =>
+          val reader = ds.createReader(options)
+          if (reader.readSchema() != schema) {
+            throw new AnalysisException(s"$ds does not allow user-specified schemas.")
+          }
+          reader
+
+        case _ =>
+          throw new AnalysisException(s"$cls does not support data reading.")
+      }
+
+      Dataset.ofRows(sparkSession, DataSourceV2Relation(reader))
+    } else {
+      // Code path for data source v1.
+      sparkSession.baseRelationToDataFrame(
+        DataSource.apply(
+          sparkSession,
+          paths = paths,
+          userSpecifiedSchema = userSpecifiedSchema,
+          className = source,
+          options = extraOptions.toMap).resolveRelation())
+    }
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala
index 4e718d609c921..b143d44eae17b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala
@@ -22,6 +22,7 @@ import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, FileSourceStrategy}
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Strategy
 import org.apache.spark.sql.internal.SQLConf
 
 class SparkPlanner(
@@ -35,6 +36,7 @@ class SparkPlanner(
   def strategies: Seq[Strategy] =
     experimentalMethods.extraStrategies ++
       extraPlanningStrategies ++ (
+      DataSourceV2Strategy ::
       FileSourceStrategy ::
       DataSourceStrategy(conf) ::
       SpecialLimits ::
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala
new file mode 100644
index 0000000000000..b8fe5ac8e3d94
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.v2
+
+import scala.collection.JavaConverters._
+
+import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow
+import org.apache.spark.sql.sources.v2.reader.ReadTask
+
+class DataSourceRDDPartition(val index: Int, val readTask: ReadTask[UnsafeRow])
+  extends Partition with Serializable
+
+class DataSourceRDD(
+    sc: SparkContext,
+    @transient private val readTasks: java.util.List[ReadTask[UnsafeRow]])
+  extends RDD[UnsafeRow](sc, Nil) {
+
+  override protected def getPartitions: Array[Partition] = {
+    readTasks.asScala.zipWithIndex.map {
+      case (readTask, index) => new DataSourceRDDPartition(index, readTask)
+    }.toArray
+  }
+
+  override def compute(split: Partition, context: TaskContext): Iterator[UnsafeRow] = {
+    val reader = split.asInstanceOf[DataSourceRDDPartition].readTask.createReader()
+    context.addTaskCompletionListener(_ => reader.close())
+    val iter = new Iterator[UnsafeRow] {
+      private[this] var valuePrepared = false
+
+      override def hasNext: Boolean = {
+        if (!valuePrepared) {
+          valuePrepared = reader.next()
+        }
+        valuePrepared
+      }
+
+      override def next(): UnsafeRow = {
+        if (!hasNext) {
+          throw new java.util.NoSuchElementException("End of stream")
+        }
+        valuePrepared = false
+        reader.get()
+      }
+    }
+    new InterruptibleIterator(context, iter)
+  }
+
+  override def getPreferredLocations(split: Partition): Seq[String] = {
+    split.asInstanceOf[DataSourceRDDPartition].readTask.preferredLocations()
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
new file mode 100644
index 0000000000000..3c9b598fd07c9
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.v2
+
+import org.apache.spark.sql.catalyst.expressions.AttributeReference
+import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Statistics}
+import org.apache.spark.sql.sources.v2.reader.{DataSourceV2Reader, SupportsReportStatistics}
+
+case class DataSourceV2Relation(
+    output: Seq[AttributeReference],
+    reader: DataSourceV2Reader) extends LeafNode {
+
+  override def computeStats(): Statistics = reader match {
+    case r: SupportsReportStatistics =>
+      Statistics(sizeInBytes = r.getStatistics.sizeInBytes().orElse(conf.defaultSizeInBytes))
+    case _ =>
+      Statistics(sizeInBytes = conf.defaultSizeInBytes)
+  }
+}
+
+object DataSourceV2Relation {
+  def apply(reader: DataSourceV2Reader): DataSourceV2Relation = {
+    new DataSourceV2Relation(reader.readSchema().toAttributes, reader)
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
new file mode 100644
index 0000000000000..7999c0ceb5749
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.v2
+
+import scala.collection.JavaConverters._
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.execution.LeafExecNode
+import org.apache.spark.sql.execution.metric.SQLMetrics
+import org.apache.spark.sql.sources.v2.reader._
+import org.apache.spark.sql.types.StructType
+
+case class DataSourceV2ScanExec(
+    fullOutput: Array[AttributeReference],
+    @transient reader: DataSourceV2Reader,
+    // TODO: these 3 parameters are only used to determine the equality of the scan node, however,
+    // the reader also have this information, and ideally we can just rely on the equality of the
+    // reader. The only concern is, the reader implementation is outside of Spark and we have no
+    // control.
+    readSchema: StructType,
+    @transient filters: ExpressionSet,
+    hashPartitionKeys: Seq[String]) extends LeafExecNode {
+
+  def output: Seq[Attribute] = readSchema.map(_.name).map { name =>
+    fullOutput.find(_.name == name).get
+  }
+
+  override def references: AttributeSet = AttributeSet.empty
+
+  override lazy val metrics = Map(
+    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
+
+  override protected def doExecute(): RDD[InternalRow] = {
+    val readTasks: java.util.List[ReadTask[UnsafeRow]] = reader match {
+      case r: SupportsScanUnsafeRow => r.createUnsafeRowReadTasks()
+      case _ =>
+        reader.createReadTasks().asScala.map {
+          new RowToUnsafeRowReadTask(_, reader.readSchema()): ReadTask[UnsafeRow]
+        }.asJava
+    }
+
+    val inputRDD = new DataSourceRDD(sparkContext, readTasks)
+      .asInstanceOf[RDD[InternalRow]]
+    val numOutputRows = longMetric("numOutputRows")
+    inputRDD.map { r =>
+      numOutputRows += 1
+      r
+    }
+  }
+}
+
+class RowToUnsafeRowReadTask(rowReadTask: ReadTask[Row], schema: StructType)
+  extends ReadTask[UnsafeRow] {
+
+  override def preferredLocations: Array[String] = rowReadTask.preferredLocations
+
+  override def createReader: DataReader[UnsafeRow] = {
+    new RowToUnsafeDataReader(rowReadTask.createReader, RowEncoder.apply(schema))
+  }
+}
+
+class RowToUnsafeDataReader(rowReader: DataReader[Row], encoder: ExpressionEncoder[Row])
+  extends DataReader[UnsafeRow] {
+
+  override def next: Boolean = rowReader.next
+
+  override def get: UnsafeRow = encoder.toRow(rowReader.get).asInstanceOf[UnsafeRow]
+
+  override def close(): Unit = rowReader.close()
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala
new file mode 100644
index 0000000000000..b80f695b2a87f
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.v2
+
+import org.apache.spark.sql.Strategy
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.planning.PhysicalOperation
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.execution.{FilterExec, ProjectExec, SparkPlan}
+import org.apache.spark.sql.execution.datasources.DataSourceStrategy
+import org.apache.spark.sql.sources.Filter
+import org.apache.spark.sql.sources.v2.reader._
+
+object DataSourceV2Strategy extends Strategy {
+  // TODO: write path
+  override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
+    case PhysicalOperation(projects, filters, DataSourceV2Relation(output, reader)) =>
+      val stayUpFilters: Seq[Expression] = reader match {
+        case r: SupportsPushDownCatalystFilters =>
+          r.pushCatalystFilters(filters.toArray)
+
+        case r: SupportsPushDownFilters =>
+          // A map from original Catalyst expressions to corresponding translated data source
+          // filters. If a predicate is not in this map, it means it cannot be pushed down.
+          val translatedMap: Map[Expression, Filter] = filters.flatMap { p =>
+            DataSourceStrategy.translateFilter(p).map(f => p -> f)
+          }.toMap
+
+          // Catalyst predicate expressions that cannot be converted to data source filters.
+          val nonConvertiblePredicates = filters.filterNot(translatedMap.contains)
+
+          // Data source filters that cannot be pushed down. An unhandled filter means
+          // the data source cannot guarantee the rows returned can pass the filter.
+          // As a result we must return it so Spark can plan an extra filter operator.
+          val unhandledFilters = r.pushFilters(translatedMap.values.toArray).toSet
+          val unhandledPredicates = translatedMap.filter { case (_, f) =>
+            unhandledFilters.contains(f)
+          }.keys
+
+          nonConvertiblePredicates ++ unhandledPredicates
+
+        case _ => filters
+      }
+
+      val attrMap = AttributeMap(output.zip(output))
+      val projectSet = AttributeSet(projects.flatMap(_.references))
+      val filterSet = AttributeSet(stayUpFilters.flatMap(_.references))
+
+      // Match original case of attributes.
+      // TODO: nested fields pruning
+      val requiredColumns = (projectSet ++ filterSet).toSeq.map(attrMap)
+      reader match {
+        case r: SupportsPushDownRequiredColumns =>
+          r.pruneColumns(requiredColumns.toStructType)
+        case _ =>
+      }
+
+      val scan = DataSourceV2ScanExec(
+        output.toArray,
+        reader,
+        reader.readSchema(),
+        ExpressionSet(filters),
+        Nil)
+
+      val filterCondition = stayUpFilters.reduceLeftOption(And)
+      val withFilter = filterCondition.map(FilterExec(_, scan)).getOrElse(scan)
+
+      val withProject = if (projects == withFilter.output) {
+        withFilter
+      } else {
+        ProjectExec(projects, withFilter)
+      }
+
+      withProject :: Nil
+
+    case _ => Nil
+  }
+}
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java
new file mode 100644
index 0000000000000..50900e98dedb6
--- /dev/null
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package test.org.apache.spark.sql.sources.v2;
+
+import java.io.IOException;
+import java.util.*;
+
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.catalyst.expressions.GenericRow;
+import org.apache.spark.sql.sources.Filter;
+import org.apache.spark.sql.sources.GreaterThan;
+import org.apache.spark.sql.sources.v2.DataSourceV2;
+import org.apache.spark.sql.sources.v2.DataSourceV2Options;
+import org.apache.spark.sql.sources.v2.ReadSupport;
+import org.apache.spark.sql.sources.v2.reader.*;
+import org.apache.spark.sql.types.StructType;
+
+public class JavaAdvancedDataSourceV2 implements DataSourceV2, ReadSupport {
+
+  class Reader implements DataSourceV2Reader, SupportsPushDownRequiredColumns, SupportsPushDownFilters {
+    private StructType requiredSchema = new StructType().add("i", "int").add("j", "int");
+    private Filter[] filters = new Filter[0];
+
+    @Override
+    public StructType readSchema() {
+      return requiredSchema;
+    }
+
+    @Override
+    public void pruneColumns(StructType requiredSchema) {
+      this.requiredSchema = requiredSchema;
+    }
+
+    @Override
+    public Filter[] pushFilters(Filter[] filters) {
+      this.filters = filters;
+      return new Filter[0];
+    }
+
+    @Override
+    public List<ReadTask<Row>> createReadTasks() {
+      List<ReadTask<Row>> res = new ArrayList<>();
+
+      Integer lowerBound = null;
+      for (Filter filter : filters) {
+        if (filter instanceof GreaterThan) {
+          GreaterThan f = (GreaterThan) filter;
+          if ("i".equals(f.attribute()) && f.value() instanceof Integer) {
+            lowerBound = (Integer) f.value();
+            break;
+          }
+        }
+      }
+
+      if (lowerBound == null) {
+        res.add(new JavaAdvancedReadTask(0, 5, requiredSchema));
+        res.add(new JavaAdvancedReadTask(5, 10, requiredSchema));
+      } else if (lowerBound < 4) {
+        res.add(new JavaAdvancedReadTask(lowerBound + 1, 5, requiredSchema));
+        res.add(new JavaAdvancedReadTask(5, 10, requiredSchema));
+      } else if (lowerBound < 9) {
+        res.add(new JavaAdvancedReadTask(lowerBound + 1, 10, requiredSchema));
+      }
+
+      return res;
+    }
+  }
+
+  static class JavaAdvancedReadTask implements ReadTask<Row>, DataReader<Row> {
+    private int start;
+    private int end;
+    private StructType requiredSchema;
+
+    JavaAdvancedReadTask(int start, int end, StructType requiredSchema) {
+      this.start = start;
+      this.end = end;
+      this.requiredSchema = requiredSchema;
+    }
+
+    @Override
+    public DataReader<Row> createReader() {
+      return new JavaAdvancedReadTask(start - 1, end, requiredSchema);
+    }
+
+    @Override
+    public boolean next() {
+      start += 1;
+      return start < end;
+    }
+
+    @Override
+    public Row get() {
+      Object[] values = new Object[requiredSchema.size()];
+      for (int i = 0; i < values.length; i++) {
+        if ("i".equals(requiredSchema.apply(i).name())) {
+          values[i] = start;
+        } else if ("j".equals(requiredSchema.apply(i).name())) {
+          values[i] = -start;
+        }
+      }
+      return new GenericRow(values);
+    }
+
+    @Override
+    public void close() throws IOException {
+
+    }
+  }
+
+
+  @Override
+  public DataSourceV2Reader createReader(DataSourceV2Options options) {
+    return new Reader();
+  }
+}
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSchemaRequiredDataSource.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSchemaRequiredDataSource.java
new file mode 100644
index 0000000000000..a174bd8092cbd
--- /dev/null
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSchemaRequiredDataSource.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package test.org.apache.spark.sql.sources.v2;
+
+import java.util.List;
+
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.sources.v2.DataSourceV2;
+import org.apache.spark.sql.sources.v2.DataSourceV2Options;
+import org.apache.spark.sql.sources.v2.ReadSupportWithSchema;
+import org.apache.spark.sql.sources.v2.reader.DataSourceV2Reader;
+import org.apache.spark.sql.sources.v2.reader.ReadTask;
+import org.apache.spark.sql.types.StructType;
+
+public class JavaSchemaRequiredDataSource implements DataSourceV2, ReadSupportWithSchema {
+
+  class Reader implements DataSourceV2Reader {
+    private final StructType schema;
+
+    Reader(StructType schema) {
+      this.schema = schema;
+    }
+
+    @Override
+    public StructType readSchema() {
+      return schema;
+    }
+
+    @Override
+    public List<ReadTask<Row>> createReadTasks() {
+      return java.util.Collections.emptyList();
+    }
+  }
+
+  @Override
+  public DataSourceV2Reader createReader(StructType schema, DataSourceV2Options options) {
+    return new Reader(schema);
+  }
+}
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSimpleDataSourceV2.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSimpleDataSourceV2.java
new file mode 100644
index 0000000000000..08469f14c257a
--- /dev/null
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSimpleDataSourceV2.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package test.org.apache.spark.sql.sources.v2;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.catalyst.expressions.GenericRow;
+import org.apache.spark.sql.sources.v2.DataSourceV2;
+import org.apache.spark.sql.sources.v2.DataSourceV2Options;
+import org.apache.spark.sql.sources.v2.ReadSupport;
+import org.apache.spark.sql.sources.v2.reader.DataReader;
+import org.apache.spark.sql.sources.v2.reader.ReadTask;
+import org.apache.spark.sql.sources.v2.reader.DataSourceV2Reader;
+import org.apache.spark.sql.types.StructType;
+
+public class JavaSimpleDataSourceV2 implements DataSourceV2, ReadSupport {
+
+  class Reader implements DataSourceV2Reader {
+    private final StructType schema = new StructType().add("i", "int").add("j", "int");
+
+    @Override
+    public StructType readSchema() {
+      return schema;
+    }
+
+    @Override
+    public List<ReadTask<Row>> createReadTasks() {
+      return java.util.Arrays.asList(
+        new JavaSimpleReadTask(0, 5),
+        new JavaSimpleReadTask(5, 10));
+    }
+  }
+
+  static class JavaSimpleReadTask implements ReadTask<Row>, DataReader<Row> {
+    private int start;
+    private int end;
+
+    JavaSimpleReadTask(int start, int end) {
+      this.start = start;
+      this.end = end;
+    }
+
+    @Override
+    public DataReader<Row> createReader() {
+      return new JavaSimpleReadTask(start - 1, end);
+    }
+
+    @Override
+    public boolean next() {
+      start += 1;
+      return start < end;
+    }
+
+    @Override
+    public Row get() {
+      return new GenericRow(new Object[] {start, -start});
+    }
+
+    @Override
+    public void close() throws IOException {
+
+    }
+  }
+
+  @Override
+  public DataSourceV2Reader createReader(DataSourceV2Options options) {
+    return new Reader();
+  }
+}
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaUnsafeRowDataSourceV2.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaUnsafeRowDataSourceV2.java
new file mode 100644
index 0000000000000..9efe7c791a936
--- /dev/null
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaUnsafeRowDataSourceV2.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package test.org.apache.spark.sql.sources.v2;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
+import org.apache.spark.sql.sources.v2.DataSourceV2;
+import org.apache.spark.sql.sources.v2.DataSourceV2Options;
+import org.apache.spark.sql.sources.v2.ReadSupport;
+import org.apache.spark.sql.sources.v2.reader.*;
+import org.apache.spark.sql.types.StructType;
+
+public class JavaUnsafeRowDataSourceV2 implements DataSourceV2, ReadSupport {
+
+  class Reader implements DataSourceV2Reader, SupportsScanUnsafeRow {
+    private final StructType schema = new StructType().add("i", "int").add("j", "int");
+
+    @Override
+    public StructType readSchema() {
+      return schema;
+    }
+
+    @Override
+    public List<ReadTask<UnsafeRow>> createUnsafeRowReadTasks() {
+      return java.util.Arrays.asList(
+        new JavaUnsafeRowReadTask(0, 5),
+        new JavaUnsafeRowReadTask(5, 10));
+    }
+  }
+
+  static class JavaUnsafeRowReadTask implements ReadTask<UnsafeRow>, DataReader<UnsafeRow> {
+    private int start;
+    private int end;
+    private UnsafeRow row;
+
+    JavaUnsafeRowReadTask(int start, int end) {
+      this.start = start;
+      this.end = end;
+      this.row = new UnsafeRow(2);
+      row.pointTo(new byte[8 * 3], 8 * 3);
+    }
+
+    @Override
+    public DataReader<UnsafeRow> createReader() {
+      return new JavaUnsafeRowReadTask(start - 1, end);
+    }
+
+    @Override
+    public boolean next() {
+      start += 1;
+      return start < end;
+    }
+
+    @Override
+    public UnsafeRow get() {
+      row.setInt(0, start);
+      row.setInt(1, -start);
+      return row;
+    }
+
+    @Override
+    public void close() throws IOException {
+
+    }
+  }
+
+  @Override
+  public DataSourceV2Reader createReader(DataSourceV2Options options) {
+    return new Reader();
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2OptionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2OptionsSuite.scala
new file mode 100644
index 0000000000000..933f4075bcc8a
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2OptionsSuite.scala
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2
+
+import scala.collection.JavaConverters._
+
+import org.apache.spark.SparkFunSuite
+
+/**
+ * A simple test suite to verify `DataSourceV2Options`.
+ */
+class DataSourceV2OptionsSuite extends SparkFunSuite {
+
+  test("key is case-insensitive") {
+    val options = new DataSourceV2Options(Map("foo" -> "bar").asJava)
+    assert(options.get("foo").get() == "bar")
+    assert(options.get("FoO").get() == "bar")
+    assert(!options.get("abc").isPresent)
+  }
+
+  test("value is case-sensitive") {
+    val options = new DataSourceV2Options(Map("foo" -> "bAr").asJava)
+    assert(options.get("foo").get == "bAr")
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
new file mode 100644
index 0000000000000..9ce93d7ae926c
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
@@ -0,0 +1,229 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2
+
+import java.util.{ArrayList, List => JList}
+
+import test.org.apache.spark.sql.sources.v2._
+
+import org.apache.spark.sql.{AnalysisException, QueryTest, Row}
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow
+import org.apache.spark.sql.sources.{Filter, GreaterThan}
+import org.apache.spark.sql.sources.v2.reader._
+import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.sql.types.StructType
+
+class DataSourceV2Suite extends QueryTest with SharedSQLContext {
+  import testImplicits._
+
+  test("simplest implementation") {
+    Seq(classOf[SimpleDataSourceV2], classOf[JavaSimpleDataSourceV2]).foreach { cls =>
+      withClue(cls.getName) {
+        val df = spark.read.format(cls.getName).load()
+        checkAnswer(df, (0 until 10).map(i => Row(i, -i)))
+        checkAnswer(df.select('j), (0 until 10).map(i => Row(-i)))
+        checkAnswer(df.filter('i > 5), (6 until 10).map(i => Row(i, -i)))
+      }
+    }
+  }
+
+  test("advanced implementation") {
+    Seq(classOf[AdvancedDataSourceV2], classOf[JavaAdvancedDataSourceV2]).foreach { cls =>
+      withClue(cls.getName) {
+        val df = spark.read.format(cls.getName).load()
+        checkAnswer(df, (0 until 10).map(i => Row(i, -i)))
+        checkAnswer(df.select('j), (0 until 10).map(i => Row(-i)))
+        checkAnswer(df.filter('i > 3), (4 until 10).map(i => Row(i, -i)))
+        checkAnswer(df.select('j).filter('i > 6), (7 until 10).map(i => Row(-i)))
+        checkAnswer(df.select('i).filter('i > 10), Nil)
+      }
+    }
+  }
+
+  test("unsafe row implementation") {
+    Seq(classOf[UnsafeRowDataSourceV2], classOf[JavaUnsafeRowDataSourceV2]).foreach { cls =>
+      withClue(cls.getName) {
+        val df = spark.read.format(cls.getName).load()
+        checkAnswer(df, (0 until 10).map(i => Row(i, -i)))
+        checkAnswer(df.select('j), (0 until 10).map(i => Row(-i)))
+        checkAnswer(df.filter('i > 5), (6 until 10).map(i => Row(i, -i)))
+      }
+    }
+  }
+
+  test("schema required data source") {
+    Seq(classOf[SchemaRequiredDataSource], classOf[JavaSchemaRequiredDataSource]).foreach { cls =>
+      withClue(cls.getName) {
+        val e = intercept[AnalysisException](spark.read.format(cls.getName).load())
+        assert(e.message.contains("A schema needs to be specified"))
+
+        val schema = new StructType().add("i", "int").add("s", "string")
+        val df = spark.read.format(cls.getName).schema(schema).load()
+
+        assert(df.schema == schema)
+        assert(df.collect().isEmpty)
+      }
+    }
+  }
+}
+
+class SimpleDataSourceV2 extends DataSourceV2 with ReadSupport {
+
+  class Reader extends DataSourceV2Reader {
+    override def readSchema(): StructType = new StructType().add("i", "int").add("j", "int")
+
+    override def createReadTasks(): JList[ReadTask[Row]] = {
+      java.util.Arrays.asList(new SimpleReadTask(0, 5), new SimpleReadTask(5, 10))
+    }
+  }
+
+  override def createReader(options: DataSourceV2Options): DataSourceV2Reader = new Reader
+}
+
+class SimpleReadTask(start: Int, end: Int) extends ReadTask[Row] with DataReader[Row] {
+  private var current = start - 1
+
+  override def createReader(): DataReader[Row] = new SimpleReadTask(start, end)
+
+  override def next(): Boolean = {
+    current += 1
+    current < end
+  }
+
+  override def get(): Row = Row(current, -current)
+
+  override def close(): Unit = {}
+}
+
+
+
+class AdvancedDataSourceV2 extends DataSourceV2 with ReadSupport {
+
+  class Reader extends DataSourceV2Reader
+    with SupportsPushDownRequiredColumns with SupportsPushDownFilters {
+
+    var requiredSchema = new StructType().add("i", "int").add("j", "int")
+    var filters = Array.empty[Filter]
+
+    override def pruneColumns(requiredSchema: StructType): Unit = {
+      this.requiredSchema = requiredSchema
+    }
+
+    override def pushFilters(filters: Array[Filter]): Array[Filter] = {
+      this.filters = filters
+      Array.empty
+    }
+
+    override def readSchema(): StructType = {
+      requiredSchema
+    }
+
+    override def createReadTasks(): JList[ReadTask[Row]] = {
+      val lowerBound = filters.collect {
+        case GreaterThan("i", v: Int) => v
+      }.headOption
+
+      val res = new ArrayList[ReadTask[Row]]
+
+      if (lowerBound.isEmpty) {
+        res.add(new AdvancedReadTask(0, 5, requiredSchema))
+        res.add(new AdvancedReadTask(5, 10, requiredSchema))
+      } else if (lowerBound.get < 4) {
+        res.add(new AdvancedReadTask(lowerBound.get + 1, 5, requiredSchema))
+        res.add(new AdvancedReadTask(5, 10, requiredSchema))
+      } else if (lowerBound.get < 9) {
+        res.add(new AdvancedReadTask(lowerBound.get + 1, 10, requiredSchema))
+      }
+
+      res
+    }
+  }
+
+  override def createReader(options: DataSourceV2Options): DataSourceV2Reader = new Reader
+}
+
+class AdvancedReadTask(start: Int, end: Int, requiredSchema: StructType)
+  extends ReadTask[Row] with DataReader[Row] {
+
+  private var current = start - 1
+
+  override def createReader(): DataReader[Row] = new AdvancedReadTask(start, end, requiredSchema)
+
+  override def close(): Unit = {}
+
+  override def next(): Boolean = {
+    current += 1
+    current < end
+  }
+
+  override def get(): Row = {
+    val values = requiredSchema.map(_.name).map {
+      case "i" => current
+      case "j" => -current
+    }
+    Row.fromSeq(values)
+  }
+}
+
+
+class UnsafeRowDataSourceV2 extends DataSourceV2 with ReadSupport {
+
+  class Reader extends DataSourceV2Reader with SupportsScanUnsafeRow {
+    override def readSchema(): StructType = new StructType().add("i", "int").add("j", "int")
+
+    override def createUnsafeRowReadTasks(): JList[ReadTask[UnsafeRow]] = {
+      java.util.Arrays.asList(new UnsafeRowReadTask(0, 5), new UnsafeRowReadTask(5, 10))
+    }
+  }
+
+  override def createReader(options: DataSourceV2Options): DataSourceV2Reader = new Reader
+}
+
+class UnsafeRowReadTask(start: Int, end: Int)
+  extends ReadTask[UnsafeRow] with DataReader[UnsafeRow] {
+
+  private val row = new UnsafeRow(2)
+  row.pointTo(new Array[Byte](8 * 3), 8 * 3)
+
+  private var current = start - 1
+
+  override def createReader(): DataReader[UnsafeRow] = new UnsafeRowReadTask(start, end)
+
+  override def next(): Boolean = {
+    current += 1
+    current < end
+  }
+  override def get(): UnsafeRow = {
+    row.setInt(0, current)
+    row.setInt(1, -current)
+    row
+  }
+
+  override def close(): Unit = {}
+}
+
+class SchemaRequiredDataSource extends DataSourceV2 with ReadSupportWithSchema {
+
+  class Reader(val readSchema: StructType) extends DataSourceV2Reader {
+    override def createReadTasks(): JList[ReadTask[Row]] =
+      java.util.Collections.emptyList()
+  }
+
+  override def createReader(schema: StructType, options: DataSourceV2Options): DataSourceV2Reader =
+    new Reader(schema)
+}

From 0bad10d3e36d3238c7ee7c0fc5465072734b3ae4 Mon Sep 17 00:00:00 2001
From: Jose Torres <jose@databricks.com>
Date: Fri, 15 Sep 2017 21:10:07 -0700
Subject: [PATCH 557/779] [SPARK-22017] Take minimum of all watermark execs in
 StreamExecution.

## What changes were proposed in this pull request?

Take the minimum of all watermark exec nodes as the "real" watermark in StreamExecution, rather than picking one arbitrarily.

## How was this patch tested?

new unit test

Author: Jose Torres <jose@databricks.com>

Closes #19239 from joseph-torres/SPARK-22017.
---
 .../streaming/IncrementalExecution.scala      |  2 +-
 .../execution/streaming/StreamExecution.scala | 39 ++++++++--
 .../streaming/EventTimeWatermarkSuite.scala   | 78 +++++++++++++++++++
 3 files changed, 113 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
index 258a64216136f..19d95980d57d3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
@@ -39,7 +39,7 @@ class IncrementalExecution(
     val checkpointLocation: String,
     val runId: UUID,
     val currentBatchId: Long,
-    offsetSeqMetadata: OffsetSeqMetadata)
+    val offsetSeqMetadata: OffsetSeqMetadata)
   extends QueryExecution(sparkSession, logicalPlan) with Logging {
 
   // Modified planner with stateful operations.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 952e431fb19d3..b27a59b8a34fb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -130,6 +130,16 @@ class StreamExecution(
   protected var offsetSeqMetadata = OffsetSeqMetadata(
     batchWatermarkMs = 0, batchTimestampMs = 0, sparkSession.conf)
 
+  /**
+   * A map of current watermarks, keyed by the position of the watermark operator in the
+   * physical plan.
+   *
+   * This state is 'soft state', which does not affect the correctness and semantics of watermarks
+   * and is not persisted across query restarts.
+   * The fault-tolerant watermark state is in offsetSeqMetadata.
+   */
+  protected val watermarkMsMap: MutableMap[Int, Long] = MutableMap()
+
   override val id: UUID = UUID.fromString(streamMetadata.id)
 
   override val runId: UUID = UUID.randomUUID
@@ -560,13 +570,32 @@ class StreamExecution(
     }
     if (hasNewData) {
       var batchWatermarkMs = offsetSeqMetadata.batchWatermarkMs
-      // Update the eventTime watermark if we find one in the plan.
+      // Update the eventTime watermarks if we find any in the plan.
       if (lastExecution != null) {
         lastExecution.executedPlan.collect {
-          case e: EventTimeWatermarkExec if e.eventTimeStats.value.count > 0 =>
-            logDebug(s"Observed event time stats: ${e.eventTimeStats.value}")
-            e.eventTimeStats.value.max - e.delayMs
-        }.headOption.foreach { newWatermarkMs =>
+          case e: EventTimeWatermarkExec => e
+        }.zipWithIndex.foreach {
+          case (e, index) if e.eventTimeStats.value.count > 0 =>
+            logDebug(s"Observed event time stats $index: ${e.eventTimeStats.value}")
+            val newWatermarkMs = e.eventTimeStats.value.max - e.delayMs
+            val prevWatermarkMs = watermarkMsMap.get(index)
+            if (prevWatermarkMs.isEmpty || newWatermarkMs > prevWatermarkMs.get) {
+              watermarkMsMap.put(index, newWatermarkMs)
+            }
+
+          // Populate 0 if we haven't seen any data yet for this watermark node.
+          case (_, index) =>
+            if (!watermarkMsMap.isDefinedAt(index)) {
+              watermarkMsMap.put(index, 0)
+            }
+        }
+
+        // Update the global watermark to the minimum of all watermark nodes.
+        // This is the safest option, because only the global watermark is fault-tolerant. Making
+        // it the minimum of all individual watermarks guarantees it will never advance past where
+        // any individual watermark operator would be if it were in a plan by itself.
+        if(!watermarkMsMap.isEmpty) {
+          val newWatermarkMs = watermarkMsMap.minBy(_._2)._2
           if (newWatermarkMs > batchWatermarkMs) {
             logInfo(s"Updating eventTime watermark to: $newWatermarkMs ms")
             batchWatermarkMs = newWatermarkMs
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala
index 4f19fa0bb4a97..f3e8cf950a5a4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala
@@ -300,6 +300,84 @@ class EventTimeWatermarkSuite extends StreamTest with BeforeAndAfter with Matche
     )
   }
 
+  test("watermark with 2 streams") {
+    import org.apache.spark.sql.functions.sum
+    val first = MemoryStream[Int]
+
+    val firstDf = first.toDF()
+      .withColumn("eventTime", $"value".cast("timestamp"))
+      .withWatermark("eventTime", "10 seconds")
+      .select('value)
+
+    val second = MemoryStream[Int]
+
+    val secondDf = second.toDF()
+      .withColumn("eventTime", $"value".cast("timestamp"))
+      .withWatermark("eventTime", "5 seconds")
+      .select('value)
+
+    withTempDir { checkpointDir =>
+      val unionWriter = firstDf.union(secondDf).agg(sum('value))
+        .writeStream
+        .option("checkpointLocation", checkpointDir.getCanonicalPath)
+        .format("memory")
+        .outputMode("complete")
+        .queryName("test")
+
+      val union = unionWriter.start()
+
+      def getWatermarkAfterData(
+                                 firstData: Seq[Int] = Seq.empty,
+                                 secondData: Seq[Int] = Seq.empty,
+                                 query: StreamingQuery = union): Long = {
+        if (firstData.nonEmpty) first.addData(firstData)
+        if (secondData.nonEmpty) second.addData(secondData)
+        query.processAllAvailable()
+        // add a dummy batch so lastExecution has the new watermark
+        first.addData(0)
+        query.processAllAvailable()
+        // get last watermark
+        val lastExecution = query.asInstanceOf[StreamingQueryWrapper].streamingQuery.lastExecution
+        lastExecution.offsetSeqMetadata.batchWatermarkMs
+      }
+
+      // Global watermark starts at 0 until we get data from both sides
+      assert(getWatermarkAfterData(firstData = Seq(11)) == 0)
+      assert(getWatermarkAfterData(secondData = Seq(6)) == 1000)
+      // Global watermark stays at left watermark 1 when right watermark moves to 2
+      assert(getWatermarkAfterData(secondData = Seq(8)) == 1000)
+      // Global watermark switches to right side value 2 when left watermark goes higher
+      assert(getWatermarkAfterData(firstData = Seq(21)) == 3000)
+      // Global watermark goes back to left
+      assert(getWatermarkAfterData(secondData = Seq(17, 28, 39)) == 11000)
+      // Global watermark stays on left as long as it's below right
+      assert(getWatermarkAfterData(firstData = Seq(31)) == 21000)
+      assert(getWatermarkAfterData(firstData = Seq(41)) == 31000)
+      // Global watermark switches back to right again
+      assert(getWatermarkAfterData(firstData = Seq(51)) == 34000)
+
+      // Global watermark is updated correctly with simultaneous data from both sides
+      assert(getWatermarkAfterData(firstData = Seq(100), secondData = Seq(100)) == 90000)
+      assert(getWatermarkAfterData(firstData = Seq(120), secondData = Seq(110)) == 105000)
+      assert(getWatermarkAfterData(firstData = Seq(130), secondData = Seq(125)) == 120000)
+
+      // Global watermark doesn't decrement with simultaneous data
+      assert(getWatermarkAfterData(firstData = Seq(100), secondData = Seq(100)) == 120000)
+      assert(getWatermarkAfterData(firstData = Seq(140), secondData = Seq(100)) == 120000)
+      assert(getWatermarkAfterData(firstData = Seq(100), secondData = Seq(135)) == 130000)
+
+      // Global watermark recovers after restart, but left side watermark ahead of it does not.
+      assert(getWatermarkAfterData(firstData = Seq(200), secondData = Seq(190)) == 185000)
+      union.stop()
+      val union2 = unionWriter.start()
+      assert(getWatermarkAfterData(query = union2) == 185000)
+      // Even though the left side was ahead of 185000 in the last execution, the watermark won't
+      // increment until it gets past it in this execution.
+      assert(getWatermarkAfterData(secondData = Seq(200), query = union2) == 185000)
+      assert(getWatermarkAfterData(firstData = Seq(200), query = union2) == 190000)
+    }
+  }
+
   test("complete mode") {
     val inputData = MemoryStream[Int]
 

From 73d9067226671adb6410ccfb4d5ca2f00283c82b Mon Sep 17 00:00:00 2001
From: Armin <me@obrown.io>
Date: Sat, 16 Sep 2017 09:18:13 +0100
Subject: [PATCH 558/779] [SPARK-21967][CORE]
 org.apache.spark.unsafe.types.UTF8String#compareTo Should Compare 8 Bytes at
 a Time for Better Performance

## What changes were proposed in this pull request?

* Using 64 bit unsigned long comparison instead of unsigned int comparison in `org.apache.spark.unsafe.types.UTF8String#compareTo` for better performance.
* Making `IS_LITTLE_ENDIAN` a constant for correctness reasons (shouldn't use a non-constant in `compareTo` implementations and it def. is a constant per JVM)

## How was this patch tested?

Build passes and the functionality is widely covered by existing tests as far as I can see.

Author: Armin <me@obrown.io>

Closes #19180 from original-brownbear/SPARK-21967.
---
 .../apache/spark/unsafe/types/UTF8String.java | 24 +++++++++++++++----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 43f57672d9544..dd67f15749add 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -64,7 +64,8 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
     5, 5, 5, 5,
     6, 6};
 
-  private static boolean isLittleEndian = ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN;
+  private static final boolean IS_LITTLE_ENDIAN =
+      ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN;
 
   private static final UTF8String COMMA_UTF8 = UTF8String.fromString(",");
   public static final UTF8String EMPTY_UTF8 = UTF8String.fromString("");
@@ -220,7 +221,7 @@ public long getPrefix() {
     // After getting the data, we use a mask to mask out data that is not part of the string.
     long p;
     long mask = 0;
-    if (isLittleEndian) {
+    if (IS_LITTLE_ENDIAN) {
       if (numBytes >= 8) {
         p = Platform.getLong(base, offset);
       } else if (numBytes > 4) {
@@ -1097,10 +1098,23 @@ public UTF8String copy() {
   @Override
   public int compareTo(@Nonnull final UTF8String other) {
     int len = Math.min(numBytes, other.numBytes);
-    // TODO: compare 8 bytes as unsigned long
-    for (int i = 0; i < len; i ++) {
+    int wordMax = (len / 8) * 8;
+    long roffset = other.offset;
+    Object rbase = other.base;
+    for (int i = 0; i < wordMax; i += 8) {
+      long left = getLong(base, offset + i);
+      long right = getLong(rbase, roffset + i);
+      if (left != right) {
+        if (IS_LITTLE_ENDIAN) {
+          return Long.compareUnsigned(Long.reverseBytes(left), Long.reverseBytes(right));
+        } else {
+          return Long.compareUnsigned(left, right);
+        }
+      }
+    }
+    for (int i = wordMax; i < len; i++) {
       // In UTF-8, the byte should be unsigned, so we should compare them as unsigned int.
-      int res = (getByte(i) & 0xFF) - (other.getByte(i) & 0xFF);
+      int res = (getByte(i) & 0xFF) - (Platform.getByte(rbase, roffset + i) & 0xFF);
       if (res != 0) {
         return res;
       }

From f4073020adf9752c7d7b39631ec3fa36d6345902 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maciej=20Bry=C5=84ski?= <maciek-github@brynski.pl>
Date: Mon, 18 Sep 2017 02:34:44 +0900
Subject: [PATCH 559/779] [SPARK-22032][PYSPARK] Speed up StructType conversion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

StructType.fromInternal is calling f.fromInternal(v) for every field.
We can use precalculated information about type to limit the number of function calls. (its calculated once per StructType and used in per record calculations)

Benchmarks (Python profiler)
```
df = spark.range(10000000).selectExpr("id as id0", "id as id1", "id as id2", "id as id3", "id as id4", "id as id5", "id as id6", "id as id7", "id as id8", "id as id9", "struct(id) as s").cache()
df.count()
df.rdd.map(lambda x: x).count()
```

Before
```
310274584 function calls (300272456 primitive calls) in 1320.684 seconds

Ordered by: internal time, cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 10000000  253.417    0.000  486.991    0.000 types.py:619(<listcomp>)
 30000000  192.272    0.000 1009.986    0.000 types.py:612(fromInternal)
100000000  176.140    0.000  176.140    0.000 types.py:88(fromInternal)
 20000000  156.832    0.000  328.093    0.000 types.py:1471(_create_row)
    14000  107.206    0.008 1237.917    0.088 {built-in method loads}
 20000000   80.176    0.000 1090.162    0.000 types.py:1468(<lambda>)
```

After
```
210274584 function calls (200272456 primitive calls) in 1035.974 seconds

Ordered by: internal time, cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 30000000  215.845    0.000  698.748    0.000 types.py:612(fromInternal)
 20000000  165.042    0.000  351.572    0.000 types.py:1471(_create_row)
    14000  116.834    0.008  946.791    0.068 {built-in method loads}
 20000000   87.326    0.000  786.073    0.000 types.py:1468(<lambda>)
 20000000   85.477    0.000  134.607    0.000 types.py:1519(__new__)
 10000000   65.777    0.000  126.712    0.000 types.py:619(<listcomp>)
```

Main difference is types.py:619(<listcomp>) and types.py:88(fromInternal) (which is removed in After)
The number of function calls is 100 million less. And performance is 20% better.

Benchmark (worst case scenario.)

Test
```
df = spark.range(1000000).selectExpr("current_timestamp as id0", "current_timestamp as id1", "current_timestamp as id2", "current_timestamp as id3", "current_timestamp as id4", "current_timestamp as id5", "current_timestamp as id6", "current_timestamp as id7", "current_timestamp as id8", "current_timestamp as id9").cache()
df.count()
df.rdd.map(lambda x: x).count()
```

Before
```
31166064 function calls (31163984 primitive calls) in 150.882 seconds
```

After
```
31166064 function calls (31163984 primitive calls) in 153.220 seconds
```

IMPORTANT:
The benchmark was done on top of https://github.com/apache/spark/pull/19246.
Without https://github.com/apache/spark/pull/19246 the performance improvement will be even greater.

## How was this patch tested?

Existing tests.
Performance benchmark.

Author: Maciej Bryński <maciek-github@brynski.pl>

Closes #19249 from maver1ck/spark_22032.
---
 python/pyspark/sql/types.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 920cf009f599d..aaf520fa8019f 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -483,7 +483,9 @@ def __init__(self, fields=None):
             self.names = [f.name for f in fields]
             assert all(isinstance(f, StructField) for f in fields),\
                 "fields should be a list of StructField"
-        self._needSerializeAnyField = any(f.needConversion() for f in self)
+        # Precalculated list of fields that need conversion with fromInternal/toInternal functions
+        self._needConversion = [f.needConversion() for f in self]
+        self._needSerializeAnyField = any(self._needConversion)
 
     def add(self, field, data_type=None, nullable=True, metadata=None):
         """
@@ -528,7 +530,9 @@ def add(self, field, data_type=None, nullable=True, metadata=None):
                 data_type_f = data_type
             self.fields.append(StructField(field, data_type_f, nullable, metadata))
             self.names.append(field)
-        self._needSerializeAnyField = any(f.needConversion() for f in self)
+        # Precalculated list of fields that need conversion with fromInternal/toInternal functions
+        self._needConversion = [f.needConversion() for f in self]
+        self._needSerializeAnyField = any(self._needConversion)
         return self
 
     def __iter__(self):
@@ -590,13 +594,17 @@ def toInternal(self, obj):
             return
 
         if self._needSerializeAnyField:
+            # Only calling toInternal function for fields that need conversion
             if isinstance(obj, dict):
-                return tuple(f.toInternal(obj.get(n)) for n, f in zip(self.names, self.fields))
+                return tuple(f.toInternal(obj.get(n)) if c else obj.get(n)
+                             for n, f, c in zip(self.names, self.fields, self._needConversion))
             elif isinstance(obj, (tuple, list)):
-                return tuple(f.toInternal(v) for f, v in zip(self.fields, obj))
+                return tuple(f.toInternal(v) if c else v
+                             for f, v, c in zip(self.fields, obj, self._needConversion))
             elif hasattr(obj, "__dict__"):
                 d = obj.__dict__
-                return tuple(f.toInternal(d.get(n)) for n, f in zip(self.names, self.fields))
+                return tuple(f.toInternal(d.get(n)) if c else d.get(n)
+                             for n, f, c in zip(self.names, self.fields, self._needConversion))
             else:
                 raise ValueError("Unexpected tuple %r with StructType" % obj)
         else:
@@ -619,7 +627,9 @@ def fromInternal(self, obj):
             # it's already converted by pickler
             return obj
         if self._needSerializeAnyField:
-            values = [f.fromInternal(v) for f, v in zip(self.fields, obj)]
+            # Only calling fromInternal function for fields that need conversion
+            values = [f.fromInternal(v) if c else v
+                      for f, v, c in zip(self.fields, obj, self._needConversion)]
         else:
             values = obj
         return _create_row(self.names, values)

From 6adf67dd14b0ece342bb91adf800df0a7101e038 Mon Sep 17 00:00:00 2001
From: Andrew Ray <ray.andrew@gmail.com>
Date: Mon, 18 Sep 2017 02:46:27 +0900
Subject: [PATCH 560/779] [SPARK-21985][PYSPARK] PairDeserializer is broken for
 double-zipped RDDs

## What changes were proposed in this pull request?
(edited)
Fixes a bug introduced in #16121

In PairDeserializer convert each batch of keys and values to lists (if they do not have `__len__` already) so that we can check that they are the same size. Normally they already are lists so this should not have a performance impact, but this is needed when repeated `zip`'s are done.

## How was this patch tested?

Additional unit test

Author: Andrew Ray <ray.andrew@gmail.com>

Closes #19226 from aray/SPARK-21985.
---
 python/pyspark/serializers.py |  6 +++++-
 python/pyspark/tests.py       | 12 ++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index d5c2a7518b18f..660b19ad2a7c4 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -97,7 +97,7 @@ def load_stream(self, stream):
 
     def _load_stream_without_unbatching(self, stream):
         """
-        Return an iterator of deserialized batches (lists) of objects from the input stream.
+        Return an iterator of deserialized batches (iterable) of objects from the input stream.
         if the serializer does not operate on batches the default implementation returns an
         iterator of single element lists.
         """
@@ -343,6 +343,10 @@ def _load_stream_without_unbatching(self, stream):
         key_batch_stream = self.key_ser._load_stream_without_unbatching(stream)
         val_batch_stream = self.val_ser._load_stream_without_unbatching(stream)
         for (key_batch, val_batch) in zip(key_batch_stream, val_batch_stream):
+            # For double-zipped RDDs, the batches can be iterators from other PairDeserializer,
+            # instead of lists. We need to convert them to lists if needed.
+            key_batch = key_batch if hasattr(key_batch, '__len__') else list(key_batch)
+            val_batch = val_batch if hasattr(val_batch, '__len__') else list(val_batch)
             if len(key_batch) != len(val_batch):
                 raise ValueError("Can not deserialize PairRDD with different number of items"
                                  " in batches: (%d, %d)" % (len(key_batch), len(val_batch)))
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 000dd1eb8e481..3c108ec92ccc9 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -644,6 +644,18 @@ def test_cartesian_chaining(self):
             set([(x, (y, y)) for x in range(10) for y in range(10)])
         )
 
+    def test_zip_chaining(self):
+        # Tests for SPARK-21985
+        rdd = self.sc.parallelize('abc', 2)
+        self.assertSetEqual(
+            set(rdd.zip(rdd).zip(rdd).collect()),
+            set([((x, x), x) for x in 'abc'])
+        )
+        self.assertSetEqual(
+            set(rdd.zip(rdd.zip(rdd)).collect()),
+            set([(x, (x, x)) for x in 'abc'])
+        )
+
     def test_deleting_input_files(self):
         # Regression test for SPARK-1025
         tempFile = tempfile.NamedTemporaryFile(delete=False)

From 6308c65f08b507408033da1f1658144ea8c1491f Mon Sep 17 00:00:00 2001
From: Andrew Ash <andrew@andrewash.com>
Date: Mon, 18 Sep 2017 10:42:24 +0800
Subject: [PATCH 561/779] [SPARK-21953] Show both memory and disk bytes spilled
 if either is present

As written now, there must be both memory and disk bytes spilled to show either of them. If there is only one of those types of spill recorded, it will be hidden.

Author: Andrew Ash <andrew@andrewash.com>

Closes #19164 from ash211/patch-3.
---
 core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala b/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala
index d9c87f69d8a54..5acec0d0f54c9 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala
@@ -110,7 +110,7 @@ private[spark] object UIData {
     def hasOutput: Boolean = outputBytes > 0
     def hasShuffleRead: Boolean = shuffleReadTotalBytes > 0
     def hasShuffleWrite: Boolean = shuffleWriteBytes > 0
-    def hasBytesSpilled: Boolean = memoryBytesSpilled > 0 && diskBytesSpilled > 0
+    def hasBytesSpilled: Boolean = memoryBytesSpilled > 0 || diskBytesSpilled > 0
   }
 
   /**

From 7c7266208a3be984ac1ce53747dc0c3640f4ecac Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Mon, 18 Sep 2017 13:20:11 +0900
Subject: [PATCH 562/779] [SPARK-22043][PYTHON] Improves error message for
 show_profiles and dump_profiles

## What changes were proposed in this pull request?

This PR proposes to improve error message from:

```
>>> sc.show_profiles()
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File ".../spark/python/pyspark/context.py", line 1000, in show_profiles
    self.profiler_collector.show_profiles()
AttributeError: 'NoneType' object has no attribute 'show_profiles'
>>> sc.dump_profiles("/tmp/abc")
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File ".../spark/python/pyspark/context.py", line 1005, in dump_profiles
    self.profiler_collector.dump_profiles(path)
AttributeError: 'NoneType' object has no attribute 'dump_profiles'
```

to

```
>>> sc.show_profiles()
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File ".../spark/python/pyspark/context.py", line 1003, in show_profiles
    raise RuntimeError("'spark.python.profile' configuration must be set "
RuntimeError: 'spark.python.profile' configuration must be set to 'true' to enable Python profile.
>>> sc.dump_profiles("/tmp/abc")
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File ".../spark/python/pyspark/context.py", line 1012, in dump_profiles
    raise RuntimeError("'spark.python.profile' configuration must be set "
RuntimeError: 'spark.python.profile' configuration must be set to 'true' to enable Python profile.
```

## How was this patch tested?

Unit tests added in `python/pyspark/tests.py` and manual tests.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #19260 from HyukjinKwon/profile-errors.
---
 python/pyspark/context.py | 12 ++++++++++--
 python/pyspark/tests.py   | 16 ++++++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index a7046043e0376..a33f6dcf31fc0 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -997,12 +997,20 @@ def runJob(self, rdd, partitionFunc, partitions=None, allowLocal=False):
 
     def show_profiles(self):
         """ Print the profile stats to stdout """
-        self.profiler_collector.show_profiles()
+        if self.profiler_collector is not None:
+            self.profiler_collector.show_profiles()
+        else:
+            raise RuntimeError("'spark.python.profile' configuration must be set "
+                               "to 'true' to enable Python profile.")
 
     def dump_profiles(self, path):
         """ Dump the profile stats into directory `path`
         """
-        self.profiler_collector.dump_profiles(path)
+        if self.profiler_collector is not None:
+            self.profiler_collector.dump_profiles(path)
+        else:
+            raise RuntimeError("'spark.python.profile' configuration must be set "
+                               "to 'true' to enable Python profile.")
 
     def getConf(self):
         conf = SparkConf()
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 3c108ec92ccc9..da99872da2f0e 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -1296,6 +1296,22 @@ def heavy_foo(x):
         rdd.foreach(heavy_foo)
 
 
+class ProfilerTests2(unittest.TestCase):
+    def test_profiler_disabled(self):
+        sc = SparkContext(conf=SparkConf().set("spark.python.profile", "false"))
+        try:
+            self.assertRaisesRegexp(
+                RuntimeError,
+                "'spark.python.profile' configuration must be set",
+                lambda: sc.show_profiles())
+            self.assertRaisesRegexp(
+                RuntimeError,
+                "'spark.python.profile' configuration must be set",
+                lambda: sc.dump_profiles("/tmp/abc"))
+        finally:
+            sc.stop()
+
+
 class InputFormatTests(ReusedPySparkTestCase):
 
     @classmethod

From 1e978b17d63d7ba20368057aa4e65f5ef6e87369 Mon Sep 17 00:00:00 2001
From: Sital Kedia <skedia@fb.com>
Date: Sun, 17 Sep 2017 23:15:08 -0700
Subject: [PATCH 563/779] =?UTF-8?q?[SPARK-21113][CORE]=20Read=20ahead=20in?=
 =?UTF-8?q?put=20stream=20to=20amortize=20disk=20IO=20cost=20=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Profiling some of our big jobs, we see that around 30% of the time is being spent in reading the spill files from disk. In order to amortize the disk IO cost, the idea is to implement a read ahead input stream which asynchronously reads ahead from the underlying input stream when specified amount of data has been read from the current buffer. It does it by maintaining two buffer - active buffer and read ahead buffer. The active buffer contains data which should be returned when a read() call is issued. The read-ahead buffer is used to asynchronously read from the underlying input stream and once the active buffer is exhausted, we flip the two buffers so that we can start reading from the read ahead buffer without being blocked in disk I/O.

## How was this patch tested?

Tested by running a job on the cluster and could see up to 8% CPU improvement.

Author: Sital Kedia <skedia@fb.com>
Author: Shixiong Zhu <zsxwing@gmail.com>
Author: Sital Kedia <sitalkedia@users.noreply.github.com>

Closes #18317 from sitalkedia/read_ahead_buffer.
---
 .../apache/spark/io/ReadAheadInputStream.java | 408 ++++++++++++++++++
 .../unsafe/sort/UnsafeSorterSpillReader.java  |  21 +-
 ....java => GenericFileInputStreamSuite.java} |  13 +-
 .../spark/io/NioBufferedInputStreamSuite.java |  33 ++
 .../spark/io/ReadAheadInputStreamSuite.java   |  33 ++
 5 files changed, 495 insertions(+), 13 deletions(-)
 create mode 100644 core/src/main/java/org/apache/spark/io/ReadAheadInputStream.java
 rename core/src/test/java/org/apache/spark/io/{NioBufferedFileInputStreamSuite.java => GenericFileInputStreamSuite.java} (87%)
 create mode 100644 core/src/test/java/org/apache/spark/io/NioBufferedInputStreamSuite.java
 create mode 100644 core/src/test/java/org/apache/spark/io/ReadAheadInputStreamSuite.java

diff --git a/core/src/main/java/org/apache/spark/io/ReadAheadInputStream.java b/core/src/main/java/org/apache/spark/io/ReadAheadInputStream.java
new file mode 100644
index 0000000000000..618bd42d0e65d
--- /dev/null
+++ b/core/src/main/java/org/apache/spark/io/ReadAheadInputStream.java
@@ -0,0 +1,408 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.io;
+
+import com.google.common.base.Preconditions;
+import com.google.common.base.Throwables;
+import org.apache.spark.util.ThreadUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.annotation.concurrent.GuardedBy;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InterruptedIOException;
+import java.nio.ByteBuffer;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.locks.Condition;
+import java.util.concurrent.locks.ReentrantLock;
+
+/**
+ * {@link InputStream} implementation which asynchronously reads ahead from the underlying input
+ * stream when specified amount of data has been read from the current buffer. It does it by maintaining
+ * two buffer - active buffer and read ahead buffer. Active buffer contains data which should be returned
+ * when a read() call is issued. The read ahead buffer is used to asynchronously read from the underlying
+ * input stream and once the current active buffer is exhausted, we flip the two buffers so that we can
+ * start reading from the read ahead buffer without being blocked in disk I/O.
+ */
+public class ReadAheadInputStream extends InputStream {
+
+  private static final Logger logger = LoggerFactory.getLogger(ReadAheadInputStream.class);
+
+  private ReentrantLock stateChangeLock = new ReentrantLock();
+
+  @GuardedBy("stateChangeLock")
+  private ByteBuffer activeBuffer;
+
+  @GuardedBy("stateChangeLock")
+  private ByteBuffer readAheadBuffer;
+
+  @GuardedBy("stateChangeLock")
+  private boolean endOfStream;
+
+  @GuardedBy("stateChangeLock")
+  // true if async read is in progress
+  private boolean readInProgress;
+
+  @GuardedBy("stateChangeLock")
+  // true if read is aborted due to an exception in reading from underlying input stream.
+  private boolean readAborted;
+
+  @GuardedBy("stateChangeLock")
+  private Throwable readException;
+
+  @GuardedBy("stateChangeLock")
+  // whether the close method is called.
+  private boolean isClosed;
+
+  @GuardedBy("stateChangeLock")
+  // true when the close method will close the underlying input stream. This is valid only if
+  // `isClosed` is true.
+  private boolean isUnderlyingInputStreamBeingClosed;
+
+  @GuardedBy("stateChangeLock")
+  // whether there is a read ahead task running,
+  private boolean isReading;
+
+  // If the remaining data size in the current buffer is below this threshold,
+  // we issue an async read from the underlying input stream.
+  private final int readAheadThresholdInBytes;
+
+  private final InputStream underlyingInputStream;
+
+  private final ExecutorService executorService = ThreadUtils.newDaemonSingleThreadExecutor("read-ahead");
+
+  private final Condition asyncReadComplete = stateChangeLock.newCondition();
+
+  private static final ThreadLocal<byte[]> oneByte = ThreadLocal.withInitial(() -> new byte[1]);
+
+  /**
+   * Creates a <code>ReadAheadInputStream</code> with the specified buffer size and read-ahead
+   * threshold
+   *
+   * @param inputStream The underlying input stream.
+   * @param bufferSizeInBytes The buffer size.
+   * @param readAheadThresholdInBytes If the active buffer has less data than the read-ahead
+   *                                  threshold, an async read is triggered.
+   */
+  public ReadAheadInputStream(InputStream inputStream, int bufferSizeInBytes, int readAheadThresholdInBytes) {
+    Preconditions.checkArgument(bufferSizeInBytes > 0,
+        "bufferSizeInBytes should be greater than 0, but the value is " + bufferSizeInBytes);
+    Preconditions.checkArgument(readAheadThresholdInBytes > 0 &&
+            readAheadThresholdInBytes < bufferSizeInBytes,
+        "readAheadThresholdInBytes should be greater than 0 and less than bufferSizeInBytes, but the" +
+            "value is " + readAheadThresholdInBytes);
+    activeBuffer = ByteBuffer.allocate(bufferSizeInBytes);
+    readAheadBuffer = ByteBuffer.allocate(bufferSizeInBytes);
+    this.readAheadThresholdInBytes = readAheadThresholdInBytes;
+    this.underlyingInputStream = inputStream;
+    activeBuffer.flip();
+    readAheadBuffer.flip();
+  }
+
+  private boolean isEndOfStream() {
+    return (!activeBuffer.hasRemaining() && !readAheadBuffer.hasRemaining() && endOfStream);
+  }
+
+  private void checkReadException() throws IOException {
+    if (readAborted) {
+      Throwables.propagateIfPossible(readException, IOException.class);
+      throw new IOException(readException);
+    }
+  }
+
+  /** Read data from underlyingInputStream to readAheadBuffer asynchronously. */
+  private void readAsync() throws IOException {
+    stateChangeLock.lock();
+    final byte[] arr = readAheadBuffer.array();
+    try {
+      if (endOfStream || readInProgress) {
+        return;
+      }
+      checkReadException();
+      readAheadBuffer.position(0);
+      readAheadBuffer.flip();
+      readInProgress = true;
+    } finally {
+      stateChangeLock.unlock();
+    }
+    executorService.execute(new Runnable() {
+
+      @Override
+      public void run() {
+        stateChangeLock.lock();
+        try {
+          if (isClosed) {
+            readInProgress = false;
+            return;
+          }
+          // Flip this so that the close method will not close the underlying input stream when we
+          // are reading.
+          isReading = true;
+        } finally {
+          stateChangeLock.unlock();
+        }
+
+        // Please note that it is safe to release the lock and read into the read ahead buffer
+        // because either of following two conditions will hold - 1. The active buffer has
+        // data available to read so the reader will not read from the read ahead buffer.
+        // 2. This is the first time read is called or the active buffer is exhausted,
+        // in that case the reader waits for this async read to complete.
+        // So there is no race condition in both the situations.
+        int read = 0;
+        Throwable exception = null;
+        try {
+          while (true) {
+            read = underlyingInputStream.read(arr);
+            if (0 != read) break;
+          }
+        } catch (Throwable ex) {
+          exception = ex;
+          if (ex instanceof Error) {
+            // `readException` may not be reported to the user. Rethrow Error to make sure at least
+            // The user can see Error in UncaughtExceptionHandler.
+            throw (Error) ex;
+          }
+        } finally {
+          stateChangeLock.lock();
+          if (read < 0 || (exception instanceof EOFException)) {
+            endOfStream = true;
+          } else if (exception != null) {
+            readAborted = true;
+            readException = exception;
+          } else {
+            readAheadBuffer.limit(read);
+          }
+          readInProgress = false;
+          signalAsyncReadComplete();
+          stateChangeLock.unlock();
+          closeUnderlyingInputStreamIfNecessary();
+        }
+      }
+    });
+  }
+
+  private void closeUnderlyingInputStreamIfNecessary() {
+    boolean needToCloseUnderlyingInputStream = false;
+    stateChangeLock.lock();
+    try {
+      isReading = false;
+      if (isClosed && !isUnderlyingInputStreamBeingClosed) {
+        // close method cannot close underlyingInputStream because we were reading.
+        needToCloseUnderlyingInputStream = true;
+      }
+    } finally {
+      stateChangeLock.unlock();
+    }
+    if (needToCloseUnderlyingInputStream) {
+      try {
+        underlyingInputStream.close();
+      } catch (IOException e) {
+        logger.warn(e.getMessage(), e);
+      }
+    }
+  }
+
+  private void signalAsyncReadComplete() {
+    stateChangeLock.lock();
+    try {
+      asyncReadComplete.signalAll();
+    } finally {
+      stateChangeLock.unlock();
+    }
+  }
+
+  private void waitForAsyncReadComplete() throws IOException {
+    stateChangeLock.lock();
+    try {
+      while (readInProgress) {
+        asyncReadComplete.await();
+      }
+    } catch (InterruptedException e) {
+      InterruptedIOException iio = new InterruptedIOException(e.getMessage());
+      iio.initCause(e);
+      throw iio;
+    } finally {
+      stateChangeLock.unlock();
+    }
+    checkReadException();
+  }
+
+  @Override
+  public int read() throws IOException {
+    byte[] oneByteArray = oneByte.get();
+    return read(oneByteArray, 0, 1) == -1 ? -1 : oneByteArray[0] & 0xFF;
+  }
+
+  @Override
+  public int read(byte[] b, int offset, int len) throws IOException {
+    if (offset < 0 || len < 0 || len > b.length - offset) {
+      throw new IndexOutOfBoundsException();
+    }
+    if (len == 0) {
+      return 0;
+    }
+    stateChangeLock.lock();
+    try {
+      return readInternal(b, offset, len);
+    } finally {
+      stateChangeLock.unlock();
+    }
+  }
+
+  /**
+   * flip the active and read ahead buffer
+   */
+  private void swapBuffers() {
+    ByteBuffer temp = activeBuffer;
+    activeBuffer = readAheadBuffer;
+    readAheadBuffer = temp;
+  }
+
+  /**
+   * Internal read function which should be called only from read() api. The assumption is that
+   * the stateChangeLock is already acquired in the caller before calling this function.
+   */
+  private int readInternal(byte[] b, int offset, int len) throws IOException {
+    assert (stateChangeLock.isLocked());
+    if (!activeBuffer.hasRemaining()) {
+      waitForAsyncReadComplete();
+      if (readAheadBuffer.hasRemaining()) {
+        swapBuffers();
+      } else {
+        // The first read or activeBuffer is skipped.
+        readAsync();
+        waitForAsyncReadComplete();
+        if (isEndOfStream()) {
+          return -1;
+        }
+        swapBuffers();
+      }
+    } else {
+      checkReadException();
+    }
+    len = Math.min(len, activeBuffer.remaining());
+    activeBuffer.get(b, offset, len);
+
+    if (activeBuffer.remaining() <= readAheadThresholdInBytes && !readAheadBuffer.hasRemaining()) {
+      readAsync();
+    }
+    return len;
+  }
+
+  @Override
+  public int available() throws IOException {
+    stateChangeLock.lock();
+    // Make sure we have no integer overflow.
+    try {
+      return (int) Math.min((long) Integer.MAX_VALUE,
+          (long) activeBuffer.remaining() + readAheadBuffer.remaining());
+    } finally {
+      stateChangeLock.unlock();
+    }
+  }
+
+  @Override
+  public long skip(long n) throws IOException {
+    if (n <= 0L) {
+      return 0L;
+    }
+    stateChangeLock.lock();
+    long skipped;
+    try {
+      skipped = skipInternal(n);
+    } finally {
+      stateChangeLock.unlock();
+    }
+    return skipped;
+  }
+
+  /**
+   * Internal skip function which should be called only from skip() api. The assumption is that
+   * the stateChangeLock is already acquired in the caller before calling this function.
+   */
+  private long skipInternal(long n) throws IOException {
+    assert (stateChangeLock.isLocked());
+    waitForAsyncReadComplete();
+    if (isEndOfStream()) {
+      return 0;
+    }
+    if (available() >= n) {
+      // we can skip from the internal buffers
+      int toSkip = (int) n;
+      if (toSkip <= activeBuffer.remaining()) {
+        // Only skipping from active buffer is sufficient
+        activeBuffer.position(toSkip + activeBuffer.position());
+        if (activeBuffer.remaining() <= readAheadThresholdInBytes
+            && !readAheadBuffer.hasRemaining()) {
+          readAsync();
+        }
+        return n;
+      }
+      // We need to skip from both active buffer and read ahead buffer
+      toSkip -= activeBuffer.remaining();
+      activeBuffer.position(0);
+      activeBuffer.flip();
+      readAheadBuffer.position(toSkip + readAheadBuffer.position());
+      swapBuffers();
+      readAsync();
+      return n;
+    } else {
+      int skippedBytes = available();
+      long toSkip = n - skippedBytes;
+      activeBuffer.position(0);
+      activeBuffer.flip();
+      readAheadBuffer.position(0);
+      readAheadBuffer.flip();
+      long skippedFromInputStream = underlyingInputStream.skip(toSkip);
+      readAsync();
+      return skippedBytes + skippedFromInputStream;
+    }
+  }
+
+  @Override
+  public void close() throws IOException {
+    boolean isSafeToCloseUnderlyingInputStream = false;
+    stateChangeLock.lock();
+    try {
+      if (isClosed) {
+        return;
+      }
+      isClosed = true;
+      if (!isReading) {
+        // Nobody is reading, so we can close the underlying input stream in this method.
+        isSafeToCloseUnderlyingInputStream = true;
+        // Flip this to make sure the read ahead task will not close the underlying input stream.
+        isUnderlyingInputStreamBeingClosed = true;
+      }
+    } finally {
+      stateChangeLock.unlock();
+    }
+
+    try {
+      executorService.shutdownNow();
+      executorService.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS);
+    } catch (InterruptedException e) {
+      InterruptedIOException iio = new InterruptedIOException(e.getMessage());
+      iio.initCause(e);
+      throw iio;
+    } finally {
+      if (isSafeToCloseUnderlyingInputStream) {
+        underlyingInputStream.close();
+      }
+    }
+  }
+}
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
index 9521ab86a12d5..1e760b0b51988 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
@@ -17,20 +17,20 @@
 
 package org.apache.spark.util.collection.unsafe.sort;
 
-import java.io.*;
-
 import com.google.common.io.ByteStreams;
 import com.google.common.io.Closeables;
-
 import org.apache.spark.SparkEnv;
 import org.apache.spark.TaskContext;
 import org.apache.spark.io.NioBufferedFileInputStream;
+import org.apache.spark.io.ReadAheadInputStream;
 import org.apache.spark.serializer.SerializerManager;
 import org.apache.spark.storage.BlockId;
 import org.apache.spark.unsafe.Platform;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import java.io.*;
+
 /**
  * Reads spill files written by {@link UnsafeSorterSpillWriter} (see that class for a description
  * of the file format).
@@ -72,10 +72,23 @@ public UnsafeSorterSpillReader(
       bufferSizeBytes = DEFAULT_BUFFER_SIZE_BYTES;
     }
 
+    final double readAheadFraction =
+        SparkEnv.get() == null ? 0.5 :
+             SparkEnv.get().conf().getDouble("spark.unsafe.sorter.spill.read.ahead.fraction", 0.5);
+
+    final boolean readAheadEnabled =
+            SparkEnv.get() == null ? false :
+                    SparkEnv.get().conf().getBoolean("spark.unsafe.sorter.spill.read.ahead.enabled", true);
+
     final InputStream bs =
         new NioBufferedFileInputStream(file, (int) bufferSizeBytes);
     try {
-      this.in = serializerManager.wrapStream(blockId, bs);
+      if (readAheadEnabled) {
+        this.in = new ReadAheadInputStream(serializerManager.wrapStream(blockId, bs),
+                (int) bufferSizeBytes, (int) (bufferSizeBytes * readAheadFraction));
+      } else {
+        this.in = serializerManager.wrapStream(blockId, bs);
+      }
       this.din = new DataInputStream(this.in);
       numRecords = numRecordsRemaining = din.readInt();
     } catch (IOException e) {
diff --git a/core/src/test/java/org/apache/spark/io/NioBufferedFileInputStreamSuite.java b/core/src/test/java/org/apache/spark/io/GenericFileInputStreamSuite.java
similarity index 87%
rename from core/src/test/java/org/apache/spark/io/NioBufferedFileInputStreamSuite.java
rename to core/src/test/java/org/apache/spark/io/GenericFileInputStreamSuite.java
index 2c1a34a607592..3440e1aea2f46 100644
--- a/core/src/test/java/org/apache/spark/io/NioBufferedFileInputStreamSuite.java
+++ b/core/src/test/java/org/apache/spark/io/GenericFileInputStreamSuite.java
@@ -31,11 +31,13 @@
 /**
  * Tests functionality of {@link NioBufferedFileInputStream}
  */
-public class NioBufferedFileInputStreamSuite {
+public abstract class GenericFileInputStreamSuite {
 
   private byte[] randomBytes;
 
-  private File inputFile;
+  protected File inputFile;
+
+  protected InputStream inputStream;
 
   @Before
   public void setUp() throws IOException {
@@ -52,7 +54,6 @@ public void tearDown() {
 
   @Test
   public void testReadOneByte() throws IOException {
-    InputStream inputStream = new NioBufferedFileInputStream(inputFile);
     for (int i = 0; i < randomBytes.length; i++) {
       assertEquals(randomBytes[i], (byte) inputStream.read());
     }
@@ -60,7 +61,6 @@ public void testReadOneByte() throws IOException {
 
   @Test
   public void testReadMultipleBytes() throws IOException {
-    InputStream inputStream = new NioBufferedFileInputStream(inputFile);
     byte[] readBytes = new byte[8 * 1024];
     int i = 0;
     while (i < randomBytes.length) {
@@ -74,7 +74,6 @@ public void testReadMultipleBytes() throws IOException {
 
   @Test
   public void testBytesSkipped() throws IOException {
-    InputStream inputStream = new NioBufferedFileInputStream(inputFile);
     assertEquals(1024, inputStream.skip(1024));
     for (int i = 1024; i < randomBytes.length; i++) {
       assertEquals(randomBytes[i], (byte) inputStream.read());
@@ -83,7 +82,6 @@ public void testBytesSkipped() throws IOException {
 
   @Test
   public void testBytesSkippedAfterRead() throws IOException {
-    InputStream inputStream = new NioBufferedFileInputStream(inputFile);
     for (int i = 0; i < 1024; i++) {
       assertEquals(randomBytes[i], (byte) inputStream.read());
     }
@@ -95,7 +93,6 @@ public void testBytesSkippedAfterRead() throws IOException {
 
   @Test
   public void testNegativeBytesSkippedAfterRead() throws IOException {
-    InputStream inputStream = new NioBufferedFileInputStream(inputFile);
     for (int i = 0; i < 1024; i++) {
       assertEquals(randomBytes[i], (byte) inputStream.read());
     }
@@ -111,7 +108,6 @@ public void testNegativeBytesSkippedAfterRead() throws IOException {
 
   @Test
   public void testSkipFromFileChannel() throws IOException {
-    InputStream inputStream = new NioBufferedFileInputStream(inputFile, 10);
     // Since the buffer is smaller than the skipped bytes, this will guarantee
     // we skip from underlying file channel.
     assertEquals(1024, inputStream.skip(1024));
@@ -128,7 +124,6 @@ public void testSkipFromFileChannel() throws IOException {
 
   @Test
   public void testBytesSkippedAfterEOF() throws IOException {
-    InputStream inputStream = new NioBufferedFileInputStream(inputFile);
     assertEquals(randomBytes.length, inputStream.skip(randomBytes.length + 1));
     assertEquals(-1, inputStream.read());
   }
diff --git a/core/src/test/java/org/apache/spark/io/NioBufferedInputStreamSuite.java b/core/src/test/java/org/apache/spark/io/NioBufferedInputStreamSuite.java
new file mode 100644
index 0000000000000..211b33a1a9fb0
--- /dev/null
+++ b/core/src/test/java/org/apache/spark/io/NioBufferedInputStreamSuite.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.io;
+
+import org.junit.Before;
+
+import java.io.IOException;
+
+/**
+ * Tests functionality of {@link NioBufferedFileInputStream}
+ */
+public class NioBufferedInputStreamSuite extends GenericFileInputStreamSuite {
+
+  @Before
+  public void setUp() throws IOException {
+    super.setUp();
+    inputStream = new NioBufferedFileInputStream(inputFile);
+  }
+}
diff --git a/core/src/test/java/org/apache/spark/io/ReadAheadInputStreamSuite.java b/core/src/test/java/org/apache/spark/io/ReadAheadInputStreamSuite.java
new file mode 100644
index 0000000000000..5008f93b7e409
--- /dev/null
+++ b/core/src/test/java/org/apache/spark/io/ReadAheadInputStreamSuite.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.io;
+
+import org.junit.Before;
+
+import java.io.IOException;
+
+/**
+ * Tests functionality of {@link NioBufferedFileInputStream}
+ */
+public class ReadAheadInputStreamSuite extends GenericFileInputStreamSuite {
+
+  @Before
+  public void setUp() throws IOException {
+    super.setUp();
+    inputStream = new ReadAheadInputStream(new NioBufferedFileInputStream(inputFile), 8 * 1024, 4 * 1024);
+  }
+}

From 894a7561de2c2ff01fe7fcc5268378161e9e5643 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Mon, 18 Sep 2017 16:42:08 +0800
Subject: [PATCH 564/779] [SPARK-22047][TEST] ignore
 HiveExternalCatalogVersionsSuite

## What changes were proposed in this pull request?

As reported in https://issues.apache.org/jira/browse/SPARK-22047 , HiveExternalCatalogVersionsSuite is failing frequently, let's disable this test suite to unblock other PRs, I'm looking into the root cause.

## How was this patch tested?
N/A

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19264 from cloud-fan/test.
---
 .../apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
index 2928a734a7e36..01db9eb6f04f2 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
@@ -35,6 +35,7 @@ import org.apache.spark.util.Utils
  * expected version under this local directory, e.g. `/tmp/spark-test/spark-2.0.3`, we will skip the
  * downloading for this spark version.
  */
+@org.scalatest.Ignore
 class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils {
   private val wareHousePath = Utils.createTempDir(namePrefix = "warehouse")
   private val tmpDataDir = Utils.createTempDir(namePrefix = "test-data")

From 3b049abf102908ca72674139367e3b8d9ffcc283 Mon Sep 17 00:00:00 2001
From: Feng Liu <fengliu@databricks.com>
Date: Mon, 18 Sep 2017 08:49:32 -0700
Subject: [PATCH 565/779] [SPARK-22003][SQL] support array column in vectorized
 reader with UDF

## What changes were proposed in this pull request?

The UDF needs to deserialize the `UnsafeRow`. When the column type is Array, the `get` method from the `ColumnVector`, which is used by the vectorized reader, is called, but this method is not implemented.

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)
(If this patch involves UI changes, please attach a screenshot; otherwise, remove this)

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Feng Liu <fengliu@databricks.com>

Closes #19230 from liufengdb/fix_array_open.
---
 .../execution/vectorized/ColumnVector.java    | 103 ++++-----
 .../vectorized/ColumnVectorSuite.scala        | 201 ++++++++++++++++++
 2 files changed, 242 insertions(+), 62 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
index a69dd9718fe33..c4b519f0b153f 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
@@ -100,72 +100,16 @@ public ArrayData copy() {
     public Object[] array() {
       DataType dt = data.dataType();
       Object[] list = new Object[length];
-
-      if (dt instanceof BooleanType) {
-        for (int i = 0; i < length; i++) {
-          if (!data.isNullAt(offset + i)) {
-            list[i] = data.getBoolean(offset + i);
-          }
-        }
-      } else if (dt instanceof ByteType) {
-        for (int i = 0; i < length; i++) {
-          if (!data.isNullAt(offset + i)) {
-            list[i] = data.getByte(offset + i);
-          }
-        }
-      } else if (dt instanceof ShortType) {
-        for (int i = 0; i < length; i++) {
-          if (!data.isNullAt(offset + i)) {
-            list[i] = data.getShort(offset + i);
-          }
-        }
-      } else if (dt instanceof IntegerType) {
-        for (int i = 0; i < length; i++) {
-          if (!data.isNullAt(offset + i)) {
-            list[i] = data.getInt(offset + i);
-          }
-        }
-      } else if (dt instanceof FloatType) {
-        for (int i = 0; i < length; i++) {
-          if (!data.isNullAt(offset + i)) {
-            list[i] = data.getFloat(offset + i);
-          }
-        }
-      } else if (dt instanceof DoubleType) {
+      try {
         for (int i = 0; i < length; i++) {
           if (!data.isNullAt(offset + i)) {
-            list[i] = data.getDouble(offset + i);
+            list[i] = get(i, dt);
           }
         }
-      } else if (dt instanceof LongType) {
-        for (int i = 0; i < length; i++) {
-          if (!data.isNullAt(offset + i)) {
-            list[i] = data.getLong(offset + i);
-          }
-        }
-      } else if (dt instanceof DecimalType) {
-        DecimalType decType = (DecimalType)dt;
-        for (int i = 0; i < length; i++) {
-          if (!data.isNullAt(offset + i)) {
-            list[i] = getDecimal(i, decType.precision(), decType.scale());
-          }
-        }
-      } else if (dt instanceof StringType) {
-        for (int i = 0; i < length; i++) {
-          if (!data.isNullAt(offset + i)) {
-            list[i] = getUTF8String(i).toString();
-          }
-        }
-      } else if (dt instanceof CalendarIntervalType) {
-        for (int i = 0; i < length; i++) {
-          if (!data.isNullAt(offset + i)) {
-            list[i] = getInterval(i);
-          }
-        }
-      } else {
-        throw new UnsupportedOperationException("Type " + dt);
+        return list;
+      } catch(Exception e) {
+        throw new RuntimeException("Could not get the array", e);
       }
-      return list;
     }
 
     @Override
@@ -237,7 +181,42 @@ public MapData getMap(int ordinal) {
 
     @Override
     public Object get(int ordinal, DataType dataType) {
-      throw new UnsupportedOperationException();
+      if (dataType instanceof BooleanType) {
+        return getBoolean(ordinal);
+      } else if (dataType instanceof ByteType) {
+        return getByte(ordinal);
+      } else if (dataType instanceof ShortType) {
+        return getShort(ordinal);
+      } else if (dataType instanceof IntegerType) {
+        return getInt(ordinal);
+      } else if (dataType instanceof LongType) {
+        return getLong(ordinal);
+      } else if (dataType instanceof FloatType) {
+        return getFloat(ordinal);
+      } else if (dataType instanceof DoubleType) {
+        return getDouble(ordinal);
+      } else if (dataType instanceof StringType) {
+        return getUTF8String(ordinal);
+      } else if (dataType instanceof BinaryType) {
+        return getBinary(ordinal);
+      } else if (dataType instanceof DecimalType) {
+        DecimalType t = (DecimalType) dataType;
+        return getDecimal(ordinal, t.precision(), t.scale());
+      } else if (dataType instanceof DateType) {
+        return getInt(ordinal);
+      } else if (dataType instanceof TimestampType) {
+        return getLong(ordinal);
+      } else if (dataType instanceof ArrayType) {
+        return getArray(ordinal);
+      } else if (dataType instanceof StructType) {
+        return getStruct(ordinal, ((StructType)dataType).fields().length);
+      } else if (dataType instanceof MapType) {
+        return getMap(ordinal);
+      } else if (dataType instanceof CalendarIntervalType) {
+        return getInterval(ordinal);
+      } else {
+        throw new UnsupportedOperationException("Datatype not supported " + dataType);
+      }
     }
 
     @Override
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
new file mode 100644
index 0000000000000..998067a47033b
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
@@ -0,0 +1,201 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.vectorized
+
+import org.scalatest.BeforeAndAfterEach
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.util.ArrayData
+import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
+
+class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
+
+  var testVector: WritableColumnVector = _
+
+  private def allocate(capacity: Int, dt: DataType): WritableColumnVector = {
+    new OnHeapColumnVector(capacity, dt)
+  }
+
+  override def afterEach(): Unit = {
+    testVector.close()
+  }
+
+  test("boolean") {
+    testVector = allocate(10, BooleanType)
+    (0 until 10).foreach { i =>
+      testVector.appendBoolean(i % 2 == 0)
+    }
+
+    val array = new ColumnVector.Array(testVector)
+
+    (0 until 10).foreach { i =>
+      assert(array.get(i, BooleanType) === (i % 2 == 0))
+    }
+  }
+
+  test("byte") {
+    testVector = allocate(10, ByteType)
+    (0 until 10).foreach { i =>
+      testVector.appendByte(i.toByte)
+    }
+
+    val array = new ColumnVector.Array(testVector)
+
+    (0 until 10).foreach { i =>
+      assert(array.get(i, ByteType) === (i.toByte))
+    }
+  }
+
+  test("short") {
+    testVector = allocate(10, ShortType)
+    (0 until 10).foreach { i =>
+      testVector.appendShort(i.toShort)
+    }
+
+    val array = new ColumnVector.Array(testVector)
+
+    (0 until 10).foreach { i =>
+      assert(array.get(i, ShortType) === (i.toShort))
+    }
+  }
+
+  test("int") {
+    testVector = allocate(10, IntegerType)
+    (0 until 10).foreach { i =>
+      testVector.appendInt(i)
+    }
+
+    val array = new ColumnVector.Array(testVector)
+
+    (0 until 10).foreach { i =>
+      assert(array.get(i, IntegerType) === i)
+    }
+  }
+
+  test("long") {
+    testVector = allocate(10, LongType)
+    (0 until 10).foreach { i =>
+      testVector.appendLong(i)
+    }
+
+    val array = new ColumnVector.Array(testVector)
+
+    (0 until 10).foreach { i =>
+      assert(array.get(i, LongType) === i)
+    }
+  }
+
+  test("float") {
+    testVector = allocate(10, FloatType)
+    (0 until 10).foreach { i =>
+      testVector.appendFloat(i.toFloat)
+    }
+
+    val array = new ColumnVector.Array(testVector)
+
+    (0 until 10).foreach { i =>
+      assert(array.get(i, FloatType) === i.toFloat)
+    }
+  }
+
+  test("double") {
+    testVector = allocate(10, DoubleType)
+    (0 until 10).foreach { i =>
+      testVector.appendDouble(i.toDouble)
+    }
+
+    val array = new ColumnVector.Array(testVector)
+
+    (0 until 10).foreach { i =>
+      assert(array.get(i, DoubleType) === i.toDouble)
+    }
+  }
+
+  test("string") {
+    testVector = allocate(10, StringType)
+    (0 until 10).map { i =>
+      val utf8 = s"str$i".getBytes("utf8")
+      testVector.appendByteArray(utf8, 0, utf8.length)
+    }
+
+    val array = new ColumnVector.Array(testVector)
+
+    (0 until 10).foreach { i =>
+      assert(array.get(i, StringType) === UTF8String.fromString(s"str$i"))
+    }
+  }
+
+  test("binary") {
+    testVector = allocate(10, BinaryType)
+    (0 until 10).map { i =>
+      val utf8 = s"str$i".getBytes("utf8")
+      testVector.appendByteArray(utf8, 0, utf8.length)
+    }
+
+    val array = new ColumnVector.Array(testVector)
+
+    (0 until 10).foreach { i =>
+      val utf8 = s"str$i".getBytes("utf8")
+      assert(array.get(i, BinaryType) === utf8)
+    }
+  }
+
+  test("array") {
+    val arrayType = ArrayType(IntegerType, true)
+    testVector = allocate(10, arrayType)
+
+    val data = testVector.arrayData()
+    var i = 0
+    while (i < 6) {
+      data.putInt(i, i)
+      i += 1
+    }
+
+    // Populate it with arrays [0], [1, 2], [], [3, 4, 5]
+    testVector.putArray(0, 0, 1)
+    testVector.putArray(1, 1, 2)
+    testVector.putArray(2, 3, 0)
+    testVector.putArray(3, 3, 3)
+
+    val array = new ColumnVector.Array(testVector)
+
+    assert(array.get(0, arrayType).asInstanceOf[ArrayData].toIntArray() === Array(0))
+    assert(array.get(1, arrayType).asInstanceOf[ArrayData].toIntArray() === Array(1, 2))
+    assert(array.get(2, arrayType).asInstanceOf[ArrayData].toIntArray() === Array.empty[Int])
+    assert(array.get(3, arrayType).asInstanceOf[ArrayData].toIntArray() === Array(3, 4, 5))
+  }
+
+  test("struct") {
+    val schema = new StructType().add("int", IntegerType).add("double", DoubleType)
+    testVector = allocate(10, schema)
+    val c1 = testVector.getChildColumn(0)
+    val c2 = testVector.getChildColumn(1)
+    c1.putInt(0, 123)
+    c2.putDouble(0, 3.45)
+    c1.putInt(1, 456)
+    c2.putDouble(1, 5.67)
+
+    val array = new ColumnVector.Array(testVector)
+
+    assert(array.get(0, schema).asInstanceOf[ColumnarBatch.Row].get(0, IntegerType) === 123)
+    assert(array.get(0, schema).asInstanceOf[ColumnarBatch.Row].get(1, DoubleType) === 3.45)
+    assert(array.get(1, schema).asInstanceOf[ColumnarBatch.Row].get(0, IntegerType) === 456)
+    assert(array.get(1, schema).asInstanceOf[ColumnarBatch.Row].get(1, DoubleType) === 5.67)
+  }
+}

From c66d64b3df9d9ffba0b16a62015680f6f876fc68 Mon Sep 17 00:00:00 2001
From: Kevin Yu <qyu@us.ibm.com>
Date: Mon, 18 Sep 2017 12:12:35 -0700
Subject: [PATCH 566/779] [SPARK-14878][SQL] Trim characters string function
 support

#### What changes were proposed in this pull request?

This PR enhances the TRIM function support in Spark SQL by allowing the specification
of trim characters set. Below is the SQL syntax :

``` SQL
<trim function> ::= TRIM <left paren> <trim operands> <right paren>
<trim operands> ::= [ [ <trim specification> ] [ <trim character set> ] FROM ] <trim source>
<trim source> ::= <character value expression>
<trim specification> ::=
  LEADING
| TRAILING
| BOTH
<trim character set> ::= <characters value expression>
```
or
``` SQL
LTRIM (source-exp [, trim-exp])
RTRIM (source-exp [, trim-exp])
```

Here are the documentation link of support of this feature by other mainstream databases.
- **Oracle:** [TRIM function](http://docs.oracle.com/cd/B28359_01/olap.111/b28126/dml_functions_2126.htm#OLADM704)
- **DB2:** [TRIM scalar function](https://www.ibm.com/support/knowledgecenter/en/SSMKHH_10.0.0/com.ibm.etools.mft.doc/ak05270_.htm)
- **MySQL:** [Trim function](http://dev.mysql.com/doc/refman/5.7/en/string-functions.html#function_trim)
- **Oracle:** [ltrim](https://docs.oracle.com/cd/B28359_01/olap.111/b28126/dml_functions_2018.htm#OLADM594)
- **DB2:** [ltrim](https://www.ibm.com/support/knowledgecenter/en/SSEPEK_11.0.0/sqlref/src/tpc/db2z_bif_ltrim.html)

This PR is to implement the above enhancement. In the implementation, the design principle is to keep the changes to the minimum. Also, the exiting trim functions (which handles a special case, i.e., trimming space characters) are kept unchanged for performane reasons.
#### How was this patch tested?

The unit test cases are added in the following files:
- UTF8StringSuite.java
- StringExpressionsSuite.scala
- sql/SQLQuerySuite.scala
- StringFunctionsSuite.scala

Author: Kevin Yu <qyu@us.ibm.com>

Closes #12646 from kevinyu98/spark-14878.
---
 .../apache/spark/unsafe/types/UTF8String.java |  93 ++++++
 .../spark/unsafe/types/UTF8StringSuite.java   |  57 ++++
 .../spark/sql/catalyst/parser/SqlBase.g4      |   6 +
 .../expressions/stringExpressions.scala       | 280 ++++++++++++++++--
 .../sql/catalyst/parser/AstBuilder.scala      |  24 +-
 .../expressions/StringExpressionsSuite.scala  |  65 +++-
 .../sql/catalyst/parser/PlanParserSuite.scala |  18 ++
 .../parser/TableIdentifierParserSuite.scala   |   2 +-
 .../org/apache/spark/sql/functions.scala      |  27 ++
 .../spark/sql/StringFunctionsSuite.scala      |  14 +-
 10 files changed, 554 insertions(+), 32 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index dd67f15749add..76db0fb91e48a 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -511,6 +511,21 @@ public UTF8String trim() {
     }
   }
 
+  /**
+   * Based on the given trim string, trim this string starting from both ends
+   * This method searches for each character in the source string, removes the character if it is found
+   * in the trim string, stops at the first not found. It calls the trimLeft first, then trimRight.
+   * It returns a new string in which both ends trim characters have been removed.
+   * @param trimString the trim character string
+   */
+  public UTF8String trim(UTF8String trimString) {
+    if (trimString != null) {
+      return trimLeft(trimString).trimRight(trimString);
+    } else {
+      return null;
+    }
+  }
+
   public UTF8String trimLeft() {
     int s = 0;
     // skip all of the space (0x20) in the left side
@@ -523,6 +538,40 @@ public UTF8String trimLeft() {
     }
   }
 
+  /**
+   * Based on the given trim string, trim this string starting from left end
+   * This method searches each character in the source string starting from the left end, removes the character if it
+   * is in the trim string, stops at the first character which is not in the trim string, returns the new string.
+   * @param trimString the trim character string
+   */
+  public UTF8String trimLeft(UTF8String trimString) {
+    if (trimString == null) return null;
+    // the searching byte position in the source string
+    int srchIdx = 0;
+    // the first beginning byte position of a non-matching character
+    int trimIdx = 0;
+
+    while (srchIdx < numBytes) {
+      UTF8String searchChar = copyUTF8String(srchIdx, srchIdx + numBytesForFirstByte(this.getByte(srchIdx)) - 1);
+      int searchCharBytes = searchChar.numBytes;
+      // try to find the matching for the searchChar in the trimString set
+      if (trimString.find(searchChar, 0) >= 0) {
+        trimIdx += searchCharBytes;
+      } else {
+        // no matching, exit the search
+        break;
+      }
+      srchIdx += searchCharBytes;
+    }
+
+    if (trimIdx >= numBytes) {
+      // empty string
+      return EMPTY_UTF8;
+    } else {
+      return copyUTF8String(trimIdx, numBytes - 1);
+    }
+  }
+
   public UTF8String trimRight() {
     int e = numBytes - 1;
     // skip all of the space (0x20) in the right side
@@ -536,6 +585,50 @@ public UTF8String trimRight() {
     }
   }
 
+  /**
+   * Based on the given trim string, trim this string starting from right end
+   * This method searches each character in the source string starting from the right end, removes the character if it
+   * is in the trim string, stops at the first character which is not in the trim string, returns the new string.
+   * @param trimString the trim character string
+   */
+  public UTF8String trimRight(UTF8String trimString) {
+    if (trimString == null) return null;
+    int charIdx = 0;
+    // number of characters from the source string
+    int numChars = 0;
+    // array of character length for the source string
+    int[] stringCharLen = new int[numBytes];
+    // array of the first byte position for each character in the source string
+    int[] stringCharPos = new int[numBytes];
+    // build the position and length array
+    while (charIdx < numBytes) {
+      stringCharPos[numChars] = charIdx;
+      stringCharLen[numChars] = numBytesForFirstByte(getByte(charIdx));
+      charIdx += stringCharLen[numChars];
+      numChars ++;
+    }
+
+    // index trimEnd points to the first no matching byte position from the right side of the source string.
+    int trimEnd = numBytes - 1;
+    while (numChars > 0) {
+      UTF8String searchChar =
+        copyUTF8String(stringCharPos[numChars - 1], stringCharPos[numChars - 1] + stringCharLen[numChars - 1] - 1);
+      if (trimString.find(searchChar, 0) >= 0) {
+        trimEnd -= stringCharLen[numChars - 1];
+      } else {
+        break;
+      }
+      numChars --;
+    }
+
+    if (trimEnd < 0) {
+      // empty string
+      return EMPTY_UTF8;
+    } else {
+      return copyUTF8String(0, trimEnd);
+    }
+  }
+
   public UTF8String reverse() {
     byte[] result = new byte[this.numBytes];
 
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index c376371abdf90..f0860018d5642 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -730,4 +730,61 @@ public void testToLong() throws IOException {
       assertFalse(negativeInput, UTF8String.fromString(negativeInput).toLong(wrapper));
     }
   }
+
+  @Test
+  public void trimBothWithTrimString() {
+    assertEquals(fromString("hello"), fromString("  hello ").trim(fromString(" ")));
+    assertEquals(fromString("o"), fromString("  hello ").trim(fromString(" hle")));
+    assertEquals(fromString("h e"), fromString("ooh e ooo").trim(fromString("o ")));
+    assertEquals(fromString(""), fromString("ooo...oooo").trim(fromString("o.")));
+    assertEquals(fromString("b"), fromString("%^b[]@").trim(fromString("][@^%")));
+
+    assertEquals(EMPTY_UTF8, fromString("  ").trim(fromString(" ")));
+
+    assertEquals(fromString("数据砖头"), fromString("  数据砖头 ").trim());
+    assertEquals(fromString("数"), fromString("a数b").trim(fromString("ab")));
+    assertEquals(fromString(""), fromString("a").trim(fromString("a数b")));
+    assertEquals(fromString(""), fromString("数数 数数数").trim(fromString("数 ")));
+    assertEquals(fromString("据砖头"), fromString("数]数[数据砖头#数数").trim(fromString("[数]#")));
+    assertEquals(fromString("据砖头数数 "), fromString("数数数据砖头数数 ").trim(fromString("数")));
+  }
+
+  @Test
+  public void trimLeftWithTrimString() {
+    assertEquals(fromString("  hello "), fromString("  hello ").trimLeft(fromString("")));
+    assertEquals(fromString(""), fromString("a").trimLeft(fromString("a")));
+    assertEquals(fromString("b"), fromString("b").trimLeft(fromString("a")));
+    assertEquals(fromString("ba"), fromString("ba").trimLeft(fromString("a")));
+    assertEquals(fromString(""), fromString("aaaaaaa").trimLeft(fromString("a")));
+    assertEquals(fromString("trim"), fromString("oabtrim").trimLeft(fromString("bao")));
+    assertEquals(fromString("rim "), fromString("ooootrim ").trimLeft(fromString("otm")));
+
+    assertEquals(EMPTY_UTF8, fromString("  ").trimLeft(fromString(" ")));
+
+    assertEquals(fromString("数据砖头 "), fromString("  数据砖头 ").trimLeft(fromString(" ")));
+    assertEquals(fromString("数"), fromString("数").trimLeft(fromString("a")));
+    assertEquals(fromString("a"), fromString("a").trimLeft(fromString("数")));
+    assertEquals(fromString("砖头数数"), fromString("数数数据砖头数数").trimLeft(fromString("据数")));
+    assertEquals(fromString("据砖头数数"), fromString(" 数数数据砖头数数").trimLeft(fromString("数 ")));
+    assertEquals(fromString("据砖头数数"), fromString("aa数数数据砖头数数").trimLeft(fromString("a数砖")));
+    assertEquals(fromString("$S,.$BR"), fromString(",,,,%$S,.$BR").trimLeft(fromString("%,")));
+  }
+
+  @Test
+  public void trimRightWithTrimString() {
+    assertEquals(fromString("  hello "), fromString("  hello ").trimRight(fromString("")));
+    assertEquals(fromString(""), fromString("a").trimRight(fromString("a")));
+    assertEquals(fromString("cc"), fromString("ccbaaaa").trimRight(fromString("ba")));
+    assertEquals(fromString(""), fromString("aabbbbaaa").trimRight(fromString("ab")));
+    assertEquals(fromString("  he"), fromString("  hello ").trimRight(fromString(" ol")));
+    assertEquals(fromString("oohell"), fromString("oohellooo../*&").trimRight(fromString("./,&%*o")));
+
+    assertEquals(EMPTY_UTF8, fromString("  ").trimRight(fromString(" ")));
+
+    assertEquals(fromString("  数据砖头"), fromString("  数据砖头 ").trimRight(fromString(" ")));
+    assertEquals(fromString("数数砖头"), fromString("数数砖头数aa数").trimRight(fromString("a数")));
+    assertEquals(fromString(""), fromString("数数数据砖ab").trimRight(fromString("数据砖ab")));
+    assertEquals(fromString("头"), fromString("头a???/").trimRight(fromString("数?/*&^%a")));
+    assertEquals(fromString("头"), fromString("头数b数数 [").trimRight(fromString(" []数b")));
+  }
 }
diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index 33bc79a92b9e7..d0a54288780ea 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -580,6 +580,8 @@ primaryExpression
     | '(' query ')'                                                                            #subqueryExpression
     | qualifiedName '(' (setQuantifier? argument+=expression (',' argument+=expression)*)? ')'
        (OVER windowSpec)?                                                                      #functionCall
+    | qualifiedName '(' trimOption=(BOTH | LEADING | TRAILING) argument+=expression
+      FROM argument+=expression ')'                                                            #functionCall
     | value=primaryExpression '[' index=valueExpression ']'                                    #subscript
     | identifier                                                                               #columnReference
     | base=primaryExpression '.' fieldName=identifier                                          #dereference
@@ -748,6 +750,7 @@ nonReserved
     | UNBOUNDED | WHEN
     | DATABASE | SELECT | FROM | WHERE | HAVING | TO | TABLE | WITH | NOT | CURRENT_DATE | CURRENT_TIMESTAMP
     | DIRECTORY
+    | BOTH | LEADING | TRAILING
     ;
 
 SELECT: 'SELECT';
@@ -861,6 +864,9 @@ COMMIT: 'COMMIT';
 ROLLBACK: 'ROLLBACK';
 MACRO: 'MACRO';
 IGNORE: 'IGNORE';
+BOTH: 'BOTH';
+LEADING: 'LEADING';
+TRAILING: 'TRAILING';
 
 IF: 'IF';
 POSITION: 'POSITION';
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 7ab45a6ee8737..83de515079eea 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -24,6 +24,7 @@ import java.util.regex.Pattern
 
 import scala.collection.mutable.ArrayBuffer
 
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen._
@@ -503,69 +504,304 @@ case class FindInSet(left: Expression, right: Expression) extends BinaryExpressi
   override def prettyName: String = "find_in_set"
 }
 
+trait String2TrimExpression extends Expression with ImplicitCastInputTypes {
+
+  override def dataType: DataType = StringType
+  override def inputTypes: Seq[AbstractDataType] = Seq.fill(children.size)(StringType)
+
+  override def nullable: Boolean = children.exists(_.nullable)
+  override def foldable: Boolean = children.forall(_.foldable)
+}
+
+object StringTrim {
+  def apply(str: Expression, trimStr: Expression) : StringTrim = StringTrim(str, Some(trimStr))
+  def apply(str: Expression) : StringTrim = StringTrim(str, None)
+}
+
 /**
- * A function that trim the spaces from both ends for the specified string.
+ * A function that takes a character string, removes the leading and trailing characters matching with any character
+ * in the trim string, returns the new string.
+ * If BOTH and trimStr keywords are not specified, it defaults to remove space character from both ends. The trim
+ * function will have one argument, which contains the source string.
+ * If BOTH and trimStr keywords are specified, it trims the characters from both ends, and the trim function will have
+ * two arguments, the first argument contains trimStr, the second argument contains the source string.
+ * trimStr: A character string to be trimmed from the source string, if it has multiple characters, the function
+ * searches for each character in the source string, removes the characters from the source string until it
+ * encounters the first non-match character.
+ * BOTH: removes any character from both ends of the source string that matches characters in the trim string.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str) - Removes the leading and trailing space characters from `str`.",
+  usage = """
+    _FUNC_(str) - Removes the leading and trailing space characters from `str`.
+    _FUNC_(BOTH trimStr FROM str) - Remove the leading and trailing trimString from `str`
+  """,
+  arguments = """
+    Arguments:
+      * str - a string expression
+      * trimString - the trim string
+      * BOTH, FROM - these are keyword to specify for trim string from both ends of the string
+  """,
   examples = """
     Examples:
       > SELECT _FUNC_('    SparkSQL   ');
        SparkSQL
+      > SELECT _FUNC_(BOTH 'SL' FROM 'SSparkSQLS');
+       parkSQ
   """)
-case class StringTrim(child: Expression)
-  extends UnaryExpression with String2StringExpression {
+case class StringTrim(
+    srcStr: Expression,
+    trimStr: Option[Expression] = None)
+  extends String2TrimExpression {
+
+  def this(trimStr: Expression, srcStr: Expression) = this(srcStr, Option(trimStr))
 
-  def convert(v: UTF8String): UTF8String = v.trim()
+  def this(srcStr: Expression) = this(srcStr, None)
 
   override def prettyName: String = "trim"
 
+  override def children: Seq[Expression] = if (trimStr.isDefined) {
+    srcStr :: trimStr.get :: Nil
+  } else {
+    srcStr :: Nil
+  }
+  override def eval(input: InternalRow): Any = {
+    val srcString = srcStr.eval(input).asInstanceOf[UTF8String]
+    if (srcString == null) {
+      null
+    } else {
+      if (trimStr.isDefined) {
+        srcString.trim(trimStr.get.eval(input).asInstanceOf[UTF8String])
+      } else {
+        srcString.trim()
+      }
+    }
+  }
+
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    defineCodeGen(ctx, ev, c => s"($c).trim()")
+    val evals = children.map(_.genCode(ctx))
+    val srcString = evals(0)
+
+    if (evals.length == 1) {
+      ev.copy(evals.map(_.code).mkString + s"""
+        boolean ${ev.isNull} = false;
+        UTF8String ${ev.value} = null;
+        if (${srcString.isNull}) {
+          ${ev.isNull} = true;
+        } else {
+          ${ev.value} = ${srcString.value}.trim();
+        }""")
+    } else {
+      val trimString = evals(1)
+      val getTrimFunction =
+        s"""
+        if (${trimString.isNull}) {
+          ${ev.isNull} = true;
+        } else {
+          ${ev.value} = ${srcString.value}.trim(${trimString.value});
+        }"""
+      ev.copy(evals.map(_.code).mkString + s"""
+        boolean ${ev.isNull} = false;
+        UTF8String ${ev.value} = null;
+        if (${srcString.isNull}) {
+          ${ev.isNull} = true;
+        } else {
+          $getTrimFunction
+        }""")
+    }
   }
 }
 
+object StringTrimLeft {
+  def apply(str: Expression, trimStr: Expression) : StringTrimLeft = StringTrimLeft(str, Some(trimStr))
+  def apply(str: Expression) : StringTrimLeft = StringTrimLeft(str, None)
+}
+
 /**
- * A function that trim the spaces from left end for given string.
+ * A function that trims the characters from left end for a given string.
+ * If LEADING and trimStr keywords are not specified, it defaults to remove space character from the left end. The ltrim
+ * function will have one argument, which contains the source string.
+ * If LEADING and trimStr keywords are not specified, it trims the characters from left end. The ltrim function will
+ * have two arguments, the first argument contains trimStr, the second argument contains the source string.
+ * trimStr: the function removes any character from the left end of the source string which matches with the characters
+ * from trimStr, it stops at the first non-match character.
+ * LEADING: removes any character from the left end of the source string that matches characters in the trim string.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str) - Removes the leading and trailing space characters from `str`.",
+  usage = """
+    _FUNC_(str) - Removes the leading space characters from `str`.
+    _FUNC_(trimStr, str) - Removes the leading string contains the characters from the trim string
+  """,
+  arguments = """
+    Arguments:
+      * str - a string expression
+      * trimString - the trim string
+      * BOTH, FROM - these are keyword to specify for trim string from both ends of the string
+  """,
   examples = """
     Examples:
-      > SELECT _FUNC_('    SparkSQL');
+      > SELECT _FUNC_('    SparkSQL   ');
        SparkSQL
+      > SELECT _FUNC_('Sp', 'SSparkSQLS');
+       arkSQLS
   """)
-case class StringTrimLeft(child: Expression)
-  extends UnaryExpression with String2StringExpression {
+case class StringTrimLeft(
+    srcStr: Expression,
+    trimStr: Option[Expression] = None)
+  extends String2TrimExpression {
 
-  def convert(v: UTF8String): UTF8String = v.trimLeft()
+  def this(trimStr: Expression, srcStr: Expression) = this(srcStr, Option(trimStr))
+
+  def this(srcStr: Expression) = this(srcStr, None)
 
   override def prettyName: String = "ltrim"
 
-  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    defineCodeGen(ctx, ev, c => s"($c).trimLeft()")
+  override def children: Seq[Expression] = if (trimStr.isDefined) {
+    srcStr :: trimStr.get :: Nil
+  } else {
+    srcStr :: Nil
   }
+
+  override def eval(input: InternalRow): Any = {
+    val srcString = srcStr.eval(input).asInstanceOf[UTF8String]
+    if (srcString == null) {
+      null
+    } else {
+      if (trimStr.isDefined) {
+        srcString.trimLeft(trimStr.get.eval(input).asInstanceOf[UTF8String])
+      } else {
+        srcString.trimLeft()
+      }
+    }
+  }
+
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    val evals = children.map(_.genCode(ctx))
+    val srcString = evals(0)
+
+    if (evals.length == 1) {
+      ev.copy(evals.map(_.code).mkString + s"""
+        boolean ${ev.isNull} = false;
+        UTF8String ${ev.value} = null;
+        if (${srcString.isNull}) {
+          ${ev.isNull} = true;
+        } else {
+          ${ev.value} = ${srcString.value}.trimLeft();
+        }""")
+    } else {
+      val trimString = evals(1)
+      val getTrimLeftFunction =
+        s"""
+        if (${trimString.isNull}) {
+          ${ev.isNull} = true;
+        } else {
+          ${ev.value} = ${srcString.value}.trimLeft(${trimString.value});
+        }"""
+      ev.copy(evals.map(_.code).mkString + s"""
+        boolean ${ev.isNull} = false;
+        UTF8String ${ev.value} = null;
+        if (${srcString.isNull}) {
+          ${ev.isNull} = true;
+        } else {
+          $getTrimLeftFunction
+        }""")
+    }
+  }
+}
+
+object StringTrimRight {
+  def apply(str: Expression, trimStr: Expression) : StringTrimRight = StringTrimRight(str, Some(trimStr))
+  def apply(str: Expression) : StringTrimRight = StringTrimRight(str, None)
 }
 
 /**
- * A function that trim the spaces from right end for given string.
+ * A function that trims the characters from right end for a given string.
+ * If TRAILING and trimStr keywords are not specified, it defaults to remove space character from the right end. The
+ * rtrim function will have one argument, which contains the source string.
+ * If TRAILING and trimStr keywords are specified, it trims the characters from right end. The rtrim function will
+ * have two arguments, the first argument contains trimStr, the second argument contains the source string.
+ * trimStr: the function removes any character from the right end of source string which matches with the characters
+ * from trimStr, it stops at the first non-match character.
+ * TRAILING: removes any character from the right end of the source string that matches characters in the trim string.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str) - Removes the trailing space characters from `str`.",
+  usage = """
+    _FUNC_(str) - Removes the trailing space characters from `str`.
+    _FUNC_(trimStr, str) - Removes the trailing string which contains the characters from the trim string from the `str`
+  """,
+  arguments = """
+    Arguments:
+      * str - a string expression
+      * trimString - the trim string
+      * BOTH, FROM - these are keyword to specify for trim string from both ends of the string
+  """,
   examples = """
     Examples:
       > SELECT _FUNC_('    SparkSQL   ');
-           SparkSQL
+       SparkSQL
+      > SELECT _FUNC_('LQSa', 'SSparkSQLS');
+       SSpark
   """)
-case class StringTrimRight(child: Expression)
-  extends UnaryExpression with String2StringExpression {
+case class StringTrimRight(
+    srcStr: Expression,
+    trimStr: Option[Expression] = None)
+  extends String2TrimExpression {
 
-  def convert(v: UTF8String): UTF8String = v.trimRight()
+  def this(trimStr: Expression, srcStr: Expression) = this(srcStr, Option(trimStr))
+
+  def this(srcStr: Expression) = this(srcStr, None)
 
   override def prettyName: String = "rtrim"
 
-  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    defineCodeGen(ctx, ev, c => s"($c).trimRight()")
+  override def children: Seq[Expression] = if (trimStr.isDefined) {
+    srcStr :: trimStr.get :: Nil
+  } else {
+    srcStr :: Nil
+  }
+
+  override def eval(input: InternalRow): Any = {
+    val srcString = srcStr.eval(input).asInstanceOf[UTF8String]
+    if (srcString == null) {
+      null
+    } else {
+      if (trimStr.isDefined) {
+        srcString.trimRight(trimStr.get.eval(input).asInstanceOf[UTF8String])
+      } else {
+        srcString.trimRight()
+      }
+    }
+  }
+
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    val evals = children.map(_.genCode(ctx))
+    val srcString = evals(0)
+
+    if (evals.length == 1) {
+      ev.copy(evals.map(_.code).mkString + s"""
+        boolean ${ev.isNull} = false;
+        UTF8String ${ev.value} = null;
+        if (${srcString.isNull}) {
+          ${ev.isNull} = true;
+        } else {
+          ${ev.value} = ${srcString.value}.trimRight();
+        }""")
+    } else {
+      val trimString = evals(1)
+      val getTrimRightFunction =
+        s"""
+        if (${trimString.isNull}) {
+          ${ev.isNull} = true;
+        } else {
+          ${ev.value} = ${srcString.value}.trimRight(${trimString.value});
+        }"""
+      ev.copy(evals.map(_.code).mkString + s"""
+        boolean ${ev.isNull} = false;
+        UTF8String ${ev.value} = null;
+        if (${srcString.isNull}) {
+          ${ev.isNull} = true;
+        } else {
+          $getTrimRightFunction
+        }""")
+    }
   }
 }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 891f61698f177..85b492e83446e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -1179,6 +1179,26 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
    * Create a (windowed) Function expression.
    */
   override def visitFunctionCall(ctx: FunctionCallContext): Expression = withOrigin(ctx) {
+    def replaceFunctions(
+        funcID: FunctionIdentifier,
+        ctx: FunctionCallContext): FunctionIdentifier = {
+      val opt = ctx.trimOption
+      if (opt != null) {
+        if (ctx.qualifiedName.getText.toLowerCase(Locale.ROOT) != "trim") {
+          throw new ParseException(s"The specified function ${ctx.qualifiedName.getText} " +
+            s"doesn't support with option ${opt.getText}.", ctx)
+        }
+        opt.getType match {
+          case SqlBaseParser.BOTH => funcID
+          case SqlBaseParser.LEADING => funcID.copy(funcName = "ltrim")
+          case SqlBaseParser.TRAILING => funcID.copy(funcName = "rtrim")
+          case _ => throw new ParseException("Function trim doesn't support with " +
+            s"type ${opt.getType}. Please use BOTH, LEADING or Trailing as trim type", ctx)
+        }
+      } else {
+        funcID
+      }
+    }
     // Create the function call.
     val name = ctx.qualifiedName.getText
     val isDistinct = Option(ctx.setQuantifier()).exists(_.DISTINCT != null)
@@ -1190,7 +1210,9 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
       case expressions =>
         expressions
     }
-    val function = UnresolvedFunction(visitFunctionName(ctx.qualifiedName), arguments, isDistinct)
+    val funcId = replaceFunctions(visitFunctionName(ctx.qualifiedName), ctx)
+    val function = UnresolvedFunction(funcId, arguments, isDistinct)
+
 
     // Check if the function is evaluated in a windowed context.
     ctx.windowSpec match {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index 4f08031153ab0..18ef4bc37c2b5 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -21,7 +21,6 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.types._
 
-
 class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   test("concat") {
@@ -406,26 +405,78 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     // scalastyle:on
   }
 
-  test("TRIM/LTRIM/RTRIM") {
+  test("TRIM") {
     val s = 'a.string.at(0)
     checkEvaluation(StringTrim(Literal(" aa  ")), "aa", create_row(" abdef "))
+    checkEvaluation(StringTrim("aa", "a"), "", create_row(" abdef "))
+    checkEvaluation(StringTrim(Literal(" aabbtrimccc"), "ab cd"), "trim", create_row("bdef"))
+    checkEvaluation(StringTrim(Literal("a<a >@>.,>"), "a.,@<>"), " ", create_row(" abdef "))
     checkEvaluation(StringTrim(s), "abdef", create_row(" abdef "))
+    checkEvaluation(StringTrim(s, "abd"), "ef", create_row("abdefa"))
+    checkEvaluation(StringTrim(s, "a"), "bdef", create_row("aaabdefaaaa"))
+    checkEvaluation(StringTrim(s, "SLSQ"), "park", create_row("SSparkSQLS"))
 
+    // scalastyle:off
+    // non ascii characters are not allowed in the source code, so we disable the scalastyle.
+    checkEvaluation(StringTrim(s), "花花世界", create_row("  花花世界 "))
+    checkEvaluation(StringTrim(s, "花世界"), "", create_row("花花世界花花"))
+    checkEvaluation(StringTrim(s, "花 "), "世界", create_row(" 花花世界花花"))
+    checkEvaluation(StringTrim(s, "花 "), "世界", create_row(" 花 花 世界 花 花 "))
+    checkEvaluation(StringTrim(s, "a花世"), "界", create_row("aa花花世界花花aa"))
+    checkEvaluation(StringTrim(s, "a@#( )"), "花花世界花花", create_row("aa()花花世界花花@ #"))
+    checkEvaluation(StringTrim(Literal("花trim"), "花 "), "trim", create_row(" abdef "))
+    // scalastyle:on
+    checkEvaluation(StringTrim(Literal("a"), Literal.create(null, StringType)), null)
+    checkEvaluation(StringTrim(Literal.create(null, StringType), Literal("a")), null)
+  }
+
+  test("LTRIM") {
+    val s = 'a.string.at(0)
     checkEvaluation(StringTrimLeft(Literal(" aa  ")), "aa  ", create_row(" abdef "))
+    checkEvaluation(StringTrimLeft(Literal("aa"), "a"), "", create_row(" abdef "))
+    checkEvaluation(StringTrimLeft(Literal("aa "), "a "), "", create_row(" abdef "))
+    checkEvaluation(StringTrimLeft(Literal("aabbcaaaa"), "ab"), "caaaa", create_row(" abdef "))
     checkEvaluation(StringTrimLeft(s), "abdef ", create_row(" abdef "))
+    checkEvaluation(StringTrimLeft(s, "a"), "bdefa", create_row("abdefa"))
+    checkEvaluation(StringTrimLeft(s, "a "), "bdefaaaa", create_row(" aaabdefaaaa"))
+    checkEvaluation(StringTrimLeft(s, "Spk"), "arkSQLS", create_row("SSparkSQLS"))
 
+    // scalastyle:off
+    // non ascii characters are not allowed in the source code, so we disable the scalastyle.
+    checkEvaluation(StringTrimLeft(s), "花花世界 ", create_row("  花花世界 "))
+    checkEvaluation(StringTrimLeft(s, "花"), "世界花花", create_row("花花世界花花"))
+    checkEvaluation(StringTrimLeft(s, "花 世"), "界花花", create_row(" 花花世界花花"))
+    checkEvaluation(StringTrimLeft(s, "花"), "a花花世界花花 ", create_row("a花花世界花花 "))
+    checkEvaluation(StringTrimLeft(s, "a花界"), "世界花花aa", create_row("aa花花世界花花aa"))
+    checkEvaluation(StringTrimLeft(s, "a世界"), "花花世界花花", create_row("花花世界花花"))
+    // scalastyle:on
+    checkEvaluation(StringTrimLeft(Literal.create(null, StringType), Literal("a")), null)
+    checkEvaluation(StringTrimLeft(Literal("a"), Literal.create(null, StringType)), null)
+  }
+
+  test("RTRIM") {
+    val s = 'a.string.at(0)
     checkEvaluation(StringTrimRight(Literal(" aa  ")), " aa", create_row(" abdef "))
+    checkEvaluation(StringTrimRight(Literal("a"), "a"), "", create_row(" abdef "))
+    checkEvaluation(StringTrimRight(Literal("ab"), "ab"), "", create_row(" abdef "))
+    checkEvaluation(StringTrimRight(Literal("aabbaaaa %"), "a %"), "aabb", create_row("def"))
     checkEvaluation(StringTrimRight(s), " abdef", create_row(" abdef "))
+    checkEvaluation(StringTrimRight(s, "a"), "abdef", create_row("abdefa"))
+    checkEvaluation(StringTrimRight(s, "abf de"), "", create_row(" aaabdefaaaa"))
+    checkEvaluation(StringTrimRight(s, "S*&"), "SSparkSQL", create_row("SSparkSQLS*"))
 
     // scalastyle:off
     // non ascii characters are not allowed in the source code, so we disable the scalastyle.
+    checkEvaluation(StringTrimRight(Literal("a"), "花"), "a", create_row(" abdef "))
+    checkEvaluation(StringTrimRight(Literal("花"), "a"), "花", create_row(" abdef "))
+    checkEvaluation(StringTrimRight(Literal("花花世界"), "界花世"), "", create_row(" abdef "))
     checkEvaluation(StringTrimRight(s), "  花花世界", create_row("  花花世界 "))
-    checkEvaluation(StringTrimLeft(s), "花花世界 ", create_row("  花花世界 "))
-    checkEvaluation(StringTrim(s), "花花世界", create_row("  花花世界 "))
+    checkEvaluation(StringTrimRight(s, "花a#"), "花花世界", create_row("花花世界花花###aa花"))
+    checkEvaluation(StringTrimRight(s, "花"), "", create_row("花花花花"))
+    checkEvaluation(StringTrimRight(s, "花 界b@"), " 花花世", create_row(" 花花世 b界@花花 "))
     // scalastyle:on
-    checkEvaluation(StringTrim(Literal.create(null, StringType)), null)
-    checkEvaluation(StringTrimLeft(Literal.create(null, StringType)), null)
-    checkEvaluation(StringTrimRight(Literal.create(null, StringType)), null)
+    checkEvaluation(StringTrimRight(Literal("a"), Literal.create(null, StringType)), null)
+    checkEvaluation(StringTrimRight(Literal.create(null, StringType), Literal("a")), null)
   }
 
   test("FORMAT") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
index b0d2fb26a6006..306e6f2cfbd37 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
@@ -651,4 +651,22 @@ class PlanParserSuite extends AnalysisTest {
       )
     )
   }
+
+  test("TRIM function") {
+    intercept("select ltrim(both 'S' from 'SS abc S'", "missing ')' at '<EOF>'")
+    intercept("select rtrim(trailing 'S' from 'SS abc S'", "missing ')' at '<EOF>'")
+
+    assertEqual(
+      "SELECT TRIM(BOTH '@$%&( )abc' FROM '@ $ % & ()abc ' )",
+        OneRowRelation().select('TRIM.function("@$%&( )abc", "@ $ % & ()abc "))
+    )
+    assertEqual(
+      "SELECT TRIM(LEADING 'c []' FROM '[ ccccbcc ')",
+        OneRowRelation().select('ltrim.function("c []", "[ ccccbcc "))
+    )
+    assertEqual(
+      "SELECT TRIM(TRAILING 'c&^,.' FROM 'bc...,,,&&&ccc')",
+      OneRowRelation().select('rtrim.function("c&^,.", "bc...,,,&&&ccc"))
+    )
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala
index 76be6ee3f50bc..cc80a41df998d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala
@@ -51,7 +51,7 @@ class TableIdentifierParserSuite extends SparkFunSuite {
     "rollup", "row", "rows", "set", "smallint", "table", "timestamp", "to", "trigger",
     "true", "truncate", "update", "user", "values", "with", "regexp", "rlike",
     "bigint", "binary", "boolean", "current_date", "current_timestamp", "date", "double", "float",
-    "int", "smallint", "timestamp", "at", "position")
+    "int", "smallint", "timestamp", "at", "position", "both", "leading", "trailing")
 
   val hiveStrictNonReservedKeyword = Seq("anti", "full", "inner", "left", "semi", "right",
     "natural", "union", "intersect", "except", "database", "on", "join", "cross", "select", "from",
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 47324ed9f2fb8..c6d0d86384b75 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2333,6 +2333,15 @@ object functions {
    */
   def ltrim(e: Column): Column = withExpr {StringTrimLeft(e.expr) }
 
+  /**
+   * Trim the specified character string from left end for the specified string column.
+   * @group string_funcs
+   * @since 2.3.0
+   */
+  def ltrim(e: Column, trimString: String): Column = withExpr {
+    StringTrimLeft(e.expr, Literal(trimString))
+  }
+
   /**
    * Extract a specific group matched by a Java regex, from the specified string column.
    * If the regex did not match, or the specified group did not match, an empty string is returned.
@@ -2410,6 +2419,15 @@ object functions {
    */
   def rtrim(e: Column): Column = withExpr { StringTrimRight(e.expr) }
 
+  /**
+   * Trim the specified character string from right end for the specified string column.
+   * @group string_funcs
+   * @since 2.3.0
+   */
+  def rtrim(e: Column, trimString: String): Column = withExpr {
+    StringTrimRight(e.expr, Literal(trimString))
+  }
+
   /**
    * Returns the soundex code for the specified expression.
    *
@@ -2477,6 +2495,15 @@ object functions {
    */
   def trim(e: Column): Column = withExpr { StringTrim(e.expr) }
 
+  /**
+   * Trim the specified character from both ends for the specified string column.
+   * @group string_funcs
+   * @since 2.3.0
+   */
+  def trim(e: Column, trimString: String): Column = withExpr {
+    StringTrim(e.expr, Literal(trimString))
+  }
+
   /**
    * Converts a string column to upper case.
    *
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index a12efc835691b..3d76b9ac33e57 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -161,12 +161,24 @@ class StringFunctionsSuite extends QueryTest with SharedSQLContext {
   }
 
   test("string trim functions") {
-    val df = Seq(("  example  ", "")).toDF("a", "b")
+    val df = Seq(("  example  ", "", "example")).toDF("a", "b", "c")
 
     checkAnswer(
       df.select(ltrim($"a"), rtrim($"a"), trim($"a")),
       Row("example  ", "  example", "example"))
 
+    checkAnswer(
+      df.select(ltrim($"c", "e"), rtrim($"c", "e"), trim($"c", "e")),
+      Row("xample", "exampl", "xampl"))
+
+    checkAnswer(
+      df.select(ltrim($"c", "xe"), rtrim($"c", "emlp"), trim($"c", "elxp")),
+      Row("ample", "exa", "am"))
+
+    checkAnswer(
+      df.select(trim($"c", "xyz")),
+      Row("example"))
+
     checkAnswer(
       df.selectExpr("ltrim(a)", "rtrim(a)", "trim(a)"),
       Row("example  ", "  example", "example"))

From 94f7e046a20cb6f802c7f2841ed2d2814cae49fa Mon Sep 17 00:00:00 2001
From: alexmnyc <project@alexandermarkham.com>
Date: Tue, 19 Sep 2017 10:05:59 +0800
Subject: [PATCH 567/779] [SPARK-22030][CORE] GraphiteSink fails to re-connect
 to Graphite instances behind an ELB or any other auto-scaled LB

## What changes were proposed in this pull request?

Upgrade codahale metrics library so that Graphite constructor can re-resolve hosts behind a CNAME with re-tried DNS lookups. When Graphite is deployed behind an ELB, ELB may change IP addresses based on auto-scaling needs. Using current approach yields Graphite usage impossible, fixing for that use case

- Upgrade to codahale 3.1.5
- Use new Graphite(host, port) constructor instead of new Graphite(new InetSocketAddress(host, port)) constructor

## How was this patch tested?

The same logic is used for another project that is using the same configuration and code path, and graphite re-connect's behind ELB's are no longer an issue

This are proposed changes for codahale lib - https://github.com/dropwizard/metrics/compare/v3.1.2...v3.1.5#diff-6916c85d2dd08d89fe771c952e3b8512R120. Specifically, https://github.com/dropwizard/metrics/blob/b4d246d34e8a059b047567848b3522567cbe6108/metrics-graphite/src/main/java/com/codahale/metrics/graphite/Graphite.java#L120

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: alexmnyc <project@alexandermarkham.com>

Closes #19210 from alexmnyc/patch-1.
---
 .../org/apache/spark/metrics/sink/GraphiteSink.scala      | 4 ++--
 dev/deps/spark-deps-hadoop-2.6                            | 8 ++++----
 dev/deps/spark-deps-hadoop-2.7                            | 8 ++++----
 pom.xml                                                   | 2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/GraphiteSink.scala b/core/src/main/scala/org/apache/spark/metrics/sink/GraphiteSink.scala
index 23e31823f4930..ac33e68abb490 100644
--- a/core/src/main/scala/org/apache/spark/metrics/sink/GraphiteSink.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/sink/GraphiteSink.scala
@@ -68,8 +68,8 @@ private[spark] class GraphiteSink(val property: Properties, val registry: Metric
   MetricsSystem.checkMinimalPollingPeriod(pollUnit, pollPeriod)
 
   val graphite = propertyToOption(GRAPHITE_KEY_PROTOCOL).map(_.toLowerCase(Locale.ROOT)) match {
-    case Some("udp") => new GraphiteUDP(new InetSocketAddress(host, port))
-    case Some("tcp") | None => new Graphite(new InetSocketAddress(host, port))
+    case Some("udp") => new GraphiteUDP(host, port)
+    case Some("tcp") | None => new Graphite(host, port)
     case Some(p) => throw new Exception(s"Invalid Graphite protocol: $p")
   }
 
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 9ac753861dd84..e534e38213fb1 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -139,10 +139,10 @@ machinist_2.11-0.6.1.jar
 macro-compat_2.11-1.1.1.jar
 mail-1.4.7.jar
 mesos-1.3.0-shaded-protobuf.jar
-metrics-core-3.1.2.jar
-metrics-graphite-3.1.2.jar
-metrics-json-3.1.2.jar
-metrics-jvm-3.1.2.jar
+metrics-core-3.1.5.jar
+metrics-graphite-3.1.5.jar
+metrics-json-3.1.5.jar
+metrics-jvm-3.1.5.jar
 minlog-1.3.0.jar
 mx4j-3.0.2.jar
 netty-3.9.9.Final.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index d39747e9ee058..02c5a19d173be 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -140,10 +140,10 @@ machinist_2.11-0.6.1.jar
 macro-compat_2.11-1.1.1.jar
 mail-1.4.7.jar
 mesos-1.3.0-shaded-protobuf.jar
-metrics-core-3.1.2.jar
-metrics-graphite-3.1.2.jar
-metrics-json-3.1.2.jar
-metrics-jvm-3.1.2.jar
+metrics-core-3.1.5.jar
+metrics-graphite-3.1.5.jar
+metrics-json-3.1.5.jar
+metrics-jvm-3.1.5.jar
 minlog-1.3.0.jar
 mx4j-3.0.2.jar
 netty-3.9.9.Final.jar
diff --git a/pom.xml b/pom.xml
index af511c3e2e5df..0bbbf20a76d68 100644
--- a/pom.xml
+++ b/pom.xml
@@ -138,7 +138,7 @@
     <chill.version>0.8.4</chill.version>
     <ivy.version>2.4.0</ivy.version>
     <oro.version>2.0.8</oro.version>
-    <codahale.metrics.version>3.1.2</codahale.metrics.version>
+    <codahale.metrics.version>3.1.5</codahale.metrics.version>
     <avro.version>1.7.7</avro.version>
     <avro.mapred.classifier>hadoop2</avro.mapred.classifier>
     <jets3t.version>0.9.3</jets3t.version>

From 10f45b3c84ff7b3f1765dc6384a563c33d26548b Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 19 Sep 2017 11:53:50 +0800
Subject: [PATCH 568/779] [SPARK-22047][FLAKY TEST]
 HiveExternalCatalogVersionsSuite

## What changes were proposed in this pull request?

This PR tries to download Spark for each test run, to make sure each test run is absolutely isolated.

## How was this patch tested?

N/A

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19265 from cloud-fan/test.
---
 .../spark/sql/hive/HiveExternalCatalogVersionsSuite.scala | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
index 01db9eb6f04f2..305f5b533d592 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
@@ -35,16 +35,18 @@ import org.apache.spark.util.Utils
  * expected version under this local directory, e.g. `/tmp/spark-test/spark-2.0.3`, we will skip the
  * downloading for this spark version.
  */
-@org.scalatest.Ignore
 class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils {
   private val wareHousePath = Utils.createTempDir(namePrefix = "warehouse")
   private val tmpDataDir = Utils.createTempDir(namePrefix = "test-data")
-  private val sparkTestingDir = "/tmp/spark-test"
+  // For local test, you can set `sparkTestingDir` to a static value like `/tmp/test-spark`, to
+  // avoid downloading Spark of different versions in each run.
+  private val sparkTestingDir = Utils.createTempDir(namePrefix = "test-spark")
   private val unusedJar = TestUtils.createJarWithClasses(Seq.empty)
 
   override def afterAll(): Unit = {
     Utils.deleteRecursively(wareHousePath)
     Utils.deleteRecursively(tmpDataDir)
+    Utils.deleteRecursively(sparkTestingDir)
     super.afterAll()
   }
 
@@ -53,7 +55,7 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils {
 
     val url = s"https://d3kbcqa49mib13.cloudfront.net/spark-$version-bin-hadoop2.7.tgz"
 
-    Seq("wget", url, "-q", "-P", sparkTestingDir).!
+    Seq("wget", url, "-q", "-P", sparkTestingDir.getCanonicalPath).!
 
     val downloaded = new File(sparkTestingDir, s"spark-$version-bin-hadoop2.7.tgz").getCanonicalPath
     val targetDir = new File(sparkTestingDir, s"spark-$version").getCanonicalPath

From a11db942aaf4c470a85f8a1b180f034f7a584254 Mon Sep 17 00:00:00 2001
From: Xianyang Liu <xianyang.liu@intel.com>
Date: Tue, 19 Sep 2017 14:51:27 +0800
Subject: [PATCH 569/779] [SPARK-21923][CORE] Avoid calling
 reserveUnrollMemoryForThisTask for every record

## What changes were proposed in this pull request?
When Spark persist data to Unsafe memory, we call  the method `MemoryStore.putIteratorAsBytes`, which need synchronize the `memoryManager` for every record write. This implementation is not necessary, we can apply for more memory at a time to reduce unnecessary synchronization.

## How was this patch tested?

Test case (with 1 executor 20 core):
```scala
val start = System.currentTimeMillis()
val data = sc.parallelize(0 until Integer.MAX_VALUE, 100)
      .persist(StorageLevel.OFF_HEAP)
      .count()

println(System.currentTimeMillis() - start)

```

Test result:

before

|  27647  |  29108  |  28591  |  28264  |  27232  |

after

|  26868  |  26358  |  27767  |  26653  |  26693  |

Author: Xianyang Liu <xianyang.liu@intel.com>

Closes #19135 from ConeyLiu/memorystore.
---
 .../apache/spark/internal/config/package.scala | 15 +++++++++++++++
 .../spark/storage/memory/MemoryStore.scala     | 18 ++++++++++++++----
 2 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index 0d3769a735869..e0f696080e566 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -385,4 +385,19 @@ package object config {
       .checkValue(v => v > 0 && v <= Int.MaxValue,
         s"The buffer size must be greater than 0 and less than ${Int.MaxValue}.")
       .createWithDefault(1024 * 1024)
+
+  private[spark] val UNROLL_MEMORY_CHECK_PERIOD =
+    ConfigBuilder("spark.storage.unrollMemoryCheckPeriod")
+      .internal()
+      .doc("The memory check period is used to determine how often we should check whether "
+        + "there is a need to request more memory when we try to unroll the given block in memory.")
+      .longConf
+      .createWithDefault(16)
+
+  private[spark] val UNROLL_MEMORY_GROWTH_FACTOR =
+    ConfigBuilder("spark.storage.unrollMemoryGrowthFactor")
+      .internal()
+      .doc("Memory to request as a multiple of the size that used to unroll the block.")
+      .doubleConf
+      .createWithDefault(1.5)
 }
diff --git a/core/src/main/scala/org/apache/spark/storage/memory/MemoryStore.scala b/core/src/main/scala/org/apache/spark/storage/memory/MemoryStore.scala
index 90e3af2d0ec74..eb2201d142ffb 100644
--- a/core/src/main/scala/org/apache/spark/storage/memory/MemoryStore.scala
+++ b/core/src/main/scala/org/apache/spark/storage/memory/MemoryStore.scala
@@ -29,6 +29,7 @@ import com.google.common.io.ByteStreams
 
 import org.apache.spark.{SparkConf, TaskContext}
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config.{UNROLL_MEMORY_CHECK_PERIOD, UNROLL_MEMORY_GROWTH_FACTOR}
 import org.apache.spark.memory.{MemoryManager, MemoryMode}
 import org.apache.spark.serializer.{SerializationStream, SerializerManager}
 import org.apache.spark.storage.{BlockId, BlockInfoManager, StorageLevel, StreamBlockId}
@@ -190,11 +191,11 @@ private[spark] class MemoryStore(
     // Initial per-task memory to request for unrolling blocks (bytes).
     val initialMemoryThreshold = unrollMemoryThreshold
     // How often to check whether we need to request more memory
-    val memoryCheckPeriod = 16
+    val memoryCheckPeriod = conf.get(UNROLL_MEMORY_CHECK_PERIOD)
     // Memory currently reserved by this task for this particular unrolling operation
     var memoryThreshold = initialMemoryThreshold
     // Memory to request as a multiple of current vector size
-    val memoryGrowthFactor = 1.5
+    val memoryGrowthFactor = conf.get(UNROLL_MEMORY_GROWTH_FACTOR)
     // Keep track of unroll memory used by this particular block / putIterator() operation
     var unrollMemoryUsedByThisBlock = 0L
     // Underlying vector for unrolling the block
@@ -325,6 +326,12 @@ private[spark] class MemoryStore(
 
     // Whether there is still enough memory for us to continue unrolling this block
     var keepUnrolling = true
+    // Number of elements unrolled so far
+    var elementsUnrolled = 0L
+    // How often to check whether we need to request more memory
+    val memoryCheckPeriod = conf.get(UNROLL_MEMORY_CHECK_PERIOD)
+    // Memory to request as a multiple of current bbos size
+    val memoryGrowthFactor = conf.get(UNROLL_MEMORY_GROWTH_FACTOR)
     // Initial per-task memory to request for unrolling blocks (bytes).
     val initialMemoryThreshold = unrollMemoryThreshold
     // Keep track of unroll memory used by this particular block / putIterator() operation
@@ -359,7 +366,7 @@ private[spark] class MemoryStore(
 
     def reserveAdditionalMemoryIfNecessary(): Unit = {
       if (bbos.size > unrollMemoryUsedByThisBlock) {
-        val amountToRequest = bbos.size - unrollMemoryUsedByThisBlock
+        val amountToRequest = (bbos.size * memoryGrowthFactor - unrollMemoryUsedByThisBlock).toLong
         keepUnrolling = reserveUnrollMemoryForThisTask(blockId, amountToRequest, memoryMode)
         if (keepUnrolling) {
           unrollMemoryUsedByThisBlock += amountToRequest
@@ -370,7 +377,10 @@ private[spark] class MemoryStore(
     // Unroll this block safely, checking whether we have exceeded our threshold
     while (values.hasNext && keepUnrolling) {
       serializationStream.writeObject(values.next())(classTag)
-      reserveAdditionalMemoryIfNecessary()
+      elementsUnrolled += 1
+      if (elementsUnrolled % memoryCheckPeriod == 0) {
+        reserveAdditionalMemoryIfNecessary()
+      }
     }
 
     // Make sure that we have enough memory to store the block. By this point, it is possible that

From 7c92351f43ac4b1710e3c80c78f7978dad491ed2 Mon Sep 17 00:00:00 2001
From: Armin <me@obrown.io>
Date: Tue, 19 Sep 2017 10:06:32 +0100
Subject: [PATCH 570/779] [MINOR][CORE] Cleanup dead code and duplication in
 Mem. Management

## What changes were proposed in this pull request?

* Removed the method `org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter#alignToWords`.
It became unused as a result of 85b0a157543201895557d66306b38b3ca52f2151
(SPARK-15962) introducing word alignment for unsafe arrays.
* Cleaned up duplicate code in memory management and unsafe sorters
  * The change extracting the exception paths is more than just cosmetics since it def. reduces the size the affected methods compile to

## How was this patch tested?

* Build still passes after removing the method, grepping the codebase for `alignToWords` shows no reference to it anywhere either.
* Dried up code is covered by existing tests.

Author: Armin <me@obrown.io>

Closes #19254 from original-brownbear/cleanup-mem-consumer.
---
 .../apache/spark/memory/MemoryConsumer.java   | 26 +++++++--------
 .../spark/unsafe/map/BytesToBytesMap.java     | 24 ++++++--------
 .../unsafe/sort/UnsafeExternalSorter.java     | 32 +++++++++----------
 .../expressions/codegen/UnsafeRowWriter.java  | 16 ----------
 4 files changed, 37 insertions(+), 61 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java b/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java
index 4099fb01f2f95..0efae16e9838c 100644
--- a/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java
+++ b/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java
@@ -89,13 +89,7 @@ public LongArray allocateArray(long size) {
     long required = size * 8L;
     MemoryBlock page = taskMemoryManager.allocatePage(required, this);
     if (page == null || page.size() < required) {
-      long got = 0;
-      if (page != null) {
-        got = page.size();
-        taskMemoryManager.freePage(page, this);
-      }
-      taskMemoryManager.showMemoryUsage();
-      throw new OutOfMemoryError("Unable to acquire " + required + " bytes of memory, got " + got);
+      throwOom(page, required);
     }
     used += required;
     return new LongArray(page);
@@ -116,13 +110,7 @@ public void freeArray(LongArray array) {
   protected MemoryBlock allocatePage(long required) {
     MemoryBlock page = taskMemoryManager.allocatePage(Math.max(pageSize, required), this);
     if (page == null || page.size() < required) {
-      long got = 0;
-      if (page != null) {
-        got = page.size();
-        taskMemoryManager.freePage(page, this);
-      }
-      taskMemoryManager.showMemoryUsage();
-      throw new OutOfMemoryError("Unable to acquire " + required + " bytes of memory, got " + got);
+      throwOom(page, required);
     }
     used += page.size();
     return page;
@@ -152,4 +140,14 @@ public void freeMemory(long size) {
     taskMemoryManager.releaseExecutionMemory(size, this);
     used -= size;
   }
+
+  private void throwOom(final MemoryBlock page, final long required) {
+    long got = 0;
+    if (page != null) {
+      got = page.size();
+      taskMemoryManager.freePage(page, this);
+    }
+    taskMemoryManager.showMemoryUsage();
+    throw new OutOfMemoryError("Unable to acquire " + required + " bytes of memory, got " + got);
+  }
 }
diff --git a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
index 610ace30f8a62..4fadfe36cd716 100644
--- a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
+++ b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
@@ -283,13 +283,7 @@ private void advanceToNextPage() {
         } else {
           currentPage = null;
           if (reader != null) {
-            // remove the spill file from disk
-            File file = spillWriters.removeFirst().getFile();
-            if (file != null && file.exists()) {
-              if (!file.delete()) {
-                logger.error("Was unable to delete spill file {}", file.getAbsolutePath());
-              }
-            }
+            handleFailedDelete();
           }
           try {
             Closeables.close(reader, /* swallowIOException = */ false);
@@ -307,13 +301,7 @@ private void advanceToNextPage() {
     public boolean hasNext() {
       if (numRecords == 0) {
         if (reader != null) {
-          // remove the spill file from disk
-          File file = spillWriters.removeFirst().getFile();
-          if (file != null && file.exists()) {
-            if (!file.delete()) {
-              logger.error("Was unable to delete spill file {}", file.getAbsolutePath());
-            }
-          }
+          handleFailedDelete();
         }
       }
       return numRecords > 0;
@@ -403,6 +391,14 @@ public long spill(long numBytes) throws IOException {
     public void remove() {
       throw new UnsupportedOperationException();
     }
+
+    private void handleFailedDelete() {
+      // remove the spill file from disk
+      File file = spillWriters.removeFirst().getFile();
+      if (file != null && file.exists() && !file.delete()) {
+        logger.error("Was unable to delete spill file {}", file.getAbsolutePath());
+      }
+    }
   }
 
   /**
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
index de4464080ef55..39eda00dd7efb 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
@@ -219,15 +219,7 @@ public long spill(long size, MemoryConsumer trigger) throws IOException {
         new UnsafeSorterSpillWriter(blockManager, fileBufferSizeBytes, writeMetrics,
           inMemSorter.numRecords());
       spillWriters.add(spillWriter);
-      final UnsafeSorterIterator sortedRecords = inMemSorter.getSortedIterator();
-      while (sortedRecords.hasNext()) {
-        sortedRecords.loadNext();
-        final Object baseObject = sortedRecords.getBaseObject();
-        final long baseOffset = sortedRecords.getBaseOffset();
-        final int recordLength = sortedRecords.getRecordLength();
-        spillWriter.write(baseObject, baseOffset, recordLength, sortedRecords.getKeyPrefix());
-      }
-      spillWriter.close();
+      spillIterator(inMemSorter.getSortedIterator(), spillWriter);
     }
 
     final long spillSize = freeMemory();
@@ -488,6 +480,18 @@ public UnsafeSorterIterator getSortedIterator() throws IOException {
     }
   }
 
+  private static void spillIterator(UnsafeSorterIterator inMemIterator,
+      UnsafeSorterSpillWriter spillWriter) throws IOException {
+    while (inMemIterator.hasNext()) {
+      inMemIterator.loadNext();
+      final Object baseObject = inMemIterator.getBaseObject();
+      final long baseOffset = inMemIterator.getBaseOffset();
+      final int recordLength = inMemIterator.getRecordLength();
+      spillWriter.write(baseObject, baseOffset, recordLength, inMemIterator.getKeyPrefix());
+    }
+    spillWriter.close();
+  }
+
   /**
    * An UnsafeSorterIterator that support spilling.
    */
@@ -503,6 +507,7 @@ class SpillableIterator extends UnsafeSorterIterator {
       this.numRecords = inMemIterator.getNumRecords();
     }
 
+    @Override
     public int getNumRecords() {
       return numRecords;
     }
@@ -521,14 +526,7 @@ public long spill() throws IOException {
         // Iterate over the records that have not been returned and spill them.
         final UnsafeSorterSpillWriter spillWriter =
           new UnsafeSorterSpillWriter(blockManager, fileBufferSizeBytes, writeMetrics, numRecords);
-        while (inMemIterator.hasNext()) {
-          inMemIterator.loadNext();
-          final Object baseObject = inMemIterator.getBaseObject();
-          final long baseOffset = inMemIterator.getBaseOffset();
-          final int recordLength = inMemIterator.getRecordLength();
-          spillWriter.write(baseObject, baseOffset, recordLength, inMemIterator.getKeyPrefix());
-        }
-        spillWriter.close();
+        spillIterator(inMemIterator, spillWriter);
         spillWriters.add(spillWriter);
         nextUpstream = spillWriter.getReader(serializerManager);
 
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java
index 4776617043878..5d9515c0725da 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java
@@ -109,22 +109,6 @@ public void setOffsetAndSize(int ordinal, long currentCursor, long size) {
     Platform.putLong(holder.buffer, fieldOffset, offsetAndSize);
   }
 
-  // Do word alignment for this row and grow the row buffer if needed.
-  // todo: remove this after we make unsafe array data word align.
-  public void alignToWords(int numBytes) {
-    final int remainder = numBytes & 0x07;
-
-    if (remainder > 0) {
-      final int paddingBytes = 8 - remainder;
-      holder.grow(paddingBytes);
-
-      for (int i = 0; i < paddingBytes; i++) {
-        Platform.putByte(holder.buffer, holder.cursor, (byte) 0);
-        holder.cursor++;
-      }
-    }
-  }
-
   public void write(int ordinal, boolean value) {
     final long offset = getFieldOffset(ordinal);
     Platform.putLong(holder.buffer, offset, 0L);

From 1bc17a6b8add02772a8a0a1048ac6a01d045baf4 Mon Sep 17 00:00:00 2001
From: Taaffy <32072374+Taaffy@users.noreply.github.com>
Date: Tue, 19 Sep 2017 10:20:04 +0100
Subject: [PATCH 571/779] [SPARK-22052] Incorrect Metric assigned in
 MetricsReporter.scala

Current implementation for processingRate-total uses wrong metric:
mistakenly uses inputRowsPerSecond instead of processedRowsPerSecond

## What changes were proposed in this pull request?
Adjust processingRate-total from using inputRowsPerSecond to processedRowsPerSecond

## How was this patch tested?

Built spark from source with proposed change and tested output with correct parameter. Before change the csv metrics file for inputRate-total and processingRate-total displayed the same values due to the error. After changing MetricsReporter.scala the processingRate-total csv file displayed the correct metric.
<img width="963" alt="processed rows per second" src="https://user-images.githubusercontent.com/32072374/30554340-82eea12c-9ca4-11e7-8370-8168526ff9a2.png">

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Taaffy <32072374+Taaffy@users.noreply.github.com>

Closes #19268 from Taaffy/patch-1.
---
 .../apache/spark/sql/execution/streaming/MetricsReporter.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetricsReporter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetricsReporter.scala
index 5551d12fa8ad2..b84e6ce64c611 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetricsReporter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetricsReporter.scala
@@ -40,7 +40,7 @@ class MetricsReporter(
   // Metric names should not have . in them, so that all the metrics of a query are identified
   // together in Ganglia as a single metric group
   registerGauge("inputRate-total", () => stream.lastProgress.inputRowsPerSecond)
-  registerGauge("processingRate-total", () => stream.lastProgress.inputRowsPerSecond)
+  registerGauge("processingRate-total", () => stream.lastProgress.processedRowsPerSecond)
   registerGauge("latency", () => stream.lastProgress.durationMs.get("triggerExecution").longValue())
 
   private def registerGauge[T](name: String, f: () => T)(implicit num: Numeric[T]): Unit = {

From 581200af717bcefd11c9930ac063fe53c6fd2fde Mon Sep 17 00:00:00 2001
From: Kent Yao <yaooqinn@hotmail.com>
Date: Tue, 19 Sep 2017 19:35:36 +0800
Subject: [PATCH 572/779] [SPARK-21428][SQL][FOLLOWUP] CliSessionState should
 point to the actual metastore not a dummy one

## What changes were proposed in this pull request?

While running bin/spark-sql, we will reuse cliSessionState, but the Hive configurations generated here just points to a dummy meta store which actually should be the real one. And the warehouse is determined later in SharedState, HiveClient should respect this config changing in this case too.

## How was this patch tested?
existing ut

cc cloud-fan jiangxb1987

Author: Kent Yao <yaooqinn@hotmail.com>

Closes #19068 from yaooqinn/SPARK-21428-FOLLOWUP.
---
 .../sql/hive/thriftserver/SparkSQLCLIDriver.scala | 14 +++++++++++---
 .../org/apache/spark/sql/hive/HiveUtils.scala     |  6 +++---
 .../spark/sql/hive/client/HiveClientImpl.scala    | 15 +++++++++++++--
 .../spark/sql/hive/client/HiveVersionSuite.scala  |  2 +-
 .../spark/sql/hive/client/VersionsSuite.scala     |  2 +-
 5 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
index 761e832ed14b8..832a15d09599f 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
@@ -37,6 +37,8 @@ import org.apache.hadoop.hive.ql.session.SessionState
 import org.apache.log4j.{Level, Logger}
 import org.apache.thrift.transport.TSocket
 
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.hive.HiveUtils
@@ -81,11 +83,17 @@ private[hive] object SparkSQLCLIDriver extends Logging {
       System.exit(1)
     }
 
+    val sparkConf = new SparkConf(loadDefaults = true)
+    val hadoopConf = SparkHadoopUtil.get.newConfiguration(sparkConf)
+    val extraConfigs = HiveUtils.formatTimeVarsForHiveClient(hadoopConf)
+
     val cliConf = new HiveConf(classOf[SessionState])
-    // Override the location of the metastore since this is only used for local execution.
-    HiveUtils.newTemporaryConfiguration(useInMemoryDerby = false).foreach {
-      case (key, value) => cliConf.set(key, value)
+    (hadoopConf.iterator().asScala.map(kv => kv.getKey -> kv.getValue)
+      ++ sparkConf.getAll.toMap ++ extraConfigs).foreach {
+      case (k, v) =>
+        cliConf.set(k, v)
     }
+
     val sessionState = new CliSessionState(cliConf)
 
     sessionState.in = System.in
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
index 561c127a40bb6..80b9a3dc9605d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
@@ -176,9 +176,9 @@ private[spark] object HiveUtils extends Logging {
   }
 
   /**
-   * Configurations needed to create a [[HiveClient]].
+   * Change time configurations needed to create a [[HiveClient]] into unified [[Long]] format.
    */
-  private[hive] def hiveClientConfigurations(hadoopConf: Configuration): Map[String, String] = {
+  private[hive] def formatTimeVarsForHiveClient(hadoopConf: Configuration): Map[String, String] = {
     // Hive 0.14.0 introduces timeout operations in HiveConf, and changes default values of a bunch
     // of time `ConfVar`s by adding time suffixes (`s`, `ms`, and `d` etc.).  This breaks backwards-
     // compatibility when users are trying to connecting to a Hive metastore of lower version,
@@ -280,7 +280,7 @@ private[spark] object HiveUtils extends Logging {
   protected[hive] def newClientForMetadata(
       conf: SparkConf,
       hadoopConf: Configuration): HiveClient = {
-    val configurations = hiveClientConfigurations(hadoopConf)
+    val configurations = formatTimeVarsForHiveClient(hadoopConf)
     newClientForMetadata(conf, hadoopConf, configurations)
   }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 426db6a4e1c12..c4e48c9360db7 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -28,6 +28,7 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.hive.common.StatsSetupConst
 import org.apache.hadoop.hive.conf.HiveConf
+import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 import org.apache.hadoop.hive.metastore.{TableType => HiveTableType}
 import org.apache.hadoop.hive.metastore.api.{Database => HiveDatabase, FieldSchema, Order}
 import org.apache.hadoop.hive.metastore.api.{SerDeInfo, StorageDescriptor}
@@ -132,14 +133,24 @@ private[hive] class HiveClientImpl(
       // in hive jars, which will turn off isolation, if SessionSate.detachSession is
       // called to remove the current state after that, hive client created later will initialize
       // its own state by newState()
-      Option(SessionState.get).getOrElse(newState())
+      val ret = SessionState.get
+      if (ret != null) {
+        // hive.metastore.warehouse.dir is determined in SharedState after the CliSessionState
+        // instance constructed, we need to follow that change here.
+        Option(hadoopConf.get(ConfVars.METASTOREWAREHOUSE.varname)).foreach { dir =>
+          ret.getConf.setVar(ConfVars.METASTOREWAREHOUSE, dir)
+        }
+        ret
+      } else {
+        newState()
+      }
     }
   }
 
   // Log the default warehouse location.
   logInfo(
     s"Warehouse location for Hive client " +
-      s"(version ${version.fullVersion}) is ${conf.get("hive.metastore.warehouse.dir")}")
+      s"(version ${version.fullVersion}) is ${conf.getVar(ConfVars.METASTOREWAREHOUSE)}")
 
   private def newState(): SessionState = {
     val hiveConf = new HiveConf(classOf[SessionState])
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveVersionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveVersionSuite.scala
index ed475a0261b0b..951ebfad4590e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveVersionSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveVersionSuite.scala
@@ -36,7 +36,7 @@ private[client] abstract class HiveVersionSuite(version: String) extends SparkFu
       hadoopConf.set("hive.metastore.schema.verification", "false")
     }
     HiveClientBuilder
-      .buildClient(version, hadoopConf, HiveUtils.hiveClientConfigurations(hadoopConf))
+      .buildClient(version, hadoopConf, HiveUtils.formatTimeVarsForHiveClient(hadoopConf))
   }
 
   override def suiteName: String = s"${super.suiteName}($version)"
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index 1d9c8da996fea..edb9a9ffbaaf6 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -127,7 +127,7 @@ class VersionsSuite extends SparkFunSuite with Logging {
         hadoopConf.set("datanucleus.schema.autoCreateAll", "true")
         hadoopConf.set("hive.metastore.schema.verification", "false")
       }
-      client = buildClient(version, hadoopConf, HiveUtils.hiveClientConfigurations(hadoopConf))
+      client = buildClient(version, hadoopConf, HiveUtils.formatTimeVarsForHiveClient(hadoopConf))
       if (versionSpark != null) versionSpark.reset()
       versionSpark = TestHiveVersion(client)
       assert(versionSpark.sharedState.externalCatalog.asInstanceOf[HiveExternalCatalog].client

From 8319432af60b8e1dc00f08d794f7d80591e24d0c Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Tue, 19 Sep 2017 22:20:05 +0800
Subject: [PATCH 573/779] [SPARK-21917][CORE][YARN] Supporting adding http(s)
 resources in yarn mode

## What changes were proposed in this pull request?
In the current Spark, when submitting application on YARN with remote resources `./bin/spark-shell --jars http://central.maven.org/maven2/com/github/swagger-akka-http/swagger-akka-http_2.11/0.10.1/swagger-akka-http_2.11-0.10.1.jar --master yarn-client -v`, Spark will be failed with:

```
java.io.IOException: No FileSystem for scheme: http
	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2586)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2593)
	at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:91)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2632)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2614)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:370)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:296)
	at org.apache.spark.deploy.yarn.Client.copyFileToRemote(Client.scala:354)
	at org.apache.spark.deploy.yarn.Client.org$apache$spark$deploy$yarn$Client$$distribute$1(Client.scala:478)
	at org.apache.spark.deploy.yarn.Client$$anonfun$prepareLocalResources$11$$anonfun$apply$6.apply(Client.scala:600)
	at org.apache.spark.deploy.yarn.Client$$anonfun$prepareLocalResources$11$$anonfun$apply$6.apply(Client.scala:599)
	at scala.collection.mutable.ArraySeq.foreach(ArraySeq.scala:74)
	at org.apache.spark.deploy.yarn.Client$$anonfun$prepareLocalResources$11.apply(Client.scala:599)
	at org.apache.spark.deploy.yarn.Client$$anonfun$prepareLocalResources$11.apply(Client.scala:598)
	at scala.collection.immutable.List.foreach(List.scala:381)
	at org.apache.spark.deploy.yarn.Client.prepareLocalResources(Client.scala:598)
	at org.apache.spark.deploy.yarn.Client.createContainerLaunchContext(Client.scala:848)
	at org.apache.spark.deploy.yarn.Client.submitApplication(Client.scala:173)
```

This is because `YARN#client` assumes resources are on the Hadoop compatible FS. To fix this problem, here propose to download remote http(s) resources to local and add this local downloaded resources to dist cache. This solution has one downside: remote resources are downloaded and uploaded again, but it only restricted to only remote http(s) resources, also the overhead is not so big. The advantages of this solution is that it is simple and the code changes restricts to only `SparkSubmit`.

## How was this patch tested?

Unit test added, also verified in local cluster.

Author: jerryshao <sshao@hortonworks.com>

Closes #19130 from jerryshao/SPARK-21917.
---
 .../apache/spark/deploy/DependencyUtils.scala |  9 ++-
 .../org/apache/spark/deploy/SparkSubmit.scala | 51 ++++++++++++++-
 .../spark/internal/config/package.scala       | 10 +++
 .../scala/org/apache/spark/util/Utils.scala   |  3 +
 .../spark/deploy/SparkSubmitSuite.scala       | 65 +++++++++++++++++++
 docs/running-on-yarn.md                       |  9 +++
 6 files changed, 143 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/DependencyUtils.scala b/core/src/main/scala/org/apache/spark/deploy/DependencyUtils.scala
index 51c3d9b158cbe..ecc82d7ac8001 100644
--- a/core/src/main/scala/org/apache/spark/deploy/DependencyUtils.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/DependencyUtils.scala
@@ -94,7 +94,7 @@ private[deploy] object DependencyUtils {
       hadoopConf: Configuration,
       secMgr: SecurityManager): String = {
     require(fileList != null, "fileList cannot be null.")
-    fileList.split(",")
+    Utils.stringToSeq(fileList)
       .map(downloadFile(_, targetDir, sparkConf, hadoopConf, secMgr))
       .mkString(",")
   }
@@ -121,6 +121,11 @@ private[deploy] object DependencyUtils {
 
     uri.getScheme match {
       case "file" | "local" => path
+      case "http" | "https" | "ftp" if Utils.isTesting =>
+        // This is only used for SparkSubmitSuite unit test. Instead of downloading file remotely,
+        // return a dummy local path instead.
+        val file = new File(uri.getPath)
+        new File(targetDir, file.getName).toURI.toString
       case _ =>
         val fname = new Path(uri).getName()
         val localFile = Utils.doFetchFile(uri.toString(), targetDir, fname, sparkConf, secMgr,
@@ -131,7 +136,7 @@ private[deploy] object DependencyUtils {
 
   def resolveGlobPaths(paths: String, hadoopConf: Configuration): String = {
     require(paths != null, "paths cannot be null.")
-    paths.split(",").map(_.trim).filter(_.nonEmpty).flatMap { path =>
+    Utils.stringToSeq(paths).flatMap { path =>
       val uri = Utils.resolveURI(path)
       uri.getScheme match {
         case "local" | "http" | "https" | "ftp" => Array(path)
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index ea9c9bdaede76..286a4379d2040 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -25,11 +25,11 @@ import java.text.ParseException
 
 import scala.annotation.tailrec
 import scala.collection.mutable.{ArrayBuffer, HashMap, Map}
-import scala.util.Properties
+import scala.util.{Properties, Try}
 
 import org.apache.commons.lang3.StringUtils
 import org.apache.hadoop.conf.{Configuration => HadoopConfiguration}
-import org.apache.hadoop.fs.Path
+import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.security.UserGroupInformation
 import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.apache.ivy.Ivy
@@ -48,6 +48,7 @@ import org.apache.spark._
 import org.apache.spark.api.r.RUtils
 import org.apache.spark.deploy.rest._
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config._
 import org.apache.spark.launcher.SparkLauncher
 import org.apache.spark.util._
 
@@ -367,6 +368,52 @@ object SparkSubmit extends CommandLineUtils with Logging {
       }.orNull
     }
 
+    // When running in YARN, for some remote resources with scheme:
+    //   1. Hadoop FileSystem doesn't support them.
+    //   2. We explicitly bypass Hadoop FileSystem with "spark.yarn.dist.forceDownloadSchemes".
+    // We will download them to local disk prior to add to YARN's distributed cache.
+    // For yarn client mode, since we already download them with above code, so we only need to
+    // figure out the local path and replace the remote one.
+    if (clusterManager == YARN) {
+      sparkConf.setIfMissing(SecurityManager.SPARK_AUTH_SECRET_CONF, "unused")
+      val secMgr = new SecurityManager(sparkConf)
+      val forceDownloadSchemes = sparkConf.get(FORCE_DOWNLOAD_SCHEMES)
+
+      def shouldDownload(scheme: String): Boolean = {
+        forceDownloadSchemes.contains(scheme) ||
+          Try { FileSystem.getFileSystemClass(scheme, hadoopConf) }.isFailure
+      }
+
+      def downloadResource(resource: String): String = {
+        val uri = Utils.resolveURI(resource)
+        uri.getScheme match {
+          case "local" | "file" => resource
+          case e if shouldDownload(e) =>
+            val file = new File(targetDir, new Path(uri).getName)
+            if (file.exists()) {
+              file.toURI.toString
+            } else {
+              downloadFile(resource, targetDir, sparkConf, hadoopConf, secMgr)
+            }
+          case _ => uri.toString
+        }
+      }
+
+      args.primaryResource = Option(args.primaryResource).map { downloadResource }.orNull
+      args.files = Option(args.files).map { files =>
+        Utils.stringToSeq(files).map(downloadResource).mkString(",")
+      }.orNull
+      args.pyFiles = Option(args.pyFiles).map { pyFiles =>
+        Utils.stringToSeq(pyFiles).map(downloadResource).mkString(",")
+      }.orNull
+      args.jars = Option(args.jars).map { jars =>
+        Utils.stringToSeq(jars).map(downloadResource).mkString(",")
+      }.orNull
+      args.archives = Option(args.archives).map { archives =>
+        Utils.stringToSeq(archives).map(downloadResource).mkString(",")
+      }.orNull
+    }
+
     // If we're running a python app, set the main class to our specific python runner
     if (args.isPython && deployMode == CLIENT) {
       if (args.primaryResource == PYSPARK_SHELL) {
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index e0f696080e566..44a2815b81a73 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -400,4 +400,14 @@ package object config {
       .doc("Memory to request as a multiple of the size that used to unroll the block.")
       .doubleConf
       .createWithDefault(1.5)
+
+  private[spark] val FORCE_DOWNLOAD_SCHEMES =
+    ConfigBuilder("spark.yarn.dist.forceDownloadSchemes")
+      .doc("Comma-separated list of schemes for which files will be downloaded to the " +
+        "local disk prior to being added to YARN's distributed cache. For use in cases " +
+        "where the YARN service does not support schemes that are supported by Spark, like http, " +
+        "https and ftp.")
+      .stringConf
+      .toSequence
+      .createWithDefault(Nil)
 }
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index bc08808a4d292..836e33c36d9a1 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -2684,6 +2684,9 @@ private[spark] object Utils extends Logging {
     redact(redactionPattern, kvs.toArray)
   }
 
+  def stringToSeq(str: String): Seq[String] = {
+    str.split(",").map(_.trim()).filter(_.nonEmpty)
+  }
 }
 
 private[util] object CallerContext extends Logging {
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index 4d69ce844d2ea..ad801bf8519a6 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -897,6 +897,71 @@ class SparkSubmitSuite
     sysProps("spark.submit.pyFiles") should (startWith("/"))
   }
 
+  test("download remote resource if it is not supported by yarn service") {
+    testRemoteResources(isHttpSchemeBlacklisted = false, supportMockHttpFs = false)
+  }
+
+  test("avoid downloading remote resource if it is supported by yarn service") {
+    testRemoteResources(isHttpSchemeBlacklisted = false, supportMockHttpFs = true)
+  }
+
+  test("force download from blacklisted schemes") {
+    testRemoteResources(isHttpSchemeBlacklisted = true, supportMockHttpFs = true)
+  }
+
+  private def testRemoteResources(isHttpSchemeBlacklisted: Boolean,
+      supportMockHttpFs: Boolean): Unit = {
+    val hadoopConf = new Configuration()
+    updateConfWithFakeS3Fs(hadoopConf)
+    if (supportMockHttpFs) {
+      hadoopConf.set("fs.http.impl", classOf[TestFileSystem].getCanonicalName)
+      hadoopConf.set("fs.http.impl.disable.cache", "true")
+    }
+
+    val tmpDir = Utils.createTempDir()
+    val mainResource = File.createTempFile("tmpPy", ".py", tmpDir)
+    val tmpS3Jar = TestUtils.createJarWithFiles(Map("test.resource" -> "USER"), tmpDir)
+    val tmpS3JarPath = s"s3a://${new File(tmpS3Jar.toURI).getAbsolutePath}"
+    val tmpHttpJar = TestUtils.createJarWithFiles(Map("test.resource" -> "USER"), tmpDir)
+    val tmpHttpJarPath = s"http://${new File(tmpHttpJar.toURI).getAbsolutePath}"
+
+    val args = Seq(
+      "--class", UserClasspathFirstTest.getClass.getName.stripPrefix("$"),
+      "--name", "testApp",
+      "--master", "yarn",
+      "--deploy-mode", "client",
+      "--jars", s"$tmpS3JarPath,$tmpHttpJarPath",
+      s"s3a://$mainResource"
+    ) ++ (
+      if (isHttpSchemeBlacklisted) {
+        Seq("--conf", "spark.yarn.dist.forceDownloadSchemes=http,https")
+      } else {
+        Nil
+      }
+    )
+
+    val appArgs = new SparkSubmitArguments(args)
+    val sysProps = SparkSubmit.prepareSubmitEnvironment(appArgs, Some(hadoopConf))._3
+
+    val jars = sysProps("spark.yarn.dist.jars").split(",").toSet
+
+    // The URI of remote S3 resource should still be remote.
+    assert(jars.contains(tmpS3JarPath))
+
+    if (supportMockHttpFs) {
+      // If Http FS is supported by yarn service, the URI of remote http resource should
+      // still be remote.
+      assert(jars.contains(tmpHttpJarPath))
+    } else {
+      // If Http FS is not supported by yarn service, or http scheme is configured to be force
+      // downloading, the URI of remote http resource should be changed to a local one.
+      val jarName = new File(tmpHttpJar.toURI).getName
+      val localHttpJar = jars.filter(_.contains(jarName))
+      localHttpJar.size should be(1)
+      localHttpJar.head should startWith("file:")
+    }
+  }
+
   // NOTE: This is an expensive operation in terms of time (10 seconds+). Use sparingly.
   private def runSparkSubmit(args: Seq[String]): Unit = {
     val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!"))
diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index e4a74556d4f26..432639588cc2b 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -211,6 +211,15 @@ To use a custom metrics.properties for the application master and executors, upd
     Comma-separated list of jars to be placed in the working directory of each executor.
   </td>
 </tr>
+<tr>
+  <td><code>spark.yarn.dist.forceDownloadSchemes</code></td>
+  <td><code>(none)</code></td>
+  <td>
+    Comma-separated list of schemes for which files will be downloaded to the local disk prior to 
+    being added to YARN's distributed cache. For use in cases where the YARN service does not 
+    support schemes that are supported by Spark, like http, https and ftp.
+  </td>
+</tr>
 <tr>
  <td><code>spark.executor.instances</code></td>
   <td><code>2</code></td>

From 2f962422a25020582c915e15819f91f43c0b9d68 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Tue, 19 Sep 2017 22:22:35 +0800
Subject: [PATCH 574/779] [MINOR][ML] Remove unnecessary default value setting
 for evaluators.

## What changes were proposed in this pull request?
Remove unnecessary default value setting for all evaluators, as we have set them in corresponding _HasXXX_ base classes.

## How was this patch tested?
Existing tests.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #19262 from yanboliang/evaluation.
---
 python/pyspark/ml/evaluation.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
index 7cb8d62f212cb..09cdf9b6a629a 100644
--- a/python/pyspark/ml/evaluation.py
+++ b/python/pyspark/ml/evaluation.py
@@ -146,8 +146,7 @@ def __init__(self, rawPredictionCol="rawPrediction", labelCol="label",
         super(BinaryClassificationEvaluator, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.evaluation.BinaryClassificationEvaluator", self.uid)
-        self._setDefault(rawPredictionCol="rawPrediction", labelCol="label",
-                         metricName="areaUnderROC")
+        self._setDefault(metricName="areaUnderROC")
         kwargs = self._input_kwargs
         self._set(**kwargs)
 
@@ -224,8 +223,7 @@ def __init__(self, predictionCol="prediction", labelCol="label",
         super(RegressionEvaluator, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.evaluation.RegressionEvaluator", self.uid)
-        self._setDefault(predictionCol="prediction", labelCol="label",
-                         metricName="rmse")
+        self._setDefault(metricName="rmse")
         kwargs = self._input_kwargs
         self._set(**kwargs)
 
@@ -297,8 +295,7 @@ def __init__(self, predictionCol="prediction", labelCol="label",
         super(MulticlassClassificationEvaluator, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator", self.uid)
-        self._setDefault(predictionCol="prediction", labelCol="label",
-                         metricName="f1")
+        self._setDefault(metricName="f1")
         kwargs = self._input_kwargs
         self._set(**kwargs)
 

From d5aefa83ad8608fbea7c08e8d9164f8bee00863d Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxing@us.ibm.com>
Date: Tue, 19 Sep 2017 09:27:05 -0700
Subject: [PATCH 575/779] [SPARK-21338][SQL] implement
 isCascadingTruncateTable() method in AggregatedDialect

## What changes were proposed in this pull request?

org.apache.spark.sql.jdbc.JdbcDialect's method:
def isCascadingTruncateTable(): Option[Boolean] = None
is not overriden in org.apache.spark.sql.jdbc.AggregatedDialect class.
Because of this issue, when you add more than one dialect Spark doesn't truncate table because isCascadingTruncateTable always returns default None for Aggregated Dialect.
Will implement isCascadingTruncateTable in AggregatedDialect class in this PR.

## How was this patch tested?

In JDBCSuite, inside test("Aggregated dialects"), will add one line to test AggregatedDialect.isCascadingTruncateTable

Author: Huaxin Gao <huaxing@us.ibm.com>

Closes #19256 from huaxingao/spark-21338.
---
 .../scala/org/apache/spark/sql/jdbc/AggregatedDialect.scala   | 4 ++++
 .../src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala  | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/AggregatedDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/AggregatedDialect.scala
index 467d8d62d1b7f..7432a1538ce97 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/AggregatedDialect.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/AggregatedDialect.scala
@@ -41,4 +41,8 @@ private class AggregatedDialect(dialects: List[JdbcDialect]) extends JdbcDialect
   override def getJDBCType(dt: DataType): Option[JdbcType] = {
     dialects.flatMap(_.getJDBCType(dt)).headOption
   }
+
+  override def isCascadingTruncateTable(): Option[Boolean] = {
+    dialects.flatMap(_.isCascadingTruncateTable()).reduceOption(_ || _)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index 689f4106824aa..fd12bb9e530b8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -740,11 +740,13 @@ class JDBCSuite extends SparkFunSuite
         } else {
           None
         }
+      override def isCascadingTruncateTable(): Option[Boolean] = Some(true)
     }, testH2Dialect))
     assert(agg.canHandle("jdbc:h2:xxx"))
     assert(!agg.canHandle("jdbc:h2"))
     assert(agg.getCatalystType(0, "", 1, null) === Some(LongType))
     assert(agg.getCatalystType(1, "", 1, null) === Some(StringType))
+    assert(agg.isCascadingTruncateTable() === Some(true))
   }
 
   test("DB2Dialect type mapping") {

From ee13f3e3dc3faa5152cefa91c22f8aaa8e425bb4 Mon Sep 17 00:00:00 2001
From: aokolnychyi <anton.okolnychyi@sap.com>
Date: Tue, 19 Sep 2017 14:19:13 -0700
Subject: [PATCH 576/779] [SPARK-21969][SQL] CommandUtils.updateTableStats
 should call refreshTable

## What changes were proposed in this pull request?

Tables in the catalog cache are not invalidated once their statistics are updated. As a consequence, existing sessions will use the cached information even though it is not valid anymore. Consider and an example below.

```
// step 1
spark.range(100).write.saveAsTable("tab1")
// step 2
spark.sql("analyze table tab1 compute statistics")
// step 3
spark.sql("explain cost select distinct * from tab1").show(false)
// step 4
spark.range(100).write.mode("append").saveAsTable("tab1")
// step 5
spark.sql("explain cost select distinct * from tab1").show(false)
```

After step 3, the table will be present in the catalog relation cache. Step 4 will correctly update the metadata inside the catalog but will NOT invalidate the cache.

By the way, ``spark.sql("analyze table tab1 compute statistics")`` between step 3 and step 4 would also solve the problem.

## How was this patch tested?

Current and additional unit tests.

Author: aokolnychyi <anton.okolnychyi@sap.com>

Closes #19252 from aokolnychyi/spark-21969.
---
 .../sql/catalyst/catalog/SessionCatalog.scala |  2 +
 .../command/AnalyzeColumnCommand.scala        |  3 -
 .../command/AnalyzeTableCommand.scala         |  2 -
 .../spark/sql/StatisticsCollectionSuite.scala | 73 +++++++++++++++++++
 .../sql/StatisticsCollectionTestBase.scala    | 14 +++-
 5 files changed, 87 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index 0908d68d25649..9407b727bca4c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -377,6 +377,8 @@ class SessionCatalog(
     requireDbExists(db)
     requireTableExists(tableIdentifier)
     externalCatalog.alterTableStats(db, table, newStats)
+    // Invalidate the table relation cache
+    refreshTable(identifier)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
index 6588993ef9ad9..caf12ad745bb8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
@@ -56,9 +56,6 @@ case class AnalyzeColumnCommand(
 
     sessionState.catalog.alterTableStats(tableIdentWithDB, Some(statistics))
 
-    // Refresh the cached data source table in the catalog.
-    sessionState.catalog.refreshTable(tableIdentWithDB)
-
     Seq.empty[Row]
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
index 04715bd314d4d..58b53e8b1c551 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala
@@ -48,8 +48,6 @@ case class AnalyzeTableCommand(
     val newStats = CommandUtils.compareAndGetNewStats(tableMeta.stats, newTotalSize, newRowCount)
     if (newStats.isDefined) {
       sessionState.catalog.alterTableStats(tableIdentWithDB, newStats)
-      // Refresh the cached data source table in the catalog.
-      sessionState.catalog.refreshTable(tableIdentWithDB)
     }
 
     Seq.empty[Row]
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
index 9e459ed00c8d5..2fc92f4aff92e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
@@ -261,6 +261,10 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
           assert(fetched1.get.sizeInBytes == 0)
           assert(fetched1.get.colStats.size == 2)
 
+          // table lookup will make the table cached
+          spark.table(table)
+          assert(isTableInCatalogCache(table))
+
           // insert into command
           sql(s"INSERT INTO TABLE $table SELECT 1, 'abc'")
           if (autoUpdate) {
@@ -270,9 +274,78 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
           } else {
             checkTableStats(table, hasSizeInBytes = false, expectedRowCounts = None)
           }
+
+          // check that tableRelationCache inside the catalog was invalidated after insert
+          assert(!isTableInCatalogCache(table))
+        }
+      }
+    }
+  }
+
+  test("invalidation of tableRelationCache after inserts") {
+    val table = "invalidate_catalog_cache_table"
+    Seq(false, true).foreach { autoUpdate =>
+      withSQLConf(SQLConf.AUTO_UPDATE_SIZE.key -> autoUpdate.toString) {
+        withTable(table) {
+          spark.range(100).write.saveAsTable(table)
+          sql(s"ANALYZE TABLE $table COMPUTE STATISTICS")
+          spark.table(table)
+          val initialSizeInBytes = getTableFromCatalogCache(table).stats.sizeInBytes
+          spark.range(100).write.mode(SaveMode.Append).saveAsTable(table)
+          spark.table(table)
+          assert(getTableFromCatalogCache(table).stats.sizeInBytes == 2 * initialSizeInBytes)
+        }
+      }
+    }
+  }
+
+  test("invalidation of tableRelationCache after table truncation") {
+    val table = "invalidate_catalog_cache_table"
+    Seq(false, true).foreach { autoUpdate =>
+      withSQLConf(SQLConf.AUTO_UPDATE_SIZE.key -> autoUpdate.toString) {
+        withTable(table) {
+          spark.range(100).write.saveAsTable(table)
+          sql(s"ANALYZE TABLE $table COMPUTE STATISTICS")
+          spark.table(table)
+          sql(s"TRUNCATE TABLE $table")
+          spark.table(table)
+          assert(getTableFromCatalogCache(table).stats.sizeInBytes == 0)
         }
       }
     }
   }
 
+  test("invalidation of tableRelationCache after alter table add partition") {
+    val table = "invalidate_catalog_cache_table"
+    Seq(false, true).foreach { autoUpdate =>
+      withSQLConf(SQLConf.AUTO_UPDATE_SIZE.key -> autoUpdate.toString) {
+        withTempDir { dir =>
+          withTable(table) {
+            val path = dir.getCanonicalPath
+            sql(s"""
+              |CREATE TABLE $table (col1 int, col2 int)
+              |USING PARQUET
+              |PARTITIONED BY (col2)
+              |LOCATION '${dir.toURI}'""".stripMargin)
+            sql(s"ANALYZE TABLE $table COMPUTE STATISTICS")
+            spark.table(table)
+            assert(getTableFromCatalogCache(table).stats.sizeInBytes == 0)
+            spark.catalog.recoverPartitions(table)
+            val df = Seq((1, 2), (1, 2)).toDF("col2", "col1")
+            df.write.parquet(s"$path/col2=1")
+            sql(s"ALTER TABLE $table ADD PARTITION (col2=1) LOCATION '${dir.toURI}'")
+            spark.table(table)
+            val cachedTable = getTableFromCatalogCache(table)
+            val cachedTableSizeInBytes = cachedTable.stats.sizeInBytes
+            val defaultSizeInBytes = conf.defaultSizeInBytes
+            if (autoUpdate) {
+              assert(cachedTableSizeInBytes != defaultSizeInBytes && cachedTableSizeInBytes > 0)
+            } else {
+              assert(cachedTableSizeInBytes == defaultSizeInBytes)
+            }
+          }
+        }
+      }
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala
index 5916cd76b8789..a2f63edd786bf 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala
@@ -23,9 +23,9 @@ import java.sql.{Date, Timestamp}
 import scala.collection.mutable
 import scala.util.Random
 
-import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.{QualifiedTableName, TableIdentifier}
 import org.apache.spark.sql.catalyst.catalog.{CatalogStatistics, CatalogTable, HiveTableRelation}
-import org.apache.spark.sql.catalyst.plans.logical.ColumnStat
+import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, LogicalPlan}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.datasources.LogicalRelation
 import org.apache.spark.sql.internal.StaticSQLConf
@@ -85,6 +85,16 @@ abstract class StatisticsCollectionTestBase extends QueryTest with SQLTestUtils
     spark.sessionState.catalog.getTableMetadata(TableIdentifier(tableName))
   }
 
+  def getTableFromCatalogCache(tableName: String): LogicalPlan = {
+    val catalog = spark.sessionState.catalog
+    val qualifiedTableName = QualifiedTableName(catalog.getCurrentDatabase, tableName)
+    catalog.getCachedTable(qualifiedTableName)
+  }
+
+  def isTableInCatalogCache(tableName: String): Boolean = {
+    getTableFromCatalogCache(tableName) != null
+  }
+
   def getCatalogStatistics(tableName: String): CatalogStatistics = {
     getCatalogTable(tableName).stats.get
   }

From 718bbc939037929ef5b8f4b4fe10aadfbab4408e Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Wed, 20 Sep 2017 10:51:00 +0900
Subject: [PATCH 577/779] [SPARK-22067][SQL] ArrowWriter should use position
 when setting UTF8String ByteBuffer

## What changes were proposed in this pull request?

The ArrowWriter StringWriter was setting Arrow data using a position of 0 instead of the actual position in the ByteBuffer.  This was currently working because of a bug ARROW-1443, and has been fixed as of
Arrow 0.7.0.  Testing with this version revealed the error in ArrowConvertersSuite test string conversion.

## How was this patch tested?

Existing tests, manually verified working with Arrow 0.7.0

Author: Bryan Cutler <cutlerb@gmail.com>

Closes #19284 from BryanCutler/arrow-ArrowWriter-StringWriter-position-SPARK-22067.
---
 .../org/apache/spark/sql/execution/arrow/ArrowWriter.scala     | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala
index 11ba04d2ce9a7..0b740735ffe19 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala
@@ -234,8 +234,9 @@ private[arrow] class StringWriter(val valueVector: NullableVarCharVector) extend
 
   override def setValue(input: SpecializedGetters, ordinal: Int): Unit = {
     val utf8 = input.getUTF8String(ordinal)
+    val utf8ByteBuffer = utf8.getByteBuffer
     // todo: for off-heap UTF8String, how to pass in to arrow without copy?
-    valueMutator.setSafe(count, utf8.getByteBuffer, 0, utf8.numBytes())
+    valueMutator.setSafe(count, utf8ByteBuffer, utf8ByteBuffer.position(), utf8.numBytes())
   }
 }
 

From c6ff59a230758b409fa9cc548b7d283eeb7ebe5d Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Wed, 20 Sep 2017 13:41:29 +0800
Subject: [PATCH 578/779] [SPARK-18838][CORE] Add separate listener queues to
 LiveListenerBus.

This change modifies the live listener bus so that all listeners are
added to queues; each queue has its own thread to dispatch events,
making it possible to separate slow listeners from other more
performance-sensitive ones.

The public API has not changed - all listeners added with the existing
"addListener" method, which after this change mostly means all
user-defined listeners, end up in a default queue. Internally, there's
an API allowing listeners to be added to specific queues, and that API
is used to separate the internal Spark listeners into 3 categories:
application status listeners (e.g. UI), executor management (e.g. dynamic
allocation), and the event log.

The queueing logic, while abstracted away in a separate class, is kept
as much as possible hidden away from consumers. Aside from choosing their
queue, there's no code change needed to take advantage of queues.

Test coverage relies on existing tests; a few tests had to be tweaked
because they relied on `LiveListenerBus.postToAll` being synchronous,
and the change makes that method asynchronous. Other tests were simplified
not to use the asynchronous LiveListenerBus.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #19211 from vanzin/SPARK-18838.
---
 .../spark/ExecutorAllocationManager.scala     |   2 +-
 .../org/apache/spark/HeartbeatReceiver.scala  |   2 +-
 .../scala/org/apache/spark/SparkContext.scala |  13 +-
 .../spark/scheduler/AsyncEventQueue.scala     | 196 +++++++++++++
 .../spark/scheduler/LiveListenerBus.scala     | 277 +++++++-----------
 .../scala/org/apache/spark/ui/SparkUI.scala   |  24 +-
 .../ExecutorAllocationManagerSuite.scala      | 128 ++++----
 .../spark/scheduler/DAGSchedulerSuite.scala   |   2 +-
 .../scheduler/EventLoggingListenerSuite.scala |   6 +-
 .../spark/scheduler/SparkListenerSuite.scala  |  95 ++++--
 .../spark/ui/storage/StorageTabSuite.scala    |   4 +-
 .../streaming/StreamingQueryListenerBus.scala |   2 +-
 .../spark/sql/internal/SharedState.scala      |   3 +-
 .../spark/streaming/StreamingContext.scala    |   3 +-
 .../scheduler/StreamingListenerBus.scala      |   2 +-
 .../streaming/StreamingContextSuite.scala     |   4 +-
 16 files changed, 473 insertions(+), 290 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala

diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
index 7a5fb9a802354..119b426a9af34 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
@@ -217,7 +217,7 @@ private[spark] class ExecutorAllocationManager(
    * the scheduling task.
    */
   def start(): Unit = {
-    listenerBus.addListener(listener)
+    listenerBus.addToManagementQueue(listener)
 
     val scheduleTask = new Runnable() {
       override def run(): Unit = {
diff --git a/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala
index 5242ab6f55235..ff960b396dbf1 100644
--- a/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala
+++ b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala
@@ -63,7 +63,7 @@ private[spark] class HeartbeatReceiver(sc: SparkContext, clock: Clock)
     this(sc, new SystemClock)
   }
 
-  sc.addSparkListener(this)
+  sc.listenerBus.addToManagementQueue(this)
 
   override val rpcEnv: RpcEnv = sc.env.rpcEnv
 
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 136f0af7b2c9e..1821bc87bf626 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -419,7 +419,7 @@ class SparkContext(config: SparkConf) extends Logging {
     // "_jobProgressListener" should be set up before creating SparkEnv because when creating
     // "SparkEnv", some messages will be posted to "listenerBus" and we should not miss them.
     _jobProgressListener = new JobProgressListener(_conf)
-    listenerBus.addListener(jobProgressListener)
+    listenerBus.addToStatusQueue(jobProgressListener)
 
     // Create the Spark execution environment (cache, map output tracker, etc)
     _env = createSparkEnv(_conf, isLocal, listenerBus)
@@ -442,7 +442,7 @@ class SparkContext(config: SparkConf) extends Logging {
 
     _ui =
       if (conf.getBoolean("spark.ui.enabled", true)) {
-        Some(SparkUI.createLiveUI(this, _conf, listenerBus, _jobProgressListener,
+        Some(SparkUI.createLiveUI(this, _conf, _jobProgressListener,
           _env.securityManager, appName, startTime = startTime))
       } else {
         // For tests, do not enable the UI
@@ -522,7 +522,7 @@ class SparkContext(config: SparkConf) extends Logging {
           new EventLoggingListener(_applicationId, _applicationAttemptId, _eventLogDir.get,
             _conf, _hadoopConfiguration)
         logger.start()
-        listenerBus.addListener(logger)
+        listenerBus.addToEventLogQueue(logger)
         Some(logger)
       } else {
         None
@@ -1563,7 +1563,7 @@ class SparkContext(config: SparkConf) extends Logging {
    */
   @DeveloperApi
   def addSparkListener(listener: SparkListenerInterface) {
-    listenerBus.addListener(listener)
+    listenerBus.addToSharedQueue(listener)
   }
 
   /**
@@ -1879,8 +1879,7 @@ class SparkContext(config: SparkConf) extends Logging {
    */
   def stop(): Unit = {
     if (LiveListenerBus.withinListenerThread.value) {
-      throw new SparkException(
-        s"Cannot stop SparkContext within listener thread of ${LiveListenerBus.name}")
+      throw new SparkException(s"Cannot stop SparkContext within listener bus thread.")
     }
     // Use the stopping variable to ensure no contention for the stop scenario.
     // Still track the stopped variable for use elsewhere in the code.
@@ -2378,7 +2377,7 @@ class SparkContext(config: SparkConf) extends Logging {
                 " parameter from breaking Spark's ability to find a valid constructor.")
           }
         }
-        listenerBus.addListener(listener)
+        listenerBus.addToSharedQueue(listener)
         logInfo(s"Registered listener $className")
       }
     } catch {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala b/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala
new file mode 100644
index 0000000000000..8605e1da161c7
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala
@@ -0,0 +1,196 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler
+
+import java.util.concurrent.LinkedBlockingQueue
+import java.util.concurrent.atomic.{AtomicBoolean, AtomicLong}
+
+import com.codahale.metrics.{Gauge, Timer}
+
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config._
+import org.apache.spark.util.Utils
+
+/**
+ * An asynchronous queue for events. All events posted to this queue will be delivered to the child
+ * listeners in a separate thread.
+ *
+ * Delivery will only begin when the `start()` method is called. The `stop()` method should be
+ * called when no more events need to be delivered.
+ */
+private class AsyncEventQueue(val name: String, conf: SparkConf, metrics: LiveListenerBusMetrics)
+  extends SparkListenerBus
+  with Logging {
+
+  import AsyncEventQueue._
+
+  // Cap the capacity of the queue so we get an explicit error (rather than an OOM exception) if
+  // it's perpetually being added to more quickly than it's being drained.
+  private val eventQueue = new LinkedBlockingQueue[SparkListenerEvent](
+    conf.get(LISTENER_BUS_EVENT_QUEUE_CAPACITY))
+
+  // Keep the event count separately, so that waitUntilEmpty() can be implemented properly;
+  // this allows that method to return only when the events in the queue have been fully
+  // processed (instead of just dequeued).
+  private val eventCount = new AtomicLong()
+
+  /** A counter for dropped events. It will be reset every time we log it. */
+  private val droppedEventsCounter = new AtomicLong(0L)
+
+  /** When `droppedEventsCounter` was logged last time in milliseconds. */
+  @volatile private var lastReportTimestamp = 0L
+
+  private val logDroppedEvent = new AtomicBoolean(false)
+
+  private var sc: SparkContext = null
+
+  private val started = new AtomicBoolean(false)
+  private val stopped = new AtomicBoolean(false)
+
+  private val droppedEvents = metrics.metricRegistry.counter(s"queue.$name.numDroppedEvents")
+  private val processingTime = metrics.metricRegistry.timer(s"queue.$name.listenerProcessingTime")
+
+  // Remove the queue size gauge first, in case it was created by a previous incarnation of
+  // this queue that was removed from the listener bus.
+  metrics.metricRegistry.remove(s"queue.$name.size")
+  metrics.metricRegistry.register(s"queue.$name.size", new Gauge[Int] {
+    override def getValue: Int = eventQueue.size()
+  })
+
+  private val dispatchThread = new Thread(s"spark-listener-group-$name") {
+    setDaemon(true)
+    override def run(): Unit = Utils.tryOrStopSparkContext(sc) {
+      dispatch()
+    }
+  }
+
+  private def dispatch(): Unit = LiveListenerBus.withinListenerThread.withValue(true) {
+    try {
+      var next: SparkListenerEvent = eventQueue.take()
+      while (next != POISON_PILL) {
+        val ctx = processingTime.time()
+        try {
+          super.postToAll(next)
+        } finally {
+          ctx.stop()
+        }
+        eventCount.decrementAndGet()
+        next = eventQueue.take()
+      }
+      eventCount.decrementAndGet()
+    } catch {
+      case ie: InterruptedException =>
+        logInfo(s"Stopping listener queue $name.", ie)
+    }
+  }
+
+  override protected def getTimer(listener: SparkListenerInterface): Option[Timer] = {
+    metrics.getTimerForListenerClass(listener.getClass.asSubclass(classOf[SparkListenerInterface]))
+  }
+
+  /**
+   * Start an asynchronous thread to dispatch events to the underlying listeners.
+   *
+   * @param sc Used to stop the SparkContext in case the async dispatcher fails.
+   */
+  private[scheduler] def start(sc: SparkContext): Unit = {
+    if (started.compareAndSet(false, true)) {
+      this.sc = sc
+      dispatchThread.start()
+    } else {
+      throw new IllegalStateException(s"$name already started!")
+    }
+  }
+
+  /**
+   * Stop the listener bus. It will wait until the queued events have been processed, but new
+   * events will be dropped.
+   */
+  private[scheduler] def stop(): Unit = {
+    if (!started.get()) {
+      throw new IllegalStateException(s"Attempted to stop $name that has not yet started!")
+    }
+    if (stopped.compareAndSet(false, true)) {
+      eventQueue.put(POISON_PILL)
+      eventCount.incrementAndGet()
+    }
+    dispatchThread.join()
+  }
+
+  def post(event: SparkListenerEvent): Unit = {
+    if (stopped.get()) {
+      return
+    }
+
+    eventCount.incrementAndGet()
+    if (eventQueue.offer(event)) {
+      return
+    }
+
+    eventCount.decrementAndGet()
+    droppedEvents.inc()
+    droppedEventsCounter.incrementAndGet()
+    if (logDroppedEvent.compareAndSet(false, true)) {
+      // Only log the following message once to avoid duplicated annoying logs.
+      logError(s"Dropping event from queue $name. " +
+        "This likely means one of the listeners is too slow and cannot keep up with " +
+        "the rate at which tasks are being started by the scheduler.")
+    }
+    logTrace(s"Dropping event $event")
+
+    val droppedCount = droppedEventsCounter.get
+    if (droppedCount > 0) {
+      // Don't log too frequently
+      if (System.currentTimeMillis() - lastReportTimestamp >= 60 * 1000) {
+        // There may be multiple threads trying to decrease droppedEventsCounter.
+        // Use "compareAndSet" to make sure only one thread can win.
+        // And if another thread is increasing droppedEventsCounter, "compareAndSet" will fail and
+        // then that thread will update it.
+        if (droppedEventsCounter.compareAndSet(droppedCount, 0)) {
+          val prevLastReportTimestamp = lastReportTimestamp
+          lastReportTimestamp = System.currentTimeMillis()
+          val previous = new java.util.Date(prevLastReportTimestamp)
+          logWarning(s"Dropped $droppedEvents events from $name since $previous.")
+        }
+      }
+    }
+  }
+
+  /**
+   * For testing only. Wait until there are no more events in the queue.
+   *
+   * @return true if the queue is empty.
+   */
+  def waitUntilEmpty(deadline: Long): Boolean = {
+    while (eventCount.get() != 0) {
+      if (System.currentTimeMillis > deadline) {
+        return false
+      }
+      Thread.sleep(10)
+    }
+    true
+  }
+
+}
+
+private object AsyncEventQueue {
+
+  val POISON_PILL = new SparkListenerEvent() { }
+
+}
diff --git a/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala
index 7d5e9809dd7b2..2f93c497c5771 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala
@@ -17,20 +17,22 @@
 
 package org.apache.spark.scheduler
 
+import java.util.{List => JList}
 import java.util.concurrent._
 import java.util.concurrent.atomic.{AtomicBoolean, AtomicLong}
 
+import scala.collection.JavaConverters._
 import scala.collection.mutable
+import scala.reflect.ClassTag
 import scala.util.DynamicVariable
 
-import com.codahale.metrics.{Counter, Gauge, MetricRegistry, Timer}
+import com.codahale.metrics.{Counter, MetricRegistry, Timer}
 
 import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
 import org.apache.spark.metrics.MetricsSystem
 import org.apache.spark.metrics.source.Source
-import org.apache.spark.util.Utils
 
 /**
  * Asynchronously passes SparkListenerEvents to registered SparkListeners.
@@ -39,20 +41,13 @@ import org.apache.spark.util.Utils
  * has started will events be actually propagated to all attached listeners. This listener bus
  * is stopped when `stop()` is called, and it will drop further events after stopping.
  */
-private[spark] class LiveListenerBus(conf: SparkConf) extends SparkListenerBus {
-
-  self =>
+private[spark] class LiveListenerBus(conf: SparkConf) {
 
   import LiveListenerBus._
 
   private var sparkContext: SparkContext = _
 
-  // Cap the capacity of the event queue so we get an explicit error (rather than
-  // an OOM exception) if it's perpetually being added to more quickly than it's being drained.
-  private val eventQueue =
-    new LinkedBlockingQueue[SparkListenerEvent](conf.get(LISTENER_BUS_EVENT_QUEUE_CAPACITY))
-
-  private[spark] val metrics = new LiveListenerBusMetrics(conf, eventQueue)
+  private[spark] val metrics = new LiveListenerBusMetrics(conf)
 
   // Indicate if `start()` is called
   private val started = new AtomicBoolean(false)
@@ -65,53 +60,76 @@ private[spark] class LiveListenerBus(conf: SparkConf) extends SparkListenerBus {
   /** When `droppedEventsCounter` was logged last time in milliseconds. */
   @volatile private var lastReportTimestamp = 0L
 
-  // Indicate if we are processing some event
-  // Guarded by `self`
-  private var processingEvent = false
-
-  private val logDroppedEvent = new AtomicBoolean(false)
-
-  // A counter that represents the number of events produced and consumed in the queue
-  private val eventLock = new Semaphore(0)
-
-  private val listenerThread = new Thread(name) {
-    setDaemon(true)
-    override def run(): Unit = Utils.tryOrStopSparkContext(sparkContext) {
-      LiveListenerBus.withinListenerThread.withValue(true) {
-        val timer = metrics.eventProcessingTime
-        while (true) {
-          eventLock.acquire()
-          self.synchronized {
-            processingEvent = true
-          }
-          try {
-            val event = eventQueue.poll
-            if (event == null) {
-              // Get out of the while loop and shutdown the daemon thread
-              if (!stopped.get) {
-                throw new IllegalStateException("Polling `null` from eventQueue means" +
-                  " the listener bus has been stopped. So `stopped` must be true")
-              }
-              return
-            }
-            val timerContext = timer.time()
-            try {
-              postToAll(event)
-            } finally {
-              timerContext.stop()
-            }
-          } finally {
-            self.synchronized {
-              processingEvent = false
-            }
-          }
+  private val queues = new CopyOnWriteArrayList[AsyncEventQueue]()
+
+  /** Add a listener to queue shared by all non-internal listeners. */
+  def addToSharedQueue(listener: SparkListenerInterface): Unit = {
+    addToQueue(listener, SHARED_QUEUE)
+  }
+
+  /** Add a listener to the executor management queue. */
+  def addToManagementQueue(listener: SparkListenerInterface): Unit = {
+    addToQueue(listener, EXECUTOR_MANAGEMENT_QUEUE)
+  }
+
+  /** Add a listener to the application status queue. */
+  def addToStatusQueue(listener: SparkListenerInterface): Unit = {
+    addToQueue(listener, APP_STATUS_QUEUE)
+  }
+
+  /** Add a listener to the event log queue. */
+  def addToEventLogQueue(listener: SparkListenerInterface): Unit = {
+    addToQueue(listener, EVENT_LOG_QUEUE)
+  }
+
+  /**
+   * Add a listener to a specific queue, creating a new queue if needed. Queues are independent
+   * of each other (each one uses a separate thread for delivering events), allowing slower
+   * listeners to be somewhat isolated from others.
+   */
+  private def addToQueue(listener: SparkListenerInterface, queue: String): Unit = synchronized {
+    if (stopped.get()) {
+      throw new IllegalStateException("LiveListenerBus is stopped.")
+    }
+
+    queues.asScala.find(_.name == queue) match {
+      case Some(queue) =>
+        queue.addListener(listener)
+
+      case None =>
+        val newQueue = new AsyncEventQueue(queue, conf, metrics)
+        newQueue.addListener(listener)
+        if (started.get()) {
+          newQueue.start(sparkContext)
         }
-      }
+        queues.add(newQueue)
     }
   }
 
-  override protected def getTimer(listener: SparkListenerInterface): Option[Timer] = {
-    metrics.getTimerForListenerClass(listener.getClass.asSubclass(classOf[SparkListenerInterface]))
+  def removeListener(listener: SparkListenerInterface): Unit = synchronized {
+    // Remove listener from all queues it was added to, and stop queues that have become empty.
+    queues.asScala
+      .filter { queue =>
+        queue.removeListener(listener)
+        queue.listeners.isEmpty()
+      }
+      .foreach { toRemove =>
+        if (started.get() && !stopped.get()) {
+          toRemove.stop()
+        }
+        queues.remove(toRemove)
+      }
+  }
+
+  /** Post an event to all queues. */
+  def post(event: SparkListenerEvent): Unit = {
+    if (!stopped.get()) {
+      metrics.numEventsPosted.inc()
+      val it = queues.iterator()
+      while (it.hasNext()) {
+        it.next().post(event)
+      }
+    }
   }
 
   /**
@@ -123,46 +141,14 @@ private[spark] class LiveListenerBus(conf: SparkConf) extends SparkListenerBus {
    *
    * @param sc Used to stop the SparkContext in case the listener thread dies.
    */
-  def start(sc: SparkContext, metricsSystem: MetricsSystem): Unit = {
-    if (started.compareAndSet(false, true)) {
-      sparkContext = sc
-      metricsSystem.registerSource(metrics)
-      listenerThread.start()
-    } else {
-      throw new IllegalStateException(s"$name already started!")
-    }
-  }
-
-  def post(event: SparkListenerEvent): Unit = {
-    if (stopped.get) {
-      // Drop further events to make `listenerThread` exit ASAP
-      logDebug(s"$name has already stopped! Dropping event $event")
-      return
-    }
-    metrics.numEventsPosted.inc()
-    val eventAdded = eventQueue.offer(event)
-    if (eventAdded) {
-      eventLock.release()
-    } else {
-      onDropEvent(event)
+  def start(sc: SparkContext, metricsSystem: MetricsSystem): Unit = synchronized {
+    if (!started.compareAndSet(false, true)) {
+      throw new IllegalStateException("LiveListenerBus already started.")
     }
 
-    val droppedEvents = droppedEventsCounter.get
-    if (droppedEvents > 0) {
-      // Don't log too frequently
-      if (System.currentTimeMillis() - lastReportTimestamp >= 60 * 1000) {
-        // There may be multiple threads trying to decrease droppedEventsCounter.
-        // Use "compareAndSet" to make sure only one thread can win.
-        // And if another thread is increasing droppedEventsCounter, "compareAndSet" will fail and
-        // then that thread will update it.
-        if (droppedEventsCounter.compareAndSet(droppedEvents, 0)) {
-          val prevLastReportTimestamp = lastReportTimestamp
-          lastReportTimestamp = System.currentTimeMillis()
-          logWarning(s"Dropped $droppedEvents SparkListenerEvents since " +
-            new java.util.Date(prevLastReportTimestamp))
-        }
-      }
-    }
+    this.sparkContext = sc
+    queues.asScala.foreach(_.start(sc))
+    metricsSystem.registerSource(metrics)
   }
 
   /**
@@ -173,80 +159,64 @@ private[spark] class LiveListenerBus(conf: SparkConf) extends SparkListenerBus {
    */
   @throws(classOf[TimeoutException])
   def waitUntilEmpty(timeoutMillis: Long): Unit = {
-    val finishTime = System.currentTimeMillis + timeoutMillis
-    while (!queueIsEmpty) {
-      if (System.currentTimeMillis > finishTime) {
-        throw new TimeoutException(
-          s"The event queue is not empty after $timeoutMillis milliseconds")
+    val deadline = System.currentTimeMillis + timeoutMillis
+    queues.asScala.foreach { queue =>
+      if (!queue.waitUntilEmpty(deadline)) {
+        throw new TimeoutException(s"The event queue is not empty after $timeoutMillis ms.")
       }
-      /* Sleep rather than using wait/notify, because this is used only for testing and
-       * wait/notify add overhead in the general case. */
-      Thread.sleep(10)
     }
   }
 
-  /**
-   * For testing only. Return whether the listener daemon thread is still alive.
-   * Exposed for testing.
-   */
-  def listenerThreadIsAlive: Boolean = listenerThread.isAlive
-
-  /**
-   * Return whether the event queue is empty.
-   *
-   * The use of synchronized here guarantees that all events that once belonged to this queue
-   * have already been processed by all attached listeners, if this returns true.
-   */
-  private def queueIsEmpty: Boolean = synchronized { eventQueue.isEmpty && !processingEvent }
-
   /**
    * Stop the listener bus. It will wait until the queued events have been processed, but drop the
    * new events after stopping.
    */
   def stop(): Unit = {
     if (!started.get()) {
-      throw new IllegalStateException(s"Attempted to stop $name that has not yet started!")
+      throw new IllegalStateException(s"Attempted to stop bus that has not yet started!")
     }
-    if (stopped.compareAndSet(false, true)) {
-      // Call eventLock.release() so that listenerThread will poll `null` from `eventQueue` and know
-      // `stop` is called.
-      eventLock.release()
-      listenerThread.join()
-    } else {
-      // Keep quiet
+
+    if (!stopped.compareAndSet(false, true)) {
+      return
     }
-  }
 
-  /**
-   * If the event queue exceeds its capacity, the new events will be dropped. The subclasses will be
-   * notified with the dropped events.
-   *
-   * Note: `onDropEvent` can be called in any thread.
-   */
-  def onDropEvent(event: SparkListenerEvent): Unit = {
-    metrics.numDroppedEvents.inc()
-    droppedEventsCounter.incrementAndGet()
-    if (logDroppedEvent.compareAndSet(false, true)) {
-      // Only log the following message once to avoid duplicated annoying logs.
-      logError("Dropping SparkListenerEvent because no remaining room in event queue. " +
-        "This likely means one of the SparkListeners is too slow and cannot keep up with " +
-        "the rate at which tasks are being started by the scheduler.")
+    synchronized {
+      queues.asScala.foreach(_.stop())
+      queues.clear()
     }
-    logTrace(s"Dropping event $event")
   }
+
+  // For testing only.
+  private[spark] def findListenersByClass[T <: SparkListenerInterface : ClassTag](): Seq[T] = {
+    queues.asScala.flatMap { queue => queue.findListenersByClass[T]() }
+  }
+
+  // For testing only.
+  private[spark] def listeners: JList[SparkListenerInterface] = {
+    queues.asScala.flatMap(_.listeners.asScala).asJava
+  }
+
+  // For testing only.
+  private[scheduler] def activeQueues(): Set[String] = {
+    queues.asScala.map(_.name).toSet
+  }
+
 }
 
 private[spark] object LiveListenerBus {
   // Allows for Context to check whether stop() call is made within listener thread
   val withinListenerThread: DynamicVariable[Boolean] = new DynamicVariable[Boolean](false)
 
-  /** The thread name of Spark listener bus */
-  val name = "SparkListenerBus"
+  private[scheduler] val SHARED_QUEUE = "shared"
+
+  private[scheduler] val APP_STATUS_QUEUE = "appStatus"
+
+  private[scheduler] val EXECUTOR_MANAGEMENT_QUEUE = "executorManagement"
+
+  private[scheduler] val EVENT_LOG_QUEUE = "eventLog"
 }
 
-private[spark] class LiveListenerBusMetrics(
-    conf: SparkConf,
-    queue: LinkedBlockingQueue[_])
+private[spark] class LiveListenerBusMetrics(conf: SparkConf)
   extends Source with Logging {
 
   override val sourceName: String = "LiveListenerBus"
@@ -260,25 +230,6 @@ private[spark] class LiveListenerBusMetrics(
    */
   val numEventsPosted: Counter = metricRegistry.counter(MetricRegistry.name("numEventsPosted"))
 
-  /**
-   * The total number of events that were dropped without being delivered to listeners.
-   */
-  val numDroppedEvents: Counter = metricRegistry.counter(MetricRegistry.name("numEventsDropped"))
-
-  /**
-   * The amount of time taken to post a single event to all listeners.
-   */
-  val eventProcessingTime: Timer = metricRegistry.timer(MetricRegistry.name("eventProcessingTime"))
-
-  /**
-   * The number of messages waiting in the queue.
-   */
-  val queueSize: Gauge[Int] = {
-    metricRegistry.register(MetricRegistry.name("queueSize"), new Gauge[Int]{
-      override def getValue: Int = queue.size()
-    })
-  }
-
   // Guarded by synchronization.
   private val perListenerClassTimers = mutable.Map[String, Timer]()
 
@@ -303,5 +254,5 @@ private[spark] class LiveListenerBusMetrics(
       }
     }
   }
-}
 
+}
diff --git a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
index f3fcf2778d39e..6e94073238a56 100644
--- a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
+++ b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
@@ -162,13 +162,14 @@ private[spark] object SparkUI {
   def createLiveUI(
       sc: SparkContext,
       conf: SparkConf,
-      listenerBus: SparkListenerBus,
       jobProgressListener: JobProgressListener,
       securityManager: SecurityManager,
       appName: String,
       startTime: Long): SparkUI = {
-    create(Some(sc), conf, listenerBus, securityManager, appName,
-      jobProgressListener = Some(jobProgressListener), startTime = startTime)
+    create(Some(sc), conf,
+      sc.listenerBus.addToStatusQueue,
+      securityManager, appName, jobProgressListener = Some(jobProgressListener),
+      startTime = startTime)
   }
 
   def createHistoryUI(
@@ -179,8 +180,7 @@ private[spark] object SparkUI {
       basePath: String,
       lastUpdateTime: Option[Long],
       startTime: Long): SparkUI = {
-    val sparkUI = create(
-      None, conf, listenerBus, securityManager, appName, basePath,
+    val sparkUI = create(None, conf, listenerBus.addListener, securityManager, appName, basePath,
       lastUpdateTime = lastUpdateTime, startTime = startTime)
 
     val listenerFactories = ServiceLoader.load(classOf[SparkHistoryListenerFactory],
@@ -202,7 +202,7 @@ private[spark] object SparkUI {
   private def create(
       sc: Option[SparkContext],
       conf: SparkConf,
-      listenerBus: SparkListenerBus,
+      addListenerFn: SparkListenerInterface => Unit,
       securityManager: SecurityManager,
       appName: String,
       basePath: String = "",
@@ -212,7 +212,7 @@ private[spark] object SparkUI {
 
     val _jobProgressListener: JobProgressListener = jobProgressListener.getOrElse {
       val listener = new JobProgressListener(conf)
-      listenerBus.addListener(listener)
+      addListenerFn(listener)
       listener
     }
 
@@ -222,11 +222,11 @@ private[spark] object SparkUI {
     val storageListener = new StorageListener(storageStatusListener)
     val operationGraphListener = new RDDOperationGraphListener(conf)
 
-    listenerBus.addListener(environmentListener)
-    listenerBus.addListener(storageStatusListener)
-    listenerBus.addListener(executorsListener)
-    listenerBus.addListener(storageListener)
-    listenerBus.addListener(operationGraphListener)
+    addListenerFn(environmentListener)
+    addListenerFn(storageStatusListener)
+    addListenerFn(executorsListener)
+    addListenerFn(storageListener)
+    addListenerFn(operationGraphListener)
 
     new SparkUI(sc, conf, securityManager, environmentListener, storageStatusListener,
       executorsListener, _jobProgressListener, storageListener, operationGraphListener,
diff --git a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
index 7da4bae0ab7eb..a91e09b7cb69f 100644
--- a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
@@ -49,6 +49,11 @@ class ExecutorAllocationManagerSuite
     contexts.foreach(_.stop())
   }
 
+  private def post(bus: LiveListenerBus, event: SparkListenerEvent): Unit = {
+    bus.post(event)
+    bus.waitUntilEmpty(1000)
+  }
+
   test("verify min/max executors") {
     val conf = new SparkConf()
       .setMaster("myDummyLocalExternalClusterManager")
@@ -95,7 +100,7 @@ class ExecutorAllocationManagerSuite
   test("add executors") {
     sc = createSparkContext(1, 10, 1)
     val manager = sc.executorAllocationManager.get
-    sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(0, 1000)))
+    post(sc.listenerBus, SparkListenerStageSubmitted(createStageInfo(0, 1000)))
 
     // Keep adding until the limit is reached
     assert(numExecutorsTarget(manager) === 1)
@@ -140,7 +145,7 @@ class ExecutorAllocationManagerSuite
   test("add executors capped by num pending tasks") {
     sc = createSparkContext(0, 10, 0)
     val manager = sc.executorAllocationManager.get
-    sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(0, 5)))
+    post(sc.listenerBus, SparkListenerStageSubmitted(createStageInfo(0, 5)))
 
     // Verify that we're capped at number of tasks in the stage
     assert(numExecutorsTarget(manager) === 0)
@@ -156,10 +161,10 @@ class ExecutorAllocationManagerSuite
     assert(numExecutorsToAdd(manager) === 1)
 
     // Verify that running a task doesn't affect the target
-    sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(1, 3)))
-    sc.listenerBus.postToAll(SparkListenerExecutorAdded(
+    post(sc.listenerBus, SparkListenerStageSubmitted(createStageInfo(1, 3)))
+    post(sc.listenerBus, SparkListenerExecutorAdded(
       0L, "executor-1", new ExecutorInfo("host1", 1, Map.empty)))
-    sc.listenerBus.postToAll(SparkListenerTaskStart(1, 0, createTaskInfo(0, 0, "executor-1")))
+    post(sc.listenerBus, SparkListenerTaskStart(1, 0, createTaskInfo(0, 0, "executor-1")))
     assert(numExecutorsTarget(manager) === 5)
     assert(addExecutors(manager) === 1)
     assert(numExecutorsTarget(manager) === 6)
@@ -172,9 +177,9 @@ class ExecutorAllocationManagerSuite
     assert(numExecutorsToAdd(manager) === 1)
 
     // Verify that re-running a task doesn't blow things up
-    sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(2, 3)))
-    sc.listenerBus.postToAll(SparkListenerTaskStart(2, 0, createTaskInfo(0, 0, "executor-1")))
-    sc.listenerBus.postToAll(SparkListenerTaskStart(2, 0, createTaskInfo(1, 0, "executor-1")))
+    post(sc.listenerBus, SparkListenerStageSubmitted(createStageInfo(2, 3)))
+    post(sc.listenerBus, SparkListenerTaskStart(2, 0, createTaskInfo(0, 0, "executor-1")))
+    post(sc.listenerBus, SparkListenerTaskStart(2, 0, createTaskInfo(1, 0, "executor-1")))
     assert(addExecutors(manager) === 1)
     assert(numExecutorsTarget(manager) === 9)
     assert(numExecutorsToAdd(manager) === 2)
@@ -183,7 +188,7 @@ class ExecutorAllocationManagerSuite
     assert(numExecutorsToAdd(manager) === 1)
 
     // Verify that running a task once we're at our limit doesn't blow things up
-    sc.listenerBus.postToAll(SparkListenerTaskStart(2, 0, createTaskInfo(0, 1, "executor-1")))
+    post(sc.listenerBus, SparkListenerTaskStart(2, 0, createTaskInfo(0, 1, "executor-1")))
     assert(addExecutors(manager) === 0)
     assert(numExecutorsTarget(manager) === 10)
   }
@@ -193,13 +198,13 @@ class ExecutorAllocationManagerSuite
     val manager = sc.executorAllocationManager.get
 
     // Verify that we're capped at number of tasks including the speculative ones in the stage
-    sc.listenerBus.postToAll(SparkListenerSpeculativeTaskSubmitted(1))
+    post(sc.listenerBus, SparkListenerSpeculativeTaskSubmitted(1))
     assert(numExecutorsTarget(manager) === 0)
     assert(numExecutorsToAdd(manager) === 1)
     assert(addExecutors(manager) === 1)
-    sc.listenerBus.postToAll(SparkListenerSpeculativeTaskSubmitted(1))
-    sc.listenerBus.postToAll(SparkListenerSpeculativeTaskSubmitted(1))
-    sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(1, 2)))
+    post(sc.listenerBus, SparkListenerSpeculativeTaskSubmitted(1))
+    post(sc.listenerBus, SparkListenerSpeculativeTaskSubmitted(1))
+    post(sc.listenerBus, SparkListenerStageSubmitted(createStageInfo(1, 2)))
     assert(numExecutorsTarget(manager) === 1)
     assert(numExecutorsToAdd(manager) === 2)
     assert(addExecutors(manager) === 2)
@@ -210,13 +215,13 @@ class ExecutorAllocationManagerSuite
     assert(numExecutorsToAdd(manager) === 1)
 
     // Verify that running a task doesn't affect the target
-    sc.listenerBus.postToAll(SparkListenerTaskStart(1, 0, createTaskInfo(0, 0, "executor-1")))
+    post(sc.listenerBus, SparkListenerTaskStart(1, 0, createTaskInfo(0, 0, "executor-1")))
     assert(numExecutorsTarget(manager) === 5)
     assert(addExecutors(manager) === 0)
     assert(numExecutorsToAdd(manager) === 1)
 
     // Verify that running a speculative task doesn't affect the target
-    sc.listenerBus.postToAll(SparkListenerTaskStart(1, 0, createTaskInfo(0, 0, "executor-2", true)))
+    post(sc.listenerBus, SparkListenerTaskStart(1, 0, createTaskInfo(0, 0, "executor-2", true)))
     assert(numExecutorsTarget(manager) === 5)
     assert(addExecutors(manager) === 0)
     assert(numExecutorsToAdd(manager) === 1)
@@ -225,7 +230,7 @@ class ExecutorAllocationManagerSuite
   test("cancel pending executors when no longer needed") {
     sc = createSparkContext(0, 10, 0)
     val manager = sc.executorAllocationManager.get
-    sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(2, 5)))
+    post(sc.listenerBus, SparkListenerStageSubmitted(createStageInfo(2, 5)))
 
     assert(numExecutorsTarget(manager) === 0)
     assert(numExecutorsToAdd(manager) === 1)
@@ -236,15 +241,15 @@ class ExecutorAllocationManagerSuite
     assert(numExecutorsTarget(manager) === 3)
 
     val task1Info = createTaskInfo(0, 0, "executor-1")
-    sc.listenerBus.postToAll(SparkListenerTaskStart(2, 0, task1Info))
+    post(sc.listenerBus, SparkListenerTaskStart(2, 0, task1Info))
 
     assert(numExecutorsToAdd(manager) === 4)
     assert(addExecutors(manager) === 2)
 
     val task2Info = createTaskInfo(1, 0, "executor-1")
-    sc.listenerBus.postToAll(SparkListenerTaskStart(2, 0, task2Info))
-    sc.listenerBus.postToAll(SparkListenerTaskEnd(2, 0, null, Success, task1Info, null))
-    sc.listenerBus.postToAll(SparkListenerTaskEnd(2, 0, null, Success, task2Info, null))
+    post(sc.listenerBus, SparkListenerTaskStart(2, 0, task2Info))
+    post(sc.listenerBus, SparkListenerTaskEnd(2, 0, null, Success, task1Info, null))
+    post(sc.listenerBus, SparkListenerTaskEnd(2, 0, null, Success, task2Info, null))
 
     assert(adjustRequestedExecutors(manager) === -1)
   }
@@ -352,21 +357,22 @@ class ExecutorAllocationManagerSuite
     sc = createSparkContext(5, 12, 5)
     val manager = sc.executorAllocationManager.get
 
-    sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(0, 8)))
+    post(sc.listenerBus, SparkListenerStageSubmitted(createStageInfo(0, 8)))
 
     // Remove when numExecutorsTarget is the same as the current number of executors
     assert(addExecutors(manager) === 1)
     assert(addExecutors(manager) === 2)
     (1 to 8).map { i => createTaskInfo(i, i, s"$i") }.foreach {
-      info => sc.listenerBus.postToAll(SparkListenerTaskStart(0, 0, info)) }
+      info => post(sc.listenerBus, SparkListenerTaskStart(0, 0, info)) }
     assert(executorIds(manager).size === 8)
     assert(numExecutorsTarget(manager) === 8)
     assert(maxNumExecutorsNeeded(manager) == 8)
     assert(!removeExecutor(manager, "1")) // won't work since numExecutorsTarget == numExecutors
 
     // Remove executors when numExecutorsTarget is lower than current number of executors
-    (1 to 3).map { i => createTaskInfo(i, i, s"$i") }.foreach {
-      info => sc.listenerBus.postToAll(SparkListenerTaskEnd(0, 0, null, Success, info, null)) }
+    (1 to 3).map { i => createTaskInfo(i, i, s"$i") }.foreach { info =>
+      post(sc.listenerBus, SparkListenerTaskEnd(0, 0, null, Success, info, null))
+    }
     adjustRequestedExecutors(manager)
     assert(executorIds(manager).size === 8)
     assert(numExecutorsTarget(manager) === 5)
@@ -378,7 +384,7 @@ class ExecutorAllocationManagerSuite
     onExecutorRemoved(manager, "3")
 
     // numExecutorsTarget is lower than minNumExecutors
-    sc.listenerBus.postToAll(
+    post(sc.listenerBus,
       SparkListenerTaskEnd(0, 0, null, Success, createTaskInfo(4, 4, "4"), null))
     assert(executorIds(manager).size === 5)
     assert(numExecutorsTarget(manager) === 5)
@@ -390,7 +396,7 @@ class ExecutorAllocationManagerSuite
   test ("interleaving add and remove") {
     sc = createSparkContext(5, 12, 5)
     val manager = sc.executorAllocationManager.get
-    sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(0, 1000)))
+    post(sc.listenerBus, SparkListenerStageSubmitted(createStageInfo(0, 1000)))
 
     // Add a few executors
     assert(addExecutors(manager) === 1)
@@ -569,7 +575,7 @@ class ExecutorAllocationManagerSuite
     val clock = new ManualClock(2020L)
     val manager = sc.executorAllocationManager.get
     manager.setClock(clock)
-    sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(0, 1000)))
+    post(sc.listenerBus, SparkListenerStageSubmitted(createStageInfo(0, 1000)))
 
     // Scheduler queue backlogged
     onSchedulerBacklogged(manager)
@@ -682,26 +688,26 @@ class ExecutorAllocationManagerSuite
 
     // Starting a stage should start the add timer
     val numTasks = 10
-    sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(0, numTasks)))
+    post(sc.listenerBus, SparkListenerStageSubmitted(createStageInfo(0, numTasks)))
     assert(addTime(manager) !== NOT_SET)
 
     // Starting a subset of the tasks should not cancel the add timer
     val taskInfos = (0 to numTasks - 1).map { i => createTaskInfo(i, i, "executor-1") }
-    taskInfos.tail.foreach { info => sc.listenerBus.postToAll(SparkListenerTaskStart(0, 0, info)) }
+    taskInfos.tail.foreach { info => post(sc.listenerBus, SparkListenerTaskStart(0, 0, info)) }
     assert(addTime(manager) !== NOT_SET)
 
     // Starting all remaining tasks should cancel the add timer
-    sc.listenerBus.postToAll(SparkListenerTaskStart(0, 0, taskInfos.head))
+    post(sc.listenerBus, SparkListenerTaskStart(0, 0, taskInfos.head))
     assert(addTime(manager) === NOT_SET)
 
     // Start two different stages
     // The add timer should be canceled only if all tasks in both stages start running
-    sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(1, numTasks)))
-    sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(2, numTasks)))
+    post(sc.listenerBus, SparkListenerStageSubmitted(createStageInfo(1, numTasks)))
+    post(sc.listenerBus, SparkListenerStageSubmitted(createStageInfo(2, numTasks)))
     assert(addTime(manager) !== NOT_SET)
-    taskInfos.foreach { info => sc.listenerBus.postToAll(SparkListenerTaskStart(1, 0, info)) }
+    taskInfos.foreach { info => post(sc.listenerBus, SparkListenerTaskStart(1, 0, info)) }
     assert(addTime(manager) !== NOT_SET)
-    taskInfos.foreach { info => sc.listenerBus.postToAll(SparkListenerTaskStart(2, 0, info)) }
+    taskInfos.foreach { info => post(sc.listenerBus, SparkListenerTaskStart(2, 0, info)) }
     assert(addTime(manager) === NOT_SET)
   }
 
@@ -715,22 +721,22 @@ class ExecutorAllocationManagerSuite
     assert(removeTimes(manager).size === 5)
 
     // Starting a task cancel the remove timer for that executor
-    sc.listenerBus.postToAll(SparkListenerTaskStart(0, 0, createTaskInfo(0, 0, "executor-1")))
-    sc.listenerBus.postToAll(SparkListenerTaskStart(0, 0, createTaskInfo(1, 1, "executor-1")))
-    sc.listenerBus.postToAll(SparkListenerTaskStart(0, 0, createTaskInfo(2, 2, "executor-2")))
+    post(sc.listenerBus, SparkListenerTaskStart(0, 0, createTaskInfo(0, 0, "executor-1")))
+    post(sc.listenerBus, SparkListenerTaskStart(0, 0, createTaskInfo(1, 1, "executor-1")))
+    post(sc.listenerBus, SparkListenerTaskStart(0, 0, createTaskInfo(2, 2, "executor-2")))
     assert(removeTimes(manager).size === 3)
     assert(!removeTimes(manager).contains("executor-1"))
     assert(!removeTimes(manager).contains("executor-2"))
 
     // Finishing all tasks running on an executor should start the remove timer for that executor
-    sc.listenerBus.postToAll(SparkListenerTaskEnd(
+    post(sc.listenerBus, SparkListenerTaskEnd(
       0, 0, "task-type", Success, createTaskInfo(0, 0, "executor-1"), new TaskMetrics))
-    sc.listenerBus.postToAll(SparkListenerTaskEnd(
+    post(sc.listenerBus, SparkListenerTaskEnd(
       0, 0, "task-type", Success, createTaskInfo(2, 2, "executor-2"), new TaskMetrics))
     assert(removeTimes(manager).size === 4)
     assert(!removeTimes(manager).contains("executor-1")) // executor-1 has not finished yet
     assert(removeTimes(manager).contains("executor-2"))
-    sc.listenerBus.postToAll(SparkListenerTaskEnd(
+    post(sc.listenerBus, SparkListenerTaskEnd(
       0, 0, "task-type", Success, createTaskInfo(1, 1, "executor-1"), new TaskMetrics))
     assert(removeTimes(manager).size === 5)
     assert(removeTimes(manager).contains("executor-1")) // executor-1 has now finished
@@ -743,13 +749,13 @@ class ExecutorAllocationManagerSuite
     assert(removeTimes(manager).isEmpty)
 
     // New executors have registered
-    sc.listenerBus.postToAll(SparkListenerExecutorAdded(
+    post(sc.listenerBus, SparkListenerExecutorAdded(
       0L, "executor-1", new ExecutorInfo("host1", 1, Map.empty)))
     assert(executorIds(manager).size === 1)
     assert(executorIds(manager).contains("executor-1"))
     assert(removeTimes(manager).size === 1)
     assert(removeTimes(manager).contains("executor-1"))
-    sc.listenerBus.postToAll(SparkListenerExecutorAdded(
+    post(sc.listenerBus, SparkListenerExecutorAdded(
       0L, "executor-2", new ExecutorInfo("host2", 1, Map.empty)))
     assert(executorIds(manager).size === 2)
     assert(executorIds(manager).contains("executor-2"))
@@ -757,14 +763,14 @@ class ExecutorAllocationManagerSuite
     assert(removeTimes(manager).contains("executor-2"))
 
     // Existing executors have disconnected
-    sc.listenerBus.postToAll(SparkListenerExecutorRemoved(0L, "executor-1", ""))
+    post(sc.listenerBus, SparkListenerExecutorRemoved(0L, "executor-1", ""))
     assert(executorIds(manager).size === 1)
     assert(!executorIds(manager).contains("executor-1"))
     assert(removeTimes(manager).size === 1)
     assert(!removeTimes(manager).contains("executor-1"))
 
     // Unknown executor has disconnected
-    sc.listenerBus.postToAll(SparkListenerExecutorRemoved(0L, "executor-3", ""))
+    post(sc.listenerBus, SparkListenerExecutorRemoved(0L, "executor-3", ""))
     assert(executorIds(manager).size === 1)
     assert(removeTimes(manager).size === 1)
   }
@@ -775,8 +781,8 @@ class ExecutorAllocationManagerSuite
     assert(executorIds(manager).isEmpty)
     assert(removeTimes(manager).isEmpty)
 
-    sc.listenerBus.postToAll(SparkListenerTaskStart(0, 0, createTaskInfo(0, 0, "executor-1")))
-    sc.listenerBus.postToAll(SparkListenerExecutorAdded(
+    post(sc.listenerBus, SparkListenerTaskStart(0, 0, createTaskInfo(0, 0, "executor-1")))
+    post(sc.listenerBus, SparkListenerExecutorAdded(
       0L, "executor-1", new ExecutorInfo("host1", 1, Map.empty)))
     assert(executorIds(manager).size === 1)
     assert(executorIds(manager).contains("executor-1"))
@@ -788,15 +794,15 @@ class ExecutorAllocationManagerSuite
     val manager = sc.executorAllocationManager.get
     assert(executorIds(manager).isEmpty)
     assert(removeTimes(manager).isEmpty)
-    sc.listenerBus.postToAll(SparkListenerExecutorAdded(
+    post(sc.listenerBus, SparkListenerExecutorAdded(
       0L, "executor-1", new ExecutorInfo("host1", 1, Map.empty)))
-    sc.listenerBus.postToAll(SparkListenerTaskStart(0, 0, createTaskInfo(0, 0, "executor-1")))
+    post(sc.listenerBus, SparkListenerTaskStart(0, 0, createTaskInfo(0, 0, "executor-1")))
 
     assert(executorIds(manager).size === 1)
     assert(executorIds(manager).contains("executor-1"))
     assert(removeTimes(manager).size === 0)
 
-    sc.listenerBus.postToAll(SparkListenerExecutorAdded(
+    post(sc.listenerBus, SparkListenerExecutorAdded(
       0L, "executor-2", new ExecutorInfo("host1", 1, Map.empty)))
     assert(executorIds(manager).size === 2)
     assert(executorIds(manager).contains("executor-2"))
@@ -809,7 +815,7 @@ class ExecutorAllocationManagerSuite
     sc = createSparkContext(0, 100000, 0)
     val manager = sc.executorAllocationManager.get
     val stage1 = createStageInfo(0, 1000)
-    sc.listenerBus.postToAll(SparkListenerStageSubmitted(stage1))
+    post(sc.listenerBus, SparkListenerStageSubmitted(stage1))
 
     assert(addExecutors(manager) === 1)
     assert(addExecutors(manager) === 2)
@@ -820,12 +826,12 @@ class ExecutorAllocationManagerSuite
       onExecutorAdded(manager, s"executor-$i")
     }
     assert(executorIds(manager).size === 15)
-    sc.listenerBus.postToAll(SparkListenerStageCompleted(stage1))
+    post(sc.listenerBus, SparkListenerStageCompleted(stage1))
 
     adjustRequestedExecutors(manager)
     assert(numExecutorsTarget(manager) === 0)
 
-    sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(1, 1000)))
+    post(sc.listenerBus, SparkListenerStageSubmitted(createStageInfo(1, 1000)))
     addExecutors(manager)
     assert(numExecutorsTarget(manager) === 16)
   }
@@ -842,7 +848,7 @@ class ExecutorAllocationManagerSuite
     // Verify whether the initial number of executors is kept with no pending tasks
     assert(numExecutorsTarget(manager) === 3)
 
-    sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(1, 2)))
+    post(sc.listenerBus, SparkListenerStageSubmitted(createStageInfo(1, 2)))
     clock.advance(100L)
 
     assert(maxNumExecutorsNeeded(manager) === 2)
@@ -892,7 +898,7 @@ class ExecutorAllocationManagerSuite
       Seq.empty
     )
     val stageInfo1 = createStageInfo(1, 5, localityPreferences1)
-    sc.listenerBus.postToAll(SparkListenerStageSubmitted(stageInfo1))
+    post(sc.listenerBus, SparkListenerStageSubmitted(stageInfo1))
 
     assert(localityAwareTasks(manager) === 3)
     assert(hostToLocalTaskCount(manager) ===
@@ -904,13 +910,13 @@ class ExecutorAllocationManagerSuite
       Seq.empty
     )
     val stageInfo2 = createStageInfo(2, 3, localityPreferences2)
-    sc.listenerBus.postToAll(SparkListenerStageSubmitted(stageInfo2))
+    post(sc.listenerBus, SparkListenerStageSubmitted(stageInfo2))
 
     assert(localityAwareTasks(manager) === 5)
     assert(hostToLocalTaskCount(manager) ===
       Map("host1" -> 2, "host2" -> 4, "host3" -> 4, "host4" -> 3, "host5" -> 2))
 
-    sc.listenerBus.postToAll(SparkListenerStageCompleted(stageInfo1))
+    post(sc.listenerBus, SparkListenerStageCompleted(stageInfo1))
     assert(localityAwareTasks(manager) === 2)
     assert(hostToLocalTaskCount(manager) ===
       Map("host2" -> 1, "host3" -> 2, "host4" -> 1, "host5" -> 2))
@@ -921,16 +927,16 @@ class ExecutorAllocationManagerSuite
     val manager = sc.executorAllocationManager.get
     assert(maxNumExecutorsNeeded(manager) === 0)
 
-    sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(0, 1)))
+    post(sc.listenerBus, SparkListenerStageSubmitted(createStageInfo(0, 1)))
     assert(maxNumExecutorsNeeded(manager) === 1)
 
     val taskInfo = createTaskInfo(1, 1, "executor-1")
-    sc.listenerBus.postToAll(SparkListenerTaskStart(0, 0, taskInfo))
+    post(sc.listenerBus, SparkListenerTaskStart(0, 0, taskInfo))
     assert(maxNumExecutorsNeeded(manager) === 1)
 
     // If the task is failed, we expect it to be resubmitted later.
     val taskEndReason = ExceptionFailure(null, null, null, null, None)
-    sc.listenerBus.postToAll(SparkListenerTaskEnd(0, 0, null, taskEndReason, taskInfo, null))
+    post(sc.listenerBus, SparkListenerTaskEnd(0, 0, null, taskEndReason, taskInfo, null))
     assert(maxNumExecutorsNeeded(manager) === 1)
   }
 
@@ -942,7 +948,7 @@ class ExecutorAllocationManagerSuite
 
     // Allocation manager is reset when adding executor requests are sent without reporting back
     // executor added.
-    sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(0, 10)))
+    post(sc.listenerBus, SparkListenerStageSubmitted(createStageInfo(0, 10)))
 
     assert(addExecutors(manager) === 1)
     assert(numExecutorsTarget(manager) === 2)
@@ -957,7 +963,7 @@ class ExecutorAllocationManagerSuite
     assert(executorIds(manager) === Set.empty)
 
     // Allocation manager is reset when executors are added.
-    sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(0, 10)))
+    post(sc.listenerBus, SparkListenerStageSubmitted(createStageInfo(0, 10)))
 
     addExecutors(manager)
     addExecutors(manager)
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 703fc1b34c387..6222e576d1ce9 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -751,7 +751,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi
 
   // Helper functions to extract commonly used code in Fetch Failure test cases
   private def setupStageAbortTest(sc: SparkContext) {
-    sc.listenerBus.addListener(new EndListener())
+    sc.listenerBus.addToSharedQueue(new EndListener())
     ended = false
     jobResult = null
   }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
index 0afd07b851cf9..6b42775ccb0f6 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
@@ -164,9 +164,9 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit
     // A comprehensive test on JSON de/serialization of all events is in JsonProtocolSuite
     eventLogger.start()
     listenerBus.start(Mockito.mock(classOf[SparkContext]), Mockito.mock(classOf[MetricsSystem]))
-    listenerBus.addListener(eventLogger)
-    listenerBus.postToAll(applicationStart)
-    listenerBus.postToAll(applicationEnd)
+    listenerBus.addToEventLogQueue(eventLogger)
+    listenerBus.post(applicationStart)
+    listenerBus.post(applicationEnd)
     listenerBus.stop()
     eventLogger.stop()
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
index 995df1dd52010..d061c7845f4a6 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
@@ -34,6 +34,8 @@ import org.apache.spark.util.{ResetSystemProperties, RpcUtils}
 class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Matchers
   with ResetSystemProperties {
 
+  import LiveListenerBus._
+
   /** Length of time to wait while draining listener events. */
   val WAIT_TIMEOUT_MILLIS = 10000
 
@@ -42,18 +44,28 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
   private val mockSparkContext: SparkContext = Mockito.mock(classOf[SparkContext])
   private val mockMetricsSystem: MetricsSystem = Mockito.mock(classOf[MetricsSystem])
 
+  private def numDroppedEvents(bus: LiveListenerBus): Long = {
+    bus.metrics.metricRegistry.counter(s"queue.$SHARED_QUEUE.numDroppedEvents").getCount
+  }
+
+  private def queueSize(bus: LiveListenerBus): Int = {
+    bus.metrics.metricRegistry.getGauges().get(s"queue.$SHARED_QUEUE.size").getValue()
+      .asInstanceOf[Int]
+  }
+
+  private def eventProcessingTimeCount(bus: LiveListenerBus): Long = {
+    bus.metrics.metricRegistry.timer(s"queue.$SHARED_QUEUE.listenerProcessingTime").getCount()
+  }
+
   test("don't call sc.stop in listener") {
     sc = new SparkContext("local", "SparkListenerSuite", new SparkConf())
     val listener = new SparkContextStoppingListener(sc)
-    val bus = new LiveListenerBus(sc.conf)
-    bus.addListener(listener)
 
-    // Starting listener bus should flush all buffered events
-    bus.start(sc, sc.env.metricsSystem)
-    bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded))
-    bus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
+    sc.listenerBus.addToSharedQueue(listener)
+    sc.listenerBus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded))
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
+    sc.stop()
 
-    bus.stop()
     assert(listener.sparkExSeen)
   }
 
@@ -61,13 +73,13 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     val conf = new SparkConf()
     val counter = new BasicJobCounter
     val bus = new LiveListenerBus(conf)
-    bus.addListener(counter)
+    bus.addToSharedQueue(counter)
 
     // Metrics are initially empty.
     assert(bus.metrics.numEventsPosted.getCount === 0)
-    assert(bus.metrics.numDroppedEvents.getCount === 0)
-    assert(bus.metrics.queueSize.getValue === 0)
-    assert(bus.metrics.eventProcessingTime.getCount === 0)
+    assert(numDroppedEvents(bus) === 0)
+    assert(queueSize(bus) === 0)
+    assert(eventProcessingTimeCount(bus) === 0)
 
     // Post five events:
     (1 to 5).foreach { _ => bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded)) }
@@ -75,7 +87,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     // Five messages should be marked as received and queued, but no messages should be posted to
     // listeners yet because the the listener bus hasn't been started.
     assert(bus.metrics.numEventsPosted.getCount === 5)
-    assert(bus.metrics.queueSize.getValue === 5)
+    assert(queueSize(bus) === 5)
     assert(counter.count === 0)
 
     // Starting listener bus should flush all buffered events
@@ -83,18 +95,14 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     Mockito.verify(mockMetricsSystem).registerSource(bus.metrics)
     bus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
     assert(counter.count === 5)
-    assert(bus.metrics.queueSize.getValue === 0)
-    assert(bus.metrics.eventProcessingTime.getCount === 5)
+    assert(queueSize(bus) === 0)
+    assert(eventProcessingTimeCount(bus) === 5)
 
     // After listener bus has stopped, posting events should not increment counter
     bus.stop()
     (1 to 5).foreach { _ => bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded)) }
     assert(counter.count === 5)
-    assert(bus.metrics.numEventsPosted.getCount === 5)
-
-    // Make sure per-listener-class timers were created:
-    assert(bus.metrics.getTimerForListenerClass(
-      classOf[BasicJobCounter].asSubclass(classOf[SparkListenerInterface])).get.getCount == 5)
+    assert(eventProcessingTimeCount(bus) === 5)
 
     // Listener bus must not be started twice
     intercept[IllegalStateException] {
@@ -135,7 +143,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     val bus = new LiveListenerBus(new SparkConf())
     val blockingListener = new BlockingListener
 
-    bus.addListener(blockingListener)
+    bus.addToSharedQueue(blockingListener)
     bus.start(mockSparkContext, mockMetricsSystem)
     bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded))
 
@@ -168,7 +176,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     val listenerStarted = new Semaphore(0)
     val listenerWait = new Semaphore(0)
 
-    bus.addListener(new SparkListener {
+    bus.addToSharedQueue(new SparkListener {
       override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = {
         listenerStarted.release()
         listenerWait.acquire()
@@ -180,20 +188,19 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     // Post a message to the listener bus and wait for processing to begin:
     bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded))
     listenerStarted.acquire()
-    assert(bus.metrics.queueSize.getValue === 0)
-    assert(bus.metrics.numDroppedEvents.getCount === 0)
+    assert(queueSize(bus) === 0)
+    assert(numDroppedEvents(bus) === 0)
 
     // If we post an additional message then it should remain in the queue because the listener is
     // busy processing the first event:
     bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded))
-    assert(bus.metrics.queueSize.getValue === 1)
-    assert(bus.metrics.numDroppedEvents.getCount === 0)
+    assert(queueSize(bus) === 1)
+    assert(numDroppedEvents(bus) === 0)
 
     // The queue is now full, so any additional events posted to the listener will be dropped:
     bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded))
-    assert(bus.metrics.queueSize.getValue === 1)
-    assert(bus.metrics.numDroppedEvents.getCount === 1)
-
+    assert(queueSize(bus) === 1)
+    assert(numDroppedEvents(bus) === 1)
 
     // Allow the the remaining events to be processed so we can stop the listener bus:
     listenerWait.release(2)
@@ -419,9 +426,9 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     val bus = new LiveListenerBus(new SparkConf())
 
     // Propagate events to bad listener first
-    bus.addListener(badListener)
-    bus.addListener(jobCounter1)
-    bus.addListener(jobCounter2)
+    bus.addToSharedQueue(badListener)
+    bus.addToSharedQueue(jobCounter1)
+    bus.addToSharedQueue(jobCounter2)
     bus.start(mockSparkContext, mockMetricsSystem)
 
     // Post events to all listeners, and wait until the queue is drained
@@ -429,7 +436,6 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     bus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
 
     // The exception should be caught, and the event should be propagated to other listeners
-    assert(bus.listenerThreadIsAlive)
     assert(jobCounter1.count === 5)
     assert(jobCounter2.count === 5)
   }
@@ -449,6 +455,31 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
         .count(_.isInstanceOf[FirehoseListenerThatAcceptsSparkConf]) should be (1)
   }
 
+  test("add and remove listeners to/from LiveListenerBus queues") {
+    val bus = new LiveListenerBus(new SparkConf(false))
+    val counter1 = new BasicJobCounter()
+    val counter2 = new BasicJobCounter()
+    val counter3 = new BasicJobCounter()
+
+    bus.addToSharedQueue(counter1)
+    bus.addToStatusQueue(counter2)
+    bus.addToStatusQueue(counter3)
+    assert(bus.activeQueues() === Set(SHARED_QUEUE, APP_STATUS_QUEUE))
+    assert(bus.findListenersByClass[BasicJobCounter]().size === 3)
+
+    bus.removeListener(counter1)
+    assert(bus.activeQueues() === Set(APP_STATUS_QUEUE))
+    assert(bus.findListenersByClass[BasicJobCounter]().size === 2)
+
+    bus.removeListener(counter2)
+    assert(bus.activeQueues() === Set(APP_STATUS_QUEUE))
+    assert(bus.findListenersByClass[BasicJobCounter]().size === 1)
+
+    bus.removeListener(counter3)
+    assert(bus.activeQueues().isEmpty)
+    assert(bus.findListenersByClass[BasicJobCounter]().isEmpty)
+  }
+
   /**
    * Assert that the given list of numbers has an average that is greater than zero.
    */
diff --git a/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala b/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala
index 1cb52593e7060..79f02f2e50bbd 100644
--- a/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala
@@ -27,7 +27,7 @@ import org.apache.spark.storage._
  * Test various functionality in the StorageListener that supports the StorageTab.
  */
 class StorageTabSuite extends SparkFunSuite with BeforeAndAfter {
-  private var bus: LiveListenerBus = _
+  private var bus: SparkListenerBus = _
   private var storageStatusListener: StorageStatusListener = _
   private var storageListener: StorageListener = _
   private val memAndDisk = StorageLevel.MEMORY_AND_DISK
@@ -43,7 +43,7 @@ class StorageTabSuite extends SparkFunSuite with BeforeAndAfter {
 
   before {
     val conf = new SparkConf()
-    bus = new LiveListenerBus(conf)
+    bus = new ReplayListenerBus()
     storageStatusListener = new StorageStatusListener(conf)
     storageListener = new StorageListener(storageStatusListener)
     bus.addListener(storageStatusListener)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala
index 4207013c3f75d..07e39023c8366 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala
@@ -40,7 +40,7 @@ class StreamingQueryListenerBus(sparkListenerBus: LiveListenerBus)
 
   import StreamingQueryListener._
 
-  sparkListenerBus.addListener(this)
+  sparkListenerBus.addToSharedQueue(this)
 
   /**
    * RunIds of active queries whose events are supposed to be forwarded by this ListenerBus
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala
index 7202f1222d10f..ad9db308b2627 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala
@@ -28,6 +28,7 @@ import org.apache.hadoop.fs.FsUrlStreamHandlerFactory
 
 import org.apache.spark.{SparkConf, SparkContext, SparkException}
 import org.apache.spark.internal.Logging
+import org.apache.spark.scheduler.LiveListenerBus
 import org.apache.spark.sql.{SparkSession, SQLContext}
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.execution.CacheManager
@@ -148,7 +149,7 @@ private[sql] class SharedState(val sparkContext: SparkContext) extends Logging {
     if (SparkSession.sqlListener.get() == null) {
       val listener = new SQLListener(sc.conf)
       if (SparkSession.sqlListener.compareAndSet(null, listener)) {
-        sc.addSparkListener(listener)
+        sc.listenerBus.addToStatusQueue(listener)
         sc.ui.foreach(new SQLTab(listener, _))
       }
     }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index f3b4ff2d1d80c..8c7418ec7ac10 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -659,8 +659,7 @@ class StreamingContext private[streaming] (
   def stop(stopSparkContext: Boolean, stopGracefully: Boolean): Unit = {
     var shutdownHookRefToRemove: AnyRef = null
     if (LiveListenerBus.withinListenerThread.value) {
-      throw new SparkException(
-        s"Cannot stop StreamingContext within listener thread of ${LiveListenerBus.name}")
+      throw new SparkException(s"Cannot stop StreamingContext within listener bus thread.")
     }
     synchronized {
       // The state should always be Stopped after calling `stop()`, even if we haven't started yet
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/StreamingListenerBus.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/StreamingListenerBus.scala
index 5fb0bd057d0f1..6a70bf7406b3c 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/StreamingListenerBus.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/StreamingListenerBus.scala
@@ -76,7 +76,7 @@ private[streaming] class StreamingListenerBus(sparkListenerBus: LiveListenerBus)
    * forward them to StreamingListeners.
    */
   def start(): Unit = {
-    sparkListenerBus.addListener(this) // for getting callbacks on spark events
+    sparkListenerBus.addToStatusQueue(this)
   }
 
   /**
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
index 96ab5a2080b8e..5810e73f4098b 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
@@ -575,8 +575,6 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with TimeL
 
   test("getActive and getActiveOrCreate") {
     require(StreamingContext.getActive().isEmpty, "context exists from before")
-    sc = new SparkContext(conf)
-
     var newContextCreated = false
 
     def creatingFunc(): StreamingContext = {
@@ -603,6 +601,7 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with TimeL
     // getActiveOrCreate should create new context and getActive should return it only
     // after starting the context
     testGetActiveOrCreate {
+      sc = new SparkContext(conf)
       ssc = StreamingContext.getActiveOrCreate(creatingFunc _)
       assert(ssc != null, "no context created")
       assert(newContextCreated === true, "new context not created")
@@ -622,6 +621,7 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with TimeL
 
     // getActiveOrCreate and getActive should return independently created context after activating
     testGetActiveOrCreate {
+      sc = new SparkContext(conf)
       ssc = creatingFunc()  // Create
       assert(StreamingContext.getActive().isEmpty,
         "new initialized context returned before starting")

From 280ff523f4079dd9541efc95e6efcb69f9374d22 Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Wed, 20 Sep 2017 00:01:21 -0700
Subject: [PATCH 579/779] [SPARK-21977] SinglePartition optimizations break
 certain Streaming Stateful Aggregation requirements

## What changes were proposed in this pull request?

This is a bit hard to explain as there are several issues here, I'll try my best. Here are the requirements:
  1. A StructuredStreaming Source that can generate empty RDDs with 0 partitions
  2. A StructuredStreaming query that uses the above source, performs a stateful aggregation
     (mapGroupsWithState, groupBy.count, ...), and coalesce's by 1

The crux of the problem is that when a dataset has a `coalesce(1)` call, it receives a `SinglePartition` partitioning scheme. This scheme satisfies most required distributions used for aggregations such as HashAggregateExec. This causes a world of problems:
  Symptom 1. If the input RDD has 0 partitions, the whole lineage will receive 0 partitions, nothing will be executed, the state store will not create any delta files. When this happens, the next trigger fails, because the StateStore fails to load the delta file for the previous trigger
  Symptom 2. Let's say that there was data. Then in this case, if you stop your stream, and change `coalesce(1)` with `coalesce(2)`, then restart your stream, your stream will fail, because `spark.sql.shuffle.partitions - 1` number of StateStores will fail to find its delta files.

To fix the issues above, we must check that the partitioning of the child of a `StatefulOperator` satisfies:
If the grouping expressions are empty:
  a) AllTuple distribution
  b) Single physical partition
If the grouping expressions are non empty:
  a) Clustered distribution
  b) spark.sql.shuffle.partition # of partitions
whether or not `coalesce(1)` exists in the plan, and whether or not the input RDD for the trigger has any data.

Once you fix the above problem by adding an Exchange to the plan, you come across the following bug:
If you call `coalesce(1).groupBy().count()` on a Streaming DataFrame, and if you have a trigger with no data, `StateStoreRestoreExec` doesn't return the prior state. However, for this specific aggregation, `HashAggregateExec` after the restore returns a (0, 0) row, since we're performing a count, and there is no data. Then this data gets stored in `StateStoreSaveExec` causing the previous counts to be overwritten and lost.

## How was this patch tested?

Regression tests

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #19196 from brkyvz/sa-0.
---
 .../streaming/IncrementalExecution.scala      |  34 ++-
 .../execution/streaming/StreamExecution.scala |   1 +
 .../streaming/statefulOperators.scala         |  37 +++-
 .../EnsureStatefulOpPartitioningSuite.scala   | 132 ++++++++++++
 .../spark/sql/streaming/StreamTest.scala      |  16 +-
 .../streaming/StreamingAggregationSuite.scala | 196 +++++++++++++++++-
 6 files changed, 395 insertions(+), 21 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/streaming/EnsureStatefulOpPartitioningSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
index 19d95980d57d3..027222e1119c1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
@@ -21,11 +21,13 @@ import java.util.UUID
 import java.util.concurrent.atomic.AtomicInteger
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.{SparkSession, Strategy}
+import org.apache.spark.sql.{AnalysisException, SparkSession, Strategy}
 import org.apache.spark.sql.catalyst.expressions.CurrentBatchTimestamp
 import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistribution, HashPartitioning, SinglePartition}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.{QueryExecution, SparkPlan, SparkPlanner, UnaryExecNode}
+import org.apache.spark.sql.execution.exchange.ShuffleExchange
 import org.apache.spark.sql.streaming.OutputMode
 
 /**
@@ -89,7 +91,7 @@ class IncrementalExecution(
     override def apply(plan: SparkPlan): SparkPlan = plan transform {
       case StateStoreSaveExec(keys, None, None, None,
              UnaryExecNode(agg,
-               StateStoreRestoreExec(keys2, None, child))) =>
+               StateStoreRestoreExec(_, None, child))) =>
         val aggStateInfo = nextStatefulOperationStateInfo
         StateStoreSaveExec(
           keys,
@@ -117,8 +119,34 @@ class IncrementalExecution(
     }
   }
 
-  override def preparations: Seq[Rule[SparkPlan]] = state +: super.preparations
+  override def preparations: Seq[Rule[SparkPlan]] =
+    Seq(state, EnsureStatefulOpPartitioning) ++ super.preparations
 
   /** No need assert supported, as this check has already been done */
   override def assertSupported(): Unit = { }
 }
+
+object EnsureStatefulOpPartitioning extends Rule[SparkPlan] {
+  // Needs to be transformUp to avoid extra shuffles
+  override def apply(plan: SparkPlan): SparkPlan = plan transformUp {
+    case so: StatefulOperator =>
+      val numPartitions = plan.sqlContext.sessionState.conf.numShufflePartitions
+      val distributions = so.requiredChildDistribution
+      val children = so.children.zip(distributions).map { case (child, reqDistribution) =>
+        val expectedPartitioning = reqDistribution match {
+          case AllTuples => SinglePartition
+          case ClusteredDistribution(keys) => HashPartitioning(keys, numPartitions)
+          case _ => throw new AnalysisException("Unexpected distribution expected for " +
+            s"Stateful Operator: $so. Expect AllTuples or ClusteredDistribution but got " +
+            s"$reqDistribution.")
+        }
+        if (child.outputPartitioning.guarantees(expectedPartitioning) &&
+            child.execute().getNumPartitions == expectedPartitioning.numPartitions) {
+          child
+        } else {
+          ShuffleExchange(expectedPartitioning, child)
+        }
+      }
+      so.withNewChildren(children)
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index b27a59b8a34fb..18385f5fc1975 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -829,6 +829,7 @@ class StreamExecution(
     if (streamDeathCause != null) {
       throw streamDeathCause
     }
+    if (!isActive) return
     awaitBatchLock.lock()
     try {
       noNewData = false
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala
index e46356392c51b..d6566b8e6b54f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateUnsafeProjection, Predicate}
 import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark
-import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Distribution, Partitioning}
+import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistribution, Distribution, Partitioning}
 import org.apache.spark.sql.catalyst.streaming.InternalOutputModes._
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
@@ -200,11 +200,20 @@ case class StateStoreRestoreExec(
       sqlContext.sessionState,
       Some(sqlContext.streams.stateStoreCoordinator)) { case (store, iter) =>
         val getKey = GenerateUnsafeProjection.generate(keyExpressions, child.output)
-        iter.flatMap { row =>
-          val key = getKey(row)
-          val savedState = store.get(key)
-          numOutputRows += 1
-          row +: Option(savedState).toSeq
+        val hasInput = iter.hasNext
+        if (!hasInput && keyExpressions.isEmpty) {
+          // If our `keyExpressions` are empty, we're getting a global aggregation. In that case
+          // the `HashAggregateExec` will output a 0 value for the partial merge. We need to
+          // restore the value, so that we don't overwrite our state with a 0 value, but rather
+          // merge the 0 with existing state.
+          store.iterator().map(_.value)
+        } else {
+          iter.flatMap { row =>
+            val key = getKey(row)
+            val savedState = store.get(key)
+            numOutputRows += 1
+            row +: Option(savedState).toSeq
+          }
         }
     }
   }
@@ -212,6 +221,14 @@ case class StateStoreRestoreExec(
   override def output: Seq[Attribute] = child.output
 
   override def outputPartitioning: Partitioning = child.outputPartitioning
+
+  override def requiredChildDistribution: Seq[Distribution] = {
+    if (keyExpressions.isEmpty) {
+      AllTuples :: Nil
+    } else {
+      ClusteredDistribution(keyExpressions) :: Nil
+    }
+  }
 }
 
 /**
@@ -351,6 +368,14 @@ case class StateStoreSaveExec(
   override def output: Seq[Attribute] = child.output
 
   override def outputPartitioning: Partitioning = child.outputPartitioning
+
+  override def requiredChildDistribution: Seq[Distribution] = {
+    if (keyExpressions.isEmpty) {
+      AllTuples :: Nil
+    } else {
+      ClusteredDistribution(keyExpressions) :: Nil
+    }
+  }
 }
 
 /** Physical operator for executing streaming Deduplicate. */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/EnsureStatefulOpPartitioningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/EnsureStatefulOpPartitioningSuite.scala
new file mode 100644
index 0000000000000..66c0263e872b9
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/EnsureStatefulOpPartitioningSuite.scala
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.streaming
+
+import java.util.UUID
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.plans.physical._
+import org.apache.spark.sql.execution.{SparkPlan, SparkPlanTest, UnaryExecNode}
+import org.apache.spark.sql.execution.exchange.{Exchange, ShuffleExchange}
+import org.apache.spark.sql.execution.streaming.{IncrementalExecution, OffsetSeqMetadata, StatefulOperator, StatefulOperatorStateInfo}
+import org.apache.spark.sql.test.SharedSQLContext
+
+class EnsureStatefulOpPartitioningSuite extends SparkPlanTest with SharedSQLContext {
+
+  import testImplicits._
+  super.beforeAll()
+
+  private val baseDf = Seq((1, "A"), (2, "b")).toDF("num", "char")
+
+  testEnsureStatefulOpPartitioning(
+    "ClusteredDistribution generates Exchange with HashPartitioning",
+    baseDf.queryExecution.sparkPlan,
+    requiredDistribution = keys => ClusteredDistribution(keys),
+    expectedPartitioning =
+      keys => HashPartitioning(keys, spark.sessionState.conf.numShufflePartitions),
+    expectShuffle = true)
+
+  testEnsureStatefulOpPartitioning(
+    "ClusteredDistribution with coalesce(1) generates Exchange with HashPartitioning",
+    baseDf.coalesce(1).queryExecution.sparkPlan,
+    requiredDistribution = keys => ClusteredDistribution(keys),
+    expectedPartitioning =
+      keys => HashPartitioning(keys, spark.sessionState.conf.numShufflePartitions),
+    expectShuffle = true)
+
+  testEnsureStatefulOpPartitioning(
+    "AllTuples generates Exchange with SinglePartition",
+    baseDf.queryExecution.sparkPlan,
+    requiredDistribution = _ => AllTuples,
+    expectedPartitioning = _ => SinglePartition,
+    expectShuffle = true)
+
+  testEnsureStatefulOpPartitioning(
+    "AllTuples with coalesce(1) doesn't need Exchange",
+    baseDf.coalesce(1).queryExecution.sparkPlan,
+    requiredDistribution = _ => AllTuples,
+    expectedPartitioning = _ => SinglePartition,
+    expectShuffle = false)
+
+  /**
+   * For `StatefulOperator` with the given `requiredChildDistribution`, and child SparkPlan
+   * `inputPlan`, ensures that the incremental planner adds exchanges, if required, in order to
+   * ensure the expected partitioning.
+   */
+  private def testEnsureStatefulOpPartitioning(
+      testName: String,
+      inputPlan: SparkPlan,
+      requiredDistribution: Seq[Attribute] => Distribution,
+      expectedPartitioning: Seq[Attribute] => Partitioning,
+      expectShuffle: Boolean): Unit = {
+    test(testName) {
+      val operator = TestStatefulOperator(inputPlan, requiredDistribution(inputPlan.output.take(1)))
+      val executed = executePlan(operator, OutputMode.Complete())
+      if (expectShuffle) {
+        val exchange = executed.children.find(_.isInstanceOf[Exchange])
+        if (exchange.isEmpty) {
+          fail(s"Was expecting an exchange but didn't get one in:\n$executed")
+        }
+        assert(exchange.get ===
+          ShuffleExchange(expectedPartitioning(inputPlan.output.take(1)), inputPlan),
+          s"Exchange didn't have expected properties:\n${exchange.get}")
+      } else {
+        assert(!executed.children.exists(_.isInstanceOf[Exchange]),
+          s"Unexpected exchange found in:\n$executed")
+      }
+    }
+  }
+
+  /** Executes a SparkPlan using the IncrementalPlanner used for Structured Streaming. */
+  private def executePlan(
+      p: SparkPlan,
+      outputMode: OutputMode = OutputMode.Append()): SparkPlan = {
+    val execution = new IncrementalExecution(
+      spark,
+      null,
+      OutputMode.Complete(),
+      "chk",
+      UUID.randomUUID(),
+      0L,
+      OffsetSeqMetadata()) {
+      override lazy val sparkPlan: SparkPlan = p transform {
+        case plan: SparkPlan =>
+          val inputMap = plan.children.flatMap(_.output).map(a => (a.name, a)).toMap
+          plan transformExpressions {
+            case UnresolvedAttribute(Seq(u)) =>
+              inputMap.getOrElse(u,
+                sys.error(s"Invalid Test: Cannot resolve $u given input $inputMap"))
+          }
+      }
+    }
+    execution.executedPlan
+  }
+}
+
+/** Used to emulate a `StatefulOperator` with the given requiredDistribution. */
+case class TestStatefulOperator(
+    child: SparkPlan,
+    requiredDist: Distribution) extends UnaryExecNode with StatefulOperator {
+  override def output: Seq[Attribute] = child.output
+  override def doExecute(): RDD[InternalRow] = child.execute()
+  override def requiredChildDistribution: Seq[Distribution] = requiredDist :: Nil
+  override def stateInfo: Option[StatefulOperatorStateInfo] = None
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index 4f8764060d922..70b39b934071a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -167,7 +167,8 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
   case class StartStream(
       trigger: Trigger = Trigger.ProcessingTime(0),
       triggerClock: Clock = new SystemClock,
-      additionalConfs: Map[String, String] = Map.empty)
+      additionalConfs: Map[String, String] = Map.empty,
+      checkpointLocation: String = null)
     extends StreamAction
 
   /** Advance the trigger clock's time manually. */
@@ -349,13 +350,14 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
          """.stripMargin)
     }
 
-    val metadataRoot = Utils.createTempDir(namePrefix = "streaming.metadata").getCanonicalPath
     var manualClockExpectedTime = -1L
+    val defaultCheckpointLocation =
+      Utils.createTempDir(namePrefix = "streaming.metadata").getCanonicalPath
     try {
       startedTest.foreach { action =>
         logInfo(s"Processing test stream action: $action")
         action match {
-          case StartStream(trigger, triggerClock, additionalConfs) =>
+          case StartStream(trigger, triggerClock, additionalConfs, checkpointLocation) =>
             verify(currentStream == null, "stream already running")
             verify(triggerClock.isInstanceOf[SystemClock]
               || triggerClock.isInstanceOf[StreamManualClock],
@@ -363,6 +365,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
             if (triggerClock.isInstanceOf[StreamManualClock]) {
               manualClockExpectedTime = triggerClock.asInstanceOf[StreamManualClock].getTimeMillis()
             }
+            val metadataRoot = Option(checkpointLocation).getOrElse(defaultCheckpointLocation)
 
             additionalConfs.foreach(pair => {
               val value =
@@ -479,7 +482,12 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
             verify(currentStream != null || lastStream != null,
               "cannot assert when no stream has been started")
             val streamToAssert = Option(currentStream).getOrElse(lastStream)
-            verify(a.condition(streamToAssert), s"Assert on query failed: ${a.message}")
+            try {
+              verify(a.condition(streamToAssert), s"Assert on query failed: ${a.message}")
+            } catch {
+              case NonFatal(e) =>
+                failTest(s"Assert on query failed: ${a.message}", e)
+            }
 
           case a: Assert =>
             val streamToAssert = Option(currentStream).getOrElse(lastStream)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
index e0979ce296c3a..995cea3b37d4f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
@@ -22,20 +22,24 @@ import java.util.{Locale, TimeZone}
 import org.scalatest.Assertions
 import org.scalatest.BeforeAndAfterAll
 
-import org.apache.spark.SparkException
-import org.apache.spark.sql.{AnalysisException, DataFrame}
+import org.apache.spark.{SparkEnv, SparkException}
+import org.apache.spark.rdd.BlockRDD
+import org.apache.spark.sql.{AnalysisException, DataFrame, Dataset, SparkSession}
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.plans.logical.Aggregate
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
-import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
+import org.apache.spark.sql.execution.exchange.Exchange
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.state.StateStore
 import org.apache.spark.sql.expressions.scalalang.typed
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.streaming.OutputMode._
-import org.apache.spark.sql.streaming.util.StreamManualClock
+import org.apache.spark.sql.streaming.util.{MockSourceProvider, StreamManualClock}
 import org.apache.spark.sql.types.StructType
+import org.apache.spark.storage.{BlockId, StorageLevel, TestBlockId}
 
-object FailureSinglton {
+object FailureSingleton {
   var firstTime = true
 }
 
@@ -226,12 +230,12 @@ class StreamingAggregationSuite extends StateStoreMetricsTest
 
   testQuietly("midbatch failure") {
     val inputData = MemoryStream[Int]
-    FailureSinglton.firstTime = true
+    FailureSingleton.firstTime = true
     val aggregated =
       inputData.toDS()
           .map { i =>
-            if (i == 4 && FailureSinglton.firstTime) {
-              FailureSinglton.firstTime = false
+            if (i == 4 && FailureSingleton.firstTime) {
+              FailureSingleton.firstTime = false
               sys.error("injected failure")
             }
 
@@ -381,4 +385,180 @@ class StreamingAggregationSuite extends StateStoreMetricsTest
       AddData(streamInput, 0, 1, 2, 3),
       CheckLastBatch((0, 0, 2), (1, 1, 3)))
   }
+
+  /**
+   * This method verifies certain properties in the SparkPlan of a streaming aggregation.
+   * First of all, it checks that the child of a `StateStoreRestoreExec` creates the desired
+   * data distribution, where the child could be an Exchange, or a `HashAggregateExec` which already
+   * provides the expected data distribution.
+   *
+   * The second thing it checks that the child provides the expected number of partitions.
+   *
+   * The third thing it checks that we don't add an unnecessary shuffle in-between
+   * `StateStoreRestoreExec` and `StateStoreSaveExec`.
+   */
+  private def checkAggregationChain(
+      se: StreamExecution,
+      expectShuffling: Boolean,
+      expectedPartition: Int): Boolean = {
+    val executedPlan = se.lastExecution.executedPlan
+    val restore = executedPlan
+      .collect { case ss: StateStoreRestoreExec => ss }
+      .head
+    restore.child match {
+      case node: UnaryExecNode =>
+        assert(node.outputPartitioning.numPartitions === expectedPartition,
+          "Didn't get the expected number of partitions.")
+        if (expectShuffling) {
+          assert(node.isInstanceOf[Exchange], s"Expected a shuffle, got: ${node.child}")
+        } else {
+          assert(!node.isInstanceOf[Exchange], "Didn't expect a shuffle")
+        }
+
+      case _ => fail("Expected no shuffling")
+    }
+    var reachedRestore = false
+    // Check that there should be no exchanges after `StateStoreRestoreExec`
+    executedPlan.foreachUp { p =>
+      if (reachedRestore) {
+        assert(!p.isInstanceOf[Exchange], "There should be no further exchanges")
+      } else {
+        reachedRestore = p.isInstanceOf[StateStoreRestoreExec]
+      }
+    }
+    true
+  }
+
+  test("SPARK-21977: coalesce(1) with 0 partition RDD should be repartitioned to 1") {
+    val inputSource = new BlockRDDBackedSource(spark)
+    MockSourceProvider.withMockSources(inputSource) {
+      // `coalesce(1)` changes the partitioning of data to `SinglePartition` which by default
+      // satisfies the required distributions of all aggregations. Therefore in our SparkPlan, we
+      // don't have any shuffling. However, `coalesce(1)` only guarantees that the RDD has at most 1
+      // partition. Which means that if we have an input RDD with 0 partitions, nothing gets
+      // executed. Therefore the StateStore's don't save any delta files for a given trigger. This
+      // then leads to `FileNotFoundException`s in the subsequent batch.
+      // This isn't the only problem though. Once we introduce a shuffle before
+      // `StateStoreRestoreExec`, the input to the operator is an empty iterator. When performing
+      // `groupBy().agg(...)`, `HashAggregateExec` returns a `0` value for all aggregations. If
+      // we fail to restore the previous state in `StateStoreRestoreExec`, we save the 0 value in
+      // `StateStoreSaveExec` losing all previous state.
+      val aggregated: Dataset[Long] =
+        spark.readStream.format((new MockSourceProvider).getClass.getCanonicalName)
+        .load().coalesce(1).groupBy().count().as[Long]
+
+      testStream(aggregated, Complete())(
+        AddBlockData(inputSource, Seq(1)),
+        CheckLastBatch(1),
+        AssertOnQuery("Verify no shuffling") { se =>
+          checkAggregationChain(se, expectShuffling = false, 1)
+        },
+        AddBlockData(inputSource), // create an empty trigger
+        CheckLastBatch(1),
+        AssertOnQuery("Verify addition of exchange operator") { se =>
+          checkAggregationChain(se, expectShuffling = true, 1)
+        },
+        AddBlockData(inputSource, Seq(2, 3)),
+        CheckLastBatch(3),
+        AddBlockData(inputSource),
+        CheckLastBatch(3),
+        StopStream
+      )
+    }
+  }
+
+  test("SPARK-21977: coalesce(1) with aggregation should still be repartitioned when it " +
+    "has non-empty grouping keys") {
+    val inputSource = new BlockRDDBackedSource(spark)
+    MockSourceProvider.withMockSources(inputSource) {
+      withTempDir { tempDir =>
+
+        // `coalesce(1)` changes the partitioning of data to `SinglePartition` which by default
+        // satisfies the required distributions of all aggregations. However, when we have
+        // non-empty grouping keys, in streaming, we must repartition to
+        // `spark.sql.shuffle.partitions`, otherwise only a single StateStore is used to process
+        // all keys. This may be fine, however, if the user removes the coalesce(1) or changes to
+        // a `coalesce(2)` for example, then the default behavior is to shuffle to
+        // `spark.sql.shuffle.partitions` many StateStores. When this happens, all StateStore's
+        // except 1 will be missing their previous delta files, which causes the stream to fail
+        // with FileNotFoundException.
+        def createDf(partitions: Int): Dataset[(Long, Long)] = {
+          spark.readStream
+            .format((new MockSourceProvider).getClass.getCanonicalName)
+            .load().coalesce(partitions).groupBy('a % 1).count().as[(Long, Long)]
+        }
+
+        testStream(createDf(1), Complete())(
+          StartStream(checkpointLocation = tempDir.getAbsolutePath),
+          AddBlockData(inputSource, Seq(1)),
+          CheckLastBatch((0L, 1L)),
+          AssertOnQuery("Verify addition of exchange operator") { se =>
+            checkAggregationChain(
+              se,
+              expectShuffling = true,
+              spark.sessionState.conf.numShufflePartitions)
+          },
+          StopStream
+        )
+
+        testStream(createDf(2), Complete())(
+          StartStream(checkpointLocation = tempDir.getAbsolutePath),
+          Execute(se => se.processAllAvailable()),
+          AddBlockData(inputSource, Seq(2), Seq(3), Seq(4)),
+          CheckLastBatch((0L, 4L)),
+          AssertOnQuery("Verify no exchange added") { se =>
+            checkAggregationChain(
+              se,
+              expectShuffling = false,
+              spark.sessionState.conf.numShufflePartitions)
+          },
+          AddBlockData(inputSource),
+          CheckLastBatch((0L, 4L)),
+          StopStream
+        )
+      }
+    }
+  }
+
+  /** Add blocks of data to the `BlockRDDBackedSource`. */
+  case class AddBlockData(source: BlockRDDBackedSource, data: Seq[Int]*) extends AddData {
+    override def addData(query: Option[StreamExecution]): (Source, Offset) = {
+      source.addBlocks(data: _*)
+      (source, LongOffset(source.counter))
+    }
+  }
+
+  /**
+   * A Streaming Source that is backed by a BlockRDD and that can create RDDs with 0 blocks at will.
+   */
+  class BlockRDDBackedSource(spark: SparkSession) extends Source {
+    var counter = 0L
+    private val blockMgr = SparkEnv.get.blockManager
+    private var blocks: Seq[BlockId] = Seq.empty
+
+    def addBlocks(dataBlocks: Seq[Int]*): Unit = synchronized {
+      dataBlocks.foreach { data =>
+        val id = TestBlockId(counter.toString)
+        blockMgr.putIterator(id, data.iterator, StorageLevel.MEMORY_ONLY)
+        blocks ++= id :: Nil
+        counter += 1
+      }
+      counter += 1
+    }
+
+    override def getOffset: Option[Offset] = synchronized {
+      if (counter == 0) None else Some(LongOffset(counter))
+    }
+
+    override def getBatch(start: Option[Offset], end: Offset): DataFrame = synchronized {
+      val rdd = new BlockRDD[Int](spark.sparkContext, blocks.toArray)
+        .map(i => InternalRow(i)) // we don't really care about the values in this test
+      blocks = Seq.empty
+      spark.internalCreateDataFrame(rdd, schema, isStreaming = true).toDF()
+    }
+    override def schema: StructType = MockSourceProvider.fakeSchema
+    override def stop(): Unit = {
+      blockMgr.getMatchingBlockIds(_.isInstanceOf[TestBlockId]).foreach(blockMgr.removeBlock(_))
+    }
+  }
 }

From 3d4dd14cd563cef2aee791dc7c15d247c076f9a1 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Wed, 20 Sep 2017 10:01:46 +0100
Subject: [PATCH 580/779] [SPARK-22066][BUILD] Update checkstyle to 8.2, enable
 it, fix violations

## What changes were proposed in this pull request?

Update plugins, including scala-maven-plugin, to latest versions. Update checkstyle to 8.2. Remove bogus checkstyle config and enable it. Fix existing and new Java checkstyle errors.

## How was this patch tested?

Existing tests

Author: Sean Owen <sowen@cloudera.com>

Closes #19282 from srowen/SPARK-22066.
---
 assembly/pom.xml                              |  2 +-
 .../apache/spark/network/util/ByteUnit.java   | 12 ++++----
 .../network/util/NettyMemoryMetrics.java      |  2 +-
 common/sketch/pom.xml                         | 27 ++++++++----------
 common/unsafe/pom.xml                         | 27 ++++++++----------
 .../apache/spark/unsafe/types/UTF8String.java | 27 ++++++++++--------
 .../spark/unsafe/types/UTF8StringSuite.java   |  3 +-
 core/pom.xml                                  |  1 +
 .../apache/spark/io/ReadAheadInputStream.java | 21 ++++++++------
 .../unsafe/sort/UnsafeSorterSpillReader.java  |  5 ++--
 .../spark/io/ReadAheadInputStreamSuite.java   |  3 +-
 dev/checkstyle.xml                            | 28 +++++++++----------
 dev/test-dependencies.sh                      |  2 +-
 pom.xml                                       | 28 +++++++++++--------
 .../sources/v2/reader/DataSourceV2Reader.java |  9 +++---
 .../v2/reader/SupportsScanUnsafeRow.java      |  5 ++--
 .../sources/v2/JavaAdvancedDataSourceV2.java  |  4 ++-
 17 files changed, 106 insertions(+), 100 deletions(-)

diff --git a/assembly/pom.xml b/assembly/pom.xml
index cd8366a175529..01fe354235e5b 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -187,7 +187,7 @@
           <plugin>
             <groupId>org.apache.maven.plugins</groupId>
             <artifactId>maven-assembly-plugin</artifactId>
-            <version>3.0.0</version>
+            <version>3.1.0</version>
             <executions>
               <execution>
                 <id>dist</id>
diff --git a/common/network-common/src/main/java/org/apache/spark/network/util/ByteUnit.java b/common/network-common/src/main/java/org/apache/spark/network/util/ByteUnit.java
index e097714bbc6de..984575acaf511 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/util/ByteUnit.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/util/ByteUnit.java
@@ -17,12 +17,12 @@
 package org.apache.spark.network.util;
 
 public enum ByteUnit {
-  BYTE (1),
-  KiB (1024L),
-  MiB ((long) Math.pow(1024L, 2L)),
-  GiB ((long) Math.pow(1024L, 3L)),
-  TiB ((long) Math.pow(1024L, 4L)),
-  PiB ((long) Math.pow(1024L, 5L));
+  BYTE(1),
+  KiB(1024L),
+  MiB((long) Math.pow(1024L, 2L)),
+  GiB((long) Math.pow(1024L, 3L)),
+  TiB((long) Math.pow(1024L, 4L)),
+  PiB((long) Math.pow(1024L, 5L));
 
   ByteUnit(long multiplier) {
     this.multiplier = multiplier;
diff --git a/common/network-common/src/main/java/org/apache/spark/network/util/NettyMemoryMetrics.java b/common/network-common/src/main/java/org/apache/spark/network/util/NettyMemoryMetrics.java
index 969d9f65fb409..77f807cdb541f 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/util/NettyMemoryMetrics.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/util/NettyMemoryMetrics.java
@@ -44,7 +44,7 @@ public class NettyMemoryMetrics implements MetricSet {
   private final String metricPrefix;
 
   @VisibleForTesting
-  final static Set<String> VERBOSE_METRICS = new HashSet<>();
+  static final Set<String> VERBOSE_METRICS = new HashSet<>();
   static {
     VERBOSE_METRICS.addAll(Arrays.asList(
       "numAllocations",
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index ca929d8f253d7..2d59c71cc3757 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -56,20 +56,17 @@
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
-    <pluginManagement>
-      <plugins>
-        <plugin>
-          <groupId>net.alchim31.maven</groupId>
-          <artifactId>scala-maven-plugin</artifactId>
-          <version>3.2.2</version>
-          <configuration>
-            <javacArgs combine.children="append">
-              <!-- This option is needed to suppress warnings from sun.misc.Unsafe usage -->
-              <javacArg>-XDignore.symbol.file</javacArg>
-            </javacArgs>
-          </configuration>
-        </plugin>
-      </plugins>
-    </pluginManagement>
+    <plugins>
+      <plugin>
+        <groupId>net.alchim31.maven</groupId>
+        <artifactId>scala-maven-plugin</artifactId>
+        <configuration>
+          <javacArgs combine.children="append">
+            <!-- This option is needed to suppress warnings from sun.misc.Unsafe usage -->
+            <javacArg>-XDignore.symbol.file</javacArg>
+          </javacArgs>
+        </configuration>
+      </plugin>
+    </plugins>
   </build>
 </project>
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 772b0ef82f01c..a3772a2620088 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -93,20 +93,17 @@
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
-    <pluginManagement>
-      <plugins>
-        <plugin>
-          <groupId>net.alchim31.maven</groupId>
-          <artifactId>scala-maven-plugin</artifactId>
-          <version>3.2.2</version>
-          <configuration>
-            <javacArgs combine.children="append">
-              <!-- This option is needed to suppress warnings from sun.misc.Unsafe usage -->
-              <javacArg>-XDignore.symbol.file</javacArg>
-            </javacArgs>
-          </configuration>
-        </plugin>
-      </plugins>
-    </pluginManagement>
+    <plugins>
+      <plugin>
+        <groupId>net.alchim31.maven</groupId>
+        <artifactId>scala-maven-plugin</artifactId>
+        <configuration>
+          <javacArgs combine.children="append">
+            <!-- This option is needed to suppress warnings from sun.misc.Unsafe usage -->
+            <javacArg>-XDignore.symbol.file</javacArg>
+          </javacArgs>
+        </configuration>
+      </plugin>
+    </plugins>
   </build>
 </project>
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 76db0fb91e48a..ce4a06bde80c4 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -513,9 +513,9 @@ public UTF8String trim() {
 
   /**
    * Based on the given trim string, trim this string starting from both ends
-   * This method searches for each character in the source string, removes the character if it is found
-   * in the trim string, stops at the first not found. It calls the trimLeft first, then trimRight.
-   * It returns a new string in which both ends trim characters have been removed.
+   * This method searches for each character in the source string, removes the character if it is
+   * found in the trim string, stops at the first not found. It calls the trimLeft first, then
+   * trimRight. It returns a new string in which both ends trim characters have been removed.
    * @param trimString the trim character string
    */
   public UTF8String trim(UTF8String trimString) {
@@ -540,8 +540,9 @@ public UTF8String trimLeft() {
 
   /**
    * Based on the given trim string, trim this string starting from left end
-   * This method searches each character in the source string starting from the left end, removes the character if it
-   * is in the trim string, stops at the first character which is not in the trim string, returns the new string.
+   * This method searches each character in the source string starting from the left end, removes
+   * the character if it is in the trim string, stops at the first character which is not in the
+   * trim string, returns the new string.
    * @param trimString the trim character string
    */
   public UTF8String trimLeft(UTF8String trimString) {
@@ -552,7 +553,8 @@ public UTF8String trimLeft(UTF8String trimString) {
     int trimIdx = 0;
 
     while (srchIdx < numBytes) {
-      UTF8String searchChar = copyUTF8String(srchIdx, srchIdx + numBytesForFirstByte(this.getByte(srchIdx)) - 1);
+      UTF8String searchChar = copyUTF8String(
+          srchIdx, srchIdx + numBytesForFirstByte(this.getByte(srchIdx)) - 1);
       int searchCharBytes = searchChar.numBytes;
       // try to find the matching for the searchChar in the trimString set
       if (trimString.find(searchChar, 0) >= 0) {
@@ -587,8 +589,9 @@ public UTF8String trimRight() {
 
   /**
    * Based on the given trim string, trim this string starting from right end
-   * This method searches each character in the source string starting from the right end, removes the character if it
-   * is in the trim string, stops at the first character which is not in the trim string, returns the new string.
+   * This method searches each character in the source string starting from the right end,
+   * removes the character if it is in the trim string, stops at the first character which is not
+   * in the trim string, returns the new string.
    * @param trimString the trim character string
    */
   public UTF8String trimRight(UTF8String trimString) {
@@ -608,11 +611,13 @@ public UTF8String trimRight(UTF8String trimString) {
       numChars ++;
     }
 
-    // index trimEnd points to the first no matching byte position from the right side of the source string.
+    // index trimEnd points to the first no matching byte position from the right side of
+    // the source string.
     int trimEnd = numBytes - 1;
     while (numChars > 0) {
-      UTF8String searchChar =
-        copyUTF8String(stringCharPos[numChars - 1], stringCharPos[numChars - 1] + stringCharLen[numChars - 1] - 1);
+      UTF8String searchChar = copyUTF8String(
+          stringCharPos[numChars - 1],
+          stringCharPos[numChars - 1] + stringCharLen[numChars - 1] - 1);
       if (trimString.find(searchChar, 0) >= 0) {
         trimEnd -= stringCharLen[numChars - 1];
       } else {
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index f0860018d5642..7b03d2c650fc9 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -777,7 +777,8 @@ public void trimRightWithTrimString() {
     assertEquals(fromString("cc"), fromString("ccbaaaa").trimRight(fromString("ba")));
     assertEquals(fromString(""), fromString("aabbbbaaa").trimRight(fromString("ab")));
     assertEquals(fromString("  he"), fromString("  hello ").trimRight(fromString(" ol")));
-    assertEquals(fromString("oohell"), fromString("oohellooo../*&").trimRight(fromString("./,&%*o")));
+    assertEquals(fromString("oohell"),
+        fromString("oohellooo../*&").trimRight(fromString("./,&%*o")));
 
     assertEquals(EMPTY_UTF8, fromString("  ").trimRight(fromString(" ")));
 
diff --git a/core/pom.xml b/core/pom.xml
index 431967e057e6f..da68abd855c7c 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -483,6 +483,7 @@
           <plugin>
             <groupId>org.codehaus.mojo</groupId>
             <artifactId>exec-maven-plugin</artifactId>
+            <version>1.6.0</version>
             <executions>
               <execution>
                 <id>sparkr-pkg</id>
diff --git a/core/src/main/java/org/apache/spark/io/ReadAheadInputStream.java b/core/src/main/java/org/apache/spark/io/ReadAheadInputStream.java
index 618bd42d0e65d..5b45d268ace8d 100644
--- a/core/src/main/java/org/apache/spark/io/ReadAheadInputStream.java
+++ b/core/src/main/java/org/apache/spark/io/ReadAheadInputStream.java
@@ -32,11 +32,12 @@
 
 /**
  * {@link InputStream} implementation which asynchronously reads ahead from the underlying input
- * stream when specified amount of data has been read from the current buffer. It does it by maintaining
- * two buffer - active buffer and read ahead buffer. Active buffer contains data which should be returned
- * when a read() call is issued. The read ahead buffer is used to asynchronously read from the underlying
- * input stream and once the current active buffer is exhausted, we flip the two buffers so that we can
- * start reading from the read ahead buffer without being blocked in disk I/O.
+ * stream when specified amount of data has been read from the current buffer. It does it by
+ * maintaining two buffers - active buffer and read ahead buffer. Active buffer contains data
+ * which should be returned when a read() call is issued. The read ahead buffer is used to
+ * asynchronously read from the underlying input stream and once the current active buffer is
+ * exhausted, we flip the two buffers so that we can start reading from the read ahead buffer
+ * without being blocked in disk I/O.
  */
 public class ReadAheadInputStream extends InputStream {
 
@@ -83,7 +84,8 @@ public class ReadAheadInputStream extends InputStream {
 
   private final InputStream underlyingInputStream;
 
-  private final ExecutorService executorService = ThreadUtils.newDaemonSingleThreadExecutor("read-ahead");
+  private final ExecutorService executorService =
+      ThreadUtils.newDaemonSingleThreadExecutor("read-ahead");
 
   private final Condition asyncReadComplete = stateChangeLock.newCondition();
 
@@ -98,13 +100,14 @@ public class ReadAheadInputStream extends InputStream {
    * @param readAheadThresholdInBytes If the active buffer has less data than the read-ahead
    *                                  threshold, an async read is triggered.
    */
-  public ReadAheadInputStream(InputStream inputStream, int bufferSizeInBytes, int readAheadThresholdInBytes) {
+  public ReadAheadInputStream(
+      InputStream inputStream, int bufferSizeInBytes, int readAheadThresholdInBytes) {
     Preconditions.checkArgument(bufferSizeInBytes > 0,
         "bufferSizeInBytes should be greater than 0, but the value is " + bufferSizeInBytes);
     Preconditions.checkArgument(readAheadThresholdInBytes > 0 &&
             readAheadThresholdInBytes < bufferSizeInBytes,
-        "readAheadThresholdInBytes should be greater than 0 and less than bufferSizeInBytes, but the" +
-            "value is " + readAheadThresholdInBytes);
+        "readAheadThresholdInBytes should be greater than 0 and less than bufferSizeInBytes, " +
+            "but the value is " + readAheadThresholdInBytes);
     activeBuffer = ByteBuffer.allocate(bufferSizeInBytes);
     readAheadBuffer = ByteBuffer.allocate(bufferSizeInBytes);
     this.readAheadThresholdInBytes = readAheadThresholdInBytes;
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
index 1e760b0b51988..e2f48e5508af6 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
@@ -76,9 +76,8 @@ public UnsafeSorterSpillReader(
         SparkEnv.get() == null ? 0.5 :
              SparkEnv.get().conf().getDouble("spark.unsafe.sorter.spill.read.ahead.fraction", 0.5);
 
-    final boolean readAheadEnabled =
-            SparkEnv.get() == null ? false :
-                    SparkEnv.get().conf().getBoolean("spark.unsafe.sorter.spill.read.ahead.enabled", true);
+    final boolean readAheadEnabled = SparkEnv.get() != null &&
+        SparkEnv.get().conf().getBoolean("spark.unsafe.sorter.spill.read.ahead.enabled", true);
 
     final InputStream bs =
         new NioBufferedFileInputStream(file, (int) bufferSizeBytes);
diff --git a/core/src/test/java/org/apache/spark/io/ReadAheadInputStreamSuite.java b/core/src/test/java/org/apache/spark/io/ReadAheadInputStreamSuite.java
index 5008f93b7e409..918ddc4517ec4 100644
--- a/core/src/test/java/org/apache/spark/io/ReadAheadInputStreamSuite.java
+++ b/core/src/test/java/org/apache/spark/io/ReadAheadInputStreamSuite.java
@@ -28,6 +28,7 @@ public class ReadAheadInputStreamSuite extends GenericFileInputStreamSuite {
   @Before
   public void setUp() throws IOException {
     super.setUp();
-    inputStream = new ReadAheadInputStream(new NioBufferedFileInputStream(inputFile), 8 * 1024, 4 * 1024);
+    inputStream = new ReadAheadInputStream(
+        new NioBufferedFileInputStream(inputFile), 8 * 1024, 4 * 1024);
   }
 }
diff --git a/dev/checkstyle.xml b/dev/checkstyle.xml
index fd73ca73ee7ef..68aee4d4c97ef 100644
--- a/dev/checkstyle.xml
+++ b/dev/checkstyle.xml
@@ -52,20 +52,6 @@
       <property name="file" value="dev/checkstyle-suppressions.xml"/>
     </module>
 
-    <!--
-    If you wish to turn off checking for a section of code, you can put a comment in the source
-    before and after the section, with the following syntax:
-
-      // checkstyle:off no.XXX (such as checkstyle.off: NoFinalizer)
-      ...  // stuff that breaks the styles
-      // checkstyle:on
-    -->
-    <module name="SuppressionCommentFilter">
-        <property name="offCommentFormat" value="checkstyle.off\: ([\w\|]+)"/>
-        <property name="onCommentFormat" value="checkstyle.on\: ([\w\|]+)"/>
-        <property name="checkFormat" value="$1"/>
-    </module>
-
     <!-- Checks for whitespace                               -->
     <!-- See http://checkstyle.sf.net/config_whitespace.html -->
     <module name="FileTabCharacter">
@@ -81,6 +67,19 @@
     <module name="NewlineAtEndOfFile"/>
 
     <module name="TreeWalker">
+        <!--
+        If you wish to turn off checking for a section of code, you can put a comment in the source
+        before and after the section, with the following syntax:
+
+          // checkstyle:off no.XXX (such as checkstyle.off: NoFinalizer)
+          ...  // stuff that breaks the styles
+          // checkstyle:on
+        -->
+        <module name="SuppressionCommentFilter">
+            <property name="offCommentFormat" value="checkstyle.off\: ([\w\|]+)"/>
+            <property name="onCommentFormat" value="checkstyle.on\: ([\w\|]+)"/>
+            <property name="checkFormat" value="$1"/>
+        </module>
         <module name="OuterTypeFilename"/>
         <module name="IllegalTokenText">
             <property name="tokens" value="STRING_LITERAL, CHAR_LITERAL"/>
@@ -182,6 +181,5 @@
         <module name="UnusedImports"/>
         <module name="RedundantImport"/>
         <module name="RedundantModifier"/>
-        <module name="FileContentsHolder"/>
     </module>
 </module>
diff --git a/dev/test-dependencies.sh b/dev/test-dependencies.sh
index 114b116d84cd9..c7714578bd005 100755
--- a/dev/test-dependencies.sh
+++ b/dev/test-dependencies.sh
@@ -46,7 +46,7 @@ OLD_VERSION=$($MVN -q \
     -Dexec.executable="echo" \
     -Dexec.args='${project.version}' \
     --non-recursive \
-    org.codehaus.mojo:exec-maven-plugin:1.5.0:exec)
+    org.codehaus.mojo:exec-maven-plugin:1.6.0:exec)
 if [ $? != 0 ]; then
     echo -e "Error while getting version string from Maven:\n$OLD_VERSION"
     exit 1
diff --git a/pom.xml b/pom.xml
index 0bbbf20a76d68..e292db9419378 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1969,7 +1969,7 @@
         <plugin>
           <groupId>org.apache.maven.plugins</groupId>
           <artifactId>maven-enforcer-plugin</artifactId>
-          <version>1.4.1</version>
+          <version>3.0.0-M1</version>
           <executions>
             <execution>
               <id>enforce-versions</id>
@@ -2012,7 +2012,7 @@
         <plugin>
           <groupId>net.alchim31.maven</groupId>
           <artifactId>scala-maven-plugin</artifactId>
-          <version>3.2.2</version>
+          <version>3.3.1</version>
           <executions>
             <execution>
               <id>eclipse-add-source</id>
@@ -2061,7 +2061,7 @@
         <plugin>
           <groupId>org.apache.maven.plugins</groupId>
           <artifactId>maven-compiler-plugin</artifactId>
-          <version>3.6.1</version>
+          <version>3.7.0</version>
           <configuration>
             <source>${java.version}</source>
             <target>${java.version}</target>
@@ -2078,7 +2078,7 @@
         <plugin>
           <groupId>org.apache.maven.plugins</groupId>
           <artifactId>maven-surefire-plugin</artifactId>
-          <version>2.19.1</version>
+          <version>2.20.1</version>
           <!-- Note config is repeated in scalatest config -->
           <configuration>
             <includes>
@@ -2222,7 +2222,7 @@
         <plugin>
           <groupId>org.apache.maven.plugins</groupId>
           <artifactId>maven-javadoc-plugin</artifactId>
-          <version>2.10.4</version>
+          <version>3.0.0-M1</version>
           <configuration>
             <additionalparam>-Xdoclint:all -Xdoclint:-missing</additionalparam>
             <tags>
@@ -2262,17 +2262,17 @@
         <plugin>
           <groupId>org.codehaus.mojo</groupId>
           <artifactId>exec-maven-plugin</artifactId>
-          <version>1.5.0</version>
+          <version>1.6.0</version>
         </plugin>
         <plugin>
           <groupId>org.apache.maven.plugins</groupId>
           <artifactId>maven-assembly-plugin</artifactId>
-          <version>3.0.0</version>
+          <version>3.1.0</version>
         </plugin>
         <plugin>
           <groupId>org.apache.maven.plugins</groupId>
           <artifactId>maven-shade-plugin</artifactId>
-          <version>3.0.0</version>
+          <version>3.1.0</version>
         </plugin>
         <plugin>
           <groupId>org.apache.maven.plugins</groupId>
@@ -2287,7 +2287,7 @@
         <plugin>
           <groupId>org.apache.maven.plugins</groupId>
           <artifactId>maven-dependency-plugin</artifactId>
-          <version>3.0.0</version>
+          <version>3.0.2</version>
           <executions>
             <execution>
               <id>default-cli</id>
@@ -2487,10 +2487,7 @@
         <artifactId>maven-checkstyle-plugin</artifactId>
         <version>2.17</version>
         <configuration>
-          <verbose>false</verbose>
-          <failOnViolation>false</failOnViolation>
           <includeTestSourceDirectory>true</includeTestSourceDirectory>
-          <failOnWarning>false</failOnWarning>
           <sourceDirectories>${basedir}/src/main/java,${basedir}/src/main/scala</sourceDirectories>
           <testSourceDirectory>${basedir}/src/test/java</testSourceDirectory>
           <configLocation>dev/checkstyle.xml</configLocation>
@@ -2498,6 +2495,13 @@
           <inputEncoding>${project.build.sourceEncoding}</inputEncoding>
           <outputEncoding>${project.reporting.outputEncoding}</outputEncoding>
         </configuration>
+        <dependencies>
+          <dependency>
+            <groupId>com.puppycrawl.tools</groupId>
+            <artifactId>checkstyle</artifactId>
+            <version>8.2</version>
+          </dependency>
+        </dependencies>
         <executions>
           <execution>
             <goals>
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java
index 48feb049c1de9..fb4d5c0d7ae41 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java
@@ -21,15 +21,14 @@
 
 import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.Row;
-import org.apache.spark.sql.sources.v2.DataSourceV2Options;
-import org.apache.spark.sql.sources.v2.ReadSupport;
-import org.apache.spark.sql.sources.v2.ReadSupportWithSchema;
 import org.apache.spark.sql.types.StructType;
 
 /**
  * A data source reader that is returned by
- * {@link ReadSupport#createReader(DataSourceV2Options)} or
- * {@link ReadSupportWithSchema#createReader(StructType, DataSourceV2Options)}.
+ * {@link org.apache.spark.sql.sources.v2.ReadSupport#createReader(
+ * org.apache.spark.sql.sources.v2.DataSourceV2Options)} or
+ * {@link org.apache.spark.sql.sources.v2.ReadSupportWithSchema#createReader(
+ * StructType, org.apache.spark.sql.sources.v2.DataSourceV2Options)}.
  * It can mix in various query optimization interfaces to speed up the data scan. The actual scan
  * logic should be delegated to {@link ReadTask}s that are returned by {@link #createReadTasks()}.
  *
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanUnsafeRow.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanUnsafeRow.java
index 829f9a078760b..d5eada808a16c 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanUnsafeRow.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanUnsafeRow.java
@@ -23,8 +23,6 @@
 import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
-import org.apache.spark.sql.sources.v2.reader.DataSourceV2Reader;
-import org.apache.spark.sql.sources.v2.reader.ReadTask;
 
 /**
  * A mix-in interface for {@link DataSourceV2Reader}. Data source readers can implement this
@@ -39,7 +37,8 @@ public interface SupportsScanUnsafeRow extends DataSourceV2Reader {
 
   @Override
   default List<ReadTask<Row>> createReadTasks() {
-    throw new IllegalStateException("createReadTasks should not be called with SupportsScanUnsafeRow.");
+    throw new IllegalStateException(
+        "createReadTasks should not be called with SupportsScanUnsafeRow.");
   }
 
   /**
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java
index 50900e98dedb6..7aacf0346d2fb 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java
@@ -32,7 +32,9 @@
 
 public class JavaAdvancedDataSourceV2 implements DataSourceV2, ReadSupport {
 
-  class Reader implements DataSourceV2Reader, SupportsPushDownRequiredColumns, SupportsPushDownFilters {
+  class Reader implements DataSourceV2Reader, SupportsPushDownRequiredColumns,
+      SupportsPushDownFilters {
+
     private StructType requiredSchema = new StructType().add("i", "int").add("j", "int");
     private Filter[] filters = new Filter[0];
 

From 2b6ff0cefda2a44fb265b4ac08b95ac4101d9f21 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Wed, 20 Sep 2017 10:49:06 +0100
Subject: [PATCH 581/779] [SPARK-22066][BUILD][HOTFIX] Revert
 scala-maven-plugin to 3.2.2 to work with Maven+zinc again

## What changes were proposed in this pull request?

See https://github.com/apache/spark/pull/19282
Revert scala-maven-plugin to 3.2.2 to work with Maven+zinc again

## How was this patch tested?

Reproduced locally with zinc, and confirmed this removes the problem.

Author: Sean Owen <sowen@cloudera.com>

Closes #19292 from srowen/SPARK-22066.2.
---
 pom.xml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index e292db9419378..b0408ecca0f66 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2012,7 +2012,8 @@
         <plugin>
           <groupId>net.alchim31.maven</groupId>
           <artifactId>scala-maven-plugin</artifactId>
-          <version>3.3.1</version>
+          <!-- 3.3.1 won't work with zinc; fails to find javac from java.home -->
+          <version>3.2.2</version>
           <executions>
             <execution>
               <id>eclipse-add-source</id>

From e17901d6df42edf2c7a3460995a0e954ad9a159f Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Wed, 20 Sep 2017 20:47:17 +0900
Subject: [PATCH 582/779] [SPARK-22049][DOCS] Confusing behavior of
 from_utc_timestamp and to_utc_timestamp

## What changes were proposed in this pull request?

Clarify behavior of to_utc_timestamp/from_utc_timestamp with an example

## How was this patch tested?

Doc only change / existing tests

Author: Sean Owen <sowen@cloudera.com>

Closes #19276 from srowen/SPARK-22049.
---
 R/pkg/R/functions.R                                | 10 ++++++----
 python/pyspark/sql/functions.py                    | 10 ++++++----
 .../catalyst/expressions/datetimeExpressions.scala | 14 ++++++++------
 .../scala/org/apache/spark/sql/functions.scala     | 10 ++++++----
 4 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index e92e1fd72bf10..9f286263c2162 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -2226,8 +2226,9 @@ setMethod("from_json", signature(x = "Column", schema = "characterOrstructType")
           })
 
 #' @details
-#' \code{from_utc_timestamp}: Given a timestamp, which corresponds to a certain time of day in UTC,
-#' returns another timestamp that corresponds to the same time of day in the given timezone.
+#' \code{from_utc_timestamp}: Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a
+#' time in UTC, and renders that time as a timestamp in the given time zone. For example, 'GMT+1'
+#' would yield '2017-07-14 03:40:00.0'.
 #'
 #' @rdname column_datetime_diff_functions
 #'
@@ -2286,8 +2287,9 @@ setMethod("next_day", signature(y = "Column", x = "character"),
           })
 
 #' @details
-#' \code{to_utc_timestamp}: Given a timestamp, which corresponds to a certain time of day
-#' in the given timezone, returns another timestamp that corresponds to the same time of day in UTC.
+#' \code{to_utc_timestamp}: Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a
+#' time in the given time zone, and renders that time as a timestamp in UTC. For example, 'GMT+1'
+#' would yield '2017-07-14 01:40:00.0'.
 #'
 #' @rdname column_datetime_diff_functions
 #' @aliases to_utc_timestamp to_utc_timestamp,Column,character-method
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 399bef02d9cc4..57068fbae1dd8 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -1150,8 +1150,9 @@ def unix_timestamp(timestamp=None, format='yyyy-MM-dd HH:mm:ss'):
 @since(1.5)
 def from_utc_timestamp(timestamp, tz):
     """
-    Given a timestamp, which corresponds to a certain time of day in UTC, returns another timestamp
-    that corresponds to the same time of day in the given timezone.
+    Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in UTC, and renders
+    that time as a timestamp in the given time zone. For example, 'GMT+1' would yield
+    '2017-07-14 03:40:00.0'.
 
     >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
     >>> df.select(from_utc_timestamp(df.t, "PST").alias('local_time')).collect()
@@ -1164,8 +1165,9 @@ def from_utc_timestamp(timestamp, tz):
 @since(1.5)
 def to_utc_timestamp(timestamp, tz):
     """
-    Given a timestamp, which corresponds to a certain time of day in the given timezone, returns
-    another timestamp that corresponds to the same time of day in UTC.
+    Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in the given time
+    zone, and renders that time as a timestamp in UTC. For example, 'GMT+1' would yield
+    '2017-07-14 01:40:00.0'.
 
     >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['ts'])
     >>> df.select(to_utc_timestamp(df.ts, "PST").alias('utc_time')).collect()
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index 70354200c82df..eaf8788888211 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -978,12 +978,13 @@ case class TimeAdd(start: Expression, interval: Expression, timeZoneId: Option[S
 }
 
 /**
- * Given a timestamp, which corresponds to a certain time of day in UTC, returns another timestamp
- * that corresponds to the same time of day in the given timezone.
+ * Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in UTC, and renders
+ * that time as a timestamp in the given time zone. For example, 'GMT+1' would yield
+ * '2017-07-14 03:40:00.0'.
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(timestamp, timezone) - Given a timestamp, which corresponds to a certain time of day in UTC, returns another timestamp that corresponds to the same time of day in the given timezone.",
+  usage = "_FUNC_(timestamp, timezone) - Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in UTC, and renders that time as a timestamp in the given time zone. For example, 'GMT+1' would yield '2017-07-14 03:40:00.0'.",
   examples = """
     Examples:
       > SELECT from_utc_timestamp('2016-08-31', 'Asia/Seoul');
@@ -1153,12 +1154,13 @@ case class MonthsBetween(date1: Expression, date2: Expression, timeZoneId: Optio
 }
 
 /**
- * Given a timestamp, which corresponds to a certain time of day in the given timezone, returns
- * another timestamp that corresponds to the same time of day in UTC.
+ * Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in the given time zone,
+ * and renders that time as a timestamp in UTC. For example, 'GMT+1' would yield
+ * '2017-07-14 01:40:00.0'.
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(timestamp, timezone) - Given a timestamp, which corresponds to a certain time of day in the given timezone, returns another timestamp that corresponds to the same time of day in UTC.",
+  usage = "_FUNC_(timestamp, timezone) - Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in the given time zone, and renders that time as a timestamp in UTC. For example, 'GMT+1' would yield '2017-07-14 01:40:00.0'.",
   examples = """
     Examples:
       > SELECT _FUNC_('2016-08-31', 'Asia/Seoul');
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index c6d0d86384b75..6bbdfa3ad1893 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2791,8 +2791,9 @@ object functions {
   }
 
   /**
-   * Given a timestamp, which corresponds to a certain time of day in UTC, returns another timestamp
-   * that corresponds to the same time of day in the given timezone.
+   * Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in UTC, and renders
+   * that time as a timestamp in the given time zone. For example, 'GMT+1' would yield
+   * '2017-07-14 03:40:00.0'.
    * @group datetime_funcs
    * @since 1.5.0
    */
@@ -2801,8 +2802,9 @@ object functions {
   }
 
   /**
-   * Given a timestamp, which corresponds to a certain time of day in the given timezone, returns
-   * another timestamp that corresponds to the same time of day in UTC.
+   * Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in the given time
+   * zone, and renders that time as a timestamp in UTC. For example, 'GMT+1' would yield
+   * '2017-07-14 01:40:00.0'.
    * @group datetime_funcs
    * @since 1.5.0
    */

From ce6a71e013c403d0a3690cf823934530ce0ea5ef Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 20 Sep 2017 09:00:43 -0700
Subject: [PATCH 583/779] [SPARK-22076][SQL] Expand.projections should not be a
 Stream

## What changes were proposed in this pull request?

Spark with Scala 2.10 fails with a group by cube:
```
spark.range(1).select($"id" as "a", $"id" as "b").write.partitionBy("a").mode("overwrite").saveAsTable("rollup_bug")
spark.sql("select 1 from rollup_bug group by rollup ()").show
```

It can be traced back to https://github.com/apache/spark/pull/15484 , which made `Expand.projections` a lazy `Stream` for group by cube.

In scala 2.10 `Stream` captures a lot of stuff, and in this case it captures the entire query plan which has some un-serializable parts.

This change is also good for master branch, to reduce the serialized size of `Expand.projections`.

## How was this patch tested?

manually verified with Spark with Scala 2.10.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19289 from cloud-fan/bug.
---
 .../apache/spark/sql/catalyst/analysis/Analyzer.scala  | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index db276fbc9d53a..4535176a14d89 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -279,9 +279,15 @@ class Analyzer(
      *  We need to get all of its subsets for a given GROUPBY expression, the subsets are
      *  represented as sequence of expressions.
      */
-    def cubeExprs(exprs: Seq[Expression]): Seq[Seq[Expression]] = exprs.toList match {
+    def cubeExprs(exprs: Seq[Expression]): Seq[Seq[Expression]] = {
+      // `cubeExprs0` is recursive and returns a lazy Stream. Here we call `toIndexedSeq` to
+      // materialize it and avoid serialization problems later on.
+      cubeExprs0(exprs).toIndexedSeq
+    }
+
+    def cubeExprs0(exprs: Seq[Expression]): Seq[Seq[Expression]] = exprs.toList match {
       case x :: xs =>
-        val initial = cubeExprs(xs)
+        val initial = cubeExprs0(xs)
         initial.map(x +: _) ++ initial
       case Nil =>
         Seq(Seq.empty)

From bb9c0697d509880d30c874d7f6384d7633aed061 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Wed, 20 Sep 2017 15:53:28 -0700
Subject: [PATCH 584/779] [SPARK-18838][HOTFIX][YARN] Check internal context
 state before stopping it.

The live listener bus now cleans up after itself and releases listeners
after stopping, so code cannot get references to listeners after the
Spark context is stopped.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #19297 from vanzin/SPARK-18838.hotfix.
---
 .../spark/deploy/yarn/YarnClusterSuite.scala  | 74 +++++++++----------
 1 file changed, 37 insertions(+), 37 deletions(-)

diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
index fc78bc488b116..d5de19072ce29 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -390,47 +390,47 @@ private object YarnClusterDriver extends Logging with Matchers {
         .map { _ => Option(getClass().getResource(confFile)).map(_.toString).orNull }
         .collect()
       assert(configFromExecutors.find(_ == null) === None)
-    } finally {
-      Files.write(result, status, StandardCharsets.UTF_8)
-      sc.stop()
-    }
 
-    // verify log urls are present
-    val listeners = sc.listenerBus.findListenersByClass[SaveExecutorInfo]
-    assert(listeners.size === 1)
-    val listener = listeners(0)
-    val executorInfos = listener.addedExecutorInfos.values
-    assert(executorInfos.nonEmpty)
-    executorInfos.foreach { info =>
-      assert(info.logUrlMap.nonEmpty)
-      info.logUrlMap.values.foreach { url =>
-        val log = Source.fromURL(url).mkString
-        assert(
-          !log.contains(SECRET_PASSWORD),
-          s"Executor logs contain sensitive info (${SECRET_PASSWORD}): \n${log} "
-        )
+      // verify log urls are present
+      val listeners = sc.listenerBus.findListenersByClass[SaveExecutorInfo]
+      assert(listeners.size === 1)
+      val listener = listeners(0)
+      val executorInfos = listener.addedExecutorInfos.values
+      assert(executorInfos.nonEmpty)
+      executorInfos.foreach { info =>
+        assert(info.logUrlMap.nonEmpty)
+        info.logUrlMap.values.foreach { url =>
+          val log = Source.fromURL(url).mkString
+          assert(
+            !log.contains(SECRET_PASSWORD),
+            s"Executor logs contain sensitive info (${SECRET_PASSWORD}): \n${log} "
+          )
+        }
       }
-    }
 
-    // If we are running in yarn-cluster mode, verify that driver logs links and present and are
-    // in the expected format.
-    if (conf.get("spark.submit.deployMode") == "cluster") {
-      assert(listener.driverLogs.nonEmpty)
-      val driverLogs = listener.driverLogs.get
-      assert(driverLogs.size === 2)
-      assert(driverLogs.contains("stderr"))
-      assert(driverLogs.contains("stdout"))
-      val urlStr = driverLogs("stderr")
-      driverLogs.foreach { kv =>
-        val log = Source.fromURL(kv._2).mkString
-        assert(
-          !log.contains(SECRET_PASSWORD),
-          s"Driver logs contain sensitive info (${SECRET_PASSWORD}): \n${log} "
-        )
+      // If we are running in yarn-cluster mode, verify that driver logs links and present and are
+      // in the expected format.
+      if (conf.get("spark.submit.deployMode") == "cluster") {
+        assert(listener.driverLogs.nonEmpty)
+        val driverLogs = listener.driverLogs.get
+        assert(driverLogs.size === 2)
+        assert(driverLogs.contains("stderr"))
+        assert(driverLogs.contains("stdout"))
+        val urlStr = driverLogs("stderr")
+        driverLogs.foreach { kv =>
+          val log = Source.fromURL(kv._2).mkString
+          assert(
+            !log.contains(SECRET_PASSWORD),
+            s"Driver logs contain sensitive info (${SECRET_PASSWORD}): \n${log} "
+          )
+        }
+        val containerId = YarnSparkHadoopUtil.get.getContainerId
+        val user = Utils.getCurrentUserName()
+        assert(urlStr.endsWith(s"/node/containerlogs/$containerId/$user/stderr?start=-4096"))
       }
-      val containerId = YarnSparkHadoopUtil.get.getContainerId
-      val user = Utils.getCurrentUserName()
-      assert(urlStr.endsWith(s"/node/containerlogs/$containerId/$user/stderr?start=-4096"))
+    } finally {
+      Files.write(result, status, StandardCharsets.UTF_8)
+      sc.stop()
     }
   }
 

From 55d5fa79db883e4d93a9c102a94713c9d2d1fb55 Mon Sep 17 00:00:00 2001
From: Devaraj K <devaraj@apache.org>
Date: Wed, 20 Sep 2017 16:22:36 -0700
Subject: [PATCH 585/779] [SPARK-21384][YARN] Spark + YARN fails with
 LocalFileSystem as default FS

## What changes were proposed in this pull request?

When the libraries temp directory(i.e. __spark_libs__*.zip dir) file system and staging dir(destination) file systems are the same then the __spark_libs__*.zip is not copying to the staging directory. But after making this decision the libraries zip file is getting deleted immediately and becoming unavailable for the Node Manager's localization.

With this change, client copies the files to remote always when the source scheme is "file".

## How was this patch tested?

I have verified it manually in yarn/cluster and yarn/client modes with hdfs and local file systems.

Author: Devaraj K <devaraj@apache.org>

Closes #19141 from devaraj-kavali/SPARK-21384.
---
 .../main/scala/org/apache/spark/deploy/yarn/Client.scala   | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index d408ca90a5d1c..64b2b4d4db549 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -340,8 +340,9 @@ private[spark] class Client(
 
   /**
    * Copy the given file to a remote file system (e.g. HDFS) if needed.
-   * The file is only copied if the source and destination file systems are different. This is used
-   * for preparing resources for launching the ApplicationMaster container. Exposed for testing.
+   * The file is only copied if the source and destination file systems are different or the source
+   * scheme is "file". This is used for preparing resources for launching the ApplicationMaster
+   * container. Exposed for testing.
    */
   private[yarn] def copyFileToRemote(
       destDir: Path,
@@ -353,7 +354,7 @@ private[spark] class Client(
     val destFs = destDir.getFileSystem(hadoopConf)
     val srcFs = srcPath.getFileSystem(hadoopConf)
     var destPath = srcPath
-    if (force || !compareFs(srcFs, destFs)) {
+    if (force || !compareFs(srcFs, destFs) || "file".equals(srcFs.getScheme)) {
       destPath = new Path(destDir, destName.getOrElse(srcPath.getName()))
       logInfo(s"Uploading resource $srcPath -> $destPath")
       FileUtil.copy(srcFs, srcPath, destFs, destPath, false, hadoopConf)

From 352bea5457bf77bb6d0e45a8c71be104bee8a586 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 20 Sep 2017 21:13:46 -0700
Subject: [PATCH 586/779] [SPARK-22076][SQL][FOLLOWUP] Expand.projections
 should not be a Stream

## What changes were proposed in this pull request?

This a follow-up of https://github.com/apache/spark/pull/19289 , we missed another place: `rollup`. `Seq.init.toSeq` also returns a `Stream`, we should fix it too.

## How was this patch tested?

manually

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19298 from cloud-fan/bug.
---
 .../scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 4535176a14d89..45ec2045de9a9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -268,7 +268,7 @@ class Analyzer(
      *  We need to get all of its subsets for the rule described above, the subset is
      *  represented as sequence of expressions.
      */
-    def rollupExprs(exprs: Seq[Expression]): Seq[Seq[Expression]] = exprs.inits.toSeq
+    def rollupExprs(exprs: Seq[Expression]): Seq[Seq[Expression]] = exprs.inits.toIndexedSeq
 
     /*
      *  GROUP BY a, b, c WITH CUBE

From 1da5822e6a60e024ae31cd722ca04abb2e7aed43 Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Thu, 21 Sep 2017 13:54:30 +0800
Subject: [PATCH 587/779] [SPARK-21934][CORE] Expose Shuffle Netty memory usage
 to MetricsSystem

## What changes were proposed in this pull request?

This is a followup work of SPARK-9104 to expose the Netty memory usage to MetricsSystem. Current the shuffle Netty memory usage of `NettyBlockTransferService` will be exposed, if using external shuffle, then the Netty memory usage of `ExternalShuffleClient` and `ExternalShuffleService` will be exposed instead. Currently I don't expose Netty memory usage of `YarnShuffleService`, because `YarnShuffleService` doesn't have `MetricsSystem` itself, and is better to connect to Hadoop's MetricsSystem.

## How was this patch tested?

Manually verified in local cluster.

Author: jerryshao <sshao@hortonworks.com>

Closes #19160 from jerryshao/SPARK-21934.
---
 .../shuffle/ExternalShuffleClient.java        |  8 +++++++
 .../spark/network/shuffle/ShuffleClient.java  | 12 +++++++++++
 .../spark/deploy/ExternalShuffleService.scala |  4 +++-
 .../deploy/ExternalShuffleServiceSource.scala | 10 ++++-----
 .../org/apache/spark/executor/Executor.scala  |  3 ++-
 .../netty/NettyBlockTransferService.scala     | 16 ++++++++++++++
 .../apache/spark/storage/BlockManager.scala   | 21 +++++++++++++++++++
 7 files changed, 67 insertions(+), 7 deletions(-)

diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleClient.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleClient.java
index 31bd24e5038b2..77702447edb88 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleClient.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleClient.java
@@ -21,6 +21,7 @@
 import java.nio.ByteBuffer;
 import java.util.List;
 
+import com.codahale.metrics.MetricSet;
 import com.google.common.collect.Lists;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -117,6 +118,12 @@ public void fetchBlocks(
     }
   }
 
+  @Override
+  public MetricSet shuffleMetrics() {
+    checkInit();
+    return clientFactory.getAllMetrics();
+  }
+
   /**
    * Registers this executor with an external shuffle server. This registration is required to
    * inform the shuffle server about where and how we store our shuffle files.
@@ -140,6 +147,7 @@ public void registerWithShuffleServer(
 
   @Override
   public void close() {
+    checkInit();
     clientFactory.close();
   }
 }
diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleClient.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleClient.java
index 9e77bee7f9ee6..5bd4412b75275 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleClient.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleClient.java
@@ -18,6 +18,9 @@
 package org.apache.spark.network.shuffle;
 
 import java.io.Closeable;
+import java.util.Collections;
+
+import com.codahale.metrics.MetricSet;
 
 /** Provides an interface for reading shuffle files, either from an Executor or external service. */
 public abstract class ShuffleClient implements Closeable {
@@ -52,4 +55,13 @@ public abstract void fetchBlocks(
       String[] blockIds,
       BlockFetchingListener listener,
       TempShuffleFileManager tempShuffleFileManager);
+
+  /**
+   * Get the shuffle MetricsSet from ShuffleClient, this will be used in MetricsSystem to
+   * get the Shuffle related metrics.
+   */
+  public MetricSet shuffleMetrics() {
+    // Return an empty MetricSet by default.
+    return () -> Collections.emptyMap();
+  }
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/ExternalShuffleService.scala b/core/src/main/scala/org/apache/spark/deploy/ExternalShuffleService.scala
index 8d491ddf6e092..f975fa5cb4e23 100644
--- a/core/src/main/scala/org/apache/spark/deploy/ExternalShuffleService.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/ExternalShuffleService.scala
@@ -56,7 +56,7 @@ class ExternalShuffleService(sparkConf: SparkConf, securityManager: SecurityMana
 
   private var server: TransportServer = _
 
-  private val shuffleServiceSource = new ExternalShuffleServiceSource(blockHandler)
+  private val shuffleServiceSource = new ExternalShuffleServiceSource
 
   /** Create a new shuffle block handler. Factored out for subclasses to override. */
   protected def newShuffleBlockHandler(conf: TransportConf): ExternalShuffleBlockHandler = {
@@ -83,6 +83,8 @@ class ExternalShuffleService(sparkConf: SparkConf, securityManager: SecurityMana
       }
     server = transportContext.createServer(port, bootstraps.asJava)
 
+    shuffleServiceSource.registerMetricSet(server.getAllMetrics)
+    shuffleServiceSource.registerMetricSet(blockHandler.getAllMetrics)
     masterMetricsSystem.registerSource(shuffleServiceSource)
     masterMetricsSystem.start()
   }
diff --git a/core/src/main/scala/org/apache/spark/deploy/ExternalShuffleServiceSource.scala b/core/src/main/scala/org/apache/spark/deploy/ExternalShuffleServiceSource.scala
index 357a9769311a9..ccfc974398781 100644
--- a/core/src/main/scala/org/apache/spark/deploy/ExternalShuffleServiceSource.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/ExternalShuffleServiceSource.scala
@@ -19,19 +19,19 @@ package org.apache.spark.deploy
 
 import javax.annotation.concurrent.ThreadSafe
 
-import com.codahale.metrics.MetricRegistry
+import com.codahale.metrics.{MetricRegistry, MetricSet}
 
 import org.apache.spark.metrics.source.Source
-import org.apache.spark.network.shuffle.ExternalShuffleBlockHandler
 
 /**
  * Provides metrics source for external shuffle service
  */
 @ThreadSafe
-private class ExternalShuffleServiceSource
-(blockHandler: ExternalShuffleBlockHandler) extends Source {
+private class ExternalShuffleServiceSource extends Source {
   override val metricRegistry = new MetricRegistry()
   override val sourceName = "shuffleService"
 
-  metricRegistry.registerAll(blockHandler.getAllMetrics)
+  def registerMetricSet(metricSet: MetricSet): Unit = {
+    metricRegistry.registerAll(metricSet)
+  }
 }
diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index 6237a4df0be58..3f55d014e5a1b 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -113,8 +113,9 @@ private[spark] class Executor(
   private val taskReaperForTask: HashMap[Long, TaskReaper] = HashMap[Long, TaskReaper]()
 
   if (!isLocal) {
-    env.metricsSystem.registerSource(executorSource)
     env.blockManager.initialize(conf.getAppId)
+    env.metricsSystem.registerSource(executorSource)
+    env.metricsSystem.registerSource(env.blockManager.shuffleMetricsSource)
   }
 
   // Whether to load classes in user jars before those in Spark jars
diff --git a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
index 30ff93897f98a..ac4d85004bad1 100644
--- a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
+++ b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
@@ -18,11 +18,14 @@
 package org.apache.spark.network.netty
 
 import java.nio.ByteBuffer
+import java.util.{HashMap => JHashMap, Map => JMap}
 
 import scala.collection.JavaConverters._
 import scala.concurrent.{Future, Promise}
 import scala.reflect.ClassTag
 
+import com.codahale.metrics.{Metric, MetricSet}
+
 import org.apache.spark.{SecurityManager, SparkConf}
 import org.apache.spark.network._
 import org.apache.spark.network.buffer.ManagedBuffer
@@ -83,6 +86,19 @@ private[spark] class NettyBlockTransferService(
     Utils.startServiceOnPort(_port, startService, conf, getClass.getName)._1
   }
 
+  override def shuffleMetrics(): MetricSet = {
+    require(server != null && clientFactory != null, "NettyBlockTransferServer is not initialized")
+
+    new MetricSet {
+      val allMetrics = new JHashMap[String, Metric]()
+      override def getMetrics: JMap[String, Metric] = {
+        allMetrics.putAll(clientFactory.getAllMetrics.getMetrics)
+        allMetrics.putAll(server.getAllMetrics.getMetrics)
+        allMetrics
+      }
+    }
+  }
+
   override def fetchBlocks(
       host: String,
       port: Int,
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
index b4b5938c307e2..a98083df5bd84 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -29,10 +29,13 @@ import scala.reflect.ClassTag
 import scala.util.Random
 import scala.util.control.NonFatal
 
+import com.codahale.metrics.{MetricRegistry, MetricSet}
+
 import org.apache.spark._
 import org.apache.spark.executor.{DataReadMethod, ShuffleWriteMetrics}
 import org.apache.spark.internal.{config, Logging}
 import org.apache.spark.memory.{MemoryManager, MemoryMode}
+import org.apache.spark.metrics.source.Source
 import org.apache.spark.network._
 import org.apache.spark.network.buffer.ManagedBuffer
 import org.apache.spark.network.netty.SparkTransportConf
@@ -248,6 +251,16 @@ private[spark] class BlockManager(
     logInfo(s"Initialized BlockManager: $blockManagerId")
   }
 
+  def shuffleMetricsSource: Source = {
+    import BlockManager._
+
+    if (externalShuffleServiceEnabled) {
+      new ShuffleMetricsSource("ExternalShuffle", shuffleClient.shuffleMetrics())
+    } else {
+      new ShuffleMetricsSource("NettyBlockTransfer", shuffleClient.shuffleMetrics())
+    }
+  }
+
   private def registerWithExternalShuffleServer() {
     logInfo("Registering executor with local external shuffle service.")
     val shuffleConfig = new ExecutorShuffleInfo(
@@ -1531,4 +1544,12 @@ private[spark] object BlockManager {
     }
     blockManagers.toMap
   }
+
+  private class ShuffleMetricsSource(
+      override val sourceName: String,
+      metricSet: MetricSet) extends Source {
+
+    override val metricRegistry = new MetricRegistry
+    metricRegistry.registerAll(metricSet)
+  }
 }

From a8d9ec8a60f21abb520b9109b238f914d2449022 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Thu, 21 Sep 2017 20:16:25 +0900
Subject: [PATCH 588/779] [SPARK-21780][R] Simpler Dataset.sample API in R

## What changes were proposed in this pull request?

This PR make `sample(...)` able to omit `withReplacement` defaulting to `FALSE`.

In short, the following examples are allowed:

```r
> df <- createDataFrame(as.list(seq(10)))
> count(sample(df, fraction=0.5, seed=3))
[1] 4
> count(sample(df, fraction=1.0))
[1] 10
```

In addition, this PR also adds some type checking logics as below:

```r
> sample(df, fraction = "a")
Error in sample(df, fraction = "a") :
  fraction must be numeric; however, got character
> sample(df, fraction = 1, seed = NULL)
Error in sample(df, fraction = 1, seed = NULL) :
  seed must not be NULL or NA; however, got NULL
> sample(df, list(1), 1.0)
Error in sample(df, list(1), 1) :
  withReplacement must be logical; however, got list
> sample(df, fraction = -1.0)
...
Error in sample : illegal argument - requirement failed: Sampling fraction (-1.0) must be on interval [0, 1] without replacement
```

## How was this patch tested?

Manually tested, unit tests added in `R/pkg/tests/fulltests/test_sparkSQL.R`.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #19243 from HyukjinKwon/SPARK-21780.
---
 R/pkg/R/DataFrame.R                   | 40 ++++++++++++++++++---------
 R/pkg/R/generics.R                    |  4 +--
 R/pkg/tests/fulltests/test_sparkSQL.R | 14 ++++++++++
 3 files changed, 43 insertions(+), 15 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 1b46c1e800c96..0728141fa483e 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -986,10 +986,10 @@ setMethod("unique",
 #' @param x A SparkDataFrame
 #' @param withReplacement Sampling with replacement or not
 #' @param fraction The (rough) sample target fraction
-#' @param seed Randomness seed value
+#' @param seed Randomness seed value. Default is a random seed.
 #'
 #' @family SparkDataFrame functions
-#' @aliases sample,SparkDataFrame,logical,numeric-method
+#' @aliases sample,SparkDataFrame-method
 #' @rdname sample
 #' @name sample
 #' @export
@@ -998,33 +998,47 @@ setMethod("unique",
 #' sparkR.session()
 #' path <- "path/to/file.json"
 #' df <- read.json(path)
+#' collect(sample(df, fraction = 0.5))
 #' collect(sample(df, FALSE, 0.5))
-#' collect(sample(df, TRUE, 0.5))
+#' collect(sample(df, TRUE, 0.5, seed = 3))
 #'}
 #' @note sample since 1.4.0
 setMethod("sample",
-          signature(x = "SparkDataFrame", withReplacement = "logical",
-                    fraction = "numeric"),
-          function(x, withReplacement, fraction, seed) {
-            if (fraction < 0.0) stop(cat("Negative fraction value:", fraction))
+          signature(x = "SparkDataFrame"),
+          function(x, withReplacement = FALSE, fraction, seed) {
+            if (!is.numeric(fraction)) {
+              stop(paste("fraction must be numeric; however, got", class(fraction)))
+            }
+            if (!is.logical(withReplacement)) {
+              stop(paste("withReplacement must be logical; however, got", class(withReplacement)))
+            }
+
             if (!missing(seed)) {
+              if (is.null(seed)) {
+                stop("seed must not be NULL or NA; however, got NULL")
+              }
+              if (is.na(seed)) {
+                stop("seed must not be NULL or NA; however, got NA")
+              }
+
               # TODO : Figure out how to send integer as java.lang.Long to JVM so
               # we can send seed as an argument through callJMethod
-              sdf <- callJMethod(x@sdf, "sample", withReplacement, fraction, as.integer(seed))
+              sdf <- handledCallJMethod(x@sdf, "sample", as.logical(withReplacement),
+                                        as.numeric(fraction), as.integer(seed))
             } else {
-              sdf <- callJMethod(x@sdf, "sample", withReplacement, fraction)
+              sdf <- handledCallJMethod(x@sdf, "sample",
+                                        as.logical(withReplacement), as.numeric(fraction))
             }
             dataFrame(sdf)
           })
 
 #' @rdname sample
-#' @aliases sample_frac,SparkDataFrame,logical,numeric-method
+#' @aliases sample_frac,SparkDataFrame-method
 #' @name sample_frac
 #' @note sample_frac since 1.4.0
 setMethod("sample_frac",
-          signature(x = "SparkDataFrame", withReplacement = "logical",
-                    fraction = "numeric"),
-          function(x, withReplacement, fraction, seed) {
+          signature(x = "SparkDataFrame"),
+          function(x, withReplacement = FALSE, fraction, seed) {
             sample(x, withReplacement, fraction, seed)
           })
 
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 603ff4e4a2e3b..0fe8f0453b064 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -645,7 +645,7 @@ setGeneric("repartition", function(x, ...) { standardGeneric("repartition") })
 #' @rdname sample
 #' @export
 setGeneric("sample",
-           function(x, withReplacement, fraction, seed) {
+           function(x, withReplacement = FALSE, fraction, seed) {
              standardGeneric("sample")
            })
 
@@ -656,7 +656,7 @@ setGeneric("rollup", function(x, ...) { standardGeneric("rollup") })
 #' @rdname sample
 #' @export
 setGeneric("sample_frac",
-           function(x, withReplacement, fraction, seed) { standardGeneric("sample_frac") })
+           function(x, withReplacement = FALSE, fraction, seed) { standardGeneric("sample_frac") })
 
 #' @rdname sampleBy
 #' @export
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index 85a7e0819cff7..4d1010ee1320a 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -1116,6 +1116,20 @@ test_that("sample on a DataFrame", {
   sampled3 <- sample_frac(df, FALSE, 0.1, 0) # set seed for predictable result
   expect_true(count(sampled3) < 3)
 
+  # Different arguments
+  df <- createDataFrame(as.list(seq(10)))
+  expect_equal(count(sample(df, fraction = 0.5, seed = 3)), 4)
+  expect_equal(count(sample(df, withReplacement = TRUE, fraction = 0.5, seed = 3)), 2)
+  expect_equal(count(sample(df, fraction = 1.0)), 10)
+  expect_equal(count(sample(df, fraction = 1L)), 10)
+  expect_equal(count(sample(df, FALSE, fraction = 1.0)), 10)
+
+  expect_error(sample(df, fraction = "a"), "fraction must be numeric")
+  expect_error(sample(df, "a", fraction = 0.1), "however, got character")
+  expect_error(sample(df, fraction = 1, seed = NA), "seed must not be NULL or NA; however, got NA")
+  expect_error(sample(df, fraction = -1.0),
+               "illegal argument - requirement failed: Sampling fraction \\(-1.0\\)")
+
   # nolint start
   # Test base::sample is working
   #expect_equal(length(sample(1:12)), 12)

From 1d1a09be9fe07244d7d12442f6105f823b260755 Mon Sep 17 00:00:00 2001
From: Zhenhua Wang <wangzhenhua@huawei.com>
Date: Thu, 21 Sep 2017 21:43:02 +0800
Subject: [PATCH 589/779] [SPARK-17997][SQL] Add an aggregation function for
 counting distinct values for multiple intervals

## What changes were proposed in this pull request?

This work is a part of [SPARK-17074](https://issues.apache.org/jira/browse/SPARK-17074) to compute equi-height histograms. Equi-height histogram is an array of bins. A bin consists of two endpoints which form an interval of values and the ndv in that interval.

This PR creates a new aggregate function, given an array of endpoints, counting distinct values (ndv) in intervals among those endpoints.

This PR also refactors `HyperLogLogPlusPlus` by extracting a helper class `HyperLogLogPlusPlusHelper`, where the underlying HLLPP algorithm locates.

## How was this patch tested?

Add new test cases.

Author: Zhenhua Wang <wangzhenhua@huawei.com>

Closes #15544 from wzhfy/countIntervals.
---
 .../ApproxCountDistinctForIntervals.scala     | 242 ++++++++++++
 .../aggregate/HyperLogLogPlusPlus.scala       | 346 +---------------
 .../util/HyperLogLogPlusPlusHelper.scala      | 373 ++++++++++++++++++
 ...ApproxCountDistinctForIntervalsSuite.scala | 238 +++++++++++
 .../aggregate/HyperLogLogPlusPlusSuite.scala  |   4 +-
 5 files changed, 869 insertions(+), 334 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproxCountDistinctForIntervals.scala
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/HyperLogLogPlusPlusHelper.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproxCountDistinctForIntervalsSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproxCountDistinctForIntervals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproxCountDistinctForIntervals.scala
new file mode 100644
index 0000000000000..096d1b35a8620
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproxCountDistinctForIntervals.scala
@@ -0,0 +1,242 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.aggregate
+
+import java.util
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
+import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess}
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, ExpectsInputTypes, Expression}
+import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData, HyperLogLogPlusPlusHelper}
+import org.apache.spark.sql.types._
+
+/**
+ * This function counts the approximate number of distinct values (ndv) in
+ * intervals constructed from endpoints specified in `endpointsExpression`. The endpoints should be
+ * sorted into ascending order. E.g., given an array of endpoints
+ * (endpoint_1, endpoint_2, ... endpoint_N), returns the approximate ndv's for intervals
+ * [endpoint_1, endpoint_2], (endpoint_2, endpoint_3], ... (endpoint_N-1, endpoint_N].
+ * To count ndv's in these intervals, apply the HyperLogLogPlusPlus algorithm in each of them.
+ * @param child to estimate the ndv's of.
+ * @param endpointsExpression An array expression to construct the intervals. It must be foldable,
+ *                            and its elements should be sorted into ascending order.
+ *                            Duplicate endpoints are allowed, e.g. (1, 5, 5, 10), and ndv for
+ *                            interval (5, 5] would be 1.
+ * @param relativeSD The maximum estimation error allowed in the HyperLogLogPlusPlus algorithm.
+ */
+case class ApproxCountDistinctForIntervals(
+    child: Expression,
+    endpointsExpression: Expression,
+    relativeSD: Double = 0.05,
+    mutableAggBufferOffset: Int = 0,
+    inputAggBufferOffset: Int = 0)
+  extends ImperativeAggregate with ExpectsInputTypes {
+
+  def this(child: Expression, endpointsExpression: Expression) = {
+    this(
+      child = child,
+      endpointsExpression = endpointsExpression,
+      relativeSD = 0.05,
+      mutableAggBufferOffset = 0,
+      inputAggBufferOffset = 0)
+  }
+
+  def this(child: Expression, endpointsExpression: Expression, relativeSD: Expression) = {
+    this(
+      child = child,
+      endpointsExpression = endpointsExpression,
+      relativeSD = HyperLogLogPlusPlus.validateDoubleLiteral(relativeSD),
+      mutableAggBufferOffset = 0,
+      inputAggBufferOffset = 0)
+  }
+
+  override def inputTypes: Seq[AbstractDataType] = {
+    Seq(TypeCollection(NumericType, TimestampType, DateType), ArrayType)
+  }
+
+  // Mark as lazy so that endpointsExpression is not evaluated during tree transformation.
+  lazy val endpoints: Array[Double] =
+    (endpointsExpression.dataType, endpointsExpression.eval()) match {
+      case (ArrayType(elementType, _), arrayData: ArrayData) =>
+        arrayData.toObjectArray(elementType).map(_.toString.toDouble)
+    }
+
+  override def checkInputDataTypes(): TypeCheckResult = {
+    val defaultCheck = super.checkInputDataTypes()
+    if (defaultCheck.isFailure) {
+      defaultCheck
+    } else if (!endpointsExpression.foldable) {
+      TypeCheckFailure("The endpoints provided must be constant literals")
+    } else {
+      endpointsExpression.dataType match {
+        case ArrayType(_: NumericType | DateType | TimestampType, _) =>
+          if (endpoints.length < 2) {
+            TypeCheckFailure("The number of endpoints must be >= 2 to construct intervals")
+          } else {
+            TypeCheckSuccess
+          }
+        case _ =>
+          TypeCheckFailure("Endpoints require (numeric or timestamp or date) type")
+      }
+    }
+  }
+
+  // N endpoints construct N-1 intervals, creating a HLLPP for each interval
+  private lazy val hllppArray = {
+    val array = new Array[HyperLogLogPlusPlusHelper](endpoints.length - 1)
+    for (i <- array.indices) {
+      array(i) = new HyperLogLogPlusPlusHelper(relativeSD)
+    }
+    // `numWords` in each HLLPPHelper should be the same because it is determined by `relativeSD`
+    // which is shared among all HLLPPHelpers.
+    assert(array.map(_.numWords).distinct.length == 1)
+    array
+  }
+
+  private lazy val numWordsPerHllpp = hllppArray.head.numWords
+
+  private lazy val totalNumWords = numWordsPerHllpp * hllppArray.length
+
+  /** Allocate enough words to store all registers. */
+  override lazy val aggBufferAttributes: Seq[AttributeReference] = {
+    Seq.tabulate(totalNumWords) { i =>
+      AttributeReference(s"MS[$i]", LongType)()
+    }
+  }
+
+  override def aggBufferSchema: StructType = StructType.fromAttributes(aggBufferAttributes)
+
+  // Note: although this simply copies aggBufferAttributes, this common code can not be placed
+  // in the superclass because that will lead to initialization ordering issues.
+  override lazy val inputAggBufferAttributes: Seq[AttributeReference] =
+    aggBufferAttributes.map(_.newInstance())
+
+  /** Fill all words with zeros. */
+  override def initialize(buffer: InternalRow): Unit = {
+    var word = 0
+    while (word < totalNumWords) {
+      buffer.setLong(mutableAggBufferOffset + word, 0)
+      word += 1
+    }
+  }
+
+  override def update(buffer: InternalRow, input: InternalRow): Unit = {
+    val value = child.eval(input)
+    // Ignore empty rows
+    if (value != null) {
+      // convert the value into a double value for searching in the double array
+      val doubleValue = child.dataType match {
+        case n: NumericType =>
+          n.numeric.toDouble(value.asInstanceOf[n.InternalType])
+        case _: DateType =>
+          value.asInstanceOf[Int].toDouble
+        case _: TimestampType =>
+          value.asInstanceOf[Long].toDouble
+      }
+
+      // endpoints are sorted into ascending order already
+      if (endpoints.head > doubleValue || endpoints.last < doubleValue) {
+        // ignore if the value is out of the whole range
+        return
+      }
+
+      val hllppIndex = findHllppIndex(doubleValue)
+      val offset = mutableAggBufferOffset + hllppIndex * numWordsPerHllpp
+      hllppArray(hllppIndex).update(buffer, offset, value, child.dataType)
+    }
+  }
+
+  // Find which interval (HyperLogLogPlusPlusHelper) should receive the given value.
+  def findHllppIndex(value: Double): Int = {
+    var index = util.Arrays.binarySearch(endpoints, value)
+    if (index >= 0) {
+      // The value is found.
+      if (index == 0) {
+        0
+      } else {
+        // If the endpoints contains multiple elements with the specified value, there is no
+        // guarantee which one binarySearch will return. We remove this uncertainty by moving the
+        // index to the first position of these elements.
+        var first = index - 1
+        while (first >= 0 && endpoints(first) == value) {
+          first -= 1
+        }
+        index = first + 1
+
+        if (index == 0) {
+          // reach the first endpoint
+          0
+        } else {
+          // send values in (endpoints(index-1), endpoints(index)] to hllpps(index-1)
+          index - 1
+        }
+      }
+    } else {
+      // The value is not found, binarySearch returns (-(<i>insertion point</i>) - 1).
+      // The <i>insertion point</i> is defined as the point at which the key would be inserted
+      // into the array: the index of the first element greater than the key.
+      val insertionPoint = - (index + 1)
+      if (insertionPoint == 0) 0 else insertionPoint - 1
+    }
+  }
+
+  override def merge(buffer1: InternalRow, buffer2: InternalRow): Unit = {
+    for (i <- hllppArray.indices) {
+      hllppArray(i).merge(
+        buffer1 = buffer1,
+        buffer2 = buffer2,
+        offset1 = mutableAggBufferOffset + i * numWordsPerHllpp,
+        offset2 = inputAggBufferOffset + i * numWordsPerHllpp)
+    }
+  }
+
+  override def eval(buffer: InternalRow): Any = {
+    val ndvArray = hllppResults(buffer)
+    // If the endpoints contains multiple elements with the same value,
+    // we set ndv=1 for intervals between these elements.
+    // E.g. given four endpoints (1, 2, 2, 4) and input sequence (0.5, 2),
+    // the ndv's for the three intervals should be (2, 1, 0)
+    for (i <- ndvArray.indices) {
+      if (endpoints(i) == endpoints(i + 1)) ndvArray(i) = 1
+    }
+    new GenericArrayData(ndvArray)
+  }
+
+  def hllppResults(buffer: InternalRow): Array[Long] = {
+    val ndvArray = new Array[Long](hllppArray.length)
+    for (i <- ndvArray.indices) {
+      ndvArray(i) = hllppArray(i).query(buffer, mutableAggBufferOffset + i * numWordsPerHllpp)
+    }
+    ndvArray
+  }
+
+  override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ImperativeAggregate =
+    copy(mutableAggBufferOffset = newMutableAggBufferOffset)
+
+  override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ImperativeAggregate =
+    copy(inputAggBufferOffset = newInputAggBufferOffset)
+
+  override def children: Seq[Expression] = Seq(child, endpointsExpression)
+
+  override def nullable: Boolean = false
+
+  override def dataType: DataType = ArrayType(LongType)
+
+  override def prettyName: String = "approx_count_distinct_for_intervals"
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlus.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlus.scala
index d5c9166443d73..0b737885f4b17 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlus.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlus.scala
@@ -17,12 +17,10 @@
 
 package org.apache.spark.sql.catalyst.expressions.aggregate
 
-import java.lang.{Long => JLong}
-import java.util
-
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.util.HyperLogLogPlusPlusHelper
 import org.apache.spark.sql.types._
 
 // scalastyle:off
@@ -57,7 +55,6 @@ case class HyperLogLogPlusPlus(
     mutableAggBufferOffset: Int = 0,
     inputAggBufferOffset: Int = 0)
   extends ImperativeAggregate {
-  import HyperLogLogPlusPlus._
 
   def this(child: Expression) = {
     this(child = child, relativeSD = 0.05, mutableAggBufferOffset = 0, inputAggBufferOffset = 0)
@@ -79,61 +76,6 @@ case class HyperLogLogPlusPlus(
   override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ImperativeAggregate =
     copy(inputAggBufferOffset = newInputAggBufferOffset)
 
-  /**
-   * HLL++ uses 'p' bits for addressing. The more addressing bits we use, the more precise the
-   * algorithm will be, and the more memory it will require. The 'p' value is based on the relative
-   * error requested.
-   *
-   * HLL++ requires that we use at least 4 bits of addressing space (a minimum precision of 27%).
-   *
-   * This method rounds up to the nearest integer. This means that the error is always equal to or
-   * lower than the requested error. Use the <code>trueRsd</code> method to get the actual RSD
-   * value.
-   */
-  private[this] val p = Math.ceil(2.0d * Math.log(1.106d / relativeSD) / Math.log(2.0d)).toInt
-
-  require(p >= 4, "HLL++ requires at least 4 bits for addressing. " +
-    "Use a lower error, at most 39%.")
-
-  /**
-   * Shift used to extract the index of the register from the hashed value.
-   *
-   * This assumes the use of 64-bit hashcodes.
-   */
-  private[this] val idxShift = JLong.SIZE - p
-
-  /**
-   * Value to pad the 'w' value with before the number of leading zeros is determined.
-   */
-  private[this] val wPadding = 1L << (p - 1)
-
-  /**
-   * The number of registers used.
-   */
-  private[this] val m = 1 << p
-
-  /**
-   * The pre-calculated combination of: alpha * m * m
-   *
-   * 'alpha' corrects the raw cardinality estimate 'Z'. See the FlFuGaMe07 paper for its
-   * derivation.
-   */
-  private[this] val alphaM2 = p match {
-    case 4 => 0.673d * m * m
-    case 5 => 0.697d * m * m
-    case 6 => 0.709d * m * m
-    case _ => (0.7213d / (1.0d + 1.079d / m)) * m * m
-  }
-
-  /**
-   * The number of words used to store the registers. We use Longs for storage because this is the
-   * most compact way of storage; Spark aligns to 8-byte words or uses Long wrappers.
-   *
-   * We only store whole registers per word in order to prevent overly complex bitwise operations.
-   * In practice this means we only use 60 out of 64 bits.
-   */
-  private[this] val numWords = m / REGISTERS_PER_WORD + 1
-
   override def children: Seq[Expression] = Seq(child)
 
   override def nullable: Boolean = false
@@ -142,9 +84,13 @@ case class HyperLogLogPlusPlus(
 
   override def aggBufferSchema: StructType = StructType.fromAttributes(aggBufferAttributes)
 
+  val hllppHelper = new HyperLogLogPlusPlusHelper(relativeSD)
+
   /** Allocate enough words to store all registers. */
-  override val aggBufferAttributes: Seq[AttributeReference] = Seq.tabulate(numWords) { i =>
-    AttributeReference(s"MS[$i]", LongType)()
+  override val aggBufferAttributes: Seq[AttributeReference] = {
+    Seq.tabulate(hllppHelper.numWords) { i =>
+      AttributeReference(s"MS[$i]", LongType)()
+    }
   }
 
   // Note: although this simply copies aggBufferAttributes, this common code can not be placed
@@ -155,7 +101,7 @@ case class HyperLogLogPlusPlus(
   /** Fill all words with zeros. */
   override def initialize(buffer: InternalRow): Unit = {
     var word = 0
-    while (word < numWords) {
+    while (word < hllppHelper.numWords) {
       buffer.setLong(mutableAggBufferOffset + word, 0)
       word += 1
     }
@@ -163,299 +109,35 @@ case class HyperLogLogPlusPlus(
 
   /**
    * Update the HLL++ buffer.
-   *
-   * Variable names in the HLL++ paper match variable names in the code.
    */
   override def update(buffer: InternalRow, input: InternalRow): Unit = {
     val v = child.eval(input)
     if (v != null) {
-      // Create the hashed value 'x'.
-      val x = XxHash64Function.hash(v, child.dataType, 42L)
-
-      // Determine the index of the register we are going to use.
-      val idx = (x >>> idxShift).toInt
-
-      // Determine the number of leading zeros in the remaining bits 'w'.
-      val pw = JLong.numberOfLeadingZeros((x << p) | wPadding) + 1L
-
-      // Get the word containing the register we are interested in.
-      val wordOffset = idx / REGISTERS_PER_WORD
-      val word = buffer.getLong(mutableAggBufferOffset + wordOffset)
-
-      // Extract the M[J] register value from the word.
-      val shift = REGISTER_SIZE * (idx - (wordOffset * REGISTERS_PER_WORD))
-      val mask = REGISTER_WORD_MASK << shift
-      val Midx = (word & mask) >>> shift
-
-      // Assign the maximum number of leading zeros to the register.
-      if (pw > Midx) {
-        buffer.setLong(mutableAggBufferOffset + wordOffset, (word & ~mask) | (pw << shift))
-      }
+      hllppHelper.update(buffer, mutableAggBufferOffset, v, child.dataType)
     }
   }
 
   /**
-   * Merge the HLL buffers by iterating through the registers in both buffers and select the
-   * maximum number of leading zeros for each register.
+   * Merge the HLL++ buffers.
    */
   override def merge(buffer1: InternalRow, buffer2: InternalRow): Unit = {
-    var idx = 0
-    var wordOffset = 0
-    while (wordOffset < numWords) {
-      val word1 = buffer1.getLong(mutableAggBufferOffset + wordOffset)
-      val word2 = buffer2.getLong(inputAggBufferOffset + wordOffset)
-      var word = 0L
-      var i = 0
-      var mask = REGISTER_WORD_MASK
-      while (idx < m && i < REGISTERS_PER_WORD) {
-        word |= Math.max(word1 & mask, word2 & mask)
-        mask <<= REGISTER_SIZE
-        i += 1
-        idx += 1
-      }
-      buffer1.setLong(mutableAggBufferOffset + wordOffset, word)
-      wordOffset += 1
-    }
-  }
-
-  /**
-   * Estimate the bias using the raw estimates with their respective biases from the HLL++
-   * appendix. We currently use KNN interpolation to determine the bias (as suggested in the
-   * paper).
-   */
-  def estimateBias(e: Double): Double = {
-    val estimates = RAW_ESTIMATE_DATA(p - 4)
-    val numEstimates = estimates.length
-
-    // The estimates are sorted so we can use a binary search to find the index of the
-    // interpolation estimate closest to the current estimate.
-    val nearestEstimateIndex = util.Arrays.binarySearch(estimates, 0, numEstimates, e) match {
-      case ix if ix < 0 => -(ix + 1)
-      case ix => ix
-    }
-
-    // Use square of the difference between the current estimate and the estimate at the given
-    // index as distance metric.
-    def distance(i: Int): Double = {
-      val diff = e - estimates(i)
-      diff * diff
-    }
-
-    // Keep moving bounds as long as the (exclusive) high bound is closer to the estimate than
-    // the lower (inclusive) bound.
-    var low = math.max(nearestEstimateIndex - K + 1, 0)
-    var high = math.min(low + K, numEstimates)
-    while (high < numEstimates && distance(high) < distance(low)) {
-      low += 1
-      high += 1
-    }
-
-    // Calculate the sum of the biases in low-high interval.
-    val biases = BIAS_DATA(p - 4)
-    var i = low
-    var biasSum = 0.0
-    while (i < high) {
-      biasSum += biases(i)
-      i += 1
-    }
-
-    // Calculate the bias.
-    biasSum / (high - low)
+    hllppHelper.merge(buffer1 = buffer1, buffer2 = buffer2,
+      offset1 = mutableAggBufferOffset, offset2 = inputAggBufferOffset)
   }
 
   /**
    * Compute the HyperLogLog estimate.
-   *
-   * Variable names in the HLL++ paper match variable names in the code.
    */
   override def eval(buffer: InternalRow): Any = {
-    // Compute the inverse of indicator value 'z' and count the number of zeros 'V'.
-    var zInverse = 0.0d
-    var V = 0.0d
-    var idx = 0
-    var wordOffset = 0
-    while (wordOffset < numWords) {
-      val word = buffer.getLong(mutableAggBufferOffset + wordOffset)
-      var i = 0
-      var shift = 0
-      while (idx < m && i < REGISTERS_PER_WORD) {
-        val Midx = (word >>> shift) & REGISTER_WORD_MASK
-        zInverse += 1.0 / (1 << Midx)
-        if (Midx == 0) {
-          V += 1.0d
-        }
-        shift += REGISTER_SIZE
-        i += 1
-        idx += 1
-      }
-      wordOffset += 1
-    }
-
-    // We integrate two steps from the paper:
-    // val Z = 1.0d / zInverse
-    // val E = alphaM2 * Z
-    val E = alphaM2 / zInverse
-    @inline
-    def EBiasCorrected = E match {
-      case e if p < 19 && e < 5.0d * m => e - estimateBias(e)
-      case e => e
-    }
-
-    // Estimate the cardinality.
-    val estimate = if (V > 0) {
-      // Use linear counting for small cardinality estimates.
-      val H = m * Math.log(m / V)
-      // HLL++ is defined only when p < 19, otherwise we need to fallback to HLL.
-      // The threshold `2.5 * m` is from the original HLL algorithm.
-      if ((p < 19 && H <= THRESHOLDS(p - 4)) || E <= 2.5 * m) {
-        H
-      } else {
-        EBiasCorrected
-      }
-    } else {
-      EBiasCorrected
-    }
-
-    // Round to the nearest long value.
-    Math.round(estimate)
+    hllppHelper.query(buffer, mutableAggBufferOffset)
   }
-
-  /**
-   * The <code>rsd</code> of HLL++ is always equal to or better than the <code>rsd</code> requested.
-   * This method returns the <code>rsd</code> this instance actually guarantees.
-   *
-   * @return the actual <code>rsd</code>.
-   */
-  def trueRsd: Double = 1.04 / math.sqrt(m)
 }
 
-// scalastyle:off
-/**
- * Constants used in the implementation of the HyperLogLogPlusPlus aggregate function.
- *
- * See the Appendix to HyperLogLog in Practice: Algorithmic Engineering of a State of the Art
- * Cardinality (https://docs.google.com/document/d/1gyjfMHy43U9OWBXxfaeG-3MjGzejW1dlpyMwEYAAWEI/view?fullscreen)
- * for more information.
- */
-// scalastyle:on
 object HyperLogLogPlusPlus {
-  /**
-   * The size of a word used for storing registers: 64 Bits.
-   */
-  val WORD_SIZE = java.lang.Long.SIZE
-
-  /**
-   * The number of bits that is required per register.
-   *
-   * This number is determined by the maximum number of leading binary zeros a hashcode can
-   * produce. This is equal to the number of bits the hashcode returns. The current
-   * implementation uses a 64-bit hashcode, this means 6-bits are (at most) needed to store the
-   * number of leading zeros.
-   */
-  val REGISTER_SIZE = 6
-
-  /**
-   * Value used to mask a register stored in a word.
-   */
-  val REGISTER_WORD_MASK: Long = (1 << REGISTER_SIZE) - 1
-
-  /**
-   * The number of registers which can be stored in one word.
-   */
-  val REGISTERS_PER_WORD = WORD_SIZE / REGISTER_SIZE
-
-  /**
-   * Number of points used for interpolating the bias value.
-   */
-  val K = 6
-
-  // Sacrificing style for readability here.
-  // scalastyle:off
-
-  /**
-   * Thresholds which decide if the linear counting or the regular algorithm is used.
-   */
-  val THRESHOLDS = Array[Double](10, 20, 40, 80, 220, 400, 900, 1800, 3100, 6500, 15500, 20000, 50000, 120000, 350000)
-
-  /**
-   * Lookup table used to find the (index of the) bias correction for a given precision (exact)
-   * and estimate (nearest).
-   */
-  val RAW_ESTIMATE_DATA = Array(
-    // precision 4
-    Array(11, 11.717, 12.207, 12.7896, 13.2882, 13.8204, 14.3772, 14.9342, 15.5202, 16.161, 16.7722, 17.4636, 18.0396, 18.6766, 19.3566, 20.0454, 20.7936, 21.4856, 22.2666, 22.9946, 23.766, 24.4692, 25.3638, 26.0764, 26.7864, 27.7602, 28.4814, 29.433, 30.2926, 31.0664, 31.9996, 32.7956, 33.5366, 34.5894, 35.5738, 36.2698, 37.3682, 38.0544, 39.2342, 40.0108, 40.7966, 41.9298, 42.8704, 43.6358, 44.5194, 45.773, 46.6772, 47.6174, 48.4888, 49.3304, 50.2506, 51.4996, 52.3824, 53.3078, 54.3984, 55.5838, 56.6618, 57.2174, 58.3514, 59.0802, 60.1482, 61.0376, 62.3598, 62.8078, 63.9744, 64.914, 65.781, 67.1806, 68.0594, 68.8446, 69.7928, 70.8248, 71.8324, 72.8598, 73.6246, 74.7014, 75.393, 76.6708, 77.2394),
-    // precision 5
-    Array(23, 23.1194, 23.8208, 24.2318, 24.77, 25.2436, 25.7774, 26.2848, 26.8224, 27.3742, 27.9336, 28.503, 29.0494, 29.6292, 30.2124, 30.798, 31.367, 31.9728, 32.5944, 33.217, 33.8438, 34.3696, 35.0956, 35.7044, 36.324, 37.0668, 37.6698, 38.3644, 39.049, 39.6918, 40.4146, 41.082, 41.687, 42.5398, 43.2462, 43.857, 44.6606, 45.4168, 46.1248, 46.9222, 47.6804, 48.447, 49.3454, 49.9594, 50.7636, 51.5776, 52.331, 53.19, 53.9676, 54.7564, 55.5314, 56.4442, 57.3708, 57.9774, 58.9624, 59.8796, 60.755, 61.472, 62.2076, 63.1024, 63.8908, 64.7338, 65.7728, 66.629, 67.413, 68.3266, 69.1524, 70.2642, 71.1806, 72.0566, 72.9192, 73.7598, 74.3516, 75.5802, 76.4386, 77.4916, 78.1524, 79.1892, 79.8414, 80.8798, 81.8376, 82.4698, 83.7656, 84.331, 85.5914, 86.6012, 87.7016, 88.5582, 89.3394, 90.3544, 91.4912, 92.308, 93.3552, 93.9746, 95.2052, 95.727, 97.1322, 98.3944, 98.7588, 100.242, 101.1914, 102.2538, 102.8776, 103.6292, 105.1932, 105.9152, 107.0868, 107.6728, 108.7144, 110.3114, 110.8716, 111.245, 112.7908, 113.7064, 114.636, 115.7464, 116.1788, 117.7464, 118.4896, 119.6166, 120.5082, 121.7798, 122.9028, 123.4426, 124.8854, 125.705, 126.4652, 128.3464, 128.3462, 130.0398, 131.0342, 131.0042, 132.4766, 133.511, 134.7252, 135.425, 136.5172, 138.0572, 138.6694, 139.3712, 140.8598, 141.4594, 142.554, 143.4006, 144.7374, 146.1634, 146.8994, 147.605, 147.9304, 149.1636, 150.2468, 151.5876, 152.2096, 153.7032, 154.7146, 155.807, 156.9228, 157.0372, 158.5852),
-    // precision 6
-    Array(46, 46.1902, 47.271, 47.8358, 48.8142, 49.2854, 50.317, 51.354, 51.8924, 52.9436, 53.4596, 54.5262, 55.6248, 56.1574, 57.2822, 57.837, 58.9636, 60.074, 60.7042, 61.7976, 62.4772, 63.6564, 64.7942, 65.5004, 66.686, 67.291, 68.5672, 69.8556, 70.4982, 71.8204, 72.4252, 73.7744, 75.0786, 75.8344, 77.0294, 77.8098, 79.0794, 80.5732, 81.1878, 82.5648, 83.2902, 84.6784, 85.3352, 86.8946, 88.3712, 89.0852, 90.499, 91.2686, 92.6844, 94.2234, 94.9732, 96.3356, 97.2286, 98.7262, 100.3284, 101.1048, 102.5962, 103.3562, 105.1272, 106.4184, 107.4974, 109.0822, 109.856, 111.48, 113.2834, 114.0208, 115.637, 116.5174, 118.0576, 119.7476, 120.427, 122.1326, 123.2372, 125.2788, 126.6776, 127.7926, 129.1952, 129.9564, 131.6454, 133.87, 134.5428, 136.2, 137.0294, 138.6278, 139.6782, 141.792, 143.3516, 144.2832, 146.0394, 147.0748, 148.4912, 150.849, 151.696, 153.5404, 154.073, 156.3714, 157.7216, 158.7328, 160.4208, 161.4184, 163.9424, 165.2772, 166.411, 168.1308, 168.769, 170.9258, 172.6828, 173.7502, 175.706, 176.3886, 179.0186, 180.4518, 181.927, 183.4172, 184.4114, 186.033, 188.5124, 189.5564, 191.6008, 192.4172, 193.8044, 194.997, 197.4548, 198.8948, 200.2346, 202.3086, 203.1548, 204.8842, 206.6508, 206.6772, 209.7254, 210.4752, 212.7228, 214.6614, 215.1676, 217.793, 218.0006, 219.9052, 221.66, 223.5588, 225.1636, 225.6882, 227.7126, 229.4502, 231.1978, 232.9756, 233.1654, 236.727, 238.1974, 237.7474, 241.1346, 242.3048, 244.1948, 245.3134, 246.879, 249.1204, 249.853, 252.6792, 253.857, 254.4486, 257.2362, 257.9534, 260.0286, 260.5632, 262.663, 264.723, 265.7566, 267.2566, 267.1624, 270.62, 272.8216, 273.2166, 275.2056, 276.2202, 278.3726, 280.3344, 281.9284, 283.9728, 284.1924, 286.4872, 287.587, 289.807, 291.1206, 292.769, 294.8708, 296.665, 297.1182, 299.4012, 300.6352, 302.1354, 304.1756, 306.1606, 307.3462, 308.5214, 309.4134, 310.8352, 313.9684, 315.837, 316.7796, 318.9858),
-    // precision 7
-    Array(92, 93.4934, 94.9758, 96.4574, 97.9718, 99.4954, 101.5302, 103.0756, 104.6374, 106.1782, 107.7888, 109.9522, 111.592, 113.2532, 114.9086, 116.5938, 118.9474, 120.6796, 122.4394, 124.2176, 125.9768, 128.4214, 130.2528, 132.0102, 133.8658, 135.7278, 138.3044, 140.1316, 142.093, 144.0032, 145.9092, 148.6306, 150.5294, 152.5756, 154.6508, 156.662, 159.552, 161.3724, 163.617, 165.5754, 167.7872, 169.8444, 172.7988, 174.8606, 177.2118, 179.3566, 181.4476, 184.5882, 186.6816, 189.0824, 191.0258, 193.6048, 196.4436, 198.7274, 200.957, 203.147, 205.4364, 208.7592, 211.3386, 213.781, 215.8028, 218.656, 221.6544, 223.996, 226.4718, 229.1544, 231.6098, 234.5956, 237.0616, 239.5758, 242.4878, 244.5244, 248.2146, 250.724, 252.8722, 255.5198, 258.0414, 261.941, 264.9048, 266.87, 269.4304, 272.028, 274.4708, 278.37, 281.0624, 283.4668, 286.5532, 289.4352, 293.2564, 295.2744, 298.2118, 300.7472, 304.1456, 307.2928, 309.7504, 312.5528, 315.979, 318.2102, 322.1834, 324.3494, 327.325, 330.6614, 332.903, 337.2544, 339.9042, 343.215, 345.2864, 348.0814, 352.6764, 355.301, 357.139, 360.658, 363.1732, 366.5902, 369.9538, 373.0828, 375.922, 378.9902, 382.7328, 386.4538, 388.1136, 391.2234, 394.0878, 396.708, 401.1556, 404.1852, 406.6372, 409.6822, 412.7796, 416.6078, 418.4916, 422.131, 424.5376, 428.1988, 432.211, 434.4502, 438.5282, 440.912, 444.0448, 447.7432, 450.8524, 453.7988, 456.7858, 458.8868, 463.9886, 466.5064, 468.9124, 472.6616, 475.4682, 478.582, 481.304, 485.2738, 488.6894, 490.329, 496.106, 497.6908, 501.1374, 504.5322, 506.8848, 510.3324, 513.4512, 516.179, 520.4412, 522.6066, 526.167, 528.7794, 533.379, 536.067, 538.46, 542.9116, 545.692, 547.9546, 552.493, 555.2722, 557.335, 562.449, 564.2014, 569.0738, 571.0974, 574.8564, 578.2996, 581.409, 583.9704, 585.8098, 589.6528, 594.5998, 595.958, 600.068, 603.3278, 608.2016, 609.9632, 612.864, 615.43, 620.7794, 621.272, 625.8644, 629.206, 633.219, 634.5154, 638.6102),
-    // precision 8
-    Array(184.2152, 187.2454, 190.2096, 193.6652, 196.6312, 199.6822, 203.249, 206.3296, 210.0038, 213.2074, 216.4612, 220.27, 223.5178, 227.4412, 230.8032, 234.1634, 238.1688, 241.6074, 245.6946, 249.2664, 252.8228, 257.0432, 260.6824, 264.9464, 268.6268, 272.2626, 276.8376, 280.4034, 284.8956, 288.8522, 292.7638, 297.3552, 301.3556, 305.7526, 309.9292, 313.8954, 318.8198, 322.7668, 327.298, 331.6688, 335.9466, 340.9746, 345.1672, 349.3474, 354.3028, 358.8912, 364.114, 368.4646, 372.9744, 378.4092, 382.6022, 387.843, 392.5684, 397.1652, 402.5426, 407.4152, 412.5388, 417.3592, 422.1366, 427.486, 432.3918, 437.5076, 442.509, 447.3834, 453.3498, 458.0668, 463.7346, 469.1228, 473.4528, 479.7, 484.644, 491.0518, 495.5774, 500.9068, 506.432, 512.1666, 517.434, 522.6644, 527.4894, 533.6312, 538.3804, 544.292, 550.5496, 556.0234, 562.8206, 566.6146, 572.4188, 579.117, 583.6762, 590.6576, 595.7864, 601.509, 607.5334, 612.9204, 619.772, 624.2924, 630.8654, 636.1836, 642.745, 649.1316, 655.0386, 660.0136, 666.6342, 671.6196, 678.1866, 684.4282, 689.3324, 695.4794, 702.5038, 708.129, 713.528, 720.3204, 726.463, 732.7928, 739.123, 744.7418, 751.2192, 756.5102, 762.6066, 769.0184, 775.2224, 781.4014, 787.7618, 794.1436, 798.6506, 805.6378, 811.766, 819.7514, 824.5776, 828.7322, 837.8048, 843.6302, 849.9336, 854.4798, 861.3388, 867.9894, 873.8196, 880.3136, 886.2308, 892.4588, 899.0816, 905.4076, 912.0064, 917.3878, 923.619, 929.998, 937.3482, 943.9506, 947.991, 955.1144, 962.203, 968.8222, 975.7324, 981.7826, 988.7666, 994.2648, 1000.3128, 1007.4082, 1013.7536, 1020.3376, 1026.7156, 1031.7478, 1037.4292, 1045.393, 1051.2278, 1058.3434, 1062.8726, 1071.884, 1076.806, 1082.9176, 1089.1678, 1095.5032, 1102.525, 1107.2264, 1115.315, 1120.93, 1127.252, 1134.1496, 1139.0408, 1147.5448, 1153.3296, 1158.1974, 1166.5262, 1174.3328, 1175.657, 1184.4222, 1190.9172, 1197.1292, 1204.4606, 1210.4578, 1218.8728, 1225.3336, 1226.6592, 1236.5768, 1241.363, 1249.4074, 1254.6566, 1260.8014, 1266.5454, 1274.5192),
-    // precision 9
-    Array(369, 374.8294, 381.2452, 387.6698, 394.1464, 400.2024, 406.8782, 413.6598, 420.462, 427.2826, 433.7102, 440.7416, 447.9366, 455.1046, 462.285, 469.0668, 476.306, 483.8448, 491.301, 498.9886, 506.2422, 513.8138, 521.7074, 529.7428, 537.8402, 545.1664, 553.3534, 561.594, 569.6886, 577.7876, 585.65, 594.228, 602.8036, 611.1666, 620.0818, 628.0824, 637.2574, 646.302, 655.1644, 664.0056, 672.3802, 681.7192, 690.5234, 700.2084, 708.831, 718.485, 728.1112, 737.4764, 746.76, 756.3368, 766.5538, 775.5058, 785.2646, 795.5902, 804.3818, 814.8998, 824.9532, 835.2062, 845.2798, 854.4728, 864.9582, 875.3292, 886.171, 896.781, 906.5716, 916.7048, 927.5322, 937.875, 949.3972, 958.3464, 969.7274, 980.2834, 992.1444, 1003.4264, 1013.0166, 1024.018, 1035.0438, 1046.34, 1057.6856, 1068.9836, 1079.0312, 1091.677, 1102.3188, 1113.4846, 1124.4424, 1135.739, 1147.1488, 1158.9202, 1169.406, 1181.5342, 1193.2834, 1203.8954, 1216.3286, 1226.2146, 1239.6684, 1251.9946, 1262.123, 1275.4338, 1285.7378, 1296.076, 1308.9692, 1320.4964, 1333.0998, 1343.9864, 1357.7754, 1368.3208, 1380.4838, 1392.7388, 1406.0758, 1416.9098, 1428.9728, 1440.9228, 1453.9292, 1462.617, 1476.05, 1490.2996, 1500.6128, 1513.7392, 1524.5174, 1536.6322, 1548.2584, 1562.3766, 1572.423, 1587.1232, 1596.5164, 1610.5938, 1622.5972, 1633.1222, 1647.7674, 1658.5044, 1671.57, 1683.7044, 1695.4142, 1708.7102, 1720.6094, 1732.6522, 1747.841, 1756.4072, 1769.9786, 1782.3276, 1797.5216, 1808.3186, 1819.0694, 1834.354, 1844.575, 1856.2808, 1871.1288, 1880.7852, 1893.9622, 1906.3418, 1920.6548, 1932.9302, 1945.8584, 1955.473, 1968.8248, 1980.6446, 1995.9598, 2008.349, 2019.8556, 2033.0334, 2044.0206, 2059.3956, 2069.9174, 2082.6084, 2093.7036, 2106.6108, 2118.9124, 2132.301, 2144.7628, 2159.8422, 2171.0212, 2183.101, 2193.5112, 2208.052, 2221.3194, 2233.3282, 2247.295, 2257.7222, 2273.342, 2286.5638, 2299.6786, 2310.8114, 2322.3312, 2335.516, 2349.874, 2363.5968, 2373.865, 2387.1918, 2401.8328, 2414.8496, 2424.544, 2436.7592, 2447.1682, 2464.1958, 2474.3438, 2489.0006, 2497.4526, 2513.6586, 2527.19, 2540.7028, 2553.768),
-    // precision 10
-    Array(738.1256, 750.4234, 763.1064, 775.4732, 788.4636, 801.0644, 814.488, 827.9654, 841.0832, 854.7864, 868.1992, 882.2176, 896.5228, 910.1716, 924.7752, 938.899, 953.6126, 968.6492, 982.9474, 998.5214, 1013.1064, 1028.6364, 1044.2468, 1059.4588, 1075.3832, 1091.0584, 1106.8606, 1123.3868, 1139.5062, 1156.1862, 1172.463, 1189.339, 1206.1936, 1223.1292, 1240.1854, 1257.2908, 1275.3324, 1292.8518, 1310.5204, 1328.4854, 1345.9318, 1364.552, 1381.4658, 1400.4256, 1419.849, 1438.152, 1456.8956, 1474.8792, 1494.118, 1513.62, 1532.5132, 1551.9322, 1570.7726, 1590.6086, 1610.5332, 1630.5918, 1650.4294, 1669.7662, 1690.4106, 1710.7338, 1730.9012, 1750.4486, 1770.1556, 1791.6338, 1812.7312, 1833.6264, 1853.9526, 1874.8742, 1896.8326, 1918.1966, 1939.5594, 1961.07, 1983.037, 2003.1804, 2026.071, 2047.4884, 2070.0848, 2091.2944, 2114.333, 2135.9626, 2158.2902, 2181.0814, 2202.0334, 2224.4832, 2246.39, 2269.7202, 2292.1714, 2314.2358, 2338.9346, 2360.891, 2384.0264, 2408.3834, 2430.1544, 2454.8684, 2476.9896, 2501.4368, 2522.8702, 2548.0408, 2570.6738, 2593.5208, 2617.0158, 2640.2302, 2664.0962, 2687.4986, 2714.2588, 2735.3914, 2759.6244, 2781.8378, 2808.0072, 2830.6516, 2856.2454, 2877.2136, 2903.4546, 2926.785, 2951.2294, 2976.468, 3000.867, 3023.6508, 3049.91, 3073.5984, 3098.162, 3121.5564, 3146.2328, 3170.9484, 3195.5902, 3221.3346, 3242.7032, 3271.6112, 3296.5546, 3317.7376, 3345.072, 3369.9518, 3394.326, 3418.1818, 3444.6926, 3469.086, 3494.2754, 3517.8698, 3544.248, 3565.3768, 3588.7234, 3616.979, 3643.7504, 3668.6812, 3695.72, 3719.7392, 3742.6224, 3770.4456, 3795.6602, 3819.9058, 3844.002, 3869.517, 3895.6824, 3920.8622, 3947.1364, 3973.985, 3995.4772, 4021.62, 4046.628, 4074.65, 4096.2256, 4121.831, 4146.6406, 4173.276, 4195.0744, 4223.9696, 4251.3708, 4272.9966, 4300.8046, 4326.302, 4353.1248, 4374.312, 4403.0322, 4426.819, 4450.0598, 4478.5206, 4504.8116, 4528.8928, 4553.9584, 4578.8712, 4603.8384, 4632.3872, 4655.5128, 4675.821, 4704.6222, 4731.9862, 4755.4174, 4781.2628, 4804.332, 4832.3048, 4862.8752, 4883.4148, 4906.9544, 4935.3516, 4954.3532, 4984.0248, 5011.217, 5035.3258, 5057.3672, 5084.1828),
-    // precision 11
-    Array(1477, 1501.6014, 1526.5802, 1551.7942, 1577.3042, 1603.2062, 1629.8402, 1656.2292, 1682.9462, 1709.9926, 1737.3026, 1765.4252, 1793.0578, 1821.6092, 1849.626, 1878.5568, 1908.527, 1937.5154, 1967.1874, 1997.3878, 2027.37, 2058.1972, 2089.5728, 2120.1012, 2151.9668, 2183.292, 2216.0772, 2247.8578, 2280.6562, 2313.041, 2345.714, 2380.3112, 2414.1806, 2447.9854, 2481.656, 2516.346, 2551.5154, 2586.8378, 2621.7448, 2656.6722, 2693.5722, 2729.1462, 2765.4124, 2802.8728, 2838.898, 2876.408, 2913.4926, 2951.4938, 2989.6776, 3026.282, 3065.7704, 3104.1012, 3143.7388, 3181.6876, 3221.1872, 3261.5048, 3300.0214, 3339.806, 3381.409, 3421.4144, 3461.4294, 3502.2286, 3544.651, 3586.6156, 3627.337, 3670.083, 3711.1538, 3753.5094, 3797.01, 3838.6686, 3882.1678, 3922.8116, 3967.9978, 4009.9204, 4054.3286, 4097.5706, 4140.6014, 4185.544, 4229.5976, 4274.583, 4316.9438, 4361.672, 4406.2786, 4451.8628, 4496.1834, 4543.505, 4589.1816, 4632.5188, 4678.2294, 4724.8908, 4769.0194, 4817.052, 4861.4588, 4910.1596, 4956.4344, 5002.5238, 5048.13, 5093.6374, 5142.8162, 5187.7894, 5237.3984, 5285.6078, 5331.0858, 5379.1036, 5428.6258, 5474.6018, 5522.7618, 5571.5822, 5618.59, 5667.9992, 5714.88, 5763.454, 5808.6982, 5860.3644, 5910.2914, 5953.571, 6005.9232, 6055.1914, 6104.5882, 6154.5702, 6199.7036, 6251.1764, 6298.7596, 6350.0302, 6398.061, 6448.4694, 6495.933, 6548.0474, 6597.7166, 6646.9416, 6695.9208, 6742.6328, 6793.5276, 6842.1934, 6894.2372, 6945.3864, 6996.9228, 7044.2372, 7094.1374, 7142.2272, 7192.2942, 7238.8338, 7288.9006, 7344.0908, 7394.8544, 7443.5176, 7490.4148, 7542.9314, 7595.6738, 7641.9878, 7694.3688, 7743.0448, 7797.522, 7845.53, 7899.594, 7950.3132, 7996.455, 8050.9442, 8092.9114, 8153.1374, 8197.4472, 8252.8278, 8301.8728, 8348.6776, 8401.4698, 8453.551, 8504.6598, 8553.8944, 8604.1276, 8657.6514, 8710.3062, 8758.908, 8807.8706, 8862.1702, 8910.4668, 8960.77, 9007.2766, 9063.164, 9121.0534, 9164.1354, 9218.1594, 9267.767, 9319.0594, 9372.155, 9419.7126, 9474.3722, 9520.1338, 9572.368, 9622.7702, 9675.8448, 9726.5396, 9778.7378, 9827.6554, 9878.1922, 9928.7782, 9978.3984, 10026.578, 10076.5626, 10137.1618, 10177.5244, 10229.9176),
-    // precision 12
-    Array(2954, 3003.4782, 3053.3568, 3104.3666, 3155.324, 3206.9598, 3259.648, 3312.539, 3366.1474, 3420.2576, 3474.8376, 3530.6076, 3586.451, 3643.38, 3700.4104, 3757.5638, 3815.9676, 3875.193, 3934.838, 3994.8548, 4055.018, 4117.1742, 4178.4482, 4241.1294, 4304.4776, 4367.4044, 4431.8724, 4496.3732, 4561.4304, 4627.5326, 4693.949, 4761.5532, 4828.7256, 4897.6182, 4965.5186, 5034.4528, 5104.865, 5174.7164, 5244.6828, 5316.6708, 5387.8312, 5459.9036, 5532.476, 5604.8652, 5679.6718, 5753.757, 5830.2072, 5905.2828, 5980.0434, 6056.6264, 6134.3192, 6211.5746, 6290.0816, 6367.1176, 6447.9796, 6526.5576, 6606.1858, 6686.9144, 6766.1142, 6847.0818, 6927.9664, 7010.9096, 7091.0816, 7175.3962, 7260.3454, 7344.018, 7426.4214, 7511.3106, 7596.0686, 7679.8094, 7765.818, 7852.4248, 7936.834, 8022.363, 8109.5066, 8200.4554, 8288.5832, 8373.366, 8463.4808, 8549.7682, 8642.0522, 8728.3288, 8820.9528, 8907.727, 9001.0794, 9091.2522, 9179.988, 9269.852, 9362.6394, 9453.642, 9546.9024, 9640.6616, 9732.6622, 9824.3254, 9917.7484, 10007.9392, 10106.7508, 10196.2152, 10289.8114, 10383.5494, 10482.3064, 10576.8734, 10668.7872, 10764.7156, 10862.0196, 10952.793, 11049.9748, 11146.0702, 11241.4492, 11339.2772, 11434.2336, 11530.741, 11627.6136, 11726.311, 11821.5964, 11918.837, 12015.3724, 12113.0162, 12213.0424, 12306.9804, 12408.4518, 12504.8968, 12604.586, 12700.9332, 12798.705, 12898.5142, 12997.0488, 13094.788, 13198.475, 13292.7764, 13392.9698, 13486.8574, 13590.1616, 13686.5838, 13783.6264, 13887.2638, 13992.0978, 14081.0844, 14189.9956, 14280.0912, 14382.4956, 14486.4384, 14588.1082, 14686.2392, 14782.276, 14888.0284, 14985.1864, 15088.8596, 15187.0998, 15285.027, 15383.6694, 15495.8266, 15591.3736, 15694.2008, 15790.3246, 15898.4116, 15997.4522, 16095.5014, 16198.8514, 16291.7492, 16402.6424, 16499.1266, 16606.2436, 16697.7186, 16796.3946, 16902.3376, 17005.7672, 17100.814, 17206.8282, 17305.8262, 17416.0744, 17508.4092, 17617.0178, 17715.4554, 17816.758, 17920.1748, 18012.9236, 18119.7984, 18223.2248, 18324.2482, 18426.6276, 18525.0932, 18629.8976, 18733.2588, 18831.0466, 18940.1366, 19032.2696, 19131.729, 19243.4864, 19349.6932, 19442.866, 19547.9448, 19653.2798, 19754.4034, 19854.0692, 19965.1224, 20065.1774, 20158.2212, 20253.353, 20366.3264, 20463.22),
-    // precision 13
-    Array(5908.5052, 6007.2672, 6107.347, 6208.5794, 6311.2622, 6414.5514, 6519.3376, 6625.6952, 6732.5988, 6841.3552, 6950.5972, 7061.3082, 7173.5646, 7287.109, 7401.8216, 7516.4344, 7633.3802, 7751.2962, 7870.3784, 7990.292, 8110.79, 8233.4574, 8356.6036, 8482.2712, 8607.7708, 8735.099, 8863.1858, 8993.4746, 9123.8496, 9255.6794, 9388.5448, 9522.7516, 9657.3106, 9792.6094, 9930.5642, 10068.794, 10206.7256, 10347.81, 10490.3196, 10632.0778, 10775.9916, 10920.4662, 11066.124, 11213.073, 11358.0362, 11508.1006, 11659.1716, 11808.7514, 11959.4884, 12112.1314, 12265.037, 12420.3756, 12578.933, 12734.311, 12890.0006, 13047.2144, 13207.3096, 13368.5144, 13528.024, 13689.847, 13852.7528, 14018.3168, 14180.5372, 14346.9668, 14513.5074, 14677.867, 14846.2186, 15017.4186, 15184.9716, 15356.339, 15529.2972, 15697.3578, 15871.8686, 16042.187, 16216.4094, 16389.4188, 16565.9126, 16742.3272, 16919.0042, 17094.7592, 17273.965, 17451.8342, 17634.4254, 17810.5984, 17988.9242, 18171.051, 18354.7938, 18539.466, 18721.0408, 18904.9972, 19081.867, 19271.9118, 19451.8694, 19637.9816, 19821.2922, 20013.1292, 20199.3858, 20387.8726, 20572.9514, 20770.7764, 20955.1714, 21144.751, 21329.9952, 21520.709, 21712.7016, 21906.3868, 22096.2626, 22286.0524, 22475.051, 22665.5098, 22862.8492, 23055.5294, 23249.6138, 23437.848, 23636.273, 23826.093, 24020.3296, 24213.3896, 24411.7392, 24602.9614, 24805.7952, 24998.1552, 25193.9588, 25389.0166, 25585.8392, 25780.6976, 25981.2728, 26175.977, 26376.5252, 26570.1964, 26773.387, 26962.9812, 27163.0586, 27368.164, 27565.0534, 27758.7428, 27961.1276, 28163.2324, 28362.3816, 28565.7668, 28758.644, 28956.9768, 29163.4722, 29354.7026, 29561.1186, 29767.9948, 29959.9986, 30164.0492, 30366.9818, 30562.5338, 30762.9928, 30976.1592, 31166.274, 31376.722, 31570.3734, 31770.809, 31974.8934, 32179.5286, 32387.5442, 32582.3504, 32794.076, 32989.9528, 33191.842, 33392.4684, 33595.659, 33801.8672, 34000.3414, 34200.0922, 34402.6792, 34610.0638, 34804.0084, 35011.13, 35218.669, 35418.6634, 35619.0792, 35830.6534, 36028.4966, 36229.7902, 36438.6422, 36630.7764, 36833.3102, 37048.6728, 37247.3916, 37453.5904, 37669.3614, 37854.5526, 38059.305, 38268.0936, 38470.2516, 38674.7064, 38876.167, 39068.3794, 39281.9144, 39492.8566, 39684.8628, 39898.4108, 40093.1836, 40297.6858, 40489.7086, 40717.2424),
-    // precision 14
-    Array(11817.475, 12015.0046, 12215.3792, 12417.7504, 12623.1814, 12830.0086, 13040.0072, 13252.503, 13466.178, 13683.2738, 13902.0344, 14123.9798, 14347.394, 14573.7784, 14802.6894, 15033.6824, 15266.9134, 15502.8624, 15741.4944, 15980.7956, 16223.8916, 16468.6316, 16715.733, 16965.5726, 17217.204, 17470.666, 17727.8516, 17986.7886, 18247.6902, 18510.9632, 18775.304, 19044.7486, 19314.4408, 19587.202, 19862.2576, 20135.924, 20417.0324, 20697.9788, 20979.6112, 21265.0274, 21550.723, 21841.6906, 22132.162, 22428.1406, 22722.127, 23020.5606, 23319.7394, 23620.4014, 23925.2728, 24226.9224, 24535.581, 24845.505, 25155.9618, 25470.3828, 25785.9702, 26103.7764, 26420.4132, 26742.0186, 27062.8852, 27388.415, 27714.6024, 28042.296, 28365.4494, 28701.1526, 29031.8008, 29364.2156, 29704.497, 30037.1458, 30380.111, 30723.8168, 31059.5114, 31404.9498, 31751.6752, 32095.2686, 32444.7792, 32794.767, 33145.204, 33498.4226, 33847.6502, 34209.006, 34560.849, 34919.4838, 35274.9778, 35635.1322, 35996.3266, 36359.1394, 36722.8266, 37082.8516, 37447.7354, 37815.9606, 38191.0692, 38559.4106, 38924.8112, 39294.6726, 39663.973, 40042.261, 40416.2036, 40779.2036, 41161.6436, 41540.9014, 41921.1998, 42294.7698, 42678.5264, 43061.3464, 43432.375, 43818.432, 44198.6598, 44583.0138, 44970.4794, 45353.924, 45729.858, 46118.2224, 46511.5724, 46900.7386, 47280.6964, 47668.1472, 48055.6796, 48446.9436, 48838.7146, 49217.7296, 49613.7796, 50010.7508, 50410.0208, 50793.7886, 51190.2456, 51583.1882, 51971.0796, 52376.5338, 52763.319, 53165.5534, 53556.5594, 53948.2702, 54346.352, 54748.7914, 55138.577, 55543.4824, 55941.1748, 56333.7746, 56745.1552, 57142.7944, 57545.2236, 57935.9956, 58348.5268, 58737.5474, 59158.5962, 59542.6896, 59958.8004, 60349.3788, 60755.0212, 61147.6144, 61548.194, 61946.0696, 62348.6042, 62763.603, 63162.781, 63560.635, 63974.3482, 64366.4908, 64771.5876, 65176.7346, 65597.3916, 65995.915, 66394.0384, 66822.9396, 67203.6336, 67612.2032, 68019.0078, 68420.0388, 68821.22, 69235.8388, 69640.0724, 70055.155, 70466.357, 70863.4266, 71276.2482, 71677.0306, 72080.2006, 72493.0214, 72893.5952, 73314.5856, 73714.9852, 74125.3022, 74521.2122, 74933.6814, 75341.5904, 75743.0244, 76166.0278, 76572.1322, 76973.1028, 77381.6284, 77800.6092, 78189.328, 78607.0962, 79012.2508, 79407.8358, 79825.725, 80238.701, 80646.891, 81035.6436, 81460.0448, 81876.3884),
-    // precision 15
-    Array(23635.0036, 24030.8034, 24431.4744, 24837.1524, 25246.7928, 25661.326, 26081.3532, 26505.2806, 26933.9892, 27367.7098, 27805.318, 28248.799, 28696.4382, 29148.8244, 29605.5138, 30066.8668, 30534.2344, 31006.32, 31480.778, 31962.2418, 32447.3324, 32938.0232, 33432.731, 33930.728, 34433.9896, 34944.1402, 35457.5588, 35974.5958, 36497.3296, 37021.9096, 37554.326, 38088.0826, 38628.8816, 39171.3192, 39723.2326, 40274.5554, 40832.3142, 41390.613, 41959.5908, 42532.5466, 43102.0344, 43683.5072, 44266.694, 44851.2822, 45440.7862, 46038.0586, 46640.3164, 47241.064, 47846.155, 48454.7396, 49076.9168, 49692.542, 50317.4778, 50939.65, 51572.5596, 52210.2906, 52843.7396, 53481.3996, 54127.236, 54770.406, 55422.6598, 56078.7958, 56736.7174, 57397.6784, 58064.5784, 58730.308, 59404.9784, 60077.0864, 60751.9158, 61444.1386, 62115.817, 62808.7742, 63501.4774, 64187.5454, 64883.6622, 65582.7468, 66274.5318, 66976.9276, 67688.7764, 68402.138, 69109.6274, 69822.9706, 70543.6108, 71265.5202, 71983.3848, 72708.4656, 73433.384, 74158.4664, 74896.4868, 75620.9564, 76362.1434, 77098.3204, 77835.7662, 78582.6114, 79323.9902, 80067.8658, 80814.9246, 81567.0136, 82310.8536, 83061.9952, 83821.4096, 84580.8608, 85335.547, 86092.5802, 86851.6506, 87612.311, 88381.2016, 89146.3296, 89907.8974, 90676.846, 91451.4152, 92224.5518, 92995.8686, 93763.5066, 94551.2796, 95315.1944, 96096.1806, 96881.0918, 97665.679, 98442.68, 99229.3002, 100011.0994, 100790.6386, 101580.1564, 102377.7484, 103152.1392, 103944.2712, 104730.216, 105528.6336, 106324.9398, 107117.6706, 107890.3988, 108695.2266, 109485.238, 110294.7876, 111075.0958, 111878.0496, 112695.2864, 113464.5486, 114270.0474, 115068.608, 115884.3626, 116673.2588, 117483.3716, 118275.097, 119085.4092, 119879.2808, 120687.5868, 121499.9944, 122284.916, 123095.9254, 123912.5038, 124709.0454, 125503.7182, 126323.259, 127138.9412, 127943.8294, 128755.646, 129556.5354, 130375.3298, 131161.4734, 131971.1962, 132787.5458, 133588.1056, 134431.351, 135220.2906, 136023.398, 136846.6558, 137667.0004, 138463.663, 139283.7154, 140074.6146, 140901.3072, 141721.8548, 142543.2322, 143356.1096, 144173.7412, 144973.0948, 145794.3162, 146609.5714, 147420.003, 148237.9784, 149050.5696, 149854.761, 150663.1966, 151494.0754, 152313.1416, 153112.6902, 153935.7206, 154746.9262, 155559.547, 156401.9746, 157228.7036, 158008.7254, 158820.75, 159646.9184, 160470.4458, 161279.5348, 162093.3114, 162918.542, 163729.2842),
-    // precision 16
-    Array(47271, 48062.3584, 48862.7074, 49673.152, 50492.8416, 51322.9514, 52161.03, 53009.407, 53867.6348, 54734.206, 55610.5144, 56496.2096, 57390.795, 58297.268, 59210.6448, 60134.665, 61068.0248, 62010.4472, 62962.5204, 63923.5742, 64895.0194, 65876.4182, 66862.6136, 67862.6968, 68868.8908, 69882.8544, 70911.271, 71944.0924, 72990.0326, 74040.692, 75100.6336, 76174.7826, 77252.5998, 78340.2974, 79438.2572, 80545.4976, 81657.2796, 82784.6336, 83915.515, 85059.7362, 86205.9368, 87364.4424, 88530.3358, 89707.3744, 90885.9638, 92080.197, 93275.5738, 94479.391, 95695.918, 96919.2236, 98148.4602, 99382.3474, 100625.6974, 101878.0284, 103141.6278, 104409.4588, 105686.2882, 106967.5402, 108261.6032, 109548.1578, 110852.0728, 112162.231, 113479.0072, 114806.2626, 116137.9072, 117469.5048, 118813.5186, 120165.4876, 121516.2556, 122875.766, 124250.5444, 125621.2222, 127003.2352, 128387.848, 129775.2644, 131181.7776, 132577.3086, 133979.9458, 135394.1132, 136800.9078, 138233.217, 139668.5308, 141085.212, 142535.2122, 143969.0684, 145420.2872, 146878.1542, 148332.7572, 149800.3202, 151269.66, 152743.6104, 154213.0948, 155690.288, 157169.4246, 158672.1756, 160160.059, 161650.6854, 163145.7772, 164645.6726, 166159.1952, 167682.1578, 169177.3328, 170700.0118, 172228.8964, 173732.6664, 175265.5556, 176787.799, 178317.111, 179856.6914, 181400.865, 182943.4612, 184486.742, 186033.4698, 187583.7886, 189148.1868, 190688.4526, 192250.1926, 193810.9042, 195354.2972, 196938.7682, 198493.5898, 200079.2824, 201618.912, 203205.5492, 204765.5798, 206356.1124, 207929.3064, 209498.7196, 211086.229, 212675.1324, 214256.7892, 215826.2392, 217412.8474, 218995.6724, 220618.6038, 222207.1166, 223781.0364, 225387.4332, 227005.7928, 228590.4336, 230217.8738, 231805.1054, 233408.9, 234995.3432, 236601.4956, 238190.7904, 239817.2548, 241411.2832, 243002.4066, 244640.1884, 246255.3128, 247849.3508, 249479.9734, 251106.8822, 252705.027, 254332.9242, 255935.129, 257526.9014, 259154.772, 260777.625, 262390.253, 264004.4906, 265643.59, 267255.4076, 268873.426, 270470.7252, 272106.4804, 273722.4456, 275337.794, 276945.7038, 278592.9154, 280204.3726, 281841.1606, 283489.171, 285130.1716, 286735.3362, 288364.7164, 289961.1814, 291595.5524, 293285.683, 294899.6668, 296499.3434, 298128.0462, 299761.8946, 301394.2424, 302997.6748, 304615.1478, 306269.7724, 307886.114, 309543.1028, 311153.2862, 312782.8546, 314421.2008, 316033.2438, 317692.9636, 319305.2648, 320948.7406, 322566.3364, 324228.4224, 325847.1542),
-    // precision 17
-    Array(94542, 96125.811, 97728.019, 99348.558, 100987.9705, 102646.7565, 104324.5125, 106021.7435, 107736.7865, 109469.272, 111223.9465, 112995.219, 114787.432, 116593.152, 118422.71, 120267.2345, 122134.6765, 124020.937, 125927.2705, 127851.255, 129788.9485, 131751.016, 133726.8225, 135722.592, 137736.789, 139770.568, 141821.518, 143891.343, 145982.1415, 148095.387, 150207.526, 152355.649, 154515.6415, 156696.05, 158887.7575, 161098.159, 163329.852, 165569.053, 167837.4005, 170121.6165, 172420.4595, 174732.6265, 177062.77, 179412.502, 181774.035, 184151.939, 186551.6895, 188965.691, 191402.8095, 193857.949, 196305.0775, 198774.6715, 201271.2585, 203764.78, 206299.3695, 208818.1365, 211373.115, 213946.7465, 216532.076, 219105.541, 221714.5375, 224337.5135, 226977.5125, 229613.0655, 232270.2685, 234952.2065, 237645.3555, 240331.1925, 243034.517, 245756.0725, 248517.6865, 251232.737, 254011.3955, 256785.995, 259556.44, 262368.335, 265156.911, 267965.266, 270785.583, 273616.0495, 276487.4835, 279346.639, 282202.509, 285074.3885, 287942.2855, 290856.018, 293774.0345, 296678.5145, 299603.6355, 302552.6575, 305492.9785, 308466.8605, 311392.581, 314347.538, 317319.4295, 320285.9785, 323301.7325, 326298.3235, 329301.3105, 332301.987, 335309.791, 338370.762, 341382.923, 344431.1265, 347464.1545, 350507.28, 353619.2345, 356631.2005, 359685.203, 362776.7845, 365886.488, 368958.2255, 372060.6825, 375165.4335, 378237.935, 381328.311, 384430.5225, 387576.425, 390683.242, 393839.648, 396977.8425, 400101.9805, 403271.296, 406409.8425, 409529.5485, 412678.7, 415847.423, 419020.8035, 422157.081, 425337.749, 428479.6165, 431700.902, 434893.1915, 438049.582, 441210.5415, 444379.2545, 447577.356, 450741.931, 453959.548, 457137.0935, 460329.846, 463537.4815, 466732.3345, 469960.5615, 473164.681, 476347.6345, 479496.173, 482813.1645, 486025.6995, 489249.4885, 492460.1945, 495675.8805, 498908.0075, 502131.802, 505374.3855, 508550.9915, 511806.7305, 515026.776, 518217.0005, 521523.9855, 524705.9855, 527950.997, 531210.0265, 534472.497, 537750.7315, 540926.922, 544207.094, 547429.4345, 550666.3745, 553975.3475, 557150.7185, 560399.6165, 563662.697, 566916.7395, 570146.1215, 573447.425, 576689.6245, 579874.5745, 583202.337, 586503.0255, 589715.635, 592910.161, 596214.3885, 599488.035, 602740.92, 605983.0685, 609248.67, 612491.3605, 615787.912, 619107.5245, 622307.9555, 625577.333, 628840.4385, 632085.2155, 635317.6135, 638691.7195, 641887.467, 645139.9405, 648441.546, 651666.252, 654941.845),
-    // precision 18
-    Array(189084, 192250.913, 195456.774, 198696.946, 201977.762, 205294.444, 208651.754, 212042.099, 215472.269, 218941.91, 222443.912, 225996.845, 229568.199, 233193.568, 236844.457, 240543.233, 244279.475, 248044.27, 251854.588, 255693.2, 259583.619, 263494.621, 267445.385, 271454.061, 275468.769, 279549.456, 283646.446, 287788.198, 291966.099, 296181.164, 300431.469, 304718.618, 309024.004, 313393.508, 317760.803, 322209.731, 326675.061, 331160.627, 335654.47, 340241.442, 344841.833, 349467.132, 354130.629, 358819.432, 363574.626, 368296.587, 373118.482, 377914.93, 382782.301, 387680.669, 392601.981, 397544.323, 402529.115, 407546.018, 412593.658, 417638.657, 422762.865, 427886.169, 433017.167, 438213.273, 443441.254, 448692.421, 453937.533, 459239.049, 464529.569, 469910.083, 475274.03, 480684.473, 486070.26, 491515.237, 496995.651, 502476.617, 507973.609, 513497.19, 519083.233, 524726.509, 530305.505, 535945.728, 541584.404, 547274.055, 552967.236, 558667.862, 564360.216, 570128.148, 575965.08, 581701.952, 587532.523, 593361.144, 599246.128, 605033.418, 610958.779, 616837.117, 622772.818, 628672.04, 634675.369, 640574.831, 646585.739, 652574.547, 658611.217, 664642.684, 670713.914, 676737.681, 682797.313, 688837.897, 694917.874, 701009.882, 707173.648, 713257.254, 719415.392, 725636.761, 731710.697, 737906.209, 744103.074, 750313.39, 756504.185, 762712.579, 768876.985, 775167.859, 781359, 787615.959, 793863.597, 800245.477, 806464.582, 812785.294, 819005.925, 825403.057, 831676.197, 837936.284, 844266.968, 850642.711, 856959.756, 863322.774, 869699.931, 876102.478, 882355.787, 888694.463, 895159.952, 901536.143, 907872.631, 914293.672, 920615.14, 927130.974, 933409.404, 939922.178, 946331.47, 952745.93, 959209.264, 965590.224, 972077.284, 978501.961, 984953.19, 991413.271, 997817.479, 1004222.658, 1010725.676, 1017177.138, 1023612.529, 1030098.236, 1036493.719, 1043112.207, 1049537.036, 1056008.096, 1062476.184, 1068942.337, 1075524.95, 1081932.864, 1088426.025, 1094776.005, 1101327.448, 1107901.673, 1114423.639, 1120884.602, 1127324.923, 1133794.24, 1140328.886, 1146849.376, 1153346.682, 1159836.502, 1166478.703, 1172953.304, 1179391.502, 1185950.982, 1192544.052, 1198913.41, 1205430.994, 1212015.525, 1218674.042, 1225121.683, 1231551.101, 1238126.379, 1244673.795, 1251260.649, 1257697.86, 1264320.983, 1270736.319, 1277274.694, 1283804.95, 1290211.514, 1296858.568, 1303455.691)
-  )
-
-  /**
-   * Bias corrections given a precision and the index of the raw estimate table.
-   */
-  val BIAS_DATA = Array(
-    // precision 4
-    Array(10, 9.717, 9.207, 8.7896, 8.2882, 7.8204, 7.3772, 6.9342, 6.5202, 6.161, 5.7722, 5.4636, 5.0396, 4.6766, 4.3566, 4.0454, 3.7936, 3.4856, 3.2666, 2.9946, 2.766, 2.4692, 2.3638, 2.0764, 1.7864, 1.7602, 1.4814, 1.433, 1.2926, 1.0664, 0.999600000000001, 0.7956, 0.5366, 0.589399999999998, 0.573799999999999, 0.269799999999996, 0.368200000000002, 0.0544000000000011, 0.234200000000001, 0.0108000000000033, -0.203400000000002, -0.0701999999999998, -0.129600000000003, -0.364199999999997, -0.480600000000003, -0.226999999999997, -0.322800000000001, -0.382599999999996, -0.511200000000002, -0.669600000000003, -0.749400000000001, -0.500399999999999, -0.617600000000003, -0.6922, -0.601599999999998, -0.416200000000003, -0.338200000000001, -0.782600000000002, -0.648600000000002, -0.919800000000002, -0.851799999999997, -0.962400000000002, -0.6402, -1.1922, -1.0256, -1.086, -1.21899999999999, -0.819400000000002, -0.940600000000003, -1.1554, -1.2072, -1.1752, -1.16759999999999, -1.14019999999999, -1.3754, -1.29859999999999, -1.607, -1.3292, -1.7606),
-    // precision 5
-    Array(22, 21.1194, 20.8208, 20.2318, 19.77, 19.2436, 18.7774, 18.2848, 17.8224, 17.3742, 16.9336, 16.503, 16.0494, 15.6292, 15.2124, 14.798, 14.367, 13.9728, 13.5944, 13.217, 12.8438, 12.3696, 12.0956, 11.7044, 11.324, 11.0668, 10.6698, 10.3644, 10.049, 9.6918, 9.4146, 9.082, 8.687, 8.5398, 8.2462, 7.857, 7.6606, 7.4168, 7.1248, 6.9222, 6.6804, 6.447, 6.3454, 5.9594, 5.7636, 5.5776, 5.331, 5.19, 4.9676, 4.7564, 4.5314, 4.4442, 4.3708, 3.9774, 3.9624, 3.8796, 3.755, 3.472, 3.2076, 3.1024, 2.8908, 2.7338, 2.7728, 2.629, 2.413, 2.3266, 2.1524, 2.2642, 2.1806, 2.0566, 1.9192, 1.7598, 1.3516, 1.5802, 1.43859999999999, 1.49160000000001, 1.1524, 1.1892, 0.841399999999993, 0.879800000000003, 0.837599999999995, 0.469800000000006, 0.765600000000006, 0.331000000000003, 0.591399999999993, 0.601200000000006, 0.701599999999999, 0.558199999999999, 0.339399999999998, 0.354399999999998, 0.491200000000006, 0.308000000000007, 0.355199999999996, -0.0254000000000048, 0.205200000000005, -0.272999999999996, 0.132199999999997, 0.394400000000005, -0.241200000000006, 0.242000000000004, 0.191400000000002, 0.253799999999998, -0.122399999999999, -0.370800000000003, 0.193200000000004, -0.0848000000000013, 0.0867999999999967, -0.327200000000005, -0.285600000000002, 0.311400000000006, -0.128399999999999, -0.754999999999995, -0.209199999999996, -0.293599999999998, -0.364000000000004, -0.253600000000006, -0.821200000000005, -0.253600000000006, -0.510400000000004, -0.383399999999995, -0.491799999999998, -0.220200000000006, -0.0972000000000008, -0.557400000000001, -0.114599999999996, -0.295000000000002, -0.534800000000004, 0.346399999999988, -0.65379999999999, 0.0398000000000138, 0.0341999999999985, -0.995800000000003, -0.523400000000009, -0.489000000000004, -0.274799999999999, -0.574999999999989, -0.482799999999997, 0.0571999999999946, -0.330600000000004, -0.628800000000012, -0.140199999999993, -0.540600000000012, -0.445999999999998, -0.599400000000003, -0.262599999999992, 0.163399999999996, -0.100599999999986, -0.39500000000001, -1.06960000000001, -0.836399999999998, -0.753199999999993, -0.412399999999991, -0.790400000000005, -0.29679999999999, -0.28540000000001, -0.193000000000012, -0.0772000000000048, -0.962799999999987, -0.414800000000014),
-    // precision 6
-    Array(45, 44.1902, 43.271, 42.8358, 41.8142, 41.2854, 40.317, 39.354, 38.8924, 37.9436, 37.4596, 36.5262, 35.6248, 35.1574, 34.2822, 33.837, 32.9636, 32.074, 31.7042, 30.7976, 30.4772, 29.6564, 28.7942, 28.5004, 27.686, 27.291, 26.5672, 25.8556, 25.4982, 24.8204, 24.4252, 23.7744, 23.0786, 22.8344, 22.0294, 21.8098, 21.0794, 20.5732, 20.1878, 19.5648, 19.2902, 18.6784, 18.3352, 17.8946, 17.3712, 17.0852, 16.499, 16.2686, 15.6844, 15.2234, 14.9732, 14.3356, 14.2286, 13.7262, 13.3284, 13.1048, 12.5962, 12.3562, 12.1272, 11.4184, 11.4974, 11.0822, 10.856, 10.48, 10.2834, 10.0208, 9.637, 9.51739999999999, 9.05759999999999, 8.74760000000001, 8.42700000000001, 8.1326, 8.2372, 8.2788, 7.6776, 7.79259999999999, 7.1952, 6.9564, 6.6454, 6.87, 6.5428, 6.19999999999999, 6.02940000000001, 5.62780000000001, 5.6782, 5.792, 5.35159999999999, 5.28319999999999, 5.0394, 5.07480000000001, 4.49119999999999, 4.84899999999999, 4.696, 4.54040000000001, 4.07300000000001, 4.37139999999999, 3.7216, 3.7328, 3.42080000000001, 3.41839999999999, 3.94239999999999, 3.27719999999999, 3.411, 3.13079999999999, 2.76900000000001, 2.92580000000001, 2.68279999999999, 2.75020000000001, 2.70599999999999, 2.3886, 3.01859999999999, 2.45179999999999, 2.92699999999999, 2.41720000000001, 2.41139999999999, 2.03299999999999, 2.51240000000001, 2.5564, 2.60079999999999, 2.41720000000001, 1.80439999999999, 1.99700000000001, 2.45480000000001, 1.8948, 2.2346, 2.30860000000001, 2.15479999999999, 1.88419999999999, 1.6508, 0.677199999999999, 1.72540000000001, 1.4752, 1.72280000000001, 1.66139999999999, 1.16759999999999, 1.79300000000001, 1.00059999999999, 0.905200000000008, 0.659999999999997, 1.55879999999999, 1.1636, 0.688199999999995, 0.712600000000009, 0.450199999999995, 1.1978, 0.975599999999986, 0.165400000000005, 1.727, 1.19739999999999, -0.252600000000001, 1.13460000000001, 1.3048, 1.19479999999999, 0.313400000000001, 0.878999999999991, 1.12039999999999, 0.853000000000009, 1.67920000000001, 0.856999999999999, 0.448599999999999, 1.2362, 0.953399999999988, 1.02859999999998, 0.563199999999995, 0.663000000000011, 0.723000000000013, 0.756599999999992, 0.256599999999992, -0.837600000000009, 0.620000000000005, 0.821599999999989, 0.216600000000028, 0.205600000000004, 0.220199999999977, 0.372599999999977, 0.334400000000016, 0.928400000000011, 0.972800000000007, 0.192400000000021, 0.487199999999973, -0.413000000000011, 0.807000000000016, 0.120600000000024, 0.769000000000005, 0.870799999999974, 0.66500000000002, 0.118200000000002, 0.401200000000017, 0.635199999999998, 0.135400000000004, 0.175599999999974, 1.16059999999999, 0.34620000000001, 0.521400000000028, -0.586599999999976, -1.16480000000001, 0.968399999999974, 0.836999999999989, 0.779600000000016, 0.985799999999983),
-    // precision 7
-    Array(91, 89.4934, 87.9758, 86.4574, 84.9718, 83.4954, 81.5302, 80.0756, 78.6374, 77.1782, 75.7888, 73.9522, 72.592, 71.2532, 69.9086, 68.5938, 66.9474, 65.6796, 64.4394, 63.2176, 61.9768, 60.4214, 59.2528, 58.0102, 56.8658, 55.7278, 54.3044, 53.1316, 52.093, 51.0032, 49.9092, 48.6306, 47.5294, 46.5756, 45.6508, 44.662, 43.552, 42.3724, 41.617, 40.5754, 39.7872, 38.8444, 37.7988, 36.8606, 36.2118, 35.3566, 34.4476, 33.5882, 32.6816, 32.0824, 31.0258, 30.6048, 29.4436, 28.7274, 27.957, 27.147, 26.4364, 25.7592, 25.3386, 24.781, 23.8028, 23.656, 22.6544, 21.996, 21.4718, 21.1544, 20.6098, 19.5956, 19.0616, 18.5758, 18.4878, 17.5244, 17.2146, 16.724, 15.8722, 15.5198, 15.0414, 14.941, 14.9048, 13.87, 13.4304, 13.028, 12.4708, 12.37, 12.0624, 11.4668, 11.5532, 11.4352, 11.2564, 10.2744, 10.2118, 9.74720000000002, 10.1456, 9.2928, 8.75040000000001, 8.55279999999999, 8.97899999999998, 8.21019999999999, 8.18340000000001, 7.3494, 7.32499999999999, 7.66140000000001, 6.90300000000002, 7.25439999999998, 6.9042, 7.21499999999997, 6.28640000000001, 6.08139999999997, 6.6764, 6.30099999999999, 5.13900000000001, 5.65800000000002, 5.17320000000001, 4.59019999999998, 4.9538, 5.08280000000002, 4.92200000000003, 4.99020000000002, 4.7328, 5.4538, 4.11360000000002, 4.22340000000003, 4.08780000000002, 3.70800000000003, 4.15559999999999, 4.18520000000001, 3.63720000000001, 3.68220000000002, 3.77960000000002, 3.6078, 2.49160000000001, 3.13099999999997, 2.5376, 3.19880000000001, 3.21100000000001, 2.4502, 3.52820000000003, 2.91199999999998, 3.04480000000001, 2.7432, 2.85239999999999, 2.79880000000003, 2.78579999999999, 1.88679999999999, 2.98860000000002, 2.50639999999999, 1.91239999999999, 2.66160000000002, 2.46820000000002, 1.58199999999999, 1.30399999999997, 2.27379999999999, 2.68939999999998, 1.32900000000001, 3.10599999999999, 1.69080000000002, 2.13740000000001, 2.53219999999999, 1.88479999999998, 1.33240000000001, 1.45119999999997, 1.17899999999997, 2.44119999999998, 1.60659999999996, 2.16700000000003, 0.77940000000001, 2.37900000000002, 2.06700000000001, 1.46000000000004, 2.91160000000002, 1.69200000000001, 0.954600000000028, 2.49300000000005, 2.2722, 1.33500000000004, 2.44899999999996, 1.20140000000004, 3.07380000000001, 2.09739999999999, 2.85640000000001, 2.29960000000005, 2.40899999999999, 1.97040000000004, 0.809799999999996, 1.65279999999996, 2.59979999999996, 0.95799999999997, 2.06799999999998, 2.32780000000002, 4.20159999999998, 1.96320000000003, 1.86400000000003, 1.42999999999995, 3.77940000000001, 1.27200000000005, 1.86440000000005, 2.20600000000002, 3.21900000000005, 1.5154, 2.61019999999996),
-    // precision 8
-    Array(183.2152, 180.2454, 177.2096, 173.6652, 170.6312, 167.6822, 164.249, 161.3296, 158.0038, 155.2074, 152.4612, 149.27, 146.5178, 143.4412, 140.8032, 138.1634, 135.1688, 132.6074, 129.6946, 127.2664, 124.8228, 122.0432, 119.6824, 116.9464, 114.6268, 112.2626, 109.8376, 107.4034, 104.8956, 102.8522, 100.7638, 98.3552, 96.3556, 93.7526, 91.9292, 89.8954, 87.8198, 85.7668, 83.298, 81.6688, 79.9466, 77.9746, 76.1672, 74.3474, 72.3028, 70.8912, 69.114, 67.4646, 65.9744, 64.4092, 62.6022, 60.843, 59.5684, 58.1652, 56.5426, 55.4152, 53.5388, 52.3592, 51.1366, 49.486, 48.3918, 46.5076, 45.509, 44.3834, 43.3498, 42.0668, 40.7346, 40.1228, 38.4528, 37.7, 36.644, 36.0518, 34.5774, 33.9068, 32.432, 32.1666, 30.434, 29.6644, 28.4894, 27.6312, 26.3804, 26.292, 25.5496000000001, 25.0234, 24.8206, 22.6146, 22.4188, 22.117, 20.6762, 20.6576, 19.7864, 19.509, 18.5334, 17.9204, 17.772, 16.2924, 16.8654, 15.1836, 15.745, 15.1316, 15.0386, 14.0136, 13.6342, 12.6196, 12.1866, 12.4281999999999, 11.3324, 10.4794000000001, 11.5038, 10.129, 9.52800000000002, 10.3203999999999, 9.46299999999997, 9.79280000000006, 9.12300000000005, 8.74180000000001, 9.2192, 7.51020000000005, 7.60659999999996, 7.01840000000004, 7.22239999999999, 7.40139999999997, 6.76179999999999, 7.14359999999999, 5.65060000000005, 5.63779999999997, 5.76599999999996, 6.75139999999999, 5.57759999999996, 3.73220000000003, 5.8048, 5.63019999999995, 4.93359999999996, 3.47979999999995, 4.33879999999999, 3.98940000000005, 3.81960000000004, 3.31359999999995, 3.23080000000004, 3.4588, 3.08159999999998, 3.4076, 3.00639999999999, 2.38779999999997, 2.61900000000003, 1.99800000000005, 3.34820000000002, 2.95060000000001, 0.990999999999985, 2.11440000000005, 2.20299999999997, 2.82219999999995, 2.73239999999998, 2.7826, 3.76660000000004, 2.26480000000004, 2.31280000000004, 2.40819999999997, 2.75360000000001, 3.33759999999995, 2.71559999999999, 1.7478000000001, 1.42920000000004, 2.39300000000003, 2.22779999999989, 2.34339999999997, 0.87259999999992, 3.88400000000001, 1.80600000000004, 1.91759999999999, 1.16779999999994, 1.50320000000011, 2.52500000000009, 0.226400000000012, 2.31500000000005, 0.930000000000064, 1.25199999999995, 2.14959999999996, 0.0407999999999902, 2.5447999999999, 1.32960000000003, 0.197400000000016, 2.52620000000002, 3.33279999999991, -1.34300000000007, 0.422199999999975, 0.917200000000093, 1.12920000000008, 1.46060000000011, 1.45779999999991, 2.8728000000001, 3.33359999999993, -1.34079999999994, 1.57680000000005, 0.363000000000056, 1.40740000000005, 0.656600000000026, 0.801400000000058, -0.454600000000028, 1.51919999999996),
-    // precision 9
-    Array(368, 361.8294, 355.2452, 348.6698, 342.1464, 336.2024, 329.8782, 323.6598, 317.462, 311.2826, 305.7102, 299.7416, 293.9366, 288.1046, 282.285, 277.0668, 271.306, 265.8448, 260.301, 254.9886, 250.2422, 244.8138, 239.7074, 234.7428, 229.8402, 225.1664, 220.3534, 215.594, 210.6886, 205.7876, 201.65, 197.228, 192.8036, 188.1666, 184.0818, 180.0824, 176.2574, 172.302, 168.1644, 164.0056, 160.3802, 156.7192, 152.5234, 149.2084, 145.831, 142.485, 139.1112, 135.4764, 131.76, 129.3368, 126.5538, 122.5058, 119.2646, 116.5902, 113.3818, 110.8998, 107.9532, 105.2062, 102.2798, 99.4728, 96.9582, 94.3292, 92.171, 89.7809999999999, 87.5716, 84.7048, 82.5322, 79.875, 78.3972, 75.3464, 73.7274, 71.2834, 70.1444, 68.4263999999999, 66.0166, 64.018, 62.0437999999999, 60.3399999999999, 58.6856, 57.9836, 55.0311999999999, 54.6769999999999, 52.3188, 51.4846, 49.4423999999999, 47.739, 46.1487999999999, 44.9202, 43.4059999999999, 42.5342000000001, 41.2834, 38.8954000000001, 38.3286000000001, 36.2146, 36.6684, 35.9946, 33.123, 33.4338, 31.7378000000001, 29.076, 28.9692, 27.4964, 27.0998, 25.9864, 26.7754, 24.3208, 23.4838, 22.7388000000001, 24.0758000000001, 21.9097999999999, 20.9728, 19.9228000000001, 19.9292, 16.617, 17.05, 18.2996000000001, 15.6128000000001, 15.7392, 14.5174, 13.6322, 12.2583999999999, 13.3766000000001, 11.423, 13.1232, 9.51639999999998, 10.5938000000001, 9.59719999999993, 8.12220000000002, 9.76739999999995, 7.50440000000003, 7.56999999999994, 6.70440000000008, 6.41419999999994, 6.71019999999999, 5.60940000000005, 4.65219999999999, 6.84099999999989, 3.4072000000001, 3.97859999999991, 3.32760000000007, 5.52160000000003, 3.31860000000006, 2.06940000000009, 4.35400000000004, 1.57500000000005, 0.280799999999999, 2.12879999999996, -0.214799999999968, -0.0378000000000611, -0.658200000000079, 0.654800000000023, -0.0697999999999865, 0.858400000000074, -2.52700000000004, -2.1751999999999, -3.35539999999992, -1.04019999999991, -0.651000000000067, -2.14439999999991, -1.96659999999997, -3.97939999999994, -0.604400000000169, -3.08260000000018, -3.39159999999993, -5.29640000000018, -5.38920000000007, -5.08759999999984, -4.69900000000007, -5.23720000000003, -3.15779999999995, -4.97879999999986, -4.89899999999989, -7.48880000000008, -5.94799999999987, -5.68060000000014, -6.67180000000008, -4.70499999999993, -7.27779999999984, -4.6579999999999, -4.4362000000001, -4.32139999999981, -5.18859999999995, -6.66879999999992, -6.48399999999992, -5.1260000000002, -4.4032000000002, -6.13500000000022, -5.80819999999994, -4.16719999999987, -4.15039999999999, -7.45600000000013, -7.24080000000004, -9.83179999999993, -5.80420000000004, -8.6561999999999, -6.99940000000015, -10.5473999999999, -7.34139999999979, -6.80999999999995, -6.29719999999998, -6.23199999999997),
-    // precision 10
-    Array(737.1256, 724.4234, 711.1064, 698.4732, 685.4636, 673.0644, 660.488, 647.9654, 636.0832, 623.7864, 612.1992, 600.2176, 588.5228, 577.1716, 565.7752, 554.899, 543.6126, 532.6492, 521.9474, 511.5214, 501.1064, 490.6364, 480.2468, 470.4588, 460.3832, 451.0584, 440.8606, 431.3868, 422.5062, 413.1862, 404.463, 395.339, 386.1936, 378.1292, 369.1854, 361.2908, 353.3324, 344.8518, 337.5204, 329.4854, 321.9318, 314.552, 306.4658, 299.4256, 292.849, 286.152, 278.8956, 271.8792, 265.118, 258.62, 252.5132, 245.9322, 239.7726, 233.6086, 227.5332, 222.5918, 216.4294, 210.7662, 205.4106, 199.7338, 194.9012, 188.4486, 183.1556, 178.6338, 173.7312, 169.6264, 163.9526, 159.8742, 155.8326, 151.1966, 147.5594, 143.07, 140.037, 134.1804, 131.071, 127.4884, 124.0848, 120.2944, 117.333, 112.9626, 110.2902, 107.0814, 103.0334, 99.4832000000001, 96.3899999999999, 93.7202000000002, 90.1714000000002, 87.2357999999999, 85.9346, 82.8910000000001, 80.0264000000002, 78.3834000000002, 75.1543999999999, 73.8683999999998, 70.9895999999999, 69.4367999999999, 64.8701999999998, 65.0408000000002, 61.6738, 59.5207999999998, 57.0158000000001, 54.2302, 53.0962, 50.4985999999999, 52.2588000000001, 47.3914, 45.6244000000002, 42.8377999999998, 43.0072, 40.6516000000001, 40.2453999999998, 35.2136, 36.4546, 33.7849999999999, 33.2294000000002, 32.4679999999998, 30.8670000000002, 28.6507999999999, 28.9099999999999, 27.5983999999999, 26.1619999999998, 24.5563999999999, 23.2328000000002, 21.9484000000002, 21.5902000000001, 21.3346000000001, 17.7031999999999, 20.6111999999998, 19.5545999999999, 15.7375999999999, 17.0720000000001, 16.9517999999998, 15.326, 13.1817999999998, 14.6925999999999, 13.0859999999998, 13.2754, 10.8697999999999, 11.248, 7.3768, 4.72339999999986, 7.97899999999981, 8.7503999999999, 7.68119999999999, 9.7199999999998, 7.73919999999998, 5.6224000000002, 7.44560000000001, 6.6601999999998, 5.9058, 4.00199999999995, 4.51699999999983, 4.68240000000014, 3.86220000000003, 5.13639999999987, 5.98500000000013, 2.47719999999981, 2.61999999999989, 1.62800000000016, 4.65000000000009, 0.225599999999758, 0.831000000000131, -0.359400000000278, 1.27599999999984, -2.92559999999958, -0.0303999999996449, 2.37079999999969, -2.0033999999996, 0.804600000000391, 0.30199999999968, 1.1247999999996, -2.6880000000001, 0.0321999999996478, -1.18099999999959, -3.9402, -1.47940000000017, -0.188400000000001, -2.10720000000038, -2.04159999999956, -3.12880000000041, -4.16160000000036, -0.612799999999879, -3.48719999999958, -8.17900000000009, -5.37780000000021, -4.01379999999972, -5.58259999999973, -5.73719999999958, -7.66799999999967, -5.69520000000011, -1.1247999999996, -5.58520000000044, -8.04560000000038, -4.64840000000004, -11.6468000000004, -7.97519999999986, -5.78300000000036, -7.67420000000038, -10.6328000000003, -9.81720000000041),
-    // precision 11
-    Array(1476, 1449.6014, 1423.5802, 1397.7942, 1372.3042, 1347.2062, 1321.8402, 1297.2292, 1272.9462, 1248.9926, 1225.3026, 1201.4252, 1178.0578, 1155.6092, 1132.626, 1110.5568, 1088.527, 1066.5154, 1045.1874, 1024.3878, 1003.37, 982.1972, 962.5728, 942.1012, 922.9668, 903.292, 884.0772, 864.8578, 846.6562, 828.041, 809.714, 792.3112, 775.1806, 757.9854, 740.656, 724.346, 707.5154, 691.8378, 675.7448, 659.6722, 645.5722, 630.1462, 614.4124, 600.8728, 585.898, 572.408, 558.4926, 544.4938, 531.6776, 517.282, 505.7704, 493.1012, 480.7388, 467.6876, 456.1872, 445.5048, 433.0214, 420.806, 411.409, 400.4144, 389.4294, 379.2286, 369.651, 360.6156, 350.337, 342.083, 332.1538, 322.5094, 315.01, 305.6686, 298.1678, 287.8116, 280.9978, 271.9204, 265.3286, 257.5706, 249.6014, 242.544, 235.5976, 229.583, 220.9438, 214.672, 208.2786, 201.8628, 195.1834, 191.505, 186.1816, 178.5188, 172.2294, 167.8908, 161.0194, 158.052, 151.4588, 148.1596, 143.4344, 138.5238, 133.13, 127.6374, 124.8162, 118.7894, 117.3984, 114.6078, 109.0858, 105.1036, 103.6258, 98.6018000000004, 95.7618000000002, 93.5821999999998, 88.5900000000001, 86.9992000000002, 82.8800000000001, 80.4539999999997, 74.6981999999998, 74.3644000000004, 73.2914000000001, 65.5709999999999, 66.9232000000002, 65.1913999999997, 62.5882000000001, 61.5702000000001, 55.7035999999998, 56.1764000000003, 52.7596000000003, 53.0302000000001, 49.0609999999997, 48.4694, 44.933, 46.0474000000004, 44.7165999999997, 41.9416000000001, 39.9207999999999, 35.6328000000003, 35.5276000000003, 33.1934000000001, 33.2371999999996, 33.3864000000003, 33.9228000000003, 30.2371999999996, 29.1373999999996, 25.2272000000003, 24.2942000000003, 19.8338000000003, 18.9005999999999, 23.0907999999999, 21.8544000000002, 19.5176000000001, 15.4147999999996, 16.9314000000004, 18.6737999999996, 12.9877999999999, 14.3688000000002, 12.0447999999997, 15.5219999999999, 12.5299999999997, 14.5940000000001, 14.3131999999996, 9.45499999999993, 12.9441999999999, 3.91139999999996, 13.1373999999996, 5.44720000000052, 9.82779999999912, 7.87279999999919, 3.67760000000089, 5.46980000000076, 5.55099999999948, 5.65979999999945, 3.89439999999922, 3.1275999999998, 5.65140000000065, 6.3062000000009, 3.90799999999945, 1.87060000000019, 5.17020000000048, 2.46680000000015, 0.770000000000437, -3.72340000000077, 1.16400000000067, 8.05340000000069, 0.135399999999208, 2.15940000000046, 0.766999999999825, 1.0594000000001, 3.15500000000065, -0.287399999999252, 2.37219999999979, -2.86620000000039, -1.63199999999961, -2.22979999999916, -0.15519999999924, -1.46039999999994, -0.262199999999211, -2.34460000000036, -2.8078000000005, -3.22179999999935, -5.60159999999996, -8.42200000000048, -9.43740000000071, 0.161799999999857, -10.4755999999998, -10.0823999999993),
-    // precision 12
-    Array(2953, 2900.4782, 2848.3568, 2796.3666, 2745.324, 2694.9598, 2644.648, 2595.539, 2546.1474, 2498.2576, 2450.8376, 2403.6076, 2357.451, 2311.38, 2266.4104, 2221.5638, 2176.9676, 2134.193, 2090.838, 2048.8548, 2007.018, 1966.1742, 1925.4482, 1885.1294, 1846.4776, 1807.4044, 1768.8724, 1731.3732, 1693.4304, 1657.5326, 1621.949, 1586.5532, 1551.7256, 1517.6182, 1483.5186, 1450.4528, 1417.865, 1385.7164, 1352.6828, 1322.6708, 1291.8312, 1260.9036, 1231.476, 1201.8652, 1173.6718, 1145.757, 1119.2072, 1092.2828, 1065.0434, 1038.6264, 1014.3192, 988.5746, 965.0816, 940.1176, 917.9796, 894.5576, 871.1858, 849.9144, 827.1142, 805.0818, 783.9664, 763.9096, 742.0816, 724.3962, 706.3454, 688.018, 667.4214, 650.3106, 633.0686, 613.8094, 597.818, 581.4248, 563.834, 547.363, 531.5066, 520.455400000001, 505.583199999999, 488.366, 476.480799999999, 459.7682, 450.0522, 434.328799999999, 423.952799999999, 408.727000000001, 399.079400000001, 387.252200000001, 373.987999999999, 360.852000000001, 351.6394, 339.642, 330.902400000001, 322.661599999999, 311.662200000001, 301.3254, 291.7484, 279.939200000001, 276.7508, 263.215200000001, 254.811400000001, 245.5494, 242.306399999999, 234.8734, 223.787200000001, 217.7156, 212.0196, 200.793, 195.9748, 189.0702, 182.449199999999, 177.2772, 170.2336, 164.741, 158.613600000001, 155.311, 147.5964, 142.837, 137.3724, 132.0162, 130.0424, 121.9804, 120.451800000001, 114.8968, 111.585999999999, 105.933199999999, 101.705, 98.5141999999996, 95.0488000000005, 89.7880000000005, 91.4750000000004, 83.7764000000006, 80.9698000000008, 72.8574000000008, 73.1615999999995, 67.5838000000003, 62.6263999999992, 63.2638000000006, 66.0977999999996, 52.0843999999997, 58.9956000000002, 47.0912000000008, 46.4956000000002, 48.4383999999991, 47.1082000000006, 43.2392, 37.2759999999998, 40.0283999999992, 35.1864000000005, 35.8595999999998, 32.0998, 28.027, 23.6694000000007, 33.8266000000003, 26.3736000000008, 27.2008000000005, 21.3245999999999, 26.4115999999995, 23.4521999999997, 19.5013999999992, 19.8513999999996, 10.7492000000002, 18.6424000000006, 13.1265999999996, 18.2436000000016, 6.71860000000015, 3.39459999999963, 6.33759999999893, 7.76719999999841, 0.813999999998487, 3.82819999999992, 0.826199999999517, 8.07440000000133, -1.59080000000176, 5.01780000000144, 0.455399999998917, -0.24199999999837, 0.174800000000687, -9.07640000000174, -4.20160000000033, -3.77520000000004, -4.75179999999818, -5.3724000000002, -8.90680000000066, -6.10239999999976, -5.74120000000039, -9.95339999999851, -3.86339999999836, -13.7304000000004, -16.2710000000006, -7.51359999999841, -3.30679999999847, -13.1339999999982, -10.0551999999989, -6.72019999999975, -8.59660000000076, -10.9307999999983, -1.8775999999998, -4.82259999999951, -13.7788, -21.6470000000008, -10.6735999999983, -15.7799999999988),
-    // precision 13
-    Array(5907.5052, 5802.2672, 5697.347, 5593.5794, 5491.2622, 5390.5514, 5290.3376, 5191.6952, 5093.5988, 4997.3552, 4902.5972, 4808.3082, 4715.5646, 4624.109, 4533.8216, 4444.4344, 4356.3802, 4269.2962, 4183.3784, 4098.292, 4014.79, 3932.4574, 3850.6036, 3771.2712, 3691.7708, 3615.099, 3538.1858, 3463.4746, 3388.8496, 3315.6794, 3244.5448, 3173.7516, 3103.3106, 3033.6094, 2966.5642, 2900.794, 2833.7256, 2769.81, 2707.3196, 2644.0778, 2583.9916, 2523.4662, 2464.124, 2406.073, 2347.0362, 2292.1006, 2238.1716, 2182.7514, 2128.4884, 2077.1314, 2025.037, 1975.3756, 1928.933, 1879.311, 1831.0006, 1783.2144, 1738.3096, 1694.5144, 1649.024, 1606.847, 1564.7528, 1525.3168, 1482.5372, 1443.9668, 1406.5074, 1365.867, 1329.2186, 1295.4186, 1257.9716, 1225.339, 1193.2972, 1156.3578, 1125.8686, 1091.187, 1061.4094, 1029.4188, 1000.9126, 972.3272, 944.004199999999, 915.7592, 889.965, 862.834200000001, 840.4254, 812.598399999999, 785.924200000001, 763.050999999999, 741.793799999999, 721.466, 699.040799999999, 677.997200000002, 649.866999999998, 634.911800000002, 609.8694, 591.981599999999, 570.2922, 557.129199999999, 538.3858, 521.872599999999, 502.951400000002, 495.776399999999, 475.171399999999, 459.751, 439.995200000001, 426.708999999999, 413.7016, 402.3868, 387.262599999998, 372.0524, 357.050999999999, 342.5098, 334.849200000001, 322.529399999999, 311.613799999999, 295.848000000002, 289.273000000001, 274.093000000001, 263.329600000001, 251.389599999999, 245.7392, 231.9614, 229.7952, 217.155200000001, 208.9588, 199.016599999999, 190.839199999999, 180.6976, 176.272799999999, 166.976999999999, 162.5252, 151.196400000001, 149.386999999999, 133.981199999998, 130.0586, 130.164000000001, 122.053400000001, 110.7428, 108.1276, 106.232400000001, 100.381600000001, 98.7668000000012, 86.6440000000002, 79.9768000000004, 82.4722000000002, 68.7026000000005, 70.1186000000016, 71.9948000000004, 58.998599999999, 59.0492000000013, 56.9818000000014, 47.5338000000011, 42.9928, 51.1591999999982, 37.2740000000013, 42.7220000000016, 31.3734000000004, 26.8090000000011, 25.8934000000008, 26.5286000000015, 29.5442000000003, 19.3503999999994, 26.0760000000009, 17.9527999999991, 14.8419999999969, 10.4683999999979, 8.65899999999965, 9.86720000000059, 4.34139999999752, -0.907800000000861, -3.32080000000133, -0.936199999996461, -11.9916000000012, -8.87000000000262, -6.33099999999831, -11.3366000000024, -15.9207999999999, -9.34659999999712, -15.5034000000014, -19.2097999999969, -15.357799999998, -28.2235999999975, -30.6898000000001, -19.3271999999997, -25.6083999999973, -24.409599999999, -13.6385999999984, -33.4473999999973, -32.6949999999997, -28.9063999999998, -31.7483999999968, -32.2935999999972, -35.8329999999987, -47.620600000002, -39.0855999999985, -33.1434000000008, -46.1371999999974, -37.5892000000022, -46.8164000000033, -47.3142000000007, -60.2914000000019, -37.7575999999972),
-    // precision 14
-    Array(11816.475, 11605.0046, 11395.3792, 11188.7504, 10984.1814, 10782.0086, 10582.0072, 10384.503, 10189.178, 9996.2738, 9806.0344, 9617.9798, 9431.394, 9248.7784, 9067.6894, 8889.6824, 8712.9134, 8538.8624, 8368.4944, 8197.7956, 8031.8916, 7866.6316, 7703.733, 7544.5726, 7386.204, 7230.666, 7077.8516, 6926.7886, 6778.6902, 6631.9632, 6487.304, 6346.7486, 6206.4408, 6070.202, 5935.2576, 5799.924, 5671.0324, 5541.9788, 5414.6112, 5290.0274, 5166.723, 5047.6906, 4929.162, 4815.1406, 4699.127, 4588.5606, 4477.7394, 4369.4014, 4264.2728, 4155.9224, 4055.581, 3955.505, 3856.9618, 3761.3828, 3666.9702, 3575.7764, 3482.4132, 3395.0186, 3305.8852, 3221.415, 3138.6024, 3056.296, 2970.4494, 2896.1526, 2816.8008, 2740.2156, 2670.497, 2594.1458, 2527.111, 2460.8168, 2387.5114, 2322.9498, 2260.6752, 2194.2686, 2133.7792, 2074.767, 2015.204, 1959.4226, 1898.6502, 1850.006, 1792.849, 1741.4838, 1687.9778, 1638.1322, 1589.3266, 1543.1394, 1496.8266, 1447.8516, 1402.7354, 1361.9606, 1327.0692, 1285.4106, 1241.8112, 1201.6726, 1161.973, 1130.261, 1094.2036, 1048.2036, 1020.6436, 990.901400000002, 961.199800000002, 924.769800000002, 899.526400000002, 872.346400000002, 834.375, 810.432000000001, 780.659800000001, 756.013800000001, 733.479399999997, 707.923999999999, 673.858, 652.222399999999, 636.572399999997, 615.738599999997, 586.696400000001, 564.147199999999, 541.679600000003, 523.943599999999, 505.714599999999, 475.729599999999, 461.779600000002, 449.750800000002, 439.020799999998, 412.7886, 400.245600000002, 383.188199999997, 362.079599999997, 357.533799999997, 334.319000000003, 327.553399999997, 308.559399999998, 291.270199999999, 279.351999999999, 271.791400000002, 252.576999999997, 247.482400000001, 236.174800000001, 218.774599999997, 220.155200000001, 208.794399999999, 201.223599999998, 182.995600000002, 185.5268, 164.547400000003, 176.5962, 150.689599999998, 157.8004, 138.378799999999, 134.021200000003, 117.614399999999, 108.194000000003, 97.0696000000025, 89.6042000000016, 95.6030000000028, 84.7810000000027, 72.635000000002, 77.3482000000004, 59.4907999999996, 55.5875999999989, 50.7346000000034, 61.3916000000027, 50.9149999999936, 39.0384000000049, 58.9395999999979, 29.633600000001, 28.2032000000036, 26.0078000000067, 17.0387999999948, 9.22000000000116, 13.8387999999977, 8.07240000000456, 14.1549999999988, 15.3570000000036, 3.42660000000615, 6.24820000000182, -2.96940000000177, -8.79940000000352, -5.97860000000219, -14.4048000000039, -3.4143999999942, -13.0148000000045, -11.6977999999945, -25.7878000000055, -22.3185999999987, -24.409599999999, -31.9756000000052, -18.9722000000038, -22.8678000000073, -30.8972000000067, -32.3715999999986, -22.3907999999938, -43.6720000000059, -35.9038, -39.7492000000057, -54.1641999999993, -45.2749999999942, -42.2989999999991, -44.1089999999967, -64.3564000000042, -49.9551999999967, -42.6116000000038),
-    // precision 15
-    Array(23634.0036, 23210.8034, 22792.4744, 22379.1524, 21969.7928, 21565.326, 21165.3532, 20770.2806, 20379.9892, 19994.7098, 19613.318, 19236.799, 18865.4382, 18498.8244, 18136.5138, 17778.8668, 17426.2344, 17079.32, 16734.778, 16397.2418, 16063.3324, 15734.0232, 15409.731, 15088.728, 14772.9896, 14464.1402, 14157.5588, 13855.5958, 13559.3296, 13264.9096, 12978.326, 12692.0826, 12413.8816, 12137.3192, 11870.2326, 11602.5554, 11340.3142, 11079.613, 10829.5908, 10583.5466, 10334.0344, 10095.5072, 9859.694, 9625.2822, 9395.7862, 9174.0586, 8957.3164, 8738.064, 8524.155, 8313.7396, 8116.9168, 7913.542, 7718.4778, 7521.65, 7335.5596, 7154.2906, 6968.7396, 6786.3996, 6613.236, 6437.406, 6270.6598, 6107.7958, 5945.7174, 5787.6784, 5635.5784, 5482.308, 5337.9784, 5190.0864, 5045.9158, 4919.1386, 4771.817, 4645.7742, 4518.4774, 4385.5454, 4262.6622, 4142.74679999999, 4015.5318, 3897.9276, 3790.7764, 3685.13800000001, 3573.6274, 3467.9706, 3368.61079999999, 3271.5202, 3170.3848, 3076.4656, 2982.38400000001, 2888.4664, 2806.4868, 2711.9564, 2634.1434, 2551.3204, 2469.7662, 2396.61139999999, 2318.9902, 2243.8658, 2171.9246, 2105.01360000001, 2028.8536, 1960.9952, 1901.4096, 1841.86079999999, 1777.54700000001, 1714.5802, 1654.65059999999, 1596.311, 1546.2016, 1492.3296, 1433.8974, 1383.84600000001, 1339.4152, 1293.5518, 1245.8686, 1193.50659999999, 1162.27959999999, 1107.19439999999, 1069.18060000001, 1035.09179999999, 999.679000000004, 957.679999999993, 925.300199999998, 888.099400000006, 848.638600000006, 818.156400000007, 796.748399999997, 752.139200000005, 725.271200000003, 692.216, 671.633600000001, 647.939799999993, 621.670599999998, 575.398799999995, 561.226599999995, 532.237999999998, 521.787599999996, 483.095799999996, 467.049599999998, 465.286399999997, 415.548599999995, 401.047399999996, 380.607999999993, 377.362599999993, 347.258799999996, 338.371599999999, 310.096999999994, 301.409199999995, 276.280799999993, 265.586800000005, 258.994399999996, 223.915999999997, 215.925399999993, 213.503800000006, 191.045400000003, 166.718200000003, 166.259000000005, 162.941200000001, 148.829400000002, 141.645999999993, 123.535399999993, 122.329800000007, 89.473399999988, 80.1962000000058, 77.5457999999926, 59.1056000000099, 83.3509999999951, 52.2906000000075, 36.3979999999865, 40.6558000000077, 42.0003999999899, 19.6630000000005, 19.7153999999864, -8.38539999999921, -0.692799999989802, 0.854800000000978, 3.23219999999856, -3.89040000000386, -5.25880000001052, -24.9052000000083, -22.6837999999989, -26.4286000000138, -34.997000000003, -37.0216000000073, -43.430400000012, -58.2390000000014, -68.8034000000043, -56.9245999999985, -57.8583999999973, -77.3097999999882, -73.2793999999994, -81.0738000000129, -87.4530000000086, -65.0254000000132, -57.296399999992, -96.2746000000043, -103.25, -96.081600000005, -91.5542000000132, -102.465200000006, -107.688599999994, -101.458000000013, -109.715800000005),
-    // precision 16
-    Array(47270, 46423.3584, 45585.7074, 44757.152, 43938.8416, 43130.9514, 42330.03, 41540.407, 40759.6348, 39988.206, 39226.5144, 38473.2096, 37729.795, 36997.268, 36272.6448, 35558.665, 34853.0248, 34157.4472, 33470.5204, 32793.5742, 32127.0194, 31469.4182, 30817.6136, 30178.6968, 29546.8908, 28922.8544, 28312.271, 27707.0924, 27114.0326, 26526.692, 25948.6336, 25383.7826, 24823.5998, 24272.2974, 23732.2572, 23201.4976, 22674.2796, 22163.6336, 21656.515, 21161.7362, 20669.9368, 20189.4424, 19717.3358, 19256.3744, 18795.9638, 18352.197, 17908.5738, 17474.391, 17052.918, 16637.2236, 16228.4602, 15823.3474, 15428.6974, 15043.0284, 14667.6278, 14297.4588, 13935.2882, 13578.5402, 13234.6032, 12882.1578, 12548.0728, 12219.231, 11898.0072, 11587.2626, 11279.9072, 10973.5048, 10678.5186, 10392.4876, 10105.2556, 9825.766, 9562.5444, 9294.2222, 9038.2352, 8784.848, 8533.2644, 8301.7776, 8058.30859999999, 7822.94579999999, 7599.11319999999, 7366.90779999999, 7161.217, 6957.53080000001, 6736.212, 6548.21220000001, 6343.06839999999, 6156.28719999999, 5975.15419999999, 5791.75719999999, 5621.32019999999, 5451.66, 5287.61040000001, 5118.09479999999, 4957.288, 4798.4246, 4662.17559999999, 4512.05900000001, 4364.68539999999, 4220.77720000001, 4082.67259999999, 3957.19519999999, 3842.15779999999, 3699.3328, 3583.01180000001, 3473.8964, 3338.66639999999, 3233.55559999999, 3117.799, 3008.111, 2909.69140000001, 2814.86499999999, 2719.46119999999, 2624.742, 2532.46979999999, 2444.7886, 2370.1868, 2272.45259999999, 2196.19260000001, 2117.90419999999, 2023.2972, 1969.76819999999, 1885.58979999999, 1833.2824, 1733.91200000001, 1682.54920000001, 1604.57980000001, 1556.11240000001, 1491.3064, 1421.71960000001, 1371.22899999999, 1322.1324, 1264.7892, 1196.23920000001, 1143.8474, 1088.67240000001, 1073.60380000001, 1023.11660000001, 959.036400000012, 927.433199999999, 906.792799999996, 853.433599999989, 841.873800000001, 791.1054, 756.899999999994, 704.343200000003, 672.495599999995, 622.790399999998, 611.254799999995, 567.283200000005, 519.406599999988, 519.188400000014, 495.312800000014, 451.350799999986, 443.973399999988, 431.882199999993, 392.027000000002, 380.924200000009, 345.128999999986, 298.901400000002, 287.771999999997, 272.625, 247.253000000026, 222.490600000019, 223.590000000026, 196.407599999977, 176.425999999978, 134.725199999986, 132.4804, 110.445599999977, 86.7939999999944, 56.7038000000175, 64.915399999998, 38.3726000000024, 37.1606000000029, 46.170999999973, 49.1716000000015, 15.3362000000197, 6.71639999997569, -34.8185999999987, -39.4476000000141, 12.6830000000191, -12.3331999999937, -50.6565999999875, -59.9538000000175, -65.1054000000004, -70.7576000000117, -106.325200000021, -126.852200000023, -110.227599999984, -132.885999999999, -113.897200000007, -142.713800000027, -151.145399999979, -150.799200000009, -177.756200000003, -156.036399999983, -182.735199999996, -177.259399999981, -198.663600000029, -174.577600000019, -193.84580000001),
-    // precision 17
-    Array(94541, 92848.811, 91174.019, 89517.558, 87879.9705, 86262.7565, 84663.5125, 83083.7435, 81521.7865, 79977.272, 78455.9465, 76950.219, 75465.432, 73994.152, 72546.71, 71115.2345, 69705.6765, 68314.937, 66944.2705, 65591.255, 64252.9485, 62938.016, 61636.8225, 60355.592, 59092.789, 57850.568, 56624.518, 55417.343, 54231.1415, 53067.387, 51903.526, 50774.649, 49657.6415, 48561.05, 47475.7575, 46410.159, 45364.852, 44327.053, 43318.4005, 42325.6165, 41348.4595, 40383.6265, 39436.77, 38509.502, 37594.035, 36695.939, 35818.6895, 34955.691, 34115.8095, 33293.949, 32465.0775, 31657.6715, 30877.2585, 30093.78, 29351.3695, 28594.1365, 27872.115, 27168.7465, 26477.076, 25774.541, 25106.5375, 24452.5135, 23815.5125, 23174.0655, 22555.2685, 21960.2065, 21376.3555, 20785.1925, 20211.517, 19657.0725, 19141.6865, 18579.737, 18081.3955, 17578.995, 17073.44, 16608.335, 16119.911, 15651.266, 15194.583, 14749.0495, 14343.4835, 13925.639, 13504.509, 13099.3885, 12691.2855, 12328.018, 11969.0345, 11596.5145, 11245.6355, 10917.6575, 10580.9785, 10277.8605, 9926.58100000001, 9605.538, 9300.42950000003, 8989.97850000003, 8728.73249999998, 8448.3235, 8175.31050000002, 7898.98700000002, 7629.79100000003, 7413.76199999999, 7149.92300000001, 6921.12650000001, 6677.1545, 6443.28000000003, 6278.23450000002, 6014.20049999998, 5791.20299999998, 5605.78450000001, 5438.48800000001, 5234.2255, 5059.6825, 4887.43349999998, 4682.935, 4496.31099999999, 4322.52250000002, 4191.42499999999, 4021.24200000003, 3900.64799999999, 3762.84250000003, 3609.98050000001, 3502.29599999997, 3363.84250000003, 3206.54849999998, 3079.70000000001, 2971.42300000001, 2867.80349999998, 2727.08100000001, 2630.74900000001, 2496.6165, 2440.902, 2356.19150000002, 2235.58199999999, 2120.54149999999, 2012.25449999998, 1933.35600000003, 1820.93099999998, 1761.54800000001, 1663.09350000002, 1578.84600000002, 1509.48149999999, 1427.3345, 1379.56150000001, 1306.68099999998, 1212.63449999999, 1084.17300000001, 1124.16450000001, 1060.69949999999, 1007.48849999998, 941.194499999983, 879.880500000028, 836.007500000007, 782.802000000025, 748.385499999975, 647.991500000004, 626.730500000005, 570.776000000013, 484.000500000024, 513.98550000001, 418.985499999952, 386.996999999974, 370.026500000036, 355.496999999974, 356.731499999994, 255.92200000002, 259.094000000041, 205.434499999974, 165.374500000034, 197.347500000033, 95.718499999959, 67.6165000000037, 54.6970000000438, 31.7395000000251, -15.8784999999916, 8.42500000004657, -26.3754999999655, -118.425500000012, -66.6629999999423, -42.9745000000112, -107.364999999991, -189.839000000036, -162.611499999999, -164.964999999967, -189.079999999958, -223.931499999948, -235.329999999958, -269.639500000048, -249.087999999989, -206.475499999942, -283.04449999996, -290.667000000016, -304.561499999953, -336.784499999951, -380.386500000022, -283.280499999993, -364.533000000054, -389.059499999974, -364.454000000027, -415.748000000021, -417.155000000028),
-    // precision 18
-    Array(189083, 185696.913, 182348.774, 179035.946, 175762.762, 172526.444, 169329.754, 166166.099, 163043.269, 159958.91, 156907.912, 153906.845, 150924.199, 147996.568, 145093.457, 142239.233, 139421.475, 136632.27, 133889.588, 131174.2, 128511.619, 125868.621, 123265.385, 120721.061, 118181.769, 115709.456, 113252.446, 110840.198, 108465.099, 106126.164, 103823.469, 101556.618, 99308.004, 97124.508, 94937.803, 92833.731, 90745.061, 88677.627, 86617.47, 84650.442, 82697.833, 80769.132, 78879.629, 77014.432, 75215.626, 73384.587, 71652.482, 69895.93, 68209.301, 66553.669, 64921.981, 63310.323, 61742.115, 60205.018, 58698.658, 57190.657, 55760.865, 54331.169, 52908.167, 51550.273, 50225.254, 48922.421, 47614.533, 46362.049, 45098.569, 43926.083, 42736.03, 41593.473, 40425.26, 39316.237, 38243.651, 37170.617, 36114.609, 35084.19, 34117.233, 33206.509, 32231.505, 31318.728, 30403.404, 29540.0550000001, 28679.236, 27825.862, 26965.216, 26179.148, 25462.08, 24645.952, 23922.523, 23198.144, 22529.128, 21762.4179999999, 21134.779, 20459.117, 19840.818, 19187.04, 18636.3689999999, 17982.831, 17439.7389999999, 16874.547, 16358.2169999999, 15835.684, 15352.914, 14823.681, 14329.313, 13816.897, 13342.874, 12880.882, 12491.648, 12021.254, 11625.392, 11293.7610000001, 10813.697, 10456.209, 10099.074, 9755.39000000001, 9393.18500000006, 9047.57900000003, 8657.98499999999, 8395.85900000005, 8033, 7736.95900000003, 7430.59699999995, 7258.47699999996, 6924.58200000005, 6691.29399999999, 6357.92500000005, 6202.05700000003, 5921.19700000004, 5628.28399999999, 5404.96799999999, 5226.71100000001, 4990.75600000005, 4799.77399999998, 4622.93099999998, 4472.478, 4171.78700000001, 3957.46299999999, 3868.95200000005, 3691.14300000004, 3474.63100000005, 3341.67200000002, 3109.14000000001, 3071.97400000005, 2796.40399999998, 2756.17799999996, 2611.46999999997, 2471.93000000005, 2382.26399999997, 2209.22400000005, 2142.28399999999, 2013.96100000001, 1911.18999999994, 1818.27099999995, 1668.47900000005, 1519.65800000005, 1469.67599999998, 1367.13800000004, 1248.52899999998, 1181.23600000003, 1022.71900000004, 1088.20700000005, 959.03600000008, 876.095999999903, 791.183999999892, 703.337000000058, 731.949999999953, 586.86400000006, 526.024999999907, 323.004999999888, 320.448000000091, 340.672999999952, 309.638999999966, 216.601999999955, 102.922999999952, 19.2399999999907, -0.114000000059605, -32.6240000000689, -89.3179999999702, -153.497999999905, -64.2970000000205, -143.695999999996, -259.497999999905, -253.017999999924, -213.948000000091, -397.590000000084, -434.006000000052, -403.475000000093, -297.958000000101, -404.317000000039, -528.898999999976, -506.621000000043, -513.205000000075, -479.351000000024, -596.139999999898, -527.016999999993, -664.681000000099, -680.306000000099, -704.050000000047, -850.486000000034, -757.43200000003, -713.308999999892)
-  )
-  // scalastyle:on
-
-  private def validateDoubleLiteral(exp: Expression): Double = exp match {
+  def validateDoubleLiteral(exp: Expression): Double = exp match {
     case Literal(d: Double, DoubleType) => d
     case Literal(dec: Decimal, _) => dec.toDouble
     case _ =>
       throw new AnalysisException("The second argument should be a double literal.")
   }
-
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/HyperLogLogPlusPlusHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/HyperLogLogPlusPlusHelper.scala
new file mode 100644
index 0000000000000..9bacd3b925be3
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/HyperLogLogPlusPlusHelper.scala
@@ -0,0 +1,373 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.util
+
+import java.lang.{Long => JLong}
+import java.util
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.XxHash64Function
+import org.apache.spark.sql.types._
+
+// A helper class for HyperLogLogPlusPlus.
+class HyperLogLogPlusPlusHelper(relativeSD: Double) extends Serializable {
+  import HyperLogLogPlusPlusHelper._
+
+  /**
+   * HLL++ uses 'p' bits for addressing. The more addressing bits we use, the more precise the
+   * algorithm will be, and the more memory it will require. The 'p' value is based on the relative
+   * error requested.
+   *
+   * HLL++ requires that we use at least 4 bits of addressing space (a minimum precision of 27%).
+   *
+   * This method rounds up to the nearest integer. This means that the error is always equal to or
+   * lower than the requested error. Use the <code>trueRsd</code> method to get the actual RSD
+   * value.
+   */
+  private[this] val p = Math.ceil(2.0d * Math.log(1.106d / relativeSD) / Math.log(2.0d)).toInt
+
+  require(p >= 4, "HLL++ requires at least 4 bits for addressing. " +
+    "Use a lower error, at most 39%.")
+
+  /**
+   * Shift used to extract the index of the register from the hashed value.
+   *
+   * This assumes the use of 64-bit hashcodes.
+   */
+  private[this] val idxShift = JLong.SIZE - p
+
+  /**
+   * Value to pad the 'w' value with before the number of leading zeros is determined.
+   */
+  private[this] val wPadding = 1L << (p - 1)
+
+  /**
+   * The number of registers used.
+   */
+  private[this] val m = 1 << p
+
+  /**
+   * The pre-calculated combination of: alpha * m * m
+   *
+   * 'alpha' corrects the raw cardinality estimate 'Z'. See the FlFuGaMe07 paper for its
+   * derivation.
+   */
+  private[this] val alphaM2 = p match {
+    case 4 => 0.673d * m * m
+    case 5 => 0.697d * m * m
+    case 6 => 0.709d * m * m
+    case _ => (0.7213d / (1.0d + 1.079d / m)) * m * m
+  }
+
+  /**
+   * The number of words used to store the registers. We use Longs for storage because this is the
+   * most compact way of storage; Spark aligns to 8-byte words or uses Long wrappers.
+   *
+   * We only store whole registers per word in order to prevent overly complex bitwise operations.
+   * In practice this means we only use 60 out of 64 bits.
+   */
+  val numWords = m / REGISTERS_PER_WORD + 1
+
+  /**
+   * Update the HLL++ buffer.
+   *
+   * Variable names in the HLL++ paper match variable names in the code.
+   */
+  def update(buffer: InternalRow, bufferOffset: Int, value: Any, dataType: DataType): Unit = {
+    // Create the hashed value 'x'.
+    val x = XxHash64Function.hash(value, dataType, 42L)
+
+    // Determine the index of the register we are going to use.
+    val idx = (x >>> idxShift).toInt
+
+    // Determine the number of leading zeros in the remaining bits 'w'.
+    val pw = JLong.numberOfLeadingZeros((x << p) | wPadding) + 1L
+
+    // Get the word containing the register we are interested in.
+    val wordOffset = idx / REGISTERS_PER_WORD
+    val word = buffer.getLong(bufferOffset + wordOffset)
+
+    // Extract the M[J] register value from the word.
+    val shift = REGISTER_SIZE * (idx - (wordOffset * REGISTERS_PER_WORD))
+    val mask = REGISTER_WORD_MASK << shift
+    val Midx = (word & mask) >>> shift
+
+    // Assign the maximum number of leading zeros to the register.
+    if (pw > Midx) {
+      buffer.setLong(bufferOffset + wordOffset, (word & ~mask) | (pw << shift))
+    }
+  }
+
+  /**
+   * Merge the HLL buffers by iterating through the registers in both buffers and select the
+   * maximum number of leading zeros for each register.
+   */
+  def merge(buffer1: InternalRow, buffer2: InternalRow, offset1: Int, offset2: Int): Unit = {
+    var idx = 0
+    var wordOffset = 0
+    while (wordOffset < numWords) {
+      val word1 = buffer1.getLong(offset1 + wordOffset)
+      val word2 = buffer2.getLong(offset2 + wordOffset)
+      var word = 0L
+      var i = 0
+      var mask = REGISTER_WORD_MASK
+      while (idx < m && i < REGISTERS_PER_WORD) {
+        word |= Math.max(word1 & mask, word2 & mask)
+        mask <<= REGISTER_SIZE
+        i += 1
+        idx += 1
+      }
+      buffer1.setLong(offset1 + wordOffset, word)
+      wordOffset += 1
+    }
+  }
+
+  /**
+   * Estimate the bias using the raw estimates with their respective biases from the HLL++
+   * appendix. We currently use KNN interpolation to determine the bias (as suggested in the
+   * paper).
+   */
+  def estimateBias(e: Double): Double = {
+    val estimates = RAW_ESTIMATE_DATA(p - 4)
+    val numEstimates = estimates.length
+
+    // The estimates are sorted so we can use a binary search to find the index of the
+    // interpolation estimate closest to the current estimate.
+    val nearestEstimateIndex = util.Arrays.binarySearch(estimates, 0, numEstimates, e) match {
+      case ix if ix < 0 => -(ix + 1)
+      case ix => ix
+    }
+
+    // Use square of the difference between the current estimate and the estimate at the given
+    // index as distance metric.
+    def distance(i: Int): Double = {
+      val diff = e - estimates(i)
+      diff * diff
+    }
+
+    // Keep moving bounds as long as the (exclusive) high bound is closer to the estimate than
+    // the lower (inclusive) bound.
+    var low = math.max(nearestEstimateIndex - K + 1, 0)
+    var high = math.min(low + K, numEstimates)
+    while (high < numEstimates && distance(high) < distance(low)) {
+      low += 1
+      high += 1
+    }
+
+    // Calculate the sum of the biases in low-high interval.
+    val biases = BIAS_DATA(p - 4)
+    var i = low
+    var biasSum = 0.0
+    while (i < high) {
+      biasSum += biases(i)
+      i += 1
+    }
+
+    // Calculate the bias.
+    biasSum / (high - low)
+  }
+
+  /**
+   * Compute the HyperLogLog estimate.
+   *
+   * Variable names in the HLL++ paper match variable names in the code.
+   */
+  def query(buffer: InternalRow, bufferOffset: Int): Long = {
+    // Compute the inverse of indicator value 'z' and count the number of zeros 'V'.
+    var zInverse = 0.0d
+    var V = 0.0d
+    var idx = 0
+    var wordOffset = 0
+    while (wordOffset < numWords) {
+      val word = buffer.getLong(bufferOffset + wordOffset)
+      var i = 0
+      var shift = 0
+      while (idx < m && i < REGISTERS_PER_WORD) {
+        val Midx = (word >>> shift) & REGISTER_WORD_MASK
+        zInverse += 1.0 / (1 << Midx)
+        if (Midx == 0) {
+          V += 1.0d
+        }
+        shift += REGISTER_SIZE
+        i += 1
+        idx += 1
+      }
+      wordOffset += 1
+    }
+
+    // We integrate two steps from the paper:
+    // val Z = 1.0d / zInverse
+    // val E = alphaM2 * Z
+    val E = alphaM2 / zInverse
+    @inline
+    def EBiasCorrected = E match {
+      case e if p < 19 && e < 5.0d * m => e - estimateBias(e)
+      case e => e
+    }
+
+    // Estimate the cardinality.
+    val estimate = if (V > 0) {
+      // Use linear counting for small cardinality estimates.
+      val H = m * Math.log(m / V)
+      // HLL++ is defined only when p < 19, otherwise we need to fallback to HLL.
+      // The threshold `2.5 * m` is from the original HLL algorithm.
+      if ((p < 19 && H <= THRESHOLDS(p - 4)) || E <= 2.5 * m) {
+        H
+      } else {
+        EBiasCorrected
+      }
+    } else {
+      EBiasCorrected
+    }
+
+    // Round to the nearest long value.
+    Math.round(estimate)
+  }
+
+  /**
+   * The <code>rsd</code> of HLL++ is always equal to or better than the <code>rsd</code> requested.
+   * This method returns the <code>rsd</code> this instance actually guarantees.
+   *
+   * @return the actual <code>rsd</code>.
+   */
+  def trueRsd: Double = 1.04 / math.sqrt(m)
+}
+
+// scalastyle:off
+/**
+ * Constants used in the implementation of the HyperLogLogPlusPlus aggregate function.
+ *
+ * See the Appendix to HyperLogLog in Practice: Algorithmic Engineering of a State of the Art
+ * Cardinality (https://docs.google.com/document/d/1gyjfMHy43U9OWBXxfaeG-3MjGzejW1dlpyMwEYAAWEI/view?fullscreen)
+ * for more information.
+ */
+// scalastyle:on
+object HyperLogLogPlusPlusHelper {
+
+  /**
+   * The size of a word used for storing registers: 64 Bits.
+   */
+  val WORD_SIZE = java.lang.Long.SIZE
+
+  /**
+   * The number of bits that is required per register.
+   *
+   * This number is determined by the maximum number of leading binary zeros a hashcode can
+   * produce. This is equal to the number of bits the hashcode returns. The current
+   * implementation uses a 64-bit hashcode, this means 6-bits are (at most) needed to store the
+   * number of leading zeros.
+   */
+  val REGISTER_SIZE = 6
+
+  /**
+   * Value used to mask a register stored in a word.
+   */
+  val REGISTER_WORD_MASK: Long = (1 << REGISTER_SIZE) - 1
+
+  /**
+   * The number of registers which can be stored in one word.
+   */
+  val REGISTERS_PER_WORD = WORD_SIZE / REGISTER_SIZE
+
+  /**
+   * Number of points used for interpolating the bias value.
+   */
+  val K = 6
+
+  // Sacrificing style for readability here.
+  // scalastyle:off
+
+  /**
+   * Thresholds which decide if the linear counting or the regular algorithm is used.
+   */
+  val THRESHOLDS = Array[Double](10, 20, 40, 80, 220, 400, 900, 1800, 3100, 6500, 15500, 20000, 50000, 120000, 350000)
+
+  /**
+   * Lookup table used to find the (index of the) bias correction for a given precision (exact)
+   * and estimate (nearest).
+   */
+  val RAW_ESTIMATE_DATA = Array(
+    // precision 4
+    Array(11, 11.717, 12.207, 12.7896, 13.2882, 13.8204, 14.3772, 14.9342, 15.5202, 16.161, 16.7722, 17.4636, 18.0396, 18.6766, 19.3566, 20.0454, 20.7936, 21.4856, 22.2666, 22.9946, 23.766, 24.4692, 25.3638, 26.0764, 26.7864, 27.7602, 28.4814, 29.433, 30.2926, 31.0664, 31.9996, 32.7956, 33.5366, 34.5894, 35.5738, 36.2698, 37.3682, 38.0544, 39.2342, 40.0108, 40.7966, 41.9298, 42.8704, 43.6358, 44.5194, 45.773, 46.6772, 47.6174, 48.4888, 49.3304, 50.2506, 51.4996, 52.3824, 53.3078, 54.3984, 55.5838, 56.6618, 57.2174, 58.3514, 59.0802, 60.1482, 61.0376, 62.3598, 62.8078, 63.9744, 64.914, 65.781, 67.1806, 68.0594, 68.8446, 69.7928, 70.8248, 71.8324, 72.8598, 73.6246, 74.7014, 75.393, 76.6708, 77.2394),
+    // precision 5
+    Array(23, 23.1194, 23.8208, 24.2318, 24.77, 25.2436, 25.7774, 26.2848, 26.8224, 27.3742, 27.9336, 28.503, 29.0494, 29.6292, 30.2124, 30.798, 31.367, 31.9728, 32.5944, 33.217, 33.8438, 34.3696, 35.0956, 35.7044, 36.324, 37.0668, 37.6698, 38.3644, 39.049, 39.6918, 40.4146, 41.082, 41.687, 42.5398, 43.2462, 43.857, 44.6606, 45.4168, 46.1248, 46.9222, 47.6804, 48.447, 49.3454, 49.9594, 50.7636, 51.5776, 52.331, 53.19, 53.9676, 54.7564, 55.5314, 56.4442, 57.3708, 57.9774, 58.9624, 59.8796, 60.755, 61.472, 62.2076, 63.1024, 63.8908, 64.7338, 65.7728, 66.629, 67.413, 68.3266, 69.1524, 70.2642, 71.1806, 72.0566, 72.9192, 73.7598, 74.3516, 75.5802, 76.4386, 77.4916, 78.1524, 79.1892, 79.8414, 80.8798, 81.8376, 82.4698, 83.7656, 84.331, 85.5914, 86.6012, 87.7016, 88.5582, 89.3394, 90.3544, 91.4912, 92.308, 93.3552, 93.9746, 95.2052, 95.727, 97.1322, 98.3944, 98.7588, 100.242, 101.1914, 102.2538, 102.8776, 103.6292, 105.1932, 105.9152, 107.0868, 107.6728, 108.7144, 110.3114, 110.8716, 111.245, 112.7908, 113.7064, 114.636, 115.7464, 116.1788, 117.7464, 118.4896, 119.6166, 120.5082, 121.7798, 122.9028, 123.4426, 124.8854, 125.705, 126.4652, 128.3464, 128.3462, 130.0398, 131.0342, 131.0042, 132.4766, 133.511, 134.7252, 135.425, 136.5172, 138.0572, 138.6694, 139.3712, 140.8598, 141.4594, 142.554, 143.4006, 144.7374, 146.1634, 146.8994, 147.605, 147.9304, 149.1636, 150.2468, 151.5876, 152.2096, 153.7032, 154.7146, 155.807, 156.9228, 157.0372, 158.5852),
+    // precision 6
+    Array(46, 46.1902, 47.271, 47.8358, 48.8142, 49.2854, 50.317, 51.354, 51.8924, 52.9436, 53.4596, 54.5262, 55.6248, 56.1574, 57.2822, 57.837, 58.9636, 60.074, 60.7042, 61.7976, 62.4772, 63.6564, 64.7942, 65.5004, 66.686, 67.291, 68.5672, 69.8556, 70.4982, 71.8204, 72.4252, 73.7744, 75.0786, 75.8344, 77.0294, 77.8098, 79.0794, 80.5732, 81.1878, 82.5648, 83.2902, 84.6784, 85.3352, 86.8946, 88.3712, 89.0852, 90.499, 91.2686, 92.6844, 94.2234, 94.9732, 96.3356, 97.2286, 98.7262, 100.3284, 101.1048, 102.5962, 103.3562, 105.1272, 106.4184, 107.4974, 109.0822, 109.856, 111.48, 113.2834, 114.0208, 115.637, 116.5174, 118.0576, 119.7476, 120.427, 122.1326, 123.2372, 125.2788, 126.6776, 127.7926, 129.1952, 129.9564, 131.6454, 133.87, 134.5428, 136.2, 137.0294, 138.6278, 139.6782, 141.792, 143.3516, 144.2832, 146.0394, 147.0748, 148.4912, 150.849, 151.696, 153.5404, 154.073, 156.3714, 157.7216, 158.7328, 160.4208, 161.4184, 163.9424, 165.2772, 166.411, 168.1308, 168.769, 170.9258, 172.6828, 173.7502, 175.706, 176.3886, 179.0186, 180.4518, 181.927, 183.4172, 184.4114, 186.033, 188.5124, 189.5564, 191.6008, 192.4172, 193.8044, 194.997, 197.4548, 198.8948, 200.2346, 202.3086, 203.1548, 204.8842, 206.6508, 206.6772, 209.7254, 210.4752, 212.7228, 214.6614, 215.1676, 217.793, 218.0006, 219.9052, 221.66, 223.5588, 225.1636, 225.6882, 227.7126, 229.4502, 231.1978, 232.9756, 233.1654, 236.727, 238.1974, 237.7474, 241.1346, 242.3048, 244.1948, 245.3134, 246.879, 249.1204, 249.853, 252.6792, 253.857, 254.4486, 257.2362, 257.9534, 260.0286, 260.5632, 262.663, 264.723, 265.7566, 267.2566, 267.1624, 270.62, 272.8216, 273.2166, 275.2056, 276.2202, 278.3726, 280.3344, 281.9284, 283.9728, 284.1924, 286.4872, 287.587, 289.807, 291.1206, 292.769, 294.8708, 296.665, 297.1182, 299.4012, 300.6352, 302.1354, 304.1756, 306.1606, 307.3462, 308.5214, 309.4134, 310.8352, 313.9684, 315.837, 316.7796, 318.9858),
+    // precision 7
+    Array(92, 93.4934, 94.9758, 96.4574, 97.9718, 99.4954, 101.5302, 103.0756, 104.6374, 106.1782, 107.7888, 109.9522, 111.592, 113.2532, 114.9086, 116.5938, 118.9474, 120.6796, 122.4394, 124.2176, 125.9768, 128.4214, 130.2528, 132.0102, 133.8658, 135.7278, 138.3044, 140.1316, 142.093, 144.0032, 145.9092, 148.6306, 150.5294, 152.5756, 154.6508, 156.662, 159.552, 161.3724, 163.617, 165.5754, 167.7872, 169.8444, 172.7988, 174.8606, 177.2118, 179.3566, 181.4476, 184.5882, 186.6816, 189.0824, 191.0258, 193.6048, 196.4436, 198.7274, 200.957, 203.147, 205.4364, 208.7592, 211.3386, 213.781, 215.8028, 218.656, 221.6544, 223.996, 226.4718, 229.1544, 231.6098, 234.5956, 237.0616, 239.5758, 242.4878, 244.5244, 248.2146, 250.724, 252.8722, 255.5198, 258.0414, 261.941, 264.9048, 266.87, 269.4304, 272.028, 274.4708, 278.37, 281.0624, 283.4668, 286.5532, 289.4352, 293.2564, 295.2744, 298.2118, 300.7472, 304.1456, 307.2928, 309.7504, 312.5528, 315.979, 318.2102, 322.1834, 324.3494, 327.325, 330.6614, 332.903, 337.2544, 339.9042, 343.215, 345.2864, 348.0814, 352.6764, 355.301, 357.139, 360.658, 363.1732, 366.5902, 369.9538, 373.0828, 375.922, 378.9902, 382.7328, 386.4538, 388.1136, 391.2234, 394.0878, 396.708, 401.1556, 404.1852, 406.6372, 409.6822, 412.7796, 416.6078, 418.4916, 422.131, 424.5376, 428.1988, 432.211, 434.4502, 438.5282, 440.912, 444.0448, 447.7432, 450.8524, 453.7988, 456.7858, 458.8868, 463.9886, 466.5064, 468.9124, 472.6616, 475.4682, 478.582, 481.304, 485.2738, 488.6894, 490.329, 496.106, 497.6908, 501.1374, 504.5322, 506.8848, 510.3324, 513.4512, 516.179, 520.4412, 522.6066, 526.167, 528.7794, 533.379, 536.067, 538.46, 542.9116, 545.692, 547.9546, 552.493, 555.2722, 557.335, 562.449, 564.2014, 569.0738, 571.0974, 574.8564, 578.2996, 581.409, 583.9704, 585.8098, 589.6528, 594.5998, 595.958, 600.068, 603.3278, 608.2016, 609.9632, 612.864, 615.43, 620.7794, 621.272, 625.8644, 629.206, 633.219, 634.5154, 638.6102),
+    // precision 8
+    Array(184.2152, 187.2454, 190.2096, 193.6652, 196.6312, 199.6822, 203.249, 206.3296, 210.0038, 213.2074, 216.4612, 220.27, 223.5178, 227.4412, 230.8032, 234.1634, 238.1688, 241.6074, 245.6946, 249.2664, 252.8228, 257.0432, 260.6824, 264.9464, 268.6268, 272.2626, 276.8376, 280.4034, 284.8956, 288.8522, 292.7638, 297.3552, 301.3556, 305.7526, 309.9292, 313.8954, 318.8198, 322.7668, 327.298, 331.6688, 335.9466, 340.9746, 345.1672, 349.3474, 354.3028, 358.8912, 364.114, 368.4646, 372.9744, 378.4092, 382.6022, 387.843, 392.5684, 397.1652, 402.5426, 407.4152, 412.5388, 417.3592, 422.1366, 427.486, 432.3918, 437.5076, 442.509, 447.3834, 453.3498, 458.0668, 463.7346, 469.1228, 473.4528, 479.7, 484.644, 491.0518, 495.5774, 500.9068, 506.432, 512.1666, 517.434, 522.6644, 527.4894, 533.6312, 538.3804, 544.292, 550.5496, 556.0234, 562.8206, 566.6146, 572.4188, 579.117, 583.6762, 590.6576, 595.7864, 601.509, 607.5334, 612.9204, 619.772, 624.2924, 630.8654, 636.1836, 642.745, 649.1316, 655.0386, 660.0136, 666.6342, 671.6196, 678.1866, 684.4282, 689.3324, 695.4794, 702.5038, 708.129, 713.528, 720.3204, 726.463, 732.7928, 739.123, 744.7418, 751.2192, 756.5102, 762.6066, 769.0184, 775.2224, 781.4014, 787.7618, 794.1436, 798.6506, 805.6378, 811.766, 819.7514, 824.5776, 828.7322, 837.8048, 843.6302, 849.9336, 854.4798, 861.3388, 867.9894, 873.8196, 880.3136, 886.2308, 892.4588, 899.0816, 905.4076, 912.0064, 917.3878, 923.619, 929.998, 937.3482, 943.9506, 947.991, 955.1144, 962.203, 968.8222, 975.7324, 981.7826, 988.7666, 994.2648, 1000.3128, 1007.4082, 1013.7536, 1020.3376, 1026.7156, 1031.7478, 1037.4292, 1045.393, 1051.2278, 1058.3434, 1062.8726, 1071.884, 1076.806, 1082.9176, 1089.1678, 1095.5032, 1102.525, 1107.2264, 1115.315, 1120.93, 1127.252, 1134.1496, 1139.0408, 1147.5448, 1153.3296, 1158.1974, 1166.5262, 1174.3328, 1175.657, 1184.4222, 1190.9172, 1197.1292, 1204.4606, 1210.4578, 1218.8728, 1225.3336, 1226.6592, 1236.5768, 1241.363, 1249.4074, 1254.6566, 1260.8014, 1266.5454, 1274.5192),
+    // precision 9
+    Array(369, 374.8294, 381.2452, 387.6698, 394.1464, 400.2024, 406.8782, 413.6598, 420.462, 427.2826, 433.7102, 440.7416, 447.9366, 455.1046, 462.285, 469.0668, 476.306, 483.8448, 491.301, 498.9886, 506.2422, 513.8138, 521.7074, 529.7428, 537.8402, 545.1664, 553.3534, 561.594, 569.6886, 577.7876, 585.65, 594.228, 602.8036, 611.1666, 620.0818, 628.0824, 637.2574, 646.302, 655.1644, 664.0056, 672.3802, 681.7192, 690.5234, 700.2084, 708.831, 718.485, 728.1112, 737.4764, 746.76, 756.3368, 766.5538, 775.5058, 785.2646, 795.5902, 804.3818, 814.8998, 824.9532, 835.2062, 845.2798, 854.4728, 864.9582, 875.3292, 886.171, 896.781, 906.5716, 916.7048, 927.5322, 937.875, 949.3972, 958.3464, 969.7274, 980.2834, 992.1444, 1003.4264, 1013.0166, 1024.018, 1035.0438, 1046.34, 1057.6856, 1068.9836, 1079.0312, 1091.677, 1102.3188, 1113.4846, 1124.4424, 1135.739, 1147.1488, 1158.9202, 1169.406, 1181.5342, 1193.2834, 1203.8954, 1216.3286, 1226.2146, 1239.6684, 1251.9946, 1262.123, 1275.4338, 1285.7378, 1296.076, 1308.9692, 1320.4964, 1333.0998, 1343.9864, 1357.7754, 1368.3208, 1380.4838, 1392.7388, 1406.0758, 1416.9098, 1428.9728, 1440.9228, 1453.9292, 1462.617, 1476.05, 1490.2996, 1500.6128, 1513.7392, 1524.5174, 1536.6322, 1548.2584, 1562.3766, 1572.423, 1587.1232, 1596.5164, 1610.5938, 1622.5972, 1633.1222, 1647.7674, 1658.5044, 1671.57, 1683.7044, 1695.4142, 1708.7102, 1720.6094, 1732.6522, 1747.841, 1756.4072, 1769.9786, 1782.3276, 1797.5216, 1808.3186, 1819.0694, 1834.354, 1844.575, 1856.2808, 1871.1288, 1880.7852, 1893.9622, 1906.3418, 1920.6548, 1932.9302, 1945.8584, 1955.473, 1968.8248, 1980.6446, 1995.9598, 2008.349, 2019.8556, 2033.0334, 2044.0206, 2059.3956, 2069.9174, 2082.6084, 2093.7036, 2106.6108, 2118.9124, 2132.301, 2144.7628, 2159.8422, 2171.0212, 2183.101, 2193.5112, 2208.052, 2221.3194, 2233.3282, 2247.295, 2257.7222, 2273.342, 2286.5638, 2299.6786, 2310.8114, 2322.3312, 2335.516, 2349.874, 2363.5968, 2373.865, 2387.1918, 2401.8328, 2414.8496, 2424.544, 2436.7592, 2447.1682, 2464.1958, 2474.3438, 2489.0006, 2497.4526, 2513.6586, 2527.19, 2540.7028, 2553.768),
+    // precision 10
+    Array(738.1256, 750.4234, 763.1064, 775.4732, 788.4636, 801.0644, 814.488, 827.9654, 841.0832, 854.7864, 868.1992, 882.2176, 896.5228, 910.1716, 924.7752, 938.899, 953.6126, 968.6492, 982.9474, 998.5214, 1013.1064, 1028.6364, 1044.2468, 1059.4588, 1075.3832, 1091.0584, 1106.8606, 1123.3868, 1139.5062, 1156.1862, 1172.463, 1189.339, 1206.1936, 1223.1292, 1240.1854, 1257.2908, 1275.3324, 1292.8518, 1310.5204, 1328.4854, 1345.9318, 1364.552, 1381.4658, 1400.4256, 1419.849, 1438.152, 1456.8956, 1474.8792, 1494.118, 1513.62, 1532.5132, 1551.9322, 1570.7726, 1590.6086, 1610.5332, 1630.5918, 1650.4294, 1669.7662, 1690.4106, 1710.7338, 1730.9012, 1750.4486, 1770.1556, 1791.6338, 1812.7312, 1833.6264, 1853.9526, 1874.8742, 1896.8326, 1918.1966, 1939.5594, 1961.07, 1983.037, 2003.1804, 2026.071, 2047.4884, 2070.0848, 2091.2944, 2114.333, 2135.9626, 2158.2902, 2181.0814, 2202.0334, 2224.4832, 2246.39, 2269.7202, 2292.1714, 2314.2358, 2338.9346, 2360.891, 2384.0264, 2408.3834, 2430.1544, 2454.8684, 2476.9896, 2501.4368, 2522.8702, 2548.0408, 2570.6738, 2593.5208, 2617.0158, 2640.2302, 2664.0962, 2687.4986, 2714.2588, 2735.3914, 2759.6244, 2781.8378, 2808.0072, 2830.6516, 2856.2454, 2877.2136, 2903.4546, 2926.785, 2951.2294, 2976.468, 3000.867, 3023.6508, 3049.91, 3073.5984, 3098.162, 3121.5564, 3146.2328, 3170.9484, 3195.5902, 3221.3346, 3242.7032, 3271.6112, 3296.5546, 3317.7376, 3345.072, 3369.9518, 3394.326, 3418.1818, 3444.6926, 3469.086, 3494.2754, 3517.8698, 3544.248, 3565.3768, 3588.7234, 3616.979, 3643.7504, 3668.6812, 3695.72, 3719.7392, 3742.6224, 3770.4456, 3795.6602, 3819.9058, 3844.002, 3869.517, 3895.6824, 3920.8622, 3947.1364, 3973.985, 3995.4772, 4021.62, 4046.628, 4074.65, 4096.2256, 4121.831, 4146.6406, 4173.276, 4195.0744, 4223.9696, 4251.3708, 4272.9966, 4300.8046, 4326.302, 4353.1248, 4374.312, 4403.0322, 4426.819, 4450.0598, 4478.5206, 4504.8116, 4528.8928, 4553.9584, 4578.8712, 4603.8384, 4632.3872, 4655.5128, 4675.821, 4704.6222, 4731.9862, 4755.4174, 4781.2628, 4804.332, 4832.3048, 4862.8752, 4883.4148, 4906.9544, 4935.3516, 4954.3532, 4984.0248, 5011.217, 5035.3258, 5057.3672, 5084.1828),
+    // precision 11
+    Array(1477, 1501.6014, 1526.5802, 1551.7942, 1577.3042, 1603.2062, 1629.8402, 1656.2292, 1682.9462, 1709.9926, 1737.3026, 1765.4252, 1793.0578, 1821.6092, 1849.626, 1878.5568, 1908.527, 1937.5154, 1967.1874, 1997.3878, 2027.37, 2058.1972, 2089.5728, 2120.1012, 2151.9668, 2183.292, 2216.0772, 2247.8578, 2280.6562, 2313.041, 2345.714, 2380.3112, 2414.1806, 2447.9854, 2481.656, 2516.346, 2551.5154, 2586.8378, 2621.7448, 2656.6722, 2693.5722, 2729.1462, 2765.4124, 2802.8728, 2838.898, 2876.408, 2913.4926, 2951.4938, 2989.6776, 3026.282, 3065.7704, 3104.1012, 3143.7388, 3181.6876, 3221.1872, 3261.5048, 3300.0214, 3339.806, 3381.409, 3421.4144, 3461.4294, 3502.2286, 3544.651, 3586.6156, 3627.337, 3670.083, 3711.1538, 3753.5094, 3797.01, 3838.6686, 3882.1678, 3922.8116, 3967.9978, 4009.9204, 4054.3286, 4097.5706, 4140.6014, 4185.544, 4229.5976, 4274.583, 4316.9438, 4361.672, 4406.2786, 4451.8628, 4496.1834, 4543.505, 4589.1816, 4632.5188, 4678.2294, 4724.8908, 4769.0194, 4817.052, 4861.4588, 4910.1596, 4956.4344, 5002.5238, 5048.13, 5093.6374, 5142.8162, 5187.7894, 5237.3984, 5285.6078, 5331.0858, 5379.1036, 5428.6258, 5474.6018, 5522.7618, 5571.5822, 5618.59, 5667.9992, 5714.88, 5763.454, 5808.6982, 5860.3644, 5910.2914, 5953.571, 6005.9232, 6055.1914, 6104.5882, 6154.5702, 6199.7036, 6251.1764, 6298.7596, 6350.0302, 6398.061, 6448.4694, 6495.933, 6548.0474, 6597.7166, 6646.9416, 6695.9208, 6742.6328, 6793.5276, 6842.1934, 6894.2372, 6945.3864, 6996.9228, 7044.2372, 7094.1374, 7142.2272, 7192.2942, 7238.8338, 7288.9006, 7344.0908, 7394.8544, 7443.5176, 7490.4148, 7542.9314, 7595.6738, 7641.9878, 7694.3688, 7743.0448, 7797.522, 7845.53, 7899.594, 7950.3132, 7996.455, 8050.9442, 8092.9114, 8153.1374, 8197.4472, 8252.8278, 8301.8728, 8348.6776, 8401.4698, 8453.551, 8504.6598, 8553.8944, 8604.1276, 8657.6514, 8710.3062, 8758.908, 8807.8706, 8862.1702, 8910.4668, 8960.77, 9007.2766, 9063.164, 9121.0534, 9164.1354, 9218.1594, 9267.767, 9319.0594, 9372.155, 9419.7126, 9474.3722, 9520.1338, 9572.368, 9622.7702, 9675.8448, 9726.5396, 9778.7378, 9827.6554, 9878.1922, 9928.7782, 9978.3984, 10026.578, 10076.5626, 10137.1618, 10177.5244, 10229.9176),
+    // precision 12
+    Array(2954, 3003.4782, 3053.3568, 3104.3666, 3155.324, 3206.9598, 3259.648, 3312.539, 3366.1474, 3420.2576, 3474.8376, 3530.6076, 3586.451, 3643.38, 3700.4104, 3757.5638, 3815.9676, 3875.193, 3934.838, 3994.8548, 4055.018, 4117.1742, 4178.4482, 4241.1294, 4304.4776, 4367.4044, 4431.8724, 4496.3732, 4561.4304, 4627.5326, 4693.949, 4761.5532, 4828.7256, 4897.6182, 4965.5186, 5034.4528, 5104.865, 5174.7164, 5244.6828, 5316.6708, 5387.8312, 5459.9036, 5532.476, 5604.8652, 5679.6718, 5753.757, 5830.2072, 5905.2828, 5980.0434, 6056.6264, 6134.3192, 6211.5746, 6290.0816, 6367.1176, 6447.9796, 6526.5576, 6606.1858, 6686.9144, 6766.1142, 6847.0818, 6927.9664, 7010.9096, 7091.0816, 7175.3962, 7260.3454, 7344.018, 7426.4214, 7511.3106, 7596.0686, 7679.8094, 7765.818, 7852.4248, 7936.834, 8022.363, 8109.5066, 8200.4554, 8288.5832, 8373.366, 8463.4808, 8549.7682, 8642.0522, 8728.3288, 8820.9528, 8907.727, 9001.0794, 9091.2522, 9179.988, 9269.852, 9362.6394, 9453.642, 9546.9024, 9640.6616, 9732.6622, 9824.3254, 9917.7484, 10007.9392, 10106.7508, 10196.2152, 10289.8114, 10383.5494, 10482.3064, 10576.8734, 10668.7872, 10764.7156, 10862.0196, 10952.793, 11049.9748, 11146.0702, 11241.4492, 11339.2772, 11434.2336, 11530.741, 11627.6136, 11726.311, 11821.5964, 11918.837, 12015.3724, 12113.0162, 12213.0424, 12306.9804, 12408.4518, 12504.8968, 12604.586, 12700.9332, 12798.705, 12898.5142, 12997.0488, 13094.788, 13198.475, 13292.7764, 13392.9698, 13486.8574, 13590.1616, 13686.5838, 13783.6264, 13887.2638, 13992.0978, 14081.0844, 14189.9956, 14280.0912, 14382.4956, 14486.4384, 14588.1082, 14686.2392, 14782.276, 14888.0284, 14985.1864, 15088.8596, 15187.0998, 15285.027, 15383.6694, 15495.8266, 15591.3736, 15694.2008, 15790.3246, 15898.4116, 15997.4522, 16095.5014, 16198.8514, 16291.7492, 16402.6424, 16499.1266, 16606.2436, 16697.7186, 16796.3946, 16902.3376, 17005.7672, 17100.814, 17206.8282, 17305.8262, 17416.0744, 17508.4092, 17617.0178, 17715.4554, 17816.758, 17920.1748, 18012.9236, 18119.7984, 18223.2248, 18324.2482, 18426.6276, 18525.0932, 18629.8976, 18733.2588, 18831.0466, 18940.1366, 19032.2696, 19131.729, 19243.4864, 19349.6932, 19442.866, 19547.9448, 19653.2798, 19754.4034, 19854.0692, 19965.1224, 20065.1774, 20158.2212, 20253.353, 20366.3264, 20463.22),
+    // precision 13
+    Array(5908.5052, 6007.2672, 6107.347, 6208.5794, 6311.2622, 6414.5514, 6519.3376, 6625.6952, 6732.5988, 6841.3552, 6950.5972, 7061.3082, 7173.5646, 7287.109, 7401.8216, 7516.4344, 7633.3802, 7751.2962, 7870.3784, 7990.292, 8110.79, 8233.4574, 8356.6036, 8482.2712, 8607.7708, 8735.099, 8863.1858, 8993.4746, 9123.8496, 9255.6794, 9388.5448, 9522.7516, 9657.3106, 9792.6094, 9930.5642, 10068.794, 10206.7256, 10347.81, 10490.3196, 10632.0778, 10775.9916, 10920.4662, 11066.124, 11213.073, 11358.0362, 11508.1006, 11659.1716, 11808.7514, 11959.4884, 12112.1314, 12265.037, 12420.3756, 12578.933, 12734.311, 12890.0006, 13047.2144, 13207.3096, 13368.5144, 13528.024, 13689.847, 13852.7528, 14018.3168, 14180.5372, 14346.9668, 14513.5074, 14677.867, 14846.2186, 15017.4186, 15184.9716, 15356.339, 15529.2972, 15697.3578, 15871.8686, 16042.187, 16216.4094, 16389.4188, 16565.9126, 16742.3272, 16919.0042, 17094.7592, 17273.965, 17451.8342, 17634.4254, 17810.5984, 17988.9242, 18171.051, 18354.7938, 18539.466, 18721.0408, 18904.9972, 19081.867, 19271.9118, 19451.8694, 19637.9816, 19821.2922, 20013.1292, 20199.3858, 20387.8726, 20572.9514, 20770.7764, 20955.1714, 21144.751, 21329.9952, 21520.709, 21712.7016, 21906.3868, 22096.2626, 22286.0524, 22475.051, 22665.5098, 22862.8492, 23055.5294, 23249.6138, 23437.848, 23636.273, 23826.093, 24020.3296, 24213.3896, 24411.7392, 24602.9614, 24805.7952, 24998.1552, 25193.9588, 25389.0166, 25585.8392, 25780.6976, 25981.2728, 26175.977, 26376.5252, 26570.1964, 26773.387, 26962.9812, 27163.0586, 27368.164, 27565.0534, 27758.7428, 27961.1276, 28163.2324, 28362.3816, 28565.7668, 28758.644, 28956.9768, 29163.4722, 29354.7026, 29561.1186, 29767.9948, 29959.9986, 30164.0492, 30366.9818, 30562.5338, 30762.9928, 30976.1592, 31166.274, 31376.722, 31570.3734, 31770.809, 31974.8934, 32179.5286, 32387.5442, 32582.3504, 32794.076, 32989.9528, 33191.842, 33392.4684, 33595.659, 33801.8672, 34000.3414, 34200.0922, 34402.6792, 34610.0638, 34804.0084, 35011.13, 35218.669, 35418.6634, 35619.0792, 35830.6534, 36028.4966, 36229.7902, 36438.6422, 36630.7764, 36833.3102, 37048.6728, 37247.3916, 37453.5904, 37669.3614, 37854.5526, 38059.305, 38268.0936, 38470.2516, 38674.7064, 38876.167, 39068.3794, 39281.9144, 39492.8566, 39684.8628, 39898.4108, 40093.1836, 40297.6858, 40489.7086, 40717.2424),
+    // precision 14
+    Array(11817.475, 12015.0046, 12215.3792, 12417.7504, 12623.1814, 12830.0086, 13040.0072, 13252.503, 13466.178, 13683.2738, 13902.0344, 14123.9798, 14347.394, 14573.7784, 14802.6894, 15033.6824, 15266.9134, 15502.8624, 15741.4944, 15980.7956, 16223.8916, 16468.6316, 16715.733, 16965.5726, 17217.204, 17470.666, 17727.8516, 17986.7886, 18247.6902, 18510.9632, 18775.304, 19044.7486, 19314.4408, 19587.202, 19862.2576, 20135.924, 20417.0324, 20697.9788, 20979.6112, 21265.0274, 21550.723, 21841.6906, 22132.162, 22428.1406, 22722.127, 23020.5606, 23319.7394, 23620.4014, 23925.2728, 24226.9224, 24535.581, 24845.505, 25155.9618, 25470.3828, 25785.9702, 26103.7764, 26420.4132, 26742.0186, 27062.8852, 27388.415, 27714.6024, 28042.296, 28365.4494, 28701.1526, 29031.8008, 29364.2156, 29704.497, 30037.1458, 30380.111, 30723.8168, 31059.5114, 31404.9498, 31751.6752, 32095.2686, 32444.7792, 32794.767, 33145.204, 33498.4226, 33847.6502, 34209.006, 34560.849, 34919.4838, 35274.9778, 35635.1322, 35996.3266, 36359.1394, 36722.8266, 37082.8516, 37447.7354, 37815.9606, 38191.0692, 38559.4106, 38924.8112, 39294.6726, 39663.973, 40042.261, 40416.2036, 40779.2036, 41161.6436, 41540.9014, 41921.1998, 42294.7698, 42678.5264, 43061.3464, 43432.375, 43818.432, 44198.6598, 44583.0138, 44970.4794, 45353.924, 45729.858, 46118.2224, 46511.5724, 46900.7386, 47280.6964, 47668.1472, 48055.6796, 48446.9436, 48838.7146, 49217.7296, 49613.7796, 50010.7508, 50410.0208, 50793.7886, 51190.2456, 51583.1882, 51971.0796, 52376.5338, 52763.319, 53165.5534, 53556.5594, 53948.2702, 54346.352, 54748.7914, 55138.577, 55543.4824, 55941.1748, 56333.7746, 56745.1552, 57142.7944, 57545.2236, 57935.9956, 58348.5268, 58737.5474, 59158.5962, 59542.6896, 59958.8004, 60349.3788, 60755.0212, 61147.6144, 61548.194, 61946.0696, 62348.6042, 62763.603, 63162.781, 63560.635, 63974.3482, 64366.4908, 64771.5876, 65176.7346, 65597.3916, 65995.915, 66394.0384, 66822.9396, 67203.6336, 67612.2032, 68019.0078, 68420.0388, 68821.22, 69235.8388, 69640.0724, 70055.155, 70466.357, 70863.4266, 71276.2482, 71677.0306, 72080.2006, 72493.0214, 72893.5952, 73314.5856, 73714.9852, 74125.3022, 74521.2122, 74933.6814, 75341.5904, 75743.0244, 76166.0278, 76572.1322, 76973.1028, 77381.6284, 77800.6092, 78189.328, 78607.0962, 79012.2508, 79407.8358, 79825.725, 80238.701, 80646.891, 81035.6436, 81460.0448, 81876.3884),
+    // precision 15
+    Array(23635.0036, 24030.8034, 24431.4744, 24837.1524, 25246.7928, 25661.326, 26081.3532, 26505.2806, 26933.9892, 27367.7098, 27805.318, 28248.799, 28696.4382, 29148.8244, 29605.5138, 30066.8668, 30534.2344, 31006.32, 31480.778, 31962.2418, 32447.3324, 32938.0232, 33432.731, 33930.728, 34433.9896, 34944.1402, 35457.5588, 35974.5958, 36497.3296, 37021.9096, 37554.326, 38088.0826, 38628.8816, 39171.3192, 39723.2326, 40274.5554, 40832.3142, 41390.613, 41959.5908, 42532.5466, 43102.0344, 43683.5072, 44266.694, 44851.2822, 45440.7862, 46038.0586, 46640.3164, 47241.064, 47846.155, 48454.7396, 49076.9168, 49692.542, 50317.4778, 50939.65, 51572.5596, 52210.2906, 52843.7396, 53481.3996, 54127.236, 54770.406, 55422.6598, 56078.7958, 56736.7174, 57397.6784, 58064.5784, 58730.308, 59404.9784, 60077.0864, 60751.9158, 61444.1386, 62115.817, 62808.7742, 63501.4774, 64187.5454, 64883.6622, 65582.7468, 66274.5318, 66976.9276, 67688.7764, 68402.138, 69109.6274, 69822.9706, 70543.6108, 71265.5202, 71983.3848, 72708.4656, 73433.384, 74158.4664, 74896.4868, 75620.9564, 76362.1434, 77098.3204, 77835.7662, 78582.6114, 79323.9902, 80067.8658, 80814.9246, 81567.0136, 82310.8536, 83061.9952, 83821.4096, 84580.8608, 85335.547, 86092.5802, 86851.6506, 87612.311, 88381.2016, 89146.3296, 89907.8974, 90676.846, 91451.4152, 92224.5518, 92995.8686, 93763.5066, 94551.2796, 95315.1944, 96096.1806, 96881.0918, 97665.679, 98442.68, 99229.3002, 100011.0994, 100790.6386, 101580.1564, 102377.7484, 103152.1392, 103944.2712, 104730.216, 105528.6336, 106324.9398, 107117.6706, 107890.3988, 108695.2266, 109485.238, 110294.7876, 111075.0958, 111878.0496, 112695.2864, 113464.5486, 114270.0474, 115068.608, 115884.3626, 116673.2588, 117483.3716, 118275.097, 119085.4092, 119879.2808, 120687.5868, 121499.9944, 122284.916, 123095.9254, 123912.5038, 124709.0454, 125503.7182, 126323.259, 127138.9412, 127943.8294, 128755.646, 129556.5354, 130375.3298, 131161.4734, 131971.1962, 132787.5458, 133588.1056, 134431.351, 135220.2906, 136023.398, 136846.6558, 137667.0004, 138463.663, 139283.7154, 140074.6146, 140901.3072, 141721.8548, 142543.2322, 143356.1096, 144173.7412, 144973.0948, 145794.3162, 146609.5714, 147420.003, 148237.9784, 149050.5696, 149854.761, 150663.1966, 151494.0754, 152313.1416, 153112.6902, 153935.7206, 154746.9262, 155559.547, 156401.9746, 157228.7036, 158008.7254, 158820.75, 159646.9184, 160470.4458, 161279.5348, 162093.3114, 162918.542, 163729.2842),
+    // precision 16
+    Array(47271, 48062.3584, 48862.7074, 49673.152, 50492.8416, 51322.9514, 52161.03, 53009.407, 53867.6348, 54734.206, 55610.5144, 56496.2096, 57390.795, 58297.268, 59210.6448, 60134.665, 61068.0248, 62010.4472, 62962.5204, 63923.5742, 64895.0194, 65876.4182, 66862.6136, 67862.6968, 68868.8908, 69882.8544, 70911.271, 71944.0924, 72990.0326, 74040.692, 75100.6336, 76174.7826, 77252.5998, 78340.2974, 79438.2572, 80545.4976, 81657.2796, 82784.6336, 83915.515, 85059.7362, 86205.9368, 87364.4424, 88530.3358, 89707.3744, 90885.9638, 92080.197, 93275.5738, 94479.391, 95695.918, 96919.2236, 98148.4602, 99382.3474, 100625.6974, 101878.0284, 103141.6278, 104409.4588, 105686.2882, 106967.5402, 108261.6032, 109548.1578, 110852.0728, 112162.231, 113479.0072, 114806.2626, 116137.9072, 117469.5048, 118813.5186, 120165.4876, 121516.2556, 122875.766, 124250.5444, 125621.2222, 127003.2352, 128387.848, 129775.2644, 131181.7776, 132577.3086, 133979.9458, 135394.1132, 136800.9078, 138233.217, 139668.5308, 141085.212, 142535.2122, 143969.0684, 145420.2872, 146878.1542, 148332.7572, 149800.3202, 151269.66, 152743.6104, 154213.0948, 155690.288, 157169.4246, 158672.1756, 160160.059, 161650.6854, 163145.7772, 164645.6726, 166159.1952, 167682.1578, 169177.3328, 170700.0118, 172228.8964, 173732.6664, 175265.5556, 176787.799, 178317.111, 179856.6914, 181400.865, 182943.4612, 184486.742, 186033.4698, 187583.7886, 189148.1868, 190688.4526, 192250.1926, 193810.9042, 195354.2972, 196938.7682, 198493.5898, 200079.2824, 201618.912, 203205.5492, 204765.5798, 206356.1124, 207929.3064, 209498.7196, 211086.229, 212675.1324, 214256.7892, 215826.2392, 217412.8474, 218995.6724, 220618.6038, 222207.1166, 223781.0364, 225387.4332, 227005.7928, 228590.4336, 230217.8738, 231805.1054, 233408.9, 234995.3432, 236601.4956, 238190.7904, 239817.2548, 241411.2832, 243002.4066, 244640.1884, 246255.3128, 247849.3508, 249479.9734, 251106.8822, 252705.027, 254332.9242, 255935.129, 257526.9014, 259154.772, 260777.625, 262390.253, 264004.4906, 265643.59, 267255.4076, 268873.426, 270470.7252, 272106.4804, 273722.4456, 275337.794, 276945.7038, 278592.9154, 280204.3726, 281841.1606, 283489.171, 285130.1716, 286735.3362, 288364.7164, 289961.1814, 291595.5524, 293285.683, 294899.6668, 296499.3434, 298128.0462, 299761.8946, 301394.2424, 302997.6748, 304615.1478, 306269.7724, 307886.114, 309543.1028, 311153.2862, 312782.8546, 314421.2008, 316033.2438, 317692.9636, 319305.2648, 320948.7406, 322566.3364, 324228.4224, 325847.1542),
+    // precision 17
+    Array(94542, 96125.811, 97728.019, 99348.558, 100987.9705, 102646.7565, 104324.5125, 106021.7435, 107736.7865, 109469.272, 111223.9465, 112995.219, 114787.432, 116593.152, 118422.71, 120267.2345, 122134.6765, 124020.937, 125927.2705, 127851.255, 129788.9485, 131751.016, 133726.8225, 135722.592, 137736.789, 139770.568, 141821.518, 143891.343, 145982.1415, 148095.387, 150207.526, 152355.649, 154515.6415, 156696.05, 158887.7575, 161098.159, 163329.852, 165569.053, 167837.4005, 170121.6165, 172420.4595, 174732.6265, 177062.77, 179412.502, 181774.035, 184151.939, 186551.6895, 188965.691, 191402.8095, 193857.949, 196305.0775, 198774.6715, 201271.2585, 203764.78, 206299.3695, 208818.1365, 211373.115, 213946.7465, 216532.076, 219105.541, 221714.5375, 224337.5135, 226977.5125, 229613.0655, 232270.2685, 234952.2065, 237645.3555, 240331.1925, 243034.517, 245756.0725, 248517.6865, 251232.737, 254011.3955, 256785.995, 259556.44, 262368.335, 265156.911, 267965.266, 270785.583, 273616.0495, 276487.4835, 279346.639, 282202.509, 285074.3885, 287942.2855, 290856.018, 293774.0345, 296678.5145, 299603.6355, 302552.6575, 305492.9785, 308466.8605, 311392.581, 314347.538, 317319.4295, 320285.9785, 323301.7325, 326298.3235, 329301.3105, 332301.987, 335309.791, 338370.762, 341382.923, 344431.1265, 347464.1545, 350507.28, 353619.2345, 356631.2005, 359685.203, 362776.7845, 365886.488, 368958.2255, 372060.6825, 375165.4335, 378237.935, 381328.311, 384430.5225, 387576.425, 390683.242, 393839.648, 396977.8425, 400101.9805, 403271.296, 406409.8425, 409529.5485, 412678.7, 415847.423, 419020.8035, 422157.081, 425337.749, 428479.6165, 431700.902, 434893.1915, 438049.582, 441210.5415, 444379.2545, 447577.356, 450741.931, 453959.548, 457137.0935, 460329.846, 463537.4815, 466732.3345, 469960.5615, 473164.681, 476347.6345, 479496.173, 482813.1645, 486025.6995, 489249.4885, 492460.1945, 495675.8805, 498908.0075, 502131.802, 505374.3855, 508550.9915, 511806.7305, 515026.776, 518217.0005, 521523.9855, 524705.9855, 527950.997, 531210.0265, 534472.497, 537750.7315, 540926.922, 544207.094, 547429.4345, 550666.3745, 553975.3475, 557150.7185, 560399.6165, 563662.697, 566916.7395, 570146.1215, 573447.425, 576689.6245, 579874.5745, 583202.337, 586503.0255, 589715.635, 592910.161, 596214.3885, 599488.035, 602740.92, 605983.0685, 609248.67, 612491.3605, 615787.912, 619107.5245, 622307.9555, 625577.333, 628840.4385, 632085.2155, 635317.6135, 638691.7195, 641887.467, 645139.9405, 648441.546, 651666.252, 654941.845),
+    // precision 18
+    Array(189084, 192250.913, 195456.774, 198696.946, 201977.762, 205294.444, 208651.754, 212042.099, 215472.269, 218941.91, 222443.912, 225996.845, 229568.199, 233193.568, 236844.457, 240543.233, 244279.475, 248044.27, 251854.588, 255693.2, 259583.619, 263494.621, 267445.385, 271454.061, 275468.769, 279549.456, 283646.446, 287788.198, 291966.099, 296181.164, 300431.469, 304718.618, 309024.004, 313393.508, 317760.803, 322209.731, 326675.061, 331160.627, 335654.47, 340241.442, 344841.833, 349467.132, 354130.629, 358819.432, 363574.626, 368296.587, 373118.482, 377914.93, 382782.301, 387680.669, 392601.981, 397544.323, 402529.115, 407546.018, 412593.658, 417638.657, 422762.865, 427886.169, 433017.167, 438213.273, 443441.254, 448692.421, 453937.533, 459239.049, 464529.569, 469910.083, 475274.03, 480684.473, 486070.26, 491515.237, 496995.651, 502476.617, 507973.609, 513497.19, 519083.233, 524726.509, 530305.505, 535945.728, 541584.404, 547274.055, 552967.236, 558667.862, 564360.216, 570128.148, 575965.08, 581701.952, 587532.523, 593361.144, 599246.128, 605033.418, 610958.779, 616837.117, 622772.818, 628672.04, 634675.369, 640574.831, 646585.739, 652574.547, 658611.217, 664642.684, 670713.914, 676737.681, 682797.313, 688837.897, 694917.874, 701009.882, 707173.648, 713257.254, 719415.392, 725636.761, 731710.697, 737906.209, 744103.074, 750313.39, 756504.185, 762712.579, 768876.985, 775167.859, 781359, 787615.959, 793863.597, 800245.477, 806464.582, 812785.294, 819005.925, 825403.057, 831676.197, 837936.284, 844266.968, 850642.711, 856959.756, 863322.774, 869699.931, 876102.478, 882355.787, 888694.463, 895159.952, 901536.143, 907872.631, 914293.672, 920615.14, 927130.974, 933409.404, 939922.178, 946331.47, 952745.93, 959209.264, 965590.224, 972077.284, 978501.961, 984953.19, 991413.271, 997817.479, 1004222.658, 1010725.676, 1017177.138, 1023612.529, 1030098.236, 1036493.719, 1043112.207, 1049537.036, 1056008.096, 1062476.184, 1068942.337, 1075524.95, 1081932.864, 1088426.025, 1094776.005, 1101327.448, 1107901.673, 1114423.639, 1120884.602, 1127324.923, 1133794.24, 1140328.886, 1146849.376, 1153346.682, 1159836.502, 1166478.703, 1172953.304, 1179391.502, 1185950.982, 1192544.052, 1198913.41, 1205430.994, 1212015.525, 1218674.042, 1225121.683, 1231551.101, 1238126.379, 1244673.795, 1251260.649, 1257697.86, 1264320.983, 1270736.319, 1277274.694, 1283804.95, 1290211.514, 1296858.568, 1303455.691)
+  )
+
+  /**
+   * Bias corrections given a precision and the index of the raw estimate table.
+   */
+  val BIAS_DATA = Array(
+    // precision 4
+    Array(10, 9.717, 9.207, 8.7896, 8.2882, 7.8204, 7.3772, 6.9342, 6.5202, 6.161, 5.7722, 5.4636, 5.0396, 4.6766, 4.3566, 4.0454, 3.7936, 3.4856, 3.2666, 2.9946, 2.766, 2.4692, 2.3638, 2.0764, 1.7864, 1.7602, 1.4814, 1.433, 1.2926, 1.0664, 0.999600000000001, 0.7956, 0.5366, 0.589399999999998, 0.573799999999999, 0.269799999999996, 0.368200000000002, 0.0544000000000011, 0.234200000000001, 0.0108000000000033, -0.203400000000002, -0.0701999999999998, -0.129600000000003, -0.364199999999997, -0.480600000000003, -0.226999999999997, -0.322800000000001, -0.382599999999996, -0.511200000000002, -0.669600000000003, -0.749400000000001, -0.500399999999999, -0.617600000000003, -0.6922, -0.601599999999998, -0.416200000000003, -0.338200000000001, -0.782600000000002, -0.648600000000002, -0.919800000000002, -0.851799999999997, -0.962400000000002, -0.6402, -1.1922, -1.0256, -1.086, -1.21899999999999, -0.819400000000002, -0.940600000000003, -1.1554, -1.2072, -1.1752, -1.16759999999999, -1.14019999999999, -1.3754, -1.29859999999999, -1.607, -1.3292, -1.7606),
+    // precision 5
+    Array(22, 21.1194, 20.8208, 20.2318, 19.77, 19.2436, 18.7774, 18.2848, 17.8224, 17.3742, 16.9336, 16.503, 16.0494, 15.6292, 15.2124, 14.798, 14.367, 13.9728, 13.5944, 13.217, 12.8438, 12.3696, 12.0956, 11.7044, 11.324, 11.0668, 10.6698, 10.3644, 10.049, 9.6918, 9.4146, 9.082, 8.687, 8.5398, 8.2462, 7.857, 7.6606, 7.4168, 7.1248, 6.9222, 6.6804, 6.447, 6.3454, 5.9594, 5.7636, 5.5776, 5.331, 5.19, 4.9676, 4.7564, 4.5314, 4.4442, 4.3708, 3.9774, 3.9624, 3.8796, 3.755, 3.472, 3.2076, 3.1024, 2.8908, 2.7338, 2.7728, 2.629, 2.413, 2.3266, 2.1524, 2.2642, 2.1806, 2.0566, 1.9192, 1.7598, 1.3516, 1.5802, 1.43859999999999, 1.49160000000001, 1.1524, 1.1892, 0.841399999999993, 0.879800000000003, 0.837599999999995, 0.469800000000006, 0.765600000000006, 0.331000000000003, 0.591399999999993, 0.601200000000006, 0.701599999999999, 0.558199999999999, 0.339399999999998, 0.354399999999998, 0.491200000000006, 0.308000000000007, 0.355199999999996, -0.0254000000000048, 0.205200000000005, -0.272999999999996, 0.132199999999997, 0.394400000000005, -0.241200000000006, 0.242000000000004, 0.191400000000002, 0.253799999999998, -0.122399999999999, -0.370800000000003, 0.193200000000004, -0.0848000000000013, 0.0867999999999967, -0.327200000000005, -0.285600000000002, 0.311400000000006, -0.128399999999999, -0.754999999999995, -0.209199999999996, -0.293599999999998, -0.364000000000004, -0.253600000000006, -0.821200000000005, -0.253600000000006, -0.510400000000004, -0.383399999999995, -0.491799999999998, -0.220200000000006, -0.0972000000000008, -0.557400000000001, -0.114599999999996, -0.295000000000002, -0.534800000000004, 0.346399999999988, -0.65379999999999, 0.0398000000000138, 0.0341999999999985, -0.995800000000003, -0.523400000000009, -0.489000000000004, -0.274799999999999, -0.574999999999989, -0.482799999999997, 0.0571999999999946, -0.330600000000004, -0.628800000000012, -0.140199999999993, -0.540600000000012, -0.445999999999998, -0.599400000000003, -0.262599999999992, 0.163399999999996, -0.100599999999986, -0.39500000000001, -1.06960000000001, -0.836399999999998, -0.753199999999993, -0.412399999999991, -0.790400000000005, -0.29679999999999, -0.28540000000001, -0.193000000000012, -0.0772000000000048, -0.962799999999987, -0.414800000000014),
+    // precision 6
+    Array(45, 44.1902, 43.271, 42.8358, 41.8142, 41.2854, 40.317, 39.354, 38.8924, 37.9436, 37.4596, 36.5262, 35.6248, 35.1574, 34.2822, 33.837, 32.9636, 32.074, 31.7042, 30.7976, 30.4772, 29.6564, 28.7942, 28.5004, 27.686, 27.291, 26.5672, 25.8556, 25.4982, 24.8204, 24.4252, 23.7744, 23.0786, 22.8344, 22.0294, 21.8098, 21.0794, 20.5732, 20.1878, 19.5648, 19.2902, 18.6784, 18.3352, 17.8946, 17.3712, 17.0852, 16.499, 16.2686, 15.6844, 15.2234, 14.9732, 14.3356, 14.2286, 13.7262, 13.3284, 13.1048, 12.5962, 12.3562, 12.1272, 11.4184, 11.4974, 11.0822, 10.856, 10.48, 10.2834, 10.0208, 9.637, 9.51739999999999, 9.05759999999999, 8.74760000000001, 8.42700000000001, 8.1326, 8.2372, 8.2788, 7.6776, 7.79259999999999, 7.1952, 6.9564, 6.6454, 6.87, 6.5428, 6.19999999999999, 6.02940000000001, 5.62780000000001, 5.6782, 5.792, 5.35159999999999, 5.28319999999999, 5.0394, 5.07480000000001, 4.49119999999999, 4.84899999999999, 4.696, 4.54040000000001, 4.07300000000001, 4.37139999999999, 3.7216, 3.7328, 3.42080000000001, 3.41839999999999, 3.94239999999999, 3.27719999999999, 3.411, 3.13079999999999, 2.76900000000001, 2.92580000000001, 2.68279999999999, 2.75020000000001, 2.70599999999999, 2.3886, 3.01859999999999, 2.45179999999999, 2.92699999999999, 2.41720000000001, 2.41139999999999, 2.03299999999999, 2.51240000000001, 2.5564, 2.60079999999999, 2.41720000000001, 1.80439999999999, 1.99700000000001, 2.45480000000001, 1.8948, 2.2346, 2.30860000000001, 2.15479999999999, 1.88419999999999, 1.6508, 0.677199999999999, 1.72540000000001, 1.4752, 1.72280000000001, 1.66139999999999, 1.16759999999999, 1.79300000000001, 1.00059999999999, 0.905200000000008, 0.659999999999997, 1.55879999999999, 1.1636, 0.688199999999995, 0.712600000000009, 0.450199999999995, 1.1978, 0.975599999999986, 0.165400000000005, 1.727, 1.19739999999999, -0.252600000000001, 1.13460000000001, 1.3048, 1.19479999999999, 0.313400000000001, 0.878999999999991, 1.12039999999999, 0.853000000000009, 1.67920000000001, 0.856999999999999, 0.448599999999999, 1.2362, 0.953399999999988, 1.02859999999998, 0.563199999999995, 0.663000000000011, 0.723000000000013, 0.756599999999992, 0.256599999999992, -0.837600000000009, 0.620000000000005, 0.821599999999989, 0.216600000000028, 0.205600000000004, 0.220199999999977, 0.372599999999977, 0.334400000000016, 0.928400000000011, 0.972800000000007, 0.192400000000021, 0.487199999999973, -0.413000000000011, 0.807000000000016, 0.120600000000024, 0.769000000000005, 0.870799999999974, 0.66500000000002, 0.118200000000002, 0.401200000000017, 0.635199999999998, 0.135400000000004, 0.175599999999974, 1.16059999999999, 0.34620000000001, 0.521400000000028, -0.586599999999976, -1.16480000000001, 0.968399999999974, 0.836999999999989, 0.779600000000016, 0.985799999999983),
+    // precision 7
+    Array(91, 89.4934, 87.9758, 86.4574, 84.9718, 83.4954, 81.5302, 80.0756, 78.6374, 77.1782, 75.7888, 73.9522, 72.592, 71.2532, 69.9086, 68.5938, 66.9474, 65.6796, 64.4394, 63.2176, 61.9768, 60.4214, 59.2528, 58.0102, 56.8658, 55.7278, 54.3044, 53.1316, 52.093, 51.0032, 49.9092, 48.6306, 47.5294, 46.5756, 45.6508, 44.662, 43.552, 42.3724, 41.617, 40.5754, 39.7872, 38.8444, 37.7988, 36.8606, 36.2118, 35.3566, 34.4476, 33.5882, 32.6816, 32.0824, 31.0258, 30.6048, 29.4436, 28.7274, 27.957, 27.147, 26.4364, 25.7592, 25.3386, 24.781, 23.8028, 23.656, 22.6544, 21.996, 21.4718, 21.1544, 20.6098, 19.5956, 19.0616, 18.5758, 18.4878, 17.5244, 17.2146, 16.724, 15.8722, 15.5198, 15.0414, 14.941, 14.9048, 13.87, 13.4304, 13.028, 12.4708, 12.37, 12.0624, 11.4668, 11.5532, 11.4352, 11.2564, 10.2744, 10.2118, 9.74720000000002, 10.1456, 9.2928, 8.75040000000001, 8.55279999999999, 8.97899999999998, 8.21019999999999, 8.18340000000001, 7.3494, 7.32499999999999, 7.66140000000001, 6.90300000000002, 7.25439999999998, 6.9042, 7.21499999999997, 6.28640000000001, 6.08139999999997, 6.6764, 6.30099999999999, 5.13900000000001, 5.65800000000002, 5.17320000000001, 4.59019999999998, 4.9538, 5.08280000000002, 4.92200000000003, 4.99020000000002, 4.7328, 5.4538, 4.11360000000002, 4.22340000000003, 4.08780000000002, 3.70800000000003, 4.15559999999999, 4.18520000000001, 3.63720000000001, 3.68220000000002, 3.77960000000002, 3.6078, 2.49160000000001, 3.13099999999997, 2.5376, 3.19880000000001, 3.21100000000001, 2.4502, 3.52820000000003, 2.91199999999998, 3.04480000000001, 2.7432, 2.85239999999999, 2.79880000000003, 2.78579999999999, 1.88679999999999, 2.98860000000002, 2.50639999999999, 1.91239999999999, 2.66160000000002, 2.46820000000002, 1.58199999999999, 1.30399999999997, 2.27379999999999, 2.68939999999998, 1.32900000000001, 3.10599999999999, 1.69080000000002, 2.13740000000001, 2.53219999999999, 1.88479999999998, 1.33240000000001, 1.45119999999997, 1.17899999999997, 2.44119999999998, 1.60659999999996, 2.16700000000003, 0.77940000000001, 2.37900000000002, 2.06700000000001, 1.46000000000004, 2.91160000000002, 1.69200000000001, 0.954600000000028, 2.49300000000005, 2.2722, 1.33500000000004, 2.44899999999996, 1.20140000000004, 3.07380000000001, 2.09739999999999, 2.85640000000001, 2.29960000000005, 2.40899999999999, 1.97040000000004, 0.809799999999996, 1.65279999999996, 2.59979999999996, 0.95799999999997, 2.06799999999998, 2.32780000000002, 4.20159999999998, 1.96320000000003, 1.86400000000003, 1.42999999999995, 3.77940000000001, 1.27200000000005, 1.86440000000005, 2.20600000000002, 3.21900000000005, 1.5154, 2.61019999999996),
+    // precision 8
+    Array(183.2152, 180.2454, 177.2096, 173.6652, 170.6312, 167.6822, 164.249, 161.3296, 158.0038, 155.2074, 152.4612, 149.27, 146.5178, 143.4412, 140.8032, 138.1634, 135.1688, 132.6074, 129.6946, 127.2664, 124.8228, 122.0432, 119.6824, 116.9464, 114.6268, 112.2626, 109.8376, 107.4034, 104.8956, 102.8522, 100.7638, 98.3552, 96.3556, 93.7526, 91.9292, 89.8954, 87.8198, 85.7668, 83.298, 81.6688, 79.9466, 77.9746, 76.1672, 74.3474, 72.3028, 70.8912, 69.114, 67.4646, 65.9744, 64.4092, 62.6022, 60.843, 59.5684, 58.1652, 56.5426, 55.4152, 53.5388, 52.3592, 51.1366, 49.486, 48.3918, 46.5076, 45.509, 44.3834, 43.3498, 42.0668, 40.7346, 40.1228, 38.4528, 37.7, 36.644, 36.0518, 34.5774, 33.9068, 32.432, 32.1666, 30.434, 29.6644, 28.4894, 27.6312, 26.3804, 26.292, 25.5496000000001, 25.0234, 24.8206, 22.6146, 22.4188, 22.117, 20.6762, 20.6576, 19.7864, 19.509, 18.5334, 17.9204, 17.772, 16.2924, 16.8654, 15.1836, 15.745, 15.1316, 15.0386, 14.0136, 13.6342, 12.6196, 12.1866, 12.4281999999999, 11.3324, 10.4794000000001, 11.5038, 10.129, 9.52800000000002, 10.3203999999999, 9.46299999999997, 9.79280000000006, 9.12300000000005, 8.74180000000001, 9.2192, 7.51020000000005, 7.60659999999996, 7.01840000000004, 7.22239999999999, 7.40139999999997, 6.76179999999999, 7.14359999999999, 5.65060000000005, 5.63779999999997, 5.76599999999996, 6.75139999999999, 5.57759999999996, 3.73220000000003, 5.8048, 5.63019999999995, 4.93359999999996, 3.47979999999995, 4.33879999999999, 3.98940000000005, 3.81960000000004, 3.31359999999995, 3.23080000000004, 3.4588, 3.08159999999998, 3.4076, 3.00639999999999, 2.38779999999997, 2.61900000000003, 1.99800000000005, 3.34820000000002, 2.95060000000001, 0.990999999999985, 2.11440000000005, 2.20299999999997, 2.82219999999995, 2.73239999999998, 2.7826, 3.76660000000004, 2.26480000000004, 2.31280000000004, 2.40819999999997, 2.75360000000001, 3.33759999999995, 2.71559999999999, 1.7478000000001, 1.42920000000004, 2.39300000000003, 2.22779999999989, 2.34339999999997, 0.87259999999992, 3.88400000000001, 1.80600000000004, 1.91759999999999, 1.16779999999994, 1.50320000000011, 2.52500000000009, 0.226400000000012, 2.31500000000005, 0.930000000000064, 1.25199999999995, 2.14959999999996, 0.0407999999999902, 2.5447999999999, 1.32960000000003, 0.197400000000016, 2.52620000000002, 3.33279999999991, -1.34300000000007, 0.422199999999975, 0.917200000000093, 1.12920000000008, 1.46060000000011, 1.45779999999991, 2.8728000000001, 3.33359999999993, -1.34079999999994, 1.57680000000005, 0.363000000000056, 1.40740000000005, 0.656600000000026, 0.801400000000058, -0.454600000000028, 1.51919999999996),
+    // precision 9
+    Array(368, 361.8294, 355.2452, 348.6698, 342.1464, 336.2024, 329.8782, 323.6598, 317.462, 311.2826, 305.7102, 299.7416, 293.9366, 288.1046, 282.285, 277.0668, 271.306, 265.8448, 260.301, 254.9886, 250.2422, 244.8138, 239.7074, 234.7428, 229.8402, 225.1664, 220.3534, 215.594, 210.6886, 205.7876, 201.65, 197.228, 192.8036, 188.1666, 184.0818, 180.0824, 176.2574, 172.302, 168.1644, 164.0056, 160.3802, 156.7192, 152.5234, 149.2084, 145.831, 142.485, 139.1112, 135.4764, 131.76, 129.3368, 126.5538, 122.5058, 119.2646, 116.5902, 113.3818, 110.8998, 107.9532, 105.2062, 102.2798, 99.4728, 96.9582, 94.3292, 92.171, 89.7809999999999, 87.5716, 84.7048, 82.5322, 79.875, 78.3972, 75.3464, 73.7274, 71.2834, 70.1444, 68.4263999999999, 66.0166, 64.018, 62.0437999999999, 60.3399999999999, 58.6856, 57.9836, 55.0311999999999, 54.6769999999999, 52.3188, 51.4846, 49.4423999999999, 47.739, 46.1487999999999, 44.9202, 43.4059999999999, 42.5342000000001, 41.2834, 38.8954000000001, 38.3286000000001, 36.2146, 36.6684, 35.9946, 33.123, 33.4338, 31.7378000000001, 29.076, 28.9692, 27.4964, 27.0998, 25.9864, 26.7754, 24.3208, 23.4838, 22.7388000000001, 24.0758000000001, 21.9097999999999, 20.9728, 19.9228000000001, 19.9292, 16.617, 17.05, 18.2996000000001, 15.6128000000001, 15.7392, 14.5174, 13.6322, 12.2583999999999, 13.3766000000001, 11.423, 13.1232, 9.51639999999998, 10.5938000000001, 9.59719999999993, 8.12220000000002, 9.76739999999995, 7.50440000000003, 7.56999999999994, 6.70440000000008, 6.41419999999994, 6.71019999999999, 5.60940000000005, 4.65219999999999, 6.84099999999989, 3.4072000000001, 3.97859999999991, 3.32760000000007, 5.52160000000003, 3.31860000000006, 2.06940000000009, 4.35400000000004, 1.57500000000005, 0.280799999999999, 2.12879999999996, -0.214799999999968, -0.0378000000000611, -0.658200000000079, 0.654800000000023, -0.0697999999999865, 0.858400000000074, -2.52700000000004, -2.1751999999999, -3.35539999999992, -1.04019999999991, -0.651000000000067, -2.14439999999991, -1.96659999999997, -3.97939999999994, -0.604400000000169, -3.08260000000018, -3.39159999999993, -5.29640000000018, -5.38920000000007, -5.08759999999984, -4.69900000000007, -5.23720000000003, -3.15779999999995, -4.97879999999986, -4.89899999999989, -7.48880000000008, -5.94799999999987, -5.68060000000014, -6.67180000000008, -4.70499999999993, -7.27779999999984, -4.6579999999999, -4.4362000000001, -4.32139999999981, -5.18859999999995, -6.66879999999992, -6.48399999999992, -5.1260000000002, -4.4032000000002, -6.13500000000022, -5.80819999999994, -4.16719999999987, -4.15039999999999, -7.45600000000013, -7.24080000000004, -9.83179999999993, -5.80420000000004, -8.6561999999999, -6.99940000000015, -10.5473999999999, -7.34139999999979, -6.80999999999995, -6.29719999999998, -6.23199999999997),
+    // precision 10
+    Array(737.1256, 724.4234, 711.1064, 698.4732, 685.4636, 673.0644, 660.488, 647.9654, 636.0832, 623.7864, 612.1992, 600.2176, 588.5228, 577.1716, 565.7752, 554.899, 543.6126, 532.6492, 521.9474, 511.5214, 501.1064, 490.6364, 480.2468, 470.4588, 460.3832, 451.0584, 440.8606, 431.3868, 422.5062, 413.1862, 404.463, 395.339, 386.1936, 378.1292, 369.1854, 361.2908, 353.3324, 344.8518, 337.5204, 329.4854, 321.9318, 314.552, 306.4658, 299.4256, 292.849, 286.152, 278.8956, 271.8792, 265.118, 258.62, 252.5132, 245.9322, 239.7726, 233.6086, 227.5332, 222.5918, 216.4294, 210.7662, 205.4106, 199.7338, 194.9012, 188.4486, 183.1556, 178.6338, 173.7312, 169.6264, 163.9526, 159.8742, 155.8326, 151.1966, 147.5594, 143.07, 140.037, 134.1804, 131.071, 127.4884, 124.0848, 120.2944, 117.333, 112.9626, 110.2902, 107.0814, 103.0334, 99.4832000000001, 96.3899999999999, 93.7202000000002, 90.1714000000002, 87.2357999999999, 85.9346, 82.8910000000001, 80.0264000000002, 78.3834000000002, 75.1543999999999, 73.8683999999998, 70.9895999999999, 69.4367999999999, 64.8701999999998, 65.0408000000002, 61.6738, 59.5207999999998, 57.0158000000001, 54.2302, 53.0962, 50.4985999999999, 52.2588000000001, 47.3914, 45.6244000000002, 42.8377999999998, 43.0072, 40.6516000000001, 40.2453999999998, 35.2136, 36.4546, 33.7849999999999, 33.2294000000002, 32.4679999999998, 30.8670000000002, 28.6507999999999, 28.9099999999999, 27.5983999999999, 26.1619999999998, 24.5563999999999, 23.2328000000002, 21.9484000000002, 21.5902000000001, 21.3346000000001, 17.7031999999999, 20.6111999999998, 19.5545999999999, 15.7375999999999, 17.0720000000001, 16.9517999999998, 15.326, 13.1817999999998, 14.6925999999999, 13.0859999999998, 13.2754, 10.8697999999999, 11.248, 7.3768, 4.72339999999986, 7.97899999999981, 8.7503999999999, 7.68119999999999, 9.7199999999998, 7.73919999999998, 5.6224000000002, 7.44560000000001, 6.6601999999998, 5.9058, 4.00199999999995, 4.51699999999983, 4.68240000000014, 3.86220000000003, 5.13639999999987, 5.98500000000013, 2.47719999999981, 2.61999999999989, 1.62800000000016, 4.65000000000009, 0.225599999999758, 0.831000000000131, -0.359400000000278, 1.27599999999984, -2.92559999999958, -0.0303999999996449, 2.37079999999969, -2.0033999999996, 0.804600000000391, 0.30199999999968, 1.1247999999996, -2.6880000000001, 0.0321999999996478, -1.18099999999959, -3.9402, -1.47940000000017, -0.188400000000001, -2.10720000000038, -2.04159999999956, -3.12880000000041, -4.16160000000036, -0.612799999999879, -3.48719999999958, -8.17900000000009, -5.37780000000021, -4.01379999999972, -5.58259999999973, -5.73719999999958, -7.66799999999967, -5.69520000000011, -1.1247999999996, -5.58520000000044, -8.04560000000038, -4.64840000000004, -11.6468000000004, -7.97519999999986, -5.78300000000036, -7.67420000000038, -10.6328000000003, -9.81720000000041),
+    // precision 11
+    Array(1476, 1449.6014, 1423.5802, 1397.7942, 1372.3042, 1347.2062, 1321.8402, 1297.2292, 1272.9462, 1248.9926, 1225.3026, 1201.4252, 1178.0578, 1155.6092, 1132.626, 1110.5568, 1088.527, 1066.5154, 1045.1874, 1024.3878, 1003.37, 982.1972, 962.5728, 942.1012, 922.9668, 903.292, 884.0772, 864.8578, 846.6562, 828.041, 809.714, 792.3112, 775.1806, 757.9854, 740.656, 724.346, 707.5154, 691.8378, 675.7448, 659.6722, 645.5722, 630.1462, 614.4124, 600.8728, 585.898, 572.408, 558.4926, 544.4938, 531.6776, 517.282, 505.7704, 493.1012, 480.7388, 467.6876, 456.1872, 445.5048, 433.0214, 420.806, 411.409, 400.4144, 389.4294, 379.2286, 369.651, 360.6156, 350.337, 342.083, 332.1538, 322.5094, 315.01, 305.6686, 298.1678, 287.8116, 280.9978, 271.9204, 265.3286, 257.5706, 249.6014, 242.544, 235.5976, 229.583, 220.9438, 214.672, 208.2786, 201.8628, 195.1834, 191.505, 186.1816, 178.5188, 172.2294, 167.8908, 161.0194, 158.052, 151.4588, 148.1596, 143.4344, 138.5238, 133.13, 127.6374, 124.8162, 118.7894, 117.3984, 114.6078, 109.0858, 105.1036, 103.6258, 98.6018000000004, 95.7618000000002, 93.5821999999998, 88.5900000000001, 86.9992000000002, 82.8800000000001, 80.4539999999997, 74.6981999999998, 74.3644000000004, 73.2914000000001, 65.5709999999999, 66.9232000000002, 65.1913999999997, 62.5882000000001, 61.5702000000001, 55.7035999999998, 56.1764000000003, 52.7596000000003, 53.0302000000001, 49.0609999999997, 48.4694, 44.933, 46.0474000000004, 44.7165999999997, 41.9416000000001, 39.9207999999999, 35.6328000000003, 35.5276000000003, 33.1934000000001, 33.2371999999996, 33.3864000000003, 33.9228000000003, 30.2371999999996, 29.1373999999996, 25.2272000000003, 24.2942000000003, 19.8338000000003, 18.9005999999999, 23.0907999999999, 21.8544000000002, 19.5176000000001, 15.4147999999996, 16.9314000000004, 18.6737999999996, 12.9877999999999, 14.3688000000002, 12.0447999999997, 15.5219999999999, 12.5299999999997, 14.5940000000001, 14.3131999999996, 9.45499999999993, 12.9441999999999, 3.91139999999996, 13.1373999999996, 5.44720000000052, 9.82779999999912, 7.87279999999919, 3.67760000000089, 5.46980000000076, 5.55099999999948, 5.65979999999945, 3.89439999999922, 3.1275999999998, 5.65140000000065, 6.3062000000009, 3.90799999999945, 1.87060000000019, 5.17020000000048, 2.46680000000015, 0.770000000000437, -3.72340000000077, 1.16400000000067, 8.05340000000069, 0.135399999999208, 2.15940000000046, 0.766999999999825, 1.0594000000001, 3.15500000000065, -0.287399999999252, 2.37219999999979, -2.86620000000039, -1.63199999999961, -2.22979999999916, -0.15519999999924, -1.46039999999994, -0.262199999999211, -2.34460000000036, -2.8078000000005, -3.22179999999935, -5.60159999999996, -8.42200000000048, -9.43740000000071, 0.161799999999857, -10.4755999999998, -10.0823999999993),
+    // precision 12
+    Array(2953, 2900.4782, 2848.3568, 2796.3666, 2745.324, 2694.9598, 2644.648, 2595.539, 2546.1474, 2498.2576, 2450.8376, 2403.6076, 2357.451, 2311.38, 2266.4104, 2221.5638, 2176.9676, 2134.193, 2090.838, 2048.8548, 2007.018, 1966.1742, 1925.4482, 1885.1294, 1846.4776, 1807.4044, 1768.8724, 1731.3732, 1693.4304, 1657.5326, 1621.949, 1586.5532, 1551.7256, 1517.6182, 1483.5186, 1450.4528, 1417.865, 1385.7164, 1352.6828, 1322.6708, 1291.8312, 1260.9036, 1231.476, 1201.8652, 1173.6718, 1145.757, 1119.2072, 1092.2828, 1065.0434, 1038.6264, 1014.3192, 988.5746, 965.0816, 940.1176, 917.9796, 894.5576, 871.1858, 849.9144, 827.1142, 805.0818, 783.9664, 763.9096, 742.0816, 724.3962, 706.3454, 688.018, 667.4214, 650.3106, 633.0686, 613.8094, 597.818, 581.4248, 563.834, 547.363, 531.5066, 520.455400000001, 505.583199999999, 488.366, 476.480799999999, 459.7682, 450.0522, 434.328799999999, 423.952799999999, 408.727000000001, 399.079400000001, 387.252200000001, 373.987999999999, 360.852000000001, 351.6394, 339.642, 330.902400000001, 322.661599999999, 311.662200000001, 301.3254, 291.7484, 279.939200000001, 276.7508, 263.215200000001, 254.811400000001, 245.5494, 242.306399999999, 234.8734, 223.787200000001, 217.7156, 212.0196, 200.793, 195.9748, 189.0702, 182.449199999999, 177.2772, 170.2336, 164.741, 158.613600000001, 155.311, 147.5964, 142.837, 137.3724, 132.0162, 130.0424, 121.9804, 120.451800000001, 114.8968, 111.585999999999, 105.933199999999, 101.705, 98.5141999999996, 95.0488000000005, 89.7880000000005, 91.4750000000004, 83.7764000000006, 80.9698000000008, 72.8574000000008, 73.1615999999995, 67.5838000000003, 62.6263999999992, 63.2638000000006, 66.0977999999996, 52.0843999999997, 58.9956000000002, 47.0912000000008, 46.4956000000002, 48.4383999999991, 47.1082000000006, 43.2392, 37.2759999999998, 40.0283999999992, 35.1864000000005, 35.8595999999998, 32.0998, 28.027, 23.6694000000007, 33.8266000000003, 26.3736000000008, 27.2008000000005, 21.3245999999999, 26.4115999999995, 23.4521999999997, 19.5013999999992, 19.8513999999996, 10.7492000000002, 18.6424000000006, 13.1265999999996, 18.2436000000016, 6.71860000000015, 3.39459999999963, 6.33759999999893, 7.76719999999841, 0.813999999998487, 3.82819999999992, 0.826199999999517, 8.07440000000133, -1.59080000000176, 5.01780000000144, 0.455399999998917, -0.24199999999837, 0.174800000000687, -9.07640000000174, -4.20160000000033, -3.77520000000004, -4.75179999999818, -5.3724000000002, -8.90680000000066, -6.10239999999976, -5.74120000000039, -9.95339999999851, -3.86339999999836, -13.7304000000004, -16.2710000000006, -7.51359999999841, -3.30679999999847, -13.1339999999982, -10.0551999999989, -6.72019999999975, -8.59660000000076, -10.9307999999983, -1.8775999999998, -4.82259999999951, -13.7788, -21.6470000000008, -10.6735999999983, -15.7799999999988),
+    // precision 13
+    Array(5907.5052, 5802.2672, 5697.347, 5593.5794, 5491.2622, 5390.5514, 5290.3376, 5191.6952, 5093.5988, 4997.3552, 4902.5972, 4808.3082, 4715.5646, 4624.109, 4533.8216, 4444.4344, 4356.3802, 4269.2962, 4183.3784, 4098.292, 4014.79, 3932.4574, 3850.6036, 3771.2712, 3691.7708, 3615.099, 3538.1858, 3463.4746, 3388.8496, 3315.6794, 3244.5448, 3173.7516, 3103.3106, 3033.6094, 2966.5642, 2900.794, 2833.7256, 2769.81, 2707.3196, 2644.0778, 2583.9916, 2523.4662, 2464.124, 2406.073, 2347.0362, 2292.1006, 2238.1716, 2182.7514, 2128.4884, 2077.1314, 2025.037, 1975.3756, 1928.933, 1879.311, 1831.0006, 1783.2144, 1738.3096, 1694.5144, 1649.024, 1606.847, 1564.7528, 1525.3168, 1482.5372, 1443.9668, 1406.5074, 1365.867, 1329.2186, 1295.4186, 1257.9716, 1225.339, 1193.2972, 1156.3578, 1125.8686, 1091.187, 1061.4094, 1029.4188, 1000.9126, 972.3272, 944.004199999999, 915.7592, 889.965, 862.834200000001, 840.4254, 812.598399999999, 785.924200000001, 763.050999999999, 741.793799999999, 721.466, 699.040799999999, 677.997200000002, 649.866999999998, 634.911800000002, 609.8694, 591.981599999999, 570.2922, 557.129199999999, 538.3858, 521.872599999999, 502.951400000002, 495.776399999999, 475.171399999999, 459.751, 439.995200000001, 426.708999999999, 413.7016, 402.3868, 387.262599999998, 372.0524, 357.050999999999, 342.5098, 334.849200000001, 322.529399999999, 311.613799999999, 295.848000000002, 289.273000000001, 274.093000000001, 263.329600000001, 251.389599999999, 245.7392, 231.9614, 229.7952, 217.155200000001, 208.9588, 199.016599999999, 190.839199999999, 180.6976, 176.272799999999, 166.976999999999, 162.5252, 151.196400000001, 149.386999999999, 133.981199999998, 130.0586, 130.164000000001, 122.053400000001, 110.7428, 108.1276, 106.232400000001, 100.381600000001, 98.7668000000012, 86.6440000000002, 79.9768000000004, 82.4722000000002, 68.7026000000005, 70.1186000000016, 71.9948000000004, 58.998599999999, 59.0492000000013, 56.9818000000014, 47.5338000000011, 42.9928, 51.1591999999982, 37.2740000000013, 42.7220000000016, 31.3734000000004, 26.8090000000011, 25.8934000000008, 26.5286000000015, 29.5442000000003, 19.3503999999994, 26.0760000000009, 17.9527999999991, 14.8419999999969, 10.4683999999979, 8.65899999999965, 9.86720000000059, 4.34139999999752, -0.907800000000861, -3.32080000000133, -0.936199999996461, -11.9916000000012, -8.87000000000262, -6.33099999999831, -11.3366000000024, -15.9207999999999, -9.34659999999712, -15.5034000000014, -19.2097999999969, -15.357799999998, -28.2235999999975, -30.6898000000001, -19.3271999999997, -25.6083999999973, -24.409599999999, -13.6385999999984, -33.4473999999973, -32.6949999999997, -28.9063999999998, -31.7483999999968, -32.2935999999972, -35.8329999999987, -47.620600000002, -39.0855999999985, -33.1434000000008, -46.1371999999974, -37.5892000000022, -46.8164000000033, -47.3142000000007, -60.2914000000019, -37.7575999999972),
+    // precision 14
+    Array(11816.475, 11605.0046, 11395.3792, 11188.7504, 10984.1814, 10782.0086, 10582.0072, 10384.503, 10189.178, 9996.2738, 9806.0344, 9617.9798, 9431.394, 9248.7784, 9067.6894, 8889.6824, 8712.9134, 8538.8624, 8368.4944, 8197.7956, 8031.8916, 7866.6316, 7703.733, 7544.5726, 7386.204, 7230.666, 7077.8516, 6926.7886, 6778.6902, 6631.9632, 6487.304, 6346.7486, 6206.4408, 6070.202, 5935.2576, 5799.924, 5671.0324, 5541.9788, 5414.6112, 5290.0274, 5166.723, 5047.6906, 4929.162, 4815.1406, 4699.127, 4588.5606, 4477.7394, 4369.4014, 4264.2728, 4155.9224, 4055.581, 3955.505, 3856.9618, 3761.3828, 3666.9702, 3575.7764, 3482.4132, 3395.0186, 3305.8852, 3221.415, 3138.6024, 3056.296, 2970.4494, 2896.1526, 2816.8008, 2740.2156, 2670.497, 2594.1458, 2527.111, 2460.8168, 2387.5114, 2322.9498, 2260.6752, 2194.2686, 2133.7792, 2074.767, 2015.204, 1959.4226, 1898.6502, 1850.006, 1792.849, 1741.4838, 1687.9778, 1638.1322, 1589.3266, 1543.1394, 1496.8266, 1447.8516, 1402.7354, 1361.9606, 1327.0692, 1285.4106, 1241.8112, 1201.6726, 1161.973, 1130.261, 1094.2036, 1048.2036, 1020.6436, 990.901400000002, 961.199800000002, 924.769800000002, 899.526400000002, 872.346400000002, 834.375, 810.432000000001, 780.659800000001, 756.013800000001, 733.479399999997, 707.923999999999, 673.858, 652.222399999999, 636.572399999997, 615.738599999997, 586.696400000001, 564.147199999999, 541.679600000003, 523.943599999999, 505.714599999999, 475.729599999999, 461.779600000002, 449.750800000002, 439.020799999998, 412.7886, 400.245600000002, 383.188199999997, 362.079599999997, 357.533799999997, 334.319000000003, 327.553399999997, 308.559399999998, 291.270199999999, 279.351999999999, 271.791400000002, 252.576999999997, 247.482400000001, 236.174800000001, 218.774599999997, 220.155200000001, 208.794399999999, 201.223599999998, 182.995600000002, 185.5268, 164.547400000003, 176.5962, 150.689599999998, 157.8004, 138.378799999999, 134.021200000003, 117.614399999999, 108.194000000003, 97.0696000000025, 89.6042000000016, 95.6030000000028, 84.7810000000027, 72.635000000002, 77.3482000000004, 59.4907999999996, 55.5875999999989, 50.7346000000034, 61.3916000000027, 50.9149999999936, 39.0384000000049, 58.9395999999979, 29.633600000001, 28.2032000000036, 26.0078000000067, 17.0387999999948, 9.22000000000116, 13.8387999999977, 8.07240000000456, 14.1549999999988, 15.3570000000036, 3.42660000000615, 6.24820000000182, -2.96940000000177, -8.79940000000352, -5.97860000000219, -14.4048000000039, -3.4143999999942, -13.0148000000045, -11.6977999999945, -25.7878000000055, -22.3185999999987, -24.409599999999, -31.9756000000052, -18.9722000000038, -22.8678000000073, -30.8972000000067, -32.3715999999986, -22.3907999999938, -43.6720000000059, -35.9038, -39.7492000000057, -54.1641999999993, -45.2749999999942, -42.2989999999991, -44.1089999999967, -64.3564000000042, -49.9551999999967, -42.6116000000038),
+    // precision 15
+    Array(23634.0036, 23210.8034, 22792.4744, 22379.1524, 21969.7928, 21565.326, 21165.3532, 20770.2806, 20379.9892, 19994.7098, 19613.318, 19236.799, 18865.4382, 18498.8244, 18136.5138, 17778.8668, 17426.2344, 17079.32, 16734.778, 16397.2418, 16063.3324, 15734.0232, 15409.731, 15088.728, 14772.9896, 14464.1402, 14157.5588, 13855.5958, 13559.3296, 13264.9096, 12978.326, 12692.0826, 12413.8816, 12137.3192, 11870.2326, 11602.5554, 11340.3142, 11079.613, 10829.5908, 10583.5466, 10334.0344, 10095.5072, 9859.694, 9625.2822, 9395.7862, 9174.0586, 8957.3164, 8738.064, 8524.155, 8313.7396, 8116.9168, 7913.542, 7718.4778, 7521.65, 7335.5596, 7154.2906, 6968.7396, 6786.3996, 6613.236, 6437.406, 6270.6598, 6107.7958, 5945.7174, 5787.6784, 5635.5784, 5482.308, 5337.9784, 5190.0864, 5045.9158, 4919.1386, 4771.817, 4645.7742, 4518.4774, 4385.5454, 4262.6622, 4142.74679999999, 4015.5318, 3897.9276, 3790.7764, 3685.13800000001, 3573.6274, 3467.9706, 3368.61079999999, 3271.5202, 3170.3848, 3076.4656, 2982.38400000001, 2888.4664, 2806.4868, 2711.9564, 2634.1434, 2551.3204, 2469.7662, 2396.61139999999, 2318.9902, 2243.8658, 2171.9246, 2105.01360000001, 2028.8536, 1960.9952, 1901.4096, 1841.86079999999, 1777.54700000001, 1714.5802, 1654.65059999999, 1596.311, 1546.2016, 1492.3296, 1433.8974, 1383.84600000001, 1339.4152, 1293.5518, 1245.8686, 1193.50659999999, 1162.27959999999, 1107.19439999999, 1069.18060000001, 1035.09179999999, 999.679000000004, 957.679999999993, 925.300199999998, 888.099400000006, 848.638600000006, 818.156400000007, 796.748399999997, 752.139200000005, 725.271200000003, 692.216, 671.633600000001, 647.939799999993, 621.670599999998, 575.398799999995, 561.226599999995, 532.237999999998, 521.787599999996, 483.095799999996, 467.049599999998, 465.286399999997, 415.548599999995, 401.047399999996, 380.607999999993, 377.362599999993, 347.258799999996, 338.371599999999, 310.096999999994, 301.409199999995, 276.280799999993, 265.586800000005, 258.994399999996, 223.915999999997, 215.925399999993, 213.503800000006, 191.045400000003, 166.718200000003, 166.259000000005, 162.941200000001, 148.829400000002, 141.645999999993, 123.535399999993, 122.329800000007, 89.473399999988, 80.1962000000058, 77.5457999999926, 59.1056000000099, 83.3509999999951, 52.2906000000075, 36.3979999999865, 40.6558000000077, 42.0003999999899, 19.6630000000005, 19.7153999999864, -8.38539999999921, -0.692799999989802, 0.854800000000978, 3.23219999999856, -3.89040000000386, -5.25880000001052, -24.9052000000083, -22.6837999999989, -26.4286000000138, -34.997000000003, -37.0216000000073, -43.430400000012, -58.2390000000014, -68.8034000000043, -56.9245999999985, -57.8583999999973, -77.3097999999882, -73.2793999999994, -81.0738000000129, -87.4530000000086, -65.0254000000132, -57.296399999992, -96.2746000000043, -103.25, -96.081600000005, -91.5542000000132, -102.465200000006, -107.688599999994, -101.458000000013, -109.715800000005),
+    // precision 16
+    Array(47270, 46423.3584, 45585.7074, 44757.152, 43938.8416, 43130.9514, 42330.03, 41540.407, 40759.6348, 39988.206, 39226.5144, 38473.2096, 37729.795, 36997.268, 36272.6448, 35558.665, 34853.0248, 34157.4472, 33470.5204, 32793.5742, 32127.0194, 31469.4182, 30817.6136, 30178.6968, 29546.8908, 28922.8544, 28312.271, 27707.0924, 27114.0326, 26526.692, 25948.6336, 25383.7826, 24823.5998, 24272.2974, 23732.2572, 23201.4976, 22674.2796, 22163.6336, 21656.515, 21161.7362, 20669.9368, 20189.4424, 19717.3358, 19256.3744, 18795.9638, 18352.197, 17908.5738, 17474.391, 17052.918, 16637.2236, 16228.4602, 15823.3474, 15428.6974, 15043.0284, 14667.6278, 14297.4588, 13935.2882, 13578.5402, 13234.6032, 12882.1578, 12548.0728, 12219.231, 11898.0072, 11587.2626, 11279.9072, 10973.5048, 10678.5186, 10392.4876, 10105.2556, 9825.766, 9562.5444, 9294.2222, 9038.2352, 8784.848, 8533.2644, 8301.7776, 8058.30859999999, 7822.94579999999, 7599.11319999999, 7366.90779999999, 7161.217, 6957.53080000001, 6736.212, 6548.21220000001, 6343.06839999999, 6156.28719999999, 5975.15419999999, 5791.75719999999, 5621.32019999999, 5451.66, 5287.61040000001, 5118.09479999999, 4957.288, 4798.4246, 4662.17559999999, 4512.05900000001, 4364.68539999999, 4220.77720000001, 4082.67259999999, 3957.19519999999, 3842.15779999999, 3699.3328, 3583.01180000001, 3473.8964, 3338.66639999999, 3233.55559999999, 3117.799, 3008.111, 2909.69140000001, 2814.86499999999, 2719.46119999999, 2624.742, 2532.46979999999, 2444.7886, 2370.1868, 2272.45259999999, 2196.19260000001, 2117.90419999999, 2023.2972, 1969.76819999999, 1885.58979999999, 1833.2824, 1733.91200000001, 1682.54920000001, 1604.57980000001, 1556.11240000001, 1491.3064, 1421.71960000001, 1371.22899999999, 1322.1324, 1264.7892, 1196.23920000001, 1143.8474, 1088.67240000001, 1073.60380000001, 1023.11660000001, 959.036400000012, 927.433199999999, 906.792799999996, 853.433599999989, 841.873800000001, 791.1054, 756.899999999994, 704.343200000003, 672.495599999995, 622.790399999998, 611.254799999995, 567.283200000005, 519.406599999988, 519.188400000014, 495.312800000014, 451.350799999986, 443.973399999988, 431.882199999993, 392.027000000002, 380.924200000009, 345.128999999986, 298.901400000002, 287.771999999997, 272.625, 247.253000000026, 222.490600000019, 223.590000000026, 196.407599999977, 176.425999999978, 134.725199999986, 132.4804, 110.445599999977, 86.7939999999944, 56.7038000000175, 64.915399999998, 38.3726000000024, 37.1606000000029, 46.170999999973, 49.1716000000015, 15.3362000000197, 6.71639999997569, -34.8185999999987, -39.4476000000141, 12.6830000000191, -12.3331999999937, -50.6565999999875, -59.9538000000175, -65.1054000000004, -70.7576000000117, -106.325200000021, -126.852200000023, -110.227599999984, -132.885999999999, -113.897200000007, -142.713800000027, -151.145399999979, -150.799200000009, -177.756200000003, -156.036399999983, -182.735199999996, -177.259399999981, -198.663600000029, -174.577600000019, -193.84580000001),
+    // precision 17
+    Array(94541, 92848.811, 91174.019, 89517.558, 87879.9705, 86262.7565, 84663.5125, 83083.7435, 81521.7865, 79977.272, 78455.9465, 76950.219, 75465.432, 73994.152, 72546.71, 71115.2345, 69705.6765, 68314.937, 66944.2705, 65591.255, 64252.9485, 62938.016, 61636.8225, 60355.592, 59092.789, 57850.568, 56624.518, 55417.343, 54231.1415, 53067.387, 51903.526, 50774.649, 49657.6415, 48561.05, 47475.7575, 46410.159, 45364.852, 44327.053, 43318.4005, 42325.6165, 41348.4595, 40383.6265, 39436.77, 38509.502, 37594.035, 36695.939, 35818.6895, 34955.691, 34115.8095, 33293.949, 32465.0775, 31657.6715, 30877.2585, 30093.78, 29351.3695, 28594.1365, 27872.115, 27168.7465, 26477.076, 25774.541, 25106.5375, 24452.5135, 23815.5125, 23174.0655, 22555.2685, 21960.2065, 21376.3555, 20785.1925, 20211.517, 19657.0725, 19141.6865, 18579.737, 18081.3955, 17578.995, 17073.44, 16608.335, 16119.911, 15651.266, 15194.583, 14749.0495, 14343.4835, 13925.639, 13504.509, 13099.3885, 12691.2855, 12328.018, 11969.0345, 11596.5145, 11245.6355, 10917.6575, 10580.9785, 10277.8605, 9926.58100000001, 9605.538, 9300.42950000003, 8989.97850000003, 8728.73249999998, 8448.3235, 8175.31050000002, 7898.98700000002, 7629.79100000003, 7413.76199999999, 7149.92300000001, 6921.12650000001, 6677.1545, 6443.28000000003, 6278.23450000002, 6014.20049999998, 5791.20299999998, 5605.78450000001, 5438.48800000001, 5234.2255, 5059.6825, 4887.43349999998, 4682.935, 4496.31099999999, 4322.52250000002, 4191.42499999999, 4021.24200000003, 3900.64799999999, 3762.84250000003, 3609.98050000001, 3502.29599999997, 3363.84250000003, 3206.54849999998, 3079.70000000001, 2971.42300000001, 2867.80349999998, 2727.08100000001, 2630.74900000001, 2496.6165, 2440.902, 2356.19150000002, 2235.58199999999, 2120.54149999999, 2012.25449999998, 1933.35600000003, 1820.93099999998, 1761.54800000001, 1663.09350000002, 1578.84600000002, 1509.48149999999, 1427.3345, 1379.56150000001, 1306.68099999998, 1212.63449999999, 1084.17300000001, 1124.16450000001, 1060.69949999999, 1007.48849999998, 941.194499999983, 879.880500000028, 836.007500000007, 782.802000000025, 748.385499999975, 647.991500000004, 626.730500000005, 570.776000000013, 484.000500000024, 513.98550000001, 418.985499999952, 386.996999999974, 370.026500000036, 355.496999999974, 356.731499999994, 255.92200000002, 259.094000000041, 205.434499999974, 165.374500000034, 197.347500000033, 95.718499999959, 67.6165000000037, 54.6970000000438, 31.7395000000251, -15.8784999999916, 8.42500000004657, -26.3754999999655, -118.425500000012, -66.6629999999423, -42.9745000000112, -107.364999999991, -189.839000000036, -162.611499999999, -164.964999999967, -189.079999999958, -223.931499999948, -235.329999999958, -269.639500000048, -249.087999999989, -206.475499999942, -283.04449999996, -290.667000000016, -304.561499999953, -336.784499999951, -380.386500000022, -283.280499999993, -364.533000000054, -389.059499999974, -364.454000000027, -415.748000000021, -417.155000000028),
+    // precision 18
+    Array(189083, 185696.913, 182348.774, 179035.946, 175762.762, 172526.444, 169329.754, 166166.099, 163043.269, 159958.91, 156907.912, 153906.845, 150924.199, 147996.568, 145093.457, 142239.233, 139421.475, 136632.27, 133889.588, 131174.2, 128511.619, 125868.621, 123265.385, 120721.061, 118181.769, 115709.456, 113252.446, 110840.198, 108465.099, 106126.164, 103823.469, 101556.618, 99308.004, 97124.508, 94937.803, 92833.731, 90745.061, 88677.627, 86617.47, 84650.442, 82697.833, 80769.132, 78879.629, 77014.432, 75215.626, 73384.587, 71652.482, 69895.93, 68209.301, 66553.669, 64921.981, 63310.323, 61742.115, 60205.018, 58698.658, 57190.657, 55760.865, 54331.169, 52908.167, 51550.273, 50225.254, 48922.421, 47614.533, 46362.049, 45098.569, 43926.083, 42736.03, 41593.473, 40425.26, 39316.237, 38243.651, 37170.617, 36114.609, 35084.19, 34117.233, 33206.509, 32231.505, 31318.728, 30403.404, 29540.0550000001, 28679.236, 27825.862, 26965.216, 26179.148, 25462.08, 24645.952, 23922.523, 23198.144, 22529.128, 21762.4179999999, 21134.779, 20459.117, 19840.818, 19187.04, 18636.3689999999, 17982.831, 17439.7389999999, 16874.547, 16358.2169999999, 15835.684, 15352.914, 14823.681, 14329.313, 13816.897, 13342.874, 12880.882, 12491.648, 12021.254, 11625.392, 11293.7610000001, 10813.697, 10456.209, 10099.074, 9755.39000000001, 9393.18500000006, 9047.57900000003, 8657.98499999999, 8395.85900000005, 8033, 7736.95900000003, 7430.59699999995, 7258.47699999996, 6924.58200000005, 6691.29399999999, 6357.92500000005, 6202.05700000003, 5921.19700000004, 5628.28399999999, 5404.96799999999, 5226.71100000001, 4990.75600000005, 4799.77399999998, 4622.93099999998, 4472.478, 4171.78700000001, 3957.46299999999, 3868.95200000005, 3691.14300000004, 3474.63100000005, 3341.67200000002, 3109.14000000001, 3071.97400000005, 2796.40399999998, 2756.17799999996, 2611.46999999997, 2471.93000000005, 2382.26399999997, 2209.22400000005, 2142.28399999999, 2013.96100000001, 1911.18999999994, 1818.27099999995, 1668.47900000005, 1519.65800000005, 1469.67599999998, 1367.13800000004, 1248.52899999998, 1181.23600000003, 1022.71900000004, 1088.20700000005, 959.03600000008, 876.095999999903, 791.183999999892, 703.337000000058, 731.949999999953, 586.86400000006, 526.024999999907, 323.004999999888, 320.448000000091, 340.672999999952, 309.638999999966, 216.601999999955, 102.922999999952, 19.2399999999907, -0.114000000059605, -32.6240000000689, -89.3179999999702, -153.497999999905, -64.2970000000205, -143.695999999996, -259.497999999905, -253.017999999924, -213.948000000091, -397.590000000084, -434.006000000052, -403.475000000093, -297.958000000101, -404.317000000039, -528.898999999976, -506.621000000043, -513.205000000075, -479.351000000024, -596.139999999898, -527.016999999993, -664.681000000099, -680.306000000099, -704.050000000047, -850.486000000034, -757.43200000003, -713.308999999892)
+  )
+  // scalastyle:on
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproxCountDistinctForIntervalsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproxCountDistinctForIntervalsSuite.scala
new file mode 100644
index 0000000000000..d6c38c3608bf8
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproxCountDistinctForIntervalsSuite.scala
@@ -0,0 +1,238 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.aggregate
+
+import java.sql.{Date, Timestamp}
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.TypeCheckFailure
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, BoundReference, CreateArray, Literal, SpecificInternalRow}
+import org.apache.spark.sql.catalyst.util.{ArrayData, DateTimeUtils}
+import org.apache.spark.sql.types._
+
+class ApproxCountDistinctForIntervalsSuite extends SparkFunSuite {
+
+  test("fails analysis if parameters are invalid") {
+    val wrongColumnTypes = Seq(BinaryType, BooleanType, StringType, ArrayType(IntegerType),
+      MapType(IntegerType, IntegerType), StructType(Seq(StructField("s", IntegerType))))
+    wrongColumnTypes.foreach { dataType =>
+      val wrongColumn = new ApproxCountDistinctForIntervals(
+        AttributeReference("a", dataType)(),
+        endpointsExpression = CreateArray(Seq(1, 10).map(Literal(_))))
+      assert(
+        wrongColumn.checkInputDataTypes() match {
+          case TypeCheckFailure(msg)
+            if msg.contains("requires (numeric or timestamp or date) type") => true
+          case _ => false
+        })
+    }
+
+    var wrongEndpoints = new ApproxCountDistinctForIntervals(
+      AttributeReference("a", DoubleType)(),
+      endpointsExpression = Literal(0.5d))
+    assert(
+      wrongEndpoints.checkInputDataTypes() match {
+        case TypeCheckFailure(msg) if msg.contains("requires array type") => true
+        case _ => false
+      })
+
+    wrongEndpoints = new ApproxCountDistinctForIntervals(
+      AttributeReference("a", DoubleType)(),
+      endpointsExpression = CreateArray(Seq(AttributeReference("b", DoubleType)())))
+    assert(wrongEndpoints.checkInputDataTypes() ==
+      TypeCheckFailure("The endpoints provided must be constant literals"))
+
+    wrongEndpoints = new ApproxCountDistinctForIntervals(
+      AttributeReference("a", DoubleType)(),
+      endpointsExpression = CreateArray(Array(10L).map(Literal(_))))
+    assert(wrongEndpoints.checkInputDataTypes() ==
+      TypeCheckFailure("The number of endpoints must be >= 2 to construct intervals"))
+
+    wrongEndpoints = new ApproxCountDistinctForIntervals(
+      AttributeReference("a", DoubleType)(),
+      endpointsExpression = CreateArray(Array("foobar").map(Literal(_))))
+    assert(wrongEndpoints.checkInputDataTypes() ==
+        TypeCheckFailure("Endpoints require (numeric or timestamp or date) type"))
+  }
+
+  /** Create an ApproxCountDistinctForIntervals instance and an input and output buffer. */
+  private def createEstimator[T](
+      endpoints: Array[T],
+      dt: DataType,
+      rsd: Double = 0.05): (ApproxCountDistinctForIntervals, InternalRow, InternalRow) = {
+    val input = new SpecificInternalRow(Seq(dt))
+    val aggFunc = ApproxCountDistinctForIntervals(
+      BoundReference(0, dt, nullable = true), CreateArray(endpoints.map(Literal(_))), rsd)
+    val buffer = createBuffer(aggFunc)
+    (aggFunc, input, buffer)
+  }
+
+  private def createBuffer(aggFunc: ApproxCountDistinctForIntervals): InternalRow = {
+    val buffer = new SpecificInternalRow(aggFunc.aggBufferAttributes.map(_.dataType))
+    aggFunc.initialize(buffer)
+    buffer
+  }
+
+  test("merging ApproxCountDistinctForIntervals instances") {
+    val (aggFunc, input, buffer1a) =
+      createEstimator(Array[Int](0, 10, 2000, 345678, 1000000), IntegerType)
+    val buffer1b = createBuffer(aggFunc)
+    val buffer2 = createBuffer(aggFunc)
+
+    // Add the lower half to `buffer1a`.
+    var i = 0
+    while (i < 500000) {
+      input.setInt(0, i)
+      aggFunc.update(buffer1a, input)
+      i += 1
+    }
+
+    // Add the upper half to `buffer1b`.
+    i = 500000
+    while (i < 1000000) {
+      input.setInt(0, i)
+      aggFunc.update(buffer1b, input)
+      i += 1
+    }
+
+    // Merge the lower and upper halves to `buffer1a`.
+    aggFunc.merge(buffer1a, buffer1b)
+
+    // Create the other buffer in reverse.
+    i = 999999
+    while (i >= 0) {
+      input.setInt(0, i)
+      aggFunc.update(buffer2, input)
+      i -= 1
+    }
+
+    // Check if the buffers are equal.
+    assert(buffer2 == buffer1a, "Buffers should be equal")
+  }
+
+  test("test findHllppIndex(value) for values in the range") {
+    def checkHllppIndex(
+        endpoints: Array[Double],
+        value: Double,
+        expectedIntervalIndex: Int): Unit = {
+      val aggFunc = ApproxCountDistinctForIntervals(
+        BoundReference(0, DoubleType, nullable = true), CreateArray(endpoints.map(Literal(_))))
+      assert(aggFunc.findHllppIndex(value) == expectedIntervalIndex)
+    }
+    val endpoints = Array[Double](0, 3, 6, 10)
+    // value is found (value is an interval boundary)
+    checkHllppIndex(endpoints = endpoints, value = 0, expectedIntervalIndex = 0)
+    checkHllppIndex(endpoints = endpoints, value = 3, expectedIntervalIndex = 0)
+    checkHllppIndex(endpoints = endpoints, value = 6, expectedIntervalIndex = 1)
+    checkHllppIndex(endpoints = endpoints, value = 10, expectedIntervalIndex = 2)
+    // value is not found
+    checkHllppIndex(endpoints = endpoints, value = 2, expectedIntervalIndex = 0)
+    checkHllppIndex(endpoints = endpoints, value = 4, expectedIntervalIndex = 1)
+    checkHllppIndex(endpoints = endpoints, value = 8, expectedIntervalIndex = 2)
+
+    // value is the same as multiple boundaries
+    checkHllppIndex(endpoints = Array(7, 7, 7, 9), value = 7, expectedIntervalIndex = 0)
+    checkHllppIndex(endpoints = Array(3, 5, 7, 7, 7), value = 7, expectedIntervalIndex = 1)
+    checkHllppIndex(endpoints = Array(1, 3, 5, 7, 7, 9), value = 7, expectedIntervalIndex = 2)
+  }
+
+  test("basic operations: update, merge, eval...") {
+    val endpoints = Array[Double](0, 0.33, 0.6, 0.6, 0.6, 1.0)
+    val data: Seq[Double] = Seq(0, 0.6, 0.3, 1, 0.6, 0.5, 0.6, 0.33)
+
+    Seq(0.01, 0.05, 0.1).foreach { relativeSD =>
+      val (aggFunc, input, buffer) = createEstimator(endpoints, DoubleType, relativeSD)
+
+      data.grouped(4).foreach { group =>
+        val (partialAggFunc, partialInput, partialBuffer) =
+          createEstimator(endpoints, DoubleType, relativeSD)
+        group.foreach { x =>
+          partialInput.setDouble(0, x)
+          partialAggFunc.update(partialBuffer, partialInput)
+        }
+        aggFunc.merge(buffer, partialBuffer)
+      }
+      // before eval(), for intervals with the same endpoints, only the first interval counts the
+      // value
+      checkNDVs(
+        ndvs = aggFunc.hllppResults(buffer),
+        expectedNdvs = Array(3, 2, 0, 0, 1),
+        rsd = relativeSD)
+
+      // A value out of the whole range will not change the buffer
+      input.setDouble(0, 2.0)
+      aggFunc.update(buffer, input)
+      checkNDVs(
+        ndvs = aggFunc.hllppResults(buffer),
+        expectedNdvs = Array(3, 2, 0, 0, 1),
+        rsd = relativeSD)
+
+      // after eval(), set the others to 1
+      checkNDVs(
+        ndvs = aggFunc.eval(buffer).asInstanceOf[ArrayData].toLongArray(),
+        expectedNdvs = Array(3, 2, 1, 1, 1),
+        rsd = relativeSD)
+    }
+  }
+
+  test("test for different input types: numeric/date/timestamp") {
+    val intEndpoints = Array[Int](0, 33, 60, 60, 60, 100)
+    val intRecords: Seq[Int] = Seq(0, 60, 30, 100, 60, 50, 60, 33)
+    val inputs = Seq(
+      (intRecords, intEndpoints, IntegerType),
+      (intRecords.map(DateTimeUtils.toJavaDate),
+          intEndpoints.map(DateTimeUtils.toJavaDate), DateType),
+      (intRecords.map(DateTimeUtils.toJavaTimestamp(_)),
+          intEndpoints.map(DateTimeUtils.toJavaTimestamp(_)), TimestampType)
+    )
+
+    inputs.foreach { case (records, endpoints, dataType) =>
+      val (aggFunc, input, buffer) = createEstimator(endpoints, dataType)
+      records.foreach { r =>
+        // convert to internal type value
+        val value = r match {
+          case d: Date => DateTimeUtils.fromJavaDate(d)
+          case t: Timestamp => DateTimeUtils.fromJavaTimestamp(t)
+          case _ => r
+        }
+        input.update(0, value)
+        aggFunc.update(buffer, input)
+      }
+      checkNDVs(
+        ndvs = aggFunc.eval(buffer).asInstanceOf[ArrayData].toLongArray(),
+        expectedNdvs = Array(3, 2, 1, 1, 1),
+        rsd = aggFunc.relativeSD)
+    }
+  }
+
+  private def checkNDVs(ndvs: Array[Long], expectedNdvs: Array[Long], rsd: Double): Unit = {
+    assert(ndvs.length == expectedNdvs.length)
+    for (i <- ndvs.indices) {
+      val ndv = ndvs(i)
+      val expectedNdv = expectedNdvs(i)
+      if (expectedNdv == 0) {
+        assert(ndv == 0)
+      } else if (expectedNdv > 0) {
+        assert(ndv > 0)
+        val error = math.abs((ndv / expectedNdv.toDouble) - 1.0d)
+        assert(error <= rsd * 3.0d, "Error should be within 3 std. errors.")
+      }
+    }
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala
index cc53880af5b24..98fd04c9cca91 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala
@@ -47,7 +47,7 @@ class HyperLogLogPlusPlusSuite extends SparkFunSuite {
   def evaluateEstimate(hll: HyperLogLogPlusPlus, buffer: InternalRow, cardinality: Int): Unit = {
     val estimate = hll.eval(buffer).asInstanceOf[Long].toDouble
     val error = math.abs((estimate / cardinality.toDouble) - 1.0d)
-    assert(error < hll.trueRsd * 3.0d, "Error should be within 3 std. errors.")
+    assert(error < hll.hllppHelper.trueRsd * 3.0d, "Error should be within 3 std. errors.")
   }
 
   test("test invalid parameter relativeSD") {
@@ -83,7 +83,7 @@ class HyperLogLogPlusPlusSuite extends SparkFunSuite {
         val estimate = hll.eval(buffer).asInstanceOf[Long].toDouble
         val cardinality = c(n)
         val error = math.abs((estimate / cardinality.toDouble) - 1.0d)
-        assert(error < hll.trueRsd * 3.0d, "Error should be within 3 std. errors.")
+        assert(error < hll.hllppHelper.trueRsd * 3.0d, "Error should be within 3 std. errors.")
     }
   }
 

From 1270e71753f40c353fb726a0a3d373d181aedb40 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Thu, 21 Sep 2017 22:45:06 +0900
Subject: [PATCH 590/779] [SPARK-22086][DOCS] Add expression description for
 CASE WHEN

## What changes were proposed in this pull request?

In SQL conditional expressions, only CASE WHEN lacks for expression description. This patch fills the gap.

## How was this patch tested?

Only documentation change.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #19304 from viirya/casewhen-doc.
---
 .../expressions/conditionalExpressions.scala    | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
index b59b6dec61389..d95b59d5ec423 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
@@ -223,7 +223,22 @@ abstract class CaseWhenBase(
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "CASE WHEN expr1 THEN expr2 [WHEN expr3 THEN expr4]* [ELSE expr5] END - When `expr1` = true, returns `expr2`; when `expr3` = true, return `expr4`; else return `expr5`.")
+  usage = "CASE WHEN expr1 THEN expr2 [WHEN expr3 THEN expr4]* [ELSE expr5] END - When `expr1` = true, returns `expr2`; else when `expr3` = true, returns `expr4`; else returns `expr5`.",
+  arguments = """
+    Arguments:
+      * expr1, expr3 - the branch condition expressions should all be boolean type.
+      * expr2, expr4, expr5 - the branch value expressions and else value expression should all be
+          same type or coercible to a common type.
+  """,
+  examples = """
+    Examples:
+      > SELECT CASE WHEN 1 > 0 THEN 1 WHEN 2 > 0 THEN 2.0 ELSE 1.2 END;
+       1
+      > SELECT CASE WHEN 1 < 0 THEN 1 WHEN 2 > 0 THEN 2.0 ELSE 1.2 END;
+       2
+      > SELECT CASE WHEN 1 < 0 THEN 1 WHEN 2 < 0 THEN 2.0 ELSE null END;
+       NULL
+  """)
 // scalastyle:on line.size.limit
 case class CaseWhen(
     val branches: Seq[(Expression, Expression)],

From f10cbf17dc7ceb96982fcdc964c849336fb50deb Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Thu, 21 Sep 2017 18:00:19 +0100
Subject: [PATCH 591/779] [SPARK-21977][HOTFIX] Adjust
 EnsureStatefulOpPartitioningSuite to use scalatest lifecycle normally instead
 of constructor

## What changes were proposed in this pull request?

Adjust EnsureStatefulOpPartitioningSuite to use scalatest lifecycle normally instead of constructor; fixes:

```
*** RUN ABORTED ***
  org.apache.spark.SparkException: Only one SparkContext may be running in this JVM (see SPARK-2243). To ignore this error, set spark.driver.allowMultipleContexts = true. The currently running SparkContext was created at:
org.apache.spark.sql.streaming.EnsureStatefulOpPartitioningSuite.<init>(EnsureStatefulOpPartitioningSuite.scala:35)
```

## How was this patch tested?

Existing tests

Author: Sean Owen <sowen@cloudera.com>

Closes #19306 from srowen/SPARK-21977.2.
---
 .../EnsureStatefulOpPartitioningSuite.scala   | 92 ++++++++++---------
 1 file changed, 49 insertions(+), 43 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/EnsureStatefulOpPartitioningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/EnsureStatefulOpPartitioningSuite.scala
index 66c0263e872b9..044bb03480aa4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/EnsureStatefulOpPartitioningSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/EnsureStatefulOpPartitioningSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.streaming
 import java.util.UUID
 
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.expressions.Attribute
@@ -32,39 +33,47 @@ import org.apache.spark.sql.test.SharedSQLContext
 class EnsureStatefulOpPartitioningSuite extends SparkPlanTest with SharedSQLContext {
 
   import testImplicits._
-  super.beforeAll()
 
-  private val baseDf = Seq((1, "A"), (2, "b")).toDF("num", "char")
+  private var baseDf: DataFrame = null
 
-  testEnsureStatefulOpPartitioning(
-    "ClusteredDistribution generates Exchange with HashPartitioning",
-    baseDf.queryExecution.sparkPlan,
-    requiredDistribution = keys => ClusteredDistribution(keys),
-    expectedPartitioning =
-      keys => HashPartitioning(keys, spark.sessionState.conf.numShufflePartitions),
-    expectShuffle = true)
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    baseDf = Seq((1, "A"), (2, "b")).toDF("num", "char")
+  }
+
+  test("ClusteredDistribution generates Exchange with HashPartitioning") {
+    testEnsureStatefulOpPartitioning(
+      baseDf.queryExecution.sparkPlan,
+      requiredDistribution = keys => ClusteredDistribution(keys),
+      expectedPartitioning =
+        keys => HashPartitioning(keys, spark.sessionState.conf.numShufflePartitions),
+      expectShuffle = true)
+  }
 
-  testEnsureStatefulOpPartitioning(
-    "ClusteredDistribution with coalesce(1) generates Exchange with HashPartitioning",
-    baseDf.coalesce(1).queryExecution.sparkPlan,
-    requiredDistribution = keys => ClusteredDistribution(keys),
-    expectedPartitioning =
-      keys => HashPartitioning(keys, spark.sessionState.conf.numShufflePartitions),
-    expectShuffle = true)
+  test("ClusteredDistribution with coalesce(1) generates Exchange with HashPartitioning") {
+    testEnsureStatefulOpPartitioning(
+      baseDf.coalesce(1).queryExecution.sparkPlan,
+      requiredDistribution = keys => ClusteredDistribution(keys),
+      expectedPartitioning =
+        keys => HashPartitioning(keys, spark.sessionState.conf.numShufflePartitions),
+      expectShuffle = true)
+  }
 
-  testEnsureStatefulOpPartitioning(
-    "AllTuples generates Exchange with SinglePartition",
-    baseDf.queryExecution.sparkPlan,
-    requiredDistribution = _ => AllTuples,
-    expectedPartitioning = _ => SinglePartition,
-    expectShuffle = true)
+  test("AllTuples generates Exchange with SinglePartition") {
+    testEnsureStatefulOpPartitioning(
+      baseDf.queryExecution.sparkPlan,
+      requiredDistribution = _ => AllTuples,
+      expectedPartitioning = _ => SinglePartition,
+      expectShuffle = true)
+  }
 
-  testEnsureStatefulOpPartitioning(
-    "AllTuples with coalesce(1) doesn't need Exchange",
-    baseDf.coalesce(1).queryExecution.sparkPlan,
-    requiredDistribution = _ => AllTuples,
-    expectedPartitioning = _ => SinglePartition,
-    expectShuffle = false)
+  test("AllTuples with coalesce(1) doesn't need Exchange") {
+    testEnsureStatefulOpPartitioning(
+      baseDf.coalesce(1).queryExecution.sparkPlan,
+      requiredDistribution = _ => AllTuples,
+      expectedPartitioning = _ => SinglePartition,
+      expectShuffle = false)
+  }
 
   /**
    * For `StatefulOperator` with the given `requiredChildDistribution`, and child SparkPlan
@@ -72,26 +81,23 @@ class EnsureStatefulOpPartitioningSuite extends SparkPlanTest with SharedSQLCont
    * ensure the expected partitioning.
    */
   private def testEnsureStatefulOpPartitioning(
-      testName: String,
       inputPlan: SparkPlan,
       requiredDistribution: Seq[Attribute] => Distribution,
       expectedPartitioning: Seq[Attribute] => Partitioning,
       expectShuffle: Boolean): Unit = {
-    test(testName) {
-      val operator = TestStatefulOperator(inputPlan, requiredDistribution(inputPlan.output.take(1)))
-      val executed = executePlan(operator, OutputMode.Complete())
-      if (expectShuffle) {
-        val exchange = executed.children.find(_.isInstanceOf[Exchange])
-        if (exchange.isEmpty) {
-          fail(s"Was expecting an exchange but didn't get one in:\n$executed")
-        }
-        assert(exchange.get ===
-          ShuffleExchange(expectedPartitioning(inputPlan.output.take(1)), inputPlan),
-          s"Exchange didn't have expected properties:\n${exchange.get}")
-      } else {
-        assert(!executed.children.exists(_.isInstanceOf[Exchange]),
-          s"Unexpected exchange found in:\n$executed")
+    val operator = TestStatefulOperator(inputPlan, requiredDistribution(inputPlan.output.take(1)))
+    val executed = executePlan(operator, OutputMode.Complete())
+    if (expectShuffle) {
+      val exchange = executed.children.find(_.isInstanceOf[Exchange])
+      if (exchange.isEmpty) {
+        fail(s"Was expecting an exchange but didn't get one in:\n$executed")
       }
+      assert(exchange.get ===
+        ShuffleExchange(expectedPartitioning(inputPlan.output.take(1)), inputPlan),
+        s"Exchange didn't have expected properties:\n${exchange.get}")
+    } else {
+      assert(!executed.children.exists(_.isInstanceOf[Exchange]),
+        s"Unexpected exchange found in:\n$executed")
     }
   }
 

From b75bd1777496ce0354458bf85603a8087a6a0ff8 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Thu, 21 Sep 2017 10:20:19 -0700
Subject: [PATCH 592/779] [SPARK-21928][CORE] Set classloader on
 SerializerManager's private kryo

## What changes were proposed in this pull request?

We have to make sure that SerializerManager's private instance of
kryo also uses the right classloader, regardless of the current thread
classloader.  In particular, this fixes serde during remote cache
fetches, as those occur in netty threads.

## How was this patch tested?

Manual tests & existing suite via jenkins.  I haven't been able to reproduce this is in a unit test, because when a remote RDD partition can be fetched, there is a warning message and then the partition is just recomputed locally.  I manually verified the warning message is no longer present.

Author: Imran Rashid <irashid@cloudera.com>

Closes #19280 from squito/SPARK-21928_ser_classloader.
---
 core/src/main/scala/org/apache/spark/executor/Executor.scala  | 3 +++
 .../scala/org/apache/spark/serializer/SerializerManager.scala | 4 ++++
 .../test/scala/org/apache/spark/executor/ExecutorSuite.scala  | 3 ++-
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index 3f55d014e5a1b..2ecbb749d1fb7 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -131,6 +131,9 @@ private[spark] class Executor(
 
   // Set the classloader for serializer
   env.serializer.setDefaultClassLoader(replClassLoader)
+  // SPARK-21928.  SerializerManager's internal instance of Kryo might get used in netty threads
+  // for fetching remote cached RDD blocks, so need to make sure it uses the right classloader too.
+  env.serializerManager.setDefaultClassLoader(replClassLoader)
 
   // Max size of direct result. If task result is bigger than this, we use the block manager
   // to send the result back.
diff --git a/core/src/main/scala/org/apache/spark/serializer/SerializerManager.scala b/core/src/main/scala/org/apache/spark/serializer/SerializerManager.scala
index bb7ed8709ba8a..311383e7ea2bd 100644
--- a/core/src/main/scala/org/apache/spark/serializer/SerializerManager.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/SerializerManager.scala
@@ -41,6 +41,10 @@ private[spark] class SerializerManager(
 
   private[this] val kryoSerializer = new KryoSerializer(conf)
 
+  def setDefaultClassLoader(classLoader: ClassLoader): Unit = {
+    kryoSerializer.setDefaultClassLoader(classLoader)
+  }
+
   private[this] val stringClassTag: ClassTag[String] = implicitly[ClassTag[String]]
   private[this] val primitiveAndPrimitiveArrayClassTags: Set[ClassTag[_]] = {
     val primitiveClassTags = Set[ClassTag[_]](
diff --git a/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala b/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala
index 884a2750e621d..105a178f2d94e 100644
--- a/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala
@@ -42,7 +42,7 @@ import org.apache.spark.metrics.MetricsSystem
 import org.apache.spark.rdd.RDD
 import org.apache.spark.rpc.RpcEnv
 import org.apache.spark.scheduler.{FakeTask, ResultTask, TaskDescription}
-import org.apache.spark.serializer.JavaSerializer
+import org.apache.spark.serializer.{JavaSerializer, SerializerManager}
 import org.apache.spark.shuffle.FetchFailedException
 import org.apache.spark.storage.BlockManagerId
 import org.apache.spark.util.UninterruptibleThread
@@ -234,6 +234,7 @@ class ExecutorSuite extends SparkFunSuite with LocalSparkContext with MockitoSug
     val mockMemoryManager = mock[MemoryManager]
     when(mockEnv.conf).thenReturn(conf)
     when(mockEnv.serializer).thenReturn(serializer)
+    when(mockEnv.serializerManager).thenReturn(mock[SerializerManager])
     when(mockEnv.rpcEnv).thenReturn(mockRpcEnv)
     when(mockEnv.metricsSystem).thenReturn(mockMetricsSystem)
     when(mockEnv.memoryManager).thenReturn(mockMemoryManager)

From f7ad0dbd512b00241337f15e1ed0a28a1d30711d Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Thu, 21 Sep 2017 10:23:35 -0700
Subject: [PATCH 593/779] [INFRA] Close stale PRs.

Closes #19296
Closes #19291

From 9cac249fd56cb001a723733eb0d6da606bd88190 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Thu, 21 Sep 2017 11:51:00 -0700
Subject: [PATCH 594/779] [SPARK-22088][SQL] Incorrect scalastyle comment
 causes wrong styles in stringExpressions

## What changes were proposed in this pull request?

There is an incorrect `scalastyle:on` comment in `stringExpressions.scala` and causes the line size limit check ineffective in the file. There are many lines of code and comment which are more than 100 chars.

## How was this patch tested?

Code style change only.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #19305 from viirya/fix-wrong-style.
---
 .../expressions/stringExpressions.scala       | 68 +++++++++++--------
 1 file changed, 40 insertions(+), 28 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 83de515079eea..6dfc59f58ad6d 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -484,7 +484,7 @@ case class StringTranslate(srcExpr: Expression, matchingExpr: Expression, replac
       > SELECT _FUNC_('ab','abc,b,ab,c,def');
        3
   """)
-// scalastyle:on
+// scalastyle:on line.size.limit
 case class FindInSet(left: Expression, right: Expression) extends BinaryExpression
     with ImplicitCastInputTypes {
 
@@ -519,16 +519,18 @@ object StringTrim {
 }
 
 /**
- * A function that takes a character string, removes the leading and trailing characters matching with any character
- * in the trim string, returns the new string.
- * If BOTH and trimStr keywords are not specified, it defaults to remove space character from both ends. The trim
- * function will have one argument, which contains the source string.
- * If BOTH and trimStr keywords are specified, it trims the characters from both ends, and the trim function will have
- * two arguments, the first argument contains trimStr, the second argument contains the source string.
- * trimStr: A character string to be trimmed from the source string, if it has multiple characters, the function
- * searches for each character in the source string, removes the characters from the source string until it
- * encounters the first non-match character.
- * BOTH: removes any character from both ends of the source string that matches characters in the trim string.
+ * A function that takes a character string, removes the leading and trailing characters matching
+ * with any character in the trim string, returns the new string.
+ * If BOTH and trimStr keywords are not specified, it defaults to remove space character from both
+ * ends. The trim function will have one argument, which contains the source string.
+ * If BOTH and trimStr keywords are specified, it trims the characters from both ends, and the trim
+ * function will have two arguments, the first argument contains trimStr, the second argument
+ * contains the source string.
+ * trimStr: A character string to be trimmed from the source string, if it has multiple characters,
+ * the function searches for each character in the source string, removes the characters from the
+ * source string until it encounters the first non-match character.
+ * BOTH: removes any character from both ends of the source string that matches characters in the
+ * trim string.
  */
 @ExpressionDescription(
   usage = """
@@ -612,19 +614,22 @@ case class StringTrim(
 }
 
 object StringTrimLeft {
-  def apply(str: Expression, trimStr: Expression) : StringTrimLeft = StringTrimLeft(str, Some(trimStr))
-  def apply(str: Expression) : StringTrimLeft = StringTrimLeft(str, None)
+  def apply(str: Expression, trimStr: Expression): StringTrimLeft =
+    StringTrimLeft(str, Some(trimStr))
+  def apply(str: Expression): StringTrimLeft = StringTrimLeft(str, None)
 }
 
 /**
  * A function that trims the characters from left end for a given string.
- * If LEADING and trimStr keywords are not specified, it defaults to remove space character from the left end. The ltrim
- * function will have one argument, which contains the source string.
- * If LEADING and trimStr keywords are not specified, it trims the characters from left end. The ltrim function will
- * have two arguments, the first argument contains trimStr, the second argument contains the source string.
- * trimStr: the function removes any character from the left end of the source string which matches with the characters
- * from trimStr, it stops at the first non-match character.
- * LEADING: removes any character from the left end of the source string that matches characters in the trim string.
+ * If LEADING and trimStr keywords are not specified, it defaults to remove space character from
+ * the left end. The ltrim function will have one argument, which contains the source string.
+ * If LEADING and trimStr keywords are not specified, it trims the characters from left end. The
+ * ltrim function will have two arguments, the first argument contains trimStr, the second argument
+ * contains the source string.
+ * trimStr: the function removes any character from the left end of the source string which matches
+ * with the characters from trimStr, it stops at the first non-match character.
+ * LEADING: removes any character from the left end of the source string that matches characters in
+ * the trim string.
  */
 @ExpressionDescription(
   usage = """
@@ -709,20 +714,24 @@ case class StringTrimLeft(
 }
 
 object StringTrimRight {
-  def apply(str: Expression, trimStr: Expression) : StringTrimRight = StringTrimRight(str, Some(trimStr))
+  def apply(str: Expression, trimStr: Expression): StringTrimRight =
+    StringTrimRight(str, Some(trimStr))
   def apply(str: Expression) : StringTrimRight = StringTrimRight(str, None)
 }
 
 /**
  * A function that trims the characters from right end for a given string.
- * If TRAILING and trimStr keywords are not specified, it defaults to remove space character from the right end. The
- * rtrim function will have one argument, which contains the source string.
- * If TRAILING and trimStr keywords are specified, it trims the characters from right end. The rtrim function will
- * have two arguments, the first argument contains trimStr, the second argument contains the source string.
- * trimStr: the function removes any character from the right end of source string which matches with the characters
- * from trimStr, it stops at the first non-match character.
- * TRAILING: removes any character from the right end of the source string that matches characters in the trim string.
+ * If TRAILING and trimStr keywords are not specified, it defaults to remove space character
+ * from the right end. The rtrim function will have one argument, which contains the source string.
+ * If TRAILING and trimStr keywords are specified, it trims the characters from right end. The
+ * rtrim function will have two arguments, the first argument contains trimStr, the second argument
+ * contains the source string.
+ * trimStr: the function removes any character from the right end of source string which matches
+ * with the characters from trimStr, it stops at the first non-match character.
+ * TRAILING: removes any character from the right end of the source string that matches characters
+ * in the trim string.
  */
+// scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = """
     _FUNC_(str) - Removes the trailing space characters from `str`.
@@ -741,6 +750,7 @@ object StringTrimRight {
       > SELECT _FUNC_('LQSa', 'SSparkSQLS');
        SSpark
   """)
+// scalastyle:on line.size.limit
 case class StringTrimRight(
     srcStr: Expression,
     trimStr: Option[Expression] = None)
@@ -812,6 +822,7 @@ case class StringTrimRight(
  *
  * NOTE: that this is not zero based, but 1-based index. The first character in str has index 1.
  */
+// scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(str, substr) - Returns the (1-based) index of the first occurrence of `substr` in `str`.",
   examples = """
@@ -819,6 +830,7 @@ case class StringTrimRight(
       > SELECT _FUNC_('SparkSQL', 'SQL');
        6
   """)
+// scalastyle:on line.size.limit
 case class StringInstr(str: Expression, substr: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 

From b21b806ecc55f15575833c1e859c35ae391ff369 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Thu, 21 Sep 2017 20:05:44 +0100
Subject: [PATCH 595/779] [SPARK-22075][ML] GBTs unpersist datasets cached by
 Checkpointer

## What changes were proposed in this pull request?
`PeriodicRDDCheckpointer` will automatically persist the last 3 datasets called by `PeriodicRDDCheckpointer.update()`.
In GBTs, the last 3 intermediate rdds are still cached after `fit()`

## How was this patch tested?
existing tests and local test in spark-shell

Author: Zheng RuiFeng <ruifengz@foxmail.com>

Closes #19288 from zhengruifeng/gbt_unpersist.
---
 .../org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala    | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala
index ce2bd7b430f43..e32447a79abb8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala
@@ -360,7 +360,9 @@ private[spark] object GradientBoostedTrees extends Logging {
     logInfo("Internal timing for DecisionTree:")
     logInfo(s"$timer")
 
+    predErrorCheckpointer.unpersistDataSet()
     predErrorCheckpointer.deleteAllCheckpoints()
+    validatePredErrorCheckpointer.unpersistDataSet()
     validatePredErrorCheckpointer.deleteAllCheckpoints()
     if (persistedInput) input.unpersist()
 

From a8a5cd24e2b9255acbc1e946f2946928144568ae Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Thu, 21 Sep 2017 20:06:42 +0100
Subject: [PATCH 596/779] [SPARK-22009][ML] Using treeAggregate improve some
 algs

## What changes were proposed in this pull request?

I test on a dataset of about 13M instances, and found that using `treeAggregate` give a speedup in following algs:

|Algs| SpeedUp |
|------|-----------|
|OneHotEncoder| 5% |
|StatFunctions.calculateCov| 7% |
|StatFunctions.multipleApproxQuantiles|  9% |
|RegressionEvaluator| 8% |

## How was this patch tested?
existing tests

Author: Zheng RuiFeng <ruifengz@foxmail.com>

Closes #19232 from zhengruifeng/use_treeAggregate.
---
 .../scala/org/apache/spark/ml/feature/OneHotEncoder.scala     | 2 +-
 .../org/apache/spark/mllib/evaluation/RegressionMetrics.scala | 2 +-
 .../org/apache/spark/sql/execution/stat/FrequentItems.scala   | 2 +-
 .../org/apache/spark/sql/execution/stat/StatFunctions.scala   | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
index ba1380bdda451..a669da183e2c8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
@@ -142,7 +142,7 @@ class OneHotEncoder @Since("1.4.0") (@Since("1.4.0") override val uid: String) e
     if (outputAttrGroup.size < 0) {
       // If the number of attributes is unknown, we check the values from the input column.
       val numAttrs = dataset.select(col(inputColName).cast(DoubleType)).rdd.map(_.getDouble(0))
-        .aggregate(0.0)(
+        .treeAggregate(0.0)(
           (m, x) => {
             assert(x <= Int.MaxValue,
               s"OneHotEncoder only supports up to ${Int.MaxValue} indices, but got $x")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
index ad99b00a31fd5..020676cac5a64 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
@@ -54,7 +54,7 @@ class RegressionMetrics @Since("2.0.0") (
   private lazy val summary: MultivariateStatisticalSummary = {
     val summary: MultivariateStatisticalSummary = predictionAndObservations.map {
       case (prediction, observation) => Vectors.dense(observation, observation - prediction)
-    }.aggregate(new MultivariateOnlineSummarizer())(
+    }.treeAggregate(new MultivariateOnlineSummarizer())(
         (summary, v) => summary.add(v),
         (sum1, sum2) => sum1.merge(sum2)
       )
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
index 3884cec345620..86f6307254332 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
@@ -95,7 +95,7 @@ object FrequentItems extends Logging {
       (name, originalSchema.fields(index).dataType)
     }.toArray
 
-    val freqItems = df.select(cols.map(Column(_)) : _*).rdd.aggregate(countMaps)(
+    val freqItems = df.select(cols.map(Column(_)) : _*).rdd.treeAggregate(countMaps)(
       seqOp = (counts, row) => {
         var i = 0
         while (i < numCols) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
index a75cfb3600225..685d5841ab551 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
@@ -99,7 +99,7 @@ object StatFunctions extends Logging {
         sum2: Array[QuantileSummaries]): Array[QuantileSummaries] = {
       sum1.zip(sum2).map { case (s1, s2) => s1.compress().merge(s2.compress()) }
     }
-    val summaries = df.select(columns: _*).rdd.aggregate(emptySummaries)(apply, merge)
+    val summaries = df.select(columns: _*).rdd.treeAggregate(emptySummaries)(apply, merge)
 
     summaries.map { summary => probabilities.flatMap(summary.query) }
   }
@@ -160,7 +160,7 @@ object StatFunctions extends Logging {
         s"for columns with dataType ${data.get.dataType} not supported.")
     }
     val columns = cols.map(n => Column(Cast(Column(n).expr, DoubleType)))
-    df.select(columns: _*).queryExecution.toRdd.aggregate(new CovarianceCounter)(
+    df.select(columns: _*).queryExecution.toRdd.treeAggregate(new CovarianceCounter)(
       seqOp = (counter, row) => {
         counter.add(row.getDouble(0), row.getDouble(1))
       },

From f32a8425051eabdef2d69002cfc843c01d98df0d Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Thu, 21 Sep 2017 15:39:07 -0700
Subject: [PATCH 597/779] [SPARK-22053][SS] Stream-stream inner join in Append
 Mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

#### Architecture
This PR implements stream-stream inner join using a two-way symmetric hash join. At a high level, we want to do the following.

1. For each stream, we maintain the past rows as state in State Store.
  - For each joining key, there can be multiple rows that have been received.
  - So, we have to effectively maintain a key-to-list-of-values multimap as state for each stream.
2. In each batch, for each input row in each stream
  - Look up the other streams state to see if there are matching rows, and output them if they satisfy the joining condition
  - Add the input row to corresponding stream’s state.
  - If the data has a timestamp/window column with watermark, then we will use that to calculate the threshold for keys that are required to buffered for future matches and drop the rest from the state.

Cleaning up old unnecessary state rows depends completely on whether watermark has been defined and what are join conditions. We definitely want to support state clean up two types of queries that are likely to be common.

- Queries to time range conditions - E.g. `SELECT * FROM leftTable, rightTable ON leftKey = rightKey AND leftTime > rightTime - INTERVAL 8 MINUTES AND leftTime < rightTime + INTERVAL 1 HOUR`
- Queries with windows as the matching key - E.g. `SELECT * FROM leftTable, rightTable ON leftKey = rightKey AND window(leftTime, "1 hour") = window(rightTime, "1 hour")` (pseudo-SQL)

#### Implementation
The stream-stream join is primarily implemented in three classes
- `StreamingSymmetricHashJoinExec` implements the above symmetric join algorithm.
- `SymmetricsHashJoinStateManagers` manages the streaming state for the join. This essentially is a fault-tolerant key-to-list-of-values multimap built on the StateStore APIs. `StreamingSymmetricHashJoinExec` instantiates two such managers, one for each join side.
- `StreamingSymmetricHashJoinExecHelper` is a helper class to extract threshold for the state based on the join conditions and the event watermark.

Refer to the scaladocs class for more implementation details.

Besides the implementation of stream-stream inner join SparkPlan. Some additional changes are
- Allowed inner join in append mode in UnsupportedOperationChecker
- Prevented stream-stream join on an empty batch dataframe to be collapsed by the optimizer

## How was this patch tested?
- New tests in StreamingJoinSuite
- Updated tests UnsupportedOperationSuite

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #19271 from tdas/SPARK-22053.
---
 .../sql/catalyst/analysis/Analyzer.scala      |   2 +-
 .../UnsupportedOperationChecker.scala         |   7 +-
 .../expressions/namedExpressions.scala        |   9 +-
 .../optimizer/PropagateEmptyRelation.scala    |  25 +-
 .../analysis/UnsupportedOperationsSuite.scala |  22 +-
 .../spark/sql/execution/SparkStrategies.scala |  21 +-
 .../streaming/IncrementalExecution.scala      |  11 +
 .../execution/streaming/StreamExecution.scala |   2 +
 .../StreamingSymmetricHashJoinExec.scala      | 346 +++++++++++++
 .../StreamingSymmetricHashJoinHelper.scala    | 415 +++++++++++++++
 .../streaming/state/StateStore.scala          |  21 +
 .../state/StateStoreCoordinator.scala         |   6 +-
 .../state/SymmetricHashJoinStateManager.scala | 395 +++++++++++++++
 .../execution/streaming/state/package.scala   |   2 -
 .../streaming/statefulOperators.scala         |  55 +-
 .../SymmetricHashJoinStateManagerSuite.scala  | 172 +++++++
 .../sql/streaming/StateStoreMetricsTest.scala |   2 +-
 .../sql/streaming/StreamingJoinSuite.scala    | 472 ++++++++++++++++++
 18 files changed, 1940 insertions(+), 45 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinHelper.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManagerSuite.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 45ec2045de9a9..8edf575db7969 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -2390,7 +2390,7 @@ object TimeWindowing extends Rule[LogicalPlan] {
 
         if (window.windowDuration == window.slideDuration) {
           val windowStruct = Alias(getWindow(0, 1), WINDOW_COL_NAME)(
-            exprId = windowAttr.exprId)
+            exprId = windowAttr.exprId, explicitMetadata = Some(metadata))
 
           val replacedPlan = p transformExpressions {
             case t: TimeWindow => windowAttr
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
index 33ba0867a33e0..d1d705691b076 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
@@ -222,8 +222,10 @@ object UnsupportedOperationChecker {
           joinType match {
 
             case _: InnerLike =>
-              if (left.isStreaming && right.isStreaming) {
-                throwError("Inner join between two streaming DataFrames/Datasets is not supported")
+              if (left.isStreaming && right.isStreaming &&
+                outputMode != InternalOutputModes.Append) {
+                throwError("Inner join between two streaming DataFrames/Datasets is not supported" +
+                  s" in ${outputMode} output mode, only in Append output mode")
               }
 
             case FullOuter =>
@@ -231,7 +233,6 @@ object UnsupportedOperationChecker {
                 throwError("Full outer joins with streaming DataFrames/Datasets are not supported")
               }
 
-
             case LeftOuter | LeftSemi | LeftAnti =>
               if (right.isStreaming) {
                 throwError("Left outer/semi/anti joins with a streaming DataFrame/Dataset " +
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
index b898484e0ecf1..e518e73cba549 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
@@ -164,7 +164,14 @@ case class Alias(child: Expression, name: String)(
     }
   }
 
-  override def toString: String = s"$child AS $name#${exprId.id}$typeSuffix"
+  /** Used to signal the column used to calculate an eventTime watermark (e.g. a#1-T{delayMs}) */
+  private def delaySuffix = if (metadata.contains(EventTimeWatermark.delayKey)) {
+    s"-T${metadata.getLong(EventTimeWatermark.delayKey)}ms"
+  } else {
+    ""
+  }
+
+  override def toString: String = s"$child AS $name#${exprId.id}$typeSuffix$delaySuffix"
 
   override protected final def otherCopyArgs: Seq[AnyRef] = {
     exprId :: qualifier :: explicitMetadata :: Nil
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala
index cfffa6bc2bfdd..52fbb4df2f58e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala
@@ -45,14 +45,19 @@ object PropagateEmptyRelation extends Rule[LogicalPlan] with PredicateHelper {
     case p: Union if p.children.forall(isEmptyLocalRelation) =>
       empty(p)
 
-    case p @ Join(_, _, joinType, _) if p.children.exists(isEmptyLocalRelation) => joinType match {
-      case _: InnerLike => empty(p)
-      // Intersect is handled as LeftSemi by `ReplaceIntersectWithSemiJoin` rule.
-      // Except is handled as LeftAnti by `ReplaceExceptWithAntiJoin` rule.
-      case LeftOuter | LeftSemi | LeftAnti if isEmptyLocalRelation(p.left) => empty(p)
-      case RightOuter if isEmptyLocalRelation(p.right) => empty(p)
-      case FullOuter if p.children.forall(isEmptyLocalRelation) => empty(p)
-      case _ => p
+    // Joins on empty LocalRelations generated from streaming sources are not eliminated
+    // as stateful streaming joins need to perform other state management operations other than
+    // just processing the input data.
+    case p @ Join(_, _, joinType, _)
+        if !p.children.exists(_.isStreaming) && p.children.exists(isEmptyLocalRelation) =>
+      joinType match {
+        case _: InnerLike => empty(p)
+        // Intersect is handled as LeftSemi by `ReplaceIntersectWithSemiJoin` rule.
+        // Except is handled as LeftAnti by `ReplaceExceptWithAntiJoin` rule.
+        case LeftOuter | LeftSemi | LeftAnti if isEmptyLocalRelation(p.left) => empty(p)
+        case RightOuter if isEmptyLocalRelation(p.right) => empty(p)
+        case FullOuter if p.children.forall(isEmptyLocalRelation) => empty(p)
+        case _ => p
     }
 
     case p: UnaryNode if p.children.nonEmpty && p.children.forall(isEmptyLocalRelation) => p match {
@@ -74,6 +79,10 @@ object PropagateEmptyRelation extends Rule[LogicalPlan] with PredicateHelper {
       //
       // If the grouping expressions are empty, however, then the aggregate will always produce a
       // single output row and thus we cannot propagate the EmptyRelation.
+      //
+      // Aggregation on empty LocalRelation generated from a streaming source is not eliminated
+      // as stateful streaming aggregation need to perform other state management operations other
+      // than just processing the input data.
       case Aggregate(ge, _, _) if ge.nonEmpty && !p.isStreaming => empty(p)
       // Generators like Hive-style UDTF may return their records within `close`.
       case Generate(_: Explode, _, _, _, _, _) => empty(p)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala
index 4de75866e04a3..11f48a39c1e25 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala
@@ -383,11 +383,27 @@ class UnsupportedOperationsSuite extends SparkFunSuite {
     outputMode = Append
   )
 
-  // Inner joins: Stream-stream not supported
+  // Inner joins: Multiple stream-stream joins supported only in append mode
   testBinaryOperationInStreamingPlan(
-    "inner join",
+    "single inner join in append mode",
     _.join(_, joinType = Inner),
-    streamStreamSupported = false)
+    outputMode = Append,
+    streamStreamSupported = true)
+
+  testBinaryOperationInStreamingPlan(
+    "multiple inner joins in append mode",
+    (x: LogicalPlan, y: LogicalPlan) => {
+      x.join(y, joinType = Inner).join(streamRelation, joinType = Inner)
+    },
+    outputMode = Append,
+    streamStreamSupported = true)
+
+  testBinaryOperationInStreamingPlan(
+    "inner join in update mode",
+    _.join(_, joinType = Inner),
+    outputMode = Update,
+    streamStreamSupported = false,
+    expectedMsg = "inner join")
 
   // Full outer joins: only batch-batch is allowed
   testBinaryOperationInStreamingPlan(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 6b16408e27840..4da7a73469537 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.execution
 
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.Strategy
+import org.apache.spark.sql.{execution, AnalysisException, Strategy}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.catalyst.expressions._
@@ -26,7 +26,6 @@ import org.apache.spark.sql.catalyst.planning._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.physical._
-import org.apache.spark.sql.execution
 import org.apache.spark.sql.execution.columnar.{InMemoryRelation, InMemoryTableScanExec}
 import org.apache.spark.sql.execution.command._
 import org.apache.spark.sql.execution.exchange.ShuffleExchange
@@ -257,6 +256,24 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
     }
   }
 
+  object StreamingJoinStrategy extends Strategy {
+    override def apply(plan: LogicalPlan): Seq[SparkPlan] = {
+      plan match {
+        case ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, condition, left, right)
+          if left.isStreaming && right.isStreaming =>
+
+          new StreamingSymmetricHashJoinExec(
+            leftKeys, rightKeys, joinType, condition, planLater(left), planLater(right)) :: Nil
+
+        case Join(left, right, _, _) if left.isStreaming && right.isStreaming =>
+          throw new AnalysisException(
+            "Stream stream joins without equality predicate is not supported", plan = Some(plan))
+
+        case _ => Nil
+      }
+    }
+  }
+
   /**
    * Used to plan the aggregate operator for expressions based on the AggregateFunction2 interface.
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
index 027222e1119c1..8e0aae39cabb6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
@@ -54,6 +54,7 @@ class IncrementalExecution(
       sparkSession.sessionState.planner.strategies
 
     override def extraPlanningStrategies: Seq[Strategy] =
+      StreamingJoinStrategy ::
       StatefulAggregationStrategy ::
       FlatMapGroupsWithStateStrategy ::
       StreamingRelationStrategy ::
@@ -116,6 +117,16 @@ class IncrementalExecution(
           stateInfo = Some(nextStatefulOperationStateInfo),
           batchTimestampMs = Some(offsetSeqMetadata.batchTimestampMs),
           eventTimeWatermark = Some(offsetSeqMetadata.batchWatermarkMs))
+
+      case j: StreamingSymmetricHashJoinExec =>
+        j.copy(
+          stateInfo = Some(nextStatefulOperationStateInfo),
+          eventTimeWatermark = Some(offsetSeqMetadata.batchWatermarkMs),
+          stateWatermarkPredicates =
+            StreamingSymmetricHashJoinHelper.getStateWatermarkPredicates(
+              j.left.output, j.right.output, j.leftKeys, j.rightKeys, j.condition,
+              Some(offsetSeqMetadata.batchWatermarkMs))
+        )
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 18385f5fc1975..b2d6c6082b025 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -297,6 +297,8 @@ class StreamExecution(
       val sparkSessionToRunBatches = sparkSession.cloneSession()
       // Adaptive execution can change num shuffle partitions, disallow
       sparkSessionToRunBatches.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "false")
+      // Disable cost-based join optimization as we do not want stateful operations to be rearranged
+      sparkSessionToRunBatches.conf.set(SQLConf.CBO_ENABLED.key, "false")
       offsetSeqMetadata = OffsetSeqMetadata(
         batchWatermarkMs = 0, batchTimestampMs = 0, sparkSessionToRunBatches.conf)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala
new file mode 100644
index 0000000000000..44f1fa58599d2
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala
@@ -0,0 +1,346 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import java.util.concurrent.TimeUnit.NANOSECONDS
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, BindReferences, Expression, JoinedRow, Literal, NamedExpression, PreciseTimestampConversion, UnsafeProjection, UnsafeRow}
+import org.apache.spark.sql.catalyst.plans._
+import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark._
+import org.apache.spark.sql.catalyst.plans.physical._
+import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan}
+import org.apache.spark.sql.execution.streaming.StreamingSymmetricHashJoinHelper._
+import org.apache.spark.sql.execution.streaming.state._
+import org.apache.spark.sql.internal.SessionState
+import org.apache.spark.sql.types.{LongType, TimestampType}
+import org.apache.spark.util.{CompletionIterator, SerializableConfiguration}
+
+
+/**
+ * Performs stream-stream join using symmetric hash join algorithm. It works as follows.
+ *
+ *                             /-----------------------\
+ *   left side input --------->|    left side state    |------\
+ *                             \-----------------------/      |
+ *                                                            |--------> joined output
+ *                             /-----------------------\      |
+ *   right side input -------->|    right side state   |------/
+ *                             \-----------------------/
+ *
+ * Each join side buffers past input rows as streaming state so that the past input can be joined
+ * with future input on the other side. This buffer state is effectively a multi-map:
+ *    equi-join key -> list of past input rows received with the join key
+ *
+ * For each input row in each side, the following operations take place.
+ * - Calculate join key from the row.
+ * - Use the join key to append the row to the buffer state of the side that the row came from.
+ * - Find past buffered values for the key from the other side. For each such value, emit the
+ *   "joined row" (left-row, right-row)
+ * - Apply the optional condition to filter the joined rows as the final output.
+ *
+ * If a timestamp column with event time watermark is present in the join keys or in the input
+ * data, then the it uses the watermark figure out which rows in the buffer will not join with
+ * and the new data, and therefore can be discarded. Depending on the provided query conditions, we
+ * can define thresholds on both state key (i.e. joining keys) and state value (i.e. input rows).
+ * There are three kinds of queries possible regarding this as explained below.
+ * Assume that watermark has been defined on both `leftTime` and `rightTime` columns used below.
+ *
+ * 1. When timestamp/time-window + watermark is in the join keys. Example (pseudo-SQL):
+ *
+ *      SELECT * FROM leftTable, rightTable
+ *      ON
+ *        leftKey = rightKey AND
+ *        window(leftTime, "1 hour") = window(rightTime, "1 hour")    // 1hr tumbling windows
+ *
+ *    In this case, this operator will join rows newer than watermark which fall in the same
+ *    1 hour window. Say the event-time watermark is "12:34" (both left and right input).
+ *    Then input rows can only have time > 12:34. Hence, they can only join with buffered rows
+ *    where window >= 12:00 - 1:00 and all buffered rows with join window < 12:00 can be
+ *    discarded. In other words, the operator will discard all state where
+ *    window in state key (i.e. join key) < event time watermark. This threshold is called
+ *    State Key Watermark.
+ *
+ * 2. When timestamp range conditions are provided (no time/window + watermark in join keys). E.g.
+ *
+ *      SELECT * FROM leftTable, rightTable
+ *      ON
+ *        leftKey = rightKey AND
+ *        leftTime > rightTime - INTERVAL 8 MINUTES AND leftTime < rightTime + INTERVAL 1 HOUR
+ *
+ *   In this case, the event-time watermark and the BETWEEN condition can be used to calculate a
+ *   state watermark, i.e., time threshold for the state rows that can be discarded.
+ *   For example, say each join side has a time column, named "leftTime" and
+ *   "rightTime", and there is a join condition "leftTime > rightTime - 8 min".
+ *   While processing, say the watermark on right input is "12:34". This means that from henceforth,
+ *   only right inputs rows with "rightTime > 12:34" will be processed, and any older rows will be
+ *   considered as "too late" and therefore dropped. Then, the left side buffer only needs
+ *   to keep rows where "leftTime > rightTime - 8 min > 12:34 - 8m > 12:26".
+ *   That is, the left state watermark is 12:26, and any rows older than that can be dropped from
+ *   the state. In other words, the operator will discard all state where
+ *   timestamp in state value (input rows) < state watermark. This threshold is called
+ *   State Value Watermark (to distinguish from the state key watermark).
+ *
+ *   Note:
+ *   - The event watermark value of one side is used to calculate the
+ *     state watermark of the other side. That is, a condition ~ "leftTime > rightTime + X" with
+ *     right side event watermark is used to calculate the left side state watermark. Conversely,
+ *     a condition ~ "left < rightTime + Y" with left side event watermark is used to calculate
+ *     right side state watermark.
+ *   - Depending on the conditions, the state watermark maybe different for the left and right
+ *     side. In the above example, leftTime > 12:26 AND rightTime > 12:34 - 1 hour = 11:34.
+ *   - State can be dropped from BOTH sides only when there are conditions of the above forms that
+ *     define time bounds on timestamp in both directions.
+ *
+ * 3. When both window in join key and time range conditions are present, case 1 + 2.
+ *    In this case, since window equality is a stricter condition than the time range, we can
+ *    use the the State Key Watermark = event time watermark to discard state (similar to case 1).
+ *
+ * @param leftKeys  Expression to generate key rows for joining from left input
+ * @param rightKeys Expression to generate key rows for joining from right input
+ * @param joinType  Type of join (inner, left outer, etc.)
+ * @param condition Optional, additional condition to filter output of the equi-join
+ * @param stateInfo Version information required to read join state (buffered rows)
+ * @param eventTimeWatermark Watermark of input event, same for both sides
+ * @param stateWatermarkPredicates Predicates for removal of state, see
+ *                                 [[JoinStateWatermarkPredicates]]
+ * @param left      Left child plan
+ * @param right     Right child plan
+ */
+case class StreamingSymmetricHashJoinExec(
+    leftKeys: Seq[Expression],
+    rightKeys: Seq[Expression],
+    joinType: JoinType,
+    condition: Option[Expression],
+    stateInfo: Option[StatefulOperatorStateInfo],
+    eventTimeWatermark: Option[Long],
+    stateWatermarkPredicates: JoinStateWatermarkPredicates,
+    left: SparkPlan,
+    right: SparkPlan) extends SparkPlan with BinaryExecNode with StateStoreWriter {
+
+  def this(
+      leftKeys: Seq[Expression],
+      rightKeys: Seq[Expression],
+      joinType: JoinType,
+      condition: Option[Expression],
+      left: SparkPlan,
+      right: SparkPlan) = {
+    this(
+      leftKeys, rightKeys, joinType, condition, stateInfo = None, eventTimeWatermark = None,
+      stateWatermarkPredicates = JoinStateWatermarkPredicates(), left, right)
+  }
+
+  require(joinType == Inner, s"${getClass.getSimpleName} should not take $joinType as the JoinType")
+  require(leftKeys.map(_.dataType) == rightKeys.map(_.dataType))
+
+  private val storeConf = new StateStoreConf(sqlContext.conf)
+  private val hadoopConfBcast = sparkContext.broadcast(
+    new SerializableConfiguration(SessionState.newHadoopConf(
+      sparkContext.hadoopConfiguration, sqlContext.conf)))
+
+  override def requiredChildDistribution: Seq[Distribution] =
+    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil
+
+  override def output: Seq[Attribute] = left.output ++ right.output
+
+  override def outputPartitioning: Partitioning = joinType match {
+    case _: InnerLike =>
+      PartitioningCollection(Seq(left.outputPartitioning, right.outputPartitioning))
+    case x =>
+      throw new IllegalArgumentException(
+        s"${getClass.getSimpleName} should not take $x as the JoinType")
+  }
+
+  protected override def doExecute(): RDD[InternalRow] = {
+    val stateStoreCoord = sqlContext.sessionState.streamingQueryManager.stateStoreCoordinator
+    val stateStoreNames = SymmetricHashJoinStateManager.allStateStoreNames(LeftSide, RightSide)
+    left.execute().stateStoreAwareZipPartitions(
+      right.execute(), stateInfo.get, stateStoreNames, stateStoreCoord)(processPartitions)
+  }
+
+  private def processPartitions(
+      leftInputIter: Iterator[InternalRow],
+      rightInputIter: Iterator[InternalRow]): Iterator[InternalRow] = {
+    if (stateInfo.isEmpty) {
+      throw new IllegalStateException(s"Cannot execute join as state info was not specified\n$this")
+    }
+
+    val numOutputRows = longMetric("numOutputRows")
+    val numUpdatedStateRows = longMetric("numUpdatedStateRows")
+    val numTotalStateRows = longMetric("numTotalStateRows")
+    val allUpdatesTimeMs = longMetric("allUpdatesTimeMs")
+    val allRemovalsTimeMs = longMetric("allRemovalsTimeMs")
+    val commitTimeMs = longMetric("commitTimeMs")
+    val stateMemory = longMetric("stateMemory")
+
+    val updateStartTimeNs = System.nanoTime
+    val joinedRow = new JoinedRow
+
+    val leftSideJoiner = new OneSideHashJoiner(
+      LeftSide, left.output, leftKeys, leftInputIter, stateWatermarkPredicates.left)
+    val rightSideJoiner = new OneSideHashJoiner(
+      RightSide, right.output, rightKeys, rightInputIter, stateWatermarkPredicates.right)
+
+    //  Join one side input using the other side's buffered/state rows. Here is how it is done.
+    //
+    //  - `leftJoiner.joinWith(rightJoiner)` generates all rows from matching new left input with
+    //    stored right input, and also stores all the left input
+    //
+    //  - `rightJoiner.joinWith(leftJoiner)` generates all rows from matching new right input with
+    //    stored left input, and also stores all the right input. It also generates all rows from
+    //    matching new left input with new right input, since the new left input has become stored
+    //    by that point. This tiny asymmetry is necessary to avoid duplication.
+    val leftOutputIter = leftSideJoiner.storeAndJoinWithOtherSide(rightSideJoiner) {
+      (inputRow: UnsafeRow, matchedRow: UnsafeRow) =>
+        joinedRow.withLeft(inputRow).withRight(matchedRow)
+    }
+    val rightOutputIter = rightSideJoiner.storeAndJoinWithOtherSide(leftSideJoiner) {
+      (inputRow: UnsafeRow, matchedRow: UnsafeRow) =>
+        joinedRow.withLeft(matchedRow).withRight(inputRow)
+    }
+
+    // Filter the joined rows based on the given condition.
+    val outputFilterFunction =
+      newPredicate(condition.getOrElse(Literal(true)), left.output ++ right.output).eval _
+    val filteredOutputIter =
+      (leftOutputIter ++ rightOutputIter).filter(outputFilterFunction).map { row =>
+        numOutputRows += 1
+        row
+      }
+
+    // Function to remove old state after all the input has been consumed and output generated
+    def onOutputCompletion = {
+      allUpdatesTimeMs += math.max(NANOSECONDS.toMillis(System.nanoTime - updateStartTimeNs), 0)
+
+      // Remove old state if needed
+      allRemovalsTimeMs += timeTakenMs {
+        leftSideJoiner.removeOldState()
+        rightSideJoiner.removeOldState()
+      }
+
+      // Commit all state changes and update state store metrics
+      commitTimeMs += timeTakenMs {
+        val leftSideMetrics = leftSideJoiner.commitStateAndGetMetrics()
+        val rightSideMetrics = rightSideJoiner.commitStateAndGetMetrics()
+        val combinedMetrics = StateStoreMetrics.combine(Seq(leftSideMetrics, rightSideMetrics))
+
+        // Update SQL metrics
+        numUpdatedStateRows +=
+          (leftSideJoiner.numUpdatedStateRows + rightSideJoiner.numUpdatedStateRows)
+        numTotalStateRows += combinedMetrics.numKeys
+        stateMemory += combinedMetrics.memoryUsedBytes
+        combinedMetrics.customMetrics.foreach { case (metric, value) =>
+          longMetric(metric.name) += value
+        }
+      }
+    }
+
+    CompletionIterator[InternalRow, Iterator[InternalRow]](filteredOutputIter, onOutputCompletion)
+  }
+
+  /**
+   * Internal helper class to consume input rows, generate join output rows using other sides
+   * buffered state rows, and finally clean up this sides buffered state rows
+   */
+  private class OneSideHashJoiner(
+      joinSide: JoinSide,
+      inputAttributes: Seq[Attribute],
+      joinKeys: Seq[Expression],
+      inputIter: Iterator[InternalRow],
+      stateWatermarkPredicate: Option[JoinStateWatermarkPredicate]) {
+
+    private val joinStateManager = new SymmetricHashJoinStateManager(
+      joinSide, inputAttributes, joinKeys, stateInfo, storeConf, hadoopConfBcast.value.value)
+    private[this] val keyGenerator = UnsafeProjection.create(joinKeys, inputAttributes)
+
+    private[this] val stateKeyWatermarkPredicateFunc = stateWatermarkPredicate match {
+      case Some(JoinStateKeyWatermarkPredicate(expr)) =>
+        // inputSchema can be empty as expr should only have BoundReferences and does not require
+        // the schema to generated predicate. See [[StreamingSymmetricHashJoinHelper]].
+        newPredicate(expr, Seq.empty).eval _
+      case _ =>
+        newPredicate(Literal(false), Seq.empty).eval _ // false = do not remove if no predicate
+    }
+
+    private[this] val stateValueWatermarkPredicateFunc = stateWatermarkPredicate match {
+      case Some(JoinStateValueWatermarkPredicate(expr)) =>
+        newPredicate(expr, inputAttributes).eval _
+      case _ =>
+        newPredicate(Literal(false), Seq.empty).eval _  // false = do not remove if no predicate
+    }
+
+    private[this] var updatedStateRowsCount = 0
+
+    /**
+     * Generate joined rows by consuming input from this side, and matching it with the buffered
+     * rows (i.e. state) of the other side.
+     * @param otherSideJoiner   Joiner of the other side
+     * @param generateJoinedRow Function to generate the joined row from the
+     *                          input row from this side and the matched row from the other side
+     */
+    def storeAndJoinWithOtherSide(
+        otherSideJoiner: OneSideHashJoiner)(
+        generateJoinedRow: (UnsafeRow, UnsafeRow) => JoinedRow): Iterator[InternalRow] = {
+
+      val watermarkAttribute = inputAttributes.find(_.metadata.contains(delayKey))
+      val nonLateRows =
+        WatermarkSupport.watermarkExpression(watermarkAttribute, eventTimeWatermark) match {
+          case Some(watermarkExpr) =>
+            val predicate = newPredicate(watermarkExpr, inputAttributes)
+            inputIter.filter { row => !predicate.eval(row) }
+          case None =>
+            inputIter
+        }
+
+      nonLateRows.flatMap { row =>
+        val thisRow = row.asInstanceOf[UnsafeRow]
+        val key = keyGenerator(thisRow)
+        val outputIter = otherSideJoiner.joinStateManager.get(key).map { thatRow =>
+          generateJoinedRow(thisRow, thatRow)
+        }
+        val shouldAddToState = // add only if both removal predicates do not match
+          !stateKeyWatermarkPredicateFunc(key) && !stateValueWatermarkPredicateFunc(thisRow)
+        if (shouldAddToState) {
+          joinStateManager.append(key, thisRow)
+          updatedStateRowsCount += 1
+        }
+        outputIter
+      }
+    }
+
+    /** Remove old buffered state rows using watermarks for state keys and values */
+    def removeOldState(): Unit = {
+      stateWatermarkPredicate match {
+        case Some(JoinStateKeyWatermarkPredicate(expr)) =>
+          joinStateManager.removeByKeyCondition(stateKeyWatermarkPredicateFunc)
+        case Some(JoinStateValueWatermarkPredicate(expr)) =>
+          joinStateManager.removeByValueCondition(stateValueWatermarkPredicateFunc)
+        case _ =>
+      }
+    }
+
+    /** Commit changes to the buffer state and return the state store metrics */
+    def commitStateAndGetMetrics(): StateStoreMetrics = {
+      joinStateManager.commit()
+      joinStateManager.metrics
+    }
+
+    def numUpdatedStateRows: Long = updatedStateRowsCount
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinHelper.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinHelper.scala
new file mode 100644
index 0000000000000..e50274a1baba1
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinHelper.scala
@@ -0,0 +1,415 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import scala.reflect.ClassTag
+import scala.util.control.NonFatal
+
+import org.apache.spark.{Partition, SparkContext}
+import org.apache.spark.internal.Logging
+import org.apache.spark.rdd.{RDD, ZippedPartitionsRDD2}
+import org.apache.spark.sql.catalyst.expressions.{Add, Attribute, AttributeReference, AttributeSet, BoundReference, Cast, CheckOverflow, Expression, ExpressionSet, GreaterThan, GreaterThanOrEqual, LessThan, LessThanOrEqual, Literal, Multiply, NamedExpression, PreciseTimestampConversion, PredicateHelper, Subtract, TimeAdd, TimeSub, UnaryMinus}
+import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark._
+import org.apache.spark.sql.execution.streaming.WatermarkSupport.watermarkExpression
+import org.apache.spark.sql.execution.streaming.state.{StateStoreCoordinatorRef, StateStoreProvider, StateStoreProviderId}
+import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.CalendarInterval
+
+
+/**
+ * Helper object for [[StreamingSymmetricHashJoinExec]]. See that object for more details.
+ */
+object StreamingSymmetricHashJoinHelper extends PredicateHelper with Logging {
+
+  sealed trait JoinSide
+  case object LeftSide extends JoinSide { override def toString(): String = "left" }
+  case object RightSide extends JoinSide { override def toString(): String = "right" }
+
+  sealed trait JoinStateWatermarkPredicate {
+    def expr: Expression
+    def desc: String
+    override def toString: String = s"$desc: $expr"
+  }
+  /** Predicate for watermark on state keys */
+  case class JoinStateKeyWatermarkPredicate(expr: Expression)
+    extends JoinStateWatermarkPredicate {
+    def desc: String = "key predicate"
+  }
+  /** Predicate for watermark on state values */
+  case class JoinStateValueWatermarkPredicate(expr: Expression)
+    extends JoinStateWatermarkPredicate {
+    def desc: String = "value predicate"
+  }
+
+  case class JoinStateWatermarkPredicates(
+    left: Option[JoinStateWatermarkPredicate] = None,
+    right: Option[JoinStateWatermarkPredicate] = None) {
+    override def toString(): String = {
+      s"state cleanup [ left ${left.map(_.toString).getOrElse("= null")}, " +
+        s"right ${right.map(_.toString).getOrElse("= null")} ]"
+    }
+  }
+
+  /** Get the predicates defining the state watermarks for both sides of the join */
+  def getStateWatermarkPredicates(
+      leftAttributes: Seq[Attribute],
+      rightAttributes: Seq[Attribute],
+      leftKeys: Seq[Expression],
+      rightKeys: Seq[Expression],
+      condition: Option[Expression],
+      eventTimeWatermark: Option[Long]): JoinStateWatermarkPredicates = {
+
+
+    // Join keys of both sides generate rows of the same fields, that is, same sequence of data
+    // types. If one side (say left side) has a column (say timestmap) that has a watermark on it,
+    // then it will never consider joining keys that are < state key watermark (i.e. event time
+    // watermark). On the other side (i.e. right side), even if there is no watermark defined,
+    // there has to be an equivalent column (i.e., timestamp). And any right side data that has the
+    // timestamp < watermark will not match will not match with left side data, as the left side get
+    // filtered with the explicitly defined watermark. So, the watermark in timestamp column in
+    // left side keys effectively causes the timestamp on the right side to have a watermark.
+    // We will use the ordinal of the left timestamp in the left keys to find the corresponding
+    // right timestamp in the right keys.
+    val joinKeyOrdinalForWatermark: Option[Int] = {
+      leftKeys.zipWithIndex.collectFirst {
+        case (ne: NamedExpression, index) if ne.metadata.contains(delayKey) => index
+      } orElse {
+        rightKeys.zipWithIndex.collectFirst {
+          case (ne: NamedExpression, index) if ne.metadata.contains(delayKey) => index
+        }
+      }
+    }
+
+    def getOneSideStateWatermarkPredicate(
+        oneSideInputAttributes: Seq[Attribute],
+        oneSideJoinKeys: Seq[Expression],
+        otherSideInputAttributes: Seq[Attribute]): Option[JoinStateWatermarkPredicate] = {
+      val isWatermarkDefinedOnInput = oneSideInputAttributes.exists(_.metadata.contains(delayKey))
+      val isWatermarkDefinedOnJoinKey = joinKeyOrdinalForWatermark.isDefined
+
+      if (isWatermarkDefinedOnJoinKey) { // case 1 and 3 in the StreamingSymmetricHashJoinExec docs
+        val keyExprWithWatermark = BoundReference(
+          joinKeyOrdinalForWatermark.get,
+          oneSideJoinKeys(joinKeyOrdinalForWatermark.get).dataType,
+          oneSideJoinKeys(joinKeyOrdinalForWatermark.get).nullable)
+        val expr = watermarkExpression(Some(keyExprWithWatermark), eventTimeWatermark)
+        expr.map(JoinStateKeyWatermarkPredicate.apply _)
+
+      } else if (isWatermarkDefinedOnInput) { // case 2 in the StreamingSymmetricHashJoinExec docs
+        val stateValueWatermark = getStateValueWatermark(
+          attributesToFindStateWatermarkFor = AttributeSet(oneSideInputAttributes),
+          attributesWithEventWatermark = AttributeSet(otherSideInputAttributes),
+          condition,
+          eventTimeWatermark)
+        val inputAttributeWithWatermark = oneSideInputAttributes.find(_.metadata.contains(delayKey))
+        val expr = watermarkExpression(inputAttributeWithWatermark, stateValueWatermark)
+        expr.map(JoinStateValueWatermarkPredicate.apply _)
+
+      } else {
+        None
+      }
+    }
+
+    val leftStateWatermarkPredicate =
+      getOneSideStateWatermarkPredicate(leftAttributes, leftKeys, rightAttributes)
+    val rightStateWatermarkPredicate =
+      getOneSideStateWatermarkPredicate(rightAttributes, rightKeys, leftAttributes)
+    JoinStateWatermarkPredicates(leftStateWatermarkPredicate, rightStateWatermarkPredicate)
+  }
+
+  /**
+   * Get state value watermark (see [[StreamingSymmetricHashJoinExec]] for context about it)
+   * given the join condition and the event time watermark. This is how it works.
+   * - The condition is split into conjunctive predicates, and we find the predicates of the
+   *   form `leftTime + c1 < rightTime + c2`   (or <=, >, >=).
+   * - We canoncalize the predicate and solve it with the event time watermark value to find the
+   *  value of the state watermark.
+   * This function is supposed to make best-effort attempt to get the state watermark. If there is
+   * any error, it will return None.
+   *
+   * @param attributesToFindStateWatermarkFor attributes of the side whose state watermark
+   *                                         is to be calculated
+   * @param attributesWithEventWatermark  attributes of the other side which has a watermark column
+   * @param joinCondition                 join condition
+   * @param eventWatermark                watermark defined on the input event data
+   * @return state value watermark in milliseconds, is possible.
+   */
+  def getStateValueWatermark(
+      attributesToFindStateWatermarkFor: AttributeSet,
+      attributesWithEventWatermark: AttributeSet,
+      joinCondition: Option[Expression],
+      eventWatermark: Option[Long]): Option[Long] = {
+
+    // If condition or event time watermark is not provided, then cannot calculate state watermark
+    if (joinCondition.isEmpty || eventWatermark.isEmpty) return None
+
+    // If there is not watermark attribute, then cannot define state watermark
+    if (!attributesWithEventWatermark.exists(_.metadata.contains(delayKey))) return None
+
+    def getStateWatermarkSafely(l: Expression, r: Expression): Option[Long] = {
+      try {
+        getStateWatermarkFromLessThenPredicate(
+          l, r, attributesToFindStateWatermarkFor, attributesWithEventWatermark, eventWatermark)
+      } catch {
+        case NonFatal(e) =>
+          logWarning(s"Error trying to extract state constraint from condition $joinCondition", e)
+          None
+      }
+    }
+
+    val allStateWatermarks = splitConjunctivePredicates(joinCondition.get).flatMap { predicate =>
+
+      // The generated the state watermark cleanup expression is inclusive of the state watermark.
+      // If state watermark is W, all state where timestamp <= W will be cleaned up.
+      // Now when the canonicalized join condition solves to leftTime >= W, we dont want to clean
+      // up leftTime <= W. Rather we should clean up leftTime <= W - 1. Hence the -1 below.
+      val stateWatermark = predicate match {
+        case LessThan(l, r) => getStateWatermarkSafely(l, r)
+        case LessThanOrEqual(l, r) => getStateWatermarkSafely(l, r).map(_ - 1)
+        case GreaterThan(l, r) => getStateWatermarkSafely(r, l)
+        case GreaterThanOrEqual(l, r) => getStateWatermarkSafely(r, l).map(_ - 1)
+        case _ => None
+      }
+      if (stateWatermark.nonEmpty) {
+        logInfo(s"Condition $joinCondition generated watermark constraint = ${stateWatermark.get}")
+      }
+      stateWatermark
+    }
+    allStateWatermarks.reduceOption((x, y) => Math.min(x, y))
+  }
+
+  /**
+   * Extract the state value watermark (milliseconds) from the condition
+   * `LessThan(leftExpr, rightExpr)` where . For example: if we want to find the constraint for
+   * leftTime using the watermark on the rightTime. Example:
+   *
+   * Input:                 rightTime-with-watermark + c1 < leftTime + c2
+   * Canonical form:        rightTime-with-watermark + c1 + (-c2) + (-leftTime) < 0
+   * Solving for rightTime: rightTime-with-watermark + c1 + (-c2) < leftTime
+   * With watermark value:  watermark-value + c1 + (-c2) < leftTime
+   */
+  private def getStateWatermarkFromLessThenPredicate(
+      leftExpr: Expression,
+      rightExpr: Expression,
+      attributesToFindStateWatermarkFor: AttributeSet,
+      attributesWithEventWatermark: AttributeSet,
+      eventWatermark: Option[Long]): Option[Long] = {
+
+    val attributesInCondition = AttributeSet(
+      leftExpr.collect { case a: AttributeReference => a } ++
+      rightExpr.collect { case a: AttributeReference => a }
+    )
+    if (attributesInCondition.filter { attributesToFindStateWatermarkFor.contains(_) }.size > 1 ||
+        attributesInCondition.filter { attributesWithEventWatermark.contains(_) }.size > 1) {
+      // If more than attributes present in condition from one side, then it cannot be solved
+      return None
+    }
+
+    def containsAttributeToFindStateConstraintFor(e: Expression): Boolean = {
+      e.collectLeaves().collectFirst {
+        case a @ AttributeReference(_, TimestampType, _, _)
+          if attributesToFindStateWatermarkFor.contains(a) => a
+      }.nonEmpty
+    }
+
+    // Canonicalization step 1: convert to (rightTime-with-watermark + c1) - (leftTime + c2) < 0
+    val allOnLeftExpr = Subtract(leftExpr, rightExpr)
+    logDebug(s"All on Left:\n${allOnLeftExpr.treeString(true)}\n${allOnLeftExpr.asCode}")
+
+    // Canonicalization step 2: extract commutative terms
+    //    rightTime-with-watermark, c1, -leftTime, -c2
+    val terms = ExpressionSet(collectTerms(allOnLeftExpr))
+    logDebug("Terms extracted from join condition:\n\t" + terms.mkString("\n\t"))
+
+
+
+    // Find the term that has leftTime (i.e. the one present in attributesToFindConstraintFor
+    val constraintTerms = terms.filter(containsAttributeToFindStateConstraintFor)
+
+    // Verify there is only one correct constraint term and of the correct type
+    if (constraintTerms.size > 1) {
+      logWarning("Failed to extract state constraint terms: multiple time terms in condition\n\t" +
+        terms.mkString("\n\t"))
+      return None
+    }
+    if (constraintTerms.isEmpty) {
+      logDebug("Failed to extract state constraint terms: no time terms in condition\n\t" +
+        terms.mkString("\n\t"))
+      return None
+    }
+    val constraintTerm = constraintTerms.head
+    if (constraintTerm.collectFirst { case u: UnaryMinus => u }.isEmpty) {
+      // Incorrect condition. We want the constraint term in canonical form to be `-leftTime`
+      // so that resolve for it as `-leftTime + watermark + c < 0` ==> `watermark + c < leftTime`.
+      // Now, if the original conditions is `rightTime-with-watermark > leftTime` and watermark
+      // condition is `rightTime-with-watermark > watermarkValue`, then no constraint about
+      // `leftTime` can be inferred. In this case, after canonicalization and collection of terms,
+      // the constraintTerm would be `leftTime` and not `-leftTime`. Hence, we return None.
+      return None
+    }
+
+    // Replace watermark attribute with watermark value, and generate the resolved expression
+    // from the other terms. That is,
+    // rightTime-with-watermark, c1, -c2  =>  watermark, c1, -c2  =>  watermark + c1 + (-c2)
+    logDebug(s"Constraint term from join condition:\t$constraintTerm")
+    val exprWithWatermarkSubstituted = (terms - constraintTerm).map { term =>
+      term.transform {
+        case a @ AttributeReference(_, TimestampType, _, metadata)
+          if attributesWithEventWatermark.contains(a) && metadata.contains(delayKey) =>
+          Multiply(Literal(eventWatermark.get.toDouble), Literal(1000.0))
+      }
+    }.reduceLeft(Add)
+
+    // Calculate the constraint value
+    logInfo(s"Final expression to evaluate constraint:\t$exprWithWatermarkSubstituted")
+    val constraintValue = exprWithWatermarkSubstituted.eval().asInstanceOf[java.lang.Double]
+    Some((Double2double(constraintValue) / 1000.0).toLong)
+  }
+
+  /**
+   * Collect all the terms present in an expression after converting it into the form
+   * a + b + c + d where each term be either an attribute or a literal casted to long,
+   * optionally wrapped in a unary minus.
+   */
+  private def collectTerms(exprToCollectFrom: Expression): Seq[Expression] = {
+    var invalid = false
+
+    /** Wrap a term with UnaryMinus if its needs to be negated. */
+    def negateIfNeeded(expr: Expression, minus: Boolean): Expression = {
+      if (minus) UnaryMinus(expr) else expr
+    }
+
+    /**
+     * Recursively split the expression into its leaf terms contains attributes or literals.
+     * Returns terms only of the forms:
+     *    Cast(AttributeReference), UnaryMinus(Cast(AttributeReference)),
+     *    Cast(AttributeReference, Double), UnaryMinus(Cast(AttributeReference, Double))
+     *    Multiply(Literal), UnaryMinus(Multiply(Literal))
+     *    Multiply(Cast(Literal)), UnaryMinus(Multiple(Cast(Literal)))
+     *
+     * Note:
+     * - If term needs to be negated for making it a commutative term,
+     *   then it will be wrapped in UnaryMinus(...)
+     * - Each terms will be representing timestamp value or time interval in microseconds,
+     *   typed as doubles.
+     */
+    def collect(expr: Expression, negate: Boolean): Seq[Expression] = {
+      expr match {
+        case Add(left, right) =>
+          collect(left, negate) ++ collect(right, negate)
+        case Subtract(left, right) =>
+          collect(left, negate) ++ collect(right, !negate)
+        case TimeAdd(left, right, _) =>
+          collect(left, negate) ++ collect(right, negate)
+        case TimeSub(left, right, _) =>
+          collect(left, negate) ++ collect(right, !negate)
+        case UnaryMinus(child) =>
+          collect(child, !negate)
+        case CheckOverflow(child, _) =>
+          collect(child, negate)
+        case Cast(child, dataType, _) =>
+          dataType match {
+            case _: NumericType | _: TimestampType => collect(child, negate)
+            case _ =>
+              invalid = true
+              Seq.empty
+          }
+        case a: AttributeReference =>
+          val castedRef = if (a.dataType != DoubleType) Cast(a, DoubleType) else a
+          Seq(negateIfNeeded(castedRef, negate))
+        case lit: Literal =>
+          // If literal of type calendar interval, then explicitly convert to millis
+          // Convert other number like literal to doubles representing millis (by x1000)
+          val castedLit = lit.dataType match {
+            case CalendarIntervalType =>
+              val calendarInterval = lit.value.asInstanceOf[CalendarInterval]
+              if (calendarInterval.months > 0) {
+                invalid = true
+                logWarning(
+                  s"Failed to extract state value watermark from condition $exprToCollectFrom " +
+                    s"as imprecise intervals like months and years cannot be used for" +
+                    s"watermark calculation. Use interval in terms of day instead.")
+                Literal(0.0)
+              } else {
+                Literal(calendarInterval.microseconds.toDouble)
+              }
+            case DoubleType =>
+              Multiply(lit, Literal(1000000.0))
+            case _: NumericType =>
+              Multiply(Cast(lit, DoubleType), Literal(1000000.0))
+            case _: TimestampType =>
+              Multiply(PreciseTimestampConversion(lit, TimestampType, LongType), Literal(1000000.0))
+          }
+          Seq(negateIfNeeded(castedLit, negate))
+        case a @ _ =>
+          logWarning(
+            s"Failed to extract state value watermark from condition $exprToCollectFrom due to $a")
+          invalid = true
+          Seq.empty
+      }
+    }
+
+    val terms = collect(exprToCollectFrom, negate = false)
+    if (!invalid) terms else Seq.empty
+  }
+
+  /**
+   * A custom RDD that allows partitions to be "zipped" together, while ensuring the tasks'
+   * preferred location is based on which executors have the required join state stores already
+   * loaded. This is class is a modified verion of [[ZippedPartitionsRDD2]].
+   */
+  class StateStoreAwareZipPartitionsRDD[A: ClassTag, B: ClassTag, V: ClassTag](
+      sc: SparkContext,
+      f: (Iterator[A], Iterator[B]) => Iterator[V],
+      rdd1: RDD[A],
+      rdd2: RDD[B],
+      stateInfo: StatefulOperatorStateInfo,
+      stateStoreNames: Seq[String],
+      @transient private val storeCoordinator: Option[StateStoreCoordinatorRef])
+      extends ZippedPartitionsRDD2[A, B, V](sc, f, rdd1, rdd2) {
+
+    /**
+     * Set the preferred location of each partition using the executor that has the related
+     * [[StateStoreProvider]] already loaded.
+     */
+    override def getPreferredLocations(partition: Partition): Seq[String] = {
+      stateStoreNames.flatMap { storeName =>
+        val stateStoreProviderId = StateStoreProviderId(stateInfo, partition.index, storeName)
+        storeCoordinator.flatMap(_.getLocation(stateStoreProviderId))
+      }.distinct
+    }
+  }
+
+  implicit class StateStoreAwareZipPartitionsHelper[T: ClassTag](dataRDD: RDD[T]) {
+    /**
+     * Function used by `StreamingSymmetricHashJoinExec` to zip together the partitions of two
+     * child RDDs for joining the data in corresponding partitions, while ensuring the tasks'
+     * preferred location is based on which executors have the required join state stores already
+     * loaded.
+     */
+    def stateStoreAwareZipPartitions[U: ClassTag, V: ClassTag](
+        dataRDD2: RDD[U],
+        stateInfo: StatefulOperatorStateInfo,
+        storeNames: Seq[String],
+        storeCoordinator: StateStoreCoordinatorRef
+      )(f: (Iterator[T], Iterator[U]) => Iterator[V]): RDD[V] = {
+      new StateStoreAwareZipPartitionsRDD(
+        dataRDD.sparkContext, f, dataRDD, dataRDD2, stateInfo, storeNames, Some(storeCoordinator))
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
index 182fc2779e7ab..6fe632f958ffc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
@@ -30,6 +30,7 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.{SparkContext, SparkEnv}
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
+import org.apache.spark.sql.execution.streaming.StatefulOperatorStateInfo
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.{ThreadUtils, Utils}
 
@@ -120,6 +121,15 @@ case class StateStoreMetrics(
     memoryUsedBytes: Long,
     customMetrics: Map[StateStoreCustomMetric, Long])
 
+object StateStoreMetrics {
+  def combine(allMetrics: Seq[StateStoreMetrics]): StateStoreMetrics = {
+    StateStoreMetrics(
+      allMetrics.map(_.numKeys).sum,
+      allMetrics.map(_.memoryUsedBytes).sum,
+      allMetrics.flatMap(_.customMetrics).toMap)
+  }
+}
+
 /**
  * Name and description of custom implementation-specific metrics that a
  * state store may wish to expose.
@@ -227,6 +237,17 @@ object StateStoreProvider {
  */
 case class StateStoreProviderId(storeId: StateStoreId, queryRunId: UUID)
 
+object StateStoreProviderId {
+  private[sql] def apply(
+      stateInfo: StatefulOperatorStateInfo,
+      partitionIndex: Int,
+      storeName: String): StateStoreProviderId = {
+    val storeId = StateStoreId(
+      stateInfo.checkpointLocation, stateInfo.operatorId, partitionIndex, storeName)
+    StateStoreProviderId(storeId, stateInfo.queryRunId)
+  }
+}
+
 /**
  * Unique identifier for a bunch of keyed state data.
  * @param checkpointRootLocation Root directory where all the state data of a query is stored
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreCoordinator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreCoordinator.scala
index 3884f5e6ce766..2b14d37ee21ee 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreCoordinator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreCoordinator.scala
@@ -84,7 +84,7 @@ object StateStoreCoordinatorRef extends Logging {
  */
 class StateStoreCoordinatorRef private(rpcEndpointRef: RpcEndpointRef) {
 
-  private[state] def reportActiveInstance(
+  private[sql] def reportActiveInstance(
       stateStoreProviderId: StateStoreProviderId,
       host: String,
       executorId: String): Unit = {
@@ -92,14 +92,14 @@ class StateStoreCoordinatorRef private(rpcEndpointRef: RpcEndpointRef) {
   }
 
   /** Verify whether the given executor has the active instance of a state store */
-  private[state] def verifyIfInstanceActive(
+  private[sql] def verifyIfInstanceActive(
       stateStoreProviderId: StateStoreProviderId,
       executorId: String): Boolean = {
     rpcEndpointRef.askSync[Boolean](VerifyIfInstanceActive(stateStoreProviderId, executorId))
   }
 
   /** Get the location of the state store */
-  private[state] def getLocation(stateStoreProviderId: StateStoreProviderId): Option[String] = {
+  private[sql] def getLocation(stateStoreProviderId: StateStoreProviderId): Option[String] = {
     rpcEndpointRef.askSync[Option[String]](GetLocation(stateStoreProviderId))
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala
new file mode 100644
index 0000000000000..37648710dfc2a
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala
@@ -0,0 +1,395 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming.state
+
+import org.apache.hadoop.conf.Configuration
+
+import org.apache.spark.TaskContext
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression, Literal, SpecificInternalRow, UnsafeProjection, UnsafeRow}
+import org.apache.spark.sql.execution.streaming.{StatefulOperatorStateInfo, StreamingSymmetricHashJoinExec}
+import org.apache.spark.sql.execution.streaming.StreamingSymmetricHashJoinHelper._
+import org.apache.spark.sql.types.{LongType, StructField, StructType}
+import org.apache.spark.util.NextIterator
+
+/**
+ * Helper class to manage state required by a single side of [[StreamingSymmetricHashJoinExec]].
+ * The interface of this class is basically that of a multi-map:
+ * - Get: Returns an iterator of multiple values for given key
+ * - Append: Append a new value to the given key
+ * - Remove Data by predicate: Drop any state using a predicate condition on keys or values
+ *
+ * @param joinSide              Defines the join side
+ * @param inputValueAttributes  Attributes of the input row which will be stored as value
+ * @param joinKeys              Expressions to generate rows that will be used to key the value rows
+ * @param stateInfo             Information about how to retrieve the correct version of state
+ * @param storeConf             Configuration for the state store.
+ * @param hadoopConf            Hadoop configuration for reading state data from storage
+ *
+ * Internally, the key -> multiple values is stored in two [[StateStore]]s.
+ * - Store 1 ([[KeyToNumValuesStore]]) maintains mapping between key -> number of values
+ * - Store 2 ([[KeyWithIndexToValueStore]]) maintains mapping between (key, index) -> value
+ * - Put:   update count in KeyToNumValuesStore,
+ *          insert new (key, count) -> value in KeyWithIndexToValueStore
+ * - Get:   read count from KeyToNumValuesStore,
+ *          read each of the n values in KeyWithIndexToValueStore
+ * - Remove state by predicate on keys:
+ *          scan all keys in KeyToNumValuesStore to find keys that do match the predicate,
+ *          delete from key from KeyToNumValuesStore, delete values in KeyWithIndexToValueStore
+ * - Remove state by condition on values:
+ *          scan all [(key, index) -> value] in KeyWithIndexToValueStore to find values that match
+ *          the predicate, delete corresponding (key, indexToDelete) from KeyWithIndexToValueStore
+ *          by overwriting with the value of (key, maxIndex), and removing [(key, maxIndex),
+ *          decrement corresponding num values in KeyToNumValuesStore
+ */
+class SymmetricHashJoinStateManager(
+    val joinSide: JoinSide,
+    inputValueAttributes: Seq[Attribute],
+    joinKeys: Seq[Expression],
+    stateInfo: Option[StatefulOperatorStateInfo],
+    storeConf: StateStoreConf,
+    hadoopConf: Configuration) extends Logging {
+
+  import SymmetricHashJoinStateManager._
+
+  /*
+  =====================================================
+                  Public methods
+  =====================================================
+   */
+
+  /** Get all the values of a key */
+  def get(key: UnsafeRow): Iterator[UnsafeRow] = {
+    val numValues = keyToNumValues.get(key)
+    keyWithIndexToValue.getAll(key, numValues)
+  }
+
+  /** Append a new value to the key */
+  def append(key: UnsafeRow, value: UnsafeRow): Unit = {
+    val numExistingValues = keyToNumValues.get(key)
+    keyWithIndexToValue.put(key, numExistingValues, value)
+    keyToNumValues.put(key, numExistingValues + 1)
+  }
+
+  /**
+   * Remove using a predicate on keys. See class docs for more context and implement details.
+   */
+  def removeByKeyCondition(condition: UnsafeRow => Boolean): Unit = {
+    val allKeyToNumValues = keyToNumValues.iterator
+
+    while (allKeyToNumValues.hasNext) {
+      val keyToNumValue = allKeyToNumValues.next
+      if (condition(keyToNumValue.key)) {
+        keyToNumValues.remove(keyToNumValue.key)
+        keyWithIndexToValue.removeAllValues(keyToNumValue.key, keyToNumValue.numValue)
+      }
+    }
+  }
+
+  /**
+   * Remove using a predicate on values. See class docs for more context and implementation details.
+   */
+  def removeByValueCondition(condition: UnsafeRow => Boolean): Unit = {
+    val allKeyToNumValues = keyToNumValues.iterator
+
+    while (allKeyToNumValues.hasNext) {
+      val keyToNumValue = allKeyToNumValues.next
+      val key = keyToNumValue.key
+
+      var numValues: Long = keyToNumValue.numValue
+      var index: Long = 0L
+      var valueRemoved: Boolean = false
+      var valueForIndex: UnsafeRow = null
+
+      while (index < numValues) {
+        if (valueForIndex == null) {
+          valueForIndex = keyWithIndexToValue.get(key, index)
+        }
+        if (condition(valueForIndex)) {
+          if (numValues > 1) {
+            val valueAtMaxIndex = keyWithIndexToValue.get(key, numValues - 1)
+            keyWithIndexToValue.put(key, index, valueAtMaxIndex)
+            keyWithIndexToValue.remove(key, numValues - 1)
+            valueForIndex = valueAtMaxIndex
+          } else {
+            keyWithIndexToValue.remove(key, 0)
+            valueForIndex = null
+          }
+          numValues -= 1
+          valueRemoved = true
+        } else {
+          valueForIndex = null
+          index += 1
+        }
+      }
+      if (valueRemoved) {
+        if (numValues >= 1) {
+          keyToNumValues.put(key, numValues)
+        } else {
+          keyToNumValues.remove(key)
+        }
+      }
+    }
+  }
+
+  def iterator(): Iterator[UnsafeRowPair] = {
+    val pair = new UnsafeRowPair()
+    keyWithIndexToValue.iterator.map { x =>
+      pair.withRows(x.key, x.value)
+    }
+  }
+
+  /** Commit all the changes to all the state stores */
+  def commit(): Unit = {
+    keyToNumValues.commit()
+    keyWithIndexToValue.commit()
+  }
+
+  /** Abort any changes to the state stores if needed */
+  def abortIfNeeded(): Unit = {
+    keyToNumValues.abortIfNeeded()
+    keyWithIndexToValue.abortIfNeeded()
+  }
+
+  /** Get the combined metrics of all the state stores */
+  def metrics: StateStoreMetrics = {
+    val keyToNumValuesMetrics = keyToNumValues.metrics
+    val keyWithIndexToValueMetrics = keyWithIndexToValue.metrics
+    def newDesc(desc: String): String = s"${joinSide.toString.toUpperCase}: $desc"
+
+    StateStoreMetrics(
+      keyWithIndexToValueMetrics.numKeys,       // represent each buffered row only once
+      keyToNumValuesMetrics.memoryUsedBytes + keyWithIndexToValueMetrics.memoryUsedBytes,
+      keyWithIndexToValueMetrics.customMetrics.map {
+        case (s @ StateStoreCustomSizeMetric(_, desc), value) =>
+          s.copy(desc = newDesc(desc)) -> value
+        case (s @ StateStoreCustomTimingMetric(_, desc), value) =>
+          s.copy(desc = newDesc(desc)) -> value
+      }
+    )
+  }
+
+  /*
+  =====================================================
+            Private methods and inner classes
+  =====================================================
+   */
+
+  private val keySchema = StructType(
+    joinKeys.zipWithIndex.map { case (k, i) => StructField(s"field$i", k.dataType, k.nullable) })
+  private val keyAttributes = keySchema.toAttributes
+  private val keyToNumValues = new KeyToNumValuesStore()
+  private val keyWithIndexToValue = new KeyWithIndexToValueStore()
+
+  // Clean up any state store resources if necessary at the end of the task
+  Option(TaskContext.get()).foreach { _.addTaskCompletionListener { _ => abortIfNeeded() } }
+
+  /** Helper trait for invoking common functionalities of a state store. */
+  private abstract class StateStoreHandler(stateStoreType: StateStoreType) extends Logging {
+
+    /** StateStore that the subclasses of this class is going to operate on */
+    protected def stateStore: StateStore
+
+    def commit(): Unit = {
+      stateStore.commit()
+      logDebug("Committed, metrics = " + stateStore.metrics)
+    }
+
+    def abortIfNeeded(): Unit = {
+      if (!stateStore.hasCommitted) {
+        logInfo(s"Aborted store ${stateStore.id}")
+        stateStore.abort()
+      }
+    }
+
+    def metrics: StateStoreMetrics = stateStore.metrics
+
+    /** Get the StateStore with the given schema */
+    protected def getStateStore(keySchema: StructType, valueSchema: StructType): StateStore = {
+      val storeProviderId = StateStoreProviderId(
+        stateInfo.get, TaskContext.getPartitionId(), getStateStoreName(joinSide, stateStoreType))
+      val store = StateStore.get(
+        storeProviderId, keySchema, valueSchema, None,
+        stateInfo.get.storeVersion, storeConf, hadoopConf)
+      logInfo(s"Loaded store ${store.id}")
+      store
+    }
+  }
+
+  /**
+   * Helper class for representing data returned by [[KeyWithIndexToValueStore]].
+   * Designed for object reuse.
+   */
+  private case class KeyAndNumValues(var key: UnsafeRow = null, var numValue: Long = 0) {
+    def withNew(newKey: UnsafeRow, newNumValues: Long): this.type = {
+      this.key = newKey
+      this.numValue = newNumValues
+      this
+    }
+  }
+
+
+  /** A wrapper around a [[StateStore]] that stores [key -> number of values]. */
+  private class KeyToNumValuesStore extends StateStoreHandler(KeyToNumValuesType) {
+    private val longValueSchema = new StructType().add("value", "long")
+    private val longToUnsafeRow = UnsafeProjection.create(longValueSchema)
+    private val valueRow = longToUnsafeRow(new SpecificInternalRow(longValueSchema))
+    protected val stateStore: StateStore = getStateStore(keySchema, longValueSchema)
+
+    /** Get the number of values the key has */
+    def get(key: UnsafeRow): Long = {
+      val longValueRow = stateStore.get(key)
+      if (longValueRow != null) longValueRow.getLong(0) else 0L
+    }
+
+    /** Set the number of values the key has */
+    def put(key: UnsafeRow, numValues: Long): Unit = {
+      require(numValues > 0)
+      valueRow.setLong(0, numValues)
+      stateStore.put(key, valueRow)
+    }
+
+    def remove(key: UnsafeRow): Unit = {
+      stateStore.remove(key)
+    }
+
+    def iterator: Iterator[KeyAndNumValues] = {
+      val keyAndNumValues = new KeyAndNumValues()
+      stateStore.getRange(None, None).map { case pair =>
+        keyAndNumValues.withNew(pair.key, pair.value.getLong(0))
+      }
+    }
+  }
+
+  /**
+   * Helper class for representing data returned by [[KeyWithIndexToValueStore]].
+   * Designed for object reuse.
+   */
+  private case class KeyWithIndexAndValue(
+    var key: UnsafeRow = null, var valueIndex: Long = -1, var value: UnsafeRow = null) {
+    def withNew(newKey: UnsafeRow, newIndex: Long, newValue: UnsafeRow): this.type = {
+      this.key = newKey
+      this.valueIndex = newIndex
+      this.value = newValue
+      this
+    }
+  }
+
+  /** A wrapper around a [[StateStore]] that stores [(key, index) -> value]. */
+  private class KeyWithIndexToValueStore extends StateStoreHandler(KeyWithIndexToValuesType) {
+    private val keyWithIndexExprs = keyAttributes :+ Literal(1L)
+    private val keyWithIndexSchema = keySchema.add("index", LongType)
+    private val indexOrdinalInKeyWithIndexRow = keyAttributes.size
+
+    // Projection to generate (key + index) row from key row
+    private val keyWithIndexRowGenerator = UnsafeProjection.create(keyWithIndexExprs, keyAttributes)
+
+    // Projection to generate key row from (key + index) row
+    private val keyRowGenerator = UnsafeProjection.create(
+      keyAttributes, keyAttributes :+ AttributeReference("index", LongType)())
+
+    protected val stateStore = getStateStore(keyWithIndexSchema, inputValueAttributes.toStructType)
+
+    def get(key: UnsafeRow, valueIndex: Long): UnsafeRow = {
+      stateStore.get(keyWithIndexRow(key, valueIndex))
+    }
+
+    /** Get all the values for key and all indices. */
+    def getAll(key: UnsafeRow, numValues: Long): Iterator[UnsafeRow] = {
+      var index = 0
+      new NextIterator[UnsafeRow] {
+        override protected def getNext(): UnsafeRow = {
+          if (index >= numValues) {
+            finished = true
+            null
+          } else {
+            val keyWithIndex = keyWithIndexRow(key, index)
+            val value = stateStore.get(keyWithIndex)
+            index += 1
+            value
+          }
+        }
+
+        override protected def close(): Unit = {}
+      }
+    }
+
+    /** Put new value for key at the given index */
+    def put(key: UnsafeRow, valueIndex: Long, value: UnsafeRow): Unit = {
+      val keyWithIndex = keyWithIndexRow(key, valueIndex)
+      stateStore.put(keyWithIndex, value)
+    }
+
+    /**
+     * Remove key and value at given index. Note that this will create a hole in
+     * (key, index) and it is upto the caller to deal with it.
+     */
+    def remove(key: UnsafeRow, valueIndex: Long): Unit = {
+      stateStore.remove(keyWithIndexRow(key, valueIndex))
+    }
+
+    /** Remove all values (i.e. all the indices) for the given key. */
+    def removeAllValues(key: UnsafeRow, numValues: Long): Unit = {
+      var index = 0
+      while (index < numValues) {
+        stateStore.remove(keyWithIndexRow(key, index))
+        index += 1
+      }
+    }
+
+    def iterator: Iterator[KeyWithIndexAndValue] = {
+      val keyWithIndexAndValue = new KeyWithIndexAndValue()
+      stateStore.getRange(None, None).map { pair =>
+        keyWithIndexAndValue.withNew(
+          keyRowGenerator(pair.key), pair.key.getLong(indexOrdinalInKeyWithIndexRow), pair.value)
+        keyWithIndexAndValue
+      }
+    }
+
+    /** Generated a row using the key and index */
+    private def keyWithIndexRow(key: UnsafeRow, valueIndex: Long): UnsafeRow = {
+      val row = keyWithIndexRowGenerator(key)
+      row.setLong(indexOrdinalInKeyWithIndexRow, valueIndex)
+      row
+    }
+  }
+}
+
+object SymmetricHashJoinStateManager {
+
+  def allStateStoreNames(joinSides: JoinSide*): Seq[String] = {
+    val allStateStoreTypes: Seq[StateStoreType] = Seq(KeyToNumValuesType, KeyWithIndexToValuesType)
+    for (joinSide <- joinSides; stateStoreType <- allStateStoreTypes) yield {
+      getStateStoreName(joinSide, stateStoreType)
+    }
+  }
+
+  private sealed trait StateStoreType
+
+  private case object KeyToNumValuesType extends StateStoreType {
+    override def toString(): String = "keyToNumValues"
+  }
+
+  private case object KeyWithIndexToValuesType extends StateStoreType {
+    override def toString(): String = "keyWithIndexToNumValues"
+  }
+
+  private def getStateStoreName(joinSide: JoinSide, storeType: StateStoreType): String = {
+    s"$joinSide-$storeType"
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/package.scala
index a0086e251f9c6..0b32327e51dbf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/package.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql.execution.streaming
 
-import java.util.UUID
-
 import scala.reflect.ClassTag
 
 import org.apache.spark.TaskContext
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala
index d6566b8e6b54f..fb960fbdde8b3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala
@@ -43,7 +43,12 @@ case class StatefulOperatorStateInfo(
     checkpointLocation: String,
     queryRunId: UUID,
     operatorId: Long,
-    storeVersion: Long)
+    storeVersion: Long) {
+  override def toString(): String = {
+    s"state info [ checkpoint = $checkpointLocation, runId = $queryRunId, " +
+      s"opId = $operatorId, ver = $storeVersion]"
+  }
+}
 
 /**
  * An operator that reads or writes state from the [[StateStore]].
@@ -133,26 +138,9 @@ trait WatermarkSupport extends UnaryExecNode {
 
   /** Generate an expression that matches data older than the watermark */
   lazy val watermarkExpression: Option[Expression] = {
-    val optionalWatermarkAttribute =
-      child.output.find(_.metadata.contains(EventTimeWatermark.delayKey))
-
-    optionalWatermarkAttribute.map { watermarkAttribute =>
-      // If we are evicting based on a window, use the end of the window.  Otherwise just
-      // use the attribute itself.
-      val evictionExpression =
-        if (watermarkAttribute.dataType.isInstanceOf[StructType]) {
-          LessThanOrEqual(
-            GetStructField(watermarkAttribute, 1),
-            Literal(eventTimeWatermark.get * 1000))
-        } else {
-          LessThanOrEqual(
-            watermarkAttribute,
-            Literal(eventTimeWatermark.get * 1000))
-        }
-
-      logInfo(s"Filtering state store on: $evictionExpression")
-      evictionExpression
-    }
+    WatermarkSupport.watermarkExpression(
+      child.output.find(_.metadata.contains(EventTimeWatermark.delayKey)),
+      eventTimeWatermark)
   }
 
   /** Predicate based on keys that matches data older than the watermark */
@@ -179,6 +167,31 @@ trait WatermarkSupport extends UnaryExecNode {
   }
 }
 
+object WatermarkSupport {
+
+  /** Generate an expression on given attributes that matches data older than the watermark */
+  def watermarkExpression(
+      optionalWatermarkExpression: Option[Expression],
+      optionalWatermarkMs: Option[Long]): Option[Expression] = {
+    if (optionalWatermarkExpression.isEmpty || optionalWatermarkMs.isEmpty) return None
+
+    val watermarkAttribute = optionalWatermarkExpression.get
+    // If we are evicting based on a window, use the end of the window.  Otherwise just
+    // use the attribute itself.
+    val evictionExpression =
+      if (watermarkAttribute.dataType.isInstanceOf[StructType]) {
+        LessThanOrEqual(
+          GetStructField(watermarkAttribute, 1),
+          Literal(optionalWatermarkMs.get * 1000))
+      } else {
+        LessThanOrEqual(
+          watermarkAttribute,
+          Literal(optionalWatermarkMs.get * 1000))
+      }
+    Some(evictionExpression)
+  }
+}
+
 /**
  * For each input tuple, the key is calculated and the value from the [[StateStore]] is added
  * to the stream (in addition to the input tuple) if present.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManagerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManagerSuite.scala
new file mode 100644
index 0000000000000..ffa4c3c22a194
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManagerSuite.scala
@@ -0,0 +1,172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming.state
+
+import java.util.UUID
+
+import org.apache.hadoop.conf.Configuration
+import org.scalatest.BeforeAndAfter
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.expressions.{Attribute, BoundReference, Expression, GenericInternalRow, LessThanOrEqual, Literal, UnsafeProjection, UnsafeRow}
+import org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate
+import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark
+import org.apache.spark.sql.execution.streaming.StatefulOperatorStateInfo
+import org.apache.spark.sql.execution.streaming.StreamingSymmetricHashJoinHelper.LeftSide
+import org.apache.spark.sql.streaming.StreamTest
+import org.apache.spark.sql.types._
+
+class SymmetricHashJoinStateManagerSuite extends StreamTest with BeforeAndAfter {
+
+  before {
+    SparkSession.setActiveSession(spark) // set this before force initializing 'joinExec'
+    spark.streams.stateStoreCoordinator // initialize the lazy coordinator
+  }
+
+
+  test("SymmetricHashJoinStateManager - all operations") {
+    withJoinStateManager(inputValueAttribs, joinKeyExprs) { manager =>
+      implicit val mgr = manager
+
+      assert(get(20) === Seq.empty)     // initially empty
+      append(20, 2)
+      assert(get(20) === Seq(2))        // should first value correctly
+      assert(numRows === 1)
+
+      append(20, 3)
+      assert(get(20) === Seq(2, 3))     // should append new values
+      append(20, 3)
+      assert(get(20) === Seq(2, 3, 3))  // should append another copy if same value added again
+      assert(numRows === 3)
+
+      assert(get(30) === Seq.empty)
+      append(30, 1)
+      assert(get(30) === Seq(1))
+      assert(get(20) === Seq(2, 3, 3))  // add another key-value should not affect existing ones
+      assert(numRows === 4)
+
+      removeByKey(25)
+      assert(get(20) === Seq.empty)
+      assert(get(30) === Seq(1))        // should remove 20, not 30
+      assert(numRows === 1)
+
+      removeByKey(30)
+      assert(get(30) === Seq.empty)     // should remove 30
+      assert(numRows === 0)
+
+      def appendAndTest(key: Int, values: Int*): Unit = {
+        values.foreach { value => append(key, value)}
+        require(get(key) === values)
+      }
+
+      appendAndTest(40, 100, 200, 300)
+      appendAndTest(50, 125)
+      appendAndTest(60, 275)              // prepare for testing removeByValue
+      assert(numRows === 5)
+
+      removeByValue(125)
+      assert(get(40) === Seq(200, 300))
+      assert(get(50) === Seq.empty)
+      assert(get(60) === Seq(275))        // should remove only some values, not all
+      assert(numRows === 3)
+
+      append(40, 50)
+      assert(get(40) === Seq(50, 200, 300))
+      assert(numRows === 4)
+
+      removeByValue(200)
+      assert(get(40) === Seq(300))
+      assert(get(60) === Seq(275))        // should remove only some values, not all
+      assert(numRows === 2)
+
+      removeByValue(300)
+      assert(get(40) === Seq.empty)
+      assert(get(60) === Seq.empty)       // should remove all values now
+      assert(numRows === 0)
+    }
+  }
+  val watermarkMetadata = new MetadataBuilder().putLong(EventTimeWatermark.delayKey, 10).build()
+  val inputValueSchema = new StructType()
+    .add(StructField("time", IntegerType, metadata = watermarkMetadata))
+    .add(StructField("value", BooleanType))
+  val inputValueAttribs = inputValueSchema.toAttributes
+  val inputValueAttribWithWatermark = inputValueAttribs(0)
+  val joinKeyExprs = Seq[Expression](Literal(false), inputValueAttribWithWatermark, Literal(10.0))
+
+  val inputValueGen = UnsafeProjection.create(inputValueAttribs.map(_.dataType).toArray)
+  val joinKeyGen = UnsafeProjection.create(joinKeyExprs.map(_.dataType).toArray)
+
+
+  def toInputValue(i: Int): UnsafeRow = {
+    inputValueGen.apply(new GenericInternalRow(Array[Any](i, false)))
+  }
+
+  def toJoinKeyRow(i: Int): UnsafeRow = {
+    joinKeyGen.apply(new GenericInternalRow(Array[Any](false, i, 10.0)))
+  }
+
+  def toValueInt(inputValueRow: UnsafeRow): Int = inputValueRow.getInt(0)
+
+  def append(key: Int, value: Int)(implicit manager: SymmetricHashJoinStateManager): Unit = {
+    manager.append(toJoinKeyRow(key), toInputValue(value))
+  }
+
+  def get(key: Int)(implicit manager: SymmetricHashJoinStateManager): Seq[Int] = {
+    manager.get(toJoinKeyRow(key)).map(toValueInt).toSeq.sorted
+  }
+
+  /** Remove keys (and corresponding values) where `time <= threshold` */
+  def removeByKey(threshold: Long)(implicit manager: SymmetricHashJoinStateManager): Unit = {
+    val expr =
+      LessThanOrEqual(
+        BoundReference(
+          1, inputValueAttribWithWatermark.dataType, inputValueAttribWithWatermark.nullable),
+        Literal(threshold))
+    manager.removeByKeyCondition(GeneratePredicate.generate(expr).eval _)
+  }
+
+  /** Remove values where `time <= threshold` */
+  def removeByValue(watermark: Long)(implicit manager: SymmetricHashJoinStateManager): Unit = {
+    val expr = LessThanOrEqual(inputValueAttribWithWatermark, Literal(watermark))
+    manager.removeByValueCondition(
+      GeneratePredicate.generate(expr, inputValueAttribs).eval _)
+  }
+
+  def numRows(implicit manager: SymmetricHashJoinStateManager): Long = {
+    manager.metrics.numKeys
+  }
+
+
+  def withJoinStateManager(
+    inputValueAttribs: Seq[Attribute],
+    joinKeyExprs: Seq[Expression])(f: SymmetricHashJoinStateManager => Unit): Unit = {
+
+    withTempDir { file =>
+      val storeConf = new StateStoreConf()
+      val stateInfo = StatefulOperatorStateInfo(file.getAbsolutePath, UUID.randomUUID, 0, 0)
+      val manager = new SymmetricHashJoinStateManager(
+        LeftSide, inputValueAttribs, joinKeyExprs, Some(stateInfo), storeConf, new Configuration)
+      try {
+        f(manager)
+      } finally {
+        manager.abortIfNeeded()
+      }
+    }
+    StateStore.stop()
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StateStoreMetricsTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StateStoreMetricsTest.scala
index 894786c50e238..368c4604dfca8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StateStoreMetricsTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StateStoreMetricsTest.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.streaming
 trait StateStoreMetricsTest extends StreamTest {
 
   def assertNumStateRows(total: Seq[Long], updated: Seq[Long]): AssertOnQuery =
-    AssertOnQuery { q =>
+    AssertOnQuery(s"Check total state rows = $total, updated state rows = $updated") { q =>
       val progressWithData = q.recentProgress.filter(_.numInputRows > 0).lastOption.get
       assert(
         progressWithData.stateOperators.map(_.numRowsTotal) === total,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala
new file mode 100644
index 0000000000000..533e1165fd59c
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala
@@ -0,0 +1,472 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.streaming
+
+import java.util.UUID
+
+import scala.util.Random
+
+import org.scalatest.BeforeAndAfter
+
+import org.apache.spark.scheduler.ExecutorCacheTaskLocation
+import org.apache.spark.sql.{AnalysisException, SparkSession}
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, AttributeSet}
+import org.apache.spark.sql.catalyst.plans.logical.{EventTimeWatermark, Filter}
+import org.apache.spark.sql.execution.LogicalRDD
+import org.apache.spark.sql.execution.streaming.{MemoryStream, StatefulOperatorStateInfo, StreamingSymmetricHashJoinHelper}
+import org.apache.spark.sql.execution.streaming.state.{StateStore, StateStoreProviderId}
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
+import org.apache.spark.util.Utils
+
+
+class StreamingJoinSuite extends StreamTest with StateStoreMetricsTest with BeforeAndAfter {
+
+  before {
+    SparkSession.setActiveSession(spark)  // set this before force initializing 'joinExec'
+    spark.streams.stateStoreCoordinator   // initialize the lazy coordinator
+  }
+
+  after {
+    StateStore.stop()
+  }
+
+  import testImplicits._
+  test("stream stream inner join on non-time column") {
+    val input1 = MemoryStream[Int]
+    val input2 = MemoryStream[Int]
+
+    val df1 = input1.toDF.select('value as "key", ('value * 2) as "leftValue")
+    val df2 = input2.toDF.select('value as "key", ('value * 3) as "rightValue")
+    val joined = df1.join(df2, "key")
+
+    testStream(joined)(
+      AddData(input1, 1),
+      CheckAnswer(),
+      AddData(input2, 1, 10),       // 1 arrived on input1 first, then input2, should join
+      CheckLastBatch((1, 2, 3)),
+      AddData(input1, 10),          // 10 arrived on input2 first, then input1, should join
+      CheckLastBatch((10, 20, 30)),
+      AddData(input2, 1),           // another 1 in input2 should join with 1 input1
+      CheckLastBatch((1, 2, 3)),
+      StopStream,
+      StartStream(),
+      AddData(input1, 1), // multiple 1s should be kept in state causing multiple (1, 2, 3)
+      CheckLastBatch((1, 2, 3), (1, 2, 3)),
+      StopStream,
+      StartStream(),
+      AddData(input1, 100),
+      AddData(input2, 100),
+      CheckLastBatch((100, 200, 300))
+    )
+  }
+
+  test("stream stream inner join on windows - without watermark") {
+    val input1 = MemoryStream[Int]
+    val input2 = MemoryStream[Int]
+
+    val df1 = input1.toDF
+      .select('value as "key", 'value.cast("timestamp") as "timestamp", ('value * 2) as "leftValue")
+      .select('key, window('timestamp, "10 second"), 'leftValue)
+
+    val df2 = input2.toDF
+      .select('value as "key", 'value.cast("timestamp") as "timestamp",
+        ('value * 3) as "rightValue")
+      .select('key, window('timestamp, "10 second"), 'rightValue)
+
+    val joined = df1.join(df2, Seq("key", "window"))
+      .select('key, $"window.end".cast("long"), 'leftValue, 'rightValue)
+
+    testStream(joined)(
+      AddData(input1, 1),
+      CheckLastBatch(),
+      AddData(input2, 1),
+      CheckLastBatch((1, 10, 2, 3)),
+      StopStream,
+      StartStream(),
+      AddData(input1, 25),
+      CheckLastBatch(),
+      StopStream,
+      StartStream(),
+      AddData(input2, 25),
+      CheckLastBatch((25, 30, 50, 75)),
+      AddData(input1, 1),
+      CheckLastBatch((1, 10, 2, 3)),      // State for 1 still around as there is no watermark
+      StopStream,
+      StartStream(),
+      AddData(input1, 5),
+      CheckLastBatch(),
+      AddData(input2, 5),
+      CheckLastBatch((5, 10, 10, 15))     // No filter by any watermark
+    )
+  }
+
+  test("stream stream inner join on windows - with watermark") {
+    val input1 = MemoryStream[Int]
+    val input2 = MemoryStream[Int]
+
+    val df1 = input1.toDF
+      .select('value as "key", 'value.cast("timestamp") as "timestamp", ('value * 2) as "leftValue")
+      .withWatermark("timestamp", "10 seconds")
+      .select('key, window('timestamp, "10 second"), 'leftValue)
+
+    val df2 = input2.toDF
+      .select('value as "key", 'value.cast("timestamp") as "timestamp",
+        ('value * 3) as "rightValue")
+      .select('key, window('timestamp, "10 second"), 'rightValue)
+
+    val joined = df1.join(df2, Seq("key", "window"))
+      .select('key, $"window.end".cast("long"), 'leftValue, 'rightValue)
+
+    testStream(joined)(
+      AddData(input1, 1),
+      CheckAnswer(),
+      assertNumStateRows(total = 1, updated = 1),
+
+      AddData(input2, 1),
+      CheckLastBatch((1, 10, 2, 3)),
+      assertNumStateRows(total = 2, updated = 1),
+      StopStream,
+      StartStream(),
+
+      AddData(input1, 25),
+      CheckLastBatch(), // since there is only 1 watermark operator, the watermark should be 15
+      assertNumStateRows(total = 3, updated = 1),
+
+      AddData(input2, 25),
+      CheckLastBatch((25, 30, 50, 75)), // watermark = 15 should remove 2 rows having window=[0,10]
+      assertNumStateRows(total = 2, updated = 1),
+      StopStream,
+      StartStream(),
+
+      AddData(input2, 1),
+      CheckLastBatch(),       // Should not join as < 15 removed
+      assertNumStateRows(total = 2, updated = 0),  // row not add as 1 < state key watermark = 15
+
+      AddData(input1, 5),
+      CheckLastBatch(),       // Should not join or add to state as < 15 got filtered by watermark
+      assertNumStateRows(total = 2, updated = 0)
+    )
+  }
+
+  test("stream stream inner join with time range - with watermark - one side condition") {
+    import org.apache.spark.sql.functions._
+
+    val leftInput = MemoryStream[(Int, Int)]
+    val rightInput = MemoryStream[(Int, Int)]
+
+    val df1 = leftInput.toDF.toDF("leftKey", "time")
+      .select('leftKey, 'time.cast("timestamp") as "leftTime", ('leftKey * 2) as "leftValue")
+      .withWatermark("leftTime", "10 seconds")
+
+    val df2 = rightInput.toDF.toDF("rightKey", "time")
+      .select('rightKey, 'time.cast("timestamp") as "rightTime", ('rightKey * 3) as "rightValue")
+      .withWatermark("rightTime", "10 seconds")
+
+    val joined =
+      df1.join(df2, expr("leftKey = rightKey AND leftTime < rightTime - interval 5 seconds"))
+        .select('leftKey, 'leftTime.cast("int"), 'rightTime.cast("int"))
+
+    testStream(joined)(
+      AddData(leftInput, (1, 5)),
+      CheckAnswer(),
+      AddData(rightInput, (1, 11)),
+      CheckLastBatch((1, 5, 11)),
+      AddData(rightInput, (1, 10)),
+      CheckLastBatch(), // no match as neither 5, nor 10 from leftTime is less than rightTime 10 - 5
+      assertNumStateRows(total = 3, updated = 1),
+
+      // Increase event time watermark to 20s by adding data with time = 30s on both inputs
+      AddData(leftInput, (1, 3), (1, 30)),
+      CheckLastBatch((1, 3, 10), (1, 3, 11)),
+      assertNumStateRows(total = 5, updated = 2),
+      AddData(rightInput, (0, 30)),
+      CheckLastBatch(),
+      assertNumStateRows(total = 6, updated = 1),
+
+      // event time watermark:    max event time - 10   ==>   30 - 10 = 20
+      // right side state constraint:    20 < leftTime < rightTime - 5   ==>   rightTime > 25
+
+      // Run another batch with event time = 25 to clear right state where rightTime <= 25
+      AddData(rightInput, (0, 30)),
+      CheckLastBatch(),
+      assertNumStateRows(total = 5, updated = 1),  // removed (1, 11) and (1, 10), added (0, 30)
+
+      // New data to right input should match with left side (1, 3) and (1, 5), as left state should
+      // not be cleared. But rows rightTime <= 20 should be filtered due to event time watermark and
+      // state rows with rightTime <= 25 should be removed from state.
+      // (1, 20) ==> filtered by event time watermark = 20
+      // (1, 21) ==> passed filter, matched with left (1, 3) and (1, 5), not added to state
+      //             as state watermark = 25
+      // (1, 28) ==> passed filter, matched with left (1, 3) and (1, 5), added to state
+      AddData(rightInput, (1, 20), (1, 21), (1, 28)),
+      CheckLastBatch((1, 3, 21), (1, 5, 21), (1, 3, 28), (1, 5, 28)),
+      assertNumStateRows(total = 6, updated = 1),
+
+      // New data to left input with leftTime <= 20 should be filtered due to event time watermark
+      AddData(leftInput, (1, 20), (1, 21)),
+      CheckLastBatch((1, 21, 28)),
+      assertNumStateRows(total = 7, updated = 1)
+    )
+  }
+
+  test("stream stream inner join with time range - with watermark - two side conditions") {
+    import org.apache.spark.sql.functions._
+
+    val leftInput = MemoryStream[(Int, Int)]
+    val rightInput = MemoryStream[(Int, Int)]
+
+    val df1 = leftInput.toDF.toDF("leftKey", "time")
+      .select('leftKey, 'time.cast("timestamp") as "leftTime", ('leftKey * 2) as "leftValue")
+      .withWatermark("leftTime", "20 seconds")
+
+    val df2 = rightInput.toDF.toDF("rightKey", "time")
+      .select('rightKey, 'time.cast("timestamp") as "rightTime", ('rightKey * 3) as "rightValue")
+      .withWatermark("rightTime", "30 seconds")
+
+    val condition = expr(
+      "leftKey = rightKey AND " +
+        "leftTime BETWEEN rightTime - interval 10 seconds AND rightTime + interval 5 seconds")
+
+    // This translates to leftTime <= rightTime + 5 seconds AND leftTime >= rightTime - 10 seconds
+    // So given leftTime, rightTime has to be BETWEEN leftTime - 5 seconds AND leftTime + 10 seconds
+    //
+    //  =============== * ======================== * ============================== * ==> leftTime
+    //                  |                          |                                |
+    //     |<---- 5s -->|<------ 10s ------>|      |<------ 10s ------>|<---- 5s -->|
+    //     |                                |                          |
+    //  == * ============================== * =========>============== * ===============> rightTime
+    //
+    // E.g.
+    //      if rightTime = 60, then it matches only leftTime = [50, 65]
+    //      if leftTime = 20, then it match only with rightTime = [15, 30]
+    //
+    // State value predicates
+    //   left side:
+    //     values allowed:  leftTime >= rightTime - 10s   ==>   leftTime > eventTimeWatermark - 10
+    //     drop state where leftTime < eventTime - 10
+    //   right side:
+    //     values allowed:  rightTime >= leftTime - 5s   ==>   rightTime > eventTimeWatermark - 5
+    //     drop state where rightTime < eventTime - 5
+
+    val joined =
+      df1.join(df2, condition).select('leftKey, 'leftTime.cast("int"), 'rightTime.cast("int"))
+
+    testStream(joined)(
+      // If leftTime = 20, then it match only with rightTime = [15, 30]
+      AddData(leftInput, (1, 20)),
+      CheckAnswer(),
+      AddData(rightInput, (1, 14), (1, 15), (1, 25), (1, 26), (1, 30), (1, 31)),
+      CheckLastBatch((1, 20, 15), (1, 20, 25), (1, 20, 26), (1, 20, 30)),
+      assertNumStateRows(total = 7, updated = 6),
+
+      // If rightTime = 60, then it matches only leftTime = [50, 65]
+      AddData(rightInput, (1, 60)),
+      CheckLastBatch(),                // matches with nothing on the left
+      AddData(leftInput, (1, 49), (1, 50), (1, 65), (1, 66)),
+      CheckLastBatch((1, 50, 60), (1, 65, 60)),
+      assertNumStateRows(total = 12, updated = 4),
+
+      // Event time watermark = min(left: 66 - delay 20 = 46, right: 60 - delay 30 = 30) = 30
+      // Left state value watermark = 30 - 10 = slightly less than 20 (since condition has <=)
+      //    Should drop < 20 from left, i.e., none
+      // Right state value watermark = 30 - 5 = slightly less than 25 (since condition has <=)
+      //    Should drop < 25 from the right, i.e., 14 and 15
+      AddData(leftInput, (1, 30), (1, 31)),     // 30 should not be processed or added to stat
+      CheckLastBatch((1, 31, 26), (1, 31, 30), (1, 31, 31)),
+      assertNumStateRows(total = 11, updated = 1),  // 12 - 2 removed + 1 added
+
+      // Advance the watermark
+      AddData(rightInput, (1, 80)),
+      CheckLastBatch(),
+      assertNumStateRows(total = 12, updated = 1),
+
+      // Event time watermark = min(left: 66 - delay 20 = 46, right: 80 - delay 30 = 50) = 46
+      // Left state value watermark = 46 - 10 = slightly less than 36 (since condition has <=)
+      //    Should drop < 36 from left, i.e., 20, 31 (30 was not added)
+      // Right state value watermark = 46 - 5 = slightly less than 41 (since condition has <=)
+      //    Should drop < 41 from the right, i.e., 25, 26, 30, 31
+      AddData(rightInput, (1, 50)),
+      CheckLastBatch((1, 49, 50), (1, 50, 50)),
+      assertNumStateRows(total = 7, updated = 1)  // 12 - 6 removed + 1 added
+    )
+  }
+
+  testQuietly("stream stream inner join without equality predicate") {
+    val input1 = MemoryStream[Int]
+    val input2 = MemoryStream[Int]
+
+    val df1 = input1.toDF.select('value as "leftKey", ('value * 2) as "leftValue")
+    val df2 = input2.toDF.select('value as "rightKey", ('value * 3) as "rightValue")
+    val joined = df1.join(df2, expr("leftKey < rightKey"))
+    val e = intercept[Exception] {
+      val q = joined.writeStream.format("memory").queryName("test").start()
+      input1.addData(1)
+      q.awaitTermination(10000)
+    }
+    assert(e.toString.contains("Stream stream joins without equality predicate is not supported"))
+  }
+
+  testQuietly("extract watermark from time condition") {
+    val attributesToFindConstraintFor = Seq(
+      AttributeReference("leftTime", TimestampType)(),
+      AttributeReference("leftOther", IntegerType)())
+    val metadataWithWatermark = new MetadataBuilder()
+      .putLong(EventTimeWatermark.delayKey, 1000)
+      .build()
+    val attributesWithWatermark = Seq(
+      AttributeReference("rightTime", TimestampType, metadata = metadataWithWatermark)(),
+      AttributeReference("rightOther", IntegerType)())
+
+    def watermarkFrom(
+        conditionStr: String,
+        rightWatermark: Option[Long] = Some(10000)): Option[Long] = {
+      val conditionExpr = Some(conditionStr).map { str =>
+        val plan =
+          Filter(
+            spark.sessionState.sqlParser.parseExpression(str),
+            LogicalRDD(
+              attributesToFindConstraintFor ++ attributesWithWatermark,
+              spark.sparkContext.emptyRDD)(spark))
+        plan.queryExecution.optimizedPlan.asInstanceOf[Filter].condition
+      }
+      StreamingSymmetricHashJoinHelper.getStateValueWatermark(
+        AttributeSet(attributesToFindConstraintFor), AttributeSet(attributesWithWatermark),
+        conditionExpr, rightWatermark)
+    }
+
+    // Test comparison directionality. E.g. if leftTime < rightTime and rightTime > watermark,
+    // then cannot define constraint on leftTime.
+    assert(watermarkFrom("leftTime > rightTime") === Some(10000))
+    assert(watermarkFrom("leftTime >= rightTime") === Some(9999))
+    assert(watermarkFrom("leftTime < rightTime") === None)
+    assert(watermarkFrom("leftTime <= rightTime") === None)
+    assert(watermarkFrom("rightTime > leftTime") === None)
+    assert(watermarkFrom("rightTime >= leftTime") === None)
+    assert(watermarkFrom("rightTime < leftTime") === Some(10000))
+    assert(watermarkFrom("rightTime <= leftTime") === Some(9999))
+
+    // Test type conversions
+    assert(watermarkFrom("CAST(leftTime AS LONG) > CAST(rightTime AS LONG)") === Some(10000))
+    assert(watermarkFrom("CAST(leftTime AS LONG) < CAST(rightTime AS LONG)") === None)
+    assert(watermarkFrom("CAST(leftTime AS DOUBLE) > CAST(rightTime AS DOUBLE)") === Some(10000))
+    assert(watermarkFrom("CAST(leftTime AS LONG) > CAST(rightTime AS DOUBLE)") === Some(10000))
+    assert(watermarkFrom("CAST(leftTime AS LONG) > CAST(rightTime AS FLOAT)") === Some(10000))
+    assert(watermarkFrom("CAST(leftTime AS DOUBLE) > CAST(rightTime AS FLOAT)") === Some(10000))
+    assert(watermarkFrom("CAST(leftTime AS STRING) > CAST(rightTime AS STRING)") === None)
+
+    // Test with timestamp type + calendar interval on either side of equation
+    // Note: timestamptype and calendar interval don't commute, so less valid combinations to test.
+    assert(watermarkFrom("leftTime > rightTime + interval 1 second") === Some(11000))
+    assert(watermarkFrom("leftTime + interval 2 seconds > rightTime ") === Some(8000))
+    assert(watermarkFrom("leftTime > rightTime - interval 3 second") === Some(7000))
+    assert(watermarkFrom("rightTime < leftTime - interval 3 second") === Some(13000))
+    assert(watermarkFrom("rightTime - interval 1 second < leftTime - interval 3 second")
+      === Some(12000))
+
+    // Test with casted long type + constants on either side of equation
+    // Note: long type and constants commute, so more combinations to test.
+    // -- Constants on the right
+    assert(watermarkFrom("CAST(leftTime AS LONG) > CAST(rightTime AS LONG) + 1") === Some(11000))
+    assert(watermarkFrom("CAST(leftTime AS LONG) > CAST(rightTime AS LONG) - 1") === Some(9000))
+    assert(watermarkFrom("CAST(leftTime AS LONG) > CAST((rightTime + interval 1 second) AS LONG)")
+      === Some(11000))
+    assert(watermarkFrom("CAST(leftTime AS LONG) > 2 + CAST(rightTime AS LONG)") === Some(12000))
+    assert(watermarkFrom("CAST(leftTime AS LONG) > -0.5 + CAST(rightTime AS LONG)") === Some(9500))
+    assert(watermarkFrom("CAST(leftTime AS LONG) - CAST(rightTime AS LONG) > 2") === Some(12000))
+    assert(watermarkFrom("-CAST(rightTime AS DOUBLE) + CAST(leftTime AS LONG) > 0.1")
+      === Some(10100))
+    assert(watermarkFrom("0 > CAST(rightTime AS LONG) - CAST(leftTime AS LONG) + 0.2")
+      === Some(10200))
+    // -- Constants on the left
+    assert(watermarkFrom("CAST(leftTime AS LONG) + 2 > CAST(rightTime AS LONG)") === Some(8000))
+    assert(watermarkFrom("1 + CAST(leftTime AS LONG) > CAST(rightTime AS LONG)") === Some(9000))
+    assert(watermarkFrom("CAST((leftTime  + interval 3 second) AS LONG) > CAST(rightTime AS LONG)")
+      === Some(7000))
+    assert(watermarkFrom("CAST(leftTime AS LONG) - 2 > CAST(rightTime AS LONG)") === Some(12000))
+    assert(watermarkFrom("CAST(leftTime AS LONG) + 0.5 > CAST(rightTime AS LONG)") === Some(9500))
+    assert(watermarkFrom("CAST(leftTime AS LONG) - CAST(rightTime AS LONG) - 2 > 0")
+      === Some(12000))
+    assert(watermarkFrom("-CAST(rightTime AS LONG) + CAST(leftTime AS LONG) - 0.1 > 0")
+      === Some(10100))
+    // -- Constants on both sides, mixed types
+    assert(watermarkFrom("CAST(leftTime AS LONG) - 2.0 > CAST(rightTime AS LONG) + 1")
+      === Some(13000))
+
+    // Test multiple conditions, should return minimum watermark
+    assert(watermarkFrom(
+      "leftTime > rightTime - interval 3 second AND rightTime < leftTime + interval 2 seconds") ===
+      Some(7000))  // first condition wins
+    assert(watermarkFrom(
+      "leftTime > rightTime - interval 3 second AND rightTime < leftTime + interval 4 seconds") ===
+      Some(6000))  // second condition wins
+
+    // Test invalid comparisons
+    assert(watermarkFrom("cast(leftTime AS LONG) > leftOther") === None)      // non-time attributes
+    assert(watermarkFrom("leftOther > rightOther") === None)                  // non-time attributes
+    assert(watermarkFrom("leftOther > rightOther AND leftTime > rightTime") === Some(10000))
+    assert(watermarkFrom("cast(rightTime AS DOUBLE) < rightOther") === None)  // non-time attributes
+    assert(watermarkFrom("leftTime > rightTime + interval 1 month") === None) // month not allowed
+
+    // Test static comparisons
+    assert(watermarkFrom("cast(leftTime AS LONG) > 10") === Some(10000))
+  }
+
+  test("locality preferences of StateStoreAwareZippedRDD") {
+    import StreamingSymmetricHashJoinHelper._
+
+    withTempDir { tempDir =>
+      val queryId = UUID.randomUUID
+      val opId = 0
+      val path = Utils.createDirectory(tempDir.getAbsolutePath, Random.nextString(10)).toString
+      val stateInfo = StatefulOperatorStateInfo(path, queryId, opId, 0L)
+
+      implicit val sqlContext = spark.sqlContext
+      val coordinatorRef = sqlContext.streams.stateStoreCoordinator
+      val numPartitions = 5
+      val storeNames = Seq("name1", "name2")
+
+      val partitionAndStoreNameToLocation = {
+        for (partIndex <- 0 until numPartitions; storeName <- storeNames) yield {
+          (partIndex, storeName) -> s"host-$partIndex-$storeName"
+        }
+      }.toMap
+      partitionAndStoreNameToLocation.foreach { case ((partIndex, storeName), hostName) =>
+        val providerId = StateStoreProviderId(stateInfo, partIndex, storeName)
+        coordinatorRef.reportActiveInstance(providerId, hostName, s"exec-$hostName")
+        require(
+          coordinatorRef.getLocation(providerId) ===
+            Some(ExecutorCacheTaskLocation(hostName, s"exec-$hostName").toString))
+      }
+
+      val rdd1 = spark.sparkContext.makeRDD(1 to 10, numPartitions)
+      val rdd2 = spark.sparkContext.makeRDD((1 to 10).map(_.toString), numPartitions)
+      val rdd = rdd1.stateStoreAwareZipPartitions(rdd2, stateInfo, storeNames, coordinatorRef) {
+        (left, right) => left.zip(right)
+      }
+      require(rdd.partitions.length === numPartitions)
+      for (partIndex <- 0 until numPartitions) {
+        val expectedLocations = storeNames.map { storeName =>
+          val hostName = partitionAndStoreNameToLocation((partIndex, storeName))
+          ExecutorCacheTaskLocation(hostName, s"exec-$hostName").toString
+        }.toSet
+        assert(rdd.preferredLocations(rdd.partitions(partIndex)).toSet === expectedLocations)
+      }
+    }
+  }
+}

From fedf6961be4e99139eb7ab08d5e6e29187ea5ccf Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <zsxwing@gmail.com>
Date: Thu, 21 Sep 2017 21:55:07 -0700
Subject: [PATCH 598/779] [SPARK-22094][SS] processAllAvailable should check
 the query state

## What changes were proposed in this pull request?

`processAllAvailable` should also check the query state and if the query is stopped, it should return.

## How was this patch tested?

The new unit test.

Author: Shixiong Zhu <zsxwing@gmail.com>

Closes #19314 from zsxwing/SPARK-22094.
---
 .../sql/execution/streaming/StreamExecution.scala    |  2 +-
 .../spark/sql/streaming/StreamingQuerySuite.scala    | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index b2d6c6082b025..406560c260f07 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -840,7 +840,7 @@ class StreamExecution(
         if (streamDeathCause != null) {
           throw streamDeathCause
         }
-        if (noNewData) {
+        if (noNewData || !isActive) {
           return
         }
       }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index 3823e336d0b64..ab35079dca23f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -640,6 +640,18 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging wi
     }
   }
 
+  test("processAllAvailable should not block forever when a query is stopped") {
+    val input = MemoryStream[Int]
+    input.addData(1)
+    val query = input.toDF().writeStream
+      .trigger(Trigger.Once())
+      .format("console")
+      .start()
+    failAfter(streamingTimeout) {
+      query.processAllAvailable()
+    }
+  }
+
   /** Create a streaming DF that only execute one batch in which it returns the given static DF */
   private def createSingleTriggerStreamingDF(triggerDF: DataFrame): DataFrame = {
     require(!triggerDF.isStreaming)

From 5ac96854cc6186fa2dad602d0906ff2705e3f610 Mon Sep 17 00:00:00 2001
From: Marco Gaido <mgaido@hortonworks.com>
Date: Fri, 22 Sep 2017 13:12:33 +0800
Subject: [PATCH 599/779] [SPARK-21981][PYTHON][ML] Added Python interface for
 ClusteringEvaluator

## What changes were proposed in this pull request?

Added Python interface for ClusteringEvaluator

## How was this patch tested?

Manual test, eg. the example Python code in the comments.

cc yanboliang

Author: Marco Gaido <mgaido@hortonworks.com>
Author: Marco Gaido <marcogaido91@gmail.com>

Closes #19204 from mgaido91/SPARK-21981.
---
 python/pyspark/ml/evaluation.py | 76 ++++++++++++++++++++++++++++++++-
 1 file changed, 74 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
index 09cdf9b6a629a..aa8dbe708a115 100644
--- a/python/pyspark/ml/evaluation.py
+++ b/python/pyspark/ml/evaluation.py
@@ -20,12 +20,13 @@
 from pyspark import since, keyword_only
 from pyspark.ml.wrapper import JavaParams
 from pyspark.ml.param import Param, Params, TypeConverters
-from pyspark.ml.param.shared import HasLabelCol, HasPredictionCol, HasRawPredictionCol
+from pyspark.ml.param.shared import HasLabelCol, HasPredictionCol, HasRawPredictionCol, \
+    HasFeaturesCol
 from pyspark.ml.common import inherit_doc
 from pyspark.ml.util import JavaMLReadable, JavaMLWritable
 
 __all__ = ['Evaluator', 'BinaryClassificationEvaluator', 'RegressionEvaluator',
-           'MulticlassClassificationEvaluator']
+           'MulticlassClassificationEvaluator', 'ClusteringEvaluator']
 
 
 @inherit_doc
@@ -325,6 +326,77 @@ def setParams(self, predictionCol="prediction", labelCol="label",
         kwargs = self._input_kwargs
         return self._set(**kwargs)
 
+
+@inherit_doc
+class ClusteringEvaluator(JavaEvaluator, HasPredictionCol, HasFeaturesCol,
+                          JavaMLReadable, JavaMLWritable):
+    """
+    .. note:: Experimental
+
+    Evaluator for Clustering results, which expects two input
+    columns: prediction and features.
+
+    >>> from pyspark.ml.linalg import Vectors
+    >>> featureAndPredictions = map(lambda x: (Vectors.dense(x[0]), x[1]),
+    ...     [([0.0, 0.5], 0.0), ([0.5, 0.0], 0.0), ([10.0, 11.0], 1.0),
+    ...     ([10.5, 11.5], 1.0), ([1.0, 1.0], 0.0), ([8.0, 6.0], 1.0)])
+    >>> dataset = spark.createDataFrame(featureAndPredictions, ["features", "prediction"])
+    ...
+    >>> evaluator = ClusteringEvaluator(predictionCol="prediction")
+    >>> evaluator.evaluate(dataset)
+    0.9079...
+    >>> ce_path = temp_path + "/ce"
+    >>> evaluator.save(ce_path)
+    >>> evaluator2 = ClusteringEvaluator.load(ce_path)
+    >>> str(evaluator2.getPredictionCol())
+    'prediction'
+
+    .. versionadded:: 2.3.0
+    """
+    metricName = Param(Params._dummy(), "metricName",
+                       "metric name in evaluation (silhouette)",
+                       typeConverter=TypeConverters.toString)
+
+    @keyword_only
+    def __init__(self, predictionCol="prediction", featuresCol="features",
+                 metricName="silhouette"):
+        """
+        __init__(self, predictionCol="prediction", featuresCol="features", \
+                 metricName="silhouette")
+        """
+        super(ClusteringEvaluator, self).__init__()
+        self._java_obj = self._new_java_obj(
+            "org.apache.spark.ml.evaluation.ClusteringEvaluator", self.uid)
+        self._setDefault(metricName="silhouette")
+        kwargs = self._input_kwargs
+        self._set(**kwargs)
+
+    @since("2.3.0")
+    def setMetricName(self, value):
+        """
+        Sets the value of :py:attr:`metricName`.
+        """
+        return self._set(metricName=value)
+
+    @since("2.3.0")
+    def getMetricName(self):
+        """
+        Gets the value of metricName or its default value.
+        """
+        return self.getOrDefault(self.metricName)
+
+    @keyword_only
+    @since("2.3.0")
+    def setParams(self, predictionCol="prediction", featuresCol="features",
+                  metricName="silhouette"):
+        """
+        setParams(self, predictionCol="prediction", featuresCol="features", \
+                  metricName="silhouette")
+        Sets params for clustering evaluator.
+        """
+        kwargs = self._input_kwargs
+        return self._set(**kwargs)
+
 if __name__ == "__main__":
     import doctest
     import tempfile

From 5960686e791b5d6642a30c43c1de61e96e594a5e Mon Sep 17 00:00:00 2001
From: maryannxue <maryann.xue@gmail.com>
Date: Thu, 21 Sep 2017 23:54:16 -0700
Subject: [PATCH 600/779] [SPARK-21998][SQL] SortMergeJoinExec did not
 calculate its outputOrdering correctly during physical planning

## What changes were proposed in this pull request?

Right now the calculation of SortMergeJoinExec's outputOrdering relies on the fact that its children have already been sorted on the join keys, while this is often not true until EnsureRequirements has been applied. So we ended up not getting the correct outputOrdering during physical planning stage before Sort nodes are added to the children.

For example, J = {A join B on key1 = key2}
1. if A is NOT ordered on key1 ASC, J's outputOrdering should include "key1 ASC"
2. if A is ordered on key1 ASC, J's outputOrdering should include "key1 ASC"
3. if A is ordered on key1 ASC, with sameOrderExp=c1, J's outputOrdering should include "key1 ASC, sameOrderExp=c1"

So to fix this I changed the  behavior of <code>getKeyOrdering(keys, childOutputOrdering)</code> to:
1. If the childOutputOrdering satisfies (is a superset of) the required child ordering => childOutputOrdering
2. Otherwise => required child ordering

In addition, I organized the logic for deciding the relationship between two orderings into SparkPlan, so that it can be reused by EnsureRequirements and SortMergeJoinExec, and potentially other classes.

## How was this patch tested?

Added new test cases.
Passed all integration tests.

Author: maryannxue <maryann.xue@gmail.com>

Closes #19281 from maryannxue/spark-21998.
---
 .../sql/catalyst/expressions/SortOrder.scala  | 23 +++++++
 .../exchange/EnsureRequirements.scala         | 21 ++-----
 .../execution/joins/SortMergeJoinExec.scala   | 17 +++--
 .../org/apache/spark/sql/JoinSuite.scala      | 62 +++++++++++++++++++
 4 files changed, 102 insertions(+), 21 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala
index abcb9a2b939b4..ff7c98f714905 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala
@@ -96,6 +96,29 @@ object SortOrder {
      sameOrderExpressions: Set[Expression] = Set.empty): SortOrder = {
     new SortOrder(child, direction, direction.defaultNullOrdering, sameOrderExpressions)
   }
+
+  /**
+   * Returns if a sequence of SortOrder satisfies another sequence of SortOrder.
+   *
+   * SortOrder sequence A satisfies SortOrder sequence B if and only if B is an equivalent of A
+   * or of A's prefix. Here are examples of ordering A satisfying ordering B:
+   * <ul>
+   *   <li>ordering A is [x, y] and ordering B is [x]</li>
+   *   <li>ordering A is [x(sameOrderExpressions=x1)] and ordering B is [x1]</li>
+   *   <li>ordering A is [x(sameOrderExpressions=x1), y] and ordering B is [x1]</li>
+   * </ul>
+   */
+  def orderingSatisfies(ordering1: Seq[SortOrder], ordering2: Seq[SortOrder]): Boolean = {
+    if (ordering2.isEmpty) {
+      true
+    } else if (ordering2.length > ordering1.length) {
+      false
+    } else {
+      ordering2.zip(ordering1).forall {
+        case (o2, o1) => o1.satisfies(o2)
+      }
+    }
+  }
 }
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
index b91d077442557..1da72f2e92329 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
@@ -234,24 +234,11 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
 
     // Now that we've performed any necessary shuffles, add sorts to guarantee output orderings:
     children = children.zip(requiredChildOrderings).map { case (child, requiredOrdering) =>
-      if (requiredOrdering.nonEmpty) {
-        // If child.outputOrdering is [a, b] and requiredOrdering is [a], we do not need to sort.
-        val orderingMatched = if (requiredOrdering.length > child.outputOrdering.length) {
-          false
-        } else {
-          requiredOrdering.zip(child.outputOrdering).forall {
-            case (requiredOrder, childOutputOrder) =>
-              childOutputOrder.satisfies(requiredOrder)
-          }
-        }
-
-        if (!orderingMatched) {
-          SortExec(requiredOrdering, global = false, child = child)
-        } else {
-          child
-        }
-      } else {
+      // If child.outputOrdering already satisfies the requiredOrdering, we do not need to sort.
+      if (SortOrder.orderingSatisfies(child.outputOrdering, requiredOrdering)) {
         child
+      } else {
+        SortExec(requiredOrdering, global = false, child = child)
       }
     }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
index 91d214e1978e4..14de2dc23e3c0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
@@ -102,13 +102,22 @@ case class SortMergeJoinExec(
   }
 
   /**
-   * For SMJ, child's output must have been sorted on key or expressions with the same order as
-   * key, so we can get ordering for key from child's output ordering.
+   * The utility method to get output ordering for left or right side of the join.
+   *
+   * Returns the required ordering for left or right child if childOutputOrdering does not
+   * satisfy the required ordering; otherwise, which means the child does not need to be sorted
+   * again, returns the required ordering for this child with extra "sameOrderExpressions" from
+   * the child's outputOrdering.
    */
   private def getKeyOrdering(keys: Seq[Expression], childOutputOrdering: Seq[SortOrder])
     : Seq[SortOrder] = {
-    keys.zip(childOutputOrdering).map { case (key, childOrder) =>
-      SortOrder(key, Ascending, childOrder.sameOrderExpressions + childOrder.child - key)
+    val requiredOrdering = requiredOrders(keys)
+    if (SortOrder.orderingSatisfies(childOutputOrdering, requiredOrdering)) {
+      keys.zip(childOutputOrdering).map { case (key, childOrder) =>
+        SortOrder(key, Ascending, childOrder.sameOrderExpressions + childOrder.child - key)
+      }
+    } else {
+      requiredOrdering
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index 453052a8ce191..9d50e8be60891 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -24,6 +24,8 @@ import scala.language.existentials
 import org.apache.spark.TestUtils.{assertNotSpilled, assertSpilled}
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
+import org.apache.spark.sql.catalyst.expressions.{Ascending, SortOrder}
+import org.apache.spark.sql.execution.SortExec
 import org.apache.spark.sql.execution.joins._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
@@ -787,4 +789,64 @@ class JoinSuite extends QueryTest with SharedSQLContext {
       }
     }
   }
+
+  test("test SortMergeJoin output ordering") {
+    val joinQueries = Seq(
+      "SELECT * FROM testData JOIN testData2 ON key = a",
+      "SELECT * FROM testData t1 JOIN " +
+        "testData2 t2 ON t1.key = t2.a JOIN testData3 t3 ON t2.a = t3.a",
+      "SELECT * FROM testData t1 JOIN " +
+        "testData2 t2 ON t1.key = t2.a JOIN " +
+        "testData3 t3 ON t2.a = t3.a JOIN " +
+        "testData t4 ON t1.key = t4.key")
+
+    def assertJoinOrdering(sqlString: String): Unit = {
+      val df = sql(sqlString)
+      val physical = df.queryExecution.sparkPlan
+      val physicalJoins = physical.collect {
+        case j: SortMergeJoinExec => j
+      }
+      val executed = df.queryExecution.executedPlan
+      val executedJoins = executed.collect {
+        case j: SortMergeJoinExec => j
+      }
+      // This only applies to the above tested queries, in which a child SortMergeJoin always
+      // contains the SortOrder required by its parent SortMergeJoin. Thus, SortExec should never
+      // appear as parent of SortMergeJoin.
+      executed.foreach {
+        case s: SortExec => s.foreach {
+          case j: SortMergeJoinExec => fail(
+            s"No extra sort should be added since $j already satisfies the required ordering"
+          )
+          case _ =>
+        }
+        case _ =>
+      }
+      val joinPairs = physicalJoins.zip(executedJoins)
+      val numOfJoins = sqlString.split(" ").count(_.toUpperCase == "JOIN")
+      assert(joinPairs.size == numOfJoins)
+
+      joinPairs.foreach {
+        case(join1, join2) =>
+          val leftKeys = join1.leftKeys
+          val rightKeys = join1.rightKeys
+          val outputOrderingPhysical = join1.outputOrdering
+          val outputOrderingExecuted = join2.outputOrdering
+
+          // outputOrdering should always contain join keys
+          assert(
+            SortOrder.orderingSatisfies(
+              outputOrderingPhysical, leftKeys.map(SortOrder(_, Ascending))))
+          assert(
+            SortOrder.orderingSatisfies(
+              outputOrderingPhysical, rightKeys.map(SortOrder(_, Ascending))))
+          // outputOrdering should be consistent between physical plan and executed plan
+          assert(outputOrderingPhysical == outputOrderingExecuted,
+            s"Operator $join1 did not have the same output ordering in the physical plan as in " +
+            s"the executed plan.")
+      }
+    }
+
+    joinQueries.foreach(assertJoinOrdering)
+  }
 }

From 8f130ad40178e35fecb3f2ba4a61ad23e6a90e3d Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@us.ibm.com>
Date: Fri, 22 Sep 2017 00:14:57 -0700
Subject: [PATCH 601/779] [SPARK-22072][SPARK-22071][BUILD] Improve release
 build scripts

## What changes were proposed in this pull request?

Check JDK version (with javac) and use SPARK_VERSION for publish-release

## How was this patch tested?

Manually tried local build with wrong JDK / JAVA_HOME & built a local release (LFTP disabled)

Author: Holden Karau <holden@us.ibm.com>

Closes #19312 from holdenk/improve-release-scripts-r2.
---
 dev/create-release/release-build.sh | 33 +++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index f4a7f25c2413f..8de1d6a37dc25 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -31,8 +31,8 @@ Top level targets are
 All other inputs are environment variables
 
 GIT_REF - Release tag or commit to build from
-SPARK_VERSION - Release identifier used when publishing
-SPARK_PACKAGE_VERSION - Release identifier in top level package directory
+SPARK_VERSION - Version of Spark being built (e.g. 2.1.2)
+SPARK_PACKAGE_VERSION - Release identifier in top level package directory (e.g. 2.1.2-rc1)
 REMOTE_PARENT_DIR - Parent in which to create doc or release builds.
 REMOTE_PARENT_MAX_LENGTH - If set, parent directory will be cleaned to only
  have this number of subdirectories (by deleting old ones). WARNING: This deletes data.
@@ -104,6 +104,33 @@ if [ -z "$SPARK_VERSION" ]; then
     | grep -v INFO | grep -v WARNING | grep -v Download)
 fi
 
+# Verify we have the right java version set
+if [ -z "$JAVA_HOME" ]; then
+  echo "Please set JAVA_HOME."
+  exit 1
+fi
+
+java_version=$("${JAVA_HOME}"/bin/javac -version 2>&1 | cut -d " " -f 2)
+
+if [[ ! $SPARK_VERSION < "2.2." ]]; then
+  if [[ $java_version < "1.8." ]]; then
+    echo "Java version $java_version is less than required 1.8 for 2.2+"
+    echo "Please set JAVA_HOME correctly."
+    exit 1
+  fi
+else
+  if [[ $java_version > "1.7." ]]; then
+    if [ -z "$JAVA_7_HOME" ]; then
+      echo "Java version $java_version is higher than required 1.7 for pre-2.2"
+      echo "Please set JAVA_HOME correctly."
+      exit 1
+    else
+      JAVA_HOME="$JAVA_7_HOME"
+    fi
+  fi
+fi
+
+
 if [ -z "$SPARK_PACKAGE_VERSION" ]; then
   SPARK_PACKAGE_VERSION="${SPARK_VERSION}-$(date +%Y_%m_%d_%H_%M)-${git_hash}"
 fi
@@ -326,6 +353,8 @@ if [[ "$1" == "publish-snapshot" ]]; then
 fi
 
 if [[ "$1" == "publish-release" ]]; then
+  SPARK_VERSION=$SPARK_PACKAGE_VERSION
+
   cd spark
   # Publish Spark to Maven release repo
   echo "Publishing Spark checkout at '$GIT_REF' ($git_hash)"

From 27fc536d9a54eccef7d1cbbe2a6a008043d62ba4 Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Fri, 22 Sep 2017 16:17:41 +0800
Subject: [PATCH 602/779] [SPARK-21190][PYSPARK] Python Vectorized UDFs

This PR adds vectorized UDFs to the Python API

**Proposed API**
Introduce a flag to turn on vectorization for a defined UDF, for example:

```
pandas_udf(DoubleType())
def plus(a, b)
    return a + b
```
or

```
plus = pandas_udf(lambda a, b: a + b, DoubleType())
```
Usage is the same as normal UDFs

0-parameter UDFs
pandas_udf functions can declare an optional `**kwargs` and when evaluated, will contain a key "size" that will give the required length of the output.  For example:

```
pandas_udf(LongType())
def f0(**kwargs):
    return pd.Series(1).repeat(kwargs["size"])

df.select(f0())
```

Added new unit tests in pyspark.sql that are enabled if pyarrow and Pandas are available.

- [x] Fix support for promoted types with null values
- [ ] Discuss 0-param UDF API (use of kwargs)
- [x] Add tests for chained UDFs
- [ ] Discuss behavior when pyarrow not installed / enabled
- [ ] Cleanup pydoc and add user docs

Author: Bryan Cutler <cutlerb@gmail.com>
Author: Takuya UESHIN <ueshin@databricks.com>

Closes #18659 from BryanCutler/arrow-vectorized-udfs-SPARK-21404.
---
 .../apache/spark/api/python/PythonRDD.scala   |  22 +-
 python/pyspark/serializers.py                 |  65 +++++-
 python/pyspark/sql/functions.py               |  49 +++--
 python/pyspark/sql/tests.py                   | 197 ++++++++++++++++++
 python/pyspark/sql/types.py                   |  27 +++
 python/pyspark/worker.py                      |  57 +++--
 .../python/ArrowEvalPythonExec.scala          |  61 ++++++
 .../python/BatchEvalPythonExec.scala          | 193 ++++++-----------
 .../sql/execution/python/EvalPythonExec.scala | 142 +++++++++++++
 .../execution/python/ExtractPythonUDFs.scala  |  11 +-
 .../sql/execution/python/PythonUDF.scala      |   3 +-
 .../python/UserDefinedPythonFunction.scala    |   5 +-
 .../python/BatchEvalPythonExecSuite.scala     |   7 +-
 13 files changed, 666 insertions(+), 173 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 33771011fe364..86d0405c678a7 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -83,10 +83,23 @@ private[spark] case class PythonFunction(
  */
 private[spark] case class ChainedPythonFunctions(funcs: Seq[PythonFunction])
 
+/**
+ * Enumerate the type of command that will be sent to the Python worker
+ */
+private[spark] object PythonEvalType {
+  val NON_UDF = 0
+  val SQL_BATCHED_UDF = 1
+  val SQL_PANDAS_UDF = 2
+}
+
 private[spark] object PythonRunner {
   def apply(func: PythonFunction, bufferSize: Int, reuse_worker: Boolean): PythonRunner = {
     new PythonRunner(
-      Seq(ChainedPythonFunctions(Seq(func))), bufferSize, reuse_worker, false, Array(Array(0)))
+      Seq(ChainedPythonFunctions(Seq(func))),
+      bufferSize,
+      reuse_worker,
+      PythonEvalType.NON_UDF,
+      Array(Array(0)))
   }
 }
 
@@ -100,7 +113,7 @@ private[spark] class PythonRunner(
     funcs: Seq[ChainedPythonFunctions],
     bufferSize: Int,
     reuse_worker: Boolean,
-    isUDF: Boolean,
+    evalType: Int,
     argOffsets: Array[Array[Int]])
   extends Logging {
 
@@ -309,8 +322,8 @@ private[spark] class PythonRunner(
         }
         dataOut.flush()
         // Serialized command:
-        if (isUDF) {
-          dataOut.writeInt(1)
+        dataOut.writeInt(evalType)
+        if (evalType != PythonEvalType.NON_UDF) {
           dataOut.writeInt(funcs.length)
           funcs.zip(argOffsets).foreach { case (chained, offsets) =>
             dataOut.writeInt(offsets.length)
@@ -324,7 +337,6 @@ private[spark] class PythonRunner(
             }
           }
         } else {
-          dataOut.writeInt(0)
           val command = funcs.head.funcs.head.command
           dataOut.writeInt(command.length)
           dataOut.write(command)
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index 660b19ad2a7c4..887c702ad4196 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -81,6 +81,12 @@ class SpecialLengths(object):
     NULL = -5
 
 
+class PythonEvalType(object):
+    NON_UDF = 0
+    SQL_BATCHED_UDF = 1
+    SQL_PANDAS_UDF = 2
+
+
 class Serializer(object):
 
     def dump_stream(self, iterator, stream):
@@ -187,8 +193,14 @@ class ArrowSerializer(FramedSerializer):
     Serializes an Arrow stream.
     """
 
-    def dumps(self, obj):
-        raise NotImplementedError
+    def dumps(self, batch):
+        import pyarrow as pa
+        import io
+        sink = io.BytesIO()
+        writer = pa.RecordBatchFileWriter(sink, batch.schema)
+        writer.write_batch(batch)
+        writer.close()
+        return sink.getvalue()
 
     def loads(self, obj):
         import pyarrow as pa
@@ -199,6 +211,55 @@ def __repr__(self):
         return "ArrowSerializer"
 
 
+class ArrowPandasSerializer(ArrowSerializer):
+    """
+    Serializes Pandas.Series as Arrow data.
+    """
+
+    def __init__(self):
+        super(ArrowPandasSerializer, self).__init__()
+
+    def dumps(self, series):
+        """
+        Make an ArrowRecordBatch from a Pandas Series and serialize. Input is a single series or
+        a list of series accompanied by an optional pyarrow type to coerce the data to.
+        """
+        import pyarrow as pa
+        # Make input conform to [(series1, type1), (series2, type2), ...]
+        if not isinstance(series, (list, tuple)) or \
+                (len(series) == 2 and isinstance(series[1], pa.DataType)):
+            series = [series]
+        series = ((s, None) if not isinstance(s, (list, tuple)) else s for s in series)
+
+        # If a nullable integer series has been promoted to floating point with NaNs, need to cast
+        # NOTE: this is not necessary with Arrow >= 0.7
+        def cast_series(s, t):
+            if t is None or s.dtype == t.to_pandas_dtype():
+                return s
+            else:
+                return s.fillna(0).astype(t.to_pandas_dtype(), copy=False)
+
+        arrs = [pa.Array.from_pandas(cast_series(s, t), mask=s.isnull(), type=t) for s, t in series]
+        batch = pa.RecordBatch.from_arrays(arrs, ["_%d" % i for i in xrange(len(arrs))])
+        return super(ArrowPandasSerializer, self).dumps(batch)
+
+    def loads(self, obj):
+        """
+        Deserialize an ArrowRecordBatch to an Arrow table and return as a list of pandas.Series
+        followed by a dictionary containing length of the loaded batches.
+        """
+        import pyarrow as pa
+        reader = pa.RecordBatchFileReader(pa.BufferReader(obj))
+        batches = [reader.get_batch(i) for i in xrange(reader.num_record_batches)]
+        # NOTE: a 0-parameter pandas_udf will produce an empty batch that can have num_rows set
+        num_rows = sum((batch.num_rows for batch in batches))
+        table = pa.Table.from_batches(batches)
+        return [c.to_pandas() for c in table.itercolumns()] + [{"length": num_rows}]
+
+    def __repr__(self):
+        return "ArrowPandasSerializer"
+
+
 class BatchedSerializer(Serializer):
 
     """
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 57068fbae1dd8..46e3a85e60d7b 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -2044,7 +2044,7 @@ class UserDefinedFunction(object):
 
     .. versionadded:: 1.3
     """
-    def __init__(self, func, returnType, name=None):
+    def __init__(self, func, returnType, name=None, vectorized=False):
         if not callable(func):
             raise TypeError(
                 "Not a function or callable (__call__ is not defined): "
@@ -2058,6 +2058,7 @@ def __init__(self, func, returnType, name=None):
         self._name = name or (
             func.__name__ if hasattr(func, '__name__')
             else func.__class__.__name__)
+        self._vectorized = vectorized
 
     @property
     def returnType(self):
@@ -2089,7 +2090,7 @@ def _create_judf(self):
         wrapped_func = _wrap_function(sc, self.func, self.returnType)
         jdt = spark._jsparkSession.parseDataType(self.returnType.json())
         judf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonFunction(
-            self._name, wrapped_func, jdt)
+            self._name, wrapped_func, jdt, self._vectorized)
         return judf
 
     def __call__(self, *cols):
@@ -2123,6 +2124,22 @@ def wrapper(*args):
         return wrapper
 
 
+def _create_udf(f, returnType, vectorized):
+
+    def _udf(f, returnType=StringType(), vectorized=vectorized):
+        udf_obj = UserDefinedFunction(f, returnType, vectorized=vectorized)
+        return udf_obj._wrapped()
+
+    # decorator @udf, @udf(), @udf(dataType()), or similar with @pandas_udf
+    if f is None or isinstance(f, (str, DataType)):
+        # If DataType has been passed as a positional argument
+        # for decorator use it as a returnType
+        return_type = f or returnType
+        return functools.partial(_udf, returnType=return_type, vectorized=vectorized)
+    else:
+        return _udf(f=f, returnType=returnType, vectorized=vectorized)
+
+
 @since(1.3)
 def udf(f=None, returnType=StringType()):
     """Creates a :class:`Column` expression representing a user defined function (UDF).
@@ -2154,18 +2171,26 @@ def udf(f=None, returnType=StringType()):
     |         8|      JOHN DOE|          22|
     +----------+--------------+------------+
     """
-    def _udf(f, returnType=StringType()):
-        udf_obj = UserDefinedFunction(f, returnType)
-        return udf_obj._wrapped()
+    return _create_udf(f, returnType=returnType, vectorized=False)
 
-    # decorator @udf, @udf() or @udf(dataType())
-    if f is None or isinstance(f, (str, DataType)):
-        # If DataType has been passed as a positional argument
-        # for decorator use it as a returnType
-        return_type = f or returnType
-        return functools.partial(_udf, returnType=return_type)
+
+@since(2.3)
+def pandas_udf(f=None, returnType=StringType()):
+    """
+    Creates a :class:`Column` expression representing a user defined function (UDF) that accepts
+    `Pandas.Series` as input arguments and outputs a `Pandas.Series` of the same length.
+
+    :param f: python function if used as a standalone function
+    :param returnType: a :class:`pyspark.sql.types.DataType` object
+
+    # TODO: doctest
+    """
+    import inspect
+    # If function "f" does not define the optional kwargs, then wrap with a kwargs placeholder
+    if inspect.getargspec(f).keywords is None:
+        return _create_udf(lambda *a, **kwargs: f(*a), returnType=returnType, vectorized=True)
     else:
-        return _udf(f=f, returnType=returnType)
+        return _create_udf(f, returnType=returnType, vectorized=True)
 
 
 blacklist = ['map', 'since', 'ignore_unicode_prefix']
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 6e7ddf94b7c03..ab76c48e000d4 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -3136,6 +3136,203 @@ def test_filtered_frame(self):
         self.assertTrue(pdf.empty)
 
 
+@unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed")
+class VectorizedUDFTests(ReusedPySparkTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        ReusedPySparkTestCase.setUpClass()
+        cls.spark = SparkSession(cls.sc)
+
+    @classmethod
+    def tearDownClass(cls):
+        ReusedPySparkTestCase.tearDownClass()
+        cls.spark.stop()
+
+    def test_vectorized_udf_basic(self):
+        from pyspark.sql.functions import pandas_udf, col
+        df = self.spark.range(10).select(
+            col('id').cast('string').alias('str'),
+            col('id').cast('int').alias('int'),
+            col('id').alias('long'),
+            col('id').cast('float').alias('float'),
+            col('id').cast('double').alias('double'),
+            col('id').cast('boolean').alias('bool'))
+        f = lambda x: x
+        str_f = pandas_udf(f, StringType())
+        int_f = pandas_udf(f, IntegerType())
+        long_f = pandas_udf(f, LongType())
+        float_f = pandas_udf(f, FloatType())
+        double_f = pandas_udf(f, DoubleType())
+        bool_f = pandas_udf(f, BooleanType())
+        res = df.select(str_f(col('str')), int_f(col('int')),
+                        long_f(col('long')), float_f(col('float')),
+                        double_f(col('double')), bool_f(col('bool')))
+        self.assertEquals(df.collect(), res.collect())
+
+    def test_vectorized_udf_null_boolean(self):
+        from pyspark.sql.functions import pandas_udf, col
+        data = [(True,), (True,), (None,), (False,)]
+        schema = StructType().add("bool", BooleanType())
+        df = self.spark.createDataFrame(data, schema)
+        bool_f = pandas_udf(lambda x: x, BooleanType())
+        res = df.select(bool_f(col('bool')))
+        self.assertEquals(df.collect(), res.collect())
+
+    def test_vectorized_udf_null_byte(self):
+        from pyspark.sql.functions import pandas_udf, col
+        data = [(None,), (2,), (3,), (4,)]
+        schema = StructType().add("byte", ByteType())
+        df = self.spark.createDataFrame(data, schema)
+        byte_f = pandas_udf(lambda x: x, ByteType())
+        res = df.select(byte_f(col('byte')))
+        self.assertEquals(df.collect(), res.collect())
+
+    def test_vectorized_udf_null_short(self):
+        from pyspark.sql.functions import pandas_udf, col
+        data = [(None,), (2,), (3,), (4,)]
+        schema = StructType().add("short", ShortType())
+        df = self.spark.createDataFrame(data, schema)
+        short_f = pandas_udf(lambda x: x, ShortType())
+        res = df.select(short_f(col('short')))
+        self.assertEquals(df.collect(), res.collect())
+
+    def test_vectorized_udf_null_int(self):
+        from pyspark.sql.functions import pandas_udf, col
+        data = [(None,), (2,), (3,), (4,)]
+        schema = StructType().add("int", IntegerType())
+        df = self.spark.createDataFrame(data, schema)
+        int_f = pandas_udf(lambda x: x, IntegerType())
+        res = df.select(int_f(col('int')))
+        self.assertEquals(df.collect(), res.collect())
+
+    def test_vectorized_udf_null_long(self):
+        from pyspark.sql.functions import pandas_udf, col
+        data = [(None,), (2,), (3,), (4,)]
+        schema = StructType().add("long", LongType())
+        df = self.spark.createDataFrame(data, schema)
+        long_f = pandas_udf(lambda x: x, LongType())
+        res = df.select(long_f(col('long')))
+        self.assertEquals(df.collect(), res.collect())
+
+    def test_vectorized_udf_null_float(self):
+        from pyspark.sql.functions import pandas_udf, col
+        data = [(3.0,), (5.0,), (-1.0,), (None,)]
+        schema = StructType().add("float", FloatType())
+        df = self.spark.createDataFrame(data, schema)
+        float_f = pandas_udf(lambda x: x, FloatType())
+        res = df.select(float_f(col('float')))
+        self.assertEquals(df.collect(), res.collect())
+
+    def test_vectorized_udf_null_double(self):
+        from pyspark.sql.functions import pandas_udf, col
+        data = [(3.0,), (5.0,), (-1.0,), (None,)]
+        schema = StructType().add("double", DoubleType())
+        df = self.spark.createDataFrame(data, schema)
+        double_f = pandas_udf(lambda x: x, DoubleType())
+        res = df.select(double_f(col('double')))
+        self.assertEquals(df.collect(), res.collect())
+
+    def test_vectorized_udf_null_string(self):
+        from pyspark.sql.functions import pandas_udf, col
+        data = [("foo",), (None,), ("bar",), ("bar",)]
+        schema = StructType().add("str", StringType())
+        df = self.spark.createDataFrame(data, schema)
+        str_f = pandas_udf(lambda x: x, StringType())
+        res = df.select(str_f(col('str')))
+        self.assertEquals(df.collect(), res.collect())
+
+    def test_vectorized_udf_zero_parameter(self):
+        from pyspark.sql.functions import pandas_udf
+        import pandas as pd
+        df = self.spark.range(10)
+        f0 = pandas_udf(lambda **kwargs: pd.Series(1).repeat(kwargs['length']), LongType())
+        res = df.select(f0())
+        self.assertEquals(df.select(lit(1)).collect(), res.collect())
+
+    def test_vectorized_udf_datatype_string(self):
+        from pyspark.sql.functions import pandas_udf, col
+        df = self.spark.range(10).select(
+            col('id').cast('string').alias('str'),
+            col('id').cast('int').alias('int'),
+            col('id').alias('long'),
+            col('id').cast('float').alias('float'),
+            col('id').cast('double').alias('double'),
+            col('id').cast('boolean').alias('bool'))
+        f = lambda x: x
+        str_f = pandas_udf(f, 'string')
+        int_f = pandas_udf(f, 'integer')
+        long_f = pandas_udf(f, 'long')
+        float_f = pandas_udf(f, 'float')
+        double_f = pandas_udf(f, 'double')
+        bool_f = pandas_udf(f, 'boolean')
+        res = df.select(str_f(col('str')), int_f(col('int')),
+                        long_f(col('long')), float_f(col('float')),
+                        double_f(col('double')), bool_f(col('bool')))
+        self.assertEquals(df.collect(), res.collect())
+
+    def test_vectorized_udf_complex(self):
+        from pyspark.sql.functions import pandas_udf, col, expr
+        df = self.spark.range(10).select(
+            col('id').cast('int').alias('a'),
+            col('id').cast('int').alias('b'),
+            col('id').cast('double').alias('c'))
+        add = pandas_udf(lambda x, y: x + y, IntegerType())
+        power2 = pandas_udf(lambda x: 2 ** x, IntegerType())
+        mul = pandas_udf(lambda x, y: x * y, DoubleType())
+        res = df.select(add(col('a'), col('b')), power2(col('a')), mul(col('b'), col('c')))
+        expected = df.select(expr('a + b'), expr('power(2, a)'), expr('b * c'))
+        self.assertEquals(expected.collect(), res.collect())
+
+    def test_vectorized_udf_exception(self):
+        from pyspark.sql.functions import pandas_udf, col
+        df = self.spark.range(10)
+        raise_exception = pandas_udf(lambda x: x * (1 / 0), LongType())
+        with QuietTest(self.sc):
+            with self.assertRaisesRegexp(Exception, 'division( or modulo)? by zero'):
+                df.select(raise_exception(col('id'))).collect()
+
+    def test_vectorized_udf_invalid_length(self):
+        from pyspark.sql.functions import pandas_udf, col
+        import pandas as pd
+        df = self.spark.range(10)
+        raise_exception = pandas_udf(lambda: pd.Series(1), LongType())
+        with QuietTest(self.sc):
+            with self.assertRaisesRegexp(
+                    Exception,
+                    'Result vector from pandas_udf was not the required length'):
+                df.select(raise_exception()).collect()
+
+    def test_vectorized_udf_mix_udf(self):
+        from pyspark.sql.functions import pandas_udf, udf, col
+        df = self.spark.range(10)
+        row_by_row_udf = udf(lambda x: x, LongType())
+        pd_udf = pandas_udf(lambda x: x, LongType())
+        with QuietTest(self.sc):
+            with self.assertRaisesRegexp(
+                    Exception,
+                    'Can not mix vectorized and non-vectorized UDFs'):
+                df.select(row_by_row_udf(col('id')), pd_udf(col('id'))).collect()
+
+    def test_vectorized_udf_chained(self):
+        from pyspark.sql.functions import pandas_udf, col
+        df = self.spark.range(10).toDF('x')
+        f = pandas_udf(lambda x: x + 1, LongType())
+        g = pandas_udf(lambda x: x - 1, LongType())
+        res = df.select(g(f(col('x'))))
+        self.assertEquals(df.collect(), res.collect())
+
+    def test_vectorized_udf_wrong_return_type(self):
+        from pyspark.sql.functions import pandas_udf, col
+        df = self.spark.range(10).toDF('x')
+        f = pandas_udf(lambda x: x * 1.0, StringType())
+        with QuietTest(self.sc):
+            with self.assertRaisesRegexp(
+                    Exception,
+                    'Invalid.*type.*string'):
+                df.select(f(col('x'))).collect()
+
+
 if __name__ == "__main__":
     from pyspark.sql.tests import *
     if xmlrunner:
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index aaf520fa8019f..ebdc11c3b744a 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -1597,6 +1597,33 @@ def convert(self, obj, gateway_client):
 register_input_converter(DateConverter())
 
 
+def toArrowType(dt):
+    """ Convert Spark data type to pyarrow type
+    """
+    import pyarrow as pa
+    if type(dt) == BooleanType:
+        arrow_type = pa.bool_()
+    elif type(dt) == ByteType:
+        arrow_type = pa.int8()
+    elif type(dt) == ShortType:
+        arrow_type = pa.int16()
+    elif type(dt) == IntegerType:
+        arrow_type = pa.int32()
+    elif type(dt) == LongType:
+        arrow_type = pa.int64()
+    elif type(dt) == FloatType:
+        arrow_type = pa.float32()
+    elif type(dt) == DoubleType:
+        arrow_type = pa.float64()
+    elif type(dt) == DecimalType:
+        arrow_type = pa.decimal(dt.precision, dt.scale)
+    elif type(dt) == StringType:
+        arrow_type = pa.string()
+    else:
+        raise TypeError("Unsupported type in conversion to Arrow: " + str(dt))
+    return arrow_type
+
+
 def _test():
     import doctest
     from pyspark.context import SparkContext
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index baaa3fe074e9a..0e35cf7be6240 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -30,7 +30,9 @@
 from pyspark.taskcontext import TaskContext
 from pyspark.files import SparkFiles
 from pyspark.serializers import write_with_length, write_int, read_long, \
-    write_long, read_int, SpecialLengths, UTF8Deserializer, PickleSerializer, BatchedSerializer
+    write_long, read_int, SpecialLengths, PythonEvalType, UTF8Deserializer, PickleSerializer, \
+    BatchedSerializer, ArrowPandasSerializer
+from pyspark.sql.types import toArrowType
 from pyspark import shuffle
 
 pickleSer = PickleSerializer()
@@ -58,9 +60,12 @@ def read_command(serializer, file):
     return command
 
 
-def chain(f, g):
-    """chain two function together """
-    return lambda *a: g(f(*a))
+def chain(f, g, eval_type):
+    """chain two functions together """
+    if eval_type == PythonEvalType.SQL_PANDAS_UDF:
+        return lambda *a, **kwargs: g(f(*a, **kwargs), **kwargs)
+    else:
+        return lambda *a: g(f(*a))
 
 
 def wrap_udf(f, return_type):
@@ -71,7 +76,21 @@ def wrap_udf(f, return_type):
         return lambda *a: f(*a)
 
 
-def read_single_udf(pickleSer, infile):
+def wrap_pandas_udf(f, return_type):
+    arrow_return_type = toArrowType(return_type)
+
+    def verify_result_length(*a):
+        kwargs = a[-1]
+        result = f(*a[:-1], **kwargs)
+        if len(result) != kwargs["length"]:
+            raise RuntimeError("Result vector from pandas_udf was not the required length: "
+                               "expected %d, got %d\nUse input vector length or kwargs['length']"
+                               % (kwargs["length"], len(result)))
+        return result, arrow_return_type
+    return lambda *a: verify_result_length(*a)
+
+
+def read_single_udf(pickleSer, infile, eval_type):
     num_arg = read_int(infile)
     arg_offsets = [read_int(infile) for i in range(num_arg)]
     row_func = None
@@ -80,17 +99,22 @@ def read_single_udf(pickleSer, infile):
         if row_func is None:
             row_func = f
         else:
-            row_func = chain(row_func, f)
+            row_func = chain(row_func, f, eval_type)
     # the last returnType will be the return type of UDF
-    return arg_offsets, wrap_udf(row_func, return_type)
+    if eval_type == PythonEvalType.SQL_PANDAS_UDF:
+        # A pandas_udf will take kwargs as the last argument
+        arg_offsets = arg_offsets + [-1]
+        return arg_offsets, wrap_pandas_udf(row_func, return_type)
+    else:
+        return arg_offsets, wrap_udf(row_func, return_type)
 
 
-def read_udfs(pickleSer, infile):
+def read_udfs(pickleSer, infile, eval_type):
     num_udfs = read_int(infile)
     udfs = {}
     call_udf = []
     for i in range(num_udfs):
-        arg_offsets, udf = read_single_udf(pickleSer, infile)
+        arg_offsets, udf = read_single_udf(pickleSer, infile, eval_type)
         udfs['f%d' % i] = udf
         args = ["a[%d]" % o for o in arg_offsets]
         call_udf.append("f%d(%s)" % (i, ", ".join(args)))
@@ -102,7 +126,12 @@ def read_udfs(pickleSer, infile):
     mapper = eval(mapper_str, udfs)
 
     func = lambda _, it: map(mapper, it)
-    ser = BatchedSerializer(PickleSerializer(), 100)
+
+    if eval_type == PythonEvalType.SQL_PANDAS_UDF:
+        ser = ArrowPandasSerializer()
+    else:
+        ser = BatchedSerializer(PickleSerializer(), 100)
+
     # profiling is not supported for UDF
     return func, None, ser, ser
 
@@ -159,11 +188,11 @@ def main(infile, outfile):
                 _broadcastRegistry.pop(bid)
 
         _accumulatorRegistry.clear()
-        is_sql_udf = read_int(infile)
-        if is_sql_udf:
-            func, profiler, deserializer, serializer = read_udfs(pickleSer, infile)
-        else:
+        eval_type = read_int(infile)
+        if eval_type == PythonEvalType.NON_UDF:
             func, profiler, deserializer, serializer = read_command(pickleSer, infile)
+        else:
+            func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)
 
         init_time = time.time()
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
new file mode 100644
index 0000000000000..f8bdc1e14eebc
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.python
+
+import org.apache.spark.TaskContext
+import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType, PythonRunner}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.arrow.{ArrowConverters, ArrowPayload}
+import org.apache.spark.sql.types.StructType
+
+/**
+ * A physical plan that evaluates a [[PythonUDF]],
+ */
+case class ArrowEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], child: SparkPlan)
+  extends EvalPythonExec(udfs, output, child) {
+
+  protected override def evaluate(
+      funcs: Seq[ChainedPythonFunctions],
+      bufferSize: Int,
+      reuseWorker: Boolean,
+      argOffsets: Array[Array[Int]],
+      iter: Iterator[InternalRow],
+      schema: StructType,
+      context: TaskContext): Iterator[InternalRow] = {
+    val inputIterator = ArrowConverters.toPayloadIterator(
+      iter, schema, conf.arrowMaxRecordsPerBatch, context).map(_.asPythonSerializable)
+
+    // Output iterator for results from Python.
+    val outputIterator = new PythonRunner(
+        funcs, bufferSize, reuseWorker, PythonEvalType.SQL_PANDAS_UDF, argOffsets)
+      .compute(inputIterator, context.partitionId(), context)
+
+    val outputRowIterator = ArrowConverters.fromPayloadIterator(
+      outputIterator.map(new ArrowPayload(_)), context)
+
+    // Verify that the output schema is correct
+    val schemaOut = StructType.fromAttributes(output.drop(child.output.length).zipWithIndex
+      .map { case (attr, i) => attr.withName(s"_$i") })
+    assert(schemaOut.equals(outputRowIterator.schema),
+      s"Invalid schema from pandas_udf: expected $schemaOut, got ${outputRowIterator.schema}")
+
+    outputRowIterator
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala
index 3e176e2cde5bd..2978eac50554d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala
@@ -17,153 +17,78 @@
 
 package org.apache.spark.sql.execution.python
 
-import java.io.File
-
 import scala.collection.JavaConverters._
-import scala.collection.mutable.ArrayBuffer
 
 import net.razorvine.pickle.{Pickler, Unpickler}
 
-import org.apache.spark.{SparkEnv, TaskContext}
-import org.apache.spark.api.python.{ChainedPythonFunctions, PythonRunner}
-import org.apache.spark.rdd.RDD
+import org.apache.spark.TaskContext
+import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType, PythonRunner}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution.SparkPlan
-import org.apache.spark.sql.types.{DataType, StructField, StructType}
-import org.apache.spark.util.Utils
-
+import org.apache.spark.sql.types.{StructField, StructType}
 
 /**
- * A physical plan that evaluates a [[PythonUDF]], one partition of tuples at a time.
- *
- * Python evaluation works by sending the necessary (projected) input data via a socket to an
- * external Python process, and combine the result from the Python process with the original row.
- *
- * For each row we send to Python, we also put it in a queue first. For each output row from Python,
- * we drain the queue to find the original input row. Note that if the Python process is way too
- * slow, this could lead to the queue growing unbounded and spill into disk when run out of memory.
- *
- * Here is a diagram to show how this works:
- *
- *            Downstream (for parent)
- *             /      \
- *            /     socket  (output of UDF)
- *           /         \
- *        RowQueue    Python
- *           \         /
- *            \     socket  (input of UDF)
- *             \     /
- *          upstream (from child)
- *
- * The rows sent to and received from Python are packed into batches (100 rows) and serialized,
- * there should be always some rows buffered in the socket or Python process, so the pulling from
- * RowQueue ALWAYS happened after pushing into it.
+ * A physical plan that evaluates a [[PythonUDF]]
  */
 case class BatchEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], child: SparkPlan)
-  extends SparkPlan {
-
-  def children: Seq[SparkPlan] = child :: Nil
-
-  override def producedAttributes: AttributeSet = AttributeSet(output.drop(child.output.length))
-
-  private def collectFunctions(udf: PythonUDF): (ChainedPythonFunctions, Seq[Expression]) = {
-    udf.children match {
-      case Seq(u: PythonUDF) =>
-        val (chained, children) = collectFunctions(u)
-        (ChainedPythonFunctions(chained.funcs ++ Seq(udf.func)), children)
-      case children =>
-        // There should not be any other UDFs, or the children can't be evaluated directly.
-        assert(children.forall(_.find(_.isInstanceOf[PythonUDF]).isEmpty))
-        (ChainedPythonFunctions(Seq(udf.func)), udf.children)
-    }
-  }
-
-  protected override def doExecute(): RDD[InternalRow] = {
-    val inputRDD = child.execute().map(_.copy())
-    val bufferSize = inputRDD.conf.getInt("spark.buffer.size", 65536)
-    val reuseWorker = inputRDD.conf.getBoolean("spark.python.worker.reuse", defaultValue = true)
-
-    inputRDD.mapPartitions { iter =>
-      EvaluatePython.registerPicklers()  // register pickler for Row
-
-      // The queue used to buffer input rows so we can drain it to
-      // combine input with output from Python.
-      val queue = HybridRowQueue(TaskContext.get().taskMemoryManager(),
-        new File(Utils.getLocalDir(SparkEnv.get.conf)), child.output.length)
-      TaskContext.get().addTaskCompletionListener({ ctx =>
-        queue.close()
-      })
-
-      val (pyFuncs, inputs) = udfs.map(collectFunctions).unzip
-
-      // flatten all the arguments
-      val allInputs = new ArrayBuffer[Expression]
-      val dataTypes = new ArrayBuffer[DataType]
-      val argOffsets = inputs.map { input =>
-        input.map { e =>
-          if (allInputs.exists(_.semanticEquals(e))) {
-            allInputs.indexWhere(_.semanticEquals(e))
-          } else {
-            allInputs += e
-            dataTypes += e.dataType
-            allInputs.length - 1
-          }
-        }.toArray
-      }.toArray
-      val projection = newMutableProjection(allInputs, child.output)
-      val schema = StructType(dataTypes.map(dt => StructField("", dt)))
-      val needConversion = dataTypes.exists(EvaluatePython.needConversionInPython)
-
-      // enable memo iff we serialize the row with schema (schema and class should be memorized)
-      val pickle = new Pickler(needConversion)
-      // Input iterator to Python: input rows are grouped so we send them in batches to Python.
-      // For each row, add it to the queue.
-      val inputIterator = iter.map { inputRow =>
-        queue.add(inputRow.asInstanceOf[UnsafeRow])
-        val row = projection(inputRow)
-        if (needConversion) {
-          EvaluatePython.toJava(row, schema)
-        } else {
-          // fast path for these types that does not need conversion in Python
-          val fields = new Array[Any](row.numFields)
-          var i = 0
-          while (i < row.numFields) {
-            val dt = dataTypes(i)
-            fields(i) = EvaluatePython.toJava(row.get(i, dt), dt)
-            i += 1
-          }
-          fields
-        }
-      }.grouped(100).map(x => pickle.dumps(x.toArray))
-
-      val context = TaskContext.get()
-
-      // Output iterator for results from Python.
-      val outputIterator = new PythonRunner(pyFuncs, bufferSize, reuseWorker, true, argOffsets)
-        .compute(inputIterator, context.partitionId(), context)
-
-      val unpickle = new Unpickler
-      val mutableRow = new GenericInternalRow(1)
-      val joined = new JoinedRow
-      val resultType = if (udfs.length == 1) {
-        udfs.head.dataType
+  extends EvalPythonExec(udfs, output, child) {
+
+  protected override def evaluate(
+      funcs: Seq[ChainedPythonFunctions],
+      bufferSize: Int,
+      reuseWorker: Boolean,
+      argOffsets: Array[Array[Int]],
+      iter: Iterator[InternalRow],
+      schema: StructType,
+      context: TaskContext): Iterator[InternalRow] = {
+    EvaluatePython.registerPicklers()  // register pickler for Row
+
+    val dataTypes = schema.map(_.dataType)
+    val needConversion = dataTypes.exists(EvaluatePython.needConversionInPython)
+
+    // enable memo iff we serialize the row with schema (schema and class should be memorized)
+    val pickle = new Pickler(needConversion)
+    // Input iterator to Python: input rows are grouped so we send them in batches to Python.
+    // For each row, add it to the queue.
+    val inputIterator = iter.map { row =>
+      if (needConversion) {
+        EvaluatePython.toJava(row, schema)
       } else {
-        StructType(udfs.map(u => StructField("", u.dataType, u.nullable)))
-      }
-      val resultProj = UnsafeProjection.create(output, output)
-      outputIterator.flatMap { pickedResult =>
-        val unpickledBatch = unpickle.loads(pickedResult)
-        unpickledBatch.asInstanceOf[java.util.ArrayList[Any]].asScala
-      }.map { result =>
-        val row = if (udfs.length == 1) {
-          // fast path for single UDF
-          mutableRow(0) = EvaluatePython.fromJava(result, resultType)
-          mutableRow
-        } else {
-          EvaluatePython.fromJava(result, resultType).asInstanceOf[InternalRow]
+        // fast path for these types that does not need conversion in Python
+        val fields = new Array[Any](row.numFields)
+        var i = 0
+        while (i < row.numFields) {
+          val dt = dataTypes(i)
+          fields(i) = EvaluatePython.toJava(row.get(i, dt), dt)
+          i += 1
         }
-        resultProj(joined(queue.remove(), row))
+        fields
+      }
+    }.grouped(100).map(x => pickle.dumps(x.toArray))
+
+    // Output iterator for results from Python.
+    val outputIterator = new PythonRunner(
+        funcs, bufferSize, reuseWorker, PythonEvalType.SQL_BATCHED_UDF, argOffsets)
+      .compute(inputIterator, context.partitionId(), context)
+
+    val unpickle = new Unpickler
+    val mutableRow = new GenericInternalRow(1)
+    val resultType = if (udfs.length == 1) {
+      udfs.head.dataType
+    } else {
+      StructType(udfs.map(u => StructField("", u.dataType, u.nullable)))
+    }
+    outputIterator.flatMap { pickedResult =>
+      val unpickledBatch = unpickle.loads(pickedResult)
+      unpickledBatch.asInstanceOf[java.util.ArrayList[Any]].asScala
+    }.map { result =>
+      if (udfs.length == 1) {
+        // fast path for single UDF
+        mutableRow(0) = EvaluatePython.fromJava(result, resultType)
+        mutableRow
+      } else {
+        EvaluatePython.fromJava(result, resultType).asInstanceOf[InternalRow]
       }
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala
new file mode 100644
index 0000000000000..860dc78c1dd1b
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.python
+
+import java.io.File
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.spark.{SparkEnv, TaskContext}
+import org.apache.spark.api.python.ChainedPythonFunctions
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.types.{DataType, StructField, StructType}
+import org.apache.spark.util.Utils
+
+
+/**
+ * A physical plan that evaluates a [[PythonUDF]], one partition of tuples at a time.
+ *
+ * Python evaluation works by sending the necessary (projected) input data via a socket to an
+ * external Python process, and combine the result from the Python process with the original row.
+ *
+ * For each row we send to Python, we also put it in a queue first. For each output row from Python,
+ * we drain the queue to find the original input row. Note that if the Python process is way too
+ * slow, this could lead to the queue growing unbounded and spill into disk when run out of memory.
+ *
+ * Here is a diagram to show how this works:
+ *
+ *            Downstream (for parent)
+ *             /      \
+ *            /     socket  (output of UDF)
+ *           /         \
+ *        RowQueue    Python
+ *           \         /
+ *            \     socket  (input of UDF)
+ *             \     /
+ *          upstream (from child)
+ *
+ * The rows sent to and received from Python are packed into batches (100 rows) and serialized,
+ * there should be always some rows buffered in the socket or Python process, so the pulling from
+ * RowQueue ALWAYS happened after pushing into it.
+ */
+abstract class EvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], child: SparkPlan)
+  extends SparkPlan {
+
+  def children: Seq[SparkPlan] = child :: Nil
+
+  override def producedAttributes: AttributeSet = AttributeSet(output.drop(child.output.length))
+
+  private def collectFunctions(udf: PythonUDF): (ChainedPythonFunctions, Seq[Expression]) = {
+    udf.children match {
+      case Seq(u: PythonUDF) =>
+        val (chained, children) = collectFunctions(u)
+        (ChainedPythonFunctions(chained.funcs ++ Seq(udf.func)), children)
+      case children =>
+        // There should not be any other UDFs, or the children can't be evaluated directly.
+        assert(children.forall(_.find(_.isInstanceOf[PythonUDF]).isEmpty))
+        (ChainedPythonFunctions(Seq(udf.func)), udf.children)
+    }
+  }
+
+  protected def evaluate(
+      funcs: Seq[ChainedPythonFunctions],
+      bufferSize: Int,
+      reuseWorker: Boolean,
+      argOffsets: Array[Array[Int]],
+      iter: Iterator[InternalRow],
+      schema: StructType,
+      context: TaskContext): Iterator[InternalRow]
+
+  protected override def doExecute(): RDD[InternalRow] = {
+    val inputRDD = child.execute().map(_.copy())
+    val bufferSize = inputRDD.conf.getInt("spark.buffer.size", 65536)
+    val reuseWorker = inputRDD.conf.getBoolean("spark.python.worker.reuse", defaultValue = true)
+
+    inputRDD.mapPartitions { iter =>
+      val context = TaskContext.get()
+
+      // The queue used to buffer input rows so we can drain it to
+      // combine input with output from Python.
+      val queue = HybridRowQueue(context.taskMemoryManager(),
+        new File(Utils.getLocalDir(SparkEnv.get.conf)), child.output.length)
+      context.addTaskCompletionListener { ctx =>
+        queue.close()
+      }
+
+      val (pyFuncs, inputs) = udfs.map(collectFunctions).unzip
+
+      // flatten all the arguments
+      val allInputs = new ArrayBuffer[Expression]
+      val dataTypes = new ArrayBuffer[DataType]
+      val argOffsets = inputs.map { input =>
+        input.map { e =>
+          if (allInputs.exists(_.semanticEquals(e))) {
+            allInputs.indexWhere(_.semanticEquals(e))
+          } else {
+            allInputs += e
+            dataTypes += e.dataType
+            allInputs.length - 1
+          }
+        }.toArray
+      }.toArray
+      val projection = newMutableProjection(allInputs, child.output)
+      val schema = StructType(dataTypes.zipWithIndex.map { case (dt, i) =>
+        StructField(s"_$i", dt)
+      })
+
+      // Add rows to queue to join later with the result.
+      val projectedRowIter = iter.map { inputRow =>
+        queue.add(inputRow.asInstanceOf[UnsafeRow])
+        projection(inputRow)
+      }
+
+      val outputRowIterator = evaluate(
+        pyFuncs, bufferSize, reuseWorker, argOffsets, projectedRowIter, schema, context)
+
+      val joined = new JoinedRow
+      val resultProj = UnsafeProjection.create(output, output)
+
+      outputRowIterator.map { outputRow =>
+        resultProj(joined(queue.remove(), outputRow))
+      }
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
index 882a5ce1a663e..fec456d86dbe2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
@@ -138,7 +138,16 @@ object ExtractPythonUDFs extends Rule[SparkPlan] with PredicateHelper {
           val resultAttrs = udfs.zipWithIndex.map { case (u, i) =>
             AttributeReference(s"pythonUDF$i", u.dataType)()
           }
-          val evaluation = BatchEvalPythonExec(validUdfs, child.output ++ resultAttrs, child)
+
+          val evaluation = validUdfs.partition(_.vectorized) match {
+            case (vectorizedUdfs, plainUdfs) if plainUdfs.isEmpty =>
+              ArrowEvalPythonExec(vectorizedUdfs, child.output ++ resultAttrs, child)
+            case (vectorizedUdfs, plainUdfs) if vectorizedUdfs.isEmpty =>
+              BatchEvalPythonExec(plainUdfs, child.output ++ resultAttrs, child)
+            case _ =>
+              throw new IllegalArgumentException("Can not mix vectorized and non-vectorized UDFs")
+          }
+
           attributeMap ++= validUdfs.zip(resultAttrs)
           evaluation
         } else {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDF.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDF.scala
index 7ebbdb9846cce..84a6d9e5be59c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDF.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDF.scala
@@ -28,7 +28,8 @@ case class PythonUDF(
     name: String,
     func: PythonFunction,
     dataType: DataType,
-    children: Seq[Expression])
+    children: Seq[Expression],
+    vectorized: Boolean)
   extends Expression with Unevaluable with NonSQLExpression with UserDefinedExpression {
 
   override def toString: String = s"$name(${children.mkString(", ")})"
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/UserDefinedPythonFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/UserDefinedPythonFunction.scala
index 0d39c8ff980f2..a30a80acf5c23 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/UserDefinedPythonFunction.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/UserDefinedPythonFunction.scala
@@ -28,10 +28,11 @@ import org.apache.spark.sql.types.DataType
 case class UserDefinedPythonFunction(
     name: String,
     func: PythonFunction,
-    dataType: DataType) {
+    dataType: DataType,
+    vectorized: Boolean) {
 
   def builder(e: Seq[Expression]): PythonUDF = {
-    PythonUDF(name, func, dataType, e)
+    PythonUDF(name, func, dataType, e, vectorized)
   }
 
   /** Returns a [[Column]] that will evaluate to calling this UDF with the given input. */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExecSuite.scala
index bbd9484271a3e..153e6e1f88c70 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExecSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExecSuite.scala
@@ -105,5 +105,8 @@ class DummyUDF extends PythonFunction(
   broadcastVars = null,
   accumulator = null)
 
-class MyDummyPythonUDF
-  extends UserDefinedPythonFunction(name = "dummyUDF", func = new DummyUDF, dataType = BooleanType)
+class MyDummyPythonUDF extends UserDefinedPythonFunction(
+  name = "dummyUDF",
+  func = new DummyUDF,
+  dataType = BooleanType,
+  vectorized = false)

From 10e37f6eb6819c9233830c0d97e8fd1c713be0f1 Mon Sep 17 00:00:00 2001
From: guoxiaolong <guo.xiaolong1@zte.com.cn>
Date: Fri, 22 Sep 2017 11:51:57 +0100
Subject: [PATCH 603/779] [UI][STREAMING] Modify the title, 'Records' instead
 of 'Input Size'

## What changes were proposed in this pull request?
Spark Streaming is processing data should be record, so the title should be 'Records', and should not be 'Input Size'.

Fix before:
![1](https://user-images.githubusercontent.com/26266482/30099599-c64d4a8a-9318-11e7-8a8d-1ca99b409323.png)

Fix after:
![2](https://user-images.githubusercontent.com/26266482/30099609-cd4df7d0-9318-11e7-8a27-dbaec6797bb1.png)

## How was this patch tested?
manual tests

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: guoxiaolong <guo.xiaolong1@zte.com.cn>

Closes #19144 from guoxiaolongzte/streamingUI.
---
 .../scala/org/apache/spark/streaming/ui/AllBatchesTable.scala | 2 +-
 .../scala/org/apache/spark/streaming/UISeleniumSuite.scala    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala
index 70b4bb466c46b..f1070e9029cb5 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala
@@ -25,7 +25,7 @@ private[ui] abstract class BatchTableBase(tableId: String, batchInterval: Long)
 
   protected def columns: Seq[Node] = {
     <th>Batch Time</th>
-      <th>Input Size</th>
+      <th>Records</th>
       <th>Scheduling Delay
         {SparkUIUtils.tooltip("Time taken by Streaming scheduler to submit jobs of a batch", "top")}
       </th>
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala
index e7cec999c219e..f2204a1870933 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala
@@ -121,11 +121,11 @@ class UISeleniumSuite
         h4Text.exists(_.matches("Completed Batches \\(last \\d+ out of \\d+\\)")) should be (true)
 
         findAll(cssSelector("""#active-batches-table th""")).map(_.text).toSeq should be {
-          List("Batch Time", "Input Size", "Scheduling Delay (?)", "Processing Time (?)",
+          List("Batch Time", "Records", "Scheduling Delay (?)", "Processing Time (?)",
             "Output Ops: Succeeded/Total", "Status")
         }
         findAll(cssSelector("""#completed-batches-table th""")).map(_.text).toSeq should be {
-          List("Batch Time", "Input Size", "Scheduling Delay (?)", "Processing Time (?)",
+          List("Batch Time", "Records", "Scheduling Delay (?)", "Processing Time (?)",
             "Total Delay (?)", "Output Ops: Succeeded/Total")
         }
 

From d2b2932d8be01dee31983121f6fffd16177bf48a Mon Sep 17 00:00:00 2001
From: Ala Luszczak <ala@databricks.com>
Date: Fri, 22 Sep 2017 15:31:43 +0200
Subject: [PATCH 604/779] [SPARK-22092] Reallocation in
 OffHeapColumnVector.reserveInternal corrupts struct and array data

## What changes were proposed in this pull request?

`OffHeapColumnVector.reserveInternal()` will only copy already inserted values during reallocation if `data != null`. In vectors containing arrays or structs this is incorrect, since there field `data` is not used at all. We need to check `nulls` instead.

## How was this patch tested?

Adds new tests to `ColumnVectorSuite` that reproduce the errors.

Author: Ala Luszczak <ala@databricks.com>

Closes #19308 from ala/vector-realloc.
---
 .../vectorized/OffHeapColumnVector.java       |  2 +-
 .../vectorized/ColumnVectorSuite.scala        | 26 +++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
index 35682756ed6c3..e1d36858d4eee 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
@@ -515,7 +515,7 @@ public void loadBytes(ColumnVector.Array array) {
   // Split out the slow path.
   @Override
   protected void reserveInternal(int newCapacity) {
-    int oldCapacity = (this.data == 0L) ? 0 : capacity;
+    int oldCapacity = (nulls == 0L) ? 0 : capacity;
     if (this.resultArray != null) {
       this.lengthData =
           Platform.reallocateMemory(lengthData, oldCapacity * 4, newCapacity * 4);
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
index 998067a47033b..f7b06c97f9db6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
@@ -198,4 +198,30 @@ class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
     assert(array.get(1, schema).asInstanceOf[ColumnarBatch.Row].get(0, IntegerType) === 456)
     assert(array.get(1, schema).asInstanceOf[ColumnarBatch.Row].get(1, DoubleType) === 5.67)
   }
+
+  test("[SPARK-22092] off-heap column vector reallocation corrupts array data") {
+    val arrayType = ArrayType(IntegerType, true)
+    testVector = new OffHeapColumnVector(8, arrayType)
+
+    val data = testVector.arrayData()
+    (0 until 8).foreach(i => data.putInt(i, i))
+    (0 until 8).foreach(i => testVector.putArray(i, i, 1))
+
+    // Increase vector's capacity and reallocate the data to new bigger buffers.
+    testVector.reserve(16)
+
+    // Check that none of the values got lost/overwritten.
+    val array = new ColumnVector.Array(testVector)
+    (0 until 8).foreach { i =>
+      assert(array.get(i, arrayType).asInstanceOf[ArrayData].toIntArray() === Array(i))
+    }
+  }
+
+  test("[SPARK-22092] off-heap column vector reallocation corrupts struct nullability") {
+    val structType = new StructType().add("int", IntegerType).add("double", DoubleType)
+    testVector = new OffHeapColumnVector(8, structType)
+    (0 until 8).foreach(i => if (i % 2 == 0) testVector.putNull(i) else testVector.putNotNull(i))
+    testVector.reserve(16)
+    (0 until 8).foreach(i => assert(testVector.isNullAt(i) == (i % 2 == 0)))
+  }
 }

From 3e6a714c9ee97ef13b3f2010babded3b63fd9d74 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Fri, 22 Sep 2017 22:39:47 +0900
Subject: [PATCH 605/779] [SPARK-21766][PYSPARK][SQL] DataFrame toPandas()
 raises ValueError with nullable int columns

## What changes were proposed in this pull request?

When calling `DataFrame.toPandas()` (without Arrow enabled), if there is a `IntegralType` column (`IntegerType`, `ShortType`, `ByteType`) that has null values the following exception is thrown:

    ValueError: Cannot convert non-finite values (NA or inf) to integer

This is because the null values first get converted to float NaN during the construction of the Pandas DataFrame in `from_records`, and then it is attempted to be converted back to to an integer where it fails.

The fix is going to check if the Pandas DataFrame can cause such failure when converting, if so, we don't do the conversion and use the inferred type by Pandas.

Closes #18945

## How was this patch tested?

Added pyspark test.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #19319 from viirya/SPARK-21766.
---
 python/pyspark/sql/dataframe.py | 13 ++++++++++---
 python/pyspark/sql/tests.py     | 12 ++++++++++++
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 88ac4134a0d0d..7b81a0be84b52 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -37,6 +37,7 @@
 from pyspark.sql.column import Column, _to_seq, _to_list, _to_java_column
 from pyspark.sql.readwriter import DataFrameWriter
 from pyspark.sql.streaming import DataStreamWriter
+from pyspark.sql.types import IntegralType
 from pyspark.sql.types import *
 
 __all__ = ["DataFrame", "DataFrameNaFunctions", "DataFrameStatFunctions"]
@@ -1891,14 +1892,20 @@ def toPandas(self):
                       "if using spark.sql.execution.arrow.enable=true"
                 raise ImportError("%s\n%s" % (e.message, msg))
         else:
+            pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
+
             dtype = {}
             for field in self.schema:
                 pandas_type = _to_corrected_pandas_type(field.dataType)
-                if pandas_type is not None:
+                # SPARK-21766: if an integer field is nullable and has null values, it can be
+                # inferred by pandas as float column. Once we convert the column with NaN back
+                # to integer type e.g., np.int16, we will hit exception. So we use the inferred
+                # float type, not the corrected type from the schema in this case.
+                if pandas_type is not None and \
+                    not(isinstance(field.dataType, IntegralType) and field.nullable and
+                        pdf[field.name].isnull().any()):
                     dtype[field.name] = pandas_type
 
-            pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
-
             for f, t in dtype.items():
                 pdf[f] = pdf[f].astype(t, copy=False)
             return pdf
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index ab76c48e000d4..3db8bee203469 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -2564,6 +2564,18 @@ def test_to_pandas(self):
         self.assertEquals(types[2], np.bool)
         self.assertEquals(types[3], np.float32)
 
+    @unittest.skipIf(not _have_pandas, "Pandas not installed")
+    def test_to_pandas_avoid_astype(self):
+        import numpy as np
+        schema = StructType().add("a", IntegerType()).add("b", StringType())\
+                             .add("c", IntegerType())
+        data = [(1, "foo", 16777220), (None, "bar", None)]
+        df = self.spark.createDataFrame(data, schema)
+        types = df.toPandas().dtypes
+        self.assertEquals(types[0], np.float64)  # doesn't convert to np.int32 due to NaN value.
+        self.assertEquals(types[1], np.object)
+        self.assertEquals(types[2], np.float64)
+
     def test_create_dataframe_from_array_of_long(self):
         import array
         data = [Row(longarray=array.array('l', [-9223372036854775808, 0, 9223372036854775807]))]

From f180b65343e706c60b995a3d46d0391612bda966 Mon Sep 17 00:00:00 2001
From: WeichenXu <weichen.xu@databricks.com>
Date: Fri, 22 Sep 2017 18:15:01 -0700
Subject: [PATCH 606/779] [SPARK-22060][ML] Fix
 CrossValidator/TrainValidationSplit param persist/load bug

## What changes were proposed in this pull request?

Currently the param of CrossValidator/TrainValidationSplit persist/loading is hardcoding, which is different with other ML estimators. This cause persist bug for new added `parallelism` param.

I refactor related code, avoid hardcoding persist/load param. And in the same time, it solve the `parallelism` persisting bug.

This refactoring is very useful because we will add more new params in #19208 , hardcoding param persisting/loading making the thing adding new params very troublesome.

## How was this patch tested?

Test added.

Author: WeichenXu <weichen.xu@databricks.com>

Closes #19278 from WeichenXu123/fix-tuning-param-bug.
---
 .../spark/ml/tuning/CrossValidator.scala      | 17 +++++++-------
 .../ml/tuning/TrainValidationSplit.scala      | 18 +++++++--------
 .../spark/ml/tuning/ValidatorParams.scala     | 22 +++++++------------
 .../org/apache/spark/ml/util/ReadWrite.scala  | 20 ++++++++++++-----
 .../spark/ml/tuning/CrossValidatorSuite.scala |  3 +++
 .../ml/tuning/TrainValidationSplitSuite.scala |  4 +++-
 6 files changed, 46 insertions(+), 38 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
index ce2a3a2e40411..7c81cb96e07f2 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -212,14 +212,13 @@ object CrossValidator extends MLReadable[CrossValidator] {
 
       val (metadata, estimator, evaluator, estimatorParamMaps) =
         ValidatorParams.loadImpl(path, sc, className)
-      val numFolds = (metadata.params \ "numFolds").extract[Int]
-      val seed = (metadata.params \ "seed").extract[Long]
-      new CrossValidator(metadata.uid)
+      val cv = new CrossValidator(metadata.uid)
         .setEstimator(estimator)
         .setEvaluator(evaluator)
         .setEstimatorParamMaps(estimatorParamMaps)
-        .setNumFolds(numFolds)
-        .setSeed(seed)
+      DefaultParamsReader.getAndSetParams(cv, metadata,
+        skipParams = Option(List("estimatorParamMaps")))
+      cv
     }
   }
 }
@@ -302,17 +301,17 @@ object CrossValidatorModel extends MLReadable[CrossValidatorModel] {
 
       val (metadata, estimator, evaluator, estimatorParamMaps) =
         ValidatorParams.loadImpl(path, sc, className)
-      val numFolds = (metadata.params \ "numFolds").extract[Int]
-      val seed = (metadata.params \ "seed").extract[Long]
       val bestModelPath = new Path(path, "bestModel").toString
       val bestModel = DefaultParamsReader.loadParamsInstance[Model[_]](bestModelPath, sc)
       val avgMetrics = (metadata.metadata \ "avgMetrics").extract[Seq[Double]].toArray
+
       val model = new CrossValidatorModel(metadata.uid, bestModel, avgMetrics)
       model.set(model.estimator, estimator)
         .set(model.evaluator, evaluator)
         .set(model.estimatorParamMaps, estimatorParamMaps)
-        .set(model.numFolds, numFolds)
-        .set(model.seed, seed)
+      DefaultParamsReader.getAndSetParams(model, metadata,
+        skipParams = Option(List("estimatorParamMaps")))
+      model
     }
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala
index 16db0f5f12c77..6e3ad40706803 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.ml.tuning
 
+import java.io.IOException
 import java.util.{List => JList}
 
 import scala.collection.JavaConverters._
@@ -207,14 +208,13 @@ object TrainValidationSplit extends MLReadable[TrainValidationSplit] {
 
       val (metadata, estimator, evaluator, estimatorParamMaps) =
         ValidatorParams.loadImpl(path, sc, className)
-      val trainRatio = (metadata.params \ "trainRatio").extract[Double]
-      val seed = (metadata.params \ "seed").extract[Long]
-      new TrainValidationSplit(metadata.uid)
+      val tvs = new TrainValidationSplit(metadata.uid)
         .setEstimator(estimator)
         .setEvaluator(evaluator)
         .setEstimatorParamMaps(estimatorParamMaps)
-        .setTrainRatio(trainRatio)
-        .setSeed(seed)
+      DefaultParamsReader.getAndSetParams(tvs, metadata,
+        skipParams = Option(List("estimatorParamMaps")))
+      tvs
     }
   }
 }
@@ -295,17 +295,17 @@ object TrainValidationSplitModel extends MLReadable[TrainValidationSplitModel] {
 
       val (metadata, estimator, evaluator, estimatorParamMaps) =
         ValidatorParams.loadImpl(path, sc, className)
-      val trainRatio = (metadata.params \ "trainRatio").extract[Double]
-      val seed = (metadata.params \ "seed").extract[Long]
       val bestModelPath = new Path(path, "bestModel").toString
       val bestModel = DefaultParamsReader.loadParamsInstance[Model[_]](bestModelPath, sc)
       val validationMetrics = (metadata.metadata \ "validationMetrics").extract[Seq[Double]].toArray
+
       val model = new TrainValidationSplitModel(metadata.uid, bestModel, validationMetrics)
       model.set(model.estimator, estimator)
         .set(model.evaluator, evaluator)
         .set(model.estimatorParamMaps, estimatorParamMaps)
-        .set(model.trainRatio, trainRatio)
-        .set(model.seed, seed)
+      DefaultParamsReader.getAndSetParams(model, metadata,
+        skipParams = Option(List("estimatorParamMaps")))
+      model
     }
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ValidatorParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ValidatorParams.scala
index 0ab6eed959381..363304ef10147 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/ValidatorParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/ValidatorParams.scala
@@ -150,20 +150,14 @@ private[ml] object ValidatorParams {
       }.toSeq
     ))
 
-    val validatorSpecificParams = instance match {
-      case cv: CrossValidatorParams =>
-        List("numFolds" -> parse(cv.numFolds.jsonEncode(cv.getNumFolds)))
-      case tvs: TrainValidationSplitParams =>
-        List("trainRatio" -> parse(tvs.trainRatio.jsonEncode(tvs.getTrainRatio)))
-      case _ =>
-        // This should not happen.
-        throw new NotImplementedError("ValidatorParams.saveImpl does not handle type: " +
-          instance.getClass.getCanonicalName)
-    }
-
-    val jsonParams = validatorSpecificParams ++ List(
-      "estimatorParamMaps" -> parse(estimatorParamMapsJson),
-      "seed" -> parse(instance.seed.jsonEncode(instance.getSeed)))
+    val params = instance.extractParamMap().toSeq
+    val skipParams = List("estimator", "evaluator", "estimatorParamMaps")
+    val jsonParams = render(params
+      .filter { case ParamPair(p, v) => !skipParams.contains(p.name)}
+      .map { case ParamPair(p, v) =>
+        p.name -> parse(p.jsonEncode(v))
+      }.toList ++ List("estimatorParamMaps" -> parse(estimatorParamMapsJson))
+    )
 
     DefaultParamsWriter.saveMetadata(instance, path, sc, extraMetadata, Some(jsonParams))
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
index 65f142cfbbcb6..7188da3531267 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
@@ -396,17 +396,27 @@ private[ml] object DefaultParamsReader {
 
   /**
    * Extract Params from metadata, and set them in the instance.
-   * This works if all Params implement [[org.apache.spark.ml.param.Param.jsonDecode()]].
+   * This works if all Params (except params included by `skipParams` list) implement
+   * [[org.apache.spark.ml.param.Param.jsonDecode()]].
+   *
+   * @param skipParams The params included in `skipParams` won't be set. This is useful if some
+   *                   params don't implement [[org.apache.spark.ml.param.Param.jsonDecode()]]
+   *                   and need special handling.
    * TODO: Move to [[Metadata]] method
    */
-  def getAndSetParams(instance: Params, metadata: Metadata): Unit = {
+  def getAndSetParams(
+      instance: Params,
+      metadata: Metadata,
+      skipParams: Option[List[String]] = None): Unit = {
     implicit val format = DefaultFormats
     metadata.params match {
       case JObject(pairs) =>
         pairs.foreach { case (paramName, jsonValue) =>
-          val param = instance.getParam(paramName)
-          val value = param.jsonDecode(compact(render(jsonValue)))
-          instance.set(param, value)
+          if (skipParams == None || !skipParams.get.contains(paramName)) {
+            val param = instance.getParam(paramName)
+            val value = param.jsonDecode(compact(render(jsonValue)))
+            instance.set(param, value)
+          }
         }
       case _ =>
         throw new IllegalArgumentException(
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
index a8d4377cff2d1..a01744f7b67fd 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
@@ -159,12 +159,15 @@ class CrossValidatorSuite
       .setEvaluator(evaluator)
       .setNumFolds(20)
       .setEstimatorParamMaps(paramMaps)
+      .setSeed(42L)
+      .setParallelism(2)
 
     val cv2 = testDefaultReadWrite(cv, testParams = false)
 
     assert(cv.uid === cv2.uid)
     assert(cv.getNumFolds === cv2.getNumFolds)
     assert(cv.getSeed === cv2.getSeed)
+    assert(cv.getParallelism === cv2.getParallelism)
 
     assert(cv2.getEvaluator.isInstanceOf[BinaryClassificationEvaluator])
     val evaluator2 = cv2.getEvaluator.asInstanceOf[BinaryClassificationEvaluator]
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
index 74801733381c1..2ed4fbb601b61 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
@@ -23,7 +23,7 @@ import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressio
 import org.apache.spark.ml.classification.LogisticRegressionSuite.generateLogisticInput
 import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator, RegressionEvaluator}
 import org.apache.spark.ml.linalg.Vectors
-import org.apache.spark.ml.param.{ParamMap}
+import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.param.shared.HasInputCol
 import org.apache.spark.ml.regression.LinearRegression
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
@@ -160,11 +160,13 @@ class TrainValidationSplitSuite
       .setTrainRatio(0.5)
       .setEstimatorParamMaps(paramMaps)
       .setSeed(42L)
+      .setParallelism(2)
 
     val tvs2 = testDefaultReadWrite(tvs, testParams = false)
 
     assert(tvs.getTrainRatio === tvs2.getTrainRatio)
     assert(tvs.getSeed === tvs2.getSeed)
+    assert(tvs.getParallelism === tvs2.getParallelism)
 
     ValidatorParamsSuiteHelpers
       .compareParamMaps(tvs.getEstimatorParamMaps, tvs2.getEstimatorParamMaps)

From c11f24a94007bbaad0835645843e776507094071 Mon Sep 17 00:00:00 2001
From: Jakub Nowacki <j.s.nowacki@gmail.com>
Date: Sat, 23 Sep 2017 21:04:10 +0900
Subject: [PATCH 607/779] [SPARK-18136] Fix SPARK_JARS_DIR for Python pip
 install on Windows

## What changes were proposed in this pull request?

Fix for setup of `SPARK_JARS_DIR` on Windows as it looks for `%SPARK_HOME%\RELEASE` file instead of `%SPARK_HOME%\jars` as it should. RELEASE file is not included in the `pip` build of PySpark.

## How was this patch tested?

Local install of PySpark on Anaconda 4.4.0 (Python 3.6.1).

Author: Jakub Nowacki <j.s.nowacki@gmail.com>

Closes #19310 from jsnowacki/master.
---
 bin/spark-class2.cmd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/spark-class2.cmd b/bin/spark-class2.cmd
index f6157f42843e8..a93fd2f0e54bc 100644
--- a/bin/spark-class2.cmd
+++ b/bin/spark-class2.cmd
@@ -29,7 +29,7 @@ if "x%1"=="x" (
 )
 
 rem Find Spark jars.
-if exist "%SPARK_HOME%\RELEASE" (
+if exist "%SPARK_HOME%\jars" (
   set SPARK_JARS_DIR="%SPARK_HOME%\jars"
 ) else (
   set SPARK_JARS_DIR="%SPARK_HOME%\assembly\target\scala-%SPARK_SCALA_VERSION%\jars"

From 3920af7d1dcf1631d3930b1b780116562741b35d Mon Sep 17 00:00:00 2001
From: guoxiaolong <guo.xiaolong1@zte.com.cn>
Date: Sat, 23 Sep 2017 15:39:53 +0100
Subject: [PATCH 608/779] [SPARK-22099] The 'job ids' list style needs to be
 changed in the SQL page.

## What changes were proposed in this pull request?

The 'job ids' list style needs to be changed in the SQL page. There are two reasons:
1. If a job id is a line, there are a lot of job ids, then the table row height will be high. As shown below:
![3](https://user-images.githubusercontent.com/26266482/30732242-2fb11442-9fa4-11e7-98ea-80a98f280243.png)

2. should be consistent with the 'JDBC / ODBC Server' page style, I am in this way to modify the style. As shown below:
![2](https://user-images.githubusercontent.com/26266482/30732257-3c550820-9fa4-11e7-9d8e-467d3011e0ac.png)

My changes are as follows:
![6](https://user-images.githubusercontent.com/26266482/30732318-8f61d8b8-9fa4-11e7-8af5-037ed12b13c9.png)

![5](https://user-images.githubusercontent.com/26266482/30732284-5b6a6c00-9fa4-11e7-8db9-3a2291f37ae6.png)

## How was this patch tested?
manual tests

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: guoxiaolong <guo.xiaolong1@zte.com.cn>

Closes #19320 from guoxiaolongzte/SPARK-22099.
---
 .../spark/sql/execution/ui/AllExecutionsPage.scala   | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala
index 64c9d90edcab4..41929ed474fa7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala
@@ -88,13 +88,19 @@ private[ui] abstract class ExecutionTable(
     val duration = executionUIData.completionTime.getOrElse(currentTime) - submissionTime
 
     val runningJobs = executionUIData.runningJobs.map { jobId =>
-      <a href={jobURL(jobId)}>{jobId.toString}</a><br/>
+      <a href={jobURL(jobId)}>
+        [{jobId.toString}]
+      </a>
     }
     val succeededJobs = executionUIData.succeededJobs.sorted.map { jobId =>
-      <a href={jobURL(jobId)}>{jobId.toString}</a><br/>
+      <a href={jobURL(jobId)}>
+        [{jobId.toString}]
+      </a>
     }
     val failedJobs = executionUIData.failedJobs.sorted.map { jobId =>
-      <a href={jobURL(jobId)}>{jobId.toString}</a><br/>
+      <a href={jobURL(jobId)}>
+        [{jobId.toString}]
+      </a>
     }
     <tr>
       <td>

From 50ada2a4d31609b6c828158cad8e128c2f605b8d Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Sat, 23 Sep 2017 15:40:59 +0100
Subject: [PATCH 609/779] [SPARK-22033][CORE] BufferHolder, other size checks
 should account for the specific VM array size limitations

## What changes were proposed in this pull request?

Try to avoid allocating an array bigger than Integer.MAX_VALUE - 8, which is the actual max size on some JVMs, in several places

## How was this patch tested?

Existing tests

Author: Sean Owen <sowen@cloudera.com>

Closes #19266 from srowen/SPARK-22033.
---
 .../apache/spark/unsafe/array/LongArray.java  |  2 +-
 .../unsafe/map/HashMapGrowthStrategy.java     |  8 +++++++-
 .../spark/util/collection/CompactBuffer.scala | 20 +++++++++----------
 .../collection/PartitionedPairBuffer.scala    |  8 +++++---
 .../expressions/codegen/BufferHolder.java     | 11 +++++++---
 .../vectorized/WritableColumnVector.java      |  2 +-
 6 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/array/LongArray.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/array/LongArray.java
index 1a3cdff638264..2cd39bd60c2ac 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/array/LongArray.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/array/LongArray.java
@@ -39,7 +39,7 @@ public final class LongArray {
   private final long length;
 
   public LongArray(MemoryBlock memory) {
-    assert memory.size() < (long) Integer.MAX_VALUE * 8: "Array size > 4 billion elements";
+    assert memory.size() < (long) Integer.MAX_VALUE * 8: "Array size >= Integer.MAX_VALUE elements";
     this.memory = memory;
     this.baseObj = memory.getBaseObject();
     this.baseOffset = memory.getBaseOffset();
diff --git a/core/src/main/java/org/apache/spark/unsafe/map/HashMapGrowthStrategy.java b/core/src/main/java/org/apache/spark/unsafe/map/HashMapGrowthStrategy.java
index 20654e4eeaa02..b8c2294c7b7ab 100644
--- a/core/src/main/java/org/apache/spark/unsafe/map/HashMapGrowthStrategy.java
+++ b/core/src/main/java/org/apache/spark/unsafe/map/HashMapGrowthStrategy.java
@@ -30,11 +30,17 @@ public interface HashMapGrowthStrategy {
   HashMapGrowthStrategy DOUBLING = new Doubling();
 
   class Doubling implements HashMapGrowthStrategy {
+
+    // Some JVMs can't allocate arrays of length Integer.MAX_VALUE; actual max is somewhat
+    // smaller. Be conservative and lower the cap a little.
+    private static final int ARRAY_MAX = Integer.MAX_VALUE - 8;
+
     @Override
     public int nextCapacity(int currentCapacity) {
       assert (currentCapacity > 0);
+      int doubleCapacity = currentCapacity * 2;
       // Guard against overflow
-      return (currentCapacity * 2 > 0) ? (currentCapacity * 2) : Integer.MAX_VALUE;
+      return (doubleCapacity > 0 && doubleCapacity <= ARRAY_MAX) ? doubleCapacity : ARRAY_MAX;
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/util/collection/CompactBuffer.scala b/core/src/main/scala/org/apache/spark/util/collection/CompactBuffer.scala
index 4d43d8d5cc8d8..f5d2fa14e49cb 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/CompactBuffer.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/CompactBuffer.scala
@@ -126,22 +126,22 @@ private[spark] class CompactBuffer[T: ClassTag] extends Seq[T] with Serializable
 
   /** Increase our size to newSize and grow the backing array if needed. */
   private def growToSize(newSize: Int): Unit = {
-    if (newSize < 0) {
-      throw new UnsupportedOperationException("Can't grow buffer past Int.MaxValue elements")
+    // Some JVMs can't allocate arrays of length Integer.MAX_VALUE; actual max is somewhat
+    // smaller. Be conservative and lower the cap a little.
+    val arrayMax = Int.MaxValue - 8
+    if (newSize < 0 || newSize - 2 > arrayMax) {
+      throw new UnsupportedOperationException(s"Can't grow buffer past $arrayMax elements")
     }
     val capacity = if (otherElements != null) otherElements.length + 2 else 2
     if (newSize > capacity) {
-      var newArrayLen = 8
+      var newArrayLen = 8L
       while (newSize - 2 > newArrayLen) {
         newArrayLen *= 2
-        if (newArrayLen == Int.MinValue) {
-          // Prevent overflow if we double from 2^30 to 2^31, which will become Int.MinValue.
-          // Note that we set the new array length to Int.MaxValue - 2 so that our capacity
-          // calculation above still gives a positive integer.
-          newArrayLen = Int.MaxValue - 2
-        }
       }
-      val newArray = new Array[T](newArrayLen)
+      if (newArrayLen > arrayMax) {
+        newArrayLen = arrayMax
+      }
+      val newArray = new Array[T](newArrayLen.toInt)
       if (otherElements != null) {
         System.arraycopy(otherElements, 0, newArray, 0, otherElements.length)
       }
diff --git a/core/src/main/scala/org/apache/spark/util/collection/PartitionedPairBuffer.scala b/core/src/main/scala/org/apache/spark/util/collection/PartitionedPairBuffer.scala
index f5844d5353be7..b755e5da51684 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/PartitionedPairBuffer.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/PartitionedPairBuffer.scala
@@ -25,7 +25,7 @@ import org.apache.spark.util.collection.WritablePartitionedPairCollection._
  * Append-only buffer of key-value pairs, each with a corresponding partition ID, that keeps track
  * of its estimated size in bytes.
  *
- * The buffer can support up to `1073741823 (2 ^ 30 - 1)` elements.
+ * The buffer can support up to 1073741819 elements.
  */
 private[spark] class PartitionedPairBuffer[K, V](initialCapacity: Int = 64)
   extends WritablePartitionedPairCollection[K, V] with SizeTracker
@@ -59,7 +59,7 @@ private[spark] class PartitionedPairBuffer[K, V](initialCapacity: Int = 64)
       throw new IllegalStateException(s"Can't insert more than ${MAXIMUM_CAPACITY} elements")
     }
     val newCapacity =
-      if (capacity * 2 < 0 || capacity * 2 > MAXIMUM_CAPACITY) { // Overflow
+      if (capacity * 2 > MAXIMUM_CAPACITY) { // Overflow
         MAXIMUM_CAPACITY
       } else {
         capacity * 2
@@ -96,5 +96,7 @@ private[spark] class PartitionedPairBuffer[K, V](initialCapacity: Int = 64)
 }
 
 private object PartitionedPairBuffer {
-  val MAXIMUM_CAPACITY = Int.MaxValue / 2 // 2 ^ 30 - 1
+  // Some JVMs can't allocate arrays of length Integer.MAX_VALUE; actual max is somewhat
+  // smaller. Be conservative and lower the cap a little.
+  val MAXIMUM_CAPACITY: Int = (Int.MaxValue - 8) / 2
 }
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolder.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolder.java
index 0e4264fe8dfb5..971d19973f067 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolder.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolder.java
@@ -35,6 +35,11 @@
  * if the fields of row are all fixed-length, as the size of result row is also fixed.
  */
 public class BufferHolder {
+
+  // Some JVMs can't allocate arrays of length Integer.MAX_VALUE; actual max is somewhat
+  // smaller. Be conservative and lower the cap a little.
+  private static final int ARRAY_MAX = Integer.MAX_VALUE - 8;
+
   public byte[] buffer;
   public int cursor = Platform.BYTE_ARRAY_OFFSET;
   private final UnsafeRow row;
@@ -61,15 +66,15 @@ public BufferHolder(UnsafeRow row, int initialSize) {
    * Grows the buffer by at least neededSize and points the row to the buffer.
    */
   public void grow(int neededSize) {
-    if (neededSize > Integer.MAX_VALUE - totalSize()) {
+    if (neededSize > ARRAY_MAX - totalSize()) {
       throw new UnsupportedOperationException(
         "Cannot grow BufferHolder by size " + neededSize + " because the size after growing " +
-          "exceeds size limitation " + Integer.MAX_VALUE);
+          "exceeds size limitation " + ARRAY_MAX);
     }
     final int length = totalSize() + neededSize;
     if (buffer.length < length) {
       // This will not happen frequently, because the buffer is re-used.
-      int newLength = length < Integer.MAX_VALUE / 2 ? length * 2 : Integer.MAX_VALUE;
+      int newLength = length < ARRAY_MAX / 2 ? length * 2 : ARRAY_MAX;
       final byte[] tmp = new byte[newLength];
       Platform.copyMemory(
         buffer,
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
index b4f753c0bc2a3..0bddc351e1bed 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
@@ -559,7 +559,7 @@ public final int appendStruct(boolean isNull) {
    * Upper limit for the maximum capacity for this column.
    */
   @VisibleForTesting
-  protected int MAX_CAPACITY = Integer.MAX_VALUE;
+  protected int MAX_CAPACITY = Integer.MAX_VALUE - 8;
 
   /**
    * Number of nulls in this column. This is an optimization for the reader, to skip NULL checks.

From 04975a68b583a6175f93da52374108e5d4754d9a Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sun, 24 Sep 2017 00:05:17 +0900
Subject: [PATCH 610/779] [SPARK-22109][SQL] Resolves type conflicts between
 strings and timestamps in partition column

## What changes were proposed in this pull request?

This PR proposes to resolve the type conflicts in strings and timestamps in partition column values.
It looks we need to set the timezone as it needs a cast between strings and timestamps.

```scala
val df = Seq((1, "2015-01-01 00:00:00"), (2, "2014-01-01 00:00:00"), (3, "blah")).toDF("i", "str")
val path = "/tmp/test.parquet"
df.write.format("parquet").partitionBy("str").save(path)
spark.read.parquet(path).show()
```

**Before**

```
java.util.NoSuchElementException: None.get
  at scala.None$.get(Option.scala:347)
  at scala.None$.get(Option.scala:345)
  at org.apache.spark.sql.catalyst.expressions.TimeZoneAwareExpression$class.timeZone(datetimeExpressions.scala:46)
  at org.apache.spark.sql.catalyst.expressions.Cast.timeZone$lzycompute(Cast.scala:172)
  at org.apache.spark.sql.catalyst.expressions.Cast.timeZone(Cast.scala:172)
  at org.apache.spark.sql.catalyst.expressions.Cast$$anonfun$castToString$3$$anonfun$apply$16.apply(Cast.scala:208)
  at org.apache.spark.sql.catalyst.expressions.Cast$$anonfun$castToString$3$$anonfun$apply$16.apply(Cast.scala:208)
  at org.apache.spark.sql.catalyst.expressions.Cast.org$apache$spark$sql$catalyst$expressions$Cast$$buildCast(Cast.scala:201)
  at org.apache.spark.sql.catalyst.expressions.Cast$$anonfun$castToString$3.apply(Cast.scala:207)
  at org.apache.spark.sql.catalyst.expressions.Cast.nullSafeEval(Cast.scala:533)
  at org.apache.spark.sql.catalyst.expressions.UnaryExpression.eval(Expression.scala:331)
  at org.apache.spark.sql.execution.datasources.PartitioningUtils$$anonfun$org$apache$spark$sql$execution$datasources$PartitioningUtils$$resolveTypeConflicts$1.apply(PartitioningUtils.scala:481)
  at org.apache.spark.sql.execution.datasources.PartitioningUtils$$anonfun$org$apache$spark$sql$execution$datasources$PartitioningUtils$$resolveTypeConflicts$1.apply(PartitioningUtils.scala:480)
  at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
  at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
  at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
```

**After**

```
+---+-------------------+
|  i|                str|
+---+-------------------+
|  2|2014-01-01 00:00:00|
|  1|2015-01-01 00:00:00|
|  3|               blah|
+---+-------------------+
```

## How was this patch tested?

Unit tests added in `ParquetPartitionDiscoverySuite` and manual tests.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #19331 from HyukjinKwon/SPARK-22109.
---
 .../execution/datasources/PartitioningUtils.scala    | 11 ++++++-----
 .../parquet/ParquetPartitionDiscoverySuite.scala     | 12 ++++++++++++
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
index 92358da6d6c67..1c00c9ebb4144 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
@@ -139,7 +139,7 @@ object PartitioningUtils {
           "root directory of the table. If there are multiple root directories, " +
           "please load them separately and then union them.")
 
-      val resolvedPartitionValues = resolvePartitions(pathsWithPartitionValues)
+      val resolvedPartitionValues = resolvePartitions(pathsWithPartitionValues, timeZone)
 
       // Creates the StructType which represents the partition columns.
       val fields = {
@@ -318,7 +318,8 @@ object PartitioningUtils {
    * }}}
    */
   def resolvePartitions(
-      pathsWithPartitionValues: Seq[(Path, PartitionValues)]): Seq[PartitionValues] = {
+      pathsWithPartitionValues: Seq[(Path, PartitionValues)],
+      timeZone: TimeZone): Seq[PartitionValues] = {
     if (pathsWithPartitionValues.isEmpty) {
       Seq.empty
     } else {
@@ -333,7 +334,7 @@ object PartitioningUtils {
       val values = pathsWithPartitionValues.map(_._2)
       val columnCount = values.head.columnNames.size
       val resolvedValues = (0 until columnCount).map { i =>
-        resolveTypeConflicts(values.map(_.literals(i)))
+        resolveTypeConflicts(values.map(_.literals(i)), timeZone)
       }
 
       // Fills resolved literals back to each partition
@@ -470,7 +471,7 @@ object PartitioningUtils {
    * Given a collection of [[Literal]]s, resolves possible type conflicts by up-casting "lower"
    * types.
    */
-  private def resolveTypeConflicts(literals: Seq[Literal]): Seq[Literal] = {
+  private def resolveTypeConflicts(literals: Seq[Literal], timeZone: TimeZone): Seq[Literal] = {
     val desiredType = {
       val topType = literals.map(_.dataType).maxBy(upCastingOrder.indexOf(_))
       // Falls back to string if all values of this column are null or empty string
@@ -478,7 +479,7 @@ object PartitioningUtils {
     }
 
     literals.map { case l @ Literal(_, dataType) =>
-      Literal.create(Cast(l, desiredType).eval(), desiredType)
+      Literal.create(Cast(l, desiredType, Some(timeZone.getID)).eval(), desiredType)
     }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
index 837a0872d7b71..f79b92b804c70 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
@@ -1055,4 +1055,16 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
       }
     }
   }
+
+  test("SPARK-22109: Resolve type conflicts between strings and timestamps in partition column") {
+    val df = Seq(
+      (1, "2015-01-01 00:00:00"),
+      (2, "2014-01-01 00:00:00"),
+      (3, "blah")).toDF("i", "str")
+
+    withTempPath { path =>
+      df.write.format("parquet").partitionBy("str").save(path.getAbsolutePath)
+      checkAnswer(spark.read.load(path.getAbsolutePath), df)
+    }
+  }
 }

From c792aff0367f1f9dc7338e0caa52842cf6b78678 Mon Sep 17 00:00:00 2001
From: Steve Loughran <stevel@hortonworks.com>
Date: Sat, 23 Sep 2017 17:08:41 +0100
Subject: [PATCH 611/779] [SPARK-20448][DOCS] Document how FileInputDStream
 works with object storage

Change-Id: I88c272444ca734dc2cbc2592607c11287b90a383

## What changes were proposed in this pull request?

The documentation on File DStreams is enhanced to

1. Detail the exact timestamp logic for examining directories and files.
1. Detail how object stores different from filesystems, and so how using them as a source of data should be treated with caution, possibly publishing data to the store differently (direct PUTs as opposed to stage + rename)

## How was this patch tested?

n/a

Author: Steve Loughran <stevel@hortonworks.com>

Closes #17743 from steveloughran/cloud/SPARK-20448-document-dstream-blobstore.
---
 docs/streaming-programming-guide.md | 120 +++++++++++++++++++++++-----
 1 file changed, 99 insertions(+), 21 deletions(-)

diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
index bc200cd07ebd8..868acc41226dc 100644
--- a/docs/streaming-programming-guide.md
+++ b/docs/streaming-programming-guide.md
@@ -615,35 +615,113 @@ which creates a DStream from text
 data received over a TCP socket connection. Besides sockets, the StreamingContext API provides
 methods for creating DStreams from files as input sources.
 
-- **File Streams:** For reading data from files on any file system compatible with the HDFS API (that is, HDFS, S3, NFS, etc.), a DStream can be created as:
+#### File Streams
+{:.no_toc}
 
-    <div class="codetabs">
-    <div data-lang="scala" markdown="1">
-        streamingContext.fileStream[KeyClass, ValueClass, InputFormatClass](dataDirectory)
-    </div>
-    <div data-lang="java" markdown="1">
-		streamingContext.fileStream<KeyClass, ValueClass, InputFormatClass>(dataDirectory);
-    </div>
-    <div data-lang="python" markdown="1">
-		streamingContext.textFileStream(dataDirectory)
-    </div>
-    </div>
+For reading data from files on any file system compatible with the HDFS API (that is, HDFS, S3, NFS, etc.), a DStream can be created as
+via `StreamingContext.fileStream[KeyClass, ValueClass, InputFormatClass]`.
 
-	Spark Streaming will monitor the directory `dataDirectory` and process any files created in that directory (files written in nested directories not supported). Note that
+File streams do not require running a receiver so there is no need to allocate any cores for receiving file data.
 
-     + The files must have the same data format.
-     + The files must be created in the `dataDirectory` by atomically *moving* or *renaming* them into
-     the data directory.
-     + Once moved, the files must not be changed. So if the files are being continuously appended, the new data will not be read.
+For simple text files, the easiest method is `StreamingContext.textFileStream(dataDirectory)`.
 
-	For simple text files, there is an easier method `streamingContext.textFileStream(dataDirectory)`. And file streams do not require running a receiver, hence does not require allocating cores.
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+{% highlight scala %}
+streamingContext.fileStream[KeyClass, ValueClass, InputFormatClass](dataDirectory)
+{% endhighlight %}
+For text files
 
-	<span class="badge" style="background-color: grey">Python API</span> `fileStream` is not available in the Python API, only	`textFileStream` is	available.
+{% highlight scala %}
+streamingContext.textFileStream(dataDirectory)
+{% endhighlight %}
+</div>
 
-- **Streams based on Custom Receivers:** DStreams can be created with data streams received through custom receivers. See the [Custom Receiver
+<div data-lang="java" markdown="1">
+{% highlight java %}
+streamingContext.fileStream<KeyClass, ValueClass, InputFormatClass>(dataDirectory);
+{% endhighlight %}
+For text files
+
+{% highlight java %}
+streamingContext.textFileStream(dataDirectory);
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+`fileStream` is not available in the Python API; only `textFileStream` is available.
+{% highlight python %}
+streamingContext.textFileStream(dataDirectory)
+{% endhighlight %}
+</div>
+
+</div>
+
+##### How Directories are Monitored
+{:.no_toc}
+
+Spark Streaming will monitor the directory `dataDirectory` and process any files created in that directory.
+
+   * A simple directory can be monitored, such as `"hdfs://namenode:8040/logs/"`.
+     All files directly under such a path will be processed as they are discovered.
+   + A [POSIX glob pattern](http://pubs.opengroup.org/onlinepubs/009695399/utilities/xcu_chap02.html#tag_02_13_02) can be supplied, such as
+     `"hdfs://namenode:8040/logs/2017/*"`.
+     Here, the DStream will consist of all files in the directories
+     matching the pattern.
+     That is: it is a pattern of directories, not of files in directories.
+   + All files must be in the same data format.
+   * A file is considered part of a time period based on its modification time,
+     not its creation time.
+   + Once processed, changes to a file within the current window will not cause the file to be reread.
+     That is: *updates are ignored*.
+   + The more files under a directory, the longer it will take to
+     scan for changes — even if no files have been modified.
+   * If a wildcard is used to identify directories, such as `"hdfs://namenode:8040/logs/2016-*"`,
+     renaming an entire directory to match the path will add the directory to the list of
+     monitored directories. Only the files in the directory whose modification time is
+     within the current window will be included in the stream.
+   + Calling [`FileSystem.setTimes()`](https://hadoop.apache.org/docs/current/api/org/apache/hadoop/fs/FileSystem.html#setTimes-org.apache.hadoop.fs.Path-long-long-)
+     to fix the timestamp is a way to have the file picked up in a later window, even if its contents have not changed.
+
+
+##### Using Object Stores as a source of data
+{:.no_toc}
+
+"Full" Filesystems such as HDFS tend to set the modification time on their files as soon
+as the output stream is created.
+When a file is opened, even before data has been completely written,
+it may be included in the `DStream` - after which updates to the file within the same window
+will be ignored. That is: changes may be missed, and data omitted from the stream.
+
+To guarantee that changes are picked up in a window, write the file
+to an unmonitored directory, then, immediately after the output stream is closed,
+rename it into the destination directory.
+Provided the renamed file appears in the scanned destination directory during the window
+of its creation, the new data will be picked up.
+
+In contrast, Object Stores such as Amazon S3 and Azure Storage usually have slow rename operations, as the
+data is actually copied.
+Furthermore, renamed object may have the time of the `rename()` operation as its modification time, so
+may not be considered part of the window which the original create time implied they were.
+
+Careful testing is needed against the target object store to verify that the timestamp behavior
+of the store is consistent with that expected by Spark Streaming. It may be
+that writing directly into a destination directory is the appropriate strategy for
+streaming data via the chosen object store.
+
+For more details on this topic, consult the [Hadoop Filesystem Specification](https://hadoop.apache.org/docs/stable2/hadoop-project-dist/hadoop-common/filesystem/introduction.html).
+
+#### Streams based on Custom Receivers
+{:.no_toc}
+
+DStreams can be created with data streams received through custom receivers. See the [Custom Receiver
   Guide](streaming-custom-receivers.html) for more details.
 
-- **Queue of RDDs as a Stream:** For testing a Spark Streaming application with test data, one can also create a DStream based on a queue of RDDs, using `streamingContext.queueStream(queueOfRDDs)`. Each RDD pushed into the queue will be treated as a batch of data in the DStream, and processed like a stream.
+#### Queue of RDDs as a Stream
+{:.no_toc}
+
+For testing a Spark Streaming application with test data, one can also create a DStream based on a queue of RDDs, using `streamingContext.queueStream(queueOfRDDs)`. Each RDD pushed into the queue will be treated as a batch of data in the DStream, and processed like a stream.
 
 For more details on streams from sockets and files, see the API documentations of the relevant functions in
 [StreamingContext](api/scala/index.html#org.apache.spark.streaming.StreamingContext) for

From 4a8c9e29bc3b9ad1e49d4f0b76db81a34e14f363 Mon Sep 17 00:00:00 2001
From: Kevin Yu <qyu@us.ibm.com>
Date: Sat, 23 Sep 2017 10:27:40 -0700
Subject: [PATCH 612/779] [SPARK-22110][SQL][DOCUMENTATION] Add usage and
 improve documentation with arguments and examples for trim function

## What changes were proposed in this pull request?

This PR proposes to enhance the documentation for `trim` functions in the function description session.

- Add more `usage`, `arguments` and `examples` for the trim function
- Adjust space in the `usage` session

After the changes, the trim function documentation will look like this:

- `trim`

```trim(str) - Removes the leading and trailing space characters from str.

trim(BOTH trimStr FROM str) - Remove the leading and trailing trimStr characters from str

trim(LEADING trimStr FROM str) - Remove the leading trimStr characters from str

trim(TRAILING trimStr FROM str) - Remove the trailing trimStr characters from str

Arguments:

str - a string expression
trimStr - the trim string characters to trim, the default value is a single space
BOTH, FROM - these are keywords to specify trimming string characters from both ends of the string
LEADING, FROM - these are keywords to specify trimming string characters from the left end of the string
TRAILING, FROM - these are keywords to specify trimming string characters from the right end of the string
Examples:

> SELECT trim('    SparkSQL   ');
 SparkSQL
> SELECT trim('SL', 'SSparkSQLS');
 parkSQ
> SELECT trim(BOTH 'SL' FROM 'SSparkSQLS');
 parkSQ
> SELECT trim(LEADING 'SL' FROM 'SSparkSQLS');
 parkSQLS
> SELECT trim(TRAILING 'SL' FROM 'SSparkSQLS');
 SSparkSQ
```

- `ltrim`

```ltrim

ltrim(str) - Removes the leading space characters from str.

ltrim(trimStr, str) - Removes the leading string contains the characters from the trim string

Arguments:

str - a string expression
trimStr - the trim string characters to trim, the default value is a single space
Examples:

> SELECT ltrim('    SparkSQL   ');
 SparkSQL
> SELECT ltrim('Sp', 'SSparkSQLS');
 arkSQLS
```

- `rtrim`
```rtrim

rtrim(str) - Removes the trailing space characters from str.

rtrim(trimStr, str) - Removes the trailing string which contains the characters from the trim string from the str

Arguments:

str - a string expression
trimStr - the trim string characters to trim, the default value is a single space
Examples:

> SELECT rtrim('    SparkSQL   ');
 SparkSQL
> SELECT rtrim('LQSa', 'SSparkSQLS');
 SSpark
```

This is the trim characters function jira: [trim function](https://issues.apache.org/jira/browse/SPARK-14878)

## How was this patch tested?

Manually tested
```
spark-sql> describe function extended trim;
17/09/22 17:03:04 INFO CodeGenerator: Code generated in 153.026533 ms
Function: trim
Class: org.apache.spark.sql.catalyst.expressions.StringTrim
Usage:
    trim(str) - Removes the leading and trailing space characters from `str`.

    trim(BOTH trimStr FROM str) - Remove the leading and trailing `trimStr` characters from `str`

    trim(LEADING trimStr FROM str) - Remove the leading `trimStr` characters from `str`

    trim(TRAILING trimStr FROM str) - Remove the trailing `trimStr` characters from `str`

Extended Usage:
    Arguments:
      * str - a string expression
      * trimStr - the trim string characters to trim, the default value is a single space
      * BOTH, FROM - these are keywords to specify trimming string characters from both ends of
          the string
      * LEADING, FROM - these are keywords to specify trimming string characters from the left
          end of the string
      * TRAILING, FROM - these are keywords to specify trimming string characters from the right
          end of the string

    Examples:
      > SELECT trim('    SparkSQL   ');
       SparkSQL
      > SELECT trim('SL', 'SSparkSQLS');
       parkSQ
      > SELECT trim(BOTH 'SL' FROM 'SSparkSQLS');
       parkSQ
      > SELECT trim(LEADING 'SL' FROM 'SSparkSQLS');
       parkSQLS
      > SELECT trim(TRAILING 'SL' FROM 'SSparkSQLS');
       SSparkSQ
  ```
```
spark-sql> describe function extended ltrim;
Function: ltrim
Class: org.apache.spark.sql.catalyst.expressions.StringTrimLeft
Usage:
    ltrim(str) - Removes the leading space characters from `str`.

    ltrim(trimStr, str) - Removes the leading string contains the characters from the trim string

Extended Usage:
    Arguments:
      * str - a string expression
      * trimStr - the trim string characters to trim, the default value is a single space

    Examples:
      > SELECT ltrim('    SparkSQL   ');
       SparkSQL
      > SELECT ltrim('Sp', 'SSparkSQLS');
       arkSQLS

```

```
spark-sql> describe function extended rtrim;
Function: rtrim
Class: org.apache.spark.sql.catalyst.expressions.StringTrimRight
Usage:
    rtrim(str) - Removes the trailing space characters from `str`.

    rtrim(trimStr, str) - Removes the trailing string which contains the characters from the trim string from the `str`

Extended Usage:
    Arguments:
      * str - a string expression
      * trimStr - the trim string characters to trim, the default value is a single space

    Examples:
      > SELECT rtrim('    SparkSQL   ');
       SparkSQL
      > SELECT rtrim('LQSa', 'SSparkSQLS');
       SSpark

```

Author: Kevin Yu <qyu@us.ibm.com>

Closes #19329 from kevinyu98/spark-14878-5.
---
 .../expressions/stringExpressions.scala       | 30 ++++++++++++++-----
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 6dfc59f58ad6d..c341943187820 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -535,20 +535,36 @@ object StringTrim {
 @ExpressionDescription(
   usage = """
     _FUNC_(str) - Removes the leading and trailing space characters from `str`.
-    _FUNC_(BOTH trimStr FROM str) - Remove the leading and trailing trimString from `str`
+
+    _FUNC_(BOTH trimStr FROM str) - Remove the leading and trailing `trimStr` characters from `str`
+
+    _FUNC_(LEADING trimStr FROM str) - Remove the leading `trimStr` characters from `str`
+
+    _FUNC_(TRAILING trimStr FROM str) - Remove the trailing `trimStr` characters from `str`
   """,
   arguments = """
     Arguments:
       * str - a string expression
-      * trimString - the trim string
-      * BOTH, FROM - these are keyword to specify for trim string from both ends of the string
+      * trimStr - the trim string characters to trim, the default value is a single space
+      * BOTH, FROM - these are keywords to specify trimming string characters from both ends of
+          the string
+      * LEADING, FROM - these are keywords to specify trimming string characters from the left
+          end of the string
+      * TRAILING, FROM - these are keywords to specify trimming string characters from the right
+          end of the string
   """,
   examples = """
     Examples:
       > SELECT _FUNC_('    SparkSQL   ');
        SparkSQL
+      > SELECT _FUNC_('SL', 'SSparkSQLS');
+       parkSQ
       > SELECT _FUNC_(BOTH 'SL' FROM 'SSparkSQLS');
        parkSQ
+      > SELECT _FUNC_(LEADING 'SL' FROM 'SSparkSQLS');
+       parkSQLS
+      > SELECT _FUNC_(TRAILING 'SL' FROM 'SSparkSQLS');
+       SSparkSQ
   """)
 case class StringTrim(
     srcStr: Expression,
@@ -634,13 +650,13 @@ object StringTrimLeft {
 @ExpressionDescription(
   usage = """
     _FUNC_(str) - Removes the leading space characters from `str`.
+
     _FUNC_(trimStr, str) - Removes the leading string contains the characters from the trim string
   """,
   arguments = """
     Arguments:
       * str - a string expression
-      * trimString - the trim string
-      * BOTH, FROM - these are keyword to specify for trim string from both ends of the string
+      * trimStr - the trim string characters to trim, the default value is a single space
   """,
   examples = """
     Examples:
@@ -735,13 +751,13 @@ object StringTrimRight {
 @ExpressionDescription(
   usage = """
     _FUNC_(str) - Removes the trailing space characters from `str`.
+
     _FUNC_(trimStr, str) - Removes the trailing string which contains the characters from the trim string from the `str`
   """,
   arguments = """
     Arguments:
       * str - a string expression
-      * trimString - the trim string
-      * BOTH, FROM - these are keyword to specify for trim string from both ends of the string
+      * trimStr - the trim string characters to trim, the default value is a single space
   """,
   examples = """
     Examples:

From 2274d84efcc71957766c1973b80b7c3265b55b11 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Sat, 23 Sep 2017 21:51:04 -0700
Subject: [PATCH 613/779] [SPARK-21338][SQL][FOLLOW-UP] Implement
 isCascadingTruncateTable() method in AggregatedDialect

## What changes were proposed in this pull request?

The implemented `isCascadingTruncateTable` in `AggregatedDialect` is wrong. When no dialect claims cascading, once there is an unknown cascading truncate in the dialects, we should return unknown cascading, instead of false.

## How was this patch tested?

Added test.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #19286 from viirya/SPARK-21338-followup.
---
 .../spark/sql/jdbc/AggregatedDialect.scala    |  8 +++++-
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 25 +++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/AggregatedDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/AggregatedDialect.scala
index 7432a1538ce97..1419d69f983ab 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/AggregatedDialect.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/AggregatedDialect.scala
@@ -43,6 +43,12 @@ private class AggregatedDialect(dialects: List[JdbcDialect]) extends JdbcDialect
   }
 
   override def isCascadingTruncateTable(): Option[Boolean] = {
-    dialects.flatMap(_.isCascadingTruncateTable()).reduceOption(_ || _)
+    // If any dialect claims cascading truncate, this dialect is also cascading truncate.
+    // Otherwise, if any dialect has unknown cascading truncate, this dialect is also unknown.
+    dialects.flatMap(_.isCascadingTruncateTable()).reduceOption(_ || _) match {
+      case Some(true) => Some(true)
+      case _ if dialects.exists(_.isCascadingTruncateTable().isEmpty) => None
+      case _ => Some(false)
+    }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index fd12bb9e530b8..34205e0b2bf08 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -749,6 +749,31 @@ class JDBCSuite extends SparkFunSuite
     assert(agg.isCascadingTruncateTable() === Some(true))
   }
 
+  test("Aggregated dialects: isCascadingTruncateTable") {
+    def genDialect(cascadingTruncateTable: Option[Boolean]): JdbcDialect = new JdbcDialect {
+      override def canHandle(url: String): Boolean = true
+      override def getCatalystType(
+        sqlType: Int,
+        typeName: String,
+        size: Int,
+        md: MetadataBuilder): Option[DataType] = None
+      override def isCascadingTruncateTable(): Option[Boolean] = cascadingTruncateTable
+    }
+
+    def testDialects(cascadings: List[Option[Boolean]], expected: Option[Boolean]): Unit = {
+      val dialects = cascadings.map(genDialect(_))
+      val agg = new AggregatedDialect(dialects)
+      assert(agg.isCascadingTruncateTable() === expected)
+    }
+
+    testDialects(List(Some(true), Some(false), None), Some(true))
+    testDialects(List(Some(true), Some(true), None), Some(true))
+    testDialects(List(Some(false), Some(false), None), None)
+    testDialects(List(Some(true), Some(true)), Some(true))
+    testDialects(List(Some(false), Some(false)), Some(false))
+    testDialects(List(None, None), None)
+  }
+
   test("DB2Dialect type mapping") {
     val db2Dialect = JdbcDialects.get("jdbc:db2://127.0.0.1/db")
     assert(db2Dialect.getJDBCType(StringType).map(_.databaseTypeDefinition).get == "CLOB")

From 9d48bd0b34c4b704e29eefd6409f1cf3ed7935d3 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sun, 24 Sep 2017 17:11:29 +0900
Subject: [PATCH 614/779] [SPARK-22093][TESTS] Fixes `assume` in `UtilsSuite`
 and `HiveDDLSuite`

## What changes were proposed in this pull request?

This PR proposes to remove `assume` in `Utils.resolveURIs` and replace `assume` to `assert` in `Utils.resolveURI` in the test cases in `UtilsSuite`.

It looks `Utils.resolveURIs` supports multiple but also single paths as input. So, it looks not meaningful to check if the input has `,`.

For the test for `Utils.resolveURI`, I replaced it to `assert` because it looks taking single path and in order to prevent future mistakes when adding more tests here.

For `assume` in `HiveDDLSuite`, it looks it should be `assert` to test at the last
## How was this patch tested?

Fixed unit tests.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #19332 from HyukjinKwon/SPARK-22093.
---
 core/src/test/scala/org/apache/spark/util/UtilsSuite.scala     | 3 +--
 .../org/apache/spark/sql/hive/execution/HiveDDLSuite.scala     | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index 05d58d8e6099d..2b16cc4852ba8 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -460,7 +460,7 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
   test("resolveURI") {
     def assertResolves(before: String, after: String): Unit = {
       // This should test only single paths
-      assume(before.split(",").length === 1)
+      assert(before.split(",").length === 1)
       def resolve(uri: String): String = Utils.resolveURI(uri).toString
       assert(resolve(before) === after)
       assert(resolve(after) === after)
@@ -488,7 +488,6 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
 
   test("resolveURIs with multiple paths") {
     def assertResolves(before: String, after: String): Unit = {
-      assume(before.split(",").length > 1)
       def resolve(uri: String): String = Utils.resolveURIs(uri)
       assert(resolve(before) === after)
       assert(resolve(after) === after)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index ee64bc9f9ee04..668da5fb47323 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -676,7 +676,7 @@ class HiveDDLSuite
       |""".stripMargin)
     val newPart = catalog.getPartition(TableIdentifier("boxes"), Map("width" -> "4"))
     assert(newPart.storage.serde == Some(expectedSerde))
-    assume(newPart.storage.properties.filterKeys(expectedSerdeProps.contains) ==
+    assert(newPart.storage.properties.filterKeys(expectedSerdeProps.contains) ==
       expectedSerdeProps)
   }
 

From 4943ea59840a894ca47d241fe68d520f1e97fa56 Mon Sep 17 00:00:00 2001
From: zuotingbing <zuo.tingbing9@zte.com.cn>
Date: Sun, 24 Sep 2017 09:38:46 +0100
Subject: [PATCH 615/779] [SPARK-22058][CORE] the BufferedInputStream will not
 be closed if an exception occurs.

## What changes were proposed in this pull request?

EventLoggingListener use `val in = new BufferedInputStream(fs.open(log))` and will close it if `codec.map(_.compressedInputStream(in)).getOrElse(in)`  occurs an exception .
But, if `CompressionCodec.createCodec(new SparkConf, c)` throws an exception, the BufferedInputStream `in` will not be closed anymore.

## How was this patch tested?

exist tests

Author: zuotingbing <zuo.tingbing9@zte.com.cn>

Closes #19277 from zuotingbing/SPARK-22058.
---
 .../org/apache/spark/scheduler/EventLoggingListener.scala | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
index 00ab2a393e17f..9dafa0b7646bf 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
@@ -351,14 +351,14 @@ private[spark] object EventLoggingListener extends Logging {
     // Since we sanitize the app ID to not include periods, it is safe to split on it
     val logName = log.getName.stripSuffix(IN_PROGRESS)
     val codecName: Option[String] = logName.split("\\.").tail.lastOption
-    val codec = codecName.map { c =>
-      codecMap.getOrElseUpdate(c, CompressionCodec.createCodec(new SparkConf, c))
-    }
 
     try {
+      val codec = codecName.map { c =>
+        codecMap.getOrElseUpdate(c, CompressionCodec.createCodec(new SparkConf, c))
+      }
       codec.map(_.compressedInputStream(in)).getOrElse(in)
     } catch {
-      case e: Exception =>
+      case e: Throwable =>
         in.close()
         throw e
     }

From 576c43fb4226e4efa12189b41c3bc862019862c6 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Sun, 24 Sep 2017 09:40:13 +0100
Subject: [PATCH 616/779] [SPARK-22087][SPARK-14650][WIP][BUILD][REPL][CORE]
 Compile Spark REPL for Scala 2.12 + other 2.12 fixes

## What changes were proposed in this pull request?

Enable Scala 2.12 REPL. Fix most remaining issues with 2.12 compilation and warnings, including:

- Selecting Kafka 0.10.1+ for Scala 2.12 and patching over a minor API difference
- Fixing lots of "eta expansion of zero arg method deprecated" warnings
- Resolving the SparkContext.sequenceFile implicits compile problem
- Fixing an odd but valid jetty-server missing dependency in hive-thriftserver

## How was this patch tested?

Existing tests

Author: Sean Owen <sowen@cloudera.com>

Closes #19307 from srowen/Scala212.
---
 .../scala/org/apache/spark/SparkContext.scala |  36 +++++
 .../deploy/history/FsHistoryProvider.scala    |   8 +-
 .../apache/spark/deploy/worker/Worker.scala   |  14 +-
 .../CoarseGrainedExecutorBackend.scala        |   4 +-
 .../spark/memory/UnifiedMemoryManager.scala   |   2 +-
 .../apache/spark/rdd/DoubleRDDFunctions.scala |   6 +-
 .../apache/spark/rpc/netty/NettyRpcEnv.scala  |   2 +-
 .../apache/spark/scheduler/DAGScheduler.scala |  14 +-
 .../CoarseGrainedSchedulerBackend.scala       |  10 +-
 .../spark/serializer/KryoSerializer.scala     |   4 +-
 .../spark/storage/BlockManagerMaster.scala    |  23 ++-
 .../storage/BlockManagerSlaveEndpoint.scala   |  10 +-
 .../apache/spark/ui/UIWorkloadGenerator.scala |  14 +-
 .../spark/deploy/SparkSubmitUtilsSuite.scala  |  11 +-
 .../history/ApplicationCacheSuite.scala       |   4 +-
 .../spark/rdd/AsyncRDDActionsSuite.scala      |   8 +-
 .../scheduler/SchedulerIntegrationSuite.scala |   2 +
 .../spark/serializer/KryoBenchmark.scala      |  16 +--
 examples/pom.xml                              |   2 +-
 .../flume/FlumePollingStreamSuite.scala       |   4 +-
 external/kafka-0-10-sql/pom.xml               |  11 ++
 .../spark/sql/kafka010/KafkaTestUtils.scala   |   5 +-
 external/kafka-0-10/pom.xml                   |  11 ++
 .../scala/org/apache/spark/ml/Pipeline.scala  |   5 +-
 .../spark/ml/classification/LinearSVC.scala   |   2 +-
 .../classification/LogisticRegression.scala   |   2 +-
 .../spark/ml/classification/OneVsRest.scala   |   2 +-
 .../ml/regression/LinearRegression.scala      |   2 +-
 .../spark/ml/tree/impl/RandomForest.scala     |   6 +-
 .../spark/mllib/tree/impurity/Impurity.scala  |   2 +-
 .../DifferentiableLossAggregatorSuite.scala   |   4 +-
 .../ml/tree/impl/RandomForestSuite.scala      |   6 +-
 repl/pom.xml                                  |   2 -
 .../org/apache/spark/repl/SparkILoop.scala    | 134 ++++++++++++++++++
 .../scala/org/apache/spark/repl/Main.scala    |   0
 .../org/apache/spark/repl/ReplSuite.scala     |   0
 .../spark/repl/SingletonReplSuite.scala       |   0
 .../expressions/aggregate/Percentile.scala    |   2 +-
 .../org/apache/spark/sql/types/Metadata.scala |   2 +-
 .../spark/sql/execution/GenerateExec.scala    |   4 +-
 .../datasources/InMemoryFileIndex.scala       |   1 +
 .../datasources/csv/UnivocityParser.scala     |   2 +-
 .../sql/execution/joins/HashedRelation.scala  |   8 +-
 .../apache/spark/sql/JavaDatasetSuite.java    |   4 +
 .../apache/spark/sql/DataFrameStatSuite.scala |   3 -
 .../StreamingQueryManagerSuite.scala          |   2 +-
 sql/hive-thriftserver/pom.xml                 |  10 ++
 .../spark/streaming/StreamingContext.scala    |   2 +-
 48 files changed, 316 insertions(+), 112 deletions(-)
 create mode 100644 repl/scala-2.12/src/main/scala/org/apache/spark/repl/SparkILoop.scala
 rename repl/{scala-2.11 => }/src/main/scala/org/apache/spark/repl/Main.scala (100%)
 rename repl/{scala-2.11 => }/src/test/scala/org/apache/spark/repl/ReplSuite.scala (100%)
 rename repl/{scala-2.11 => }/src/test/scala/org/apache/spark/repl/SingletonReplSuite.scala (100%)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 1821bc87bf626..cec61d85ccf38 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -2826,6 +2826,42 @@ object WritableConverter {
   // them automatically. However, we still keep the old functions in SparkContext for backward
   // compatibility and forward to the following functions directly.
 
+  // The following implicit declarations have been added on top of the very similar ones
+  // below in order to enable compatibility with Scala 2.12. Scala 2.12 deprecates eta
+  // expansion of zero-arg methods and thus won't match a no-arg method where it expects
+  // an implicit that is a function of no args.
+
+  implicit val intWritableConverterFn: () => WritableConverter[Int] =
+    () => simpleWritableConverter[Int, IntWritable](_.get)
+
+  implicit val longWritableConverterFn: () => WritableConverter[Long] =
+    () => simpleWritableConverter[Long, LongWritable](_.get)
+
+  implicit val doubleWritableConverterFn: () => WritableConverter[Double] =
+    () => simpleWritableConverter[Double, DoubleWritable](_.get)
+
+  implicit val floatWritableConverterFn: () => WritableConverter[Float] =
+    () => simpleWritableConverter[Float, FloatWritable](_.get)
+
+  implicit val booleanWritableConverterFn: () => WritableConverter[Boolean] =
+    () => simpleWritableConverter[Boolean, BooleanWritable](_.get)
+
+  implicit val bytesWritableConverterFn: () => WritableConverter[Array[Byte]] = {
+    () => simpleWritableConverter[Array[Byte], BytesWritable] { bw =>
+      // getBytes method returns array which is longer then data to be returned
+      Arrays.copyOfRange(bw.getBytes, 0, bw.getLength)
+    }
+  }
+
+  implicit val stringWritableConverterFn: () => WritableConverter[String] =
+    () => simpleWritableConverter[String, Text](_.toString)
+
+  implicit def writableWritableConverterFn[T <: Writable : ClassTag]: () => WritableConverter[T] =
+    () => new WritableConverter[T](_.runtimeClass.asInstanceOf[Class[T]], _.asInstanceOf[T])
+
+  // These implicits remain included for backwards-compatibility. They fulfill the
+  // same role as those above.
+
   implicit def intWritableConverter(): WritableConverter[Int] =
     simpleWritableConverter[Int, IntWritable](_.get)
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index 20fe911f2d294..910121e9878b9 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -218,11 +218,13 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     if (!conf.contains("spark.testing")) {
       // A task that periodically checks for event log updates on disk.
       logDebug(s"Scheduling update thread every $UPDATE_INTERVAL_S seconds")
-      pool.scheduleWithFixedDelay(getRunner(checkForLogs), 0, UPDATE_INTERVAL_S, TimeUnit.SECONDS)
+      pool.scheduleWithFixedDelay(
+        getRunner(() => checkForLogs()), 0, UPDATE_INTERVAL_S, TimeUnit.SECONDS)
 
       if (conf.getBoolean("spark.history.fs.cleaner.enabled", false)) {
         // A task that periodically cleans event logs on disk.
-        pool.scheduleWithFixedDelay(getRunner(cleanLogs), 0, CLEAN_INTERVAL_S, TimeUnit.SECONDS)
+        pool.scheduleWithFixedDelay(
+          getRunner(() => cleanLogs()), 0, CLEAN_INTERVAL_S, TimeUnit.SECONDS)
       }
     } else {
       logDebug("Background update thread disabled for testing")
@@ -268,7 +270,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
               appListener.adminAclsGroups.getOrElse("")
             ui.getSecurityManager.setAdminAclsGroups(adminAclsGroups)
             ui.getSecurityManager.setViewAclsGroups(appListener.viewAclsGroups.getOrElse(""))
-            Some(LoadedAppUI(ui, updateProbe(appId, attemptId, attempt.fileSize)))
+            Some(LoadedAppUI(ui, () => updateProbe(appId, attemptId, attempt.fileSize)))
           } else {
             None
           }
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
index 29a810fe7abe0..ed5fa4b839cd4 100755
--- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
@@ -450,10 +450,9 @@ private[deploy] class Worker(
         }
       }(cleanupThreadExecutor)
 
-      cleanupFuture.onFailure {
-        case e: Throwable =>
-          logError("App dir cleanup failed: " + e.getMessage, e)
-      }(cleanupThreadExecutor)
+      cleanupFuture.failed.foreach(e =>
+        logError("App dir cleanup failed: " + e.getMessage, e)
+      )(cleanupThreadExecutor)
 
     case MasterChanged(masterRef, masterWebUiUrl) =>
       logInfo("Master has changed, new master is at " + masterRef.address.toSparkURL)
@@ -622,10 +621,9 @@ private[deploy] class Worker(
           dirList.foreach { dir =>
             Utils.deleteRecursively(new File(dir))
           }
-        }(cleanupThreadExecutor).onFailure {
-          case e: Throwable =>
-            logError(s"Clean up app dir $dirList failed: ${e.getMessage}", e)
-        }(cleanupThreadExecutor)
+        }(cleanupThreadExecutor).failed.foreach(e =>
+          logError(s"Clean up app dir $dirList failed: ${e.getMessage}", e)
+        )(cleanupThreadExecutor)
       }
       shuffleService.applicationRemoved(id)
     }
diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
index ed893cd1e9482..d27362ae85bea 100644
--- a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
+++ b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -163,9 +163,9 @@ private[spark] class CoarseGrainedExecutorBackend(
     if (notifyDriver && driver.nonEmpty) {
       driver.get.ask[Boolean](
         RemoveExecutor(executorId, new ExecutorLossReason(reason))
-      ).onFailure { case e =>
+      ).failed.foreach(e =>
         logWarning(s"Unable to notify the driver due to " + e.getMessage, e)
-      }(ThreadUtils.sameThread)
+      )(ThreadUtils.sameThread)
     }
 
     System.exit(code)
diff --git a/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala b/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala
index df193552bed3c..78edd2c4d7faa 100644
--- a/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala
+++ b/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala
@@ -143,7 +143,7 @@ private[spark] class UnifiedMemoryManager private[memory] (
     }
 
     executionPool.acquireMemory(
-      numBytes, taskAttemptId, maybeGrowExecutionPool, computeMaxExecutionPoolSize)
+      numBytes, taskAttemptId, maybeGrowExecutionPool, () => computeMaxExecutionPoolSize)
   }
 
   override def acquireStorageMemory(
diff --git a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
index 57782c0ebf332..943abae17a911 100644
--- a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
@@ -128,9 +128,9 @@ class DoubleRDDFunctions(self: RDD[Double]) extends Logging with Serializable {
     }
     // Compute the minimum and the maximum
     val (max: Double, min: Double) = self.mapPartitions { items =>
-      Iterator(items.foldRight(Double.NegativeInfinity,
-        Double.PositiveInfinity)((e: Double, x: (Double, Double)) =>
-        (x._1.max(e), x._2.min(e))))
+      Iterator(
+        items.foldRight((Double.NegativeInfinity, Double.PositiveInfinity)
+        )((e: Double, x: (Double, Double)) => (x._1.max(e), x._2.min(e))))
     }.reduce { (maxmin1, maxmin2) =>
       (maxmin1._1.max(maxmin2._1), maxmin1._2.min(maxmin2._2))
     }
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
index 1777e7a539751..f951591e02a5c 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
@@ -232,7 +232,7 @@ private[netty] class NettyRpcEnv(
           onFailure,
           (client, response) => onSuccess(deserialize[Any](client, response)))
         postToOutbox(message.receiver, rpcMessage)
-        promise.future.onFailure {
+        promise.future.failed.foreach {
           case _: TimeoutException => rpcMessage.onTimeout()
           case _ =>
         }(ThreadUtils.sameThread)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 562dd1da4fe14..9153751d03c1b 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -24,7 +24,7 @@ import java.util.concurrent.atomic.AtomicInteger
 
 import scala.annotation.tailrec
 import scala.collection.Map
-import scala.collection.mutable.{HashMap, HashSet, Stack}
+import scala.collection.mutable.{ArrayStack, HashMap, HashSet}
 import scala.concurrent.duration._
 import scala.language.existentials
 import scala.language.postfixOps
@@ -396,12 +396,12 @@ class DAGScheduler(
 
   /** Find ancestor shuffle dependencies that are not registered in shuffleToMapStage yet */
   private def getMissingAncestorShuffleDependencies(
-      rdd: RDD[_]): Stack[ShuffleDependency[_, _, _]] = {
-    val ancestors = new Stack[ShuffleDependency[_, _, _]]
+      rdd: RDD[_]): ArrayStack[ShuffleDependency[_, _, _]] = {
+    val ancestors = new ArrayStack[ShuffleDependency[_, _, _]]
     val visited = new HashSet[RDD[_]]
     // We are manually maintaining a stack here to prevent StackOverflowError
     // caused by recursively visiting
-    val waitingForVisit = new Stack[RDD[_]]
+    val waitingForVisit = new ArrayStack[RDD[_]]
     waitingForVisit.push(rdd)
     while (waitingForVisit.nonEmpty) {
       val toVisit = waitingForVisit.pop()
@@ -434,7 +434,7 @@ class DAGScheduler(
       rdd: RDD[_]): HashSet[ShuffleDependency[_, _, _]] = {
     val parents = new HashSet[ShuffleDependency[_, _, _]]
     val visited = new HashSet[RDD[_]]
-    val waitingForVisit = new Stack[RDD[_]]
+    val waitingForVisit = new ArrayStack[RDD[_]]
     waitingForVisit.push(rdd)
     while (waitingForVisit.nonEmpty) {
       val toVisit = waitingForVisit.pop()
@@ -456,7 +456,7 @@ class DAGScheduler(
     val visited = new HashSet[RDD[_]]
     // We are manually maintaining a stack here to prevent StackOverflowError
     // caused by recursively visiting
-    val waitingForVisit = new Stack[RDD[_]]
+    val waitingForVisit = new ArrayStack[RDD[_]]
     def visit(rdd: RDD[_]) {
       if (!visited(rdd)) {
         visited += rdd
@@ -1633,7 +1633,7 @@ class DAGScheduler(
     val visitedRdds = new HashSet[RDD[_]]
     // We are manually maintaining a stack here to prevent StackOverflowError
     // caused by recursively visiting
-    val waitingForVisit = new Stack[RDD[_]]
+    val waitingForVisit = new ArrayStack[RDD[_]]
     def visit(rdd: RDD[_]) {
       if (!visitedRdds(rdd)) {
         visitedRdds += rdd
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index a0ef209779309..424e43b25c77a 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -471,15 +471,13 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
    */
   protected def removeExecutor(executorId: String, reason: ExecutorLossReason): Unit = {
     // Only log the failure since we don't care about the result.
-    driverEndpoint.ask[Boolean](RemoveExecutor(executorId, reason)).onFailure {
-      case t => logError(t.getMessage, t)
-    }(ThreadUtils.sameThread)
+    driverEndpoint.ask[Boolean](RemoveExecutor(executorId, reason)).failed.foreach(t =>
+      logError(t.getMessage, t))(ThreadUtils.sameThread)
   }
 
   protected def removeWorker(workerId: String, host: String, message: String): Unit = {
-    driverEndpoint.ask[Boolean](RemoveWorker(workerId, host, message)).onFailure {
-      case t => logError(t.getMessage, t)
-    }(ThreadUtils.sameThread)
+    driverEndpoint.ask[Boolean](RemoveWorker(workerId, host, message)).failed.foreach(t =>
+      logError(t.getMessage, t))(ThreadUtils.sameThread)
   }
 
   def sufficientResourcesRegistered(): Boolean = true
diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
index 4f03e54e304f6..58483c9577d29 100644
--- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
@@ -501,8 +501,8 @@ private class JavaIterableWrapperSerializer
 private object JavaIterableWrapperSerializer extends Logging {
   // The class returned by JavaConverters.asJava
   // (scala.collection.convert.Wrappers$IterableWrapper).
-  val wrapperClass =
-    scala.collection.convert.WrapAsJava.asJavaIterable(Seq(1)).getClass
+  import scala.collection.JavaConverters._
+  val wrapperClass = Seq(1).asJava.getClass
 
   // Get the underlying method so we can use it to get the Scala collection for serialization.
   private val underlyingMethodOpt = {
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala
index ea5d8423a588c..8b1dc0ba6356a 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala
@@ -118,10 +118,9 @@ class BlockManagerMaster(
   /** Remove all blocks belonging to the given RDD. */
   def removeRdd(rddId: Int, blocking: Boolean) {
     val future = driverEndpoint.askSync[Future[Seq[Int]]](RemoveRdd(rddId))
-    future.onFailure {
-      case e: Exception =>
-        logWarning(s"Failed to remove RDD $rddId - ${e.getMessage}", e)
-    }(ThreadUtils.sameThread)
+    future.failed.foreach(e =>
+      logWarning(s"Failed to remove RDD $rddId - ${e.getMessage}", e)
+    )(ThreadUtils.sameThread)
     if (blocking) {
       timeout.awaitResult(future)
     }
@@ -130,10 +129,9 @@ class BlockManagerMaster(
   /** Remove all blocks belonging to the given shuffle. */
   def removeShuffle(shuffleId: Int, blocking: Boolean) {
     val future = driverEndpoint.askSync[Future[Seq[Boolean]]](RemoveShuffle(shuffleId))
-    future.onFailure {
-      case e: Exception =>
-        logWarning(s"Failed to remove shuffle $shuffleId - ${e.getMessage}", e)
-    }(ThreadUtils.sameThread)
+    future.failed.foreach(e =>
+      logWarning(s"Failed to remove shuffle $shuffleId - ${e.getMessage}", e)
+    )(ThreadUtils.sameThread)
     if (blocking) {
       timeout.awaitResult(future)
     }
@@ -143,11 +141,10 @@ class BlockManagerMaster(
   def removeBroadcast(broadcastId: Long, removeFromMaster: Boolean, blocking: Boolean) {
     val future = driverEndpoint.askSync[Future[Seq[Int]]](
       RemoveBroadcast(broadcastId, removeFromMaster))
-    future.onFailure {
-      case e: Exception =>
-        logWarning(s"Failed to remove broadcast $broadcastId" +
-          s" with removeFromMaster = $removeFromMaster - ${e.getMessage}", e)
-    }(ThreadUtils.sameThread)
+    future.failed.foreach(e =>
+      logWarning(s"Failed to remove broadcast $broadcastId" +
+        s" with removeFromMaster = $removeFromMaster - ${e.getMessage}", e)
+    )(ThreadUtils.sameThread)
     if (blocking) {
       timeout.awaitResult(future)
     }
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveEndpoint.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveEndpoint.scala
index 1aaa42459df69..742cf4fe393f9 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveEndpoint.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveEndpoint.scala
@@ -85,13 +85,13 @@ class BlockManagerSlaveEndpoint(
       logDebug(actionMessage)
       body
     }
-    future.onSuccess { case response =>
-      logDebug("Done " + actionMessage + ", response is " + response)
+    future.foreach { response =>
+      logDebug(s"Done $actionMessage, response is $response")
       context.reply(response)
-      logDebug("Sent response: " + response + " to " + context.senderAddress)
+      logDebug(s"Sent response: $response to ${context.senderAddress}")
     }
-    future.onFailure { case t: Throwable =>
-      logError("Error in " + actionMessage, t)
+    future.failed.foreach { t =>
+      logError(s"Error in $actionMessage", t)
       context.sendFailure(t)
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/ui/UIWorkloadGenerator.scala b/core/src/main/scala/org/apache/spark/ui/UIWorkloadGenerator.scala
index 094953f2f5b5e..6229e800957d5 100644
--- a/core/src/main/scala/org/apache/spark/ui/UIWorkloadGenerator.scala
+++ b/core/src/main/scala/org/apache/spark/ui/UIWorkloadGenerator.scala
@@ -66,11 +66,11 @@ private[spark] object UIWorkloadGenerator {
     def nextFloat(): Float = new Random().nextFloat()
 
     val jobs = Seq[(String, () => Long)](
-      ("Count", baseData.count),
-      ("Cache and Count", baseData.map(x => x).cache().count),
-      ("Single Shuffle", baseData.map(x => (x % 10, x)).reduceByKey(_ + _).count),
-      ("Entirely failed phase", baseData.map(x => throw new Exception).count),
-      ("Partially failed phase", {
+      ("Count", () => baseData.count),
+      ("Cache and Count", () => baseData.map(x => x).cache().count),
+      ("Single Shuffle", () => baseData.map(x => (x % 10, x)).reduceByKey(_ + _).count),
+      ("Entirely failed phase", () => baseData.map { x => throw new Exception(); 1 }.count),
+      ("Partially failed phase", () => {
         baseData.map{x =>
           val probFailure = (4.0 / NUM_PARTITIONS)
           if (nextFloat() < probFailure) {
@@ -79,7 +79,7 @@ private[spark] object UIWorkloadGenerator {
           1
         }.count
       }),
-      ("Partially failed phase (longer tasks)", {
+      ("Partially failed phase (longer tasks)", () => {
         baseData.map{x =>
           val probFailure = (4.0 / NUM_PARTITIONS)
           if (nextFloat() < probFailure) {
@@ -89,7 +89,7 @@ private[spark] object UIWorkloadGenerator {
           1
         }.count
       }),
-      ("Job with delays", baseData.map(x => Thread.sleep(100)).count)
+      ("Job with delays", () => baseData.map(x => Thread.sleep(100)).count)
     )
 
     val barrier = new Semaphore(-nJobSet * jobs.size + 1)
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
index 88b77e5143420..eb8c203ae7751 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
@@ -83,11 +83,12 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll {
     val resolver = settings.getDefaultResolver.asInstanceOf[ChainResolver]
     assert(resolver.getResolvers.size() === 4)
     val expected = repos.split(",").map(r => s"$r/")
-    resolver.getResolvers.toArray.zipWithIndex.foreach { case (resolver: AbstractResolver, i) =>
-      if (1 < i && i < 3) {
-        assert(resolver.getName === s"repo-$i")
-        assert(resolver.asInstanceOf[IBiblioResolver].getRoot === expected(i - 1))
-      }
+    resolver.getResolvers.toArray.map(_.asInstanceOf[AbstractResolver]).zipWithIndex.foreach {
+      case (r, i) =>
+        if (1 < i && i < 3) {
+          assert(r.getName === s"repo-$i")
+          assert(r.asInstanceOf[IBiblioResolver].getRoot === expected(i - 1))
+        }
     }
   }
 
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/ApplicationCacheSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/ApplicationCacheSuite.scala
index c175ed3fb6e3d..6e50e84549047 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/ApplicationCacheSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/ApplicationCacheSuite.scala
@@ -78,7 +78,7 @@ class ApplicationCacheSuite extends SparkFunSuite with Logging with MockitoSugar
       logDebug(s"getAppUI($appId, $attemptId)")
       getAppUICount += 1
       instances.get(CacheKey(appId, attemptId)).map( e =>
-        LoadedAppUI(e.ui, updateProbe(appId, attemptId, e.probeTime)))
+        LoadedAppUI(e.ui, () => updateProbe(appId, attemptId, e.probeTime)))
     }
 
     override def attachSparkUI(
@@ -122,7 +122,7 @@ class ApplicationCacheSuite extends SparkFunSuite with Logging with MockitoSugar
         completed: Boolean,
         timestamp: Long): Unit = {
       instances += (CacheKey(appId, attemptId) ->
-          new CacheEntry(ui, completed, updateProbe(appId, attemptId, timestamp), timestamp))
+          new CacheEntry(ui, completed, () => updateProbe(appId, attemptId, timestamp), timestamp))
     }
 
     /**
diff --git a/core/src/test/scala/org/apache/spark/rdd/AsyncRDDActionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/AsyncRDDActionsSuite.scala
index f4be8eaef7013..de0e71a332f23 100644
--- a/core/src/test/scala/org/apache/spark/rdd/AsyncRDDActionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/AsyncRDDActionsSuite.scala
@@ -130,10 +130,10 @@ class AsyncRDDActionsSuite extends SparkFunSuite with BeforeAndAfterAll with Tim
         info("Should not have reached this code path (onComplete matching Failure)")
         throw new Exception("Task should succeed")
     }
-    f.onSuccess { case a: Any =>
+    f.foreach { a =>
       sem.release()
     }
-    f.onFailure { case t =>
+    f.failed.foreach { t =>
       info("Should not have reached this code path (onFailure)")
       throw new Exception("Task should succeed")
     }
@@ -164,11 +164,11 @@ class AsyncRDDActionsSuite extends SparkFunSuite with BeforeAndAfterAll with Tim
       case scala.util.Failure(e) =>
         sem.release()
     }
-    f.onSuccess { case a: Any =>
+    f.foreach { a =>
       info("Should not have reached this code path (onSuccess)")
       throw new Exception("Task should fail")
     }
-    f.onFailure { case t =>
+    f.failed.foreach { t =>
       sem.release()
     }
     intercept[SparkException] {
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
index a8249e123fa00..75ea409e16b4b 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
@@ -625,6 +625,8 @@ class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCor
           backend.taskFailed(taskDescription, fetchFailed)
         case (1, _, partition) =>
           backend.taskSuccess(taskDescription, 42 + partition)
+        case unmatched =>
+          fail(s"Unexpected shuffle output $unmatched")
       }
     }
     withBackend(runBackend _) {
diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoBenchmark.scala b/core/src/test/scala/org/apache/spark/serializer/KryoBenchmark.scala
index 64be966276140..a1cf3570a7a6d 100644
--- a/core/src/test/scala/org/apache/spark/serializer/KryoBenchmark.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/KryoBenchmark.scala
@@ -78,10 +78,10 @@ class KryoBenchmark extends SparkFunSuite {
         sum
       }
     }
-    basicTypes("Int", Random.nextInt)
-    basicTypes("Long", Random.nextLong)
-    basicTypes("Float", Random.nextFloat)
-    basicTypes("Double", Random.nextDouble)
+    basicTypes("Int", () => Random.nextInt())
+    basicTypes("Long", () => Random.nextLong())
+    basicTypes("Float", () => Random.nextFloat())
+    basicTypes("Double", () => Random.nextDouble())
 
     // Benchmark Array of Primitives
     val arrayCount = 10000
@@ -101,10 +101,10 @@ class KryoBenchmark extends SparkFunSuite {
         sum
       }
     }
-    basicTypeArray("Int", Random.nextInt)
-    basicTypeArray("Long", Random.nextLong)
-    basicTypeArray("Float", Random.nextFloat)
-    basicTypeArray("Double", Random.nextDouble)
+    basicTypeArray("Int", () => Random.nextInt())
+    basicTypeArray("Long", () => Random.nextLong())
+    basicTypeArray("Float", () => Random.nextFloat())
+    basicTypeArray("Double", () => Random.nextDouble())
 
     // Benchmark Maps
     val mapsCount = 1000
diff --git a/examples/pom.xml b/examples/pom.xml
index 33eca48645725..52a6764ae26a5 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -114,7 +114,7 @@
     <dependency>
       <groupId>com.github.scopt</groupId>
       <artifactId>scopt_${scala.binary.version}</artifactId>
-      <version>3.3.0</version>
+      <version>3.7.0</version>
     </dependency>
     <dependency>
       <groupId>com.twitter</groupId>
diff --git a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala
index 1c93079497f69..4324cc6d0f804 100644
--- a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala
+++ b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala
@@ -61,11 +61,11 @@ class FlumePollingStreamSuite extends SparkFunSuite with BeforeAndAfterAll with
   }
 
   test("flume polling test") {
-    testMultipleTimes(testFlumePolling)
+    testMultipleTimes(() => testFlumePolling())
   }
 
   test("flume polling test multiple hosts") {
-    testMultipleTimes(testFlumePollingMultipleHost)
+    testMultipleTimes(() => testFlumePollingMultipleHost())
   }
 
   /**
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index 0f61a10e9b4a0..0c9f0aa765a39 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -102,8 +102,19 @@
     </dependency>
 
   </dependencies>
+
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
   </build>
+
+  <profiles>
+    <profile>
+      <id>scala-2.12</id>
+      <properties>
+        <kafka.version>0.10.1.1</kafka.version>
+      </properties>
+    </profile>
+  </profiles>
+
 </project>
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
index 066a68a5dd31a..2df8352f48660 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
@@ -173,7 +173,10 @@ class KafkaTestUtils(withBrokerProps: Map[String, Object] = Map.empty) extends L
         AdminUtils.createTopic(zkUtils, topic, partitions, 1)
         created = true
       } catch {
-        case e: kafka.common.TopicExistsException if overwrite => deleteTopic(topic)
+        // Workaround fact that TopicExistsException is in kafka.common in 0.10.0 and
+        // org.apache.kafka.common.errors in 0.10.1 (!)
+        case e: Exception if (e.getClass.getSimpleName == "TopicExistsException") && overwrite =>
+          deleteTopic(topic)
       }
     }
     // wait until metadata is propagated
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index 4d9861af1cc03..6eb7ba5f0092d 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -87,8 +87,19 @@
     </dependency>
 
   </dependencies>
+
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
   </build>
+
+  <profiles>
+    <profile>
+      <id>scala-2.12</id>
+      <properties>
+        <kafka.version>0.10.1.1</kafka.version>
+      </properties>
+    </profile>
+  </profiles>
+
 </project>
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
index b76dc5f93193c..103082b7b9766 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
@@ -250,8 +250,9 @@ object Pipeline extends MLReadable[Pipeline] {
 
       // Save stages
       val stagesDir = new Path(path, "stages").toString
-      stages.zipWithIndex.foreach { case (stage: MLWritable, idx: Int) =>
-        stage.write.save(getStagePath(stage.uid, idx, stages.length, stagesDir))
+      stages.zipWithIndex.foreach { case (stage, idx) =>
+        stage.asInstanceOf[MLWritable].write.save(
+          getStagePath(stage.uid, idx, stages.length, stagesDir))
       }
     }
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
index 1c97d77d38948..ce400f4f1faf7 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
@@ -184,7 +184,7 @@ class LinearSVC @Since("2.2.0") (
           (c1._1.merge(c2._1), c1._2.merge(c2._2))
 
       instances.treeAggregate(
-        new MultivariateOnlineSummarizer, new MultiClassSummarizer
+        (new MultivariateOnlineSummarizer, new MultiClassSummarizer)
       )(seqOp, combOp, $(aggregationDepth))
     }
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index cbc8f4a2d8c27..fa191604218db 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -514,7 +514,7 @@ class LogisticRegression @Since("1.2.0") (
           (c1._1.merge(c2._1), c1._2.merge(c2._2))
 
       instances.treeAggregate(
-        new MultivariateOnlineSummarizer, new MultiClassSummarizer
+        (new MultivariateOnlineSummarizer, new MultiClassSummarizer)
       )(seqOp, combOp, $(aggregationDepth))
     }
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
index 92a7742f6c865..3ab99b35ece2b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
@@ -235,7 +235,7 @@ object OneVsRestModel extends MLReadable[OneVsRestModel] {
       val extraJson = ("labelMetadata" -> instance.labelMetadata.json) ~
         ("numClasses" -> instance.models.length)
       OneVsRestParams.saveImpl(path, instance, sc, Some(extraJson))
-      instance.models.zipWithIndex.foreach { case (model: MLWritable, idx) =>
+      instance.models.map(_.asInstanceOf[MLWritable]).zipWithIndex.foreach { case (model, idx) =>
         val modelPath = new Path(path, s"model_$idx").toString
         model.save(modelPath)
       }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index b2a968118d1a9..df1aa609c1b71 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -265,7 +265,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
           (c1._1.merge(c2._1), c1._2.merge(c2._2))
 
       instances.treeAggregate(
-        new MultivariateOnlineSummarizer, new MultivariateOnlineSummarizer
+        (new MultivariateOnlineSummarizer, new MultivariateOnlineSummarizer)
       )(seqOp, combOp, $(aggregationDepth))
     }
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
index f7d969f4ca5db..acfc6399c553b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
@@ -169,7 +169,7 @@ private[spark] object RandomForest extends Logging {
       training the same tree in the next iteration.  This focus allows us to send fewer trees to
       workers on each iteration; see topNodesForGroup below.
      */
-    val nodeStack = new mutable.Stack[(Int, LearningNode)]
+    val nodeStack = new mutable.ArrayStack[(Int, LearningNode)]
 
     val rng = new Random()
     rng.setSeed(seed)
@@ -367,7 +367,7 @@ private[spark] object RandomForest extends Logging {
       nodesForGroup: Map[Int, Array[LearningNode]],
       treeToNodeToIndexInfo: Map[Int, Map[Int, NodeIndexInfo]],
       splits: Array[Array[Split]],
-      nodeStack: mutable.Stack[(Int, LearningNode)],
+      nodeStack: mutable.ArrayStack[(Int, LearningNode)],
       timer: TimeTracker = new TimeTracker,
       nodeIdCache: Option[NodeIdCache] = None): Unit = {
 
@@ -1076,7 +1076,7 @@ private[spark] object RandomForest extends Logging {
    *          The feature indices are None if not subsampling features.
    */
   private[tree] def selectNodesToSplit(
-      nodeStack: mutable.Stack[(Int, LearningNode)],
+      nodeStack: mutable.ArrayStack[(Int, LearningNode)],
       maxMemoryUsage: Long,
       metadata: DecisionTreeMetadata,
       rng: Random): (Map[Int, Array[LearningNode]], Map[Int, Map[Int, NodeIndexInfo]]) = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala
index 4c7746869dde1..f151a6a01b658 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala
@@ -162,7 +162,7 @@ private[spark] abstract class ImpurityCalculator(val stats: Array[Double]) exten
    * Fails if the array is empty.
    */
   protected def indexOfLargestArrayElement(array: Array[Double]): Int = {
-    val result = array.foldLeft(-1, Double.MinValue, 0) {
+    val result = array.foldLeft((-1, Double.MinValue, 0)) {
       case ((maxIndex, maxValue, currentIndex), currentValue) =>
         if (currentValue > maxValue) {
           (currentIndex, currentValue, currentIndex + 1)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/DifferentiableLossAggregatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/DifferentiableLossAggregatorSuite.scala
index d7cdeae30be20..9fddf09babb0a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/DifferentiableLossAggregatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/DifferentiableLossAggregatorSuite.scala
@@ -174,7 +174,7 @@ object DifferentiableLossAggregatorSuite {
       (c1._1.merge(c2._1), c1._2.merge(c2._2))
 
     instances.aggregate(
-      new MultivariateOnlineSummarizer, new MultivariateOnlineSummarizer
+      (new MultivariateOnlineSummarizer, new MultivariateOnlineSummarizer)
     )(seqOp, combOp)
   }
 
@@ -191,7 +191,7 @@ object DifferentiableLossAggregatorSuite {
       (c1._1.merge(c2._1), c1._2.merge(c2._2))
 
     instances.aggregate(
-      new MultivariateOnlineSummarizer, new MultiClassSummarizer
+      (new MultivariateOnlineSummarizer, new MultiClassSummarizer)
     )(seqOp, combOp)
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala
index df155b464c64b..dbe2ea931fb9c 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala
@@ -324,7 +324,7 @@ class RandomForestSuite extends SparkFunSuite with MLlibTestSparkContext {
     val treeToNodeToIndexInfo = Map((0, Map(
       (topNode.id, new RandomForest.NodeIndexInfo(0, None))
     )))
-    val nodeStack = new mutable.Stack[(Int, LearningNode)]
+    val nodeStack = new mutable.ArrayStack[(Int, LearningNode)]
     RandomForest.findBestSplits(baggedInput, metadata, Map(0 -> topNode),
       nodesForGroup, treeToNodeToIndexInfo, splits, nodeStack)
 
@@ -366,7 +366,7 @@ class RandomForestSuite extends SparkFunSuite with MLlibTestSparkContext {
     val treeToNodeToIndexInfo = Map((0, Map(
       (topNode.id, new RandomForest.NodeIndexInfo(0, None))
     )))
-    val nodeStack = new mutable.Stack[(Int, LearningNode)]
+    val nodeStack = new mutable.ArrayStack[(Int, LearningNode)]
     RandomForest.findBestSplits(baggedInput, metadata, Map(0 -> topNode),
       nodesForGroup, treeToNodeToIndexInfo, splits, nodeStack)
 
@@ -478,7 +478,7 @@ class RandomForestSuite extends SparkFunSuite with MLlibTestSparkContext {
         val failString = s"Failed on test with:" +
           s"numTrees=$numTrees, featureSubsetStrategy=$featureSubsetStrategy," +
           s" numFeaturesPerNode=$numFeaturesPerNode, seed=$seed"
-        val nodeStack = new mutable.Stack[(Int, LearningNode)]
+        val nodeStack = new mutable.ArrayStack[(Int, LearningNode)]
         val topNodes: Array[LearningNode] = new Array[LearningNode](numTrees)
         Range(0, numTrees).foreach { treeIndex =>
           topNodes(treeIndex) = LearningNode.emptyNode(nodeIndex = 1)
diff --git a/repl/pom.xml b/repl/pom.xml
index 51eb9b60dd54a..bd2cfc465aaf0 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -171,7 +171,6 @@
     </plugins>
   </build>
   
-  <!--
   <profiles>
     <profile>
       <id>scala-2.12</id>
@@ -181,6 +180,5 @@
       </properties>
     </profile>
   </profiles>
-  -->
 
 </project>
diff --git a/repl/scala-2.12/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/scala-2.12/src/main/scala/org/apache/spark/repl/SparkILoop.scala
new file mode 100644
index 0000000000000..413594021987d
--- /dev/null
+++ b/repl/scala-2.12/src/main/scala/org/apache/spark/repl/SparkILoop.scala
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.repl
+
+import java.io.BufferedReader
+
+// scalastyle:off println
+import scala.Predef.{println => _, _}
+// scalastyle:on println
+import scala.tools.nsc.Settings
+import scala.tools.nsc.interpreter.{ILoop, JPrintWriter}
+import scala.tools.nsc.util.stringFromStream
+import scala.util.Properties.{javaVersion, javaVmName, versionString}
+
+/**
+ *  A Spark-specific interactive shell.
+ */
+class SparkILoop(in0: Option[BufferedReader], out: JPrintWriter)
+    extends ILoop(in0, out) {
+  def this(in0: BufferedReader, out: JPrintWriter) = this(Some(in0), out)
+  def this() = this(None, new JPrintWriter(Console.out, true))
+
+  def initializeSpark() {
+    intp.beQuietDuring {
+      processLine("""
+        @transient val spark = if (org.apache.spark.repl.Main.sparkSession != null) {
+            org.apache.spark.repl.Main.sparkSession
+          } else {
+            org.apache.spark.repl.Main.createSparkSession()
+          }
+        @transient val sc = {
+          val _sc = spark.sparkContext
+          if (_sc.getConf.getBoolean("spark.ui.reverseProxy", false)) {
+            val proxyUrl = _sc.getConf.get("spark.ui.reverseProxyUrl", null)
+            if (proxyUrl != null) {
+              println(
+                s"Spark Context Web UI is available at ${proxyUrl}/proxy/${_sc.applicationId}")
+            } else {
+              println(s"Spark Context Web UI is available at Spark Master Public URL")
+            }
+          } else {
+            _sc.uiWebUrl.foreach {
+              webUrl => println(s"Spark context Web UI available at ${webUrl}")
+            }
+          }
+          println("Spark context available as 'sc' " +
+            s"(master = ${_sc.master}, app id = ${_sc.applicationId}).")
+          println("Spark session available as 'spark'.")
+          _sc
+        }
+        """)
+      processLine("import org.apache.spark.SparkContext._")
+      processLine("import spark.implicits._")
+      processLine("import spark.sql")
+      processLine("import org.apache.spark.sql.functions._")
+    }
+  }
+
+  /** Print a welcome message */
+  override def printWelcome() {
+    import org.apache.spark.SPARK_VERSION
+    echo("""Welcome to
+      ____              __
+     / __/__  ___ _____/ /__
+    _\ \/ _ \/ _ `/ __/  '_/
+   /___/ .__/\_,_/_/ /_/\_\   version %s
+      /_/
+         """.format(SPARK_VERSION))
+    val welcomeMsg = "Using Scala %s (%s, Java %s)".format(
+      versionString, javaVmName, javaVersion)
+    echo(welcomeMsg)
+    echo("Type in expressions to have them evaluated.")
+    echo("Type :help for more information.")
+  }
+
+  /** Available commands */
+  override def commands: List[LoopCommand] = standardCommands
+
+  /**
+   * We override `createInterpreter` because we need to initialize Spark *before* the REPL
+   * sees any files, so that the Spark context is visible in those files. This is a bit of a
+   * hack, but there isn't another hook available to us at this point.
+   */
+  override def createInterpreter(): Unit = {
+    super.createInterpreter()
+    initializeSpark()
+  }
+
+  override def resetCommand(line: String): Unit = {
+    super.resetCommand(line)
+    initializeSpark()
+    echo("Note that after :reset, state of SparkSession and SparkContext is unchanged.")
+  }
+}
+
+object SparkILoop {
+
+  /**
+   * Creates an interpreter loop with default settings and feeds
+   * the given code to it as input.
+   */
+  def run(code: String, sets: Settings = new Settings): String = {
+    import java.io.{ BufferedReader, StringReader, OutputStreamWriter }
+
+    stringFromStream { ostream =>
+      Console.withOut(ostream) {
+        val input = new BufferedReader(new StringReader(code))
+        val output = new JPrintWriter(new OutputStreamWriter(ostream), true)
+        val repl = new SparkILoop(input, output)
+
+        if (sets.classpath.isDefault) {
+          sets.classpath.value = sys.props("java.class.path")
+        }
+        repl process sets
+      }
+    }
+  }
+  def run(lines: List[String]): String = run(lines.map(_ + "\n").mkString)
+}
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala b/repl/src/main/scala/org/apache/spark/repl/Main.scala
similarity index 100%
rename from repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
rename to repl/src/main/scala/org/apache/spark/repl/Main.scala
diff --git a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala b/repl/src/test/scala/org/apache/spark/repl/ReplSuite.scala
similarity index 100%
rename from repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
rename to repl/src/test/scala/org/apache/spark/repl/ReplSuite.scala
diff --git a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/SingletonReplSuite.scala b/repl/src/test/scala/org/apache/spark/repl/SingletonReplSuite.scala
similarity index 100%
rename from repl/scala-2.11/src/test/scala/org/apache/spark/repl/SingletonReplSuite.scala
rename to repl/src/test/scala/org/apache/spark/repl/SingletonReplSuite.scala
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Percentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Percentile.scala
index 8761ae4020e55..4894036e27463 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Percentile.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Percentile.scala
@@ -179,7 +179,7 @@ case class Percentile(
 
     val sortedCounts = buffer.toSeq.sortBy(_._1)(
       child.dataType.asInstanceOf[NumericType].ordering.asInstanceOf[Ordering[AnyRef]])
-    val accumlatedCounts = sortedCounts.scanLeft(sortedCounts.head._1, 0L) {
+    val accumlatedCounts = sortedCounts.scanLeft((sortedCounts.head._1, 0L)) {
       case ((key1, count1), (key2, count2)) => (key2, count1 + count2)
     }.tail
     val maxPosition = accumlatedCounts.last._2 - 1
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Metadata.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Metadata.scala
index 3aa4bf619f274..352fb545f4b6b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Metadata.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Metadata.scala
@@ -177,7 +177,7 @@ object Metadata {
   private def toJsonValue(obj: Any): JValue = {
     obj match {
       case map: Map[_, _] =>
-        val fields = map.toList.map { case (k: String, v) => (k, toJsonValue(v)) }
+        val fields = map.toList.map { case (k, v) => (k.toString, toJsonValue(v)) }
         JObject(fields)
       case arr: Array[_] =>
         val values = arr.toList.map(toJsonValue)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala
index c35e5638e9273..65ca37491b6a1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala
@@ -96,7 +96,7 @@ case class GenerateExec(
           } else {
             outputRows.map(joinedRow.withRight)
           }
-        } ++ LazyIterator(boundGenerator.terminate).map { row =>
+        } ++ LazyIterator(() => boundGenerator.terminate()).map { row =>
           // we leave the left side as the last element of its child output
           // keep it the same as Hive does
           joinedRow.withRight(row)
@@ -109,7 +109,7 @@ case class GenerateExec(
           } else {
             outputRows
           }
-        } ++ LazyIterator(boundGenerator.terminate)
+        } ++ LazyIterator(() => boundGenerator.terminate())
       }
 
       // Convert the rows to unsafe rows.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
index d74aae35250ff..203d449717512 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
@@ -119,6 +119,7 @@ class InMemoryFileIndex(
         case None =>
           pathsToFetch += path
       }
+      Unit // for some reasons scalac 2.12 needs this; return type doesn't matter
     }
     val filter = FileInputFormat.getInputPathFilter(new JobConf(hadoopConf, this.getClass))
     val discovered = InMemoryFileIndex.bulkListLeafFiles(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
index 0e41f3c7aa6b8..7d6d7e7eef926 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
@@ -205,7 +205,7 @@ class UnivocityParser(
       }
       throw BadRecordException(
         () => getCurrentInput,
-        getPartialResult,
+        () => getPartialResult(),
         new RuntimeException("Malformed CSV record"))
     } else {
       try {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
index 1b6a28cde2931..f8058b2f7813b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
@@ -216,7 +216,7 @@ private[joins] class UnsafeHashedRelation(
   }
 
   override def readExternal(in: ObjectInput): Unit = Utils.tryOrIOException {
-    read(in.readInt, in.readLong, in.readFully)
+    read(() => in.readInt(), () => in.readLong(), in.readFully)
   }
 
   private def read(
@@ -277,7 +277,7 @@ private[joins] class UnsafeHashedRelation(
   }
 
   override def read(kryo: Kryo, in: Input): Unit = Utils.tryOrIOException {
-    read(in.readInt, in.readLong, in.readBytes)
+    read(() => in.readInt(), () => in.readLong(), in.readBytes)
   }
 
   override def getAverageProbesPerLookup: Double = binaryMap.getAverageProbesPerLookup
@@ -766,11 +766,11 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
   }
 
   override def readExternal(in: ObjectInput): Unit = {
-    read(in.readBoolean, in.readLong, in.readFully)
+    read(() => in.readBoolean(), () => in.readLong(), in.readFully)
   }
 
   override def read(kryo: Kryo, in: Input): Unit = {
-    read(in.readBoolean, in.readLong, in.readBytes)
+    read(() => in.readBoolean(), () => in.readLong(), in.readBytes)
   }
 
   /**
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java
index 13b006fc48ac3..c132cab1b38cf 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java
@@ -1334,6 +1334,10 @@ public String toString() {
       return "BeanWithEnum(" + enumField  + ", " + regularField + ")";
     }
 
+    public int hashCode() {
+      return Objects.hashCode(enumField, regularField);
+    }
+
     public boolean equals(Object other) {
       if (other instanceof BeanWithEnum) {
         BeanWithEnum beanWithEnum = (BeanWithEnum) other;
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
index 09502d05f7709..247c30e2ee65b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
@@ -230,11 +230,9 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext {
 
     val resNaN1 = dfNaN.stat.approxQuantile("input1", Array(q1, q2), epsilon)
     assert(resNaN1.count(_.isNaN) === 0)
-    assert(resNaN1.count(_ == null) === 0)
 
     val resNaN2 = dfNaN.stat.approxQuantile("input2", Array(q1, q2), epsilon)
     assert(resNaN2.count(_.isNaN) === 0)
-    assert(resNaN2.count(_ == null) === 0)
 
     val resNaN3 = dfNaN.stat.approxQuantile("input3", Array(q1, q2), epsilon)
     assert(resNaN3.isEmpty)
@@ -242,7 +240,6 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext {
     val resNaNAll = dfNaN.stat.approxQuantile(Array("input1", "input2", "input3"),
       Array(q1, q2), epsilon)
     assert(resNaNAll.flatten.count(_.isNaN) === 0)
-    assert(resNaNAll.flatten.count(_ == null) === 0)
 
     assert(resNaN1(0) === resNaNAll(0)(0))
     assert(resNaN1(1) === resNaNAll(0)(1))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryManagerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryManagerSuite.scala
index 2986b7f1eecfb..46eec736d4027 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryManagerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryManagerSuite.scala
@@ -289,7 +289,7 @@ class StreamingQueryManagerSuite extends StreamTest with BeforeAndAfter {
       }
     }
 
-    AwaitTerminationTester.test(expectedBehavior, awaitTermFunc, testBehaviorFor)
+    AwaitTerminationTester.test(expectedBehavior, () => awaitTermFunc(), testBehaviorFor)
   }
 
   /** Stop a random active query either with `stop()` or with an error */
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index a5a8e2640586c..3135a8a275dae 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -63,6 +63,16 @@
       <groupId>${hive.group}</groupId>
       <artifactId>hive-beeline</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.eclipse.jetty</groupId>
+      <artifactId>jetty-server</artifactId>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.eclipse.jetty</groupId>
+      <artifactId>jetty-servlet</artifactId>
+      <scope>provided</scope>
+    </dependency>
     <!-- Added for selenium: -->
     <dependency>
       <groupId>org.seleniumhq.selenium</groupId>
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index 8c7418ec7ac10..027403816f538 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -596,7 +596,7 @@ class StreamingContext private[streaming] (
         }
         logDebug("Adding shutdown hook") // force eager creation of logger
         shutdownHookRef = ShutdownHookManager.addShutdownHook(
-          StreamingContext.SHUTDOWN_HOOK_PRIORITY)(stopOnShutdown)
+          StreamingContext.SHUTDOWN_HOOK_PRIORITY)(() => stopOnShutdown())
         // Registering Streaming Metrics at the start of the StreamingContext
         assert(env.metricsSystem != null)
         env.metricsSystem.registerSource(streamingSource)

From 20adf9aa1f42353432d356117e655e799ea1290b Mon Sep 17 00:00:00 2001
From: John O'Leary <jgoleary@gmail.com>
Date: Mon, 25 Sep 2017 09:16:27 +0900
Subject: [PATCH 617/779] [SPARK-22107] Change as to alias in python quickstart

## What changes were proposed in this pull request?

Updated docs so that a line of python in the quick start guide executes. Closes #19283

## How was this patch tested?

Existing tests.

Author: John O'Leary <jgoleary@gmail.com>

Closes #19326 from jgoleary/issues/22107.
---
 docs/quick-start.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/quick-start.md b/docs/quick-start.md
index a85e5b28a6996..200b97230e866 100644
--- a/docs/quick-start.md
+++ b/docs/quick-start.md
@@ -153,7 +153,7 @@ This first maps a line to an integer value and aliases it as "numWords", creatin
 One common data flow pattern is MapReduce, as popularized by Hadoop. Spark can implement MapReduce flows easily:
 
 {% highlight python %}
->>> wordCounts = textFile.select(explode(split(textFile.value, "\s+")).as("word")).groupBy("word").count()
+>>> wordCounts = textFile.select(explode(split(textFile.value, "\s+")).alias("word")).groupBy("word").count()
 {% endhighlight %}
 
 Here, we use the `explode` function in `select`, to transfrom a Dataset of lines to a Dataset of words, and then combine `groupBy` and `count` to compute the per-word counts in the file as a DataFrame of 2 columns: "word" and "count". To collect the word counts in our shell, we can call `collect`:

From 365a29bdbfd18aae4b5374157dc1d2abfc64eb0e Mon Sep 17 00:00:00 2001
From: Zhenhua Wang <wangzhenhua@huawei.com>
Date: Mon, 25 Sep 2017 09:28:42 -0700
Subject: [PATCH 618/779] [SPARK-22100][SQL] Make percentile_approx support
 date/timestamp type and change the output type to be the same as input type

## What changes were proposed in this pull request?

The `percentile_approx` function previously accepted numeric type input and output double type results.

But since all numeric types, date and timestamp types are represented as numerics internally, `percentile_approx` can support them easily.

After this PR, it supports date type, timestamp type and numeric types as input types. The result type is also changed to be the same as the input type, which is more reasonable for percentiles.

This change is also required when we generate equi-height histograms for these types.

## How was this patch tested?

Added a new test and modified some existing tests.

Author: Zhenhua Wang <wangzhenhua@huawei.com>

Closes #19321 from wzhfy/approx_percentile_support_types.
---
 R/pkg/tests/fulltests/test_sparkSQL.R         |  4 +--
 docs/sql-programming-guide.md                 |  1 +
 python/pyspark/sql/dataframe.py               | 10 +++---
 .../aggregate/ApproximatePercentile.scala     | 33 ++++++++++++++++---
 .../ApproximatePercentileSuite.scala          |  6 ++--
 .../sql/ApproximatePercentileQuerySuite.scala | 29 +++++++++++++++-
 .../org/apache/spark/sql/DataFrameSuite.scala |  6 ++--
 7 files changed, 70 insertions(+), 19 deletions(-)

diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index 4d1010ee1320a..4e62be9b4d619 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -2538,14 +2538,14 @@ test_that("describe() and summary() on a DataFrame", {
 
   stats2 <- summary(df)
   expect_equal(collect(stats2)[5, "summary"], "25%")
-  expect_equal(collect(stats2)[5, "age"], "30.0")
+  expect_equal(collect(stats2)[5, "age"], "30")
 
   stats3 <- summary(df, "min", "max", "55.1%")
 
   expect_equal(collect(stats3)[1, "summary"], "min")
   expect_equal(collect(stats3)[2, "summary"], "max")
   expect_equal(collect(stats3)[3, "summary"], "55.1%")
-  expect_equal(collect(stats3)[3, "age"], "30.0")
+  expect_equal(collect(stats3)[3, "age"], "30")
 
   # SPARK-16425: SparkR summary() fails on column of type logical
   df <- withColumn(df, "boolean", df$age == 30)
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 5db60cc996e75..a095263bfa619 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1553,6 +1553,7 @@ options.
 ## Upgrading From Spark SQL 2.2 to 2.3
 
   - Since Spark 2.3, the queries from raw JSON/CSV files are disallowed when the referenced columns only include the internal corrupt record column (named `_corrupt_record` by default). For example, `spark.read.schema(schema).json(file).filter($"_corrupt_record".isNotNull).count()` and `spark.read.schema(schema).json(file).select("_corrupt_record").show()`. Instead, you can cache or save the parsed results and then send the same query. For example, `val df = spark.read.schema(schema).json(file).cache()` and then `df.filter($"_corrupt_record".isNotNull).count()`.
+  - The `percentile_approx` function previously accepted numeric type input and output double type results. Now it supports date type, timestamp type and numeric types as input types. The result type is also changed to be the same as the input type, which is more reasonable for percentiles.
 
 ## Upgrading From Spark SQL 2.1 to 2.2
 
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 7b81a0be84b52..b7ce9a83a616d 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1038,9 +1038,9 @@ def summary(self, *statistics):
         |   mean|               3.5| null|
         | stddev|2.1213203435596424| null|
         |    min|                 2|Alice|
-        |    25%|               5.0| null|
-        |    50%|               5.0| null|
-        |    75%|               5.0| null|
+        |    25%|                 5| null|
+        |    50%|                 5| null|
+        |    75%|                 5| null|
         |    max|                 5|  Bob|
         +-------+------------------+-----+
 
@@ -1050,8 +1050,8 @@ def summary(self, *statistics):
         +-------+---+-----+
         |  count|  2|    2|
         |    min|  2|Alice|
-        |    25%|5.0| null|
-        |    75%|5.0| null|
+        |    25%|  5| null|
+        |    75%|  5| null|
         |    max|  5|  Bob|
         +-------+---+-----+
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
index 896c009b32971..7facb9dad9a76 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
@@ -85,7 +85,10 @@ case class ApproximatePercentile(
   private lazy val accuracy: Int = accuracyExpression.eval().asInstanceOf[Int]
 
   override def inputTypes: Seq[AbstractDataType] = {
-    Seq(DoubleType, TypeCollection(DoubleType, ArrayType(DoubleType)), IntegerType)
+    // Support NumericType, DateType and TimestampType since their internal types are all numeric,
+    // and can be easily cast to double for processing.
+    Seq(TypeCollection(NumericType, DateType, TimestampType),
+      TypeCollection(DoubleType, ArrayType(DoubleType)), IntegerType)
   }
 
   // Mark as lazy so that percentageExpression is not evaluated during tree transformation.
@@ -123,7 +126,15 @@ case class ApproximatePercentile(
     val value = child.eval(inputRow)
     // Ignore empty rows, for example: percentile_approx(null)
     if (value != null) {
-      buffer.add(value.asInstanceOf[Double])
+      // Convert the value to a double value
+      val doubleValue = child.dataType match {
+        case DateType => value.asInstanceOf[Int].toDouble
+        case TimestampType => value.asInstanceOf[Long].toDouble
+        case n: NumericType => n.numeric.toDouble(value.asInstanceOf[n.InternalType])
+        case other: DataType =>
+          throw new UnsupportedOperationException(s"Unexpected data type $other")
+      }
+      buffer.add(doubleValue)
     }
     buffer
   }
@@ -134,7 +145,20 @@ case class ApproximatePercentile(
   }
 
   override def eval(buffer: PercentileDigest): Any = {
-    val result = buffer.getPercentiles(percentages)
+    val doubleResult = buffer.getPercentiles(percentages)
+    val result = child.dataType match {
+      case DateType => doubleResult.map(_.toInt)
+      case TimestampType => doubleResult.map(_.toLong)
+      case ByteType => doubleResult.map(_.toByte)
+      case ShortType => doubleResult.map(_.toShort)
+      case IntegerType => doubleResult.map(_.toInt)
+      case LongType => doubleResult.map(_.toLong)
+      case FloatType => doubleResult.map(_.toFloat)
+      case DoubleType => doubleResult
+      case _: DecimalType => doubleResult.map(Decimal(_))
+      case other: DataType =>
+        throw new UnsupportedOperationException(s"Unexpected data type $other")
+    }
     if (result.length == 0) {
       null
     } else if (returnPercentileArray) {
@@ -155,8 +179,9 @@ case class ApproximatePercentile(
   // Returns null for empty inputs
   override def nullable: Boolean = true
 
+  // The result type is the same as the input type.
   override def dataType: DataType = {
-    if (returnPercentileArray) ArrayType(DoubleType, false) else DoubleType
+    if (returnPercentileArray) ArrayType(child.dataType, false) else child.dataType
   }
 
   override def prettyName: String = "percentile_approx"
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentileSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentileSuite.scala
index fcb370ae8460f..84b3cc79cef5e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentileSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentileSuite.scala
@@ -19,8 +19,8 @@ package org.apache.spark.sql.catalyst.expressions.aggregate
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.analysis.{SimpleAnalyzer, UnresolvedAttribute}
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.TypeCheckFailure
+import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, BoundReference, Cast, CreateArray, DecimalLiteral, GenericInternalRow, Literal}
@@ -270,7 +270,6 @@ class ApproximatePercentileSuite extends SparkFunSuite {
         percentageExpression = percentageExpression,
         accuracyExpression = Literal(100))
 
-      val result = wrongPercentage.checkInputDataTypes()
       assert(
         wrongPercentage.checkInputDataTypes() match {
           case TypeCheckFailure(msg) if msg.contains("must be between 0.0 and 1.0") => true
@@ -281,7 +280,6 @@ class ApproximatePercentileSuite extends SparkFunSuite {
 
   test("class ApproximatePercentile, automatically add type casting for parameters") {
     val testRelation = LocalRelation('a.int)
-    val analyzer = SimpleAnalyzer
 
     // Compatible accuracy types: Long type and decimal type
     val accuracyExpressions = Seq(Literal(1000L), DecimalLiteral(10000), Literal(123.0D))
@@ -299,7 +297,7 @@ class ApproximatePercentileSuite extends SparkFunSuite {
         analyzed match {
           case Alias(agg: ApproximatePercentile, _) =>
             assert(agg.resolved)
-            assert(agg.child.dataType == DoubleType)
+            assert(agg.child.dataType == IntegerType)
             assert(agg.percentageExpression.dataType == DoubleType ||
               agg.percentageExpression.dataType == ArrayType(DoubleType, containsNull = false))
             assert(agg.accuracyExpression.dataType == IntegerType)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala
index 62a75343a0946..1aea33766407f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala
@@ -17,8 +17,11 @@
 
 package org.apache.spark.sql
 
+import java.sql.{Date, Timestamp}
+
 import org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile.DEFAULT_PERCENTILE_ACCURACY
 import org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile.PercentileDigest
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.test.SharedSQLContext
 
 /**
@@ -67,6 +70,30 @@ class ApproximatePercentileQuerySuite extends QueryTest with SharedSQLContext {
     }
   }
 
+  test("percentile_approx, different column types") {
+    withTempView(table) {
+      val intSeq = 1 to 1000
+      val data: Seq[(java.math.BigDecimal, Date, Timestamp)] = intSeq.map { i =>
+        (new java.math.BigDecimal(i), DateTimeUtils.toJavaDate(i), DateTimeUtils.toJavaTimestamp(i))
+      }
+      data.toDF("cdecimal", "cdate", "ctimestamp").createOrReplaceTempView(table)
+      checkAnswer(
+        spark.sql(
+          s"""SELECT
+             |  percentile_approx(cdecimal, array(0.25, 0.5, 0.75D)),
+             |  percentile_approx(cdate, array(0.25, 0.5, 0.75D)),
+             |  percentile_approx(ctimestamp, array(0.25, 0.5, 0.75D))
+             |FROM $table
+           """.stripMargin),
+        Row(
+          Seq("250.000000000000000000", "500.000000000000000000", "750.000000000000000000")
+              .map(i => new java.math.BigDecimal(i)),
+          Seq(250, 500, 750).map(DateTimeUtils.toJavaDate),
+          Seq(250, 500, 750).map(i => DateTimeUtils.toJavaTimestamp(i.toLong)))
+      )
+    }
+  }
+
   test("percentile_approx, multiple records with the minimum value in a partition") {
     withTempView(table) {
       spark.sparkContext.makeRDD(Seq(1, 1, 2, 1, 1, 3, 1, 1, 4, 1, 1, 5), 4).toDF("col")
@@ -88,7 +115,7 @@ class ApproximatePercentileQuerySuite extends QueryTest with SharedSQLContext {
       val accuracies = Array(1, 10, 100, 1000, 10000)
       val errors = accuracies.map { accuracy =>
         val df = spark.sql(s"SELECT percentile_approx(col, 0.25, $accuracy) FROM $table")
-        val approximatePercentile = df.collect().head.getDouble(0)
+        val approximatePercentile = df.collect().head.getInt(0)
         val error = Math.abs(approximatePercentile - expectedPercentile)
         error
       }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 13341645e8ff8..6178661cf7b2b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -803,9 +803,9 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
       Row("mean", null, "33.0", "178.0"),
       Row("stddev", null, "19.148542155126762", "11.547005383792516"),
       Row("min", "Alice", "16", "164"),
-      Row("25%", null, "24.0", "176.0"),
-      Row("50%", null, "24.0", "176.0"),
-      Row("75%", null, "32.0", "180.0"),
+      Row("25%", null, "24", "176"),
+      Row("50%", null, "24", "176"),
+      Row("75%", null, "32", "180"),
       Row("max", "David", "60", "192"))
 
     val emptySummaryResult = Seq(

From 2c5b9b1173c23f6ca8890817a9a35dc7557b0776 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Mon, 25 Sep 2017 12:02:30 -0700
Subject: [PATCH 619/779] [SPARK-22083][CORE] Release locks in
 MemoryStore.evictBlocksToFreeSpace

## What changes were proposed in this pull request?

MemoryStore.evictBlocksToFreeSpace acquires write locks for all the
blocks it intends to evict up front.  If there is a failure to evict
blocks (eg., some failure dropping a block to disk), then we have to
release the lock.  Otherwise the lock is never released and an executor
trying to get the lock will wait forever.

## How was this patch tested?

Added unit test.

Author: Imran Rashid <irashid@cloudera.com>

Closes #19311 from squito/SPARK-22083.
---
 .../spark/storage/memory/MemoryStore.scala    |  47 +++++--
 .../spark/storage/MemoryStoreSuite.scala      | 119 ++++++++++++++++++
 2 files changed, 153 insertions(+), 13 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/storage/memory/MemoryStore.scala b/core/src/main/scala/org/apache/spark/storage/memory/MemoryStore.scala
index eb2201d142ffb..651e9c7b2ab61 100644
--- a/core/src/main/scala/org/apache/spark/storage/memory/MemoryStore.scala
+++ b/core/src/main/scala/org/apache/spark/storage/memory/MemoryStore.scala
@@ -32,7 +32,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config.{UNROLL_MEMORY_CHECK_PERIOD, UNROLL_MEMORY_GROWTH_FACTOR}
 import org.apache.spark.memory.{MemoryManager, MemoryMode}
 import org.apache.spark.serializer.{SerializationStream, SerializerManager}
-import org.apache.spark.storage.{BlockId, BlockInfoManager, StorageLevel, StreamBlockId}
+import org.apache.spark.storage._
 import org.apache.spark.unsafe.Platform
 import org.apache.spark.util.{SizeEstimator, Utils}
 import org.apache.spark.util.collection.SizeTrackingVector
@@ -544,20 +544,38 @@ private[spark] class MemoryStore(
       }
 
       if (freedMemory >= space) {
-        logInfo(s"${selectedBlocks.size} blocks selected for dropping " +
-          s"(${Utils.bytesToString(freedMemory)} bytes)")
-        for (blockId <- selectedBlocks) {
-          val entry = entries.synchronized { entries.get(blockId) }
-          // This should never be null as only one task should be dropping
-          // blocks and removing entries. However the check is still here for
-          // future safety.
-          if (entry != null) {
-            dropBlock(blockId, entry)
+        var lastSuccessfulBlock = -1
+        try {
+          logInfo(s"${selectedBlocks.size} blocks selected for dropping " +
+            s"(${Utils.bytesToString(freedMemory)} bytes)")
+          (0 until selectedBlocks.size).foreach { idx =>
+            val blockId = selectedBlocks(idx)
+            val entry = entries.synchronized {
+              entries.get(blockId)
+            }
+            // This should never be null as only one task should be dropping
+            // blocks and removing entries. However the check is still here for
+            // future safety.
+            if (entry != null) {
+              dropBlock(blockId, entry)
+              afterDropAction(blockId)
+            }
+            lastSuccessfulBlock = idx
+          }
+          logInfo(s"After dropping ${selectedBlocks.size} blocks, " +
+            s"free memory is ${Utils.bytesToString(maxMemory - blocksMemoryUsed)}")
+          freedMemory
+        } finally {
+          // like BlockManager.doPut, we use a finally rather than a catch to avoid having to deal
+          // with InterruptedException
+          if (lastSuccessfulBlock != selectedBlocks.size - 1) {
+            // the blocks we didn't process successfully are still locked, so we have to unlock them
+            (lastSuccessfulBlock + 1 until selectedBlocks.size).foreach { idx =>
+              val blockId = selectedBlocks(idx)
+              blockInfoManager.unlock(blockId)
+            }
           }
         }
-        logInfo(s"After dropping ${selectedBlocks.size} blocks, " +
-          s"free memory is ${Utils.bytesToString(maxMemory - blocksMemoryUsed)}")
-        freedMemory
       } else {
         blockId.foreach { id =>
           logInfo(s"Will not store $id")
@@ -570,6 +588,9 @@ private[spark] class MemoryStore(
     }
   }
 
+  // hook for testing, so we can simulate a race
+  protected def afterDropAction(blockId: BlockId): Unit = {}
+
   def contains(blockId: BlockId): Boolean = {
     entries.synchronized { entries.containsKey(blockId) }
   }
diff --git a/core/src/test/scala/org/apache/spark/storage/MemoryStoreSuite.scala b/core/src/test/scala/org/apache/spark/storage/MemoryStoreSuite.scala
index 9929ea033a99f..7274072e5049a 100644
--- a/core/src/test/scala/org/apache/spark/storage/MemoryStoreSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/MemoryStoreSuite.scala
@@ -407,4 +407,123 @@ class MemoryStoreSuite
     })
     assert(memoryStore.getSize(blockId) === 10000)
   }
+
+  test("SPARK-22083: Release all locks in evictBlocksToFreeSpace") {
+    // Setup a memory store with many blocks cached, and then one request which leads to multiple
+    // blocks getting evicted.  We'll make the eviction throw an exception, and make sure that
+    // all locks are released.
+    val ct = implicitly[ClassTag[Array[Byte]]]
+    val numInitialBlocks = 10
+    val memStoreSize = 100
+    val bytesPerSmallBlock = memStoreSize / numInitialBlocks
+    def testFailureOnNthDrop(numValidBlocks: Int, readLockAfterDrop: Boolean): Unit = {
+      val tc = TaskContext.empty()
+      val memManager = new StaticMemoryManager(conf, Long.MaxValue, memStoreSize, numCores = 1)
+      val blockInfoManager = new BlockInfoManager
+      blockInfoManager.registerTask(tc.taskAttemptId)
+      var droppedSoFar = 0
+      val blockEvictionHandler = new BlockEvictionHandler {
+        var memoryStore: MemoryStore = _
+
+        override private[storage] def dropFromMemory[T: ClassTag](
+            blockId: BlockId,
+            data: () => Either[Array[T], ChunkedByteBuffer]): StorageLevel = {
+          if (droppedSoFar < numValidBlocks) {
+            droppedSoFar += 1
+            memoryStore.remove(blockId)
+            if (readLockAfterDrop) {
+              // for testing purposes, we act like another thread gets the read lock on the new
+              // block
+              StorageLevel.DISK_ONLY
+            } else {
+              StorageLevel.NONE
+            }
+          } else {
+            throw new RuntimeException(s"Mock error dropping block $droppedSoFar")
+          }
+        }
+      }
+      val memoryStore = new MemoryStore(conf, blockInfoManager, serializerManager, memManager,
+          blockEvictionHandler) {
+        override def afterDropAction(blockId: BlockId): Unit = {
+          if (readLockAfterDrop) {
+            // pretend that we get a read lock on the block (now on disk) in another thread
+            TaskContext.setTaskContext(tc)
+            blockInfoManager.lockForReading(blockId)
+            TaskContext.unset()
+          }
+        }
+      }
+
+      blockEvictionHandler.memoryStore = memoryStore
+      memManager.setMemoryStore(memoryStore)
+
+      // Put in some small blocks to fill up the memory store
+      val initialBlocks = (1 to numInitialBlocks).map { id =>
+        val blockId = BlockId(s"rdd_1_$id")
+        val blockInfo = new BlockInfo(StorageLevel.MEMORY_ONLY, ct, tellMaster = false)
+        val initialWriteLock = blockInfoManager.lockNewBlockForWriting(blockId, blockInfo)
+        assert(initialWriteLock)
+        val success = memoryStore.putBytes(blockId, bytesPerSmallBlock, MemoryMode.ON_HEAP, () => {
+          new ChunkedByteBuffer(ByteBuffer.allocate(bytesPerSmallBlock))
+        })
+        assert(success)
+        blockInfoManager.unlock(blockId, None)
+      }
+      assert(blockInfoManager.size === numInitialBlocks)
+
+
+      // Add one big block, which will require evicting everything in the memorystore.  However our
+      // mock BlockEvictionHandler will throw an exception -- make sure all locks are cleared.
+      val largeBlockId = BlockId(s"rdd_2_1")
+      val largeBlockInfo = new BlockInfo(StorageLevel.MEMORY_ONLY, ct, tellMaster = false)
+      val initialWriteLock = blockInfoManager.lockNewBlockForWriting(largeBlockId, largeBlockInfo)
+      assert(initialWriteLock)
+      if (numValidBlocks < numInitialBlocks) {
+        val exc = intercept[RuntimeException] {
+          memoryStore.putBytes(largeBlockId, memStoreSize, MemoryMode.ON_HEAP, () => {
+            new ChunkedByteBuffer(ByteBuffer.allocate(memStoreSize))
+          })
+        }
+        assert(exc.getMessage().startsWith("Mock error dropping block"), exc)
+        // BlockManager.doPut takes care of releasing the lock for the newly written block -- not
+        // testing that here, so do it manually
+        blockInfoManager.removeBlock(largeBlockId)
+      } else {
+        memoryStore.putBytes(largeBlockId, memStoreSize, MemoryMode.ON_HEAP, () => {
+          new ChunkedByteBuffer(ByteBuffer.allocate(memStoreSize))
+        })
+        // BlockManager.doPut takes care of releasing the lock for the newly written block -- not
+        // testing that here, so do it manually
+        blockInfoManager.unlock(largeBlockId)
+      }
+
+      val largeBlockInMemory = if (numValidBlocks == numInitialBlocks) 1 else 0
+      val expBlocks = numInitialBlocks +
+        (if (readLockAfterDrop) 0 else -numValidBlocks) +
+        largeBlockInMemory
+      assert(blockInfoManager.size === expBlocks)
+
+      val blocksStillInMemory = blockInfoManager.entries.filter { case (id, info) =>
+        assert(info.writerTask === BlockInfo.NO_WRITER, id)
+        // in this test, all the blocks in memory have no reader, but everything dropped to disk
+        // had another thread read the block.  We shouldn't lose the other thread's reader lock.
+        if (memoryStore.contains(id)) {
+          assert(info.readerCount === 0, id)
+          true
+        } else {
+          assert(info.readerCount === 1, id)
+          false
+        }
+      }
+      assert(blocksStillInMemory.size ===
+        (numInitialBlocks - numValidBlocks + largeBlockInMemory))
+    }
+
+    Seq(0, 3, numInitialBlocks).foreach { failAfterDropping =>
+      Seq(true, false).foreach { readLockAfterDropping =>
+        testFailureOnNthDrop(failAfterDropping, readLockAfterDropping)
+      }
+    }
+  }
 }

From 038b185736fbc5bcdaf00ddcd59593571745f25f Mon Sep 17 00:00:00 2001
From: Juliusz Sompolski <julek@databricks.com>
Date: Mon, 25 Sep 2017 12:50:25 -0700
Subject: [PATCH 620/779] [SPARK-22103] Move HashAggregateExec parent consume
 to a separate function in codegen

## What changes were proposed in this pull request?

HashAggregateExec codegen uses two paths for fast hash table and a generic one.
It generates code paths for iterating over both, and both code paths generate the consume code of the parent operator, resulting in that code being expanded twice.
This leads to a long generated function that might be an issue for the compiler (see e.g. SPARK-21603).
I propose to remove the double expansion by generating the consume code in a helper function that can just be called from both iterating loops.

An issue with separating the `consume` code to a helper function was that a number of places relied and assumed on being in the scope of an outside `produce` loop and e.g. use `continue` to jump out.
I replaced such code flows with nested scopes. It is code that should be handled the same by compiler, while getting rid of depending on assumptions that are outside of the `consume`'s own scope.

## How was this patch tested?

Existing test coverage.

Author: Juliusz Sompolski <julek@databricks.com>

Closes #19324 from juliuszsompolski/aggrconsumecodegen.
---
 .../expressions/codegen/CodeGenerator.scala   |  19 ++++
 .../sql/execution/WholeStageCodegenExec.scala |  22 +++-
 .../aggregate/HashAggregateExec.scala         |  89 ++++++++++-----
 .../execution/basicPhysicalOperators.scala    |  18 +--
 .../joins/BroadcastHashJoinExec.scala         | 105 ++++++++++--------
 5 files changed, 164 insertions(+), 89 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 437397187356c..f10d368627707 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -242,6 +242,9 @@ class CodegenContext {
   private val classFunctions: mutable.Map[String, mutable.Map[String, String]] =
     mutable.Map(outerClassName -> mutable.Map.empty[String, String])
 
+  // Verbatim extra code to be added to the OuterClass.
+  private val extraCode: mutable.ListBuffer[String] = mutable.ListBuffer[String]()
+
   // Returns the size of the most recently added class.
   private def currClassSize(): Int = classSize(classes.head._1)
 
@@ -328,6 +331,22 @@ class CodegenContext {
     (inlinedFunctions ++ initNestedClasses ++ declareNestedClasses).mkString("\n")
   }
 
+  /**
+   * Emits any source code added with addExtraCode
+   */
+  def emitExtraCode(): String = {
+    extraCode.mkString("\n")
+  }
+
+  /**
+   * Add extra source code to the outermost generated class.
+   * @param code verbatim source code to be added.
+   */
+  def addExtraCode(code: String): Unit = {
+    extraCode.append(code)
+    classSize(outerClassName) += code.length
+  }
+
   final val JAVA_BOOLEAN = "boolean"
   final val JAVA_BYTE = "byte"
   final val JAVA_SHORT = "short"
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
index a41a7ca56a0a1..268ccfa4edfa0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
@@ -197,11 +197,14 @@ trait CodegenSupport extends SparkPlan {
    *
    * This should be override by subclass to support codegen.
    *
-   * For example, Filter will generate the code like this:
+   * Note: The operator should not assume the existence of an outer processing loop,
+   *       which it can jump from with "continue;"!
    *
+   * For example, filter could generate this:
    *   # code to evaluate the predicate expression, result is isNull1 and value2
-   *   if (isNull1 || !value2) continue;
-   *   # call consume(), which will call parent.doConsume()
+   *   if (!isNull1 && value2) {
+   *     # call consume(), which will call parent.doConsume()
+   *   }
    *
    * Note: A plan can either consume the rows as UnsafeRow (row), or a list of variables (input).
    */
@@ -329,6 +332,15 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
   def doCodeGen(): (CodegenContext, CodeAndComment) = {
     val ctx = new CodegenContext
     val code = child.asInstanceOf[CodegenSupport].produce(ctx, this)
+
+    // main next function.
+    ctx.addNewFunction("processNext",
+      s"""
+        protected void processNext() throws java.io.IOException {
+          ${code.trim}
+        }
+       """, inlineToOuterClass = true)
+
     val source = s"""
       public Object generate(Object[] references) {
         return new GeneratedIterator(references);
@@ -352,9 +364,7 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
           ${ctx.initPartition()}
         }
 
-        protected void processNext() throws java.io.IOException {
-          ${code.trim}
-        }
+        ${ctx.emitExtraCode()}
 
         ${ctx.declareAddedFunctions()}
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
index d77405c559c58..abdf9530c6c7b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
@@ -425,12 +425,14 @@ case class HashAggregateExec(
 
   /**
    * Generate the code for output.
+   * @return function name for the result code.
    */
-  private def generateResultCode(
-      ctx: CodegenContext,
-      keyTerm: String,
-      bufferTerm: String,
-      plan: String): String = {
+  private def generateResultFunction(ctx: CodegenContext): String = {
+    val funcName = ctx.freshName("doAggregateWithKeysOutput")
+    val keyTerm = ctx.freshName("keyTerm")
+    val bufferTerm = ctx.freshName("bufferTerm")
+
+    val body =
     if (modes.contains(Final) || modes.contains(Complete)) {
       // generate output using resultExpressions
       ctx.currentVars = null
@@ -462,18 +464,36 @@ case class HashAggregateExec(
        $evaluateAggResults
        ${consume(ctx, resultVars)}
        """
-
     } else if (modes.contains(Partial) || modes.contains(PartialMerge)) {
-      // This should be the last operator in a stage, we should output UnsafeRow directly
-      val joinerTerm = ctx.freshName("unsafeRowJoiner")
-      ctx.addMutableState(classOf[UnsafeRowJoiner].getName, joinerTerm,
-        s"$joinerTerm = $plan.createUnsafeJoiner();")
-      val resultRow = ctx.freshName("resultRow")
+      // resultExpressions are Attributes of groupingExpressions and aggregateBufferAttributes.
+      assert(resultExpressions.forall(_.isInstanceOf[Attribute]))
+      assert(resultExpressions.length ==
+        groupingExpressions.length + aggregateBufferAttributes.length)
+
+      ctx.currentVars = null
+
+      ctx.INPUT_ROW = keyTerm
+      val keyVars = groupingExpressions.zipWithIndex.map { case (e, i) =>
+        BoundReference(i, e.dataType, e.nullable).genCode(ctx)
+      }
+      val evaluateKeyVars = evaluateVariables(keyVars)
+
+      ctx.INPUT_ROW = bufferTerm
+      val resultBufferVars = aggregateBufferAttributes.zipWithIndex.map { case (e, i) =>
+        BoundReference(i, e.dataType, e.nullable).genCode(ctx)
+      }
+      val evaluateResultBufferVars = evaluateVariables(resultBufferVars)
+
+      ctx.currentVars = keyVars ++ resultBufferVars
+      val inputAttrs = resultExpressions.map(_.toAttribute)
+      val resultVars = resultExpressions.map { e =>
+        BindReferences.bindReference(e, inputAttrs).genCode(ctx)
+      }
       s"""
-       UnsafeRow $resultRow = $joinerTerm.join($keyTerm, $bufferTerm);
-       ${consume(ctx, null, resultRow)}
+       $evaluateKeyVars
+       $evaluateResultBufferVars
+       ${consume(ctx, resultVars)}
        """
-
     } else {
       // generate result based on grouping key
       ctx.INPUT_ROW = keyTerm
@@ -483,6 +503,13 @@ case class HashAggregateExec(
       }
       consume(ctx, eval)
     }
+    ctx.addNewFunction(funcName,
+      s"""
+        private void $funcName(UnsafeRow $keyTerm, UnsafeRow $bufferTerm)
+            throws java.io.IOException {
+          $body
+        }
+       """)
   }
 
   /**
@@ -581,11 +608,6 @@ case class HashAggregateExec(
     val iterTerm = ctx.freshName("mapIter")
     ctx.addMutableState(classOf[KVIterator[UnsafeRow, UnsafeRow]].getName, iterTerm, "")
 
-    val doAgg = ctx.freshName("doAggregateWithKeys")
-    val peakMemory = metricTerm(ctx, "peakMemory")
-    val spillSize = metricTerm(ctx, "spillSize")
-    val avgHashProbe = metricTerm(ctx, "avgHashProbe")
-
     def generateGenerateCode(): String = {
       if (isFastHashMapEnabled) {
         if (isVectorizedHashMapEnabled) {
@@ -599,10 +621,14 @@ case class HashAggregateExec(
         }
       } else ""
     }
+    ctx.addExtraCode(generateGenerateCode())
 
+    val doAgg = ctx.freshName("doAggregateWithKeys")
+    val peakMemory = metricTerm(ctx, "peakMemory")
+    val spillSize = metricTerm(ctx, "spillSize")
+    val avgHashProbe = metricTerm(ctx, "avgHashProbe")
     val doAggFuncName = ctx.addNewFunction(doAgg,
       s"""
-        ${generateGenerateCode}
         private void $doAgg() throws java.io.IOException {
           $hashMapTerm = $thisPlan.createHashMap();
           ${child.asInstanceOf[CodegenSupport].produce(ctx, this)}
@@ -618,7 +644,7 @@ case class HashAggregateExec(
     // generate code for output
     val keyTerm = ctx.freshName("aggKey")
     val bufferTerm = ctx.freshName("aggBuffer")
-    val outputCode = generateResultCode(ctx, keyTerm, bufferTerm, thisPlan)
+    val outputFunc = generateResultFunction(ctx)
     val numOutput = metricTerm(ctx, "numOutputRows")
 
     // The child could change `copyResult` to true, but we had already consumed all the rows,
@@ -641,7 +667,7 @@ case class HashAggregateExec(
          $numOutput.add(1);
          UnsafeRow $keyTerm = (UnsafeRow) $iterTermForFastHashMap.getKey();
          UnsafeRow $bufferTerm = (UnsafeRow) $iterTermForFastHashMap.getValue();
-         $outputCode
+         $outputFunc($keyTerm, $bufferTerm);
 
          if (shouldStop()) return;
        }
@@ -654,18 +680,23 @@ case class HashAggregateExec(
         val row = ctx.freshName("fastHashMapRow")
         ctx.currentVars = null
         ctx.INPUT_ROW = row
-        var schema: StructType = groupingKeySchema
-        bufferSchema.foreach(i => schema = schema.add(i))
-        val generateRow = GenerateUnsafeProjection.createCode(ctx, schema.toAttributes.zipWithIndex
-          .map { case (attr, i) => BoundReference(i, attr.dataType, attr.nullable) })
+        val generateKeyRow = GenerateUnsafeProjection.createCode(ctx,
+          groupingKeySchema.toAttributes.zipWithIndex
+          .map { case (attr, i) => BoundReference(i, attr.dataType, attr.nullable) }
+        )
+        val generateBufferRow = GenerateUnsafeProjection.createCode(ctx,
+          bufferSchema.toAttributes.zipWithIndex
+          .map { case (attr, i) =>
+            BoundReference(groupingKeySchema.length + i, attr.dataType, attr.nullable) })
         s"""
            | while ($iterTermForFastHashMap.hasNext()) {
            |   $numOutput.add(1);
            |   org.apache.spark.sql.execution.vectorized.ColumnarBatch.Row $row =
            |     (org.apache.spark.sql.execution.vectorized.ColumnarBatch.Row)
            |     $iterTermForFastHashMap.next();
-           |   ${generateRow.code}
-           |   ${consume(ctx, Seq.empty, {generateRow.value})}
+           |   ${generateKeyRow.code}
+           |   ${generateBufferRow.code}
+           |   $outputFunc(${generateKeyRow.value}, ${generateBufferRow.value});
            |
            |   if (shouldStop()) return;
            | }
@@ -692,7 +723,7 @@ case class HashAggregateExec(
        $numOutput.add(1);
        UnsafeRow $keyTerm = (UnsafeRow) $iterTerm.getKey();
        UnsafeRow $bufferTerm = (UnsafeRow) $iterTerm.getValue();
-       $outputCode
+       $outputFunc($keyTerm, $bufferTerm);
 
        if (shouldStop()) return;
      }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
index e4e9372447f7c..18142c44f0295 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
@@ -201,11 +201,14 @@ case class FilterExec(condition: Expression, child: SparkPlan)
       ev
     }
 
+    // Note: wrap in "do { } while(false);", so the generated checks can jump out with "continue;"
     s"""
-       |$generated
-       |$nullChecks
-       |$numOutput.add(1);
-       |${consume(ctx, resultVars)}
+       |do {
+       |  $generated
+       |  $nullChecks
+       |  $numOutput.add(1);
+       |  ${consume(ctx, resultVars)}
+       |} while(false);
      """.stripMargin
   }
 
@@ -316,9 +319,10 @@ case class SampleExec(
          """.stripMargin.trim)
 
       s"""
-         | if ($sampler.sample() == 0) continue;
-         | $numOutput.add(1);
-         | ${consume(ctx, input)}
+         | if ($sampler.sample() != 0) {
+         |   $numOutput.add(1);
+         |   ${consume(ctx, input)}
+         | }
        """.stripMargin.trim
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
index ab7bb8ab9d87a..b09da9bdacb99 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
@@ -186,8 +186,7 @@ case class BroadcastHashJoinExec(
    */
   private def getJoinCondition(
       ctx: CodegenContext,
-      input: Seq[ExprCode],
-      anti: Boolean = false): (String, String, Seq[ExprCode]) = {
+      input: Seq[ExprCode]): (String, String, Seq[ExprCode]) = {
     val matched = ctx.freshName("matched")
     val buildVars = genBuildSideVars(ctx, matched)
     val checkCondition = if (condition.isDefined) {
@@ -198,18 +197,12 @@ case class BroadcastHashJoinExec(
       ctx.currentVars = input ++ buildVars
       val ev =
         BindReferences.bindReference(expr, streamedPlan.output ++ buildPlan.output).genCode(ctx)
-      val skipRow = if (!anti) {
-        s"${ev.isNull} || !${ev.value}"
-      } else {
-        s"!${ev.isNull} && ${ev.value}"
-      }
+      val skipRow = s"${ev.isNull} || !${ev.value}"
       s"""
          |$eval
          |${ev.code}
-         |if ($skipRow) continue;
+         |if (!($skipRow))
        """.stripMargin
-    } else if (anti) {
-      "continue;"
     } else {
       ""
     }
@@ -235,10 +228,12 @@ case class BroadcastHashJoinExec(
          |${keyEv.code}
          |// find matches from HashedRelation
          |UnsafeRow $matched = $anyNull ? null: (UnsafeRow)$relationTerm.getValue(${keyEv.value});
-         |if ($matched == null) continue;
-         |$checkCondition
-         |$numOutput.add(1);
-         |${consume(ctx, resultVars)}
+         |if ($matched != null) {
+         |  $checkCondition {
+         |    $numOutput.add(1);
+         |    ${consume(ctx, resultVars)}
+         |  }
+         |}
        """.stripMargin
 
     } else {
@@ -250,12 +245,14 @@ case class BroadcastHashJoinExec(
          |${keyEv.code}
          |// find matches from HashRelation
          |$iteratorCls $matches = $anyNull ? null : ($iteratorCls)$relationTerm.get(${keyEv.value});
-         |if ($matches == null) continue;
-         |while ($matches.hasNext()) {
-         |  UnsafeRow $matched = (UnsafeRow) $matches.next();
-         |  $checkCondition
-         |  $numOutput.add(1);
-         |  ${consume(ctx, resultVars)}
+         |if ($matches != null) {
+         |  while ($matches.hasNext()) {
+         |    UnsafeRow $matched = (UnsafeRow) $matches.next();
+         |    $checkCondition {
+         |      $numOutput.add(1);
+         |      ${consume(ctx, resultVars)}
+         |    }
+         |  }
          |}
        """.stripMargin
     }
@@ -328,10 +325,11 @@ case class BroadcastHashJoinExec(
          |  UnsafeRow $matched = $matches != null && $matches.hasNext() ?
          |    (UnsafeRow) $matches.next() : null;
          |  ${checkCondition.trim}
-         |  if (!$conditionPassed) continue;
-         |  $found = true;
-         |  $numOutput.add(1);
-         |  ${consume(ctx, resultVars)}
+         |  if ($conditionPassed) {
+         |    $found = true;
+         |    $numOutput.add(1);
+         |    ${consume(ctx, resultVars)}
+         |  }
          |}
        """.stripMargin
     }
@@ -351,10 +349,12 @@ case class BroadcastHashJoinExec(
          |${keyEv.code}
          |// find matches from HashedRelation
          |UnsafeRow $matched = $anyNull ? null: (UnsafeRow)$relationTerm.getValue(${keyEv.value});
-         |if ($matched == null) continue;
-         |$checkCondition
-         |$numOutput.add(1);
-         |${consume(ctx, input)}
+         |if ($matched != null) {
+         |  $checkCondition {
+         |    $numOutput.add(1);
+         |    ${consume(ctx, input)}
+         |  }
+         |}
        """.stripMargin
     } else {
       val matches = ctx.freshName("matches")
@@ -365,16 +365,19 @@ case class BroadcastHashJoinExec(
          |${keyEv.code}
          |// find matches from HashRelation
          |$iteratorCls $matches = $anyNull ? null : ($iteratorCls)$relationTerm.get(${keyEv.value});
-         |if ($matches == null) continue;
-         |boolean $found = false;
-         |while (!$found && $matches.hasNext()) {
-         |  UnsafeRow $matched = (UnsafeRow) $matches.next();
-         |  $checkCondition
-         |  $found = true;
+         |if ($matches != null) {
+         |  boolean $found = false;
+         |  while (!$found && $matches.hasNext()) {
+         |    UnsafeRow $matched = (UnsafeRow) $matches.next();
+         |    $checkCondition {
+         |      $found = true;
+         |    }
+         |  }
+         |  if ($found) {
+         |    $numOutput.add(1);
+         |    ${consume(ctx, input)}
+         |  }
          |}
-         |if (!$found) continue;
-         |$numOutput.add(1);
-         |${consume(ctx, input)}
        """.stripMargin
     }
   }
@@ -386,11 +389,13 @@ case class BroadcastHashJoinExec(
     val (broadcastRelation, relationTerm) = prepareBroadcast(ctx)
     val uniqueKeyCodePath = broadcastRelation.value.keyIsUnique
     val (keyEv, anyNull) = genStreamSideJoinKey(ctx, input)
-    val (matched, checkCondition, _) = getJoinCondition(ctx, input, uniqueKeyCodePath)
+    val (matched, checkCondition, _) = getJoinCondition(ctx, input)
     val numOutput = metricTerm(ctx, "numOutputRows")
 
     if (uniqueKeyCodePath) {
+      val found = ctx.freshName("found")
       s"""
+         |boolean $found = false;
          |// generate join key for stream side
          |${keyEv.code}
          |// Check if the key has nulls.
@@ -399,17 +404,22 @@ case class BroadcastHashJoinExec(
          |  UnsafeRow $matched = (UnsafeRow)$relationTerm.getValue(${keyEv.value});
          |  if ($matched != null) {
          |    // Evaluate the condition.
-         |    $checkCondition
+         |    $checkCondition {
+         |      $found = true;
+         |    }
          |  }
          |}
-         |$numOutput.add(1);
-         |${consume(ctx, input)}
+         |if (!$found) {
+         |  $numOutput.add(1);
+         |  ${consume(ctx, input)}
+         |}
        """.stripMargin
     } else {
       val matches = ctx.freshName("matches")
       val iteratorCls = classOf[Iterator[UnsafeRow]].getName
       val found = ctx.freshName("found")
       s"""
+         |boolean $found = false;
          |// generate join key for stream side
          |${keyEv.code}
          |// Check if the key has nulls.
@@ -418,17 +428,18 @@ case class BroadcastHashJoinExec(
          |  $iteratorCls $matches = ($iteratorCls)$relationTerm.get(${keyEv.value});
          |  if ($matches != null) {
          |    // Evaluate the condition.
-         |    boolean $found = false;
          |    while (!$found && $matches.hasNext()) {
          |      UnsafeRow $matched = (UnsafeRow) $matches.next();
-         |      $checkCondition
-         |      $found = true;
+         |      $checkCondition {
+         |        $found = true;
+         |      }
          |    }
-         |    if ($found) continue;
          |  }
          |}
-         |$numOutput.add(1);
-         |${consume(ctx, input)}
+         |if (!$found) {
+         |  $numOutput.add(1);
+         |  ${consume(ctx, input)}
+         |}
        """.stripMargin
     }
   }

From ce204780ee2434ff6bae50428ae37083835798d3 Mon Sep 17 00:00:00 2001
From: Greg Owen <greg@databricks.com>
Date: Mon, 25 Sep 2017 14:16:11 -0700
Subject: [PATCH 621/779] [SPARK-22120][SQL] TestHiveSparkSession.reset()
 should clean out Hive warehouse directory

## What changes were proposed in this pull request?
During TestHiveSparkSession.reset(), which is called after each TestHiveSingleton suite, we now delete and recreate the Hive warehouse directory.

## How was this patch tested?
Ran full suite of tests locally, verified that they pass.

Author: Greg Owen <greg@databricks.com>

Closes #19341 from GregOwen/SPARK-22120.
---
 .../scala/org/apache/spark/sql/hive/test/TestHive.scala     | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 0f6a81b6f813b..b6be00dbb3a73 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.hive.test
 
 import java.io.File
+import java.net.URI
 import java.util.{Set => JavaSet}
 
 import scala.collection.JavaConverters._
@@ -498,6 +499,11 @@ private[hive] class TestHiveSparkSession(
         }
       }
 
+      // Clean out the Hive warehouse between each suite
+      val warehouseDir = new File(new URI(sparkContext.conf.get("spark.sql.warehouse.dir")).getPath)
+      Utils.deleteRecursively(warehouseDir)
+      warehouseDir.mkdir()
+
       sharedState.cacheManager.clearCache()
       loadedTables.clear()
       sessionState.catalog.reset()

From d8e825e3bc5fdb8ba00eba431512fa7f771417f1 Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Tue, 26 Sep 2017 10:54:00 +0900
Subject: [PATCH 622/779] [SPARK-22106][PYSPARK][SQL] Disable 0-parameter
 pandas_udf and add doctests

## What changes were proposed in this pull request?

This change disables the use of 0-parameter pandas_udfs due to the API being overly complex and awkward, and can easily be worked around by using an index column as an input argument.  Also added doctests for pandas_udfs which revealed bugs for handling empty partitions and using the pandas_udf decorator.

## How was this patch tested?

Reworked existing 0-parameter test to verify error is raised, added doctest for pandas_udf, added new tests for empty partition and decorator usage.

Author: Bryan Cutler <cutlerb@gmail.com>

Closes #19325 from BryanCutler/arrow-pandas_udf-0-param-remove-SPARK-22106.
---
 python/pyspark/serializers.py                 | 15 +----
 python/pyspark/sql/functions.py               | 32 +++++++---
 python/pyspark/sql/tests.py                   | 59 ++++++++++++++-----
 python/pyspark/worker.py                      | 25 ++++----
 .../python/ArrowEvalPythonExec.scala          | 10 ++--
 5 files changed, 89 insertions(+), 52 deletions(-)

diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index 887c702ad4196..7c1fbadcb82be 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -216,9 +216,6 @@ class ArrowPandasSerializer(ArrowSerializer):
     Serializes Pandas.Series as Arrow data.
     """
 
-    def __init__(self):
-        super(ArrowPandasSerializer, self).__init__()
-
     def dumps(self, series):
         """
         Make an ArrowRecordBatch from a Pandas Series and serialize. Input is a single series or
@@ -245,16 +242,10 @@ def cast_series(s, t):
 
     def loads(self, obj):
         """
-        Deserialize an ArrowRecordBatch to an Arrow table and return as a list of pandas.Series
-        followed by a dictionary containing length of the loaded batches.
+        Deserialize an ArrowRecordBatch to an Arrow table and return as a list of pandas.Series.
         """
-        import pyarrow as pa
-        reader = pa.RecordBatchFileReader(pa.BufferReader(obj))
-        batches = [reader.get_batch(i) for i in xrange(reader.num_record_batches)]
-        # NOTE: a 0-parameter pandas_udf will produce an empty batch that can have num_rows set
-        num_rows = sum((batch.num_rows for batch in batches))
-        table = pa.Table.from_batches(batches)
-        return [c.to_pandas() for c in table.itercolumns()] + [{"length": num_rows}]
+        table = super(ArrowPandasSerializer, self).loads(obj)
+        return [c.to_pandas() for c in table.itercolumns()]
 
     def __repr__(self):
         return "ArrowPandasSerializer"
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 46e3a85e60d7b..63e9a830bbc9e 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -2127,6 +2127,10 @@ def wrapper(*args):
 def _create_udf(f, returnType, vectorized):
 
     def _udf(f, returnType=StringType(), vectorized=vectorized):
+        if vectorized:
+            import inspect
+            if len(inspect.getargspec(f).args) == 0:
+                raise NotImplementedError("0-parameter pandas_udfs are not currently supported")
         udf_obj = UserDefinedFunction(f, returnType, vectorized=vectorized)
         return udf_obj._wrapped()
 
@@ -2183,14 +2187,28 @@ def pandas_udf(f=None, returnType=StringType()):
     :param f: python function if used as a standalone function
     :param returnType: a :class:`pyspark.sql.types.DataType` object
 
-    # TODO: doctest
+    >>> from pyspark.sql.types import IntegerType, StringType
+    >>> slen = pandas_udf(lambda s: s.str.len(), IntegerType())
+    >>> @pandas_udf(returnType=StringType())
+    ... def to_upper(s):
+    ...     return s.str.upper()
+    ...
+    >>> @pandas_udf(returnType="integer")
+    ... def add_one(x):
+    ...     return x + 1
+    ...
+    >>> df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age"))
+    >>> df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")) \\
+    ...     .show() # doctest: +SKIP
+    +----------+--------------+------------+
+    |slen(name)|to_upper(name)|add_one(age)|
+    +----------+--------------+------------+
+    |         8|      JOHN DOE|          22|
+    +----------+--------------+------------+
     """
-    import inspect
-    # If function "f" does not define the optional kwargs, then wrap with a kwargs placeholder
-    if inspect.getargspec(f).keywords is None:
-        return _create_udf(lambda *a, **kwargs: f(*a), returnType=returnType, vectorized=True)
-    else:
-        return _create_udf(f, returnType=returnType, vectorized=True)
+    wrapped_udf = _create_udf(f, returnType=returnType, vectorized=True)
+
+    return wrapped_udf
 
 
 blacklist = ['map', 'since', 'ignore_unicode_prefix']
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 3db8bee203469..1b3af42c47ad2 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -3256,11 +3256,20 @@ def test_vectorized_udf_null_string(self):
 
     def test_vectorized_udf_zero_parameter(self):
         from pyspark.sql.functions import pandas_udf
-        import pandas as pd
-        df = self.spark.range(10)
-        f0 = pandas_udf(lambda **kwargs: pd.Series(1).repeat(kwargs['length']), LongType())
-        res = df.select(f0())
-        self.assertEquals(df.select(lit(1)).collect(), res.collect())
+        error_str = '0-parameter pandas_udfs.*not.*supported'
+        with QuietTest(self.sc):
+            with self.assertRaisesRegexp(NotImplementedError, error_str):
+                pandas_udf(lambda: 1, LongType())
+
+            with self.assertRaisesRegexp(NotImplementedError, error_str):
+                @pandas_udf
+                def zero_no_type():
+                    return 1
+
+            with self.assertRaisesRegexp(NotImplementedError, error_str):
+                @pandas_udf(LongType())
+                def zero_with_type():
+                    return 1
 
     def test_vectorized_udf_datatype_string(self):
         from pyspark.sql.functions import pandas_udf, col
@@ -3308,12 +3317,12 @@ def test_vectorized_udf_invalid_length(self):
         from pyspark.sql.functions import pandas_udf, col
         import pandas as pd
         df = self.spark.range(10)
-        raise_exception = pandas_udf(lambda: pd.Series(1), LongType())
+        raise_exception = pandas_udf(lambda _: pd.Series(1), LongType())
         with QuietTest(self.sc):
             with self.assertRaisesRegexp(
                     Exception,
                     'Result vector from pandas_udf was not the required length'):
-                df.select(raise_exception()).collect()
+                df.select(raise_exception(col('id'))).collect()
 
     def test_vectorized_udf_mix_udf(self):
         from pyspark.sql.functions import pandas_udf, udf, col
@@ -3328,22 +3337,44 @@ def test_vectorized_udf_mix_udf(self):
 
     def test_vectorized_udf_chained(self):
         from pyspark.sql.functions import pandas_udf, col
-        df = self.spark.range(10).toDF('x')
+        df = self.spark.range(10)
         f = pandas_udf(lambda x: x + 1, LongType())
         g = pandas_udf(lambda x: x - 1, LongType())
-        res = df.select(g(f(col('x'))))
+        res = df.select(g(f(col('id'))))
         self.assertEquals(df.collect(), res.collect())
 
     def test_vectorized_udf_wrong_return_type(self):
         from pyspark.sql.functions import pandas_udf, col
-        df = self.spark.range(10).toDF('x')
+        df = self.spark.range(10)
         f = pandas_udf(lambda x: x * 1.0, StringType())
         with QuietTest(self.sc):
-            with self.assertRaisesRegexp(
-                    Exception,
-                    'Invalid.*type.*string'):
-                df.select(f(col('x'))).collect()
+            with self.assertRaisesRegexp(Exception, 'Invalid.*type.*string'):
+                df.select(f(col('id'))).collect()
+
+    def test_vectorized_udf_return_scalar(self):
+        from pyspark.sql.functions import pandas_udf, col
+        df = self.spark.range(10)
+        f = pandas_udf(lambda x: 1.0, DoubleType())
+        with QuietTest(self.sc):
+            with self.assertRaisesRegexp(Exception, 'Return.*type.*pandas_udf.*Series'):
+                df.select(f(col('id'))).collect()
+
+    def test_vectorized_udf_decorator(self):
+        from pyspark.sql.functions import pandas_udf, col
+        df = self.spark.range(10)
 
+        @pandas_udf(returnType=LongType())
+        def identity(x):
+            return x
+        res = df.select(identity(col('id')))
+        self.assertEquals(df.collect(), res.collect())
+
+    def test_vectorized_udf_empty_partition(self):
+        from pyspark.sql.functions import pandas_udf, col
+        df = self.spark.createDataFrame(self.sc.parallelize([Row(id=1)], 2))
+        f = pandas_udf(lambda x: x, LongType())
+        res = df.select(f(col('id')))
+        self.assertEquals(df.collect(), res.collect())
 
 if __name__ == "__main__":
     from pyspark.sql.tests import *
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index 0e35cf7be6240..fd917c400c872 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -60,12 +60,9 @@ def read_command(serializer, file):
     return command
 
 
-def chain(f, g, eval_type):
+def chain(f, g):
     """chain two functions together """
-    if eval_type == PythonEvalType.SQL_PANDAS_UDF:
-        return lambda *a, **kwargs: g(f(*a, **kwargs), **kwargs)
-    else:
-        return lambda *a: g(f(*a))
+    return lambda *a: g(f(*a))
 
 
 def wrap_udf(f, return_type):
@@ -80,14 +77,14 @@ def wrap_pandas_udf(f, return_type):
     arrow_return_type = toArrowType(return_type)
 
     def verify_result_length(*a):
-        kwargs = a[-1]
-        result = f(*a[:-1], **kwargs)
-        if len(result) != kwargs["length"]:
+        result = f(*a)
+        if not hasattr(result, "__len__"):
+            raise TypeError("Return type of pandas_udf should be a Pandas.Series")
+        if len(result) != len(a[0]):
             raise RuntimeError("Result vector from pandas_udf was not the required length: "
-                               "expected %d, got %d\nUse input vector length or kwargs['length']"
-                               % (kwargs["length"], len(result)))
-        return result, arrow_return_type
-    return lambda *a: verify_result_length(*a)
+                               "expected %d, got %d" % (len(a[0]), len(result)))
+        return result
+    return lambda *a: (verify_result_length(*a), arrow_return_type)
 
 
 def read_single_udf(pickleSer, infile, eval_type):
@@ -99,11 +96,9 @@ def read_single_udf(pickleSer, infile, eval_type):
         if row_func is None:
             row_func = f
         else:
-            row_func = chain(row_func, f, eval_type)
+            row_func = chain(row_func, f)
     # the last returnType will be the return type of UDF
     if eval_type == PythonEvalType.SQL_PANDAS_UDF:
-        # A pandas_udf will take kwargs as the last argument
-        arg_offsets = arg_offsets + [-1]
         return arg_offsets, wrap_pandas_udf(row_func, return_type)
     else:
         return arg_offsets, wrap_udf(row_func, return_type)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
index f8bdc1e14eebc..5e72cd255873a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
@@ -51,10 +51,12 @@ case class ArrowEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], chi
       outputIterator.map(new ArrowPayload(_)), context)
 
     // Verify that the output schema is correct
-    val schemaOut = StructType.fromAttributes(output.drop(child.output.length).zipWithIndex
-      .map { case (attr, i) => attr.withName(s"_$i") })
-    assert(schemaOut.equals(outputRowIterator.schema),
-      s"Invalid schema from pandas_udf: expected $schemaOut, got ${outputRowIterator.schema}")
+    if (outputRowIterator.hasNext) {
+      val schemaOut = StructType.fromAttributes(output.drop(child.output.length).zipWithIndex
+        .map { case (attr, i) => attr.withName(s"_$i") })
+      assert(schemaOut.equals(outputRowIterator.schema),
+        s"Invalid schema from pandas_udf: expected $schemaOut, got ${outputRowIterator.schema}")
+    }
 
     outputRowIterator
   }

From 64fbd1cef30033efd83570a752980ea658ee12bb Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Tue, 26 Sep 2017 15:23:13 +0800
Subject: [PATCH 623/779] [SPARK-22124][SQL] Sample and Limit should also defer
 input evaluation under codegen

## What changes were proposed in this pull request?

We can override `usedInputs` to claim that an operator defers input evaluation. `Sample` and `Limit` are two operators which should claim it but don't. We should do it.

## How was this patch tested?

Existing tests.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #19345 from viirya/SPARK-22124.
---
 .../apache/spark/sql/execution/basicPhysicalOperators.scala   | 4 ++++
 .../src/main/scala/org/apache/spark/sql/execution/limit.scala | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
index 18142c44f0295..8389e2f3d5be9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
@@ -267,6 +267,10 @@ case class SampleExec(
     }
   }
 
+  // Mark this as empty. This plan doesn't need to evaluate any inputs and can defer the evaluation
+  // to the parent operator.
+  override def usedInputs: AttributeSet = AttributeSet.empty
+
   override def inputRDDs(): Seq[RDD[InternalRow]] = {
     child.asInstanceOf[CodegenSupport].inputRDDs()
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala
index 73a0f8735ed45..1f515e29b4af5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala
@@ -62,6 +62,10 @@ trait BaseLimitExec extends UnaryExecNode with CodegenSupport {
     child.asInstanceOf[CodegenSupport].inputRDDs()
   }
 
+  // Mark this as empty. This plan doesn't need to evaluate any inputs and can defer the evaluation
+  // to the parent operator.
+  override def usedInputs: AttributeSet = AttributeSet.empty
+
   protected override def doProduce(ctx: CodegenContext): String = {
     child.asInstanceOf[CodegenSupport].produce(ctx, this)
   }

From f21f6ce9982db38f549119616d544b427b7b1e1f Mon Sep 17 00:00:00 2001
From: Juliusz Sompolski <julek@databricks.com>
Date: Tue, 26 Sep 2017 10:04:34 -0700
Subject: [PATCH 624/779] [SPARK-22103][FOLLOWUP] Rename addExtraCode to
 addInnerClass

## What changes were proposed in this pull request?

Address PR comments that appeared post-merge, to rename `addExtraCode` to `addInnerClass`,
and not count the size of the inner class to the size of the outer class.

## How was this patch tested?

YOLO.

Author: Juliusz Sompolski <julek@databricks.com>

Closes #19353 from juliuszsompolski/SPARK-22103followup.
---
 .../expressions/codegen/CodeGenerator.scala         | 13 ++++++-------
 .../sql/execution/aggregate/HashAggregateExec.scala |  2 +-
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index f10d368627707..f3b45799c5688 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -243,7 +243,7 @@ class CodegenContext {
     mutable.Map(outerClassName -> mutable.Map.empty[String, String])
 
   // Verbatim extra code to be added to the OuterClass.
-  private val extraCode: mutable.ListBuffer[String] = mutable.ListBuffer[String]()
+  private val extraClasses: mutable.ListBuffer[String] = mutable.ListBuffer[String]()
 
   // Returns the size of the most recently added class.
   private def currClassSize(): Int = classSize(classes.head._1)
@@ -332,19 +332,18 @@ class CodegenContext {
   }
 
   /**
-   * Emits any source code added with addExtraCode
+   * Emits extra inner classes added with addExtraCode
    */
   def emitExtraCode(): String = {
-    extraCode.mkString("\n")
+    extraClasses.mkString("\n")
   }
 
   /**
    * Add extra source code to the outermost generated class.
-   * @param code verbatim source code to be added.
+   * @param code verbatim source code of the inner class to be added.
    */
-  def addExtraCode(code: String): Unit = {
-    extraCode.append(code)
-    classSize(outerClassName) += code.length
+  def addInnerClass(code: String): Unit = {
+    extraClasses.append(code)
   }
 
   final val JAVA_BOOLEAN = "boolean"
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
index abdf9530c6c7b..f424096b330e3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
@@ -621,7 +621,7 @@ case class HashAggregateExec(
         }
       } else ""
     }
-    ctx.addExtraCode(generateGenerateCode())
+    ctx.addInnerClass(generateGenerateCode())
 
     val doAgg = ctx.freshName("doAggregateWithKeys")
     val peakMemory = metricTerm(ctx, "peakMemory")

From ceaec93839d18a20e0cd78b70f3ea71872dce0a4 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Wed, 27 Sep 2017 09:30:25 +0900
Subject: [PATCH 625/779] [BUILD] Close stale PRs

Closes #13794
Closes #18474
Closes #18897
Closes #18978
Closes #19152
Closes #19238
Closes #19295
Closes #19334
Closes #19335
Closes #19347
Closes #19236
Closes #19244
Closes #19300
Closes #19315
Closes #19356
Closes #15009
Closes #18253

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #19348 from HyukjinKwon/stale-prs.

From 1fdfe69352e4d4714c1f06d61d7ad475ce3a7f1f Mon Sep 17 00:00:00 2001
From: goldmedal <liugs963@gmail.com>
Date: Wed, 27 Sep 2017 11:19:45 +0900
Subject: [PATCH 626/779] [SPARK-22112][PYSPARK] Supports RDD of strings as
 input in spark.read.csv in PySpark

## What changes were proposed in this pull request?
We added a method to the scala API for creating a `DataFrame` from `DataSet[String]` storing CSV in [SPARK-15463](https://issues.apache.org/jira/browse/SPARK-15463) but PySpark doesn't have `Dataset` to support this feature. Therfore, I add an API to create a `DataFrame` from `RDD[String]` storing csv and it's also consistent with PySpark's `spark.read.json`.

For example as below
```
>>> rdd = sc.textFile('python/test_support/sql/ages.csv')
>>> df2 = spark.read.csv(rdd)
>>> df2.dtypes
[('_c0', 'string'), ('_c1', 'string')]
```
## How was this patch tested?
add unit test cases.

Author: goldmedal <liugs963@gmail.com>

Closes #19339 from goldmedal/SPARK-22112.
---
 python/pyspark/sql/readwriter.py | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index cb847a0420311..f3092918abb54 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -335,7 +335,8 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
         ``inferSchema`` is enabled. To avoid going through the entire data once, disable
         ``inferSchema`` option or specify the schema explicitly using ``schema``.
 
-        :param path: string, or list of strings, for input path(s).
+        :param path: string, or list of strings, for input path(s),
+                     or RDD of Strings storing CSV rows.
         :param schema: an optional :class:`pyspark.sql.types.StructType` for the input schema
                        or a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``).
         :param sep: sets the single character as a separator for each field and value.
@@ -408,6 +409,10 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
         >>> df = spark.read.csv('python/test_support/sql/ages.csv')
         >>> df.dtypes
         [('_c0', 'string'), ('_c1', 'string')]
+        >>> rdd = sc.textFile('python/test_support/sql/ages.csv')
+        >>> df2 = spark.read.csv(rdd)
+        >>> df2.dtypes
+        [('_c0', 'string'), ('_c1', 'string')]
         """
         self._set_opts(
             schema=schema, sep=sep, encoding=encoding, quote=quote, escape=escape, comment=comment,
@@ -420,7 +425,29 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
             columnNameOfCorruptRecord=columnNameOfCorruptRecord, multiLine=multiLine)
         if isinstance(path, basestring):
             path = [path]
-        return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))
+        if type(path) == list:
+            return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))
+        elif isinstance(path, RDD):
+            def func(iterator):
+                for x in iterator:
+                    if not isinstance(x, basestring):
+                        x = unicode(x)
+                    if isinstance(x, unicode):
+                        x = x.encode("utf-8")
+                    yield x
+            keyed = path.mapPartitions(func)
+            keyed._bypass_serializer = True
+            jrdd = keyed._jrdd.map(self._spark._jvm.BytesToString())
+            # see SPARK-22112
+            # There aren't any jvm api for creating a dataframe from rdd storing csv.
+            # We can do it through creating a jvm dataset firstly and using the jvm api
+            # for creating a dataframe from dataset storing csv.
+            jdataset = self._spark._ssql_ctx.createDataset(
+                jrdd.rdd(),
+                self._spark._jvm.Encoders.STRING())
+            return self._df(self._jreader.csv(jdataset))
+        else:
+            raise TypeError("path can be only string, list or RDD")
 
     @since(1.5)
     def orc(self, path):

From 9c5935d00b54ed2f029a214ffc64f69cfa854e69 Mon Sep 17 00:00:00 2001
From: Wang Gengliang <ltnwgl@gmail.com>
Date: Wed, 27 Sep 2017 12:44:10 +0200
Subject: [PATCH 627/779] [SPARK-22141][SQL] Propagate empty relation before
 checking Cartesian products

## What changes were proposed in this pull request?

When inferring constraints from children, Join's condition can be simplified as None.
For example,
```
val testRelation = LocalRelation('a.int)
val x = testRelation.as("x")
val y = testRelation.where($"a" === 2 && !($"a" === 2)).as("y")
x.join.where($"x.a" === $"y.a")
```
The plan will become
```
Join Inner
:- LocalRelation <empty>, [a#23]
+- LocalRelation <empty>, [a#224]
```
And the Cartesian products check will throw exception for above plan.

Propagate empty relation before checking Cartesian products, and the issue is resolved.

## How was this patch tested?

Unit test

Author: Wang Gengliang <ltnwgl@gmail.com>

Closes #19362 from gengliangwang/MoveCheckCartesianProducts.
---
 .../apache/spark/sql/catalyst/optimizer/Optimizer.scala   | 4 ++--
 .../src/test/scala/org/apache/spark/sql/JoinSuite.scala   | 8 ++++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index a602894efbcae..a391c513ad384 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -124,8 +124,6 @@ abstract class Optimizer(sessionCatalog: SessionCatalog)
       SimplifyCreateMapOps,
       CombineConcats) ++
       extendedOperatorOptimizationRules: _*) ::
-    Batch("Check Cartesian Products", Once,
-      CheckCartesianProducts) ::
     Batch("Join Reorder", Once,
       CostBasedJoinReorder) ::
     Batch("Decimal Optimizations", fixedPoint,
@@ -136,6 +134,8 @@ abstract class Optimizer(sessionCatalog: SessionCatalog)
     Batch("LocalRelation", fixedPoint,
       ConvertToLocalRelation,
       PropagateEmptyRelation) ::
+    Batch("Check Cartesian Products", Once,
+      CheckCartesianProducts) ::
     Batch("OptimizeCodegen", Once,
       OptimizeCodegen) ::
     Batch("RewriteSubquery", Once,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index 9d50e8be60891..226cc3028b135 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -200,6 +200,14 @@ class JoinSuite extends QueryTest with SharedSQLContext {
       Nil)
   }
 
+  test("SPARK-22141: Propagate empty relation before checking Cartesian products") {
+    Seq("inner", "left", "right", "left_outer", "right_outer", "full_outer").foreach { joinType =>
+      val x = testData2.where($"a" === 2 && !($"a" === 2)).as("x")
+      val y = testData2.where($"a" === 1 && !($"a" === 1)).as("y")
+      checkAnswer(x.join(y, Seq.empty, joinType), Nil)
+    }
+  }
+
   test("big inner join, 4 matches per row") {
     val bigData = testData.union(testData).union(testData).union(testData)
     val bigDataX = bigData.as("x")

From 74daf622de4e534d5a5929b424a6e836850eefad Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Wed, 27 Sep 2017 20:33:41 +0800
Subject: [PATCH 628/779] [SPARK-20642][CORE] Store FsHistoryProvider listing
 data in a KVStore.

The application listing is still generated from event logs, but is now stored
in a KVStore instance. By default an in-memory store is used, but a new config
allows setting a local disk path to store the data, in which case a LevelDB
store will be created.

The provider stores things internally using the public REST API types; I believe
this is better going forward since it will make it easier to get rid of the
internal history server API which is mostly redundant at this point.

I also added a finalizer to LevelDBIterator, to make sure that resources are
eventually released. This helps when code iterates but does not exhaust the
iterator, thus not triggering the auto-close code.

HistoryServerSuite was modified to not re-start the history server unnecessarily;
this makes the json validation tests run more quickly.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #18887 from vanzin/SPARK-20642.
---
 .../apache/spark/util/kvstore/LevelDB.java    |  33 +-
 .../spark/util/kvstore/LevelDBIterator.java   |  10 +
 core/pom.xml                                  |   5 +
 .../deploy/history/FsHistoryProvider.scala    | 659 ++++++++++--------
 .../apache/spark/deploy/history/config.scala  |  48 ++
 .../org/apache/spark/status/api/v1/api.scala  |  15 +-
 .../history/FsHistoryProviderSuite.scala      |  71 +-
 .../deploy/history/HistoryServerSuite.scala   |  17 +-
 docs/monitoring.md                            |   9 +
 scalastyle-config.xml                         |   2 +-
 10 files changed, 536 insertions(+), 333 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/deploy/history/config.scala

diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java
index 310febc352ef8..ff48b155fab31 100644
--- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java
@@ -213,17 +213,32 @@ public long count(Class<?> type, String index, Object indexedValue) throws Excep
 
   @Override
   public void close() throws IOException {
-    DB _db = this._db.getAndSet(null);
-    if (_db == null) {
-      return;
+    synchronized (this._db) {
+      DB _db = this._db.getAndSet(null);
+      if (_db == null) {
+        return;
+      }
+
+      try {
+        _db.close();
+      } catch (IOException ioe) {
+        throw ioe;
+      } catch (Exception e) {
+        throw new IOException(e.getMessage(), e);
+      }
     }
+  }
 
-    try {
-      _db.close();
-    } catch (IOException ioe) {
-      throw ioe;
-    } catch (Exception e) {
-      throw new IOException(e.getMessage(), e);
+  /**
+   * Closes the given iterator if the DB is still open. Trying to close a JNI LevelDB handle
+   * with a closed DB can cause JVM crashes, so this ensures that situation does not happen.
+   */
+  void closeIterator(LevelDBIterator it) throws IOException {
+    synchronized (this._db) {
+      DB _db = this._db.get();
+      if (_db != null) {
+        it.close();
+      }
     }
   }
 
diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBIterator.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBIterator.java
index a2181f3874f86..b3ba76ba58052 100644
--- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBIterator.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBIterator.java
@@ -191,6 +191,16 @@ public synchronized void close() throws IOException {
     }
   }
 
+  /**
+   * Because it's tricky to expose closeable iterators through many internal APIs, especially
+   * when Scala wrappers are used, this makes sure that, hopefully, the JNI resources held by
+   * the iterator will eventually be released.
+   */
+  @Override
+  protected void finalize() throws Throwable {
+    db.closeIterator(this);
+  }
+
   private byte[] loadNext() {
     if (count >= max) {
       return null;
diff --git a/core/pom.xml b/core/pom.xml
index da68abd855c7c..09669149d8123 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -67,6 +67,11 @@
       <artifactId>spark-launcher_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-kvstore_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-network-common_${scala.binary.version}</artifactId>
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index 910121e9878b9..3889dd097ee59 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -17,14 +17,17 @@
 
 package org.apache.spark.deploy.history
 
-import java.io.{FileNotFoundException, IOException, OutputStream}
-import java.util.UUID
-import java.util.concurrent.{ConcurrentHashMap, Executors, ExecutorService, Future, TimeUnit}
+import java.io.{File, FileNotFoundException, IOException}
+import java.util.{Date, UUID}
+import java.util.concurrent.{Executors, ExecutorService, Future, TimeUnit}
 import java.util.zip.{ZipEntry, ZipOutputStream}
 
+import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.xml.Node
 
+import com.fasterxml.jackson.annotation.{JsonIgnore, JsonInclude}
+import com.fasterxml.jackson.module.scala.DefaultScalaModule
 import com.google.common.io.ByteStreams
 import com.google.common.util.concurrent.{MoreExecutors, ThreadFactoryBuilder}
 import org.apache.hadoop.fs.{FileStatus, Path}
@@ -35,11 +38,14 @@ import org.apache.hadoop.security.AccessControlException
 
 import org.apache.spark.{SecurityManager, SparkConf, SparkException}
 import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.deploy.history.config._
 import org.apache.spark.internal.Logging
 import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.ReplayListenerBus._
+import org.apache.spark.status.api.v1
 import org.apache.spark.ui.SparkUI
 import org.apache.spark.util.{Clock, SystemClock, ThreadUtils, Utils}
+import org.apache.spark.util.kvstore._
 
 /**
  * A class that provides application history from event logs stored in the file system.
@@ -50,11 +56,10 @@ import org.apache.spark.util.{Clock, SystemClock, ThreadUtils, Utils}
  *
  * - New attempts are detected in [[checkForLogs]]: the log dir is scanned, and any
  * entries in the log dir whose modification time is greater than the last scan time
- * are considered new or updated. These are replayed to create a new [[FsApplicationAttemptInfo]]
- * entry and update or create a matching [[FsApplicationHistoryInfo]] element in the list
- * of applications.
+ * are considered new or updated. These are replayed to create a new attempt info entry
+ * and update or create a matching application info element in the list of applications.
  * - Updated attempts are also found in [[checkForLogs]] -- if the attempt's log file has grown, the
- * [[FsApplicationAttemptInfo]] is replaced by another one with a larger log size.
+ * attempt is replaced by another one with a larger log size.
  * - When [[updateProbe()]] is invoked to check if a loaded [[SparkUI]]
  * instance is out of date, the log size of the cached instance is checked against the app last
  * loaded by [[checkForLogs]].
@@ -78,6 +83,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     this(conf, new SystemClock())
   }
 
+  import config._
   import FsHistoryProvider._
 
   // Interval between safemode checks.
@@ -94,8 +100,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
   private val NUM_PROCESSING_THREADS = conf.getInt(SPARK_HISTORY_FS_NUM_REPLAY_THREADS,
     Math.ceil(Runtime.getRuntime.availableProcessors() / 4f).toInt)
 
-  private val logDir = conf.getOption("spark.history.fs.logDirectory")
-    .getOrElse(DEFAULT_LOG_DIR)
+  private val logDir = conf.get(EVENT_LOG_DIR)
 
   private val HISTORY_UI_ACLS_ENABLE = conf.getBoolean("spark.history.ui.acls.enable", false)
   private val HISTORY_UI_ADMIN_ACLS = conf.get("spark.history.ui.admin.acls", "")
@@ -117,17 +122,38 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
   // used for logging msgs (logs are re-scanned based on file size, rather than modtime)
   private val lastScanTime = new java.util.concurrent.atomic.AtomicLong(-1)
 
-  // Mapping of application IDs to their metadata, in descending end time order. Apps are inserted
-  // into the map in order, so the LinkedHashMap maintains the correct ordering.
-  @volatile private var applications: mutable.LinkedHashMap[String, FsApplicationHistoryInfo]
-    = new mutable.LinkedHashMap()
+  private val pendingReplayTasksCount = new java.util.concurrent.atomic.AtomicInteger(0)
 
-  val fileToAppInfo = new ConcurrentHashMap[Path, FsApplicationAttemptInfo]()
+  private val storePath = conf.get(LOCAL_STORE_DIR)
 
-  // List of application logs to be deleted by event log cleaner.
-  private var attemptsToClean = new mutable.ListBuffer[FsApplicationAttemptInfo]
+  // Visible for testing.
+  private[history] val listing: KVStore = storePath.map { path =>
+    val dbPath = new File(path, "listing.ldb")
 
-  private val pendingReplayTasksCount = new java.util.concurrent.atomic.AtomicInteger(0)
+    def openDB(): LevelDB = new LevelDB(dbPath, new KVStoreScalaSerializer())
+
+    try {
+      val db = openDB()
+      val meta = db.getMetadata(classOf[KVStoreMetadata])
+
+      if (meta == null) {
+        db.setMetadata(new KVStoreMetadata(CURRENT_LISTING_VERSION, logDir))
+        db
+      } else if (meta.version != CURRENT_LISTING_VERSION || !logDir.equals(meta.logDir)) {
+        logInfo("Detected mismatched config in existing DB, deleting...")
+        db.close()
+        Utils.deleteRecursively(dbPath)
+        openDB()
+      } else {
+        db
+      }
+    } catch {
+      case _: UnsupportedStoreVersionException =>
+        logInfo("Detected incompatible DB versions, deleting...")
+        Utils.deleteRecursively(dbPath)
+        openDB()
+    }
+  }.getOrElse(new InMemoryStore())
 
   /**
    * Return a runnable that performs the given operation on the event logs.
@@ -231,10 +257,23 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     }
   }
 
-  override def getListing(): Iterator[FsApplicationHistoryInfo] = applications.values.iterator
+  override def getListing(): Iterator[ApplicationHistoryInfo] = {
+    // Return the listing in end time descending order.
+    listing.view(classOf[ApplicationInfoWrapper])
+      .index("endTime")
+      .reverse()
+      .iterator()
+      .asScala
+      .map(_.toAppHistoryInfo())
+  }
 
-  override def getApplicationInfo(appId: String): Option[FsApplicationHistoryInfo] = {
-    applications.get(appId)
+  override def getApplicationInfo(appId: String): Option[ApplicationHistoryInfo] = {
+    try {
+      Some(load(appId).toAppHistoryInfo())
+    } catch {
+      case e: NoSuchElementException =>
+        None
+    }
   }
 
   override def getEventLogsUnderProcess(): Int = pendingReplayTasksCount.get()
@@ -243,42 +282,40 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
 
   override def getAppUI(appId: String, attemptId: Option[String]): Option[LoadedAppUI] = {
     try {
-      applications.get(appId).flatMap { appInfo =>
-        appInfo.attempts.find(_.attemptId == attemptId).flatMap { attempt =>
+      val appInfo = load(appId)
+      appInfo.attempts
+        .find(_.info.attemptId == attemptId)
+        .map { attempt =>
           val replayBus = new ReplayListenerBus()
           val ui = {
             val conf = this.conf.clone()
             val appSecManager = new SecurityManager(conf)
-            SparkUI.createHistoryUI(conf, replayBus, appSecManager, appInfo.name,
-              HistoryServer.getAttemptURI(appId, attempt.attemptId),
-              Some(attempt.lastUpdated), attempt.startTime)
+            SparkUI.createHistoryUI(conf, replayBus, appSecManager, appInfo.info.name,
+              HistoryServer.getAttemptURI(appId, attempt.info.attemptId),
+              Some(attempt.info.lastUpdated.getTime()), attempt.info.startTime.getTime())
             // Do not call ui.bind() to avoid creating a new server for each application
           }
 
           val fileStatus = fs.getFileStatus(new Path(logDir, attempt.logPath))
 
           val appListener = replay(fileStatus, isApplicationCompleted(fileStatus), replayBus)
-
-          if (appListener.appId.isDefined) {
-            ui.appSparkVersion = appListener.appSparkVersion.getOrElse("")
-            ui.getSecurityManager.setAcls(HISTORY_UI_ACLS_ENABLE)
-            // make sure to set admin acls before view acls so they are properly picked up
-            val adminAcls = HISTORY_UI_ADMIN_ACLS + "," + appListener.adminAcls.getOrElse("")
-            ui.getSecurityManager.setAdminAcls(adminAcls)
-            ui.getSecurityManager.setViewAcls(attempt.sparkUser, appListener.viewAcls.getOrElse(""))
-            val adminAclsGroups = HISTORY_UI_ADMIN_ACLS_GROUPS + "," +
-              appListener.adminAclsGroups.getOrElse("")
-            ui.getSecurityManager.setAdminAclsGroups(adminAclsGroups)
-            ui.getSecurityManager.setViewAclsGroups(appListener.viewAclsGroups.getOrElse(""))
-            Some(LoadedAppUI(ui, () => updateProbe(appId, attemptId, attempt.fileSize)))
-          } else {
-            None
-          }
-
+          assert(appListener.appId.isDefined)
+          ui.appSparkVersion = appListener.appSparkVersion.getOrElse("")
+          ui.getSecurityManager.setAcls(HISTORY_UI_ACLS_ENABLE)
+          // make sure to set admin acls before view acls so they are properly picked up
+          val adminAcls = HISTORY_UI_ADMIN_ACLS + "," + appListener.adminAcls.getOrElse("")
+          ui.getSecurityManager.setAdminAcls(adminAcls)
+          ui.getSecurityManager.setViewAcls(attempt.info.sparkUser,
+            appListener.viewAcls.getOrElse(""))
+          val adminAclsGroups = HISTORY_UI_ADMIN_ACLS_GROUPS + "," +
+            appListener.adminAclsGroups.getOrElse("")
+          ui.getSecurityManager.setAdminAclsGroups(adminAclsGroups)
+          ui.getSecurityManager.setViewAclsGroups(appListener.viewAclsGroups.getOrElse(""))
+          LoadedAppUI(ui, () => updateProbe(appId, attemptId, attempt.fileSize))
         }
-      }
     } catch {
-      case e: FileNotFoundException => None
+      case _: FileNotFoundException => None
+      case _: NoSuchElementException => None
     }
   }
 
@@ -303,9 +340,13 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
   }
 
   override def stop(): Unit = {
-    if (initThread != null && initThread.isAlive()) {
-      initThread.interrupt()
-      initThread.join()
+    try {
+      if (initThread != null && initThread.isAlive()) {
+        initThread.interrupt()
+        initThread.join()
+      }
+    } finally {
+      listing.close()
     }
   }
 
@@ -318,25 +359,20 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     try {
       val newLastScanTime = getNewLastScanTime()
       logDebug(s"Scanning $logDir with lastScanTime==$lastScanTime")
-      val statusList = Option(fs.listStatus(new Path(logDir))).map(_.toSeq)
-        .getOrElse(Seq.empty[FileStatus])
       // scan for modified applications, replay and merge them
-      val logInfos: Seq[FileStatus] = statusList
+      val logInfos = Option(fs.listStatus(new Path(logDir))).map(_.toSeq).getOrElse(Nil)
         .filter { entry =>
-          val fileInfo = fileToAppInfo.get(entry.getPath())
-          val prevFileSize = if (fileInfo != null) fileInfo.fileSize else 0L
           !entry.isDirectory() &&
             // FsHistoryProvider generates a hidden file which can't be read.  Accidentally
             // reading a garbage file is safe, but we would log an error which can be scary to
             // the end-user.
             !entry.getPath().getName().startsWith(".") &&
-            prevFileSize < entry.getLen() &&
-            SparkHadoopUtil.get.checkAccessPermission(entry, FsAction.READ)
+            SparkHadoopUtil.get.checkAccessPermission(entry, FsAction.READ) &&
+            recordedFileSize(entry.getPath()) < entry.getLen()
         }
-        .flatMap { entry => Some(entry) }
         .sortWith { case (entry1, entry2) =>
-          entry1.getModificationTime() >= entry2.getModificationTime()
-      }
+          entry1.getModificationTime() > entry2.getModificationTime()
+        }
 
       if (logInfos.nonEmpty) {
         logDebug(s"New/updated attempts found: ${logInfos.size} ${logInfos.map(_.getPath)}")
@@ -424,207 +460,104 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
       }
     }
 
-    applications.get(appId) match {
-      case Some(appInfo) =>
-        try {
-          // If no attempt is specified, or there is no attemptId for attempts, return all attempts
-          appInfo.attempts.filter { attempt =>
-            attempt.attemptId.isEmpty || attemptId.isEmpty || attempt.attemptId.get == attemptId.get
-          }.foreach { attempt =>
-            val logPath = new Path(logDir, attempt.logPath)
-            zipFileToStream(logPath, attempt.logPath, zipStream)
-          }
-        } finally {
-          zipStream.close()
+    val app = try {
+      load(appId)
+    } catch {
+      case _: NoSuchElementException =>
+        throw new SparkException(s"Logs for $appId not found.")
+    }
+
+    try {
+      // If no attempt is specified, or there is no attemptId for attempts, return all attempts
+      attemptId
+        .map { id => app.attempts.filter(_.info.attemptId == Some(id)) }
+        .getOrElse(app.attempts)
+        .map(_.logPath)
+        .foreach { log =>
+          zipFileToStream(new Path(logDir, log), log, zipStream)
         }
-      case None => throw new SparkException(s"Logs for $appId not found.")
+    } finally {
+      zipStream.close()
     }
   }
 
   /**
-   * Replay the log files in the list and merge the list of old applications with new ones
+   * Replay the given log file, saving the application in the listing db.
    */
   protected def mergeApplicationListing(fileStatus: FileStatus): Unit = {
-    val newAttempts = try {
-      val eventsFilter: ReplayEventsFilter = { eventString =>
-        eventString.startsWith(APPL_START_EVENT_PREFIX) ||
-          eventString.startsWith(APPL_END_EVENT_PREFIX) ||
-          eventString.startsWith(LOG_START_EVENT_PREFIX)
-      }
-
-      val logPath = fileStatus.getPath()
-      val appCompleted = isApplicationCompleted(fileStatus)
-
-      // Use loading time as lastUpdated since some filesystems don't update modifiedTime
-      // each time file is updated. However use modifiedTime for completed jobs so lastUpdated
-      // won't change whenever HistoryServer restarts and reloads the file.
-      val lastUpdated = if (appCompleted) fileStatus.getModificationTime else clock.getTimeMillis()
-
-      val appListener = replay(fileStatus, appCompleted, new ReplayListenerBus(), eventsFilter)
-
-      // Without an app ID, new logs will render incorrectly in the listing page, so do not list or
-      // try to show their UI.
-      if (appListener.appId.isDefined) {
-        val attemptInfo = new FsApplicationAttemptInfo(
-          logPath.getName(),
-          appListener.appName.getOrElse(NOT_STARTED),
-          appListener.appId.getOrElse(logPath.getName()),
-          appListener.appAttemptId,
-          appListener.startTime.getOrElse(-1L),
-          appListener.endTime.getOrElse(-1L),
-          lastUpdated,
-          appListener.sparkUser.getOrElse(NOT_STARTED),
-          appCompleted,
-          fileStatus.getLen(),
-          appListener.appSparkVersion.getOrElse("")
-        )
-        fileToAppInfo.put(logPath, attemptInfo)
-        logDebug(s"Application log ${attemptInfo.logPath} loaded successfully: $attemptInfo")
-        Some(attemptInfo)
-      } else {
-        logWarning(s"Failed to load application log ${fileStatus.getPath}. " +
-          "The application may have not started.")
-        None
-      }
-
-    } catch {
-      case e: Exception =>
-        logError(
-          s"Exception encountered when attempting to load application log ${fileStatus.getPath}",
-          e)
-        None
-    }
-
-    if (newAttempts.isEmpty) {
-      return
+    val eventsFilter: ReplayEventsFilter = { eventString =>
+      eventString.startsWith(APPL_START_EVENT_PREFIX) ||
+        eventString.startsWith(APPL_END_EVENT_PREFIX) ||
+        eventString.startsWith(LOG_START_EVENT_PREFIX)
     }
 
-    // Build a map containing all apps that contain new attempts. The app information in this map
-    // contains both the new app attempt, and those that were already loaded in the existing apps
-    // map. If an attempt has been updated, it replaces the old attempt in the list.
-    val newAppMap = new mutable.HashMap[String, FsApplicationHistoryInfo]()
-
-    applications.synchronized {
-      newAttempts.foreach { attempt =>
-        val appInfo = newAppMap.get(attempt.appId)
-          .orElse(applications.get(attempt.appId))
-          .map { app =>
-            val attempts =
-              app.attempts.filter(_.attemptId != attempt.attemptId) ++ List(attempt)
-            new FsApplicationHistoryInfo(attempt.appId, attempt.name,
-              attempts.sortWith(compareAttemptInfo))
-          }
-          .getOrElse(new FsApplicationHistoryInfo(attempt.appId, attempt.name, List(attempt)))
-        newAppMap(attempt.appId) = appInfo
-      }
+    val logPath = fileStatus.getPath()
+    logInfo(s"Replaying log path: $logPath")
 
-      // Merge the new app list with the existing one, maintaining the expected ordering (descending
-      // end time). Maintaining the order is important to avoid having to sort the list every time
-      // there is a request for the log list.
-      val newApps = newAppMap.values.toSeq.sortWith(compareAppInfo)
-      val mergedApps = new mutable.LinkedHashMap[String, FsApplicationHistoryInfo]()
-      def addIfAbsent(info: FsApplicationHistoryInfo): Unit = {
-        if (!mergedApps.contains(info.id)) {
-          mergedApps += (info.id -> info)
-        }
-      }
+    val bus = new ReplayListenerBus()
+    val listener = new AppListingListener(fileStatus, clock)
+    bus.addListener(listener)
 
-      val newIterator = newApps.iterator.buffered
-      val oldIterator = applications.values.iterator.buffered
-      while (newIterator.hasNext && oldIterator.hasNext) {
-        if (newAppMap.contains(oldIterator.head.id)) {
-          oldIterator.next()
-        } else if (compareAppInfo(newIterator.head, oldIterator.head)) {
-          addIfAbsent(newIterator.next())
-        } else {
-          addIfAbsent(oldIterator.next())
-        }
-      }
-      newIterator.foreach(addIfAbsent)
-      oldIterator.foreach(addIfAbsent)
-
-      applications = mergedApps
-    }
+    replay(fileStatus, isApplicationCompleted(fileStatus), bus, eventsFilter)
+    listener.applicationInfo.foreach(addListing)
+    listing.write(LogInfo(logPath.toString(), fileStatus.getLen()))
   }
 
   /**
    * Delete event logs from the log directory according to the clean policy defined by the user.
    */
   private[history] def cleanLogs(): Unit = {
+    var iterator: Option[KVStoreIterator[ApplicationInfoWrapper]] = None
     try {
-      val maxAge = conf.getTimeAsSeconds("spark.history.fs.cleaner.maxAge", "7d") * 1000
-
-      val now = clock.getTimeMillis()
-      val appsToRetain = new mutable.LinkedHashMap[String, FsApplicationHistoryInfo]()
-
-      def shouldClean(attempt: FsApplicationAttemptInfo): Boolean = {
-        now - attempt.lastUpdated > maxAge
-      }
+      val maxTime = clock.getTimeMillis() - conf.get(MAX_LOG_AGE_S) * 1000
+
+      // Iterate descending over all applications whose oldest attempt happened before maxTime.
+      iterator = Some(listing.view(classOf[ApplicationInfoWrapper])
+        .index("oldestAttempt")
+        .reverse()
+        .first(maxTime)
+        .closeableIterator())
+
+      iterator.get.asScala.foreach { app =>
+        // Applications may have multiple attempts, some of which may not need to be deleted yet.
+        val (remaining, toDelete) = app.attempts.partition { attempt =>
+          attempt.info.lastUpdated.getTime() >= maxTime
+        }
 
-      // Scan all logs from the log directory.
-      // Only completed applications older than the specified max age will be deleted.
-      applications.values.foreach { app =>
-        val (toClean, toRetain) = app.attempts.partition(shouldClean)
-        attemptsToClean ++= toClean
-
-        if (toClean.isEmpty) {
-          appsToRetain += (app.id -> app)
-        } else if (toRetain.nonEmpty) {
-          appsToRetain += (app.id ->
-            new FsApplicationHistoryInfo(app.id, app.name, toRetain.toList))
+        if (remaining.nonEmpty) {
+          val newApp = new ApplicationInfoWrapper(app.info, remaining)
+          listing.write(newApp)
         }
-      }
 
-      applications = appsToRetain
+        toDelete.foreach { attempt =>
+          val logPath = new Path(logDir, attempt.logPath)
+          try {
+            listing.delete(classOf[LogInfo], logPath.toString())
+          } catch {
+            case _: NoSuchElementException =>
+              logDebug(s"Log info entry for $logPath not found.")
+          }
+          try {
+            fs.delete(logPath, true)
+          } catch {
+            case e: AccessControlException =>
+              logInfo(s"No permission to delete ${attempt.logPath}, ignoring.")
+            case t: IOException =>
+              logError(s"IOException in cleaning ${attempt.logPath}", t)
+          }
+        }
 
-      val leftToClean = new mutable.ListBuffer[FsApplicationAttemptInfo]
-      attemptsToClean.foreach { attempt =>
-        try {
-          fs.delete(new Path(logDir, attempt.logPath), true)
-        } catch {
-          case e: AccessControlException =>
-            logInfo(s"No permission to delete ${attempt.logPath}, ignoring.")
-          case t: IOException =>
-            logError(s"IOException in cleaning ${attempt.logPath}", t)
-            leftToClean += attempt
+        if (remaining.isEmpty) {
+          listing.delete(app.getClass(), app.id)
         }
       }
-
-      attemptsToClean = leftToClean
     } catch {
-      case t: Exception => logError("Exception in cleaning logs", t)
+      case t: Exception => logError("Exception while cleaning logs", t)
+    } finally {
+      iterator.foreach(_.close())
     }
   }
 
-  /**
-   * Comparison function that defines the sort order for the application listing.
-   *
-   * @return Whether `i1` should precede `i2`.
-   */
-  private def compareAppInfo(
-      i1: FsApplicationHistoryInfo,
-      i2: FsApplicationHistoryInfo): Boolean = {
-    val a1 = i1.attempts.head
-    val a2 = i2.attempts.head
-    if (a1.endTime != a2.endTime) a1.endTime >= a2.endTime else a1.startTime >= a2.startTime
-  }
-
-  /**
-   * Comparison function that defines the sort order for application attempts within the same
-   * application. Order is: attempts are sorted by descending start time.
-   * Most recent attempt state matches with current state of the app.
-   *
-   * Normally applications should have a single running attempt; but failure to call sc.stop()
-   * may cause multiple running attempts to show up.
-   *
-   * @return Whether `a1` should precede `a2`.
-   */
-  private def compareAttemptInfo(
-      a1: FsApplicationAttemptInfo,
-      a2: FsApplicationAttemptInfo): Boolean = {
-    a1.startTime >= a2.startTime
-  }
-
   /**
    * Replays the events in the specified log file on the supplied `ReplayListenerBus`. Returns
    * an `ApplicationEventListener` instance with event data captured from the replay.
@@ -649,6 +582,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
       val appListener = new ApplicationEventListener
       bus.addListener(appListener)
       bus.replay(logInput, logPath.toString, !appCompleted, eventsFilter)
+      logInfo(s"Finished replaying $logPath")
       appListener
     } finally {
       logInput.close()
@@ -685,26 +619,11 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
    * @return a summary of the component state
    */
   override def toString: String = {
-    val header = s"""
-      | FsHistoryProvider: logdir=$logDir,
-      | last scan time=$lastScanTime
-      | Cached application count =${applications.size}}
-    """.stripMargin
-    val sb = new StringBuilder(header)
-    applications.foreach(entry => sb.append(entry._2).append("\n"))
-    sb.toString
-  }
-
-  /**
-   * Look up an application attempt
-   * @param appId application ID
-   * @param attemptId Attempt ID, if set
-   * @return the matching attempt, if found
-   */
-  def lookup(appId: String, attemptId: Option[String]): Option[FsApplicationAttemptInfo] = {
-    applications.get(appId).flatMap { appInfo =>
-      appInfo.attempts.find(_.attemptId == attemptId)
-    }
+    val count = listing.count(classOf[ApplicationInfoWrapper])
+    s"""|FsHistoryProvider{logdir=$logDir,
+        |  storedir=$storePath,
+        |  last scan time=$lastScanTime
+        |  application count=$count}""".stripMargin
   }
 
   /**
@@ -722,21 +641,69 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
       appId: String,
       attemptId: Option[String],
       prevFileSize: Long)(): Boolean = {
-    lookup(appId, attemptId) match {
-      case None =>
+    try {
+      val attempt = getAttempt(appId, attemptId)
+      val logPath = fs.makeQualified(new Path(logDir, attempt.logPath))
+      recordedFileSize(logPath) > prevFileSize
+    } catch {
+      case _: NoSuchElementException =>
         logDebug(s"Application Attempt $appId/$attemptId not found")
         false
-      case Some(latest) =>
-        prevFileSize < latest.fileSize
     }
   }
-}
 
-private[history] object FsHistoryProvider {
-  val DEFAULT_LOG_DIR = "file:/tmp/spark-events"
+  /**
+   * Return the last known size of the given event log, recorded the last time the file
+   * system scanner detected a change in the file.
+   */
+  private def recordedFileSize(log: Path): Long = {
+    try {
+      listing.read(classOf[LogInfo], log.toString()).fileSize
+    } catch {
+      case _: NoSuchElementException => 0L
+    }
+  }
+
+  private def load(appId: String): ApplicationInfoWrapper = {
+    listing.read(classOf[ApplicationInfoWrapper], appId)
+  }
+
+  /**
+   * Write the app's information to the given store. Serialized to avoid the (notedly rare) case
+   * where two threads are processing separate attempts of the same application.
+   */
+  private def addListing(app: ApplicationInfoWrapper): Unit = listing.synchronized {
+    val attempt = app.attempts.head
+
+    val oldApp = try {
+      load(app.id)
+    } catch {
+      case _: NoSuchElementException =>
+        app
+    }
+
+    def compareAttemptInfo(a1: AttemptInfoWrapper, a2: AttemptInfoWrapper): Boolean = {
+      a1.info.startTime.getTime() > a2.info.startTime.getTime()
+    }
+
+    val attempts = oldApp.attempts.filter(_.info.attemptId != attempt.info.attemptId) ++
+      List(attempt)
+
+    val newAppInfo = new ApplicationInfoWrapper(
+      app.info,
+      attempts.sortWith(compareAttemptInfo))
+    listing.write(newAppInfo)
+  }
 
-  private val NOT_STARTED = "<Not Started>"
+  /** For testing. Returns internal data about a single attempt. */
+  private[history] def getAttempt(appId: String, attemptId: Option[String]): AttemptInfoWrapper = {
+    load(appId).attempts.find(_.info.attemptId == attemptId).getOrElse(
+      throw new NoSuchElementException(s"Cannot find attempt $attemptId of $appId."))
+  }
 
+}
+
+private[history] object FsHistoryProvider {
   private val SPARK_HISTORY_FS_NUM_REPLAY_THREADS = "spark.history.fs.numReplayThreads"
 
   private val APPL_START_EVENT_PREFIX = "{\"Event\":\"SparkListenerApplicationStart\""
@@ -744,53 +711,145 @@ private[history] object FsHistoryProvider {
   private val APPL_END_EVENT_PREFIX = "{\"Event\":\"SparkListenerApplicationEnd\""
 
   private val LOG_START_EVENT_PREFIX = "{\"Event\":\"SparkListenerLogStart\""
+
+  /**
+   * Current version of the data written to the listing database. When opening an existing
+   * db, if the version does not match this value, the FsHistoryProvider will throw away
+   * all data and re-generate the listing data from the event logs.
+   */
+  private[history] val CURRENT_LISTING_VERSION = 1L
 }
 
 /**
- * Application attempt information.
- *
- * @param logPath path to the log file, or, for a legacy log, its directory
- * @param name application name
- * @param appId application ID
- * @param attemptId optional attempt ID
- * @param startTime start time (from playback)
- * @param endTime end time (from playback). -1 if the application is incomplete.
- * @param lastUpdated the modification time of the log file when this entry was built by replaying
- *                    the history.
- * @param sparkUser user running the application
- * @param completed flag to indicate whether or not the application has completed.
- * @param fileSize the size of the log file the last time the file was scanned for changes
+ * A KVStoreSerializer that provides Scala types serialization too, and uses the same options as
+ * the API serializer.
  */
-private class FsApplicationAttemptInfo(
+private class KVStoreScalaSerializer extends KVStoreSerializer {
+
+  mapper.registerModule(DefaultScalaModule)
+  mapper.setSerializationInclusion(JsonInclude.Include.NON_NULL)
+  mapper.setDateFormat(v1.JacksonMessageWriter.makeISODateFormat)
+
+}
+
+private[history] case class KVStoreMetadata(
+  version: Long,
+  logDir: String)
+
+private[history] case class LogInfo(
+  @KVIndexParam logPath: String,
+  fileSize: Long)
+
+private[history] class AttemptInfoWrapper(
+    val info: v1.ApplicationAttemptInfo,
     val logPath: String,
-    val name: String,
-    val appId: String,
-    attemptId: Option[String],
-    startTime: Long,
-    endTime: Long,
-    lastUpdated: Long,
-    sparkUser: String,
-    completed: Boolean,
-    val fileSize: Long,
-    appSparkVersion: String)
-  extends ApplicationAttemptInfo(
-      attemptId, startTime, endTime, lastUpdated, sparkUser, completed, appSparkVersion) {
-
-  /** extend the superclass string value with the extra attributes of this class */
-  override def toString: String = {
-    s"FsApplicationAttemptInfo($name, $appId," +
-      s" ${super.toString}, source=$logPath, size=$fileSize"
+    val fileSize: Long) {
+
+  def toAppAttemptInfo(): ApplicationAttemptInfo = {
+    ApplicationAttemptInfo(info.attemptId, info.startTime.getTime(),
+      info.endTime.getTime(), info.lastUpdated.getTime(), info.sparkUser,
+      info.completed, info.appSparkVersion)
   }
+
 }
 
-/**
- * Application history information
- * @param id application ID
- * @param name application name
- * @param attempts list of attempts, most recent first.
- */
-private class FsApplicationHistoryInfo(
-    id: String,
-    override val name: String,
-    override val attempts: List[FsApplicationAttemptInfo])
-  extends ApplicationHistoryInfo(id, name, attempts)
+private[history] class ApplicationInfoWrapper(
+    val info: v1.ApplicationInfo,
+    val attempts: List[AttemptInfoWrapper]) {
+
+  @JsonIgnore @KVIndexParam
+  def id: String = info.id
+
+  @JsonIgnore @KVIndexParam("endTime")
+  def endTime(): Long = attempts.head.info.endTime.getTime()
+
+  @JsonIgnore @KVIndexParam("oldestAttempt")
+  def oldestAttempt(): Long = attempts.map(_.info.lastUpdated.getTime()).min
+
+  def toAppHistoryInfo(): ApplicationHistoryInfo = {
+    ApplicationHistoryInfo(info.id, info.name, attempts.map(_.toAppAttemptInfo()))
+  }
+
+}
+
+private[history] class AppListingListener(log: FileStatus, clock: Clock) extends SparkListener {
+
+  private val app = new MutableApplicationInfo()
+  private val attempt = new MutableAttemptInfo(log.getPath().getName(), log.getLen())
+
+  override def onApplicationStart(event: SparkListenerApplicationStart): Unit = {
+    app.id = event.appId.orNull
+    app.name = event.appName
+
+    attempt.attemptId = event.appAttemptId
+    attempt.startTime = new Date(event.time)
+    attempt.lastUpdated = new Date(clock.getTimeMillis())
+    attempt.sparkUser = event.sparkUser
+  }
+
+  override def onApplicationEnd(event: SparkListenerApplicationEnd): Unit = {
+    attempt.endTime = new Date(event.time)
+    attempt.lastUpdated = new Date(log.getModificationTime())
+    attempt.duration = event.time - attempt.startTime.getTime()
+    attempt.completed = true
+  }
+
+  override def onOtherEvent(event: SparkListenerEvent): Unit = event match {
+    case SparkListenerLogStart(sparkVersion) =>
+      attempt.appSparkVersion = sparkVersion
+    case _ =>
+  }
+
+  def applicationInfo: Option[ApplicationInfoWrapper] = {
+    if (app.id != null) {
+      Some(app.toView())
+    } else {
+      None
+    }
+  }
+
+  private class MutableApplicationInfo {
+    var id: String = null
+    var name: String = null
+    var coresGranted: Option[Int] = None
+    var maxCores: Option[Int] = None
+    var coresPerExecutor: Option[Int] = None
+    var memoryPerExecutorMB: Option[Int] = None
+
+    def toView(): ApplicationInfoWrapper = {
+      val apiInfo = new v1.ApplicationInfo(id, name, coresGranted, maxCores, coresPerExecutor,
+        memoryPerExecutorMB, Nil)
+      new ApplicationInfoWrapper(apiInfo, List(attempt.toView()))
+    }
+
+  }
+
+  private class MutableAttemptInfo(logPath: String, fileSize: Long) {
+    var attemptId: Option[String] = None
+    var startTime = new Date(-1)
+    var endTime = new Date(-1)
+    var lastUpdated = new Date(-1)
+    var duration = 0L
+    var sparkUser: String = null
+    var completed = false
+    var appSparkVersion = ""
+
+    def toView(): AttemptInfoWrapper = {
+      val apiInfo = new v1.ApplicationAttemptInfo(
+        attemptId,
+        startTime,
+        endTime,
+        lastUpdated,
+        duration,
+        sparkUser,
+        completed,
+        appSparkVersion)
+      new AttemptInfoWrapper(
+        apiInfo,
+        logPath,
+        fileSize)
+    }
+
+  }
+
+}
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/config.scala b/core/src/main/scala/org/apache/spark/deploy/history/config.scala
new file mode 100644
index 0000000000000..fb9e997def0dd
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/deploy/history/config.scala
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.history
+
+import java.util.concurrent.TimeUnit
+
+import scala.annotation.meta.getter
+
+import org.apache.spark.internal.config.ConfigBuilder
+import org.apache.spark.util.kvstore.KVIndex
+
+private[spark] object config {
+
+  /** Use this to annotate constructor params to be used as KVStore indices. */
+  type KVIndexParam = KVIndex @getter
+
+  val DEFAULT_LOG_DIR = "file:/tmp/spark-events"
+
+  val EVENT_LOG_DIR = ConfigBuilder("spark.history.fs.logDirectory")
+    .stringConf
+    .createWithDefault(DEFAULT_LOG_DIR)
+
+  val MAX_LOG_AGE_S = ConfigBuilder("spark.history.fs.cleaner.maxAge")
+    .timeConf(TimeUnit.SECONDS)
+    .createWithDefaultString("7d")
+
+  val LOCAL_STORE_DIR = ConfigBuilder("spark.history.store.path")
+    .doc("Local directory where to cache application history information. By default this is " +
+      "not set, meaning all history information will be kept in memory.")
+    .stringConf
+    .createOptional
+
+}
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/api.scala b/core/src/main/scala/org/apache/spark/status/api/v1/api.scala
index 05948f2661056..31659b25db318 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/api.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/api.scala
@@ -20,6 +20,8 @@ import java.util.Date
 
 import scala.collection.Map
 
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties
+
 import org.apache.spark.JobExecutionStatus
 
 class ApplicationInfo private[spark](
@@ -31,6 +33,9 @@ class ApplicationInfo private[spark](
     val memoryPerExecutorMB: Option[Int],
     val attempts: Seq[ApplicationAttemptInfo])
 
+@JsonIgnoreProperties(
+  value = Array("startTimeEpoch", "endTimeEpoch", "lastUpdatedEpoch"),
+  allowGetters = true)
 class ApplicationAttemptInfo private[spark](
     val attemptId: Option[String],
     val startTime: Date,
@@ -40,9 +45,13 @@ class ApplicationAttemptInfo private[spark](
     val sparkUser: String,
     val completed: Boolean = false,
     val appSparkVersion: String) {
-    def getStartTimeEpoch: Long = startTime.getTime
-    def getEndTimeEpoch: Long = endTime.getTime
-    def getLastUpdatedEpoch: Long = lastUpdated.getTime
+
+  def getStartTimeEpoch: Long = startTime.getTime
+
+  def getEndTimeEpoch: Long = endTime.getTime
+
+  def getLastUpdatedEpoch: Long = lastUpdated.getTime
+
 }
 
 class ExecutorStageSummary private[spark](
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
index 7109146ece371..2141934c92640 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
@@ -36,6 +36,7 @@ import org.scalatest.Matchers
 import org.scalatest.concurrent.Eventually._
 
 import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.history.config._
 import org.apache.spark.internal.Logging
 import org.apache.spark.io._
 import org.apache.spark.scheduler._
@@ -66,9 +67,15 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
     new File(logPath)
   }
 
-  test("Parse application logs") {
+  Seq(true, false).foreach { inMemory =>
+    test(s"Parse application logs (inMemory = $inMemory)") {
+      testAppLogParsing(inMemory)
+    }
+  }
+
+  private def testAppLogParsing(inMemory: Boolean) {
     val clock = new ManualClock(12345678)
-    val provider = new FsHistoryProvider(createTestConf(), clock)
+    val provider = new FsHistoryProvider(createTestConf(inMemory = inMemory), clock)
 
     // Write a new-style application log.
     val newAppComplete = newLogFile("new1", None, inProgress = false)
@@ -172,20 +179,18 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
     )
     updateAndCheck(provider) { list =>
       list.size should be (1)
-      list.head.attempts.head.asInstanceOf[FsApplicationAttemptInfo].logPath should
-        endWith(EventLoggingListener.IN_PROGRESS)
+      provider.getAttempt("app1", None).logPath should endWith(EventLoggingListener.IN_PROGRESS)
     }
 
     logFile1.renameTo(newLogFile("app1", None, inProgress = false))
     updateAndCheck(provider) { list =>
       list.size should be (1)
-      list.head.attempts.head.asInstanceOf[FsApplicationAttemptInfo].logPath should not
-        endWith(EventLoggingListener.IN_PROGRESS)
+      provider.getAttempt("app1", None).logPath should not endWith(EventLoggingListener.IN_PROGRESS)
     }
   }
 
   test("Parse logs that application is not started") {
-    val provider = new FsHistoryProvider((createTestConf()))
+    val provider = new FsHistoryProvider(createTestConf())
 
     val logFile1 = newLogFile("app1", None, inProgress = true)
     writeFile(logFile1, true, None,
@@ -342,17 +347,23 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
     provider.checkForLogs()
 
     // This should not trigger any cleanup
-    updateAndCheck(provider)(list => list.size should be(2))
+    updateAndCheck(provider) { list =>
+      list.size should be(2)
+    }
 
     // Should trigger cleanup for first file but not second one
     clock.setTime(firstFileModifiedTime + maxAge + 1)
-    updateAndCheck(provider)(list => list.size should be(1))
+    updateAndCheck(provider) { list =>
+      list.size should be(1)
+    }
     assert(!log1.exists())
     assert(log2.exists())
 
     // Should cleanup the second file as well.
     clock.setTime(secondFileModifiedTime + maxAge + 1)
-    updateAndCheck(provider)(list => list.size should be(0))
+    updateAndCheck(provider) { list =>
+      list.size should be(0)
+    }
     assert(!log1.exists())
     assert(!log2.exists())
   }
@@ -580,7 +591,34 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
       securityManager.checkUIViewPermissions("user4") should be (false)
       securityManager.checkUIViewPermissions("user5") should be (false)
     }
- }
+  }
+
+  test("mismatched version discards old listing") {
+    val conf = createTestConf()
+    val oldProvider = new FsHistoryProvider(conf)
+
+    val logFile1 = newLogFile("app1", None, inProgress = false)
+    writeFile(logFile1, true, None,
+      SparkListenerLogStart("2.3"),
+      SparkListenerApplicationStart("test", Some("test"), 1L, "test", None),
+      SparkListenerApplicationEnd(5L)
+    )
+
+    updateAndCheck(oldProvider) { list =>
+      list.size should be (1)
+    }
+    assert(oldProvider.listing.count(classOf[ApplicationInfoWrapper]) === 1)
+
+    // Manually overwrite the version in the listing db; this should cause the new provider to
+    // discard all data because the versions don't match.
+    val meta = new KVStoreMetadata(FsHistoryProvider.CURRENT_LISTING_VERSION + 1,
+      conf.get(LOCAL_STORE_DIR).get)
+    oldProvider.listing.setMetadata(meta)
+    oldProvider.stop()
+
+    val mistatchedVersionProvider = new FsHistoryProvider(conf)
+    assert(mistatchedVersionProvider.listing.count(classOf[ApplicationInfoWrapper]) === 0)
+  }
 
   /**
    * Asks the provider to check for logs and calls a function to perform checks on the updated
@@ -623,8 +661,15 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
     new FileOutputStream(file).close()
   }
 
-  private def createTestConf(): SparkConf = {
-    new SparkConf().set("spark.history.fs.logDirectory", testDir.getAbsolutePath())
+  private def createTestConf(inMemory: Boolean = false): SparkConf = {
+    val conf = new SparkConf()
+      .set("spark.history.fs.logDirectory", testDir.getAbsolutePath())
+
+    if (!inMemory) {
+      conf.set(LOCAL_STORE_DIR, Utils.createTempDir().getAbsolutePath())
+    }
+
+    conf
   }
 
   private class SafeModeTestProvider(conf: SparkConf, clock: Clock)
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
index 18da8c18939ed..c11543a4b3ba2 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
@@ -43,6 +43,7 @@ import org.scalatest.mockito.MockitoSugar
 import org.scalatest.selenium.WebBrowser
 
 import org.apache.spark._
+import org.apache.spark.deploy.history.config._
 import org.apache.spark.ui.SparkUI
 import org.apache.spark.ui.jobs.UIData.JobUIData
 import org.apache.spark.util.{ResetSystemProperties, Utils}
@@ -64,6 +65,7 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers
 
   private val logDir = getTestResourcePath("spark-events")
   private val expRoot = getTestResourceFile("HistoryServerExpectations")
+  private val storeDir = Utils.createTempDir(namePrefix = "history")
 
   private var provider: FsHistoryProvider = null
   private var server: HistoryServer = null
@@ -74,6 +76,7 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers
       .set("spark.history.fs.logDirectory", logDir)
       .set("spark.history.fs.update.interval", "0")
       .set("spark.testing", "true")
+      .set(LOCAL_STORE_DIR, storeDir.getAbsolutePath())
     conf.setAll(extraConf)
     provider = new FsHistoryProvider(conf)
     provider.checkForLogs()
@@ -87,14 +90,13 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers
 
   def stop(): Unit = {
     server.stop()
+    server = null
   }
 
   before {
-    init()
-  }
-
-  after{
-    stop()
+    if (server == null) {
+      init()
+    }
   }
 
   val cases = Seq(
@@ -296,6 +298,7 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers
       .set("spark.history.fs.logDirectory", logDir)
       .set("spark.history.fs.update.interval", "0")
       .set("spark.testing", "true")
+      .set(LOCAL_STORE_DIR, storeDir.getAbsolutePath())
 
     provider = new FsHistoryProvider(conf)
     provider.checkForLogs()
@@ -372,6 +375,7 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers
   }
 
   test("incomplete apps get refreshed") {
+    server.stop()
 
     implicit val webDriver: WebDriver = new HtmlUnitDriver
     implicit val formats = org.json4s.DefaultFormats
@@ -388,6 +392,7 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers
       .set("spark.history.fs.update.interval", "1s")
       .set("spark.eventLog.enabled", "true")
       .set("spark.history.cache.window", "250ms")
+      .set(LOCAL_STORE_DIR, storeDir.getAbsolutePath())
       .remove("spark.testing")
     val provider = new FsHistoryProvider(myConf)
     val securityManager = HistoryServer.createSecurityManager(myConf)
@@ -413,8 +418,6 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers
       }
     }
 
-    // stop the server with the old config, and start the new one
-    server.stop()
     server = new HistoryServer(myConf, provider, securityManager, 18080)
     server.initialize()
     server.bind()
diff --git a/docs/monitoring.md b/docs/monitoring.md
index 51084a25983ea..1ae43185d22f8 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -224,6 +224,15 @@ The history server can be configured as follows:
       Number of threads that will be used by history server to process event logs.
     </td>
   </tr>
+  <tr>
+    <td>spark.history.store.path</td>
+    <td>(none)</td>
+    <td>
+        Local directory where to cache application history data. If set, the history
+        server will store application data on disk instead of keeping it in memory. The data
+        written to disk will be re-used in the event of a history server restart.
+    </td>
+  </tr>
 </table>
 
 Note that in all of these UIs, the tables are sortable by clicking their headers,
diff --git a/scalastyle-config.xml b/scalastyle-config.xml
index bd7f462b722cd..7bdd3fac773a3 100644
--- a/scalastyle-config.xml
+++ b/scalastyle-config.xml
@@ -86,7 +86,7 @@ This file is divided into 3 sections:
   </check>
 
   <check level="error" class="org.scalastyle.scalariform.ObjectNamesChecker" enabled="true">
-    <parameters><parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter></parameters>
+    <parameters><parameter name="regex"><![CDATA[(config|[A-Z][A-Za-z]*)]]></parameter></parameters>
   </check>
 
   <check level="error" class="org.scalastyle.scalariform.PackageObjectNamesChecker" enabled="true">

From d2b8b63b9318710500f5a57bedc418d0c75b1450 Mon Sep 17 00:00:00 2001
From: guoxiaolong <guo.xiaolong1@zte.com.cn>
Date: Wed, 27 Sep 2017 20:48:55 +0800
Subject: [PATCH 629/779] [SAPRK-20785][WEB-UI][SQL] Spark should provide jump
 links and add (count) in the SQL web ui.

## What changes were proposed in this pull request?

propose:

it provide links that jump to Running Queries,Completed Queries and Failed Queries.
it add (count) about Running Queries,Completed Queries and Failed Queries.
This is a small optimization in in the SQL web ui.

fix before:

![1](https://user-images.githubusercontent.com/26266482/30840686-36025cc0-a2ab-11e7-8d8d-1de0122a84fb.png)

fix after:
![2](https://user-images.githubusercontent.com/26266482/30840723-6cc67a52-a2ab-11e7-8002-9191a55895a6.png)

## How was this patch tested?

manual tests

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: guoxiaolong <guo.xiaolong1@zte.com.cn>

Closes #19346 from guoxiaolongzte/SPARK-20785.
---
 .../sql/execution/ui/AllExecutionsPage.scala  | 39 ++++++++++++++++---
 1 file changed, 34 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala
index 41929ed474fa7..f9c69864a3361 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.ui
 import javax.servlet.http.HttpServletRequest
 
 import scala.collection.mutable
-import scala.xml.Node
+import scala.xml.{Node, NodeSeq}
 
 import org.apache.commons.lang3.StringEscapeUtils
 
@@ -38,19 +38,19 @@ private[ui] class AllExecutionsPage(parent: SQLTab) extends WebUIPage("") with L
       if (listener.getRunningExecutions.nonEmpty) {
         _content ++=
           new RunningExecutionTable(
-            parent, "Running Queries", currentTime,
+            parent, s"Running Queries (${listener.getRunningExecutions.size})", currentTime,
             listener.getRunningExecutions.sortBy(_.submissionTime).reverse).toNodeSeq
       }
       if (listener.getCompletedExecutions.nonEmpty) {
         _content ++=
           new CompletedExecutionTable(
-            parent, "Completed Queries", currentTime,
+            parent, s"Completed Queries (${listener.getCompletedExecutions.size})", currentTime,
             listener.getCompletedExecutions.sortBy(_.submissionTime).reverse).toNodeSeq
       }
       if (listener.getFailedExecutions.nonEmpty) {
         _content ++=
           new FailedExecutionTable(
-            parent, "Failed Queries", currentTime,
+            parent, s"Failed Queries (${listener.getFailedExecutions.size})", currentTime,
             listener.getFailedExecutions.sortBy(_.submissionTime).reverse).toNodeSeq
       }
       _content
@@ -61,7 +61,36 @@ private[ui] class AllExecutionsPage(parent: SQLTab) extends WebUIPage("") with L
           details.parentNode.querySelector('.stage-details').classList.toggle('collapsed')
         }}
       </script>
-    UIUtils.headerSparkPage("SQL", content, parent, Some(5000))
+    val summary: NodeSeq =
+      <div>
+        <ul class="unstyled">
+          {
+            if (listener.getRunningExecutions.nonEmpty) {
+              <li>
+                <a href="#running-execution-table"><strong>Running Queries:</strong></a>
+                {listener.getRunningExecutions.size}
+              </li>
+            }
+          }
+          {
+            if (listener.getCompletedExecutions.nonEmpty) {
+              <li>
+                <a href="#completed-execution-table"><strong>Completed Queries:</strong></a>
+                {listener.getCompletedExecutions.size}
+              </li>
+            }
+          }
+          {
+            if (listener.getFailedExecutions.nonEmpty) {
+              <li>
+                <a href="#failed-execution-table"><strong>Failed Queries:</strong></a>
+                {listener.getFailedExecutions.size}
+              </li>
+            }
+          }
+        </ul>
+      </div>
+    UIUtils.headerSparkPage("SQL", summary ++ content, parent, Some(5000))
   }
 }
 

From 12e740bba110c6ab017c73c5ef940cce39dd45b7 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Wed, 27 Sep 2017 23:19:10 +0900
Subject: [PATCH 630/779] [SPARK-22130][CORE] UTF8String.trim() scans " " twice

## What changes were proposed in this pull request?

This PR allows us to scan a string including only white space (e.g. `"     "`) once while the current implementation scans twice (right to left, and then left to right).

## How was this patch tested?

Existing test suites

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #19355 from kiszk/SPARK-22130.
---
 .../org/apache/spark/unsafe/types/UTF8String.java     | 11 +++++------
 .../apache/spark/unsafe/types/UTF8StringSuite.java    |  3 +++
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index ce4a06bde80c4..b0d0c44823e68 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -498,17 +498,16 @@ private UTF8String copyUTF8String(int start, int end) {
 
   public UTF8String trim() {
     int s = 0;
-    int e = this.numBytes - 1;
     // skip all of the space (0x20) in the left side
     while (s < this.numBytes && getByte(s) == 0x20) s++;
-    // skip all of the space (0x20) in the right side
-    while (e >= 0 && getByte(e) == 0x20) e--;
-    if (s > e) {
+    if (s == this.numBytes) {
       // empty string
       return EMPTY_UTF8;
-    } else {
-      return copyUTF8String(s, e);
     }
+    // skip all of the space (0x20) in the right side
+    int e = this.numBytes - 1;
+    while (e > s && getByte(e) == 0x20) e--;
+    return copyUTF8String(s, e);
   }
 
   /**
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index 7b03d2c650fc9..9b303fa5bc6c5 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -222,10 +222,13 @@ public void substring() {
 
   @Test
   public void trims() {
+    assertEquals(fromString("1"), fromString("1").trim());
+
     assertEquals(fromString("hello"), fromString("  hello ").trim());
     assertEquals(fromString("hello "), fromString("  hello ").trimLeft());
     assertEquals(fromString("  hello"), fromString("  hello ").trimRight());
 
+    assertEquals(EMPTY_UTF8, EMPTY_UTF8.trim());
     assertEquals(EMPTY_UTF8, fromString("  ").trim());
     assertEquals(EMPTY_UTF8, fromString("  ").trimLeft());
     assertEquals(EMPTY_UTF8, fromString("  ").trimRight());

From 09cbf3df20efea09c0941499249b7a3b2bf7e9fd Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Wed, 27 Sep 2017 23:21:44 +0900
Subject: [PATCH 631/779] [SPARK-22125][PYSPARK][SQL] Enable Arrow Stream
 format for vectorized UDF.

## What changes were proposed in this pull request?

Currently we use Arrow File format to communicate with Python worker when invoking vectorized UDF but we can use Arrow Stream format.

This pr replaces the Arrow File format with the Arrow Stream format.

## How was this patch tested?

Existing tests.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #19349 from ueshin/issues/SPARK-22125.
---
 .../apache/spark/api/python/PythonRDD.scala   | 325 +------------
 .../spark/api/python/PythonRunner.scala       | 441 ++++++++++++++++++
 python/pyspark/serializers.py                 |  70 +--
 python/pyspark/worker.py                      |   4 +-
 .../execution/vectorized/ColumnarBatch.java   |   5 +
 .../python/ArrowEvalPythonExec.scala          |  54 ++-
 .../execution/python/ArrowPythonRunner.scala  | 181 +++++++
 .../python/BatchEvalPythonExec.scala          |   4 +-
 .../execution/python/PythonUDFRunner.scala    | 113 +++++
 9 files changed, 825 insertions(+), 372 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDFRunner.scala

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 86d0405c678a7..f6293c0dc5091 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -48,7 +48,7 @@ private[spark] class PythonRDD(
   extends RDD[Array[Byte]](parent) {
 
   val bufferSize = conf.getInt("spark.buffer.size", 65536)
-  val reuse_worker = conf.getBoolean("spark.python.worker.reuse", true)
+  val reuseWorker = conf.getBoolean("spark.python.worker.reuse", true)
 
   override def getPartitions: Array[Partition] = firstParent.partitions
 
@@ -59,7 +59,7 @@ private[spark] class PythonRDD(
   val asJavaRDD: JavaRDD[Array[Byte]] = JavaRDD.fromRDD(this)
 
   override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = {
-    val runner = PythonRunner(func, bufferSize, reuse_worker)
+    val runner = PythonRunner(func, bufferSize, reuseWorker)
     runner.compute(firstParent.iterator(split, context), split.index, context)
   }
 }
@@ -83,318 +83,9 @@ private[spark] case class PythonFunction(
  */
 private[spark] case class ChainedPythonFunctions(funcs: Seq[PythonFunction])
 
-/**
- * Enumerate the type of command that will be sent to the Python worker
- */
-private[spark] object PythonEvalType {
-  val NON_UDF = 0
-  val SQL_BATCHED_UDF = 1
-  val SQL_PANDAS_UDF = 2
-}
-
-private[spark] object PythonRunner {
-  def apply(func: PythonFunction, bufferSize: Int, reuse_worker: Boolean): PythonRunner = {
-    new PythonRunner(
-      Seq(ChainedPythonFunctions(Seq(func))),
-      bufferSize,
-      reuse_worker,
-      PythonEvalType.NON_UDF,
-      Array(Array(0)))
-  }
-}
-
-/**
- * A helper class to run Python mapPartition/UDFs in Spark.
- *
- * funcs is a list of independent Python functions, each one of them is a list of chained Python
- * functions (from bottom to top).
- */
-private[spark] class PythonRunner(
-    funcs: Seq[ChainedPythonFunctions],
-    bufferSize: Int,
-    reuse_worker: Boolean,
-    evalType: Int,
-    argOffsets: Array[Array[Int]])
-  extends Logging {
-
-  require(funcs.length == argOffsets.length, "argOffsets should have the same length as funcs")
-
-  // All the Python functions should have the same exec, version and envvars.
-  private val envVars = funcs.head.funcs.head.envVars
-  private val pythonExec = funcs.head.funcs.head.pythonExec
-  private val pythonVer = funcs.head.funcs.head.pythonVer
-
-  // TODO: support accumulator in multiple UDF
-  private val accumulator = funcs.head.funcs.head.accumulator
-
-  def compute(
-      inputIterator: Iterator[_],
-      partitionIndex: Int,
-      context: TaskContext): Iterator[Array[Byte]] = {
-    val startTime = System.currentTimeMillis
-    val env = SparkEnv.get
-    val localdir = env.blockManager.diskBlockManager.localDirs.map(f => f.getPath()).mkString(",")
-    envVars.put("SPARK_LOCAL_DIRS", localdir) // it's also used in monitor thread
-    if (reuse_worker) {
-      envVars.put("SPARK_REUSE_WORKER", "1")
-    }
-    val worker: Socket = env.createPythonWorker(pythonExec, envVars.asScala.toMap)
-    // Whether is the worker released into idle pool
-    @volatile var released = false
-
-    // Start a thread to feed the process input from our parent's iterator
-    val writerThread = new WriterThread(env, worker, inputIterator, partitionIndex, context)
-
-    context.addTaskCompletionListener { context =>
-      writerThread.shutdownOnTaskCompletion()
-      if (!reuse_worker || !released) {
-        try {
-          worker.close()
-        } catch {
-          case e: Exception =>
-            logWarning("Failed to close worker socket", e)
-        }
-      }
-    }
-
-    writerThread.start()
-    new MonitorThread(env, worker, context).start()
-
-    // Return an iterator that read lines from the process's stdout
-    val stream = new DataInputStream(new BufferedInputStream(worker.getInputStream, bufferSize))
-    val stdoutIterator = new Iterator[Array[Byte]] {
-      override def next(): Array[Byte] = {
-        val obj = _nextObj
-        if (hasNext) {
-          _nextObj = read()
-        }
-        obj
-      }
-
-      private def read(): Array[Byte] = {
-        if (writerThread.exception.isDefined) {
-          throw writerThread.exception.get
-        }
-        try {
-          stream.readInt() match {
-            case length if length > 0 =>
-              val obj = new Array[Byte](length)
-              stream.readFully(obj)
-              obj
-            case 0 => Array.empty[Byte]
-            case SpecialLengths.TIMING_DATA =>
-              // Timing data from worker
-              val bootTime = stream.readLong()
-              val initTime = stream.readLong()
-              val finishTime = stream.readLong()
-              val boot = bootTime - startTime
-              val init = initTime - bootTime
-              val finish = finishTime - initTime
-              val total = finishTime - startTime
-              logInfo("Times: total = %s, boot = %s, init = %s, finish = %s".format(total, boot,
-                init, finish))
-              val memoryBytesSpilled = stream.readLong()
-              val diskBytesSpilled = stream.readLong()
-              context.taskMetrics.incMemoryBytesSpilled(memoryBytesSpilled)
-              context.taskMetrics.incDiskBytesSpilled(diskBytesSpilled)
-              read()
-            case SpecialLengths.PYTHON_EXCEPTION_THROWN =>
-              // Signals that an exception has been thrown in python
-              val exLength = stream.readInt()
-              val obj = new Array[Byte](exLength)
-              stream.readFully(obj)
-              throw new PythonException(new String(obj, StandardCharsets.UTF_8),
-                writerThread.exception.getOrElse(null))
-            case SpecialLengths.END_OF_DATA_SECTION =>
-              // We've finished the data section of the output, but we can still
-              // read some accumulator updates:
-              val numAccumulatorUpdates = stream.readInt()
-              (1 to numAccumulatorUpdates).foreach { _ =>
-                val updateLen = stream.readInt()
-                val update = new Array[Byte](updateLen)
-                stream.readFully(update)
-                accumulator.add(update)
-              }
-              // Check whether the worker is ready to be re-used.
-              if (stream.readInt() == SpecialLengths.END_OF_STREAM) {
-                if (reuse_worker) {
-                  env.releasePythonWorker(pythonExec, envVars.asScala.toMap, worker)
-                  released = true
-                }
-              }
-              null
-          }
-        } catch {
-
-          case e: Exception if context.isInterrupted =>
-            logDebug("Exception thrown after task interruption", e)
-            throw new TaskKilledException(context.getKillReason().getOrElse("unknown reason"))
-
-          case e: Exception if env.isStopped =>
-            logDebug("Exception thrown after context is stopped", e)
-            null  // exit silently
-
-          case e: Exception if writerThread.exception.isDefined =>
-            logError("Python worker exited unexpectedly (crashed)", e)
-            logError("This may have been caused by a prior exception:", writerThread.exception.get)
-            throw writerThread.exception.get
-
-          case eof: EOFException =>
-            throw new SparkException("Python worker exited unexpectedly (crashed)", eof)
-        }
-      }
-
-      var _nextObj = read()
-
-      override def hasNext: Boolean = _nextObj != null
-    }
-    new InterruptibleIterator(context, stdoutIterator)
-  }
-
-  /**
-   * The thread responsible for writing the data from the PythonRDD's parent iterator to the
-   * Python process.
-   */
-  class WriterThread(
-      env: SparkEnv,
-      worker: Socket,
-      inputIterator: Iterator[_],
-      partitionIndex: Int,
-      context: TaskContext)
-    extends Thread(s"stdout writer for $pythonExec") {
-
-    @volatile private var _exception: Exception = null
-
-    private val pythonIncludes = funcs.flatMap(_.funcs.flatMap(_.pythonIncludes.asScala)).toSet
-    private val broadcastVars = funcs.flatMap(_.funcs.flatMap(_.broadcastVars.asScala))
-
-    setDaemon(true)
-
-    /** Contains the exception thrown while writing the parent iterator to the Python process. */
-    def exception: Option[Exception] = Option(_exception)
-
-    /** Terminates the writer thread, ignoring any exceptions that may occur due to cleanup. */
-    def shutdownOnTaskCompletion() {
-      assert(context.isCompleted)
-      this.interrupt()
-    }
-
-    override def run(): Unit = Utils.logUncaughtExceptions {
-      try {
-        TaskContext.setTaskContext(context)
-        val stream = new BufferedOutputStream(worker.getOutputStream, bufferSize)
-        val dataOut = new DataOutputStream(stream)
-        // Partition index
-        dataOut.writeInt(partitionIndex)
-        // Python version of driver
-        PythonRDD.writeUTF(pythonVer, dataOut)
-        // Write out the TaskContextInfo
-        dataOut.writeInt(context.stageId())
-        dataOut.writeInt(context.partitionId())
-        dataOut.writeInt(context.attemptNumber())
-        dataOut.writeLong(context.taskAttemptId())
-        // sparkFilesDir
-        PythonRDD.writeUTF(SparkFiles.getRootDirectory(), dataOut)
-        // Python includes (*.zip and *.egg files)
-        dataOut.writeInt(pythonIncludes.size)
-        for (include <- pythonIncludes) {
-          PythonRDD.writeUTF(include, dataOut)
-        }
-        // Broadcast variables
-        val oldBids = PythonRDD.getWorkerBroadcasts(worker)
-        val newBids = broadcastVars.map(_.id).toSet
-        // number of different broadcasts
-        val toRemove = oldBids.diff(newBids)
-        val cnt = toRemove.size + newBids.diff(oldBids).size
-        dataOut.writeInt(cnt)
-        for (bid <- toRemove) {
-          // remove the broadcast from worker
-          dataOut.writeLong(- bid - 1)  // bid >= 0
-          oldBids.remove(bid)
-        }
-        for (broadcast <- broadcastVars) {
-          if (!oldBids.contains(broadcast.id)) {
-            // send new broadcast
-            dataOut.writeLong(broadcast.id)
-            PythonRDD.writeUTF(broadcast.value.path, dataOut)
-            oldBids.add(broadcast.id)
-          }
-        }
-        dataOut.flush()
-        // Serialized command:
-        dataOut.writeInt(evalType)
-        if (evalType != PythonEvalType.NON_UDF) {
-          dataOut.writeInt(funcs.length)
-          funcs.zip(argOffsets).foreach { case (chained, offsets) =>
-            dataOut.writeInt(offsets.length)
-            offsets.foreach { offset =>
-              dataOut.writeInt(offset)
-            }
-            dataOut.writeInt(chained.funcs.length)
-            chained.funcs.foreach { f =>
-              dataOut.writeInt(f.command.length)
-              dataOut.write(f.command)
-            }
-          }
-        } else {
-          val command = funcs.head.funcs.head.command
-          dataOut.writeInt(command.length)
-          dataOut.write(command)
-        }
-        // Data values
-        PythonRDD.writeIteratorToStream(inputIterator, dataOut)
-        dataOut.writeInt(SpecialLengths.END_OF_DATA_SECTION)
-        dataOut.writeInt(SpecialLengths.END_OF_STREAM)
-        dataOut.flush()
-      } catch {
-        case e: Exception if context.isCompleted || context.isInterrupted =>
-          logDebug("Exception thrown after task completion (likely due to cleanup)", e)
-          if (!worker.isClosed) {
-            Utils.tryLog(worker.shutdownOutput())
-          }
-
-        case e: Exception =>
-          // We must avoid throwing exceptions here, because the thread uncaught exception handler
-          // will kill the whole executor (see org.apache.spark.executor.Executor).
-          _exception = e
-          if (!worker.isClosed) {
-            Utils.tryLog(worker.shutdownOutput())
-          }
-      }
-    }
-  }
-
-  /**
-   * It is necessary to have a monitor thread for python workers if the user cancels with
-   * interrupts disabled. In that case we will need to explicitly kill the worker, otherwise the
-   * threads can block indefinitely.
-   */
-  class MonitorThread(env: SparkEnv, worker: Socket, context: TaskContext)
-    extends Thread(s"Worker Monitor for $pythonExec") {
-
-    setDaemon(true)
-
-    override def run() {
-      // Kill the worker if it is interrupted, checking until task completion.
-      // TODO: This has a race condition if interruption occurs, as completed may still become true.
-      while (!context.isInterrupted && !context.isCompleted) {
-        Thread.sleep(2000)
-      }
-      if (!context.isCompleted) {
-        try {
-          logWarning("Incomplete task interrupted: Attempting to kill Python Worker")
-          env.destroyPythonWorker(pythonExec, envVars.asScala.toMap, worker)
-        } catch {
-          case e: Exception =>
-            logError("Exception when trying to kill worker", e)
-        }
-      }
-    }
-  }
-}
-
 /** Thrown for exceptions in user Python code. */
-private class PythonException(msg: String, cause: Exception) extends RuntimeException(msg, cause)
+private[spark] class PythonException(msg: String, cause: Exception)
+  extends RuntimeException(msg, cause)
 
 /**
  * Form an RDD[(Array[Byte], Array[Byte])] from key-value pairs returned from Python.
@@ -411,14 +102,6 @@ private class PairwiseRDD(prev: RDD[Array[Byte]]) extends RDD[(Long, Array[Byte]
   val asJavaPairRDD : JavaPairRDD[Long, Array[Byte]] = JavaPairRDD.fromRDD(this)
 }
 
-private object SpecialLengths {
-  val END_OF_DATA_SECTION = -1
-  val PYTHON_EXCEPTION_THROWN = -2
-  val TIMING_DATA = -3
-  val END_OF_STREAM = -4
-  val NULL = -5
-}
-
 private[spark] object PythonRDD extends Logging {
 
   // remember the broadcasts sent to each worker
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
new file mode 100644
index 0000000000000..3688a149443c1
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
@@ -0,0 +1,441 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.python
+
+import java.io._
+import java.net._
+import java.nio.charset.StandardCharsets
+import java.util.concurrent.atomic.AtomicBoolean
+
+import scala.collection.JavaConverters._
+
+import org.apache.spark._
+import org.apache.spark.internal.Logging
+import org.apache.spark.util._
+
+
+/**
+ * Enumerate the type of command that will be sent to the Python worker
+ */
+private[spark] object PythonEvalType {
+  val NON_UDF = 0
+  val SQL_BATCHED_UDF = 1
+  val SQL_PANDAS_UDF = 2
+}
+
+/**
+ * A helper class to run Python mapPartition/UDFs in Spark.
+ *
+ * funcs is a list of independent Python functions, each one of them is a list of chained Python
+ * functions (from bottom to top).
+ */
+private[spark] abstract class BasePythonRunner[IN, OUT](
+    funcs: Seq[ChainedPythonFunctions],
+    bufferSize: Int,
+    reuseWorker: Boolean,
+    evalType: Int,
+    argOffsets: Array[Array[Int]])
+  extends Logging {
+
+  require(funcs.length == argOffsets.length, "argOffsets should have the same length as funcs")
+
+  // All the Python functions should have the same exec, version and envvars.
+  protected val envVars = funcs.head.funcs.head.envVars
+  protected val pythonExec = funcs.head.funcs.head.pythonExec
+  protected val pythonVer = funcs.head.funcs.head.pythonVer
+
+  // TODO: support accumulator in multiple UDF
+  protected val accumulator = funcs.head.funcs.head.accumulator
+
+  def compute(
+      inputIterator: Iterator[IN],
+      partitionIndex: Int,
+      context: TaskContext): Iterator[OUT] = {
+    val startTime = System.currentTimeMillis
+    val env = SparkEnv.get
+    val localdir = env.blockManager.diskBlockManager.localDirs.map(f => f.getPath()).mkString(",")
+    envVars.put("SPARK_LOCAL_DIRS", localdir) // it's also used in monitor thread
+    if (reuseWorker) {
+      envVars.put("SPARK_REUSE_WORKER", "1")
+    }
+    val worker: Socket = env.createPythonWorker(pythonExec, envVars.asScala.toMap)
+    // Whether is the worker released into idle pool
+    val released = new AtomicBoolean(false)
+
+    // Start a thread to feed the process input from our parent's iterator
+    val writerThread = newWriterThread(env, worker, inputIterator, partitionIndex, context)
+
+    context.addTaskCompletionListener { _ =>
+      writerThread.shutdownOnTaskCompletion()
+      if (!reuseWorker || !released.get) {
+        try {
+          worker.close()
+        } catch {
+          case e: Exception =>
+            logWarning("Failed to close worker socket", e)
+        }
+      }
+    }
+
+    writerThread.start()
+    new MonitorThread(env, worker, context).start()
+
+    // Return an iterator that read lines from the process's stdout
+    val stream = new DataInputStream(new BufferedInputStream(worker.getInputStream, bufferSize))
+
+    val stdoutIterator = newReaderIterator(
+      stream, writerThread, startTime, env, worker, released, context)
+    new InterruptibleIterator(context, stdoutIterator)
+  }
+
+  protected def newWriterThread(
+      env: SparkEnv,
+      worker: Socket,
+      inputIterator: Iterator[IN],
+      partitionIndex: Int,
+      context: TaskContext): WriterThread
+
+  protected def newReaderIterator(
+      stream: DataInputStream,
+      writerThread: WriterThread,
+      startTime: Long,
+      env: SparkEnv,
+      worker: Socket,
+      released: AtomicBoolean,
+      context: TaskContext): Iterator[OUT]
+
+  /**
+   * The thread responsible for writing the data from the PythonRDD's parent iterator to the
+   * Python process.
+   */
+  abstract class WriterThread(
+      env: SparkEnv,
+      worker: Socket,
+      inputIterator: Iterator[IN],
+      partitionIndex: Int,
+      context: TaskContext)
+    extends Thread(s"stdout writer for $pythonExec") {
+
+    @volatile private var _exception: Exception = null
+
+    private val pythonIncludes = funcs.flatMap(_.funcs.flatMap(_.pythonIncludes.asScala)).toSet
+    private val broadcastVars = funcs.flatMap(_.funcs.flatMap(_.broadcastVars.asScala))
+
+    setDaemon(true)
+
+    /** Contains the exception thrown while writing the parent iterator to the Python process. */
+    def exception: Option[Exception] = Option(_exception)
+
+    /** Terminates the writer thread, ignoring any exceptions that may occur due to cleanup. */
+    def shutdownOnTaskCompletion() {
+      assert(context.isCompleted)
+      this.interrupt()
+    }
+
+    /**
+     * Writes a command section to the stream connected to the Python worker.
+     */
+    protected def writeCommand(dataOut: DataOutputStream): Unit
+
+    /**
+     * Writes input data to the stream connected to the Python worker.
+     */
+    protected def writeIteratorToStream(dataOut: DataOutputStream): Unit
+
+    override def run(): Unit = Utils.logUncaughtExceptions {
+      try {
+        TaskContext.setTaskContext(context)
+        val stream = new BufferedOutputStream(worker.getOutputStream, bufferSize)
+        val dataOut = new DataOutputStream(stream)
+        // Partition index
+        dataOut.writeInt(partitionIndex)
+        // Python version of driver
+        PythonRDD.writeUTF(pythonVer, dataOut)
+        // Write out the TaskContextInfo
+        dataOut.writeInt(context.stageId())
+        dataOut.writeInt(context.partitionId())
+        dataOut.writeInt(context.attemptNumber())
+        dataOut.writeLong(context.taskAttemptId())
+        // sparkFilesDir
+        PythonRDD.writeUTF(SparkFiles.getRootDirectory(), dataOut)
+        // Python includes (*.zip and *.egg files)
+        dataOut.writeInt(pythonIncludes.size)
+        for (include <- pythonIncludes) {
+          PythonRDD.writeUTF(include, dataOut)
+        }
+        // Broadcast variables
+        val oldBids = PythonRDD.getWorkerBroadcasts(worker)
+        val newBids = broadcastVars.map(_.id).toSet
+        // number of different broadcasts
+        val toRemove = oldBids.diff(newBids)
+        val cnt = toRemove.size + newBids.diff(oldBids).size
+        dataOut.writeInt(cnt)
+        for (bid <- toRemove) {
+          // remove the broadcast from worker
+          dataOut.writeLong(- bid - 1)  // bid >= 0
+          oldBids.remove(bid)
+        }
+        for (broadcast <- broadcastVars) {
+          if (!oldBids.contains(broadcast.id)) {
+            // send new broadcast
+            dataOut.writeLong(broadcast.id)
+            PythonRDD.writeUTF(broadcast.value.path, dataOut)
+            oldBids.add(broadcast.id)
+          }
+        }
+        dataOut.flush()
+
+        dataOut.writeInt(evalType)
+        writeCommand(dataOut)
+        writeIteratorToStream(dataOut)
+
+        dataOut.writeInt(SpecialLengths.END_OF_STREAM)
+        dataOut.flush()
+      } catch {
+        case e: Exception if context.isCompleted || context.isInterrupted =>
+          logDebug("Exception thrown after task completion (likely due to cleanup)", e)
+          if (!worker.isClosed) {
+            Utils.tryLog(worker.shutdownOutput())
+          }
+
+        case e: Exception =>
+          // We must avoid throwing exceptions here, because the thread uncaught exception handler
+          // will kill the whole executor (see org.apache.spark.executor.Executor).
+          _exception = e
+          if (!worker.isClosed) {
+            Utils.tryLog(worker.shutdownOutput())
+          }
+      }
+    }
+  }
+
+  abstract class ReaderIterator(
+      stream: DataInputStream,
+      writerThread: WriterThread,
+      startTime: Long,
+      env: SparkEnv,
+      worker: Socket,
+      released: AtomicBoolean,
+      context: TaskContext)
+    extends Iterator[OUT] {
+
+    private var nextObj: OUT = _
+    private var eos = false
+
+    override def hasNext: Boolean = nextObj != null || {
+      if (!eos) {
+        nextObj = read()
+        hasNext
+      } else {
+        false
+      }
+    }
+
+    override def next(): OUT = {
+      if (hasNext) {
+        val obj = nextObj
+        nextObj = null.asInstanceOf[OUT]
+        obj
+      } else {
+        Iterator.empty.next()
+      }
+    }
+
+    /**
+     * Reads next object from the stream.
+     * When the stream reaches end of data, needs to process the following sections,
+     * and then returns null.
+     */
+    protected def read(): OUT
+
+    protected def handleTimingData(): Unit = {
+      // Timing data from worker
+      val bootTime = stream.readLong()
+      val initTime = stream.readLong()
+      val finishTime = stream.readLong()
+      val boot = bootTime - startTime
+      val init = initTime - bootTime
+      val finish = finishTime - initTime
+      val total = finishTime - startTime
+      logInfo("Times: total = %s, boot = %s, init = %s, finish = %s".format(total, boot,
+        init, finish))
+      val memoryBytesSpilled = stream.readLong()
+      val diskBytesSpilled = stream.readLong()
+      context.taskMetrics.incMemoryBytesSpilled(memoryBytesSpilled)
+      context.taskMetrics.incDiskBytesSpilled(diskBytesSpilled)
+    }
+
+    protected def handlePythonException(): PythonException = {
+      // Signals that an exception has been thrown in python
+      val exLength = stream.readInt()
+      val obj = new Array[Byte](exLength)
+      stream.readFully(obj)
+      new PythonException(new String(obj, StandardCharsets.UTF_8),
+        writerThread.exception.getOrElse(null))
+    }
+
+    protected def handleEndOfDataSection(): Unit = {
+      // We've finished the data section of the output, but we can still
+      // read some accumulator updates:
+      val numAccumulatorUpdates = stream.readInt()
+      (1 to numAccumulatorUpdates).foreach { _ =>
+        val updateLen = stream.readInt()
+        val update = new Array[Byte](updateLen)
+        stream.readFully(update)
+        accumulator.add(update)
+      }
+      // Check whether the worker is ready to be re-used.
+      if (stream.readInt() == SpecialLengths.END_OF_STREAM) {
+        if (reuseWorker) {
+          env.releasePythonWorker(pythonExec, envVars.asScala.toMap, worker)
+          released.set(true)
+        }
+      }
+      eos = true
+    }
+
+    protected val handleException: PartialFunction[Throwable, OUT] = {
+      case e: Exception if context.isInterrupted =>
+        logDebug("Exception thrown after task interruption", e)
+        throw new TaskKilledException(context.getKillReason().getOrElse("unknown reason"))
+
+      case e: Exception if env.isStopped =>
+        logDebug("Exception thrown after context is stopped", e)
+        null.asInstanceOf[OUT]  // exit silently
+
+      case e: Exception if writerThread.exception.isDefined =>
+        logError("Python worker exited unexpectedly (crashed)", e)
+        logError("This may have been caused by a prior exception:", writerThread.exception.get)
+        throw writerThread.exception.get
+
+      case eof: EOFException =>
+        throw new SparkException("Python worker exited unexpectedly (crashed)", eof)
+    }
+  }
+
+  /**
+   * It is necessary to have a monitor thread for python workers if the user cancels with
+   * interrupts disabled. In that case we will need to explicitly kill the worker, otherwise the
+   * threads can block indefinitely.
+   */
+  class MonitorThread(env: SparkEnv, worker: Socket, context: TaskContext)
+    extends Thread(s"Worker Monitor for $pythonExec") {
+
+    setDaemon(true)
+
+    override def run() {
+      // Kill the worker if it is interrupted, checking until task completion.
+      // TODO: This has a race condition if interruption occurs, as completed may still become true.
+      while (!context.isInterrupted && !context.isCompleted) {
+        Thread.sleep(2000)
+      }
+      if (!context.isCompleted) {
+        try {
+          logWarning("Incomplete task interrupted: Attempting to kill Python Worker")
+          env.destroyPythonWorker(pythonExec, envVars.asScala.toMap, worker)
+        } catch {
+          case e: Exception =>
+            logError("Exception when trying to kill worker", e)
+        }
+      }
+    }
+  }
+}
+
+private[spark] object PythonRunner {
+
+  def apply(func: PythonFunction, bufferSize: Int, reuseWorker: Boolean): PythonRunner = {
+    new PythonRunner(Seq(ChainedPythonFunctions(Seq(func))), bufferSize, reuseWorker)
+  }
+}
+
+/**
+ * A helper class to run Python mapPartition in Spark.
+ */
+private[spark] class PythonRunner(
+    funcs: Seq[ChainedPythonFunctions],
+    bufferSize: Int,
+    reuseWorker: Boolean)
+  extends BasePythonRunner[Array[Byte], Array[Byte]](
+    funcs, bufferSize, reuseWorker, PythonEvalType.NON_UDF, Array(Array(0))) {
+
+  protected override def newWriterThread(
+      env: SparkEnv,
+      worker: Socket,
+      inputIterator: Iterator[Array[Byte]],
+      partitionIndex: Int,
+      context: TaskContext): WriterThread = {
+    new WriterThread(env, worker, inputIterator, partitionIndex, context) {
+
+      protected override def writeCommand(dataOut: DataOutputStream): Unit = {
+        val command = funcs.head.funcs.head.command
+        dataOut.writeInt(command.length)
+        dataOut.write(command)
+      }
+
+      protected override def writeIteratorToStream(dataOut: DataOutputStream): Unit = {
+        PythonRDD.writeIteratorToStream(inputIterator, dataOut)
+        dataOut.writeInt(SpecialLengths.END_OF_DATA_SECTION)
+      }
+    }
+  }
+
+  protected override def newReaderIterator(
+      stream: DataInputStream,
+      writerThread: WriterThread,
+      startTime: Long,
+      env: SparkEnv,
+      worker: Socket,
+      released: AtomicBoolean,
+      context: TaskContext): Iterator[Array[Byte]] = {
+    new ReaderIterator(stream, writerThread, startTime, env, worker, released, context) {
+
+      protected override def read(): Array[Byte] = {
+        if (writerThread.exception.isDefined) {
+          throw writerThread.exception.get
+        }
+        try {
+          stream.readInt() match {
+            case length if length > 0 =>
+              val obj = new Array[Byte](length)
+              stream.readFully(obj)
+              obj
+            case 0 => Array.empty[Byte]
+            case SpecialLengths.TIMING_DATA =>
+              handleTimingData()
+              read()
+            case SpecialLengths.PYTHON_EXCEPTION_THROWN =>
+              throw handlePythonException()
+            case SpecialLengths.END_OF_DATA_SECTION =>
+              handleEndOfDataSection()
+              null
+          }
+        } catch handleException
+      }
+    }
+  }
+}
+
+private[spark] object SpecialLengths {
+  val END_OF_DATA_SECTION = -1
+  val PYTHON_EXCEPTION_THROWN = -2
+  val TIMING_DATA = -3
+  val END_OF_STREAM = -4
+  val NULL = -5
+  val START_ARROW_STREAM = -6
+}
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index 7c1fbadcb82be..db77b7e150b24 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -79,6 +79,7 @@ class SpecialLengths(object):
     TIMING_DATA = -3
     END_OF_STREAM = -4
     NULL = -5
+    START_ARROW_STREAM = -6
 
 
 class PythonEvalType(object):
@@ -211,44 +212,61 @@ def __repr__(self):
         return "ArrowSerializer"
 
 
-class ArrowPandasSerializer(ArrowSerializer):
+def _create_batch(series):
+    import pyarrow as pa
+    # Make input conform to [(series1, type1), (series2, type2), ...]
+    if not isinstance(series, (list, tuple)) or \
+            (len(series) == 2 and isinstance(series[1], pa.DataType)):
+        series = [series]
+    series = ((s, None) if not isinstance(s, (list, tuple)) else s for s in series)
+
+    # If a nullable integer series has been promoted to floating point with NaNs, need to cast
+    # NOTE: this is not necessary with Arrow >= 0.7
+    def cast_series(s, t):
+        if t is None or s.dtype == t.to_pandas_dtype():
+            return s
+        else:
+            return s.fillna(0).astype(t.to_pandas_dtype(), copy=False)
+
+    arrs = [pa.Array.from_pandas(cast_series(s, t), mask=s.isnull(), type=t) for s, t in series]
+    return pa.RecordBatch.from_arrays(arrs, ["_%d" % i for i in xrange(len(arrs))])
+
+
+class ArrowStreamPandasSerializer(Serializer):
     """
-    Serializes Pandas.Series as Arrow data.
+    Serializes Pandas.Series as Arrow data with Arrow streaming format.
     """
 
-    def dumps(self, series):
+    def dump_stream(self, iterator, stream):
         """
-        Make an ArrowRecordBatch from a Pandas Series and serialize. Input is a single series or
+        Make ArrowRecordBatches from Pandas Serieses and serialize. Input is a single series or
         a list of series accompanied by an optional pyarrow type to coerce the data to.
         """
         import pyarrow as pa
-        # Make input conform to [(series1, type1), (series2, type2), ...]
-        if not isinstance(series, (list, tuple)) or \
-                (len(series) == 2 and isinstance(series[1], pa.DataType)):
-            series = [series]
-        series = ((s, None) if not isinstance(s, (list, tuple)) else s for s in series)
-
-        # If a nullable integer series has been promoted to floating point with NaNs, need to cast
-        # NOTE: this is not necessary with Arrow >= 0.7
-        def cast_series(s, t):
-            if t is None or s.dtype == t.to_pandas_dtype():
-                return s
-            else:
-                return s.fillna(0).astype(t.to_pandas_dtype(), copy=False)
-
-        arrs = [pa.Array.from_pandas(cast_series(s, t), mask=s.isnull(), type=t) for s, t in series]
-        batch = pa.RecordBatch.from_arrays(arrs, ["_%d" % i for i in xrange(len(arrs))])
-        return super(ArrowPandasSerializer, self).dumps(batch)
+        writer = None
+        try:
+            for series in iterator:
+                batch = _create_batch(series)
+                if writer is None:
+                    write_int(SpecialLengths.START_ARROW_STREAM, stream)
+                    writer = pa.RecordBatchStreamWriter(stream, batch.schema)
+                writer.write_batch(batch)
+        finally:
+            if writer is not None:
+                writer.close()
 
-    def loads(self, obj):
+    def load_stream(self, stream):
         """
-        Deserialize an ArrowRecordBatch to an Arrow table and return as a list of pandas.Series.
+        Deserialize ArrowRecordBatchs to an Arrow table and return as a list of pandas.Series.
         """
-        table = super(ArrowPandasSerializer, self).loads(obj)
-        return [c.to_pandas() for c in table.itercolumns()]
+        import pyarrow as pa
+        reader = pa.open_stream(stream)
+        for batch in reader:
+            table = pa.Table.from_batches([batch])
+            yield [c.to_pandas() for c in table.itercolumns()]
 
     def __repr__(self):
-        return "ArrowPandasSerializer"
+        return "ArrowStreamPandasSerializer"
 
 
 class BatchedSerializer(Serializer):
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index fd917c400c872..4e24789cf010d 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -31,7 +31,7 @@
 from pyspark.files import SparkFiles
 from pyspark.serializers import write_with_length, write_int, read_long, \
     write_long, read_int, SpecialLengths, PythonEvalType, UTF8Deserializer, PickleSerializer, \
-    BatchedSerializer, ArrowPandasSerializer
+    BatchedSerializer, ArrowStreamPandasSerializer
 from pyspark.sql.types import toArrowType
 from pyspark import shuffle
 
@@ -123,7 +123,7 @@ def read_udfs(pickleSer, infile, eval_type):
     func = lambda _, it: map(mapper, it)
 
     if eval_type == PythonEvalType.SQL_PANDAS_UDF:
-        ser = ArrowPandasSerializer()
+        ser = ArrowStreamPandasSerializer()
     else:
         ser = BatchedSerializer(PickleSerializer(), 100)
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java
index e782756a3e781..bc546c7c425b1 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java
@@ -462,6 +462,11 @@ public int numValidRows() {
     return numRows - numRowsFiltered;
   }
 
+  /**
+   * Returns the schema that makes up this batch.
+   */
+  public StructType schema() { return schema; }
+
   /**
    * Returns the max capacity (in number of rows) for this batch.
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
index 5e72cd255873a..f7e8cbe416121 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
@@ -17,12 +17,13 @@
 
 package org.apache.spark.sql.execution.python
 
+import scala.collection.JavaConverters._
+
 import org.apache.spark.TaskContext
-import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType, PythonRunner}
+import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution.SparkPlan
-import org.apache.spark.sql.execution.arrow.{ArrowConverters, ArrowPayload}
 import org.apache.spark.sql.types.StructType
 
 /**
@@ -39,25 +40,36 @@ case class ArrowEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], chi
       iter: Iterator[InternalRow],
       schema: StructType,
       context: TaskContext): Iterator[InternalRow] = {
-    val inputIterator = ArrowConverters.toPayloadIterator(
-      iter, schema, conf.arrowMaxRecordsPerBatch, context).map(_.asPythonSerializable)
-
-    // Output iterator for results from Python.
-    val outputIterator = new PythonRunner(
-        funcs, bufferSize, reuseWorker, PythonEvalType.SQL_PANDAS_UDF, argOffsets)
-      .compute(inputIterator, context.partitionId(), context)
-
-    val outputRowIterator = ArrowConverters.fromPayloadIterator(
-      outputIterator.map(new ArrowPayload(_)), context)
-
-    // Verify that the output schema is correct
-    if (outputRowIterator.hasNext) {
-      val schemaOut = StructType.fromAttributes(output.drop(child.output.length).zipWithIndex
-        .map { case (attr, i) => attr.withName(s"_$i") })
-      assert(schemaOut.equals(outputRowIterator.schema),
-        s"Invalid schema from pandas_udf: expected $schemaOut, got ${outputRowIterator.schema}")
-    }
 
-    outputRowIterator
+    val schemaOut = StructType.fromAttributes(output.drop(child.output.length).zipWithIndex
+      .map { case (attr, i) => attr.withName(s"_$i") })
+
+    val columnarBatchIter = new ArrowPythonRunner(
+        funcs, conf.arrowMaxRecordsPerBatch, bufferSize, reuseWorker,
+        PythonEvalType.SQL_PANDAS_UDF, argOffsets, schema)
+      .compute(iter, context.partitionId(), context)
+
+    new Iterator[InternalRow] {
+
+      var currentIter = if (columnarBatchIter.hasNext) {
+        val batch = columnarBatchIter.next()
+        assert(schemaOut.equals(batch.schema),
+          s"Invalid schema from pandas_udf: expected $schemaOut, got ${batch.schema}")
+        batch.rowIterator.asScala
+      } else {
+        Iterator.empty
+      }
+
+      override def hasNext: Boolean = currentIter.hasNext || {
+        if (columnarBatchIter.hasNext) {
+          currentIter = columnarBatchIter.next().rowIterator.asScala
+          hasNext
+        } else {
+          false
+        }
+      }
+
+      override def next(): InternalRow = currentIter.next()
+    }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
new file mode 100644
index 0000000000000..bbad9d6b631fd
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
@@ -0,0 +1,181 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.python
+
+import java.io._
+import java.net._
+import java.util.concurrent.atomic.AtomicBoolean
+
+import scala.collection.JavaConverters._
+
+import org.apache.arrow.vector.VectorSchemaRoot
+import org.apache.arrow.vector.stream.{ArrowStreamReader, ArrowStreamWriter}
+
+import org.apache.spark._
+import org.apache.spark.api.python._
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.execution.arrow.{ArrowUtils, ArrowWriter}
+import org.apache.spark.sql.execution.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnVector}
+import org.apache.spark.sql.types._
+import org.apache.spark.util.Utils
+
+/**
+ * Similar to `PythonUDFRunner`, but exchange data with Python worker via Arrow stream.
+ */
+class ArrowPythonRunner(
+    funcs: Seq[ChainedPythonFunctions],
+    batchSize: Int,
+    bufferSize: Int,
+    reuseWorker: Boolean,
+    evalType: Int,
+    argOffsets: Array[Array[Int]],
+    schema: StructType)
+  extends BasePythonRunner[InternalRow, ColumnarBatch](
+    funcs, bufferSize, reuseWorker, evalType, argOffsets) {
+
+  protected override def newWriterThread(
+      env: SparkEnv,
+      worker: Socket,
+      inputIterator: Iterator[InternalRow],
+      partitionIndex: Int,
+      context: TaskContext): WriterThread = {
+    new WriterThread(env, worker, inputIterator, partitionIndex, context) {
+
+      protected override def writeCommand(dataOut: DataOutputStream): Unit = {
+        PythonUDFRunner.writeUDFs(dataOut, funcs, argOffsets)
+      }
+
+      protected override def writeIteratorToStream(dataOut: DataOutputStream): Unit = {
+        val arrowSchema = ArrowUtils.toArrowSchema(schema)
+        val allocator = ArrowUtils.rootAllocator.newChildAllocator(
+          s"stdout writer for $pythonExec", 0, Long.MaxValue)
+
+        val root = VectorSchemaRoot.create(arrowSchema, allocator)
+        val arrowWriter = ArrowWriter.create(root)
+
+        var closed = false
+
+        context.addTaskCompletionListener { _ =>
+          if (!closed) {
+            root.close()
+            allocator.close()
+          }
+        }
+
+        val writer = new ArrowStreamWriter(root, null, dataOut)
+        writer.start()
+
+        Utils.tryWithSafeFinally {
+          while (inputIterator.hasNext) {
+            var rowCount = 0
+            while (inputIterator.hasNext && (batchSize <= 0 || rowCount < batchSize)) {
+              val row = inputIterator.next()
+              arrowWriter.write(row)
+              rowCount += 1
+            }
+            arrowWriter.finish()
+            writer.writeBatch()
+            arrowWriter.reset()
+          }
+        } {
+          writer.end()
+          root.close()
+          allocator.close()
+          closed = true
+        }
+      }
+    }
+  }
+
+  protected override def newReaderIterator(
+      stream: DataInputStream,
+      writerThread: WriterThread,
+      startTime: Long,
+      env: SparkEnv,
+      worker: Socket,
+      released: AtomicBoolean,
+      context: TaskContext): Iterator[ColumnarBatch] = {
+    new ReaderIterator(stream, writerThread, startTime, env, worker, released, context) {
+
+      private val allocator = ArrowUtils.rootAllocator.newChildAllocator(
+        s"stdin reader for $pythonExec", 0, Long.MaxValue)
+
+      private var reader: ArrowStreamReader = _
+      private var root: VectorSchemaRoot = _
+      private var schema: StructType = _
+      private var vectors: Array[ColumnVector] = _
+
+      private var closed = false
+
+      context.addTaskCompletionListener { _ =>
+        // todo: we need something like `reader.end()`, which release all the resources, but leave
+        // the input stream open. `reader.close()` will close the socket and we can't reuse worker.
+        // So here we simply not close the reader, which is problematic.
+        if (!closed) {
+          if (root != null) {
+            root.close()
+          }
+          allocator.close()
+        }
+      }
+
+      private var batchLoaded = true
+
+      protected override def read(): ColumnarBatch = {
+        if (writerThread.exception.isDefined) {
+          throw writerThread.exception.get
+        }
+        try {
+          if (reader != null && batchLoaded) {
+            batchLoaded = reader.loadNextBatch()
+            if (batchLoaded) {
+              val batch = new ColumnarBatch(schema, vectors, root.getRowCount)
+              batch.setNumRows(root.getRowCount)
+              batch
+            } else {
+              root.close()
+              allocator.close()
+              closed = true
+              // Reach end of stream. Call `read()` again to read control data.
+              read()
+            }
+          } else {
+            stream.readInt() match {
+              case SpecialLengths.START_ARROW_STREAM =>
+                reader = new ArrowStreamReader(stream, allocator)
+                root = reader.getVectorSchemaRoot()
+                schema = ArrowUtils.fromArrowSchema(root.getSchema())
+                vectors = root.getFieldVectors().asScala.map { vector =>
+                  new ArrowColumnVector(vector)
+                }.toArray[ColumnVector]
+                read()
+              case SpecialLengths.TIMING_DATA =>
+                handleTimingData()
+                read()
+              case SpecialLengths.PYTHON_EXCEPTION_THROWN =>
+                throw handlePythonException()
+              case SpecialLengths.END_OF_DATA_SECTION =>
+                handleEndOfDataSection()
+                null
+            }
+          }
+        } catch handleException
+      }
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala
index 2978eac50554d..26ee25f633ea4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala
@@ -22,7 +22,7 @@ import scala.collection.JavaConverters._
 import net.razorvine.pickle.{Pickler, Unpickler}
 
 import org.apache.spark.TaskContext
-import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType, PythonRunner}
+import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution.SparkPlan
@@ -68,7 +68,7 @@ case class BatchEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], chi
     }.grouped(100).map(x => pickle.dumps(x.toArray))
 
     // Output iterator for results from Python.
-    val outputIterator = new PythonRunner(
+    val outputIterator = new PythonUDFRunner(
         funcs, bufferSize, reuseWorker, PythonEvalType.SQL_BATCHED_UDF, argOffsets)
       .compute(inputIterator, context.partitionId(), context)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDFRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDFRunner.scala
new file mode 100644
index 0000000000000..e28def1c4b423
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDFRunner.scala
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.python
+
+import java.io._
+import java.net._
+import java.util.concurrent.atomic.AtomicBoolean
+
+import org.apache.spark._
+import org.apache.spark.api.python._
+
+/**
+ * A helper class to run Python UDFs in Spark.
+ */
+class PythonUDFRunner(
+    funcs: Seq[ChainedPythonFunctions],
+    bufferSize: Int,
+    reuseWorker: Boolean,
+    evalType: Int,
+    argOffsets: Array[Array[Int]])
+  extends BasePythonRunner[Array[Byte], Array[Byte]](
+    funcs, bufferSize, reuseWorker, evalType, argOffsets) {
+
+  protected override def newWriterThread(
+      env: SparkEnv,
+      worker: Socket,
+      inputIterator: Iterator[Array[Byte]],
+      partitionIndex: Int,
+      context: TaskContext): WriterThread = {
+    new WriterThread(env, worker, inputIterator, partitionIndex, context) {
+
+      protected override def writeCommand(dataOut: DataOutputStream): Unit = {
+        PythonUDFRunner.writeUDFs(dataOut, funcs, argOffsets)
+      }
+
+      protected override def writeIteratorToStream(dataOut: DataOutputStream): Unit = {
+        PythonRDD.writeIteratorToStream(inputIterator, dataOut)
+        dataOut.writeInt(SpecialLengths.END_OF_DATA_SECTION)
+      }
+    }
+  }
+
+  protected override def newReaderIterator(
+      stream: DataInputStream,
+      writerThread: WriterThread,
+      startTime: Long,
+      env: SparkEnv,
+      worker: Socket,
+      released: AtomicBoolean,
+      context: TaskContext): Iterator[Array[Byte]] = {
+    new ReaderIterator(stream, writerThread, startTime, env, worker, released, context) {
+
+      protected override def read(): Array[Byte] = {
+        if (writerThread.exception.isDefined) {
+          throw writerThread.exception.get
+        }
+        try {
+          stream.readInt() match {
+            case length if length > 0 =>
+              val obj = new Array[Byte](length)
+              stream.readFully(obj)
+              obj
+            case 0 => Array.empty[Byte]
+            case SpecialLengths.TIMING_DATA =>
+              handleTimingData()
+              read()
+            case SpecialLengths.PYTHON_EXCEPTION_THROWN =>
+              throw handlePythonException()
+            case SpecialLengths.END_OF_DATA_SECTION =>
+              handleEndOfDataSection()
+              null
+          }
+        } catch handleException
+      }
+    }
+  }
+}
+
+object PythonUDFRunner {
+
+  def writeUDFs(
+      dataOut: DataOutputStream,
+      funcs: Seq[ChainedPythonFunctions],
+      argOffsets: Array[Array[Int]]): Unit = {
+    dataOut.writeInt(funcs.length)
+    funcs.zip(argOffsets).foreach { case (chained, offsets) =>
+      dataOut.writeInt(offsets.length)
+      offsets.foreach { offset =>
+        dataOut.writeInt(offset)
+      }
+      dataOut.writeInt(chained.funcs.length)
+      chained.funcs.foreach { f =>
+        dataOut.writeInt(f.command.length)
+        dataOut.write(f.command)
+      }
+    }
+  }
+}

From 9b98aef6a39a5a9ea9fc5481b5a0d92620ba6347 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Wed, 27 Sep 2017 13:40:21 -0700
Subject: [PATCH 632/779] [HOTFIX][BUILD] Fix finalizer checkstyle error and
 re-disable checkstyle

## What changes were proposed in this pull request?

Fix finalizer checkstyle violation by just turning it off; re-disable checkstyle as it won't be run by SBT PR builder. See https://github.com/apache/spark/pull/18887#issuecomment-332580700

## How was this patch tested?

`./dev/lint-java` runs successfully

Author: Sean Owen <sowen@cloudera.com>

Closes #19371 from srowen/HotfixFinalizerCheckstlye.
---
 .../java/org/apache/spark/io/NioBufferedFileInputStream.java    | 2 --
 dev/checkstyle-suppressions.xml                                 | 2 --
 dev/checkstyle.xml                                              | 1 -
 pom.xml                                                         | 1 +
 4 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/io/NioBufferedFileInputStream.java b/core/src/main/java/org/apache/spark/io/NioBufferedFileInputStream.java
index ea5f1a9abf69b..f6d1288cb263d 100644
--- a/core/src/main/java/org/apache/spark/io/NioBufferedFileInputStream.java
+++ b/core/src/main/java/org/apache/spark/io/NioBufferedFileInputStream.java
@@ -130,10 +130,8 @@ public synchronized void close() throws IOException {
     StorageUtils.dispose(byteBuffer);
   }
 
-  //checkstyle.off: NoFinalizer
   @Override
   protected void finalize() throws IOException {
     close();
   }
-  //checkstyle.on: NoFinalizer
 }
diff --git a/dev/checkstyle-suppressions.xml b/dev/checkstyle-suppressions.xml
index 6e15f6955984e..bbda824dd13b4 100644
--- a/dev/checkstyle-suppressions.xml
+++ b/dev/checkstyle-suppressions.xml
@@ -40,8 +40,6 @@
               files="src/main/java/org/apache/hive/service/*"/>
     <suppress checks="MethodName"
               files="src/main/java/org/apache/hive/service/auth/PasswdAuthenticationProvider.java"/>
-    <suppress checks="NoFinalizer"
-              files="src/main/java/org/apache/hive/service/server/ThreadWithGarbageCleanup.java"/>
     <suppress checks="MethodName"
               files="sql/catalyst/src/main/java/org/apache/spark/sql/streaming/OutputMode.java"/>
     <suppress checks="MethodName"
diff --git a/dev/checkstyle.xml b/dev/checkstyle.xml
index 68aee4d4c97ef..53c284888ebb0 100644
--- a/dev/checkstyle.xml
+++ b/dev/checkstyle.xml
@@ -131,7 +131,6 @@
             <message key="name.invalidPattern"
              value="Method type name ''{0}'' must match pattern ''{1}''."/>
         </module>
-        <module name="NoFinalizer"/>
         <module name="GenericWhitespace">
             <message key="ws.followed"
              value="GenericWhitespace ''{0}'' is followed by whitespace."/>
diff --git a/pom.xml b/pom.xml
index b0408ecca0f66..83a35006707da 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2488,6 +2488,7 @@
         <artifactId>maven-checkstyle-plugin</artifactId>
         <version>2.17</version>
         <configuration>
+          <failOnViolation>false</failOnViolation>
           <includeTestSourceDirectory>true</includeTestSourceDirectory>
           <sourceDirectories>${basedir}/src/main/java,${basedir}/src/main/scala</sourceDirectories>
           <testSourceDirectory>${basedir}/src/test/java</testSourceDirectory>

From 02bb0682e68a2ce81f3b98d33649d368da7f2b3d Mon Sep 17 00:00:00 2001
From: Herman van Hovell <hvanhovell@databricks.com>
Date: Wed, 27 Sep 2017 23:08:30 +0200
Subject: [PATCH 633/779] [SPARK-22143][SQL] Fix memory leak in
 OffHeapColumnVector

## What changes were proposed in this pull request?
`WriteableColumnVector` does not close its child column vectors. This can create memory leaks for `OffHeapColumnVector` where we do not clean up the memory allocated by a vectors children. This can be especially bad for string columns (which uses a child byte column vector).

## How was this patch tested?
I have updated the existing tests to always use both on-heap and off-heap vectors. Testing and diagnoses was done locally.

Author: Herman van Hovell <hvanhovell@databricks.com>

Closes #19367 from hvanhovell/SPARK-22143.
---
 .../vectorized/OffHeapColumnVector.java       |   1 +
 .../vectorized/OnHeapColumnVector.java        |  10 +
 .../vectorized/WritableColumnVector.java      |  18 ++
 .../vectorized/ColumnVectorSuite.scala        | 102 +++++----
 .../vectorized/ColumnarBatchSuite.scala       | 194 ++++++++----------
 5 files changed, 165 insertions(+), 160 deletions(-)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
index e1d36858d4eee..8cbc895506d91 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
@@ -85,6 +85,7 @@ public long nullsNativeAddress() {
 
   @Override
   public void close() {
+    super.close();
     Platform.freeMemory(nulls);
     Platform.freeMemory(data);
     Platform.freeMemory(lengthData);
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
index 96a452978cb35..2725a29eeabe8 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
@@ -90,6 +90,16 @@ public long nullsNativeAddress() {
 
   @Override
   public void close() {
+    super.close();
+    nulls = null;
+    byteData = null;
+    shortData = null;
+    intData = null;
+    longData = null;
+    floatData = null;
+    doubleData = null;
+    arrayLengths = null;
+    arrayOffsets = null;
   }
 
   //
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
index 0bddc351e1bed..163f2511e5f73 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
@@ -59,6 +59,24 @@ public void reset() {
     }
   }
 
+  @Override
+  public void close() {
+    if (childColumns != null) {
+      for (int i = 0; i < childColumns.length; i++) {
+        childColumns[i].close();
+        childColumns[i] = null;
+      }
+      childColumns = null;
+    }
+    if (dictionaryIds != null) {
+      dictionaryIds.close();
+      dictionaryIds = null;
+    }
+    dictionary = null;
+    resultStruct = null;
+    resultArray = null;
+  }
+
   public void reserve(int requiredCapacity) {
     if (requiredCapacity > capacity) {
       int newCapacity = (int) Math.min(MAX_CAPACITY, requiredCapacity * 2L);
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
index f7b06c97f9db6..85da8270d4cba 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
@@ -25,19 +25,24 @@ import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
 class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
-
-  var testVector: WritableColumnVector = _
-
-  private def allocate(capacity: Int, dt: DataType): WritableColumnVector = {
-    new OnHeapColumnVector(capacity, dt)
+  private def withVector(
+      vector: WritableColumnVector)(
+      block: WritableColumnVector => Unit): Unit = {
+    try block(vector) finally vector.close()
   }
 
-  override def afterEach(): Unit = {
-    testVector.close()
+  private def testVectors(
+      name: String,
+      size: Int,
+      dt: DataType)(
+      block: WritableColumnVector => Unit): Unit = {
+    test(name) {
+      withVector(new OnHeapColumnVector(size, dt))(block)
+      withVector(new OffHeapColumnVector(size, dt))(block)
+    }
   }
 
-  test("boolean") {
-    testVector = allocate(10, BooleanType)
+  testVectors("boolean", 10, BooleanType) { testVector =>
     (0 until 10).foreach { i =>
       testVector.appendBoolean(i % 2 == 0)
     }
@@ -49,8 +54,7 @@ class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
     }
   }
 
-  test("byte") {
-    testVector = allocate(10, ByteType)
+  testVectors("byte", 10, ByteType) { testVector =>
     (0 until 10).foreach { i =>
       testVector.appendByte(i.toByte)
     }
@@ -58,12 +62,11 @@ class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
     val array = new ColumnVector.Array(testVector)
 
     (0 until 10).foreach { i =>
-      assert(array.get(i, ByteType) === (i.toByte))
+      assert(array.get(i, ByteType) === i.toByte)
     }
   }
 
-  test("short") {
-    testVector = allocate(10, ShortType)
+  testVectors("short", 10, ShortType) { testVector =>
     (0 until 10).foreach { i =>
       testVector.appendShort(i.toShort)
     }
@@ -71,12 +74,11 @@ class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
     val array = new ColumnVector.Array(testVector)
 
     (0 until 10).foreach { i =>
-      assert(array.get(i, ShortType) === (i.toShort))
+      assert(array.get(i, ShortType) === i.toShort)
     }
   }
 
-  test("int") {
-    testVector = allocate(10, IntegerType)
+  testVectors("int", 10, IntegerType) { testVector =>
     (0 until 10).foreach { i =>
       testVector.appendInt(i)
     }
@@ -88,8 +90,7 @@ class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
     }
   }
 
-  test("long") {
-    testVector = allocate(10, LongType)
+  testVectors("long", 10, LongType) { testVector =>
     (0 until 10).foreach { i =>
       testVector.appendLong(i)
     }
@@ -101,8 +102,7 @@ class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
     }
   }
 
-  test("float") {
-    testVector = allocate(10, FloatType)
+  testVectors("float", 10, FloatType) { testVector =>
     (0 until 10).foreach { i =>
       testVector.appendFloat(i.toFloat)
     }
@@ -114,8 +114,7 @@ class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
     }
   }
 
-  test("double") {
-    testVector = allocate(10, DoubleType)
+  testVectors("double", 10, DoubleType) { testVector =>
     (0 until 10).foreach { i =>
       testVector.appendDouble(i.toDouble)
     }
@@ -127,8 +126,7 @@ class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
     }
   }
 
-  test("string") {
-    testVector = allocate(10, StringType)
+  testVectors("string", 10, StringType) { testVector =>
     (0 until 10).map { i =>
       val utf8 = s"str$i".getBytes("utf8")
       testVector.appendByteArray(utf8, 0, utf8.length)
@@ -141,8 +139,7 @@ class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
     }
   }
 
-  test("binary") {
-    testVector = allocate(10, BinaryType)
+  testVectors("binary", 10, BinaryType) { testVector =>
     (0 until 10).map { i =>
       val utf8 = s"str$i".getBytes("utf8")
       testVector.appendByteArray(utf8, 0, utf8.length)
@@ -156,9 +153,8 @@ class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
     }
   }
 
-  test("array") {
-    val arrayType = ArrayType(IntegerType, true)
-    testVector = allocate(10, arrayType)
+  val arrayType: ArrayType = ArrayType(IntegerType, containsNull = true)
+  testVectors("array", 10, arrayType) { testVector =>
 
     val data = testVector.arrayData()
     var i = 0
@@ -181,9 +177,8 @@ class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
     assert(array.get(3, arrayType).asInstanceOf[ArrayData].toIntArray() === Array(3, 4, 5))
   }
 
-  test("struct") {
-    val schema = new StructType().add("int", IntegerType).add("double", DoubleType)
-    testVector = allocate(10, schema)
+  val structType: StructType = new StructType().add("int", IntegerType).add("double", DoubleType)
+  testVectors("struct", 10, structType) { testVector =>
     val c1 = testVector.getChildColumn(0)
     val c2 = testVector.getChildColumn(1)
     c1.putInt(0, 123)
@@ -193,35 +188,34 @@ class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
 
     val array = new ColumnVector.Array(testVector)
 
-    assert(array.get(0, schema).asInstanceOf[ColumnarBatch.Row].get(0, IntegerType) === 123)
-    assert(array.get(0, schema).asInstanceOf[ColumnarBatch.Row].get(1, DoubleType) === 3.45)
-    assert(array.get(1, schema).asInstanceOf[ColumnarBatch.Row].get(0, IntegerType) === 456)
-    assert(array.get(1, schema).asInstanceOf[ColumnarBatch.Row].get(1, DoubleType) === 5.67)
+    assert(array.get(0, structType).asInstanceOf[ColumnarBatch.Row].get(0, IntegerType) === 123)
+    assert(array.get(0, structType).asInstanceOf[ColumnarBatch.Row].get(1, DoubleType) === 3.45)
+    assert(array.get(1, structType).asInstanceOf[ColumnarBatch.Row].get(0, IntegerType) === 456)
+    assert(array.get(1, structType).asInstanceOf[ColumnarBatch.Row].get(1, DoubleType) === 5.67)
   }
 
   test("[SPARK-22092] off-heap column vector reallocation corrupts array data") {
-    val arrayType = ArrayType(IntegerType, true)
-    testVector = new OffHeapColumnVector(8, arrayType)
+    withVector(new OffHeapColumnVector(8, arrayType)) { testVector =>
+      val data = testVector.arrayData()
+      (0 until 8).foreach(i => data.putInt(i, i))
+      (0 until 8).foreach(i => testVector.putArray(i, i, 1))
 
-    val data = testVector.arrayData()
-    (0 until 8).foreach(i => data.putInt(i, i))
-    (0 until 8).foreach(i => testVector.putArray(i, i, 1))
+      // Increase vector's capacity and reallocate the data to new bigger buffers.
+      testVector.reserve(16)
 
-    // Increase vector's capacity and reallocate the data to new bigger buffers.
-    testVector.reserve(16)
-
-    // Check that none of the values got lost/overwritten.
-    val array = new ColumnVector.Array(testVector)
-    (0 until 8).foreach { i =>
-      assert(array.get(i, arrayType).asInstanceOf[ArrayData].toIntArray() === Array(i))
+      // Check that none of the values got lost/overwritten.
+      val array = new ColumnVector.Array(testVector)
+      (0 until 8).foreach { i =>
+        assert(array.get(i, arrayType).asInstanceOf[ArrayData].toIntArray() === Array(i))
+      }
     }
   }
 
   test("[SPARK-22092] off-heap column vector reallocation corrupts struct nullability") {
-    val structType = new StructType().add("int", IntegerType).add("double", DoubleType)
-    testVector = new OffHeapColumnVector(8, structType)
-    (0 until 8).foreach(i => if (i % 2 == 0) testVector.putNull(i) else testVector.putNotNull(i))
-    testVector.reserve(16)
-    (0 until 8).foreach(i => assert(testVector.isNullAt(i) == (i % 2 == 0)))
+    withVector(new OffHeapColumnVector(8, structType)) { testVector =>
+      (0 until 8).foreach(i => if (i % 2 == 0) testVector.putNull(i) else testVector.putNotNull(i))
+      testVector.reserve(16)
+      (0 until 8).foreach(i => assert(testVector.isNullAt(i) == (i % 2 == 0)))
+    }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
index ebf76613343ba..983eb103682c1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
@@ -38,7 +38,7 @@ import org.apache.spark.unsafe.types.CalendarInterval
 
 class ColumnarBatchSuite extends SparkFunSuite {
 
-  def allocate(capacity: Int, dt: DataType, memMode: MemoryMode): WritableColumnVector = {
+  private def allocate(capacity: Int, dt: DataType, memMode: MemoryMode): WritableColumnVector = {
     if (memMode == MemoryMode.OFF_HEAP) {
       new OffHeapColumnVector(capacity, dt)
     } else {
@@ -46,23 +46,36 @@ class ColumnarBatchSuite extends SparkFunSuite {
     }
   }
 
-  test("Null Apis") {
-    (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode => {
-      val reference = mutable.ArrayBuffer.empty[Boolean]
+  private def testVector(
+      name: String,
+      size: Int,
+      dt: DataType)(
+      block: (WritableColumnVector, MemoryMode) => Unit): Unit = {
+    test(name) {
+      Seq(MemoryMode.ON_HEAP, MemoryMode.OFF_HEAP).foreach { mode =>
+        val vector = allocate(size, dt, mode)
+        try block(vector, mode) finally {
+          vector.close()
+        }
+      }
+    }
+  }
 
-      val column = allocate(1024, IntegerType, memMode)
+  testVector("Null APIs", 1024, IntegerType) {
+    (column, memMode) =>
+      val reference = mutable.ArrayBuffer.empty[Boolean]
       var idx = 0
-      assert(column.anyNullsSet() == false)
+      assert(!column.anyNullsSet())
       assert(column.numNulls() == 0)
 
       column.appendNotNull()
       reference += false
-      assert(column.anyNullsSet() == false)
+      assert(!column.anyNullsSet())
       assert(column.numNulls() == 0)
 
       column.appendNotNulls(3)
       (1 to 3).foreach(_ => reference += false)
-      assert(column.anyNullsSet() == false)
+      assert(!column.anyNullsSet())
       assert(column.numNulls() == 0)
 
       column.appendNull()
@@ -113,16 +126,12 @@ class ColumnarBatchSuite extends SparkFunSuite {
           assert(v._1 == (Platform.getByte(null, addr + v._2) == 1), "index=" + v._2)
         }
       }
-      column.close
-    }}
   }
 
-  test("Byte Apis") {
-    (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode => {
+  testVector("Byte APIs", 1024, ByteType) {
+    (column, memMode) =>
       val reference = mutable.ArrayBuffer.empty[Byte]
 
-      val column = allocate(1024, ByteType, memMode)
-
       var values = (10 :: 20 :: 30 :: 40 :: 50 :: Nil).map(_.toByte).toArray
       column.appendBytes(2, values, 0)
       reference += 10.toByte
@@ -170,17 +179,14 @@ class ColumnarBatchSuite extends SparkFunSuite {
           assert(v._1 == Platform.getByte(null, addr + v._2))
         }
       }
-    }}
   }
 
-  test("Short Apis") {
-    (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode => {
+  testVector("Short APIs", 1024, ShortType) {
+    (column, memMode) =>
       val seed = System.currentTimeMillis()
       val random = new Random(seed)
       val reference = mutable.ArrayBuffer.empty[Short]
 
-      val column = allocate(1024, ShortType, memMode)
-
       var values = (10 :: 20 :: 30 :: 40 :: 50 :: Nil).map(_.toShort).toArray
       column.appendShorts(2, values, 0)
       reference += 10.toShort
@@ -248,19 +254,14 @@ class ColumnarBatchSuite extends SparkFunSuite {
           assert(v._1 == Platform.getShort(null, addr + 2 * v._2))
         }
       }
-
-      column.close
-    }}
   }
 
-  test("Int Apis") {
-    (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode => {
+  testVector("Int APIs", 1024, IntegerType) {
+    (column, memMode) =>
       val seed = System.currentTimeMillis()
       val random = new Random(seed)
       val reference = mutable.ArrayBuffer.empty[Int]
 
-      val column = allocate(1024, IntegerType, memMode)
-
       var values = (10 :: 20 :: 30 :: 40 :: 50 :: Nil).toArray
       column.appendInts(2, values, 0)
       reference += 10
@@ -334,18 +335,14 @@ class ColumnarBatchSuite extends SparkFunSuite {
           assert(v._1 == Platform.getInt(null, addr + 4 * v._2))
         }
       }
-      column.close
-    }}
   }
 
-  test("Long Apis") {
-    (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode => {
+  testVector("Long APIs", 1024, LongType) {
+    (column, memMode) =>
       val seed = System.currentTimeMillis()
       val random = new Random(seed)
       val reference = mutable.ArrayBuffer.empty[Long]
 
-      val column = allocate(1024, LongType, memMode)
-
       var values = (10L :: 20L :: 30L :: 40L :: 50L :: Nil).toArray
       column.appendLongs(2, values, 0)
       reference += 10L
@@ -422,17 +419,14 @@ class ColumnarBatchSuite extends SparkFunSuite {
           assert(v._1 == Platform.getLong(null, addr + 8 * v._2))
         }
       }
-    }}
   }
 
-  test("Float APIs") {
-    (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode => {
+  testVector("Float APIs", 1024, FloatType) {
+    (column, memMode) =>
       val seed = System.currentTimeMillis()
       val random = new Random(seed)
       val reference = mutable.ArrayBuffer.empty[Float]
 
-      val column = allocate(1024, FloatType, memMode)
-
       var values = (.1f :: .2f :: .3f :: .4f :: .5f :: Nil).toArray
       column.appendFloats(2, values, 0)
       reference += .1f
@@ -512,18 +506,14 @@ class ColumnarBatchSuite extends SparkFunSuite {
           assert(v._1 == Platform.getFloat(null, addr + 4 * v._2))
         }
       }
-      column.close
-    }}
   }
 
-  test("Double APIs") {
-    (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode => {
+  testVector("Double APIs", 1024, DoubleType) {
+    (column, memMode) =>
       val seed = System.currentTimeMillis()
       val random = new Random(seed)
       val reference = mutable.ArrayBuffer.empty[Double]
 
-      val column = allocate(1024, DoubleType, memMode)
-
       var values = (.1 :: .2 :: .3 :: .4 :: .5 :: Nil).toArray
       column.appendDoubles(2, values, 0)
       reference += .1
@@ -603,15 +593,12 @@ class ColumnarBatchSuite extends SparkFunSuite {
           assert(v._1 == Platform.getDouble(null, addr + 8 * v._2))
         }
       }
-      column.close
-    }}
   }
 
-  test("String APIs") {
-    (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode => {
+  testVector("String APIs", 6, StringType) {
+    (column, memMode) =>
       val reference = mutable.ArrayBuffer.empty[String]
 
-      val column = allocate(6, BinaryType, memMode)
       assert(column.arrayData().elementsAppended == 0)
 
       val str = "string"
@@ -663,15 +650,13 @@ class ColumnarBatchSuite extends SparkFunSuite {
 
       column.reset()
       assert(column.arrayData().elementsAppended == 0)
-    }}
   }
 
-  test("Int Array") {
-    (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode => {
-      val column = allocate(10, new ArrayType(IntegerType, true), memMode)
+  testVector("Int Array", 10, new ArrayType(IntegerType, true)) {
+    (column, _) =>
 
       // Fill the underlying data with all the arrays back to back.
-      val data = column.arrayData();
+      val data = column.arrayData()
       var i = 0
       while (i < 6) {
         data.putInt(i, i)
@@ -709,7 +694,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
       assert(column.getArray(3).getInt(2) == 5)
 
       // Add a longer array which requires resizing
-      column.reset
+      column.reset()
       val array = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
       assert(data.capacity == 10)
       data.reserve(array.length)
@@ -718,63 +703,67 @@ class ColumnarBatchSuite extends SparkFunSuite {
       column.putArray(0, 0, array.length)
       assert(ColumnVectorUtils.toPrimitiveJavaArray(column.getArray(0)).asInstanceOf[Array[Int]]
         === array)
-    }}
   }
 
   test("toArray for primitive types") {
-    // (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode => {
-    (MemoryMode.ON_HEAP :: Nil).foreach { memMode => {
+    (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode =>
       val len = 4
 
       val columnBool = allocate(len, new ArrayType(BooleanType, false), memMode)
       val boolArray = Array(false, true, false, true)
-      boolArray.zipWithIndex.map { case (v, i) => columnBool.arrayData.putBoolean(i, v) }
+      boolArray.zipWithIndex.foreach { case (v, i) => columnBool.arrayData.putBoolean(i, v) }
       columnBool.putArray(0, 0, len)
       assert(columnBool.getArray(0).toBooleanArray === boolArray)
+      columnBool.close()
 
       val columnByte = allocate(len, new ArrayType(ByteType, false), memMode)
       val byteArray = Array[Byte](0, 1, 2, 3)
-      byteArray.zipWithIndex.map { case (v, i) => columnByte.arrayData.putByte(i, v) }
+      byteArray.zipWithIndex.foreach { case (v, i) => columnByte.arrayData.putByte(i, v) }
       columnByte.putArray(0, 0, len)
       assert(columnByte.getArray(0).toByteArray === byteArray)
+      columnByte.close()
 
       val columnShort = allocate(len, new ArrayType(ShortType, false), memMode)
       val shortArray = Array[Short](0, 1, 2, 3)
-      shortArray.zipWithIndex.map { case (v, i) => columnShort.arrayData.putShort(i, v) }
+      shortArray.zipWithIndex.foreach { case (v, i) => columnShort.arrayData.putShort(i, v) }
       columnShort.putArray(0, 0, len)
       assert(columnShort.getArray(0).toShortArray === shortArray)
+      columnShort.close()
 
       val columnInt = allocate(len, new ArrayType(IntegerType, false), memMode)
       val intArray = Array(0, 1, 2, 3)
-      intArray.zipWithIndex.map { case (v, i) => columnInt.arrayData.putInt(i, v) }
+      intArray.zipWithIndex.foreach { case (v, i) => columnInt.arrayData.putInt(i, v) }
       columnInt.putArray(0, 0, len)
       assert(columnInt.getArray(0).toIntArray === intArray)
+      columnInt.close()
 
       val columnLong = allocate(len, new ArrayType(LongType, false), memMode)
       val longArray = Array[Long](0, 1, 2, 3)
-      longArray.zipWithIndex.map { case (v, i) => columnLong.arrayData.putLong(i, v) }
+      longArray.zipWithIndex.foreach { case (v, i) => columnLong.arrayData.putLong(i, v) }
       columnLong.putArray(0, 0, len)
       assert(columnLong.getArray(0).toLongArray === longArray)
+      columnLong.close()
 
       val columnFloat = allocate(len, new ArrayType(FloatType, false), memMode)
       val floatArray = Array(0.0F, 1.1F, 2.2F, 3.3F)
-      floatArray.zipWithIndex.map { case (v, i) => columnFloat.arrayData.putFloat(i, v) }
+      floatArray.zipWithIndex.foreach { case (v, i) => columnFloat.arrayData.putFloat(i, v) }
       columnFloat.putArray(0, 0, len)
       assert(columnFloat.getArray(0).toFloatArray === floatArray)
+      columnFloat.close()
 
       val columnDouble = allocate(len, new ArrayType(DoubleType, false), memMode)
       val doubleArray = Array(0.0, 1.1, 2.2, 3.3)
-      doubleArray.zipWithIndex.map { case (v, i) => columnDouble.arrayData.putDouble(i, v) }
+      doubleArray.zipWithIndex.foreach { case (v, i) => columnDouble.arrayData.putDouble(i, v) }
       columnDouble.putArray(0, 0, len)
       assert(columnDouble.getArray(0).toDoubleArray === doubleArray)
-    }}
+      columnDouble.close()
+    }
   }
 
-  test("Struct Column") {
-    (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode => {
-      val schema = new StructType().add("int", IntegerType).add("double", DoubleType)
-      val column = allocate(1024, schema, memMode)
-
+  testVector(
+    "Struct Column",
+    10,
+    new StructType().add("int", IntegerType).add("double", DoubleType)) { (column, _) =>
       val c1 = column.getChildColumn(0)
       val c2 = column.getChildColumn(1)
       assert(c1.dataType() == IntegerType)
@@ -797,13 +786,10 @@ class ColumnarBatchSuite extends SparkFunSuite {
       val s2 = column.getStruct(1)
       assert(s2.getInt(0) == 456)
       assert(s2.getDouble(1) == 5.67)
-    }}
   }
 
-  test("Nest Array in Array.") {
-    (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode =>
-      val column = allocate(10, new ArrayType(new ArrayType(IntegerType, true), true),
-        memMode)
+  testVector("Nest Array in Array", 10, new ArrayType(new ArrayType(IntegerType, true), true)) {
+    (column, _) =>
       val childColumn = column.arrayData()
       val data = column.arrayData().arrayData()
       (0 until 6).foreach {
@@ -829,13 +815,14 @@ class ColumnarBatchSuite extends SparkFunSuite {
       assert(column.getArray(2).getArray(1).getInt(1) === 4)
       assert(column.getArray(2).getArray(1).getInt(2) === 5)
       assert(column.isNullAt(3))
-    }
   }
 
-  test("Nest Struct in Array.") {
-    (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode =>
-      val schema = new StructType().add("int", IntegerType).add("long", LongType)
-      val column = allocate(10, new ArrayType(schema, true), memMode)
+  private val structType: StructType = new StructType().add("i", IntegerType).add("l", LongType)
+
+  testVector(
+    "Nest Struct in Array",
+    10,
+    new ArrayType(structType, true)) { (column, _) =>
       val data = column.arrayData()
       val c0 = data.getChildColumn(0)
       val c1 = data.getChildColumn(1)
@@ -850,22 +837,21 @@ class ColumnarBatchSuite extends SparkFunSuite {
       column.putArray(1, 1, 3)
       column.putArray(2, 4, 2)
 
-      assert(column.getArray(0).getStruct(0, 2).toSeq(schema) === Seq(0, 0))
-      assert(column.getArray(0).getStruct(1, 2).toSeq(schema) === Seq(1, 10))
-      assert(column.getArray(1).getStruct(0, 2).toSeq(schema) === Seq(1, 10))
-      assert(column.getArray(1).getStruct(1, 2).toSeq(schema) === Seq(2, 20))
-      assert(column.getArray(1).getStruct(2, 2).toSeq(schema) === Seq(3, 30))
-      assert(column.getArray(2).getStruct(0, 2).toSeq(schema) === Seq(4, 40))
-      assert(column.getArray(2).getStruct(1, 2).toSeq(schema) === Seq(5, 50))
-    }
+      assert(column.getArray(0).getStruct(0, 2).toSeq(structType) === Seq(0, 0))
+      assert(column.getArray(0).getStruct(1, 2).toSeq(structType) === Seq(1, 10))
+      assert(column.getArray(1).getStruct(0, 2).toSeq(structType) === Seq(1, 10))
+      assert(column.getArray(1).getStruct(1, 2).toSeq(structType) === Seq(2, 20))
+      assert(column.getArray(1).getStruct(2, 2).toSeq(structType) === Seq(3, 30))
+      assert(column.getArray(2).getStruct(0, 2).toSeq(structType) === Seq(4, 40))
+      assert(column.getArray(2).getStruct(1, 2).toSeq(structType) === Seq(5, 50))
   }
 
-  test("Nest Array in Struct.") {
-    (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode =>
-      val schema = new StructType()
-        .add("int", IntegerType)
-        .add("array", new ArrayType(IntegerType, true))
-      val column = allocate(10, schema, memMode)
+  testVector(
+    "Nest Array in Struct",
+    10,
+    new StructType()
+      .add("int", IntegerType)
+      .add("array", new ArrayType(IntegerType, true))) { (column, _) =>
       val c0 = column.getChildColumn(0)
       val c1 = column.getChildColumn(1)
       c0.putInt(0, 0)
@@ -886,18 +872,15 @@ class ColumnarBatchSuite extends SparkFunSuite {
       assert(column.getStruct(1).getArray(1).toIntArray() === Array(2))
       assert(column.getStruct(2).getInt(0) === 2)
       assert(column.getStruct(2).getArray(1).toIntArray() === Array(3, 4, 5))
-    }
   }
 
-  test("Nest Struct in Struct.") {
-    (MemoryMode.ON_HEAP :: Nil).foreach { memMode =>
-      val subSchema = new StructType()
-        .add("int", IntegerType)
-        .add("int", IntegerType)
-      val schema = new StructType()
-        .add("int", IntegerType)
-        .add("struct", subSchema)
-      val column = allocate(10, schema, memMode)
+  private val subSchema: StructType = new StructType()
+    .add("int", IntegerType)
+    .add("int", IntegerType)
+  testVector(
+    "Nest Struct in Struct",
+    10,
+    new StructType().add("int", IntegerType).add("struct", subSchema)) { (column, _) =>
       val c0 = column.getChildColumn(0)
       val c1 = column.getChildColumn(1)
       c0.putInt(0, 0)
@@ -919,7 +902,6 @@ class ColumnarBatchSuite extends SparkFunSuite {
       assert(column.getStruct(1).getStruct(1, 2).toSeq(subSchema) === Seq(8, 80))
       assert(column.getStruct(2).getInt(0) === 2)
       assert(column.getStruct(2).getStruct(1, 2).toSeq(subSchema) === Seq(9, 90))
-    }
   }
 
   test("ColumnarBatch basic") {
@@ -1040,7 +1022,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
       val it4 = batch.rowIterator()
       rowEquals(it4.next(), Row(null, 2.2, 2, "abc"))
 
-      batch.close
+      batch.close()
     }}
   }
 

From 9244957b500cb2b458c32db2c63293a1444690d7 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Wed, 27 Sep 2017 17:03:42 -0700
Subject: [PATCH 634/779] [SPARK-22140] Add TPCDSQuerySuite

## What changes were proposed in this pull request?
Now, we are not running TPC-DS queries as regular test cases. Thus, we need to add a test suite using empty tables for ensuring the new code changes will not break them. For example, optimizer/analyzer batches should not exceed the max iteration.

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #19361 from gatorsmile/tpcdsQuerySuite.
---
 .../apache/spark/sql/TPCDSQuerySuite.scala    | 348 ++++++++++++++++++
 1 file changed, 348 insertions(+)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala
new file mode 100644
index 0000000000000..c0797fa55f5da
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala
@@ -0,0 +1,348 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.scalatest.BeforeAndAfterAll
+
+import org.apache.spark.sql.catalyst.util.resourceToString
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.test.SharedSQLContext
+
+class TPCDSQuerySuite extends QueryTest with SharedSQLContext with BeforeAndAfterAll {
+
+  /**
+   * Drop all the tables
+   */
+  protected override def afterAll(): Unit = {
+    try {
+      spark.sessionState.catalog.reset()
+    } finally {
+      super.afterAll()
+    }
+  }
+
+  override def beforeAll() {
+    super.beforeAll()
+    sql(
+      """
+        |CREATE TABLE `catalog_page` (
+        |`cp_catalog_page_sk` INT, `cp_catalog_page_id` STRING, `cp_start_date_sk` INT,
+        |`cp_end_date_sk` INT, `cp_department` STRING, `cp_catalog_number` INT,
+        |`cp_catalog_page_number` INT, `cp_description` STRING, `cp_type` STRING)
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `catalog_returns` (
+        |`cr_returned_date_sk` INT, `cr_returned_time_sk` INT, `cr_item_sk` INT,
+        |`cr_refunded_customer_sk` INT, `cr_refunded_cdemo_sk` INT, `cr_refunded_hdemo_sk` INT,
+        |`cr_refunded_addr_sk` INT, `cr_returning_customer_sk` INT, `cr_returning_cdemo_sk` INT,
+        |`cr_returning_hdemo_sk` INT, `cr_returning_addr_sk` INT, `cr_call_center_sk` INT,
+        |`cr_catalog_page_sk` INT, `cr_ship_mode_sk` INT, `cr_warehouse_sk` INT, `cr_reason_sk` INT,
+        |`cr_order_number` INT, `cr_return_quantity` INT, `cr_return_amount` DECIMAL(7,2),
+        |`cr_return_tax` DECIMAL(7,2), `cr_return_amt_inc_tax` DECIMAL(7,2), `cr_fee` DECIMAL(7,2),
+        |`cr_return_ship_cost` DECIMAL(7,2), `cr_refunded_cash` DECIMAL(7,2),
+        |`cr_reversed_charge` DECIMAL(7,2), `cr_store_credit` DECIMAL(7,2),
+        |`cr_net_loss` DECIMAL(7,2))
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `customer` (
+        |`c_customer_sk` INT, `c_customer_id` STRING, `c_current_cdemo_sk` INT,
+        |`c_current_hdemo_sk` INT, `c_current_addr_sk` INT, `c_first_shipto_date_sk` INT,
+        |`c_first_sales_date_sk` INT, `c_salutation` STRING, `c_first_name` STRING,
+        |`c_last_name` STRING, `c_preferred_cust_flag` STRING, `c_birth_day` INT,
+        |`c_birth_month` INT, `c_birth_year` INT, `c_birth_country` STRING, `c_login` STRING,
+        |`c_email_address` STRING, `c_last_review_date` STRING)
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `customer_address` (
+        |`ca_address_sk` INT, `ca_address_id` STRING, `ca_street_number` STRING,
+        |`ca_street_name` STRING, `ca_street_type` STRING, `ca_suite_number` STRING,
+        |`ca_city` STRING, `ca_county` STRING, `ca_state` STRING, `ca_zip` STRING,
+        |`ca_country` STRING, `ca_gmt_offset` DECIMAL(5,2), `ca_location_type` STRING)
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `customer_demographics` (
+        |`cd_demo_sk` INT, `cd_gender` STRING, `cd_marital_status` STRING,
+        |`cd_education_status` STRING, `cd_purchase_estimate` INT, `cd_credit_rating` STRING,
+        |`cd_dep_count` INT, `cd_dep_employed_count` INT, `cd_dep_college_count` INT)
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `date_dim` (
+        |`d_date_sk` INT, `d_date_id` STRING, `d_date` STRING,
+        |`d_month_seq` INT, `d_week_seq` INT, `d_quarter_seq` INT, `d_year` INT, `d_dow` INT,
+        |`d_moy` INT, `d_dom` INT, `d_qoy` INT, `d_fy_year` INT, `d_fy_quarter_seq` INT,
+        |`d_fy_week_seq` INT, `d_day_name` STRING, `d_quarter_name` STRING, `d_holiday` STRING,
+        |`d_weekend` STRING, `d_following_holiday` STRING, `d_first_dom` INT, `d_last_dom` INT,
+        |`d_same_day_ly` INT, `d_same_day_lq` INT, `d_current_day` STRING, `d_current_week` STRING,
+        |`d_current_month` STRING, `d_current_quarter` STRING, `d_current_year` STRING)
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `household_demographics` (
+        |`hd_demo_sk` INT, `hd_income_band_sk` INT, `hd_buy_potential` STRING, `hd_dep_count` INT,
+        |`hd_vehicle_count` INT)
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `inventory` (`inv_date_sk` INT, `inv_item_sk` INT, `inv_warehouse_sk` INT,
+        |`inv_quantity_on_hand` INT)
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `item` (`i_item_sk` INT, `i_item_id` STRING, `i_rec_start_date` STRING,
+        |`i_rec_end_date` STRING, `i_item_desc` STRING, `i_current_price` DECIMAL(7,2),
+        |`i_wholesale_cost` DECIMAL(7,2), `i_brand_id` INT, `i_brand` STRING, `i_class_id` INT,
+        |`i_class` STRING, `i_category_id` INT, `i_category` STRING, `i_manufact_id` INT,
+        |`i_manufact` STRING, `i_size` STRING, `i_formulation` STRING, `i_color` STRING,
+        |`i_units` STRING, `i_container` STRING, `i_manager_id` INT, `i_product_name` STRING)
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `promotion` (
+        |`p_promo_sk` INT, `p_promo_id` STRING, `p_start_date_sk` INT, `p_end_date_sk` INT,
+        |`p_item_sk` INT, `p_cost` DECIMAL(15,2), `p_response_target` INT, `p_promo_name` STRING,
+        |`p_channel_dmail` STRING, `p_channel_email` STRING, `p_channel_catalog` STRING,
+        |`p_channel_tv` STRING, `p_channel_radio` STRING, `p_channel_press` STRING,
+        |`p_channel_event` STRING, `p_channel_demo` STRING, `p_channel_details` STRING,
+        |`p_purpose` STRING, `p_discount_active` STRING)
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `store` (
+        |`s_store_sk` INT, `s_store_id` STRING, `s_rec_start_date` STRING,
+        |`s_rec_end_date` STRING, `s_closed_date_sk` INT, `s_store_name` STRING,
+        |`s_number_employees` INT, `s_floor_space` INT, `s_hours` STRING, `s_manager` STRING,
+        |`s_market_id` INT, `s_geography_class` STRING, `s_market_desc` STRING,
+        |`s_market_manager` STRING, `s_division_id` INT, `s_division_name` STRING,
+        |`s_company_id` INT, `s_company_name` STRING, `s_street_number` STRING,
+        |`s_street_name` STRING, `s_street_type` STRING, `s_suite_number` STRING, `s_city` STRING,
+        |`s_county` STRING, `s_state` STRING, `s_zip` STRING, `s_country` STRING,
+        |`s_gmt_offset` DECIMAL(5,2), `s_tax_precentage` DECIMAL(5,2))
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `store_returns` (
+        |`sr_returned_date_sk` BIGINT, `sr_return_time_sk` BIGINT, `sr_item_sk` BIGINT,
+        |`sr_customer_sk` BIGINT, `sr_cdemo_sk` BIGINT, `sr_hdemo_sk` BIGINT, `sr_addr_sk` BIGINT,
+        |`sr_store_sk` BIGINT, `sr_reason_sk` BIGINT, `sr_ticket_number` BIGINT,
+        |`sr_return_quantity` BIGINT, `sr_return_amt` DECIMAL(7,2), `sr_return_tax` DECIMAL(7,2),
+        |`sr_return_amt_inc_tax` DECIMAL(7,2), `sr_fee` DECIMAL(7,2),
+        |`sr_return_ship_cost` DECIMAL(7,2), `sr_refunded_cash` DECIMAL(7,2),
+        |`sr_reversed_charge` DECIMAL(7,2), `sr_store_credit` DECIMAL(7,2),
+        |`sr_net_loss` DECIMAL(7,2))
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `catalog_sales` (
+        |`cs_sold_date_sk` INT, `cs_sold_time_sk` INT, `cs_ship_date_sk` INT,
+        |`cs_bill_customer_sk` INT, `cs_bill_cdemo_sk` INT, `cs_bill_hdemo_sk` INT,
+        |`cs_bill_addr_sk` INT, `cs_ship_customer_sk` INT, `cs_ship_cdemo_sk` INT,
+        |`cs_ship_hdemo_sk` INT, `cs_ship_addr_sk` INT, `cs_call_center_sk` INT,
+        |`cs_catalog_page_sk` INT, `cs_ship_mode_sk` INT, `cs_warehouse_sk` INT,
+        |`cs_item_sk` INT, `cs_promo_sk` INT, `cs_order_number` INT, `cs_quantity` INT,
+        |`cs_wholesale_cost` DECIMAL(7,2), `cs_list_price` DECIMAL(7,2),
+        |`cs_sales_price` DECIMAL(7,2), `cs_ext_discount_amt` DECIMAL(7,2),
+        |`cs_ext_sales_price` DECIMAL(7,2), `cs_ext_wholesale_cost` DECIMAL(7,2),
+        |`cs_ext_list_price` DECIMAL(7,2), `cs_ext_tax` DECIMAL(7,2), `cs_coupon_amt` DECIMAL(7,2),
+        |`cs_ext_ship_cost` DECIMAL(7,2), `cs_net_paid` DECIMAL(7,2),
+        |`cs_net_paid_inc_tax` DECIMAL(7,2), `cs_net_paid_inc_ship` DECIMAL(7,2),
+        |`cs_net_paid_inc_ship_tax` DECIMAL(7,2), `cs_net_profit` DECIMAL(7,2))
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `web_sales` (
+        |`ws_sold_date_sk` INT, `ws_sold_time_sk` INT, `ws_ship_date_sk` INT, `ws_item_sk` INT,
+        |`ws_bill_customer_sk` INT, `ws_bill_cdemo_sk` INT, `ws_bill_hdemo_sk` INT,
+        |`ws_bill_addr_sk` INT, `ws_ship_customer_sk` INT, `ws_ship_cdemo_sk` INT,
+        |`ws_ship_hdemo_sk` INT, `ws_ship_addr_sk` INT, `ws_web_page_sk` INT, `ws_web_site_sk` INT,
+        |`ws_ship_mode_sk` INT, `ws_warehouse_sk` INT, `ws_promo_sk` INT, `ws_order_number` INT,
+        |`ws_quantity` INT, `ws_wholesale_cost` DECIMAL(7,2), `ws_list_price` DECIMAL(7,2),
+        |`ws_sales_price` DECIMAL(7,2), `ws_ext_discount_amt` DECIMAL(7,2),
+        |`ws_ext_sales_price` DECIMAL(7,2), `ws_ext_wholesale_cost` DECIMAL(7,2),
+        |`ws_ext_list_price` DECIMAL(7,2), `ws_ext_tax` DECIMAL(7,2),
+        |`ws_coupon_amt` DECIMAL(7,2), `ws_ext_ship_cost` DECIMAL(7,2), `ws_net_paid` DECIMAL(7,2),
+        |`ws_net_paid_inc_tax` DECIMAL(7,2), `ws_net_paid_inc_ship` DECIMAL(7,2),
+        |`ws_net_paid_inc_ship_tax` DECIMAL(7,2), `ws_net_profit` DECIMAL(7,2))
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `store_sales` (
+        |`ss_sold_date_sk` INT, `ss_sold_time_sk` INT, `ss_item_sk` INT, `ss_customer_sk` INT,
+        |`ss_cdemo_sk` INT, `ss_hdemo_sk` INT, `ss_addr_sk` INT, `ss_store_sk` INT,
+        |`ss_promo_sk` INT, `ss_ticket_number` INT, `ss_quantity` INT,
+        |`ss_wholesale_cost` DECIMAL(7,2), `ss_list_price` DECIMAL(7,2),
+        |`ss_sales_price` DECIMAL(7,2), `ss_ext_discount_amt` DECIMAL(7,2),
+        |`ss_ext_sales_price` DECIMAL(7,2), `ss_ext_wholesale_cost` DECIMAL(7,2),
+        |`ss_ext_list_price` DECIMAL(7,2), `ss_ext_tax` DECIMAL(7,2),
+        |`ss_coupon_amt` DECIMAL(7,2), `ss_net_paid` DECIMAL(7,2),
+        |`ss_net_paid_inc_tax` DECIMAL(7,2), `ss_net_profit` DECIMAL(7,2))
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `web_returns` (
+        |`wr_returned_date_sk` BIGINT, `wr_returned_time_sk` BIGINT, `wr_item_sk` BIGINT,
+        |`wr_refunded_customer_sk` BIGINT, `wr_refunded_cdemo_sk` BIGINT,
+        |`wr_refunded_hdemo_sk` BIGINT, `wr_refunded_addr_sk` BIGINT,
+        |`wr_returning_customer_sk` BIGINT, `wr_returning_cdemo_sk` BIGINT,
+        |`wr_returning_hdemo_sk` BIGINT, `wr_returning_addr_sk` BIGINT, `wr_web_page_sk` BIGINT,
+        |`wr_reason_sk` BIGINT, `wr_order_number` BIGINT, `wr_return_quantity` BIGINT,
+        |`wr_return_amt` DECIMAL(7,2), `wr_return_tax` DECIMAL(7,2),
+        |`wr_return_amt_inc_tax` DECIMAL(7,2), `wr_fee` DECIMAL(7,2),
+        |`wr_return_ship_cost` DECIMAL(7,2), `wr_refunded_cash` DECIMAL(7,2),
+        |`wr_reversed_charge` DECIMAL(7,2), `wr_account_credit` DECIMAL(7,2),
+        |`wr_net_loss` DECIMAL(7,2))
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `web_site` (
+        |`web_site_sk` INT, `web_site_id` STRING, `web_rec_start_date` DATE,
+        |`web_rec_end_date` DATE, `web_name` STRING, `web_open_date_sk` INT,
+        |`web_close_date_sk` INT, `web_class` STRING, `web_manager` STRING, `web_mkt_id` INT,
+        |`web_mkt_class` STRING, `web_mkt_desc` STRING, `web_market_manager` STRING,
+        |`web_company_id` INT, `web_company_name` STRING, `web_street_number` STRING,
+        |`web_street_name` STRING, `web_street_type` STRING, `web_suite_number` STRING,
+        |`web_city` STRING, `web_county` STRING, `web_state` STRING, `web_zip` STRING,
+        |`web_country` STRING, `web_gmt_offset` STRING, `web_tax_percentage` DECIMAL(5,2))
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `reason` (
+        |`r_reason_sk` INT, `r_reason_id` STRING, `r_reason_desc` STRING)
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `call_center` (
+        |`cc_call_center_sk` INT, `cc_call_center_id` STRING, `cc_rec_start_date` DATE,
+        |`cc_rec_end_date` DATE, `cc_closed_date_sk` INT, `cc_open_date_sk` INT, `cc_name` STRING,
+        |`cc_class` STRING, `cc_employees` INT, `cc_sq_ft` INT, `cc_hours` STRING,
+        |`cc_manager` STRING, `cc_mkt_id` INT, `cc_mkt_class` STRING, `cc_mkt_desc` STRING,
+        |`cc_market_manager` STRING, `cc_division` INT, `cc_division_name` STRING, `cc_company` INT,
+        |`cc_company_name` STRING, `cc_street_number` STRING, `cc_street_name` STRING,
+        |`cc_street_type` STRING, `cc_suite_number` STRING, `cc_city` STRING, `cc_county` STRING,
+        |`cc_state` STRING, `cc_zip` STRING, `cc_country` STRING, `cc_gmt_offset` DECIMAL(5,2),
+        |`cc_tax_percentage` DECIMAL(5,2))
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `warehouse` (
+        |`w_warehouse_sk` INT, `w_warehouse_id` STRING, `w_warehouse_name` STRING,
+        |`w_warehouse_sq_ft` INT, `w_street_number` STRING, `w_street_name` STRING,
+        |`w_street_type` STRING, `w_suite_number` STRING, `w_city` STRING, `w_county` STRING,
+        |`w_state` STRING, `w_zip` STRING, `w_country` STRING, `w_gmt_offset` DECIMAL(5,2))
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `ship_mode` (
+        |`sm_ship_mode_sk` INT, `sm_ship_mode_id` STRING, `sm_type` STRING, `sm_code` STRING,
+        |`sm_carrier` STRING, `sm_contract` STRING)
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `income_band` (
+        |`ib_income_band_sk` INT, `ib_lower_bound` INT, `ib_upper_bound` INT)
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `time_dim` (
+        |`t_time_sk` INT, `t_time_id` STRING, `t_time` INT, `t_hour` INT, `t_minute` INT,
+        |`t_second` INT, `t_am_pm` STRING, `t_shift` STRING, `t_sub_shift` STRING,
+        |`t_meal_time` STRING)
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `web_page` (`wp_web_page_sk` INT, `wp_web_page_id` STRING,
+        |`wp_rec_start_date` DATE, `wp_rec_end_date` DATE, `wp_creation_date_sk` INT,
+        |`wp_access_date_sk` INT, `wp_autogen_flag` STRING, `wp_customer_sk` INT,
+        |`wp_url` STRING, `wp_type` STRING, `wp_char_count` INT, `wp_link_count` INT,
+        |`wp_image_count` INT, `wp_max_ad_count` INT)
+        |USING parquet
+      """.stripMargin)
+  }
+
+  val tpcdsQueries = Seq(
+    "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
+    "q12", "q13", "q14a", "q14b", "q15", "q16", "q17", "q18", "q19", "q20",
+    "q21", "q22", "q23a", "q23b", "q24a", "q24b", "q25", "q26", "q27", "q28", "q29", "q30",
+    "q31", "q32", "q33", "q34", "q35", "q36", "q37", "q38", "q39a", "q39b", "q40",
+    "q41", "q42", "q43", "q44", "q45", "q46", "q47", "q48", "q49", "q50",
+    "q51", "q52", "q53", "q54", "q55", "q56", "q57", "q58", "q59", "q60",
+    "q61", "q62", "q63", "q64", "q65", "q66", "q67", "q68", "q69", "q70",
+    "q71", "q72", "q73", "q74", "q75", "q76", "q77", "q78", "q79", "q80",
+    "q81", "q82", "q83", "q84", "q85", "q86", "q87", "q88", "q89", "q90",
+    "q91", "q92", "q93", "q94", "q95", "q96", "q97", "q98", "q99")
+
+  tpcdsQueries.foreach { name =>
+    val queryString = resourceToString(s"tpcds/$name.sql",
+      classLoader = Thread.currentThread().getContextClassLoader)
+    test(name) {
+      withSQLConf(SQLConf.CROSS_JOINS_ENABLED.key -> "true") {
+        sql(queryString).collect()
+      }
+    }
+  }
+}

From 7bf4da8a33c33b03bbfddc698335fe9b86ce1e0e Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Thu, 28 Sep 2017 10:24:51 +0900
Subject: [PATCH 635/779] [MINOR] Fixed up pandas_udf related docs and
 formatting

## What changes were proposed in this pull request?

Fixed some minor issues with pandas_udf related docs and formatting.

## How was this patch tested?

NA

Author: Bryan Cutler <cutlerb@gmail.com>

Closes #19375 from BryanCutler/arrow-pandas_udf-cleanup-minor.
---
 python/pyspark/serializers.py   | 6 +++---
 python/pyspark/sql/functions.py | 6 ++----
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index db77b7e150b24..ad18bd0c81eaa 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -191,7 +191,7 @@ def loads(self, obj):
 
 class ArrowSerializer(FramedSerializer):
     """
-    Serializes an Arrow stream.
+    Serializes bytes as Arrow data with the Arrow file format.
     """
 
     def dumps(self, batch):
@@ -239,7 +239,7 @@ class ArrowStreamPandasSerializer(Serializer):
 
     def dump_stream(self, iterator, stream):
         """
-        Make ArrowRecordBatches from Pandas Serieses and serialize. Input is a single series or
+        Make ArrowRecordBatches from Pandas Series and serialize. Input is a single series or
         a list of series accompanied by an optional pyarrow type to coerce the data to.
         """
         import pyarrow as pa
@@ -257,7 +257,7 @@ def dump_stream(self, iterator, stream):
 
     def load_stream(self, stream):
         """
-        Deserialize ArrowRecordBatchs to an Arrow table and return as a list of pandas.Series.
+        Deserialize ArrowRecordBatches to an Arrow table and return as a list of pandas.Series.
         """
         import pyarrow as pa
         reader = pa.open_stream(stream)
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 63e9a830bbc9e..b45a59db93679 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -2199,16 +2199,14 @@ def pandas_udf(f=None, returnType=StringType()):
     ...
     >>> df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age"))
     >>> df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")) \\
-    ...     .show() # doctest: +SKIP
+    ...     .show()  # doctest: +SKIP
     +----------+--------------+------------+
     |slen(name)|to_upper(name)|add_one(age)|
     +----------+--------------+------------+
     |         8|      JOHN DOE|          22|
     +----------+--------------+------------+
     """
-    wrapped_udf = _create_udf(f, returnType=returnType, vectorized=True)
-
-    return wrapped_udf
+    return _create_udf(f, returnType=returnType, vectorized=True)
 
 
 blacklist = ['map', 'since', 'ignore_unicode_prefix']

From 3b117d631e1ff387b70ed8efba229594f4594db5 Mon Sep 17 00:00:00 2001
From: zhoukang <zhoukang199191@gmail.com>
Date: Thu, 28 Sep 2017 09:25:21 +0800
Subject: [PATCH 636/779] [SPARK-22123][CORE] Add latest failure reason for
 task set blacklist

## What changes were proposed in this pull request?
This patch add latest failure reason for task set blacklist.Which can be showed on spark ui and let user know failure reason directly.
Till now , every job which aborted by completed blacklist just show log like below which has no more information:
`Aborting $taskSet because task $indexInTaskSet (partition $partition) cannot run anywhere due to node and executor blacklist.  Blacklisting behavior cannot run anywhere due to node and executor blacklist.Blacklisting behavior can be configured via spark.blacklist.*."`
**After modify:**
```
Aborting TaskSet 0.0 because task 0 (partition 0)
cannot run anywhere due to node and executor blacklist.
Most recent failure:
Some(Lost task 0.1 in stage 0.0 (TID 3,xxx, executor 1): java.lang.Exception: Fake error!
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:73)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:305)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
).

Blacklisting behavior can be configured via spark.blacklist.*.

```

## How was this patch tested?

Unit test and manually test.

Author: zhoukang <zhoukang199191@gmail.com>

Closes #19338 from caneGuy/zhoukang/improve-blacklist.
---
 .../spark/scheduler/TaskSetBlacklist.scala    | 14 ++++-
 .../spark/scheduler/TaskSetManager.scala      | 15 +++--
 .../scheduler/BlacklistIntegrationSuite.scala |  5 +-
 .../scheduler/BlacklistTrackerSuite.scala     | 60 ++++++++++++-------
 .../scheduler/TaskSchedulerImplSuite.scala    | 11 +++-
 .../scheduler/TaskSetBlacklistSuite.scala     | 45 +++++++++-----
 .../spark/scheduler/TaskSetManagerSuite.scala |  2 +-
 7 files changed, 104 insertions(+), 48 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetBlacklist.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetBlacklist.scala
index e815b7e0cf6c9..233781f3d9719 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetBlacklist.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetBlacklist.scala
@@ -61,6 +61,16 @@ private[scheduler] class TaskSetBlacklist(val conf: SparkConf, val stageId: Int,
   private val blacklistedExecs = new HashSet[String]()
   private val blacklistedNodes = new HashSet[String]()
 
+  private var latestFailureReason: String = null
+
+  /**
+   * Get the most recent failure reason of this TaskSet.
+   * @return
+   */
+  def getLatestFailureReason: String = {
+    latestFailureReason
+  }
+
   /**
    * Return true if this executor is blacklisted for the given task.  This does *not*
    * need to return true if the executor is blacklisted for the entire stage, or blacklisted
@@ -94,7 +104,9 @@ private[scheduler] class TaskSetBlacklist(val conf: SparkConf, val stageId: Int,
   private[scheduler] def updateBlacklistForFailedTask(
       host: String,
       exec: String,
-      index: Int): Unit = {
+      index: Int,
+      failureReason: String): Unit = {
+    latestFailureReason = failureReason
     val execFailures = execToFailures.getOrElseUpdate(exec, new ExecutorFailuresInTaskSet(host))
     execFailures.updateWithFailure(index, clock.getTimeMillis())
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index 3804ea863b4f9..bb867416a4fac 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -670,9 +670,14 @@ private[spark] class TaskSetManager(
           }
           if (blacklistedEverywhere) {
             val partition = tasks(indexInTaskSet).partitionId
-            abort(s"Aborting $taskSet because task $indexInTaskSet (partition $partition) " +
-              s"cannot run anywhere due to node and executor blacklist.  Blacklisting behavior " +
-              s"can be configured via spark.blacklist.*.")
+            abort(s"""
+              |Aborting $taskSet because task $indexInTaskSet (partition $partition)
+              |cannot run anywhere due to node and executor blacklist.
+              |Most recent failure:
+              |${taskSetBlacklist.getLatestFailureReason}
+              |
+              |Blacklisting behavior can be configured via spark.blacklist.*.
+              |""".stripMargin)
           }
         }
       }
@@ -837,9 +842,9 @@ private[spark] class TaskSetManager(
     sched.dagScheduler.taskEnded(tasks(index), reason, null, accumUpdates, info)
 
     if (!isZombie && reason.countTowardsTaskFailures) {
-      taskSetBlacklistHelperOpt.foreach(_.updateBlacklistForFailedTask(
-        info.host, info.executorId, index))
       assert (null != failureReason)
+      taskSetBlacklistHelperOpt.foreach(_.updateBlacklistForFailedTask(
+        info.host, info.executorId, index, failureReason))
       numFailures(index) += 1
       if (numFailures(index) >= maxTaskFailures) {
         logError("Task %d in stage %s failed %d times; aborting job".format(
diff --git a/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala
index f6015cd51c2bd..d3bbfd11d406d 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala
@@ -115,8 +115,9 @@ class BlacklistIntegrationSuite extends SchedulerIntegrationSuite[MultiExecutorM
     withBackend(runBackend _) {
       val jobFuture = submit(new MockRDD(sc, 10, Nil), (0 until 10).toArray)
       awaitJobTermination(jobFuture, duration)
-      val pattern = ("Aborting TaskSet 0.0 because task .* " +
-        "cannot run anywhere due to node and executor blacklist").r
+      val pattern = (
+        s"""|Aborting TaskSet 0.0 because task .*
+            |cannot run anywhere due to node and executor blacklist""".stripMargin).r
       assert(pattern.findFirstIn(failure.getMessage).isDefined,
         s"Couldn't find $pattern in ${failure.getMessage()}")
     }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/BlacklistTrackerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/BlacklistTrackerSuite.scala
index a136d69b36d6c..cd1b7a9e5ab18 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/BlacklistTrackerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/BlacklistTrackerSuite.scala
@@ -110,7 +110,8 @@ class BlacklistTrackerSuite extends SparkFunSuite with BeforeAndAfterEach with M
       val taskSetBlacklist = createTaskSetBlacklist(stageId)
       if (stageId % 2 == 0) {
         // fail one task in every other taskset
-        taskSetBlacklist.updateBlacklistForFailedTask("hostA", exec = "1", index = 0)
+        taskSetBlacklist.updateBlacklistForFailedTask(
+          "hostA", exec = "1", index = 0, failureReason = "testing")
         failuresSoFar += 1
       }
       blacklist.updateBlacklistForSuccessfulTaskSet(stageId, 0, taskSetBlacklist.execToFailures)
@@ -132,7 +133,8 @@ class BlacklistTrackerSuite extends SparkFunSuite with BeforeAndAfterEach with M
     // for many different stages, executor 1 fails a task, and then the taskSet fails.
     (0 until failuresUntilBlacklisted * 10).foreach { stage =>
       val taskSetBlacklist = createTaskSetBlacklist(stage)
-      taskSetBlacklist.updateBlacklistForFailedTask("hostA", exec = "1", index = 0)
+      taskSetBlacklist.updateBlacklistForFailedTask(
+        "hostA", exec = "1", index = 0, failureReason = "testing")
     }
     assertEquivalentToSet(blacklist.isExecutorBlacklisted(_), Set())
   }
@@ -147,7 +149,8 @@ class BlacklistTrackerSuite extends SparkFunSuite with BeforeAndAfterEach with M
       val numFailures = math.max(conf.get(config.MAX_FAILURES_PER_EXEC),
         conf.get(config.MAX_FAILURES_PER_EXEC_STAGE))
       (0 until numFailures).foreach { index =>
-        taskSetBlacklist.updateBlacklistForFailedTask("hostA", exec = "1", index = index)
+        taskSetBlacklist.updateBlacklistForFailedTask(
+          "hostA", exec = "1", index = index, failureReason = "testing")
       }
       assert(taskSetBlacklist.isExecutorBlacklistedForTaskSet("1"))
       assertEquivalentToSet(blacklist.isExecutorBlacklisted(_), Set())
@@ -170,7 +173,8 @@ class BlacklistTrackerSuite extends SparkFunSuite with BeforeAndAfterEach with M
     // Fail 4 tasks in one task set on executor 1, so that executor gets blacklisted for the whole
     // application.
     (0 until 4).foreach { partition =>
-      taskSetBlacklist0.updateBlacklistForFailedTask("hostA", exec = "1", index = partition)
+      taskSetBlacklist0.updateBlacklistForFailedTask(
+        "hostA", exec = "1", index = partition, failureReason = "testing")
     }
     blacklist.updateBlacklistForSuccessfulTaskSet(0, 0, taskSetBlacklist0.execToFailures)
     assert(blacklist.nodeBlacklist() === Set())
@@ -183,7 +187,8 @@ class BlacklistTrackerSuite extends SparkFunSuite with BeforeAndAfterEach with M
     // application.  Since that's the second executor that is blacklisted on the same node, we also
     // blacklist that node.
     (0 until 4).foreach { partition =>
-      taskSetBlacklist1.updateBlacklistForFailedTask("hostA", exec = "2", index = partition)
+      taskSetBlacklist1.updateBlacklistForFailedTask(
+        "hostA", exec = "2", index = partition, failureReason = "testing")
     }
     blacklist.updateBlacklistForSuccessfulTaskSet(0, 0, taskSetBlacklist1.execToFailures)
     assert(blacklist.nodeBlacklist() === Set("hostA"))
@@ -207,7 +212,8 @@ class BlacklistTrackerSuite extends SparkFunSuite with BeforeAndAfterEach with M
     // Fail one more task, but executor isn't put back into blacklist since the count of failures
     // on that executor should have been reset to 0.
     val taskSetBlacklist2 = createTaskSetBlacklist(stageId = 2)
-    taskSetBlacklist2.updateBlacklistForFailedTask("hostA", exec = "1", index = 0)
+    taskSetBlacklist2.updateBlacklistForFailedTask(
+      "hostA", exec = "1", index = 0, failureReason = "testing")
     blacklist.updateBlacklistForSuccessfulTaskSet(2, 0, taskSetBlacklist2.execToFailures)
     assert(blacklist.nodeBlacklist() === Set())
     assertEquivalentToSet(blacklist.isNodeBlacklisted(_), Set())
@@ -221,7 +227,8 @@ class BlacklistTrackerSuite extends SparkFunSuite with BeforeAndAfterEach with M
     // Lets say that executor 1 dies completely.  We get some task failures, but
     // the taskset then finishes successfully (elsewhere).
     (0 until 4).foreach { partition =>
-      taskSetBlacklist0.updateBlacklistForFailedTask("hostA", exec = "1", index = partition)
+      taskSetBlacklist0.updateBlacklistForFailedTask(
+        "hostA", exec = "1", index = partition, failureReason = "testing")
     }
     blacklist.handleRemovedExecutor("1")
     blacklist.updateBlacklistForSuccessfulTaskSet(
@@ -236,7 +243,8 @@ class BlacklistTrackerSuite extends SparkFunSuite with BeforeAndAfterEach with M
     // Now another executor gets spun up on that host, but it also dies.
     val taskSetBlacklist1 = createTaskSetBlacklist(stageId = 1)
     (0 until 4).foreach { partition =>
-      taskSetBlacklist1.updateBlacklistForFailedTask("hostA", exec = "2", index = partition)
+      taskSetBlacklist1.updateBlacklistForFailedTask(
+        "hostA", exec = "2", index = partition, failureReason = "testing")
     }
     blacklist.handleRemovedExecutor("2")
     blacklist.updateBlacklistForSuccessfulTaskSet(
@@ -279,7 +287,7 @@ class BlacklistTrackerSuite extends SparkFunSuite with BeforeAndAfterEach with M
 
     def failOneTaskInTaskSet(exec: String): Unit = {
       val taskSetBlacklist = createTaskSetBlacklist(stageId = stageId)
-      taskSetBlacklist.updateBlacklistForFailedTask("host-" + exec, exec, 0)
+      taskSetBlacklist.updateBlacklistForFailedTask("host-" + exec, exec, 0, "testing")
       blacklist.updateBlacklistForSuccessfulTaskSet(stageId, 0, taskSetBlacklist.execToFailures)
       stageId += 1
     }
@@ -354,12 +362,12 @@ class BlacklistTrackerSuite extends SparkFunSuite with BeforeAndAfterEach with M
     val taskSetBlacklist1 = createTaskSetBlacklist(stageId = 1)
     val taskSetBlacklist2 = createTaskSetBlacklist(stageId = 2)
     // Taskset1 has one failure immediately
-    taskSetBlacklist1.updateBlacklistForFailedTask("host-1", "1", 0)
+    taskSetBlacklist1.updateBlacklistForFailedTask("host-1", "1", 0, "testing")
     // Then we have a *long* delay, much longer than the timeout, before any other failures or
     // taskset completion
     clock.advance(blacklist.BLACKLIST_TIMEOUT_MILLIS * 5)
     // After the long delay, we have one failure on taskset 2, on the same executor
-    taskSetBlacklist2.updateBlacklistForFailedTask("host-1", "1", 0)
+    taskSetBlacklist2.updateBlacklistForFailedTask("host-1", "1", 0, "testing")
     // Finally, we complete both tasksets.  Its important here to complete taskset2 *first*.  We
     // want to make sure that when taskset 1 finishes, even though we've now got two task failures,
     // we realize that the task failure we just added was well before the timeout.
@@ -377,16 +385,20 @@ class BlacklistTrackerSuite extends SparkFunSuite with BeforeAndAfterEach with M
     // we blacklist executors on two different hosts -- make sure that doesn't lead to any
     // node blacklisting
     val taskSetBlacklist0 = createTaskSetBlacklist(stageId = 0)
-    taskSetBlacklist0.updateBlacklistForFailedTask("hostA", exec = "1", index = 0)
-    taskSetBlacklist0.updateBlacklistForFailedTask("hostA", exec = "1", index = 1)
+    taskSetBlacklist0.updateBlacklistForFailedTask(
+      "hostA", exec = "1", index = 0, failureReason = "testing")
+    taskSetBlacklist0.updateBlacklistForFailedTask(
+      "hostA", exec = "1", index = 1, failureReason = "testing")
     blacklist.updateBlacklistForSuccessfulTaskSet(0, 0, taskSetBlacklist0.execToFailures)
     assertEquivalentToSet(blacklist.isExecutorBlacklisted(_), Set("1"))
     verify(listenerBusMock).post(SparkListenerExecutorBlacklisted(0, "1", 2))
     assertEquivalentToSet(blacklist.isNodeBlacklisted(_), Set())
 
     val taskSetBlacklist1 = createTaskSetBlacklist(stageId = 1)
-    taskSetBlacklist1.updateBlacklistForFailedTask("hostB", exec = "2", index = 0)
-    taskSetBlacklist1.updateBlacklistForFailedTask("hostB", exec = "2", index = 1)
+    taskSetBlacklist1.updateBlacklistForFailedTask(
+      "hostB", exec = "2", index = 0, failureReason = "testing")
+    taskSetBlacklist1.updateBlacklistForFailedTask(
+      "hostB", exec = "2", index = 1, failureReason = "testing")
     blacklist.updateBlacklistForSuccessfulTaskSet(1, 0, taskSetBlacklist1.execToFailures)
     assertEquivalentToSet(blacklist.isExecutorBlacklisted(_), Set("1", "2"))
     verify(listenerBusMock).post(SparkListenerExecutorBlacklisted(0, "2", 2))
@@ -395,8 +407,10 @@ class BlacklistTrackerSuite extends SparkFunSuite with BeforeAndAfterEach with M
     // Finally, blacklist another executor on the same node as the original blacklisted executor,
     // and make sure this time we *do* blacklist the node.
     val taskSetBlacklist2 = createTaskSetBlacklist(stageId = 0)
-    taskSetBlacklist2.updateBlacklistForFailedTask("hostA", exec = "3", index = 0)
-    taskSetBlacklist2.updateBlacklistForFailedTask("hostA", exec = "3", index = 1)
+    taskSetBlacklist2.updateBlacklistForFailedTask(
+      "hostA", exec = "3", index = 0, failureReason = "testing")
+    taskSetBlacklist2.updateBlacklistForFailedTask(
+      "hostA", exec = "3", index = 1, failureReason = "testing")
     blacklist.updateBlacklistForSuccessfulTaskSet(0, 0, taskSetBlacklist2.execToFailures)
     assertEquivalentToSet(blacklist.isExecutorBlacklisted(_), Set("1", "2", "3"))
     verify(listenerBusMock).post(SparkListenerExecutorBlacklisted(0, "3", 2))
@@ -486,7 +500,8 @@ class BlacklistTrackerSuite extends SparkFunSuite with BeforeAndAfterEach with M
     // Fail 4 tasks in one task set on executor 1, so that executor gets blacklisted for the whole
     // application.
     (0 until 4).foreach { partition =>
-      taskSetBlacklist0.updateBlacklistForFailedTask("hostA", exec = "1", index = partition)
+      taskSetBlacklist0.updateBlacklistForFailedTask(
+        "hostA", exec = "1", index = partition, failureReason = "testing")
     }
     blacklist.updateBlacklistForSuccessfulTaskSet(0, 0, taskSetBlacklist0.execToFailures)
 
@@ -497,7 +512,8 @@ class BlacklistTrackerSuite extends SparkFunSuite with BeforeAndAfterEach with M
     // application.  Since that's the second executor that is blacklisted on the same node, we also
     // blacklist that node.
     (0 until 4).foreach { partition =>
-      taskSetBlacklist1.updateBlacklistForFailedTask("hostA", exec = "2", index = partition)
+      taskSetBlacklist1.updateBlacklistForFailedTask(
+        "hostA", exec = "2", index = partition, failureReason = "testing")
     }
     blacklist.updateBlacklistForSuccessfulTaskSet(0, 0, taskSetBlacklist1.execToFailures)
 
@@ -512,7 +528,8 @@ class BlacklistTrackerSuite extends SparkFunSuite with BeforeAndAfterEach with M
     // Fail 4 tasks in one task set on executor 1, so that executor gets blacklisted for the whole
     // application.
     (0 until 4).foreach { partition =>
-      taskSetBlacklist2.updateBlacklistForFailedTask("hostA", exec = "1", index = partition)
+      taskSetBlacklist2.updateBlacklistForFailedTask(
+        "hostA", exec = "1", index = partition, failureReason = "testing")
     }
     blacklist.updateBlacklistForSuccessfulTaskSet(0, 0, taskSetBlacklist2.execToFailures)
 
@@ -523,7 +540,8 @@ class BlacklistTrackerSuite extends SparkFunSuite with BeforeAndAfterEach with M
     // application.  Since that's the second executor that is blacklisted on the same node, we also
     // blacklist that node.
     (0 until 4).foreach { partition =>
-      taskSetBlacklist3.updateBlacklistForFailedTask("hostA", exec = "2", index = partition)
+      taskSetBlacklist3.updateBlacklistForFailedTask(
+        "hostA", exec = "2", index = partition, failureReason = "testing")
     }
     blacklist.updateBlacklistForSuccessfulTaskSet(0, 0, taskSetBlacklist3.execToFailures)
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
index b8626bf777598..6003899bb7bef 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
@@ -660,9 +660,14 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B
     assert(tsm.isZombie)
     assert(failedTaskSet)
     val idx = failedTask.index
-    assert(failedTaskSetReason === s"Aborting TaskSet 0.0 because task $idx (partition $idx) " +
-      s"cannot run anywhere due to node and executor blacklist.  Blacklisting behavior can be " +
-      s"configured via spark.blacklist.*.")
+    assert(failedTaskSetReason === s"""
+      |Aborting $taskSet because task $idx (partition $idx)
+      |cannot run anywhere due to node and executor blacklist.
+      |Most recent failure:
+      |${tsm.taskSetBlacklistHelperOpt.get.getLatestFailureReason}
+      |
+      |Blacklisting behavior can be configured via spark.blacklist.*.
+      |""".stripMargin)
   }
 
   test("don't abort if there is an executor available, though it hasn't had scheduled tasks yet") {
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetBlacklistSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetBlacklistSuite.scala
index f1392e9db6bfd..18981d5be2f94 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetBlacklistSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetBlacklistSuite.scala
@@ -37,7 +37,8 @@ class TaskSetBlacklistSuite extends SparkFunSuite {
 
     // First, mark task 0 as failed on exec1.
     // task 0 should be blacklisted on exec1, and nowhere else
-    taskSetBlacklist.updateBlacklistForFailedTask("hostA", exec = "exec1", index = 0)
+    taskSetBlacklist.updateBlacklistForFailedTask(
+      "hostA", exec = "exec1", index = 0, failureReason = "testing")
     for {
       executor <- (1 to 4).map(_.toString)
       index <- 0 until 10
@@ -49,17 +50,20 @@ class TaskSetBlacklistSuite extends SparkFunSuite {
     assert(!taskSetBlacklist.isNodeBlacklistedForTaskSet("hostA"))
 
     // Mark task 1 failed on exec1 -- this pushes the executor into the blacklist
-    taskSetBlacklist.updateBlacklistForFailedTask("hostA", exec = "exec1", index = 1)
+    taskSetBlacklist.updateBlacklistForFailedTask(
+      "hostA", exec = "exec1", index = 1, failureReason = "testing")
     assert(taskSetBlacklist.isExecutorBlacklistedForTaskSet("exec1"))
     assert(!taskSetBlacklist.isNodeBlacklistedForTaskSet("hostA"))
     // Mark one task as failed on exec2 -- not enough for any further blacklisting yet.
-    taskSetBlacklist.updateBlacklistForFailedTask("hostA", exec = "exec2", index = 0)
+    taskSetBlacklist.updateBlacklistForFailedTask(
+      "hostA", exec = "exec2", index = 0, failureReason = "testing")
     assert(taskSetBlacklist.isExecutorBlacklistedForTaskSet("exec1"))
     assert(!taskSetBlacklist.isExecutorBlacklistedForTaskSet("exec2"))
     assert(!taskSetBlacklist.isNodeBlacklistedForTaskSet("hostA"))
     // Mark another task as failed on exec2 -- now we blacklist exec2, which also leads to
     // blacklisting the entire node.
-    taskSetBlacklist.updateBlacklistForFailedTask("hostA", exec = "exec2", index = 1)
+    taskSetBlacklist.updateBlacklistForFailedTask(
+      "hostA", exec = "exec2", index = 1, failureReason = "testing")
     assert(taskSetBlacklist.isExecutorBlacklistedForTaskSet("exec1"))
     assert(taskSetBlacklist.isExecutorBlacklistedForTaskSet("exec2"))
     assert(taskSetBlacklist.isNodeBlacklistedForTaskSet("hostA"))
@@ -108,34 +112,41 @@ class TaskSetBlacklistSuite extends SparkFunSuite {
       .set(config.MAX_FAILED_EXEC_PER_NODE_STAGE, 3)
     val taskSetBlacklist = new TaskSetBlacklist(conf, stageId = 0, new SystemClock())
     // Fail a task twice on hostA, exec:1
-    taskSetBlacklist.updateBlacklistForFailedTask("hostA", exec = "1", index = 0)
-    taskSetBlacklist.updateBlacklistForFailedTask("hostA", exec = "1", index = 0)
+    taskSetBlacklist.updateBlacklistForFailedTask(
+      "hostA", exec = "1", index = 0, failureReason = "testing")
+    taskSetBlacklist.updateBlacklistForFailedTask(
+      "hostA", exec = "1", index = 0, failureReason = "testing")
     assert(taskSetBlacklist.isExecutorBlacklistedForTask("1", 0))
     assert(!taskSetBlacklist.isNodeBlacklistedForTask("hostA", 0))
     assert(!taskSetBlacklist.isExecutorBlacklistedForTaskSet("1"))
     assert(!taskSetBlacklist.isNodeBlacklistedForTaskSet("hostA"))
 
     // Fail the same task once more on hostA, exec:2
-    taskSetBlacklist.updateBlacklistForFailedTask("hostA", exec = "2", index = 0)
+    taskSetBlacklist.updateBlacklistForFailedTask(
+      "hostA", exec = "2", index = 0, failureReason = "testing")
     assert(taskSetBlacklist.isNodeBlacklistedForTask("hostA", 0))
     assert(!taskSetBlacklist.isExecutorBlacklistedForTaskSet("2"))
     assert(!taskSetBlacklist.isNodeBlacklistedForTaskSet("hostA"))
 
     // Fail another task on hostA, exec:1.  Now that executor has failures on two different tasks,
     // so its blacklisted
-    taskSetBlacklist.updateBlacklistForFailedTask("hostA", exec = "1", index = 1)
+    taskSetBlacklist.updateBlacklistForFailedTask(
+      "hostA", exec = "1", index = 1, failureReason = "testing")
     assert(taskSetBlacklist.isExecutorBlacklistedForTaskSet("1"))
     assert(!taskSetBlacklist.isNodeBlacklistedForTaskSet("hostA"))
 
     // Fail a third task on hostA, exec:2, so that exec is blacklisted for the whole task set
-    taskSetBlacklist.updateBlacklistForFailedTask("hostA", exec = "2", index = 2)
+    taskSetBlacklist.updateBlacklistForFailedTask(
+      "hostA", exec = "2", index = 2, failureReason = "testing")
     assert(taskSetBlacklist.isExecutorBlacklistedForTaskSet("2"))
     assert(!taskSetBlacklist.isNodeBlacklistedForTaskSet("hostA"))
 
     // Fail a fourth & fifth task on hostA, exec:3.  Now we've got three executors that are
     // blacklisted for the taskset, so blacklist the whole node.
-    taskSetBlacklist.updateBlacklistForFailedTask("hostA", exec = "3", index = 3)
-    taskSetBlacklist.updateBlacklistForFailedTask("hostA", exec = "3", index = 4)
+    taskSetBlacklist.updateBlacklistForFailedTask(
+      "hostA", exec = "3", index = 3, failureReason = "testing")
+    taskSetBlacklist.updateBlacklistForFailedTask(
+      "hostA", exec = "3", index = 4, failureReason = "testing")
     assert(taskSetBlacklist.isExecutorBlacklistedForTaskSet("3"))
     assert(taskSetBlacklist.isNodeBlacklistedForTaskSet("hostA"))
   }
@@ -147,13 +158,17 @@ class TaskSetBlacklistSuite extends SparkFunSuite {
     val conf = new SparkConf().setAppName("test").setMaster("local")
       .set(config.BLACKLIST_ENABLED.key, "true")
     val taskSetBlacklist = new TaskSetBlacklist(conf, stageId = 0, new SystemClock())
-    taskSetBlacklist.updateBlacklistForFailedTask("hostA", exec = "1", index = 0)
-    taskSetBlacklist.updateBlacklistForFailedTask("hostA", exec = "1", index = 1)
+    taskSetBlacklist.updateBlacklistForFailedTask(
+      "hostA", exec = "1", index = 0, failureReason = "testing")
+    taskSetBlacklist.updateBlacklistForFailedTask(
+      "hostA", exec = "1", index = 1, failureReason = "testing")
     assert(taskSetBlacklist.isExecutorBlacklistedForTaskSet("1"))
     assert(!taskSetBlacklist.isNodeBlacklistedForTaskSet("hostA"))
 
-    taskSetBlacklist.updateBlacklistForFailedTask("hostB", exec = "2", index = 0)
-    taskSetBlacklist.updateBlacklistForFailedTask("hostB", exec = "2", index = 1)
+    taskSetBlacklist.updateBlacklistForFailedTask(
+      "hostB", exec = "2", index = 0, failureReason = "testing")
+    taskSetBlacklist.updateBlacklistForFailedTask(
+      "hostB", exec = "2", index = 1, failureReason = "testing")
     assert(taskSetBlacklist.isExecutorBlacklistedForTaskSet("1"))
     assert(taskSetBlacklist.isExecutorBlacklistedForTaskSet("2"))
     assert(!taskSetBlacklist.isNodeBlacklistedForTaskSet("hostA"))
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
index ae43f4cadc037..5c712bd6a545b 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
@@ -1146,7 +1146,7 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg
     // Make sure that the blacklist ignored all of the task failures above, since they aren't
     // the fault of the executor where the task was running.
     verify(blacklist, never())
-      .updateBlacklistForFailedTask(anyString(), anyString(), anyInt())
+      .updateBlacklistForFailedTask(anyString(), anyString(), anyInt(), anyString())
   }
 
   test("update application blacklist for shuffle-fetch") {

From f20be4d70bf321f377020d1bde761a43e5c72f0a Mon Sep 17 00:00:00 2001
From: Paul Mackles <pmackles@adobe.com>
Date: Thu, 28 Sep 2017 14:43:31 +0800
Subject: [PATCH 637/779] [SPARK-22135][MESOS] metrics in spark-dispatcher not
 being registered properly

## What changes were proposed in this pull request?

Fix a trivial bug with how metrics are registered in the mesos dispatcher. Bug resulted in creating a new registry each time the metricRegistry() method was called.

## How was this patch tested?

Verified manually on local mesos setup

Author: Paul Mackles <pmackles@adobe.com>

Closes #19358 from pmackles/SPARK-22135.
---
 .../cluster/mesos/MesosClusterSchedulerSource.scala          | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSource.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSource.scala
index 1fe94974c8e36..76aded4edb431 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSource.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSource.scala
@@ -23,8 +23,9 @@ import org.apache.spark.metrics.source.Source
 
 private[mesos] class MesosClusterSchedulerSource(scheduler: MesosClusterScheduler)
   extends Source {
-  override def sourceName: String = "mesos_cluster"
-  override def metricRegistry: MetricRegistry = new MetricRegistry()
+
+  override val sourceName: String = "mesos_cluster"
+  override val metricRegistry: MetricRegistry = new MetricRegistry()
 
   metricRegistry.register(MetricRegistry.name("waitingDrivers"), new Gauge[Int] {
     override def getValue: Int = scheduler.getQueuedDriversSize

From 01bd00d13532af1c7328997cbec446b0d3e21459 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Thu, 28 Sep 2017 08:22:48 +0100
Subject: [PATCH 638/779] [SPARK-22128][CORE] Update paranamer to 2.8 to avoid
 BytecodeReadingParanamer ArrayIndexOutOfBoundsException with Scala 2.12 +
 Java 8 lambda

## What changes were proposed in this pull request?

Un-manage jackson-module-paranamer version to let it use the version desired by jackson-module-scala; manage paranamer up from 2.8 for jackson-module-scala 2.7.9, to override avro 1.7.7's desired paranamer 2.3

## How was this patch tested?

Existing tests

Author: Sean Owen <sowen@cloudera.com>

Closes #19352 from srowen/SPARK-22128.
---
 dev/deps/spark-deps-hadoop-2.6 |  4 ++--
 dev/deps/spark-deps-hadoop-2.7 |  4 ++--
 pom.xml                        | 10 ++++------
 3 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index e534e38213fb1..76fcbd15869f1 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -93,7 +93,7 @@ jackson-core-asl-1.9.13.jar
 jackson-databind-2.6.7.1.jar
 jackson-jaxrs-1.9.13.jar
 jackson-mapper-asl-1.9.13.jar
-jackson-module-paranamer-2.6.7.jar
+jackson-module-paranamer-2.7.9.jar
 jackson-module-scala_2.11-2.6.7.1.jar
 jackson-xc-1.9.13.jar
 janino-3.0.0.jar
@@ -153,7 +153,7 @@ orc-core-1.4.0-nohive.jar
 orc-mapreduce-1.4.0-nohive.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
-paranamer-2.6.jar
+paranamer-2.8.jar
 parquet-column-1.8.2.jar
 parquet-common-1.8.2.jar
 parquet-encoding-1.8.2.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 02c5a19d173be..cb20072bf8b30 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -93,7 +93,7 @@ jackson-core-asl-1.9.13.jar
 jackson-databind-2.6.7.1.jar
 jackson-jaxrs-1.9.13.jar
 jackson-mapper-asl-1.9.13.jar
-jackson-module-paranamer-2.6.7.jar
+jackson-module-paranamer-2.7.9.jar
 jackson-module-scala_2.11-2.6.7.1.jar
 jackson-xc-1.9.13.jar
 janino-3.0.0.jar
@@ -154,7 +154,7 @@ orc-core-1.4.0-nohive.jar
 orc-mapreduce-1.4.0-nohive.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
-paranamer-2.6.jar
+paranamer-2.8.jar
 parquet-column-1.8.2.jar
 parquet-common-1.8.2.jar
 parquet-encoding-1.8.2.jar
diff --git a/pom.xml b/pom.xml
index 83a35006707da..87a468c3a6f55 100644
--- a/pom.xml
+++ b/pom.xml
@@ -179,7 +179,10 @@
     <antlr4.version>4.7</antlr4.version>
     <jpam.version>1.1</jpam.version>
     <selenium.version>2.52.0</selenium.version>
-    <paranamer.version>2.6</paranamer.version>
+    <!--
+    Managed up from older version from Avro; sync with jackson-module-paranamer dependency version
+    -->
+    <paranamer.version>2.8</paranamer.version>
     <maven-antrun.version>1.8</maven-antrun.version>
     <commons-crypto.version>1.0.0</commons-crypto.version>
     <arrow.version>0.4.0</arrow.version>
@@ -637,11 +640,6 @@
           </exclusion>
         </exclusions>
       </dependency>
-      <dependency>
-        <groupId>com.fasterxml.jackson.module</groupId>
-        <artifactId>jackson-module-paranamer</artifactId>
-        <version>${fasterxml.jackson.version}</version>
-      </dependency>
       <dependency>
         <groupId>com.fasterxml.jackson.module</groupId>
         <artifactId>jackson-module-jaxb-annotations</artifactId>

From d74dee1336e7152cc0fb7d2b3bf1a44f4f452025 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 28 Sep 2017 09:20:37 -0700
Subject: [PATCH 639/779] [SPARK-22153][SQL] Rename ShuffleExchange ->
 ShuffleExchangeExec

## What changes were proposed in this pull request?
For some reason when we added the Exec suffix to all physical operators, we missed this one. I was looking for this physical operator today and couldn't find it, because I was looking for ExchangeExec.

## How was this patch tested?
This is a simple rename and should be covered by existing tests.

Author: Reynold Xin <rxin@databricks.com>

Closes #19376 from rxin/SPARK-22153.
---
 .../spark/sql/execution/SparkStrategies.scala |  6 +--
 .../exchange/EnsureRequirements.scala         | 26 ++++++-------
 .../exchange/ExchangeCoordinator.scala        | 38 +++++++++----------
 ...change.scala => ShuffleExchangeExec.scala} | 10 ++---
 .../apache/spark/sql/execution/limit.scala    |  6 +--
 .../streaming/IncrementalExecution.scala      |  4 +-
 .../apache/spark/sql/CachedTableSuite.scala   |  5 ++-
 .../org/apache/spark/sql/DataFrameSuite.scala | 10 ++---
 .../org/apache/spark/sql/DatasetSuite.scala   |  4 +-
 .../execution/ExchangeCoordinatorSuite.scala  | 22 +++++------
 .../spark/sql/execution/ExchangeSuite.scala   | 12 +++---
 .../spark/sql/execution/PlannerSuite.scala    | 32 ++++++++--------
 .../spark/sql/sources/BucketedReadSuite.scala | 10 ++---
 .../EnsureStatefulOpPartitioningSuite.scala   |  4 +-
 14 files changed, 95 insertions(+), 94 deletions(-)
 rename sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/{ShuffleExchange.scala => ShuffleExchangeExec.scala} (98%)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 4da7a73469537..92eaab5cd8f81 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution.columnar.{InMemoryRelation, InMemoryTableScanExec}
 import org.apache.spark.sql.execution.command._
-import org.apache.spark.sql.execution.exchange.ShuffleExchange
+import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec
 import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight}
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.internal.SQLConf
@@ -411,7 +411,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
 
       case logical.Repartition(numPartitions, shuffle, child) =>
         if (shuffle) {
-          ShuffleExchange(RoundRobinPartitioning(numPartitions), planLater(child)) :: Nil
+          ShuffleExchangeExec(RoundRobinPartitioning(numPartitions), planLater(child)) :: Nil
         } else {
           execution.CoalesceExec(numPartitions, planLater(child)) :: Nil
         }
@@ -446,7 +446,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       case r: logical.Range =>
         execution.RangeExec(r) :: Nil
       case logical.RepartitionByExpression(expressions, child, numPartitions) =>
-        exchange.ShuffleExchange(HashPartitioning(
+        exchange.ShuffleExchangeExec(HashPartitioning(
           expressions, numPartitions), planLater(child)) :: Nil
       case ExternalRDD(outputObjAttr, rdd) => ExternalRDDScanExec(outputObjAttr, rdd) :: Nil
       case r: LogicalRDD =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
index 1da72f2e92329..d28ce60e276d5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
@@ -27,8 +27,8 @@ import org.apache.spark.sql.internal.SQLConf
  * Ensures that the [[org.apache.spark.sql.catalyst.plans.physical.Partitioning Partitioning]]
  * of input data meets the
  * [[org.apache.spark.sql.catalyst.plans.physical.Distribution Distribution]] requirements for
- * each operator by inserting [[ShuffleExchange]] Operators where required.  Also ensure that the
- * input partition ordering requirements are met.
+ * each operator by inserting [[ShuffleExchangeExec]] Operators where required.  Also ensure that
+ * the input partition ordering requirements are met.
  */
 case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
   private def defaultNumPreShufflePartitions: Int = conf.numShufflePartitions
@@ -57,17 +57,17 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
   }
 
   /**
-   * Adds [[ExchangeCoordinator]] to [[ShuffleExchange]]s if adaptive query execution is enabled
-   * and partitioning schemes of these [[ShuffleExchange]]s support [[ExchangeCoordinator]].
+   * Adds [[ExchangeCoordinator]] to [[ShuffleExchangeExec]]s if adaptive query execution is enabled
+   * and partitioning schemes of these [[ShuffleExchangeExec]]s support [[ExchangeCoordinator]].
    */
   private def withExchangeCoordinator(
       children: Seq[SparkPlan],
       requiredChildDistributions: Seq[Distribution]): Seq[SparkPlan] = {
     val supportsCoordinator =
-      if (children.exists(_.isInstanceOf[ShuffleExchange])) {
+      if (children.exists(_.isInstanceOf[ShuffleExchangeExec])) {
         // Right now, ExchangeCoordinator only support HashPartitionings.
         children.forall {
-          case e @ ShuffleExchange(hash: HashPartitioning, _, _) => true
+          case e @ ShuffleExchangeExec(hash: HashPartitioning, _, _) => true
           case child =>
             child.outputPartitioning match {
               case hash: HashPartitioning => true
@@ -94,7 +94,7 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
             targetPostShuffleInputSize,
             minNumPostShufflePartitions)
         children.zip(requiredChildDistributions).map {
-          case (e: ShuffleExchange, _) =>
+          case (e: ShuffleExchangeExec, _) =>
             // This child is an Exchange, we need to add the coordinator.
             e.copy(coordinator = Some(coordinator))
           case (child, distribution) =>
@@ -138,7 +138,7 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
             val targetPartitioning =
               createPartitioning(distribution, defaultNumPreShufflePartitions)
             assert(targetPartitioning.isInstanceOf[HashPartitioning])
-            ShuffleExchange(targetPartitioning, child, Some(coordinator))
+            ShuffleExchangeExec(targetPartitioning, child, Some(coordinator))
         }
       } else {
         // If we do not need ExchangeCoordinator, the original children are returned.
@@ -162,7 +162,7 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
       case (child, BroadcastDistribution(mode)) =>
         BroadcastExchangeExec(mode, child)
       case (child, distribution) =>
-        ShuffleExchange(createPartitioning(distribution, defaultNumPreShufflePartitions), child)
+        ShuffleExchangeExec(createPartitioning(distribution, defaultNumPreShufflePartitions), child)
     }
 
     // If the operator has multiple children and specifies child output distributions (e.g. join),
@@ -215,8 +215,8 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
               child match {
                 // If child is an exchange, we replace it with
                 // a new one having targetPartitioning.
-                case ShuffleExchange(_, c, _) => ShuffleExchange(targetPartitioning, c)
-                case _ => ShuffleExchange(targetPartitioning, child)
+                case ShuffleExchangeExec(_, c, _) => ShuffleExchangeExec(targetPartitioning, c)
+                case _ => ShuffleExchangeExec(targetPartitioning, child)
               }
           }
         }
@@ -246,9 +246,9 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
   }
 
   def apply(plan: SparkPlan): SparkPlan = plan.transformUp {
-    case operator @ ShuffleExchange(partitioning, child, _) =>
+    case operator @ ShuffleExchangeExec(partitioning, child, _) =>
       child.children match {
-        case ShuffleExchange(childPartitioning, baseChild, _)::Nil =>
+        case ShuffleExchangeExec(childPartitioning, baseChild, _)::Nil =>
           if (childPartitioning.guarantees(partitioning)) child else operator
         case _ => operator
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ExchangeCoordinator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ExchangeCoordinator.scala
index 9fc4ffb651ec8..78f11ca8d8c78 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ExchangeCoordinator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ExchangeCoordinator.scala
@@ -35,9 +35,9 @@ import org.apache.spark.sql.execution.{ShuffledRowRDD, SparkPlan}
  *
  * A coordinator is constructed with three parameters, `numExchanges`,
  * `targetPostShuffleInputSize`, and `minNumPostShufflePartitions`.
- *  - `numExchanges` is used to indicated that how many [[ShuffleExchange]]s that will be registered
- *    to this coordinator. So, when we start to do any actual work, we have a way to make sure that
- *    we have got expected number of [[ShuffleExchange]]s.
+ *  - `numExchanges` is used to indicated that how many [[ShuffleExchangeExec]]s that will be
+ *    registered to this coordinator. So, when we start to do any actual work, we have a way to
+ *    make sure that we have got expected number of [[ShuffleExchangeExec]]s.
  *  - `targetPostShuffleInputSize` is the targeted size of a post-shuffle partition's
  *    input data size. With this parameter, we can estimate the number of post-shuffle partitions.
  *    This parameter is configured through
@@ -47,28 +47,28 @@ import org.apache.spark.sql.execution.{ShuffledRowRDD, SparkPlan}
  *    partitions.
  *
  * The workflow of this coordinator is described as follows:
- *  - Before the execution of a [[SparkPlan]], for a [[ShuffleExchange]] operator,
+ *  - Before the execution of a [[SparkPlan]], for a [[ShuffleExchangeExec]] operator,
  *    if an [[ExchangeCoordinator]] is assigned to it, it registers itself to this coordinator.
  *    This happens in the `doPrepare` method.
- *  - Once we start to execute a physical plan, a [[ShuffleExchange]] registered to this
+ *  - Once we start to execute a physical plan, a [[ShuffleExchangeExec]] registered to this
  *    coordinator will call `postShuffleRDD` to get its corresponding post-shuffle
  *    [[ShuffledRowRDD]].
- *    If this coordinator has made the decision on how to shuffle data, this [[ShuffleExchange]]
+ *    If this coordinator has made the decision on how to shuffle data, this [[ShuffleExchangeExec]]
  *    will immediately get its corresponding post-shuffle [[ShuffledRowRDD]].
  *  - If this coordinator has not made the decision on how to shuffle data, it will ask those
- *    registered [[ShuffleExchange]]s to submit their pre-shuffle stages. Then, based on the
+ *    registered [[ShuffleExchangeExec]]s to submit their pre-shuffle stages. Then, based on the
  *    size statistics of pre-shuffle partitions, this coordinator will determine the number of
  *    post-shuffle partitions and pack multiple pre-shuffle partitions with continuous indices
  *    to a single post-shuffle partition whenever necessary.
  *  - Finally, this coordinator will create post-shuffle [[ShuffledRowRDD]]s for all registered
- *    [[ShuffleExchange]]s. So, when a [[ShuffleExchange]] calls `postShuffleRDD`, this coordinator
- *    can lookup the corresponding [[RDD]].
+ *    [[ShuffleExchangeExec]]s. So, when a [[ShuffleExchangeExec]] calls `postShuffleRDD`, this
+ *    coordinator can lookup the corresponding [[RDD]].
  *
  * The strategy used to determine the number of post-shuffle partitions is described as follows.
  * To determine the number of post-shuffle partitions, we have a target input size for a
  * post-shuffle partition. Once we have size statistics of pre-shuffle partitions from stages
- * corresponding to the registered [[ShuffleExchange]]s, we will do a pass of those statistics and
- * pack pre-shuffle partitions with continuous indices to a single post-shuffle partition until
+ * corresponding to the registered [[ShuffleExchangeExec]]s, we will do a pass of those statistics
+ * and pack pre-shuffle partitions with continuous indices to a single post-shuffle partition until
  * adding another pre-shuffle partition would cause the size of a post-shuffle partition to be
  * greater than the target size.
  *
@@ -89,11 +89,11 @@ class ExchangeCoordinator(
   extends Logging {
 
   // The registered Exchange operators.
-  private[this] val exchanges = ArrayBuffer[ShuffleExchange]()
+  private[this] val exchanges = ArrayBuffer[ShuffleExchangeExec]()
 
   // This map is used to lookup the post-shuffle ShuffledRowRDD for an Exchange operator.
-  private[this] val postShuffleRDDs: JMap[ShuffleExchange, ShuffledRowRDD] =
-    new JHashMap[ShuffleExchange, ShuffledRowRDD](numExchanges)
+  private[this] val postShuffleRDDs: JMap[ShuffleExchangeExec, ShuffledRowRDD] =
+    new JHashMap[ShuffleExchangeExec, ShuffledRowRDD](numExchanges)
 
   // A boolean that indicates if this coordinator has made decision on how to shuffle data.
   // This variable will only be updated by doEstimationIfNecessary, which is protected by
@@ -101,11 +101,11 @@ class ExchangeCoordinator(
   @volatile private[this] var estimated: Boolean = false
 
   /**
-   * Registers a [[ShuffleExchange]] operator to this coordinator. This method is only allowed to
-   * be called in the `doPrepare` method of a [[ShuffleExchange]] operator.
+   * Registers a [[ShuffleExchangeExec]] operator to this coordinator. This method is only allowed
+   * to be called in the `doPrepare` method of a [[ShuffleExchangeExec]] operator.
    */
   @GuardedBy("this")
-  def registerExchange(exchange: ShuffleExchange): Unit = synchronized {
+  def registerExchange(exchange: ShuffleExchangeExec): Unit = synchronized {
     exchanges += exchange
   }
 
@@ -200,7 +200,7 @@ class ExchangeCoordinator(
       // Make sure we have the expected number of registered Exchange operators.
       assert(exchanges.length == numExchanges)
 
-      val newPostShuffleRDDs = new JHashMap[ShuffleExchange, ShuffledRowRDD](numExchanges)
+      val newPostShuffleRDDs = new JHashMap[ShuffleExchangeExec, ShuffledRowRDD](numExchanges)
 
       // Submit all map stages
       val shuffleDependencies = ArrayBuffer[ShuffleDependency[Int, InternalRow, InternalRow]]()
@@ -255,7 +255,7 @@ class ExchangeCoordinator(
     }
   }
 
-  def postShuffleRDD(exchange: ShuffleExchange): ShuffledRowRDD = {
+  def postShuffleRDD(exchange: ShuffleExchangeExec): ShuffledRowRDD = {
     doEstimationIfNecessary()
 
     if (!postShuffleRDDs.containsKey(exchange)) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala
similarity index 98%
rename from sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchange.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala
index 0d06d83fb2f3c..11c4aa9b4acf0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchange.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala
@@ -35,7 +35,7 @@ import org.apache.spark.util.MutablePair
 /**
  * Performs a shuffle that will result in the desired `newPartitioning`.
  */
-case class ShuffleExchange(
+case class ShuffleExchangeExec(
     var newPartitioning: Partitioning,
     child: SparkPlan,
     @transient coordinator: Option[ExchangeCoordinator]) extends Exchange {
@@ -84,7 +84,7 @@ case class ShuffleExchange(
    */
   private[exchange] def prepareShuffleDependency()
     : ShuffleDependency[Int, InternalRow, InternalRow] = {
-    ShuffleExchange.prepareShuffleDependency(
+    ShuffleExchangeExec.prepareShuffleDependency(
       child.execute(), child.output, newPartitioning, serializer)
   }
 
@@ -129,9 +129,9 @@ case class ShuffleExchange(
   }
 }
 
-object ShuffleExchange {
-  def apply(newPartitioning: Partitioning, child: SparkPlan): ShuffleExchange = {
-    ShuffleExchange(newPartitioning, child, coordinator = Option.empty[ExchangeCoordinator])
+object ShuffleExchangeExec {
+  def apply(newPartitioning: Partitioning, child: SparkPlan): ShuffleExchangeExec = {
+    ShuffleExchangeExec(newPartitioning, child, coordinator = Option.empty[ExchangeCoordinator])
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala
index 1f515e29b4af5..13da4b26a5dcb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode, LazilyGeneratedOrdering}
 import org.apache.spark.sql.catalyst.plans.physical._
-import org.apache.spark.sql.execution.exchange.ShuffleExchange
+import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec
 import org.apache.spark.util.Utils
 
 /**
@@ -40,7 +40,7 @@ case class CollectLimitExec(limit: Int, child: SparkPlan) extends UnaryExecNode
   protected override def doExecute(): RDD[InternalRow] = {
     val locallyLimited = child.execute().mapPartitionsInternal(_.take(limit))
     val shuffled = new ShuffledRowRDD(
-      ShuffleExchange.prepareShuffleDependency(
+      ShuffleExchangeExec.prepareShuffleDependency(
         locallyLimited, child.output, SinglePartition, serializer))
     shuffled.mapPartitionsInternal(_.take(limit))
   }
@@ -153,7 +153,7 @@ case class TakeOrderedAndProjectExec(
       }
     }
     val shuffled = new ShuffledRowRDD(
-      ShuffleExchange.prepareShuffleDependency(
+      ShuffleExchangeExec.prepareShuffleDependency(
         localTopK, child.output, SinglePartition, serializer))
     shuffled.mapPartitions { iter =>
       val topK = org.apache.spark.util.collection.Utils.takeOrdered(iter.map(_.copy()), limit)(ord)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
index 8e0aae39cabb6..82f879c763c2b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistribution, HashPartitioning, SinglePartition}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.{QueryExecution, SparkPlan, SparkPlanner, UnaryExecNode}
-import org.apache.spark.sql.execution.exchange.ShuffleExchange
+import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec
 import org.apache.spark.sql.streaming.OutputMode
 
 /**
@@ -155,7 +155,7 @@ object EnsureStatefulOpPartitioning extends Rule[SparkPlan] {
             child.execute().getNumPartitions == expectedPartitioning.numPartitions) {
           child
         } else {
-          ShuffleExchange(expectedPartitioning, child)
+          ShuffleExchangeExec(expectedPartitioning, child)
         }
       }
       so.withNewChildren(children)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
index 3e4f619431599..1e52445f28fc1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.expressions.SubqueryExpression
 import org.apache.spark.sql.execution.{RDDScanExec, SparkPlan}
 import org.apache.spark.sql.execution.columnar._
-import org.apache.spark.sql.execution.exchange.ShuffleExchange
+import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.{SharedSQLContext, SQLTestUtils}
 import org.apache.spark.storage.{RDDBlockId, StorageLevel}
@@ -420,7 +420,8 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext
    * Verifies that the plan for `df` contains `expected` number of Exchange operators.
    */
   private def verifyNumExchanges(df: DataFrame, expected: Int): Unit = {
-    assert(df.queryExecution.executedPlan.collect { case e: ShuffleExchange => e }.size == expected)
+    assert(
+      df.queryExecution.executedPlan.collect { case e: ShuffleExchangeExec => e }.size == expected)
   }
 
   test("A cached table preserves the partitioning and ordering of its cached SparkPlan") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 6178661cf7b2b..0e2f2e5a193e1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -31,7 +31,7 @@ import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.plans.logical.{Filter, OneRowRelation, Union}
 import org.apache.spark.sql.execution.{FilterExec, QueryExecution}
 import org.apache.spark.sql.execution.aggregate.HashAggregateExec
-import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ReusedExchangeExec, ShuffleExchange}
+import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ReusedExchangeExec, ShuffleExchangeExec}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.{ExamplePoint, ExamplePointUDT, SharedSQLContext}
@@ -1529,7 +1529,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
           fail("Should not have back to back Aggregates")
         }
         atFirstAgg = true
-      case e: ShuffleExchange => atFirstAgg = false
+      case e: ShuffleExchangeExec => atFirstAgg = false
       case _ =>
     }
   }
@@ -1710,19 +1710,19 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
       val plan = join.queryExecution.executedPlan
       checkAnswer(join, df)
       assert(
-        join.queryExecution.executedPlan.collect { case e: ShuffleExchange => true }.size === 1)
+        join.queryExecution.executedPlan.collect { case e: ShuffleExchangeExec => true }.size === 1)
       assert(
         join.queryExecution.executedPlan.collect { case e: ReusedExchangeExec => true }.size === 1)
       val broadcasted = broadcast(join)
       val join2 = join.join(broadcasted, "id").join(broadcasted, "id")
       checkAnswer(join2, df)
       assert(
-        join2.queryExecution.executedPlan.collect { case e: ShuffleExchange => true }.size === 1)
+        join2.queryExecution.executedPlan.collect { case e: ShuffleExchangeExec => true }.size == 1)
       assert(
         join2.queryExecution.executedPlan
           .collect { case e: BroadcastExchangeExec => true }.size === 1)
       assert(
-        join2.queryExecution.executedPlan.collect { case e: ReusedExchangeExec => true }.size === 4)
+        join2.queryExecution.executedPlan.collect { case e: ReusedExchangeExec => true }.size == 4)
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 5015f3709f131..dace6825ee40e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.catalyst.encoders.{OuterScopes, RowEncoder}
 import org.apache.spark.sql.catalyst.plans.{LeftAnti, LeftSemi}
 import org.apache.spark.sql.catalyst.util.sideBySide
 import org.apache.spark.sql.execution.{LogicalRDD, RDDScanExec}
-import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ShuffleExchange}
+import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ShuffleExchangeExec}
 import org.apache.spark.sql.execution.streaming.MemoryStream
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
@@ -1206,7 +1206,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
       val agg = cp.groupBy('id % 2).agg(count('id))
 
       agg.queryExecution.executedPlan.collectFirst {
-        case ShuffleExchange(_, _: RDDScanExec, _) =>
+        case ShuffleExchangeExec(_, _: RDDScanExec, _) =>
         case BroadcastExchangeExec(_, _: RDDScanExec) =>
       }.foreach { _ =>
         fail(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeCoordinatorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeCoordinatorSuite.scala
index f1b5e3be5b63f..737eeb0af586e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeCoordinatorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeCoordinatorSuite.scala
@@ -21,7 +21,7 @@ import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.{MapOutputStatistics, SparkConf, SparkFunSuite}
 import org.apache.spark.sql._
-import org.apache.spark.sql.execution.exchange.{ExchangeCoordinator, ShuffleExchange}
+import org.apache.spark.sql.execution.exchange.{ExchangeCoordinator, ShuffleExchangeExec}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 
@@ -300,13 +300,13 @@ class ExchangeCoordinatorSuite extends SparkFunSuite with BeforeAndAfterAll {
         // Then, let's look at the number of post-shuffle partitions estimated
         // by the ExchangeCoordinator.
         val exchanges = agg.queryExecution.executedPlan.collect {
-          case e: ShuffleExchange => e
+          case e: ShuffleExchangeExec => e
         }
         assert(exchanges.length === 1)
         minNumPostShufflePartitions match {
           case Some(numPartitions) =>
             exchanges.foreach {
-              case e: ShuffleExchange =>
+              case e: ShuffleExchangeExec =>
                 assert(e.coordinator.isDefined)
                 assert(e.outputPartitioning.numPartitions === 5)
               case o =>
@@ -314,7 +314,7 @@ class ExchangeCoordinatorSuite extends SparkFunSuite with BeforeAndAfterAll {
 
           case None =>
             exchanges.foreach {
-              case e: ShuffleExchange =>
+              case e: ShuffleExchangeExec =>
                 assert(e.coordinator.isDefined)
                 assert(e.outputPartitioning.numPartitions === 3)
               case o =>
@@ -351,13 +351,13 @@ class ExchangeCoordinatorSuite extends SparkFunSuite with BeforeAndAfterAll {
         // Then, let's look at the number of post-shuffle partitions estimated
         // by the ExchangeCoordinator.
         val exchanges = join.queryExecution.executedPlan.collect {
-          case e: ShuffleExchange => e
+          case e: ShuffleExchangeExec => e
         }
         assert(exchanges.length === 2)
         minNumPostShufflePartitions match {
           case Some(numPartitions) =>
             exchanges.foreach {
-              case e: ShuffleExchange =>
+              case e: ShuffleExchangeExec =>
                 assert(e.coordinator.isDefined)
                 assert(e.outputPartitioning.numPartitions === 5)
               case o =>
@@ -365,7 +365,7 @@ class ExchangeCoordinatorSuite extends SparkFunSuite with BeforeAndAfterAll {
 
           case None =>
             exchanges.foreach {
-              case e: ShuffleExchange =>
+              case e: ShuffleExchangeExec =>
                 assert(e.coordinator.isDefined)
                 assert(e.outputPartitioning.numPartitions === 2)
               case o =>
@@ -407,13 +407,13 @@ class ExchangeCoordinatorSuite extends SparkFunSuite with BeforeAndAfterAll {
         // Then, let's look at the number of post-shuffle partitions estimated
         // by the ExchangeCoordinator.
         val exchanges = join.queryExecution.executedPlan.collect {
-          case e: ShuffleExchange => e
+          case e: ShuffleExchangeExec => e
         }
         assert(exchanges.length === 4)
         minNumPostShufflePartitions match {
           case Some(numPartitions) =>
             exchanges.foreach {
-              case e: ShuffleExchange =>
+              case e: ShuffleExchangeExec =>
                 assert(e.coordinator.isDefined)
                 assert(e.outputPartitioning.numPartitions === 5)
               case o =>
@@ -459,13 +459,13 @@ class ExchangeCoordinatorSuite extends SparkFunSuite with BeforeAndAfterAll {
         // Then, let's look at the number of post-shuffle partitions estimated
         // by the ExchangeCoordinator.
         val exchanges = join.queryExecution.executedPlan.collect {
-          case e: ShuffleExchange => e
+          case e: ShuffleExchangeExec => e
         }
         assert(exchanges.length === 3)
         minNumPostShufflePartitions match {
           case Some(numPartitions) =>
             exchanges.foreach {
-              case e: ShuffleExchange =>
+              case e: ShuffleExchangeExec =>
                 assert(e.coordinator.isDefined)
                 assert(e.outputPartitioning.numPartitions === 5)
               case o =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeSuite.scala
index 59eaf4d1c29b7..aac8d56ba6201 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.execution
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions.{Alias, Literal}
 import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, IdentityBroadcastMode, SinglePartition}
-import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ReusedExchangeExec, ShuffleExchange}
+import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ReusedExchangeExec, ShuffleExchangeExec}
 import org.apache.spark.sql.execution.joins.HashedRelationBroadcastMode
 import org.apache.spark.sql.test.SharedSQLContext
 
@@ -31,7 +31,7 @@ class ExchangeSuite extends SparkPlanTest with SharedSQLContext {
     val input = (1 to 1000).map(Tuple1.apply)
     checkAnswer(
       input.toDF(),
-      plan => ShuffleExchange(SinglePartition, plan),
+      plan => ShuffleExchangeExec(SinglePartition, plan),
       input.map(Row.fromTuple)
     )
   }
@@ -81,12 +81,12 @@ class ExchangeSuite extends SparkPlanTest with SharedSQLContext {
     assert(plan sameResult plan)
 
     val part1 = HashPartitioning(output, 1)
-    val exchange1 = ShuffleExchange(part1, plan)
-    val exchange2 = ShuffleExchange(part1, plan)
+    val exchange1 = ShuffleExchangeExec(part1, plan)
+    val exchange2 = ShuffleExchangeExec(part1, plan)
     val part2 = HashPartitioning(output, 2)
-    val exchange3 = ShuffleExchange(part2, plan)
+    val exchange3 = ShuffleExchangeExec(part2, plan)
     val part3 = HashPartitioning(output ++ output, 2)
-    val exchange4 = ShuffleExchange(part3, plan)
+    val exchange4 = ShuffleExchangeExec(part3, plan)
     val exchange5 = ReusedExchangeExec(output, exchange4)
 
     assert(exchange1 sameResult exchange1)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index 63e17c7f372b0..86066362da9dd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.plans.{Cross, FullOuter, Inner, LeftOuter,
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Repartition}
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution.columnar.InMemoryRelation
-import org.apache.spark.sql.execution.exchange.{EnsureRequirements, ReusedExchangeExec, ReuseExchange, ShuffleExchange}
+import org.apache.spark.sql.execution.exchange.{EnsureRequirements, ReusedExchangeExec, ReuseExchange, ShuffleExchangeExec}
 import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, SortMergeJoinExec}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
@@ -214,7 +214,7 @@ class PlannerSuite extends SharedSQLContext {
               |  JOIN tiny ON (small.key = tiny.key)
             """.stripMargin
           ).queryExecution.executedPlan.collect {
-            case exchange: ShuffleExchange => exchange
+            case exchange: ShuffleExchangeExec => exchange
           }.length
           assert(numExchanges === 5)
         }
@@ -229,7 +229,7 @@ class PlannerSuite extends SharedSQLContext {
               |  JOIN tiny ON (normal.key = tiny.key)
             """.stripMargin
           ).queryExecution.executedPlan.collect {
-            case exchange: ShuffleExchange => exchange
+            case exchange: ShuffleExchangeExec => exchange
           }.length
           assert(numExchanges === 5)
         }
@@ -300,7 +300,7 @@ class PlannerSuite extends SharedSQLContext {
     )
     val outputPlan = EnsureRequirements(spark.sessionState.conf).apply(inputPlan)
     assertDistributionRequirementsAreSatisfied(outputPlan)
-    if (outputPlan.collect { case e: ShuffleExchange => true }.isEmpty) {
+    if (outputPlan.collect { case e: ShuffleExchangeExec => true }.isEmpty) {
       fail(s"Exchange should have been added:\n$outputPlan")
     }
   }
@@ -338,7 +338,7 @@ class PlannerSuite extends SharedSQLContext {
     )
     val outputPlan = EnsureRequirements(spark.sessionState.conf).apply(inputPlan)
     assertDistributionRequirementsAreSatisfied(outputPlan)
-    if (outputPlan.collect { case e: ShuffleExchange => true }.isEmpty) {
+    if (outputPlan.collect { case e: ShuffleExchangeExec => true }.isEmpty) {
       fail(s"Exchange should have been added:\n$outputPlan")
     }
   }
@@ -358,7 +358,7 @@ class PlannerSuite extends SharedSQLContext {
     )
     val outputPlan = EnsureRequirements(spark.sessionState.conf).apply(inputPlan)
     assertDistributionRequirementsAreSatisfied(outputPlan)
-    if (outputPlan.collect { case e: ShuffleExchange => true }.nonEmpty) {
+    if (outputPlan.collect { case e: ShuffleExchangeExec => true }.nonEmpty) {
       fail(s"Exchange should not have been added:\n$outputPlan")
     }
   }
@@ -381,7 +381,7 @@ class PlannerSuite extends SharedSQLContext {
     )
     val outputPlan = EnsureRequirements(spark.sessionState.conf).apply(inputPlan)
     assertDistributionRequirementsAreSatisfied(outputPlan)
-    if (outputPlan.collect { case e: ShuffleExchange => true }.nonEmpty) {
+    if (outputPlan.collect { case e: ShuffleExchangeExec => true }.nonEmpty) {
       fail(s"No Exchanges should have been added:\n$outputPlan")
     }
   }
@@ -391,7 +391,7 @@ class PlannerSuite extends SharedSQLContext {
     val finalPartitioning = HashPartitioning(Literal(1) :: Nil, 5)
     val childPartitioning = HashPartitioning(Literal(2) :: Nil, 5)
     assert(!childPartitioning.satisfies(distribution))
-    val inputPlan = ShuffleExchange(finalPartitioning,
+    val inputPlan = ShuffleExchangeExec(finalPartitioning,
       DummySparkPlan(
         children = DummySparkPlan(outputPartitioning = childPartitioning) :: Nil,
         requiredChildDistribution = Seq(distribution),
@@ -400,7 +400,7 @@ class PlannerSuite extends SharedSQLContext {
 
     val outputPlan = EnsureRequirements(spark.sessionState.conf).apply(inputPlan)
     assertDistributionRequirementsAreSatisfied(outputPlan)
-    if (outputPlan.collect { case e: ShuffleExchange => true }.size == 2) {
+    if (outputPlan.collect { case e: ShuffleExchangeExec => true }.size == 2) {
       fail(s"Topmost Exchange should have been eliminated:\n$outputPlan")
     }
   }
@@ -411,7 +411,7 @@ class PlannerSuite extends SharedSQLContext {
     val finalPartitioning = HashPartitioning(Literal(1) :: Nil, 8)
     val childPartitioning = HashPartitioning(Literal(2) :: Nil, 5)
     assert(!childPartitioning.satisfies(distribution))
-    val inputPlan = ShuffleExchange(finalPartitioning,
+    val inputPlan = ShuffleExchangeExec(finalPartitioning,
       DummySparkPlan(
         children = DummySparkPlan(outputPartitioning = childPartitioning) :: Nil,
         requiredChildDistribution = Seq(distribution),
@@ -420,7 +420,7 @@ class PlannerSuite extends SharedSQLContext {
 
     val outputPlan = EnsureRequirements(spark.sessionState.conf).apply(inputPlan)
     assertDistributionRequirementsAreSatisfied(outputPlan)
-    if (outputPlan.collect { case e: ShuffleExchange => true }.size == 1) {
+    if (outputPlan.collect { case e: ShuffleExchangeExec => true }.size == 1) {
       fail(s"Topmost Exchange should not have been eliminated:\n$outputPlan")
     }
   }
@@ -430,7 +430,7 @@ class PlannerSuite extends SharedSQLContext {
     val finalPartitioning = HashPartitioning(Literal(1) :: Nil, 5)
     val childPartitioning = HashPartitioning(Literal(2) :: Nil, 5)
     assert(!childPartitioning.satisfies(distribution))
-    val shuffle = ShuffleExchange(finalPartitioning,
+    val shuffle = ShuffleExchangeExec(finalPartitioning,
       DummySparkPlan(
         children = DummySparkPlan(outputPartitioning = childPartitioning) :: Nil,
         requiredChildDistribution = Seq(distribution),
@@ -449,7 +449,7 @@ class PlannerSuite extends SharedSQLContext {
     if (outputPlan.collect { case e: ReusedExchangeExec => true }.size != 1) {
       fail(s"Should re-use the shuffle:\n$outputPlan")
     }
-    if (outputPlan.collect { case e: ShuffleExchange => true }.size != 1) {
+    if (outputPlan.collect { case e: ShuffleExchangeExec => true }.size != 1) {
       fail(s"Should have only one shuffle:\n$outputPlan")
     }
 
@@ -459,14 +459,14 @@ class PlannerSuite extends SharedSQLContext {
       Literal(1) :: Nil,
       Inner,
       None,
-      ShuffleExchange(finalPartitioning, inputPlan),
-      ShuffleExchange(finalPartitioning, inputPlan))
+      ShuffleExchangeExec(finalPartitioning, inputPlan),
+      ShuffleExchangeExec(finalPartitioning, inputPlan))
 
     val outputPlan2 = ReuseExchange(spark.sessionState.conf).apply(inputPlan2)
     if (outputPlan2.collect { case e: ReusedExchangeExec => true }.size != 2) {
       fail(s"Should re-use the two shuffles:\n$outputPlan2")
     }
-    if (outputPlan2.collect { case e: ShuffleExchange => true }.size != 2) {
+    if (outputPlan2.collect { case e: ShuffleExchangeExec => true }.size != 2) {
       fail(s"Should have only two shuffles:\n$outputPlan")
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
index eb9e6458fc61c..ab18905e2ddb2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning
 import org.apache.spark.sql.execution.{DataSourceScanExec, SortExec}
 import org.apache.spark.sql.execution.datasources.DataSourceStrategy
-import org.apache.spark.sql.execution.exchange.ShuffleExchange
+import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec
 import org.apache.spark.sql.execution.joins.SortMergeJoinExec
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
@@ -302,10 +302,10 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils {
 
         // check existence of shuffle
         assert(
-          joinOperator.left.find(_.isInstanceOf[ShuffleExchange]).isDefined == shuffleLeft,
+          joinOperator.left.find(_.isInstanceOf[ShuffleExchangeExec]).isDefined == shuffleLeft,
           s"expected shuffle in plan to be $shuffleLeft but found\n${joinOperator.left}")
         assert(
-          joinOperator.right.find(_.isInstanceOf[ShuffleExchange]).isDefined == shuffleRight,
+          joinOperator.right.find(_.isInstanceOf[ShuffleExchangeExec]).isDefined == shuffleRight,
           s"expected shuffle in plan to be $shuffleRight but found\n${joinOperator.right}")
 
         // check existence of sort
@@ -506,7 +506,7 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils {
         agged.sort("i", "j"),
         df1.groupBy("i", "j").agg(max("k")).sort("i", "j"))
 
-      assert(agged.queryExecution.executedPlan.find(_.isInstanceOf[ShuffleExchange]).isEmpty)
+      assert(agged.queryExecution.executedPlan.find(_.isInstanceOf[ShuffleExchangeExec]).isEmpty)
     }
   }
 
@@ -520,7 +520,7 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils {
         agged.sort("i", "j"),
         df1.groupBy("i", "j").agg(max("k")).sort("i", "j"))
 
-      assert(agged.queryExecution.executedPlan.find(_.isInstanceOf[ShuffleExchange]).isEmpty)
+      assert(agged.queryExecution.executedPlan.find(_.isInstanceOf[ShuffleExchangeExec]).isEmpty)
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/EnsureStatefulOpPartitioningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/EnsureStatefulOpPartitioningSuite.scala
index 044bb03480aa4..ed9823fbddfda 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/EnsureStatefulOpPartitioningSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/EnsureStatefulOpPartitioningSuite.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution.{SparkPlan, SparkPlanTest, UnaryExecNode}
-import org.apache.spark.sql.execution.exchange.{Exchange, ShuffleExchange}
+import org.apache.spark.sql.execution.exchange.{Exchange, ShuffleExchangeExec}
 import org.apache.spark.sql.execution.streaming.{IncrementalExecution, OffsetSeqMetadata, StatefulOperator, StatefulOperatorStateInfo}
 import org.apache.spark.sql.test.SharedSQLContext
 
@@ -93,7 +93,7 @@ class EnsureStatefulOpPartitioningSuite extends SparkPlanTest with SharedSQLCont
         fail(s"Was expecting an exchange but didn't get one in:\n$executed")
       }
       assert(exchange.get ===
-        ShuffleExchange(expectedPartitioning(inputPlan.output.take(1)), inputPlan),
+        ShuffleExchangeExec(expectedPartitioning(inputPlan.output.take(1)), inputPlan),
         s"Exchange didn't have expected properties:\n${exchange.get}")
     } else {
       assert(!executed.children.exists(_.isInstanceOf[Exchange]),

From d29d1e87995e02cb57ba3026c945c3cd66bb06e2 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 28 Sep 2017 15:59:05 -0700
Subject: [PATCH 640/779] [SPARK-22159][SQL] Make config names consistently end
 with "enabled".

## What changes were proposed in this pull request?
spark.sql.execution.arrow.enable and spark.sql.codegen.aggregate.map.twolevel.enable -> enabled

## How was this patch tested?
N/A

Author: Reynold Xin <rxin@databricks.com>

Closes #19384 from rxin/SPARK-22159.
---
 .../main/scala/org/apache/spark/sql/internal/SQLConf.scala    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index d00c672487532..358cf62149070 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -668,7 +668,7 @@ object SQLConf {
       .createWithDefault(40)
 
   val ENABLE_TWOLEVEL_AGG_MAP =
-    buildConf("spark.sql.codegen.aggregate.map.twolevel.enable")
+    buildConf("spark.sql.codegen.aggregate.map.twolevel.enabled")
       .internal()
       .doc("Enable two-level aggregate hash map. When enabled, records will first be " +
         "inserted/looked-up at a 1st-level, small, fast map, and then fallback to a " +
@@ -908,7 +908,7 @@ object SQLConf {
     .createWithDefault(false)
 
   val ARROW_EXECUTION_ENABLE =
-    buildConf("spark.sql.execution.arrow.enable")
+    buildConf("spark.sql.execution.arrow.enabled")
       .internal()
       .doc("Make use of Apache Arrow for columnar data transfers. Currently available " +
         "for use with pyspark.sql.DataFrame.toPandas with the following data types: " +

From 323806e68f91f3c7521327186a37ddd1436267d0 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 28 Sep 2017 21:07:12 -0700
Subject: [PATCH 641/779] [SPARK-22160][SQL] Make sample points per partition
 (in range partitioner) configurable and bump the default value up to 100

## What changes were proposed in this pull request?
Spark's RangePartitioner hard codes the number of sampling points per partition to be 20. This is sometimes too low. This ticket makes it configurable, via spark.sql.execution.rangeExchange.sampleSizePerPartition, and raises the default in Spark SQL to be 100.

## How was this patch tested?
Added a pretty sophisticated test based on chi square test ...

Author: Reynold Xin <rxin@databricks.com>

Closes #19387 from rxin/SPARK-22160.
---
 .../scala/org/apache/spark/Partitioner.scala  | 15 ++++-
 .../apache/spark/sql/internal/SQLConf.scala   | 10 +++
 .../exchange/ShuffleExchangeExec.scala        |  7 +-
 .../spark/sql/ConfigBehaviorSuite.scala       | 66 +++++++++++++++++++
 4 files changed, 95 insertions(+), 3 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/ConfigBehaviorSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/Partitioner.scala b/core/src/main/scala/org/apache/spark/Partitioner.scala
index 1484f29525a4e..debbd8d7c26c9 100644
--- a/core/src/main/scala/org/apache/spark/Partitioner.scala
+++ b/core/src/main/scala/org/apache/spark/Partitioner.scala
@@ -108,11 +108,21 @@ class HashPartitioner(partitions: Int) extends Partitioner {
 class RangePartitioner[K : Ordering : ClassTag, V](
     partitions: Int,
     rdd: RDD[_ <: Product2[K, V]],
-    private var ascending: Boolean = true)
+    private var ascending: Boolean = true,
+    val samplePointsPerPartitionHint: Int = 20)
   extends Partitioner {
 
+  // A constructor declared in order to maintain backward compatibility for Java, when we add the
+  // 4th constructor parameter samplePointsPerPartitionHint. See SPARK-22160.
+  // This is added to make sure from a bytecode point of view, there is still a 3-arg ctor.
+  def this(partitions: Int, rdd: RDD[_ <: Product2[K, V]], ascending: Boolean) = {
+    this(partitions, rdd, ascending, samplePointsPerPartitionHint = 20)
+  }
+
   // We allow partitions = 0, which happens when sorting an empty RDD under the default settings.
   require(partitions >= 0, s"Number of partitions cannot be negative but found $partitions.")
+  require(samplePointsPerPartitionHint > 0,
+    s"Sample points per partition must be greater than 0 but found $samplePointsPerPartitionHint")
 
   private var ordering = implicitly[Ordering[K]]
 
@@ -122,7 +132,8 @@ class RangePartitioner[K : Ordering : ClassTag, V](
       Array.empty
     } else {
       // This is the sample size we need to have roughly balanced output partitions, capped at 1M.
-      val sampleSize = math.min(20.0 * partitions, 1e6)
+      // Cast to double to avoid overflowing ints or longs
+      val sampleSize = math.min(samplePointsPerPartitionHint.toDouble * partitions, 1e6)
       // Assume the input partitions are roughly balanced and over-sample a little bit.
       val sampleSizePerPartition = math.ceil(3.0 * sampleSize / rdd.partitions.length).toInt
       val (numItems, sketched) = RangePartitioner.sketch(rdd.map(_._1), sampleSizePerPartition)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 358cf62149070..1a73d168b9b6e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -907,6 +907,14 @@ object SQLConf {
     .booleanConf
     .createWithDefault(false)
 
+  val RANGE_EXCHANGE_SAMPLE_SIZE_PER_PARTITION =
+    buildConf("spark.sql.execution.rangeExchange.sampleSizePerPartition")
+      .internal()
+      .doc("Number of points to sample per partition in order to determine the range boundaries" +
+          " for range partitioning, typically used in global sorting (without limit).")
+      .intConf
+      .createWithDefault(100)
+
   val ARROW_EXECUTION_ENABLE =
     buildConf("spark.sql.execution.arrow.enabled")
       .internal()
@@ -1199,6 +1207,8 @@ class SQLConf extends Serializable with Logging {
 
   def supportQuotedRegexColumnName: Boolean = getConf(SUPPORT_QUOTED_REGEX_COLUMN_NAME)
 
+  def rangeExchangeSampleSizePerPartition: Int = getConf(RANGE_EXCHANGE_SAMPLE_SIZE_PER_PARTITION)
+
   def arrowEnable: Boolean = getConf(ARROW_EXECUTION_ENABLE)
 
   def arrowMaxRecordsPerBatch: Int = getConf(ARROW_EXECUTION_MAX_RECORDS_PER_BATCH)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala
index 11c4aa9b4acf0..5a1e217082bc2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala
@@ -30,6 +30,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.LazilyGeneratedOrdering
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.metric.SQLMetrics
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.util.MutablePair
 
 /**
@@ -218,7 +219,11 @@ object ShuffleExchangeExec {
           iter.map(row => mutablePair.update(row.copy(), null))
         }
         implicit val ordering = new LazilyGeneratedOrdering(sortingExpressions, outputAttributes)
-        new RangePartitioner(numPartitions, rddForSampling, ascending = true)
+        new RangePartitioner(
+          numPartitions,
+          rddForSampling,
+          ascending = true,
+          samplePointsPerPartitionHint = SQLConf.get.rangeExchangeSampleSizePerPartition)
       case SinglePartition =>
         new Partitioner {
           override def numPartitions: Int = 1
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ConfigBehaviorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ConfigBehaviorSuite.scala
new file mode 100644
index 0000000000000..2c1e5db5fd9bb
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ConfigBehaviorSuite.scala
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.commons.math3.stat.inference.ChiSquareTest
+
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.test.SharedSQLContext
+
+
+class ConfigBehaviorSuite extends QueryTest with SharedSQLContext {
+
+  import testImplicits._
+
+  test("SPARK-22160 spark.sql.execution.rangeExchange.sampleSizePerPartition") {
+    // In this test, we run a sort and compute the histogram for partition size post shuffle.
+    // With a high sample count, the partition size should be more evenly distributed, and has a
+    // low chi-sq test value.
+    // Also the whole code path for range partitioning as implemented should be deterministic
+    // (it uses the partition id as the seed), so this test shouldn't be flaky.
+
+    val numPartitions = 4
+
+    def computeChiSquareTest(): Double = {
+      val n = 10000
+      // Trigger a sort
+      val data = spark.range(0, n, 1, 1).sort('id)
+        .selectExpr("SPARK_PARTITION_ID() pid", "id").as[(Int, Long)].collect()
+
+      // Compute histogram for the number of records per partition post sort
+      val dist = data.groupBy(_._1).map(_._2.length.toLong).toArray
+      assert(dist.length == 4)
+
+      new ChiSquareTest().chiSquare(
+        Array.fill(numPartitions) { n.toDouble / numPartitions },
+        dist)
+    }
+
+    withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> numPartitions.toString) {
+      // The default chi-sq value should be low
+      assert(computeChiSquareTest() < 100)
+
+      withSQLConf(SQLConf.RANGE_EXCHANGE_SAMPLE_SIZE_PER_PARTITION.key -> "1") {
+        // If we only sample one point, the range boundaries will be pretty bad and the
+        // chi-sq value would be very high.
+        assert(computeChiSquareTest() > 1000)
+      }
+    }
+  }
+
+}

From 161ba7eaa4539f0a7f20d9e2a493e0e323ca5249 Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Thu, 28 Sep 2017 23:14:53 -0700
Subject: [PATCH 642/779] [SPARK-22146] FileNotFoundException while reading ORC
 files containing special characters

## What changes were proposed in this pull request?

Reading ORC files containing special characters like '%' fails with a FileNotFoundException.
This PR aims to fix the problem.

## How was this patch tested?

Added UT.

Author: Marco Gaido <marcogaido91@gmail.com>
Author: Marco Gaido <mgaido@hortonworks.com>

Closes #19368 from mgaido91/SPARK-22146.
---
 .../apache/spark/sql/hive/orc/OrcFileFormat.scala    |  2 +-
 .../spark/sql/hive/MetastoreDataSourcesSuite.scala   | 12 +++++++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
index 4d92a67044373..c76f0ebb36a60 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -58,7 +58,7 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
       options: Map[String, String],
       files: Seq[FileStatus]): Option[StructType] = {
     OrcFileOperator.readSchema(
-      files.map(_.getPath.toUri.toString),
+      files.map(_.getPath.toString),
       Some(sparkSession.sessionState.newHadoopConf())
     )
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index 29b0e6c8533ef..f5d41c91270a5 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -993,7 +993,6 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
     spark.sql("""drop database if exists testdb8156 CASCADE""")
   }
 
-
   test("skip hive metadata on table creation") {
     withTempDir { tempPath =>
       val schema = StructType((1 to 5).map(i => StructField(s"c_$i", StringType)))
@@ -1345,6 +1344,17 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
     }
   }
 
+  Seq("orc", "parquet", "csv", "json", "text").foreach { format =>
+    test(s"SPARK-22146: read files containing special characters using $format") {
+      val nameWithSpecialChars = s"sp&cial%chars"
+      withTempDir { dir =>
+        val tmpFile = s"$dir/$nameWithSpecialChars"
+        spark.createDataset(Seq("a", "b")).write.format(format).save(tmpFile)
+        spark.read.format(format).load(tmpFile)
+      }
+    }
+  }
+
   private def withDebugMode(f: => Unit): Unit = {
     val previousValue = sparkSession.sparkContext.conf.get(DEBUG_MODE)
     try {

From 0fa4dbe4f4d7b988be2105b46590b5207f7c8121 Mon Sep 17 00:00:00 2001
From: Wang Gengliang <ltnwgl@gmail.com>
Date: Thu, 28 Sep 2017 23:23:30 -0700
Subject: [PATCH 643/779] [SPARK-22141][FOLLOWUP][SQL] Add comments for the
 order of batches

## What changes were proposed in this pull request?
Add comments for specifying the position of  batch "Check Cartesian Products", as rxin suggested in https://github.com/apache/spark/pull/19362 .

## How was this patch tested?
Unit test

Author: Wang Gengliang <ltnwgl@gmail.com>

Closes #19379 from gengliangwang/SPARK-22141-followup.
---
 .../org/apache/spark/sql/catalyst/optimizer/Optimizer.scala   | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index a391c513ad384..b9fa39d6dad4c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -134,6 +134,7 @@ abstract class Optimizer(sessionCatalog: SessionCatalog)
     Batch("LocalRelation", fixedPoint,
       ConvertToLocalRelation,
       PropagateEmptyRelation) ::
+    // The following batch should be executed after batch "Join Reorder" and "LocalRelation".
     Batch("Check Cartesian Products", Once,
       CheckCartesianProducts) ::
     Batch("OptimizeCodegen", Once,
@@ -1089,6 +1090,9 @@ object CombineLimits extends Rule[LogicalPlan] {
  * SELECT * from R, S where R.r = S.s,
  * the join between R and S is not a cartesian product and therefore should be allowed.
  * The predicate R.r = S.s is not recognized as a join condition until the ReorderJoin rule.
+ *
+ * This rule must be run AFTER the batch "LocalRelation", since a join with empty relation should
+ * not be a cartesian product.
  */
 object CheckCartesianProducts extends Rule[LogicalPlan] with PredicateHelper {
   /**

From a2516f41aef68e39df7f6380fd2618cc148a609e Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Fri, 29 Sep 2017 08:26:53 +0100
Subject: [PATCH 644/779] [SPARK-22142][BUILD][STREAMING] Move Flume support
 behind a profile

## What changes were proposed in this pull request?

Add 'flume' profile to enable Flume-related integration modules

## How was this patch tested?

Existing tests; no functional change

Author: Sean Owen <sowen@cloudera.com>

Closes #19365 from srowen/SPARK-22142.
---
 dev/create-release/release-build.sh |  4 ++--
 dev/mima                            |  2 +-
 dev/scalastyle                      |  1 +
 dev/sparktestsupport/modules.py     | 20 +++++++++++++++++++-
 dev/test-dependencies.sh            |  2 +-
 docs/building-spark.md              |  6 ++++++
 pom.xml                             | 13 ++++++++++---
 project/SparkBuild.scala            | 17 +++++++++--------
 python/pyspark/streaming/tests.py   | 16 +++++++++++++---
 9 files changed, 62 insertions(+), 19 deletions(-)

diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index 8de1d6a37dc25..c548a0a4e4bee 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -84,9 +84,9 @@ MVN="build/mvn --force"
 # Hive-specific profiles for some builds
 HIVE_PROFILES="-Phive -Phive-thriftserver"
 # Profiles for publishing snapshots and release to Maven Central
-PUBLISH_PROFILES="-Pmesos -Pyarn $HIVE_PROFILES -Pspark-ganglia-lgpl -Pkinesis-asl"
+PUBLISH_PROFILES="-Pmesos -Pyarn -Pflume $HIVE_PROFILES -Pspark-ganglia-lgpl -Pkinesis-asl"
 # Profiles for building binary releases
-BASE_RELEASE_PROFILES="-Pmesos -Pyarn -Psparkr"
+BASE_RELEASE_PROFILES="-Pmesos -Pyarn -Pflume -Psparkr"
 # Scala 2.11 only profiles for some builds
 SCALA_2_11_PROFILES="-Pkafka-0-8"
 # Scala 2.12 only profiles for some builds
diff --git a/dev/mima b/dev/mima
index fdb21f5007cf2..1e3ca9700bc07 100755
--- a/dev/mima
+++ b/dev/mima
@@ -24,7 +24,7 @@ set -e
 FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
 cd "$FWDIR"
 
-SPARK_PROFILES="-Pmesos -Pkafka-0-8 -Pyarn -Pspark-ganglia-lgpl -Pkinesis-asl -Phive-thriftserver -Phive"
+SPARK_PROFILES="-Pmesos -Pkafka-0-8 -Pyarn -Pflume -Pspark-ganglia-lgpl -Pkinesis-asl -Phive-thriftserver -Phive"
 TOOLS_CLASSPATH="$(build/sbt -DcopyDependencies=false "export tools/fullClasspath" | tail -n1)"
 OLD_DEPS_CLASSPATH="$(build/sbt -DcopyDependencies=false $SPARK_PROFILES "export oldDeps/fullClasspath" | tail -n1)"
 
diff --git a/dev/scalastyle b/dev/scalastyle
index e5aa589869535..89ecc8abd6f8c 100755
--- a/dev/scalastyle
+++ b/dev/scalastyle
@@ -25,6 +25,7 @@ ERRORS=$(echo -e "q\n" \
         -Pmesos \
         -Pkafka-0-8 \
         -Pyarn \
+        -Pflume \
         -Phive \
         -Phive-thriftserver \
         scalastyle test:scalastyle \
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 50e14b60545af..91d5667ed1f07 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -279,6 +279,12 @@ def __hash__(self):
     source_file_regexes=[
         "external/flume-sink",
     ],
+    build_profile_flags=[
+        "-Pflume",
+    ],
+    environ={
+        "ENABLE_FLUME_TESTS": "1"
+    },
     sbt_test_goals=[
         "streaming-flume-sink/test",
     ]
@@ -291,6 +297,12 @@ def __hash__(self):
     source_file_regexes=[
         "external/flume",
     ],
+    build_profile_flags=[
+        "-Pflume",
+    ],
+    environ={
+        "ENABLE_FLUME_TESTS": "1"
+    },
     sbt_test_goals=[
         "streaming-flume/test",
     ]
@@ -302,7 +314,13 @@ def __hash__(self):
     dependencies=[streaming_flume, streaming_flume_sink],
     source_file_regexes=[
         "external/flume-assembly",
-    ]
+    ],
+    build_profile_flags=[
+        "-Pflume",
+    ],
+    environ={
+        "ENABLE_FLUME_TESTS": "1"
+    }
 )
 
 
diff --git a/dev/test-dependencies.sh b/dev/test-dependencies.sh
index c7714578bd005..58b295d4f6e00 100755
--- a/dev/test-dependencies.sh
+++ b/dev/test-dependencies.sh
@@ -29,7 +29,7 @@ export LC_ALL=C
 # TODO: This would be much nicer to do in SBT, once SBT supports Maven-style resolution.
 
 # NOTE: These should match those in the release publishing script
-HADOOP2_MODULE_PROFILES="-Phive-thriftserver -Pmesos -Pkafka-0-8 -Pyarn -Phive"
+HADOOP2_MODULE_PROFILES="-Phive-thriftserver -Pmesos -Pkafka-0-8 -Pyarn -Pflume -Phive"
 MVN="build/mvn"
 HADOOP_PROFILES=(
     hadoop-2.6
diff --git a/docs/building-spark.md b/docs/building-spark.md
index 57baa503259c1..e1532de16108d 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -100,6 +100,12 @@ Note: Kafka 0.8 support is deprecated as of Spark 2.3.0.
 
 Kafka 0.10 support is still automatically built.
 
+## Building with Flume support
+
+Apache Flume support must be explicitly enabled with the `flume` profile.
+
+    ./build/mvn -Pflume -DskipTests clean package
+
 ## Building submodules individually
 
 It's possible to build Spark sub-modules using the `mvn -pl` option.
diff --git a/pom.xml b/pom.xml
index 87a468c3a6f55..9fac8b1e53788 100644
--- a/pom.xml
+++ b/pom.xml
@@ -98,15 +98,13 @@
     <module>sql/core</module>
     <module>sql/hive</module>
     <module>assembly</module>
-    <module>external/flume</module>
-    <module>external/flume-sink</module>
-    <module>external/flume-assembly</module>
     <module>examples</module>
     <module>repl</module>
     <module>launcher</module>
     <module>external/kafka-0-10</module>
     <module>external/kafka-0-10-assembly</module>
     <module>external/kafka-0-10-sql</module>
+    <!-- See additional modules enabled by profiles below -->
   </modules>
 
   <properties>
@@ -2583,6 +2581,15 @@
       </dependencies>
     </profile>
 
+    <profile>
+      <id>flume</id>
+      <modules>
+        <module>external/flume</module>
+        <module>external/flume-sink</module>
+        <module>external/flume-assembly</module>
+      </modules>
+    </profile>
+
     <!-- Ganglia integration is not included by default due to LGPL-licensed code -->
     <profile>
       <id>spark-ganglia-lgpl</id>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index a568d264cb2db..9501eed1e906b 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -43,11 +43,8 @@ object BuildCommons {
     "catalyst", "sql", "hive", "hive-thriftserver", "sql-kafka-0-10"
   ).map(ProjectRef(buildLocation, _))
 
-  val streamingProjects@Seq(
-    streaming, streamingFlumeSink, streamingFlume, streamingKafka010
-  ) = Seq(
-    "streaming", "streaming-flume-sink", "streaming-flume", "streaming-kafka-0-10"
-  ).map(ProjectRef(buildLocation, _))
+  val streamingProjects@Seq(streaming, streamingKafka010) =
+    Seq("streaming", "streaming-kafka-0-10").map(ProjectRef(buildLocation, _))
 
   val allProjects@Seq(
     core, graphx, mllib, mllibLocal, repl, networkCommon, networkShuffle, launcher, unsafe, tags, sketch, kvstore, _*
@@ -56,9 +53,13 @@ object BuildCommons {
     "tags", "sketch", "kvstore"
   ).map(ProjectRef(buildLocation, _)) ++ sqlProjects ++ streamingProjects
 
-  val optionallyEnabledProjects@Seq(mesos, yarn, streamingKafka, sparkGangliaLgpl,
-    streamingKinesisAsl, dockerIntegrationTests, hadoopCloud) =
-    Seq("mesos", "yarn", "streaming-kafka-0-8", "ganglia-lgpl", "streaming-kinesis-asl",
+  val optionallyEnabledProjects@Seq(mesos, yarn,
+    streamingFlumeSink, streamingFlume,
+    streamingKafka, sparkGangliaLgpl, streamingKinesisAsl,
+    dockerIntegrationTests, hadoopCloud) =
+    Seq("mesos", "yarn",
+      "streaming-flume-sink", "streaming-flume",
+      "streaming-kafka-0-8", "ganglia-lgpl", "streaming-kinesis-asl",
       "docker-integration-tests", "hadoop-cloud").map(ProjectRef(buildLocation, _))
 
   val assemblyProjects@Seq(networkYarn, streamingFlumeAssembly, streamingKafkaAssembly, streamingKafka010Assembly, streamingKinesisAslAssembly) =
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 229cf53e47359..5b86c1cb2c390 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -1478,7 +1478,7 @@ def search_kafka_assembly_jar():
             ("Failed to find Spark Streaming kafka assembly jar in %s. " % kafka_assembly_dir) +
             "You need to build Spark with "
             "'build/sbt assembly/package streaming-kafka-0-8-assembly/assembly' or "
-            "'build/mvn package' before running this test.")
+            "'build/mvn -Pkafka-0-8 package' before running this test.")
     elif len(jars) > 1:
         raise Exception(("Found multiple Spark Streaming Kafka assembly JARs: %s; please "
                          "remove all but one") % (", ".join(jars)))
@@ -1495,7 +1495,7 @@ def search_flume_assembly_jar():
             ("Failed to find Spark Streaming Flume assembly jar in %s. " % flume_assembly_dir) +
             "You need to build Spark with "
             "'build/sbt assembly/assembly streaming-flume-assembly/assembly' or "
-            "'build/mvn package' before running this test.")
+            "'build/mvn -Pflume package' before running this test.")
     elif len(jars) > 1:
         raise Exception(("Found multiple Spark Streaming Flume assembly JARs: %s; please "
                         "remove all but one") % (", ".join(jars)))
@@ -1516,6 +1516,9 @@ def search_kinesis_asl_assembly_jar():
         return jars[0]
 
 
+# Must be same as the variable and condition defined in modules.py
+flume_test_environ_var = "ENABLE_FLUME_TESTS"
+are_flume_tests_enabled = os.environ.get(flume_test_environ_var) == '1'
 # Must be same as the variable and condition defined in modules.py
 kafka_test_environ_var = "ENABLE_KAFKA_0_8_TESTS"
 are_kafka_tests_enabled = os.environ.get(kafka_test_environ_var) == '1'
@@ -1538,9 +1541,16 @@ def search_kinesis_asl_assembly_jar():
 
     os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars %s pyspark-shell" % jars
     testcases = [BasicOperationTests, WindowFunctionTests, StreamingContextTests, CheckpointTests,
-                 FlumeStreamTests, FlumePollingStreamTests,
                  StreamingListenerTests]
 
+    if are_flume_tests_enabled:
+        testcases.append(FlumeStreamTests)
+        testcases.append(FlumePollingStreamTests)
+    else:
+        sys.stderr.write(
+            "Skipped test_flume_stream (enable by setting environment variable %s=1"
+            % flume_test_environ_var)
+
     if are_kafka_tests_enabled:
         testcases.append(KafkaStreamTests)
     else:

From ecbe416ab5001b32737966c5a2407597a1dafc32 Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@us.ibm.com>
Date: Fri, 29 Sep 2017 08:04:14 -0700
Subject: [PATCH 645/779] [SPARK-22129][SPARK-22138] Release script
 improvements

## What changes were proposed in this pull request?

Use the GPG_KEY param, fix lsof to non-hardcoded path, remove version swap since it wasn't really needed. Use EXPORT on JAVA_HOME for downstream scripts as well.

## How was this patch tested?

Rolled 2.1.2 RC2

Author: Holden Karau <holden@us.ibm.com>

Closes #19359 from holdenk/SPARK-22129-fix-signing.
---
 dev/create-release/release-build.sh | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index c548a0a4e4bee..7e8d5c7075195 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -74,7 +74,7 @@ GIT_REF=${GIT_REF:-master}
 # Destination directory parent on remote server
 REMOTE_PARENT_DIR=${REMOTE_PARENT_DIR:-/home/$ASF_USERNAME/public_html}
 
-GPG="gpg --no-tty --batch"
+GPG="gpg -u $GPG_KEY --no-tty --batch"
 NEXUS_ROOT=https://repository.apache.org/service/local/staging
 NEXUS_PROFILE=d63f592e7eac0 # Profile for Spark staging uploads
 BASE_DIR=$(pwd)
@@ -125,7 +125,7 @@ else
       echo "Please set JAVA_HOME correctly."
       exit 1
     else
-      JAVA_HOME="$JAVA_7_HOME"
+      export JAVA_HOME="$JAVA_7_HOME"
     fi
   fi
 fi
@@ -140,7 +140,7 @@ DEST_DIR_NAME="spark-$SPARK_PACKAGE_VERSION"
 function LFTP {
   SSH="ssh -o ConnectTimeout=300 -o StrictHostKeyChecking=no -i $ASF_RSA_KEY"
   COMMANDS=$(cat <<EOF
-     set net:max-retries 1 &&
+     set net:max-retries 2 &&
      set sftp:connect-program $SSH &&
      connect -u $ASF_USERNAME,p sftp://home.apache.org &&
      $@
@@ -345,7 +345,7 @@ if [[ "$1" == "publish-snapshot" ]]; then
   #  -DskipTests $SCALA_2_12_PROFILES $PUBLISH_PROFILES clean deploy
 
   # Clean-up Zinc nailgun process
-  /usr/sbin/lsof -P |grep $ZINC_PORT | grep LISTEN | awk '{ print $2; }' | xargs kill
+  lsof -P |grep $ZINC_PORT | grep LISTEN | awk '{ print $2; }' | xargs kill
 
   rm $tmp_settings
   cd ..
@@ -353,8 +353,6 @@ if [[ "$1" == "publish-snapshot" ]]; then
 fi
 
 if [[ "$1" == "publish-release" ]]; then
-  SPARK_VERSION=$SPARK_PACKAGE_VERSION
-
   cd spark
   # Publish Spark to Maven release repo
   echo "Publishing Spark checkout at '$GIT_REF' ($git_hash)"
@@ -384,7 +382,7 @@ if [[ "$1" == "publish-release" ]]; then
   #  -DskipTests $SCALA_2_12_PROFILES §$PUBLISH_PROFILES clean install
 
   # Clean-up Zinc nailgun process
-  /usr/sbin/lsof -P |grep $ZINC_PORT | grep LISTEN | awk '{ print $2; }' | xargs kill
+  lsof -P |grep $ZINC_PORT | grep LISTEN | awk '{ print $2; }' | xargs kill
 
   #./dev/change-scala-version.sh 2.11
 

From 9ed7394a68315126b2dd00e53a444cc65b5a62ea Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Fri, 29 Sep 2017 08:59:42 -0700
Subject: [PATCH 646/779] [SPARK-22161][SQL] Add Impala-modified TPC-DS queries

## What changes were proposed in this pull request?

Added IMPALA-modified TPCDS queries to TPC-DS query suites.

- Ref: https://github.com/cloudera/impala-tpcds-kit/tree/master/queries

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #19386 from gatorsmile/addImpalaQueries.
---
 .../resources/tpcds-modifiedQueries/q10.sql   |  70 ++++++
 .../resources/tpcds-modifiedQueries/q19.sql   |  38 +++
 .../resources/tpcds-modifiedQueries/q27.sql   |  43 ++++
 .../resources/tpcds-modifiedQueries/q3.sql    | 228 ++++++++++++++++++
 .../resources/tpcds-modifiedQueries/q34.sql   |  45 ++++
 .../resources/tpcds-modifiedQueries/q42.sql   |  28 +++
 .../resources/tpcds-modifiedQueries/q43.sql   |  36 +++
 .../resources/tpcds-modifiedQueries/q46.sql   |  80 ++++++
 .../resources/tpcds-modifiedQueries/q52.sql   |  27 +++
 .../resources/tpcds-modifiedQueries/q53.sql   |  37 +++
 .../resources/tpcds-modifiedQueries/q55.sql   |  24 ++
 .../resources/tpcds-modifiedQueries/q59.sql   |  83 +++++++
 .../resources/tpcds-modifiedQueries/q63.sql   |  29 +++
 .../resources/tpcds-modifiedQueries/q65.sql   |  58 +++++
 .../resources/tpcds-modifiedQueries/q68.sql   |  62 +++++
 .../resources/tpcds-modifiedQueries/q7.sql    |  31 +++
 .../resources/tpcds-modifiedQueries/q73.sql   |  49 ++++
 .../resources/tpcds-modifiedQueries/q79.sql   |  59 +++++
 .../resources/tpcds-modifiedQueries/q89.sql   |  43 ++++
 .../resources/tpcds-modifiedQueries/q98.sql   |  32 +++
 .../tpcds-modifiedQueries/ss_max.sql          |  14 ++
 .../apache/spark/sql/TPCDSQuerySuite.scala    |  26 +-
 22 files changed, 1141 insertions(+), 1 deletion(-)
 create mode 100755 sql/core/src/test/resources/tpcds-modifiedQueries/q10.sql
 create mode 100755 sql/core/src/test/resources/tpcds-modifiedQueries/q19.sql
 create mode 100755 sql/core/src/test/resources/tpcds-modifiedQueries/q27.sql
 create mode 100755 sql/core/src/test/resources/tpcds-modifiedQueries/q3.sql
 create mode 100755 sql/core/src/test/resources/tpcds-modifiedQueries/q34.sql
 create mode 100755 sql/core/src/test/resources/tpcds-modifiedQueries/q42.sql
 create mode 100755 sql/core/src/test/resources/tpcds-modifiedQueries/q43.sql
 create mode 100755 sql/core/src/test/resources/tpcds-modifiedQueries/q46.sql
 create mode 100755 sql/core/src/test/resources/tpcds-modifiedQueries/q52.sql
 create mode 100755 sql/core/src/test/resources/tpcds-modifiedQueries/q53.sql
 create mode 100755 sql/core/src/test/resources/tpcds-modifiedQueries/q55.sql
 create mode 100755 sql/core/src/test/resources/tpcds-modifiedQueries/q59.sql
 create mode 100755 sql/core/src/test/resources/tpcds-modifiedQueries/q63.sql
 create mode 100755 sql/core/src/test/resources/tpcds-modifiedQueries/q65.sql
 create mode 100755 sql/core/src/test/resources/tpcds-modifiedQueries/q68.sql
 create mode 100755 sql/core/src/test/resources/tpcds-modifiedQueries/q7.sql
 create mode 100755 sql/core/src/test/resources/tpcds-modifiedQueries/q73.sql
 create mode 100755 sql/core/src/test/resources/tpcds-modifiedQueries/q79.sql
 create mode 100755 sql/core/src/test/resources/tpcds-modifiedQueries/q89.sql
 create mode 100755 sql/core/src/test/resources/tpcds-modifiedQueries/q98.sql
 create mode 100755 sql/core/src/test/resources/tpcds-modifiedQueries/ss_max.sql

diff --git a/sql/core/src/test/resources/tpcds-modifiedQueries/q10.sql b/sql/core/src/test/resources/tpcds-modifiedQueries/q10.sql
new file mode 100755
index 0000000000000..79dd3d516e8c7
--- /dev/null
+++ b/sql/core/src/test/resources/tpcds-modifiedQueries/q10.sql
@@ -0,0 +1,70 @@
+-- start query 10 in stream 0 using template query10.tpl
+with 
+v1 as (
+  select 
+     ws_bill_customer_sk as customer_sk
+  from web_sales,
+       date_dim
+  where ws_sold_date_sk = d_date_sk
+  and d_year = 2002
+  and d_moy between 4 and 4+3
+  union all
+  select 
+    cs_ship_customer_sk as customer_sk
+  from catalog_sales,
+       date_dim 
+  where cs_sold_date_sk = d_date_sk
+  and d_year = 2002
+  and d_moy between 4 and 4+3
+),
+v2 as (
+  select 
+    ss_customer_sk as customer_sk
+  from store_sales,
+       date_dim
+  where ss_sold_date_sk = d_date_sk
+  and d_year = 2002
+  and d_moy between 4 and 4+3 
+)
+select
+  cd_gender,
+  cd_marital_status,
+  cd_education_status,
+  count(*) cnt1,
+  cd_purchase_estimate,
+  count(*) cnt2,
+  cd_credit_rating,
+  count(*) cnt3,
+  cd_dep_count,
+  count(*) cnt4,
+  cd_dep_employed_count,
+  count(*) cnt5,
+  cd_dep_college_count,
+  count(*) cnt6
+from customer c
+join customer_address ca on (c.c_current_addr_sk = ca.ca_address_sk)
+join customer_demographics on (cd_demo_sk = c.c_current_cdemo_sk) 
+left semi join v1 on (v1.customer_sk = c.c_customer_sk) 
+left semi join v2 on (v2.customer_sk = c.c_customer_sk)
+where 
+  ca_county in ('Walker County','Richland County','Gaines County','Douglas County','Dona Ana County')
+group by 
+  cd_gender,
+  cd_marital_status,
+  cd_education_status,
+  cd_purchase_estimate,
+  cd_credit_rating,
+  cd_dep_count,
+  cd_dep_employed_count,
+  cd_dep_college_count
+order by 
+  cd_gender,
+  cd_marital_status,
+  cd_education_status,
+  cd_purchase_estimate,
+  cd_credit_rating,
+  cd_dep_count,
+  cd_dep_employed_count,
+  cd_dep_college_count
+limit 100
+-- end query 10 in stream 0 using template query10.tpl
diff --git a/sql/core/src/test/resources/tpcds-modifiedQueries/q19.sql b/sql/core/src/test/resources/tpcds-modifiedQueries/q19.sql
new file mode 100755
index 0000000000000..1799827762916
--- /dev/null
+++ b/sql/core/src/test/resources/tpcds-modifiedQueries/q19.sql
@@ -0,0 +1,38 @@
+-- start query 19 in stream 0 using template query19.tpl
+select
+  i_brand_id brand_id,
+  i_brand brand,
+  i_manufact_id,
+  i_manufact,
+  sum(ss_ext_sales_price) ext_price
+from
+  date_dim,
+  store_sales,
+  item,
+  customer,
+  customer_address,
+  store
+where
+  d_date_sk = ss_sold_date_sk
+  and ss_item_sk = i_item_sk
+  and i_manager_id = 7
+  and d_moy = 11
+  and d_year = 1999
+  and ss_customer_sk = c_customer_sk
+  and c_current_addr_sk = ca_address_sk
+  and substr(ca_zip, 1, 5) <> substr(s_zip, 1, 5)
+  and ss_store_sk = s_store_sk
+  and ss_sold_date_sk between 2451484 and 2451513  -- partition key filter
+group by
+  i_brand,
+  i_brand_id,
+  i_manufact_id,
+  i_manufact
+order by
+  ext_price desc,
+  i_brand,
+  i_brand_id,
+  i_manufact_id,
+  i_manufact
+limit 100
+-- end query 19 in stream 0 using template query19.tpl
diff --git a/sql/core/src/test/resources/tpcds-modifiedQueries/q27.sql b/sql/core/src/test/resources/tpcds-modifiedQueries/q27.sql
new file mode 100755
index 0000000000000..dedbc62a2ab2e
--- /dev/null
+++ b/sql/core/src/test/resources/tpcds-modifiedQueries/q27.sql
@@ -0,0 +1,43 @@
+-- start query 27 in stream 0 using template query27.tpl
+ with results as
+ (select i_item_id,
+        s_state,
+        ss_quantity agg1,
+        ss_list_price agg2,
+        ss_coupon_amt agg3,
+        ss_sales_price agg4
+        --0 as g_state,
+        --avg(ss_quantity) agg1,
+        --avg(ss_list_price) agg2,
+        --avg(ss_coupon_amt) agg3,
+        --avg(ss_sales_price) agg4
+ from store_sales, customer_demographics, date_dim, store, item
+ where ss_sold_date_sk = d_date_sk and
+       ss_sold_date_sk between 2451545 and 2451910 and
+       ss_item_sk = i_item_sk and
+       ss_store_sk = s_store_sk and
+       ss_cdemo_sk = cd_demo_sk and
+       cd_gender = 'F' and
+       cd_marital_status = 'D' and
+       cd_education_status = 'Primary' and
+       d_year = 2000 and
+       s_state in ('TN','AL', 'SD', 'SD', 'SD', 'SD')
+ --group by i_item_id, s_state
+ )
+
+ select i_item_id,
+  s_state, g_state, agg1, agg2, agg3, agg4
+   from (
+        select i_item_id, s_state, 0 as g_state, avg(agg1) agg1, avg(agg2) agg2, avg(agg3) agg3, avg(agg4) agg4 from results
+        group by i_item_id, s_state
+         union all
+        select i_item_id, NULL AS s_state, 1 AS g_state, avg(agg1) agg1, avg(agg2) agg2, avg(agg3) agg3,
+         avg(agg4) agg4 from results
+        group by i_item_id
+         union all
+        select NULL AS i_item_id, NULL as s_state, 1 as g_state, avg(agg1) agg1, avg(agg2) agg2, avg(agg3) agg3,
+         avg(agg4) agg4 from results
+        ) foo
+  order by i_item_id, s_state
+ limit 100
+-- end query 27 in stream 0 using template query27.tpl
diff --git a/sql/core/src/test/resources/tpcds-modifiedQueries/q3.sql b/sql/core/src/test/resources/tpcds-modifiedQueries/q3.sql
new file mode 100755
index 0000000000000..35b0a20f80a4e
--- /dev/null
+++ b/sql/core/src/test/resources/tpcds-modifiedQueries/q3.sql
@@ -0,0 +1,228 @@
+-- start query 3 in stream 0 using template query3.tpl
+select
+  dt.d_year,
+  item.i_brand_id brand_id,
+  item.i_brand brand,
+  sum(ss_net_profit) sum_agg
+from
+  date_dim dt,
+  store_sales,
+  item
+where
+  dt.d_date_sk = store_sales.ss_sold_date_sk
+  and store_sales.ss_item_sk = item.i_item_sk
+  and item.i_manufact_id = 436
+  and dt.d_moy = 12
+  -- partition key filters
+  and ( 
+ss_sold_date_sk between 2415355 and 2415385
+or ss_sold_date_sk between 2415720 and 2415750
+or ss_sold_date_sk between 2416085 and 2416115
+or ss_sold_date_sk between 2416450 and 2416480
+or ss_sold_date_sk between 2416816 and 2416846
+or ss_sold_date_sk between 2417181 and 2417211
+or ss_sold_date_sk between 2417546 and 2417576
+or ss_sold_date_sk between 2417911 and 2417941
+or ss_sold_date_sk between 2418277 and 2418307
+or ss_sold_date_sk between 2418642 and 2418672
+or ss_sold_date_sk between 2419007 and 2419037
+or ss_sold_date_sk between 2419372 and 2419402
+or ss_sold_date_sk between 2419738 and 2419768
+or ss_sold_date_sk between 2420103 and 2420133
+or ss_sold_date_sk between 2420468 and 2420498
+or ss_sold_date_sk between 2420833 and 2420863
+or ss_sold_date_sk between 2421199 and 2421229
+or ss_sold_date_sk between 2421564 and 2421594
+or ss_sold_date_sk between 2421929 and 2421959
+or ss_sold_date_sk between 2422294 and 2422324
+or ss_sold_date_sk between 2422660 and 2422690
+or ss_sold_date_sk between 2423025 and 2423055
+or ss_sold_date_sk between 2423390 and 2423420
+or ss_sold_date_sk between 2423755 and 2423785
+or ss_sold_date_sk between 2424121 and 2424151
+or ss_sold_date_sk between 2424486 and 2424516
+or ss_sold_date_sk between 2424851 and 2424881
+or ss_sold_date_sk between 2425216 and 2425246
+or ss_sold_date_sk between 2425582 and 2425612
+or ss_sold_date_sk between 2425947 and 2425977
+or ss_sold_date_sk between 2426312 and 2426342
+or ss_sold_date_sk between 2426677 and 2426707
+or ss_sold_date_sk between 2427043 and 2427073
+or ss_sold_date_sk between 2427408 and 2427438
+or ss_sold_date_sk between 2427773 and 2427803
+or ss_sold_date_sk between 2428138 and 2428168
+or ss_sold_date_sk between 2428504 and 2428534
+or ss_sold_date_sk between 2428869 and 2428899
+or ss_sold_date_sk between 2429234 and 2429264
+or ss_sold_date_sk between 2429599 and 2429629
+or ss_sold_date_sk between 2429965 and 2429995
+or ss_sold_date_sk between 2430330 and 2430360
+or ss_sold_date_sk between 2430695 and 2430725
+or ss_sold_date_sk between 2431060 and 2431090
+or ss_sold_date_sk between 2431426 and 2431456
+or ss_sold_date_sk between 2431791 and 2431821
+or ss_sold_date_sk between 2432156 and 2432186
+or ss_sold_date_sk between 2432521 and 2432551
+or ss_sold_date_sk between 2432887 and 2432917
+or ss_sold_date_sk between 2433252 and 2433282
+or ss_sold_date_sk between 2433617 and 2433647
+or ss_sold_date_sk between 2433982 and 2434012
+or ss_sold_date_sk between 2434348 and 2434378
+or ss_sold_date_sk between 2434713 and 2434743
+or ss_sold_date_sk between 2435078 and 2435108
+or ss_sold_date_sk between 2435443 and 2435473
+or ss_sold_date_sk between 2435809 and 2435839
+or ss_sold_date_sk between 2436174 and 2436204
+or ss_sold_date_sk between 2436539 and 2436569
+or ss_sold_date_sk between 2436904 and 2436934
+or ss_sold_date_sk between 2437270 and 2437300
+or ss_sold_date_sk between 2437635 and 2437665
+or ss_sold_date_sk between 2438000 and 2438030
+or ss_sold_date_sk between 2438365 and 2438395
+or ss_sold_date_sk between 2438731 and 2438761
+or ss_sold_date_sk between 2439096 and 2439126
+or ss_sold_date_sk between 2439461 and 2439491
+or ss_sold_date_sk between 2439826 and 2439856
+or ss_sold_date_sk between 2440192 and 2440222
+or ss_sold_date_sk between 2440557 and 2440587
+or ss_sold_date_sk between 2440922 and 2440952
+or ss_sold_date_sk between 2441287 and 2441317
+or ss_sold_date_sk between 2441653 and 2441683
+or ss_sold_date_sk between 2442018 and 2442048
+or ss_sold_date_sk between 2442383 and 2442413
+or ss_sold_date_sk between 2442748 and 2442778
+or ss_sold_date_sk between 2443114 and 2443144
+or ss_sold_date_sk between 2443479 and 2443509
+or ss_sold_date_sk between 2443844 and 2443874
+or ss_sold_date_sk between 2444209 and 2444239
+or ss_sold_date_sk between 2444575 and 2444605
+or ss_sold_date_sk between 2444940 and 2444970
+or ss_sold_date_sk between 2445305 and 2445335
+or ss_sold_date_sk between 2445670 and 2445700
+or ss_sold_date_sk between 2446036 and 2446066
+or ss_sold_date_sk between 2446401 and 2446431
+or ss_sold_date_sk between 2446766 and 2446796
+or ss_sold_date_sk between 2447131 and 2447161
+or ss_sold_date_sk between 2447497 and 2447527
+or ss_sold_date_sk between 2447862 and 2447892
+or ss_sold_date_sk between 2448227 and 2448257
+or ss_sold_date_sk between 2448592 and 2448622
+or ss_sold_date_sk between 2448958 and 2448988
+or ss_sold_date_sk between 2449323 and 2449353
+or ss_sold_date_sk between 2449688 and 2449718
+or ss_sold_date_sk between 2450053 and 2450083
+or ss_sold_date_sk between 2450419 and 2450449
+or ss_sold_date_sk between 2450784 and 2450814
+or ss_sold_date_sk between 2451149 and 2451179
+or ss_sold_date_sk between 2451514 and 2451544
+or ss_sold_date_sk between 2451880 and 2451910
+or ss_sold_date_sk between 2452245 and 2452275
+or ss_sold_date_sk between 2452610 and 2452640
+or ss_sold_date_sk between 2452975 and 2453005
+or ss_sold_date_sk between 2453341 and 2453371
+or ss_sold_date_sk between 2453706 and 2453736
+or ss_sold_date_sk between 2454071 and 2454101
+or ss_sold_date_sk between 2454436 and 2454466
+or ss_sold_date_sk between 2454802 and 2454832
+or ss_sold_date_sk between 2455167 and 2455197
+or ss_sold_date_sk between 2455532 and 2455562
+or ss_sold_date_sk between 2455897 and 2455927
+or ss_sold_date_sk between 2456263 and 2456293
+or ss_sold_date_sk between 2456628 and 2456658
+or ss_sold_date_sk between 2456993 and 2457023
+or ss_sold_date_sk between 2457358 and 2457388
+or ss_sold_date_sk between 2457724 and 2457754
+or ss_sold_date_sk between 2458089 and 2458119
+or ss_sold_date_sk between 2458454 and 2458484
+or ss_sold_date_sk between 2458819 and 2458849
+or ss_sold_date_sk between 2459185 and 2459215
+or ss_sold_date_sk between 2459550 and 2459580
+or ss_sold_date_sk between 2459915 and 2459945
+or ss_sold_date_sk between 2460280 and 2460310
+or ss_sold_date_sk between 2460646 and 2460676
+or ss_sold_date_sk between 2461011 and 2461041
+or ss_sold_date_sk between 2461376 and 2461406
+or ss_sold_date_sk between 2461741 and 2461771
+or ss_sold_date_sk between 2462107 and 2462137
+or ss_sold_date_sk between 2462472 and 2462502
+or ss_sold_date_sk between 2462837 and 2462867
+or ss_sold_date_sk between 2463202 and 2463232
+or ss_sold_date_sk between 2463568 and 2463598
+or ss_sold_date_sk between 2463933 and 2463963
+or ss_sold_date_sk between 2464298 and 2464328
+or ss_sold_date_sk between 2464663 and 2464693
+or ss_sold_date_sk between 2465029 and 2465059
+or ss_sold_date_sk between 2465394 and 2465424
+or ss_sold_date_sk between 2465759 and 2465789
+or ss_sold_date_sk between 2466124 and 2466154
+or ss_sold_date_sk between 2466490 and 2466520
+or ss_sold_date_sk between 2466855 and 2466885
+or ss_sold_date_sk between 2467220 and 2467250
+or ss_sold_date_sk between 2467585 and 2467615
+or ss_sold_date_sk between 2467951 and 2467981
+or ss_sold_date_sk between 2468316 and 2468346
+or ss_sold_date_sk between 2468681 and 2468711
+or ss_sold_date_sk between 2469046 and 2469076
+or ss_sold_date_sk between 2469412 and 2469442
+or ss_sold_date_sk between 2469777 and 2469807
+or ss_sold_date_sk between 2470142 and 2470172
+or ss_sold_date_sk between 2470507 and 2470537
+or ss_sold_date_sk between 2470873 and 2470903
+or ss_sold_date_sk between 2471238 and 2471268
+or ss_sold_date_sk between 2471603 and 2471633
+or ss_sold_date_sk between 2471968 and 2471998
+or ss_sold_date_sk between 2472334 and 2472364
+or ss_sold_date_sk between 2472699 and 2472729
+or ss_sold_date_sk between 2473064 and 2473094
+or ss_sold_date_sk between 2473429 and 2473459
+or ss_sold_date_sk between 2473795 and 2473825
+or ss_sold_date_sk between 2474160 and 2474190
+or ss_sold_date_sk between 2474525 and 2474555
+or ss_sold_date_sk between 2474890 and 2474920
+or ss_sold_date_sk between 2475256 and 2475286
+or ss_sold_date_sk between 2475621 and 2475651
+or ss_sold_date_sk between 2475986 and 2476016
+or ss_sold_date_sk between 2476351 and 2476381
+or ss_sold_date_sk between 2476717 and 2476747
+or ss_sold_date_sk between 2477082 and 2477112
+or ss_sold_date_sk between 2477447 and 2477477
+or ss_sold_date_sk between 2477812 and 2477842
+or ss_sold_date_sk between 2478178 and 2478208
+or ss_sold_date_sk between 2478543 and 2478573
+or ss_sold_date_sk between 2478908 and 2478938
+or ss_sold_date_sk between 2479273 and 2479303
+or ss_sold_date_sk between 2479639 and 2479669
+or ss_sold_date_sk between 2480004 and 2480034
+or ss_sold_date_sk between 2480369 and 2480399
+or ss_sold_date_sk between 2480734 and 2480764
+or ss_sold_date_sk between 2481100 and 2481130
+or ss_sold_date_sk between 2481465 and 2481495
+or ss_sold_date_sk between 2481830 and 2481860
+or ss_sold_date_sk between 2482195 and 2482225
+or ss_sold_date_sk between 2482561 and 2482591
+or ss_sold_date_sk between 2482926 and 2482956
+or ss_sold_date_sk between 2483291 and 2483321
+or ss_sold_date_sk between 2483656 and 2483686
+or ss_sold_date_sk between 2484022 and 2484052
+or ss_sold_date_sk between 2484387 and 2484417
+or ss_sold_date_sk between 2484752 and 2484782
+or ss_sold_date_sk between 2485117 and 2485147
+or ss_sold_date_sk between 2485483 and 2485513
+or ss_sold_date_sk between 2485848 and 2485878
+or ss_sold_date_sk between 2486213 and 2486243
+or ss_sold_date_sk between 2486578 and 2486608
+or ss_sold_date_sk between 2486944 and 2486974
+or ss_sold_date_sk between 2487309 and 2487339
+or ss_sold_date_sk between 2487674 and 2487704
+or ss_sold_date_sk between 2488039 and 2488069
+)
+group by
+  dt.d_year,
+  item.i_brand,
+  item.i_brand_id
+order by
+  dt.d_year,
+  sum_agg desc,
+  brand_id
+limit 100
+-- end query 3 in stream 0 using template query3.tpl
diff --git a/sql/core/src/test/resources/tpcds-modifiedQueries/q34.sql b/sql/core/src/test/resources/tpcds-modifiedQueries/q34.sql
new file mode 100755
index 0000000000000..d11696e5e0c34
--- /dev/null
+++ b/sql/core/src/test/resources/tpcds-modifiedQueries/q34.sql
@@ -0,0 +1,45 @@
+-- start query 34 in stream 0 using template query34.tpl
+select
+  c_last_name,
+  c_first_name,
+  c_salutation,
+  c_preferred_cust_flag,
+  ss_ticket_number,
+  cnt
+from
+  (select
+    ss_ticket_number,
+    ss_customer_sk,
+    count(*) cnt
+  from
+    store_sales,
+    date_dim,
+    store,
+    household_demographics
+  where
+    store_sales.ss_sold_date_sk = date_dim.d_date_sk
+    and store_sales.ss_store_sk = store.s_store_sk
+    and store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk
+    and (date_dim.d_dom between 1 and 3
+      or date_dim.d_dom between 25 and 28)
+    and (household_demographics.hd_buy_potential = '>10000'
+      or household_demographics.hd_buy_potential = 'Unknown')
+    and household_demographics.hd_vehicle_count > 0
+    and (case when household_demographics.hd_vehicle_count > 0 then household_demographics.hd_dep_count / household_demographics.hd_vehicle_count else null end) > 1.2
+    and date_dim.d_year in (1998, 1998 + 1, 1998 + 2)
+    and store.s_county in ('Saginaw County', 'Sumner County', 'Appanoose County', 'Daviess County', 'Fairfield County', 'Raleigh County', 'Ziebach County', 'Williamson County')
+    and ss_sold_date_sk between 2450816 and 2451910 -- partition key filter
+  group by
+    ss_ticket_number,
+    ss_customer_sk
+  ) dn,
+  customer
+where
+  ss_customer_sk = c_customer_sk
+  and cnt between 15 and 20
+order by
+  c_last_name,
+  c_first_name,
+  c_salutation,
+  c_preferred_cust_flag desc
+-- end query 34 in stream 0 using template query34.tpl
diff --git a/sql/core/src/test/resources/tpcds-modifiedQueries/q42.sql b/sql/core/src/test/resources/tpcds-modifiedQueries/q42.sql
new file mode 100755
index 0000000000000..b6332a8afbebe
--- /dev/null
+++ b/sql/core/src/test/resources/tpcds-modifiedQueries/q42.sql
@@ -0,0 +1,28 @@
+-- start query 42 in stream 0 using template query42.tpl
+select
+  dt.d_year,
+  item.i_category_id,
+  item.i_category,
+  sum(ss_ext_sales_price)
+from
+  date_dim dt,
+  store_sales,
+  item
+where
+  dt.d_date_sk = store_sales.ss_sold_date_sk
+  and store_sales.ss_item_sk = item.i_item_sk
+  and item.i_manager_id = 1
+  and dt.d_moy = 12
+  and dt.d_year = 1998
+  and ss_sold_date_sk between 2451149 and 2451179  -- partition key filter
+group by
+  dt.d_year,
+  item.i_category_id,
+  item.i_category
+order by
+  sum(ss_ext_sales_price) desc,
+  dt.d_year,
+  item.i_category_id,
+  item.i_category
+limit 100
+-- end query 42 in stream 0 using template query42.tpl
diff --git a/sql/core/src/test/resources/tpcds-modifiedQueries/q43.sql b/sql/core/src/test/resources/tpcds-modifiedQueries/q43.sql
new file mode 100755
index 0000000000000..cc2040b2fdb7c
--- /dev/null
+++ b/sql/core/src/test/resources/tpcds-modifiedQueries/q43.sql
@@ -0,0 +1,36 @@
+-- start query 43 in stream 0 using template query43.tpl
+select
+  s_store_name,
+  s_store_id,
+  sum(case when (d_day_name = 'Sunday') then ss_sales_price else null end) sun_sales,
+  sum(case when (d_day_name = 'Monday') then ss_sales_price else null end) mon_sales,
+  sum(case when (d_day_name = 'Tuesday') then ss_sales_price else null end) tue_sales,
+  sum(case when (d_day_name = 'Wednesday') then ss_sales_price else null end) wed_sales,
+  sum(case when (d_day_name = 'Thursday') then ss_sales_price else null end) thu_sales,
+  sum(case when (d_day_name = 'Friday') then ss_sales_price else null end) fri_sales,
+  sum(case when (d_day_name = 'Saturday') then ss_sales_price else null end) sat_sales
+from
+  date_dim,
+  store_sales,
+  store
+where
+  d_date_sk = ss_sold_date_sk
+  and s_store_sk = ss_store_sk
+  and s_gmt_offset = -5
+  and d_year = 1998
+  and ss_sold_date_sk between 2450816 and 2451179  -- partition key filter
+group by
+  s_store_name,
+  s_store_id
+order by
+  s_store_name,
+  s_store_id,
+  sun_sales,
+  mon_sales,
+  tue_sales,
+  wed_sales,
+  thu_sales,
+  fri_sales,
+  sat_sales
+limit 100
+-- end query 43 in stream 0 using template query43.tpl
diff --git a/sql/core/src/test/resources/tpcds-modifiedQueries/q46.sql b/sql/core/src/test/resources/tpcds-modifiedQueries/q46.sql
new file mode 100755
index 0000000000000..52b7ba4f4b86b
--- /dev/null
+++ b/sql/core/src/test/resources/tpcds-modifiedQueries/q46.sql
@@ -0,0 +1,80 @@
+-- start query 46 in stream 0 using template query46.tpl
+select
+  c_last_name,
+  c_first_name,
+  ca_city,
+  bought_city,
+  ss_ticket_number,
+  amt,
+  profit
+from
+  (select
+    ss_ticket_number,
+    ss_customer_sk,
+    ca_city bought_city,
+    sum(ss_coupon_amt) amt,
+    sum(ss_net_profit) profit
+  from
+    store_sales,
+    date_dim,
+    store,
+    household_demographics,
+    customer_address
+  where
+    store_sales.ss_sold_date_sk = date_dim.d_date_sk
+    and store_sales.ss_store_sk = store.s_store_sk
+    and store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk
+    and store_sales.ss_addr_sk = customer_address.ca_address_sk
+    and (household_demographics.hd_dep_count = 5
+      or household_demographics.hd_vehicle_count = 3)
+    and date_dim.d_dow in (6, 0)
+    and date_dim.d_year in (1999, 1999 + 1, 1999 + 2)
+    and store.s_city in ('Midway', 'Concord', 'Spring Hill', 'Brownsville', 'Greenville')
+    -- partition key filter
+    and ss_sold_date_sk in (2451181, 2451182, 2451188, 2451189, 2451195, 2451196, 2451202, 2451203, 2451209, 2451210, 2451216, 2451217, 
+                            2451223, 2451224, 2451230, 2451231, 2451237, 2451238, 2451244, 2451245, 2451251, 2451252, 2451258, 2451259, 
+                            2451265, 2451266, 2451272, 2451273, 2451279, 2451280, 2451286, 2451287, 2451293, 2451294, 2451300, 2451301, 
+                            2451307, 2451308, 2451314, 2451315, 2451321, 2451322, 2451328, 2451329, 2451335, 2451336, 2451342, 2451343, 
+                            2451349, 2451350, 2451356, 2451357, 2451363, 2451364, 2451370, 2451371, 2451377, 2451378, 2451384, 2451385, 
+                            2451391, 2451392, 2451398, 2451399, 2451405, 2451406, 2451412, 2451413, 2451419, 2451420, 2451426, 2451427, 
+                            2451433, 2451434, 2451440, 2451441, 2451447, 2451448, 2451454, 2451455, 2451461, 2451462, 2451468, 2451469, 
+                            2451475, 2451476, 2451482, 2451483, 2451489, 2451490, 2451496, 2451497, 2451503, 2451504, 2451510, 2451511, 
+                            2451517, 2451518, 2451524, 2451525, 2451531, 2451532, 2451538, 2451539, 2451545, 2451546, 2451552, 2451553, 
+                            2451559, 2451560, 2451566, 2451567, 2451573, 2451574, 2451580, 2451581, 2451587, 2451588, 2451594, 2451595, 
+                            2451601, 2451602, 2451608, 2451609, 2451615, 2451616, 2451622, 2451623, 2451629, 2451630, 2451636, 2451637, 
+                            2451643, 2451644, 2451650, 2451651, 2451657, 2451658, 2451664, 2451665, 2451671, 2451672, 2451678, 2451679, 
+                            2451685, 2451686, 2451692, 2451693, 2451699, 2451700, 2451706, 2451707, 2451713, 2451714, 2451720, 2451721, 
+                            2451727, 2451728, 2451734, 2451735, 2451741, 2451742, 2451748, 2451749, 2451755, 2451756, 2451762, 2451763, 
+                            2451769, 2451770, 2451776, 2451777, 2451783, 2451784, 2451790, 2451791, 2451797, 2451798, 2451804, 2451805, 
+                            2451811, 2451812, 2451818, 2451819, 2451825, 2451826, 2451832, 2451833, 2451839, 2451840, 2451846, 2451847, 
+                            2451853, 2451854, 2451860, 2451861, 2451867, 2451868, 2451874, 2451875, 2451881, 2451882, 2451888, 2451889, 
+                            2451895, 2451896, 2451902, 2451903, 2451909, 2451910, 2451916, 2451917, 2451923, 2451924, 2451930, 2451931, 
+                            2451937, 2451938, 2451944, 2451945, 2451951, 2451952, 2451958, 2451959, 2451965, 2451966, 2451972, 2451973, 
+                            2451979, 2451980, 2451986, 2451987, 2451993, 2451994, 2452000, 2452001, 2452007, 2452008, 2452014, 2452015, 
+                            2452021, 2452022, 2452028, 2452029, 2452035, 2452036, 2452042, 2452043, 2452049, 2452050, 2452056, 2452057, 
+                            2452063, 2452064, 2452070, 2452071, 2452077, 2452078, 2452084, 2452085, 2452091, 2452092, 2452098, 2452099, 
+                            2452105, 2452106, 2452112, 2452113, 2452119, 2452120, 2452126, 2452127, 2452133, 2452134, 2452140, 2452141, 
+                            2452147, 2452148, 2452154, 2452155, 2452161, 2452162, 2452168, 2452169, 2452175, 2452176, 2452182, 2452183, 
+                            2452189, 2452190, 2452196, 2452197, 2452203, 2452204, 2452210, 2452211, 2452217, 2452218, 2452224, 2452225, 
+                            2452231, 2452232, 2452238, 2452239, 2452245, 2452246, 2452252, 2452253, 2452259, 2452260, 2452266, 2452267, 
+                            2452273, 2452274)
+  group by
+    ss_ticket_number,
+    ss_customer_sk,
+    ss_addr_sk,
+    ca_city
+  ) dn,
+  customer,
+  customer_address current_addr
+where
+  ss_customer_sk = c_customer_sk
+  and customer.c_current_addr_sk = current_addr.ca_address_sk
+  and current_addr.ca_city <> bought_city
+order by
+  c_last_name,
+  c_first_name,
+  ca_city,
+  bought_city,
+  ss_ticket_number
+limit 100
+-- end query 46 in stream 0 using template query46.tpl
diff --git a/sql/core/src/test/resources/tpcds-modifiedQueries/q52.sql b/sql/core/src/test/resources/tpcds-modifiedQueries/q52.sql
new file mode 100755
index 0000000000000..a510eefb13e17
--- /dev/null
+++ b/sql/core/src/test/resources/tpcds-modifiedQueries/q52.sql
@@ -0,0 +1,27 @@
+-- start query 52 in stream 0 using template query52.tpl
+select
+  dt.d_year,
+  item.i_brand_id brand_id,
+  item.i_brand brand,
+  sum(ss_ext_sales_price) ext_price
+from
+  date_dim dt,
+  store_sales,
+  item
+where
+  dt.d_date_sk = store_sales.ss_sold_date_sk
+  and store_sales.ss_item_sk = item.i_item_sk
+  and item.i_manager_id = 1
+  and dt.d_moy = 12
+  and dt.d_year = 1998
+  and ss_sold_date_sk between 2451149 and 2451179 -- added for partition pruning
+group by
+  dt.d_year,
+  item.i_brand,
+  item.i_brand_id
+order by
+  dt.d_year,
+  ext_price desc,
+  brand_id
+limit 100
+-- end query 52 in stream 0 using template query52.tpl
diff --git a/sql/core/src/test/resources/tpcds-modifiedQueries/q53.sql b/sql/core/src/test/resources/tpcds-modifiedQueries/q53.sql
new file mode 100755
index 0000000000000..fb7bb75183858
--- /dev/null
+++ b/sql/core/src/test/resources/tpcds-modifiedQueries/q53.sql
@@ -0,0 +1,37 @@
+-- start query 53 in stream 0 using template query53.tpl
+select
+  *
+from
+  (select
+    i_manufact_id,
+    sum(ss_sales_price) sum_sales,
+    avg(sum(ss_sales_price)) over (partition by i_manufact_id) avg_quarterly_sales
+  from
+    item,
+    store_sales,
+    date_dim,
+    store
+  where
+    ss_item_sk = i_item_sk
+    and ss_sold_date_sk = d_date_sk
+    and ss_store_sk = s_store_sk
+    and d_month_seq in (1212, 1212 + 1, 1212 + 2, 1212 + 3, 1212 + 4, 1212 + 5, 1212 + 6, 1212 + 7, 1212 + 8, 1212 + 9, 1212 + 10, 1212 + 11)
+    and ((i_category in ('Books', 'Children', 'Electronics')
+      and i_class in ('personal', 'portable', 'reference', 'self-help')
+      and i_brand in ('scholaramalgamalg #14', 'scholaramalgamalg #7', 'exportiunivamalg #9', 'scholaramalgamalg #9'))
+    or (i_category in ('Women', 'Music', 'Men')
+      and i_class in ('accessories', 'classical', 'fragrances', 'pants')
+      and i_brand in ('amalgimporto #1', 'edu packscholar #1', 'exportiimporto #1', 'importoamalg #1')))
+    and ss_sold_date_sk between 2451911 and 2452275 -- partition key filter
+  group by
+    i_manufact_id,
+    d_qoy
+  ) tmp1
+where
+  case when avg_quarterly_sales > 0 then abs (sum_sales - avg_quarterly_sales) / avg_quarterly_sales else null end > 0.1
+order by
+  avg_quarterly_sales,
+  sum_sales,
+  i_manufact_id
+limit 100
+-- end query 53 in stream 0 using template query53.tpl
diff --git a/sql/core/src/test/resources/tpcds-modifiedQueries/q55.sql b/sql/core/src/test/resources/tpcds-modifiedQueries/q55.sql
new file mode 100755
index 0000000000000..47b1f0292d901
--- /dev/null
+++ b/sql/core/src/test/resources/tpcds-modifiedQueries/q55.sql
@@ -0,0 +1,24 @@
+-- start query 55 in stream 0 using template query55.tpl
+select
+  i_brand_id brand_id,
+  i_brand brand,
+  sum(ss_ext_sales_price) ext_price
+from
+  date_dim,
+  store_sales,
+  item
+where
+  d_date_sk = ss_sold_date_sk
+  and ss_item_sk = i_item_sk
+  and i_manager_id = 48
+  and d_moy = 11
+  and d_year = 2001
+  and ss_sold_date_sk between 2452215 and 2452244
+group by
+  i_brand,
+  i_brand_id
+order by
+  ext_price desc,
+  i_brand_id
+limit 100
+-- end query 55 in stream 0 using template query55.tpl
diff --git a/sql/core/src/test/resources/tpcds-modifiedQueries/q59.sql b/sql/core/src/test/resources/tpcds-modifiedQueries/q59.sql
new file mode 100755
index 0000000000000..3d5c4e9d64419
--- /dev/null
+++ b/sql/core/src/test/resources/tpcds-modifiedQueries/q59.sql
@@ -0,0 +1,83 @@
+-- start query 59 in stream 0 using template query59.tpl
+with
+  wss as
+  (select
+    d_week_seq,
+    ss_store_sk,
+    sum(case when (d_day_name = 'Sunday') then ss_sales_price else null end) sun_sales,
+    sum(case when (d_day_name = 'Monday') then ss_sales_price else null end) mon_sales,
+    sum(case when (d_day_name = 'Tuesday') then ss_sales_price else null end) tue_sales,
+    sum(case when (d_day_name = 'Wednesday') then ss_sales_price else null end) wed_sales,
+    sum(case when (d_day_name = 'Thursday') then ss_sales_price else null end) thu_sales,
+    sum(case when (d_day_name = 'Friday') then ss_sales_price else null end) fri_sales,
+    sum(case when (d_day_name = 'Saturday') then ss_sales_price else null end) sat_sales
+  from
+    store_sales,
+    date_dim
+  where
+    d_date_sk = ss_sold_date_sk
+  group by
+    d_week_seq,
+    ss_store_sk
+  )
+select
+  s_store_name1,
+  s_store_id1,
+  d_week_seq1,
+  sun_sales1 / sun_sales2,
+  mon_sales1 / mon_sales2,
+  tue_sales1 / tue_sales1,
+  wed_sales1 / wed_sales2,
+  thu_sales1 / thu_sales2,
+  fri_sales1 / fri_sales2,
+  sat_sales1 / sat_sales2
+from
+  (select
+    s_store_name s_store_name1,
+    wss.d_week_seq d_week_seq1,
+    s_store_id s_store_id1,
+    sun_sales sun_sales1,
+    mon_sales mon_sales1,
+    tue_sales tue_sales1,
+    wed_sales wed_sales1,
+    thu_sales thu_sales1,
+    fri_sales fri_sales1,
+    sat_sales sat_sales1
+  from
+    wss,
+    store,
+    date_dim d
+  where
+    d.d_week_seq = wss.d_week_seq
+    and ss_store_sk = s_store_sk
+    and d_month_seq between 1185 and 1185 + 11
+  ) y,
+  (select
+    s_store_name s_store_name2,
+    wss.d_week_seq d_week_seq2,
+    s_store_id s_store_id2,
+    sun_sales sun_sales2,
+    mon_sales mon_sales2,
+    tue_sales tue_sales2,
+    wed_sales wed_sales2,
+    thu_sales thu_sales2,
+    fri_sales fri_sales2,
+    sat_sales sat_sales2
+  from
+    wss,
+    store,
+    date_dim d
+  where
+    d.d_week_seq = wss.d_week_seq
+    and ss_store_sk = s_store_sk
+    and d_month_seq between 1185 + 12 and 1185 + 23
+  ) x
+where
+  s_store_id1 = s_store_id2
+  and d_week_seq1 = d_week_seq2 - 52
+order by
+  s_store_name1,
+  s_store_id1,
+  d_week_seq1
+limit 100
+-- end query 59 in stream 0 using template query59.tpl
diff --git a/sql/core/src/test/resources/tpcds-modifiedQueries/q63.sql b/sql/core/src/test/resources/tpcds-modifiedQueries/q63.sql
new file mode 100755
index 0000000000000..b71199ab17d0b
--- /dev/null
+++ b/sql/core/src/test/resources/tpcds-modifiedQueries/q63.sql
@@ -0,0 +1,29 @@
+-- start query 63 in stream 0 using template query63.tpl
+select  * 
+from (select i_manager_id
+             ,sum(ss_sales_price) sum_sales
+             ,avg(sum(ss_sales_price)) over (partition by i_manager_id) avg_monthly_sales
+      from item
+          ,store_sales
+          ,date_dim
+          ,store
+      where ss_item_sk = i_item_sk
+        and ss_sold_date_sk = d_date_sk
+	and ss_sold_date_sk between 2452123 and	2452487
+        and ss_store_sk = s_store_sk
+        and d_month_seq in (1219,1219+1,1219+2,1219+3,1219+4,1219+5,1219+6,1219+7,1219+8,1219+9,1219+10,1219+11)
+        and ((    i_category in ('Books','Children','Electronics')
+              and i_class in ('personal','portable','reference','self-help')
+              and i_brand in ('scholaramalgamalg #14','scholaramalgamalg #7',
+		                  'exportiunivamalg #9','scholaramalgamalg #9'))
+           or(    i_category in ('Women','Music','Men')
+              and i_class in ('accessories','classical','fragrances','pants')
+              and i_brand in ('amalgimporto #1','edu packscholar #1','exportiimporto #1',
+		                 'importoamalg #1')))
+group by i_manager_id, d_moy) tmp1
+where case when avg_monthly_sales > 0 then abs (sum_sales - avg_monthly_sales) / avg_monthly_sales else null end > 0.1
+order by i_manager_id
+        ,avg_monthly_sales
+        ,sum_sales
+limit 100
+-- end query 63 in stream 0 using template query63.tpl
diff --git a/sql/core/src/test/resources/tpcds-modifiedQueries/q65.sql b/sql/core/src/test/resources/tpcds-modifiedQueries/q65.sql
new file mode 100755
index 0000000000000..7344feeff6a9f
--- /dev/null
+++ b/sql/core/src/test/resources/tpcds-modifiedQueries/q65.sql
@@ -0,0 +1,58 @@
+-- start query 65 in stream 0 using template query65.tpl
+select
+  s_store_name,
+  i_item_desc,
+  sc.revenue,
+  i_current_price,
+  i_wholesale_cost,
+  i_brand
+from
+  store,
+  item,
+  (select
+    ss_store_sk,
+    avg(revenue) as ave
+  from
+    (select
+      ss_store_sk,
+      ss_item_sk,
+      sum(ss_sales_price) as revenue
+    from
+      store_sales,
+      date_dim
+    where
+      ss_sold_date_sk = d_date_sk
+      and d_month_seq between 1212 and 1212 + 11
+      and ss_sold_date_sk between 2451911 and 2452275  -- partition key filter
+    group by
+      ss_store_sk,
+      ss_item_sk
+    ) sa
+  group by
+    ss_store_sk
+  ) sb,
+  (select
+    ss_store_sk,
+    ss_item_sk,
+    sum(ss_sales_price) as revenue
+  from
+    store_sales,
+    date_dim
+  where
+    ss_sold_date_sk = d_date_sk
+    and d_month_seq between 1212 and 1212 + 11
+    and ss_sold_date_sk between 2451911 and 2452275  -- partition key filter
+  group by
+    ss_store_sk,
+    ss_item_sk
+  ) sc
+where
+  sb.ss_store_sk = sc.ss_store_sk
+  and sc.revenue <= 0.1 * sb.ave
+  and s_store_sk = sc.ss_store_sk
+  and i_item_sk = sc.ss_item_sk
+order by
+  s_store_name,
+  i_item_desc
+limit 100
+-- end query 65 in stream 0 using template query65.tpl
diff --git a/sql/core/src/test/resources/tpcds-modifiedQueries/q68.sql b/sql/core/src/test/resources/tpcds-modifiedQueries/q68.sql
new file mode 100755
index 0000000000000..94df4b3f57a90
--- /dev/null
+++ b/sql/core/src/test/resources/tpcds-modifiedQueries/q68.sql
@@ -0,0 +1,62 @@
+-- start query 68 in stream 0 using template query68.tpl
+-- changed to match exact same partitions in original query
+select
+  c_last_name,
+  c_first_name,
+  ca_city,
+  bought_city,
+  ss_ticket_number,
+  extended_price,
+  extended_tax,
+  list_price
+from
+  (select
+    ss_ticket_number,
+    ss_customer_sk,
+    ca_city bought_city,
+    sum(ss_ext_sales_price) extended_price,
+    sum(ss_ext_list_price) list_price,
+    sum(ss_ext_tax) extended_tax
+  from
+    store_sales,
+    date_dim,
+    store,
+    household_demographics,
+    customer_address
+  where
+    store_sales.ss_sold_date_sk = date_dim.d_date_sk
+    and store_sales.ss_store_sk = store.s_store_sk
+    and store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk
+    and store_sales.ss_addr_sk = customer_address.ca_address_sk
+    and date_dim.d_dom between 1 and 2
+    and (household_demographics.hd_dep_count = 5
+      or household_demographics.hd_vehicle_count = 3)
+    and date_dim.d_year in (1999, 1999 + 1, 1999 + 2)
+    and store.s_city in ('Midway', 'Fairview')
+    -- partition key filter
+    and ss_sold_date_sk in (2451180, 2451181, 2451211, 2451212, 2451239, 2451240, 2451270, 2451271, 2451300, 2451301, 2451331, 
+                             2451332, 2451361, 2451362, 2451392, 2451393, 2451423, 2451424, 2451453, 2451454, 2451484, 2451485, 
+                             2451514, 2451515, 2451545, 2451546, 2451576, 2451577, 2451605, 2451606, 2451636, 2451637, 2451666, 
+                             2451667, 2451697, 2451698, 2451727, 2451728, 2451758, 2451759, 2451789, 2451790, 2451819, 2451820, 
+                             2451850, 2451851, 2451880, 2451881, 2451911, 2451912, 2451942, 2451943, 2451970, 2451971, 2452001, 
+                             2452002, 2452031, 2452032, 2452062, 2452063, 2452092, 2452093, 2452123, 2452124, 2452154, 2452155, 
+                             2452184, 2452185, 2452215, 2452216, 2452245, 2452246) 
+    --and ss_sold_date_sk between 2451180 and 2451269 -- partition key filter (3 months)
+    --and d_date between '1999-01-01' and '1999-03-31'
+  group by
+    ss_ticket_number,
+    ss_customer_sk,
+    ss_addr_sk,
+    ca_city
+  ) dn,
+  customer,
+  customer_address current_addr
+where
+  ss_customer_sk = c_customer_sk
+  and customer.c_current_addr_sk = current_addr.ca_address_sk
+  and current_addr.ca_city <> bought_city
+order by
+  c_last_name,
+  ss_ticket_number
+limit 100
+-- end query 68 in stream 0 using template query68.tpl
diff --git a/sql/core/src/test/resources/tpcds-modifiedQueries/q7.sql b/sql/core/src/test/resources/tpcds-modifiedQueries/q7.sql
new file mode 100755
index 0000000000000..c61a2d0d2a8fa
--- /dev/null
+++ b/sql/core/src/test/resources/tpcds-modifiedQueries/q7.sql
@@ -0,0 +1,31 @@
+-- start query 7 in stream 0 using template query7.tpl
+select
+  i_item_id,
+  avg(ss_quantity) agg1,
+  avg(ss_list_price) agg2,
+  avg(ss_coupon_amt) agg3,
+  avg(ss_sales_price) agg4
+from
+  store_sales,
+  customer_demographics,
+  date_dim,
+  item,
+  promotion
+where
+  ss_sold_date_sk = d_date_sk
+  and ss_item_sk = i_item_sk
+  and ss_cdemo_sk = cd_demo_sk
+  and ss_promo_sk = p_promo_sk
+  and cd_gender = 'F'
+  and cd_marital_status = 'W'
+  and cd_education_status = 'Primary'
+  and (p_channel_email = 'N'
+    or p_channel_event = 'N')
+  and d_year = 1998
+  and ss_sold_date_sk between 2450815 and 2451179 -- partition key filter
+group by
+  i_item_id
+order by
+  i_item_id
+limit 100
+-- end query 7 in stream 0 using template query7.tpl
diff --git a/sql/core/src/test/resources/tpcds-modifiedQueries/q73.sql b/sql/core/src/test/resources/tpcds-modifiedQueries/q73.sql
new file mode 100755
index 0000000000000..8703910b305a8
--- /dev/null
+++ b/sql/core/src/test/resources/tpcds-modifiedQueries/q73.sql
@@ -0,0 +1,49 @@
+-- start query 73 in stream 0 using template query73.tpl
+select
+  c_last_name,
+  c_first_name,
+  c_salutation,
+  c_preferred_cust_flag,
+  ss_ticket_number,
+  cnt
+from
+  (select
+    ss_ticket_number,
+    ss_customer_sk,
+    count(*) cnt
+  from
+    store_sales,
+    date_dim,
+    store,
+    household_demographics
+  where
+    store_sales.ss_sold_date_sk = date_dim.d_date_sk
+    and store_sales.ss_store_sk = store.s_store_sk
+    and store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk
+    and date_dim.d_dom between 1 and 2
+    and (household_demographics.hd_buy_potential = '>10000'
+      or household_demographics.hd_buy_potential = 'Unknown')
+    and household_demographics.hd_vehicle_count > 0
+    and case when household_demographics.hd_vehicle_count > 0 then household_demographics.hd_dep_count / household_demographics.hd_vehicle_count else null end > 1
+    and date_dim.d_year in (1998, 1998 + 1, 1998 + 2)
+    and store.s_county in ('Fairfield County','Ziebach County','Bronx County','Barrow County')
+    -- partition key filter
+    and ss_sold_date_sk in (2450815, 2450816, 2450846, 2450847, 2450874, 2450875, 2450905, 2450906, 2450935, 2450936, 2450966, 2450967, 
+                            2450996, 2450997, 2451027, 2451028, 2451058, 2451059, 2451088, 2451089, 2451119, 2451120, 2451149, 
+                            2451150, 2451180, 2451181, 2451211, 2451212, 2451239, 2451240, 2451270, 2451271, 2451300, 2451301, 
+                            2451331, 2451332, 2451361, 2451362, 2451392, 2451393, 2451423, 2451424, 2451453, 2451454, 2451484, 
+                            2451485, 2451514, 2451515, 2451545, 2451546, 2451576, 2451577, 2451605, 2451606, 2451636, 2451637, 
+                            2451666, 2451667, 2451697, 2451698, 2451727, 2451728, 2451758, 2451759, 2451789, 2451790, 2451819, 
+                            2451820, 2451850, 2451851, 2451880, 2451881)    
+    --and ss_sold_date_sk between 2451180 and 2451269 -- partition key filter (3 months)
+  group by
+    ss_ticket_number,
+    ss_customer_sk
+  ) dj,
+  customer
+where
+  ss_customer_sk = c_customer_sk
+  and cnt between 1 and 5
+order by
+  cnt desc
+-- end query 73 in stream 0 using template query73.tpl
diff --git a/sql/core/src/test/resources/tpcds-modifiedQueries/q79.sql b/sql/core/src/test/resources/tpcds-modifiedQueries/q79.sql
new file mode 100755
index 0000000000000..4254310ecd10b
--- /dev/null
+++ b/sql/core/src/test/resources/tpcds-modifiedQueries/q79.sql
@@ -0,0 +1,59 @@
+-- start query 79 in stream 0 using template query79.tpl
+select
+  c_last_name,
+  c_first_name,
+  substr(s_city, 1, 30),
+  ss_ticket_number,
+  amt,
+  profit
+from
+  (select
+    ss_ticket_number,
+    ss_customer_sk,
+    store.s_city,
+    sum(ss_coupon_amt) amt,
+    sum(ss_net_profit) profit
+  from
+    store_sales,
+    date_dim,
+    store,
+    household_demographics
+  where
+    store_sales.ss_sold_date_sk = date_dim.d_date_sk
+    and store_sales.ss_store_sk = store.s_store_sk
+    and store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk
+    and (household_demographics.hd_dep_count = 8
+      or household_demographics.hd_vehicle_count > 0)
+    and date_dim.d_dow = 1
+     and date_dim.d_year in (1998, 1998 + 1, 1998 + 2)
+    and store.s_number_employees between 200 and 295
+    and ss_sold_date_sk between 2450819 and 2451904
+    -- partition key filter
+    --and ss_sold_date_sk in (2450819, 2450826, 2450833, 2450840, 2450847, 2450854, 2450861, 2450868, 2450875, 2450882, 2450889,
+    -- 2450896, 2450903, 2450910, 2450917, 2450924, 2450931, 2450938, 2450945, 2450952, 2450959, 2450966, 2450973, 2450980, 2450987,
+    -- 2450994, 2451001, 2451008, 2451015, 2451022, 2451029, 2451036, 2451043, 2451050, 2451057, 2451064, 2451071, 2451078, 2451085,
+    -- 2451092, 2451099, 2451106, 2451113, 2451120, 2451127, 2451134, 2451141, 2451148, 2451155, 2451162, 2451169, 2451176, 2451183,
+    -- 2451190, 2451197, 2451204, 2451211, 2451218, 2451225, 2451232, 2451239, 2451246, 2451253, 2451260, 2451267, 2451274, 2451281,
+    -- 2451288, 2451295, 2451302, 2451309, 2451316, 2451323, 2451330, 2451337, 2451344, 2451351, 2451358, 2451365, 2451372, 2451379,
+    -- 2451386, 2451393, 2451400, 2451407, 2451414, 2451421, 2451428, 2451435, 2451442, 2451449, 2451456, 2451463, 2451470, 2451477,
+    -- 2451484, 2451491, 2451498, 2451505, 2451512, 2451519, 2451526, 2451533, 2451540, 2451547, 2451554, 2451561, 2451568, 2451575,
+    -- 2451582, 2451589, 2451596, 2451603, 2451610, 2451617, 2451624, 2451631, 2451638, 2451645, 2451652, 2451659, 2451666, 2451673,
+    -- 2451680, 2451687, 2451694, 2451701, 2451708, 2451715, 2451722, 2451729, 2451736, 2451743, 2451750, 2451757, 2451764, 2451771,
+    -- 2451778, 2451785, 2451792, 2451799, 2451806, 2451813, 2451820, 2451827, 2451834, 2451841, 2451848, 2451855, 2451862, 2451869,
+    -- 2451876, 2451883, 2451890, 2451897, 2451904)    
+  group by
+    ss_ticket_number,
+    ss_customer_sk,
+    ss_addr_sk,
+    store.s_city
+  ) ms,
+  customer
+where
+  ss_customer_sk = c_customer_sk
+order by
+  c_last_name,
+  c_first_name,
+  substr(s_city, 1, 30),
+  profit 
+  limit 100
+-- end query 79 in stream 0 using template query79.tpl
diff --git a/sql/core/src/test/resources/tpcds-modifiedQueries/q89.sql b/sql/core/src/test/resources/tpcds-modifiedQueries/q89.sql
new file mode 100755
index 0000000000000..b1d814af5e57a
--- /dev/null
+++ b/sql/core/src/test/resources/tpcds-modifiedQueries/q89.sql
@@ -0,0 +1,43 @@
+-- start query 89 in stream 0 using template query89.tpl
+select
+  *
+from
+  (select
+    i_category,
+    i_class,
+    i_brand,
+    s_store_name,
+    s_company_name,
+    d_moy,
+    sum(ss_sales_price) sum_sales,
+    avg(sum(ss_sales_price)) over (partition by i_category, i_brand, s_store_name, s_company_name) avg_monthly_sales
+  from
+    item,
+    store_sales,
+    date_dim,
+    store
+  where
+    ss_item_sk = i_item_sk
+    and ss_sold_date_sk = d_date_sk
+    and ss_store_sk = s_store_sk
+    and d_year in (2000)
+    and ((i_category in ('Home', 'Books', 'Electronics')
+        and i_class in ('wallpaper', 'parenting', 'musical'))
+      or (i_category in ('Shoes', 'Jewelry', 'Men')
+        and i_class in ('womens', 'birdal', 'pants')))
+    and ss_sold_date_sk between 2451545 and 2451910  -- partition key filter
+  group by
+    i_category,
+    i_class,
+    i_brand,
+    s_store_name,
+    s_company_name,
+    d_moy
+  ) tmp1
+where
+  case when (avg_monthly_sales <> 0) then (abs(sum_sales - avg_monthly_sales) / avg_monthly_sales) else null end > 0.1
+order by
+  sum_sales - avg_monthly_sales,
+  s_store_name
+limit 100
+-- end query 89 in stream 0 using template query89.tpl
diff --git a/sql/core/src/test/resources/tpcds-modifiedQueries/q98.sql b/sql/core/src/test/resources/tpcds-modifiedQueries/q98.sql
new file mode 100755
index 0000000000000..f53f2f5f9c5b6
--- /dev/null
+++ b/sql/core/src/test/resources/tpcds-modifiedQueries/q98.sql
@@ -0,0 +1,32 @@
+-- start query 98 in stream 0 using template query98.tpl
+select
+  i_item_desc,
+  i_category,
+  i_class,
+  i_current_price,
+  sum(ss_ext_sales_price) as itemrevenue,
+  sum(ss_ext_sales_price) * 100 / sum(sum(ss_ext_sales_price)) over (partition by i_class) as revenueratio
+from
+  store_sales,
+  item,
+  date_dim
+where
+  ss_item_sk = i_item_sk
+  and i_category in ('Jewelry', 'Sports', 'Books')
+  and ss_sold_date_sk = d_date_sk
+  and ss_sold_date_sk between 2451911 and 2451941  -- partition key filter (1 calendar month)
+  and d_date between '2001-01-01' and '2001-01-31'
+group by
+  i_item_id,
+  i_item_desc,
+  i_category,
+  i_class,
+  i_current_price
+order by
+  i_category,
+  i_class,
+  i_item_id,
+  i_item_desc,
+  revenueratio
+--limit 1000; -- added limit
+-- end query 98 in stream 0 using template query98.tpl
diff --git a/sql/core/src/test/resources/tpcds-modifiedQueries/ss_max.sql b/sql/core/src/test/resources/tpcds-modifiedQueries/ss_max.sql
new file mode 100755
index 0000000000000..bf58b4bb3c5a5
--- /dev/null
+++ b/sql/core/src/test/resources/tpcds-modifiedQueries/ss_max.sql
@@ -0,0 +1,14 @@
+select 
+  count(*) as total,
+  count(ss_sold_date_sk) as not_null_total,
+  count(distinct ss_sold_date_sk) as unique_days,
+  max(ss_sold_date_sk) as max_ss_sold_date_sk,
+  max(ss_sold_time_sk) as max_ss_sold_time_sk,
+  max(ss_item_sk) as max_ss_item_sk,
+  max(ss_customer_sk) as max_ss_customer_sk,
+  max(ss_cdemo_sk) as max_ss_cdemo_sk,
+  max(ss_hdemo_sk) as max_ss_hdemo_sk,
+  max(ss_addr_sk) as max_ss_addr_sk,
+  max(ss_store_sk) as max_ss_store_sk,
+  max(ss_promo_sk) as max_ss_promo_sk
+from store_sales
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala
index c0797fa55f5da..e47d4b0ee25d4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala
@@ -22,9 +22,18 @@ import org.scalatest.BeforeAndAfterAll
 import org.apache.spark.sql.catalyst.util.resourceToString
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.util.Utils
 
+/**
+ * This test suite ensures all the TPC-DS queries can be successfully analyzed and optimized
+ * without hitting the max iteration threshold.
+ */
 class TPCDSQuerySuite extends QueryTest with SharedSQLContext with BeforeAndAfterAll {
 
+  // When Utils.isTesting is true, the RuleExecutor will issue an exception when hitting
+  // the max iteration of analyzer/optimizer batches.
+  assert(Utils.isTesting, "spark.testing is not set to true")
+
   /**
    * Drop all the tables
    */
@@ -341,8 +350,23 @@ class TPCDSQuerySuite extends QueryTest with SharedSQLContext with BeforeAndAfte
       classLoader = Thread.currentThread().getContextClassLoader)
     test(name) {
       withSQLConf(SQLConf.CROSS_JOINS_ENABLED.key -> "true") {
-        sql(queryString).collect()
+        // Just check the plans can be properly generated
+        sql(queryString).queryExecution.executedPlan
       }
     }
   }
+
+  // These queries are from https://github.com/cloudera/impala-tpcds-kit/tree/master/queries
+  val modifiedTPCDSQueries = Seq(
+    "q3", "q7", "q10", "q19", "q27", "q34", "q42", "q43", "q46", "q52", "q53", "q55", "q59",
+    "q63", "q65", "q68", "q73", "q79", "q89", "q98", "ss_max")
+
+  modifiedTPCDSQueries.foreach { name =>
+    val queryString = resourceToString(s"tpcds-modifiedQueries/$name.sql",
+      classLoader = Thread.currentThread().getContextClassLoader)
+    test(s"modified-$name") {
+      // Just check the plans can be properly generated
+      sql(queryString).queryExecution.executedPlan
+    }
+  }
 }

From 472864014c42da08b9d3f3fffbe657c6fcf1e2ef Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Fri, 29 Sep 2017 11:45:58 -0700
Subject: [PATCH 647/779] Revert "[SPARK-22142][BUILD][STREAMING] Move Flume
 support behind a profile"

This reverts commit a2516f41aef68e39df7f6380fd2618cc148a609e.
---
 dev/create-release/release-build.sh |  4 ++--
 dev/mima                            |  2 +-
 dev/scalastyle                      |  1 -
 dev/sparktestsupport/modules.py     | 20 +-------------------
 dev/test-dependencies.sh            |  2 +-
 docs/building-spark.md              |  6 ------
 pom.xml                             | 13 +++----------
 project/SparkBuild.scala            | 17 ++++++++---------
 python/pyspark/streaming/tests.py   | 16 +++-------------
 9 files changed, 19 insertions(+), 62 deletions(-)

diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index 7e8d5c7075195..5390f5916fc0d 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -84,9 +84,9 @@ MVN="build/mvn --force"
 # Hive-specific profiles for some builds
 HIVE_PROFILES="-Phive -Phive-thriftserver"
 # Profiles for publishing snapshots and release to Maven Central
-PUBLISH_PROFILES="-Pmesos -Pyarn -Pflume $HIVE_PROFILES -Pspark-ganglia-lgpl -Pkinesis-asl"
+PUBLISH_PROFILES="-Pmesos -Pyarn $HIVE_PROFILES -Pspark-ganglia-lgpl -Pkinesis-asl"
 # Profiles for building binary releases
-BASE_RELEASE_PROFILES="-Pmesos -Pyarn -Pflume -Psparkr"
+BASE_RELEASE_PROFILES="-Pmesos -Pyarn -Psparkr"
 # Scala 2.11 only profiles for some builds
 SCALA_2_11_PROFILES="-Pkafka-0-8"
 # Scala 2.12 only profiles for some builds
diff --git a/dev/mima b/dev/mima
index 1e3ca9700bc07..fdb21f5007cf2 100755
--- a/dev/mima
+++ b/dev/mima
@@ -24,7 +24,7 @@ set -e
 FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
 cd "$FWDIR"
 
-SPARK_PROFILES="-Pmesos -Pkafka-0-8 -Pyarn -Pflume -Pspark-ganglia-lgpl -Pkinesis-asl -Phive-thriftserver -Phive"
+SPARK_PROFILES="-Pmesos -Pkafka-0-8 -Pyarn -Pspark-ganglia-lgpl -Pkinesis-asl -Phive-thriftserver -Phive"
 TOOLS_CLASSPATH="$(build/sbt -DcopyDependencies=false "export tools/fullClasspath" | tail -n1)"
 OLD_DEPS_CLASSPATH="$(build/sbt -DcopyDependencies=false $SPARK_PROFILES "export oldDeps/fullClasspath" | tail -n1)"
 
diff --git a/dev/scalastyle b/dev/scalastyle
index 89ecc8abd6f8c..e5aa589869535 100755
--- a/dev/scalastyle
+++ b/dev/scalastyle
@@ -25,7 +25,6 @@ ERRORS=$(echo -e "q\n" \
         -Pmesos \
         -Pkafka-0-8 \
         -Pyarn \
-        -Pflume \
         -Phive \
         -Phive-thriftserver \
         scalastyle test:scalastyle \
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 91d5667ed1f07..50e14b60545af 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -279,12 +279,6 @@ def __hash__(self):
     source_file_regexes=[
         "external/flume-sink",
     ],
-    build_profile_flags=[
-        "-Pflume",
-    ],
-    environ={
-        "ENABLE_FLUME_TESTS": "1"
-    },
     sbt_test_goals=[
         "streaming-flume-sink/test",
     ]
@@ -297,12 +291,6 @@ def __hash__(self):
     source_file_regexes=[
         "external/flume",
     ],
-    build_profile_flags=[
-        "-Pflume",
-    ],
-    environ={
-        "ENABLE_FLUME_TESTS": "1"
-    },
     sbt_test_goals=[
         "streaming-flume/test",
     ]
@@ -314,13 +302,7 @@ def __hash__(self):
     dependencies=[streaming_flume, streaming_flume_sink],
     source_file_regexes=[
         "external/flume-assembly",
-    ],
-    build_profile_flags=[
-        "-Pflume",
-    ],
-    environ={
-        "ENABLE_FLUME_TESTS": "1"
-    }
+    ]
 )
 
 
diff --git a/dev/test-dependencies.sh b/dev/test-dependencies.sh
index 58b295d4f6e00..c7714578bd005 100755
--- a/dev/test-dependencies.sh
+++ b/dev/test-dependencies.sh
@@ -29,7 +29,7 @@ export LC_ALL=C
 # TODO: This would be much nicer to do in SBT, once SBT supports Maven-style resolution.
 
 # NOTE: These should match those in the release publishing script
-HADOOP2_MODULE_PROFILES="-Phive-thriftserver -Pmesos -Pkafka-0-8 -Pyarn -Pflume -Phive"
+HADOOP2_MODULE_PROFILES="-Phive-thriftserver -Pmesos -Pkafka-0-8 -Pyarn -Phive"
 MVN="build/mvn"
 HADOOP_PROFILES=(
     hadoop-2.6
diff --git a/docs/building-spark.md b/docs/building-spark.md
index e1532de16108d..57baa503259c1 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -100,12 +100,6 @@ Note: Kafka 0.8 support is deprecated as of Spark 2.3.0.
 
 Kafka 0.10 support is still automatically built.
 
-## Building with Flume support
-
-Apache Flume support must be explicitly enabled with the `flume` profile.
-
-    ./build/mvn -Pflume -DskipTests clean package
-
 ## Building submodules individually
 
 It's possible to build Spark sub-modules using the `mvn -pl` option.
diff --git a/pom.xml b/pom.xml
index 9fac8b1e53788..87a468c3a6f55 100644
--- a/pom.xml
+++ b/pom.xml
@@ -98,13 +98,15 @@
     <module>sql/core</module>
     <module>sql/hive</module>
     <module>assembly</module>
+    <module>external/flume</module>
+    <module>external/flume-sink</module>
+    <module>external/flume-assembly</module>
     <module>examples</module>
     <module>repl</module>
     <module>launcher</module>
     <module>external/kafka-0-10</module>
     <module>external/kafka-0-10-assembly</module>
     <module>external/kafka-0-10-sql</module>
-    <!-- See additional modules enabled by profiles below -->
   </modules>
 
   <properties>
@@ -2581,15 +2583,6 @@
       </dependencies>
     </profile>
 
-    <profile>
-      <id>flume</id>
-      <modules>
-        <module>external/flume</module>
-        <module>external/flume-sink</module>
-        <module>external/flume-assembly</module>
-      </modules>
-    </profile>
-
     <!-- Ganglia integration is not included by default due to LGPL-licensed code -->
     <profile>
       <id>spark-ganglia-lgpl</id>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 9501eed1e906b..a568d264cb2db 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -43,8 +43,11 @@ object BuildCommons {
     "catalyst", "sql", "hive", "hive-thriftserver", "sql-kafka-0-10"
   ).map(ProjectRef(buildLocation, _))
 
-  val streamingProjects@Seq(streaming, streamingKafka010) =
-    Seq("streaming", "streaming-kafka-0-10").map(ProjectRef(buildLocation, _))
+  val streamingProjects@Seq(
+    streaming, streamingFlumeSink, streamingFlume, streamingKafka010
+  ) = Seq(
+    "streaming", "streaming-flume-sink", "streaming-flume", "streaming-kafka-0-10"
+  ).map(ProjectRef(buildLocation, _))
 
   val allProjects@Seq(
     core, graphx, mllib, mllibLocal, repl, networkCommon, networkShuffle, launcher, unsafe, tags, sketch, kvstore, _*
@@ -53,13 +56,9 @@ object BuildCommons {
     "tags", "sketch", "kvstore"
   ).map(ProjectRef(buildLocation, _)) ++ sqlProjects ++ streamingProjects
 
-  val optionallyEnabledProjects@Seq(mesos, yarn,
-    streamingFlumeSink, streamingFlume,
-    streamingKafka, sparkGangliaLgpl, streamingKinesisAsl,
-    dockerIntegrationTests, hadoopCloud) =
-    Seq("mesos", "yarn",
-      "streaming-flume-sink", "streaming-flume",
-      "streaming-kafka-0-8", "ganglia-lgpl", "streaming-kinesis-asl",
+  val optionallyEnabledProjects@Seq(mesos, yarn, streamingKafka, sparkGangliaLgpl,
+    streamingKinesisAsl, dockerIntegrationTests, hadoopCloud) =
+    Seq("mesos", "yarn", "streaming-kafka-0-8", "ganglia-lgpl", "streaming-kinesis-asl",
       "docker-integration-tests", "hadoop-cloud").map(ProjectRef(buildLocation, _))
 
   val assemblyProjects@Seq(networkYarn, streamingFlumeAssembly, streamingKafkaAssembly, streamingKafka010Assembly, streamingKinesisAslAssembly) =
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 5b86c1cb2c390..229cf53e47359 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -1478,7 +1478,7 @@ def search_kafka_assembly_jar():
             ("Failed to find Spark Streaming kafka assembly jar in %s. " % kafka_assembly_dir) +
             "You need to build Spark with "
             "'build/sbt assembly/package streaming-kafka-0-8-assembly/assembly' or "
-            "'build/mvn -Pkafka-0-8 package' before running this test.")
+            "'build/mvn package' before running this test.")
     elif len(jars) > 1:
         raise Exception(("Found multiple Spark Streaming Kafka assembly JARs: %s; please "
                          "remove all but one") % (", ".join(jars)))
@@ -1495,7 +1495,7 @@ def search_flume_assembly_jar():
             ("Failed to find Spark Streaming Flume assembly jar in %s. " % flume_assembly_dir) +
             "You need to build Spark with "
             "'build/sbt assembly/assembly streaming-flume-assembly/assembly' or "
-            "'build/mvn -Pflume package' before running this test.")
+            "'build/mvn package' before running this test.")
     elif len(jars) > 1:
         raise Exception(("Found multiple Spark Streaming Flume assembly JARs: %s; please "
                         "remove all but one") % (", ".join(jars)))
@@ -1516,9 +1516,6 @@ def search_kinesis_asl_assembly_jar():
         return jars[0]
 
 
-# Must be same as the variable and condition defined in modules.py
-flume_test_environ_var = "ENABLE_FLUME_TESTS"
-are_flume_tests_enabled = os.environ.get(flume_test_environ_var) == '1'
 # Must be same as the variable and condition defined in modules.py
 kafka_test_environ_var = "ENABLE_KAFKA_0_8_TESTS"
 are_kafka_tests_enabled = os.environ.get(kafka_test_environ_var) == '1'
@@ -1541,16 +1538,9 @@ def search_kinesis_asl_assembly_jar():
 
     os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars %s pyspark-shell" % jars
     testcases = [BasicOperationTests, WindowFunctionTests, StreamingContextTests, CheckpointTests,
+                 FlumeStreamTests, FlumePollingStreamTests,
                  StreamingListenerTests]
 
-    if are_flume_tests_enabled:
-        testcases.append(FlumeStreamTests)
-        testcases.append(FlumePollingStreamTests)
-    else:
-        sys.stderr.write(
-            "Skipped test_flume_stream (enable by setting environment variable %s=1"
-            % flume_test_environ_var)
-
     if are_kafka_tests_enabled:
         testcases.append(KafkaStreamTests)
     else:

From 530fe683297cb11b920a4df6630eff5d7e7ddce2 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Fri, 29 Sep 2017 19:35:32 -0700
Subject: [PATCH 648/779] [SPARK-21904][SQL] Rename tempTables to tempViews in
 SessionCatalog

### What changes were proposed in this pull request?
`tempTables` is not right. To be consistent, we need to rename the internal variable names/comments to tempViews in SessionCatalog too.

### How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #19117 from gatorsmile/renameTempTablesToTempViews.
---
 .../sql/catalyst/catalog/SessionCatalog.scala | 79 +++++++++----------
 .../sql/execution/command/DDLSuite.scala      | 10 +--
 2 files changed, 43 insertions(+), 46 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index 9407b727bca4c..6ba9ee5446a01 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.catalyst.catalog
 
-import java.lang.reflect.InvocationTargetException
 import java.net.URI
 import java.util.Locale
 import java.util.concurrent.Callable
@@ -25,7 +24,6 @@ import javax.annotation.concurrent.GuardedBy
 
 import scala.collection.mutable
 import scala.util.{Failure, Success, Try}
-import scala.util.control.NonFatal
 
 import com.google.common.cache.{Cache, CacheBuilder}
 import org.apache.hadoop.conf.Configuration
@@ -41,7 +39,6 @@ import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParserInterface}
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias, View}
 import org.apache.spark.sql.catalyst.util.StringUtils
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.Utils
 
@@ -52,7 +49,7 @@ object SessionCatalog {
 /**
  * An internal catalog that is used by a Spark Session. This internal catalog serves as a
  * proxy to the underlying metastore (e.g. Hive Metastore) and it also manages temporary
- * tables and functions of the Spark Session that it belongs to.
+ * views and functions of the Spark Session that it belongs to.
  *
  * This class must be thread-safe.
  */
@@ -90,13 +87,13 @@ class SessionCatalog(
       new SQLConf().copy(SQLConf.CASE_SENSITIVE -> true))
   }
 
-  /** List of temporary tables, mapping from table name to their logical plan. */
+  /** List of temporary views, mapping from table name to their logical plan. */
   @GuardedBy("this")
-  protected val tempTables = new mutable.HashMap[String, LogicalPlan]
+  protected val tempViews = new mutable.HashMap[String, LogicalPlan]
 
   // Note: we track current database here because certain operations do not explicitly
   // specify the database (e.g. DROP TABLE my_table). In these cases we must first
-  // check whether the temporary table or function exists, then, if not, operate on
+  // check whether the temporary view or function exists, then, if not, operate on
   // the corresponding item in the current database.
   @GuardedBy("this")
   protected var currentDb: String = formatDatabaseName(DEFAULT_DATABASE)
@@ -272,8 +269,8 @@ class SessionCatalog(
   // ----------------------------------------------------------------------------
   // Tables
   // ----------------------------------------------------------------------------
-  // There are two kinds of tables, temporary tables and metastore tables.
-  // Temporary tables are isolated across sessions and do not belong to any
+  // There are two kinds of tables, temporary views and metastore tables.
+  // Temporary views are isolated across sessions and do not belong to any
   // particular database. Metastore tables can be used across multiple
   // sessions as their metadata is persisted in the underlying catalog.
   // ----------------------------------------------------------------------------
@@ -462,10 +459,10 @@ class SessionCatalog(
       tableDefinition: LogicalPlan,
       overrideIfExists: Boolean): Unit = synchronized {
     val table = formatTableName(name)
-    if (tempTables.contains(table) && !overrideIfExists) {
+    if (tempViews.contains(table) && !overrideIfExists) {
       throw new TempTableAlreadyExistsException(name)
     }
-    tempTables.put(table, tableDefinition)
+    tempViews.put(table, tableDefinition)
   }
 
   /**
@@ -487,7 +484,7 @@ class SessionCatalog(
       viewDefinition: LogicalPlan): Boolean = synchronized {
     val viewName = formatTableName(name.table)
     if (name.database.isEmpty) {
-      if (tempTables.contains(viewName)) {
+      if (tempViews.contains(viewName)) {
         createTempView(viewName, viewDefinition, overrideIfExists = true)
         true
       } else {
@@ -504,7 +501,7 @@ class SessionCatalog(
    * Return a local temporary view exactly as it was stored.
    */
   def getTempView(name: String): Option[LogicalPlan] = synchronized {
-    tempTables.get(formatTableName(name))
+    tempViews.get(formatTableName(name))
   }
 
   /**
@@ -520,7 +517,7 @@ class SessionCatalog(
    * Returns true if this view is dropped successfully, false otherwise.
    */
   def dropTempView(name: String): Boolean = synchronized {
-    tempTables.remove(formatTableName(name)).isDefined
+    tempViews.remove(formatTableName(name)).isDefined
   }
 
   /**
@@ -572,7 +569,7 @@ class SessionCatalog(
    * Rename a table.
    *
    * If a database is specified in `oldName`, this will rename the table in that database.
-   * If no database is specified, this will first attempt to rename a temporary table with
+   * If no database is specified, this will first attempt to rename a temporary view with
    * the same name, then, if that does not exist, rename the table in the current database.
    *
    * This assumes the database specified in `newName` matches the one in `oldName`.
@@ -592,7 +589,7 @@ class SessionCatalog(
       globalTempViewManager.rename(oldTableName, newTableName)
     } else {
       requireDbExists(db)
-      if (oldName.database.isDefined || !tempTables.contains(oldTableName)) {
+      if (oldName.database.isDefined || !tempViews.contains(oldTableName)) {
         requireTableExists(TableIdentifier(oldTableName, Some(db)))
         requireTableNotExists(TableIdentifier(newTableName, Some(db)))
         validateName(newTableName)
@@ -600,16 +597,16 @@ class SessionCatalog(
       } else {
         if (newName.database.isDefined) {
           throw new AnalysisException(
-            s"RENAME TEMPORARY TABLE from '$oldName' to '$newName': cannot specify database " +
+            s"RENAME TEMPORARY VIEW from '$oldName' to '$newName': cannot specify database " +
               s"name '${newName.database.get}' in the destination table")
         }
-        if (tempTables.contains(newTableName)) {
-          throw new AnalysisException(s"RENAME TEMPORARY TABLE from '$oldName' to '$newName': " +
+        if (tempViews.contains(newTableName)) {
+          throw new AnalysisException(s"RENAME TEMPORARY VIEW from '$oldName' to '$newName': " +
             "destination table already exists")
         }
-        val table = tempTables(oldTableName)
-        tempTables.remove(oldTableName)
-        tempTables.put(newTableName, table)
+        val table = tempViews(oldTableName)
+        tempViews.remove(oldTableName)
+        tempViews.put(newTableName, table)
       }
     }
   }
@@ -618,7 +615,7 @@ class SessionCatalog(
    * Drop a table.
    *
    * If a database is specified in `name`, this will drop the table from that database.
-   * If no database is specified, this will first attempt to drop a temporary table with
+   * If no database is specified, this will first attempt to drop a temporary view with
    * the same name, then, if that does not exist, drop the table from the current database.
    */
   def dropTable(
@@ -633,7 +630,7 @@ class SessionCatalog(
         throw new NoSuchTableException(globalTempViewManager.database, table)
       }
     } else {
-      if (name.database.isDefined || !tempTables.contains(table)) {
+      if (name.database.isDefined || !tempViews.contains(table)) {
         requireDbExists(db)
         // When ignoreIfNotExists is false, no exception is issued when the table does not exist.
         // Instead, log it as an error message.
@@ -643,7 +640,7 @@ class SessionCatalog(
           throw new NoSuchTableException(db = db, table = table)
         }
       } else {
-        tempTables.remove(table)
+        tempViews.remove(table)
       }
     }
   }
@@ -652,7 +649,7 @@ class SessionCatalog(
    * Return a [[LogicalPlan]] that represents the given table or view.
    *
    * If a database is specified in `name`, this will return the table/view from that database.
-   * If no database is specified, this will first attempt to return a temporary table/view with
+   * If no database is specified, this will first attempt to return a temporary view with
    * the same name, then, if that does not exist, return the table/view from the current database.
    *
    * Note that, the global temp view database is also valid here, this will return the global temp
@@ -671,7 +668,7 @@ class SessionCatalog(
         globalTempViewManager.get(table).map { viewDef =>
           SubqueryAlias(table, viewDef)
         }.getOrElse(throw new NoSuchTableException(db, table))
-      } else if (name.database.isDefined || !tempTables.contains(table)) {
+      } else if (name.database.isDefined || !tempViews.contains(table)) {
         val metadata = externalCatalog.getTable(db, table)
         if (metadata.tableType == CatalogTableType.VIEW) {
           val viewText = metadata.viewText.getOrElse(sys.error("Invalid view without text."))
@@ -687,21 +684,21 @@ class SessionCatalog(
           SubqueryAlias(table, UnresolvedCatalogRelation(metadata))
         }
       } else {
-        SubqueryAlias(table, tempTables(table))
+        SubqueryAlias(table, tempViews(table))
       }
     }
   }
 
   /**
-   * Return whether a table with the specified name is a temporary table.
+   * Return whether a table with the specified name is a temporary view.
    *
-   * Note: The temporary table cache is checked only when database is not
+   * Note: The temporary view cache is checked only when database is not
    * explicitly specified.
    */
   def isTemporaryTable(name: TableIdentifier): Boolean = synchronized {
     val table = formatTableName(name.table)
     if (name.database.isEmpty) {
-      tempTables.contains(table)
+      tempViews.contains(table)
     } else if (formatDatabaseName(name.database.get) == globalTempViewManager.database) {
       globalTempViewManager.get(table).isDefined
     } else {
@@ -710,7 +707,7 @@ class SessionCatalog(
   }
 
   /**
-   * List all tables in the specified database, including local temporary tables.
+   * List all tables in the specified database, including local temporary views.
    *
    * Note that, if the specified database is global temporary view database, we will list global
    * temporary views.
@@ -718,7 +715,7 @@ class SessionCatalog(
   def listTables(db: String): Seq[TableIdentifier] = listTables(db, "*")
 
   /**
-   * List all matching tables in the specified database, including local temporary tables.
+   * List all matching tables in the specified database, including local temporary views.
    *
    * Note that, if the specified database is global temporary view database, we will list global
    * temporary views.
@@ -736,7 +733,7 @@ class SessionCatalog(
       }
     }
     val localTempViews = synchronized {
-      StringUtils.filterPattern(tempTables.keys.toSeq, pattern).map { name =>
+      StringUtils.filterPattern(tempViews.keys.toSeq, pattern).map { name =>
         TableIdentifier(name)
       }
     }
@@ -750,11 +747,11 @@ class SessionCatalog(
     val dbName = formatDatabaseName(name.database.getOrElse(currentDb))
     val tableName = formatTableName(name.table)
 
-    // Go through temporary tables and invalidate them.
+    // Go through temporary views and invalidate them.
     // If the database is defined, this may be a global temporary view.
-    // If the database is not defined, there is a good chance this is a temp table.
+    // If the database is not defined, there is a good chance this is a temp view.
     if (name.database.isEmpty) {
-      tempTables.get(tableName).foreach(_.refresh())
+      tempViews.get(tableName).foreach(_.refresh())
     } else if (dbName == globalTempViewManager.database) {
       globalTempViewManager.get(tableName).foreach(_.refresh())
     }
@@ -765,11 +762,11 @@ class SessionCatalog(
   }
 
   /**
-   * Drop all existing temporary tables.
+   * Drop all existing temporary views.
    * For testing only.
    */
   def clearTempTables(): Unit = synchronized {
-    tempTables.clear()
+    tempViews.clear()
   }
 
   // ----------------------------------------------------------------------------
@@ -1337,7 +1334,7 @@ class SessionCatalog(
    */
   private[sql] def copyStateTo(target: SessionCatalog): Unit = synchronized {
     target.currentDb = currentDb
-    // copy over temporary tables
-    tempTables.foreach(kv => target.tempTables.put(kv._1, kv._2))
+    // copy over temporary views
+    tempViews.foreach(kv => target.tempViews.put(kv._1, kv._2))
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index d19cfeef7d19f..4ed2cecc5faff 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -795,7 +795,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
     checkAnswer(spark.table("teachers"), df)
   }
 
-  test("rename temporary table - destination table with database name") {
+  test("rename temporary view - destination table with database name") {
     withTempView("tab1") {
       sql(
         """
@@ -812,7 +812,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
         sql("ALTER TABLE tab1 RENAME TO default.tab2")
       }
       assert(e.getMessage.contains(
-        "RENAME TEMPORARY TABLE from '`tab1`' to '`default`.`tab2`': " +
+        "RENAME TEMPORARY VIEW from '`tab1`' to '`default`.`tab2`': " +
           "cannot specify database name 'default' in the destination table"))
 
       val catalog = spark.sessionState.catalog
@@ -820,7 +820,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
     }
   }
 
-  test("rename temporary table") {
+  test("rename temporary view") {
     withTempView("tab1", "tab2") {
       spark.range(10).createOrReplaceTempView("tab1")
       sql("ALTER TABLE tab1 RENAME TO tab2")
@@ -832,7 +832,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
     }
   }
 
-  test("rename temporary table - destination table already exists") {
+  test("rename temporary view - destination table already exists") {
     withTempView("tab1", "tab2") {
       sql(
         """
@@ -860,7 +860,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
         sql("ALTER TABLE tab1 RENAME TO tab2")
       }
       assert(e.getMessage.contains(
-        "RENAME TEMPORARY TABLE from '`tab1`' to '`tab2`': destination table already exists"))
+        "RENAME TEMPORARY VIEW from '`tab1`' to '`tab2`': destination table already exists"))
 
       val catalog = spark.sessionState.catalog
       assert(catalog.listTables("default") == Seq(TableIdentifier("tab1"), TableIdentifier("tab2")))

From c6610a997f69148a1f1bbf69360e8f39e24cb70a Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Fri, 29 Sep 2017 21:36:52 -0700
Subject: [PATCH 649/779] [SPARK-22122][SQL] Use analyzed logical plans to
 count input rows in TPCDSQueryBenchmark

## What changes were proposed in this pull request?
Since the current code ignores WITH clauses to check input relations in TPCDS queries, this leads to inaccurate per-row processing time for benchmark results. For example, in `q2`, this fix could catch all the input relations: `web_sales`, `date_dim`, and `catalog_sales` (the current code catches `date_dim` only). The one-third of the TPCDS queries uses WITH clauses, so I think it is worth fixing this.

## How was this patch tested?
Manually checked.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #19344 from maropu/RespectWithInTPCDSBench.
---
 .../benchmark/TPCDSQueryBenchmark.scala       | 32 +++++++------------
 1 file changed, 11 insertions(+), 21 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala
index 99c6df7389205..69247d7f4e9aa 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala
@@ -20,11 +20,10 @@ package org.apache.spark.sql.execution.benchmark
 import org.apache.spark.SparkConf
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
-import org.apache.spark.sql.catalyst.expressions.SubqueryExpression
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.catalog.HiveTableRelation
+import org.apache.spark.sql.catalyst.plans.logical.SubqueryAlias
 import org.apache.spark.sql.catalyst.util._
+import org.apache.spark.sql.execution.datasources.LogicalRelation
 import org.apache.spark.util.Benchmark
 
 /**
@@ -66,24 +65,15 @@ object TPCDSQueryBenchmark extends Logging {
         classLoader = Thread.currentThread().getContextClassLoader)
 
       // This is an indirect hack to estimate the size of each query's input by traversing the
-      // logical plan and adding up the sizes of all tables that appear in the plan. Note that this
-      // currently doesn't take WITH subqueries into account which might lead to fairly inaccurate
-      // per-row processing time for those cases.
+      // logical plan and adding up the sizes of all tables that appear in the plan.
       val queryRelations = scala.collection.mutable.HashSet[String]()
-      spark.sql(queryString).queryExecution.logical.map {
-        case UnresolvedRelation(t: TableIdentifier) =>
-          queryRelations.add(t.table)
-        case lp: LogicalPlan =>
-          lp.expressions.foreach { _ foreach {
-            case subquery: SubqueryExpression =>
-              subquery.plan.foreach {
-                case UnresolvedRelation(t: TableIdentifier) =>
-                  queryRelations.add(t.table)
-                case _ =>
-              }
-            case _ =>
-          }
-        }
+      spark.sql(queryString).queryExecution.analyzed.foreach {
+        case SubqueryAlias(alias, _: LogicalRelation) =>
+          queryRelations.add(alias)
+        case LogicalRelation(_, _, Some(catalogTable), _) =>
+          queryRelations.add(catalogTable.identifier.table)
+        case HiveTableRelation(tableMeta, _, _) =>
+          queryRelations.add(tableMeta.identifier.table)
         case _ =>
       }
       val numRows = queryRelations.map(tableSizes.getOrElse(_, 0L)).sum

From 02c91e03f975c2a6a05a9d5327057bb6b3c4a66f Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sun, 1 Oct 2017 18:42:45 +0900
Subject: [PATCH 650/779] [SPARK-22063][R] Fixes lint check failures in R by
 latest commit sha1 ID of lint-r

## What changes were proposed in this pull request?

Currently, we set lintr to jimhester/lintra769c0b (see [this](https://github.com/apache/spark/commit/7d1175011c976756efcd4e4e4f70a8fd6f287026) and [SPARK-14074](https://issues.apache.org/jira/browse/SPARK-14074)).

I first tested and checked lintr-1.0.1 but it looks many important fixes are missing (for example, checking 100 length). So, I instead tried the latest commit, https://github.com/jimhester/lintr/commit/5431140ffea65071f1327625d4a8de9688fa7e72, in my local and fixed the check failures.

It looks it has fixed many bugs and now finds many instances that I have observed and thought should be caught time to time, here I filed [the results](https://gist.github.com/HyukjinKwon/4f59ddcc7b6487a02da81800baca533c).

The downside looks it now takes about 7ish mins, (it was 2ish mins before) in my local.

## How was this patch tested?

Manually, `./dev/lint-r` after manually updating the lintr package.

Author: hyukjinkwon <gurwls223@gmail.com>
Author: zuotingbing <zuo.tingbing9@zte.com.cn>

Closes #19290 from HyukjinKwon/upgrade-r-lint.
---
 R/pkg/.lintr                                 |   2 +-
 R/pkg/R/DataFrame.R                          |  30 ++--
 R/pkg/R/RDD.R                                |   6 +-
 R/pkg/R/WindowSpec.R                         |   2 +-
 R/pkg/R/column.R                             |   2 +
 R/pkg/R/context.R                            |   2 +-
 R/pkg/R/deserialize.R                        |   2 +-
 R/pkg/R/functions.R                          |  79 ++++++-----
 R/pkg/R/generics.R                           |   4 +-
 R/pkg/R/group.R                              |   4 +-
 R/pkg/R/mllib_classification.R               | 137 +++++++++++--------
 R/pkg/R/mllib_clustering.R                   |  15 +-
 R/pkg/R/mllib_regression.R                   |  62 +++++----
 R/pkg/R/mllib_tree.R                         |  36 +++--
 R/pkg/R/pairRDD.R                            |   4 +-
 R/pkg/R/schema.R                             |   2 +-
 R/pkg/R/stats.R                              |  14 +-
 R/pkg/R/utils.R                              |   4 +-
 R/pkg/inst/worker/worker.R                   |   2 +-
 R/pkg/tests/fulltests/test_binary_function.R |   2 +-
 R/pkg/tests/fulltests/test_rdd.R             |   6 +-
 R/pkg/tests/fulltests/test_sparkSQL.R        |  14 +-
 dev/lint-r.R                                 |   4 +-
 23 files changed, 242 insertions(+), 193 deletions(-)

diff --git a/R/pkg/.lintr b/R/pkg/.lintr
index ae50b28ec6166..c83ad2adfe0ef 100644
--- a/R/pkg/.lintr
+++ b/R/pkg/.lintr
@@ -1,2 +1,2 @@
-linters: with_defaults(line_length_linter(100), multiple_dots_linter = NULL, camel_case_linter = NULL, open_curly_linter(allow_single_line = TRUE), closed_curly_linter(allow_single_line = TRUE))
+linters: with_defaults(line_length_linter(100), multiple_dots_linter = NULL, object_name_linter = NULL, camel_case_linter = NULL, open_curly_linter(allow_single_line = TRUE), closed_curly_linter(allow_single_line = TRUE))
 exclusions: list("inst/profile/general.R" = 1, "inst/profile/shell.R")
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 0728141fa483e..176bb3b8a8d0c 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1923,13 +1923,15 @@ setMethod("[", signature(x = "SparkDataFrame"),
 #' @param i,subset (Optional) a logical expression to filter on rows.
 #'                 For extract operator [[ and replacement operator [[<-, the indexing parameter for
 #'                 a single Column.
-#' @param j,select expression for the single Column or a list of columns to select from the SparkDataFrame.
+#' @param j,select expression for the single Column or a list of columns to select from the
+#'                 SparkDataFrame.
 #' @param drop if TRUE, a Column will be returned if the resulting dataset has only one column.
 #'             Otherwise, a SparkDataFrame will always be returned.
 #' @param value a Column or an atomic vector in the length of 1 as literal value, or \code{NULL}.
 #'              If \code{NULL}, the specified Column is dropped.
 #' @param ... currently not used.
-#' @return A new SparkDataFrame containing only the rows that meet the condition with selected columns.
+#' @return A new SparkDataFrame containing only the rows that meet the condition with selected
+#'         columns.
 #' @export
 #' @family SparkDataFrame functions
 #' @aliases subset,SparkDataFrame-method
@@ -2608,12 +2610,12 @@ setMethod("merge",
             } else {
               # if by or both by.x and by.y have length 0, use Cartesian Product
               joinRes <- crossJoin(x, y)
-              return (joinRes)
+              return(joinRes)
             }
 
             # sets alias for making colnames unique in dataframes 'x' and 'y'
-            colsX <- generateAliasesForIntersectedCols(x, by, suffixes[1])
-            colsY <- generateAliasesForIntersectedCols(y, by, suffixes[2])
+            colsX <- genAliasesForIntersectedCols(x, by, suffixes[1])
+            colsY <- genAliasesForIntersectedCols(y, by, suffixes[2])
 
             # selects columns with their aliases from dataframes
             # in case same column names are present in both data frames
@@ -2661,9 +2663,8 @@ setMethod("merge",
 #' @param intersectedColNames a list of intersected column names of the SparkDataFrame
 #' @param suffix a suffix for the column name
 #' @return list of columns
-#'
-#' @note generateAliasesForIntersectedCols since 1.6.0
-generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) {
+#' @noRd
+genAliasesForIntersectedCols <- function(x, intersectedColNames, suffix) {
   allColNames <- names(x)
   # sets alias for making colnames unique in dataframe 'x'
   cols <- lapply(allColNames, function(colName) {
@@ -2671,7 +2672,7 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) {
     if (colName %in% intersectedColNames) {
       newJoin <- paste(colName, suffix, sep = "")
       if (newJoin %in% allColNames){
-        stop ("The following column name: ", newJoin, " occurs more than once in the 'DataFrame'.",
+        stop("The following column name: ", newJoin, " occurs more than once in the 'DataFrame'.",
           "Please use different suffixes for the intersected columns.")
       }
       col <- alias(col, newJoin)
@@ -3058,7 +3059,8 @@ setMethod("describe",
 #' summary(select(df, "age", "height"))
 #' }
 #' @note summary(SparkDataFrame) since 1.5.0
-#' @note The statistics provided by \code{summary} were change in 2.3.0 use \link{describe} for previous defaults.
+#' @note The statistics provided by \code{summary} were change in 2.3.0 use \link{describe} for
+#'       previous defaults.
 #' @seealso \link{describe}
 setMethod("summary",
           signature(object = "SparkDataFrame"),
@@ -3765,8 +3767,8 @@ setMethod("checkpoint",
 #'
 #' Create a multi-dimensional cube for the SparkDataFrame using the specified columns.
 #'
-#' If grouping expression is missing \code{cube} creates a single global aggregate and is equivalent to
-#' direct application of \link{agg}.
+#' If grouping expression is missing \code{cube} creates a single global aggregate and is
+#' equivalent to direct application of \link{agg}.
 #'
 #' @param x a SparkDataFrame.
 #' @param ... character name(s) or Column(s) to group on.
@@ -3800,8 +3802,8 @@ setMethod("cube",
 #'
 #' Create a multi-dimensional rollup for the SparkDataFrame using the specified columns.
 #'
-#' If grouping expression is missing \code{rollup} creates a single global aggregate and is equivalent to
-#' direct application of \link{agg}.
+#' If grouping expression is missing \code{rollup} creates a single global aggregate and is
+#' equivalent to direct application of \link{agg}.
 #'
 #' @param x a SparkDataFrame.
 #' @param ... character name(s) or Column(s) to group on.
diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R
index 15ca212acf87f..6e89b4bb4d964 100644
--- a/R/pkg/R/RDD.R
+++ b/R/pkg/R/RDD.R
@@ -131,7 +131,7 @@ PipelinedRDD <- function(prev, func) {
 # Return the serialization mode for an RDD.
 setGeneric("getSerializedMode", function(rdd, ...) { standardGeneric("getSerializedMode") })
 # For normal RDDs we can directly read the serializedMode
-setMethod("getSerializedMode", signature(rdd = "RDD"), function(rdd) rdd@env$serializedMode )
+setMethod("getSerializedMode", signature(rdd = "RDD"), function(rdd) rdd@env$serializedMode)
 # For pipelined RDDs if jrdd_val is set then serializedMode should exist
 # if not we return the defaultSerialization mode of "byte" as we don't know the serialization
 # mode at this point in time.
@@ -145,7 +145,7 @@ setMethod("getSerializedMode", signature(rdd = "PipelinedRDD"),
           })
 
 # The jrdd accessor function.
-setMethod("getJRDD", signature(rdd = "RDD"), function(rdd) rdd@jrdd )
+setMethod("getJRDD", signature(rdd = "RDD"), function(rdd) rdd@jrdd)
 setMethod("getJRDD", signature(rdd = "PipelinedRDD"),
           function(rdd, serializedMode = "byte") {
             if (!is.null(rdd@env$jrdd_val)) {
@@ -893,7 +893,7 @@ setMethod("sampleRDD",
                 if (withReplacement) {
                   count <- stats::rpois(1, fraction)
                   if (count > 0) {
-                    res[ (len + 1) : (len + count) ] <- rep(list(elem), count)
+                    res[(len + 1) : (len + count)] <- rep(list(elem), count)
                     len <- len + count
                   }
                 } else {
diff --git a/R/pkg/R/WindowSpec.R b/R/pkg/R/WindowSpec.R
index 81beac9ea9925..debc7cbde55e7 100644
--- a/R/pkg/R/WindowSpec.R
+++ b/R/pkg/R/WindowSpec.R
@@ -73,7 +73,7 @@ setMethod("show", "WindowSpec",
 setMethod("partitionBy",
           signature(x = "WindowSpec"),
           function(x, col, ...) {
-            stopifnot (class(col) %in% c("character", "Column"))
+            stopifnot(class(col) %in% c("character", "Column"))
 
             if (class(col) == "character") {
               windowSpec(callJMethod(x@sws, "partitionBy", col, list(...)))
diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R
index a5c2ea81f2490..3095adb918b67 100644
--- a/R/pkg/R/column.R
+++ b/R/pkg/R/column.R
@@ -238,8 +238,10 @@ setMethod("between", signature(x = "Column"),
 #' @param x a Column.
 #' @param dataType a character object describing the target data type.
 #'        See
+# nolint start
 #'        \href{https://spark.apache.org/docs/latest/sparkr.html#data-type-mapping-between-r-and-spark}{
 #'        Spark Data Types} for available data types.
+# nolint end
 #' @rdname cast
 #' @name cast
 #' @family colum_func
diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R
index 8349b57a30a93..443c2ff8f9ace 100644
--- a/R/pkg/R/context.R
+++ b/R/pkg/R/context.R
@@ -329,7 +329,7 @@ spark.addFile <- function(path, recursive = FALSE) {
 #' spark.getSparkFilesRootDirectory()
 #'}
 #' @note spark.getSparkFilesRootDirectory since 2.1.0
-spark.getSparkFilesRootDirectory <- function() {
+spark.getSparkFilesRootDirectory <- function() { # nolint
   if (Sys.getenv("SPARKR_IS_RUNNING_ON_WORKER") == "") {
     # Running on driver.
     callJStatic("org.apache.spark.SparkFiles", "getRootDirectory")
diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R
index 0e99b171cabeb..a90f7d381026b 100644
--- a/R/pkg/R/deserialize.R
+++ b/R/pkg/R/deserialize.R
@@ -43,7 +43,7 @@ readObject <- function(con) {
 }
 
 readTypedObject <- function(con, type) {
-  switch (type,
+  switch(type,
     "i" = readInt(con),
     "c" = readString(con),
     "b" = readBoolean(con),
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 9f286263c2162..0143a3e63ba61 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -38,7 +38,8 @@ NULL
 #'
 #' Date time functions defined for \code{Column}.
 #'
-#' @param x Column to compute on. In \code{window}, it must be a time Column of \code{TimestampType}.
+#' @param x Column to compute on. In \code{window}, it must be a time Column of
+#'          \code{TimestampType}.
 #' @param format For \code{to_date} and \code{to_timestamp}, it is the string to use to parse
 #'               Column \code{x} to DateType or TimestampType. For \code{trunc}, it is the string
 #'               to use to specify the truncation method. For example, "year", "yyyy", "yy" for
@@ -90,8 +91,8 @@ NULL
 #'
 #' Math functions defined for \code{Column}.
 #'
-#' @param x Column to compute on. In \code{shiftLeft}, \code{shiftRight} and \code{shiftRightUnsigned},
-#'          this is the number of bits to shift.
+#' @param x Column to compute on. In \code{shiftLeft}, \code{shiftRight} and
+#'          \code{shiftRightUnsigned}, this is the number of bits to shift.
 #' @param y Column to compute on.
 #' @param ... additional argument(s).
 #' @name column_math_functions
@@ -480,7 +481,7 @@ setMethod("ceiling",
 setMethod("coalesce",
           signature(x = "Column"),
           function(x, ...) {
-            jcols <- lapply(list(x, ...), function (x) {
+            jcols <- lapply(list(x, ...), function(x) {
               stopifnot(class(x) == "Column")
               x@jc
             })
@@ -676,7 +677,7 @@ setMethod("crc32",
 setMethod("hash",
           signature(x = "Column"),
           function(x, ...) {
-            jcols <- lapply(list(x, ...), function (x) {
+            jcols <- lapply(list(x, ...), function(x) {
               stopifnot(class(x) == "Column")
               x@jc
             })
@@ -1310,9 +1311,9 @@ setMethod("round",
 #' Also known as Gaussian rounding or bankers' rounding that rounds to the nearest even number.
 #' bround(2.5, 0) = 2, bround(3.5, 0) = 4.
 #'
-#' @param scale round to \code{scale} digits to the right of the decimal point when \code{scale} > 0,
-#'        the nearest even number when \code{scale} = 0, and \code{scale} digits to the left
-#'        of the decimal point when \code{scale} < 0.
+#' @param scale round to \code{scale} digits to the right of the decimal point when
+#'        \code{scale} > 0, the nearest even number when \code{scale} = 0, and \code{scale} digits
+#'        to the left of the decimal point when \code{scale} < 0.
 #' @rdname column_math_functions
 #' @aliases bround bround,Column-method
 #' @export
@@ -2005,8 +2006,9 @@ setMethod("months_between", signature(y = "Column"),
           })
 
 #' @details
-#' \code{nanvl}: Returns the first column (\code{y}) if it is not NaN, or the second column (\code{x}) if
-#' the first column is NaN. Both inputs should be floating point columns (DoubleType or FloatType).
+#' \code{nanvl}: Returns the first column (\code{y}) if it is not NaN, or the second column
+#' (\code{x}) if the first column is NaN. Both inputs should be floating point columns
+#' (DoubleType or FloatType).
 #'
 #' @rdname column_nonaggregate_functions
 #' @aliases nanvl nanvl,Column-method
@@ -2061,7 +2063,7 @@ setMethod("approxCountDistinct",
 setMethod("countDistinct",
           signature(x = "Column"),
           function(x, ...) {
-            jcols <- lapply(list(...), function (x) {
+            jcols <- lapply(list(...), function(x) {
               stopifnot(class(x) == "Column")
               x@jc
             })
@@ -2090,7 +2092,7 @@ setMethod("countDistinct",
 setMethod("concat",
           signature(x = "Column"),
           function(x, ...) {
-            jcols <- lapply(list(x, ...), function (x) {
+            jcols <- lapply(list(x, ...), function(x) {
               stopifnot(class(x) == "Column")
               x@jc
             })
@@ -2110,7 +2112,7 @@ setMethod("greatest",
           signature(x = "Column"),
           function(x, ...) {
             stopifnot(length(list(...)) > 0)
-            jcols <- lapply(list(x, ...), function (x) {
+            jcols <- lapply(list(x, ...), function(x) {
               stopifnot(class(x) == "Column")
               x@jc
             })
@@ -2130,7 +2132,7 @@ setMethod("least",
           signature(x = "Column"),
           function(x, ...) {
             stopifnot(length(list(...)) > 0)
-            jcols <- lapply(list(x, ...), function (x) {
+            jcols <- lapply(list(x, ...), function(x) {
               stopifnot(class(x) == "Column")
               x@jc
             })
@@ -2406,8 +2408,8 @@ setMethod("shiftLeft", signature(y = "Column", x = "numeric"),
           })
 
 #' @details
-#' \code{shiftRight}: (Signed) shifts the given value numBits right. If the given value is a long value,
-#' it will return a long value else it will return an integer value.
+#' \code{shiftRight}: (Signed) shifts the given value numBits right. If the given value is a long
+#' value, it will return a long value else it will return an integer value.
 #'
 #' @rdname column_math_functions
 #' @aliases shiftRight shiftRight,Column,numeric-method
@@ -2505,9 +2507,10 @@ setMethod("format_string", signature(format = "character", x = "Column"),
           })
 
 #' @details
-#' \code{from_unixtime}: Converts the number of seconds from unix epoch (1970-01-01 00:00:00 UTC) to a
-#' string representing the timestamp of that moment in the current system time zone in the JVM in the
-#' given format. See \href{http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html}{
+#' \code{from_unixtime}: Converts the number of seconds from unix epoch (1970-01-01 00:00:00 UTC)
+#' to a string representing the timestamp of that moment in the current system time zone in the JVM
+#' in the given format.
+#' See \href{http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html}{
 #' Customizing Formats} for available options.
 #'
 #' @rdname column_datetime_functions
@@ -2634,8 +2637,8 @@ setMethod("lpad", signature(x = "Column", len = "numeric", pad = "character"),
           })
 
 #' @details
-#' \code{rand}: Generates a random column with independent and identically distributed (i.i.d.) samples
-#' from U[0.0, 1.0].
+#' \code{rand}: Generates a random column with independent and identically distributed (i.i.d.)
+#' samples from U[0.0, 1.0].
 #'
 #' @rdname column_nonaggregate_functions
 #' @param seed a random seed. Can be missing.
@@ -2664,8 +2667,8 @@ setMethod("rand", signature(seed = "numeric"),
           })
 
 #' @details
-#' \code{randn}: Generates a column with independent and identically distributed (i.i.d.) samples from
-#' the standard normal distribution.
+#' \code{randn}: Generates a column with independent and identically distributed (i.i.d.) samples
+#' from the standard normal distribution.
 #'
 #' @rdname column_nonaggregate_functions
 #' @aliases randn randn,missing-method
@@ -2831,8 +2834,8 @@ setMethod("unix_timestamp", signature(x = "Column", format = "character"),
           })
 
 #' @details
-#' \code{when}: Evaluates a list of conditions and returns one of multiple possible result expressions.
-#' For unmatched expressions null is returned.
+#' \code{when}: Evaluates a list of conditions and returns one of multiple possible result
+#' expressions. For unmatched expressions null is returned.
 #'
 #' @rdname column_nonaggregate_functions
 #' @param condition the condition to test on. Must be a Column expression.
@@ -2859,8 +2862,8 @@ setMethod("when", signature(condition = "Column", value = "ANY"),
           })
 
 #' @details
-#' \code{ifelse}: Evaluates a list of conditions and returns \code{yes} if the conditions are satisfied.
-#' Otherwise \code{no} is returned for unmatched conditions.
+#' \code{ifelse}: Evaluates a list of conditions and returns \code{yes} if the conditions are
+#' satisfied. Otherwise \code{no} is returned for unmatched conditions.
 #'
 #' @rdname column_nonaggregate_functions
 #' @param test a Column expression that describes the condition.
@@ -2990,7 +2993,8 @@ setMethod("ntile",
           })
 
 #' @details
-#' \code{percent_rank}: Returns the relative rank (i.e. percentile) of rows within a window partition.
+#' \code{percent_rank}: Returns the relative rank (i.e. percentile) of rows within a window
+#' partition.
 #' This is computed by: (rank of row in its partition - 1) / (number of rows in the partition - 1).
 #' This is equivalent to the \code{PERCENT_RANK} function in SQL.
 #' The method should be used with no argument.
@@ -3160,7 +3164,8 @@ setMethod("posexplode",
           })
 
 #' @details
-#' \code{create_array}: Creates a new array column. The input columns must all have the same data type.
+#' \code{create_array}: Creates a new array column. The input columns must all have the same data
+#' type.
 #'
 #' @rdname column_nonaggregate_functions
 #' @aliases create_array create_array,Column-method
@@ -3169,7 +3174,7 @@ setMethod("posexplode",
 setMethod("create_array",
           signature(x = "Column"),
           function(x, ...) {
-            jcols <- lapply(list(x, ...), function (x) {
+            jcols <- lapply(list(x, ...), function(x) {
               stopifnot(class(x) == "Column")
               x@jc
             })
@@ -3178,8 +3183,8 @@ setMethod("create_array",
           })
 
 #' @details
-#' \code{create_map}: Creates a new map column. The input columns must be grouped as key-value pairs,
-#' e.g. (key1, value1, key2, value2, ...).
+#' \code{create_map}: Creates a new map column. The input columns must be grouped as key-value
+#' pairs, e.g. (key1, value1, key2, value2, ...).
 #' The key columns must all have the same data type, and can't be null.
 #' The value columns must all have the same data type.
 #'
@@ -3190,7 +3195,7 @@ setMethod("create_array",
 setMethod("create_map",
           signature(x = "Column"),
           function(x, ...) {
-            jcols <- lapply(list(x, ...), function (x) {
+            jcols <- lapply(list(x, ...), function(x) {
               stopifnot(class(x) == "Column")
               x@jc
             })
@@ -3352,9 +3357,9 @@ setMethod("not",
           })
 
 #' @details
-#' \code{grouping_bit}: Indicates whether a specified column in a GROUP BY list is aggregated or not,
-#' returns 1 for aggregated or 0 for not aggregated in the result set. Same as \code{GROUPING} in SQL
-#' and \code{grouping} function in Scala.
+#' \code{grouping_bit}: Indicates whether a specified column in a GROUP BY list is aggregated or
+#' not, returns 1 for aggregated or 0 for not aggregated in the result set. Same as \code{GROUPING}
+#' in SQL and \code{grouping} function in Scala.
 #'
 #' @rdname column_aggregate_functions
 #' @aliases grouping_bit grouping_bit,Column-method
@@ -3412,7 +3417,7 @@ setMethod("grouping_bit",
 setMethod("grouping_id",
           signature(x = "Column"),
           function(x, ...) {
-            jcols <- lapply(list(x, ...), function (x) {
+            jcols <- lapply(list(x, ...), function(x) {
               stopifnot(class(x) == "Column")
               x@jc
             })
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 0fe8f0453b064..4e427489f6860 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -385,7 +385,7 @@ setGeneric("value", function(bcast) { standardGeneric("value") })
 #' @return A SparkDataFrame.
 #' @rdname summarize
 #' @export
-setGeneric("agg", function (x, ...) { standardGeneric("agg") })
+setGeneric("agg", function(x, ...) { standardGeneric("agg") })
 
 #' alias
 #'
@@ -731,7 +731,7 @@ setGeneric("schema", function(x) { standardGeneric("schema") })
 
 #' @rdname select
 #' @export
-setGeneric("select", function(x, col, ...) { standardGeneric("select") } )
+setGeneric("select", function(x, col, ...) { standardGeneric("select") })
 
 #' @rdname selectExpr
 #' @export
diff --git a/R/pkg/R/group.R b/R/pkg/R/group.R
index 0a7be0e993975..54ef9f07d6fae 100644
--- a/R/pkg/R/group.R
+++ b/R/pkg/R/group.R
@@ -133,8 +133,8 @@ setMethod("summarize",
 # Aggregate Functions by name
 methods <- c("avg", "max", "mean", "min", "sum")
 
-# These are not exposed on GroupedData: "kurtosis", "skewness", "stddev", "stddev_samp", "stddev_pop",
-# "variance", "var_samp", "var_pop"
+# These are not exposed on GroupedData: "kurtosis", "skewness", "stddev", "stddev_samp",
+# "stddev_pop", "variance", "var_samp", "var_pop"
 
 #' Pivot a column of the GroupedData and perform the specified aggregation.
 #'
diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R
index 15af8298ba484..7cd072a1d6f89 100644
--- a/R/pkg/R/mllib_classification.R
+++ b/R/pkg/R/mllib_classification.R
@@ -58,22 +58,25 @@ setClass("NaiveBayesModel", representation(jobj = "jobj"))
 #' @param regParam The regularization parameter. Only supports L2 regularization currently.
 #' @param maxIter Maximum iteration number.
 #' @param tol Convergence tolerance of iterations.
-#' @param standardization Whether to standardize the training features before fitting the model. The coefficients
-#'                        of models will be always returned on the original scale, so it will be transparent for
-#'                        users. Note that with/without standardization, the models should be always converged
-#'                        to the same solution when no regularization is applied.
+#' @param standardization Whether to standardize the training features before fitting the model.
+#'                        The coefficients of models will be always returned on the original scale,
+#'                        so it will be transparent for users. Note that with/without
+#'                        standardization, the models should be always converged to the same
+#'                        solution when no regularization is applied.
 #' @param threshold The threshold in binary classification applied to the linear model prediction.
 #'                  This threshold can be any real number, where Inf will make all predictions 0.0
 #'                  and -Inf will make all predictions 1.0.
 #' @param weightCol The weight column name.
-#' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the dimensions of features
-#'                         or the number of partitions are large, this param could be adjusted to a larger size.
+#' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the
+#'                         dimensions of features or the number of partitions are large, this param
+#'                         could be adjusted to a larger size.
 #'                         This is an expert parameter. Default value should be good for most cases.
-#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and label
-#'                      column of string type.
+#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and
+#'                      label column of string type.
 #'                      Supported options: "skip" (filter out rows with invalid data),
-#'                                         "error" (throw an error), "keep" (put invalid data in a special additional
-#'                                         bucket, at index numLabels). Default is "error".
+#'                                         "error" (throw an error), "keep" (put invalid data in
+#'                                         a special additional bucket, at index numLabels). Default
+#'                                         is "error".
 #' @param ... additional arguments passed to the method.
 #' @return \code{spark.svmLinear} returns a fitted linear SVM model.
 #' @rdname spark.svmLinear
@@ -175,62 +178,80 @@ function(object, path, overwrite = FALSE) {
 
 #' Logistic Regression Model
 #'
-#' Fits an logistic regression model against a SparkDataFrame. It supports "binomial": Binary logistic regression
-#' with pivoting; "multinomial": Multinomial logistic (softmax) regression without pivoting, similar to glmnet.
-#' Users can print, make predictions on the produced model and save the model to the input path.
+#' Fits an logistic regression model against a SparkDataFrame. It supports "binomial": Binary
+#' logistic regression with pivoting; "multinomial": Multinomial logistic (softmax) regression
+#' without pivoting, similar to glmnet. Users can print, make predictions on the produced model
+#' and save the model to the input path.
 #'
 #' @param data SparkDataFrame for training.
 #' @param formula A symbolic description of the model to be fitted. Currently only a few formula
 #'                operators are supported, including '~', '.', ':', '+', and '-'.
 #' @param regParam the regularization parameter.
-#' @param elasticNetParam the ElasticNet mixing parameter. For alpha = 0.0, the penalty is an L2 penalty.
-#'                        For alpha = 1.0, it is an L1 penalty. For 0.0 < alpha < 1.0, the penalty is a combination
-#'                        of L1 and L2. Default is 0.0 which is an L2 penalty.
+#' @param elasticNetParam the ElasticNet mixing parameter. For alpha = 0.0, the penalty is an L2
+#'                        penalty. For alpha = 1.0, it is an L1 penalty. For 0.0 < alpha < 1.0,
+#'                        the penalty is a combination of L1 and L2. Default is 0.0 which is an
+#'                        L2 penalty.
 #' @param maxIter maximum iteration number.
 #' @param tol convergence tolerance of iterations.
-#' @param family the name of family which is a description of the label distribution to be used in the model.
+#' @param family the name of family which is a description of the label distribution to be used
+#'               in the model.
 #'               Supported options:
 #'                 \itemize{
 #'                   \item{"auto": Automatically select the family based on the number of classes:
 #'                           If number of classes == 1 || number of classes == 2, set to "binomial".
 #'                           Else, set to "multinomial".}
 #'                   \item{"binomial": Binary logistic regression with pivoting.}
-#'                   \item{"multinomial": Multinomial logistic (softmax) regression without pivoting.}
+#'                   \item{"multinomial": Multinomial logistic (softmax) regression without
+#'                           pivoting.}
 #'                 }
-#' @param standardization whether to standardize the training features before fitting the model. The coefficients
-#'                        of models will be always returned on the original scale, so it will be transparent for
-#'                        users. Note that with/without standardization, the models should be always converged
-#'                        to the same solution when no regularization is applied. Default is TRUE, same as glmnet.
-#' @param thresholds in binary classification, in range [0, 1]. If the estimated probability of class label 1
-#'                  is > threshold, then predict 1, else 0. A high threshold encourages the model to predict 0
-#'                  more often; a low threshold encourages the model to predict 1 more often. Note: Setting this with
-#'                  threshold p is equivalent to setting thresholds c(1-p, p). In multiclass (or binary) classification to adjust the probability of
-#'                  predicting each class. Array must have length equal to the number of classes, with values > 0,
-#'                  excepting that at most one value may be 0. The class with largest value p/t is predicted, where p
-#'                  is the original probability of that class and t is the class's threshold.
+#' @param standardization whether to standardize the training features before fitting the model.
+#'                        The coefficients of models will be always returned on the original scale,
+#'                        so it will be transparent for users. Note that with/without
+#'                        standardization, the models should be always converged to the same
+#'                        solution when no regularization is applied. Default is TRUE, same as
+#'                        glmnet.
+#' @param thresholds in binary classification, in range [0, 1]. If the estimated probability of
+#'                   class label 1 is > threshold, then predict 1, else 0. A high threshold
+#'                   encourages the model to predict 0 more often; a low threshold encourages the
+#'                   model to predict 1 more often. Note: Setting this with threshold p is
+#'                   equivalent to setting thresholds c(1-p, p). In multiclass (or binary)
+#'                   classification to adjust the probability of predicting each class. Array must
+#'                   have length equal to the number of classes, with values > 0, excepting that
+#'                   at most one value may be 0. The class with largest value p/t is predicted,
+#'                   where p is the original probability of that class and t is the class's
+#'                   threshold.
 #' @param weightCol The weight column name.
-#' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the dimensions of features
-#'                         or the number of partitions are large, this param could be adjusted to a larger size.
-#'                         This is an expert parameter. Default value should be good for most cases.
-#' @param lowerBoundsOnCoefficients The lower bounds on coefficients if fitting under bound constrained optimization.
-#'                                  The bound matrix must be compatible with the shape (1, number of features) for binomial
-#'                                  regression, or (number of classes, number of features) for multinomial regression.
+#' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the
+#'                         dimensions of features or the number of partitions are large, this param
+#'                         could be adjusted to a larger size. This is an expert parameter. Default
+#'                         value should be good for most cases.
+#' @param lowerBoundsOnCoefficients The lower bounds on coefficients if fitting under bound
+#'                                  constrained optimization.
+#'                                  The bound matrix must be compatible with the shape (1, number
+#'                                  of features) for binomial regression, or (number of classes,
+#'                                  number of features) for multinomial regression.
 #'                                  It is a R matrix.
-#' @param upperBoundsOnCoefficients The upper bounds on coefficients if fitting under bound constrained optimization.
-#'                                  The bound matrix must be compatible with the shape (1, number of features) for binomial
-#'                                  regression, or (number of classes, number of features) for multinomial regression.
+#' @param upperBoundsOnCoefficients The upper bounds on coefficients if fitting under bound
+#'                                  constrained optimization.
+#'                                  The bound matrix must be compatible with the shape (1, number
+#'                                  of features) for binomial regression, or (number of classes,
+#'                                  number of features) for multinomial regression.
 #'                                  It is a R matrix.
-#' @param lowerBoundsOnIntercepts The lower bounds on intercepts if fitting under bound constrained optimization.
-#'                                The bounds vector size must be equal to 1 for binomial regression, or the number
-#'                                of classes for multinomial regression.
-#' @param upperBoundsOnIntercepts The upper bounds on intercepts if fitting under bound constrained optimization.
-#'                                The bound vector size must be equal to 1 for binomial regression, or the number
+#' @param lowerBoundsOnIntercepts The lower bounds on intercepts if fitting under bound constrained
+#'                                optimization.
+#'                                The bounds vector size must be equal to 1 for binomial regression,
+#'                                or the number
 #'                                of classes for multinomial regression.
-#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and label
-#'                      column of string type.
+#' @param upperBoundsOnIntercepts The upper bounds on intercepts if fitting under bound constrained
+#'                                optimization.
+#'                                The bound vector size must be equal to 1 for binomial regression,
+#'                                or the number of classes for multinomial regression.
+#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and
+#'                      label column of string type.
 #'                      Supported options: "skip" (filter out rows with invalid data),
-#'                                         "error" (throw an error), "keep" (put invalid data in a special additional
-#'                                         bucket, at index numLabels). Default is "error".
+#'                                         "error" (throw an error), "keep" (put invalid data in
+#'                                         a special additional bucket, at index numLabels). Default
+#'                                         is "error".
 #' @param ... additional arguments passed to the method.
 #' @return \code{spark.logit} returns a fitted logistic regression model.
 #' @rdname spark.logit
@@ -412,11 +433,12 @@ setMethod("write.ml", signature(object = "LogisticRegressionModel", path = "char
 #' @param seed seed parameter for weights initialization.
 #' @param initialWeights initialWeights parameter for weights initialization, it should be a
 #'        numeric vector.
-#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and label
-#'                      column of string type.
+#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and
+#'                      label column of string type.
 #'                      Supported options: "skip" (filter out rows with invalid data),
-#'                                         "error" (throw an error), "keep" (put invalid data in a special additional
-#'                                         bucket, at index numLabels). Default is "error".
+#'                                         "error" (throw an error), "keep" (put invalid data in
+#'                                         a special additional bucket, at index numLabels). Default
+#'                                         is "error".
 #' @param ... additional arguments passed to the method.
 #' @return \code{spark.mlp} returns a fitted Multilayer Perceptron Classification Model.
 #' @rdname spark.mlp
@@ -452,11 +474,11 @@ setMethod("spark.mlp", signature(data = "SparkDataFrame", formula = "formula"),
                    handleInvalid = c("error", "keep", "skip")) {
             formula <- paste(deparse(formula), collapse = "")
             if (is.null(layers)) {
-              stop ("layers must be a integer vector with length > 1.")
+              stop("layers must be a integer vector with length > 1.")
             }
             layers <- as.integer(na.omit(layers))
             if (length(layers) <= 1) {
-              stop ("layers must be a integer vector with length > 1.")
+              stop("layers must be a integer vector with length > 1.")
             }
             if (!is.null(seed)) {
               seed <- as.character(as.integer(seed))
@@ -538,11 +560,12 @@ setMethod("write.ml", signature(object = "MultilayerPerceptronClassificationMode
 #' @param formula a symbolic description of the model to be fitted. Currently only a few formula
 #'               operators are supported, including '~', '.', ':', '+', and '-'.
 #' @param smoothing smoothing parameter.
-#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and label
-#'                      column of string type.
+#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and
+#'                      label column of string type.
 #'                      Supported options: "skip" (filter out rows with invalid data),
-#'                                         "error" (throw an error), "keep" (put invalid data in a special additional
-#'                                         bucket, at index numLabels). Default is "error".
+#'                                         "error" (throw an error), "keep" (put invalid data in
+#'                                         a special additional bucket, at index numLabels). Default
+#'                                         is "error".
 #' @param ... additional argument(s) passed to the method. Currently only \code{smoothing}.
 #' @return \code{spark.naiveBayes} returns a fitted naive Bayes model.
 #' @rdname spark.naiveBayes
diff --git a/R/pkg/R/mllib_clustering.R b/R/pkg/R/mllib_clustering.R
index 97c9fa1b45840..a25bf81c6d977 100644
--- a/R/pkg/R/mllib_clustering.R
+++ b/R/pkg/R/mllib_clustering.R
@@ -60,9 +60,9 @@ setClass("LDAModel", representation(jobj = "jobj"))
 #' @param maxIter maximum iteration number.
 #' @param seed the random seed.
 #' @param minDivisibleClusterSize The minimum number of points (if greater than or equal to 1.0)
-#'                                or the minimum proportion of points (if less than 1.0) of a divisible cluster.
-#'                                Note that it is an expert parameter. The default value should be good enough
-#'                                for most cases.
+#'                                or the minimum proportion of points (if less than 1.0) of a
+#'                                divisible cluster. Note that it is an expert parameter. The
+#'                                default value should be good enough for most cases.
 #' @param ... additional argument(s) passed to the method.
 #' @return \code{spark.bisectingKmeans} returns a fitted bisecting k-means model.
 #' @rdname spark.bisectingKmeans
@@ -325,10 +325,11 @@ setMethod("write.ml", signature(object = "GaussianMixtureModel", path = "charact
 #'                Note that the response variable of formula is empty in spark.kmeans.
 #' @param k number of centers.
 #' @param maxIter maximum iteration number.
-#' @param initMode the initialization algorithm choosen to fit the model.
+#' @param initMode the initialization algorithm chosen to fit the model.
 #' @param seed the random seed for cluster initialization.
 #' @param initSteps the number of steps for the k-means|| initialization mode.
-#'                  This is an advanced setting, the default of 2 is almost always enough. Must be > 0.
+#'                  This is an advanced setting, the default of 2 is almost always enough.
+#'                  Must be > 0.
 #' @param tol convergence tolerance of iterations.
 #' @param ... additional argument(s) passed to the method.
 #' @return \code{spark.kmeans} returns a fitted k-means model.
@@ -548,8 +549,8 @@ setMethod("spark.lda", signature(data = "SparkDataFrame"),
 #'         \item{\code{topics}}{top 10 terms and their weights of all topics}
 #'         \item{\code{vocabulary}}{whole terms of the training corpus, NULL if libsvm format file
 #'               used as training set}
-#'         \item{\code{trainingLogLikelihood}}{Log likelihood of the observed tokens in the training set,
-#'               given the current parameter estimates:
+#'         \item{\code{trainingLogLikelihood}}{Log likelihood of the observed tokens in the
+#'               training set, given the current parameter estimates:
 #'               log P(docs | topics, topic distributions for docs, Dirichlet hyperparameters)
 #'               It is only for distributed LDA model (i.e., optimizer = "em")}
 #'         \item{\code{logPrior}}{Log probability of the current parameter estimate:
diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index ebaeae970218a..f734a0865ec3b 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -58,8 +58,8 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 #'               Note that there are two ways to specify the tweedie family.
 #'               \itemize{
 #'                \item Set \code{family = "tweedie"} and specify the var.power and link.power;
-#'                \item When package \code{statmod} is loaded, the tweedie family is specified using the
-#'                family definition therein, i.e., \code{tweedie(var.power, link.power)}.
+#'                \item When package \code{statmod} is loaded, the tweedie family is specified
+#'                using the family definition therein, i.e., \code{tweedie(var.power, link.power)}.
 #'               }
 #' @param tol positive convergence tolerance of iterations.
 #' @param maxIter integer giving the maximal number of IRLS iterations.
@@ -71,13 +71,15 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 #'                      applicable to the Tweedie family.
 #' @param link.power the index in the power link function. Only applicable to the Tweedie family.
 #' @param stringIndexerOrderType how to order categories of a string feature column. This is used to
-#'                               decide the base level of a string feature as the last category after
-#'                               ordering is dropped when encoding strings. Supported options are
-#'                               "frequencyDesc", "frequencyAsc", "alphabetDesc", and "alphabetAsc".
-#'                               The default value is "frequencyDesc". When the ordering is set to
-#'                               "alphabetDesc", this drops the same category as R when encoding strings.
-#' @param offsetCol the offset column name. If this is not set or empty, we treat all instance offsets
-#'                  as 0.0. The feature specified as offset has a constant coefficient of 1.0.
+#'                               decide the base level of a string feature as the last category
+#'                               after ordering is dropped when encoding strings. Supported options
+#'                               are "frequencyDesc", "frequencyAsc", "alphabetDesc", and
+#'                               "alphabetAsc". The default value is "frequencyDesc". When the
+#'                               ordering is set to "alphabetDesc", this drops the same category
+#'                               as R when encoding strings.
+#' @param offsetCol the offset column name. If this is not set or empty, we treat all instance
+#'                  offsets as 0.0. The feature specified as offset has a constant coefficient of
+#'                  1.0.
 #' @param ... additional arguments passed to the method.
 #' @aliases spark.glm,SparkDataFrame,formula-method
 #' @return \code{spark.glm} returns a fitted generalized linear model.
@@ -197,13 +199,15 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
 #' @param var.power the index of the power variance function in the Tweedie family.
 #' @param link.power the index of the power link function in the Tweedie family.
 #' @param stringIndexerOrderType how to order categories of a string feature column. This is used to
-#'                               decide the base level of a string feature as the last category after
-#'                               ordering is dropped when encoding strings. Supported options are
-#'                               "frequencyDesc", "frequencyAsc", "alphabetDesc", and "alphabetAsc".
-#'                               The default value is "frequencyDesc". When the ordering is set to
-#'                               "alphabetDesc", this drops the same category as R when encoding strings.
-#' @param offsetCol the offset column name. If this is not set or empty, we treat all instance offsets
-#'                  as 0.0. The feature specified as offset has a constant coefficient of 1.0.
+#'                               decide the base level of a string feature as the last category
+#'                               after ordering is dropped when encoding strings. Supported options
+#'                               are "frequencyDesc", "frequencyAsc", "alphabetDesc", and
+#'                               "alphabetAsc". The default value is "frequencyDesc". When the
+#'                               ordering is set to "alphabetDesc", this drops the same category
+#'                               as R when encoding strings.
+#' @param offsetCol the offset column name. If this is not set or empty, we treat all instance
+#'                  offsets as 0.0. The feature specified as offset has a constant coefficient of
+#'                  1.0.
 #' @return \code{glm} returns a fitted generalized linear model.
 #' @rdname glm
 #' @export
@@ -233,11 +237,11 @@ setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDat
 
 #' @param object a fitted generalized linear model.
 #' @return \code{summary} returns summary information of the fitted model, which is a list.
-#'         The list of components includes at least the \code{coefficients} (coefficients matrix, which includes
-#'         coefficients, standard error of coefficients, t value and p value),
+#'         The list of components includes at least the \code{coefficients} (coefficients matrix,
+#'         which includes coefficients, standard error of coefficients, t value and p value),
 #'         \code{null.deviance} (null/residual degrees of freedom), \code{aic} (AIC)
-#'         and \code{iter} (number of iterations IRLS takes). If there are collinear columns in the data,
-#'         the coefficients matrix only provides coefficients.
+#'         and \code{iter} (number of iterations IRLS takes). If there are collinear columns in
+#'         the data, the coefficients matrix only provides coefficients.
 #' @rdname spark.glm
 #' @export
 #' @note summary(GeneralizedLinearRegressionModel) since 2.0.0
@@ -457,15 +461,17 @@ setMethod("write.ml", signature(object = "IsotonicRegressionModel", path = "char
 #' @param formula a symbolic description of the model to be fitted. Currently only a few formula
 #'                operators are supported, including '~', ':', '+', and '-'.
 #'                Note that operator '.' is not supported currently.
-#' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the dimensions of features
-#'                         or the number of partitions are large, this param could be adjusted to a larger size.
-#'                         This is an expert parameter. Default value should be good for most cases.
+#' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the
+#'                         dimensions of features or the number of partitions are large, this
+#'                         param could be adjusted to a larger size. This is an expert parameter.
+#'                         Default value should be good for most cases.
 #' @param stringIndexerOrderType how to order categories of a string feature column. This is used to
-#'                               decide the base level of a string feature as the last category after
-#'                               ordering is dropped when encoding strings. Supported options are
-#'                               "frequencyDesc", "frequencyAsc", "alphabetDesc", and "alphabetAsc".
-#'                               The default value is "frequencyDesc". When the ordering is set to
-#'                               "alphabetDesc", this drops the same category as R when encoding strings.
+#'                               decide the base level of a string feature as the last category
+#'                               after ordering is dropped when encoding strings. Supported options
+#'                               are "frequencyDesc", "frequencyAsc", "alphabetDesc", and
+#'                               "alphabetAsc". The default value is "frequencyDesc". When the
+#'                               ordering is set to "alphabetDesc", this drops the same category
+#'                               as R when encoding strings.
 #' @param ... additional arguments passed to the method.
 #' @return \code{spark.survreg} returns a fitted AFT survival regression model.
 #' @rdname spark.survreg
diff --git a/R/pkg/R/mllib_tree.R b/R/pkg/R/mllib_tree.R
index 33c4653f4c184..89a58bf0aadae 100644
--- a/R/pkg/R/mllib_tree.R
+++ b/R/pkg/R/mllib_tree.R
@@ -132,10 +132,12 @@ print.summary.decisionTree <- function(x) {
 #' Gradient Boosted Tree model, \code{predict} to make predictions on new data, and
 #' \code{write.ml}/\code{read.ml} to save/load fitted models.
 #' For more details, see
+# nolint start
 #' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-regression}{
 #' GBT Regression} and
 #' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-classifier}{
 #' GBT Classification}
+# nolint end
 #'
 #' @param data a SparkDataFrame for training.
 #' @param formula a symbolic description of the model to be fitted. Currently only a few formula
@@ -164,11 +166,12 @@ print.summary.decisionTree <- function(x) {
 #'                     nodes. If TRUE, the algorithm will cache node IDs for each instance. Caching
 #'                     can speed up training of deeper trees. Users can set how often should the
 #'                     cache be checkpointed or disable it by setting checkpointInterval.
-#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and label
-#'                      column of string type in classification model.
+#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and
+#'                      label column of string type in classification model.
 #'                      Supported options: "skip" (filter out rows with invalid data),
-#'                                         "error" (throw an error), "keep" (put invalid data in a special additional
-#'                                         bucket, at index numLabels). Default is "error".
+#'                                         "error" (throw an error), "keep" (put invalid data in
+#'                                         a special additional bucket, at index numLabels). Default
+#'                                         is "error".
 #' @param ... additional arguments passed to the method.
 #' @aliases spark.gbt,SparkDataFrame,formula-method
 #' @return \code{spark.gbt} returns a fitted Gradient Boosted Tree model.
@@ -352,10 +355,12 @@ setMethod("write.ml", signature(object = "GBTClassificationModel", path = "chara
 #' model, \code{predict} to make predictions on new data, and \code{write.ml}/\code{read.ml} to
 #' save/load fitted models.
 #' For more details, see
+# nolint start
 #' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html#random-forest-regression}{
 #' Random Forest Regression} and
 #' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html#random-forest-classifier}{
 #' Random Forest Classification}
+# nolint end
 #'
 #' @param data a SparkDataFrame for training.
 #' @param formula a symbolic description of the model to be fitted. Currently only a few formula
@@ -382,11 +387,12 @@ setMethod("write.ml", signature(object = "GBTClassificationModel", path = "chara
 #'                     nodes. If TRUE, the algorithm will cache node IDs for each instance. Caching
 #'                     can speed up training of deeper trees. Users can set how often should the
 #'                     cache be checkpointed or disable it by setting checkpointInterval.
-#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and label
-#'                      column of string type in classification model.
+#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and
+#'                      label column of string type in classification model.
 #'                      Supported options: "skip" (filter out rows with invalid data),
-#'                                         "error" (throw an error), "keep" (put invalid data in a special additional
-#'                                         bucket, at index numLabels). Default is "error".
+#'                                         "error" (throw an error), "keep" (put invalid data in
+#'                                         a special additional bucket, at index numLabels). Default
+#'                                         is "error".
 #' @param ... additional arguments passed to the method.
 #' @aliases spark.randomForest,SparkDataFrame,formula-method
 #' @return \code{spark.randomForest} returns a fitted Random Forest model.
@@ -567,10 +573,12 @@ setMethod("write.ml", signature(object = "RandomForestClassificationModel", path
 #' model, \code{predict} to make predictions on new data, and \code{write.ml}/\code{read.ml} to
 #' save/load fitted models.
 #' For more details, see
+# nolint start
 #' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html#decision-tree-regression}{
 #' Decision Tree Regression} and
 #' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html#decision-tree-classifier}{
 #' Decision Tree Classification}
+# nolint end
 #'
 #' @param data a SparkDataFrame for training.
 #' @param formula a symbolic description of the model to be fitted. Currently only a few formula
@@ -592,11 +600,12 @@ setMethod("write.ml", signature(object = "RandomForestClassificationModel", path
 #'                     nodes. If TRUE, the algorithm will cache node IDs for each instance. Caching
 #'                     can speed up training of deeper trees. Users can set how often should the
 #'                     cache be checkpointed or disable it by setting checkpointInterval.
-#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and label
-#'                      column of string type in classification model.
+#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and
+#'                      label column of string type in classification model.
 #'                      Supported options: "skip" (filter out rows with invalid data),
-#'                                         "error" (throw an error), "keep" (put invalid data in a special additional
-#'                                         bucket, at index numLabels). Default is "error".
+#'                                         "error" (throw an error), "keep" (put invalid data in
+#'                                         a special additional bucket, at index numLabels). Default
+#'                                         is "error".
 #' @param ... additional arguments passed to the method.
 #' @aliases spark.decisionTree,SparkDataFrame,formula-method
 #' @return \code{spark.decisionTree} returns a fitted Decision Tree model.
@@ -671,7 +680,8 @@ setMethod("spark.decisionTree", signature(data = "SparkDataFrame", formula = "fo
 #' @return \code{summary} returns summary information of the fitted model, which is a list.
 #'         The list of components includes \code{formula} (formula),
 #'         \code{numFeatures} (number of features), \code{features} (list of features),
-#'         \code{featureImportances} (feature importances), and \code{maxDepth} (max depth of trees).
+#'         \code{featureImportances} (feature importances), and \code{maxDepth} (max depth of
+#'         trees).
 #' @rdname spark.decisionTree
 #' @aliases summary,DecisionTreeRegressionModel-method
 #' @export
diff --git a/R/pkg/R/pairRDD.R b/R/pkg/R/pairRDD.R
index 8fa21be3076b5..9c2e57d3067db 100644
--- a/R/pkg/R/pairRDD.R
+++ b/R/pkg/R/pairRDD.R
@@ -860,7 +860,7 @@ setMethod("subtractByKey",
                                             other,
                                             numPartitions = numPartitions),
                                     filterFunction),
-                          function (v) { v[[1]] })
+                          function(v) { v[[1]] })
           })
 
 #' Return a subset of this RDD sampled by key.
@@ -925,7 +925,7 @@ setMethod("sampleByKey",
                   if (withReplacement) {
                     count <- stats::rpois(1, frac)
                     if (count > 0) {
-                      res[ (len + 1) : (len + count) ] <- rep(list(elem), count)
+                      res[(len + 1) : (len + count)] <- rep(list(elem), count)
                       len <- len + count
                     }
                   } else {
diff --git a/R/pkg/R/schema.R b/R/pkg/R/schema.R
index d1ed6833d5d02..65f418740c643 100644
--- a/R/pkg/R/schema.R
+++ b/R/pkg/R/schema.R
@@ -155,7 +155,7 @@ checkType <- function(type) {
   } else {
     # Check complex types
     firstChar <- substr(type, 1, 1)
-    switch (firstChar,
+    switch(firstChar,
             a = {
               # Array type
               m <- regexec("^array<(.+)>$", type)
diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R
index 9a9fa84044ce6..c8af798830b30 100644
--- a/R/pkg/R/stats.R
+++ b/R/pkg/R/stats.R
@@ -29,9 +29,9 @@ setOldClass("jobj")
 #' @param col1 name of the first column. Distinct items will make the first item of each row.
 #' @param col2 name of the second column. Distinct items will make the column names of the output.
 #' @return a local R data.frame representing the contingency table. The first column of each row
-#'         will be the distinct values of \code{col1} and the column names will be the distinct values
-#'         of \code{col2}. The name of the first column will be "\code{col1}_\code{col2}". Pairs
-#'         that have no occurrences will have zero as their counts.
+#'         will be the distinct values of \code{col1} and the column names will be the distinct
+#'         values of \code{col2}. The name of the first column will be "\code{col1}_\code{col2}".
+#'         Pairs that have no occurrences will have zero as their counts.
 #'
 #' @rdname crosstab
 #' @name crosstab
@@ -53,8 +53,8 @@ setMethod("crosstab",
           })
 
 #' @details
-#' \code{cov}: When applied to SparkDataFrame, this calculates the sample covariance of two numerical
-#' columns of \emph{one} SparkDataFrame.
+#' \code{cov}: When applied to SparkDataFrame, this calculates the sample covariance of two
+#' numerical columns of \emph{one} SparkDataFrame.
 #'
 #' @param colName1 the name of the first column
 #' @param colName2 the name of the second column
@@ -159,8 +159,8 @@ setMethod("freqItems", signature(x = "SparkDataFrame", cols = "character"),
 #' @param relativeError The relative target precision to achieve (>= 0). If set to zero,
 #'                      the exact quantiles are computed, which could be very expensive.
 #'                      Note that values greater than 1 are accepted but give the same result as 1.
-#' @return The approximate quantiles at the given probabilities. If the input is a single column name,
-#'         the output is a list of approximate quantiles in that column; If the input is
+#' @return The approximate quantiles at the given probabilities. If the input is a single column
+#'         name, the output is a list of approximate quantiles in that column; If the input is
 #'         multiple column names, the output should be a list, and each element in it is a list of
 #'         numeric values which represents the approximate quantiles in corresponding column.
 #'
diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index 91483a4d23d9b..4b716995f2c46 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -625,7 +625,7 @@ appendPartitionLengths <- function(x, other) {
     x <- lapplyPartition(x, appendLength)
     other <- lapplyPartition(other, appendLength)
   }
-  list (x, other)
+  list(x, other)
 }
 
 # Perform zip or cartesian between elements from two RDDs in each partition
@@ -657,7 +657,7 @@ mergePartitions <- function(rdd, zip) {
           keys <- list()
         }
         if (lengthOfValues > 1) {
-          values <- part[ (lengthOfKeys + 1) : (len - 1) ]
+          values <- part[(lengthOfKeys + 1) : (len - 1)]
         } else {
           values <- list()
         }
diff --git a/R/pkg/inst/worker/worker.R b/R/pkg/inst/worker/worker.R
index 03e7450147865..00789d815bba8 100644
--- a/R/pkg/inst/worker/worker.R
+++ b/R/pkg/inst/worker/worker.R
@@ -68,7 +68,7 @@ compute <- function(mode, partition, serializer, deserializer, key,
   } else {
     output <- computeFunc(partition, inputData)
   }
-  return (output)
+  return(output)
 }
 
 outputResult <- function(serializer, output, outputCon) {
diff --git a/R/pkg/tests/fulltests/test_binary_function.R b/R/pkg/tests/fulltests/test_binary_function.R
index 442bed509bb1d..c5d240f3e7344 100644
--- a/R/pkg/tests/fulltests/test_binary_function.R
+++ b/R/pkg/tests/fulltests/test_binary_function.R
@@ -73,7 +73,7 @@ test_that("zipPartitions() on RDDs", {
   rdd2 <- parallelize(sc, 1:4, 2L)  # 1:2, 3:4
   rdd3 <- parallelize(sc, 1:6, 2L)  # 1:3, 4:6
   actual <- collectRDD(zipPartitions(rdd1, rdd2, rdd3,
-                                  func = function(x, y, z) { list(list(x, y, z))} ))
+                                  func = function(x, y, z) { list(list(x, y, z))}))
   expect_equal(actual,
                list(list(1, c(1, 2), c(1, 2, 3)), list(2, c(3, 4), c(4, 5, 6))))
 
diff --git a/R/pkg/tests/fulltests/test_rdd.R b/R/pkg/tests/fulltests/test_rdd.R
index 6ee1fceffd822..0c702ea897f7c 100644
--- a/R/pkg/tests/fulltests/test_rdd.R
+++ b/R/pkg/tests/fulltests/test_rdd.R
@@ -698,14 +698,14 @@ test_that("fullOuterJoin() on pairwise RDDs", {
 })
 
 test_that("sortByKey() on pairwise RDDs", {
-  numPairsRdd <- map(rdd, function(x) { list (x, x) })
+  numPairsRdd <- map(rdd, function(x) { list(x, x) })
   sortedRdd <- sortByKey(numPairsRdd, ascending = FALSE)
   actual <- collectRDD(sortedRdd)
-  numPairs <- lapply(nums, function(x) { list (x, x) })
+  numPairs <- lapply(nums, function(x) { list(x, x) })
   expect_equal(actual, sortKeyValueList(numPairs, decreasing = TRUE))
 
   rdd2 <- parallelize(sc, sort(nums, decreasing = TRUE), 2L)
-  numPairsRdd2 <- map(rdd2, function(x) { list (x, x) })
+  numPairsRdd2 <- map(rdd2, function(x) { list(x, x) })
   sortedRdd2 <- sortByKey(numPairsRdd2)
   actual <- collectRDD(sortedRdd2)
   expect_equal(actual, numPairs)
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index 4e62be9b4d619..7f781f2f66a7f 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -560,9 +560,9 @@ test_that("Collect DataFrame with complex types", {
   expect_equal(nrow(ldf), 3)
   expect_equal(ncol(ldf), 3)
   expect_equal(names(ldf), c("c1", "c2", "c3"))
-  expect_equal(ldf$c1, list(list(1, 2, 3), list(4, 5, 6), list (7, 8, 9)))
-  expect_equal(ldf$c2, list(list("a", "b", "c"), list("d", "e", "f"), list ("g", "h", "i")))
-  expect_equal(ldf$c3, list(list(1.0, 2.0, 3.0), list(4.0, 5.0, 6.0), list (7.0, 8.0, 9.0)))
+  expect_equal(ldf$c1, list(list(1, 2, 3), list(4, 5, 6), list(7, 8, 9)))
+  expect_equal(ldf$c2, list(list("a", "b", "c"), list("d", "e", "f"), list("g", "h", "i")))
+  expect_equal(ldf$c3, list(list(1.0, 2.0, 3.0), list(4.0, 5.0, 6.0), list(7.0, 8.0, 9.0)))
 
   # MapType
   schema <- structType(structField("name", "string"),
@@ -1524,7 +1524,7 @@ test_that("column functions", {
     expect_equal(ncol(s), 1)
     expect_equal(nrow(s), 3)
     expect_is(s[[1]][[1]], "struct")
-    expect_true(any(apply(s, 1, function(x) { x[[1]]$age == 16 } )))
+    expect_true(any(apply(s, 1, function(x) { x[[1]]$age == 16 })))
   }
 
   # passing option
@@ -2710,7 +2710,7 @@ test_that("freqItems() on a DataFrame", {
   input <- 1:1000
   rdf <- data.frame(numbers = input, letters = as.character(input),
                     negDoubles = input * -1.0, stringsAsFactors = F)
-  rdf[ input %% 3 == 0, ] <- c(1, "1", -1)
+  rdf[input %% 3 == 0, ] <- c(1, "1", -1)
   df <- createDataFrame(rdf)
   multiColResults <- freqItems(df, c("numbers", "letters"), support = 0.1)
   expect_true(1 %in% multiColResults$numbers[[1]])
@@ -3064,7 +3064,7 @@ test_that("coalesce, repartition, numPartitions", {
 })
 
 test_that("gapply() and gapplyCollect() on a DataFrame", {
-  df <- createDataFrame (
+  df <- createDataFrame(
     list(list(1L, 1, "1", 0.1), list(1L, 2, "1", 0.2), list(3L, 3, "3", 0.3)),
     c("a", "b", "c", "d"))
   expected <- collect(df)
@@ -3135,7 +3135,7 @@ test_that("gapply() and gapplyCollect() on a DataFrame", {
   actual <- df3Collect[order(df3Collect$a), ]
   expect_identical(actual$avg, expected$avg)
 
-  irisDF <- suppressWarnings(createDataFrame (iris))
+  irisDF <- suppressWarnings(createDataFrame(iris))
   schema <-  structType(structField("Sepal_Length", "double"), structField("Avg", "double"))
   # Groups by `Sepal_Length` and computes the average for `Sepal_Width`
   df4 <- gapply(
diff --git a/dev/lint-r.R b/dev/lint-r.R
index 87ee36d5c9b68..a4261d266bbc0 100644
--- a/dev/lint-r.R
+++ b/dev/lint-r.R
@@ -26,8 +26,8 @@ if (! library(SparkR, lib.loc = LOCAL_LIB_LOC, logical.return = TRUE)) {
 
 # Installs lintr from Github in a local directory.
 # NOTE: The CRAN's version is too old to adapt to our rules.
-if ("lintr" %in% row.names(installed.packages())  == FALSE) {
-  devtools::install_github("jimhester/lintr@a769c0b")
+if ("lintr" %in% row.names(installed.packages()) == FALSE) {
+  devtools::install_github("jimhester/lintr@5431140")
 }
 
 library(lintr)

From 3ca367083e196e6487207211e6c49d4bbfe31288 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Sun, 1 Oct 2017 10:49:22 -0700
Subject: [PATCH 651/779] [SPARK-22001][ML][SQL] ImputerModel can do withColumn
 for all input columns at one pass

## What changes were proposed in this pull request?

SPARK-21690 makes one-pass `Imputer` by parallelizing the computation of all input columns. When we transform dataset with `ImputerModel`, we do `withColumn` on all input columns sequentially. We can also do this on all input columns at once by adding a `withColumns` API to `Dataset`.

The new `withColumns` API is for internal use only now.

## How was this patch tested?

Existing tests for `ImputerModel`'s change. Added tests for `withColumns` API.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #19229 from viirya/SPARK-22001.
---
 .../org/apache/spark/ml/feature/Imputer.scala | 10 ++--
 .../scala/org/apache/spark/sql/Dataset.scala  | 42 ++++++++++-----
 .../org/apache/spark/sql/DataFrameSuite.scala | 52 +++++++++++++++++++
 3 files changed, 86 insertions(+), 18 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
index 1f36eced3d08f..4663f16b5f5dc 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -223,20 +223,18 @@ class ImputerModel private[ml] (
 
   override def transform(dataset: Dataset[_]): DataFrame = {
     transformSchema(dataset.schema, logging = true)
-    var outputDF = dataset
     val surrogates = surrogateDF.select($(inputCols).map(col): _*).head().toSeq
 
-    $(inputCols).zip($(outputCols)).zip(surrogates).foreach {
+    val newCols = $(inputCols).zip($(outputCols)).zip(surrogates).map {
       case ((inputCol, outputCol), surrogate) =>
         val inputType = dataset.schema(inputCol).dataType
         val ic = col(inputCol)
-        outputDF = outputDF.withColumn(outputCol,
-          when(ic.isNull, surrogate)
+        when(ic.isNull, surrogate)
           .when(ic === $(missingValue), surrogate)
           .otherwise(ic)
-          .cast(inputType))
+          .cast(inputType)
     }
-    outputDF.toDF()
+    dataset.withColumns($(outputCols), newCols).toDF()
   }
 
   override def transformSchema(schema: StructType): StructType = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index ab0c4126bcbdd..f2a76a506eb6f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -2083,22 +2083,40 @@ class Dataset[T] private[sql](
    * @group untypedrel
    * @since 2.0.0
    */
-  def withColumn(colName: String, col: Column): DataFrame = {
+  def withColumn(colName: String, col: Column): DataFrame = withColumns(Seq(colName), Seq(col))
+
+  /**
+   * Returns a new Dataset by adding columns or replacing the existing columns that has
+   * the same names.
+   */
+  private[spark] def withColumns(colNames: Seq[String], cols: Seq[Column]): DataFrame = {
+    require(colNames.size == cols.size,
+      s"The size of column names: ${colNames.size} isn't equal to " +
+        s"the size of columns: ${cols.size}")
+    SchemaUtils.checkColumnNameDuplication(
+      colNames,
+      "in given column names",
+      sparkSession.sessionState.conf.caseSensitiveAnalysis)
+
     val resolver = sparkSession.sessionState.analyzer.resolver
     val output = queryExecution.analyzed.output
-    val shouldReplace = output.exists(f => resolver(f.name, colName))
-    if (shouldReplace) {
-      val columns = output.map { field =>
-        if (resolver(field.name, colName)) {
-          col.as(colName)
-        } else {
-          Column(field)
-        }
+
+    val columnMap = colNames.zip(cols).toMap
+
+    val replacedAndExistingColumns = output.map { field =>
+      columnMap.find { case (colName, _) =>
+        resolver(field.name, colName)
+      } match {
+        case Some((colName: String, col: Column)) => col.as(colName)
+        case _ => Column(field)
       }
-      select(columns : _*)
-    } else {
-      select(Column("*"), col.as(colName))
     }
+
+    val newColumns = columnMap.filter { case (colName, col) =>
+      !output.exists(f => resolver(f.name, colName))
+    }.map { case (colName, col) => col.as(colName) }
+
+    select(replacedAndExistingColumns ++ newColumns : _*)
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 0e2f2e5a193e1..672deeac597f1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -641,6 +641,49 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
     assert(df.schema.map(_.name) === Seq("key", "value", "newCol"))
   }
 
+  test("withColumns") {
+    val df = testData.toDF().withColumns(Seq("newCol1", "newCol2"),
+      Seq(col("key") + 1, col("key") + 2))
+    checkAnswer(
+      df,
+      testData.collect().map { case Row(key: Int, value: String) =>
+        Row(key, value, key + 1, key + 2)
+      }.toSeq)
+    assert(df.schema.map(_.name) === Seq("key", "value", "newCol1", "newCol2"))
+
+    val err = intercept[IllegalArgumentException] {
+      testData.toDF().withColumns(Seq("newCol1"),
+        Seq(col("key") + 1, col("key") + 2))
+    }
+    assert(
+      err.getMessage.contains("The size of column names: 1 isn't equal to the size of columns: 2"))
+
+    val err2 = intercept[AnalysisException] {
+      testData.toDF().withColumns(Seq("newCol1", "newCOL1"),
+        Seq(col("key") + 1, col("key") + 2))
+    }
+    assert(err2.getMessage.contains("Found duplicate column(s)"))
+  }
+
+  test("withColumns: case sensitive") {
+    withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") {
+      val df = testData.toDF().withColumns(Seq("newCol1", "newCOL1"),
+        Seq(col("key") + 1, col("key") + 2))
+      checkAnswer(
+        df,
+        testData.collect().map { case Row(key: Int, value: String) =>
+          Row(key, value, key + 1, key + 2)
+        }.toSeq)
+      assert(df.schema.map(_.name) === Seq("key", "value", "newCol1", "newCOL1"))
+
+      val err = intercept[AnalysisException] {
+        testData.toDF().withColumns(Seq("newCol1", "newCol1"),
+          Seq(col("key") + 1, col("key") + 2))
+      }
+      assert(err.getMessage.contains("Found duplicate column(s)"))
+    }
+  }
+
   test("replace column using withColumn") {
     val df2 = sparkContext.parallelize(Array(1, 2, 3)).toDF("x")
     val df3 = df2.withColumn("x", df2("x") + 1)
@@ -649,6 +692,15 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
       Row(2) :: Row(3) :: Row(4) :: Nil)
   }
 
+  test("replace column using withColumns") {
+    val df2 = sparkContext.parallelize(Array((1, 2), (2, 3), (3, 4))).toDF("x", "y")
+    val df3 = df2.withColumns(Seq("x", "newCol1", "newCol2"),
+      Seq(df2("x") + 1, df2("y"), df2("y") + 1))
+    checkAnswer(
+      df3.select("x", "newCol1", "newCol2"),
+      Row(2, 2, 3) :: Row(3, 3, 4) :: Row(4, 4, 5) :: Nil)
+  }
+
   test("drop column using drop") {
     val df = testData.drop("key")
     checkAnswer(

From 405c0e99e7697bfa88aa4abc9a55ce5e043e48b1 Mon Sep 17 00:00:00 2001
From: guoxiaolong <guo.xiaolong1@zte.com.cn>
Date: Mon, 2 Oct 2017 08:07:56 +0100
Subject: [PATCH 652/779] [SPARK-22173][WEB-UI] Table CSS style needs to be
 adjusted in History Page and in Executors Page.

## What changes were proposed in this pull request?

There is a problem with table CSS style.

1. At present, table CSS style is too crowded, and the table width cannot adapt itself.

2. Table CSS style is different from job page, stage page, task page, master page, worker page, etc. The Spark web UI needs to be consistent.

fix before:
![01](https://user-images.githubusercontent.com/26266482/31041261-c6766c3a-a5c4-11e7-97a7-96bd51ef12bd.png)

![02](https://user-images.githubusercontent.com/26266482/31041266-d75b6a32-a5c4-11e7-8071-e3bbbba39b80.png)

----------------------------------------------------------------------------------------------------------

fix after:
![1](https://user-images.githubusercontent.com/26266482/31041162-808a5a3e-a5c3-11e7-8d92-d763b500ce53.png)

![2](https://user-images.githubusercontent.com/26266482/31041166-86e583e0-a5c3-11e7-949c-11c370db9e27.png)

## How was this patch tested?

manual tests

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: guoxiaolong <guo.xiaolong1@zte.com.cn>

Closes #19397 from guoxiaolongzte/SPARK-22173.
---
 .../scala/org/apache/spark/deploy/history/HistoryPage.scala   | 4 ++--
 .../main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
index af14717633409..6399dccc1676a 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
@@ -37,7 +37,7 @@ private[history] class HistoryPage(parent: HistoryServer) extends WebUIPage("")
     val content =
       <script src={UIUtils.prependBaseUri("/static/historypage-common.js")}></script>
       <div>
-          <div class="span12">
+          <div class="container-fluid">
             <ul class="unstyled">
               {providerConfig.map { case (k, v) => <li><strong>{k}:</strong> {v}</li> }}
             </ul>
@@ -58,7 +58,7 @@ private[history] class HistoryPage(parent: HistoryServer) extends WebUIPage("")
             {
             if (allAppsSize > 0) {
               <script src={UIUtils.prependBaseUri("/static/dataTables.rowsGroup.js")}></script> ++
-                <div id="history-summary" class="span12 pagination"></div> ++
+                <div id="history-summary" class="row-fluid"></div> ++
                 <script src={UIUtils.prependBaseUri("/static/utils.js")}></script> ++
                 <script src={UIUtils.prependBaseUri("/static/historypage.js")}></script> ++
                 <script>setAppLimit({parent.maxApplications})</script>
diff --git a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala
index d63381c78bc3b..7b2767f0be3cd 100644
--- a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala
@@ -82,7 +82,7 @@ private[ui] class ExecutorsPage(
               </ul>
             </div>
           </div> ++
-          <div id="active-executors" class="span12 pagination"></div> ++
+          <div id="active-executors" class="row-fluid"></div> ++
           <script src={UIUtils.prependBaseUri("/static/utils.js")}></script> ++
           <script src={UIUtils.prependBaseUri("/static/executorspage.js")}></script> ++
           <script>setThreadDumpEnabled({threadDumpEnabled})</script>

From 8fab7995d36c7bc4524393b20a4e524dbf6bbf62 Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@us.ibm.com>
Date: Mon, 2 Oct 2017 11:46:51 -0700
Subject: [PATCH 653/779] [SPARK-22167][R][BUILD] sparkr packaging issue allow
 zinc

## What changes were proposed in this pull request?

When zinc is running the pwd might be in the root of the project. A quick solution to this is to not go a level up incase we are in the root rather than root/core/. If we are in the root everything works fine, if we are in core add a script which goes and runs the level up

## How was this patch tested?

set -x in the SparkR install scripts.

Author: Holden Karau <holden@us.ibm.com>

Closes #19402 from holdenk/SPARK-22167-sparkr-packaging-issue-allow-zinc.
---
 R/install-dev.sh | 1 +
 core/pom.xml     | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/R/install-dev.sh b/R/install-dev.sh
index d613552718307..9fbc999f2e805 100755
--- a/R/install-dev.sh
+++ b/R/install-dev.sh
@@ -28,6 +28,7 @@
 
 set -o pipefail
 set -e
+set -x
 
 FWDIR="$(cd "`dirname "${BASH_SOURCE[0]}"`"; pwd)"
 LIB_DIR="$FWDIR/lib"
diff --git a/core/pom.xml b/core/pom.xml
index 09669149d8123..54f7a34a6c37e 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -499,7 +499,7 @@
               </execution>
             </executions>
             <configuration>
-              <executable>..${file.separator}R${file.separator}install-dev${script.extension}</executable>
+              <executable>${project.basedir}${file.separator}..${file.separator}R${file.separator}install-dev${script.extension}</executable>
             </configuration>
           </plugin>
         </plugins>

From e5431f2cfddc8e96194827a2123b92716c7a1467 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Mon, 2 Oct 2017 15:00:26 -0700
Subject: [PATCH 654/779] [SPARK-22158][SQL] convertMetastore should not ignore
 table property

## What changes were proposed in this pull request?

From the beginning, convertMetastoreOrc ignores table properties and use an empty map instead. This PR fixes that. For the diff, please see [this](https://github.com/apache/spark/pull/19382/files?w=1). convertMetastoreParquet also ignore.

```scala
val options = Map[String, String]()
```

- [SPARK-14070: HiveMetastoreCatalog.scala](https://github.com/apache/spark/pull/11891/files#diff-ee66e11b56c21364760a5ed2b783f863R650)
- [Master branch: HiveStrategies.scala](https://github.com/apache/spark/blob/master/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala#L197
)

## How was this patch tested?

Pass the Jenkins with an updated test suite.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #19382 from dongjoon-hyun/SPARK-22158.
---
 .../spark/sql/hive/HiveStrategies.scala       |  4 +-
 .../sql/hive/execution/HiveDDLSuite.scala     | 54 ++++++++++++++++---
 2 files changed, 50 insertions(+), 8 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index 805b3171cdaab..3592b8f4846d1 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -189,12 +189,12 @@ case class RelationConversions(
   private def convert(relation: HiveTableRelation): LogicalRelation = {
     val serde = relation.tableMeta.storage.serde.getOrElse("").toLowerCase(Locale.ROOT)
     if (serde.contains("parquet")) {
-      val options = Map(ParquetOptions.MERGE_SCHEMA ->
+      val options = relation.tableMeta.storage.properties + (ParquetOptions.MERGE_SCHEMA ->
         conf.getConf(HiveUtils.CONVERT_METASTORE_PARQUET_WITH_SCHEMA_MERGING).toString)
       sessionCatalog.metastoreCatalog
         .convertToLogicalRelation(relation, options, classOf[ParquetFileFormat], "parquet")
     } else {
-      val options = Map[String, String]()
+      val options = relation.tableMeta.storage.properties
       sessionCatalog.metastoreCatalog
         .convertToLogicalRelation(relation, options, classOf[OrcFileFormat], "orc")
     }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index 668da5fb47323..02e26bbe876a0 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -23,6 +23,8 @@ import java.net.URI
 import scala.language.existentials
 
 import org.apache.hadoop.fs.Path
+import org.apache.parquet.format.converter.ParquetMetadataConverter.NO_FILTER
+import org.apache.parquet.hadoop.ParquetFileReader
 import org.scalatest.BeforeAndAfterEach
 
 import org.apache.spark.SparkException
@@ -32,6 +34,7 @@ import org.apache.spark.sql.catalyst.analysis.{NoSuchPartitionException, TableAl
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.execution.command.{DDLSuite, DDLUtils}
 import org.apache.spark.sql.hive.HiveExternalCatalog
+import org.apache.spark.sql.hive.HiveUtils.{CONVERT_METASTORE_ORC, CONVERT_METASTORE_PARQUET}
 import org.apache.spark.sql.hive.orc.OrcFileOperator
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.{HiveSerDe, SQLConf}
@@ -1455,12 +1458,8 @@ class HiveDDLSuite
         sql("INSERT INTO t SELECT 1")
         checkAnswer(spark.table("t"), Row(1))
         // Check if this is compressed as ZLIB.
-        val maybeOrcFile = path.listFiles().find(!_.getName.endsWith(".crc"))
-        assert(maybeOrcFile.isDefined)
-        val orcFilePath = maybeOrcFile.get.toPath.toString
-        val expectedCompressionKind =
-          OrcFileOperator.getFileReader(orcFilePath).get.getCompression
-        assert("ZLIB" === expectedCompressionKind.name())
+        val maybeOrcFile = path.listFiles().find(_.getName.startsWith("part"))
+        assertCompression(maybeOrcFile, "orc", "ZLIB")
 
         sql("CREATE TABLE t2 USING HIVE AS SELECT 1 AS c1, 'a' AS c2")
         val table2 = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t2"))
@@ -2009,4 +2008,47 @@ class HiveDDLSuite
       }
     }
   }
+
+  private def assertCompression(maybeFile: Option[File], format: String, compression: String) = {
+    assert(maybeFile.isDefined)
+
+    val actualCompression = format match {
+      case "orc" =>
+        OrcFileOperator.getFileReader(maybeFile.get.toPath.toString).get.getCompression.name
+
+      case "parquet" =>
+        val footer = ParquetFileReader.readFooter(
+          sparkContext.hadoopConfiguration, new Path(maybeFile.get.getPath), NO_FILTER)
+        footer.getBlocks.get(0).getColumns.get(0).getCodec.toString
+    }
+
+    assert(compression === actualCompression)
+  }
+
+  Seq(("orc", "ZLIB"), ("parquet", "GZIP")).foreach { case (fileFormat, compression) =>
+    test(s"SPARK-22158 convertMetastore should not ignore table property - $fileFormat") {
+      withSQLConf(CONVERT_METASTORE_ORC.key -> "true", CONVERT_METASTORE_PARQUET.key -> "true") {
+        withTable("t") {
+          withTempPath { path =>
+            sql(
+              s"""
+                |CREATE TABLE t(id int) USING hive
+                |OPTIONS(fileFormat '$fileFormat', compression '$compression')
+                |LOCATION '${path.toURI}'
+              """.stripMargin)
+            val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t"))
+            assert(DDLUtils.isHiveTable(table))
+            assert(table.storage.serde.get.contains(fileFormat))
+            assert(table.storage.properties.get("compression") == Some(compression))
+            assert(spark.table("t").collect().isEmpty)
+
+            sql("INSERT INTO t SELECT 1")
+            checkAnswer(spark.table("t"), Row(1))
+            val maybeFile = path.listFiles().find(_.getName.startsWith("part"))
+            assertCompression(maybeFile, fileFormat, compression)
+          }
+        }
+      }
+    }
+  }
 }

From 4329eb2e73181819bb712f57ca9c7feac0d640ea Mon Sep 17 00:00:00 2001
From: Gene Pang <gene.pang@gmail.com>
Date: Mon, 2 Oct 2017 15:09:11 -0700
Subject: [PATCH 655/779] [SPARK-16944][Mesos] Improve data locality when
 launching new executors when dynamic allocation is enabled

## What changes were proposed in this pull request?

Improve the Spark-Mesos coarse-grained scheduler to consider the preferred locations when dynamic allocation is enabled.

## How was this patch tested?

Added a unittest, and performed manual testing on AWS.

Author: Gene Pang <gene.pang@gmail.com>

Closes #18098 from gpang/mesos_data_locality.
---
 .../spark/internal/config/package.scala       |  4 ++
 .../spark/scheduler/TaskSetManager.scala      |  6 +-
 .../MesosCoarseGrainedSchedulerBackend.scala  | 52 ++++++++++++++--
 ...osCoarseGrainedSchedulerBackendSuite.scala | 62 +++++++++++++++++++
 .../spark/scheduler/cluster/mesos/Utils.scala |  6 ++
 5 files changed, 123 insertions(+), 7 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index 44a2815b81a73..d85b6a0200b8d 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -72,6 +72,10 @@ package object config {
   private[spark] val DYN_ALLOCATION_MAX_EXECUTORS =
     ConfigBuilder("spark.dynamicAllocation.maxExecutors").intConf.createWithDefault(Int.MaxValue)
 
+  private[spark] val LOCALITY_WAIT = ConfigBuilder("spark.locality.wait")
+    .timeConf(TimeUnit.MILLISECONDS)
+    .createWithDefaultString("3s")
+
   private[spark] val SHUFFLE_SERVICE_ENABLED =
     ConfigBuilder("spark.shuffle.service.enabled").booleanConf.createWithDefault(false)
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index bb867416a4fac..3bdede6743d1b 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -27,7 +27,7 @@ import scala.util.control.NonFatal
 
 import org.apache.spark._
 import org.apache.spark.TaskState.TaskState
-import org.apache.spark.internal.Logging
+import org.apache.spark.internal.{config, Logging}
 import org.apache.spark.scheduler.SchedulingMode._
 import org.apache.spark.util.{AccumulatorV2, Clock, SystemClock, Utils}
 import org.apache.spark.util.collection.MedianHeap
@@ -980,7 +980,7 @@ private[spark] class TaskSetManager(
   }
 
   private def getLocalityWait(level: TaskLocality.TaskLocality): Long = {
-    val defaultWait = conf.get("spark.locality.wait", "3s")
+    val defaultWait = conf.get(config.LOCALITY_WAIT)
     val localityWaitKey = level match {
       case TaskLocality.PROCESS_LOCAL => "spark.locality.wait.process"
       case TaskLocality.NODE_LOCAL => "spark.locality.wait.node"
@@ -989,7 +989,7 @@ private[spark] class TaskSetManager(
     }
 
     if (localityWaitKey != null) {
-      conf.getTimeAsMs(localityWaitKey, defaultWait)
+      conf.getTimeAsMs(localityWaitKey, defaultWait.toString)
     } else {
       0L
     }
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
index 26699873145b4..80c0a041b7322 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
@@ -99,6 +99,14 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
   private var totalCoresAcquired = 0
   private var totalGpusAcquired = 0
 
+  // The amount of time to wait for locality scheduling
+  private val localityWait = conf.get(config.LOCALITY_WAIT)
+  // The start of the waiting, for data local scheduling
+  private var localityWaitStartTime = System.currentTimeMillis()
+  // If true, the scheduler is in the process of launching executors to reach the requested
+  // executor limit
+  private var launchingExecutors = false
+
   // SlaveID -> Slave
   // This map accumulates entries for the duration of the job.  Slaves are never deleted, because
   // we need to maintain e.g. failure state and connection state.
@@ -311,6 +319,19 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
         return
       }
 
+      if (numExecutors >= executorLimit) {
+        logDebug("Executor limit reached. numExecutors: " + numExecutors +
+          " executorLimit: " + executorLimit)
+        offers.asScala.map(_.getId).foreach(d.declineOffer)
+        launchingExecutors = false
+        return
+      } else {
+        if (!launchingExecutors) {
+          launchingExecutors = true
+          localityWaitStartTime = System.currentTimeMillis()
+        }
+      }
+
       logDebug(s"Received ${offers.size} resource offers.")
 
       val (matchedOffers, unmatchedOffers) = offers.asScala.partition { offer =>
@@ -413,7 +434,7 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
         val offerId = offer.getId.getValue
         val resources = remainingResources(offerId)
 
-        if (canLaunchTask(slaveId, resources)) {
+        if (canLaunchTask(slaveId, offer.getHostname, resources)) {
           // Create a task
           launchTasks = true
           val taskId = newMesosTaskId()
@@ -477,7 +498,8 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
       cpuResourcesToUse ++ memResourcesToUse ++ portResourcesToUse ++ gpuResourcesToUse)
   }
 
-  private def canLaunchTask(slaveId: String, resources: JList[Resource]): Boolean = {
+  private def canLaunchTask(slaveId: String, offerHostname: String,
+                            resources: JList[Resource]): Boolean = {
     val offerMem = getResource(resources, "mem")
     val offerCPUs = getResource(resources, "cpus").toInt
     val cpus = executorCores(offerCPUs)
@@ -489,9 +511,10 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
       cpus <= offerCPUs &&
       cpus + totalCoresAcquired <= maxCores &&
       mem <= offerMem &&
-      numExecutors() < executorLimit &&
+      numExecutors < executorLimit &&
       slaves.get(slaveId).map(_.taskFailures).getOrElse(0) < MAX_SLAVE_FAILURES &&
-      meetsPortRequirements
+      meetsPortRequirements &&
+      satisfiesLocality(offerHostname)
   }
 
   private def executorCores(offerCPUs: Int): Int = {
@@ -500,6 +523,25 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
     )
   }
 
+  private def satisfiesLocality(offerHostname: String): Boolean = {
+    if (!Utils.isDynamicAllocationEnabled(conf) || hostToLocalTaskCount.isEmpty) {
+      return true
+    }
+
+    // Check the locality information
+    val currentHosts = slaves.values.filter(_.taskIDs.nonEmpty).map(_.hostname).toSet
+    val allDesiredHosts = hostToLocalTaskCount.keys.toSet
+    // Try to match locality for hosts which do not have executors yet, to potentially
+    // increase coverage.
+    val remainingHosts = allDesiredHosts -- currentHosts
+    if (!remainingHosts.contains(offerHostname) &&
+      (System.currentTimeMillis() - localityWaitStartTime <= localityWait)) {
+      logDebug("Skipping host and waiting for locality. host: " + offerHostname)
+      return false
+    }
+    return true
+  }
+
   override def statusUpdate(d: org.apache.mesos.SchedulerDriver, status: TaskStatus) {
     val taskId = status.getTaskId.getValue
     val slaveId = status.getSlaveId.getValue
@@ -646,6 +688,8 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
     // since at coarse grain it depends on the amount of slaves available.
     logInfo("Capping the total amount of executors to " + requestedTotal)
     executorLimitOption = Some(requestedTotal)
+    // Update the locality wait start time to continue trying for locality.
+    localityWaitStartTime = System.currentTimeMillis()
     true
   }
 
diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
index f6bae01c3af59..6c40792112f49 100644
--- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
+++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
@@ -604,6 +604,55 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite
     assert(backend.isReady)
   }
 
+  test("supports data locality with dynamic allocation") {
+    setBackend(Map(
+      "spark.dynamicAllocation.enabled" -> "true",
+      "spark.dynamicAllocation.testing" -> "true",
+      "spark.locality.wait" -> "1s"))
+
+    assert(backend.getExecutorIds().isEmpty)
+
+    backend.requestTotalExecutors(2, 2, Map("hosts10" -> 1, "hosts11" -> 1))
+
+    // Offer non-local resources, which should be rejected
+    offerResourcesAndVerify(1, false)
+    offerResourcesAndVerify(2, false)
+
+    // Offer local resource
+    offerResourcesAndVerify(10, true)
+
+    // Wait longer than spark.locality.wait
+    Thread.sleep(2000)
+
+    // Offer non-local resource, which should be accepted
+    offerResourcesAndVerify(1, true)
+
+    // Update total executors
+    backend.requestTotalExecutors(3, 3, Map("hosts10" -> 1, "hosts11" -> 1, "hosts12" -> 1))
+
+    // Offer non-local resources, which should be rejected
+    offerResourcesAndVerify(3, false)
+
+    // Wait longer than spark.locality.wait
+    Thread.sleep(2000)
+
+    // Update total executors
+    backend.requestTotalExecutors(4, 4, Map("hosts10" -> 1, "hosts11" -> 1, "hosts12" -> 1,
+      "hosts13" -> 1))
+
+    // Offer non-local resources, which should be rejected
+    offerResourcesAndVerify(3, false)
+
+    // Offer local resource
+    offerResourcesAndVerify(13, true)
+
+    // Wait longer than spark.locality.wait
+    Thread.sleep(2000)
+
+    // Offer non-local resource, which should be accepted
+    offerResourcesAndVerify(2, true)
+  }
+
   private case class Resources(mem: Int, cpus: Int, gpus: Int = 0)
 
   private def registerMockExecutor(executorId: String, slaveId: String, cores: Integer) = {
@@ -631,6 +680,19 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite
     backend.resourceOffers(driver, mesosOffers.asJava)
   }
 
+  private def offerResourcesAndVerify(id: Int, expectAccept: Boolean): Unit = {
+    offerResources(List(Resources(backend.executorMemory(sc), 1)), id)
+    if (expectAccept) {
+      val numExecutors = backend.getExecutorIds().size
+      val launchedTasks = verifyTaskLaunched(driver, s"o$id")
+      assert(s"s$id" == launchedTasks.head.getSlaveId.getValue)
+      registerMockExecutor(launchedTasks.head.getTaskId.getValue, s"s$id", 1)
+      assert(backend.getExecutorIds().size == numExecutors + 1)
+    } else {
+      verifyTaskNotLaunched(driver, s"o$id")
+    }
+  }
+
   private def createTaskStatus(taskId: String, slaveId: String, state: TaskState): TaskStatus = {
     TaskStatus.newBuilder()
       .setTaskId(TaskID.newBuilder().setValue(taskId).build())
diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/Utils.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/Utils.scala
index 2a67cbc913ffe..833db0c1ff334 100644
--- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/Utils.scala
+++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/Utils.scala
@@ -84,6 +84,12 @@ object Utils {
     captor.getValue.asScala.toList
   }
 
+  def verifyTaskNotLaunched(driver: SchedulerDriver, offerId: String): Unit = {
+    verify(driver, times(0)).launchTasks(
+      Matchers.eq(Collections.singleton(createOfferId(offerId))),
+      Matchers.any(classOf[java.util.Collection[TaskInfo]]))
+  }
+
   def createOfferId(offerId: String): OfferID = {
     OfferID.newBuilder().setValue(offerId).build()
   }

From fa225da7463e384529da14706e44f4a09772e5c1 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Mon, 2 Oct 2017 15:25:33 -0700
Subject: [PATCH 656/779] [SPARK-22176][SQL] Fix overflow issue in Dataset.show

## What changes were proposed in this pull request?
This pr fixed an overflow issue below in `Dataset.show`:
```
scala> Seq((1, 2), (3, 4)).toDF("a", "b").show(Int.MaxValue)
org.apache.spark.sql.AnalysisException: The limit expression must be equal to or greater than 0, but got -2147483648;;
GlobalLimit -2147483648
+- LocalLimit -2147483648
   +- Project [_1#27218 AS a#27221, _2#27219 AS b#27222]
      +- LocalRelation [_1#27218, _2#27219]

  at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.failAnalysis(CheckAnalysis.scala:41)
  at org.apache.spark.sql.catalyst.analysis.Analyzer.failAnalysis(Analyzer.scala:89)
  at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.org$apache$spark$sql$catalyst$analysis$CheckAnalysis$$checkLimitClause(CheckAnalysis.scala:70)
  at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:234)
  at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:80)
  at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:127)
```

## How was this patch tested?
Added tests in `DataFrameSuite`.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #19401 from maropu/MaxValueInShowString.
---
 .../main/scala/org/apache/spark/sql/Dataset.scala    |  2 +-
 .../scala/org/apache/spark/sql/DataFrameSuite.scala  | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index f2a76a506eb6f..b70dfc05330f8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -237,7 +237,7 @@ class Dataset[T] private[sql](
    */
   private[sql] def showString(
       _numRows: Int, truncate: Int = 20, vertical: Boolean = false): String = {
-    val numRows = _numRows.max(0)
+    val numRows = _numRows.max(0).min(Int.MaxValue - 1)
     val takeResult = toDF().take(numRows + 1)
     val hasMoreData = takeResult.length > numRows
     val data = takeResult.take(numRows)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 672deeac597f1..dd8f54b690f64 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -1045,6 +1045,18 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
     assert(testData.select($"*").showString(0) === expectedAnswer)
   }
 
+  test("showString(Int.MaxValue)") {
+    val df = Seq((1, 2), (3, 4)).toDF("a", "b")
+    val expectedAnswer = """+---+---+
+                           ||  a|  b|
+                           |+---+---+
+                           ||  1|  2|
+                           ||  3|  4|
+                           |+---+---+
+                           |""".stripMargin
+    assert(df.showString(Int.MaxValue) === expectedAnswer)
+  }
+
   test("showString(0), vertical = true") {
     val expectedAnswer = "(0 rows)\n"
     assert(testData.select($"*").showString(0, vertical = true) === expectedAnswer)

From 4c5158eec9101ef105274df6b488e292a56156a2 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 3 Oct 2017 12:38:13 -0700
Subject: [PATCH 657/779] [SPARK-21644][SQL] LocalLimit.maxRows is defined
 incorrectly

## What changes were proposed in this pull request?
The definition of `maxRows` in `LocalLimit` operator was simply wrong. This patch introduces a new `maxRowsPerPartition` method and uses that in pruning. The patch also adds more documentation on why we need local limit vs global limit.

Note that this previously has never been a bug because the way the code is structured, but future use of the maxRows could lead to bugs.

## How was this patch tested?
Should be covered by existing test cases.

Closes #18851

Author: gatorsmile <gatorsmile@gmail.com>
Author: Reynold Xin <rxin@databricks.com>

Closes #19393 from gatorsmile/pr-18851.
---
 .../sql/catalyst/optimizer/Optimizer.scala    | 29 ++++++-----
 .../catalyst/plans/logical/LogicalPlan.scala  |  5 ++
 .../plans/logical/basicLogicalOperators.scala | 49 ++++++++++++++++++-
 .../execution/basicPhysicalOperators.scala    |  3 ++
 4 files changed, 74 insertions(+), 12 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index b9fa39d6dad4c..bc2d4a824cb49 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -305,13 +305,20 @@ object LimitPushDown extends Rule[LogicalPlan] {
     }
   }
 
-  private def maybePushLimit(limitExp: Expression, plan: LogicalPlan): LogicalPlan = {
-    (limitExp, plan.maxRows) match {
-      case (IntegerLiteral(maxRow), Some(childMaxRows)) if maxRow < childMaxRows =>
+  private def maybePushLocalLimit(limitExp: Expression, plan: LogicalPlan): LogicalPlan = {
+    (limitExp, plan.maxRowsPerPartition) match {
+      case (IntegerLiteral(newLimit), Some(childMaxRows)) if newLimit < childMaxRows =>
+        // If the child has a cap on max rows per partition and the cap is larger than
+        // the new limit, put a new LocalLimit there.
         LocalLimit(limitExp, stripGlobalLimitIfPresent(plan))
+
       case (_, None) =>
+        // If the child has no cap, put the new LocalLimit.
         LocalLimit(limitExp, stripGlobalLimitIfPresent(plan))
-      case _ => plan
+
+      case _ =>
+        // Otherwise, don't put a new LocalLimit.
+        plan
     }
   }
 
@@ -323,7 +330,7 @@ object LimitPushDown extends Rule[LogicalPlan] {
     // pushdown Limit through it. Once we add UNION DISTINCT, however, we will not be able to
     // pushdown Limit.
     case LocalLimit(exp, Union(children)) =>
-      LocalLimit(exp, Union(children.map(maybePushLimit(exp, _))))
+      LocalLimit(exp, Union(children.map(maybePushLocalLimit(exp, _))))
     // Add extra limits below OUTER JOIN. For LEFT OUTER and FULL OUTER JOIN we push limits to the
     // left and right sides, respectively. For FULL OUTER JOIN, we can only push limits to one side
     // because we need to ensure that rows from the limited side still have an opportunity to match
@@ -335,19 +342,19 @@ object LimitPushDown extends Rule[LogicalPlan] {
     //   - If neither side is limited, limit the side that is estimated to be bigger.
     case LocalLimit(exp, join @ Join(left, right, joinType, _)) =>
       val newJoin = joinType match {
-        case RightOuter => join.copy(right = maybePushLimit(exp, right))
-        case LeftOuter => join.copy(left = maybePushLimit(exp, left))
+        case RightOuter => join.copy(right = maybePushLocalLimit(exp, right))
+        case LeftOuter => join.copy(left = maybePushLocalLimit(exp, left))
         case FullOuter =>
           (left.maxRows, right.maxRows) match {
             case (None, None) =>
               if (left.stats.sizeInBytes >= right.stats.sizeInBytes) {
-                join.copy(left = maybePushLimit(exp, left))
+                join.copy(left = maybePushLocalLimit(exp, left))
               } else {
-                join.copy(right = maybePushLimit(exp, right))
+                join.copy(right = maybePushLocalLimit(exp, right))
               }
             case (Some(_), Some(_)) => join
-            case (Some(_), None) => join.copy(left = maybePushLimit(exp, left))
-            case (None, Some(_)) => join.copy(right = maybePushLimit(exp, right))
+            case (Some(_), None) => join.copy(left = maybePushLocalLimit(exp, left))
+            case (None, Some(_)) => join.copy(right = maybePushLocalLimit(exp, right))
 
           }
         case _ => join
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index 68aae720e026a..14188829db2af 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -97,6 +97,11 @@ abstract class LogicalPlan
    */
   def maxRows: Option[Long] = None
 
+  /**
+   * Returns the maximum number of rows this plan may compute on each partition.
+   */
+  def maxRowsPerPartition: Option[Long] = maxRows
+
   /**
    * Returns true if this expression and all its children have been resolved to a specific schema
    * and false if it still contains any unresolved placeholders. Implementations of LogicalPlan
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index f443cd5a69de3..80243d3d356ca 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -191,6 +191,9 @@ object Union {
   }
 }
 
+/**
+ * Logical plan for unioning two plans, without a distinct. This is UNION ALL in SQL.
+ */
 case class Union(children: Seq[LogicalPlan]) extends LogicalPlan {
   override def maxRows: Option[Long] = {
     if (children.exists(_.maxRows.isEmpty)) {
@@ -200,6 +203,17 @@ case class Union(children: Seq[LogicalPlan]) extends LogicalPlan {
     }
   }
 
+  /**
+   * Note the definition has assumption about how union is implemented physically.
+   */
+  override def maxRowsPerPartition: Option[Long] = {
+    if (children.exists(_.maxRowsPerPartition.isEmpty)) {
+      None
+    } else {
+      Some(children.flatMap(_.maxRowsPerPartition).sum)
+    }
+  }
+
   // updating nullability to make all the children consistent
   override def output: Seq[Attribute] =
     children.map(_.output).transpose.map(attrs =>
@@ -669,6 +683,27 @@ case class Pivot(
   }
 }
 
+/**
+ * A constructor for creating a logical limit, which is split into two separate logical nodes:
+ * a [[LocalLimit]], which is a partition local limit, followed by a [[GlobalLimit]].
+ *
+ * This muds the water for clean logical/physical separation, and is done for better limit pushdown.
+ * In distributed query processing, a non-terminal global limit is actually an expensive operation
+ * because it requires coordination (in Spark this is done using a shuffle).
+ *
+ * In most cases when we want to push down limit, it is often better to only push some partition
+ * local limit. Consider the following:
+ *
+ *   GlobalLimit(Union(A, B))
+ *
+ * It is better to do
+ *   GlobalLimit(Union(LocalLimit(A), LocalLimit(B)))
+ *
+ * than
+ *   Union(GlobalLimit(A), GlobalLimit(B)).
+ *
+ * So we introduced LocalLimit and GlobalLimit in the logical plan node for limit pushdown.
+ */
 object Limit {
   def apply(limitExpr: Expression, child: LogicalPlan): UnaryNode = {
     GlobalLimit(limitExpr, LocalLimit(limitExpr, child))
@@ -682,6 +717,11 @@ object Limit {
   }
 }
 
+/**
+ * A global (coordinated) limit. This operator can emit at most `limitExpr` number in total.
+ *
+ * See [[Limit]] for more information.
+ */
 case class GlobalLimit(limitExpr: Expression, child: LogicalPlan) extends UnaryNode {
   override def output: Seq[Attribute] = child.output
   override def maxRows: Option[Long] = {
@@ -692,9 +732,16 @@ case class GlobalLimit(limitExpr: Expression, child: LogicalPlan) extends UnaryN
   }
 }
 
+/**
+ * A partition-local (non-coordinated) limit. This operator can emit at most `limitExpr` number
+ * of tuples on each physical partition.
+ *
+ * See [[Limit]] for more information.
+ */
 case class LocalLimit(limitExpr: Expression, child: LogicalPlan) extends UnaryNode {
   override def output: Seq[Attribute] = child.output
-  override def maxRows: Option[Long] = {
+
+  override def maxRowsPerPartition: Option[Long] = {
     limitExpr match {
       case IntegerLiteral(limit) => Some(limit)
       case _ => None
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
index 8389e2f3d5be9..63cd1691f4cd7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
@@ -554,6 +554,9 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range)
 
 /**
  * Physical plan for unioning two plans, without a distinct. This is UNION ALL in SQL.
+ *
+ * If we change how this is implemented physically, we'd need to update
+ * [[org.apache.spark.sql.catalyst.plans.logical.Union.maxRowsPerPartition]].
  */
 case class UnionExec(children: Seq[SparkPlan]) extends SparkPlan {
   override def output: Seq[Attribute] =

From e65b6b7ca1a7cff1b91ad2262bb7941e6bf057cd Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 3 Oct 2017 12:40:22 -0700
Subject: [PATCH 658/779] [SPARK-22178][SQL] Refresh Persistent Views by
 REFRESH TABLE Command

## What changes were proposed in this pull request?
The underlying tables of persistent views are not refreshed when users issue the REFRESH TABLE command against the persistent views.

## How was this patch tested?
Added a test case

Author: gatorsmile <gatorsmile@gmail.com>

Closes #19405 from gatorsmile/refreshView.
---
 .../apache/spark/sql/internal/CatalogImpl.scala   | 15 +++++++++++----
 .../spark/sql/hive/HiveMetadataCacheSuite.scala   | 14 +++++++++++---
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
index 142b005850a49..fdd25330c5e67 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
@@ -474,13 +474,20 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
    */
   override def refreshTable(tableName: String): Unit = {
     val tableIdent = sparkSession.sessionState.sqlParser.parseTableIdentifier(tableName)
-    // Temp tables: refresh (or invalidate) any metadata/data cached in the plan recursively.
-    // Non-temp tables: refresh the metadata cache.
-    sessionCatalog.refreshTable(tableIdent)
+    val tableMetadata = sessionCatalog.getTempViewOrPermanentTableMetadata(tableIdent)
+    val table = sparkSession.table(tableIdent)
+
+    if (tableMetadata.tableType == CatalogTableType.VIEW) {
+      // Temp or persistent views: refresh (or invalidate) any metadata/data cached
+      // in the plan recursively.
+      table.queryExecution.analyzed.foreach(_.refresh())
+    } else {
+      // Non-temp tables: refresh the metadata cache.
+      sessionCatalog.refreshTable(tableIdent)
+    }
 
     // If this table is cached as an InMemoryRelation, drop the original
     // cached version and make the new version cached lazily.
-    val table = sparkSession.table(tableIdent)
     if (isCached(table)) {
       // Uncache the logicalPlan.
       sparkSession.sharedState.cacheManager.uncacheQuery(table, blocking = true)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
index 0c28a1b609bb8..e71aba72c31fe 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala
@@ -31,14 +31,22 @@ import org.apache.spark.sql.test.SQLTestUtils
 class HiveMetadataCacheSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
 
   test("SPARK-16337 temporary view refresh") {
-    withTempView("view_refresh") {
+    checkRefreshView(isTemp = true)
+  }
+
+  test("view refresh") {
+    checkRefreshView(isTemp = false)
+  }
+
+  private def checkRefreshView(isTemp: Boolean) {
+    withView("view_refresh") {
       withTable("view_table") {
         // Create a Parquet directory
         spark.range(start = 0, end = 100, step = 1, numPartitions = 3)
           .write.saveAsTable("view_table")
 
-        // Read the table in
-        spark.table("view_table").filter("id > -1").createOrReplaceTempView("view_refresh")
+        val temp = if (isTemp) "TEMPORARY" else ""
+        spark.sql(s"CREATE $temp VIEW view_refresh AS SELECT * FROM view_table WHERE id > -1")
         assert(sql("select count(*) from view_refresh").first().getLong(0) == 100)
 
         // Delete a file using the Hadoop file system interface since the path returned by

From e36ec38d89472df0dfe12222b6af54cd6eea8e98 Mon Sep 17 00:00:00 2001
From: Sahil Takiar <stakiar@cloudera.com>
Date: Tue, 3 Oct 2017 16:53:32 -0700
Subject: [PATCH 659/779] [SPARK-20466][CORE] HadoopRDD#addLocalConfiguration
 throws NPE

## What changes were proposed in this pull request?

Fix for SPARK-20466, full description of the issue in the JIRA. To summarize, `HadoopRDD` uses a metadata cache to cache `JobConf` objects. The cache uses soft-references, which means the JVM can delete entries from the cache whenever there is GC pressure. `HadoopRDD#getJobConf` had a bug where it would check if the cache contained the `JobConf`, if it did it would get the `JobConf` from the cache and return it. This doesn't work when soft-references are used as the JVM can delete the entry between the existence check and the get call.

## How was this patch tested?

Haven't thought of a good way to test this yet given the issue only occurs sometimes, and happens during high GC pressure. Was thinking of using mocks to verify `#getJobConf` is doing the right thing. I deleted the method `HadoopRDD#containsCachedMetadata` so that we don't hit this issue again.

Author: Sahil Takiar <stakiar@cloudera.com>

Closes #19413 from sahilTakiar/master.
---
 .../org/apache/spark/rdd/HadoopRDD.scala      | 33 ++++++++++---------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
index 76ea8b86c53d2..23b344230e490 100644
--- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -157,20 +157,25 @@ class HadoopRDD[K, V](
       if (conf.isInstanceOf[JobConf]) {
         logDebug("Re-using user-broadcasted JobConf")
         conf.asInstanceOf[JobConf]
-      } else if (HadoopRDD.containsCachedMetadata(jobConfCacheKey)) {
-        logDebug("Re-using cached JobConf")
-        HadoopRDD.getCachedMetadata(jobConfCacheKey).asInstanceOf[JobConf]
       } else {
-        // Create a JobConf that will be cached and used across this RDD's getJobConf() calls in the
-        // local process. The local cache is accessed through HadoopRDD.putCachedMetadata().
-        // The caching helps minimize GC, since a JobConf can contain ~10KB of temporary objects.
-        // Synchronize to prevent ConcurrentModificationException (SPARK-1097, HADOOP-10456).
-        HadoopRDD.CONFIGURATION_INSTANTIATION_LOCK.synchronized {
-          logDebug("Creating new JobConf and caching it for later re-use")
-          val newJobConf = new JobConf(conf)
-          initLocalJobConfFuncOpt.foreach(f => f(newJobConf))
-          HadoopRDD.putCachedMetadata(jobConfCacheKey, newJobConf)
-          newJobConf
+        Option(HadoopRDD.getCachedMetadata(jobConfCacheKey))
+          .map { conf =>
+            logDebug("Re-using cached JobConf")
+            conf.asInstanceOf[JobConf]
+          }
+          .getOrElse {
+            // Create a JobConf that will be cached and used across this RDD's getJobConf() calls in
+            // the local process. The local cache is accessed through HadoopRDD.putCachedMetadata().
+            // The caching helps minimize GC, since a JobConf can contain ~10KB of temporary
+            // objects. Synchronize to prevent ConcurrentModificationException (SPARK-1097,
+            // HADOOP-10456).
+            HadoopRDD.CONFIGURATION_INSTANTIATION_LOCK.synchronized {
+              logDebug("Creating new JobConf and caching it for later re-use")
+              val newJobConf = new JobConf(conf)
+              initLocalJobConfFuncOpt.foreach(f => f(newJobConf))
+              HadoopRDD.putCachedMetadata(jobConfCacheKey, newJobConf)
+              newJobConf
+          }
         }
       }
     }
@@ -360,8 +365,6 @@ private[spark] object HadoopRDD extends Logging {
    */
   def getCachedMetadata(key: String): Any = SparkEnv.get.hadoopJobMetadata.get(key)
 
-  def containsCachedMetadata(key: String): Boolean = SparkEnv.get.hadoopJobMetadata.containsKey(key)
-
   private def putCachedMetadata(key: String, value: Any): Unit =
     SparkEnv.get.hadoopJobMetadata.put(key, value)
 

From 5f694334534e4425fb9e8abf5b7e3e5efdfcef50 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 3 Oct 2017 21:27:58 -0700
Subject: [PATCH 660/779] [SPARK-22171][SQL] Describe Table Extended Failed
 when Table Owner is Empty

## What changes were proposed in this pull request?

Users could hit `java.lang.NullPointerException` when the tables were created by Hive and the table's owner is `null` that are got from Hive metastore. `DESC EXTENDED` failed with the error:

> SQLExecutionException: java.lang.NullPointerException at scala.collection.immutable.StringOps$.length$extension(StringOps.scala:47) at scala.collection.immutable.StringOps.length(StringOps.scala:47) at scala.collection.IndexedSeqOptimized$class.isEmpty(IndexedSeqOptimized.scala:27) at scala.collection.immutable.StringOps.isEmpty(StringOps.scala:29) at scala.collection.TraversableOnce$class.nonEmpty(TraversableOnce.scala:111) at scala.collection.immutable.StringOps.nonEmpty(StringOps.scala:29) at org.apache.spark.sql.catalyst.catalog.CatalogTable.toLinkedHashMap(interface.scala:300) at org.apache.spark.sql.execution.command.DescribeTableCommand.describeFormattedTableInfo(tables.scala:565) at org.apache.spark.sql.execution.command.DescribeTableCommand.run(tables.scala:543) at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:66) at

## How was this patch tested?
Added a unit test case

Author: gatorsmile <gatorsmile@gmail.com>

Closes #19395 from gatorsmile/desc.
---
 .../sql/catalyst/catalog/interface.scala      |  2 +-
 .../sql/catalyst/analysis/CatalogSuite.scala  | 37 +++++++++++++++++++
 .../sql/hive/client/HiveClientImpl.scala      |  2 +-
 3 files changed, 39 insertions(+), 2 deletions(-)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/CatalogSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index 1965144e81197..fe2af910a0ae5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -307,7 +307,7 @@ case class CatalogTable(
 
     identifier.database.foreach(map.put("Database", _))
     map.put("Table", identifier.table)
-    if (owner.nonEmpty) map.put("Owner", owner)
+    if (owner != null && owner.nonEmpty) map.put("Owner", owner)
     map.put("Created Time", new Date(createTime).toString)
     map.put("Last Access", new Date(lastAccessTime).toString)
     map.put("Created By", "Spark " + createVersion)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/CatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/CatalogSuite.scala
new file mode 100644
index 0000000000000..d670053ba1b5d
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/CatalogSuite.scala
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.analysis
+
+import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType}
+import org.apache.spark.sql.types.StructType
+
+
+class CatalogSuite extends AnalysisTest {
+
+  test("desc table when owner is set to null") {
+    val table = CatalogTable(
+      identifier = TableIdentifier("tbl", Some("db1")),
+      tableType = CatalogTableType.MANAGED,
+      storage = CatalogStorageFormat.empty,
+      owner = null,
+      schema = new StructType().add("col1", "int").add("col2", "string"),
+      provider = Some("parquet"))
+    table.toLinkedHashMap
+  }
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index c4e48c9360db7..66165c7228bca 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -461,7 +461,7 @@ private[hive] class HiveClientImpl(
         // in table properties. This means, if we have bucket spec in both hive metastore and
         // table properties, we will trust the one in table properties.
         bucketSpec = bucketSpec,
-        owner = h.getOwner,
+        owner = Option(h.getOwner).getOrElse(""),
         createTime = h.getTTable.getCreateTime.toLong * 1000,
         lastAccessTime = h.getLastAccessTime.toLong * 1000,
         storage = CatalogStorageFormat(

From 3099c574c56cab86c3fcf759864f89151643f837 Mon Sep 17 00:00:00 2001
From: Jose Torres <jose@databricks.com>
Date: Tue, 3 Oct 2017 21:42:51 -0700
Subject: [PATCH 661/779] [SPARK-22136][SS] Implement stream-stream outer
 joins.

## What changes were proposed in this pull request?

Allow one-sided outer joins between two streams when a watermark is defined.

## How was this patch tested?

new unit tests

Author: Jose Torres <jose@databricks.com>

Closes #19327 from joseph-torres/outerjoin.
---
 .../analysis/StreamingJoinHelper.scala        | 286 +++++++++++++++++
 .../UnsupportedOperationChecker.scala         |  53 +++-
 .../analysis/StreamingJoinHelperSuite.scala   | 140 ++++++++
 .../analysis/UnsupportedOperationsSuite.scala | 108 ++++++-
 .../StreamingSymmetricHashJoinExec.scala      | 152 +++++++--
 .../StreamingSymmetricHashJoinHelper.scala    | 241 +-------------
 .../state/SymmetricHashJoinStateManager.scala | 200 +++++++++---
 .../SymmetricHashJoinStateManagerSuite.scala  |   6 +-
 .../sql/streaming/StreamingJoinSuite.scala    | 298 +++++++++++-------
 9 files changed, 1051 insertions(+), 433 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/StreamingJoinHelper.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/StreamingJoinHelperSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/StreamingJoinHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/StreamingJoinHelper.scala
new file mode 100644
index 0000000000000..072dc954879ca
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/StreamingJoinHelper.scala
@@ -0,0 +1,286 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.analysis
+
+import scala.util.control.NonFatal
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.catalyst.expressions.{Add, AttributeReference, AttributeSet, Cast, CheckOverflow, Expression, ExpressionSet, GreaterThan, GreaterThanOrEqual, LessThan, LessThanOrEqual, Literal, Multiply, PreciseTimestampConversion, PredicateHelper, Subtract, TimeAdd, TimeSub, UnaryMinus}
+import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys
+import org.apache.spark.sql.catalyst.plans.logical.{EventTimeWatermark, LogicalPlan}
+import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark._
+import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.CalendarInterval
+
+
+/**
+ * Helper object for stream joins. See [[StreamingSymmetricHashJoinExec]] in SQL for more details.
+ */
+object StreamingJoinHelper extends PredicateHelper with Logging {
+
+  /**
+   * Check the provided logical plan to see if its join keys contain a watermark attribute.
+   *
+   * Will return false if the plan is not an equijoin.
+   * @param plan the logical plan to check
+   */
+  def isWatermarkInJoinKeys(plan: LogicalPlan): Boolean = {
+    plan match {
+      case ExtractEquiJoinKeys(_, leftKeys, rightKeys, _, _, _) =>
+        (leftKeys ++ rightKeys).exists {
+          case a: AttributeReference => a.metadata.contains(EventTimeWatermark.delayKey)
+          case _ => false
+        }
+      case _ => false
+    }
+  }
+
+  /**
+   * Get state value watermark (see [[StreamingSymmetricHashJoinExec]] for context about it)
+   * given the join condition and the event time watermark. This is how it works.
+   * - The condition is split into conjunctive predicates, and we find the predicates of the
+   *   form `leftTime + c1 < rightTime + c2`   (or <=, >, >=).
+   * - We canoncalize the predicate and solve it with the event time watermark value to find the
+   *  value of the state watermark.
+   * This function is supposed to make best-effort attempt to get the state watermark. If there is
+   * any error, it will return None.
+   *
+   * @param attributesToFindStateWatermarkFor attributes of the side whose state watermark
+   *                                         is to be calculated
+   * @param attributesWithEventWatermark  attributes of the other side which has a watermark column
+   * @param joinCondition                 join condition
+   * @param eventWatermark                watermark defined on the input event data
+   * @return state value watermark in milliseconds, is possible.
+   */
+  def getStateValueWatermark(
+      attributesToFindStateWatermarkFor: AttributeSet,
+      attributesWithEventWatermark: AttributeSet,
+      joinCondition: Option[Expression],
+      eventWatermark: Option[Long]): Option[Long] = {
+
+    // If condition or event time watermark is not provided, then cannot calculate state watermark
+    if (joinCondition.isEmpty || eventWatermark.isEmpty) return None
+
+    // If there is not watermark attribute, then cannot define state watermark
+    if (!attributesWithEventWatermark.exists(_.metadata.contains(delayKey))) return None
+
+    def getStateWatermarkSafely(l: Expression, r: Expression): Option[Long] = {
+      try {
+        getStateWatermarkFromLessThenPredicate(
+          l, r, attributesToFindStateWatermarkFor, attributesWithEventWatermark, eventWatermark)
+      } catch {
+        case NonFatal(e) =>
+          logWarning(s"Error trying to extract state constraint from condition $joinCondition", e)
+          None
+      }
+    }
+
+    val allStateWatermarks = splitConjunctivePredicates(joinCondition.get).flatMap { predicate =>
+
+      // The generated the state watermark cleanup expression is inclusive of the state watermark.
+      // If state watermark is W, all state where timestamp <= W will be cleaned up.
+      // Now when the canonicalized join condition solves to leftTime >= W, we dont want to clean
+      // up leftTime <= W. Rather we should clean up leftTime <= W - 1. Hence the -1 below.
+      val stateWatermark = predicate match {
+        case LessThan(l, r) => getStateWatermarkSafely(l, r)
+        case LessThanOrEqual(l, r) => getStateWatermarkSafely(l, r).map(_ - 1)
+        case GreaterThan(l, r) => getStateWatermarkSafely(r, l)
+        case GreaterThanOrEqual(l, r) => getStateWatermarkSafely(r, l).map(_ - 1)
+        case _ => None
+      }
+      if (stateWatermark.nonEmpty) {
+        logInfo(s"Condition $joinCondition generated watermark constraint = ${stateWatermark.get}")
+      }
+      stateWatermark
+    }
+    allStateWatermarks.reduceOption((x, y) => Math.min(x, y))
+  }
+
+  /**
+   * Extract the state value watermark (milliseconds) from the condition
+   * `LessThan(leftExpr, rightExpr)` where . For example: if we want to find the constraint for
+   * leftTime using the watermark on the rightTime. Example:
+   *
+   * Input:                 rightTime-with-watermark + c1 < leftTime + c2
+   * Canonical form:        rightTime-with-watermark + c1 + (-c2) + (-leftTime) < 0
+   * Solving for rightTime: rightTime-with-watermark + c1 + (-c2) < leftTime
+   * With watermark value:  watermark-value + c1 + (-c2) < leftTime
+   */
+  private def getStateWatermarkFromLessThenPredicate(
+      leftExpr: Expression,
+      rightExpr: Expression,
+      attributesToFindStateWatermarkFor: AttributeSet,
+      attributesWithEventWatermark: AttributeSet,
+      eventWatermark: Option[Long]): Option[Long] = {
+
+    val attributesInCondition = AttributeSet(
+      leftExpr.collect { case a: AttributeReference => a } ++
+      rightExpr.collect { case a: AttributeReference => a }
+    )
+    if (attributesInCondition.filter { attributesToFindStateWatermarkFor.contains(_) }.size > 1 ||
+        attributesInCondition.filter { attributesWithEventWatermark.contains(_) }.size > 1) {
+      // If more than attributes present in condition from one side, then it cannot be solved
+      return None
+    }
+
+    def containsAttributeToFindStateConstraintFor(e: Expression): Boolean = {
+      e.collectLeaves().collectFirst {
+        case a @ AttributeReference(_, _, _, _)
+          if attributesToFindStateWatermarkFor.contains(a) => a
+      }.nonEmpty
+    }
+
+    // Canonicalization step 1: convert to (rightTime-with-watermark + c1) - (leftTime + c2) < 0
+    val allOnLeftExpr = Subtract(leftExpr, rightExpr)
+    logDebug(s"All on Left:\n${allOnLeftExpr.treeString(true)}\n${allOnLeftExpr.asCode}")
+
+    // Canonicalization step 2: extract commutative terms
+    //    rightTime-with-watermark, c1, -leftTime, -c2
+    val terms = ExpressionSet(collectTerms(allOnLeftExpr))
+    logDebug("Terms extracted from join condition:\n\t" + terms.mkString("\n\t"))
+
+    // Find the term that has leftTime (i.e. the one present in attributesToFindConstraintFor
+    val constraintTerms = terms.filter(containsAttributeToFindStateConstraintFor)
+
+    // Verify there is only one correct constraint term and of the correct type
+    if (constraintTerms.size > 1) {
+      logWarning("Failed to extract state constraint terms: multiple time terms in condition\n\t" +
+        terms.mkString("\n\t"))
+      return None
+    }
+    if (constraintTerms.isEmpty) {
+      logDebug("Failed to extract state constraint terms: no time terms in condition\n\t" +
+        terms.mkString("\n\t"))
+      return None
+    }
+    val constraintTerm = constraintTerms.head
+    if (constraintTerm.collectFirst { case u: UnaryMinus => u }.isEmpty) {
+      // Incorrect condition. We want the constraint term in canonical form to be `-leftTime`
+      // so that resolve for it as `-leftTime + watermark + c < 0` ==> `watermark + c < leftTime`.
+      // Now, if the original conditions is `rightTime-with-watermark > leftTime` and watermark
+      // condition is `rightTime-with-watermark > watermarkValue`, then no constraint about
+      // `leftTime` can be inferred. In this case, after canonicalization and collection of terms,
+      // the constraintTerm would be `leftTime` and not `-leftTime`. Hence, we return None.
+      return None
+    }
+
+    // Replace watermark attribute with watermark value, and generate the resolved expression
+    // from the other terms. That is,
+    // rightTime-with-watermark, c1, -c2  =>  watermark, c1, -c2  =>  watermark + c1 + (-c2)
+    logDebug(s"Constraint term from join condition:\t$constraintTerm")
+    val exprWithWatermarkSubstituted = (terms - constraintTerm).map { term =>
+      term.transform {
+        case a @ AttributeReference(_, _, _, metadata)
+          if attributesWithEventWatermark.contains(a) && metadata.contains(delayKey) =>
+          Multiply(Literal(eventWatermark.get.toDouble), Literal(1000.0))
+      }
+    }.reduceLeft(Add)
+
+    // Calculate the constraint value
+    logInfo(s"Final expression to evaluate constraint:\t$exprWithWatermarkSubstituted")
+    val constraintValue = exprWithWatermarkSubstituted.eval().asInstanceOf[java.lang.Double]
+    Some((Double2double(constraintValue) / 1000.0).toLong)
+  }
+
+  /**
+   * Collect all the terms present in an expression after converting it into the form
+   * a + b + c + d where each term be either an attribute or a literal casted to long,
+   * optionally wrapped in a unary minus.
+   */
+  private def collectTerms(exprToCollectFrom: Expression): Seq[Expression] = {
+    var invalid = false
+
+    /** Wrap a term with UnaryMinus if its needs to be negated. */
+    def negateIfNeeded(expr: Expression, minus: Boolean): Expression = {
+      if (minus) UnaryMinus(expr) else expr
+    }
+
+    /**
+     * Recursively split the expression into its leaf terms contains attributes or literals.
+     * Returns terms only of the forms:
+     *    Cast(AttributeReference), UnaryMinus(Cast(AttributeReference)),
+     *    Cast(AttributeReference, Double), UnaryMinus(Cast(AttributeReference, Double))
+     *    Multiply(Literal), UnaryMinus(Multiply(Literal))
+     *    Multiply(Cast(Literal)), UnaryMinus(Multiple(Cast(Literal)))
+     *
+     * Note:
+     * - If term needs to be negated for making it a commutative term,
+     *   then it will be wrapped in UnaryMinus(...)
+     * - Each terms will be representing timestamp value or time interval in microseconds,
+     *   typed as doubles.
+     */
+    def collect(expr: Expression, negate: Boolean): Seq[Expression] = {
+      expr match {
+        case Add(left, right) =>
+          collect(left, negate) ++ collect(right, negate)
+        case Subtract(left, right) =>
+          collect(left, negate) ++ collect(right, !negate)
+        case TimeAdd(left, right, _) =>
+          collect(left, negate) ++ collect(right, negate)
+        case TimeSub(left, right, _) =>
+          collect(left, negate) ++ collect(right, !negate)
+        case UnaryMinus(child) =>
+          collect(child, !negate)
+        case CheckOverflow(child, _) =>
+          collect(child, negate)
+        case Cast(child, dataType, _) =>
+          dataType match {
+            case _: NumericType | _: TimestampType => collect(child, negate)
+            case _ =>
+              invalid = true
+              Seq.empty
+          }
+        case a: AttributeReference =>
+          val castedRef = if (a.dataType != DoubleType) Cast(a, DoubleType) else a
+          Seq(negateIfNeeded(castedRef, negate))
+        case lit: Literal =>
+          // If literal of type calendar interval, then explicitly convert to millis
+          // Convert other number like literal to doubles representing millis (by x1000)
+          val castedLit = lit.dataType match {
+            case CalendarIntervalType =>
+              val calendarInterval = lit.value.asInstanceOf[CalendarInterval]
+              if (calendarInterval.months > 0) {
+                invalid = true
+                logWarning(
+                  s"Failed to extract state value watermark from condition $exprToCollectFrom " +
+                    s"as imprecise intervals like months and years cannot be used for" +
+                    s"watermark calculation. Use interval in terms of day instead.")
+                Literal(0.0)
+              } else {
+                Literal(calendarInterval.microseconds.toDouble)
+              }
+            case DoubleType =>
+              Multiply(lit, Literal(1000000.0))
+            case _: NumericType =>
+              Multiply(Cast(lit, DoubleType), Literal(1000000.0))
+            case _: TimestampType =>
+              Multiply(PreciseTimestampConversion(lit, TimestampType, LongType), Literal(1000000.0))
+          }
+          Seq(negateIfNeeded(castedLit, negate))
+        case a @ _ =>
+          logWarning(
+            s"Failed to extract state value watermark from condition $exprToCollectFrom due to $a")
+          invalid = true
+          Seq.empty
+      }
+    }
+
+    val terms = collect(exprToCollectFrom, negate = false)
+    if (!invalid) terms else Seq.empty
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
index d1d705691b076..dee6fbe9d1514 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
@@ -18,8 +18,9 @@
 package org.apache.spark.sql.catalyst.analysis
 
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, AttributeSet}
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
+import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.streaming.InternalOutputModes
@@ -217,7 +218,7 @@ object UnsupportedOperationChecker {
           throwError("dropDuplicates is not supported after aggregation on a " +
             "streaming DataFrame/Dataset")
 
-        case Join(left, right, joinType, _) =>
+        case Join(left, right, joinType, condition) =>
 
           joinType match {
 
@@ -233,16 +234,52 @@ object UnsupportedOperationChecker {
                 throwError("Full outer joins with streaming DataFrames/Datasets are not supported")
               }
 
-            case LeftOuter | LeftSemi | LeftAnti =>
+            case LeftSemi | LeftAnti =>
               if (right.isStreaming) {
-                throwError("Left outer/semi/anti joins with a streaming DataFrame/Dataset " +
-                    "on the right is not supported")
+                throwError("Left semi/anti joins with a streaming DataFrame/Dataset " +
+                    "on the right are not supported")
               }
 
+            // We support streaming left outer joins with static on the right always, and with
+            // stream on both sides under the appropriate conditions.
+            case LeftOuter =>
+              if (!left.isStreaming && right.isStreaming) {
+                throwError("Left outer join with a streaming DataFrame/Dataset " +
+                  "on the right and a static DataFrame/Dataset on the left is not supported")
+              } else if (left.isStreaming && right.isStreaming) {
+                val watermarkInJoinKeys = StreamingJoinHelper.isWatermarkInJoinKeys(subPlan)
+
+                val hasValidWatermarkRange =
+                  StreamingJoinHelper.getStateValueWatermark(
+                    left.outputSet, right.outputSet, condition, Some(1000000)).isDefined
+
+                if (!watermarkInJoinKeys && !hasValidWatermarkRange) {
+                  throwError("Stream-stream outer join between two streaming DataFrame/Datasets " +
+                    "is not supported without a watermark in the join keys, or a watermark on " +
+                    "the nullable side and an appropriate range condition")
+                }
+              }
+
+            // We support streaming right outer joins with static on the left always, and with
+            // stream on both sides under the appropriate conditions.
             case RightOuter =>
-              if (left.isStreaming) {
-                throwError("Right outer join with a streaming DataFrame/Dataset on the left is " +
-                    "not supported")
+              if (left.isStreaming && !right.isStreaming) {
+                throwError("Right outer join with a streaming DataFrame/Dataset on the left and " +
+                    "a static DataFrame/DataSet on the right not supported")
+              } else if (left.isStreaming && right.isStreaming) {
+                val isWatermarkInJoinKeys = StreamingJoinHelper.isWatermarkInJoinKeys(subPlan)
+
+                // Check if the nullable side has a watermark, and there's a range condition which
+                // implies a state value watermark on the first side.
+                val hasValidWatermarkRange =
+                    StreamingJoinHelper.getStateValueWatermark(
+                      right.outputSet, left.outputSet, condition, Some(1000000)).isDefined
+
+                if (!isWatermarkInJoinKeys && !hasValidWatermarkRange) {
+                  throwError("Stream-stream outer join between two streaming DataFrame/Datasets " +
+                    "is not supported without a watermark in the join keys, or a watermark on " +
+                    "the nullable side and an appropriate range condition")
+                }
               }
 
             case NaturalJoin(_) | UsingJoin(_, _) =>
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/StreamingJoinHelperSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/StreamingJoinHelperSuite.scala
new file mode 100644
index 0000000000000..8cf41a02320d2
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/StreamingJoinHelperSuite.scala
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.analysis
+
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, AttributeSet}
+import org.apache.spark.sql.catalyst.optimizer.SimpleTestOptimizer
+import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
+import org.apache.spark.sql.catalyst.plans.logical.{EventTimeWatermark, Filter, LeafNode, LocalRelation}
+import org.apache.spark.sql.types.{IntegerType, MetadataBuilder, TimestampType}
+
+class StreamingJoinHelperSuite extends AnalysisTest {
+
+  test("extract watermark from time condition") {
+    val attributesToFindConstraintFor = Seq(
+      AttributeReference("leftTime", TimestampType)(),
+      AttributeReference("leftOther", IntegerType)())
+    val metadataWithWatermark = new MetadataBuilder()
+      .putLong(EventTimeWatermark.delayKey, 1000)
+      .build()
+    val attributesWithWatermark = Seq(
+      AttributeReference("rightTime", TimestampType, metadata = metadataWithWatermark)(),
+      AttributeReference("rightOther", IntegerType)())
+
+    case class DummyLeafNode() extends LeafNode {
+      override def output: Seq[Attribute] =
+        attributesToFindConstraintFor ++ attributesWithWatermark
+    }
+
+    def watermarkFrom(
+        conditionStr: String,
+        rightWatermark: Option[Long] = Some(10000)): Option[Long] = {
+      val conditionExpr = Some(conditionStr).map { str =>
+        val plan =
+          Filter(
+            CatalystSqlParser.parseExpression(str),
+            DummyLeafNode())
+        val optimized = SimpleTestOptimizer.execute(SimpleAnalyzer.execute(plan))
+        optimized.asInstanceOf[Filter].condition
+      }
+      StreamingJoinHelper.getStateValueWatermark(
+        AttributeSet(attributesToFindConstraintFor), AttributeSet(attributesWithWatermark),
+        conditionExpr, rightWatermark)
+    }
+
+    // Test comparison directionality. E.g. if leftTime < rightTime and rightTime > watermark,
+    // then cannot define constraint on leftTime.
+    assert(watermarkFrom("leftTime > rightTime") === Some(10000))
+    assert(watermarkFrom("leftTime >= rightTime") === Some(9999))
+    assert(watermarkFrom("leftTime < rightTime") === None)
+    assert(watermarkFrom("leftTime <= rightTime") === None)
+    assert(watermarkFrom("rightTime > leftTime") === None)
+    assert(watermarkFrom("rightTime >= leftTime") === None)
+    assert(watermarkFrom("rightTime < leftTime") === Some(10000))
+    assert(watermarkFrom("rightTime <= leftTime") === Some(9999))
+
+    // Test type conversions
+    assert(watermarkFrom("CAST(leftTime AS LONG) > CAST(rightTime AS LONG)") === Some(10000))
+    assert(watermarkFrom("CAST(leftTime AS LONG) < CAST(rightTime AS LONG)") === None)
+    assert(watermarkFrom("CAST(leftTime AS DOUBLE) > CAST(rightTime AS DOUBLE)") === Some(10000))
+    assert(watermarkFrom("CAST(leftTime AS LONG) > CAST(rightTime AS DOUBLE)") === Some(10000))
+    assert(watermarkFrom("CAST(leftTime AS LONG) > CAST(rightTime AS FLOAT)") === Some(10000))
+    assert(watermarkFrom("CAST(leftTime AS DOUBLE) > CAST(rightTime AS FLOAT)") === Some(10000))
+    assert(watermarkFrom("CAST(leftTime AS STRING) > CAST(rightTime AS STRING)") === None)
+
+    // Test with timestamp type + calendar interval on either side of equation
+    // Note: timestamptype and calendar interval don't commute, so less valid combinations to test.
+    assert(watermarkFrom("leftTime > rightTime + interval 1 second") === Some(11000))
+    assert(watermarkFrom("leftTime + interval 2 seconds > rightTime ") === Some(8000))
+    assert(watermarkFrom("leftTime > rightTime - interval 3 second") === Some(7000))
+    assert(watermarkFrom("rightTime < leftTime - interval 3 second") === Some(13000))
+    assert(watermarkFrom("rightTime - interval 1 second < leftTime - interval 3 second")
+      === Some(12000))
+
+    // Test with casted long type + constants on either side of equation
+    // Note: long type and constants commute, so more combinations to test.
+    // -- Constants on the right
+    assert(watermarkFrom("CAST(leftTime AS LONG) > CAST(rightTime AS LONG) + 1") === Some(11000))
+    assert(watermarkFrom("CAST(leftTime AS LONG) > CAST(rightTime AS LONG) - 1") === Some(9000))
+    assert(watermarkFrom("CAST(leftTime AS LONG) > CAST((rightTime + interval 1 second) AS LONG)")
+      === Some(11000))
+    assert(watermarkFrom("CAST(leftTime AS LONG) > 2 + CAST(rightTime AS LONG)") === Some(12000))
+    assert(watermarkFrom("CAST(leftTime AS LONG) > -0.5 + CAST(rightTime AS LONG)") === Some(9500))
+    assert(watermarkFrom("CAST(leftTime AS LONG) - CAST(rightTime AS LONG) > 2") === Some(12000))
+    assert(watermarkFrom("-CAST(rightTime AS DOUBLE) + CAST(leftTime AS LONG) > 0.1")
+      === Some(10100))
+    assert(watermarkFrom("0 > CAST(rightTime AS LONG) - CAST(leftTime AS LONG) + 0.2")
+      === Some(10200))
+    // -- Constants on the left
+    assert(watermarkFrom("CAST(leftTime AS LONG) + 2 > CAST(rightTime AS LONG)") === Some(8000))
+    assert(watermarkFrom("1 + CAST(leftTime AS LONG) > CAST(rightTime AS LONG)") === Some(9000))
+    assert(watermarkFrom("CAST((leftTime  + interval 3 second) AS LONG) > CAST(rightTime AS LONG)")
+      === Some(7000))
+    assert(watermarkFrom("CAST(leftTime AS LONG) - 2 > CAST(rightTime AS LONG)") === Some(12000))
+    assert(watermarkFrom("CAST(leftTime AS LONG) + 0.5 > CAST(rightTime AS LONG)") === Some(9500))
+    assert(watermarkFrom("CAST(leftTime AS LONG) - CAST(rightTime AS LONG) - 2 > 0")
+      === Some(12000))
+    assert(watermarkFrom("-CAST(rightTime AS LONG) + CAST(leftTime AS LONG) - 0.1 > 0")
+      === Some(10100))
+    // -- Constants on both sides, mixed types
+    assert(watermarkFrom("CAST(leftTime AS LONG) - 2.0 > CAST(rightTime AS LONG) + 1")
+      === Some(13000))
+
+    // Test multiple conditions, should return minimum watermark
+    assert(watermarkFrom(
+      "leftTime > rightTime - interval 3 second AND rightTime < leftTime + interval 2 seconds") ===
+      Some(7000))  // first condition wins
+    assert(watermarkFrom(
+      "leftTime > rightTime - interval 3 second AND rightTime < leftTime + interval 4 seconds") ===
+      Some(6000))  // second condition wins
+
+    // Test invalid comparisons
+    assert(watermarkFrom("cast(leftTime AS LONG) > leftOther") === None)      // non-time attributes
+    assert(watermarkFrom("leftOther > rightOther") === None)                  // non-time attributes
+    assert(watermarkFrom("leftOther > rightOther AND leftTime > rightTime") === Some(10000))
+    assert(watermarkFrom("cast(rightTime AS DOUBLE) < rightOther") === None)  // non-time attributes
+    assert(watermarkFrom("leftTime > rightTime + interval 1 month") === None) // month not allowed
+
+    // Test static comparisons
+    assert(watermarkFrom("cast(leftTime AS LONG) > 10") === Some(10000))
+
+    // Test non-positive results
+    assert(watermarkFrom("CAST(leftTime AS LONG) > CAST(rightTime AS LONG) - 10") === Some(0))
+    assert(watermarkFrom("CAST(leftTime AS LONG) > CAST(rightTime AS LONG) - 100") === Some(-90000))
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala
index 11f48a39c1e25..e5057c451d5b8 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala
@@ -31,6 +31,7 @@ import org.apache.spark.sql.catalyst.plans.logical.{FlatMapGroupsWithState, _}
 import org.apache.spark.sql.catalyst.streaming.InternalOutputModes._
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.{IntegerType, LongType, MetadataBuilder}
+import org.apache.spark.unsafe.types.CalendarInterval
 
 /** A dummy command for testing unsupported operations. */
 case class DummyCommand() extends Command
@@ -417,9 +418,57 @@ class UnsupportedOperationsSuite extends SparkFunSuite {
   testBinaryOperationInStreamingPlan(
     "left outer join",
     _.join(_, joinType = LeftOuter),
-    streamStreamSupported = false,
     batchStreamSupported = false,
-    expectedMsg = "left outer/semi/anti joins")
+    streamStreamSupported = false,
+    expectedMsg = "outer join")
+
+  // Left outer joins: stream-stream allowed with join on watermark attribute
+  // Note that the attribute need not be watermarked on both sides.
+  assertSupportedInStreamingPlan(
+    s"left outer join with stream-stream relations and join on attribute with left watermark",
+    streamRelation.join(streamRelation, joinType = LeftOuter,
+      condition = Some(attributeWithWatermark === attribute)),
+    OutputMode.Append())
+  assertSupportedInStreamingPlan(
+    s"left outer join with stream-stream relations and join on attribute with right watermark",
+    streamRelation.join(streamRelation, joinType = LeftOuter,
+      condition = Some(attribute === attributeWithWatermark)),
+    OutputMode.Append())
+  assertNotSupportedInStreamingPlan(
+    s"left outer join with stream-stream relations and join on non-watermarked attribute",
+    streamRelation.join(streamRelation, joinType = LeftOuter,
+      condition = Some(attribute === attribute)),
+    OutputMode.Append(),
+    Seq("watermark in the join keys"))
+
+  // Left outer joins: stream-stream allowed with range condition yielding state value watermark
+  assertSupportedInStreamingPlan(
+    s"left outer join with stream-stream relations and state value watermark", {
+      val leftRelation = streamRelation
+      val rightTimeWithWatermark =
+        AttributeReference("b", IntegerType)().withMetadata(watermarkMetadata)
+      val rightRelation = new TestStreamingRelation(rightTimeWithWatermark)
+      leftRelation.join(
+        rightRelation,
+        joinType = LeftOuter,
+        condition = Some(attribute > rightTimeWithWatermark + 10))
+    },
+    OutputMode.Append())
+
+  // Left outer joins: stream-stream not allowed with insufficient range condition
+  assertNotSupportedInStreamingPlan(
+    s"left outer join with stream-stream relations and state value watermark", {
+      val leftRelation = streamRelation
+      val rightTimeWithWatermark =
+        AttributeReference("b", IntegerType)().withMetadata(watermarkMetadata)
+      val rightRelation = new TestStreamingRelation(rightTimeWithWatermark)
+      leftRelation.join(
+        rightRelation,
+        joinType = LeftOuter,
+        condition = Some(attribute < rightTimeWithWatermark + 10))
+    },
+    OutputMode.Append(),
+    Seq("appropriate range condition"))
 
   // Left semi joins: stream-* not allowed
   testBinaryOperationInStreamingPlan(
@@ -427,7 +476,7 @@ class UnsupportedOperationsSuite extends SparkFunSuite {
     _.join(_, joinType = LeftSemi),
     streamStreamSupported = false,
     batchStreamSupported = false,
-    expectedMsg = "left outer/semi/anti joins")
+    expectedMsg = "left semi/anti joins")
 
   // Left anti joins: stream-* not allowed
   testBinaryOperationInStreamingPlan(
@@ -435,14 +484,63 @@ class UnsupportedOperationsSuite extends SparkFunSuite {
     _.join(_, joinType = LeftAnti),
     streamStreamSupported = false,
     batchStreamSupported = false,
-    expectedMsg = "left outer/semi/anti joins")
+    expectedMsg = "left semi/anti joins")
 
   // Right outer joins: stream-* not allowed
   testBinaryOperationInStreamingPlan(
     "right outer join",
     _.join(_, joinType = RightOuter),
+    streamBatchSupported = false,
     streamStreamSupported = false,
-    streamBatchSupported = false)
+    expectedMsg = "outer join")
+
+  // Right outer joins: stream-stream allowed with join on watermark attribute
+  // Note that the attribute need not be watermarked on both sides.
+  assertSupportedInStreamingPlan(
+    s"right outer join with stream-stream relations and join on attribute with left watermark",
+    streamRelation.join(streamRelation, joinType = RightOuter,
+      condition = Some(attributeWithWatermark === attribute)),
+    OutputMode.Append())
+  assertSupportedInStreamingPlan(
+    s"right outer join with stream-stream relations and join on attribute with right watermark",
+    streamRelation.join(streamRelation, joinType = RightOuter,
+      condition = Some(attribute === attributeWithWatermark)),
+    OutputMode.Append())
+  assertNotSupportedInStreamingPlan(
+    s"right outer join with stream-stream relations and join on non-watermarked attribute",
+    streamRelation.join(streamRelation, joinType = RightOuter,
+      condition = Some(attribute === attribute)),
+    OutputMode.Append(),
+    Seq("watermark in the join keys"))
+
+  // Right outer joins: stream-stream allowed with range condition yielding state value watermark
+  assertSupportedInStreamingPlan(
+    s"right outer join with stream-stream relations and state value watermark", {
+      val leftTimeWithWatermark =
+        AttributeReference("b", IntegerType)().withMetadata(watermarkMetadata)
+      val leftRelation = new TestStreamingRelation(leftTimeWithWatermark)
+      val rightRelation = streamRelation
+      leftRelation.join(
+        rightRelation,
+        joinType = RightOuter,
+        condition = Some(leftTimeWithWatermark + 10 < attribute))
+    },
+    OutputMode.Append())
+
+  // Right outer joins: stream-stream not allowed with insufficient range condition
+  assertNotSupportedInStreamingPlan(
+    s"right outer join with stream-stream relations and state value watermark", {
+      val leftTimeWithWatermark =
+        AttributeReference("b", IntegerType)().withMetadata(watermarkMetadata)
+      val leftRelation = new TestStreamingRelation(leftTimeWithWatermark)
+      val rightRelation = streamRelation
+      leftRelation.join(
+        rightRelation,
+        joinType = RightOuter,
+        condition = Some(leftTimeWithWatermark + 10 > attribute))
+    },
+    OutputMode.Append(),
+    Seq("appropriate range condition"))
 
   // Cogroup: only batch-batch is allowed
   testBinaryOperationInStreamingPlan(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala
index 44f1fa58599d2..9bd2127a28ff6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala
@@ -21,7 +21,7 @@ import java.util.concurrent.TimeUnit.NANOSECONDS
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{Attribute, BindReferences, Expression, JoinedRow, Literal, NamedExpression, PreciseTimestampConversion, UnsafeProjection, UnsafeRow}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, BindReferences, Expression, GenericInternalRow, JoinedRow, Literal, NamedExpression, PreciseTimestampConversion, UnsafeProjection, UnsafeRow}
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark._
 import org.apache.spark.sql.catalyst.plans.physical._
@@ -146,7 +146,14 @@ case class StreamingSymmetricHashJoinExec(
       stateWatermarkPredicates = JoinStateWatermarkPredicates(), left, right)
   }
 
-  require(joinType == Inner, s"${getClass.getSimpleName} should not take $joinType as the JoinType")
+  private def throwBadJoinTypeException(): Nothing = {
+    throw new IllegalArgumentException(
+      s"${getClass.getSimpleName} should not take $joinType as the JoinType")
+  }
+
+  require(
+    joinType == Inner || joinType == LeftOuter || joinType == RightOuter,
+    s"${getClass.getSimpleName} should not take $joinType as the JoinType")
   require(leftKeys.map(_.dataType) == rightKeys.map(_.dataType))
 
   private val storeConf = new StateStoreConf(sqlContext.conf)
@@ -157,11 +164,18 @@ case class StreamingSymmetricHashJoinExec(
   override def requiredChildDistribution: Seq[Distribution] =
     ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil
 
-  override def output: Seq[Attribute] = left.output ++ right.output
+  override def output: Seq[Attribute] = joinType match {
+    case _: InnerLike => left.output ++ right.output
+    case LeftOuter => left.output ++ right.output.map(_.withNullability(true))
+    case RightOuter => left.output.map(_.withNullability(true)) ++ right.output
+    case _ => throwBadJoinTypeException()
+  }
 
   override def outputPartitioning: Partitioning = joinType match {
     case _: InnerLike =>
       PartitioningCollection(Seq(left.outputPartitioning, right.outputPartitioning))
+    case LeftOuter => PartitioningCollection(Seq(left.outputPartitioning))
+    case RightOuter => PartitioningCollection(Seq(right.outputPartitioning))
     case x =>
       throw new IllegalArgumentException(
         s"${getClass.getSimpleName} should not take $x as the JoinType")
@@ -207,31 +221,108 @@ case class StreamingSymmetricHashJoinExec(
     //    matching new left input with new right input, since the new left input has become stored
     //    by that point. This tiny asymmetry is necessary to avoid duplication.
     val leftOutputIter = leftSideJoiner.storeAndJoinWithOtherSide(rightSideJoiner) {
-      (inputRow: UnsafeRow, matchedRow: UnsafeRow) =>
-        joinedRow.withLeft(inputRow).withRight(matchedRow)
+      (input: UnsafeRow, matched: UnsafeRow) => joinedRow.withLeft(input).withRight(matched)
     }
     val rightOutputIter = rightSideJoiner.storeAndJoinWithOtherSide(leftSideJoiner) {
-      (inputRow: UnsafeRow, matchedRow: UnsafeRow) =>
-        joinedRow.withLeft(matchedRow).withRight(inputRow)
+      (input: UnsafeRow, matched: UnsafeRow) => joinedRow.withLeft(matched).withRight(input)
     }
 
     // Filter the joined rows based on the given condition.
-    val outputFilterFunction =
-      newPredicate(condition.getOrElse(Literal(true)), left.output ++ right.output).eval _
-    val filteredOutputIter =
-      (leftOutputIter ++ rightOutputIter).filter(outputFilterFunction).map { row =>
-        numOutputRows += 1
-        row
-      }
+    val outputFilterFunction = newPredicate(condition.getOrElse(Literal(true)), output).eval _
+
+    // We need to save the time that the inner join output iterator completes, since outer join
+    // output counts as both update and removal time.
+    var innerOutputCompletionTimeNs: Long = 0
+    def onInnerOutputCompletion = {
+      innerOutputCompletionTimeNs = System.nanoTime
+    }
+    val filteredInnerOutputIter = CompletionIterator[InternalRow, Iterator[InternalRow]](
+      (leftOutputIter ++ rightOutputIter).filter(outputFilterFunction), onInnerOutputCompletion)
+
+    def matchesWithRightSideState(leftKeyValue: UnsafeRowPair) = {
+      rightSideJoiner.get(leftKeyValue.key).exists(
+        rightValue => {
+          outputFilterFunction(
+            joinedRow.withLeft(leftKeyValue.value).withRight(rightValue))
+        })
+    }
+
+    def matchesWithLeftSideState(rightKeyValue: UnsafeRowPair) = {
+      leftSideJoiner.get(rightKeyValue.key).exists(
+        leftValue => {
+          outputFilterFunction(
+            joinedRow.withLeft(leftValue).withRight(rightKeyValue.value))
+        })
+    }
+
+    val outputIter: Iterator[InternalRow] = joinType match {
+      case Inner =>
+        filteredInnerOutputIter
+      case LeftOuter =>
+        // We generate the outer join input by:
+        // * Getting an iterator over the rows that have aged out on the left side. These rows are
+        //   candidates for being null joined. Note that to avoid doing two passes, this iterator
+        //   removes the rows from the state manager as they're processed.
+        // * Checking whether the current row matches a key in the right side state, and that key
+        //   has any value which satisfies the filter function when joined. If it doesn't,
+        //   we know we can join with null, since there was never (including this batch) a match
+        //   within the watermark period. If it does, there must have been a match at some point, so
+        //   we know we can't join with null.
+        val nullRight = new GenericInternalRow(right.output.map(_.withNullability(true)).length)
+        val removedRowIter = leftSideJoiner.removeOldState()
+        val outerOutputIter = removedRowIter
+          .filterNot(pair => matchesWithRightSideState(pair))
+          .map(pair => joinedRow.withLeft(pair.value).withRight(nullRight))
+
+        filteredInnerOutputIter ++ outerOutputIter
+      case RightOuter =>
+        // See comments for left outer case.
+        val nullLeft = new GenericInternalRow(left.output.map(_.withNullability(true)).length)
+        val removedRowIter = rightSideJoiner.removeOldState()
+        val outerOutputIter = removedRowIter
+          .filterNot(pair => matchesWithLeftSideState(pair))
+          .map(pair => joinedRow.withLeft(nullLeft).withRight(pair.value))
+
+        filteredInnerOutputIter ++ outerOutputIter
+      case _ => throwBadJoinTypeException()
+    }
+
+    val outputIterWithMetrics = outputIter.map { row =>
+      numOutputRows += 1
+      row
+    }
 
     // Function to remove old state after all the input has been consumed and output generated
     def onOutputCompletion = {
+      // All processing time counts as update time.
       allUpdatesTimeMs += math.max(NANOSECONDS.toMillis(System.nanoTime - updateStartTimeNs), 0)
 
-      // Remove old state if needed
+      // Processing time between inner output completion and here comes from the outer portion of a
+      // join, and thus counts as removal time as we remove old state from one side while iterating.
+      if (innerOutputCompletionTimeNs != 0) {
+        allRemovalsTimeMs +=
+          math.max(NANOSECONDS.toMillis(System.nanoTime - innerOutputCompletionTimeNs), 0)
+      }
+
       allRemovalsTimeMs += timeTakenMs {
-        leftSideJoiner.removeOldState()
-        rightSideJoiner.removeOldState()
+        // Remove any remaining state rows which aren't needed because they're below the watermark.
+        //
+        // For inner joins, we have to remove unnecessary state rows from both sides if possible.
+        // For outer joins, we have already removed unnecessary state rows from the outer side
+        // (e.g., left side for left outer join) while generating the outer "null" outputs. Now, we
+        // have to remove unnecessary state rows from the other side (e.g., right side for the left
+        // outer join) if possible. In all cases, nothing needs to be outputted, hence the removal
+        // needs to be done greedily by immediately consuming the returned iterator.
+        val cleanupIter = joinType match {
+          case Inner =>
+            leftSideJoiner.removeOldState() ++ rightSideJoiner.removeOldState()
+          case LeftOuter => rightSideJoiner.removeOldState()
+          case RightOuter => leftSideJoiner.removeOldState()
+          case _ => throwBadJoinTypeException()
+        }
+        while (cleanupIter.hasNext) {
+          cleanupIter.next()
+        }
       }
 
       // Commit all state changes and update state store metrics
@@ -251,7 +342,8 @@ case class StreamingSymmetricHashJoinExec(
       }
     }
 
-    CompletionIterator[InternalRow, Iterator[InternalRow]](filteredOutputIter, onOutputCompletion)
+    CompletionIterator[InternalRow, Iterator[InternalRow]](
+      outputIterWithMetrics, onOutputCompletion)
   }
 
   /**
@@ -324,14 +416,32 @@ case class StreamingSymmetricHashJoinExec(
       }
     }
 
-    /** Remove old buffered state rows using watermarks for state keys and values */
-    def removeOldState(): Unit = {
+    /**
+     * Get an iterator over the values stored in this joiner's state manager for the given key.
+     *
+     * Should not be interleaved with mutations.
+     */
+    def get(key: UnsafeRow): Iterator[UnsafeRow] = {
+      joinStateManager.get(key)
+    }
+
+    /**
+     * Builds an iterator over old state key-value pairs, removing them lazily as they're produced.
+     *
+     * @note This iterator must be consumed fully before any other operations are made
+     * against this joiner's join state manager. For efficiency reasons, the intermediate states of
+     * the iterator leave the state manager in an undefined state.
+     *
+     * We do this to avoid requiring either two passes or full materialization when
+     * processing the rows for outer join.
+     */
+    def removeOldState(): Iterator[UnsafeRowPair] = {
       stateWatermarkPredicate match {
         case Some(JoinStateKeyWatermarkPredicate(expr)) =>
           joinStateManager.removeByKeyCondition(stateKeyWatermarkPredicateFunc)
         case Some(JoinStateValueWatermarkPredicate(expr)) =>
           joinStateManager.removeByValueCondition(stateValueWatermarkPredicateFunc)
-        case _ =>
+        case _ => Iterator.empty
       }
     }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinHelper.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinHelper.scala
index e50274a1baba1..64c7189f72ac3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinHelper.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinHelper.scala
@@ -23,6 +23,7 @@ import scala.util.control.NonFatal
 import org.apache.spark.{Partition, SparkContext}
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.{RDD, ZippedPartitionsRDD2}
+import org.apache.spark.sql.catalyst.analysis.StreamingJoinHelper
 import org.apache.spark.sql.catalyst.expressions.{Add, Attribute, AttributeReference, AttributeSet, BoundReference, Cast, CheckOverflow, Expression, ExpressionSet, GreaterThan, GreaterThanOrEqual, LessThan, LessThanOrEqual, Literal, Multiply, NamedExpression, PreciseTimestampConversion, PredicateHelper, Subtract, TimeAdd, TimeSub, UnaryMinus}
 import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark._
 import org.apache.spark.sql.execution.streaming.WatermarkSupport.watermarkExpression
@@ -34,7 +35,7 @@ import org.apache.spark.unsafe.types.CalendarInterval
 /**
  * Helper object for [[StreamingSymmetricHashJoinExec]]. See that object for more details.
  */
-object StreamingSymmetricHashJoinHelper extends PredicateHelper with Logging {
+object StreamingSymmetricHashJoinHelper extends Logging {
 
   sealed trait JoinSide
   case object LeftSide extends JoinSide { override def toString(): String = "left" }
@@ -111,7 +112,7 @@ object StreamingSymmetricHashJoinHelper extends PredicateHelper with Logging {
         expr.map(JoinStateKeyWatermarkPredicate.apply _)
 
       } else if (isWatermarkDefinedOnInput) { // case 2 in the StreamingSymmetricHashJoinExec docs
-        val stateValueWatermark = getStateValueWatermark(
+        val stateValueWatermark = StreamingJoinHelper.getStateValueWatermark(
           attributesToFindStateWatermarkFor = AttributeSet(oneSideInputAttributes),
           attributesWithEventWatermark = AttributeSet(otherSideInputAttributes),
           condition,
@@ -132,242 +133,6 @@ object StreamingSymmetricHashJoinHelper extends PredicateHelper with Logging {
     JoinStateWatermarkPredicates(leftStateWatermarkPredicate, rightStateWatermarkPredicate)
   }
 
-  /**
-   * Get state value watermark (see [[StreamingSymmetricHashJoinExec]] for context about it)
-   * given the join condition and the event time watermark. This is how it works.
-   * - The condition is split into conjunctive predicates, and we find the predicates of the
-   *   form `leftTime + c1 < rightTime + c2`   (or <=, >, >=).
-   * - We canoncalize the predicate and solve it with the event time watermark value to find the
-   *  value of the state watermark.
-   * This function is supposed to make best-effort attempt to get the state watermark. If there is
-   * any error, it will return None.
-   *
-   * @param attributesToFindStateWatermarkFor attributes of the side whose state watermark
-   *                                         is to be calculated
-   * @param attributesWithEventWatermark  attributes of the other side which has a watermark column
-   * @param joinCondition                 join condition
-   * @param eventWatermark                watermark defined on the input event data
-   * @return state value watermark in milliseconds, is possible.
-   */
-  def getStateValueWatermark(
-      attributesToFindStateWatermarkFor: AttributeSet,
-      attributesWithEventWatermark: AttributeSet,
-      joinCondition: Option[Expression],
-      eventWatermark: Option[Long]): Option[Long] = {
-
-    // If condition or event time watermark is not provided, then cannot calculate state watermark
-    if (joinCondition.isEmpty || eventWatermark.isEmpty) return None
-
-    // If there is not watermark attribute, then cannot define state watermark
-    if (!attributesWithEventWatermark.exists(_.metadata.contains(delayKey))) return None
-
-    def getStateWatermarkSafely(l: Expression, r: Expression): Option[Long] = {
-      try {
-        getStateWatermarkFromLessThenPredicate(
-          l, r, attributesToFindStateWatermarkFor, attributesWithEventWatermark, eventWatermark)
-      } catch {
-        case NonFatal(e) =>
-          logWarning(s"Error trying to extract state constraint from condition $joinCondition", e)
-          None
-      }
-    }
-
-    val allStateWatermarks = splitConjunctivePredicates(joinCondition.get).flatMap { predicate =>
-
-      // The generated the state watermark cleanup expression is inclusive of the state watermark.
-      // If state watermark is W, all state where timestamp <= W will be cleaned up.
-      // Now when the canonicalized join condition solves to leftTime >= W, we dont want to clean
-      // up leftTime <= W. Rather we should clean up leftTime <= W - 1. Hence the -1 below.
-      val stateWatermark = predicate match {
-        case LessThan(l, r) => getStateWatermarkSafely(l, r)
-        case LessThanOrEqual(l, r) => getStateWatermarkSafely(l, r).map(_ - 1)
-        case GreaterThan(l, r) => getStateWatermarkSafely(r, l)
-        case GreaterThanOrEqual(l, r) => getStateWatermarkSafely(r, l).map(_ - 1)
-        case _ => None
-      }
-      if (stateWatermark.nonEmpty) {
-        logInfo(s"Condition $joinCondition generated watermark constraint = ${stateWatermark.get}")
-      }
-      stateWatermark
-    }
-    allStateWatermarks.reduceOption((x, y) => Math.min(x, y))
-  }
-
-  /**
-   * Extract the state value watermark (milliseconds) from the condition
-   * `LessThan(leftExpr, rightExpr)` where . For example: if we want to find the constraint for
-   * leftTime using the watermark on the rightTime. Example:
-   *
-   * Input:                 rightTime-with-watermark + c1 < leftTime + c2
-   * Canonical form:        rightTime-with-watermark + c1 + (-c2) + (-leftTime) < 0
-   * Solving for rightTime: rightTime-with-watermark + c1 + (-c2) < leftTime
-   * With watermark value:  watermark-value + c1 + (-c2) < leftTime
-   */
-  private def getStateWatermarkFromLessThenPredicate(
-      leftExpr: Expression,
-      rightExpr: Expression,
-      attributesToFindStateWatermarkFor: AttributeSet,
-      attributesWithEventWatermark: AttributeSet,
-      eventWatermark: Option[Long]): Option[Long] = {
-
-    val attributesInCondition = AttributeSet(
-      leftExpr.collect { case a: AttributeReference => a } ++
-      rightExpr.collect { case a: AttributeReference => a }
-    )
-    if (attributesInCondition.filter { attributesToFindStateWatermarkFor.contains(_) }.size > 1 ||
-        attributesInCondition.filter { attributesWithEventWatermark.contains(_) }.size > 1) {
-      // If more than attributes present in condition from one side, then it cannot be solved
-      return None
-    }
-
-    def containsAttributeToFindStateConstraintFor(e: Expression): Boolean = {
-      e.collectLeaves().collectFirst {
-        case a @ AttributeReference(_, TimestampType, _, _)
-          if attributesToFindStateWatermarkFor.contains(a) => a
-      }.nonEmpty
-    }
-
-    // Canonicalization step 1: convert to (rightTime-with-watermark + c1) - (leftTime + c2) < 0
-    val allOnLeftExpr = Subtract(leftExpr, rightExpr)
-    logDebug(s"All on Left:\n${allOnLeftExpr.treeString(true)}\n${allOnLeftExpr.asCode}")
-
-    // Canonicalization step 2: extract commutative terms
-    //    rightTime-with-watermark, c1, -leftTime, -c2
-    val terms = ExpressionSet(collectTerms(allOnLeftExpr))
-    logDebug("Terms extracted from join condition:\n\t" + terms.mkString("\n\t"))
-
-
-
-    // Find the term that has leftTime (i.e. the one present in attributesToFindConstraintFor
-    val constraintTerms = terms.filter(containsAttributeToFindStateConstraintFor)
-
-    // Verify there is only one correct constraint term and of the correct type
-    if (constraintTerms.size > 1) {
-      logWarning("Failed to extract state constraint terms: multiple time terms in condition\n\t" +
-        terms.mkString("\n\t"))
-      return None
-    }
-    if (constraintTerms.isEmpty) {
-      logDebug("Failed to extract state constraint terms: no time terms in condition\n\t" +
-        terms.mkString("\n\t"))
-      return None
-    }
-    val constraintTerm = constraintTerms.head
-    if (constraintTerm.collectFirst { case u: UnaryMinus => u }.isEmpty) {
-      // Incorrect condition. We want the constraint term in canonical form to be `-leftTime`
-      // so that resolve for it as `-leftTime + watermark + c < 0` ==> `watermark + c < leftTime`.
-      // Now, if the original conditions is `rightTime-with-watermark > leftTime` and watermark
-      // condition is `rightTime-with-watermark > watermarkValue`, then no constraint about
-      // `leftTime` can be inferred. In this case, after canonicalization and collection of terms,
-      // the constraintTerm would be `leftTime` and not `-leftTime`. Hence, we return None.
-      return None
-    }
-
-    // Replace watermark attribute with watermark value, and generate the resolved expression
-    // from the other terms. That is,
-    // rightTime-with-watermark, c1, -c2  =>  watermark, c1, -c2  =>  watermark + c1 + (-c2)
-    logDebug(s"Constraint term from join condition:\t$constraintTerm")
-    val exprWithWatermarkSubstituted = (terms - constraintTerm).map { term =>
-      term.transform {
-        case a @ AttributeReference(_, TimestampType, _, metadata)
-          if attributesWithEventWatermark.contains(a) && metadata.contains(delayKey) =>
-          Multiply(Literal(eventWatermark.get.toDouble), Literal(1000.0))
-      }
-    }.reduceLeft(Add)
-
-    // Calculate the constraint value
-    logInfo(s"Final expression to evaluate constraint:\t$exprWithWatermarkSubstituted")
-    val constraintValue = exprWithWatermarkSubstituted.eval().asInstanceOf[java.lang.Double]
-    Some((Double2double(constraintValue) / 1000.0).toLong)
-  }
-
-  /**
-   * Collect all the terms present in an expression after converting it into the form
-   * a + b + c + d where each term be either an attribute or a literal casted to long,
-   * optionally wrapped in a unary minus.
-   */
-  private def collectTerms(exprToCollectFrom: Expression): Seq[Expression] = {
-    var invalid = false
-
-    /** Wrap a term with UnaryMinus if its needs to be negated. */
-    def negateIfNeeded(expr: Expression, minus: Boolean): Expression = {
-      if (minus) UnaryMinus(expr) else expr
-    }
-
-    /**
-     * Recursively split the expression into its leaf terms contains attributes or literals.
-     * Returns terms only of the forms:
-     *    Cast(AttributeReference), UnaryMinus(Cast(AttributeReference)),
-     *    Cast(AttributeReference, Double), UnaryMinus(Cast(AttributeReference, Double))
-     *    Multiply(Literal), UnaryMinus(Multiply(Literal))
-     *    Multiply(Cast(Literal)), UnaryMinus(Multiple(Cast(Literal)))
-     *
-     * Note:
-     * - If term needs to be negated for making it a commutative term,
-     *   then it will be wrapped in UnaryMinus(...)
-     * - Each terms will be representing timestamp value or time interval in microseconds,
-     *   typed as doubles.
-     */
-    def collect(expr: Expression, negate: Boolean): Seq[Expression] = {
-      expr match {
-        case Add(left, right) =>
-          collect(left, negate) ++ collect(right, negate)
-        case Subtract(left, right) =>
-          collect(left, negate) ++ collect(right, !negate)
-        case TimeAdd(left, right, _) =>
-          collect(left, negate) ++ collect(right, negate)
-        case TimeSub(left, right, _) =>
-          collect(left, negate) ++ collect(right, !negate)
-        case UnaryMinus(child) =>
-          collect(child, !negate)
-        case CheckOverflow(child, _) =>
-          collect(child, negate)
-        case Cast(child, dataType, _) =>
-          dataType match {
-            case _: NumericType | _: TimestampType => collect(child, negate)
-            case _ =>
-              invalid = true
-              Seq.empty
-          }
-        case a: AttributeReference =>
-          val castedRef = if (a.dataType != DoubleType) Cast(a, DoubleType) else a
-          Seq(negateIfNeeded(castedRef, negate))
-        case lit: Literal =>
-          // If literal of type calendar interval, then explicitly convert to millis
-          // Convert other number like literal to doubles representing millis (by x1000)
-          val castedLit = lit.dataType match {
-            case CalendarIntervalType =>
-              val calendarInterval = lit.value.asInstanceOf[CalendarInterval]
-              if (calendarInterval.months > 0) {
-                invalid = true
-                logWarning(
-                  s"Failed to extract state value watermark from condition $exprToCollectFrom " +
-                    s"as imprecise intervals like months and years cannot be used for" +
-                    s"watermark calculation. Use interval in terms of day instead.")
-                Literal(0.0)
-              } else {
-                Literal(calendarInterval.microseconds.toDouble)
-              }
-            case DoubleType =>
-              Multiply(lit, Literal(1000000.0))
-            case _: NumericType =>
-              Multiply(Cast(lit, DoubleType), Literal(1000000.0))
-            case _: TimestampType =>
-              Multiply(PreciseTimestampConversion(lit, TimestampType, LongType), Literal(1000000.0))
-          }
-          Seq(negateIfNeeded(castedLit, negate))
-        case a @ _ =>
-          logWarning(
-            s"Failed to extract state value watermark from condition $exprToCollectFrom due to $a")
-          invalid = true
-          Seq.empty
-      }
-    }
-
-    val terms = collect(exprToCollectFrom, negate = false)
-    if (!invalid) terms else Seq.empty
-  }
-
   /**
    * A custom RDD that allows partitions to be "zipped" together, while ensuring the tasks'
    * preferred location is based on which executors have the required join state stores already
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala
index 37648710dfc2a..d256fb578d921 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala
@@ -76,7 +76,7 @@ class SymmetricHashJoinStateManager(
   /** Get all the values of a key */
   def get(key: UnsafeRow): Iterator[UnsafeRow] = {
     val numValues = keyToNumValues.get(key)
-    keyWithIndexToValue.getAll(key, numValues)
+    keyWithIndexToValue.getAll(key, numValues).map(_.value)
   }
 
   /** Append a new value to the key */
@@ -87,70 +87,163 @@ class SymmetricHashJoinStateManager(
   }
 
   /**
-   * Remove using a predicate on keys. See class docs for more context and implement details.
+   * Remove using a predicate on keys.
+   *
+   * This produces an iterator over the (key, value) pairs satisfying condition(key), where the
+   * underlying store is updated as a side-effect of producing next.
+   *
+   * This implies the iterator must be consumed fully without any other operations on this manager
+   * or the underlying store being interleaved.
    */
-  def removeByKeyCondition(condition: UnsafeRow => Boolean): Unit = {
-    val allKeyToNumValues = keyToNumValues.iterator
-
-    while (allKeyToNumValues.hasNext) {
-      val keyToNumValue = allKeyToNumValues.next
-      if (condition(keyToNumValue.key)) {
-        keyToNumValues.remove(keyToNumValue.key)
-        keyWithIndexToValue.removeAllValues(keyToNumValue.key, keyToNumValue.numValue)
+  def removeByKeyCondition(removalCondition: UnsafeRow => Boolean): Iterator[UnsafeRowPair] = {
+    new NextIterator[UnsafeRowPair] {
+
+      private val allKeyToNumValues = keyToNumValues.iterator
+
+      private var currentKeyToNumValue: KeyAndNumValues = null
+      private var currentValues: Iterator[KeyWithIndexAndValue] = null
+
+      private def currentKey = currentKeyToNumValue.key
+
+      private val reusedPair = new UnsafeRowPair()
+
+      private def getAndRemoveValue() = {
+        val keyWithIndexAndValue = currentValues.next()
+        keyWithIndexToValue.remove(currentKey, keyWithIndexAndValue.valueIndex)
+        reusedPair.withRows(currentKey, keyWithIndexAndValue.value)
+      }
+
+      override def getNext(): UnsafeRowPair = {
+        // If there are more values for the current key, remove and return the next one.
+        if (currentValues != null && currentValues.hasNext) {
+          return getAndRemoveValue()
+        }
+
+        // If there weren't any values left, try and find the next key that satisfies the removal
+        // condition and has values.
+        while (allKeyToNumValues.hasNext) {
+          currentKeyToNumValue = allKeyToNumValues.next()
+          if (removalCondition(currentKey)) {
+            currentValues = keyWithIndexToValue.getAll(
+              currentKey, currentKeyToNumValue.numValue)
+            keyToNumValues.remove(currentKey)
+
+            if (currentValues.hasNext) {
+              return getAndRemoveValue()
+            }
+          }
+        }
+
+        // We only reach here if there were no satisfying keys left, which means we're done.
+        finished = true
+        return null
       }
+
+      override def close: Unit = {}
     }
   }
 
   /**
-   * Remove using a predicate on values. See class docs for more context and implementation details.
+   * Remove using a predicate on values.
+   *
+   * At a high level, this produces an iterator over the (key, value) pairs such that value
+   * satisfies the predicate, where producing an element removes the value from the state store
+   * and producing all elements with a given key updates it accordingly.
+   *
+   * This implies the iterator must be consumed fully without any other operations on this manager
+   * or the underlying store being interleaved.
    */
-  def removeByValueCondition(condition: UnsafeRow => Boolean): Unit = {
-    val allKeyToNumValues = keyToNumValues.iterator
+  def removeByValueCondition(removalCondition: UnsafeRow => Boolean): Iterator[UnsafeRowPair] = {
+    new NextIterator[UnsafeRowPair] {
 
-    while (allKeyToNumValues.hasNext) {
-      val keyToNumValue = allKeyToNumValues.next
-      val key = keyToNumValue.key
+      // Reuse this object to avoid creation+GC overhead.
+      private val reusedPair = new UnsafeRowPair()
 
-      var numValues: Long = keyToNumValue.numValue
-      var index: Long = 0L
-      var valueRemoved: Boolean = false
-      var valueForIndex: UnsafeRow = null
+      private val allKeyToNumValues = keyToNumValues.iterator
 
-      while (index < numValues) {
-        if (valueForIndex == null) {
-          valueForIndex = keyWithIndexToValue.get(key, index)
+      private var currentKey: UnsafeRow = null
+      private var numValues: Long = 0L
+      private var index: Long = 0L
+      private var valueRemoved: Boolean = false
+
+      // Push the data for the current key to the numValues store, and reset the tracking variables
+      // to their empty state.
+      private def updateNumValueForCurrentKey(): Unit = {
+        if (valueRemoved) {
+          if (numValues >= 1) {
+            keyToNumValues.put(currentKey, numValues)
+          } else {
+            keyToNumValues.remove(currentKey)
+          }
         }
-        if (condition(valueForIndex)) {
-          if (numValues > 1) {
-            val valueAtMaxIndex = keyWithIndexToValue.get(key, numValues - 1)
-            keyWithIndexToValue.put(key, index, valueAtMaxIndex)
-            keyWithIndexToValue.remove(key, numValues - 1)
-            valueForIndex = valueAtMaxIndex
+
+        currentKey = null
+        numValues = 0
+        index = 0
+        valueRemoved = false
+      }
+
+      // Find the next value satisfying the condition, updating `currentKey` and `numValues` if
+      // needed. Returns null when no value can be found.
+      private def findNextValueForIndex(): UnsafeRow = {
+        // Loop across all values for the current key, and then all other keys, until we find a
+        // value satisfying the removal condition.
+        def hasMoreValuesForCurrentKey = currentKey != null && index < numValues
+        def hasMoreKeys = allKeyToNumValues.hasNext
+        while (hasMoreValuesForCurrentKey || hasMoreKeys) {
+          if (hasMoreValuesForCurrentKey) {
+            // First search the values for the current key.
+            val currentValue = keyWithIndexToValue.get(currentKey, index)
+            if (removalCondition(currentValue)) {
+              return currentValue
+            } else {
+              index += 1
+            }
+          } else if (hasMoreKeys) {
+            // If we can't find a value for the current key, cleanup and start looking at the next.
+            // This will also happen the first time the iterator is called.
+            updateNumValueForCurrentKey()
+
+            val currentKeyToNumValue = allKeyToNumValues.next()
+            currentKey = currentKeyToNumValue.key
+            numValues = currentKeyToNumValue.numValue
           } else {
-            keyWithIndexToValue.remove(key, 0)
-            valueForIndex = null
+            // Should be unreachable, but in any case means a value couldn't be found.
+            return null
           }
-          numValues -= 1
-          valueRemoved = true
-        } else {
-          valueForIndex = null
-          index += 1
         }
+
+        // We tried and failed to find the next value.
+        return null
       }
-      if (valueRemoved) {
-        if (numValues >= 1) {
-          keyToNumValues.put(key, numValues)
+
+      override def getNext(): UnsafeRowPair = {
+        val currentValue = findNextValueForIndex()
+
+        // If there's no value, clean up and finish. There aren't any more available.
+        if (currentValue == null) {
+          updateNumValueForCurrentKey()
+          finished = true
+          return null
+        }
+
+        // The backing store is arraylike - we as the caller are responsible for filling back in
+        // any hole. So we swap the last element into the hole and decrement numValues to shorten.
+        // clean
+        if (numValues > 1) {
+          val valueAtMaxIndex = keyWithIndexToValue.get(currentKey, numValues - 1)
+          keyWithIndexToValue.put(currentKey, index, valueAtMaxIndex)
+          keyWithIndexToValue.remove(currentKey, numValues - 1)
         } else {
-          keyToNumValues.remove(key)
+          keyWithIndexToValue.remove(currentKey, 0)
         }
+        numValues -= 1
+        valueRemoved = true
+
+        return reusedPair.withRows(currentKey, currentValue)
       }
-    }
-  }
 
-  def iterator(): Iterator[UnsafeRowPair] = {
-    val pair = new UnsafeRowPair()
-    keyWithIndexToValue.iterator.map { x =>
-      pair.withRows(x.key, x.value)
+      override def close: Unit = {}
     }
   }
 
@@ -309,19 +402,24 @@ class SymmetricHashJoinStateManager(
       stateStore.get(keyWithIndexRow(key, valueIndex))
     }
 
-    /** Get all the values for key and all indices. */
-    def getAll(key: UnsafeRow, numValues: Long): Iterator[UnsafeRow] = {
+    /**
+     * Get all values and indices for the provided key.
+     * Should not return null.
+     */
+    def getAll(key: UnsafeRow, numValues: Long): Iterator[KeyWithIndexAndValue] = {
+      val keyWithIndexAndValue = new KeyWithIndexAndValue()
       var index = 0
-      new NextIterator[UnsafeRow] {
-        override protected def getNext(): UnsafeRow = {
+      new NextIterator[KeyWithIndexAndValue] {
+        override protected def getNext(): KeyWithIndexAndValue = {
           if (index >= numValues) {
             finished = true
             null
           } else {
             val keyWithIndex = keyWithIndexRow(key, index)
             val value = stateStore.get(keyWithIndex)
+            keyWithIndexAndValue.withNew(key, index, value)
             index += 1
-            value
+            keyWithIndexAndValue
           }
         }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManagerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManagerSuite.scala
index ffa4c3c22a194..d44af1d14c27a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManagerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManagerSuite.scala
@@ -137,14 +137,16 @@ class SymmetricHashJoinStateManagerSuite extends StreamTest with BeforeAndAfter
         BoundReference(
           1, inputValueAttribWithWatermark.dataType, inputValueAttribWithWatermark.nullable),
         Literal(threshold))
-    manager.removeByKeyCondition(GeneratePredicate.generate(expr).eval _)
+    val iter = manager.removeByKeyCondition(GeneratePredicate.generate(expr).eval _)
+    while (iter.hasNext) iter.next()
   }
 
   /** Remove values where `time <= threshold` */
   def removeByValue(watermark: Long)(implicit manager: SymmetricHashJoinStateManager): Unit = {
     val expr = LessThanOrEqual(inputValueAttribWithWatermark, Literal(watermark))
-    manager.removeByValueCondition(
+    val iter = manager.removeByValueCondition(
       GeneratePredicate.generate(expr, inputValueAttribs).eval _)
+    while (iter.hasNext) iter.next()
   }
 
   def numRows(implicit manager: SymmetricHashJoinStateManager): Long = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala
index 533e1165fd59c..a6593b71e51de 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala
@@ -24,8 +24,9 @@ import scala.util.Random
 import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.scheduler.ExecutorCacheTaskLocation
-import org.apache.spark.sql.{AnalysisException, SparkSession}
-import org.apache.spark.sql.catalyst.expressions.{AttributeReference, AttributeSet}
+import org.apache.spark.sql.{AnalysisException, DataFrame, Row, SparkSession}
+import org.apache.spark.sql.catalyst.analysis.StreamingJoinHelper
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, AttributeSet, Literal}
 import org.apache.spark.sql.catalyst.plans.logical.{EventTimeWatermark, Filter}
 import org.apache.spark.sql.execution.LogicalRDD
 import org.apache.spark.sql.execution.streaming.{MemoryStream, StatefulOperatorStateInfo, StreamingSymmetricHashJoinHelper}
@@ -35,7 +36,7 @@ import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
 
-class StreamingJoinSuite extends StreamTest with StateStoreMetricsTest with BeforeAndAfter {
+class StreamingInnerJoinSuite extends StreamTest with StateStoreMetricsTest with BeforeAndAfter {
 
   before {
     SparkSession.setActiveSession(spark)  // set this before force initializing 'joinExec'
@@ -322,111 +323,6 @@ class StreamingJoinSuite extends StreamTest with StateStoreMetricsTest with Befo
     assert(e.toString.contains("Stream stream joins without equality predicate is not supported"))
   }
 
-  testQuietly("extract watermark from time condition") {
-    val attributesToFindConstraintFor = Seq(
-      AttributeReference("leftTime", TimestampType)(),
-      AttributeReference("leftOther", IntegerType)())
-    val metadataWithWatermark = new MetadataBuilder()
-      .putLong(EventTimeWatermark.delayKey, 1000)
-      .build()
-    val attributesWithWatermark = Seq(
-      AttributeReference("rightTime", TimestampType, metadata = metadataWithWatermark)(),
-      AttributeReference("rightOther", IntegerType)())
-
-    def watermarkFrom(
-        conditionStr: String,
-        rightWatermark: Option[Long] = Some(10000)): Option[Long] = {
-      val conditionExpr = Some(conditionStr).map { str =>
-        val plan =
-          Filter(
-            spark.sessionState.sqlParser.parseExpression(str),
-            LogicalRDD(
-              attributesToFindConstraintFor ++ attributesWithWatermark,
-              spark.sparkContext.emptyRDD)(spark))
-        plan.queryExecution.optimizedPlan.asInstanceOf[Filter].condition
-      }
-      StreamingSymmetricHashJoinHelper.getStateValueWatermark(
-        AttributeSet(attributesToFindConstraintFor), AttributeSet(attributesWithWatermark),
-        conditionExpr, rightWatermark)
-    }
-
-    // Test comparison directionality. E.g. if leftTime < rightTime and rightTime > watermark,
-    // then cannot define constraint on leftTime.
-    assert(watermarkFrom("leftTime > rightTime") === Some(10000))
-    assert(watermarkFrom("leftTime >= rightTime") === Some(9999))
-    assert(watermarkFrom("leftTime < rightTime") === None)
-    assert(watermarkFrom("leftTime <= rightTime") === None)
-    assert(watermarkFrom("rightTime > leftTime") === None)
-    assert(watermarkFrom("rightTime >= leftTime") === None)
-    assert(watermarkFrom("rightTime < leftTime") === Some(10000))
-    assert(watermarkFrom("rightTime <= leftTime") === Some(9999))
-
-    // Test type conversions
-    assert(watermarkFrom("CAST(leftTime AS LONG) > CAST(rightTime AS LONG)") === Some(10000))
-    assert(watermarkFrom("CAST(leftTime AS LONG) < CAST(rightTime AS LONG)") === None)
-    assert(watermarkFrom("CAST(leftTime AS DOUBLE) > CAST(rightTime AS DOUBLE)") === Some(10000))
-    assert(watermarkFrom("CAST(leftTime AS LONG) > CAST(rightTime AS DOUBLE)") === Some(10000))
-    assert(watermarkFrom("CAST(leftTime AS LONG) > CAST(rightTime AS FLOAT)") === Some(10000))
-    assert(watermarkFrom("CAST(leftTime AS DOUBLE) > CAST(rightTime AS FLOAT)") === Some(10000))
-    assert(watermarkFrom("CAST(leftTime AS STRING) > CAST(rightTime AS STRING)") === None)
-
-    // Test with timestamp type + calendar interval on either side of equation
-    // Note: timestamptype and calendar interval don't commute, so less valid combinations to test.
-    assert(watermarkFrom("leftTime > rightTime + interval 1 second") === Some(11000))
-    assert(watermarkFrom("leftTime + interval 2 seconds > rightTime ") === Some(8000))
-    assert(watermarkFrom("leftTime > rightTime - interval 3 second") === Some(7000))
-    assert(watermarkFrom("rightTime < leftTime - interval 3 second") === Some(13000))
-    assert(watermarkFrom("rightTime - interval 1 second < leftTime - interval 3 second")
-      === Some(12000))
-
-    // Test with casted long type + constants on either side of equation
-    // Note: long type and constants commute, so more combinations to test.
-    // -- Constants on the right
-    assert(watermarkFrom("CAST(leftTime AS LONG) > CAST(rightTime AS LONG) + 1") === Some(11000))
-    assert(watermarkFrom("CAST(leftTime AS LONG) > CAST(rightTime AS LONG) - 1") === Some(9000))
-    assert(watermarkFrom("CAST(leftTime AS LONG) > CAST((rightTime + interval 1 second) AS LONG)")
-      === Some(11000))
-    assert(watermarkFrom("CAST(leftTime AS LONG) > 2 + CAST(rightTime AS LONG)") === Some(12000))
-    assert(watermarkFrom("CAST(leftTime AS LONG) > -0.5 + CAST(rightTime AS LONG)") === Some(9500))
-    assert(watermarkFrom("CAST(leftTime AS LONG) - CAST(rightTime AS LONG) > 2") === Some(12000))
-    assert(watermarkFrom("-CAST(rightTime AS DOUBLE) + CAST(leftTime AS LONG) > 0.1")
-      === Some(10100))
-    assert(watermarkFrom("0 > CAST(rightTime AS LONG) - CAST(leftTime AS LONG) + 0.2")
-      === Some(10200))
-    // -- Constants on the left
-    assert(watermarkFrom("CAST(leftTime AS LONG) + 2 > CAST(rightTime AS LONG)") === Some(8000))
-    assert(watermarkFrom("1 + CAST(leftTime AS LONG) > CAST(rightTime AS LONG)") === Some(9000))
-    assert(watermarkFrom("CAST((leftTime  + interval 3 second) AS LONG) > CAST(rightTime AS LONG)")
-      === Some(7000))
-    assert(watermarkFrom("CAST(leftTime AS LONG) - 2 > CAST(rightTime AS LONG)") === Some(12000))
-    assert(watermarkFrom("CAST(leftTime AS LONG) + 0.5 > CAST(rightTime AS LONG)") === Some(9500))
-    assert(watermarkFrom("CAST(leftTime AS LONG) - CAST(rightTime AS LONG) - 2 > 0")
-      === Some(12000))
-    assert(watermarkFrom("-CAST(rightTime AS LONG) + CAST(leftTime AS LONG) - 0.1 > 0")
-      === Some(10100))
-    // -- Constants on both sides, mixed types
-    assert(watermarkFrom("CAST(leftTime AS LONG) - 2.0 > CAST(rightTime AS LONG) + 1")
-      === Some(13000))
-
-    // Test multiple conditions, should return minimum watermark
-    assert(watermarkFrom(
-      "leftTime > rightTime - interval 3 second AND rightTime < leftTime + interval 2 seconds") ===
-      Some(7000))  // first condition wins
-    assert(watermarkFrom(
-      "leftTime > rightTime - interval 3 second AND rightTime < leftTime + interval 4 seconds") ===
-      Some(6000))  // second condition wins
-
-    // Test invalid comparisons
-    assert(watermarkFrom("cast(leftTime AS LONG) > leftOther") === None)      // non-time attributes
-    assert(watermarkFrom("leftOther > rightOther") === None)                  // non-time attributes
-    assert(watermarkFrom("leftOther > rightOther AND leftTime > rightTime") === Some(10000))
-    assert(watermarkFrom("cast(rightTime AS DOUBLE) < rightOther") === None)  // non-time attributes
-    assert(watermarkFrom("leftTime > rightTime + interval 1 month") === None) // month not allowed
-
-    // Test static comparisons
-    assert(watermarkFrom("cast(leftTime AS LONG) > 10") === Some(10000))
-  }
-
   test("locality preferences of StateStoreAwareZippedRDD") {
     import StreamingSymmetricHashJoinHelper._
 
@@ -470,3 +366,189 @@ class StreamingJoinSuite extends StreamTest with StateStoreMetricsTest with Befo
     }
   }
 }
+
+class StreamingOuterJoinSuite extends StreamTest with StateStoreMetricsTest with BeforeAndAfter {
+
+  import testImplicits._
+  import org.apache.spark.sql.functions._
+
+  before {
+    SparkSession.setActiveSession(spark) // set this before force initializing 'joinExec'
+    spark.streams.stateStoreCoordinator // initialize the lazy coordinator
+  }
+
+  after {
+    StateStore.stop()
+  }
+
+  private def setupStream(prefix: String, multiplier: Int): (MemoryStream[Int], DataFrame) = {
+    val input = MemoryStream[Int]
+    val df = input.toDF
+      .select(
+        'value as "key",
+        'value.cast("timestamp") as s"${prefix}Time",
+        ('value * multiplier) as s"${prefix}Value")
+      .withWatermark(s"${prefix}Time", "10 seconds")
+
+    return (input, df)
+  }
+
+  private def setupWindowedJoin(joinType: String):
+  (MemoryStream[Int], MemoryStream[Int], DataFrame) = {
+    val (input1, df1) = setupStream("left", 2)
+    val (input2, df2) = setupStream("right", 3)
+    val windowed1 = df1.select('key, window('leftTime, "10 second"), 'leftValue)
+    val windowed2 = df2.select('key, window('rightTime, "10 second"), 'rightValue)
+    val joined = windowed1.join(windowed2, Seq("key", "window"), joinType)
+      .select('key, $"window.end".cast("long"), 'leftValue, 'rightValue)
+
+    (input1, input2, joined)
+  }
+
+  test("windowed left outer join") {
+    val (leftInput, rightInput, joined) = setupWindowedJoin("left_outer")
+
+    testStream(joined)(
+      // Test inner part of the join.
+      AddData(leftInput, 1, 2, 3, 4, 5),
+      AddData(rightInput, 3, 4, 5, 6, 7),
+      CheckLastBatch((3, 10, 6, 9), (4, 10, 8, 12), (5, 10, 10, 15)),
+      // Old state doesn't get dropped until the batch *after* it gets introduced, so the
+      // nulls won't show up until the next batch after the watermark advances.
+      AddData(leftInput, 21),
+      AddData(rightInput, 22),
+      CheckLastBatch(),
+      assertNumStateRows(total = 12, updated = 2),
+      AddData(leftInput, 22),
+      CheckLastBatch(Row(22, 30, 44, 66), Row(1, 10, 2, null), Row(2, 10, 4, null)),
+      assertNumStateRows(total = 3, updated = 1)
+    )
+  }
+
+  test("windowed right outer join") {
+    val (leftInput, rightInput, joined) = setupWindowedJoin("right_outer")
+
+    testStream(joined)(
+      // Test inner part of the join.
+      AddData(leftInput, 1, 2, 3, 4, 5),
+      AddData(rightInput, 3, 4, 5, 6, 7),
+      CheckLastBatch((3, 10, 6, 9), (4, 10, 8, 12), (5, 10, 10, 15)),
+      // Old state doesn't get dropped until the batch *after* it gets introduced, so the
+      // nulls won't show up until the next batch after the watermark advances.
+      AddData(leftInput, 21),
+      AddData(rightInput, 22),
+      CheckLastBatch(),
+      assertNumStateRows(total = 12, updated = 2),
+      AddData(leftInput, 22),
+      CheckLastBatch(Row(22, 30, 44, 66), Row(6, 10, null, 18), Row(7, 10, null, 21)),
+      assertNumStateRows(total = 3, updated = 1)
+    )
+  }
+
+  Seq(
+    ("left_outer", Row(3, null, 5, null)),
+    ("right_outer", Row(null, 2, null, 5))
+  ).foreach { case (joinType: String, outerResult) =>
+    test(s"${joinType.replaceAllLiterally("_", " ")} with watermark range condition") {
+      import org.apache.spark.sql.functions._
+
+      val leftInput = MemoryStream[(Int, Int)]
+      val rightInput = MemoryStream[(Int, Int)]
+
+      val df1 = leftInput.toDF.toDF("leftKey", "time")
+        .select('leftKey, 'time.cast("timestamp") as "leftTime", ('leftKey * 2) as "leftValue")
+        .withWatermark("leftTime", "10 seconds")
+
+      val df2 = rightInput.toDF.toDF("rightKey", "time")
+        .select('rightKey, 'time.cast("timestamp") as "rightTime", ('rightKey * 3) as "rightValue")
+        .withWatermark("rightTime", "10 seconds")
+
+      val joined =
+        df1.join(
+          df2,
+          expr("leftKey = rightKey AND " +
+            "leftTime BETWEEN rightTime - interval 5 seconds AND rightTime + interval 5 seconds"),
+          joinType)
+          .select('leftKey, 'rightKey, 'leftTime.cast("int"), 'rightTime.cast("int"))
+      testStream(joined)(
+        AddData(leftInput, (1, 5), (3, 5)),
+        CheckAnswer(),
+        AddData(rightInput, (1, 10), (2, 5)),
+        CheckLastBatch((1, 1, 5, 10)),
+        AddData(rightInput, (1, 11)),
+        CheckLastBatch(), // no match as left time is too low
+        assertNumStateRows(total = 5, updated = 1),
+
+        // Increase event time watermark to 20s by adding data with time = 30s on both inputs
+        AddData(leftInput, (1, 7), (1, 30)),
+        CheckLastBatch((1, 1, 7, 10), (1, 1, 7, 11)),
+        assertNumStateRows(total = 7, updated = 2),
+        AddData(rightInput, (0, 30)),
+        CheckLastBatch(),
+        assertNumStateRows(total = 8, updated = 1),
+        AddData(rightInput, (0, 30)),
+        CheckLastBatch(outerResult),
+        assertNumStateRows(total = 3, updated = 1)
+      )
+    }
+  }
+
+  // When the join condition isn't true, the outer null rows must be generated, even if the join
+  // keys themselves have a match.
+  test("left outer join with non-key condition violated on left") {
+    val (leftInput, simpleLeftDf) = setupStream("left", 2)
+    val (rightInput, simpleRightDf) = setupStream("right", 3)
+
+    val left = simpleLeftDf.select('key, window('leftTime, "10 second"), 'leftValue)
+    val right = simpleRightDf.select('key, window('rightTime, "10 second"), 'rightValue)
+
+    val joined = left.join(
+        right,
+        left("key") === right("key") && left("window") === right("window") &&
+            'leftValue > 10 && ('rightValue < 300 || 'rightValue > 1000),
+        "left_outer")
+      .select(left("key"), left("window.end").cast("long"), 'leftValue, 'rightValue)
+
+    testStream(joined)(
+      // leftValue <= 10 should generate outer join rows even though it matches right keys
+      AddData(leftInput, 1, 2, 3),
+      AddData(rightInput, 1, 2, 3),
+      CheckLastBatch(),
+      AddData(leftInput, 20),
+      AddData(rightInput, 21),
+      CheckLastBatch(),
+      assertNumStateRows(total = 8, updated = 2),
+      AddData(rightInput, 20),
+      CheckLastBatch(
+        Row(20, 30, 40, 60), Row(1, 10, 2, null), Row(2, 10, 4, null), Row(3, 10, 6, null)),
+      assertNumStateRows(total = 3, updated = 1),
+      // leftValue and rightValue both satisfying condition should not generate outer join rows
+      AddData(leftInput, 40, 41),
+      AddData(rightInput, 40, 41),
+      CheckLastBatch((40, 50, 80, 120), (41, 50, 82, 123)),
+      AddData(leftInput, 70),
+      AddData(rightInput, 71),
+      CheckLastBatch(),
+      assertNumStateRows(total = 6, updated = 2),
+      AddData(rightInput, 70),
+      CheckLastBatch((70, 80, 140, 210)),
+      assertNumStateRows(total = 3, updated = 1),
+      // rightValue between 300 and 1000 should generate outer join rows even though it matches left
+      AddData(leftInput, 101, 102, 103),
+      AddData(rightInput, 101, 102, 103),
+      CheckLastBatch(),
+      AddData(leftInput, 1000),
+      AddData(rightInput, 1001),
+      CheckLastBatch(),
+      assertNumStateRows(total = 8, updated = 2),
+      AddData(rightInput, 1000),
+      CheckLastBatch(
+        Row(1000, 1010, 2000, 3000),
+        Row(101, 110, 202, null),
+        Row(102, 110, 204, null),
+        Row(103, 110, 206, null)),
+      assertNumStateRows(total = 3, updated = 1)
+    )
+  }
+}
+

From d54670192a6acd892d13b511dfb62390be6ad39c Mon Sep 17 00:00:00 2001
From: Rekha Joshi <rekhajoshm@gmail.com>
Date: Wed, 4 Oct 2017 07:11:00 +0100
Subject: [PATCH 662/779] [SPARK-22193][SQL] Minor typo fix

## What changes were proposed in this pull request?

[SPARK-22193][SQL] Minor typo fix in SortMergeJoinExec. Nothing major, but it bothered me going into.Hence fixing

## How was this patch tested?
existing tests

Author: Rekha Joshi <rekhajoshm@gmail.com>
Author: rjoshi2 <rekhajoshm@gmail.com>

Closes #19422 from rekhajoshm/SPARK-22193.
---
 .../spark/sql/execution/joins/SortMergeJoinExec.scala       | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
index 14de2dc23e3c0..4e02803552e82 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
@@ -402,7 +402,7 @@ case class SortMergeJoinExec(
     }
   }
 
-  private def genComparision(ctx: CodegenContext, a: Seq[ExprCode], b: Seq[ExprCode]): String = {
+  private def genComparison(ctx: CodegenContext, a: Seq[ExprCode], b: Seq[ExprCode]): String = {
     val comparisons = a.zip(b).zipWithIndex.map { case ((l, r), i) =>
       s"""
          |if (comp == 0) {
@@ -463,7 +463,7 @@ case class SortMergeJoinExec(
          |      continue;
          |    }
          |    if (!$matches.isEmpty()) {
-         |      ${genComparision(ctx, leftKeyVars, matchedKeyVars)}
+         |      ${genComparison(ctx, leftKeyVars, matchedKeyVars)}
          |      if (comp == 0) {
          |        return true;
          |      }
@@ -484,7 +484,7 @@ case class SortMergeJoinExec(
          |        }
          |        ${rightKeyVars.map(_.code).mkString("\n")}
          |      }
-         |      ${genComparision(ctx, leftKeyVars, rightKeyVars)}
+         |      ${genComparison(ctx, leftKeyVars, rightKeyVars)}
          |      if (comp > 0) {
          |        $rightRow = null;
          |      } else if (comp < 0) {

From 64df08b64779bab629a8a90a3797d8bd70f61703 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Wed, 4 Oct 2017 15:06:44 +0800
Subject: [PATCH 663/779] [SPARK-20783][SQL] Create ColumnVector to abstract
 existing compressed column (batch method)

## What changes were proposed in this pull request?

This PR abstracts data compressed by `CompressibleColumnAccessor` using `ColumnVector` in batch method. When `ColumnAccessor.decompress` is called, `ColumnVector` will have uncompressed data. This batch decompress does not use `InternalRow` to reduce the number of memory accesses.

As first step of this implementation, this JIRA supports primitive data types. Another PR will support array and other data types.

This implementation decompress data in batch into uncompressed column batch, as rxin suggested at [here](https://github.com/apache/spark/pull/18468#issuecomment-316914076). Another implementation uses adapter approach [as cloud-fan suggested](https://github.com/apache/spark/pull/18468).

## How was this patch tested?

Added test suites

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #18704 from kiszk/SPARK-20783a.
---
 .../execution/columnar/ColumnDictionary.java  |  58 +++
 .../vectorized/OffHeapColumnVector.java       |  18 +
 .../vectorized/OnHeapColumnVector.java        |  18 +
 .../vectorized/WritableColumnVector.java      |  76 ++--
 .../execution/columnar/ColumnAccessor.scala   |  16 +-
 .../sql/execution/columnar/ColumnType.scala   |  33 ++
 .../CompressibleColumnAccessor.scala          |   4 +
 .../compression/CompressionScheme.scala       |   3 +
 .../compression/compressionSchemes.scala      | 340 +++++++++++++++++-
 .../compression/BooleanBitSetSuite.scala      |  52 +++
 .../compression/DictionaryEncodingSuite.scala |  72 +++-
 .../compression/IntegralDeltaSuite.scala      |  72 ++++
 .../PassThroughEncodingSuite.scala            | 189 ++++++++++
 .../compression/RunLengthEncodingSuite.scala  |  89 ++++-
 .../TestCompressibleColumnBuilder.scala       |   9 +-
 .../vectorized/ColumnVectorSuite.scala        | 183 +++++++++-
 .../vectorized/ColumnarBatchSuite.scala       |   4 +-
 17 files changed, 1192 insertions(+), 44 deletions(-)
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/execution/columnar/ColumnDictionary.java
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/PassThroughEncodingSuite.scala

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/columnar/ColumnDictionary.java b/sql/core/src/main/java/org/apache/spark/sql/execution/columnar/ColumnDictionary.java
new file mode 100644
index 0000000000000..f1785853a94ae
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/columnar/ColumnDictionary.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.columnar;
+
+import org.apache.spark.sql.execution.vectorized.Dictionary;
+
+public final class ColumnDictionary implements Dictionary {
+  private int[] intDictionary;
+  private long[] longDictionary;
+
+  public ColumnDictionary(int[] dictionary) {
+    this.intDictionary = dictionary;
+  }
+
+  public ColumnDictionary(long[] dictionary) {
+    this.longDictionary = dictionary;
+  }
+
+  @Override
+  public int decodeToInt(int id) {
+    return intDictionary[id];
+  }
+
+  @Override
+  public long decodeToLong(int id) {
+    return longDictionary[id];
+  }
+
+  @Override
+  public float decodeToFloat(int id) {
+    throw new UnsupportedOperationException("Dictionary encoding does not support float");
+  }
+
+  @Override
+  public double decodeToDouble(int id) {
+    throw new UnsupportedOperationException("Dictionary encoding does not support double");
+  }
+
+  @Override
+  public byte[] decodeToBinary(int id) {
+    throw new UnsupportedOperationException("Dictionary encoding does not support String");
+  }
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
index 8cbc895506d91..a7522ebf5821a 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
@@ -228,6 +228,12 @@ public void putShorts(int rowId, int count, short[] src, int srcIndex) {
         null, data + 2 * rowId, count * 2);
   }
 
+  @Override
+  public void putShorts(int rowId, int count, byte[] src, int srcIndex) {
+    Platform.copyMemory(src, Platform.BYTE_ARRAY_OFFSET + srcIndex,
+      null, data + rowId * 2, count * 2);
+  }
+
   @Override
   public short getShort(int rowId) {
     if (dictionary == null) {
@@ -268,6 +274,12 @@ public void putInts(int rowId, int count, int[] src, int srcIndex) {
         null, data + 4 * rowId, count * 4);
   }
 
+  @Override
+  public void putInts(int rowId, int count, byte[] src, int srcIndex) {
+    Platform.copyMemory(src, Platform.BYTE_ARRAY_OFFSET + srcIndex,
+      null, data + rowId * 4, count * 4);
+  }
+
   @Override
   public void putIntsLittleEndian(int rowId, int count, byte[] src, int srcIndex) {
     if (!bigEndianPlatform) {
@@ -334,6 +346,12 @@ public void putLongs(int rowId, int count, long[] src, int srcIndex) {
         null, data + 8 * rowId, count * 8);
   }
 
+  @Override
+  public void putLongs(int rowId, int count, byte[] src, int srcIndex) {
+    Platform.copyMemory(src, Platform.BYTE_ARRAY_OFFSET + srcIndex,
+      null, data + rowId * 8, count * 8);
+  }
+
   @Override
   public void putLongsLittleEndian(int rowId, int count, byte[] src, int srcIndex) {
     if (!bigEndianPlatform) {
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
index 2725a29eeabe8..166a39e0fabd9 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
@@ -233,6 +233,12 @@ public void putShorts(int rowId, int count, short[] src, int srcIndex) {
     System.arraycopy(src, srcIndex, shortData, rowId, count);
   }
 
+  @Override
+  public void putShorts(int rowId, int count, byte[] src, int srcIndex) {
+    Platform.copyMemory(src, Platform.BYTE_ARRAY_OFFSET + srcIndex, shortData,
+      Platform.SHORT_ARRAY_OFFSET + rowId * 2, count * 2);
+  }
+
   @Override
   public short getShort(int rowId) {
     if (dictionary == null) {
@@ -272,6 +278,12 @@ public void putInts(int rowId, int count, int[] src, int srcIndex) {
     System.arraycopy(src, srcIndex, intData, rowId, count);
   }
 
+  @Override
+  public void putInts(int rowId, int count, byte[] src, int srcIndex) {
+    Platform.copyMemory(src, Platform.BYTE_ARRAY_OFFSET + srcIndex, intData,
+      Platform.INT_ARRAY_OFFSET + rowId * 4, count * 4);
+  }
+
   @Override
   public void putIntsLittleEndian(int rowId, int count, byte[] src, int srcIndex) {
     int srcOffset = srcIndex + Platform.BYTE_ARRAY_OFFSET;
@@ -332,6 +344,12 @@ public void putLongs(int rowId, int count, long[] src, int srcIndex) {
     System.arraycopy(src, srcIndex, longData, rowId, count);
   }
 
+  @Override
+  public void putLongs(int rowId, int count, byte[] src, int srcIndex) {
+    Platform.copyMemory(src, Platform.BYTE_ARRAY_OFFSET + srcIndex, longData,
+      Platform.LONG_ARRAY_OFFSET + rowId * 8, count * 8);
+  }
+
   @Override
   public void putLongsLittleEndian(int rowId, int count, byte[] src, int srcIndex) {
     int srcOffset = srcIndex + Platform.BYTE_ARRAY_OFFSET;
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
index 163f2511e5f73..da72954ddc448 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
@@ -113,138 +113,156 @@ private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
   protected abstract void reserveInternal(int capacity);
 
   /**
-   * Sets the value at rowId to null/not null.
+   * Sets null/not null to the value at rowId.
    */
   public abstract void putNotNull(int rowId);
   public abstract void putNull(int rowId);
 
   /**
-   * Sets the values from [rowId, rowId + count) to null/not null.
+   * Sets null/not null to the values at [rowId, rowId + count).
    */
   public abstract void putNulls(int rowId, int count);
   public abstract void putNotNulls(int rowId, int count);
 
   /**
-   * Sets the value at rowId to `value`.
+   * Sets `value` to the value at rowId.
    */
   public abstract void putBoolean(int rowId, boolean value);
 
   /**
-   * Sets values from [rowId, rowId + count) to value.
+   * Sets value to [rowId, rowId + count).
    */
   public abstract void putBooleans(int rowId, int count, boolean value);
 
   /**
-   * Sets the value at rowId to `value`.
+   * Sets `value` to the value at rowId.
    */
   public abstract void putByte(int rowId, byte value);
 
   /**
-   * Sets values from [rowId, rowId + count) to value.
+   * Sets value to [rowId, rowId + count).
    */
   public abstract void putBytes(int rowId, int count, byte value);
 
   /**
-   * Sets values from [rowId, rowId + count) to [src + srcIndex, src + srcIndex + count)
+   * Sets values from [src[srcIndex], src[srcIndex + count]) to [rowId, rowId + count)
    */
   public abstract void putBytes(int rowId, int count, byte[] src, int srcIndex);
 
   /**
-   * Sets the value at rowId to `value`.
+   * Sets `value` to the value at rowId.
    */
   public abstract void putShort(int rowId, short value);
 
   /**
-   * Sets values from [rowId, rowId + count) to value.
+   * Sets value to [rowId, rowId + count).
    */
   public abstract void putShorts(int rowId, int count, short value);
 
   /**
-   * Sets values from [rowId, rowId + count) to [src + srcIndex, src + srcIndex + count)
+   * Sets values from [src[srcIndex], src[srcIndex + count]) to [rowId, rowId + count)
    */
   public abstract void putShorts(int rowId, int count, short[] src, int srcIndex);
 
   /**
-   * Sets the value at rowId to `value`.
+   * Sets values from [src[srcIndex], src[srcIndex + count * 2]) to [rowId, rowId + count)
+   * The data in src must be 2-byte platform native endian shorts.
+   */
+  public abstract void putShorts(int rowId, int count, byte[] src, int srcIndex);
+
+  /**
+   * Sets `value` to the value at rowId.
    */
   public abstract void putInt(int rowId, int value);
 
   /**
-   * Sets values from [rowId, rowId + count) to value.
+   * Sets value to [rowId, rowId + count).
    */
   public abstract void putInts(int rowId, int count, int value);
 
   /**
-   * Sets values from [rowId, rowId + count) to [src + srcIndex, src + srcIndex + count)
+   * Sets values from [src[srcIndex], src[srcIndex + count]) to [rowId, rowId + count)
    */
   public abstract void putInts(int rowId, int count, int[] src, int srcIndex);
 
   /**
-   * Sets values from [rowId, rowId + count) to [src[srcIndex], src[srcIndex + count])
+   * Sets values from [src[srcIndex], src[srcIndex + count * 4]) to [rowId, rowId + count)
+   * The data in src must be 4-byte platform native endian ints.
+   */
+  public abstract void putInts(int rowId, int count, byte[] src, int srcIndex);
+
+  /**
+   * Sets values from [src[srcIndex], src[srcIndex + count * 4]) to [rowId, rowId + count)
    * The data in src must be 4-byte little endian ints.
    */
   public abstract void putIntsLittleEndian(int rowId, int count, byte[] src, int srcIndex);
 
   /**
-   * Sets the value at rowId to `value`.
+   * Sets `value` to the value at rowId.
    */
   public abstract void putLong(int rowId, long value);
 
   /**
-   * Sets values from [rowId, rowId + count) to value.
+   * Sets value to [rowId, rowId + count).
    */
   public abstract void putLongs(int rowId, int count, long value);
 
   /**
-   * Sets values from [rowId, rowId + count) to [src + srcIndex, src + srcIndex + count)
+   * Sets values from [src[srcIndex], src[srcIndex + count]) to [rowId, rowId + count)
    */
   public abstract void putLongs(int rowId, int count, long[] src, int srcIndex);
 
   /**
-   * Sets values from [rowId, rowId + count) to [src[srcIndex], src[srcIndex + count])
+   * Sets values from [src[srcIndex], src[srcIndex + count * 8]) to [rowId, rowId + count)
+   * The data in src must be 8-byte platform native endian longs.
+   */
+  public abstract void putLongs(int rowId, int count, byte[] src, int srcIndex);
+
+  /**
+   * Sets values from [src + srcIndex, src + srcIndex + count * 8) to [rowId, rowId + count)
    * The data in src must be 8-byte little endian longs.
    */
   public abstract void putLongsLittleEndian(int rowId, int count, byte[] src, int srcIndex);
 
   /**
-   * Sets the value at rowId to `value`.
+   * Sets `value` to the value at rowId.
    */
   public abstract void putFloat(int rowId, float value);
 
   /**
-   * Sets values from [rowId, rowId + count) to value.
+   * Sets value to [rowId, rowId + count).
    */
   public abstract void putFloats(int rowId, int count, float value);
 
   /**
-   * Sets values from [rowId, rowId + count) to [src + srcIndex, src + srcIndex + count)
+   * Sets values from [src[srcIndex], src[srcIndex + count]) to [rowId, rowId + count)
    */
   public abstract void putFloats(int rowId, int count, float[] src, int srcIndex);
 
   /**
-   * Sets values from [rowId, rowId + count) to [src[srcIndex], src[srcIndex + count])
-   * The data in src must be ieee formatted floats.
+   * Sets values from [src[srcIndex], src[srcIndex + count * 4]) to [rowId, rowId + count)
+   * The data in src must be ieee formatted floats in platform native endian.
    */
   public abstract void putFloats(int rowId, int count, byte[] src, int srcIndex);
 
   /**
-   * Sets the value at rowId to `value`.
+   * Sets `value` to the value at rowId.
    */
   public abstract void putDouble(int rowId, double value);
 
   /**
-   * Sets values from [rowId, rowId + count) to value.
+   * Sets value to [rowId, rowId + count).
    */
   public abstract void putDoubles(int rowId, int count, double value);
 
   /**
-   * Sets values from [rowId, rowId + count) to [src + srcIndex, src + srcIndex + count)
+   * Sets values from [src[srcIndex], src[srcIndex + count]) to [rowId, rowId + count)
    */
   public abstract void putDoubles(int rowId, int count, double[] src, int srcIndex);
 
   /**
-   * Sets values from [rowId, rowId + count) to [src[srcIndex], src[srcIndex + count])
-   * The data in src must be ieee formatted doubles.
+   * Sets values from [src[srcIndex], src[srcIndex + count * 8]) to [rowId, rowId + count)
+   * The data in src must be ieee formatted doubles in platform native endian.
    */
   public abstract void putDoubles(int rowId, int count, byte[] src, int srcIndex);
 
@@ -254,7 +272,7 @@ private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
   public abstract void putArray(int rowId, int offset, int length);
 
   /**
-   * Sets the value at rowId to `value`.
+   * Sets values from [value + offset, value + offset + count) to the values at rowId.
    */
   public abstract int putByteArray(int rowId, byte[] value, int offset, int count);
   public final int putByteArray(int rowId, byte[] value) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnAccessor.scala
index 6241b79d9affc..24c8ac81420cb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnAccessor.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnAccessor.scala
@@ -24,6 +24,7 @@ import scala.annotation.tailrec
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{UnsafeArrayData, UnsafeMapData, UnsafeRow}
 import org.apache.spark.sql.execution.columnar.compression.CompressibleColumnAccessor
+import org.apache.spark.sql.execution.vectorized.WritableColumnVector
 import org.apache.spark.sql.types._
 
 /**
@@ -62,6 +63,9 @@ private[columnar] abstract class BasicColumnAccessor[JvmType](
   }
 
   protected def underlyingBuffer = buffer
+
+  def getByteBuffer: ByteBuffer =
+    buffer.duplicate.order(ByteOrder.nativeOrder())
 }
 
 private[columnar] class NullColumnAccessor(buffer: ByteBuffer)
@@ -122,7 +126,7 @@ private[columnar] class MapColumnAccessor(buffer: ByteBuffer, dataType: MapType)
   extends BasicColumnAccessor[UnsafeMapData](buffer, MAP(dataType))
   with NullableColumnAccessor
 
-private[columnar] object ColumnAccessor {
+private[sql] object ColumnAccessor {
   @tailrec
   def apply(dataType: DataType, buffer: ByteBuffer): ColumnAccessor = {
     val buf = buffer.order(ByteOrder.nativeOrder)
@@ -149,4 +153,14 @@ private[columnar] object ColumnAccessor {
         throw new Exception(s"not support type: $other")
     }
   }
+
+  def decompress(columnAccessor: ColumnAccessor, columnVector: WritableColumnVector, numRows: Int):
+      Unit = {
+    if (columnAccessor.isInstanceOf[NativeColumnAccessor[_]]) {
+      val nativeAccessor = columnAccessor.asInstanceOf[NativeColumnAccessor[_]]
+      nativeAccessor.decompress(columnVector, numRows)
+    } else {
+      throw new RuntimeException("Not support non-primitive type now")
+    }
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnType.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnType.scala
index 5cfb003e4f150..e9b150fd86095 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnType.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnType.scala
@@ -43,6 +43,12 @@ import org.apache.spark.unsafe.types.UTF8String
  * WARNING: This only works with HeapByteBuffer
  */
 private[columnar] object ByteBufferHelper {
+  def getShort(buffer: ByteBuffer): Short = {
+    val pos = buffer.position()
+    buffer.position(pos + 2)
+    Platform.getShort(buffer.array(), Platform.BYTE_ARRAY_OFFSET + pos)
+  }
+
   def getInt(buffer: ByteBuffer): Int = {
     val pos = buffer.position()
     buffer.position(pos + 4)
@@ -66,6 +72,33 @@ private[columnar] object ByteBufferHelper {
     buffer.position(pos + 8)
     Platform.getDouble(buffer.array(), Platform.BYTE_ARRAY_OFFSET + pos)
   }
+
+  def putShort(buffer: ByteBuffer, value: Short): Unit = {
+    val pos = buffer.position()
+    buffer.position(pos + 2)
+    Platform.putShort(buffer.array(), Platform.BYTE_ARRAY_OFFSET + pos, value)
+  }
+
+  def putInt(buffer: ByteBuffer, value: Int): Unit = {
+    val pos = buffer.position()
+    buffer.position(pos + 4)
+    Platform.putInt(buffer.array(), Platform.BYTE_ARRAY_OFFSET + pos, value)
+  }
+
+  def putLong(buffer: ByteBuffer, value: Long): Unit = {
+    val pos = buffer.position()
+    buffer.position(pos + 8)
+    Platform.putLong(buffer.array(), Platform.BYTE_ARRAY_OFFSET + pos, value)
+  }
+
+  def copyMemory(src: ByteBuffer, dst: ByteBuffer, len: Int): Unit = {
+    val srcPos = src.position()
+    val dstPos = dst.position()
+    src.position(srcPos + len)
+    dst.position(dstPos + len)
+    Platform.copyMemory(src.array(), Platform.BYTE_ARRAY_OFFSET + srcPos,
+      dst.array(), Platform.BYTE_ARRAY_OFFSET + dstPos, len)
+  }
 }
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/compression/CompressibleColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/compression/CompressibleColumnAccessor.scala
index e1d13ad0e94e5..774011f1e3de8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/compression/CompressibleColumnAccessor.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/compression/CompressibleColumnAccessor.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.columnar.compression
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.execution.columnar.{ColumnAccessor, NativeColumnAccessor}
+import org.apache.spark.sql.execution.vectorized.WritableColumnVector
 import org.apache.spark.sql.types.AtomicType
 
 private[columnar] trait CompressibleColumnAccessor[T <: AtomicType] extends ColumnAccessor {
@@ -36,4 +37,7 @@ private[columnar] trait CompressibleColumnAccessor[T <: AtomicType] extends Colu
   override def extractSingle(row: InternalRow, ordinal: Int): Unit = {
     decoder.next(row, ordinal)
   }
+
+  def decompress(columnVector: WritableColumnVector, capacity: Int): Unit =
+    decoder.decompress(columnVector, capacity)
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/compression/CompressionScheme.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/compression/CompressionScheme.scala
index 6e4f1c5b80684..f8aeba44257d8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/compression/CompressionScheme.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/compression/CompressionScheme.scala
@@ -21,6 +21,7 @@ import java.nio.{ByteBuffer, ByteOrder}
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.execution.columnar.{ColumnType, NativeColumnType}
+import org.apache.spark.sql.execution.vectorized.WritableColumnVector
 import org.apache.spark.sql.types.AtomicType
 
 private[columnar] trait Encoder[T <: AtomicType] {
@@ -41,6 +42,8 @@ private[columnar] trait Decoder[T <: AtomicType] {
   def next(row: InternalRow, ordinal: Int): Unit
 
   def hasNext: Boolean
+
+  def decompress(columnVector: WritableColumnVector, capacity: Int): Unit
 }
 
 private[columnar] trait CompressionScheme {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/compression/compressionSchemes.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/compression/compressionSchemes.scala
index ee99c90a751d9..bf00ad997c76e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/compression/compressionSchemes.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/compression/compressionSchemes.scala
@@ -18,12 +18,14 @@
 package org.apache.spark.sql.execution.columnar.compression
 
 import java.nio.ByteBuffer
+import java.nio.ByteOrder
 
 import scala.collection.mutable
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.SpecificInternalRow
 import org.apache.spark.sql.execution.columnar._
+import org.apache.spark.sql.execution.vectorized.WritableColumnVector
 import org.apache.spark.sql.types._
 
 
@@ -61,6 +63,101 @@ private[columnar] case object PassThrough extends CompressionScheme {
     }
 
     override def hasNext: Boolean = buffer.hasRemaining
+
+    private def putBooleans(
+        columnVector: WritableColumnVector, pos: Int, bufferPos: Int, len: Int): Unit = {
+      for (i <- 0 until len) {
+        columnVector.putBoolean(pos + i, (buffer.get(bufferPos + i) != 0))
+      }
+    }
+
+    private def putBytes(
+        columnVector: WritableColumnVector, pos: Int, bufferPos: Int, len: Int): Unit = {
+      columnVector.putBytes(pos, len, buffer.array, bufferPos)
+    }
+
+    private def putShorts(
+        columnVector: WritableColumnVector, pos: Int, bufferPos: Int, len: Int): Unit = {
+      columnVector.putShorts(pos, len, buffer.array, bufferPos)
+    }
+
+    private def putInts(
+        columnVector: WritableColumnVector, pos: Int, bufferPos: Int, len: Int): Unit = {
+      columnVector.putInts(pos, len, buffer.array, bufferPos)
+    }
+
+    private def putLongs(
+        columnVector: WritableColumnVector, pos: Int, bufferPos: Int, len: Int): Unit = {
+      columnVector.putLongs(pos, len, buffer.array, bufferPos)
+    }
+
+    private def putFloats(
+        columnVector: WritableColumnVector, pos: Int, bufferPos: Int, len: Int): Unit = {
+      columnVector.putFloats(pos, len, buffer.array, bufferPos)
+    }
+
+    private def putDoubles(
+        columnVector: WritableColumnVector, pos: Int, bufferPos: Int, len: Int): Unit = {
+      columnVector.putDoubles(pos, len, buffer.array, bufferPos)
+    }
+
+    private def decompress0(
+        columnVector: WritableColumnVector,
+        capacity: Int,
+        unitSize: Int,
+        putFunction: (WritableColumnVector, Int, Int, Int) => Unit): Unit = {
+      val nullsBuffer = buffer.duplicate().order(ByteOrder.nativeOrder())
+      nullsBuffer.rewind()
+      val nullCount = ByteBufferHelper.getInt(nullsBuffer)
+      var nextNullIndex = if (nullCount > 0) ByteBufferHelper.getInt(nullsBuffer) else capacity
+      var pos = 0
+      var seenNulls = 0
+      var bufferPos = buffer.position
+      while (pos < capacity) {
+        if (pos != nextNullIndex) {
+          val len = nextNullIndex - pos
+          assert(len * unitSize < Int.MaxValue)
+          putFunction(columnVector, pos, bufferPos, len)
+          bufferPos += len * unitSize
+          pos += len
+        } else {
+          seenNulls += 1
+          nextNullIndex = if (seenNulls < nullCount) {
+            ByteBufferHelper.getInt(nullsBuffer)
+          } else {
+            capacity
+          }
+          columnVector.putNull(pos)
+          pos += 1
+        }
+      }
+    }
+
+    override def decompress(columnVector: WritableColumnVector, capacity: Int): Unit = {
+      columnType.dataType match {
+        case _: BooleanType =>
+          val unitSize = 1
+          decompress0(columnVector, capacity, unitSize, putBooleans)
+        case _: ByteType =>
+          val unitSize = 1
+          decompress0(columnVector, capacity, unitSize, putBytes)
+        case _: ShortType =>
+          val unitSize = 2
+          decompress0(columnVector, capacity, unitSize, putShorts)
+        case _: IntegerType =>
+          val unitSize = 4
+          decompress0(columnVector, capacity, unitSize, putInts)
+        case _: LongType =>
+          val unitSize = 8
+          decompress0(columnVector, capacity, unitSize, putLongs)
+        case _: FloatType =>
+          val unitSize = 4
+          decompress0(columnVector, capacity, unitSize, putFloats)
+        case _: DoubleType =>
+          val unitSize = 8
+          decompress0(columnVector, capacity, unitSize, putDoubles)
+      }
+    }
   }
 }
 
@@ -169,6 +266,94 @@ private[columnar] case object RunLengthEncoding extends CompressionScheme {
     }
 
     override def hasNext: Boolean = valueCount < run || buffer.hasRemaining
+
+    private def putBoolean(columnVector: WritableColumnVector, pos: Int, value: Long): Unit = {
+      columnVector.putBoolean(pos, value == 1)
+    }
+
+    private def getByte(buffer: ByteBuffer): Long = {
+      buffer.get().toLong
+    }
+
+    private def putByte(columnVector: WritableColumnVector, pos: Int, value: Long): Unit = {
+      columnVector.putByte(pos, value.toByte)
+    }
+
+    private def getShort(buffer: ByteBuffer): Long = {
+      buffer.getShort().toLong
+    }
+
+    private def putShort(columnVector: WritableColumnVector, pos: Int, value: Long): Unit = {
+      columnVector.putShort(pos, value.toShort)
+    }
+
+    private def getInt(buffer: ByteBuffer): Long = {
+      buffer.getInt().toLong
+    }
+
+    private def putInt(columnVector: WritableColumnVector, pos: Int, value: Long): Unit = {
+      columnVector.putInt(pos, value.toInt)
+    }
+
+    private def getLong(buffer: ByteBuffer): Long = {
+      buffer.getLong()
+    }
+
+    private def putLong(columnVector: WritableColumnVector, pos: Int, value: Long): Unit = {
+      columnVector.putLong(pos, value)
+    }
+
+    private def decompress0(
+        columnVector: WritableColumnVector,
+        capacity: Int,
+        getFunction: (ByteBuffer) => Long,
+        putFunction: (WritableColumnVector, Int, Long) => Unit): Unit = {
+      val nullsBuffer = buffer.duplicate().order(ByteOrder.nativeOrder())
+      nullsBuffer.rewind()
+      val nullCount = ByteBufferHelper.getInt(nullsBuffer)
+      var nextNullIndex = if (nullCount > 0) ByteBufferHelper.getInt(nullsBuffer) else -1
+      var pos = 0
+      var seenNulls = 0
+      var runLocal = 0
+      var valueCountLocal = 0
+      var currentValueLocal: Long = 0
+
+      while (valueCountLocal < runLocal || (pos < capacity)) {
+        if (pos != nextNullIndex) {
+          if (valueCountLocal == runLocal) {
+            currentValueLocal = getFunction(buffer)
+            runLocal = ByteBufferHelper.getInt(buffer)
+            valueCountLocal = 1
+          } else {
+            valueCountLocal += 1
+          }
+          putFunction(columnVector, pos, currentValueLocal)
+        } else {
+          seenNulls += 1
+          if (seenNulls < nullCount) {
+            nextNullIndex = ByteBufferHelper.getInt(nullsBuffer)
+          }
+          columnVector.putNull(pos)
+        }
+        pos += 1
+      }
+    }
+
+    override def decompress(columnVector: WritableColumnVector, capacity: Int): Unit = {
+      columnType.dataType match {
+        case _: BooleanType =>
+          decompress0(columnVector, capacity, getByte, putBoolean)
+        case _: ByteType =>
+          decompress0(columnVector, capacity, getByte, putByte)
+        case _: ShortType =>
+          decompress0(columnVector, capacity, getShort, putShort)
+        case _: IntegerType =>
+          decompress0(columnVector, capacity, getInt, putInt)
+        case _: LongType =>
+          decompress0(columnVector, capacity, getLong, putLong)
+        case _ => throw new IllegalStateException("Not supported type in RunLengthEncoding.")
+      }
+    }
   }
 }
 
@@ -266,11 +451,32 @@ private[columnar] case object DictionaryEncoding extends CompressionScheme {
   }
 
   class Decoder[T <: AtomicType](buffer: ByteBuffer, columnType: NativeColumnType[T])
-    extends compression.Decoder[T] {
-
-    private val dictionary: Array[Any] = {
-      val elementNum = ByteBufferHelper.getInt(buffer)
-      Array.fill[Any](elementNum)(columnType.extract(buffer).asInstanceOf[Any])
+      extends compression.Decoder[T] {
+    val elementNum = ByteBufferHelper.getInt(buffer)
+    private val dictionary: Array[Any] = new Array[Any](elementNum)
+    private var intDictionary: Array[Int] = null
+    private var longDictionary: Array[Long] = null
+
+    columnType.dataType match {
+      case _: IntegerType =>
+        intDictionary = new Array[Int](elementNum)
+        for (i <- 0 until elementNum) {
+          val v = columnType.extract(buffer).asInstanceOf[Int]
+          intDictionary(i) = v
+          dictionary(i) = v
+        }
+      case _: LongType =>
+        longDictionary = new Array[Long](elementNum)
+        for (i <- 0 until elementNum) {
+          val v = columnType.extract(buffer).asInstanceOf[Long]
+          longDictionary(i) = v
+          dictionary(i) = v
+        }
+      case _: StringType =>
+        for (i <- 0 until elementNum) {
+          val v = columnType.extract(buffer).asInstanceOf[Any]
+          dictionary(i) = v
+        }
     }
 
     override def next(row: InternalRow, ordinal: Int): Unit = {
@@ -278,6 +484,46 @@ private[columnar] case object DictionaryEncoding extends CompressionScheme {
     }
 
     override def hasNext: Boolean = buffer.hasRemaining
+
+    override def decompress(columnVector: WritableColumnVector, capacity: Int): Unit = {
+      val nullsBuffer = buffer.duplicate().order(ByteOrder.nativeOrder())
+      nullsBuffer.rewind()
+      val nullCount = ByteBufferHelper.getInt(nullsBuffer)
+      var nextNullIndex = if (nullCount > 0) ByteBufferHelper.getInt(nullsBuffer) else -1
+      var pos = 0
+      var seenNulls = 0
+      columnType.dataType match {
+        case _: IntegerType =>
+          val dictionaryIds = columnVector.reserveDictionaryIds(capacity)
+          columnVector.setDictionary(new ColumnDictionary(intDictionary))
+          while (pos < capacity) {
+            if (pos != nextNullIndex) {
+              dictionaryIds.putInt(pos, buffer.getShort())
+            } else {
+              seenNulls += 1
+              if (seenNulls < nullCount) nextNullIndex = ByteBufferHelper.getInt(nullsBuffer)
+              columnVector.putNull(pos)
+            }
+            pos += 1
+          }
+        case _: LongType =>
+          val dictionaryIds = columnVector.reserveDictionaryIds(capacity)
+          columnVector.setDictionary(new ColumnDictionary(longDictionary))
+          while (pos < capacity) {
+            if (pos != nextNullIndex) {
+              dictionaryIds.putInt(pos, buffer.getShort())
+            } else {
+              seenNulls += 1
+              if (seenNulls < nullCount) {
+                nextNullIndex = ByteBufferHelper.getInt(nullsBuffer)
+              }
+              columnVector.putNull(pos)
+            }
+            pos += 1
+          }
+        case _ => throw new IllegalStateException("Not supported type in DictionaryEncoding.")
+      }
+    }
   }
 }
 
@@ -368,6 +614,38 @@ private[columnar] case object BooleanBitSet extends CompressionScheme {
     }
 
     override def hasNext: Boolean = visited < count
+
+    override def decompress(columnVector: WritableColumnVector, capacity: Int): Unit = {
+      val countLocal = count
+      var currentWordLocal: Long = 0
+      var visitedLocal: Int = 0
+      val nullsBuffer = buffer.duplicate().order(ByteOrder.nativeOrder())
+      nullsBuffer.rewind()
+      val nullCount = ByteBufferHelper.getInt(nullsBuffer)
+      var nextNullIndex = if (nullCount > 0) ByteBufferHelper.getInt(nullsBuffer) else -1
+      var pos = 0
+      var seenNulls = 0
+
+      while (visitedLocal < countLocal) {
+        if (pos != nextNullIndex) {
+          val bit = visitedLocal % BITS_PER_LONG
+
+          visitedLocal += 1
+          if (bit == 0) {
+            currentWordLocal = ByteBufferHelper.getLong(buffer)
+          }
+
+          columnVector.putBoolean(pos, ((currentWordLocal >> bit) & 1) != 0)
+        } else {
+          seenNulls += 1
+          if (seenNulls < nullCount) {
+            nextNullIndex = ByteBufferHelper.getInt(nullsBuffer)
+          }
+          columnVector.putNull(pos)
+        }
+        pos += 1
+      }
+    }
   }
 }
 
@@ -448,6 +726,32 @@ private[columnar] case object IntDelta extends CompressionScheme {
       prev = if (delta > Byte.MinValue) prev + delta else ByteBufferHelper.getInt(buffer)
       row.setInt(ordinal, prev)
     }
+
+    override def decompress(columnVector: WritableColumnVector, capacity: Int): Unit = {
+      var prevLocal: Int = 0
+      val nullsBuffer = buffer.duplicate().order(ByteOrder.nativeOrder())
+      nullsBuffer.rewind()
+      val nullCount = ByteBufferHelper.getInt(nullsBuffer)
+      var nextNullIndex = if (nullCount > 0) ByteBufferHelper.getInt(nullsBuffer) else -1
+      var pos = 0
+      var seenNulls = 0
+
+      while (pos < capacity) {
+        if (pos != nextNullIndex) {
+          val delta = buffer.get
+          prevLocal = if (delta > Byte.MinValue) { prevLocal + delta } else
+          { ByteBufferHelper.getInt(buffer) }
+          columnVector.putInt(pos, prevLocal)
+        } else {
+          seenNulls += 1
+          if (seenNulls < nullCount) {
+            nextNullIndex = ByteBufferHelper.getInt(nullsBuffer)
+          }
+          columnVector.putNull(pos)
+        }
+        pos += 1
+      }
+    }
   }
 }
 
@@ -528,5 +832,31 @@ private[columnar] case object LongDelta extends CompressionScheme {
       prev = if (delta > Byte.MinValue) prev + delta else ByteBufferHelper.getLong(buffer)
       row.setLong(ordinal, prev)
     }
+
+    override def decompress(columnVector: WritableColumnVector, capacity: Int): Unit = {
+      var prevLocal: Long = 0
+      val nullsBuffer = buffer.duplicate().order(ByteOrder.nativeOrder())
+      nullsBuffer.rewind
+      val nullCount = ByteBufferHelper.getInt(nullsBuffer)
+      var nextNullIndex = if (nullCount > 0) ByteBufferHelper.getInt(nullsBuffer) else -1
+      var pos = 0
+      var seenNulls = 0
+
+      while (pos < capacity) {
+        if (pos != nextNullIndex) {
+          val delta = buffer.get()
+          prevLocal = if (delta > Byte.MinValue) { prevLocal + delta } else
+          { ByteBufferHelper.getLong(buffer) }
+          columnVector.putLong(pos, prevLocal)
+        } else {
+          seenNulls += 1
+          if (seenNulls < nullCount) {
+            nextNullIndex = ByteBufferHelper.getInt(nullsBuffer)
+          }
+          columnVector.putNull(pos)
+        }
+        pos += 1
+      }
+    }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/BooleanBitSetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/BooleanBitSetSuite.scala
index d01bf911e3a77..2d71a42628dfb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/BooleanBitSetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/BooleanBitSetSuite.scala
@@ -22,6 +22,8 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
 import org.apache.spark.sql.execution.columnar.{BOOLEAN, NoopColumnStats}
 import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._
+import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector
+import org.apache.spark.sql.types.BooleanType
 
 class BooleanBitSetSuite extends SparkFunSuite {
   import BooleanBitSet._
@@ -85,6 +87,36 @@ class BooleanBitSetSuite extends SparkFunSuite {
     assert(!decoder.hasNext)
   }
 
+  def skeletonForDecompress(count: Int) {
+    val builder = TestCompressibleColumnBuilder(new NoopColumnStats, BOOLEAN, BooleanBitSet)
+    val rows = Seq.fill[InternalRow](count)(makeRandomRow(BOOLEAN))
+    val values = rows.map(_.getBoolean(0))
+
+    rows.foreach(builder.appendFrom(_, 0))
+    val buffer = builder.build()
+
+    // ----------------
+    // Tests decompress
+    // ----------------
+
+    // Rewinds, skips column header and 4 more bytes for compression scheme ID
+    val headerSize = CompressionScheme.columnHeaderSize(buffer)
+    buffer.position(headerSize)
+    assertResult(BooleanBitSet.typeId, "Wrong compression scheme ID")(buffer.getInt())
+
+    val decoder = BooleanBitSet.decoder(buffer, BOOLEAN)
+    val columnVector = new OnHeapColumnVector(values.length, BooleanType)
+    decoder.decompress(columnVector, values.length)
+
+    if (values.nonEmpty) {
+      values.zipWithIndex.foreach { case (b: Boolean, index: Int) =>
+        assertResult(b, s"Wrong ${index}-th decoded boolean value") {
+          columnVector.getBoolean(index)
+        }
+      }
+    }
+  }
+
   test(s"$BooleanBitSet: empty") {
     skeleton(0)
   }
@@ -104,4 +136,24 @@ class BooleanBitSetSuite extends SparkFunSuite {
   test(s"$BooleanBitSet: multiple words and 1 more bit") {
     skeleton(BITS_PER_LONG * 2 + 1)
   }
+
+  test(s"$BooleanBitSet: empty for decompression()") {
+    skeletonForDecompress(0)
+  }
+
+  test(s"$BooleanBitSet: less than 1 word for decompression()") {
+    skeletonForDecompress(BITS_PER_LONG - 1)
+  }
+
+  test(s"$BooleanBitSet: exactly 1 word for decompression()") {
+    skeletonForDecompress(BITS_PER_LONG)
+  }
+
+  test(s"$BooleanBitSet: multiple whole words for decompression()") {
+    skeletonForDecompress(BITS_PER_LONG * 2)
+  }
+
+  test(s"$BooleanBitSet: multiple words and 1 more bit for decompression()") {
+    skeletonForDecompress(BITS_PER_LONG * 2 + 1)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/DictionaryEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/DictionaryEncodingSuite.scala
index 67139b13d7882..28950b74cf1c8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/DictionaryEncodingSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/DictionaryEncodingSuite.scala
@@ -23,16 +23,19 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
 import org.apache.spark.sql.execution.columnar._
 import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._
+import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector
 import org.apache.spark.sql.types.AtomicType
 
 class DictionaryEncodingSuite extends SparkFunSuite {
+  val nullValue = -1
   testDictionaryEncoding(new IntColumnStats, INT)
   testDictionaryEncoding(new LongColumnStats, LONG)
-  testDictionaryEncoding(new StringColumnStats, STRING)
+  testDictionaryEncoding(new StringColumnStats, STRING, false)
 
   def testDictionaryEncoding[T <: AtomicType](
       columnStats: ColumnStats,
-      columnType: NativeColumnType[T]) {
+      columnType: NativeColumnType[T],
+      testDecompress: Boolean = true) {
 
     val typeName = columnType.getClass.getSimpleName.stripSuffix("$")
 
@@ -113,6 +116,58 @@ class DictionaryEncodingSuite extends SparkFunSuite {
       }
     }
 
+    def skeletonForDecompress(uniqueValueCount: Int, inputSeq: Seq[Int]) {
+      if (!testDecompress) return
+      val builder = TestCompressibleColumnBuilder(columnStats, columnType, DictionaryEncoding)
+      val (values, rows) = makeUniqueValuesAndSingleValueRows(columnType, uniqueValueCount)
+      val dictValues = stableDistinct(inputSeq)
+
+      val nullRow = new GenericInternalRow(1)
+      nullRow.setNullAt(0)
+      inputSeq.foreach { i =>
+        if (i == nullValue) {
+          builder.appendFrom(nullRow, 0)
+        } else {
+          builder.appendFrom(rows(i), 0)
+        }
+      }
+      val buffer = builder.build()
+
+      // ----------------
+      // Tests decompress
+      // ----------------
+      // Rewinds, skips column header and 4 more bytes for compression scheme ID
+      val headerSize = CompressionScheme.columnHeaderSize(buffer)
+      buffer.position(headerSize)
+      assertResult(DictionaryEncoding.typeId, "Wrong compression scheme ID")(buffer.getInt())
+
+      val decoder = DictionaryEncoding.decoder(buffer, columnType)
+      val columnVector = new OnHeapColumnVector(inputSeq.length, columnType.dataType)
+      decoder.decompress(columnVector, inputSeq.length)
+
+      if (inputSeq.nonEmpty) {
+        inputSeq.zipWithIndex.foreach { case (i: Any, index: Int) =>
+          if (i == nullValue) {
+            assertResult(true, s"Wrong null ${index}-th position") {
+              columnVector.isNullAt(index)
+            }
+          } else {
+            columnType match {
+              case INT =>
+                assertResult(values(i), s"Wrong ${index}-th decoded int value") {
+                  columnVector.getInt(index)
+                }
+              case LONG =>
+                assertResult(values(i), s"Wrong ${index}-th decoded long value") {
+                  columnVector.getLong(index)
+                }
+              case _ => fail("Unsupported type")
+            }
+          }
+        }
+      }
+    }
+
     test(s"$DictionaryEncoding with $typeName: empty") {
       skeleton(0, Seq.empty)
     }
@@ -124,5 +179,18 @@ class DictionaryEncodingSuite extends SparkFunSuite {
     test(s"$DictionaryEncoding with $typeName: dictionary overflow") {
       skeleton(DictionaryEncoding.MAX_DICT_SIZE + 1, 0 to DictionaryEncoding.MAX_DICT_SIZE)
     }
+
+    test(s"$DictionaryEncoding with $typeName: empty for decompress()") {
+      skeletonForDecompress(0, Seq.empty)
+    }
+
+    test(s"$DictionaryEncoding with $typeName: simple case for decompress()") {
+      skeletonForDecompress(2, Seq(0, nullValue, 0, nullValue))
+    }
+
+    test(s"$DictionaryEncoding with $typeName: dictionary overflow for decompress()") {
+      skeletonForDecompress(DictionaryEncoding.MAX_DICT_SIZE + 2,
+        Seq(nullValue) ++ (0 to DictionaryEncoding.MAX_DICT_SIZE - 1) ++ Seq(nullValue))
+    }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/IntegralDeltaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/IntegralDeltaSuite.scala
index 411d31fa0e29b..0d9f1fb0c02c9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/IntegralDeltaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/IntegralDeltaSuite.scala
@@ -21,9 +21,11 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
 import org.apache.spark.sql.execution.columnar._
 import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._
+import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector
 import org.apache.spark.sql.types.IntegralType
 
 class IntegralDeltaSuite extends SparkFunSuite {
+  val nullValue = -1
   testIntegralDelta(new IntColumnStats, INT, IntDelta)
   testIntegralDelta(new LongColumnStats, LONG, LongDelta)
 
@@ -109,6 +111,53 @@ class IntegralDeltaSuite extends SparkFunSuite {
       assert(!decoder.hasNext)
     }
 
+    def skeletonForDecompress(input: Seq[I#InternalType]) {
+      val builder = TestCompressibleColumnBuilder(columnStats, columnType, scheme)
+      val row = new GenericInternalRow(1)
+      val nullRow = new GenericInternalRow(1)
+      nullRow.setNullAt(0)
+      input.map { value =>
+        if (value == nullValue) {
+          builder.appendFrom(nullRow, 0)
+        } else {
+          columnType.setField(row, 0, value)
+          builder.appendFrom(row, 0)
+        }
+      }
+      val buffer = builder.build()
+
+      // ----------------
+      // Tests decompress
+      // ----------------
+      // Rewinds, skips column header and 4 more bytes for compression scheme ID
+      val headerSize = CompressionScheme.columnHeaderSize(buffer)
+      buffer.position(headerSize)
+      assertResult(scheme.typeId, "Wrong compression scheme ID")(buffer.getInt())
+
+      val decoder = scheme.decoder(buffer, columnType)
+      val columnVector = new OnHeapColumnVector(input.length, columnType.dataType)
+      decoder.decompress(columnVector, input.length)
+
+      if (input.nonEmpty) {
+        input.zipWithIndex.foreach {
+          case (expected: Any, index: Int) if expected == nullValue =>
+            assertResult(true, s"Wrong null ${index}th-position") {
+              columnVector.isNullAt(index)
+            }
+          case (expected: Int, index: Int) =>
+            assertResult(expected, s"Wrong ${index}-th decoded int value") {
+              columnVector.getInt(index)
+            }
+          case (expected: Long, index: Int) =>
+            assertResult(expected, s"Wrong ${index}-th decoded long value") {
+              columnVector.getLong(index)
+            }
+          case _ =>
+            fail("Unsupported type")
+        }
+      }
+    }
+
     test(s"$scheme: empty column") {
       skeleton(Seq.empty)
     }
@@ -127,5 +176,28 @@ class IntegralDeltaSuite extends SparkFunSuite {
       val input = Array.fill[Any](10000)(makeRandomValue(columnType))
       skeleton(input.map(_.asInstanceOf[I#InternalType]))
     }
+
+
+    test(s"$scheme: empty column for decompress()") {
+      skeletonForDecompress(Seq.empty)
+    }
+
+    test(s"$scheme: simple case for decompress()") {
+      val input = columnType match {
+        case INT => Seq(2: Int, 1: Int, 2: Int, 130: Int)
+        case LONG => Seq(2: Long, 1: Long, 2: Long, 130: Long)
+      }
+
+      skeletonForDecompress(input.map(_.asInstanceOf[I#InternalType]))
+    }
+
+    test(s"$scheme: simple case with null for decompress()") {
+      val input = columnType match {
+        case INT => Seq(2: Int, 1: Int, 2: Int, nullValue: Int, 5: Int)
+        case LONG => Seq(2: Long, 1: Long, 2: Long, nullValue: Long, 5: Long)
+      }
+
+      skeletonForDecompress(input.map(_.asInstanceOf[I#InternalType]))
+    }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/PassThroughEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/PassThroughEncodingSuite.scala
new file mode 100644
index 0000000000000..b6f0b5e6277b4
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/PassThroughEncodingSuite.scala
@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.columnar.compression
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
+import org.apache.spark.sql.execution.columnar._
+import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._
+import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector
+import org.apache.spark.sql.types.AtomicType
+
+class PassThroughSuite extends SparkFunSuite {
+  val nullValue = -1
+  testPassThrough(new ByteColumnStats, BYTE)
+  testPassThrough(new ShortColumnStats, SHORT)
+  testPassThrough(new IntColumnStats, INT)
+  testPassThrough(new LongColumnStats, LONG)
+  testPassThrough(new FloatColumnStats, FLOAT)
+  testPassThrough(new DoubleColumnStats, DOUBLE)
+
+  def testPassThrough[T <: AtomicType](
+      columnStats: ColumnStats,
+      columnType: NativeColumnType[T]) {
+
+    val typeName = columnType.getClass.getSimpleName.stripSuffix("$")
+
+    def skeleton(input: Seq[T#InternalType]) {
+      // -------------
+      // Tests encoder
+      // -------------
+
+      val builder = TestCompressibleColumnBuilder(columnStats, columnType, PassThrough)
+
+      input.map { value =>
+        val row = new GenericInternalRow(1)
+        columnType.setField(row, 0, value)
+        builder.appendFrom(row, 0)
+      }
+
+      val buffer = builder.build()
+      // Column type ID + null count + null positions
+      val headerSize = CompressionScheme.columnHeaderSize(buffer)
+
+      // Compression scheme ID + compressed contents
+      val compressedSize = 4 + input.size * columnType.defaultSize
+
+      // 4 extra bytes for compression scheme type ID
+      assertResult(headerSize + compressedSize, "Wrong buffer capacity")(buffer.capacity)
+
+      buffer.position(headerSize)
+      assertResult(PassThrough.typeId, "Wrong compression scheme ID")(buffer.getInt())
+
+      if (input.nonEmpty) {
+        input.foreach { value =>
+          assertResult(value, "Wrong value")(columnType.extract(buffer))
+        }
+      }
+
+      // -------------
+      // Tests decoder
+      // -------------
+
+      // Rewinds, skips column header and 4 more bytes for compression scheme ID
+      buffer.rewind().position(headerSize + 4)
+
+      val decoder = PassThrough.decoder(buffer, columnType)
+      val mutableRow = new GenericInternalRow(1)
+
+      if (input.nonEmpty) {
+        input.foreach{
+          assert(decoder.hasNext)
+          assertResult(_, "Wrong decoded value") {
+            decoder.next(mutableRow, 0)
+            columnType.getField(mutableRow, 0)
+          }
+        }
+      }
+      assert(!decoder.hasNext)
+    }
+
+    def skeletonForDecompress(input: Seq[T#InternalType]) {
+      val builder = TestCompressibleColumnBuilder(columnStats, columnType, PassThrough)
+      val row = new GenericInternalRow(1)
+      val nullRow = new GenericInternalRow(1)
+      nullRow.setNullAt(0)
+      input.map { value =>
+        if (value == nullValue) {
+          builder.appendFrom(nullRow, 0)
+        } else {
+          columnType.setField(row, 0, value)
+          builder.appendFrom(row, 0)
+        }
+      }
+      val buffer = builder.build()
+
+      // ----------------
+      // Tests decompress
+      // ----------------
+      // Rewinds, skips column header and 4 more bytes for compression scheme ID
+      val headerSize = CompressionScheme.columnHeaderSize(buffer)
+      buffer.position(headerSize)
+      assertResult(PassThrough.typeId, "Wrong compression scheme ID")(buffer.getInt())
+
+      val decoder = PassThrough.decoder(buffer, columnType)
+      val columnVector = new OnHeapColumnVector(input.length, columnType.dataType)
+      decoder.decompress(columnVector, input.length)
+
+      if (input.nonEmpty) {
+        input.zipWithIndex.foreach {
+          case (expected: Any, index: Int) if expected == nullValue =>
+            assertResult(true, s"Wrong null ${index}th-position") {
+              columnVector.isNullAt(index)
+            }
+          case (expected: Byte, index: Int) =>
+            assertResult(expected, s"Wrong ${index}-th decoded byte value") {
+              columnVector.getByte(index)
+            }
+          case (expected: Short, index: Int) =>
+            assertResult(expected, s"Wrong ${index}-th decoded short value") {
+              columnVector.getShort(index)
+            }
+          case (expected: Int, index: Int) =>
+            assertResult(expected, s"Wrong ${index}-th decoded int value") {
+              columnVector.getInt(index)
+            }
+          case (expected: Long, index: Int) =>
+            assertResult(expected, s"Wrong ${index}-th decoded long value") {
+              columnVector.getLong(index)
+            }
+          case (expected: Float, index: Int) =>
+            assertResult(expected, s"Wrong ${index}-th decoded float value") {
+              columnVector.getFloat(index)
+            }
+          case (expected: Double, index: Int) =>
+            assertResult(expected, s"Wrong ${index}-th decoded double value") {
+              columnVector.getDouble(index)
+            }
+          case _ => fail("Unsupported type")
+        }
+      }
+    }
+
+    test(s"$PassThrough with $typeName: empty column") {
+      skeleton(Seq.empty)
+    }
+
+    test(s"$PassThrough with $typeName: long random series") {
+      val input = Array.fill[Any](10000)(makeRandomValue(columnType))
+      skeleton(input.map(_.asInstanceOf[T#InternalType]))
+    }
+
+    test(s"$PassThrough with $typeName: empty column for decompress()") {
+      skeletonForDecompress(Seq.empty)
+    }
+
+    test(s"$PassThrough with $typeName: long random series for decompress()") {
+      val input = Array.fill[Any](10000)(makeRandomValue(columnType))
+      skeletonForDecompress(input.map(_.asInstanceOf[T#InternalType]))
+    }
+
+    test(s"$PassThrough with $typeName: simple case with null for decompress()") {
+      val input = columnType match {
+        case BYTE => Seq(2: Byte, 1: Byte, 2: Byte, nullValue.toByte: Byte, 5: Byte)
+        case SHORT => Seq(2: Short, 1: Short, 2: Short, nullValue.toShort: Short, 5: Short)
+        case INT => Seq(2: Int, 1: Int, 2: Int, nullValue: Int, 5: Int)
+        case LONG => Seq(2: Long, 1: Long, 2: Long, nullValue: Long, 5: Long)
+        case FLOAT => Seq(2: Float, 1: Float, 2: Float, nullValue: Float, 5: Float)
+        case DOUBLE => Seq(2: Double, 1: Double, 2: Double, nullValue: Double, 5: Double)
+      }
+
+      skeletonForDecompress(input.map(_.asInstanceOf[T#InternalType]))
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/RunLengthEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/RunLengthEncodingSuite.scala
index dffa9b364ebfe..eb1cdd9bbceff 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/RunLengthEncodingSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/RunLengthEncodingSuite.scala
@@ -21,19 +21,22 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
 import org.apache.spark.sql.execution.columnar._
 import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._
+import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector
 import org.apache.spark.sql.types.AtomicType
 
 class RunLengthEncodingSuite extends SparkFunSuite {
+  val nullValue = -1
   testRunLengthEncoding(new NoopColumnStats, BOOLEAN)
   testRunLengthEncoding(new ByteColumnStats, BYTE)
   testRunLengthEncoding(new ShortColumnStats, SHORT)
   testRunLengthEncoding(new IntColumnStats, INT)
   testRunLengthEncoding(new LongColumnStats, LONG)
-  testRunLengthEncoding(new StringColumnStats, STRING)
+  testRunLengthEncoding(new StringColumnStats, STRING, false)
 
   def testRunLengthEncoding[T <: AtomicType](
       columnStats: ColumnStats,
-      columnType: NativeColumnType[T]) {
+      columnType: NativeColumnType[T],
+      testDecompress: Boolean = true) {
 
     val typeName = columnType.getClass.getSimpleName.stripSuffix("$")
 
@@ -95,6 +98,72 @@ class RunLengthEncodingSuite extends SparkFunSuite {
       assert(!decoder.hasNext)
     }
 
+    def skeletonForDecompress(uniqueValueCount: Int, inputRuns: Seq[(Int, Int)]) {
+      if (!testDecompress) return
+      val builder = TestCompressibleColumnBuilder(columnStats, columnType, RunLengthEncoding)
+      val (values, rows) = makeUniqueValuesAndSingleValueRows(columnType, uniqueValueCount)
+      val inputSeq = inputRuns.flatMap { case (index, run) =>
+        Seq.fill(run)(index)
+      }
+
+      val nullRow = new GenericInternalRow(1)
+      nullRow.setNullAt(0)
+      inputSeq.foreach { i =>
+        if (i == nullValue) {
+          builder.appendFrom(nullRow, 0)
+        } else {
+          builder.appendFrom(rows(i), 0)
+        }
+      }
+      val buffer = builder.build()
+
+      // ----------------
+      // Tests decompress
+      // ----------------
+      // Rewinds, skips column header and 4 more bytes for compression scheme ID
+      val headerSize = CompressionScheme.columnHeaderSize(buffer)
+      buffer.position(headerSize)
+      assertResult(RunLengthEncoding.typeId, "Wrong compression scheme ID")(buffer.getInt())
+
+      val decoder = RunLengthEncoding.decoder(buffer, columnType)
+      val columnVector = new OnHeapColumnVector(inputSeq.length, columnType.dataType)
+      decoder.decompress(columnVector, inputSeq.length)
+
+      if (inputSeq.nonEmpty) {
+        inputSeq.zipWithIndex.foreach {
+          case (expected: Any, index: Int) if expected == nullValue =>
+            assertResult(true, s"Wrong null ${index}th-position") {
+              columnVector.isNullAt(index)
+            }
+          case (i: Int, index: Int) =>
+            columnType match {
+              case BOOLEAN =>
+                assertResult(values(i), s"Wrong ${index}-th decoded boolean value") {
+                  columnVector.getBoolean(index)
+                }
+              case BYTE =>
+                assertResult(values(i), s"Wrong ${index}-th decoded byte value") {
+                  columnVector.getByte(index)
+                }
+              case SHORT =>
+                assertResult(values(i), s"Wrong ${index}-th decoded short value") {
+                  columnVector.getShort(index)
+                }
+              case INT =>
+                assertResult(values(i), s"Wrong ${index}-th decoded int value") {
+                  columnVector.getInt(index)
+                }
+              case LONG =>
+                assertResult(values(i), s"Wrong ${index}-th decoded long value") {
+                  columnVector.getLong(index)
+                }
+              case _ => fail("Unsupported type")
+            }
+          case _ => fail("Unsupported type")
+        }
+      }
+    }
+
     test(s"$RunLengthEncoding with $typeName: empty column") {
       skeleton(0, Seq.empty)
     }
@@ -110,5 +179,21 @@ class RunLengthEncodingSuite extends SparkFunSuite {
     test(s"$RunLengthEncoding with $typeName: single long run") {
       skeleton(1, Seq(0 -> 1000))
     }
+
+    test(s"$RunLengthEncoding with $typeName: empty column for decompress()") {
+      skeletonForDecompress(0, Seq.empty)
+    }
+
+    test(s"$RunLengthEncoding with $typeName: simple case for decompress()") {
+      skeletonForDecompress(2, Seq(0 -> 2, 1 -> 2))
+    }
+
+    test(s"$RunLengthEncoding with $typeName: single long run for decompress()") {
+      skeletonForDecompress(1, Seq(0 -> 1000))
+    }
+
+    test(s"$RunLengthEncoding with $typeName: single case with null for decompress()") {
+      skeletonForDecompress(2, Seq(0 -> 2, nullValue -> 2))
+    }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/TestCompressibleColumnBuilder.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/TestCompressibleColumnBuilder.scala
index 5e078f251375a..310cb0be5f5a2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/TestCompressibleColumnBuilder.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/TestCompressibleColumnBuilder.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.execution.columnar.compression
 
 import org.apache.spark.sql.execution.columnar._
-import org.apache.spark.sql.types.AtomicType
+import org.apache.spark.sql.types.{AtomicType, DataType}
 
 class TestCompressibleColumnBuilder[T <: AtomicType](
     override val columnStats: ColumnStats,
@@ -42,3 +42,10 @@ object TestCompressibleColumnBuilder {
     builder
   }
 }
+
+object ColumnBuilderHelper {
+  def apply(
+      dataType: DataType, batchSize: Int, name: String, useCompression: Boolean): ColumnBuilder = {
+    ColumnBuilder(dataType, batchSize, name, useCompression)
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
index 85da8270d4cba..c5c8ae3a17c6c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
@@ -20,7 +20,10 @@ package org.apache.spark.sql.execution.vectorized
 import org.scalatest.BeforeAndAfterEach
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.expressions.SpecificInternalRow
 import org.apache.spark.sql.catalyst.util.ArrayData
+import org.apache.spark.sql.execution.columnar.ColumnAccessor
+import org.apache.spark.sql.execution.columnar.compression.ColumnBuilderHelper
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -31,14 +34,21 @@ class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
     try block(vector) finally vector.close()
   }
 
+  private def withVectors(
+      size: Int,
+      dt: DataType)(
+      block: WritableColumnVector => Unit): Unit = {
+    withVector(new OnHeapColumnVector(size, dt))(block)
+    withVector(new OffHeapColumnVector(size, dt))(block)
+  }
+
   private def testVectors(
       name: String,
       size: Int,
       dt: DataType)(
       block: WritableColumnVector => Unit): Unit = {
     test(name) {
-      withVector(new OnHeapColumnVector(size, dt))(block)
-      withVector(new OffHeapColumnVector(size, dt))(block)
+      withVectors(size, dt)(block)
     }
   }
 
@@ -218,4 +228,173 @@ class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
       (0 until 8).foreach(i => assert(testVector.isNullAt(i) == (i % 2 == 0)))
     }
   }
+
+  test("CachedBatch boolean Apis") {
+    val dataType = BooleanType
+    val columnBuilder = ColumnBuilderHelper(dataType, 1024, "col", true)
+    val row = new SpecificInternalRow(Array(dataType))
+
+    row.setNullAt(0)
+    columnBuilder.appendFrom(row, 0)
+    for (i <- 1 until 16) {
+      row.setBoolean(0, i % 2 == 0)
+      columnBuilder.appendFrom(row, 0)
+    }
+
+    withVectors(16, dataType) { testVector =>
+      val columnAccessor = ColumnAccessor(dataType, columnBuilder.build)
+      ColumnAccessor.decompress(columnAccessor, testVector, 16)
+
+      assert(testVector.isNullAt(0) == true)
+      for (i <- 1 until 16) {
+        assert(testVector.isNullAt(i) == false)
+        assert(testVector.getBoolean(i) == (i % 2 == 0))
+      }
+    }
+  }
+
+  test("CachedBatch byte Apis") {
+    val dataType = ByteType
+    val columnBuilder = ColumnBuilderHelper(dataType, 1024, "col", true)
+    val row = new SpecificInternalRow(Array(dataType))
+
+    row.setNullAt(0)
+    columnBuilder.appendFrom(row, 0)
+    for (i <- 1 until 16) {
+      row.setByte(0, i.toByte)
+      columnBuilder.appendFrom(row, 0)
+    }
+
+    withVectors(16, dataType) { testVector =>
+      val columnAccessor = ColumnAccessor(dataType, columnBuilder.build)
+      ColumnAccessor.decompress(columnAccessor, testVector, 16)
+
+      assert(testVector.isNullAt(0) == true)
+      for (i <- 1 until 16) {
+        assert(testVector.isNullAt(i) == false)
+        assert(testVector.getByte(i) == i)
+      }
+    }
+  }
+
+  test("CachedBatch short Apis") {
+    val dataType = ShortType
+    val columnBuilder = ColumnBuilderHelper(dataType, 1024, "col", true)
+    val row = new SpecificInternalRow(Array(dataType))
+
+    row.setNullAt(0)
+    columnBuilder.appendFrom(row, 0)
+    for (i <- 1 until 16) {
+      row.setShort(0, i.toShort)
+      columnBuilder.appendFrom(row, 0)
+    }
+
+    withVectors(16, dataType) { testVector =>
+      val columnAccessor = ColumnAccessor(dataType, columnBuilder.build)
+      ColumnAccessor.decompress(columnAccessor, testVector, 16)
+
+      assert(testVector.isNullAt(0) == true)
+      for (i <- 1 until 16) {
+        assert(testVector.isNullAt(i) == false)
+        assert(testVector.getShort(i) == i)
+      }
+    }
+  }
+
+  test("CachedBatch int Apis") {
+    val dataType = IntegerType
+    val columnBuilder = ColumnBuilderHelper(dataType, 1024, "col", true)
+    val row = new SpecificInternalRow(Array(dataType))
+
+    row.setNullAt(0)
+    columnBuilder.appendFrom(row, 0)
+    for (i <- 1 until 16) {
+      row.setInt(0, i)
+      columnBuilder.appendFrom(row, 0)
+    }
+
+    withVectors(16, dataType) { testVector =>
+      val columnAccessor = ColumnAccessor(dataType, columnBuilder.build)
+      ColumnAccessor.decompress(columnAccessor, testVector, 16)
+
+      assert(testVector.isNullAt(0) == true)
+      for (i <- 1 until 16) {
+        assert(testVector.isNullAt(i) == false)
+        assert(testVector.getInt(i) == i)
+      }
+    }
+  }
+
+  test("CachedBatch long Apis") {
+    val dataType = LongType
+    val columnBuilder = ColumnBuilderHelper(dataType, 1024, "col", true)
+    val row = new SpecificInternalRow(Array(dataType))
+
+    row.setNullAt(0)
+    columnBuilder.appendFrom(row, 0)
+    for (i <- 1 until 16) {
+      row.setLong(0, i.toLong)
+      columnBuilder.appendFrom(row, 0)
+    }
+
+    withVectors(16, dataType) { testVector =>
+      val columnAccessor = ColumnAccessor(dataType, columnBuilder.build)
+      ColumnAccessor.decompress(columnAccessor, testVector, 16)
+
+      assert(testVector.isNullAt(0) == true)
+      for (i <- 1 until 16) {
+        assert(testVector.isNullAt(i) == false)
+        assert(testVector.getLong(i) == i.toLong)
+      }
+    }
+  }
+
+  test("CachedBatch float Apis") {
+    val dataType = FloatType
+    val columnBuilder = ColumnBuilderHelper(dataType, 1024, "col", true)
+    val row = new SpecificInternalRow(Array(dataType))
+
+    row.setNullAt(0)
+    columnBuilder.appendFrom(row, 0)
+    for (i <- 1 until 16) {
+      row.setFloat(0, i.toFloat)
+      columnBuilder.appendFrom(row, 0)
+    }
+
+    withVectors(16, dataType) { testVector =>
+      val columnAccessor = ColumnAccessor(dataType, columnBuilder.build)
+      ColumnAccessor.decompress(columnAccessor, testVector, 16)
+
+      assert(testVector.isNullAt(0) == true)
+      for (i <- 1 until 16) {
+        assert(testVector.isNullAt(i) == false)
+        assert(testVector.getFloat(i) == i.toFloat)
+      }
+    }
+  }
+
+  test("CachedBatch double Apis") {
+    val dataType = DoubleType
+    val columnBuilder = ColumnBuilderHelper(dataType, 1024, "col", true)
+    val row = new SpecificInternalRow(Array(dataType))
+
+    row.setNullAt(0)
+    columnBuilder.appendFrom(row, 0)
+    for (i <- 1 until 16) {
+      row.setDouble(0, i.toDouble)
+      columnBuilder.appendFrom(row, 0)
+    }
+
+    withVectors(16, dataType) { testVector =>
+      val columnAccessor = ColumnAccessor(dataType, columnBuilder.build)
+      ColumnAccessor.decompress(columnAccessor, testVector, 16)
+
+      assert(testVector.isNullAt(0) == true)
+      for (i <- 1 until 16) {
+        assert(testVector.isNullAt(i) == false)
+        assert(testVector.getDouble(i) == i.toDouble)
+      }
+    }
+  }
 }
+
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
index 983eb103682c1..0b179aa97c479 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
@@ -413,7 +413,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
 
       reference.zipWithIndex.foreach { v =>
         assert(v._1 == column.getLong(v._2), "idx=" + v._2 +
-            " Seed = " + seed + " MemMode=" + memMode)
+          " Seed = " + seed + " MemMode=" + memMode)
         if (memMode == MemoryMode.OFF_HEAP) {
           val addr = column.valuesNativeAddress()
           assert(v._1 == Platform.getLong(null, addr + 8 * v._2))
@@ -1120,7 +1120,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
       }
       batch.close()
     }
-  }}
+    }}
 
   /**
    * This test generates a random schema data, serializes it to column batches and verifies the

From 4a779bdac3e75c17b7d36c5a009ba6c948fa9fb6 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Wed, 4 Oct 2017 10:08:24 -0700
Subject: [PATCH 664/779] [SPARK-21871][SQL] Check actual bytecode size when
 compiling generated code

## What changes were proposed in this pull request?
This pr added code to check actual bytecode size when compiling generated code. In #18810, we added code to give up code compilation and use interpreter execution in `SparkPlan` if the line number of generated functions goes over `maxLinesPerFunction`. But, we already have code to collect metrics for compiled bytecode size in `CodeGenerator` object. So,we could easily reuse the code for this purpose.

## How was this patch tested?
Added tests in `WholeStageCodegenSuite`.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #19083 from maropu/SPARK-21871.
---
 .../expressions/codegen/CodeFormatter.scala   |  8 ---
 .../expressions/codegen/CodeGenerator.scala   | 59 +++++++++----------
 .../codegen/GenerateMutableProjection.scala   |  4 +-
 .../codegen/GenerateOrdering.scala            |  3 +-
 .../codegen/GeneratePredicate.scala           |  3 +-
 .../codegen/GenerateSafeProjection.scala      |  4 +-
 .../codegen/GenerateUnsafeProjection.scala    |  4 +-
 .../codegen/GenerateUnsafeRowJoiner.scala     |  4 +-
 .../apache/spark/sql/internal/SQLConf.scala   | 15 ++---
 .../codegen/CodeFormatterSuite.scala          | 32 ----------
 .../sql/execution/WholeStageCodegenExec.scala | 25 ++++----
 .../columnar/GenerateColumnAccessor.scala     |  3 +-
 .../execution/WholeStageCodegenSuite.scala    | 43 ++++----------
 .../benchmark/AggregateBenchmark.scala        | 36 +++++------
 14 files changed, 94 insertions(+), 149 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala
index 7b398f424cead..60e600d8dbd8f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala
@@ -89,14 +89,6 @@ object CodeFormatter {
     }
     new CodeAndComment(code.result().trim(), map)
   }
-
-  def stripExtraNewLinesAndComments(input: String): String = {
-    val commentReg =
-      ("""([ |\t]*?\/\*[\s|\S]*?\*\/[ |\t]*?)|""" +    // strip /*comment*/
-       """([ |\t]*?\/\/[\s\S]*?\n)""").r               // strip //comment
-    val codeWithoutComment = commentReg.replaceAllIn(input, "")
-    codeWithoutComment.replaceAll("""\n\s*\n""", "\n") // strip ExtraNewLines
-  }
 }
 
 private class CodeFormatter {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index f3b45799c5688..f9c5ef8439085 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -373,20 +373,6 @@ class CodegenContext {
    */
   private val placeHolderToComments = new mutable.HashMap[String, String]
 
-  /**
-   * It will count the lines of every Java function generated by whole-stage codegen,
-   * if there is a function of length greater than spark.sql.codegen.maxLinesPerFunction,
-   * it will return true.
-   */
-  def isTooLongGeneratedFunction: Boolean = {
-    classFunctions.values.exists { _.values.exists {
-      code =>
-        val codeWithoutComments = CodeFormatter.stripExtraNewLinesAndComments(code)
-        codeWithoutComments.count(_ == '\n') > SQLConf.get.maxLinesPerFunction
-      }
-    }
-  }
-
   /**
    * Returns a term name that is unique within this instance of a `CodegenContext`.
    */
@@ -1020,10 +1006,16 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
 }
 
 object CodeGenerator extends Logging {
+
+  // This is the value of HugeMethodLimit in the OpenJDK JVM settings
+  val DEFAULT_JVM_HUGE_METHOD_LIMIT = 8000
+
   /**
    * Compile the Java source code into a Java class, using Janino.
+   *
+   * @return a pair of a generated class and the max bytecode size of generated functions.
    */
-  def compile(code: CodeAndComment): GeneratedClass = try {
+  def compile(code: CodeAndComment): (GeneratedClass, Int) = try {
     cache.get(code)
   } catch {
     // Cache.get() may wrap the original exception. See the following URL
@@ -1036,7 +1028,7 @@ object CodeGenerator extends Logging {
   /**
    * Compile the Java source code into a Java class, using Janino.
    */
-  private[this] def doCompile(code: CodeAndComment): GeneratedClass = {
+  private[this] def doCompile(code: CodeAndComment): (GeneratedClass, Int) = {
     val evaluator = new ClassBodyEvaluator()
 
     // A special classloader used to wrap the actual parent classloader of
@@ -1075,9 +1067,9 @@ object CodeGenerator extends Logging {
       s"\n${CodeFormatter.format(code)}"
     })
 
-    try {
+    val maxCodeSize = try {
       evaluator.cook("generated.java", code.body)
-      recordCompilationStats(evaluator)
+      updateAndGetCompilationStats(evaluator)
     } catch {
       case e: JaninoRuntimeException =>
         val msg = s"failed to compile: $e"
@@ -1092,13 +1084,15 @@ object CodeGenerator extends Logging {
         logInfo(s"\n${CodeFormatter.format(code, maxLines)}")
         throw new CompileException(msg, e.getLocation)
     }
-    evaluator.getClazz().newInstance().asInstanceOf[GeneratedClass]
+
+    (evaluator.getClazz().newInstance().asInstanceOf[GeneratedClass], maxCodeSize)
   }
 
   /**
-   * Records the generated class and method bytecode sizes by inspecting janino private fields.
+   * Returns the max bytecode size of the generated functions by inspecting janino private fields.
+   * Also, this method updates the metrics information.
    */
-  private def recordCompilationStats(evaluator: ClassBodyEvaluator): Unit = {
+  private def updateAndGetCompilationStats(evaluator: ClassBodyEvaluator): Int = {
     // First retrieve the generated classes.
     val classes = {
       val resultField = classOf[SimpleCompiler].getDeclaredField("result")
@@ -1113,23 +1107,26 @@ object CodeGenerator extends Logging {
     val codeAttr = Utils.classForName("org.codehaus.janino.util.ClassFile$CodeAttribute")
     val codeAttrField = codeAttr.getDeclaredField("code")
     codeAttrField.setAccessible(true)
-    classes.foreach { case (_, classBytes) =>
+    val codeSizes = classes.flatMap { case (_, classBytes) =>
       CodegenMetrics.METRIC_GENERATED_CLASS_BYTECODE_SIZE.update(classBytes.length)
       try {
         val cf = new ClassFile(new ByteArrayInputStream(classBytes))
-        cf.methodInfos.asScala.foreach { method =>
-          method.getAttributes().foreach { a =>
-            if (a.getClass.getName == codeAttr.getName) {
-              CodegenMetrics.METRIC_GENERATED_METHOD_BYTECODE_SIZE.update(
-                codeAttrField.get(a).asInstanceOf[Array[Byte]].length)
-            }
+        val stats = cf.methodInfos.asScala.flatMap { method =>
+          method.getAttributes().filter(_.getClass.getName == codeAttr.getName).map { a =>
+            val byteCodeSize = codeAttrField.get(a).asInstanceOf[Array[Byte]].length
+            CodegenMetrics.METRIC_GENERATED_METHOD_BYTECODE_SIZE.update(byteCodeSize)
+            byteCodeSize
           }
         }
+        Some(stats)
       } catch {
         case NonFatal(e) =>
           logWarning("Error calculating stats of compiled class.", e)
+          None
       }
-    }
+    }.flatten
+
+    codeSizes.max
   }
 
   /**
@@ -1144,8 +1141,8 @@ object CodeGenerator extends Logging {
   private val cache = CacheBuilder.newBuilder()
     .maximumSize(100)
     .build(
-      new CacheLoader[CodeAndComment, GeneratedClass]() {
-        override def load(code: CodeAndComment): GeneratedClass = {
+      new CacheLoader[CodeAndComment, (GeneratedClass, Int)]() {
+        override def load(code: CodeAndComment): (GeneratedClass, Int) = {
           val startTime = System.nanoTime()
           val result = doCompile(code)
           val endTime = System.nanoTime()
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
index 3768dcde00a4e..b5429fade53cf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
@@ -142,7 +142,7 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], MutableP
       new CodeAndComment(codeBody, ctx.getPlaceHolderToComments()))
     logDebug(s"code for ${expressions.mkString(",")}:\n${CodeFormatter.format(code)}")
 
-    val c = CodeGenerator.compile(code)
-    c.generate(ctx.references.toArray).asInstanceOf[MutableProjection]
+    val (clazz, _) = CodeGenerator.compile(code)
+    clazz.generate(ctx.references.toArray).asInstanceOf[MutableProjection]
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
index 4e47895985209..1639d1b9dda1f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
@@ -185,7 +185,8 @@ object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[InternalR
       new CodeAndComment(codeBody, ctx.getPlaceHolderToComments()))
     logDebug(s"Generated Ordering by ${ordering.mkString(",")}:\n${CodeFormatter.format(code)}")
 
-    CodeGenerator.compile(code).generate(ctx.references.toArray).asInstanceOf[BaseOrdering]
+    val (clazz, _) = CodeGenerator.compile(code)
+    clazz.generate(ctx.references.toArray).asInstanceOf[BaseOrdering]
   }
 }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
index e35b9dda6c017..e0fabad6d089a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
@@ -78,6 +78,7 @@ object GeneratePredicate extends CodeGenerator[Expression, Predicate] {
       new CodeAndComment(codeBody, ctx.getPlaceHolderToComments()))
     logDebug(s"Generated predicate '$predicate':\n${CodeFormatter.format(code)}")
 
-    CodeGenerator.compile(code).generate(ctx.references.toArray).asInstanceOf[Predicate]
+    val (clazz, _) = CodeGenerator.compile(code)
+    clazz.generate(ctx.references.toArray).asInstanceOf[Predicate]
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
index 192701a829686..1e4ac3f2afd52 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
@@ -189,8 +189,8 @@ object GenerateSafeProjection extends CodeGenerator[Seq[Expression], Projection]
       new CodeAndComment(codeBody, ctx.getPlaceHolderToComments()))
     logDebug(s"code for ${expressions.mkString(",")}:\n${CodeFormatter.format(code)}")
 
-    val c = CodeGenerator.compile(code)
+    val (clazz, _) = CodeGenerator.compile(code)
     val resultRow = new SpecificInternalRow(expressions.map(_.dataType))
-    c.generate(ctx.references.toArray :+ resultRow).asInstanceOf[Projection]
+    clazz.generate(ctx.references.toArray :+ resultRow).asInstanceOf[Projection]
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
index f2a66efc98e71..4bd50aee05514 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
@@ -409,7 +409,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
       new CodeAndComment(codeBody, ctx.getPlaceHolderToComments()))
     logDebug(s"code for ${expressions.mkString(",")}:\n${CodeFormatter.format(code)}")
 
-    val c = CodeGenerator.compile(code)
-    c.generate(ctx.references.toArray).asInstanceOf[UnsafeProjection]
+    val (clazz, _) = CodeGenerator.compile(code)
+    clazz.generate(ctx.references.toArray).asInstanceOf[UnsafeProjection]
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoiner.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoiner.scala
index 4aa5ec82471ec..6bc72a0d75c6d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoiner.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoiner.scala
@@ -196,7 +196,7 @@ object GenerateUnsafeRowJoiner extends CodeGenerator[(StructType, StructType), U
     val code = CodeFormatter.stripOverlappingComments(new CodeAndComment(codeBody, Map.empty))
     logDebug(s"SpecificUnsafeRowJoiner($schema1, $schema2):\n${CodeFormatter.format(code)}")
 
-    val c = CodeGenerator.compile(code)
-    c.generate(Array.empty).asInstanceOf[UnsafeRowJoiner]
+    val (clazz, _) = CodeGenerator.compile(code)
+    clazz.generate(Array.empty).asInstanceOf[UnsafeRowJoiner]
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 1a73d168b9b6e..58323740b80cc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -30,6 +30,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
 import org.apache.spark.network.util.ByteUnit
 import org.apache.spark.sql.catalyst.analysis.Resolver
+import org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator
 import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -575,15 +576,15 @@ object SQLConf {
       "disable logging or -1 to apply no limit.")
     .createWithDefault(1000)
 
-  val WHOLESTAGE_MAX_LINES_PER_FUNCTION = buildConf("spark.sql.codegen.maxLinesPerFunction")
+  val WHOLESTAGE_HUGE_METHOD_LIMIT = buildConf("spark.sql.codegen.hugeMethodLimit")
     .internal()
-    .doc("The maximum lines of a single Java function generated by whole-stage codegen. " +
-      "When the generated function exceeds this threshold, " +
+    .doc("The maximum bytecode size of a single compiled Java function generated by whole-stage " +
+      "codegen. When the compiled function exceeds this threshold, " +
       "the whole-stage codegen is deactivated for this subtree of the current query plan. " +
-      "The default value 4000 is the max length of byte code JIT supported " +
-      "for a single function(8000) divided by 2.")
+      s"The default value is ${CodeGenerator.DEFAULT_JVM_HUGE_METHOD_LIMIT} and " +
+      "this is a limit in the OpenJDK JVM implementation.")
     .intConf
-    .createWithDefault(4000)
+    .createWithDefault(CodeGenerator.DEFAULT_JVM_HUGE_METHOD_LIMIT)
 
   val FILES_MAX_PARTITION_BYTES = buildConf("spark.sql.files.maxPartitionBytes")
     .doc("The maximum number of bytes to pack into a single partition when reading files.")
@@ -1058,7 +1059,7 @@ class SQLConf extends Serializable with Logging {
 
   def loggingMaxLinesForCodegen: Int = getConf(CODEGEN_LOGGING_MAX_LINES)
 
-  def maxLinesPerFunction: Int = getConf(WHOLESTAGE_MAX_LINES_PER_FUNCTION)
+  def hugeMethodLimit: Int = getConf(WHOLESTAGE_HUGE_METHOD_LIMIT)
 
   def tableRelationCacheSize: Int =
     getConf(StaticSQLConf.FILESOURCE_TABLE_RELATION_CACHE_SIZE)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala
index a0f1a64b0ab08..9d0a41661beaa 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala
@@ -53,38 +53,6 @@ class CodeFormatterSuite extends SparkFunSuite {
     assert(reducedCode.body === "/*project_c4*/")
   }
 
-  test("removing extra new lines and comments") {
-    val code =
-      """
-        |/*
-        |  * multi
-        |  * line
-        |  * comments
-        |  */
-        |
-        |public function() {
-        |/*comment*/
-        |  /*comment_with_space*/
-        |code_body
-        |//comment
-        |code_body
-        |  //comment_with_space
-        |
-        |code_body
-        |}
-      """.stripMargin
-
-    val reducedCode = CodeFormatter.stripExtraNewLinesAndComments(code)
-    assert(reducedCode ===
-      """
-        |public function() {
-        |code_body
-        |code_body
-        |code_body
-        |}
-      """.stripMargin)
-  }
-
   testCase("basic example") {
     """
       |class A {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
index 268ccfa4edfa0..9073d599ac43d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
@@ -380,16 +380,8 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
 
   override def doExecute(): RDD[InternalRow] = {
     val (ctx, cleanedSource) = doCodeGen()
-    if (ctx.isTooLongGeneratedFunction) {
-      logWarning("Found too long generated codes and JIT optimization might not work, " +
-        "Whole-stage codegen disabled for this plan, " +
-        "You can change the config spark.sql.codegen.MaxFunctionLength " +
-        "to adjust the function length limit:\n "
-        + s"$treeString")
-      return child.execute()
-    }
     // try to compile and fallback if it failed
-    try {
+    val (_, maxCodeSize) = try {
       CodeGenerator.compile(cleanedSource)
     } catch {
       case _: Exception if !Utils.isTesting && sqlContext.conf.codegenFallback =>
@@ -397,6 +389,17 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
         logWarning(s"Whole-stage codegen disabled for this plan:\n $treeString")
         return child.execute()
     }
+
+    // Check if compiled code has a too large function
+    if (maxCodeSize > sqlContext.conf.hugeMethodLimit) {
+      logWarning(s"Found too long generated codes and JIT optimization might not work: " +
+        s"the bytecode size was $maxCodeSize, this value went over the limit " +
+        s"${sqlContext.conf.hugeMethodLimit}, and the whole-stage codegen was disabled " +
+        s"for this plan. To avoid this, you can raise the limit " +
+        s"${SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.key}:\n$treeString")
+      return child.execute()
+    }
+
     val references = ctx.references.toArray
 
     val durationMs = longMetric("pipelineTime")
@@ -405,7 +408,7 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
     assert(rdds.size <= 2, "Up to two input RDDs can be supported")
     if (rdds.length == 1) {
       rdds.head.mapPartitionsWithIndex { (index, iter) =>
-        val clazz = CodeGenerator.compile(cleanedSource)
+        val (clazz, _) = CodeGenerator.compile(cleanedSource)
         val buffer = clazz.generate(references).asInstanceOf[BufferedRowIterator]
         buffer.init(index, Array(iter))
         new Iterator[InternalRow] {
@@ -424,7 +427,7 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
         // a small hack to obtain the correct partition index
       }.mapPartitionsWithIndex { (index, zippedIter) =>
         val (leftIter, rightIter) = zippedIter.next()
-        val clazz = CodeGenerator.compile(cleanedSource)
+        val (clazz, _) = CodeGenerator.compile(cleanedSource)
         val buffer = clazz.generate(references).asInstanceOf[BufferedRowIterator]
         buffer.init(index, Array(leftIter, rightIter))
         new Iterator[InternalRow] {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala
index da34643281911..ae600c1ffae8e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala
@@ -227,6 +227,7 @@ object GenerateColumnAccessor extends CodeGenerator[Seq[DataType], ColumnarItera
       new CodeAndComment(codeBody, ctx.getPlaceHolderToComments()))
     logDebug(s"Generated ColumnarIterator:\n${CodeFormatter.format(code)}")
 
-    CodeGenerator.compile(code).generate(Array.empty).asInstanceOf[ColumnarIterator]
+    val (clazz, _) = CodeGenerator.compile(code)
+    clazz.generate(Array.empty).asInstanceOf[ColumnarIterator]
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
index beeee6a97c8dd..aaa77b3ee6201 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
@@ -17,10 +17,8 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.sql.{Column, Dataset, Row}
-import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
-import org.apache.spark.sql.catalyst.expressions.{Add, Literal, Stack}
-import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodeAndComment, CodeGenerator}
 import org.apache.spark.sql.execution.aggregate.HashAggregateExec
 import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec
 import org.apache.spark.sql.execution.joins.SortMergeJoinExec
@@ -151,7 +149,7 @@ class WholeStageCodegenSuite extends SparkPlanTest with SharedSQLContext {
     }
   }
 
-  def genGroupByCodeGenContext(caseNum: Int): CodegenContext = {
+  def genGroupByCode(caseNum: Int): CodeAndComment = {
     val caseExp = (1 to caseNum).map { i =>
       s"case when id > $i and id <= ${i + 1} then 1 else 0 end as v$i"
     }.toList
@@ -176,34 +174,15 @@ class WholeStageCodegenSuite extends SparkPlanTest with SharedSQLContext {
     })
 
     assert(wholeStageCodeGenExec.isDefined)
-    wholeStageCodeGenExec.get.asInstanceOf[WholeStageCodegenExec].doCodeGen()._1
+    wholeStageCodeGenExec.get.asInstanceOf[WholeStageCodegenExec].doCodeGen()._2
   }
 
-  test("SPARK-21603 check there is a too long generated function") {
-    withSQLConf(SQLConf.WHOLESTAGE_MAX_LINES_PER_FUNCTION.key -> "1500") {
-      val ctx = genGroupByCodeGenContext(30)
-      assert(ctx.isTooLongGeneratedFunction === true)
-    }
-  }
-
-  test("SPARK-21603 check there is not a too long generated function") {
-    withSQLConf(SQLConf.WHOLESTAGE_MAX_LINES_PER_FUNCTION.key -> "1500") {
-      val ctx = genGroupByCodeGenContext(1)
-      assert(ctx.isTooLongGeneratedFunction === false)
-    }
-  }
-
-  test("SPARK-21603 check there is not a too long generated function when threshold is Int.Max") {
-    withSQLConf(SQLConf.WHOLESTAGE_MAX_LINES_PER_FUNCTION.key -> Int.MaxValue.toString) {
-      val ctx = genGroupByCodeGenContext(30)
-      assert(ctx.isTooLongGeneratedFunction === false)
-    }
-  }
-
-  test("SPARK-21603 check there is a too long generated function when threshold is 0") {
-    withSQLConf(SQLConf.WHOLESTAGE_MAX_LINES_PER_FUNCTION.key -> "0") {
-      val ctx = genGroupByCodeGenContext(1)
-      assert(ctx.isTooLongGeneratedFunction === true)
-    }
+  test("SPARK-21871 check if we can get large code size when compiling too long functions") {
+    val codeWithShortFunctions = genGroupByCode(3)
+    val (_, maxCodeSize1) = CodeGenerator.compile(codeWithShortFunctions)
+    assert(maxCodeSize1 < SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.defaultValue.get)
+    val codeWithLongFunctions = genGroupByCode(20)
+    val (_, maxCodeSize2) = CodeGenerator.compile(codeWithLongFunctions)
+    assert(maxCodeSize2 > SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.defaultValue.get)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala
index 691fa9ac5e1e7..aca1be01fa3da 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala
@@ -24,6 +24,7 @@ import org.apache.spark.memory.{StaticMemoryManager, TaskMemoryManager}
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.execution.joins.LongToUnsafeRowMap
 import org.apache.spark.sql.execution.vectorized.AggregateHashMap
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{LongType, StructType}
 import org.apache.spark.unsafe.Platform
 import org.apache.spark.unsafe.hash.Murmur3_x86_32
@@ -301,10 +302,10 @@ class AggregateBenchmark extends BenchmarkBase {
     */
   }
 
-  ignore("max function length of wholestagecodegen") {
+  ignore("max function bytecode size of wholestagecodegen") {
     val N = 20 << 15
 
-    val benchmark = new Benchmark("max function length of wholestagecodegen", N)
+    val benchmark = new Benchmark("max function bytecode size", N)
     def f(): Unit = sparkSession.range(N)
       .selectExpr(
         "id",
@@ -333,33 +334,34 @@ class AggregateBenchmark extends BenchmarkBase {
       .sum()
       .collect()
 
-    benchmark.addCase(s"codegen = F") { iter =>
-      sparkSession.conf.set("spark.sql.codegen.wholeStage", "false")
+    benchmark.addCase("codegen = F") { iter =>
+      sparkSession.conf.set(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, "false")
       f()
     }
 
-    benchmark.addCase(s"codegen = T maxLinesPerFunction = 10000") { iter =>
-      sparkSession.conf.set("spark.sql.codegen.wholeStage", "true")
-      sparkSession.conf.set("spark.sql.codegen.maxLinesPerFunction", "10000")
+    benchmark.addCase("codegen = T hugeMethodLimit = 10000") { iter =>
+      sparkSession.conf.set(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, "true")
+      sparkSession.conf.set(SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.key, "10000")
       f()
     }
 
-    benchmark.addCase(s"codegen = T maxLinesPerFunction = 1500") { iter =>
-      sparkSession.conf.set("spark.sql.codegen.wholeStage", "true")
-      sparkSession.conf.set("spark.sql.codegen.maxLinesPerFunction", "1500")
+    benchmark.addCase("codegen = T hugeMethodLimit = 1500") { iter =>
+      sparkSession.conf.set(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, "true")
+      sparkSession.conf.set(SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.key, "1500")
       f()
     }
 
     benchmark.run()
 
     /*
-    Java HotSpot(TM) 64-Bit Server VM 1.8.0_111-b14 on Windows 7 6.1
-    Intel64 Family 6 Model 58 Stepping 9, GenuineIntel
-    max function length of wholestagecodegen: Best/Avg Time(ms)    Rate(M/s)  Per Row(ns)  Relative
-    ----------------------------------------------------------------------------------------------
-    codegen = F                                    462 /  533          1.4       704.4     1.0X
-    codegen = T maxLinesPerFunction = 10000       3444 / 3447          0.2      5255.3     0.1X
-    codegen = T maxLinesPerFunction = 1500         447 /  478          1.5       682.1     1.0X
+    Java HotSpot(TM) 64-Bit Server VM 1.8.0_31-b13 on Mac OS X 10.10.2
+    Intel(R) Core(TM) i7-4578U CPU @ 3.00GHz
+
+    max function bytecode size:              Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    ------------------------------------------------------------------------------------------------
+    codegen = F                                    709 /  803          0.9        1082.1       1.0X
+    codegen = T hugeMethodLimit = 10000           3485 / 3548          0.2        5317.7       0.2X
+    codegen = T hugeMethodLimit = 1500             636 /  701          1.0         969.9       1.1X
      */
   }
 

From bb035f1ee5cdf88e476b7ed83d59140d669fbe12 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 4 Oct 2017 13:13:51 -0700
Subject: [PATCH 665/779] [SPARK-22169][SQL] support byte length literal as
 identifier

## What changes were proposed in this pull request?

By definition the table name in Spark can be something like `123x`, `25a`, etc., with exceptions for literals like `12L`, `23BD`, etc. However, Spark SQL has a special byte length literal, which stops users to use digits followed by `b`, `k`, `m`, `g` as identifiers.

byte length literal is not a standard sql literal and is only used in the `tableSample` parser rule. This PR move the parsing of byte length literal from lexer to parser, so that users can use it as identifiers.

## How was this patch tested?

regression test

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19392 from cloud-fan/parser-bug.
---
 .../spark/sql/catalyst/parser/SqlBase.g4      | 25 +++++++-----------
 .../sql/catalyst/catalog/SessionCatalog.scala |  2 +-
 .../sql/catalyst/parser/AstBuilder.scala      | 26 +++++++++++++------
 .../sql/catalyst/parser/PlanParserSuite.scala |  1 +
 .../execution/command/DDLParserSuite.scala    | 19 ++++++++++++++
 5 files changed, 49 insertions(+), 24 deletions(-)

diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index d0a54288780ea..17c8404f8a79c 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -25,7 +25,7 @@ grammar SqlBase;
    * For char stream "2.3", "2." is not a valid decimal token, because it is followed by digit '3'.
    * For char stream "2.3_", "2.3" is not a valid decimal token, because it is followed by '_'.
    * For char stream "2.3W", "2.3" is not a valid decimal token, because it is followed by 'W'.
-   * For char stream "12.0D 34.E2+0.12 "  12.0D is a valid decimal token because it is folllowed
+   * For char stream "12.0D 34.E2+0.12 "  12.0D is a valid decimal token because it is followed
    * by a space. 34.E2 is a valid decimal token because it is followed by symbol '+'
    * which is not a digit or letter or underscore.
    */
@@ -40,10 +40,6 @@ grammar SqlBase;
   }
 }
 
-tokens {
-    DELIMITER
-}
-
 singleStatement
     : statement EOF
     ;
@@ -447,12 +443,15 @@ joinCriteria
     ;
 
 sample
-    : TABLESAMPLE '('
-      ( (negativeSign=MINUS? percentage=(INTEGER_VALUE | DECIMAL_VALUE) sampleType=PERCENTLIT)
-      | (expression sampleType=ROWS)
-      | sampleType=BYTELENGTH_LITERAL
-      | (sampleType=BUCKET numerator=INTEGER_VALUE OUT OF denominator=INTEGER_VALUE (ON (identifier | qualifiedName '(' ')'))?))
-      ')'
+    : TABLESAMPLE '(' sampleMethod? ')'
+    ;
+
+sampleMethod
+    : negativeSign=MINUS? percentage=(INTEGER_VALUE | DECIMAL_VALUE) PERCENTLIT   #sampleByPercentile
+    | expression ROWS                                                             #sampleByRows
+    | sampleType=BUCKET numerator=INTEGER_VALUE OUT OF denominator=INTEGER_VALUE
+        (ON (identifier | qualifiedName '(' ')'))?                                #sampleByBucket
+    | bytes=expression                                                            #sampleByBytes
     ;
 
 identifierList
@@ -1004,10 +1003,6 @@ TINYINT_LITERAL
     : DIGIT+ 'Y'
     ;
 
-BYTELENGTH_LITERAL
-    : DIGIT+ ('B' | 'K' | 'M' | 'G')
-    ;
-
 INTEGER_VALUE
     : DIGIT+
     ;
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index 6ba9ee5446a01..95bc3d674b4f8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -99,7 +99,7 @@ class SessionCatalog(
   protected var currentDb: String = formatDatabaseName(DEFAULT_DATABASE)
 
   /**
-   * Checks if the given name conforms the Hive standard ("[a-zA-z_0-9]+"),
+   * Checks if the given name conforms the Hive standard ("[a-zA-Z_0-9]+"),
    * i.e. if this name only contains characters, numbers, and _.
    *
    * This method is intended to have the same behavior of
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 85b492e83446e..ce367145bc637 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -699,20 +699,30 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
       Sample(0.0, fraction, withReplacement = false, (math.random * 1000).toInt, query)
     }
 
-    ctx.sampleType.getType match {
-      case SqlBaseParser.ROWS =>
+    if (ctx.sampleMethod() == null) {
+      throw new ParseException("TABLESAMPLE does not accept empty inputs.", ctx)
+    }
+
+    ctx.sampleMethod() match {
+      case ctx: SampleByRowsContext =>
         Limit(expression(ctx.expression), query)
 
-      case SqlBaseParser.PERCENTLIT =>
+      case ctx: SampleByPercentileContext =>
         val fraction = ctx.percentage.getText.toDouble
         val sign = if (ctx.negativeSign == null) 1 else -1
         sample(sign * fraction / 100.0d)
 
-      case SqlBaseParser.BYTELENGTH_LITERAL =>
-        throw new ParseException(
-          "TABLESAMPLE(byteLengthLiteral) is not supported", ctx)
+      case ctx: SampleByBytesContext =>
+        val bytesStr = ctx.bytes.getText
+        if (bytesStr.matches("[0-9]+[bBkKmMgG]")) {
+          throw new ParseException("TABLESAMPLE(byteLengthLiteral) is not supported", ctx)
+        } else {
+          throw new ParseException(
+            bytesStr + " is not a valid byte length literal, " +
+              "expected syntax: DIGIT+ ('B' | 'K' | 'M' | 'G')", ctx)
+        }
 
-      case SqlBaseParser.BUCKET if ctx.ON != null =>
+      case ctx: SampleByBucketContext if ctx.ON() != null =>
         if (ctx.identifier != null) {
           throw new ParseException(
             "TABLESAMPLE(BUCKET x OUT OF y ON colname) is not supported", ctx)
@@ -721,7 +731,7 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
             "TABLESAMPLE(BUCKET x OUT OF y ON function) is not supported", ctx)
         }
 
-      case SqlBaseParser.BUCKET =>
+      case ctx: SampleByBucketContext =>
         sample(ctx.numerator.getText.toDouble / ctx.denominator.getText.toDouble)
     }
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
index 306e6f2cfbd37..d34a83c42c67e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
@@ -110,6 +110,7 @@ class PlanParserSuite extends AnalysisTest {
     assertEqual("select distinct a, b from db.c", Distinct(table("db", "c").select('a, 'b)))
     assertEqual("select all a, b from db.c", table("db", "c").select('a, 'b))
     assertEqual("select from tbl", OneRowRelation().select('from.as("tbl")))
+    assertEqual("select a from 1k.2m", table("1k", "2m").select('a))
   }
 
   test("reverse select query") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
index fa5172ca8a3e7..eb7c33590b602 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
@@ -525,6 +525,25 @@ class DDLParserSuite extends PlanTest with SharedSQLContext {
     assert(e.message.contains("you can only specify one of them."))
   }
 
+  test("create table - byte length literal table name") {
+    val sql = "CREATE TABLE 1m.2g(a INT) USING parquet"
+
+    val expectedTableDesc = CatalogTable(
+      identifier = TableIdentifier("2g", Some("1m")),
+      tableType = CatalogTableType.MANAGED,
+      storage = CatalogStorageFormat.empty,
+      schema = new StructType().add("a", IntegerType),
+      provider = Some("parquet"))
+
+    parser.parsePlan(sql) match {
+      case CreateTable(tableDesc, _, None) =>
+        assert(tableDesc == expectedTableDesc.copy(createTime = tableDesc.createTime))
+      case other =>
+        fail(s"Expected to parse ${classOf[CreateTableCommand].getClass.getName} from query," +
+          s"got ${other.getClass.getName}: $sql")
+    }
+  }
+
   test("insert overwrite directory") {
     val v1 = "INSERT OVERWRITE DIRECTORY '/tmp/file' USING parquet SELECT 1 as a"
     parser.parsePlan(v1) match {

From 969ffd631746125eb2b83722baf6f6e7ddd2092c Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Wed, 4 Oct 2017 19:25:22 -0700
Subject: [PATCH 666/779] [SPARK-22187][SS] Update unsaferow format for saved
 state such that we can set timeouts when state is null

## What changes were proposed in this pull request?

Currently, the group state of user-defined-type is encoded as top-level columns in the UnsafeRows stores in the state store. The timeout timestamp is also saved as (when needed) as the last top-level column. Since the group state is serialized to top-level columns, you cannot save "null" as a value of state (setting null in all the top-level columns is not equivalent). So we don't let the user set the timeout without initializing the state for a key. Based on user experience, this leads to confusion.

This PR is to change the row format such that the state is saved as nested columns. This would allow the state to be set to null, and avoid these confusing corner cases.

## How was this patch tested?
Refactored tests.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #19416 from tdas/SPARK-22187.
---
 .../FlatMapGroupsWithStateExec.scala          | 133 +++------------
 .../FlatMapGroupsWithState_StateManager.scala | 153 ++++++++++++++++++
 .../FlatMapGroupsWithStateSuite.scala         | 130 ++++++++-------
 3 files changed, 246 insertions(+), 170 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/FlatMapGroupsWithState_StateManager.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala
index ab690fd5fbbca..aab06d611a5ea 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala
@@ -23,10 +23,8 @@ import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, Attribut
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Distribution}
 import org.apache.spark.sql.execution._
-import org.apache.spark.sql.execution.streaming.GroupStateImpl.NO_TIMESTAMP
 import org.apache.spark.sql.execution.streaming.state._
 import org.apache.spark.sql.streaming.{GroupStateTimeout, OutputMode}
-import org.apache.spark.sql.types.IntegerType
 import org.apache.spark.util.CompletionIterator
 
 /**
@@ -62,26 +60,7 @@ case class FlatMapGroupsWithStateExec(
   import GroupStateImpl._
 
   private val isTimeoutEnabled = timeoutConf != NoTimeout
-  private val timestampTimeoutAttribute =
-    AttributeReference("timeoutTimestamp", dataType = IntegerType, nullable = false)()
-  private val stateAttributes: Seq[Attribute] = {
-    val encSchemaAttribs = stateEncoder.schema.toAttributes
-    if (isTimeoutEnabled) encSchemaAttribs :+ timestampTimeoutAttribute else encSchemaAttribs
-  }
-  // Get the serializer for the state, taking into account whether we need to save timestamps
-  private val stateSerializer = {
-    val encoderSerializer = stateEncoder.namedExpressions
-    if (isTimeoutEnabled) {
-      encoderSerializer :+ Literal(GroupStateImpl.NO_TIMESTAMP)
-    } else {
-      encoderSerializer
-    }
-  }
-  // Get the deserializer for the state. Note that this must be done in the driver, as
-  // resolving and binding of deserializer expressions to the encoded type can be safely done
-  // only in the driver.
-  private val stateDeserializer = stateEncoder.resolveAndBind().deserializer
-
+  val stateManager = new FlatMapGroupsWithState_StateManager(stateEncoder, isTimeoutEnabled)
 
   /** Distribute by grouping attributes */
   override def requiredChildDistribution: Seq[Distribution] =
@@ -109,11 +88,11 @@ case class FlatMapGroupsWithStateExec(
     child.execute().mapPartitionsWithStateStore[InternalRow](
       getStateInfo,
       groupingAttributes.toStructType,
-      stateAttributes.toStructType,
+      stateManager.stateSchema,
       indexOrdinal = None,
       sqlContext.sessionState,
       Some(sqlContext.streams.stateStoreCoordinator)) { case (store, iter) =>
-        val updater = new StateStoreUpdater(store)
+        val processor = new InputProcessor(store)
 
         // If timeout is based on event time, then filter late data based on watermark
         val filteredIter = watermarkPredicateForData match {
@@ -128,7 +107,7 @@ case class FlatMapGroupsWithStateExec(
         // all the data has been processed. This is to ensure that the timeout information of all
         // the keys with data is updated before they are processed for timeouts.
         val outputIterator =
-          updater.updateStateForKeysWithData(filteredIter) ++ updater.updateStateForTimedOutKeys()
+          processor.processNewData(filteredIter) ++ processor.processTimedOutState()
 
         // Return an iterator of all the rows generated by all the keys, such that when fully
         // consumed, all the state updates will be committed by the state store
@@ -143,7 +122,7 @@ case class FlatMapGroupsWithStateExec(
   }
 
   /** Helper class to update the state store */
-  class StateStoreUpdater(store: StateStore) {
+  class InputProcessor(store: StateStore) {
 
     // Converters for translating input keys, values, output data between rows and Java objects
     private val getKeyObj =
@@ -152,14 +131,6 @@ case class FlatMapGroupsWithStateExec(
       ObjectOperator.deserializeRowToObject(valueDeserializer, dataAttributes)
     private val getOutputRow = ObjectOperator.wrapObjectToRow(outputObjAttr.dataType)
 
-    // Converters for translating state between rows and Java objects
-    private val getStateObjFromRow = ObjectOperator.deserializeRowToObject(
-      stateDeserializer, stateAttributes)
-    private val getStateRowFromObj = ObjectOperator.serializeObjectToRow(stateSerializer)
-
-    // Index of the additional metadata fields in the state row
-    private val timeoutTimestampIndex = stateAttributes.indexOf(timestampTimeoutAttribute)
-
     // Metrics
     private val numUpdatedStateRows = longMetric("numUpdatedStateRows")
     private val numOutputRows = longMetric("numOutputRows")
@@ -168,20 +139,19 @@ case class FlatMapGroupsWithStateExec(
      * For every group, get the key, values and corresponding state and call the function,
      * and return an iterator of rows
      */
-    def updateStateForKeysWithData(dataIter: Iterator[InternalRow]): Iterator[InternalRow] = {
+    def processNewData(dataIter: Iterator[InternalRow]): Iterator[InternalRow] = {
       val groupedIter = GroupedIterator(dataIter, groupingAttributes, child.output)
       groupedIter.flatMap { case (keyRow, valueRowIter) =>
         val keyUnsafeRow = keyRow.asInstanceOf[UnsafeRow]
         callFunctionAndUpdateState(
-          keyUnsafeRow,
+          stateManager.getState(store, keyUnsafeRow),
           valueRowIter,
-          store.get(keyUnsafeRow),
           hasTimedOut = false)
       }
     }
 
     /** Find the groups that have timeout set and are timing out right now, and call the function */
-    def updateStateForTimedOutKeys(): Iterator[InternalRow] = {
+    def processTimedOutState(): Iterator[InternalRow] = {
       if (isTimeoutEnabled) {
         val timeoutThreshold = timeoutConf match {
           case ProcessingTimeTimeout => batchTimestampMs.get
@@ -190,12 +160,11 @@ case class FlatMapGroupsWithStateExec(
             throw new IllegalStateException(
               s"Cannot filter timed out keys for $timeoutConf")
         }
-        val timingOutKeys = store.getRange(None, None).filter { rowPair =>
-          val timeoutTimestamp = getTimeoutTimestamp(rowPair.value)
-          timeoutTimestamp != NO_TIMESTAMP && timeoutTimestamp < timeoutThreshold
+        val timingOutKeys = stateManager.getAllState(store).filter { state =>
+          state.timeoutTimestamp != NO_TIMESTAMP && state.timeoutTimestamp < timeoutThreshold
         }
-        timingOutKeys.flatMap { rowPair =>
-          callFunctionAndUpdateState(rowPair.key, Iterator.empty, rowPair.value, hasTimedOut = true)
+        timingOutKeys.flatMap { stateData =>
+          callFunctionAndUpdateState(stateData, Iterator.empty, hasTimedOut = true)
         }
       } else Iterator.empty
     }
@@ -205,72 +174,43 @@ case class FlatMapGroupsWithStateExec(
      * iterator. Note that the store updating is lazy, that is, the store will be updated only
      * after the returned iterator is fully consumed.
      *
-     * @param keyRow Row representing the key, cannot be null
+     * @param stateData All the data related to the state to be updated
      * @param valueRowIter Iterator of values as rows, cannot be null, but can be empty
-     * @param prevStateRow Row representing the previous state, can be null
      * @param hasTimedOut Whether this function is being called for a key timeout
      */
     private def callFunctionAndUpdateState(
-        keyRow: UnsafeRow,
+        stateData: FlatMapGroupsWithState_StateData,
         valueRowIter: Iterator[InternalRow],
-        prevStateRow: UnsafeRow,
         hasTimedOut: Boolean): Iterator[InternalRow] = {
 
-      val keyObj = getKeyObj(keyRow)  // convert key to objects
+      val keyObj = getKeyObj(stateData.keyRow)  // convert key to objects
       val valueObjIter = valueRowIter.map(getValueObj.apply) // convert value rows to objects
-      val stateObj = getStateObj(prevStateRow)
-      val keyedState = GroupStateImpl.createForStreaming(
-        Option(stateObj),
+      val groupState = GroupStateImpl.createForStreaming(
+        Option(stateData.stateObj),
         batchTimestampMs.getOrElse(NO_TIMESTAMP),
         eventTimeWatermark.getOrElse(NO_TIMESTAMP),
         timeoutConf,
         hasTimedOut)
 
       // Call function, get the returned objects and convert them to rows
-      val mappedIterator = func(keyObj, valueObjIter, keyedState).map { obj =>
+      val mappedIterator = func(keyObj, valueObjIter, groupState).map { obj =>
         numOutputRows += 1
         getOutputRow(obj)
       }
 
       // When the iterator is consumed, then write changes to state
       def onIteratorCompletion: Unit = {
-
-        val currentTimeoutTimestamp = keyedState.getTimeoutTimestamp
-        // If the state has not yet been set but timeout has been set, then
-        // we have to generate a row to save the timeout. However, attempting serialize
-        // null using case class encoder throws -
-        //    java.lang.NullPointerException: Null value appeared in non-nullable field:
-        //    If the schema is inferred from a Scala tuple / case class, or a Java bean, please
-        //    try to use scala.Option[_] or other nullable types.
-        if (!keyedState.exists && currentTimeoutTimestamp != NO_TIMESTAMP) {
-          throw new IllegalStateException(
-            "Cannot set timeout when state is not defined, that is, state has not been" +
-              "initialized or has been removed")
-        }
-
-        if (keyedState.hasRemoved) {
-          store.remove(keyRow)
+        if (groupState.hasRemoved && groupState.getTimeoutTimestamp == NO_TIMESTAMP) {
+          stateManager.removeState(store, stateData.keyRow)
           numUpdatedStateRows += 1
-
         } else {
-          val previousTimeoutTimestamp = getTimeoutTimestamp(prevStateRow)
-          val stateRowToWrite = if (keyedState.hasUpdated) {
-            getStateRow(keyedState.get)
-          } else {
-            prevStateRow
-          }
-
-          val hasTimeoutChanged = currentTimeoutTimestamp != previousTimeoutTimestamp
-          val shouldWriteState = keyedState.hasUpdated || hasTimeoutChanged
+          val currentTimeoutTimestamp = groupState.getTimeoutTimestamp
+          val hasTimeoutChanged = currentTimeoutTimestamp != stateData.timeoutTimestamp
+          val shouldWriteState = groupState.hasUpdated || groupState.hasRemoved || hasTimeoutChanged
 
           if (shouldWriteState) {
-            if (stateRowToWrite == null) {
-              // This should never happen because checks in GroupStateImpl should avoid cases
-              // where empty state would need to be written
-              throw new IllegalStateException("Attempting to write empty state")
-            }
-            setTimeoutTimestamp(stateRowToWrite, currentTimeoutTimestamp)
-            store.put(keyRow, stateRowToWrite)
+            val updatedStateObj = if (groupState.exists) groupState.get else null
+            stateManager.putState(store, stateData.keyRow, updatedStateObj, currentTimeoutTimestamp)
             numUpdatedStateRows += 1
           }
         }
@@ -279,28 +219,5 @@ case class FlatMapGroupsWithStateExec(
       // Return an iterator of rows such that fully consumed, the updated state value will be saved
       CompletionIterator[InternalRow, Iterator[InternalRow]](mappedIterator, onIteratorCompletion)
     }
-
-    /** Returns the state as Java object if defined */
-    def getStateObj(stateRow: UnsafeRow): Any = {
-      if (stateRow != null) getStateObjFromRow(stateRow) else null
-    }
-
-    /** Returns the row for an updated state */
-    def getStateRow(obj: Any): UnsafeRow = {
-      assert(obj != null)
-      getStateRowFromObj(obj)
-    }
-
-    /** Returns the timeout timestamp of a state row is set */
-    def getTimeoutTimestamp(stateRow: UnsafeRow): Long = {
-      if (isTimeoutEnabled && stateRow != null) {
-        stateRow.getLong(timeoutTimestampIndex)
-      } else NO_TIMESTAMP
-    }
-
-    /** Set the timestamp in a state row */
-    def setTimeoutTimestamp(stateRow: UnsafeRow, timeoutTimestamps: Long): Unit = {
-      if (isTimeoutEnabled) stateRow.setLong(timeoutTimestampIndex, timeoutTimestamps)
-    }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/FlatMapGroupsWithState_StateManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/FlatMapGroupsWithState_StateManager.scala
new file mode 100644
index 0000000000000..d077836da847c
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/FlatMapGroupsWithState_StateManager.scala
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming.state
+
+import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, BoundReference, CaseWhen, CreateNamedStruct, GetStructField, IsNull, Literal, UnsafeRow}
+import org.apache.spark.sql.execution.ObjectOperator
+import org.apache.spark.sql.execution.streaming.GroupStateImpl
+import org.apache.spark.sql.execution.streaming.GroupStateImpl.NO_TIMESTAMP
+import org.apache.spark.sql.types.{IntegerType, LongType, StructType}
+
+
+/**
+ * Class to serialize/write/read/deserialize state for
+ * [[org.apache.spark.sql.execution.streaming.FlatMapGroupsWithStateExec]].
+ */
+class FlatMapGroupsWithState_StateManager(
+    stateEncoder: ExpressionEncoder[Any],
+    shouldStoreTimestamp: Boolean) extends Serializable {
+
+  /** Schema of the state rows saved in the state store */
+  val stateSchema = {
+    val schema = new StructType().add("groupState", stateEncoder.schema, nullable = true)
+    if (shouldStoreTimestamp) schema.add("timeoutTimestamp", LongType) else schema
+  }
+
+  /** Get deserialized state and corresponding timeout timestamp for a key */
+  def getState(store: StateStore, keyRow: UnsafeRow): FlatMapGroupsWithState_StateData = {
+    val stateRow = store.get(keyRow)
+    stateDataForGets.withNew(
+      keyRow, stateRow, getStateObj(stateRow), getTimestamp(stateRow))
+  }
+
+  /** Put state and timeout timestamp for a key */
+  def putState(store: StateStore, keyRow: UnsafeRow, state: Any, timestamp: Long): Unit = {
+    val stateRow = getStateRow(state)
+    setTimestamp(stateRow, timestamp)
+    store.put(keyRow, stateRow)
+  }
+
+  /** Removed all information related to a key */
+  def removeState(store: StateStore, keyRow: UnsafeRow): Unit = {
+    store.remove(keyRow)
+  }
+
+  /** Get all the keys and corresponding state rows in the state store */
+  def getAllState(store: StateStore): Iterator[FlatMapGroupsWithState_StateData] = {
+    val stateDataForGetAllState = FlatMapGroupsWithState_StateData()
+    store.getRange(None, None).map { pair =>
+      stateDataForGetAllState.withNew(
+        pair.key, pair.value, getStateObjFromRow(pair.value), getTimestamp(pair.value))
+    }
+  }
+
+  // Ordinals of the information stored in the state row
+  private lazy val nestedStateOrdinal = 0
+  private lazy val timeoutTimestampOrdinal = 1
+
+  // Get the serializer for the state, taking into account whether we need to save timestamps
+  private val stateSerializer = {
+    val nestedStateExpr = CreateNamedStruct(
+      stateEncoder.namedExpressions.flatMap(e => Seq(Literal(e.name), e)))
+    if (shouldStoreTimestamp) {
+      Seq(nestedStateExpr, Literal(GroupStateImpl.NO_TIMESTAMP))
+    } else {
+      Seq(nestedStateExpr)
+    }
+  }
+
+  // Get the deserializer for the state. Note that this must be done in the driver, as
+  // resolving and binding of deserializer expressions to the encoded type can be safely done
+  // only in the driver.
+  private val stateDeserializer = {
+    val boundRefToNestedState = BoundReference(nestedStateOrdinal, stateEncoder.schema, true)
+    val deser = stateEncoder.resolveAndBind().deserializer.transformUp {
+      case BoundReference(ordinal, _, _) => GetStructField(boundRefToNestedState, ordinal)
+    }
+    CaseWhen(Seq(IsNull(boundRefToNestedState) -> Literal(null)), elseValue = deser).toCodegen()
+  }
+
+  // Converters for translating state between rows and Java objects
+  private lazy val getStateObjFromRow = ObjectOperator.deserializeRowToObject(
+    stateDeserializer, stateSchema.toAttributes)
+  private lazy val getStateRowFromObj = ObjectOperator.serializeObjectToRow(stateSerializer)
+
+  // Reusable instance for returning state information
+  private lazy val stateDataForGets = FlatMapGroupsWithState_StateData()
+
+  /** Returns the state as Java object if defined */
+  private def getStateObj(stateRow: UnsafeRow): Any = {
+    if (stateRow == null) null
+    else getStateObjFromRow(stateRow)
+  }
+
+  /** Returns the row for an updated state */
+  private def getStateRow(obj: Any): UnsafeRow = {
+    val row = getStateRowFromObj(obj)
+    if (obj == null) {
+      row.setNullAt(nestedStateOrdinal)
+    }
+    row
+  }
+
+  /** Returns the timeout timestamp of a state row is set */
+  private def getTimestamp(stateRow: UnsafeRow): Long = {
+    if (shouldStoreTimestamp && stateRow != null) {
+      stateRow.getLong(timeoutTimestampOrdinal)
+    } else NO_TIMESTAMP
+  }
+
+  /** Set the timestamp in a state row */
+  private def setTimestamp(stateRow: UnsafeRow, timeoutTimestamps: Long): Unit = {
+    if (shouldStoreTimestamp) stateRow.setLong(timeoutTimestampOrdinal, timeoutTimestamps)
+  }
+}
+
+/**
+ * Class to capture deserialized state and timestamp return by the state manager.
+ * This is intended for reuse.
+ */
+case class FlatMapGroupsWithState_StateData(
+    var keyRow: UnsafeRow = null,
+    var stateRow: UnsafeRow = null,
+    var stateObj: Any = null,
+    var timeoutTimestamp: Long = -1) {
+  def withNew(
+      newKeyRow: UnsafeRow,
+      newStateRow: UnsafeRow,
+      newStateObj: Any,
+      newTimeout: Long): this.type = {
+    keyRow = newKeyRow
+    stateRow = newStateRow
+    stateObj = newStateObj
+    timeoutTimestamp = newTimeout
+    this
+  }
+}
+
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
index 9d74a5c701ef1..d2e8beb2f5290 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
@@ -289,13 +289,13 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest with BeforeAndAf
     }
   }
 
-  // Values used for testing StateStoreUpdater
+  // Values used for testing InputProcessor
   val currentBatchTimestamp = 1000
   val currentBatchWatermark = 1000
   val beforeTimeoutThreshold = 999
   val afterTimeoutThreshold = 1001
 
-  // Tests for StateStoreUpdater.updateStateForKeysWithData() when timeout = NoTimeout
+  // Tests for InputProcessor.processNewData() when timeout = NoTimeout
   for (priorState <- Seq(None, Some(0))) {
     val priorStateStr = if (priorState.nonEmpty) "prior state set" else "no prior state"
     val testName = s"NoTimeout - $priorStateStr - "
@@ -322,7 +322,7 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest with BeforeAndAf
       expectedState = None)        // should be removed
   }
 
-  // Tests for StateStoreUpdater.updateStateForKeysWithData() when timeout != NoTimeout
+  // Tests for InputProcessor.processTimedOutState() when timeout != NoTimeout
   for (priorState <- Seq(None, Some(0))) {
     for (priorTimeoutTimestamp <- Seq(NO_TIMESTAMP, 1000)) {
       var testName = ""
@@ -365,6 +365,18 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest with BeforeAndAf
           expectedState = None)                                 // state should be removed
       }
 
+      // Tests with ProcessingTimeTimeout
+      if (priorState == None) {
+        testStateUpdateWithData(
+          s"ProcessingTimeTimeout - $testName - timeout updated without initializing state",
+          stateUpdates = state => { state.setTimeoutDuration(5000) },
+          timeoutConf = ProcessingTimeTimeout,
+          priorState = None,
+          priorTimeoutTimestamp = priorTimeoutTimestamp,
+          expectedState = None,
+          expectedTimeoutTimestamp = currentBatchTimestamp + 5000)
+      }
+
       testStateUpdateWithData(
         s"ProcessingTimeTimeout - $testName - state and timeout duration updated",
         stateUpdates =
@@ -375,10 +387,36 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest with BeforeAndAf
         expectedState = Some(5),                                 // state should change
         expectedTimeoutTimestamp = currentBatchTimestamp + 5000) // timestamp should change
 
+      testStateUpdateWithData(
+        s"ProcessingTimeTimeout - $testName - timeout updated after state removed",
+        stateUpdates = state => { state.remove(); state.setTimeoutDuration(5000) },
+        timeoutConf = ProcessingTimeTimeout,
+        priorState = priorState,
+        priorTimeoutTimestamp = priorTimeoutTimestamp,
+        expectedState = None,
+        expectedTimeoutTimestamp = currentBatchTimestamp + 5000)
+
+      // Tests with EventTimeTimeout
+
+      if (priorState == None) {
+        testStateUpdateWithData(
+          s"EventTimeTimeout - $testName - setting timeout without init state not allowed",
+          stateUpdates = state => {
+            state.setTimeoutTimestamp(10000)
+          },
+          timeoutConf = EventTimeTimeout,
+          priorState = None,
+          priorTimeoutTimestamp = priorTimeoutTimestamp,
+          expectedState = None,
+          expectedTimeoutTimestamp = 10000)
+      }
+
       testStateUpdateWithData(
         s"EventTimeTimeout - $testName - state and timeout timestamp updated",
         stateUpdates =
-          (state: GroupState[Int]) => { state.update(5); state.setTimeoutTimestamp(5000) },
+          (state: GroupState[Int]) => {
+            state.update(5); state.setTimeoutTimestamp(5000)
+          },
         timeoutConf = EventTimeTimeout,
         priorState = priorState,
         priorTimeoutTimestamp = priorTimeoutTimestamp,
@@ -397,50 +435,23 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest with BeforeAndAf
         timeoutConf = EventTimeTimeout,
         priorState = priorState,
         priorTimeoutTimestamp = priorTimeoutTimestamp,
-        expectedState = Some(5),                                 // state should change
-        expectedTimeoutTimestamp = NO_TIMESTAMP)                 // timestamp should not update
-    }
-  }
-
-  // Currently disallowed cases for StateStoreUpdater.updateStateForKeysWithData(),
-  // Try to remove these cases in the future
-  for (priorTimeoutTimestamp <- Seq(NO_TIMESTAMP, 1000)) {
-    val testName =
-      if (priorTimeoutTimestamp != NO_TIMESTAMP) "prior timeout set" else "no prior timeout"
-    testStateUpdateWithData(
-      s"ProcessingTimeTimeout - $testName - setting timeout without init state not allowed",
-      stateUpdates = state => { state.setTimeoutDuration(5000) },
-      timeoutConf = ProcessingTimeTimeout,
-      priorState = None,
-      priorTimeoutTimestamp = priorTimeoutTimestamp,
-      expectedException = classOf[IllegalStateException])
-
-    testStateUpdateWithData(
-      s"ProcessingTimeTimeout - $testName - setting timeout with state removal not allowed",
-      stateUpdates = state => { state.remove(); state.setTimeoutDuration(5000) },
-      timeoutConf = ProcessingTimeTimeout,
-      priorState = Some(5),
-      priorTimeoutTimestamp = priorTimeoutTimestamp,
-      expectedException = classOf[IllegalStateException])
-
-    testStateUpdateWithData(
-      s"EventTimeTimeout - $testName - setting timeout without init state not allowed",
-      stateUpdates = state => { state.setTimeoutTimestamp(10000) },
-      timeoutConf = EventTimeTimeout,
-      priorState = None,
-      priorTimeoutTimestamp = priorTimeoutTimestamp,
-      expectedException = classOf[IllegalStateException])
+        expectedState = Some(5), // state should change
+        expectedTimeoutTimestamp = NO_TIMESTAMP) // timestamp should not update
 
-    testStateUpdateWithData(
-      s"EventTimeTimeout - $testName - setting timeout with state removal not allowed",
-      stateUpdates = state => { state.remove(); state.setTimeoutTimestamp(10000) },
-      timeoutConf = EventTimeTimeout,
-      priorState = Some(5),
-      priorTimeoutTimestamp = priorTimeoutTimestamp,
-      expectedException = classOf[IllegalStateException])
+      testStateUpdateWithData(
+        s"EventTimeTimeout - $testName - setting timeout with state removal not allowed",
+        stateUpdates = state => {
+          state.remove(); state.setTimeoutTimestamp(10000)
+        },
+        timeoutConf = EventTimeTimeout,
+        priorState = priorState,
+        priorTimeoutTimestamp = priorTimeoutTimestamp,
+        expectedState = None,
+        expectedTimeoutTimestamp = 10000)
+    }
   }
 
-  // Tests for StateStoreUpdater.updateStateForTimedOutKeys()
+  // Tests for InputProcessor.processTimedOutState()
   val preTimeoutState = Some(5)
   for (timeoutConf <- Seq(ProcessingTimeTimeout, EventTimeTimeout)) {
     testStateUpdateWithTimeout(
@@ -924,7 +935,7 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest with BeforeAndAf
     if (priorState.isEmpty && priorTimeoutTimestamp != NO_TIMESTAMP) {
       return // there can be no prior timestamp, when there is no prior state
     }
-    test(s"StateStoreUpdater - updates with data - $testName") {
+    test(s"InputProcessor - process new data - $testName") {
       val mapGroupsFunc = (key: Int, values: Iterator[Int], state: GroupState[Int]) => {
         assert(state.hasTimedOut === false, "hasTimedOut not false")
         assert(values.nonEmpty, "Some value is expected")
@@ -946,7 +957,7 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest with BeforeAndAf
       expectedState: Option[Int],
       expectedTimeoutTimestamp: Long = NO_TIMESTAMP): Unit = {
 
-    test(s"StateStoreUpdater - updates for timeout - $testName") {
+    test(s"InputProcessor - process timed out state - $testName") {
       val mapGroupsFunc = (key: Int, values: Iterator[Int], state: GroupState[Int]) => {
         assert(state.hasTimedOut === true, "hasTimedOut not true")
         assert(values.isEmpty, "values not empty")
@@ -973,21 +984,20 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest with BeforeAndAf
     val store = newStateStore()
     val mapGroupsSparkPlan = newFlatMapGroupsWithStateExec(
       mapGroupsFunc, timeoutConf, currentBatchTimestamp)
-    val updater = new mapGroupsSparkPlan.StateStoreUpdater(store)
+    val inputProcessor = new mapGroupsSparkPlan.InputProcessor(store)
+    val stateManager = mapGroupsSparkPlan.stateManager
     val key = intToRow(0)
     // Prepare store with prior state configs
-    if (priorState.nonEmpty) {
-      val row = updater.getStateRow(priorState.get)
-      updater.setTimeoutTimestamp(row, priorTimeoutTimestamp)
-      store.put(key.copy(), row.copy())
+    if (priorState.nonEmpty || priorTimeoutTimestamp != NO_TIMESTAMP) {
+      stateManager.putState(store, key, priorState.orNull, priorTimeoutTimestamp)
     }
 
     // Call updating function to update state store
     def callFunction() = {
       val returnedIter = if (testTimeoutUpdates) {
-        updater.updateStateForTimedOutKeys()
+        inputProcessor.processTimedOutState()
       } else {
-        updater.updateStateForKeysWithData(Iterator(key))
+        inputProcessor.processNewData(Iterator(key))
       }
       returnedIter.size // consume the iterator to force state updates
     }
@@ -998,15 +1008,11 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest with BeforeAndAf
     } else {
       // Call function to update and verify updated state in store
       callFunction()
-      val updatedStateRow = store.get(key)
-      assert(
-        Option(updater.getStateObj(updatedStateRow)).map(_.toString.toInt) === expectedState,
+      val updatedState = stateManager.getState(store, key)
+      assert(Option(updatedState.stateObj).map(_.toString.toInt) === expectedState,
         "final state not as expected")
-      if (updatedStateRow != null) {
-        assert(
-          updater.getTimeoutTimestamp(updatedStateRow) === expectedTimeoutTimestamp,
-          "final timeout timestamp not as expected")
-      }
+      assert(updatedState.timeoutTimestamp === expectedTimeoutTimestamp,
+        "final timeout timestamp not as expected")
     }
   }
 

From c8affec21c91d638009524955515fc143ad86f20 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <zsxwing@gmail.com>
Date: Wed, 4 Oct 2017 20:58:48 -0700
Subject: [PATCH 667/779] [SPARK-22203][SQL] Add job description for file
 listing Spark jobs

## What changes were proposed in this pull request?

The user may be confused about some 10000-tasks jobs. We can add a job description for these jobs so that the user can figure it out.

## How was this patch tested?

The new unit test.

Before:
<img width="343" alt="screen shot 2017-10-04 at 3 22 09 pm" src="https://user-images.githubusercontent.com/1000778/31202567-f78d15c0-a917-11e7-841e-11b8bf8f0032.png">

After:
<img width="473" alt="screen shot 2017-10-04 at 3 13 51 pm" src="https://user-images.githubusercontent.com/1000778/31202576-fc01e356-a917-11e7-9c2b-7bf80b153adb.png">

Author: Shixiong Zhu <zsxwing@gmail.com>

Closes #19432 from zsxwing/SPARK-22203.
---
 .../datasources/InMemoryFileIndex.scala       | 85 +++++++++++--------
 .../sql/test/DataFrameReaderWriterSuite.scala | 31 +++++++
 2 files changed, 81 insertions(+), 35 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
index 203d449717512..318ada0ceefc5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
@@ -25,6 +25,7 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs._
 import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
 
+import org.apache.spark.SparkContext
 import org.apache.spark.internal.Logging
 import org.apache.spark.metrics.source.HiveCatalogMetrics
 import org.apache.spark.sql.SparkSession
@@ -187,42 +188,56 @@ object InMemoryFileIndex extends Logging {
     // in case of large #defaultParallelism.
     val numParallelism = Math.min(paths.size, parallelPartitionDiscoveryParallelism)
 
-    val statusMap = sparkContext
-      .parallelize(serializedPaths, numParallelism)
-      .mapPartitions { pathStrings =>
-        val hadoopConf = serializableConfiguration.value
-        pathStrings.map(new Path(_)).toSeq.map { path =>
-          (path, listLeafFiles(path, hadoopConf, filter, None))
-        }.iterator
-      }.map { case (path, statuses) =>
-      val serializableStatuses = statuses.map { status =>
-        // Turn FileStatus into SerializableFileStatus so we can send it back to the driver
-        val blockLocations = status match {
-          case f: LocatedFileStatus =>
-            f.getBlockLocations.map { loc =>
-              SerializableBlockLocation(
-                loc.getNames,
-                loc.getHosts,
-                loc.getOffset,
-                loc.getLength)
-            }
-
-          case _ =>
-            Array.empty[SerializableBlockLocation]
-        }
-
-        SerializableFileStatus(
-          status.getPath.toString,
-          status.getLen,
-          status.isDirectory,
-          status.getReplication,
-          status.getBlockSize,
-          status.getModificationTime,
-          status.getAccessTime,
-          blockLocations)
+    val previousJobDescription = sparkContext.getLocalProperty(SparkContext.SPARK_JOB_DESCRIPTION)
+    val statusMap = try {
+      val description = paths.size match {
+        case 0 =>
+          s"Listing leaf files and directories 0 paths"
+        case 1 =>
+          s"Listing leaf files and directories for 1 path:<br/>${paths(0)}"
+        case s =>
+          s"Listing leaf files and directories for $s paths:<br/>${paths(0)}, ..."
       }
-      (path.toString, serializableStatuses)
-    }.collect()
+      sparkContext.setJobDescription(description)
+      sparkContext
+        .parallelize(serializedPaths, numParallelism)
+        .mapPartitions { pathStrings =>
+          val hadoopConf = serializableConfiguration.value
+          pathStrings.map(new Path(_)).toSeq.map { path =>
+            (path, listLeafFiles(path, hadoopConf, filter, None))
+          }.iterator
+        }.map { case (path, statuses) =>
+        val serializableStatuses = statuses.map { status =>
+          // Turn FileStatus into SerializableFileStatus so we can send it back to the driver
+          val blockLocations = status match {
+            case f: LocatedFileStatus =>
+              f.getBlockLocations.map { loc =>
+                SerializableBlockLocation(
+                  loc.getNames,
+                  loc.getHosts,
+                  loc.getOffset,
+                  loc.getLength)
+              }
+
+            case _ =>
+              Array.empty[SerializableBlockLocation]
+          }
+
+          SerializableFileStatus(
+            status.getPath.toString,
+            status.getLen,
+            status.isDirectory,
+            status.getReplication,
+            status.getBlockSize,
+            status.getModificationTime,
+            status.getAccessTime,
+            blockLocations)
+        }
+        (path.toString, serializableStatuses)
+      }.collect()
+    } finally {
+      sparkContext.setJobDescription(previousJobDescription)
+    }
 
     // turn SerializableFileStatus back to Status
     statusMap.map { case (path, serializableStatuses) =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala
index 569bac156b531..a5d7e6257a6df 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala
@@ -21,10 +21,14 @@ import java.io.File
 import java.util.Locale
 import java.util.concurrent.ConcurrentLinkedQueue
 
+import scala.collection.JavaConverters._
+
 import org.scalatest.BeforeAndAfter
 
+import org.apache.spark.SparkContext
 import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage
 import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol
+import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart}
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.internal.SQLConf
@@ -775,4 +779,31 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSQLContext with Be
       }
     }
   }
+
+  test("use Spark jobs to list files") {
+    withSQLConf(SQLConf.PARALLEL_PARTITION_DISCOVERY_THRESHOLD.key -> "1") {
+      withTempDir { dir =>
+        val jobDescriptions = new ConcurrentLinkedQueue[String]()
+        val jobListener = new SparkListener {
+          override def onJobStart(jobStart: SparkListenerJobStart): Unit = {
+            jobDescriptions.add(jobStart.properties.getProperty(SparkContext.SPARK_JOB_DESCRIPTION))
+          }
+        }
+        sparkContext.addSparkListener(jobListener)
+        try {
+          spark.range(0, 3).map(i => (i, i))
+            .write.partitionBy("_1").mode("overwrite").parquet(dir.getCanonicalPath)
+          // normal file paths
+          checkDatasetUnorderly(
+            spark.read.parquet(dir.getCanonicalPath).as[(Long, Long)],
+            0L -> 0L, 1L -> 1L, 2L -> 2L)
+          sparkContext.listenerBus.waitUntilEmpty(10000)
+          assert(jobDescriptions.asScala.toList.exists(
+            _.contains("Listing leaf files and directories for 3 paths")))
+        } finally {
+          sparkContext.removeSparkListener(jobListener)
+        }
+      }
+    }
+  }
 }

From ae61f187aa0471242c046fdeac6ed55b9b98a3f6 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Thu, 5 Oct 2017 23:36:18 +0900
Subject: [PATCH 668/779] [SPARK-22206][SQL][SPARKR] gapply in R can't work on
 empty grouping columns

## What changes were proposed in this pull request?

Looks like `FlatMapGroupsInRExec.requiredChildDistribution` didn't consider empty grouping attributes. It should be a problem when running `EnsureRequirements` and `gapply` in R can't work on empty grouping columns.

## How was this patch tested?

Added test.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #19436 from viirya/fix-flatmapinr-distribution.
---
 R/pkg/tests/fulltests/test_sparkSQL.R                       | 5 +++++
 .../main/scala/org/apache/spark/sql/execution/objects.scala | 6 +++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index 7f781f2f66a7f..bbea25bc4da5c 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -3075,6 +3075,11 @@ test_that("gapply() and gapplyCollect() on a DataFrame", {
   df1Collect <- gapplyCollect(df, list("a"), function(key, x) { x })
   expect_identical(df1Collect, expected)
 
+  # gapply on empty grouping columns.
+  df1 <- gapply(df, c(), function(key, x) { x }, schema(df))
+  actual <- collect(df1)
+  expect_identical(actual, expected)
+
   # Computes the sum of second column by grouping on the first and third columns
   # and checks if the sum is larger than 2
   schemas <- list(structType(structField("a", "integer"), structField("e", "boolean")),
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
index 5a3fcad38888e..c68975bea490f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
@@ -394,7 +394,11 @@ case class FlatMapGroupsInRExec(
   override def producedAttributes: AttributeSet = AttributeSet(outputObjAttr)
 
   override def requiredChildDistribution: Seq[Distribution] =
-    ClusteredDistribution(groupingAttributes) :: Nil
+    if (groupingAttributes.isEmpty) {
+      AllTuples :: Nil
+    } else {
+      ClusteredDistribution(groupingAttributes) :: Nil
+    }
 
   override def requiredChildOrdering: Seq[Seq[SortOrder]] =
     Seq(groupingAttributes.map(SortOrder(_, Ascending)))

From 83488cc3180ca18f829516f550766efb3095881e Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Thu, 5 Oct 2017 23:33:49 -0700
Subject: [PATCH 669/779] [SPARK-21871][SQL] Fix infinite loop when bytecode
 size is larger than spark.sql.codegen.hugeMethodLimit

## What changes were proposed in this pull request?
When exceeding `spark.sql.codegen.hugeMethodLimit`, the runtime fallbacks to the Volcano iterator solution. This could cause an infinite loop when `FileSourceScanExec` can use the columnar batch to read the data. This PR is to fix the issue.

## How was this patch tested?
Added a test

Author: gatorsmile <gatorsmile@gmail.com>

Closes #19440 from gatorsmile/testt.
---
 .../sql/execution/WholeStageCodegenExec.scala | 12 ++++++----
 .../execution/WholeStageCodegenSuite.scala    | 23 +++++++++++++++++--
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
index 9073d599ac43d..1aaaf896692d1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
@@ -392,12 +392,16 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
 
     // Check if compiled code has a too large function
     if (maxCodeSize > sqlContext.conf.hugeMethodLimit) {
-      logWarning(s"Found too long generated codes and JIT optimization might not work: " +
-        s"the bytecode size was $maxCodeSize, this value went over the limit " +
+      logInfo(s"Found too long generated codes and JIT optimization might not work: " +
+        s"the bytecode size ($maxCodeSize) is above the limit " +
         s"${sqlContext.conf.hugeMethodLimit}, and the whole-stage codegen was disabled " +
         s"for this plan. To avoid this, you can raise the limit " +
-        s"${SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.key}:\n$treeString")
-      return child.execute()
+        s"`${SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.key}`:\n$treeString")
+      child match {
+        // The fallback solution of batch file source scan still uses WholeStageCodegenExec
+        case f: FileSourceScanExec if f.supportsBatch => // do nothing
+        case _ => return child.execute()
+      }
     }
 
     val references = ctx.references.toArray
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
index aaa77b3ee6201..098e4cfeb15b2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.sql.Row
+import org.apache.spark.sql.{QueryTest, Row, SaveMode}
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeAndComment, CodeGenerator}
 import org.apache.spark.sql.execution.aggregate.HashAggregateExec
 import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec
@@ -28,7 +28,7 @@ import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
 
-class WholeStageCodegenSuite extends SparkPlanTest with SharedSQLContext {
+class WholeStageCodegenSuite extends QueryTest with SharedSQLContext {
 
   test("range/filter should be combined") {
     val df = spark.range(10).filter("id = 1").selectExpr("id + 1")
@@ -185,4 +185,23 @@ class WholeStageCodegenSuite extends SparkPlanTest with SharedSQLContext {
     val (_, maxCodeSize2) = CodeGenerator.compile(codeWithLongFunctions)
     assert(maxCodeSize2 > SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.defaultValue.get)
   }
+
+  test("bytecode of batch file scan exceeds the limit of WHOLESTAGE_HUGE_METHOD_LIMIT") {
+    import testImplicits._
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+      val df = spark.range(10).select(Seq.tabulate(201) {i => ('id + i).as(s"c$i")} : _*)
+      df.write.mode(SaveMode.Overwrite).parquet(path)
+
+      withSQLConf(SQLConf.WHOLESTAGE_MAX_NUM_FIELDS.key -> "202",
+        SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.key -> "2000") {
+        // wide table batch scan causes the byte code of codegen exceeds the limit of
+        // WHOLESTAGE_HUGE_METHOD_LIMIT
+        val df2 = spark.read.parquet(path)
+        val fileScan2 = df2.queryExecution.sparkPlan.find(_.isInstanceOf[FileSourceScanExec]).get
+        assert(fileScan2.asInstanceOf[FileSourceScanExec].supportsBatch)
+        checkAnswer(df2, df)
+      }
+    }
+  }
 }

From 0c03297bf0e87944f9fe0535fdae5518228e3e29 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Fri, 6 Oct 2017 15:08:28 +0100
Subject: [PATCH 670/779] [SPARK-22142][BUILD][STREAMING] Move Flume support
 behind a profile, take 2

## What changes were proposed in this pull request?

Move flume behind a profile, take 2. See https://github.com/apache/spark/pull/19365 for most of the back-story.

This change should fix the problem by removing the examples module dependency and moving Flume examples to the module itself. It also adds deprecation messages, per a discussion on dev about deprecating for 2.3.0.

## How was this patch tested?

Existing tests, which still enable flume integration.

Author: Sean Owen <sowen@cloudera.com>

Closes #19412 from srowen/SPARK-22142.2.
---
 dev/create-release/release-build.sh           |  4 ++--
 dev/mima                                      |  2 +-
 dev/scalastyle                                |  1 +
 dev/sparktestsupport/modules.py               | 20 ++++++++++++++++++-
 dev/test-dependencies.sh                      |  2 +-
 docs/building-spark.md                        |  7 +++++++
 docs/streaming-flume-integration.md           | 13 +++++-------
 examples/pom.xml                              |  7 -------
 .../spark/examples}/JavaFlumeEventCount.java  |  2 --
 .../spark/examples}/FlumeEventCount.scala     |  2 --
 .../examples}/FlumePollingEventCount.scala    |  2 --
 .../spark/streaming/flume/FlumeUtils.scala    |  1 +
 pom.xml                                       | 13 +++++++++---
 project/SparkBuild.scala                      | 17 ++++++++--------
 python/pyspark/streaming/flume.py             |  4 ++++
 python/pyspark/streaming/tests.py             | 16 ++++++++++++---
 16 files changed, 73 insertions(+), 40 deletions(-)
 rename {examples/src/main/java/org/apache/spark/examples/streaming => external/flume/src/main/java/org/apache/spark/examples}/JavaFlumeEventCount.java (98%)
 rename {examples/src/main/scala/org/apache/spark/examples/streaming => external/flume/src/main/scala/org/apache/spark/examples}/FlumeEventCount.scala (98%)
 rename {examples/src/main/scala/org/apache/spark/examples/streaming => external/flume/src/main/scala/org/apache/spark/examples}/FlumePollingEventCount.scala (98%)

diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index 5390f5916fc0d..7e8d5c7075195 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -84,9 +84,9 @@ MVN="build/mvn --force"
 # Hive-specific profiles for some builds
 HIVE_PROFILES="-Phive -Phive-thriftserver"
 # Profiles for publishing snapshots and release to Maven Central
-PUBLISH_PROFILES="-Pmesos -Pyarn $HIVE_PROFILES -Pspark-ganglia-lgpl -Pkinesis-asl"
+PUBLISH_PROFILES="-Pmesos -Pyarn -Pflume $HIVE_PROFILES -Pspark-ganglia-lgpl -Pkinesis-asl"
 # Profiles for building binary releases
-BASE_RELEASE_PROFILES="-Pmesos -Pyarn -Psparkr"
+BASE_RELEASE_PROFILES="-Pmesos -Pyarn -Pflume -Psparkr"
 # Scala 2.11 only profiles for some builds
 SCALA_2_11_PROFILES="-Pkafka-0-8"
 # Scala 2.12 only profiles for some builds
diff --git a/dev/mima b/dev/mima
index fdb21f5007cf2..1e3ca9700bc07 100755
--- a/dev/mima
+++ b/dev/mima
@@ -24,7 +24,7 @@ set -e
 FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
 cd "$FWDIR"
 
-SPARK_PROFILES="-Pmesos -Pkafka-0-8 -Pyarn -Pspark-ganglia-lgpl -Pkinesis-asl -Phive-thriftserver -Phive"
+SPARK_PROFILES="-Pmesos -Pkafka-0-8 -Pyarn -Pflume -Pspark-ganglia-lgpl -Pkinesis-asl -Phive-thriftserver -Phive"
 TOOLS_CLASSPATH="$(build/sbt -DcopyDependencies=false "export tools/fullClasspath" | tail -n1)"
 OLD_DEPS_CLASSPATH="$(build/sbt -DcopyDependencies=false $SPARK_PROFILES "export oldDeps/fullClasspath" | tail -n1)"
 
diff --git a/dev/scalastyle b/dev/scalastyle
index e5aa589869535..89ecc8abd6f8c 100755
--- a/dev/scalastyle
+++ b/dev/scalastyle
@@ -25,6 +25,7 @@ ERRORS=$(echo -e "q\n" \
         -Pmesos \
         -Pkafka-0-8 \
         -Pyarn \
+        -Pflume \
         -Phive \
         -Phive-thriftserver \
         scalastyle test:scalastyle \
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 50e14b60545af..91d5667ed1f07 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -279,6 +279,12 @@ def __hash__(self):
     source_file_regexes=[
         "external/flume-sink",
     ],
+    build_profile_flags=[
+        "-Pflume",
+    ],
+    environ={
+        "ENABLE_FLUME_TESTS": "1"
+    },
     sbt_test_goals=[
         "streaming-flume-sink/test",
     ]
@@ -291,6 +297,12 @@ def __hash__(self):
     source_file_regexes=[
         "external/flume",
     ],
+    build_profile_flags=[
+        "-Pflume",
+    ],
+    environ={
+        "ENABLE_FLUME_TESTS": "1"
+    },
     sbt_test_goals=[
         "streaming-flume/test",
     ]
@@ -302,7 +314,13 @@ def __hash__(self):
     dependencies=[streaming_flume, streaming_flume_sink],
     source_file_regexes=[
         "external/flume-assembly",
-    ]
+    ],
+    build_profile_flags=[
+        "-Pflume",
+    ],
+    environ={
+        "ENABLE_FLUME_TESTS": "1"
+    }
 )
 
 
diff --git a/dev/test-dependencies.sh b/dev/test-dependencies.sh
index c7714578bd005..58b295d4f6e00 100755
--- a/dev/test-dependencies.sh
+++ b/dev/test-dependencies.sh
@@ -29,7 +29,7 @@ export LC_ALL=C
 # TODO: This would be much nicer to do in SBT, once SBT supports Maven-style resolution.
 
 # NOTE: These should match those in the release publishing script
-HADOOP2_MODULE_PROFILES="-Phive-thriftserver -Pmesos -Pkafka-0-8 -Pyarn -Phive"
+HADOOP2_MODULE_PROFILES="-Phive-thriftserver -Pmesos -Pkafka-0-8 -Pyarn -Pflume -Phive"
 MVN="build/mvn"
 HADOOP_PROFILES=(
     hadoop-2.6
diff --git a/docs/building-spark.md b/docs/building-spark.md
index 57baa503259c1..98f7df155456f 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -100,6 +100,13 @@ Note: Kafka 0.8 support is deprecated as of Spark 2.3.0.
 
 Kafka 0.10 support is still automatically built.
 
+## Building with Flume support
+
+Apache Flume support must be explicitly enabled with the `flume` profile.
+Note: Flume support is deprecated as of Spark 2.3.0.
+
+    ./build/mvn -Pflume -DskipTests clean package
+
 ## Building submodules individually
 
 It's possible to build Spark sub-modules using the `mvn -pl` option.
diff --git a/docs/streaming-flume-integration.md b/docs/streaming-flume-integration.md
index a5d36da5b6de9..257a4f7d4f3ca 100644
--- a/docs/streaming-flume-integration.md
+++ b/docs/streaming-flume-integration.md
@@ -5,6 +5,8 @@ title: Spark Streaming + Flume Integration Guide
 
 [Apache Flume](https://flume.apache.org/) is a distributed, reliable, and available service for efficiently collecting, aggregating, and moving large amounts of log data. Here we explain how to configure Flume and Spark Streaming to receive data from Flume. There are two approaches to this.
 
+**Note: Flume support is deprecated as of Spark 2.3.0.**
+
 ## Approach 1: Flume-style Push-based Approach
 Flume is designed to push data between Flume agents. In this approach, Spark Streaming essentially sets up a receiver that acts an Avro agent for Flume, to which Flume can push the data. Here are the configuration steps.
 
@@ -44,8 +46,7 @@ configuring Flume agents.
 
 		val flumeStream = FlumeUtils.createStream(streamingContext, [chosen machine's hostname], [chosen port])
 
-	See the [API docs](api/scala/index.html#org.apache.spark.streaming.flume.FlumeUtils$)
-	and the [example]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/scala/org/apache/spark/examples/streaming/FlumeEventCount.scala).
+	See the [API docs](api/scala/index.html#org.apache.spark.streaming.flume.FlumeUtils$).
 	</div>
 	<div data-lang="java" markdown="1">
 		import org.apache.spark.streaming.flume.*;
@@ -53,8 +54,7 @@ configuring Flume agents.
 		JavaReceiverInputDStream<SparkFlumeEvent> flumeStream =
         	FlumeUtils.createStream(streamingContext, [chosen machine's hostname], [chosen port]);
 
-	See the [API docs](api/java/index.html?org/apache/spark/streaming/flume/FlumeUtils.html)
-	and the [example]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/java/org/apache/spark/examples/streaming/JavaFlumeEventCount.java).
+	See the [API docs](api/java/index.html?org/apache/spark/streaming/flume/FlumeUtils.html).
 	</div>
 	<div data-lang="python" markdown="1">
 		from pyspark.streaming.flume import FlumeUtils
@@ -62,8 +62,7 @@ configuring Flume agents.
 		flumeStream = FlumeUtils.createStream(streamingContext, [chosen machine's hostname], [chosen port])
 
 	By default, the Python API will decode Flume event body as UTF8 encoded strings. You can specify your custom decoding function to decode the body byte arrays in Flume events to any arbitrary data type. 
-	See the [API docs](api/python/pyspark.streaming.html#pyspark.streaming.flume.FlumeUtils)
-	and the [example]({{site.SPARK_GITHUB_URL}}/blob/v{{site.SPARK_VERSION_SHORT}}/examples/src/main/python/streaming/flume_wordcount.py).
+	See the [API docs](api/python/pyspark.streaming.html#pyspark.streaming.flume.FlumeUtils).
 	</div>
 	</div>
 
@@ -162,8 +161,6 @@ configuring Flume agents.
 	</div>
 	</div>
 
-	See the Scala example [FlumePollingEventCount]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/scala/org/apache/spark/examples/streaming/FlumePollingEventCount.scala).
-
 	Note that each input DStream can be configured to receive data from multiple sinks.
 
 3. **Deploying:** This is same as the first approach.
diff --git a/examples/pom.xml b/examples/pom.xml
index 52a6764ae26a5..1791dbaad775e 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -34,7 +34,6 @@
     <sbt.project.name>examples</sbt.project.name>
     <build.testJarPhase>none</build.testJarPhase>
     <build.copyDependenciesPhase>package</build.copyDependenciesPhase>
-    <flume.deps.scope>provided</flume.deps.scope>
     <hadoop.deps.scope>provided</hadoop.deps.scope>
     <hive.deps.scope>provided</hive.deps.scope>
     <parquet.deps.scope>provided</parquet.deps.scope>
@@ -78,12 +77,6 @@
       <version>${project.version}</version>
       <scope>provided</scope>
     </dependency>
-    <dependency>
-      <groupId>org.apache.spark</groupId>
-      <artifactId>spark-streaming-flume_${scala.binary.version}</artifactId>
-      <version>${project.version}</version>
-      <scope>provided</scope>
-    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-streaming-kafka-0-10_${scala.binary.version}</artifactId>
diff --git a/examples/src/main/java/org/apache/spark/examples/streaming/JavaFlumeEventCount.java b/external/flume/src/main/java/org/apache/spark/examples/JavaFlumeEventCount.java
similarity index 98%
rename from examples/src/main/java/org/apache/spark/examples/streaming/JavaFlumeEventCount.java
rename to external/flume/src/main/java/org/apache/spark/examples/JavaFlumeEventCount.java
index 0c651049d0ffa..4e3420d9c3b06 100644
--- a/examples/src/main/java/org/apache/spark/examples/streaming/JavaFlumeEventCount.java
+++ b/external/flume/src/main/java/org/apache/spark/examples/JavaFlumeEventCount.java
@@ -48,8 +48,6 @@ public static void main(String[] args) throws Exception {
       System.exit(1);
     }
 
-    StreamingExamples.setStreamingLogLevels();
-
     String host = args[0];
     int port = Integer.parseInt(args[1]);
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/FlumeEventCount.scala b/external/flume/src/main/scala/org/apache/spark/examples/FlumeEventCount.scala
similarity index 98%
rename from examples/src/main/scala/org/apache/spark/examples/streaming/FlumeEventCount.scala
rename to external/flume/src/main/scala/org/apache/spark/examples/FlumeEventCount.scala
index 91e52e4eff5a7..f877f79391b37 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/FlumeEventCount.scala
+++ b/external/flume/src/main/scala/org/apache/spark/examples/FlumeEventCount.scala
@@ -47,8 +47,6 @@ object FlumeEventCount {
       System.exit(1)
     }
 
-    StreamingExamples.setStreamingLogLevels()
-
     val Array(host, IntParam(port)) = args
 
     val batchInterval = Milliseconds(2000)
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/FlumePollingEventCount.scala b/external/flume/src/main/scala/org/apache/spark/examples/FlumePollingEventCount.scala
similarity index 98%
rename from examples/src/main/scala/org/apache/spark/examples/streaming/FlumePollingEventCount.scala
rename to external/flume/src/main/scala/org/apache/spark/examples/FlumePollingEventCount.scala
index dd725d72c23ef..79a4027ca5bde 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/FlumePollingEventCount.scala
+++ b/external/flume/src/main/scala/org/apache/spark/examples/FlumePollingEventCount.scala
@@ -44,8 +44,6 @@ object FlumePollingEventCount {
       System.exit(1)
     }
 
-    StreamingExamples.setStreamingLogLevels()
-
     val Array(host, IntParam(port)) = args
 
     val batchInterval = Milliseconds(2000)
diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala
index 3e3ed712f0dbf..707193a957700 100644
--- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala
+++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala
@@ -30,6 +30,7 @@ import org.apache.spark.streaming.StreamingContext
 import org.apache.spark.streaming.api.java.{JavaPairDStream, JavaReceiverInputDStream, JavaStreamingContext}
 import org.apache.spark.streaming.dstream.ReceiverInputDStream
 
+@deprecated("Deprecated without replacement", "2.3.0")
 object FlumeUtils {
   private val DEFAULT_POLLING_PARALLELISM = 5
   private val DEFAULT_POLLING_BATCH_SIZE = 1000
diff --git a/pom.xml b/pom.xml
index 87a468c3a6f55..9fac8b1e53788 100644
--- a/pom.xml
+++ b/pom.xml
@@ -98,15 +98,13 @@
     <module>sql/core</module>
     <module>sql/hive</module>
     <module>assembly</module>
-    <module>external/flume</module>
-    <module>external/flume-sink</module>
-    <module>external/flume-assembly</module>
     <module>examples</module>
     <module>repl</module>
     <module>launcher</module>
     <module>external/kafka-0-10</module>
     <module>external/kafka-0-10-assembly</module>
     <module>external/kafka-0-10-sql</module>
+    <!-- See additional modules enabled by profiles below -->
   </modules>
 
   <properties>
@@ -2583,6 +2581,15 @@
       </dependencies>
     </profile>
 
+    <profile>
+      <id>flume</id>
+      <modules>
+        <module>external/flume</module>
+        <module>external/flume-sink</module>
+        <module>external/flume-assembly</module>
+      </modules>
+    </profile>
+
     <!-- Ganglia integration is not included by default due to LGPL-licensed code -->
     <profile>
       <id>spark-ganglia-lgpl</id>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index a568d264cb2db..9501eed1e906b 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -43,11 +43,8 @@ object BuildCommons {
     "catalyst", "sql", "hive", "hive-thriftserver", "sql-kafka-0-10"
   ).map(ProjectRef(buildLocation, _))
 
-  val streamingProjects@Seq(
-    streaming, streamingFlumeSink, streamingFlume, streamingKafka010
-  ) = Seq(
-    "streaming", "streaming-flume-sink", "streaming-flume", "streaming-kafka-0-10"
-  ).map(ProjectRef(buildLocation, _))
+  val streamingProjects@Seq(streaming, streamingKafka010) =
+    Seq("streaming", "streaming-kafka-0-10").map(ProjectRef(buildLocation, _))
 
   val allProjects@Seq(
     core, graphx, mllib, mllibLocal, repl, networkCommon, networkShuffle, launcher, unsafe, tags, sketch, kvstore, _*
@@ -56,9 +53,13 @@ object BuildCommons {
     "tags", "sketch", "kvstore"
   ).map(ProjectRef(buildLocation, _)) ++ sqlProjects ++ streamingProjects
 
-  val optionallyEnabledProjects@Seq(mesos, yarn, streamingKafka, sparkGangliaLgpl,
-    streamingKinesisAsl, dockerIntegrationTests, hadoopCloud) =
-    Seq("mesos", "yarn", "streaming-kafka-0-8", "ganglia-lgpl", "streaming-kinesis-asl",
+  val optionallyEnabledProjects@Seq(mesos, yarn,
+    streamingFlumeSink, streamingFlume,
+    streamingKafka, sparkGangliaLgpl, streamingKinesisAsl,
+    dockerIntegrationTests, hadoopCloud) =
+    Seq("mesos", "yarn",
+      "streaming-flume-sink", "streaming-flume",
+      "streaming-kafka-0-8", "ganglia-lgpl", "streaming-kinesis-asl",
       "docker-integration-tests", "hadoop-cloud").map(ProjectRef(buildLocation, _))
 
   val assemblyProjects@Seq(networkYarn, streamingFlumeAssembly, streamingKafkaAssembly, streamingKafka010Assembly, streamingKinesisAslAssembly) =
diff --git a/python/pyspark/streaming/flume.py b/python/pyspark/streaming/flume.py
index cd30483fc636a..2fed5940b31ea 100644
--- a/python/pyspark/streaming/flume.py
+++ b/python/pyspark/streaming/flume.py
@@ -53,6 +53,8 @@ def createStream(ssc, hostname, port,
         :param enableDecompression:  Should netty server decompress input stream
         :param bodyDecoder:  A function used to decode body (default is utf8_decoder)
         :return: A DStream object
+
+        .. note:: Deprecated in 2.3.0
         """
         jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
         helper = FlumeUtils._get_helper(ssc._sc)
@@ -79,6 +81,8 @@ def createPollingStream(ssc, addresses,
                              will result in this stream using more threads
         :param bodyDecoder:  A function used to decode body (default is utf8_decoder)
         :return: A DStream object
+
+        .. note:: Deprecated in 2.3.0
         """
         jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
         hosts = []
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 229cf53e47359..5b86c1cb2c390 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -1478,7 +1478,7 @@ def search_kafka_assembly_jar():
             ("Failed to find Spark Streaming kafka assembly jar in %s. " % kafka_assembly_dir) +
             "You need to build Spark with "
             "'build/sbt assembly/package streaming-kafka-0-8-assembly/assembly' or "
-            "'build/mvn package' before running this test.")
+            "'build/mvn -Pkafka-0-8 package' before running this test.")
     elif len(jars) > 1:
         raise Exception(("Found multiple Spark Streaming Kafka assembly JARs: %s; please "
                          "remove all but one") % (", ".join(jars)))
@@ -1495,7 +1495,7 @@ def search_flume_assembly_jar():
             ("Failed to find Spark Streaming Flume assembly jar in %s. " % flume_assembly_dir) +
             "You need to build Spark with "
             "'build/sbt assembly/assembly streaming-flume-assembly/assembly' or "
-            "'build/mvn package' before running this test.")
+            "'build/mvn -Pflume package' before running this test.")
     elif len(jars) > 1:
         raise Exception(("Found multiple Spark Streaming Flume assembly JARs: %s; please "
                         "remove all but one") % (", ".join(jars)))
@@ -1516,6 +1516,9 @@ def search_kinesis_asl_assembly_jar():
         return jars[0]
 
 
+# Must be same as the variable and condition defined in modules.py
+flume_test_environ_var = "ENABLE_FLUME_TESTS"
+are_flume_tests_enabled = os.environ.get(flume_test_environ_var) == '1'
 # Must be same as the variable and condition defined in modules.py
 kafka_test_environ_var = "ENABLE_KAFKA_0_8_TESTS"
 are_kafka_tests_enabled = os.environ.get(kafka_test_environ_var) == '1'
@@ -1538,9 +1541,16 @@ def search_kinesis_asl_assembly_jar():
 
     os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars %s pyspark-shell" % jars
     testcases = [BasicOperationTests, WindowFunctionTests, StreamingContextTests, CheckpointTests,
-                 FlumeStreamTests, FlumePollingStreamTests,
                  StreamingListenerTests]
 
+    if are_flume_tests_enabled:
+        testcases.append(FlumeStreamTests)
+        testcases.append(FlumePollingStreamTests)
+    else:
+        sys.stderr.write(
+            "Skipped test_flume_stream (enable by setting environment variable %s=1"
+            % flume_test_environ_var)
+
     if are_kafka_tests_enabled:
         testcases.append(KafkaStreamTests)
     else:

From c7b46d4d8aa8da24131d79d2bfa36e8db19662e4 Mon Sep 17 00:00:00 2001
From: minixalpha <xkzalpha@gmail.com>
Date: Fri, 6 Oct 2017 23:38:47 +0900
Subject: [PATCH 671/779] [SPARK-21877][DEPLOY, WINDOWS] Handle quotes in
 Windows command scripts

## What changes were proposed in this pull request?

All the windows command scripts can not handle quotes in parameter.

Run a windows command shell with parameter which has quotes can reproduce the bug:

```
C:\Users\meng\software\spark-2.2.0-bin-hadoop2.7> bin\spark-shell --driver-java-options " -Dfile.encoding=utf-8 "
'C:\Users\meng\software\spark-2.2.0-bin-hadoop2.7\bin\spark-shell2.cmd" --driver-java-options "' is not recognized as an internal or external command,
operable program or batch file.
```

Windows recognize "--driver-java-options" as part of the command.
All the Windows command script has the following code have the bug.

```
cmd /V /E /C "<other command>" %*
```

We should quote command and parameters like

```
cmd /V /E /C ""<other command>" %*"
```

## How was this patch tested?

Test manually on Windows 10 and Windows 7

We can verify it by the following demo:

```
C:\Users\meng\program\demo>cat a.cmd
echo off
cmd /V /E /C "b.cmd" %*

C:\Users\meng\program\demo>cat b.cmd
echo off
echo %*

C:\Users\meng\program\demo>cat c.cmd
echo off
cmd /V /E /C ""b.cmd" %*"

C:\Users\meng\program\demo>a.cmd "123"
'b.cmd" "123' is not recognized as an internal or external command,
operable program or batch file.

C:\Users\meng\program\demo>c.cmd "123"
"123"
```

With the spark-shell.cmd example, change it to the following code will make the command execute succeed.

```
cmd /V /E /C ""%~dp0spark-shell2.cmd" %*"
```

```
C:\Users\meng\software\spark-2.2.0-bin-hadoop2.7> bin\spark-shell  --driver-java-options " -Dfile.encoding=utf-8 "
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
...

```

Author: minixalpha <xkzalpha@gmail.com>

Closes #19090 from minixalpha/master.
---
 bin/beeline.cmd      | 4 +++-
 bin/pyspark.cmd      | 4 +++-
 bin/run-example.cmd  | 5 ++++-
 bin/spark-class.cmd  | 4 +++-
 bin/spark-shell.cmd  | 4 +++-
 bin/spark-submit.cmd | 4 +++-
 bin/sparkR.cmd       | 4 +++-
 7 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/bin/beeline.cmd b/bin/beeline.cmd
index 02464bd088792..288059a28cd74 100644
--- a/bin/beeline.cmd
+++ b/bin/beeline.cmd
@@ -17,4 +17,6 @@ rem See the License for the specific language governing permissions and
 rem limitations under the License.
 rem
 
-cmd /V /E /C "%~dp0spark-class.cmd" org.apache.hive.beeline.BeeLine %*
+rem The outermost quotes are used to prevent Windows command line parse error
+rem when there are some quotes in parameters, see SPARK-21877.
+cmd /V /E /C ""%~dp0spark-class.cmd" org.apache.hive.beeline.BeeLine %*"
diff --git a/bin/pyspark.cmd b/bin/pyspark.cmd
index 72d046a4ba2cf..3dcf1d45a8189 100644
--- a/bin/pyspark.cmd
+++ b/bin/pyspark.cmd
@@ -20,4 +20,6 @@ rem
 rem This is the entry point for running PySpark. To avoid polluting the
 rem environment, it just launches a new cmd to do the real work.
 
-cmd /V /E /C "%~dp0pyspark2.cmd" %*
+rem The outermost quotes are used to prevent Windows command line parse error
+rem when there are some quotes in parameters, see SPARK-21877.
+cmd /V /E /C ""%~dp0pyspark2.cmd" %*"
diff --git a/bin/run-example.cmd b/bin/run-example.cmd
index f9b786e92b823..efa5f81d08f7f 100644
--- a/bin/run-example.cmd
+++ b/bin/run-example.cmd
@@ -19,4 +19,7 @@ rem
 
 set SPARK_HOME=%~dp0..
 set _SPARK_CMD_USAGE=Usage: ./bin/run-example [options] example-class [example args]
-cmd /V /E /C "%~dp0spark-submit.cmd" run-example %*
+
+rem The outermost quotes are used to prevent Windows command line parse error
+rem when there are some quotes in parameters, see SPARK-21877.
+cmd /V /E /C ""%~dp0spark-submit.cmd" run-example %*"
diff --git a/bin/spark-class.cmd b/bin/spark-class.cmd
index 3bf3d20cb57b5..b22536ab6f458 100644
--- a/bin/spark-class.cmd
+++ b/bin/spark-class.cmd
@@ -20,4 +20,6 @@ rem
 rem This is the entry point for running a Spark class. To avoid polluting
 rem the environment, it just launches a new cmd to do the real work.
 
-cmd /V /E /C "%~dp0spark-class2.cmd" %*
+rem The outermost quotes are used to prevent Windows command line parse error
+rem when there are some quotes in parameters, see SPARK-21877.
+cmd /V /E /C ""%~dp0spark-class2.cmd" %*"
diff --git a/bin/spark-shell.cmd b/bin/spark-shell.cmd
index 991423da6ab99..e734f13097d61 100644
--- a/bin/spark-shell.cmd
+++ b/bin/spark-shell.cmd
@@ -20,4 +20,6 @@ rem
 rem This is the entry point for running Spark shell. To avoid polluting the
 rem environment, it just launches a new cmd to do the real work.
 
-cmd /V /E /C "%~dp0spark-shell2.cmd" %*
+rem The outermost quotes are used to prevent Windows command line parse error
+rem when there are some quotes in parameters, see SPARK-21877.
+cmd /V /E /C ""%~dp0spark-shell2.cmd" %*"
diff --git a/bin/spark-submit.cmd b/bin/spark-submit.cmd
index f301606933a95..da62a8777524d 100644
--- a/bin/spark-submit.cmd
+++ b/bin/spark-submit.cmd
@@ -20,4 +20,6 @@ rem
 rem This is the entry point for running Spark submit. To avoid polluting the
 rem environment, it just launches a new cmd to do the real work.
 
-cmd /V /E /C "%~dp0spark-submit2.cmd" %*
+rem The outermost quotes are used to prevent Windows command line parse error
+rem when there are some quotes in parameters, see SPARK-21877.
+cmd /V /E /C ""%~dp0spark-submit2.cmd" %*"
diff --git a/bin/sparkR.cmd b/bin/sparkR.cmd
index 1e5ea6a623219..fcd172b083e1e 100644
--- a/bin/sparkR.cmd
+++ b/bin/sparkR.cmd
@@ -20,4 +20,6 @@ rem
 rem This is the entry point for running SparkR. To avoid polluting the
 rem environment, it just launches a new cmd to do the real work.
 
-cmd /V /E /C "%~dp0sparkR2.cmd" %*
+rem The outermost quotes are used to prevent Windows command line parse error
+rem when there are some quotes in parameters, see SPARK-21877.
+cmd /V /E /C ""%~dp0sparkR2.cmd" %*"

From 08b204fd2c731e87d3bc2cc0bccb6339ef7e3a6e Mon Sep 17 00:00:00 2001
From: Xingbo Jiang <xingbo.jiang@databricks.com>
Date: Fri, 6 Oct 2017 12:53:35 -0700
Subject: [PATCH 672/779] [SPARK-22214][SQL] Refactor the list hive partitions
 code

## What changes were proposed in this pull request?

In this PR we make a few changes to the list hive partitions code, to make the code more extensible.
The following changes are made:
1. In `HiveClientImpl.getPartitions()`, call `client.getPartitions` instead of `shim.getAllPartitions` when `spec` is empty;
2. In `HiveTableScanExec`, previously we always call `listPartitionsByFilter` if the config `metastorePartitionPruning` is enabled, but actually, we'd better call `listPartitions` if `partitionPruningPred` is empty;
3.  We should use sessionCatalog instead of SharedState.externalCatalog in `HiveTableScanExec`.

## How was this patch tested?

Tested by existing test cases since this is code refactor, no regression or behavior change is expected.

Author: Xingbo Jiang <xingbo.jiang@databricks.com>

Closes #19444 from jiangxb1987/hivePartitions.
---
 .../sql/catalyst/catalog/interface.scala      |  5 ++++
 .../sql/hive/client/HiveClientImpl.scala      |  7 +++--
 .../hive/execution/HiveTableScanExec.scala    | 28 +++++++++----------
 3 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index fe2af910a0ae5..975b084aa6188 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -405,6 +405,11 @@ object CatalogTypes {
    * Specifications of a table partition. Mapping column name to column value.
    */
   type TablePartitionSpec = Map[String, String]
+
+  /**
+   * Initialize an empty spec.
+   */
+  lazy val emptyTablePartitionSpec: TablePartitionSpec = Map.empty[String, String]
 }
 
 /**
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 66165c7228bca..a01c312d5e497 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -638,12 +638,13 @@ private[hive] class HiveClientImpl(
       table: CatalogTable,
       spec: Option[TablePartitionSpec]): Seq[CatalogTablePartition] = withHiveState {
     val hiveTable = toHiveTable(table, Some(userName))
-    val parts = spec match {
-      case None => shim.getAllPartitions(client, hiveTable).map(fromHivePartition)
+    val partSpec = spec match {
+      case None => CatalogTypes.emptyTablePartitionSpec
       case Some(s) =>
         assert(s.values.forall(_.nonEmpty), s"partition spec '$s' is invalid")
-        client.getPartitions(hiveTable, s.asJava).asScala.map(fromHivePartition)
+        s
     }
+    val parts = client.getPartitions(hiveTable, partSpec.asJava).asScala.map(fromHivePartition)
     HiveCatalogMetrics.incrementFetchedPartitions(parts.length)
     parts
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala
index 48d0b4a63e54a..4f8dab9cd6172 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala
@@ -162,21 +162,19 @@ case class HiveTableScanExec(
 
   // exposed for tests
   @transient lazy val rawPartitions = {
-    val prunedPartitions = if (sparkSession.sessionState.conf.metastorePartitionPruning) {
-      // Retrieve the original attributes based on expression ID so that capitalization matches.
-      val normalizedFilters = partitionPruningPred.map(_.transform {
-        case a: AttributeReference => originalAttributes(a)
-      })
-      sparkSession.sharedState.externalCatalog.listPartitionsByFilter(
-        relation.tableMeta.database,
-        relation.tableMeta.identifier.table,
-        normalizedFilters,
-        sparkSession.sessionState.conf.sessionLocalTimeZone)
-    } else {
-      sparkSession.sharedState.externalCatalog.listPartitions(
-        relation.tableMeta.database,
-        relation.tableMeta.identifier.table)
-    }
+    val prunedPartitions =
+      if (sparkSession.sessionState.conf.metastorePartitionPruning &&
+          partitionPruningPred.size > 0) {
+        // Retrieve the original attributes based on expression ID so that capitalization matches.
+        val normalizedFilters = partitionPruningPred.map(_.transform {
+          case a: AttributeReference => originalAttributes(a)
+        })
+        sparkSession.sessionState.catalog.listPartitionsByFilter(
+          relation.tableMeta.identifier,
+          normalizedFilters)
+      } else {
+        sparkSession.sessionState.catalog.listPartitions(relation.tableMeta.identifier)
+      }
     prunedPartitions.map(HiveClientImpl.toHivePartition(_, hiveQlTable))
   }
 

From debcbec7491d3a23b19ef149e50d2887590b6de0 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Fri, 6 Oct 2017 13:10:04 -0700
Subject: [PATCH 673/779] [SPARK-21947][SS] Check and report error when
 monotonically_increasing_id is used in streaming query

## What changes were proposed in this pull request?

`monotonically_increasing_id` doesn't work in Structured Streaming. We should throw an exception if a streaming query uses it.

## How was this patch tested?

Added test.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #19336 from viirya/SPARK-21947.
---
 .../analysis/UnsupportedOperationChecker.scala    | 15 ++++++++++++++-
 .../analysis/UnsupportedOperationsSuite.scala     | 10 +++++++++-
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
index dee6fbe9d1514..04502d04d9509 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.analysis
 
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, AttributeSet}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, AttributeSet, MonotonicallyIncreasingID}
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys
 import org.apache.spark.sql.catalyst.plans._
@@ -129,6 +129,16 @@ object UnsupportedOperationChecker {
       !subplan.isStreaming || (aggs.nonEmpty && outputMode == InternalOutputModes.Complete)
     }
 
+    def checkUnsupportedExpressions(implicit operator: LogicalPlan): Unit = {
+      val unsupportedExprs = operator.expressions.flatMap(_.collect {
+        case m: MonotonicallyIncreasingID => m
+      }).distinct
+      if (unsupportedExprs.nonEmpty) {
+        throwError("Expression(s): " + unsupportedExprs.map(_.sql).mkString(", ") +
+          " is not supported with streaming DataFrames/Datasets")
+      }
+    }
+
     plan.foreachUp { implicit subPlan =>
 
       // Operations that cannot exists anywhere in a streaming plan
@@ -323,6 +333,9 @@ object UnsupportedOperationChecker {
 
         case _ =>
       }
+
+      // Check if there are unsupported expressions in streaming query plan.
+      checkUnsupportedExpressions(subPlan)
     }
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala
index e5057c451d5b8..60d1351fda264 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, NamedExpression}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, MonotonicallyIncreasingID, NamedExpression}
 import org.apache.spark.sql.catalyst.expressions.aggregate.Count
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical.{FlatMapGroupsWithState, _}
@@ -614,6 +614,14 @@ class UnsupportedOperationsSuite extends SparkFunSuite {
   testOutputMode(Update, shouldSupportAggregation = true, shouldSupportNonAggregation = true)
   testOutputMode(Complete, shouldSupportAggregation = true, shouldSupportNonAggregation = false)
 
+  // Unsupported expressions in streaming plan
+  assertNotSupportedInStreamingPlan(
+    "MonotonicallyIncreasingID",
+    streamRelation.select(MonotonicallyIncreasingID()),
+    outputMode = Append,
+    expectedMsgs = Seq("monotonically_increasing_id"))
+
+
   /*
     =======================================================================================
                                      TESTING FUNCTIONS

From 2030f19511f656e9534f3fd692e622e45f9a074e Mon Sep 17 00:00:00 2001
From: Sergey Zhemzhitsky <szhemzhitski@gmail.com>
Date: Fri, 6 Oct 2017 20:43:53 -0700
Subject: [PATCH 674/779] [SPARK-21549][CORE] Respect OutputFormats with no
 output directory provided

## What changes were proposed in this pull request?

Fix for https://issues.apache.org/jira/browse/SPARK-21549 JIRA issue.

Since version 2.2 Spark does not respect OutputFormat with no output paths provided.
The examples of such formats are [Cassandra OutputFormat](https://github.com/finn-no/cassandra-hadoop/blob/08dfa3a7ac727bb87269f27a1c82ece54e3f67e6/src/main/java/org/apache/cassandra/hadoop2/AbstractColumnFamilyOutputFormat.java), [Aerospike OutputFormat](https://github.com/aerospike/aerospike-hadoop/blob/master/mapreduce/src/main/java/com/aerospike/hadoop/mapreduce/AerospikeOutputFormat.java), etc. which do not have an ability to rollback the results written to an external systems on job failure.

Provided output directory is required by Spark to allows files to be committed to an absolute output location, that is not the case for output formats which write data to external systems.

This pull request prevents accessing `absPathStagingDir` method that causes the error described in SPARK-21549 unless there are files to rename in `addedAbsPathFiles`.

## How was this patch tested?

Unit tests

Author: Sergey Zhemzhitsky <szhemzhitski@gmail.com>

Closes #19294 from szhem/SPARK-21549-abs-output-commits.
---
 .../io/HadoopMapReduceCommitProtocol.scala    | 28 ++++++++++++----
 .../spark/rdd/PairRDDFunctionsSuite.scala     | 33 ++++++++++++++++++-
 2 files changed, 54 insertions(+), 7 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index b1d07ab2c9199..a7e6859ef6b64 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -35,6 +35,9 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil
  * (from the newer mapreduce API, not the old mapred API).
  *
  * Unlike Hadoop's OutputCommitter, this implementation is serializable.
+ *
+ * @param jobId the job's or stage's id
+ * @param path the job's output path, or null if committer acts as a noop
  */
 class HadoopMapReduceCommitProtocol(jobId: String, path: String)
   extends FileCommitProtocol with Serializable with Logging {
@@ -57,6 +60,15 @@ class HadoopMapReduceCommitProtocol(jobId: String, path: String)
    */
   private def absPathStagingDir: Path = new Path(path, "_temporary-" + jobId)
 
+  /**
+   * Checks whether there are files to be committed to an absolute output location.
+   *
+   * As committing and aborting a job occurs on driver, where `addedAbsPathFiles` is always null,
+   * it is necessary to check whether the output path is specified. Output path may not be required
+   * for committers not writing to distributed file systems.
+   */
+  private def hasAbsPathFiles: Boolean = path != null
+
   protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     val format = context.getOutputFormatClass.newInstance()
     // If OutputFormat is Configurable, we should set conf to it.
@@ -130,17 +142,21 @@ class HadoopMapReduceCommitProtocol(jobId: String, path: String)
     val filesToMove = taskCommits.map(_.obj.asInstanceOf[Map[String, String]])
       .foldLeft(Map[String, String]())(_ ++ _)
     logDebug(s"Committing files staged for absolute locations $filesToMove")
-    val fs = absPathStagingDir.getFileSystem(jobContext.getConfiguration)
-    for ((src, dst) <- filesToMove) {
-      fs.rename(new Path(src), new Path(dst))
+    if (hasAbsPathFiles) {
+      val fs = absPathStagingDir.getFileSystem(jobContext.getConfiguration)
+      for ((src, dst) <- filesToMove) {
+        fs.rename(new Path(src), new Path(dst))
+      }
+      fs.delete(absPathStagingDir, true)
     }
-    fs.delete(absPathStagingDir, true)
   }
 
   override def abortJob(jobContext: JobContext): Unit = {
     committer.abortJob(jobContext, JobStatus.State.FAILED)
-    val fs = absPathStagingDir.getFileSystem(jobContext.getConfiguration)
-    fs.delete(absPathStagingDir, true)
+    if (hasAbsPathFiles) {
+      val fs = absPathStagingDir.getFileSystem(jobContext.getConfiguration)
+      fs.delete(absPathStagingDir, true)
+    }
   }
 
   override def setupTask(taskContext: TaskAttemptContext): Unit = {
diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
index 44dd955ce8690..07579c5098014 100644
--- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
@@ -26,7 +26,7 @@ import org.apache.commons.math3.distribution.{BinomialDistribution, PoissonDistr
 import org.apache.hadoop.conf.{Configurable, Configuration}
 import org.apache.hadoop.fs.FileSystem
 import org.apache.hadoop.mapred._
-import org.apache.hadoop.mapreduce.{JobContext => NewJobContext,
+import org.apache.hadoop.mapreduce.{Job => NewJob, JobContext => NewJobContext,
   OutputCommitter => NewOutputCommitter, OutputFormat => NewOutputFormat,
   RecordWriter => NewRecordWriter, TaskAttemptContext => NewTaskAttempContext}
 import org.apache.hadoop.util.Progressable
@@ -568,6 +568,37 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
     assert(FakeWriterWithCallback.exception.getMessage contains "failed to write")
   }
 
+  test("saveAsNewAPIHadoopDataset should respect empty output directory when " +
+    "there are no files to be committed to an absolute output location") {
+    val pairs = sc.parallelize(Array((new Integer(1), new Integer(2))), 1)
+
+    val job = NewJob.getInstance(new Configuration(sc.hadoopConfiguration))
+    job.setOutputKeyClass(classOf[Integer])
+    job.setOutputValueClass(classOf[Integer])
+    job.setOutputFormatClass(classOf[NewFakeFormat])
+    val jobConfiguration = job.getConfiguration
+
+    // just test that the job does not fail with
+    // java.lang.IllegalArgumentException: Can not create a Path from a null string
+    pairs.saveAsNewAPIHadoopDataset(jobConfiguration)
+  }
+
+  test("saveAsHadoopDataset should respect empty output directory when " +
+    "there are no files to be committed to an absolute output location") {
+    val pairs = sc.parallelize(Array((new Integer(1), new Integer(2))), 1)
+
+    val conf = new JobConf()
+    conf.setOutputKeyClass(classOf[Integer])
+    conf.setOutputValueClass(classOf[Integer])
+    conf.setOutputFormat(classOf[FakeOutputFormat])
+    conf.setOutputCommitter(classOf[FakeOutputCommitter])
+
+    FakeOutputCommitter.ran = false
+    pairs.saveAsHadoopDataset(conf)
+
+    assert(FakeOutputCommitter.ran, "OutputCommitter was never called")
+  }
+
   test("lookup") {
     val pairs = sc.parallelize(Array((1, 2), (3, 4), (5, 6), (5, 7)))
 

From 5eacc3bfa9b9c1435ce04222ac7f943b5f930cf4 Mon Sep 17 00:00:00 2001
From: Kento NOZAWA <k_nzw@klis.tsukuba.ac.jp>
Date: Sat, 7 Oct 2017 08:30:48 +0100
Subject: [PATCH 675/779] [SPARK-22156][MLLIB] Fix update equation of learning
 rate in Word2Vec.scala

## What changes were proposed in this pull request?

Current equation of learning rate is incorrect when `numIterations` > `1`.
This PR is based on [original C code](https://github.com/tmikolov/word2vec/blob/master/word2vec.c#L393).

cc: mengxr

## How was this patch tested?

manual tests

I modified [this example code](https://spark.apache.org/docs/2.1.1/mllib-feature-extraction.html#example).

### `numIteration=1`

#### Code

```scala
import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel}

val input = sc.textFile("data/mllib/sample_lda_data.txt").map(line => line.split(" ").toSeq)

val word2vec = new Word2Vec()

val model = word2vec.fit(input)

val synonyms = model.findSynonyms("1", 5)

for((synonym, cosineSimilarity) <- synonyms) {
  println(s"$synonym $cosineSimilarity")
}
```

#### Result

```
2 0.175856813788414
0 0.10971353203058243
4 0.09818313270807266
3 0.012947646901011467
9 -0.09881238639354706
```

### `numIteration=5`

#### Code

```scala
import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel}

val input = sc.textFile("data/mllib/sample_lda_data.txt").map(line => line.split(" ").toSeq)

val word2vec = new Word2Vec()
word2vec.setNumIterations(5)

val model = word2vec.fit(input)

val synonyms = model.findSynonyms("1", 5)

for((synonym, cosineSimilarity) <- synonyms) {
  println(s"$synonym $cosineSimilarity")
}
```

#### Result

```
0 0.9898583889007568
2 0.9808019399642944
4 0.9794934391975403
3 0.9506527781486511
9 -0.9065656661987305
```

Author: Kento NOZAWA <k_nzw@klis.tsukuba.ac.jp>

Closes #19372 from nzw0301/master.
---
 .../org/apache/spark/mllib/feature/Word2Vec.scala    | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index 6f96813497b62..b8c306d86bace 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -353,11 +353,14 @@ class Word2Vec extends Serializable with Logging {
     val syn0Global =
       Array.fill[Float](vocabSize * vectorSize)((initRandom.nextFloat() - 0.5f) / vectorSize)
     val syn1Global = new Array[Float](vocabSize * vectorSize)
+    val totalWordsCounts = numIterations * trainWordsCount + 1
     var alpha = learningRate
 
     for (k <- 1 to numIterations) {
       val bcSyn0Global = sc.broadcast(syn0Global)
       val bcSyn1Global = sc.broadcast(syn1Global)
+      val numWordsProcessedInPreviousIterations = (k - 1) * trainWordsCount
+
       val partial = newSentences.mapPartitionsWithIndex { case (idx, iter) =>
         val random = new XORShiftRandom(seed ^ ((idx + 1) << 16) ^ ((-k - 1) << 8))
         val syn0Modify = new Array[Int](vocabSize)
@@ -368,11 +371,12 @@ class Word2Vec extends Serializable with Logging {
             var wc = wordCount
             if (wordCount - lastWordCount > 10000) {
               lwc = wordCount
-              // TODO: discount by iteration?
-              alpha =
-                learningRate * (1 - numPartitions * wordCount.toDouble / (trainWordsCount + 1))
+              alpha = learningRate *
+                (1 - (numPartitions * wordCount.toDouble + numWordsProcessedInPreviousIterations) /
+                  totalWordsCounts)
               if (alpha < learningRate * 0.0001) alpha = learningRate * 0.0001
-              logInfo("wordCount = " + wordCount + ", alpha = " + alpha)
+              logInfo(s"wordCount = ${wordCount + numWordsProcessedInPreviousIterations}, " +
+                s"alpha = $alpha")
             }
             wc += sentence.length
             var pos = 0

From c998a2ae0ea019dfb9b39cef6ddfac07c496e083 Mon Sep 17 00:00:00 2001
From: Sergei Lebedev <s.lebedev@criteo.com>
Date: Sun, 8 Oct 2017 12:58:39 +0100
Subject: [PATCH 676/779] [SPARK-22147][CORE] Removed redundant allocations
 from BlockId

## What changes were proposed in this pull request?

Prior to this commit BlockId.hashCode and BlockId.equals were defined
in terms of BlockId.name. This allowed the subclasses to be concise and
enforced BlockId.name as a single unique identifier for a block. All
subclasses override BlockId.name with an expression involving an
allocation of StringBuilder and ultimatelly String. This is suboptimal
since it induced unnecessary GC pressure on the dirver, see
BlockManagerMasterEndpoint.

The commit removes the definition of hashCode and equals from the base
class. No other change is necessary since all subclasses are in fact
case classes and therefore have auto-generated hashCode and equals. No
change of behaviour is expected.

Sidenote: you might be wondering, why did the subclasses use the base
implementation and the auto-generated one? Apparently, this behaviour
is documented in the spec. See this SO answer for details
https://stackoverflow.com/a/44990210/262432.

## How was this patch tested?

BlockIdSuite

Author: Sergei Lebedev <s.lebedev@criteo.com>

Closes #19369 from superbobry/blockid-equals-hashcode.
---
 .../netty/NettyBlockTransferService.scala     |  2 +-
 .../org/apache/spark/storage/BlockId.scala    |  5 --
 .../org/apache/spark/storage/DiskStore.scala  |  8 +--
 .../BlockManagerReplicationSuite.scala        | 49 -------------------
 4 files changed, 5 insertions(+), 59 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
index ac4d85004bad1..6a29e18bf3cbb 100644
--- a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
+++ b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
@@ -151,7 +151,7 @@ private[spark] class NettyBlockTransferService(
     // Convert or copy nio buffer into array in order to serialize it.
     val array = JavaUtils.bufferToArray(blockData.nioByteBuffer())
 
-    client.sendRpc(new UploadBlock(appId, execId, blockId.toString, metadata, array).toByteBuffer,
+    client.sendRpc(new UploadBlock(appId, execId, blockId.name, metadata, array).toByteBuffer,
       new RpcResponseCallback {
         override def onSuccess(response: ByteBuffer): Unit = {
           logTrace(s"Successfully uploaded block $blockId")
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockId.scala b/core/src/main/scala/org/apache/spark/storage/BlockId.scala
index 524f6970992a5..a441baed2800e 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockId.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockId.scala
@@ -41,11 +41,6 @@ sealed abstract class BlockId {
   def isBroadcast: Boolean = isInstanceOf[BroadcastBlockId]
 
   override def toString: String = name
-  override def hashCode: Int = name.hashCode
-  override def equals(other: Any): Boolean = other match {
-    case o: BlockId => getClass == o.getClass && name.equals(o.name)
-    case _ => false
-  }
 }
 
 @DeveloperApi
diff --git a/core/src/main/scala/org/apache/spark/storage/DiskStore.scala b/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
index 3579acf8d83d9..97abd92d4b70f 100644
--- a/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
@@ -47,9 +47,9 @@ private[spark] class DiskStore(
   private val minMemoryMapBytes = conf.getSizeAsBytes("spark.storage.memoryMapThreshold", "2m")
   private val maxMemoryMapBytes = conf.getSizeAsBytes("spark.storage.memoryMapLimitForTests",
     Int.MaxValue.toString)
-  private val blockSizes = new ConcurrentHashMap[String, Long]()
+  private val blockSizes = new ConcurrentHashMap[BlockId, Long]()
 
-  def getSize(blockId: BlockId): Long = blockSizes.get(blockId.name)
+  def getSize(blockId: BlockId): Long = blockSizes.get(blockId)
 
   /**
    * Invokes the provided callback function to write the specific block.
@@ -67,7 +67,7 @@ private[spark] class DiskStore(
     var threwException: Boolean = true
     try {
       writeFunc(out)
-      blockSizes.put(blockId.name, out.getCount)
+      blockSizes.put(blockId, out.getCount)
       threwException = false
     } finally {
       try {
@@ -113,7 +113,7 @@ private[spark] class DiskStore(
   }
 
   def remove(blockId: BlockId): Boolean = {
-    blockSizes.remove(blockId.name)
+    blockSizes.remove(blockId)
     val file = diskManager.getFile(blockId.name)
     if (file.exists()) {
       val ret = file.delete()
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala
index dd61dcd11bcda..c2101ba828553 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala
@@ -198,55 +198,6 @@ trait BlockManagerReplicationBehavior extends SparkFunSuite
     }
   }
 
-  test("block replication - deterministic node selection") {
-    val blockSize = 1000
-    val storeSize = 10000
-    val stores = (1 to 5).map {
-      i => makeBlockManager(storeSize, s"store$i")
-    }
-    val storageLevel2x = StorageLevel.MEMORY_AND_DISK_2
-    val storageLevel3x = StorageLevel(true, true, false, true, 3)
-    val storageLevel4x = StorageLevel(true, true, false, true, 4)
-
-    def putBlockAndGetLocations(blockId: String, level: StorageLevel): Set[BlockManagerId] = {
-      stores.head.putSingle(blockId, new Array[Byte](blockSize), level)
-      val locations = master.getLocations(blockId).sortBy { _.executorId }.toSet
-      stores.foreach { _.removeBlock(blockId) }
-      master.removeBlock(blockId)
-      locations
-    }
-
-    // Test if two attempts to 2x replication returns same set of locations
-    val a1Locs = putBlockAndGetLocations("a1", storageLevel2x)
-    assert(putBlockAndGetLocations("a1", storageLevel2x) === a1Locs,
-      "Inserting a 2x replicated block second time gave different locations from the first")
-
-    // Test if two attempts to 3x replication returns same set of locations
-    val a2Locs3x = putBlockAndGetLocations("a2", storageLevel3x)
-    assert(putBlockAndGetLocations("a2", storageLevel3x) === a2Locs3x,
-      "Inserting a 3x replicated block second time gave different locations from the first")
-
-    // Test if 2x replication of a2 returns a strict subset of the locations of 3x replication
-    val a2Locs2x = putBlockAndGetLocations("a2", storageLevel2x)
-    assert(
-      a2Locs2x.subsetOf(a2Locs3x),
-      "Inserting a with 2x replication gave locations that are not a subset of locations" +
-        s" with 3x replication [3x: ${a2Locs3x.mkString(",")}; 2x: ${a2Locs2x.mkString(",")}"
-    )
-
-    // Test if 4x replication of a2 returns a strict superset of the locations of 3x replication
-    val a2Locs4x = putBlockAndGetLocations("a2", storageLevel4x)
-    assert(
-      a2Locs3x.subsetOf(a2Locs4x),
-      "Inserting a with 4x replication gave locations that are not a superset of locations " +
-        s"with 3x replication [3x: ${a2Locs3x.mkString(",")}; 4x: ${a2Locs4x.mkString(",")}"
-    )
-
-    // Test if 3x replication of two different blocks gives two different sets of locations
-    val a3Locs3x = putBlockAndGetLocations("a3", storageLevel3x)
-    assert(a3Locs3x !== a2Locs3x, "Two blocks gave same locations with 3x replication")
-  }
-
   test("block replication - replication failures") {
     /*
       Create a system of three block managers / stores. One of them (say, failableStore)

From fe7b219ae3e8a045655a836cbb77219036ec5740 Mon Sep 17 00:00:00 2001
From: Yuanjian Li <xyliyuanjian@gmail.com>
Date: Mon, 9 Oct 2017 14:16:25 +0800
Subject: [PATCH 677/779] [SPARK-22074][CORE] Task killed by other attempt task
 should not be resubmitted

## What changes were proposed in this pull request?

As the detail scenario described in [SPARK-22074](https://issues.apache.org/jira/browse/SPARK-22074), unnecessary resubmitted may cause stage hanging in currently release versions. This patch add a new var in TaskInfo to mark this task killed by other attempt or not.

## How was this patch tested?

Add a new UT `[SPARK-22074] Task killed by other attempt task should not be resubmitted` in TaskSetManagerSuite, this UT recreate the scenario in JIRA description, it failed without the changes in this PR and passed conversely.

Author: Yuanjian Li <xyliyuanjian@gmail.com>

Closes #19287 from xuanyuanking/SPARK-22074.
---
 .../spark/scheduler/TaskSetManager.scala      |   8 +-
 .../org/apache/spark/scheduler/FakeTask.scala |  20 +++-
 .../spark/scheduler/TaskSetManagerSuite.scala | 107 ++++++++++++++++++
 3 files changed, 132 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index 3bdede6743d1b..de4711f461df2 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -83,6 +83,11 @@ private[spark] class TaskSetManager(
   val successful = new Array[Boolean](numTasks)
   private val numFailures = new Array[Int](numTasks)
 
+  // Set the coresponding index of Boolean var when the task killed by other attempt tasks,
+  // this happened while we set the `spark.speculation` to true. The task killed by others
+  // should not resubmit while executor lost.
+  private val killedByOtherAttempt: Array[Boolean] = new Array[Boolean](numTasks)
+
   val taskAttempts = Array.fill[List[TaskInfo]](numTasks)(Nil)
   private[scheduler] var tasksSuccessful = 0
 
@@ -729,6 +734,7 @@ private[spark] class TaskSetManager(
       logInfo(s"Killing attempt ${attemptInfo.attemptNumber} for task ${attemptInfo.id} " +
         s"in stage ${taskSet.id} (TID ${attemptInfo.taskId}) on ${attemptInfo.host} " +
         s"as the attempt ${info.attemptNumber} succeeded on ${info.host}")
+      killedByOtherAttempt(index) = true
       sched.backend.killTask(
         attemptInfo.taskId,
         attemptInfo.executorId,
@@ -915,7 +921,7 @@ private[spark] class TaskSetManager(
         && !isZombie) {
       for ((tid, info) <- taskInfos if info.executorId == execId) {
         val index = taskInfos(tid).index
-        if (successful(index)) {
+        if (successful(index) && !killedByOtherAttempt(index)) {
           successful(index) = false
           copiesRunning(index) -= 1
           tasksSuccessful -= 1
diff --git a/core/src/test/scala/org/apache/spark/scheduler/FakeTask.scala b/core/src/test/scala/org/apache/spark/scheduler/FakeTask.scala
index fe6de2bd98850..109d4a0a870b8 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/FakeTask.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/FakeTask.scala
@@ -19,8 +19,7 @@ package org.apache.spark.scheduler
 
 import java.util.Properties
 
-import org.apache.spark.SparkEnv
-import org.apache.spark.TaskContext
+import org.apache.spark.{Partition, SparkEnv, TaskContext}
 import org.apache.spark.executor.TaskMetrics
 
 class FakeTask(
@@ -58,4 +57,21 @@ object FakeTask {
     }
     new TaskSet(tasks, stageId, stageAttemptId, priority = 0, null)
   }
+
+  def createShuffleMapTaskSet(
+      numTasks: Int,
+      stageId: Int,
+      stageAttemptId: Int,
+      prefLocs: Seq[TaskLocation]*): TaskSet = {
+    if (prefLocs.size != 0 && prefLocs.size != numTasks) {
+      throw new IllegalArgumentException("Wrong number of task locations")
+    }
+    val tasks = Array.tabulate[Task[_]](numTasks) { i =>
+      new ShuffleMapTask(stageId, stageAttemptId, null, new Partition {
+        override def index: Int = i
+      }, prefLocs(i), new Properties,
+        SparkEnv.get.closureSerializer.newInstance().serialize(TaskMetrics.registered).array())
+    }
+    new TaskSet(tasks, stageId, stageAttemptId, priority = 0, null)
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
index 5c712bd6a545b..2ce81ae27daf6 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
@@ -744,6 +744,113 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg
     assert(resubmittedTasks === 0)
   }
 
+
+  test("[SPARK-22074] Task killed by other attempt task should not be resubmitted") {
+    val conf = new SparkConf().set("spark.speculation", "true")
+    sc = new SparkContext("local", "test", conf)
+    // Set the speculation multiplier to be 0 so speculative tasks are launched immediately
+    sc.conf.set("spark.speculation.multiplier", "0.0")
+    sc.conf.set("spark.speculation.quantile", "0.5")
+    sc.conf.set("spark.speculation", "true")
+
+    var killTaskCalled = false
+    val sched = new FakeTaskScheduler(sc, ("exec1", "host1"),
+      ("exec2", "host2"), ("exec3", "host3"))
+    sched.initialize(new FakeSchedulerBackend() {
+      override def killTask(
+          taskId: Long,
+          executorId: String,
+          interruptThread: Boolean,
+          reason: String): Unit = {
+        // Check the only one killTask event in this case, which triggered by
+        // task 2.1 completed.
+        assert(taskId === 2)
+        assert(executorId === "exec3")
+        assert(interruptThread)
+        assert(reason === "another attempt succeeded")
+        killTaskCalled = true
+      }
+    })
+
+    // Keep track of the number of tasks that are resubmitted,
+    // so that the test can check that no tasks were resubmitted.
+    var resubmittedTasks = 0
+    val dagScheduler = new FakeDAGScheduler(sc, sched) {
+      override def taskEnded(
+          task: Task[_],
+          reason: TaskEndReason,
+          result: Any,
+          accumUpdates: Seq[AccumulatorV2[_, _]],
+          taskInfo: TaskInfo): Unit = {
+        super.taskEnded(task, reason, result, accumUpdates, taskInfo)
+        reason match {
+          case Resubmitted => resubmittedTasks += 1
+          case _ =>
+        }
+      }
+    }
+    sched.setDAGScheduler(dagScheduler)
+
+    val taskSet = FakeTask.createShuffleMapTaskSet(4, 0, 0,
+      Seq(TaskLocation("host1", "exec1")),
+      Seq(TaskLocation("host1", "exec1")),
+      Seq(TaskLocation("host3", "exec3")),
+      Seq(TaskLocation("host2", "exec2")))
+
+    val clock = new ManualClock()
+    val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES, clock = clock)
+    val accumUpdatesByTask: Array[Seq[AccumulatorV2[_, _]]] = taskSet.tasks.map { task =>
+      task.metrics.internalAccums
+    }
+    // Offer resources for 4 tasks to start
+    for ((exec, host) <- Seq(
+      "exec1" -> "host1",
+      "exec1" -> "host1",
+      "exec3" -> "host3",
+      "exec2" -> "host2")) {
+      val taskOption = manager.resourceOffer(exec, host, NO_PREF)
+      assert(taskOption.isDefined)
+      val task = taskOption.get
+      assert(task.executorId === exec)
+      // Add an extra assert to make sure task 2.0 is running on exec3
+      if (task.index == 2) {
+        assert(task.attemptNumber === 0)
+        assert(task.executorId === "exec3")
+      }
+    }
+    assert(sched.startedTasks.toSet === Set(0, 1, 2, 3))
+    clock.advance(1)
+    // Complete the 2 tasks and leave 2 task in running
+    for (id <- Set(0, 1)) {
+      manager.handleSuccessfulTask(id, createTaskResult(id, accumUpdatesByTask(id)))
+      assert(sched.endedTasks(id) === Success)
+    }
+
+    // checkSpeculatableTasks checks that the task runtime is greater than the threshold for
+    // speculating. Since we use a threshold of 0 for speculation, tasks need to be running for
+    // > 0ms, so advance the clock by 1ms here.
+    clock.advance(1)
+    assert(manager.checkSpeculatableTasks(0))
+    assert(sched.speculativeTasks.toSet === Set(2, 3))
+
+    // Offer resource to start the speculative attempt for the running task 2.0
+    val taskOption = manager.resourceOffer("exec2", "host2", ANY)
+    assert(taskOption.isDefined)
+    val task4 = taskOption.get
+    assert(task4.index === 2)
+    assert(task4.taskId === 4)
+    assert(task4.executorId === "exec2")
+    assert(task4.attemptNumber === 1)
+    // Complete the speculative attempt for the running task
+    manager.handleSuccessfulTask(4, createTaskResult(2, accumUpdatesByTask(2)))
+    // Make sure schedBackend.killTask(2, "exec3", true, "another attempt succeeded") gets called
+    assert(killTaskCalled)
+    // Host 3 Losts, there's only task 2.0 on it, which killed by task 2.1
+    manager.executorLost("exec3", "host3", SlaveLost())
+    // Check the resubmittedTasks
+    assert(resubmittedTasks === 0)
+  }
+
   test("speculative and noPref task should be scheduled after node-local") {
     sc = new SparkContext("local", "test")
     sched = new FakeTaskScheduler(

From 98057583dd2787c0e396c2658c7dd76412f86936 Mon Sep 17 00:00:00 2001
From: Nick Pentreath <nickp@za.ibm.com>
Date: Mon, 9 Oct 2017 10:42:33 +0200
Subject: [PATCH 678/779] [SPARK-20679][ML] Support recommending for a subset
 of users/items in ALSModel

This PR adds methods `recommendForUserSubset` and `recommendForItemSubset` to `ALSModel`. These allow recommending for a specified set of user / item ids rather than for every user / item (as in the `recommendForAllX` methods).

The subset methods take a `DataFrame` as input, containing ids in the column specified by the param `userCol` or `itemCol`. The model will generate recommendations for each _unique_ id in this input dataframe.

## How was this patch tested?
New unit tests in `ALSSuite` and Python doctests in `ALS`. Ran updated examples locally.

Author: Nick Pentreath <nickp@za.ibm.com>

Closes #18748 from MLnick/als-recommend-df.
---
 .../spark/examples/ml/JavaALSExample.java     |   9 ++
 examples/src/main/python/ml/als_example.py    |   9 ++
 .../apache/spark/examples/ml/ALSExample.scala |   9 ++
 .../apache/spark/ml/recommendation/ALS.scala  |  48 +++++++++
 .../spark/ml/recommendation/ALSSuite.scala    | 100 ++++++++++++++++--
 python/pyspark/ml/recommendation.py           |  38 +++++++
 6 files changed, 205 insertions(+), 8 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaALSExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaALSExample.java
index fe4d6bc83f04a..27052be87b82e 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaALSExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaALSExample.java
@@ -118,9 +118,18 @@ public static void main(String[] args) {
     Dataset<Row> userRecs = model.recommendForAllUsers(10);
     // Generate top 10 user recommendations for each movie
     Dataset<Row> movieRecs = model.recommendForAllItems(10);
+
+    // Generate top 10 movie recommendations for a specified set of users
+    Dataset<Row> users = ratings.select(als.getUserCol()).distinct().limit(3);
+    Dataset<Row> userSubsetRecs = model.recommendForUserSubset(users, 10);
+    // Generate top 10 user recommendations for a specified set of movies
+    Dataset<Row> movies = ratings.select(als.getItemCol()).distinct().limit(3);
+    Dataset<Row> movieSubSetRecs = model.recommendForItemSubset(movies, 10);
     // $example off$
     userRecs.show();
     movieRecs.show();
+    userSubsetRecs.show();
+    movieSubSetRecs.show();
 
     spark.stop();
   }
diff --git a/examples/src/main/python/ml/als_example.py b/examples/src/main/python/ml/als_example.py
index 1672d552eb1d5..8b7ec9c439f9f 100644
--- a/examples/src/main/python/ml/als_example.py
+++ b/examples/src/main/python/ml/als_example.py
@@ -60,8 +60,17 @@
     userRecs = model.recommendForAllUsers(10)
     # Generate top 10 user recommendations for each movie
     movieRecs = model.recommendForAllItems(10)
+
+    # Generate top 10 movie recommendations for a specified set of users
+    users = ratings.select(als.getUserCol()).distinct().limit(3)
+    userSubsetRecs = model.recommendForUserSubset(users, 10)
+    # Generate top 10 user recommendations for a specified set of movies
+    movies = ratings.select(als.getItemCol()).distinct().limit(3)
+    movieSubSetRecs = model.recommendForItemSubset(movies, 10)
     # $example off$
     userRecs.show()
     movieRecs.show()
+    userSubsetRecs.show()
+    movieSubSetRecs.show()
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala
index 07b15dfa178f7..8091838a2301e 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala
@@ -80,9 +80,18 @@ object ALSExample {
     val userRecs = model.recommendForAllUsers(10)
     // Generate top 10 user recommendations for each movie
     val movieRecs = model.recommendForAllItems(10)
+
+    // Generate top 10 movie recommendations for a specified set of users
+    val users = ratings.select(als.getUserCol).distinct().limit(3)
+    val userSubsetRecs = model.recommendForUserSubset(users, 10)
+    // Generate top 10 user recommendations for a specified set of movies
+    val movies = ratings.select(als.getItemCol).distinct().limit(3)
+    val movieSubSetRecs = model.recommendForItemSubset(movies, 10)
     // $example off$
     userRecs.show()
     movieRecs.show()
+    userSubsetRecs.show()
+    movieSubSetRecs.show()
 
     spark.stop()
   }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
index 3d5fd1794de23..a8843661c873b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -344,6 +344,21 @@ class ALSModel private[ml] (
     recommendForAll(userFactors, itemFactors, $(userCol), $(itemCol), numItems)
   }
 
+  /**
+   * Returns top `numItems` items recommended for each user id in the input data set. Note that if
+   * there are duplicate ids in the input dataset, only one set of recommendations per unique id
+   * will be returned.
+   * @param dataset a Dataset containing a column of user ids. The column name must match `userCol`.
+   * @param numItems max number of recommendations for each user.
+   * @return a DataFrame of (userCol: Int, recommendations), where recommendations are
+   *         stored as an array of (itemCol: Int, rating: Float) Rows.
+   */
+  @Since("2.3.0")
+  def recommendForUserSubset(dataset: Dataset[_], numItems: Int): DataFrame = {
+    val srcFactorSubset = getSourceFactorSubset(dataset, userFactors, $(userCol))
+    recommendForAll(srcFactorSubset, itemFactors, $(userCol), $(itemCol), numItems)
+  }
+
   /**
    * Returns top `numUsers` users recommended for each item, for all items.
    * @param numUsers max number of recommendations for each item
@@ -355,6 +370,39 @@ class ALSModel private[ml] (
     recommendForAll(itemFactors, userFactors, $(itemCol), $(userCol), numUsers)
   }
 
+  /**
+   * Returns top `numUsers` users recommended for each item id in the input data set. Note that if
+   * there are duplicate ids in the input dataset, only one set of recommendations per unique id
+   * will be returned.
+   * @param dataset a Dataset containing a column of item ids. The column name must match `itemCol`.
+   * @param numUsers max number of recommendations for each item.
+   * @return a DataFrame of (itemCol: Int, recommendations), where recommendations are
+   *         stored as an array of (userCol: Int, rating: Float) Rows.
+   */
+  @Since("2.3.0")
+  def recommendForItemSubset(dataset: Dataset[_], numUsers: Int): DataFrame = {
+    val srcFactorSubset = getSourceFactorSubset(dataset, itemFactors, $(itemCol))
+    recommendForAll(srcFactorSubset, userFactors, $(itemCol), $(userCol), numUsers)
+  }
+
+  /**
+   * Returns a subset of a factor DataFrame limited to only those unique ids contained
+   * in the input dataset.
+   * @param dataset input Dataset containing id column to user to filter factors.
+   * @param factors factor DataFrame to filter.
+   * @param column column name containing the ids in the input dataset.
+   * @return DataFrame containing factors only for those ids present in both the input dataset and
+   *         the factor DataFrame.
+   */
+  private def getSourceFactorSubset(
+      dataset: Dataset[_],
+      factors: DataFrame,
+      column: String): DataFrame = {
+    factors
+      .join(dataset.select(column), factors("id") === dataset(column), joinType = "left_semi")
+      .select(factors("id"), factors("features"))
+  }
+
   /**
    * Makes recommendations for all users (or items).
    *
diff --git a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
index ac7319110159b..addcd21d50aac 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
@@ -723,9 +723,9 @@ class ALSSuite
     val numUsers = model.userFactors.count
     val numItems = model.itemFactors.count
     val expected = Map(
-      0 -> Array((3, 54f), (4, 44f), (5, 42f), (6, 28f)),
-      1 -> Array((3, 39f), (5, 33f), (4, 26f), (6, 16f)),
-      2 -> Array((3, 51f), (5, 45f), (4, 30f), (6, 18f))
+      0 -> Seq((3, 54f), (4, 44f), (5, 42f), (6, 28f)),
+      1 -> Seq((3, 39f), (5, 33f), (4, 26f), (6, 16f)),
+      2 -> Seq((3, 51f), (5, 45f), (4, 30f), (6, 18f))
     )
 
     Seq(2, 4, 6).foreach { k =>
@@ -743,10 +743,10 @@ class ALSSuite
     val numUsers = model.userFactors.count
     val numItems = model.itemFactors.count
     val expected = Map(
-      3 -> Array((0, 54f), (2, 51f), (1, 39f)),
-      4 -> Array((0, 44f), (2, 30f), (1, 26f)),
-      5 -> Array((2, 45f), (0, 42f), (1, 33f)),
-      6 -> Array((0, 28f), (2, 18f), (1, 16f))
+      3 -> Seq((0, 54f), (2, 51f), (1, 39f)),
+      4 -> Seq((0, 44f), (2, 30f), (1, 26f)),
+      5 -> Seq((2, 45f), (0, 42f), (1, 33f)),
+      6 -> Seq((0, 28f), (2, 18f), (1, 16f))
     )
 
     Seq(2, 3, 4).foreach { k =>
@@ -759,9 +759,93 @@ class ALSSuite
     }
   }
 
+  test("recommendForUserSubset with k <, = and > num_items") {
+    val spark = this.spark
+    import spark.implicits._
+    val model = getALSModel
+    val numItems = model.itemFactors.count
+    val expected = Map(
+      0 -> Seq((3, 54f), (4, 44f), (5, 42f), (6, 28f)),
+      2 -> Seq((3, 51f), (5, 45f), (4, 30f), (6, 18f))
+    )
+    val userSubset = expected.keys.toSeq.toDF("user")
+    val numUsersSubset = userSubset.count
+
+    Seq(2, 4, 6).foreach { k =>
+      val n = math.min(k, numItems).toInt
+      val expectedUpToN = expected.mapValues(_.slice(0, n))
+      val topItems = model.recommendForUserSubset(userSubset, k)
+      assert(topItems.count() == numUsersSubset)
+      assert(topItems.columns.contains("user"))
+      checkRecommendations(topItems, expectedUpToN, "item")
+    }
+  }
+
+  test("recommendForItemSubset with k <, = and > num_users") {
+    val spark = this.spark
+    import spark.implicits._
+    val model = getALSModel
+    val numUsers = model.userFactors.count
+    val expected = Map(
+      3 -> Seq((0, 54f), (2, 51f), (1, 39f)),
+      6 -> Seq((0, 28f), (2, 18f), (1, 16f))
+    )
+    val itemSubset = expected.keys.toSeq.toDF("item")
+    val numItemsSubset = itemSubset.count
+
+    Seq(2, 3, 4).foreach { k =>
+      val n = math.min(k, numUsers).toInt
+      val expectedUpToN = expected.mapValues(_.slice(0, n))
+      val topUsers = model.recommendForItemSubset(itemSubset, k)
+      assert(topUsers.count() == numItemsSubset)
+      assert(topUsers.columns.contains("item"))
+      checkRecommendations(topUsers, expectedUpToN, "user")
+    }
+  }
+
+  test("subset recommendations eliminate duplicate ids, returns same results as unique ids") {
+    val spark = this.spark
+    import spark.implicits._
+    val model = getALSModel
+    val k = 2
+
+    val users = Seq(0, 1).toDF("user")
+    val dupUsers = Seq(0, 1, 0, 1).toDF("user")
+    val singleUserRecs = model.recommendForUserSubset(users, k)
+    val dupUserRecs = model.recommendForUserSubset(dupUsers, k)
+      .as[(Int, Seq[(Int, Float)])].collect().toMap
+    assert(singleUserRecs.count == dupUserRecs.size)
+    checkRecommendations(singleUserRecs, dupUserRecs, "item")
+
+    val items = Seq(3, 4, 5).toDF("item")
+    val dupItems = Seq(3, 4, 5, 4, 5).toDF("item")
+    val singleItemRecs = model.recommendForItemSubset(items, k)
+    val dupItemRecs = model.recommendForItemSubset(dupItems, k)
+      .as[(Int, Seq[(Int, Float)])].collect().toMap
+    assert(singleItemRecs.count == dupItemRecs.size)
+    checkRecommendations(singleItemRecs, dupItemRecs, "user")
+  }
+
+  test("subset recommendations on full input dataset equivalent to recommendForAll") {
+    val spark = this.spark
+    import spark.implicits._
+    val model = getALSModel
+    val k = 2
+
+    val userSubset = model.userFactors.withColumnRenamed("id", "user").drop("features")
+    val userSubsetRecs = model.recommendForUserSubset(userSubset, k)
+    val allUserRecs = model.recommendForAllUsers(k).as[(Int, Seq[(Int, Float)])].collect().toMap
+    checkRecommendations(userSubsetRecs, allUserRecs, "item")
+
+    val itemSubset = model.itemFactors.withColumnRenamed("id", "item").drop("features")
+    val itemSubsetRecs = model.recommendForItemSubset(itemSubset, k)
+    val allItemRecs = model.recommendForAllItems(k).as[(Int, Seq[(Int, Float)])].collect().toMap
+    checkRecommendations(itemSubsetRecs, allItemRecs, "user")
+  }
+
   private def checkRecommendations(
       topK: DataFrame,
-      expected: Map[Int, Array[(Int, Float)]],
+      expected: Map[Int, Seq[(Int, Float)]],
       dstColName: String): Unit = {
     val spark = this.spark
     import spark.implicits._
diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py
index bcfb36880eb02..e8bcbe4cd34cb 100644
--- a/python/pyspark/ml/recommendation.py
+++ b/python/pyspark/ml/recommendation.py
@@ -90,6 +90,14 @@ class ALS(JavaEstimator, HasCheckpointInterval, HasMaxIter, HasPredictionCol, Ha
     >>> item_recs.where(item_recs.item == 2)\
         .select("recommendations.user", "recommendations.rating").collect()
     [Row(user=[2, 1, 0], rating=[4.901..., 3.981..., -0.138...])]
+    >>> user_subset = df.where(df.user == 2)
+    >>> user_subset_recs = model.recommendForUserSubset(user_subset, 3)
+    >>> user_subset_recs.select("recommendations.item", "recommendations.rating").first()
+    Row(item=[2, 1, 0], rating=[4.901..., 1.056..., -1.501...])
+    >>> item_subset = df.where(df.item == 0)
+    >>> item_subset_recs = model.recommendForItemSubset(item_subset, 3)
+    >>> item_subset_recs.select("recommendations.user", "recommendations.rating").first()
+    Row(user=[0, 1, 2], rating=[3.910..., 2.625..., -1.501...])
     >>> als_path = temp_path + "/als"
     >>> als.save(als_path)
     >>> als2 = ALS.load(als_path)
@@ -414,6 +422,36 @@ def recommendForAllItems(self, numUsers):
         """
         return self._call_java("recommendForAllItems", numUsers)
 
+    @since("2.3.0")
+    def recommendForUserSubset(self, dataset, numItems):
+        """
+        Returns top `numItems` items recommended for each user id in the input data set. Note that
+        if there are duplicate ids in the input dataset, only one set of recommendations per unique
+        id will be returned.
+
+        :param dataset: a Dataset containing a column of user ids. The column name must match
+                        `userCol`.
+        :param numItems: max number of recommendations for each user
+        :return: a DataFrame of (userCol, recommendations), where recommendations are
+                 stored as an array of (itemCol, rating) Rows.
+        """
+        return self._call_java("recommendForUserSubset", dataset, numItems)
+
+    @since("2.3.0")
+    def recommendForItemSubset(self, dataset, numUsers):
+        """
+        Returns top `numUsers` users recommended for each item id in the input data set. Note that
+        if there are duplicate ids in the input dataset, only one set of recommendations per unique
+        id will be returned.
+
+        :param dataset: a Dataset containing a column of item ids. The column name must match
+                        `itemCol`.
+        :param numUsers: max number of recommendations for each item
+        :return: a DataFrame of (itemCol, recommendations), where recommendations are
+                 stored as an array of (userCol, rating) Rows.
+        """
+        return self._call_java("recommendForItemSubset", dataset, numUsers)
+
 
 if __name__ == "__main__":
     import doctest

From f31e11404d6d5ee28b574c242ecbee94f35e9370 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Mon, 9 Oct 2017 12:53:10 -0700
Subject: [PATCH 679/779] [SPARK-21568][CORE] ConsoleProgressBar should only be
 enabled in shells

## What changes were proposed in this pull request?

This PR disables console progress bar feature in non-shell environment by overriding the configuration.

## How was this patch tested?

Manual. Run the following examples with and without `spark.ui.showConsoleProgress` in order to see progress bar on master branch and this PR.

**Scala Shell**
```scala
spark.range(1000000000).map(_ + 1).count
```

**PySpark**
```python
spark.range(10000000).rdd.map(lambda x: len(x)).count()
```

**Spark Submit**
```python
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession.builder.getOrCreate()
    spark.range(2000000).rdd.map(lambda row: len(row)).count()
    spark.stop()
```

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #19061 from dongjoon-hyun/SPARK-21568.
---
 .../main/scala/org/apache/spark/SparkContext.scala   |  2 +-
 .../scala/org/apache/spark/deploy/SparkSubmit.scala  |  5 +++++
 .../org/apache/spark/internal/config/package.scala   |  5 +++++
 .../org/apache/spark/deploy/SparkSubmitSuite.scala   | 12 ++++++++++++
 4 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index cec61d85ccf38..b3cd03c0cfbe1 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -434,7 +434,7 @@ class SparkContext(config: SparkConf) extends Logging {
     _statusTracker = new SparkStatusTracker(this)
 
     _progressBar =
-      if (_conf.getBoolean("spark.ui.showConsoleProgress", true) && !log.isInfoEnabled) {
+      if (_conf.get(UI_SHOW_CONSOLE_PROGRESS) && !log.isInfoEnabled) {
         Some(new ConsoleProgressBar(this))
       } else {
         None
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 286a4379d2040..135bbe93bf28e 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -598,6 +598,11 @@ object SparkSubmit extends CommandLineUtils with Logging {
       }
     }
 
+    // In case of shells, spark.ui.showConsoleProgress can be true by default or by user.
+    if (isShell(args.primaryResource) && !sparkConf.contains(UI_SHOW_CONSOLE_PROGRESS)) {
+      sysProps(UI_SHOW_CONSOLE_PROGRESS.key) = "true"
+    }
+
     // Add the application jar automatically so the user doesn't have to call sc.addJar
     // For YARN cluster mode, the jar is already distributed on each node as "app.jar"
     // For python and R files, the primary resource is already distributed as a regular file
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index d85b6a0200b8d..5278e5e0fb270 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -203,6 +203,11 @@ package object config {
   private[spark] val HISTORY_UI_MAX_APPS =
     ConfigBuilder("spark.history.ui.maxApplications").intConf.createWithDefault(Integer.MAX_VALUE)
 
+  private[spark] val UI_SHOW_CONSOLE_PROGRESS = ConfigBuilder("spark.ui.showConsoleProgress")
+    .doc("When true, show the progress bar in the console.")
+    .booleanConf
+    .createWithDefault(false)
+
   private[spark] val IO_ENCRYPTION_ENABLED = ConfigBuilder("spark.io.encryption.enabled")
     .booleanConf
     .createWithDefault(false)
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index ad801bf8519a6..b06f2e26a4a7a 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -399,6 +399,18 @@ class SparkSubmitSuite
     mainClass should be ("org.apache.spark.deploy.yarn.Client")
   }
 
+  test("SPARK-21568 ConsoleProgressBar should be enabled only in shells") {
+    val clArgs1 = Seq("--class", "org.apache.spark.repl.Main", "spark-shell")
+    val appArgs1 = new SparkSubmitArguments(clArgs1)
+    val (_, _, sysProps1, _) = prepareSubmitEnvironment(appArgs1)
+    sysProps1(UI_SHOW_CONSOLE_PROGRESS.key) should be ("true")
+
+    val clArgs2 = Seq("--class", "org.SomeClass", "thejar.jar")
+    val appArgs2 = new SparkSubmitArguments(clArgs2)
+    val (_, _, sysProps2, _) = prepareSubmitEnvironment(appArgs2)
+    sysProps2.keys should not contain UI_SHOW_CONSOLE_PROGRESS.key
+  }
+
   test("launch simple application with spark-submit") {
     val unusedJar = TestUtils.createJarWithClasses(Seq.empty)
     val args = Seq(

From a74ec6d7bbfe185ba995dcb02d69e90a089c293e Mon Sep 17 00:00:00 2001
From: Thomas Graves <tgraves@unharmedunarmed.corp.ne1.yahoo.com>
Date: Mon, 9 Oct 2017 12:56:37 -0700
Subject: [PATCH 680/779] [SPARK-22218] spark shuffle services fails to update
 secret on app re-attempts

This patch fixes application re-attempts when running spark on yarn using the external shuffle service with security on.  Currently executors will fail to launch on any application re-attempt when launched on a nodemanager that had an executor from the first attempt.  The reason for this is because we aren't updating the secret key after the first application attempt.  The fix here is to just remove the containskey check to see if it already exists. In this way, we always add it and make sure its the most recent secret.  Similarly remove the check for containsKey on the remove since its just adding extra check that isn't really needed.

Note this worked before spark 2.2 because the check used to be contains (which was looking for the value) rather then containsKey, so that never matched and it was just always adding the new secret.

Patch was tested on a 10 node cluster as well as added the unit test.
The test ran was a wordcount where the output directory already existed.  With the bug present the application attempt failed with max number of executor Failures which were all saslExceptions.  With the fix present the application re-attempts fail with directory already exists or when you remove the directory between attempts the re-attemps succeed.

Author: Thomas Graves <tgraves@unharmedunarmed.corp.ne1.yahoo.com>

Closes #19450 from tgravescs/SPARK-22218.
---
 .../network/sasl/ShuffleSecretManager.java    | 19 +++----
 .../sasl/ShuffleSecretManagerSuite.java       | 55 +++++++++++++++++++
 2 files changed, 62 insertions(+), 12 deletions(-)
 create mode 100644 common/network-shuffle/src/test/java/org/apache/spark/network/sasl/ShuffleSecretManagerSuite.java

diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/sasl/ShuffleSecretManager.java b/common/network-shuffle/src/main/java/org/apache/spark/network/sasl/ShuffleSecretManager.java
index d2d008f8a3d35..7253101f41df6 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/sasl/ShuffleSecretManager.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/sasl/ShuffleSecretManager.java
@@ -47,12 +47,11 @@ public ShuffleSecretManager() {
    * fetching shuffle files written by other executors in this application.
    */
   public void registerApp(String appId, String shuffleSecret) {
-    if (!shuffleSecretMap.containsKey(appId)) {
-      shuffleSecretMap.put(appId, shuffleSecret);
-      logger.info("Registered shuffle secret for application {}", appId);
-    } else {
-      logger.debug("Application {} already registered", appId);
-    }
+    // Always put the new secret information to make sure it's the most up to date.
+    // Otherwise we have to specifically look at the application attempt in addition
+    // to the applicationId since the secrets change between application attempts on yarn.
+    shuffleSecretMap.put(appId, shuffleSecret);
+    logger.info("Registered shuffle secret for application {}", appId);
   }
 
   /**
@@ -67,12 +66,8 @@ public void registerApp(String appId, ByteBuffer shuffleSecret) {
    * This is called when the application terminates.
    */
   public void unregisterApp(String appId) {
-    if (shuffleSecretMap.containsKey(appId)) {
-      shuffleSecretMap.remove(appId);
-      logger.info("Unregistered shuffle secret for application {}", appId);
-    } else {
-      logger.warn("Attempted to unregister application {} when it is not registered", appId);
-    }
+    shuffleSecretMap.remove(appId);
+    logger.info("Unregistered shuffle secret for application {}", appId);
   }
 
   /**
diff --git a/common/network-shuffle/src/test/java/org/apache/spark/network/sasl/ShuffleSecretManagerSuite.java b/common/network-shuffle/src/test/java/org/apache/spark/network/sasl/ShuffleSecretManagerSuite.java
new file mode 100644
index 0000000000000..46c4c33865eea
--- /dev/null
+++ b/common/network-shuffle/src/test/java/org/apache/spark/network/sasl/ShuffleSecretManagerSuite.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.network.sasl;
+
+import java.nio.ByteBuffer;
+
+import org.junit.Test;
+import static org.junit.Assert.*;
+
+public class ShuffleSecretManagerSuite {
+  static String app1 = "app1";
+  static String app2 = "app2";
+  static String pw1 = "password1";
+  static String pw2 = "password2";
+  static String pw1update = "password1update";
+  static String pw2update = "password2update";
+
+  @Test
+  public void testMultipleRegisters() {
+    ShuffleSecretManager secretManager = new ShuffleSecretManager();
+    secretManager.registerApp(app1, pw1);
+    assertEquals(pw1, secretManager.getSecretKey(app1));
+    secretManager.registerApp(app2, ByteBuffer.wrap(pw2.getBytes()));
+    assertEquals(pw2, secretManager.getSecretKey(app2));
+
+    // now update the password for the apps and make sure it takes affect
+    secretManager.registerApp(app1, pw1update);
+    assertEquals(pw1update, secretManager.getSecretKey(app1));
+    secretManager.registerApp(app2, ByteBuffer.wrap(pw2update.getBytes()));
+    assertEquals(pw2update, secretManager.getSecretKey(app2));
+
+    secretManager.unregisterApp(app1);
+    assertNull(secretManager.getSecretKey(app1));
+    assertEquals(pw2update, secretManager.getSecretKey(app2));
+
+    secretManager.unregisterApp(app2);
+    assertNull(secretManager.getSecretKey(app2));
+    assertNull(secretManager.getSecretKey(app1));
+  }
+}

From b650ee0265477ada68220cbf286fa79906608ef5 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Mon, 9 Oct 2017 13:55:55 -0700
Subject: [PATCH 681/779] [INFRA] Close stale PRs.

Closes #19423
Closes #19455

From dadd13f365aad0d9228cd8b8e6d57ad32175b155 Mon Sep 17 00:00:00 2001
From: Pavel Sakun <pavel.sakun@gmail.com>
Date: Mon, 9 Oct 2017 23:00:04 +0100
Subject: [PATCH 682/779] [SPARK] Misleading error message for missing
 --proxy-user value

Fix misleading error message when argument is expected.

## What changes were proposed in this pull request?

Change message to be accurate.

## How was this patch tested?

Messaging change, was tested manually.

Author: Pavel Sakun <pavel.sakun@gmail.com>

Closes #19457 from pavel-sakun/patch-1.
---
 .../src/main/java/org/apache/spark/launcher/SparkLauncher.java  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java b/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java
index 718a368a8e731..75b8ef5ca5ef4 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java
@@ -625,7 +625,7 @@ private static class ArgumentValidator extends SparkSubmitOptionParser {
     @Override
     protected boolean handle(String opt, String value) {
       if (value == null && hasValue) {
-        throw new IllegalArgumentException(String.format("'%s' does not expect a value.", opt));
+        throw new IllegalArgumentException(String.format("'%s' expects a value.", opt));
       }
       return true;
     }

From 155ab6347ec7be06c937372a51e8013fdd371d93 Mon Sep 17 00:00:00 2001
From: Ryan Blue <blue@apache.org>
Date: Mon, 9 Oct 2017 15:22:41 -0700
Subject: [PATCH 683/779] [SPARK-22170][SQL] Reduce memory consumption in
 broadcast joins.

## What changes were proposed in this pull request?

This updates the broadcast join code path to lazily decompress pages and
iterate through UnsafeRows to prevent all rows from being held in memory
while the broadcast table is being built.

## How was this patch tested?

Existing tests.

Author: Ryan Blue <blue@apache.org>

Closes #19394 from rdblue/broadcast-driver-memory.
---
 .../plans/physical/broadcastMode.scala        |  6 ++++
 .../spark/sql/execution/SparkPlan.scala       | 19 ++++++++----
 .../exchange/BroadcastExchangeExec.scala      | 29 ++++++++++++++-----
 .../sql/execution/joins/HashedRelation.scala  | 13 ++++++++-
 .../spark/sql/ConfigBehaviorSuite.scala       |  2 +-
 .../execution/metric/SQLMetricsSuite.scala    |  3 +-
 6 files changed, 54 insertions(+), 18 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/broadcastMode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/broadcastMode.scala
index 2ab46dc8330aa..9fac95aed8f12 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/broadcastMode.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/broadcastMode.scala
@@ -26,6 +26,8 @@ import org.apache.spark.sql.catalyst.InternalRow
 trait BroadcastMode {
   def transform(rows: Array[InternalRow]): Any
 
+  def transform(rows: Iterator[InternalRow], sizeHint: Option[Long]): Any
+
   def canonicalized: BroadcastMode
 }
 
@@ -36,5 +38,9 @@ case object IdentityBroadcastMode extends BroadcastMode {
   // TODO: pack the UnsafeRows into single bytes array.
   override def transform(rows: Array[InternalRow]): Array[InternalRow] = rows
 
+  override def transform(
+      rows: Iterator[InternalRow],
+      sizeHint: Option[Long]): Array[InternalRow] = rows.toArray
+
   override def canonicalized: BroadcastMode = this
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index b263f100e6068..2ffd948f984bf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -223,7 +223,7 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
    * UnsafeRow is highly compressible (at least 8 bytes for any column), the byte array is also
    * compressed.
    */
-  private def getByteArrayRdd(n: Int = -1): RDD[Array[Byte]] = {
+  private def getByteArrayRdd(n: Int = -1): RDD[(Long, Array[Byte])] = {
     execute().mapPartitionsInternal { iter =>
       var count = 0
       val buffer = new Array[Byte](4 << 10)  // 4K
@@ -239,7 +239,7 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
       out.writeInt(-1)
       out.flush()
       out.close()
-      Iterator(bos.toByteArray)
+      Iterator((count, bos.toByteArray))
     }
   }
 
@@ -274,19 +274,26 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
     val byteArrayRdd = getByteArrayRdd()
 
     val results = ArrayBuffer[InternalRow]()
-    byteArrayRdd.collect().foreach { bytes =>
-      decodeUnsafeRows(bytes).foreach(results.+=)
+    byteArrayRdd.collect().foreach { countAndBytes =>
+      decodeUnsafeRows(countAndBytes._2).foreach(results.+=)
     }
     results.toArray
   }
 
+  private[spark] def executeCollectIterator(): (Long, Iterator[InternalRow]) = {
+    val countsAndBytes = getByteArrayRdd().collect()
+    val total = countsAndBytes.map(_._1).sum
+    val rows = countsAndBytes.iterator.flatMap(countAndBytes => decodeUnsafeRows(countAndBytes._2))
+    (total, rows)
+  }
+
   /**
    * Runs this query returning the result as an iterator of InternalRow.
    *
    * @note Triggers multiple jobs (one for each partition).
    */
   def executeToIterator(): Iterator[InternalRow] = {
-    getByteArrayRdd().toLocalIterator.flatMap(decodeUnsafeRows)
+    getByteArrayRdd().map(_._2).toLocalIterator.flatMap(decodeUnsafeRows)
   }
 
   /**
@@ -307,7 +314,7 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
       return new Array[InternalRow](0)
     }
 
-    val childRDD = getByteArrayRdd(n)
+    val childRDD = getByteArrayRdd(n).map(_._2)
 
     val buf = new ArrayBuffer[InternalRow]
     val totalParts = childRDD.partitions.length
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala
index 9c859e41f8762..880e18c6808b0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala
@@ -27,8 +27,8 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.catalyst.plans.physical.{BroadcastMode, BroadcastPartitioning, Partitioning}
 import org.apache.spark.sql.execution.{SparkPlan, SQLExecution}
+import org.apache.spark.sql.execution.joins.HashedRelation
 import org.apache.spark.sql.execution.metric.SQLMetrics
-import org.apache.spark.sql.execution.ui.SparkListenerDriverAccumUpdates
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.util.ThreadUtils
 
@@ -72,26 +72,39 @@ case class BroadcastExchangeExec(
       SQLExecution.withExecutionId(sparkContext, executionId) {
         try {
           val beforeCollect = System.nanoTime()
-          // Note that we use .executeCollect() because we don't want to convert data to Scala types
-          val input: Array[InternalRow] = child.executeCollect()
-          if (input.length >= 512000000) {
+          // Use executeCollect/executeCollectIterator to avoid conversion to Scala types
+          val (numRows, input) = child.executeCollectIterator()
+          if (numRows >= 512000000) {
             throw new SparkException(
-              s"Cannot broadcast the table with more than 512 millions rows: ${input.length} rows")
+              s"Cannot broadcast the table with more than 512 millions rows: $numRows rows")
           }
+
           val beforeBuild = System.nanoTime()
           longMetric("collectTime") += (beforeBuild - beforeCollect) / 1000000
-          val dataSize = input.map(_.asInstanceOf[UnsafeRow].getSizeInBytes.toLong).sum
+
+          // Construct the relation.
+          val relation = mode.transform(input, Some(numRows))
+
+          val dataSize = relation match {
+            case map: HashedRelation =>
+              map.estimatedSize
+            case arr: Array[InternalRow] =>
+              arr.map(_.asInstanceOf[UnsafeRow].getSizeInBytes.toLong).sum
+            case _ =>
+              throw new SparkException("[BUG] BroadcastMode.transform returned unexpected type: " +
+                  relation.getClass.getName)
+          }
+
           longMetric("dataSize") += dataSize
           if (dataSize >= (8L << 30)) {
             throw new SparkException(
               s"Cannot broadcast the table that is larger than 8GB: ${dataSize >> 30} GB")
           }
 
-          // Construct and broadcast the relation.
-          val relation = mode.transform(input)
           val beforeBroadcast = System.nanoTime()
           longMetric("buildTime") += (beforeBroadcast - beforeBuild) / 1000000
 
+          // Broadcast the relation
           val broadcasted = sparkContext.broadcast(relation)
           longMetric("broadcastTime") += (System.nanoTime() - beforeBroadcast) / 1000000
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
index f8058b2f7813b..b2dcbe5aa9877 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
@@ -866,7 +866,18 @@ private[execution] case class HashedRelationBroadcastMode(key: Seq[Expression])
   extends BroadcastMode {
 
   override def transform(rows: Array[InternalRow]): HashedRelation = {
-    HashedRelation(rows.iterator, canonicalized.key, rows.length)
+    transform(rows.iterator, Some(rows.length))
+  }
+
+  override def transform(
+      rows: Iterator[InternalRow],
+      sizeHint: Option[Long]): HashedRelation = {
+    sizeHint match {
+      case Some(numRows) =>
+        HashedRelation(rows, canonicalized.key, numRows.toInt)
+      case None =>
+        HashedRelation(rows, canonicalized.key)
+    }
   }
 
   override lazy val canonicalized: HashedRelationBroadcastMode = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ConfigBehaviorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ConfigBehaviorSuite.scala
index 2c1e5db5fd9bb..cee85ec8af04d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ConfigBehaviorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ConfigBehaviorSuite.scala
@@ -58,7 +58,7 @@ class ConfigBehaviorSuite extends QueryTest with SharedSQLContext {
       withSQLConf(SQLConf.RANGE_EXCHANGE_SAMPLE_SIZE_PER_PARTITION.key -> "1") {
         // If we only sample one point, the range boundaries will be pretty bad and the
         // chi-sq value would be very high.
-        assert(computeChiSquareTest() > 1000)
+        assert(computeChiSquareTest() > 300)
       }
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
index 0dc612ef735fa..58a194b8af62b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
@@ -227,8 +227,7 @@ class SQLMetricsSuite extends SparkFunSuite with SQLMetricsTestUtils with Shared
     val df = df1.join(broadcast(df2), "key")
     testSparkPlanMetrics(df, 2, Map(
       1L -> (("BroadcastHashJoin", Map(
-        "number of output rows" -> 2L,
-        "avg hash probe (min, med, max)" -> "\n(1, 1, 1)"))))
+        "number of output rows" -> 2L))))
     )
   }
 

From 71c2b81aa0e0db70013821f5512df1fbd8e59445 Mon Sep 17 00:00:00 2001
From: Jose Torres <jose@databricks.com>
Date: Mon, 9 Oct 2017 16:34:39 -0700
Subject: [PATCH 684/779] [SPARK-22230] Swap per-row order in state store
 restore.

## What changes were proposed in this pull request?
In state store restore, for each row, put the saved state before the row in the iterator instead of after.

This fixes an issue where agg(last('attr)) will forever return the last value of 'attr from the first microbatch.

## How was this patch tested?

new unit test

Author: Jose Torres <jose@databricks.com>

Closes #19461 from joseph-torres/SPARK-22230.
---
 .../execution/streaming/statefulOperators.scala  |  2 +-
 .../streaming/StreamingAggregationSuite.scala    | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala
index fb960fbdde8b3..0d85542928ee6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala
@@ -225,7 +225,7 @@ case class StateStoreRestoreExec(
             val key = getKey(row)
             val savedState = store.get(key)
             numOutputRows += 1
-            row +: Option(savedState).toSeq
+            Option(savedState).toSeq :+ row
           }
         }
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
index 995cea3b37d4f..fe7efa69f7e31 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
@@ -520,6 +520,22 @@ class StreamingAggregationSuite extends StateStoreMetricsTest
     }
   }
 
+  test("SPARK-22230: last should change with new batches") {
+    val input = MemoryStream[Int]
+
+    val aggregated = input.toDF().agg(last('value))
+    testStream(aggregated, OutputMode.Complete())(
+      AddData(input, 1, 2, 3),
+      CheckLastBatch(3),
+      AddData(input, 4, 5, 6),
+      CheckLastBatch(6),
+      AddData(input),
+      CheckLastBatch(6),
+      AddData(input, 0),
+      CheckLastBatch(0)
+    )
+  }
+
   /** Add blocks of data to the `BlockRDDBackedSource`. */
   case class AddBlockData(source: BlockRDDBackedSource, data: Seq[Int]*) extends AddData {
     override def addData(query: Option[StreamExecution]): (Source, Offset) = {

From bebd2e1ce10a460555f75cda75df33f39a783469 Mon Sep 17 00:00:00 2001
From: Feng Liu <fengliu@databricks.com>
Date: Mon, 9 Oct 2017 21:34:37 -0700
Subject: [PATCH 685/779] [SPARK-22222][CORE] Fix the ARRAY_MAX in BufferHolder
 and add a test

## What changes were proposed in this pull request?

We should not break the assumption that the length of the allocated byte array is word rounded:
https://github.com/apache/spark/blob/master/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java#L170
So we want to use `Integer.MAX_VALUE - 15` instead of `Integer.MAX_VALUE - 8` as the upper bound of an allocated byte array.

cc: srowen gatorsmile
## How was this patch tested?

Since the Spark unit test JVM has less than 1GB heap, here we run the test code as a submit job, so it can run on a JVM has 4GB memory.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Feng Liu <fengliu@databricks.com>

Closes #19460 from liufengdb/fix_array_max.
---
 .../spark/unsafe/array/ByteArrayMethods.java  |  7 ++
 .../unsafe/map/HashMapGrowthStrategy.java     |  6 +-
 .../collection/PartitionedPairBuffer.scala    |  6 +-
 .../spark/deploy/SparkSubmitSuite.scala       | 52 +++++++------
 .../expressions/codegen/BufferHolder.java     |  7 +-
 .../BufferHolderSparkSubmitSutie.scala        | 78 +++++++++++++++++++
 .../vectorized/WritableColumnVector.java      |  3 +-
 7 files changed, 124 insertions(+), 35 deletions(-)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolderSparkSubmitSutie.scala

diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java
index 9c551ab19e9aa..f121b1cd745b8 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java
@@ -40,6 +40,13 @@ public static int roundNumberOfBytesToNearestWord(int numBytes) {
     }
   }
 
+  // Some JVMs can't allocate arrays of length Integer.MAX_VALUE; actual max is somewhat smaller.
+  // Be conservative and lower the cap a little.
+  // Refer to "http://hg.openjdk.java.net/jdk8/jdk8/jdk/file/tip/src/share/classes/java/util/ArrayList.java#l229"
+  // This value is word rounded. Use this value if the allocated byte arrays are used to store other
+  // types rather than bytes.
+  public static int MAX_ROUNDED_ARRAY_LENGTH = Integer.MAX_VALUE - 15;
+
   private static final boolean unaligned = Platform.unaligned();
   /**
    * Optimized byte array equality check for byte arrays.
diff --git a/core/src/main/java/org/apache/spark/unsafe/map/HashMapGrowthStrategy.java b/core/src/main/java/org/apache/spark/unsafe/map/HashMapGrowthStrategy.java
index b8c2294c7b7ab..ee6d9f75ac5aa 100644
--- a/core/src/main/java/org/apache/spark/unsafe/map/HashMapGrowthStrategy.java
+++ b/core/src/main/java/org/apache/spark/unsafe/map/HashMapGrowthStrategy.java
@@ -17,6 +17,8 @@
 
 package org.apache.spark.unsafe.map;
 
+import org.apache.spark.unsafe.array.ByteArrayMethods;
+
 /**
  * Interface that defines how we can grow the size of a hash map when it is over a threshold.
  */
@@ -31,9 +33,7 @@ public interface HashMapGrowthStrategy {
 
   class Doubling implements HashMapGrowthStrategy {
 
-    // Some JVMs can't allocate arrays of length Integer.MAX_VALUE; actual max is somewhat
-    // smaller. Be conservative and lower the cap a little.
-    private static final int ARRAY_MAX = Integer.MAX_VALUE - 8;
+    private static final int ARRAY_MAX = ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH;
 
     @Override
     public int nextCapacity(int currentCapacity) {
diff --git a/core/src/main/scala/org/apache/spark/util/collection/PartitionedPairBuffer.scala b/core/src/main/scala/org/apache/spark/util/collection/PartitionedPairBuffer.scala
index b755e5da51684..e17a9de97e335 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/PartitionedPairBuffer.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/PartitionedPairBuffer.scala
@@ -19,6 +19,8 @@ package org.apache.spark.util.collection
 
 import java.util.Comparator
 
+import org.apache.spark.unsafe.Platform
+import org.apache.spark.unsafe.array.ByteArrayMethods
 import org.apache.spark.util.collection.WritablePartitionedPairCollection._
 
 /**
@@ -96,7 +98,5 @@ private[spark] class PartitionedPairBuffer[K, V](initialCapacity: Int = 64)
 }
 
 private object PartitionedPairBuffer {
-  // Some JVMs can't allocate arrays of length Integer.MAX_VALUE; actual max is somewhat
-  // smaller. Be conservative and lower the cap a little.
-  val MAXIMUM_CAPACITY: Int = (Int.MaxValue - 8) / 2
+  val MAXIMUM_CAPACITY: Int = ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH / 2
 }
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index b06f2e26a4a7a..b52da4c0c8bc3 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -100,6 +100,8 @@ class SparkSubmitSuite
   with TimeLimits
   with TestPrematureExit {
 
+  import SparkSubmitSuite._
+
   override def beforeEach() {
     super.beforeEach()
     System.setProperty("spark.testing", "true")
@@ -974,30 +976,6 @@ class SparkSubmitSuite
     }
   }
 
-  // NOTE: This is an expensive operation in terms of time (10 seconds+). Use sparingly.
-  private def runSparkSubmit(args: Seq[String]): Unit = {
-    val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!"))
-    val sparkSubmitFile = if (Utils.isWindows) {
-      new File("..\\bin\\spark-submit.cmd")
-    } else {
-      new File("../bin/spark-submit")
-    }
-    val process = Utils.executeCommand(
-      Seq(sparkSubmitFile.getCanonicalPath) ++ args,
-      new File(sparkHome),
-      Map("SPARK_TESTING" -> "1", "SPARK_HOME" -> sparkHome))
-
-    try {
-      val exitCode = failAfter(60 seconds) { process.waitFor() }
-      if (exitCode != 0) {
-        fail(s"Process returned with exit code $exitCode. See the log4j logs for more detail.")
-      }
-    } finally {
-      // Ensure we still kill the process in case it timed out
-      process.destroy()
-    }
-  }
-
   private def forConfDir(defaults: Map[String, String]) (f: String => Unit) = {
     val tmpDir = Utils.createTempDir()
 
@@ -1020,6 +998,32 @@ class SparkSubmitSuite
   }
 }
 
+object SparkSubmitSuite extends SparkFunSuite with TimeLimits {
+  // NOTE: This is an expensive operation in terms of time (10 seconds+). Use sparingly.
+  def runSparkSubmit(args: Seq[String], root: String = ".."): Unit = {
+    val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!"))
+    val sparkSubmitFile = if (Utils.isWindows) {
+      new File(s"$root\\bin\\spark-submit.cmd")
+    } else {
+      new File(s"$root/bin/spark-submit")
+    }
+    val process = Utils.executeCommand(
+      Seq(sparkSubmitFile.getCanonicalPath) ++ args,
+      new File(sparkHome),
+      Map("SPARK_TESTING" -> "1", "SPARK_HOME" -> sparkHome))
+
+    try {
+      val exitCode = failAfter(60 seconds) { process.waitFor() }
+      if (exitCode != 0) {
+        fail(s"Process returned with exit code $exitCode. See the log4j logs for more detail.")
+      }
+    } finally {
+      // Ensure we still kill the process in case it timed out
+      process.destroy()
+    }
+  }
+}
+
 object JarCreationTest extends Logging {
   def main(args: Array[String]) {
     Utils.configTestLog4j("INFO")
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolder.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolder.java
index 971d19973f067..259976118c12f 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolder.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolder.java
@@ -19,6 +19,7 @@
 
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
 import org.apache.spark.unsafe.Platform;
+import org.apache.spark.unsafe.array.ByteArrayMethods;
 
 /**
  * A helper class to manage the data buffer for an unsafe row.  The data buffer can grow and
@@ -36,9 +37,7 @@
  */
 public class BufferHolder {
 
-  // Some JVMs can't allocate arrays of length Integer.MAX_VALUE; actual max is somewhat
-  // smaller. Be conservative and lower the cap a little.
-  private static final int ARRAY_MAX = Integer.MAX_VALUE - 8;
+  private static final int ARRAY_MAX = ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH;
 
   public byte[] buffer;
   public int cursor = Platform.BYTE_ARRAY_OFFSET;
@@ -51,7 +50,7 @@ public BufferHolder(UnsafeRow row) {
 
   public BufferHolder(UnsafeRow row, int initialSize) {
     int bitsetWidthInBytes = UnsafeRow.calculateBitSetWidthInBytes(row.numFields());
-    if (row.numFields() > (Integer.MAX_VALUE - initialSize - bitsetWidthInBytes) / 8) {
+    if (row.numFields() > (ARRAY_MAX - initialSize - bitsetWidthInBytes) / 8) {
       throw new UnsupportedOperationException(
         "Cannot create BufferHolder for input UnsafeRow because there are " +
           "too many fields (number of fields: " + row.numFields() + ")");
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolderSparkSubmitSutie.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolderSparkSubmitSutie.scala
new file mode 100644
index 0000000000000..1167d2f3f3891
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolderSparkSubmitSutie.scala
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.codegen
+
+import org.scalatest.{BeforeAndAfterEach, Matchers}
+import org.scalatest.concurrent.Timeouts
+
+import org.apache.spark.{SparkFunSuite, TestUtils}
+import org.apache.spark.deploy.SparkSubmitSuite
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow
+import org.apache.spark.unsafe.array.ByteArrayMethods
+import org.apache.spark.util.ResetSystemProperties
+
+// A test for growing the buffer holder to nearly 2GB. Due to the heap size limitation of the Spark
+// unit tests JVM, the actually test code is running as a submit job.
+class BufferHolderSparkSubmitSuite
+  extends SparkFunSuite
+    with Matchers
+    with BeforeAndAfterEach
+    with ResetSystemProperties
+    with Timeouts {
+
+  test("SPARK-22222: Buffer holder should be able to allocate memory larger than 1GB") {
+    val unusedJar = TestUtils.createJarWithClasses(Seq.empty)
+
+    val argsForSparkSubmit = Seq(
+      "--class", BufferHolderSparkSubmitSuite.getClass.getName.stripSuffix("$"),
+      "--name", "SPARK-22222",
+      "--master", "local-cluster[2,1,1024]",
+      "--driver-memory", "4g",
+      "--conf", "spark.ui.enabled=false",
+      "--conf", "spark.master.rest.enabled=false",
+      "--conf", "spark.driver.extraJavaOptions=-ea",
+      unusedJar.toString)
+    SparkSubmitSuite.runSparkSubmit(argsForSparkSubmit, "../..")
+  }
+}
+
+object BufferHolderSparkSubmitSuite {
+
+  def main(args: Array[String]): Unit = {
+
+    val ARRAY_MAX = ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH
+
+    val holder = new BufferHolder(new UnsafeRow(1000))
+
+    holder.reset()
+    holder.grow(roundToWord(ARRAY_MAX / 2))
+
+    holder.reset()
+    holder.grow(roundToWord(ARRAY_MAX / 2 + 8))
+
+    holder.reset()
+    holder.grow(roundToWord(Integer.MAX_VALUE / 2))
+
+    holder.reset()
+    holder.grow(roundToWord(Integer.MAX_VALUE))
+  }
+
+  private def roundToWord(len: Int): Int = {
+    ByteArrayMethods.roundNumberOfBytesToNearestWord(len)
+  }
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
index da72954ddc448..d3a14b9d8bd74 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
@@ -23,6 +23,7 @@
 
 import org.apache.spark.sql.internal.SQLConf;
 import org.apache.spark.sql.types.*;
+import org.apache.spark.unsafe.array.ByteArrayMethods;
 import org.apache.spark.unsafe.types.UTF8String;
 
 /**
@@ -595,7 +596,7 @@ public final int appendStruct(boolean isNull) {
    * Upper limit for the maximum capacity for this column.
    */
   @VisibleForTesting
-  protected int MAX_CAPACITY = Integer.MAX_VALUE - 8;
+  protected int MAX_CAPACITY = ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH;
 
   /**
    * Number of nulls in this column. This is an optimization for the reader, to skip NULL checks.

From af8a34c787dc3d68f5148a7d9975b52650bb7729 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Mon, 9 Oct 2017 22:35:34 -0700
Subject: [PATCH 686/779] [SPARK-22159][SQL][FOLLOW-UP] Make config names
 consistently end with "enabled".

## What changes were proposed in this pull request?

This is a follow-up of #19384.

In the previous pr, only definitions of the config names were modified, but we also need to modify the names in runtime or tests specified as string literal.

## How was this patch tested?

Existing tests but modified the config names.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #19462 from ueshin/issues/SPARK-22159/fup1.
---
 python/pyspark/sql/dataframe.py               |  4 ++--
 python/pyspark/sql/tests.py                   |  6 +++---
 .../aggregate/HashAggregateExec.scala         |  2 +-
 .../spark/sql/AggregateHashMapSuite.scala     | 12 +++++------
 .../benchmark/AggregateBenchmark.scala        | 20 +++++++++----------
 .../execution/AggregationQuerySuite.scala     |  2 +-
 6 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index b7ce9a83a616d..fe69e588fe098 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1878,7 +1878,7 @@ def toPandas(self):
         1    5    Bob
         """
         import pandas as pd
-        if self.sql_ctx.getConf("spark.sql.execution.arrow.enable", "false").lower() == "true":
+        if self.sql_ctx.getConf("spark.sql.execution.arrow.enabled", "false").lower() == "true":
             try:
                 import pyarrow
                 tables = self._collectAsArrow()
@@ -1889,7 +1889,7 @@ def toPandas(self):
                     return pd.DataFrame.from_records([], columns=self.columns)
             except ImportError as e:
                 msg = "note: pyarrow must be installed and available on calling Python process " \
-                      "if using spark.sql.execution.arrow.enable=true"
+                      "if using spark.sql.execution.arrow.enabled=true"
                 raise ImportError("%s\n%s" % (e.message, msg))
         else:
             pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 1b3af42c47ad2..a59378b5e848a 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -3088,7 +3088,7 @@ class ArrowTests(ReusedPySparkTestCase):
     def setUpClass(cls):
         ReusedPySparkTestCase.setUpClass()
         cls.spark = SparkSession(cls.sc)
-        cls.spark.conf.set("spark.sql.execution.arrow.enable", "true")
+        cls.spark.conf.set("spark.sql.execution.arrow.enabled", "true")
         cls.schema = StructType([
             StructField("1_str_t", StringType(), True),
             StructField("2_int_t", IntegerType(), True),
@@ -3120,9 +3120,9 @@ def test_null_conversion(self):
 
     def test_toPandas_arrow_toggle(self):
         df = self.spark.createDataFrame(self.data, schema=self.schema)
-        self.spark.conf.set("spark.sql.execution.arrow.enable", "false")
+        self.spark.conf.set("spark.sql.execution.arrow.enabled", "false")
         pdf = df.toPandas()
-        self.spark.conf.set("spark.sql.execution.arrow.enable", "true")
+        self.spark.conf.set("spark.sql.execution.arrow.enabled", "true")
         pdf_arrow = df.toPandas()
         self.assertFramesEqual(pdf_arrow, pdf)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
index f424096b330e3..8b573fdcf25e1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
@@ -539,7 +539,7 @@ case class HashAggregateExec(
   private def enableTwoLevelHashMap(ctx: CodegenContext) = {
     if (!checkIfFastHashMapSupported(ctx)) {
       if (modes.forall(mode => mode == Partial || mode == PartialMerge) && !Utils.isTesting) {
-        logInfo("spark.sql.codegen.aggregate.map.twolevel.enable is set to true, but"
+        logInfo("spark.sql.codegen.aggregate.map.twolevel.enabled is set to true, but"
           + " current version of codegened fast hashmap does not support this aggregate.")
       }
     } else {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/AggregateHashMapSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/AggregateHashMapSuite.scala
index 7e61a68025158..938d76c9f0837 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/AggregateHashMapSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/AggregateHashMapSuite.scala
@@ -24,14 +24,14 @@ import org.apache.spark.SparkConf
 class SingleLevelAggregateHashMapSuite extends DataFrameAggregateSuite with BeforeAndAfter {
   override protected def sparkConf: SparkConf = super.sparkConf
     .set("spark.sql.codegen.fallback", "false")
-    .set("spark.sql.codegen.aggregate.map.twolevel.enable", "false")
+    .set("spark.sql.codegen.aggregate.map.twolevel.enabled", "false")
 
   // adding some checking after each test is run, assuring that the configs are not changed
   // in test code
   after {
     assert(sparkConf.get("spark.sql.codegen.fallback") == "false",
       "configuration parameter changed in test body")
-    assert(sparkConf.get("spark.sql.codegen.aggregate.map.twolevel.enable") == "false",
+    assert(sparkConf.get("spark.sql.codegen.aggregate.map.twolevel.enabled") == "false",
       "configuration parameter changed in test body")
   }
 }
@@ -39,14 +39,14 @@ class SingleLevelAggregateHashMapSuite extends DataFrameAggregateSuite with Befo
 class TwoLevelAggregateHashMapSuite extends DataFrameAggregateSuite with BeforeAndAfter {
   override protected def sparkConf: SparkConf = super.sparkConf
     .set("spark.sql.codegen.fallback", "false")
-    .set("spark.sql.codegen.aggregate.map.twolevel.enable", "true")
+    .set("spark.sql.codegen.aggregate.map.twolevel.enabled", "true")
 
   // adding some checking after each test is run, assuring that the configs are not changed
   // in test code
   after {
     assert(sparkConf.get("spark.sql.codegen.fallback") == "false",
       "configuration parameter changed in test body")
-    assert(sparkConf.get("spark.sql.codegen.aggregate.map.twolevel.enable") == "true",
+    assert(sparkConf.get("spark.sql.codegen.aggregate.map.twolevel.enabled") == "true",
       "configuration parameter changed in test body")
   }
 }
@@ -57,7 +57,7 @@ class TwoLevelAggregateHashMapWithVectorizedMapSuite
 
   override protected def sparkConf: SparkConf = super.sparkConf
     .set("spark.sql.codegen.fallback", "false")
-    .set("spark.sql.codegen.aggregate.map.twolevel.enable", "true")
+    .set("spark.sql.codegen.aggregate.map.twolevel.enabled", "true")
     .set("spark.sql.codegen.aggregate.map.vectorized.enable", "true")
 
   // adding some checking after each test is run, assuring that the configs are not changed
@@ -65,7 +65,7 @@ class TwoLevelAggregateHashMapWithVectorizedMapSuite
   after {
     assert(sparkConf.get("spark.sql.codegen.fallback") == "false",
       "configuration parameter changed in test body")
-    assert(sparkConf.get("spark.sql.codegen.aggregate.map.twolevel.enable") == "true",
+    assert(sparkConf.get("spark.sql.codegen.aggregate.map.twolevel.enabled") == "true",
       "configuration parameter changed in test body")
     assert(sparkConf.get("spark.sql.codegen.aggregate.map.vectorized.enable") == "true",
       "configuration parameter changed in test body")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala
index aca1be01fa3da..a834b7cd2c69f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala
@@ -107,14 +107,14 @@ class AggregateBenchmark extends BenchmarkBase {
 
     benchmark.addCase(s"codegen = T hashmap = F", numIters = 3) { iter =>
       sparkSession.conf.set("spark.sql.codegen.wholeStage", "true")
-      sparkSession.conf.set("spark.sql.codegen.aggregate.map.twolevel.enable", "false")
+      sparkSession.conf.set("spark.sql.codegen.aggregate.map.twolevel.enabled", "false")
       sparkSession.conf.set("spark.sql.codegen.aggregate.map.vectorized.enable", "false")
       f()
     }
 
     benchmark.addCase(s"codegen = T hashmap = T", numIters = 5) { iter =>
       sparkSession.conf.set("spark.sql.codegen.wholeStage", "true")
-      sparkSession.conf.set("spark.sql.codegen.aggregate.map.twolevel.enable", "true")
+      sparkSession.conf.set("spark.sql.codegen.aggregate.map.twolevel.enabled", "true")
       sparkSession.conf.set("spark.sql.codegen.aggregate.map.vectorized.enable", "true")
       f()
     }
@@ -149,14 +149,14 @@ class AggregateBenchmark extends BenchmarkBase {
 
     benchmark.addCase(s"codegen = T hashmap = F", numIters = 3) { iter =>
       sparkSession.conf.set("spark.sql.codegen.wholeStage", value = true)
-      sparkSession.conf.set("spark.sql.codegen.aggregate.map.twolevel.enable", "false")
+      sparkSession.conf.set("spark.sql.codegen.aggregate.map.twolevel.enabled", "false")
       sparkSession.conf.set("spark.sql.codegen.aggregate.map.vectorized.enable", "false")
       f()
     }
 
     benchmark.addCase(s"codegen = T hashmap = T", numIters = 5) { iter =>
       sparkSession.conf.set("spark.sql.codegen.wholeStage", value = true)
-      sparkSession.conf.set("spark.sql.codegen.aggregate.map.twolevel.enable", "true")
+      sparkSession.conf.set("spark.sql.codegen.aggregate.map.twolevel.enabled", "true")
       sparkSession.conf.set("spark.sql.codegen.aggregate.map.vectorized.enable", "true")
       f()
     }
@@ -189,14 +189,14 @@ class AggregateBenchmark extends BenchmarkBase {
 
     benchmark.addCase(s"codegen = T hashmap = F", numIters = 3) { iter =>
       sparkSession.conf.set("spark.sql.codegen.wholeStage", "true")
-      sparkSession.conf.set("spark.sql.codegen.aggregate.map.twolevel.enable", "false")
+      sparkSession.conf.set("spark.sql.codegen.aggregate.map.twolevel.enabled", "false")
       sparkSession.conf.set("spark.sql.codegen.aggregate.map.vectorized.enable", "false")
       f()
     }
 
     benchmark.addCase(s"codegen = T hashmap = T", numIters = 5) { iter =>
       sparkSession.conf.set("spark.sql.codegen.wholeStage", "true")
-      sparkSession.conf.set("spark.sql.codegen.aggregate.map.twolevel.enable", "true")
+      sparkSession.conf.set("spark.sql.codegen.aggregate.map.twolevel.enabled", "true")
       sparkSession.conf.set("spark.sql.codegen.aggregate.map.vectorized.enable", "true")
       f()
     }
@@ -228,14 +228,14 @@ class AggregateBenchmark extends BenchmarkBase {
 
     benchmark.addCase(s"codegen = T hashmap = F") { iter =>
       sparkSession.conf.set("spark.sql.codegen.wholeStage", "true")
-      sparkSession.conf.set("spark.sql.codegen.aggregate.map.twolevel.enable", "false")
+      sparkSession.conf.set("spark.sql.codegen.aggregate.map.twolevel.enabled", "false")
       sparkSession.conf.set("spark.sql.codegen.aggregate.map.vectorized.enable", "false")
       f()
     }
 
     benchmark.addCase(s"codegen = T hashmap = T") { iter =>
       sparkSession.conf.set("spark.sql.codegen.wholeStage", "true")
-      sparkSession.conf.set("spark.sql.codegen.aggregate.map.twolevel.enable", "true")
+      sparkSession.conf.set("spark.sql.codegen.aggregate.map.twolevel.enabled", "true")
       sparkSession.conf.set("spark.sql.codegen.aggregate.map.vectorized.enable", "true")
       f()
     }
@@ -277,14 +277,14 @@ class AggregateBenchmark extends BenchmarkBase {
 
     benchmark.addCase(s"codegen = T hashmap = F") { iter =>
       sparkSession.conf.set("spark.sql.codegen.wholeStage", "true")
-      sparkSession.conf.set("spark.sql.codegen.aggregate.map.twolevel.enable", "false")
+      sparkSession.conf.set("spark.sql.codegen.aggregate.map.twolevel.enabled", "false")
       sparkSession.conf.set("spark.sql.codegen.aggregate.map.vectorized.enable", "false")
       f()
     }
 
     benchmark.addCase(s"codegen = T hashmap = T") { iter =>
       sparkSession.conf.set("spark.sql.codegen.wholeStage", "true")
-      sparkSession.conf.set("spark.sql.codegen.aggregate.map.twolevel.enable", "true")
+      sparkSession.conf.set("spark.sql.codegen.aggregate.map.twolevel.enabled", "true")
       sparkSession.conf.set("spark.sql.codegen.aggregate.map.vectorized.enable", "true")
       f()
     }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
index f245a79f805a2..ae675149df5e2 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
@@ -1015,7 +1015,7 @@ class HashAggregationQueryWithControlledFallbackSuite extends AggregationQuerySu
 
   override protected def checkAnswer(actual: => DataFrame, expectedAnswer: Seq[Row]): Unit = {
     Seq("true", "false").foreach { enableTwoLevelMaps =>
-      withSQLConf("spark.sql.codegen.aggregate.map.twolevel.enable" ->
+      withSQLConf("spark.sql.codegen.aggregate.map.twolevel.enabled" ->
         enableTwoLevelMaps) {
         (1 to 3).foreach { fallbackStartsAt =>
           withSQLConf("spark.sql.TungstenAggregate.testFallbackStartsAt" ->

From 3b5c2a84bfa311a94c1c0a57f2cb3e421fb05650 Mon Sep 17 00:00:00 2001
From: WeichenXu <weichen.xu@databricks.com>
Date: Tue, 10 Oct 2017 08:27:45 +0100
Subject: [PATCH 687/779] [SPARK-21770][ML] ProbabilisticClassificationModel
 fix corner case: normalization of all-zero raw predictions

## What changes were proposed in this pull request?

Fix probabilisticClassificationModel corner case: normalization of all-zero raw predictions, throw IllegalArgumentException with description.

## How was this patch tested?

Test case added.

Author: WeichenXu <weichen.xu@databricks.com>

Closes #19106 from WeichenXu123/SPARK-21770.
---
 .../ProbabilisticClassifier.scala             | 20 ++++++++++---------
 .../ProbabilisticClassifierSuite.scala        | 18 +++++++++++++++++
 2 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
index ef08134809915..730fcab333e11 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
@@ -230,21 +230,23 @@ private[ml] object ProbabilisticClassificationModel {
    * Normalize a vector of raw predictions to be a multinomial probability vector, in place.
    *
    * The input raw predictions should be nonnegative.
-   * The output vector sums to 1, unless the input vector is all-0 (in which case the output is
-   * all-0 too).
+   * The output vector sums to 1.
    *
    * NOTE: This is NOT applicable to all models, only ones which effectively use class
    *       instance counts for raw predictions.
+   *
+   * @throws IllegalArgumentException if the input vector is all-0 or including negative values
    */
   def normalizeToProbabilitiesInPlace(v: DenseVector): Unit = {
+    v.values.foreach(value => require(value >= 0,
+      "The input raw predictions should be nonnegative."))
     val sum = v.values.sum
-    if (sum != 0) {
-      var i = 0
-      val size = v.size
-      while (i < size) {
-        v.values(i) /= sum
-        i += 1
-      }
+    require(sum > 0, "Can't normalize the 0-vector.")
+    var i = 0
+    val size = v.size
+    while (i < size) {
+      v.values(i) /= sum
+      i += 1
     }
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala
index 4ecd5a05365eb..d649ceac949c4 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/ProbabilisticClassifierSuite.scala
@@ -80,6 +80,24 @@ class ProbabilisticClassifierSuite extends SparkFunSuite {
       new TestProbabilisticClassificationModel("myuid", 2, 2).setThresholds(Array(-0.1, 0.1))
     }
   }
+
+  test("normalizeToProbabilitiesInPlace") {
+    val vec1 = Vectors.dense(1.0, 2.0, 3.0).toDense
+    ProbabilisticClassificationModel.normalizeToProbabilitiesInPlace(vec1)
+    assert(vec1 ~== Vectors.dense(1.0 / 6, 2.0 / 6, 3.0 / 6) relTol 1e-3)
+
+    // all-0 input test
+    val vec2 = Vectors.dense(0.0, 0.0, 0.0).toDense
+    intercept[IllegalArgumentException] {
+      ProbabilisticClassificationModel.normalizeToProbabilitiesInPlace(vec2)
+    }
+
+    // negative input test
+    val vec3 = Vectors.dense(1.0, -1.0, 2.0).toDense
+    intercept[IllegalArgumentException] {
+      ProbabilisticClassificationModel.normalizeToProbabilitiesInPlace(vec3)
+    }
+  }
 }
 
 object ProbabilisticClassifierSuite {

From b8a08f25cc64ed3034f3c90790931c30e5b0f236 Mon Sep 17 00:00:00 2001
From: liuxian <liu.xian3@zte.com.cn>
Date: Tue, 10 Oct 2017 20:44:33 +0800
Subject: [PATCH 688/779] [SPARK-21506][DOC] The description of
 "spark.executor.cores" may be not correct

## What changes were proposed in this pull request?

The number of cores assigned to each executor is configurable. When this is not explicitly set,  multiple executors from the same application may be launched on the same worker too.

## How was this patch tested?
N/A

Author: liuxian <liu.xian3@zte.com.cn>

Closes #18711 from 10110346/executorcores.
---
 .../spark/deploy/client/StandaloneAppClient.scala     |  2 +-
 .../scala/org/apache/spark/deploy/master/Master.scala |  8 +++++++-
 .../cluster/StandaloneSchedulerBackend.scala          |  2 +-
 docs/configuration.md                                 | 11 ++++-------
 docs/spark-standalone.md                              |  8 ++++++++
 5 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClient.scala b/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClient.scala
index 757c930b84eb2..34ade4ce6f39b 100644
--- a/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClient.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClient.scala
@@ -170,7 +170,7 @@ private[spark] class StandaloneAppClient(
 
       case ExecutorAdded(id: Int, workerId: String, hostPort: String, cores: Int, memory: Int) =>
         val fullId = appId + "/" + id
-        logInfo("Executor added: %s on %s (%s) with %d cores".format(fullId, workerId, hostPort,
+        logInfo("Executor added: %s on %s (%s) with %d core(s)".format(fullId, workerId, hostPort,
           cores))
         listener.executorAdded(fullId, workerId, hostPort, cores, memory)
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
index e030cac60a8e4..2c78c15773af2 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
@@ -581,7 +581,13 @@ private[deploy] class Master(
    * The number of cores assigned to each executor is configurable. When this is explicitly set,
    * multiple executors from the same application may be launched on the same worker if the worker
    * has enough cores and memory. Otherwise, each executor grabs all the cores available on the
-   * worker by default, in which case only one executor may be launched on each worker.
+   * worker by default, in which case only one executor per application may be launched on each
+   * worker during one single schedule iteration.
+   * Note that when `spark.executor.cores` is not set, we may still launch multiple executors from
+   * the same application on the same worker. Consider appA and appB both have one executor running
+   * on worker1, and appA.coresLeft > 0, then appB is finished and release all its cores on worker1,
+   * thus for the next schedule iteration, appA launches a new executor that grabs all the free
+   * cores on worker1, therefore we get multiple executors from appA running on worker1.
    *
    * It is important to allocate coresPerExecutor on each worker at a time (instead of 1 core
    * at a time). Consider the following example: cluster has 4 workers with 16 cores each.
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala
index a4e2a74341283..505c342a889ee 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala
@@ -153,7 +153,7 @@ private[spark] class StandaloneSchedulerBackend(
 
   override def executorAdded(fullId: String, workerId: String, hostPort: String, cores: Int,
     memory: Int) {
-    logInfo("Granted executor ID %s on hostPort %s with %d cores, %s RAM".format(
+    logInfo("Granted executor ID %s on hostPort %s with %d core(s), %s RAM".format(
       fullId, hostPort, cores, Utils.megabytesToString(memory)))
   }
 
diff --git a/docs/configuration.md b/docs/configuration.md
index 6e9fe591b70a3..7a777d3c6fa3d 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1015,7 +1015,7 @@ Apart from these, the following properties are also available, and may be useful
   <td>0.5</td>
   <td>
     Amount of storage memory immune to eviction, expressed as a fraction of the size of the
-    region set aside by <code>s​park.memory.fraction</code>. The higher this is, the less
+    region set aside by <code>spark.memory.fraction</code>. The higher this is, the less
     working memory may be available to execution and tasks may spill to disk more often.
     Leaving this at the default value is recommended. For more detail, see
     <a href="tuning.html#memory-management-overview">this description</a>.
@@ -1041,7 +1041,7 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.memory.useLegacyMode</code></td>
   <td>false</td>
   <td>
-    ​Whether to enable the legacy memory management mode used in Spark 1.5 and before.
+    Whether to enable the legacy memory management mode used in Spark 1.5 and before.
     The legacy mode rigidly partitions the heap space into fixed-size regions,
     potentially leading to excessive spilling if the application was not tuned.
     The following deprecated memory fraction configurations are not read unless this is enabled:
@@ -1115,11 +1115,8 @@ Apart from these, the following properties are also available, and may be useful
   <td>
     The number of cores to use on each executor.
 
-    In standalone and Mesos coarse-grained modes, setting this
-    parameter allows an application to run multiple executors on the
-    same worker, provided that there are enough cores on that
-    worker. Otherwise, only one executor per application will run on
-    each worker.
+    In standalone and Mesos coarse-grained modes, for more detail, see
+    <a href="spark-standalone.html#Executors Scheduling">this description</a>.
   </td>
 </tr>
 <tr>
diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md
index 1095386c31ab8..f51c5cc38f4de 100644
--- a/docs/spark-standalone.md
+++ b/docs/spark-standalone.md
@@ -328,6 +328,14 @@ export SPARK_MASTER_OPTS="-Dspark.deploy.defaultCores=<value>"
 This is useful on shared clusters where users might not have configured a maximum number of cores
 individually.
 
+# Executors Scheduling
+
+The number of cores assigned to each executor is configurable. When `spark.executor.cores` is
+explicitly set, multiple executors from the same application may be launched on the same worker
+if the worker has enough cores and memory. Otherwise, each executor grabs all the cores available
+on the worker by default, in which case only one executor per application may be launched on each
+worker during one single schedule iteration.
+
 # Monitoring and Logging
 
 Spark's standalone mode offers a web-based user interface to monitor the cluster. The master and each worker has its own web UI that shows cluster and job statistics. By default you can access the web UI for the master at port 8080. The port can be changed either in the configuration file or via command-line options.

From 23af2d79ad9a3c83936485ee57513b39193a446b Mon Sep 17 00:00:00 2001
From: Prashant Sharma <prashsh1@in.ibm.com>
Date: Tue, 10 Oct 2017 20:48:42 +0800
Subject: [PATCH 689/779] [SPARK-20025][CORE] Ignore  SPARK_LOCAL* env, while
 deploying via cluster mode.

## What changes were proposed in this pull request?

In a bare metal system with No DNS setup, spark may be configured with SPARK_LOCAL* for IP and host properties.
During a driver failover, in cluster deployment mode. SPARK_LOCAL* should be ignored while restarting on another node and should be picked up from target system's local environment.

## How was this patch tested?
Distributed deployment against a spark standalone cluster of  6 Workers. Tested by killing JVM's running driver and verified the restarted JVMs have right configurations on them.

Author: Prashant Sharma <prashsh1@in.ibm.com>
Author: Prashant Sharma <prashant@apache.org>

Closes #17357 from ScrapCodes/driver-failover-fix.
---
 core/src/main/scala/org/apache/spark/deploy/Client.scala | 6 +++---
 .../apache/spark/deploy/rest/StandaloneRestServer.scala  | 4 +++-
 .../org/apache/spark/deploy/worker/DriverWrapper.scala   | 9 ++++++---
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/Client.scala b/core/src/main/scala/org/apache/spark/deploy/Client.scala
index bf6093236d92b..7acb5c55bb252 100644
--- a/core/src/main/scala/org/apache/spark/deploy/Client.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/Client.scala
@@ -93,19 +93,19 @@ private class ClientEndpoint(
           driverArgs.cores,
           driverArgs.supervise,
           command)
-        ayncSendToMasterAndForwardReply[SubmitDriverResponse](
+        asyncSendToMasterAndForwardReply[SubmitDriverResponse](
           RequestSubmitDriver(driverDescription))
 
       case "kill" =>
         val driverId = driverArgs.driverId
-        ayncSendToMasterAndForwardReply[KillDriverResponse](RequestKillDriver(driverId))
+        asyncSendToMasterAndForwardReply[KillDriverResponse](RequestKillDriver(driverId))
     }
   }
 
   /**
    * Send the message to master and forward the reply to self asynchronously.
    */
-  private def ayncSendToMasterAndForwardReply[T: ClassTag](message: Any): Unit = {
+  private def asyncSendToMasterAndForwardReply[T: ClassTag](message: Any): Unit = {
     for (masterEndpoint <- masterEndpoints) {
       masterEndpoint.ask[T](message).onComplete {
         case Success(v) => self.send(v)
diff --git a/core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestServer.scala b/core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestServer.scala
index 0164084ab129e..22b65abce611a 100644
--- a/core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestServer.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestServer.scala
@@ -139,7 +139,9 @@ private[rest] class StandaloneSubmitRequestServlet(
     val driverExtraLibraryPath = sparkProperties.get("spark.driver.extraLibraryPath")
     val superviseDriver = sparkProperties.get("spark.driver.supervise")
     val appArgs = request.appArgs
-    val environmentVariables = request.environmentVariables
+    // Filter SPARK_LOCAL_(IP|HOSTNAME) environment variables from being set on the remote system.
+    val environmentVariables =
+      request.environmentVariables.filterNot(x => x._1.matches("SPARK_LOCAL_(IP|HOSTNAME)"))
 
     // Construct driver description
     val conf = new SparkConf(false)
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala b/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala
index c1671192e0c64..b19c9904d5982 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala
@@ -23,6 +23,7 @@ import org.apache.commons.lang3.StringUtils
 
 import org.apache.spark.{SecurityManager, SparkConf}
 import org.apache.spark.deploy.{DependencyUtils, SparkHadoopUtil, SparkSubmit}
+import org.apache.spark.internal.Logging
 import org.apache.spark.rpc.RpcEnv
 import org.apache.spark.util.{ChildFirstURLClassLoader, MutableURLClassLoader, Utils}
 
@@ -30,7 +31,7 @@ import org.apache.spark.util.{ChildFirstURLClassLoader, MutableURLClassLoader, U
  * Utility object for launching driver programs such that they share fate with the Worker process.
  * This is used in standalone cluster mode only.
  */
-object DriverWrapper {
+object DriverWrapper extends Logging {
   def main(args: Array[String]) {
     args.toList match {
       /*
@@ -41,8 +42,10 @@ object DriverWrapper {
        */
       case workerUrl :: userJar :: mainClass :: extraArgs =>
         val conf = new SparkConf()
-        val rpcEnv = RpcEnv.create("Driver",
-          Utils.localHostName(), 0, conf, new SecurityManager(conf))
+        val host: String = Utils.localHostName()
+        val port: Int = sys.props.getOrElse("spark.driver.port", "0").toInt
+        val rpcEnv = RpcEnv.create("Driver", host, port, conf, new SecurityManager(conf))
+        logInfo(s"Driver address: ${rpcEnv.address}")
         rpcEnv.setupEndpoint("workerWatcher", new WorkerWatcher(rpcEnv, workerUrl))
 
         val currentLoader = Thread.currentThread.getContextClassLoader

From 633ffd816d285480bab1f346471135b10ec092bb Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 10 Oct 2017 11:01:02 -0700
Subject: [PATCH 690/779] rename the file.

---
 ...rSparkSubmitSutie.scala => BufferHolderSparkSubmitSuite.scala} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/{BufferHolderSparkSubmitSutie.scala => BufferHolderSparkSubmitSuite.scala} (100%)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolderSparkSubmitSutie.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolderSparkSubmitSuite.scala
similarity index 100%
rename from sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolderSparkSubmitSutie.scala
rename to sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/BufferHolderSparkSubmitSuite.scala

From 2028e5a82bc3e9a79f9b84f376bdf606b8c9bb0f Mon Sep 17 00:00:00 2001
From: Eyal Farago <eyal@nrgene.com>
Date: Tue, 10 Oct 2017 22:49:47 +0200
Subject: [PATCH 691/779] [SPARK-21907][CORE] oom during spill

## What changes were proposed in this pull request?
1. a test reproducing [SPARK-21907](https://issues.apache.org/jira/browse/SPARK-21907)
2. a fix for the root cause of the issue.

`org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.spill` calls `org.apache.spark.util.collection.unsafe.sort.UnsafeInMemorySorter.reset` which may trigger another spill,
when this happens the `array` member is already de-allocated but still referenced by the code, this causes the nested spill to fail with an NPE in `org.apache.spark.memory.TaskMemoryManager.getPage`.
This patch introduces a reproduction in a test case and a fix, the fix simply sets the in-mem sorter's array member to an empty array before actually performing the allocation. This prevents the spilling code from 'touching' the de-allocated array.

## How was this patch tested?
introduced a new test case: `org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorterSuite#testOOMDuringSpill`.

Author: Eyal Farago <eyal@nrgene.com>

Closes #19181 from eyalfa/SPARK-21907__oom_during_spill.
---
 .../unsafe/sort/UnsafeExternalSorter.java     |  4 ++
 .../unsafe/sort/UnsafeInMemorySorter.java     | 12 ++++-
 .../sort/UnsafeExternalSorterSuite.java       | 33 +++++++++++++
 .../sort/UnsafeInMemorySorterSuite.java       | 46 +++++++++++++++++++
 .../spark/memory/TestMemoryManager.scala      | 12 +++--
 5 files changed, 102 insertions(+), 5 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
index 39eda00dd7efb..e749f7ba87c6e 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
@@ -480,6 +480,10 @@ public UnsafeSorterIterator getSortedIterator() throws IOException {
     }
   }
 
+  @VisibleForTesting boolean hasSpaceForAnotherRecord() {
+    return inMemSorter.hasSpaceForAnotherRecord();
+  }
+
   private static void spillIterator(UnsafeSorterIterator inMemIterator,
       UnsafeSorterSpillWriter spillWriter) throws IOException {
     while (inMemIterator.hasNext()) {
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
index c14c12664f5ab..869ec908be1fb 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
@@ -162,7 +162,9 @@ private int getUsableCapacity() {
    */
   public void free() {
     if (consumer != null) {
-      consumer.freeArray(array);
+      if (array != null) {
+        consumer.freeArray(array);
+      }
       array = null;
     }
   }
@@ -170,6 +172,14 @@ public void free() {
   public void reset() {
     if (consumer != null) {
       consumer.freeArray(array);
+      // the call to consumer.allocateArray may trigger a spill
+      // which in turn access this instance and eventually re-enter this method and try to free the array again.
+      // by setting the array to null and its length to 0 we effectively make the spill code-path a no-op.
+      // setting the array to null also indicates that it has already been de-allocated which prevents a double de-allocation in free().
+      array = null;
+      usableCapacity = 0;
+      pos = 0;
+      nullBoundaryPos = 0;
       array = consumer.allocateArray(initialSize);
       usableCapacity = getUsableCapacity();
     }
diff --git a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
index 5330a688e63e3..6c5451d0fd2a5 100644
--- a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
+++ b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
@@ -23,6 +23,7 @@
 import java.util.LinkedList;
 import java.util.UUID;
 
+import org.hamcrest.Matchers;
 import scala.Tuple2$;
 
 import org.junit.After;
@@ -503,6 +504,38 @@ public void testGetIterator() throws Exception {
     verifyIntIterator(sorter.getIterator(279), 279, 300);
   }
 
+  @Test
+  public void testOOMDuringSpill() throws Exception {
+    final UnsafeExternalSorter sorter = newSorter();
+    // we assume that given default configuration,
+    // the size of the data we insert to the sorter (ints)
+    // and assuming we shouldn't spill before pointers array is exhausted
+    // (memory manager is not configured to throw at this point)
+    // - so this loop runs a reasonable number of iterations (<2000).
+    // test indeed completed within <30ms (on a quad i7 laptop).
+    for (int i = 0; sorter.hasSpaceForAnotherRecord(); ++i) {
+      insertNumber(sorter, i);
+    }
+    // we expect the next insert to attempt growing the pointerssArray
+    // first allocation is expected to fail, then a spill is triggered which attempts another allocation
+    // which also fails and we expect to see this OOM here.
+    // the original code messed with a released array within the spill code
+    // and ended up with a failed assertion.
+    // we also expect the location of the OOM to be org.apache.spark.util.collection.unsafe.sort.UnsafeInMemorySorter.reset
+    memoryManager.markconsequentOOM(2);
+    try {
+      insertNumber(sorter, 1024);
+      fail("expected OutOfMmoryError but it seems operation surprisingly succeeded");
+    }
+    // we expect an OutOfMemoryError here, anything else (i.e the original NPE is a failure)
+    catch (OutOfMemoryError oom){
+      String oomStackTrace = Utils.exceptionString(oom);
+      assertThat("expected OutOfMemoryError in org.apache.spark.util.collection.unsafe.sort.UnsafeInMemorySorter.reset",
+              oomStackTrace,
+              Matchers.containsString("org.apache.spark.util.collection.unsafe.sort.UnsafeInMemorySorter.reset"));
+    }
+  }
+
   private void verifyIntIterator(UnsafeSorterIterator iter, int start, int end)
       throws IOException {
     for (int i = start; i < end; i++) {
diff --git a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java
index bd89085aa9a14..1a3e11efe9787 100644
--- a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java
+++ b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java
@@ -35,6 +35,7 @@
 import static org.hamcrest.Matchers.greaterThanOrEqualTo;
 import static org.hamcrest.Matchers.isIn;
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
 import static org.mockito.Mockito.mock;
 
 public class UnsafeInMemorySorterSuite {
@@ -139,4 +140,49 @@ public int compare(
     }
     assertEquals(dataToSort.length, iterLength);
   }
+
+  @Test
+  public void freeAfterOOM() {
+    final SparkConf sparkConf = new SparkConf();
+    sparkConf.set("spark.memory.offHeap.enabled", "false");
+
+    final TestMemoryManager testMemoryManager =
+            new TestMemoryManager(sparkConf);
+    final TaskMemoryManager memoryManager = new TaskMemoryManager(
+            testMemoryManager, 0);
+    final TestMemoryConsumer consumer = new TestMemoryConsumer(memoryManager);
+    final MemoryBlock dataPage = memoryManager.allocatePage(2048, consumer);
+    final Object baseObject = dataPage.getBaseObject();
+    // Write the records into the data page:
+    long position = dataPage.getBaseOffset();
+
+    final HashPartitioner hashPartitioner = new HashPartitioner(4);
+    // Use integer comparison for comparing prefixes (which are partition ids, in this case)
+    final PrefixComparator prefixComparator = PrefixComparators.LONG;
+    final RecordComparator recordComparator = new RecordComparator() {
+      @Override
+      public int compare(
+              Object leftBaseObject,
+              long leftBaseOffset,
+              Object rightBaseObject,
+              long rightBaseOffset) {
+        return 0;
+      }
+    };
+    UnsafeInMemorySorter sorter = new UnsafeInMemorySorter(consumer, memoryManager,
+            recordComparator, prefixComparator, 100, shouldUseRadixSort());
+
+    testMemoryManager.markExecutionAsOutOfMemoryOnce();
+    try {
+      sorter.reset();
+      fail("expected OutOfMmoryError but it seems operation surprisingly succeeded");
+    } catch (OutOfMemoryError oom) {
+      // as expected
+    }
+    // [SPARK-21907] this failed on NPE at org.apache.spark.memory.MemoryConsumer.freeArray(MemoryConsumer.java:108)
+    sorter.free();
+    // simulate a 'back to back' free.
+    sorter.free();
+  }
+
 }
diff --git a/core/src/test/scala/org/apache/spark/memory/TestMemoryManager.scala b/core/src/test/scala/org/apache/spark/memory/TestMemoryManager.scala
index 5f699df8211de..c26945fa5fa31 100644
--- a/core/src/test/scala/org/apache/spark/memory/TestMemoryManager.scala
+++ b/core/src/test/scala/org/apache/spark/memory/TestMemoryManager.scala
@@ -27,8 +27,8 @@ class TestMemoryManager(conf: SparkConf)
       numBytes: Long,
       taskAttemptId: Long,
       memoryMode: MemoryMode): Long = {
-    if (oomOnce) {
-      oomOnce = false
+    if (consequentOOM > 0) {
+      consequentOOM -= 1
       0
     } else if (available >= numBytes) {
       available -= numBytes
@@ -58,11 +58,15 @@ class TestMemoryManager(conf: SparkConf)
 
   override def maxOffHeapStorageMemory: Long = 0L
 
-  private var oomOnce = false
+  private var consequentOOM = 0
   private var available = Long.MaxValue
 
   def markExecutionAsOutOfMemoryOnce(): Unit = {
-    oomOnce = true
+    markconsequentOOM(1)
+  }
+
+  def markconsequentOOM(n : Int) : Unit = {
+    consequentOOM += n
   }
 
   def limit(avail: Long): Unit = {

From bfc7e1fe1ad5f9777126f2941e29bbe51ea5da7c Mon Sep 17 00:00:00 2001
From: Li Jin <ice.xelloss@gmail.com>
Date: Wed, 11 Oct 2017 07:32:01 +0900
Subject: [PATCH 692/779] [SPARK-20396][SQL][PYSPARK] groupby().apply() with
 pandas udf

## What changes were proposed in this pull request?

This PR adds an apply() function on df.groupby(). apply() takes a pandas udf that is a transformation on `pandas.DataFrame` -> `pandas.DataFrame`.

Static schema
-------------------
```
schema = df.schema

pandas_udf(schema)
def normalize(df):
    df = df.assign(v1 = (df.v1 - df.v1.mean()) / df.v1.std()
    return df

df.groupBy('id').apply(normalize)
```
Dynamic schema
-----------------------
**This use case is removed from the PR and we will discuss this as a follow up. See discussion https://github.com/apache/spark/pull/18732#pullrequestreview-66583248**

Another example to use pd.DataFrame dtypes as output schema of the udf:

```
sample_df = df.filter(df.id == 1).toPandas()

def foo(df):
      ret = # Some transformation on the input pd.DataFrame
      return ret

foo_udf = pandas_udf(foo, foo(sample_df).dtypes)

df.groupBy('id').apply(foo_udf)
```
In interactive use case, user usually have a sample pd.DataFrame to test function `foo` in their notebook. Having been able to use `foo(sample_df).dtypes` frees user from specifying the output schema of `foo`.

Design doc: https://github.com/icexelloss/spark/blob/pandas-udf-doc/docs/pyspark-pandas-udf.md

## How was this patch tested?
* Added GroupbyApplyTest

Author: Li Jin <ice.xelloss@gmail.com>
Author: Takuya UESHIN <ueshin@databricks.com>
Author: Bryan Cutler <cutlerb@gmail.com>

Closes #18732 from icexelloss/groupby-apply-SPARK-20396.
---
 python/pyspark/sql/dataframe.py               |   6 +-
 python/pyspark/sql/functions.py               |  98 ++++++++---
 python/pyspark/sql/group.py                   |  88 +++++++++-
 python/pyspark/sql/tests.py                   | 157 +++++++++++++++++-
 python/pyspark/sql/types.py                   |   2 +-
 python/pyspark/worker.py                      |  35 ++--
 .../sql/catalyst/optimizer/Optimizer.scala    |   2 +
 .../logical/pythonLogicalOperators.scala      |  39 +++++
 .../spark/sql/RelationalGroupedDataset.scala  |  36 +++-
 .../spark/sql/execution/SparkStrategies.scala |   2 +
 .../python/ArrowEvalPythonExec.scala          |  39 ++++-
 .../execution/python/ArrowPythonRunner.scala  |  15 +-
 .../execution/python/ExtractPythonUDFs.scala  |   8 +-
 .../python/FlatMapGroupsInPandasExec.scala    | 103 ++++++++++++
 14 files changed, 561 insertions(+), 69 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/pythonLogicalOperators.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasExec.scala

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index fe69e588fe098..2d596229ced7e 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1227,7 +1227,7 @@ def groupBy(self, *cols):
         """
         jgd = self._jdf.groupBy(self._jcols(*cols))
         from pyspark.sql.group import GroupedData
-        return GroupedData(jgd, self.sql_ctx)
+        return GroupedData(jgd, self)
 
     @since(1.4)
     def rollup(self, *cols):
@@ -1248,7 +1248,7 @@ def rollup(self, *cols):
         """
         jgd = self._jdf.rollup(self._jcols(*cols))
         from pyspark.sql.group import GroupedData
-        return GroupedData(jgd, self.sql_ctx)
+        return GroupedData(jgd, self)
 
     @since(1.4)
     def cube(self, *cols):
@@ -1271,7 +1271,7 @@ def cube(self, *cols):
         """
         jgd = self._jdf.cube(self._jcols(*cols))
         from pyspark.sql.group import GroupedData
-        return GroupedData(jgd, self.sql_ctx)
+        return GroupedData(jgd, self)
 
     @since(1.3)
     def agg(self, *exprs):
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index b45a59db93679..9bc12c3b7a162 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -2058,7 +2058,7 @@ def __init__(self, func, returnType, name=None, vectorized=False):
         self._name = name or (
             func.__name__ if hasattr(func, '__name__')
             else func.__class__.__name__)
-        self._vectorized = vectorized
+        self.vectorized = vectorized
 
     @property
     def returnType(self):
@@ -2090,7 +2090,7 @@ def _create_judf(self):
         wrapped_func = _wrap_function(sc, self.func, self.returnType)
         jdt = spark._jsparkSession.parseDataType(self.returnType.json())
         judf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonFunction(
-            self._name, wrapped_func, jdt, self._vectorized)
+            self._name, wrapped_func, jdt, self.vectorized)
         return judf
 
     def __call__(self, *cols):
@@ -2118,8 +2118,10 @@ def wrapper(*args):
         wrapper.__name__ = self._name
         wrapper.__module__ = (self.func.__module__ if hasattr(self.func, '__module__')
                               else self.func.__class__.__module__)
+
         wrapper.func = self.func
         wrapper.returnType = self.returnType
+        wrapper.vectorized = self.vectorized
 
         return wrapper
 
@@ -2129,8 +2131,12 @@ def _create_udf(f, returnType, vectorized):
     def _udf(f, returnType=StringType(), vectorized=vectorized):
         if vectorized:
             import inspect
-            if len(inspect.getargspec(f).args) == 0:
-                raise NotImplementedError("0-parameter pandas_udfs are not currently supported")
+            argspec = inspect.getargspec(f)
+            if len(argspec.args) == 0 and argspec.varargs is None:
+                raise ValueError(
+                    "0-arg pandas_udfs are not supported. "
+                    "Instead, create a 1-arg pandas_udf and ignore the arg in your function."
+                )
         udf_obj = UserDefinedFunction(f, returnType, vectorized=vectorized)
         return udf_obj._wrapped()
 
@@ -2146,7 +2152,7 @@ def _udf(f, returnType=StringType(), vectorized=vectorized):
 
 @since(1.3)
 def udf(f=None, returnType=StringType()):
-    """Creates a :class:`Column` expression representing a user defined function (UDF).
+    """Creates a user defined function (UDF).
 
     .. note:: The user-defined functions must be deterministic. Due to optimization,
         duplicate invocations may be eliminated or the function may even be invoked more times than
@@ -2181,30 +2187,70 @@ def udf(f=None, returnType=StringType()):
 @since(2.3)
 def pandas_udf(f=None, returnType=StringType()):
     """
-    Creates a :class:`Column` expression representing a user defined function (UDF) that accepts
-    `Pandas.Series` as input arguments and outputs a `Pandas.Series` of the same length.
+    Creates a vectorized user defined function (UDF).
 
-    :param f: python function if used as a standalone function
+    :param f: user-defined function. A python function if used as a standalone function
     :param returnType: a :class:`pyspark.sql.types.DataType` object
 
-    >>> from pyspark.sql.types import IntegerType, StringType
-    >>> slen = pandas_udf(lambda s: s.str.len(), IntegerType())
-    >>> @pandas_udf(returnType=StringType())
-    ... def to_upper(s):
-    ...     return s.str.upper()
-    ...
-    >>> @pandas_udf(returnType="integer")
-    ... def add_one(x):
-    ...     return x + 1
-    ...
-    >>> df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age"))
-    >>> df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")) \\
-    ...     .show()  # doctest: +SKIP
-    +----------+--------------+------------+
-    |slen(name)|to_upper(name)|add_one(age)|
-    +----------+--------------+------------+
-    |         8|      JOHN DOE|          22|
-    +----------+--------------+------------+
+    The user-defined function can define one of the following transformations:
+
+    1. One or more `pandas.Series` -> A `pandas.Series`
+
+       This udf is used with :meth:`pyspark.sql.DataFrame.withColumn` and
+       :meth:`pyspark.sql.DataFrame.select`.
+       The returnType should be a primitive data type, e.g., `DoubleType()`.
+       The length of the returned `pandas.Series` must be of the same as the input `pandas.Series`.
+
+       >>> from pyspark.sql.types import IntegerType, StringType
+       >>> slen = pandas_udf(lambda s: s.str.len(), IntegerType())
+       >>> @pandas_udf(returnType=StringType())
+       ... def to_upper(s):
+       ...     return s.str.upper()
+       ...
+       >>> @pandas_udf(returnType="integer")
+       ... def add_one(x):
+       ...     return x + 1
+       ...
+       >>> df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age"))
+       >>> df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")) \\
+       ...     .show()  # doctest: +SKIP
+       +----------+--------------+------------+
+       |slen(name)|to_upper(name)|add_one(age)|
+       +----------+--------------+------------+
+       |         8|      JOHN DOE|          22|
+       +----------+--------------+------------+
+
+    2. A `pandas.DataFrame` -> A `pandas.DataFrame`
+
+       This udf is only used with :meth:`pyspark.sql.GroupedData.apply`.
+       The returnType should be a :class:`StructType` describing the schema of the returned
+       `pandas.DataFrame`.
+
+       >>> df = spark.createDataFrame(
+       ...     [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
+       ...     ("id", "v"))
+       >>> @pandas_udf(returnType=df.schema)
+       ... def normalize(pdf):
+       ...     v = pdf.v
+       ...     return pdf.assign(v=(v - v.mean()) / v.std())
+       >>> df.groupby('id').apply(normalize).show()  # doctest: +SKIP
+       +---+-------------------+
+       | id|                  v|
+       +---+-------------------+
+       |  1|-0.7071067811865475|
+       |  1| 0.7071067811865475|
+       |  2|-0.8320502943378437|
+       |  2|-0.2773500981126146|
+       |  2| 1.1094003924504583|
+       +---+-------------------+
+
+       .. note:: This type of udf cannot be used with functions such as `withColumn` or `select`
+                 because it defines a `DataFrame` transformation rather than a `Column`
+                 transformation.
+
+       .. seealso:: :meth:`pyspark.sql.GroupedData.apply`
+
+    .. note:: The user-defined function must be deterministic.
     """
     return _create_udf(f, returnType=returnType, vectorized=True)
 
diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py
index f2092f9c63054..817d0bc83bb77 100644
--- a/python/pyspark/sql/group.py
+++ b/python/pyspark/sql/group.py
@@ -54,9 +54,10 @@ class GroupedData(object):
     .. versionadded:: 1.3
     """
 
-    def __init__(self, jgd, sql_ctx):
+    def __init__(self, jgd, df):
         self._jgd = jgd
-        self.sql_ctx = sql_ctx
+        self._df = df
+        self.sql_ctx = df.sql_ctx
 
     @ignore_unicode_prefix
     @since(1.3)
@@ -170,7 +171,7 @@ def sum(self, *cols):
     @since(1.6)
     def pivot(self, pivot_col, values=None):
         """
-        Pivots a column of the current [[DataFrame]] and perform the specified aggregation.
+        Pivots a column of the current :class:`DataFrame` and perform the specified aggregation.
         There are two versions of pivot function: one that requires the caller to specify the list
         of distinct values to pivot on, and one that does not. The latter is more concise but less
         efficient, because Spark needs to first compute the list of distinct values internally.
@@ -192,7 +193,85 @@ def pivot(self, pivot_col, values=None):
             jgd = self._jgd.pivot(pivot_col)
         else:
             jgd = self._jgd.pivot(pivot_col, values)
-        return GroupedData(jgd, self.sql_ctx)
+        return GroupedData(jgd, self._df)
+
+    @since(2.3)
+    def apply(self, udf):
+        """
+        Maps each group of the current :class:`DataFrame` using a pandas udf and returns the result
+        as a `DataFrame`.
+
+        The user-defined function should take a `pandas.DataFrame` and return another
+        `pandas.DataFrame`. For each group, all columns are passed together as a `pandas.DataFrame`
+        to the user-function and the returned `pandas.DataFrame`s are combined as a
+        :class:`DataFrame`.
+        The returned `pandas.DataFrame` can be of arbitrary length and its schema must match the
+        returnType of the pandas udf.
+
+        This function does not support partial aggregation, and requires shuffling all the data in
+        the :class:`DataFrame`.
+
+        :param udf: A function object returned by :meth:`pyspark.sql.functions.pandas_udf`
+
+        >>> from pyspark.sql.functions import pandas_udf
+        >>> df = spark.createDataFrame(
+        ...     [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
+        ...     ("id", "v"))
+        >>> @pandas_udf(returnType=df.schema)
+        ... def normalize(pdf):
+        ...     v = pdf.v
+        ...     return pdf.assign(v=(v - v.mean()) / v.std())
+        >>> df.groupby('id').apply(normalize).show()  # doctest: +SKIP
+        +---+-------------------+
+        | id|                  v|
+        +---+-------------------+
+        |  1|-0.7071067811865475|
+        |  1| 0.7071067811865475|
+        |  2|-0.8320502943378437|
+        |  2|-0.2773500981126146|
+        |  2| 1.1094003924504583|
+        +---+-------------------+
+
+        .. seealso:: :meth:`pyspark.sql.functions.pandas_udf`
+
+        """
+        from pyspark.sql.functions import pandas_udf
+
+        # Columns are special because hasattr always return True
+        if isinstance(udf, Column) or not hasattr(udf, 'func') or not udf.vectorized:
+            raise ValueError("The argument to apply must be a pandas_udf")
+        if not isinstance(udf.returnType, StructType):
+            raise ValueError("The returnType of the pandas_udf must be a StructType")
+
+        df = self._df
+        func = udf.func
+        returnType = udf.returnType
+
+        # The python executors expects the function to use pd.Series as input and output
+        # So we to create a wrapper function that turns that to a pd.DataFrame before passing
+        # down to the user function, then turn the result pd.DataFrame back into pd.Series
+        columns = df.columns
+
+        def wrapped(*cols):
+            from pyspark.sql.types import to_arrow_type
+            import pandas as pd
+            result = func(pd.concat(cols, axis=1, keys=columns))
+            if not isinstance(result, pd.DataFrame):
+                raise TypeError("Return type of the user-defined function should be "
+                                "Pandas.DataFrame, but is {}".format(type(result)))
+            if not len(result.columns) == len(returnType):
+                raise RuntimeError(
+                    "Number of columns of the returned Pandas.DataFrame "
+                    "doesn't match specified schema. "
+                    "Expected: {} Actual: {}".format(len(returnType), len(result.columns)))
+            arrow_return_types = (to_arrow_type(field.dataType) for field in returnType)
+            return [(result[result.columns[i]], arrow_type)
+                    for i, arrow_type in enumerate(arrow_return_types)]
+
+        wrapped_udf_obj = pandas_udf(wrapped, returnType)
+        udf_column = wrapped_udf_obj(*[df[col] for col in df.columns])
+        jdf = self._jgd.flatMapGroupsInPandas(udf_column._jc.expr())
+        return DataFrame(jdf, self.sql_ctx)
 
 
 def _test():
@@ -206,6 +285,7 @@ def _test():
         .getOrCreate()
     sc = spark.sparkContext
     globs['sc'] = sc
+    globs['spark'] = spark
     globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \
         .toDF(StructType([StructField('age', IntegerType()),
                           StructField('name', StringType())]))
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index a59378b5e848a..bac2ef84ae7a7 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -3256,17 +3256,17 @@ def test_vectorized_udf_null_string(self):
 
     def test_vectorized_udf_zero_parameter(self):
         from pyspark.sql.functions import pandas_udf
-        error_str = '0-parameter pandas_udfs.*not.*supported'
+        error_str = '0-arg pandas_udfs.*not.*supported'
         with QuietTest(self.sc):
-            with self.assertRaisesRegexp(NotImplementedError, error_str):
+            with self.assertRaisesRegexp(ValueError, error_str):
                 pandas_udf(lambda: 1, LongType())
 
-            with self.assertRaisesRegexp(NotImplementedError, error_str):
+            with self.assertRaisesRegexp(ValueError, error_str):
                 @pandas_udf
                 def zero_no_type():
                     return 1
 
-            with self.assertRaisesRegexp(NotImplementedError, error_str):
+            with self.assertRaisesRegexp(ValueError, error_str):
                 @pandas_udf(LongType())
                 def zero_with_type():
                     return 1
@@ -3348,7 +3348,7 @@ def test_vectorized_udf_wrong_return_type(self):
         df = self.spark.range(10)
         f = pandas_udf(lambda x: x * 1.0, StringType())
         with QuietTest(self.sc):
-            with self.assertRaisesRegexp(Exception, 'Invalid.*type.*string'):
+            with self.assertRaisesRegexp(Exception, 'Invalid.*type'):
                 df.select(f(col('id'))).collect()
 
     def test_vectorized_udf_return_scalar(self):
@@ -3356,7 +3356,7 @@ def test_vectorized_udf_return_scalar(self):
         df = self.spark.range(10)
         f = pandas_udf(lambda x: 1.0, DoubleType())
         with QuietTest(self.sc):
-            with self.assertRaisesRegexp(Exception, 'Return.*type.*pandas_udf.*Series'):
+            with self.assertRaisesRegexp(Exception, 'Return.*type.*Series'):
                 df.select(f(col('id'))).collect()
 
     def test_vectorized_udf_decorator(self):
@@ -3376,6 +3376,151 @@ def test_vectorized_udf_empty_partition(self):
         res = df.select(f(col('id')))
         self.assertEquals(df.collect(), res.collect())
 
+    def test_vectorized_udf_varargs(self):
+        from pyspark.sql.functions import pandas_udf, col
+        df = self.spark.createDataFrame(self.sc.parallelize([Row(id=1)], 2))
+        f = pandas_udf(lambda *v: v[0], LongType())
+        res = df.select(f(col('id')))
+        self.assertEquals(df.collect(), res.collect())
+
+
+@unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed")
+class GroupbyApplyTests(ReusedPySparkTestCase):
+    @classmethod
+    def setUpClass(cls):
+        ReusedPySparkTestCase.setUpClass()
+        cls.spark = SparkSession(cls.sc)
+
+    @classmethod
+    def tearDownClass(cls):
+        ReusedPySparkTestCase.tearDownClass()
+        cls.spark.stop()
+
+    def assertFramesEqual(self, expected, result):
+        msg = ("DataFrames are not equal: " +
+               ("\n\nExpected:\n%s\n%s" % (expected, expected.dtypes)) +
+               ("\n\nResult:\n%s\n%s" % (result, result.dtypes)))
+        self.assertTrue(expected.equals(result), msg=msg)
+
+    @property
+    def data(self):
+        from pyspark.sql.functions import array, explode, col, lit
+        return self.spark.range(10).toDF('id') \
+            .withColumn("vs", array([lit(i) for i in range(20, 30)])) \
+            .withColumn("v", explode(col('vs'))).drop('vs')
+
+    def test_simple(self):
+        from pyspark.sql.functions import pandas_udf
+        df = self.data
+
+        foo_udf = pandas_udf(
+            lambda pdf: pdf.assign(v1=pdf.v * pdf.id * 1.0, v2=pdf.v + pdf.id),
+            StructType(
+                [StructField('id', LongType()),
+                 StructField('v', IntegerType()),
+                 StructField('v1', DoubleType()),
+                 StructField('v2', LongType())]))
+
+        result = df.groupby('id').apply(foo_udf).sort('id').toPandas()
+        expected = df.toPandas().groupby('id').apply(foo_udf.func).reset_index(drop=True)
+        self.assertFramesEqual(expected, result)
+
+    def test_decorator(self):
+        from pyspark.sql.functions import pandas_udf
+        df = self.data
+
+        @pandas_udf(StructType(
+            [StructField('id', LongType()),
+             StructField('v', IntegerType()),
+             StructField('v1', DoubleType()),
+             StructField('v2', LongType())]))
+        def foo(pdf):
+            return pdf.assign(v1=pdf.v * pdf.id * 1.0, v2=pdf.v + pdf.id)
+
+        result = df.groupby('id').apply(foo).sort('id').toPandas()
+        expected = df.toPandas().groupby('id').apply(foo.func).reset_index(drop=True)
+        self.assertFramesEqual(expected, result)
+
+    def test_coerce(self):
+        from pyspark.sql.functions import pandas_udf
+        df = self.data
+
+        foo = pandas_udf(
+            lambda pdf: pdf,
+            StructType([StructField('id', LongType()), StructField('v', DoubleType())]))
+
+        result = df.groupby('id').apply(foo).sort('id').toPandas()
+        expected = df.toPandas().groupby('id').apply(foo.func).reset_index(drop=True)
+        expected = expected.assign(v=expected.v.astype('float64'))
+        self.assertFramesEqual(expected, result)
+
+    def test_complex_groupby(self):
+        from pyspark.sql.functions import pandas_udf, col
+        df = self.data
+
+        @pandas_udf(StructType(
+            [StructField('id', LongType()),
+             StructField('v', IntegerType()),
+             StructField('norm', DoubleType())]))
+        def normalize(pdf):
+            v = pdf.v
+            return pdf.assign(norm=(v - v.mean()) / v.std())
+
+        result = df.groupby(col('id') % 2 == 0).apply(normalize).sort('id', 'v').toPandas()
+        pdf = df.toPandas()
+        expected = pdf.groupby(pdf['id'] % 2 == 0).apply(normalize.func)
+        expected = expected.sort_values(['id', 'v']).reset_index(drop=True)
+        expected = expected.assign(norm=expected.norm.astype('float64'))
+        self.assertFramesEqual(expected, result)
+
+    def test_empty_groupby(self):
+        from pyspark.sql.functions import pandas_udf, col
+        df = self.data
+
+        @pandas_udf(StructType(
+            [StructField('id', LongType()),
+             StructField('v', IntegerType()),
+             StructField('norm', DoubleType())]))
+        def normalize(pdf):
+            v = pdf.v
+            return pdf.assign(norm=(v - v.mean()) / v.std())
+
+        result = df.groupby().apply(normalize).sort('id', 'v').toPandas()
+        pdf = df.toPandas()
+        expected = normalize.func(pdf)
+        expected = expected.sort_values(['id', 'v']).reset_index(drop=True)
+        expected = expected.assign(norm=expected.norm.astype('float64'))
+        self.assertFramesEqual(expected, result)
+
+    def test_wrong_return_type(self):
+        from pyspark.sql.functions import pandas_udf
+        df = self.data
+
+        foo = pandas_udf(
+            lambda pdf: pdf,
+            StructType([StructField('id', LongType()), StructField('v', StringType())]))
+
+        with QuietTest(self.sc):
+            with self.assertRaisesRegexp(Exception, 'Invalid.*type'):
+                df.groupby('id').apply(foo).sort('id').toPandas()
+
+    def test_wrong_args(self):
+        from pyspark.sql.functions import udf, pandas_udf, sum
+        df = self.data
+
+        with QuietTest(self.sc):
+            with self.assertRaisesRegexp(ValueError, 'pandas_udf'):
+                df.groupby('id').apply(lambda x: x)
+            with self.assertRaisesRegexp(ValueError, 'pandas_udf'):
+                df.groupby('id').apply(udf(lambda x: x, DoubleType()))
+            with self.assertRaisesRegexp(ValueError, 'pandas_udf'):
+                df.groupby('id').apply(sum(df.v))
+            with self.assertRaisesRegexp(ValueError, 'pandas_udf'):
+                df.groupby('id').apply(df.v + 1)
+            with self.assertRaisesRegexp(ValueError, 'returnType'):
+                df.groupby('id').apply(pandas_udf(lambda x: x, DoubleType()))
+
+
 if __name__ == "__main__":
     from pyspark.sql.tests import *
     if xmlrunner:
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index ebdc11c3b744a..f65273d5f0b6c 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -1597,7 +1597,7 @@ def convert(self, obj, gateway_client):
 register_input_converter(DateConverter())
 
 
-def toArrowType(dt):
+def to_arrow_type(dt):
     """ Convert Spark data type to pyarrow type
     """
     import pyarrow as pa
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index 4e24789cf010d..eb6d48688dc0a 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -32,7 +32,7 @@
 from pyspark.serializers import write_with_length, write_int, read_long, \
     write_long, read_int, SpecialLengths, PythonEvalType, UTF8Deserializer, PickleSerializer, \
     BatchedSerializer, ArrowStreamPandasSerializer
-from pyspark.sql.types import toArrowType
+from pyspark.sql.types import to_arrow_type, StructType
 from pyspark import shuffle
 
 pickleSer = PickleSerializer()
@@ -74,17 +74,28 @@ def wrap_udf(f, return_type):
 
 
 def wrap_pandas_udf(f, return_type):
-    arrow_return_type = toArrowType(return_type)
-
-    def verify_result_length(*a):
-        result = f(*a)
-        if not hasattr(result, "__len__"):
-            raise TypeError("Return type of pandas_udf should be a Pandas.Series")
-        if len(result) != len(a[0]):
-            raise RuntimeError("Result vector from pandas_udf was not the required length: "
-                               "expected %d, got %d" % (len(a[0]), len(result)))
-        return result
-    return lambda *a: (verify_result_length(*a), arrow_return_type)
+    # If the return_type is a StructType, it indicates this is a groupby apply udf,
+    # and has already been wrapped under apply(), otherwise, it's a vectorized column udf.
+    # We can distinguish these two by return type because in groupby apply, we always specify
+    # returnType as a StructType, and in vectorized column udf, StructType is not supported.
+    #
+    # TODO: Look into refactoring use of StructType to be more flexible for future pandas_udfs
+    if isinstance(return_type, StructType):
+        return lambda *a: f(*a)
+    else:
+        arrow_return_type = to_arrow_type(return_type)
+
+        def verify_result_length(*a):
+            result = f(*a)
+            if not hasattr(result, "__len__"):
+                raise TypeError("Return type of the user-defined functon should be "
+                                "Pandas.Series, but is {}".format(type(result)))
+            if len(result) != len(a[0]):
+                raise RuntimeError("Result vector from pandas_udf was not the required length: "
+                                   "expected %d, got %d" % (len(a[0]), len(result)))
+            return result
+
+        return lambda *a: (verify_result_length(*a), arrow_return_type)
 
 
 def read_single_udf(pickleSer, infile, eval_type):
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index bc2d4a824cb49..d829e01441dcc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -452,6 +452,8 @@ object ColumnPruning extends Rule[LogicalPlan] {
     // Prunes the unused columns from child of Aggregate/Expand/Generate
     case a @ Aggregate(_, _, child) if (child.outputSet -- a.references).nonEmpty =>
       a.copy(child = prunedChild(child, a.references))
+    case f @ FlatMapGroupsInPandas(_, _, _, child) if (child.outputSet -- f.references).nonEmpty =>
+      f.copy(child = prunedChild(child, f.references))
     case e @ Expand(_, _, child) if (child.outputSet -- e.references).nonEmpty =>
       e.copy(child = prunedChild(child, e.references))
     case g: Generate if !g.join && (g.child.outputSet -- g.references).nonEmpty =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/pythonLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/pythonLogicalOperators.scala
new file mode 100644
index 0000000000000..8abab24bc9b44
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/pythonLogicalOperators.scala
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.plans.logical
+
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression}
+
+/**
+ * FlatMap groups using an udf: pandas.Dataframe -> pandas.DataFrame.
+ * This is used by DataFrame.groupby().apply().
+ */
+case class FlatMapGroupsInPandas(
+  groupingAttributes: Seq[Attribute],
+  functionExpr: Expression,
+  output: Seq[Attribute],
+  child: LogicalPlan) extends UnaryNode {
+  /**
+   * This is needed because output attributes are considered `references` when
+   * passed through the constructor.
+   *
+   * Without this, catalyst will complain that output attributes are missing
+   * from the input.
+   */
+  override val producedAttributes = AttributeSet(output)
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
index 147b549964913..cd0ac1feffa51 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
@@ -27,12 +27,12 @@ import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.sql.catalyst.analysis.{Star, UnresolvedAlias, UnresolvedAttribute, UnresolvedFunction}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
-import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, FlatMapGroupsInR, Pivot}
+import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.util.usePrettyExpression
 import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression
+import org.apache.spark.sql.execution.python.PythonUDF
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.types.NumericType
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.types.{NumericType, StructField, StructType}
 
 /**
  * A set of methods for aggregations on a `DataFrame`, created by [[Dataset#groupBy groupBy]],
@@ -435,6 +435,36 @@ class RelationalGroupedDataset protected[sql](
           df.logicalPlan.output,
           df.logicalPlan))
   }
+
+  /**
+   * Applies a vectorized python user-defined function to each group of data.
+   * The user-defined function defines a transformation: `pandas.DataFrame` -> `pandas.DataFrame`.
+   * For each group, all elements in the group are passed as a `pandas.DataFrame` and the results
+   * for all groups are combined into a new [[DataFrame]].
+   *
+   * This function does not support partial aggregation, and requires shuffling all the data in
+   * the [[DataFrame]].
+   *
+   * This function uses Apache Arrow as serialization format between Java executors and Python
+   * workers.
+   */
+  private[sql] def flatMapGroupsInPandas(expr: PythonUDF): DataFrame = {
+    require(expr.vectorized, "Must pass a vectorized python udf")
+    require(expr.dataType.isInstanceOf[StructType],
+      "The returnType of the vectorized python udf must be a StructType")
+
+    val groupingNamedExpressions = groupingExprs.map {
+      case ne: NamedExpression => ne
+      case other => Alias(other, other.toString)()
+    }
+    val groupingAttributes = groupingNamedExpressions.map(_.toAttribute)
+    val child = df.logicalPlan
+    val project = Project(groupingNamedExpressions ++ child.output, child)
+    val output = expr.dataType.asInstanceOf[StructType].toAttributes
+    val plan = FlatMapGroupsInPandas(groupingAttributes, expr, output, project)
+
+    Dataset.ofRows(df.sparkSession, plan)
+  }
 }
 
 private[sql] object RelationalGroupedDataset {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 92eaab5cd8f81..4cdcc73faacd7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -392,6 +392,8 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       case logical.FlatMapGroupsInR(f, p, b, is, os, key, value, grouping, data, objAttr, child) =>
         execution.FlatMapGroupsInRExec(f, p, b, is, os, key, value, grouping,
           data, objAttr, planLater(child)) :: Nil
+      case logical.FlatMapGroupsInPandas(grouping, func, output, child) =>
+        execution.python.FlatMapGroupsInPandasExec(grouping, func, output, planLater(child)) :: Nil
       case logical.MapElements(f, _, _, objAttr, child) =>
         execution.MapElementsExec(f, objAttr, planLater(child)) :: Nil
       case logical.AppendColumns(f, _, _, in, out, child) =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
index f7e8cbe416121..81896187ecc46 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
@@ -26,6 +26,35 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.types.StructType
 
+/**
+ * Grouped a iterator into batches.
+ * This is similar to iter.grouped but returns Iterator[T] instead of Seq[T].
+ * This is necessary because sometimes we cannot hold reference of input rows
+ * because the some input rows are mutable and can be reused.
+ */
+private class BatchIterator[T](iter: Iterator[T], batchSize: Int)
+  extends Iterator[Iterator[T]] {
+
+  override def hasNext: Boolean = iter.hasNext
+
+  override def next(): Iterator[T] = {
+    new Iterator[T] {
+      var count = 0
+
+      override def hasNext: Boolean = iter.hasNext && count < batchSize
+
+      override def next(): T = {
+        if (!hasNext) {
+          Iterator.empty.next()
+        } else {
+          count += 1
+          iter.next()
+        }
+      }
+    }
+  }
+}
+
 /**
  * A physical plan that evaluates a [[PythonUDF]],
  */
@@ -44,14 +73,18 @@ case class ArrowEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], chi
     val schemaOut = StructType.fromAttributes(output.drop(child.output.length).zipWithIndex
       .map { case (attr, i) => attr.withName(s"_$i") })
 
+    val batchSize = conf.arrowMaxRecordsPerBatch
+    // DO NOT use iter.grouped(). See BatchIterator.
+    val batchIter = if (batchSize > 0) new BatchIterator(iter, batchSize) else Iterator(iter)
+
     val columnarBatchIter = new ArrowPythonRunner(
-        funcs, conf.arrowMaxRecordsPerBatch, bufferSize, reuseWorker,
+        funcs, bufferSize, reuseWorker,
         PythonEvalType.SQL_PANDAS_UDF, argOffsets, schema)
-      .compute(iter, context.partitionId(), context)
+      .compute(batchIter, context.partitionId(), context)
 
     new Iterator[InternalRow] {
 
-      var currentIter = if (columnarBatchIter.hasNext) {
+      private var currentIter = if (columnarBatchIter.hasNext) {
         val batch = columnarBatchIter.next()
         assert(schemaOut.equals(batch.schema),
           s"Invalid schema from pandas_udf: expected $schemaOut, got ${batch.schema}")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
index bbad9d6b631fd..f6c03c415dc66 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
@@ -39,19 +39,18 @@ import org.apache.spark.util.Utils
  */
 class ArrowPythonRunner(
     funcs: Seq[ChainedPythonFunctions],
-    batchSize: Int,
     bufferSize: Int,
     reuseWorker: Boolean,
     evalType: Int,
     argOffsets: Array[Array[Int]],
     schema: StructType)
-  extends BasePythonRunner[InternalRow, ColumnarBatch](
+  extends BasePythonRunner[Iterator[InternalRow], ColumnarBatch](
     funcs, bufferSize, reuseWorker, evalType, argOffsets) {
 
   protected override def newWriterThread(
       env: SparkEnv,
       worker: Socket,
-      inputIterator: Iterator[InternalRow],
+      inputIterator: Iterator[Iterator[InternalRow]],
       partitionIndex: Int,
       context: TaskContext): WriterThread = {
     new WriterThread(env, worker, inputIterator, partitionIndex, context) {
@@ -82,12 +81,12 @@ class ArrowPythonRunner(
 
         Utils.tryWithSafeFinally {
           while (inputIterator.hasNext) {
-            var rowCount = 0
-            while (inputIterator.hasNext && (batchSize <= 0 || rowCount < batchSize)) {
-              val row = inputIterator.next()
-              arrowWriter.write(row)
-              rowCount += 1
+            val nextBatch = inputIterator.next()
+
+            while (nextBatch.hasNext) {
+              arrowWriter.write(nextBatch.next())
             }
+
             arrowWriter.finish()
             writer.writeBatch()
             arrowWriter.reset()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
index fec456d86dbe2..e3f952e221d53 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
@@ -24,8 +24,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan, Project}
 import org.apache.spark.sql.catalyst.rules.Rule
-import org.apache.spark.sql.execution
-import org.apache.spark.sql.execution.{FilterExec, SparkPlan}
+import org.apache.spark.sql.execution.{FilterExec, ProjectExec, SparkPlan}
 
 
 /**
@@ -111,6 +110,9 @@ object ExtractPythonUDFs extends Rule[SparkPlan] with PredicateHelper {
   }
 
   def apply(plan: SparkPlan): SparkPlan = plan transformUp {
+    // FlatMapGroupsInPandas can be evaluated directly in python worker
+    // Therefore we don't need to extract the UDFs
+    case plan: FlatMapGroupsInPandasExec => plan
     case plan: SparkPlan => extract(plan)
   }
 
@@ -169,7 +171,7 @@ object ExtractPythonUDFs extends Rule[SparkPlan] with PredicateHelper {
       val newPlan = extract(rewritten)
       if (newPlan.output != plan.output) {
         // Trim away the new UDF value if it was only used for filtering or something.
-        execution.ProjectExec(plan.output, newPlan)
+        ProjectExec(plan.output, newPlan)
       } else {
         newPlan
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasExec.scala
new file mode 100644
index 0000000000000..b996b5bb38ba5
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasExec.scala
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.python
+
+import scala.collection.JavaConverters._
+
+import org.apache.spark.TaskContext
+import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistribution, Distribution, Partitioning}
+import org.apache.spark.sql.execution.{GroupedIterator, SparkPlan, UnaryExecNode}
+import org.apache.spark.sql.types.StructType
+
+/**
+ * Physical node for [[org.apache.spark.sql.catalyst.plans.logical.FlatMapGroupsInPandas]]
+ *
+ * Rows in each group are passed to the Python worker as an Arrow record batch.
+ * The Python worker turns the record batch to a `pandas.DataFrame`, invoke the
+ * user-defined function, and passes the resulting `pandas.DataFrame`
+ * as an Arrow record batch. Finally, each record batch is turned to
+ * Iterator[InternalRow] using ColumnarBatch.
+ *
+ * Note on memory usage:
+ * Both the Python worker and the Java executor need to have enough memory to
+ * hold the largest group. The memory on the Java side is used to construct the
+ * record batch (off heap memory). The memory on the Python side is used for
+ * holding the `pandas.DataFrame`. It's possible to further split one group into
+ * multiple record batches to reduce the memory footprint on the Java side, this
+ * is left as future work.
+ */
+case class FlatMapGroupsInPandasExec(
+    groupingAttributes: Seq[Attribute],
+    func: Expression,
+    output: Seq[Attribute],
+    child: SparkPlan)
+  extends UnaryExecNode {
+
+  private val pandasFunction = func.asInstanceOf[PythonUDF].func
+
+  override def outputPartitioning: Partitioning = child.outputPartitioning
+
+  override def producedAttributes: AttributeSet = AttributeSet(output)
+
+  override def requiredChildDistribution: Seq[Distribution] = {
+    if (groupingAttributes.isEmpty) {
+      AllTuples :: Nil
+    } else {
+      ClusteredDistribution(groupingAttributes) :: Nil
+    }
+  }
+
+  override def requiredChildOrdering: Seq[Seq[SortOrder]] =
+    Seq(groupingAttributes.map(SortOrder(_, Ascending)))
+
+  override protected def doExecute(): RDD[InternalRow] = {
+    val inputRDD = child.execute()
+
+    val bufferSize = inputRDD.conf.getInt("spark.buffer.size", 65536)
+    val reuseWorker = inputRDD.conf.getBoolean("spark.python.worker.reuse", defaultValue = true)
+    val chainedFunc = Seq(ChainedPythonFunctions(Seq(pandasFunction)))
+    val argOffsets = Array((0 until (child.output.length - groupingAttributes.length)).toArray)
+    val schema = StructType(child.schema.drop(groupingAttributes.length))
+
+    inputRDD.mapPartitionsInternal { iter =>
+      val grouped = if (groupingAttributes.isEmpty) {
+        Iterator(iter)
+      } else {
+        val groupedIter = GroupedIterator(iter, groupingAttributes, child.output)
+        val dropGrouping =
+          UnsafeProjection.create(child.output.drop(groupingAttributes.length), child.output)
+        groupedIter.map {
+          case (_, groupedRowIter) => groupedRowIter.map(dropGrouping)
+        }
+      }
+
+      val context = TaskContext.get()
+
+      val columnarBatchIter = new ArrowPythonRunner(
+        chainedFunc, bufferSize, reuseWorker,
+        PythonEvalType.SQL_PANDAS_UDF, argOffsets, schema)
+        .compute(grouped, context.partitionId(), context)
+
+      columnarBatchIter.flatMap(_.rowIterator.asScala).map(UnsafeProjection.create(output, output))
+    }
+  }
+}

From bd4eb9ce57da7bacff69d9ed958c94f349b7e6fb Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Tue, 10 Oct 2017 15:50:37 -0700
Subject: [PATCH 693/779] [SPARK-19558][SQL] Add config key to register
 QueryExecutionListeners automatically.

This change adds a new SQL config key that is equivalent to SparkContext's
"spark.extraListeners", allowing users to register QueryExecutionListener
instances through the Spark configuration system instead of having to
explicitly do it in code.

The code used by SparkContext to implement the feature was refactored into
a helper method in the Utils class, and SQL's ExecutionListenerManager was
modified to use it to initialize listener declared in the configuration.

Unit tests were added to verify all the new functionality.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #19309 from vanzin/SPARK-19558.
---
 .../scala/org/apache/spark/SparkContext.scala | 38 ++--------
 .../spark/internal/config/package.scala       |  7 ++
 .../scala/org/apache/spark/util/Utils.scala   | 57 ++++++++++++++-
 .../spark/scheduler/SparkListenerSuite.scala  |  6 +-
 .../org/apache/spark/util/UtilsSuite.scala    | 56 ++++++++++++++-
 .../spark/sql/internal/StaticSQLConf.scala    |  8 +++
 .../internal/BaseSessionStateBuilder.scala    |  3 +-
 .../sql/util/QueryExecutionListener.scala     | 12 +++-
 .../util/ExecutionListenerManagerSuite.scala  | 69 +++++++++++++++++++
 9 files changed, 216 insertions(+), 40 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/util/ExecutionListenerManagerSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index b3cd03c0cfbe1..6f25d346e6e54 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -2344,41 +2344,13 @@ class SparkContext(config: SparkConf) extends Logging {
    * (e.g. after the web UI and event logging listeners have been registered).
    */
   private def setupAndStartListenerBus(): Unit = {
-    // Use reflection to instantiate listeners specified via `spark.extraListeners`
     try {
-      val listenerClassNames: Seq[String] =
-        conf.get("spark.extraListeners", "").split(',').map(_.trim).filter(_ != "")
-      for (className <- listenerClassNames) {
-        // Use reflection to find the right constructor
-        val constructors = {
-          val listenerClass = Utils.classForName(className)
-          listenerClass
-              .getConstructors
-              .asInstanceOf[Array[Constructor[_ <: SparkListenerInterface]]]
+      conf.get(EXTRA_LISTENERS).foreach { classNames =>
+        val listeners = Utils.loadExtensions(classOf[SparkListenerInterface], classNames, conf)
+        listeners.foreach { listener =>
+          listenerBus.addToSharedQueue(listener)
+          logInfo(s"Registered listener ${listener.getClass().getName()}")
         }
-        val constructorTakingSparkConf = constructors.find { c =>
-          c.getParameterTypes.sameElements(Array(classOf[SparkConf]))
-        }
-        lazy val zeroArgumentConstructor = constructors.find { c =>
-          c.getParameterTypes.isEmpty
-        }
-        val listener: SparkListenerInterface = {
-          if (constructorTakingSparkConf.isDefined) {
-            constructorTakingSparkConf.get.newInstance(conf)
-          } else if (zeroArgumentConstructor.isDefined) {
-            zeroArgumentConstructor.get.newInstance()
-          } else {
-            throw new SparkException(
-              s"$className did not have a zero-argument constructor or a" +
-                " single-argument constructor that accepts SparkConf. Note: if the class is" +
-                " defined inside of another Scala class, then its constructors may accept an" +
-                " implicit parameter that references the enclosing class; in this case, you must" +
-                " define the listener as a top-level class in order to prevent this extra" +
-                " parameter from breaking Spark's ability to find a valid constructor.")
-          }
-        }
-        listenerBus.addToSharedQueue(listener)
-        logInfo(s"Registered listener $className")
       }
     } catch {
       case e: Exception =>
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index 5278e5e0fb270..19336f854145f 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -419,4 +419,11 @@ package object config {
       .stringConf
       .toSequence
       .createWithDefault(Nil)
+
+  private[spark] val EXTRA_LISTENERS = ConfigBuilder("spark.extraListeners")
+    .doc("Class names of listeners to add to SparkContext during initialization.")
+    .stringConf
+    .toSequence
+    .createOptional
+
 }
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 836e33c36d9a1..930e09d90c2f5 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -19,6 +19,7 @@ package org.apache.spark.util
 
 import java.io._
 import java.lang.management.{LockInfo, ManagementFactory, MonitorInfo, ThreadInfo}
+import java.lang.reflect.InvocationTargetException
 import java.math.{MathContext, RoundingMode}
 import java.net._
 import java.nio.ByteBuffer
@@ -37,7 +38,7 @@ import scala.collection.Map
 import scala.collection.mutable.ArrayBuffer
 import scala.io.Source
 import scala.reflect.ClassTag
-import scala.util.Try
+import scala.util.{Failure, Success, Try}
 import scala.util.control.{ControlThrowable, NonFatal}
 import scala.util.matching.Regex
 
@@ -2687,6 +2688,60 @@ private[spark] object Utils extends Logging {
   def stringToSeq(str: String): Seq[String] = {
     str.split(",").map(_.trim()).filter(_.nonEmpty)
   }
+
+  /**
+   * Create instances of extension classes.
+   *
+   * The classes in the given list must:
+   * - Be sub-classes of the given base class.
+   * - Provide either a no-arg constructor, or a 1-arg constructor that takes a SparkConf.
+   *
+   * The constructors are allowed to throw "UnsupportedOperationException" if the extension does not
+   * want to be registered; this allows the implementations to check the Spark configuration (or
+   * other state) and decide they do not need to be added. A log message is printed in that case.
+   * Other exceptions are bubbled up.
+   */
+  def loadExtensions[T](extClass: Class[T], classes: Seq[String], conf: SparkConf): Seq[T] = {
+    classes.flatMap { name =>
+      try {
+        val klass = classForName(name)
+        require(extClass.isAssignableFrom(klass),
+          s"$name is not a subclass of ${extClass.getName()}.")
+
+        val ext = Try(klass.getConstructor(classOf[SparkConf])) match {
+          case Success(ctor) =>
+            ctor.newInstance(conf)
+
+          case Failure(_) =>
+            klass.getConstructor().newInstance()
+        }
+
+        Some(ext.asInstanceOf[T])
+      } catch {
+        case _: NoSuchMethodException =>
+          throw new SparkException(
+            s"$name did not have a zero-argument constructor or a" +
+              " single-argument constructor that accepts SparkConf. Note: if the class is" +
+              " defined inside of another Scala class, then its constructors may accept an" +
+              " implicit parameter that references the enclosing class; in this case, you must" +
+              " define the class as a top-level class in order to prevent this extra" +
+              " parameter from breaking Spark's ability to find a valid constructor.")
+
+        case e: InvocationTargetException =>
+          e.getCause() match {
+            case uoe: UnsupportedOperationException =>
+              logDebug(s"Extension $name not being initialized.", uoe)
+              logInfo(s"Extension $name not being initialized.")
+              None
+
+            case null => throw e
+
+            case cause => throw cause
+          }
+      }
+    }
+  }
+
 }
 
 private[util] object CallerContext extends Logging {
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
index d061c7845f4a6..1beb36afa95f0 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
@@ -27,7 +27,7 @@ import org.scalatest.Matchers
 
 import org.apache.spark._
 import org.apache.spark.executor.TaskMetrics
-import org.apache.spark.internal.config.LISTENER_BUS_EVENT_QUEUE_CAPACITY
+import org.apache.spark.internal.config._
 import org.apache.spark.metrics.MetricsSystem
 import org.apache.spark.util.{ResetSystemProperties, RpcUtils}
 
@@ -446,13 +446,13 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
       classOf[FirehoseListenerThatAcceptsSparkConf],
       classOf[BasicJobCounter])
     val conf = new SparkConf().setMaster("local").setAppName("test")
-      .set("spark.extraListeners", listeners.map(_.getName).mkString(","))
+      .set(EXTRA_LISTENERS, listeners.map(_.getName))
     sc = new SparkContext(conf)
     sc.listenerBus.listeners.asScala.count(_.isInstanceOf[BasicJobCounter]) should be (1)
     sc.listenerBus.listeners.asScala
       .count(_.isInstanceOf[ListenerThatAcceptsSparkConf]) should be (1)
     sc.listenerBus.listeners.asScala
-        .count(_.isInstanceOf[FirehoseListenerThatAcceptsSparkConf]) should be (1)
+      .count(_.isInstanceOf[FirehoseListenerThatAcceptsSparkConf]) should be (1)
   }
 
   test("add and remove listeners to/from LiveListenerBus queues") {
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index 2b16cc4852ba8..4d3adeb968e84 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -38,9 +38,10 @@ import org.apache.commons.math3.stat.inference.ChiSquareTest
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 
-import org.apache.spark.{SparkConf, SparkFunSuite, TaskContext}
+import org.apache.spark.{SparkConf, SparkException, SparkFunSuite, TaskContext}
 import org.apache.spark.internal.Logging
 import org.apache.spark.network.util.ByteUnit
+import org.apache.spark.scheduler.SparkListener
 
 class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
 
@@ -1110,4 +1111,57 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
     Utils.tryWithSafeFinallyAndFailureCallbacks {}(catchBlock = {}, finallyBlock = {})
     TaskContext.unset
   }
+
+  test("load extensions") {
+    val extensions = Seq(
+      classOf[SimpleExtension],
+      classOf[ExtensionWithConf],
+      classOf[UnregisterableExtension]).map(_.getName())
+
+    val conf = new SparkConf(false)
+    val instances = Utils.loadExtensions(classOf[Object], extensions, conf)
+    assert(instances.size === 2)
+    assert(instances.count(_.isInstanceOf[SimpleExtension]) === 1)
+
+    val extWithConf = instances.find(_.isInstanceOf[ExtensionWithConf])
+      .map(_.asInstanceOf[ExtensionWithConf])
+      .get
+    assert(extWithConf.conf eq conf)
+
+    class NestedExtension { }
+
+    val invalid = Seq(classOf[NestedExtension].getName())
+    intercept[SparkException] {
+      Utils.loadExtensions(classOf[Object], invalid, conf)
+    }
+
+    val error = Seq(classOf[ExtensionWithError].getName())
+    intercept[IllegalArgumentException] {
+      Utils.loadExtensions(classOf[Object], error, conf)
+    }
+
+    val wrongType = Seq(classOf[ListenerImpl].getName())
+    intercept[IllegalArgumentException] {
+      Utils.loadExtensions(classOf[Seq[_]], wrongType, conf)
+    }
+  }
+
+}
+
+private class SimpleExtension
+
+private class ExtensionWithConf(val conf: SparkConf)
+
+private class UnregisterableExtension {
+
+  throw new UnsupportedOperationException()
+
+}
+
+private class ExtensionWithError {
+
+  throw new IllegalArgumentException()
+
 }
+
+private class ListenerImpl extends SparkListener
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala
index c6c0a605d89ff..c018fc8a332fa 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala
@@ -87,4 +87,12 @@ object StaticSQLConf {
       "implement Function1[SparkSessionExtension, Unit], and must have a no-args constructor.")
     .stringConf
     .createOptional
+
+  val QUERY_EXECUTION_LISTENERS = buildStaticConf("spark.sql.queryExecutionListeners")
+    .doc("List of class names implementing QueryExecutionListener that will be automatically " +
+      "added to newly created sessions. The classes should have either a no-arg constructor, " +
+      "or a constructor that expects a SparkConf argument.")
+    .stringConf
+    .toSequence
+    .createOptional
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
index 4e756084bbdbb..2867b4cd7da5e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
@@ -266,7 +266,8 @@ abstract class BaseSessionStateBuilder(
    * This gets cloned from parent if available, otherwise is a new instance is created.
    */
   protected def listenerManager: ExecutionListenerManager = {
-    parentState.map(_.listenerManager.clone()).getOrElse(new ExecutionListenerManager)
+    parentState.map(_.listenerManager.clone()).getOrElse(
+      new ExecutionListenerManager(session.sparkContext.conf))
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala
index f6240d85fba6f..2b46233e1a5df 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala
@@ -22,9 +22,12 @@ import java.util.concurrent.locks.ReentrantReadWriteLock
 import scala.collection.mutable.ListBuffer
 import scala.util.control.NonFatal
 
+import org.apache.spark.SparkConf
 import org.apache.spark.annotation.{DeveloperApi, Experimental, InterfaceStability}
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.execution.QueryExecution
+import org.apache.spark.sql.internal.StaticSQLConf._
+import org.apache.spark.util.Utils
 
 /**
  * :: Experimental ::
@@ -72,7 +75,14 @@ trait QueryExecutionListener {
  */
 @Experimental
 @InterfaceStability.Evolving
-class ExecutionListenerManager private[sql] () extends Logging {
+class ExecutionListenerManager private extends Logging {
+
+  private[sql] def this(conf: SparkConf) = {
+    this()
+    conf.get(QUERY_EXECUTION_LISTENERS).foreach { classNames =>
+      Utils.loadExtensions(classOf[QueryExecutionListener], classNames, conf).foreach(register)
+    }
+  }
 
   /**
    * Registers the specified [[QueryExecutionListener]].
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/util/ExecutionListenerManagerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/util/ExecutionListenerManagerSuite.scala
new file mode 100644
index 0000000000000..4205e23ae240a
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/util/ExecutionListenerManagerSuite.scala
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.util
+
+import java.util.concurrent.atomic.AtomicInteger
+
+import org.apache.spark._
+import org.apache.spark.sql.execution.QueryExecution
+import org.apache.spark.sql.internal.StaticSQLConf._
+
+class ExecutionListenerManagerSuite extends SparkFunSuite {
+
+  import CountingQueryExecutionListener._
+
+  test("register query execution listeners using configuration") {
+    val conf = new SparkConf(false)
+      .set(QUERY_EXECUTION_LISTENERS, Seq(classOf[CountingQueryExecutionListener].getName()))
+
+    val mgr = new ExecutionListenerManager(conf)
+    assert(INSTANCE_COUNT.get() === 1)
+    mgr.onSuccess(null, null, 42L)
+    assert(CALLBACK_COUNT.get() === 1)
+
+    val clone = mgr.clone()
+    assert(INSTANCE_COUNT.get() === 1)
+
+    clone.onSuccess(null, null, 42L)
+    assert(CALLBACK_COUNT.get() === 2)
+  }
+
+}
+
+private class CountingQueryExecutionListener extends QueryExecutionListener {
+
+  import CountingQueryExecutionListener._
+
+  INSTANCE_COUNT.incrementAndGet()
+
+  override def onSuccess(funcName: String, qe: QueryExecution, durationNs: Long): Unit = {
+    CALLBACK_COUNT.incrementAndGet()
+  }
+
+  override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {
+    CALLBACK_COUNT.incrementAndGet()
+  }
+
+}
+
+private object CountingQueryExecutionListener {
+
+  val CALLBACK_COUNT = new AtomicInteger()
+  val INSTANCE_COUNT = new AtomicInteger()
+
+}

From 76fb173dd639baa9534486488155fc05a71f850e Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Tue, 10 Oct 2017 20:29:02 -0700
Subject: [PATCH 694/779] [SPARK-21751][SQL] CodeGeneraor.splitExpressions
 counts code size more precisely

## What changes were proposed in this pull request?

Current `CodeGeneraor.splitExpressions` splits statements into methods if the total length of statements is more than 1024 characters. The length may include comments or empty line.

This PR excludes comment or empty line from the length to reduce the number of generated methods in a class, by using `CodeFormatter.stripExtraNewLinesAndComments()` method.

## How was this patch tested?

Existing tests

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #18966 from kiszk/SPARK-21751.
---
 .../expressions/codegen/CodeFormatter.scala   |  8 +++++
 .../expressions/codegen/CodeGenerator.scala   |  5 ++-
 .../codegen/CodeFormatterSuite.scala          | 32 +++++++++++++++++++
 3 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala
index 60e600d8dbd8f..7b398f424cead 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala
@@ -89,6 +89,14 @@ object CodeFormatter {
     }
     new CodeAndComment(code.result().trim(), map)
   }
+
+  def stripExtraNewLinesAndComments(input: String): String = {
+    val commentReg =
+      ("""([ |\t]*?\/\*[\s|\S]*?\*\/[ |\t]*?)|""" +    // strip /*comment*/
+       """([ |\t]*?\/\/[\s\S]*?\n)""").r               // strip //comment
+    val codeWithoutComment = commentReg.replaceAllIn(input, "")
+    codeWithoutComment.replaceAll("""\n\s*\n""", "\n") // strip ExtraNewLines
+  }
 }
 
 private class CodeFormatter {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index f9c5ef8439085..2cb66599076a9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -772,16 +772,19 @@ class CodegenContext {
       foldFunctions: Seq[String] => String = _.mkString("", ";\n", ";")): String = {
     val blocks = new ArrayBuffer[String]()
     val blockBuilder = new StringBuilder()
+    var length = 0
     for (code <- expressions) {
       // We can't know how many bytecode will be generated, so use the length of source code
       // as metric. A method should not go beyond 8K, otherwise it will not be JITted, should
       // also not be too small, or it will have many function calls (for wide table), see the
       // results in BenchmarkWideTable.
-      if (blockBuilder.length > 1024) {
+      if (length > 1024) {
         blocks += blockBuilder.toString()
         blockBuilder.clear()
+        length = 0
       }
       blockBuilder.append(code)
+      length += CodeFormatter.stripExtraNewLinesAndComments(code).length
     }
     blocks += blockBuilder.toString()
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala
index 9d0a41661beaa..a0f1a64b0ab08 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala
@@ -53,6 +53,38 @@ class CodeFormatterSuite extends SparkFunSuite {
     assert(reducedCode.body === "/*project_c4*/")
   }
 
+  test("removing extra new lines and comments") {
+    val code =
+      """
+        |/*
+        |  * multi
+        |  * line
+        |  * comments
+        |  */
+        |
+        |public function() {
+        |/*comment*/
+        |  /*comment_with_space*/
+        |code_body
+        |//comment
+        |code_body
+        |  //comment_with_space
+        |
+        |code_body
+        |}
+      """.stripMargin
+
+    val reducedCode = CodeFormatter.stripExtraNewLinesAndComments(code)
+    assert(reducedCode ===
+      """
+        |public function() {
+        |code_body
+        |code_body
+        |code_body
+        |}
+      """.stripMargin)
+  }
+
   testCase("basic example") {
     """
       |class A {

From 655f6f86f84ff5241d1d20766e1ef83bb32ca5e0 Mon Sep 17 00:00:00 2001
From: Zhenhua Wang <wzh_zju@163.com>
Date: Wed, 11 Oct 2017 00:16:12 -0700
Subject: [PATCH 695/779] [SPARK-22208][SQL] Improve percentile_approx by not
 rounding up targetError and starting from index 0

## What changes were proposed in this pull request?

Currently percentile_approx never returns the first element when percentile is in (relativeError, 1/N], where relativeError default 1/10000, and N is the total number of elements. But ideally, percentiles in [0, 1/N] should all return the first element as the answer.

For example, given input data 1 to 10, if a user queries 10% (or even less) percentile, it should return 1, because the first value 1 already reaches 10%. Currently it returns 2.

Based on the paper, targetError is not rounded up, and searching index should start from 0 instead of 1. By following the paper, we should be able to fix the cases mentioned above.

## How was this patch tested?

Added a new test case and fix existing test cases.

Author: Zhenhua Wang <wzh_zju@163.com>

Closes #19438 from wzhfy/improve_percentile_approx.
---
 R/pkg/tests/fulltests/test_sparkSQL.R           |  8 ++++----
 .../apache/spark/ml/feature/ImputerSuite.scala  |  2 +-
 python/pyspark/sql/dataframe.py                 |  6 +++---
 .../sql/catalyst/util/QuantileSummaries.scala   |  4 ++--
 .../catalyst/util/QuantileSummariesSuite.scala  | 10 ++++++++--
 .../sql/ApproximatePercentileQuerySuite.scala   | 17 ++++++++++++++++-
 .../apache/spark/sql/DataFrameStatSuite.scala   |  2 +-
 .../org/apache/spark/sql/DataFrameSuite.scala   |  2 +-
 8 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index bbea25bc4da5c..4382ef2ed4525 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -2538,7 +2538,7 @@ test_that("describe() and summary() on a DataFrame", {
 
   stats2 <- summary(df)
   expect_equal(collect(stats2)[5, "summary"], "25%")
-  expect_equal(collect(stats2)[5, "age"], "30")
+  expect_equal(collect(stats2)[5, "age"], "19")
 
   stats3 <- summary(df, "min", "max", "55.1%")
 
@@ -2738,7 +2738,7 @@ test_that("sampleBy() on a DataFrame", {
 })
 
 test_that("approxQuantile() on a DataFrame", {
-  l <- lapply(c(0:99), function(i) { list(i, 99 - i) })
+  l <- lapply(c(0:100), function(i) { list(i, 100 - i) })
   df <- createDataFrame(l, list("a", "b"))
   quantiles <- approxQuantile(df, "a", c(0.5, 0.8), 0.0)
   expect_equal(quantiles, list(50, 80))
@@ -2749,8 +2749,8 @@ test_that("approxQuantile() on a DataFrame", {
   dfWithNA <- createDataFrame(data.frame(a = c(NA, 30, 19, 11, 28, 15),
                                          b = c(-30, -19, NA, -11, -28, -15)))
   quantiles3 <- approxQuantile(dfWithNA, c("a", "b"), c(0.5), 0.0)
-  expect_equal(quantiles3[[1]], list(28))
-  expect_equal(quantiles3[[2]], list(-15))
+  expect_equal(quantiles3[[1]], list(19))
+  expect_equal(quantiles3[[2]], list(-19))
 })
 
 test_that("SQL error message is returned from JVM", {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
index ee2ba73fa96d5..c08b35b419266 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
@@ -43,7 +43,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
       (0, 1.0, 1.0, 1.0),
       (1, 3.0, 3.0, 3.0),
       (2, Double.NaN, Double.NaN, Double.NaN),
-      (3, -1.0, 2.0, 3.0)
+      (3, -1.0, 2.0, 1.0)
     )).toDF("id", "value", "expected_mean_value", "expected_median_value")
     val imputer = new Imputer().setInputCols(Array("value")).setOutputCols(Array("out"))
       .setMissingValue(-1.0)
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 2d596229ced7e..38b01f0011671 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1038,8 +1038,8 @@ def summary(self, *statistics):
         |   mean|               3.5| null|
         | stddev|2.1213203435596424| null|
         |    min|                 2|Alice|
-        |    25%|                 5| null|
-        |    50%|                 5| null|
+        |    25%|                 2| null|
+        |    50%|                 2| null|
         |    75%|                 5| null|
         |    max|                 5|  Bob|
         +-------+------------------+-----+
@@ -1050,7 +1050,7 @@ def summary(self, *statistics):
         +-------+---+-----+
         |  count|  2|    2|
         |    min|  2|Alice|
-        |    25%|  5| null|
+        |    25%|  2| null|
         |    75%|  5| null|
         |    max|  5|  Bob|
         +-------+---+-----+
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala
index af543b04ba780..eb7941cf9e6af 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala
@@ -193,10 +193,10 @@ class QuantileSummaries(
 
     // Target rank
     val rank = math.ceil(quantile * count).toInt
-    val targetError = math.ceil(relativeError * count)
+    val targetError = relativeError * count
     // Minimum rank at current sample
     var minRank = 0
-    var i = 1
+    var i = 0
     while (i < sampled.length - 1) {
       val curSample = sampled(i)
       minRank += curSample.g
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/QuantileSummariesSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/QuantileSummariesSuite.scala
index df579d5ec1ddf..650813975d75c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/QuantileSummariesSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/QuantileSummariesSuite.scala
@@ -57,8 +57,14 @@ class QuantileSummariesSuite extends SparkFunSuite {
   private def checkQuantile(quant: Double, data: Seq[Double], summary: QuantileSummaries): Unit = {
     if (data.nonEmpty) {
       val approx = summary.query(quant).get
-      // The rank of the approximation.
-      val rank = data.count(_ < approx) // has to be <, not <= to be exact
+      // Get the rank of the approximation.
+      val rankOfValue = data.count(_ <= approx)
+      val rankOfPreValue = data.count(_ < approx)
+      // `rankOfValue` is the last position of the quantile value. If the input repeats the value
+      // chosen as the quantile, e.g. in (1,2,2,2,2,2,3), the 50% quantile is 2, then it's
+      // improper to choose the last position as its rank. Instead, we get the rank by averaging
+      // `rankOfValue` and `rankOfPreValue`.
+      val rank = math.ceil((rankOfValue + rankOfPreValue) / 2.0)
       val lower = math.floor((quant - summary.relativeError) * data.size)
       val upper = math.ceil((quant + summary.relativeError) * data.size)
       val msg =
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala
index 1aea33766407f..137c5bea2abb9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala
@@ -53,6 +53,21 @@ class ApproximatePercentileQuerySuite extends QueryTest with SharedSQLContext {
     }
   }
 
+  test("percentile_approx, the first element satisfies small percentages") {
+    withTempView(table) {
+      (1 to 10).toDF("col").createOrReplaceTempView(table)
+      checkAnswer(
+        spark.sql(
+          s"""
+             |SELECT
+             |  percentile_approx(col, array(0.01, 0.1, 0.11))
+             |FROM $table
+           """.stripMargin),
+        Row(Seq(1, 1, 2))
+      )
+    }
+  }
+
   test("percentile_approx, array of percentile value") {
     withTempView(table) {
       (1 to 1000).toDF("col").createOrReplaceTempView(table)
@@ -130,7 +145,7 @@ class ApproximatePercentileQuerySuite extends QueryTest with SharedSQLContext {
       (1 to 1000).toDF("col").createOrReplaceTempView(table)
       checkAnswer(
         spark.sql(s"SELECT percentile_approx(col, array(0.25 + 0.25D), 200 + 800D) FROM $table"),
-        Row(Seq(500D))
+        Row(Seq(499))
       )
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
index 247c30e2ee65b..46b21c3b64a2e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
@@ -141,7 +141,7 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext {
 
   test("approximate quantile") {
     val n = 1000
-    val df = Seq.tabulate(n)(i => (i, 2.0 * i)).toDF("singles", "doubles")
+    val df = Seq.tabulate(n + 1)(i => (i, 2.0 * i)).toDF("singles", "doubles")
 
     val q1 = 0.5
     val q2 = 0.8
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index dd8f54b690f64..ad461fa6144b3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -855,7 +855,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
       Row("mean", null, "33.0", "178.0"),
       Row("stddev", null, "19.148542155126762", "11.547005383792516"),
       Row("min", "Alice", "16", "164"),
-      Row("25%", null, "24", "176"),
+      Row("25%", null, "16", "164"),
       Row("50%", null, "24", "176"),
       Row("75%", null, "32", "180"),
       Row("max", "David", "60", "192"))

From 645e108eeb6364e57f5d7213dbbd42dbcf1124d3 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <zsxwing@gmail.com>
Date: Wed, 11 Oct 2017 13:51:33 -0700
Subject: [PATCH 696/779] [SPARK-21988][SS] Implement
 StreamingRelation.computeStats to fix explain

## What changes were proposed in this pull request?

Implement StreamingRelation.computeStats to fix explain

## How was this patch tested?

- unit tests: `StreamingRelation.computeStats` and `StreamingExecutionRelation.computeStats`.
- regression tests: `explain join with a normal source` and `explain join with MemoryStream`.

Author: Shixiong Zhu <zsxwing@gmail.com>

Closes #19465 from zsxwing/SPARK-21988.
---
 .../streaming/StreamingRelation.scala         |  8 +++
 .../spark/sql/streaming/StreamSuite.scala     | 65 ++++++++++++++++---
 2 files changed, 63 insertions(+), 10 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
index ab716052c28ba..6b82c78ea653d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
@@ -44,6 +44,14 @@ case class StreamingRelation(dataSource: DataSource, sourceName: String, output:
   extends LeafNode {
   override def isStreaming: Boolean = true
   override def toString: String = sourceName
+
+  // There's no sensible value here. On the execution path, this relation will be
+  // swapped out with microbatches. But some dataframe operations (in particular explain) do lead
+  // to this node surviving analysis. So we satisfy the LeafNode contract with the session default
+  // value.
+  override def computeStats(): Statistics = Statistics(
+    sizeInBytes = BigInt(dataSource.sparkSession.sessionState.conf.defaultSizeInBytes)
+  )
 }
 
 /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
index 9c901062d570a..3d687d2214e90 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
@@ -76,20 +76,65 @@ class StreamSuite extends StreamTest {
       CheckAnswer(Row(1, 1, "one"), Row(2, 2, "two"), Row(4, 4, "four")))
   }
 
+  test("StreamingRelation.computeStats") {
+    val streamingRelation = spark.readStream.format("rate").load().logicalPlan collect {
+      case s: StreamingRelation => s
+    }
+    assert(streamingRelation.nonEmpty, "cannot find StreamingRelation")
+    assert(
+      streamingRelation.head.computeStats.sizeInBytes == spark.sessionState.conf.defaultSizeInBytes)
+  }
 
-  test("explain join") {
-    // Make a table and ensure it will be broadcast.
-    val smallTable = Seq((1, "one"), (2, "two"), (4, "four")).toDF("number", "word")
+  test("StreamingExecutionRelation.computeStats") {
+    val streamingExecutionRelation = MemoryStream[Int].toDF.logicalPlan collect {
+      case s: StreamingExecutionRelation => s
+    }
+    assert(streamingExecutionRelation.nonEmpty, "cannot find StreamingExecutionRelation")
+    assert(streamingExecutionRelation.head.computeStats.sizeInBytes
+      == spark.sessionState.conf.defaultSizeInBytes)
+  }
 
-    // Join the input stream with a table.
-    val inputData = MemoryStream[Int]
-    val joined = inputData.toDF().join(smallTable, smallTable("number") === $"value")
+  test("explain join with a normal source") {
+    // This test triggers CostBasedJoinReorder to call `computeStats`
+    withSQLConf(SQLConf.CBO_ENABLED.key -> "true", SQLConf.JOIN_REORDER_ENABLED.key -> "true") {
+      val smallTable = Seq((1, "one"), (2, "two"), (4, "four")).toDF("number", "word")
+      val smallTable2 = Seq((1, "one"), (2, "two"), (4, "four")).toDF("number", "word")
+      val smallTable3 = Seq((1, "one"), (2, "two"), (4, "four")).toDF("number", "word")
+
+      // Join the input stream with a table.
+      val df = spark.readStream.format("rate").load()
+      val joined = df.join(smallTable, smallTable("number") === $"value")
+        .join(smallTable2, smallTable2("number") === $"value")
+        .join(smallTable3, smallTable3("number") === $"value")
+
+      val outputStream = new java.io.ByteArrayOutputStream()
+      Console.withOut(outputStream) {
+        joined.explain(true)
+      }
+      assert(outputStream.toString.contains("StreamingRelation"))
+    }
+  }
 
-    val outputStream = new java.io.ByteArrayOutputStream()
-    Console.withOut(outputStream) {
-      joined.explain()
+  test("explain join with MemoryStream") {
+    // This test triggers CostBasedJoinReorder to call `computeStats`
+    // Because MemoryStream doesn't use DataSource code path, we need a separate test.
+    withSQLConf(SQLConf.CBO_ENABLED.key -> "true", SQLConf.JOIN_REORDER_ENABLED.key -> "true") {
+      val smallTable = Seq((1, "one"), (2, "two"), (4, "four")).toDF("number", "word")
+      val smallTable2 = Seq((1, "one"), (2, "two"), (4, "four")).toDF("number", "word")
+      val smallTable3 = Seq((1, "one"), (2, "two"), (4, "four")).toDF("number", "word")
+
+      // Join the input stream with a table.
+      val df = MemoryStream[Int].toDF
+      val joined = df.join(smallTable, smallTable("number") === $"value")
+        .join(smallTable2, smallTable2("number") === $"value")
+        .join(smallTable3, smallTable3("number") === $"value")
+
+      val outputStream = new java.io.ByteArrayOutputStream()
+      Console.withOut(outputStream) {
+        joined.explain(true)
+      }
+      assert(outputStream.toString.contains("StreamingRelation"))
     }
-    assert(outputStream.toString.contains("StreamingRelation"))
   }
 
   test("SPARK-20432: union one stream with itself") {

From ccdf21f56e4ff5497d7770dcbee2f7a60bb9e3a7 Mon Sep 17 00:00:00 2001
From: Jorge Machado <jorge.w.machado@hotmail.com>
Date: Wed, 11 Oct 2017 22:13:07 -0700
Subject: [PATCH 697/779] [SPARK-20055][DOCS] Added documentation for loading
 csv files into DataFrames

## What changes were proposed in this pull request?

 Added documentation for loading csv files into Dataframes

## How was this patch tested?

/dev/run-tests

Author: Jorge Machado <jorge.w.machado@hotmail.com>

Closes #19429 from jomach/master.
---
 docs/sql-programming-guide.md                 | 32 ++++++++++++++++---
 .../sql/JavaSQLDataSourceExample.java         |  7 ++++
 examples/src/main/python/sql/datasource.py    |  5 +++
 examples/src/main/r/RSparkSQLExample.R        |  6 ++++
 examples/src/main/resources/people.csv        |  3 ++
 .../examples/sql/SQLDataSourceExample.scala   |  8 +++++
 6 files changed, 56 insertions(+), 5 deletions(-)
 create mode 100644 examples/src/main/resources/people.csv

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index a095263bfa619..639a8ea7bb8ad 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -461,6 +461,8 @@ name (i.e., `org.apache.spark.sql.parquet`), but for built-in sources you can al
 names (`json`, `parquet`, `jdbc`, `orc`, `libsvm`, `csv`, `text`). DataFrames loaded from any data
 source type can be converted into other types using this syntax.
 
+To load a JSON file you can use:
+
 <div class="codetabs">
 <div data-lang="scala"  markdown="1">
 {% include_example manual_load_options scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %}
@@ -479,6 +481,26 @@ source type can be converted into other types using this syntax.
 </div>
 </div>
 
+To load a CSV file you can use:
+
+<div class="codetabs">
+<div data-lang="scala"  markdown="1">
+{% include_example manual_load_options_csv scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %}
+</div>
+
+<div data-lang="java"  markdown="1">
+{% include_example manual_load_options_csv java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %}
+</div>
+
+<div data-lang="python"  markdown="1">
+{% include_example manual_load_options_csv python/sql/datasource.py %}
+</div>
+
+<div data-lang="r"  markdown="1">
+{% include_example manual_load_options_csv r/RSparkSQLExample.R %}
+
+</div>
+</div>
 ### Run SQL on files directly
 
 Instead of using read API to load a file into DataFrame and query it, you can also query that
@@ -573,7 +595,7 @@ Note that partition information is not gathered by default when creating externa
 
 ### Bucketing, Sorting and Partitioning
 
-For file-based data source, it is also possible to bucket and sort or partition the output. 
+For file-based data source, it is also possible to bucket and sort or partition the output.
 Bucketing and sorting are applicable only to persistent tables:
 
 <div class="codetabs">
@@ -598,7 +620,7 @@ CREATE TABLE users_bucketed_by_name(
   name STRING,
   favorite_color STRING,
   favorite_numbers array<integer>
-) USING parquet 
+) USING parquet
 CLUSTERED BY(name) INTO 42 BUCKETS;
 
 {% endhighlight %}
@@ -629,7 +651,7 @@ while partitioning can be used with both `save` and `saveAsTable` when using the
 {% highlight sql %}
 
 CREATE TABLE users_by_favorite_color(
-  name STRING, 
+  name STRING,
   favorite_color STRING,
   favorite_numbers array<integer>
 ) USING csv PARTITIONED BY(favorite_color);
@@ -664,7 +686,7 @@ CREATE TABLE users_bucketed_and_partitioned(
   name STRING,
   favorite_color STRING,
   favorite_numbers array<integer>
-) USING parquet 
+) USING parquet
 PARTITIONED BY (favorite_color)
 CLUSTERED BY(name) SORTED BY (favorite_numbers) INTO 42 BUCKETS;
 
@@ -675,7 +697,7 @@ CLUSTERED BY(name) SORTED BY (favorite_numbers) INTO 42 BUCKETS;
 </div>
 
 `partitionBy` creates a directory structure as described in the [Partition Discovery](#partition-discovery) section.
-Thus, it has limited applicability to columns with high cardinality. In contrast 
+Thus, it has limited applicability to columns with high cardinality. In contrast
  `bucketBy` distributes
 data across a fixed number of buckets and can be used when a number of unique values is unbounded.
 
diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java
index 95859c52c2aeb..ef3c904775697 100644
--- a/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java
@@ -116,6 +116,13 @@ private static void runBasicDataSourceExample(SparkSession spark) {
       spark.read().format("json").load("examples/src/main/resources/people.json");
     peopleDF.select("name", "age").write().format("parquet").save("namesAndAges.parquet");
     // $example off:manual_load_options$
+    // $example on:manual_load_options_csv$
+    Dataset<Row> peopleDFCsv = spark.read().format("csv")
+      .option("sep", ";")
+      .option("inferSchema", "true")
+      .option("header", "true")
+      .load("examples/src/main/resources/people.csv");
+    // $example off:manual_load_options_csv$
     // $example on:direct_sql$
     Dataset<Row> sqlDF =
       spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`");
diff --git a/examples/src/main/python/sql/datasource.py b/examples/src/main/python/sql/datasource.py
index f86012ea382e8..b375fa775de39 100644
--- a/examples/src/main/python/sql/datasource.py
+++ b/examples/src/main/python/sql/datasource.py
@@ -53,6 +53,11 @@ def basic_datasource_example(spark):
     df.select("name", "age").write.save("namesAndAges.parquet", format="parquet")
     # $example off:manual_load_options$
 
+    # $example on:manual_load_options_csv$
+    df = spark.read.load("examples/src/main/resources/people.csv",
+                         format="csv", sep=":", inferSchema="true", header="true")
+    # $example off:manual_load_options_csv$
+
     # $example on:write_sorting_and_bucketing$
     df.write.bucketBy(42, "name").sortBy("age").saveAsTable("people_bucketed")
     # $example off:write_sorting_and_bucketing$
diff --git a/examples/src/main/r/RSparkSQLExample.R b/examples/src/main/r/RSparkSQLExample.R
index 3734568d872d0..a5ed723da47ca 100644
--- a/examples/src/main/r/RSparkSQLExample.R
+++ b/examples/src/main/r/RSparkSQLExample.R
@@ -113,6 +113,12 @@ write.df(namesAndAges, "namesAndAges.parquet", "parquet")
 # $example off:manual_load_options$
 
 
+# $example on:manual_load_options_csv$
+df <- read.df("examples/src/main/resources/people.csv", "csv")
+namesAndAges <- select(df, "name", "age")
+# $example off:manual_load_options_csv$
+
+
 # $example on:direct_sql$
 df <- sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`")
 # $example off:direct_sql$
diff --git a/examples/src/main/resources/people.csv b/examples/src/main/resources/people.csv
new file mode 100644
index 0000000000000..7fe5adba93d77
--- /dev/null
+++ b/examples/src/main/resources/people.csv
@@ -0,0 +1,3 @@
+name;age;job
+Jorge;30;Developer
+Bob;32;Developer
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala
index 86b3dc4a84f58..f9477969a4bb5 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala
@@ -49,6 +49,14 @@ object SQLDataSourceExample {
     val peopleDF = spark.read.format("json").load("examples/src/main/resources/people.json")
     peopleDF.select("name", "age").write.format("parquet").save("namesAndAges.parquet")
     // $example off:manual_load_options$
+    // $example on:manual_load_options_csv$
+    val peopleDFCsv = spark.read.format("csv")
+      .option("sep", ";")
+      .option("inferSchema", "true")
+      .option("header", "true")
+      .load("examples/src/main/resources/people.csv")
+    // $example off:manual_load_options_csv$
+
     // $example on:direct_sql$
     val sqlDF = spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`")
     // $example off:direct_sql$

From 274f0efefa0c063649bccddb787e8863910f4366 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 12 Oct 2017 20:20:44 +0800
Subject: [PATCH 698/779] [SPARK-22252][SQL] FileFormatWriter should respect
 the input query schema

## What changes were proposed in this pull request?

In https://github.com/apache/spark/pull/18064, we allowed `RunnableCommand` to have children in order to fix some UI issues. Then we made `InsertIntoXXX` commands take the input `query` as a child, when we do the actual writing, we just pass the physical plan to the writer(`FileFormatWriter.write`).

However this is problematic. In Spark SQL, optimizer and planner are allowed to change the schema names a little bit. e.g. `ColumnPruning` rule will remove no-op `Project`s, like `Project("A", Scan("a"))`, and thus change the output schema from "<A: int>" to `<a: int>`. When it comes to writing, especially for self-description data format like parquet, we may write the wrong schema to the file and cause null values at the read path.

Fortunately, in https://github.com/apache/spark/pull/18450 , we decided to allow nested execution and one query can map to multiple executions in the UI. This releases the major restriction in #18604 , and now we don't have to take the input `query` as child of `InsertIntoXXX` commands.

So the fix is simple, this PR partially revert #18064 and make `InsertIntoXXX` commands leaf nodes again.

## How was this patch tested?

new regression test

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19474 from cloud-fan/bug.
---
 .../spark/sql/catalyst/plans/QueryPlan.scala  |  2 +-
 .../sql/catalyst/plans/logical/Command.scala  |  3 +-
 .../spark/sql/execution/QueryExecution.scala  |  4 +--
 .../spark/sql/execution/SparkStrategies.scala |  2 +-
 .../execution/columnar/InMemoryRelation.scala |  3 +-
 .../columnar/InMemoryTableScanExec.scala      |  2 +-
 .../command/DataWritingCommand.scala          | 13 ++++++++
 .../InsertIntoDataSourceDirCommand.scala      |  6 ++--
 .../spark/sql/execution/command/cache.scala   |  2 +-
 .../sql/execution/command/commands.scala      | 30 ++++++-------------
 .../command/createDataSourceTables.scala      |  2 +-
 .../spark/sql/execution/command/views.scala   |  4 +--
 .../execution/datasources/DataSource.scala    | 13 +++++++-
 .../datasources/FileFormatWriter.scala        | 14 +++++----
 .../InsertIntoDataSourceCommand.scala         |  2 +-
 .../InsertIntoHadoopFsRelationCommand.scala   |  9 ++----
 .../SaveIntoDataSourceCommand.scala           |  2 +-
 .../execution/streaming/FileStreamSink.scala  |  2 +-
 .../datasources/FileFormatWriterSuite.scala   | 16 +++++++++-
 .../execution/InsertIntoHiveDirCommand.scala  | 10 ++-----
 .../hive/execution/InsertIntoHiveTable.scala  | 14 ++-------
 .../sql/hive/execution/SaveAsHiveFile.scala   |  6 ++--
 22 files changed, 85 insertions(+), 76 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
index 7addbaaa9afa5..c7952e3ff8280 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -178,7 +178,7 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] extends TreeNode[PlanT
     })
   }
 
-  override def innerChildren: Seq[QueryPlan[_]] = subqueries
+  override protected def innerChildren: Seq[QueryPlan[_]] = subqueries
 
   /**
    * Returns a plan where a best effort attempt has been made to transform `this` in a way
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Command.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Command.scala
index ec5766e1f67f2..38f47081b6f55 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Command.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Command.scala
@@ -24,7 +24,6 @@ import org.apache.spark.sql.catalyst.expressions.Attribute
  * commands can be used by parsers to represent DDL operations.  Commands, unlike queries, are
  * eagerly executed.
  */
-trait Command extends LogicalPlan {
+trait Command extends LeafNode {
   override def output: Seq[Attribute] = Seq.empty
-  override def children: Seq[LogicalPlan] = Seq.empty
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
index 4accf54a18232..f404621399cea 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -119,7 +119,7 @@ class QueryExecution(val sparkSession: SparkSession, val logical: LogicalPlan) {
    * `SparkSQLDriver` for CLI applications.
    */
   def hiveResultString(): Seq[String] = executedPlan match {
-    case ExecutedCommandExec(desc: DescribeTableCommand, _) =>
+    case ExecutedCommandExec(desc: DescribeTableCommand) =>
       // If it is a describe command for a Hive table, we want to have the output format
       // be similar with Hive.
       desc.run(sparkSession).map {
@@ -130,7 +130,7 @@ class QueryExecution(val sparkSession: SparkSession, val logical: LogicalPlan) {
             .mkString("\t")
       }
     // SHOW TABLES in Hive only output table names, while ours output database, table name, isTemp.
-    case command @ ExecutedCommandExec(s: ShowTablesCommand, _) if !s.isExtended =>
+    case command @ ExecutedCommandExec(s: ShowTablesCommand) if !s.isExtended =>
       command.executeCollect().map(_.getString(1))
     case other =>
       val result: Seq[Seq[Any]] = other.executeCollectPublic().map(_.toSeq).toSeq
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 4cdcc73faacd7..19b858faba6ea 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -364,7 +364,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
   // Can we automate these 'pass through' operations?
   object BasicOperators extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
-      case r: RunnableCommand => ExecutedCommandExec(r, r.children.map(planLater)) :: Nil
+      case r: RunnableCommand => ExecutedCommandExec(r) :: Nil
 
       case MemoryPlan(sink, output) =>
         val encoder = RowEncoder(sink.schema)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
index bc98d8d9d6d61..a1c62a729900e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
@@ -62,7 +62,8 @@ case class InMemoryRelation(
     @transient var _cachedColumnBuffers: RDD[CachedBatch] = null,
     val batchStats: LongAccumulator = child.sqlContext.sparkContext.longAccumulator)
   extends logical.LeafNode with MultiInstanceRelation {
-  override def innerChildren: Seq[SparkPlan] = Seq(child)
+
+  override protected def innerChildren: Seq[SparkPlan] = Seq(child)
 
   override def producedAttributes: AttributeSet = outputSet
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
index c7ddec55682e1..af3636a5a2ca7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
@@ -34,7 +34,7 @@ case class InMemoryTableScanExec(
     @transient relation: InMemoryRelation)
   extends LeafExecNode {
 
-  override def innerChildren: Seq[QueryPlan[_]] = Seq(relation) ++ super.innerChildren
+  override protected def innerChildren: Seq[QueryPlan[_]] = Seq(relation) ++ super.innerChildren
 
   override lazy val metrics = Map(
     "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala
index 4e1c5e4846f36..2cf06982e25f6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution.command
 import org.apache.hadoop.conf.Configuration
 
 import org.apache.spark.SparkContext
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.datasources.BasicWriteJobStatsTracker
 import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
 import org.apache.spark.util.SerializableConfiguration
@@ -30,6 +31,18 @@ import org.apache.spark.util.SerializableConfiguration
  */
 trait DataWritingCommand extends RunnableCommand {
 
+  /**
+   * The input query plan that produces the data to be written.
+   */
+  def query: LogicalPlan
+
+  // We make the input `query` an inner child instead of a child in order to hide it from the
+  // optimizer. This is because optimizer may not preserve the output schema names' case, and we
+  // have to keep the original analyzed plan here so that we can pass the corrected schema to the
+  // writer. The schema of analyzed plan is what user expects(or specifies), so we should respect
+  // it when writing.
+  override protected def innerChildren: Seq[LogicalPlan] = query :: Nil
+
   override lazy val metrics: Map[String, SQLMetric] = {
     val sparkContext = SparkContext.getActive.get
     Map(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/InsertIntoDataSourceDirCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/InsertIntoDataSourceDirCommand.scala
index 633de4c37af94..9e3519073303c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/InsertIntoDataSourceDirCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/InsertIntoDataSourceDirCommand.scala
@@ -21,7 +21,6 @@ import org.apache.spark.SparkException
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.datasources._
 
 /**
@@ -45,10 +44,9 @@ case class InsertIntoDataSourceDirCommand(
     query: LogicalPlan,
     overwrite: Boolean) extends RunnableCommand {
 
-  override def children: Seq[LogicalPlan] = Seq(query)
+  override protected def innerChildren: Seq[LogicalPlan] = query :: Nil
 
-  override def run(sparkSession: SparkSession, children: Seq[SparkPlan]): Seq[Row] = {
-    assert(children.length == 1)
+  override def run(sparkSession: SparkSession): Seq[Row] = {
     assert(storage.locationUri.nonEmpty, "Directory path is required")
     assert(provider.nonEmpty, "Data source is required")
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala
index 792290bef0163..140f920eaafae 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala
@@ -30,7 +30,7 @@ case class CacheTableCommand(
   require(plan.isEmpty || tableIdent.database.isEmpty,
     "Database name is not allowed in CACHE TABLE AS SELECT")
 
-  override def innerChildren: Seq[QueryPlan[_]] = plan.toSeq
+  override protected def innerChildren: Seq[QueryPlan[_]] = plan.toSeq
 
   override def run(sparkSession: SparkSession): Seq[Row] = {
     plan.foreach { logicalPlan =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
index 7cd4baef89e75..e28b5eb2e2a2b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
@@ -24,9 +24,9 @@ import org.apache.spark.sql.{Row, SparkSession}
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
-import org.apache.spark.sql.catalyst.plans.{logical, QueryPlan}
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.catalyst.plans.QueryPlan
+import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan}
+import org.apache.spark.sql.execution.LeafExecNode
 import org.apache.spark.sql.execution.debug._
 import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.sql.execution.streaming.{IncrementalExecution, OffsetSeqMetadata}
@@ -37,19 +37,13 @@ import org.apache.spark.sql.types._
  * A logical command that is executed for its side-effects.  `RunnableCommand`s are
  * wrapped in `ExecutedCommand` during execution.
  */
-trait RunnableCommand extends logical.Command {
+trait RunnableCommand extends Command {
 
   // The map used to record the metrics of running the command. This will be passed to
   // `ExecutedCommand` during query planning.
   lazy val metrics: Map[String, SQLMetric] = Map.empty
 
-  def run(sparkSession: SparkSession, children: Seq[SparkPlan]): Seq[Row] = {
-    throw new NotImplementedError
-  }
-
-  def run(sparkSession: SparkSession): Seq[Row] = {
-    throw new NotImplementedError
-  }
+  def run(sparkSession: SparkSession): Seq[Row]
 }
 
 /**
@@ -57,9 +51,8 @@ trait RunnableCommand extends logical.Command {
  * saves the result to prevent multiple executions.
  *
  * @param cmd the `RunnableCommand` this operator will run.
- * @param children the children physical plans ran by the `RunnableCommand`.
  */
-case class ExecutedCommandExec(cmd: RunnableCommand, children: Seq[SparkPlan]) extends SparkPlan {
+case class ExecutedCommandExec(cmd: RunnableCommand) extends LeafExecNode {
 
   override lazy val metrics: Map[String, SQLMetric] = cmd.metrics
 
@@ -74,19 +67,14 @@ case class ExecutedCommandExec(cmd: RunnableCommand, children: Seq[SparkPlan]) e
    */
   protected[sql] lazy val sideEffectResult: Seq[InternalRow] = {
     val converter = CatalystTypeConverters.createToCatalystConverter(schema)
-    val rows = if (children.isEmpty) {
-      cmd.run(sqlContext.sparkSession)
-    } else {
-      cmd.run(sqlContext.sparkSession, children)
-    }
-    rows.map(converter(_).asInstanceOf[InternalRow])
+    cmd.run(sqlContext.sparkSession).map(converter(_).asInstanceOf[InternalRow])
   }
 
-  override def innerChildren: Seq[QueryPlan[_]] = cmd.innerChildren
+  override protected def innerChildren: Seq[QueryPlan[_]] = cmd :: Nil
 
   override def output: Seq[Attribute] = cmd.output
 
-  override def nodeName: String = cmd.nodeName
+  override def nodeName: String = "Execute " + cmd.nodeName
 
   override def executeCollect(): Array[InternalRow] = sideEffectResult.toArray
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
index 04b2534ca5eb1..9e3907996995c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -120,7 +120,7 @@ case class CreateDataSourceTableAsSelectCommand(
     query: LogicalPlan)
   extends RunnableCommand {
 
-  override def innerChildren: Seq[LogicalPlan] = Seq(query)
+  override protected def innerChildren: Seq[LogicalPlan] = Seq(query)
 
   override def run(sparkSession: SparkSession): Seq[Row] = {
     assert(table.tableType != CatalogTableType.VIEW)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
index ffdfd527fa701..5172f32ec7b9c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
@@ -98,7 +98,7 @@ case class CreateViewCommand(
 
   import ViewHelper._
 
-  override def innerChildren: Seq[QueryPlan[_]] = Seq(child)
+  override protected def innerChildren: Seq[QueryPlan[_]] = Seq(child)
 
   if (viewType == PersistedView) {
     require(originalText.isDefined, "'originalText' must be provided to create permanent view")
@@ -267,7 +267,7 @@ case class AlterViewAsCommand(
 
   import ViewHelper._
 
-  override def innerChildren: Seq[QueryPlan[_]] = Seq(query)
+  override protected def innerChildren: Seq[QueryPlan[_]] = Seq(query)
 
   override def run(session: SparkSession): Seq[Row] = {
     // If the plan cannot be analyzed, throw an exception and don't proceed.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index b9502a95a7c08..b43d282bd434c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -453,6 +453,17 @@ case class DataSource(
     val caseSensitive = sparkSession.sessionState.conf.caseSensitiveAnalysis
     PartitioningUtils.validatePartitionColumn(data.schema, partitionColumns, caseSensitive)
 
+
+    // SPARK-17230: Resolve the partition columns so InsertIntoHadoopFsRelationCommand does
+    // not need to have the query as child, to avoid to analyze an optimized query,
+    // because InsertIntoHadoopFsRelationCommand will be optimized first.
+    val partitionAttributes = partitionColumns.map { name =>
+      data.output.find(a => equality(a.name, name)).getOrElse {
+        throw new AnalysisException(
+          s"Unable to resolve $name given [${data.output.map(_.name).mkString(", ")}]")
+      }
+    }
+
     val fileIndex = catalogTable.map(_.identifier).map { tableIdent =>
       sparkSession.table(tableIdent).queryExecution.analyzed.collect {
         case LogicalRelation(t: HadoopFsRelation, _, _, _) => t.location
@@ -465,7 +476,7 @@ case class DataSource(
       outputPath = outputPath,
       staticPartitions = Map.empty,
       ifPartitionNotExists = false,
-      partitionColumns = partitionColumns.map(UnresolvedAttribute.quoted),
+      partitionColumns = partitionAttributes,
       bucketSpec = bucketSpec,
       fileFormat = format,
       options = options,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
index 514969715091a..75b1695fbc275 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
@@ -39,7 +39,7 @@ import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, _}
 import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning
 import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
-import org.apache.spark.sql.execution.{SortExec, SparkPlan, SQLExecution}
+import org.apache.spark.sql.execution.{QueryExecution, SortExec, SQLExecution}
 import org.apache.spark.sql.types.StringType
 import org.apache.spark.util.{SerializableConfiguration, Utils}
 
@@ -101,7 +101,7 @@ object FileFormatWriter extends Logging {
    */
   def write(
       sparkSession: SparkSession,
-      plan: SparkPlan,
+      queryExecution: QueryExecution,
       fileFormat: FileFormat,
       committer: FileCommitProtocol,
       outputSpec: OutputSpec,
@@ -117,7 +117,9 @@ object FileFormatWriter extends Logging {
     job.setOutputValueClass(classOf[InternalRow])
     FileOutputFormat.setOutputPath(job, new Path(outputSpec.outputPath))
 
-    val allColumns = plan.output
+    // Pick the attributes from analyzed plan, as optimizer may not preserve the output schema
+    // names' case.
+    val allColumns = queryExecution.analyzed.output
     val partitionSet = AttributeSet(partitionColumns)
     val dataColumns = allColumns.filterNot(partitionSet.contains)
 
@@ -158,7 +160,7 @@ object FileFormatWriter extends Logging {
     // We should first sort by partition columns, then bucket id, and finally sorting columns.
     val requiredOrdering = partitionColumns ++ bucketIdExpression ++ sortColumns
     // the sort order doesn't matter
-    val actualOrdering = plan.outputOrdering.map(_.child)
+    val actualOrdering = queryExecution.executedPlan.outputOrdering.map(_.child)
     val orderingMatched = if (requiredOrdering.length > actualOrdering.length) {
       false
     } else {
@@ -176,12 +178,12 @@ object FileFormatWriter extends Logging {
 
     try {
       val rdd = if (orderingMatched) {
-        plan.execute()
+        queryExecution.toRdd
       } else {
         SortExec(
           requiredOrdering.map(SortOrder(_, Ascending)),
           global = false,
-          child = plan).execute()
+          child = queryExecution.executedPlan).execute()
       }
       val ret = new Array[WriteTaskResult](rdd.partitions.length)
       sparkSession.sparkContext.runJob(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoDataSourceCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoDataSourceCommand.scala
index 08b2f4f31170f..a813829d50cb1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoDataSourceCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoDataSourceCommand.scala
@@ -33,7 +33,7 @@ case class InsertIntoDataSourceCommand(
     overwrite: Boolean)
   extends RunnableCommand {
 
-  override def innerChildren: Seq[QueryPlan[_]] = Seq(query)
+  override protected def innerChildren: Seq[QueryPlan[_]] = Seq(query)
 
   override def run(sparkSession: SparkSession): Seq[Row] = {
     val relation = logicalRelation.relation.asInstanceOf[InsertableRelation]
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index 64e5a57adc37c..675bee85bf61e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -27,7 +27,6 @@ import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable, CatalogT
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.command._
 import org.apache.spark.sql.util.SchemaUtils
 
@@ -57,11 +56,7 @@ case class InsertIntoHadoopFsRelationCommand(
   extends DataWritingCommand {
   import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.escapePathName
 
-  override def children: Seq[LogicalPlan] = query :: Nil
-
-  override def run(sparkSession: SparkSession, children: Seq[SparkPlan]): Seq[Row] = {
-    assert(children.length == 1)
-
+  override def run(sparkSession: SparkSession): Seq[Row] = {
     // Most formats don't do well with duplicate columns, so lets not allow that
     SchemaUtils.checkSchemaColumnNameDuplication(
       query.schema,
@@ -144,7 +139,7 @@ case class InsertIntoHadoopFsRelationCommand(
       val updatedPartitionPaths =
         FileFormatWriter.write(
           sparkSession = sparkSession,
-          plan = children.head,
+          queryExecution = Dataset.ofRows(sparkSession, query).queryExecution,
           fileFormat = fileFormat,
           committer = committer,
           outputSpec = FileFormatWriter.OutputSpec(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommand.scala
index 5eb6a8471be0d..96c84eab1c894 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommand.scala
@@ -38,7 +38,7 @@ case class SaveIntoDataSourceCommand(
     options: Map[String, String],
     mode: SaveMode) extends RunnableCommand {
 
-  override def innerChildren: Seq[QueryPlan[_]] = Seq(query)
+  override protected def innerChildren: Seq[QueryPlan[_]] = Seq(query)
 
   override def run(sparkSession: SparkSession): Seq[Row] = {
     dataSource.createRelation(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
index 72e5ac40bbfed..6bd0696622005 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
@@ -121,7 +121,7 @@ class FileStreamSink(
 
       FileFormatWriter.write(
         sparkSession = sparkSession,
-        plan = data.queryExecution.executedPlan,
+        queryExecution = data.queryExecution,
         fileFormat = fileFormat,
         committer = committer,
         outputSpec = FileFormatWriter.OutputSpec(path, Map.empty),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileFormatWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileFormatWriterSuite.scala
index a0c1ea63d3827..6f8767db176aa 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileFormatWriterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileFormatWriterSuite.scala
@@ -17,10 +17,11 @@
 
 package org.apache.spark.sql.execution.datasources
 
-import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.{QueryTest, Row}
 import org.apache.spark.sql.test.SharedSQLContext
 
 class FileFormatWriterSuite extends QueryTest with SharedSQLContext {
+  import testImplicits._
 
   test("empty file should be skipped while write to file") {
     withTempPath { path =>
@@ -30,4 +31,17 @@ class FileFormatWriterSuite extends QueryTest with SharedSQLContext {
       assert(partFiles.length === 2)
     }
   }
+
+  test("FileFormatWriter should respect the input query schema") {
+    withTable("t1", "t2", "t3", "t4") {
+      spark.range(1).select('id as 'col1, 'id as 'col2).write.saveAsTable("t1")
+      spark.sql("select COL1, COL2 from t1").write.saveAsTable("t2")
+      checkAnswer(spark.table("t2"), Row(0, 0))
+
+      // Test picking part of the columns when writing.
+      spark.range(1).select('id, 'id as 'col1, 'id as 'col2).write.saveAsTable("t3")
+      spark.sql("select COL1, COL2 from t3").write.saveAsTable("t4")
+      checkAnswer(spark.table("t4"), Row(0, 0))
+    }
+  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveDirCommand.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveDirCommand.scala
index 918c8be00d69d..1c6f8dd77fc2c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveDirCommand.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveDirCommand.scala
@@ -27,11 +27,10 @@ import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
 import org.apache.hadoop.mapred._
 
 import org.apache.spark.SparkException
-import org.apache.spark.sql.{Row, SparkSession}
+import org.apache.spark.sql.{Dataset, Row, SparkSession}
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.hive.client.HiveClientImpl
 
 /**
@@ -57,10 +56,7 @@ case class InsertIntoHiveDirCommand(
     query: LogicalPlan,
     overwrite: Boolean) extends SaveAsHiveFile {
 
-  override def children: Seq[LogicalPlan] = query :: Nil
-
-  override def run(sparkSession: SparkSession, children: Seq[SparkPlan]): Seq[Row] = {
-    assert(children.length == 1)
+  override def run(sparkSession: SparkSession): Seq[Row] = {
     assert(storage.locationUri.nonEmpty)
 
     val hiveTable = HiveClientImpl.toHiveTable(CatalogTable(
@@ -102,7 +98,7 @@ case class InsertIntoHiveDirCommand(
     try {
       saveAsHiveFile(
         sparkSession = sparkSession,
-        plan = children.head,
+        queryExecution = Dataset.ofRows(sparkSession, query).queryExecution,
         hadoopConf = hadoopConf,
         fileSinkConf = fileSinkConf,
         outputLocation = tmpPath.toString)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index e5b59ed7a1a6b..56e10bc457a00 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -17,20 +17,16 @@
 
 package org.apache.spark.sql.hive.execution
 
-import scala.util.control.NonFatal
-
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.hive.ql.ErrorMsg
 import org.apache.hadoop.hive.ql.plan.TableDesc
 
 import org.apache.spark.SparkException
-import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
+import org.apache.spark.sql.{AnalysisException, Dataset, Row, SparkSession}
 import org.apache.spark.sql.catalyst.catalog.CatalogTable
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.command.CommandUtils
-import org.apache.spark.sql.hive._
 import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc}
 import org.apache.spark.sql.hive.client.HiveClientImpl
 
@@ -72,16 +68,12 @@ case class InsertIntoHiveTable(
     overwrite: Boolean,
     ifPartitionNotExists: Boolean) extends SaveAsHiveFile {
 
-  override def children: Seq[LogicalPlan] = query :: Nil
-
   /**
    * Inserts all the rows in the table into Hive.  Row objects are properly serialized with the
    * `org.apache.hadoop.hive.serde2.SerDe` and the
    * `org.apache.hadoop.mapred.OutputFormat` provided by the table definition.
    */
-  override def run(sparkSession: SparkSession, children: Seq[SparkPlan]): Seq[Row] = {
-    assert(children.length == 1)
-
+  override def run(sparkSession: SparkSession): Seq[Row] = {
     val externalCatalog = sparkSession.sharedState.externalCatalog
     val hadoopConf = sparkSession.sessionState.newHadoopConf()
 
@@ -170,7 +162,7 @@ case class InsertIntoHiveTable(
 
     saveAsHiveFile(
       sparkSession = sparkSession,
-      plan = children.head,
+      queryExecution = Dataset.ofRows(sparkSession, query).queryExecution,
       hadoopConf = hadoopConf,
       fileSinkConf = fileSinkConf,
       outputLocation = tmpLocation.toString,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
index 2d74ef040ef5a..63657590e5e79 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
@@ -33,7 +33,7 @@ import org.apache.spark.internal.io.FileCommitProtocol
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.QueryExecution
 import org.apache.spark.sql.execution.command.DataWritingCommand
 import org.apache.spark.sql.execution.datasources.FileFormatWriter
 import org.apache.spark.sql.hive.HiveExternalCatalog
@@ -47,7 +47,7 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
 
   protected def saveAsHiveFile(
       sparkSession: SparkSession,
-      plan: SparkPlan,
+      queryExecution: QueryExecution,
       hadoopConf: Configuration,
       fileSinkConf: FileSinkDesc,
       outputLocation: String,
@@ -75,7 +75,7 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
 
     FileFormatWriter.write(
       sparkSession = sparkSession,
-      plan = plan,
+      queryExecution = queryExecution,
       fileFormat = new HiveFileFormat(fileSinkConf),
       committer = committer,
       outputSpec = FileFormatWriter.OutputSpec(outputLocation, customPartitionLocations),

From b5c1ef7a8e4db4067bc361d10d554ee9a538423f Mon Sep 17 00:00:00 2001
From: Xianyang Liu <xianyang.liu@intel.com>
Date: Thu, 12 Oct 2017 20:26:51 +0800
Subject: [PATCH 699/779] [SPARK-22097][CORE] Request an accurate memory after
 we unrolled the block

## What changes were proposed in this pull request?

We only need request `bbos.size - unrollMemoryUsedByThisBlock` after unrolled the block.

## How was this patch tested?

Existing UT.

Author: Xianyang Liu <xianyang.liu@intel.com>

Closes #19316 from ConeyLiu/putIteratorAsBytes.
---
 .../org/apache/spark/storage/memory/MemoryStore.scala     | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/storage/memory/MemoryStore.scala b/core/src/main/scala/org/apache/spark/storage/memory/MemoryStore.scala
index 651e9c7b2ab61..17f7a69ad6ba1 100644
--- a/core/src/main/scala/org/apache/spark/storage/memory/MemoryStore.scala
+++ b/core/src/main/scala/org/apache/spark/storage/memory/MemoryStore.scala
@@ -388,7 +388,13 @@ private[spark] class MemoryStore(
     // perform one final call to attempt to allocate additional memory if necessary.
     if (keepUnrolling) {
       serializationStream.close()
-      reserveAdditionalMemoryIfNecessary()
+      if (bbos.size > unrollMemoryUsedByThisBlock) {
+        val amountToRequest = bbos.size - unrollMemoryUsedByThisBlock
+        keepUnrolling = reserveUnrollMemoryForThisTask(blockId, amountToRequest, memoryMode)
+        if (keepUnrolling) {
+          unrollMemoryUsedByThisBlock += amountToRequest
+        }
+      }
     }
 
     if (keepUnrolling) {

From 73d80ec49713605d6a589e688020f0fc2d6feab2 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 12 Oct 2017 20:34:03 +0800
Subject: [PATCH 700/779] [SPARK-22197][SQL] push down operators to data source
 before planning

## What changes were proposed in this pull request?

As we discussed in https://github.com/apache/spark/pull/19136#discussion_r137023744 , we should push down operators to data source before planning, so that data source can report statistics more accurate.

This PR also includes some cleanup for the read path.

## How was this patch tested?

existing tests.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19424 from cloud-fan/follow.
---
 .../spark/sql/sources/v2/ReadSupport.java     |   5 +-
 .../sql/sources/v2/ReadSupportWithSchema.java |   5 +-
 .../sql/sources/v2/reader/DataReader.java     |   4 +
 .../sources/v2/reader/DataSourceV2Reader.java |   2 +-
 .../spark/sql/sources/v2/reader/ReadTask.java |   3 +-
 .../SupportsPushDownCatalystFilters.java      |   8 +
 .../v2/reader/SupportsPushDownFilters.java    |   8 +
 .../apache/spark/sql/DataFrameReader.scala    |   5 +-
 .../spark/sql/execution/SparkOptimizer.scala  |   4 +-
 .../v2/DataSourceReaderHolder.scala           |  68 +++++++++
 .../datasources/v2/DataSourceV2Relation.scala |   8 +-
 .../datasources/v2/DataSourceV2ScanExec.scala |  22 +--
 .../datasources/v2/DataSourceV2Strategy.scala |  60 +-------
 .../v2/PushDownOperatorsToDataSource.scala    | 140 ++++++++++++++++++
 .../sources/v2/JavaAdvancedDataSourceV2.java  |   5 +
 .../sql/sources/v2/DataSourceV2Suite.scala    |   2 +
 16 files changed, 262 insertions(+), 87 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceReaderHolder.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownOperatorsToDataSource.scala

diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupport.java
index ab5254a688d5a..ee489ad0f608f 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupport.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupport.java
@@ -30,9 +30,8 @@ public interface ReadSupport {
   /**
    * Creates a {@link DataSourceV2Reader} to scan the data from this data source.
    *
-   * @param options the options for this data source reader, which is an immutable case-insensitive
-   *                string-to-string map.
-   * @return a reader that implements the actual read logic.
+   * @param options the options for the returned data source reader, which is an immutable
+   *                case-insensitive string-to-string map.
    */
   DataSourceV2Reader createReader(DataSourceV2Options options);
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupportWithSchema.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupportWithSchema.java
index c13aeca2ef36f..74e81a2c84d68 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupportWithSchema.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupportWithSchema.java
@@ -39,9 +39,8 @@ public interface ReadSupportWithSchema {
    *               physical schema of the underlying storage of this data source reader, e.g.
    *               CSV files, JSON files, etc, while this reader may not read data with full
    *               schema, as column pruning or other optimizations may happen.
-   * @param options the options for this data source reader, which is an immutable case-insensitive
-   *                string-to-string map.
-   * @return a reader that implements the actual read logic.
+   * @param options the options for the returned data source reader, which is an immutable
+   *                case-insensitive string-to-string map.
    */
   DataSourceV2Reader createReader(StructType schema, DataSourceV2Options options);
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReader.java
index cfafc1a576793..95e091569b614 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReader.java
@@ -24,6 +24,10 @@
 /**
  * A data reader returned by {@link ReadTask#createReader()} and is responsible for outputting data
  * for a RDD partition.
+ *
+ * Note that, Currently the type `T` can only be {@link org.apache.spark.sql.Row} for normal data
+ * source readers, or {@link org.apache.spark.sql.catalyst.expressions.UnsafeRow} for data source
+ * readers that mix in {@link SupportsScanUnsafeRow}.
  */
 @InterfaceStability.Evolving
 public interface DataReader<T> extends Closeable {
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java
index fb4d5c0d7ae41..5989a4ac8440b 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java
@@ -30,7 +30,7 @@
  * {@link org.apache.spark.sql.sources.v2.ReadSupportWithSchema#createReader(
  * StructType, org.apache.spark.sql.sources.v2.DataSourceV2Options)}.
  * It can mix in various query optimization interfaces to speed up the data scan. The actual scan
- * logic should be delegated to {@link ReadTask}s that are returned by {@link #createReadTasks()}.
+ * logic is delegated to {@link ReadTask}s that are returned by {@link #createReadTasks()}.
  *
  * There are mainly 3 kinds of query optimizations:
  *   1. Operators push-down. E.g., filter push-down, required columns push-down(aka column
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ReadTask.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ReadTask.java
index 7885bfcdd49e4..01362df0978cb 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ReadTask.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ReadTask.java
@@ -27,7 +27,8 @@
  * is similar to the relationship between {@link Iterable} and {@link java.util.Iterator}.
  *
  * Note that, the read task will be serialized and sent to executors, then the data reader will be
- * created on executors and do the actual reading.
+ * created on executors and do the actual reading. So {@link ReadTask} must be serializable and
+ * {@link DataReader} doesn't need to be.
  */
 @InterfaceStability.Evolving
 public interface ReadTask<T> extends Serializable {
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownCatalystFilters.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownCatalystFilters.java
index 19d706238ec8e..d6091774d75aa 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownCatalystFilters.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownCatalystFilters.java
@@ -40,4 +40,12 @@ public interface SupportsPushDownCatalystFilters {
    * Pushes down filters, and returns unsupported filters.
    */
   Expression[] pushCatalystFilters(Expression[] filters);
+
+  /**
+   * Returns the catalyst filters that are pushed in {@link #pushCatalystFilters(Expression[])}.
+   * It's possible that there is no filters in the query and
+   * {@link #pushCatalystFilters(Expression[])} is never called, empty array should be returned for
+   * this case.
+   */
+  Expression[] pushedCatalystFilters();
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownFilters.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownFilters.java
index d4b509e7080f2..d6f297c013375 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownFilters.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownFilters.java
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.sources.v2.reader;
 
 import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.catalyst.expressions.Expression;
 import org.apache.spark.sql.sources.Filter;
 
 /**
@@ -35,4 +36,11 @@ public interface SupportsPushDownFilters {
    * Pushes down filters, and returns unsupported filters.
    */
   Filter[] pushFilters(Filter[] filters);
+
+  /**
+   * Returns the filters that are pushed in {@link #pushFilters(Filter[])}.
+   * It's possible that there is no filters in the query and {@link #pushFilters(Filter[])}
+   * is never called, empty array should be returned for this case.
+   */
+  Filter[] pushedFilters();
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 78b668c04fd5c..17966eecfc051 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -184,7 +184,6 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
 
     val cls = DataSource.lookupDataSource(source)
     if (classOf[DataSourceV2].isAssignableFrom(cls)) {
-      val dataSource = cls.newInstance()
       val options = new DataSourceV2Options(extraOptions.asJava)
 
       val reader = (cls.newInstance(), userSpecifiedSchema) match {
@@ -194,8 +193,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
         case (ds: ReadSupport, None) =>
           ds.createReader(options)
 
-        case (_: ReadSupportWithSchema, None) =>
-          throw new AnalysisException(s"A schema needs to be specified when using $dataSource.")
+        case (ds: ReadSupportWithSchema, None) =>
+          throw new AnalysisException(s"A schema needs to be specified when using $ds.")
 
         case (ds: ReadSupport, Some(schema)) =>
           val reader = ds.createReader(options)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
index 00ff4c8ac310b..1c8e4050978dc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
@@ -21,6 +21,7 @@ import org.apache.spark.sql.ExperimentalMethods
 import org.apache.spark.sql.catalyst.catalog.SessionCatalog
 import org.apache.spark.sql.catalyst.optimizer.Optimizer
 import org.apache.spark.sql.execution.datasources.PruneFileSourcePartitions
+import org.apache.spark.sql.execution.datasources.v2.PushDownOperatorsToDataSource
 import org.apache.spark.sql.execution.python.ExtractPythonUDFFromAggregate
 
 class SparkOptimizer(
@@ -31,7 +32,8 @@ class SparkOptimizer(
   override def batches: Seq[Batch] = (preOptimizationBatches ++ super.batches :+
     Batch("Optimize Metadata Only Query", Once, OptimizeMetadataOnlyQuery(catalog)) :+
     Batch("Extract Python UDF from Aggregate", Once, ExtractPythonUDFFromAggregate) :+
-    Batch("Prune File Source Table Partitions", Once, PruneFileSourcePartitions)) ++
+    Batch("Prune File Source Table Partitions", Once, PruneFileSourcePartitions) :+
+    Batch("Push down operators to data source scan", Once, PushDownOperatorsToDataSource)) ++
     postHocOptimizationBatches :+
     Batch("User Provided Optimizers", fixedPoint, experimentalMethods.extraOptimizations: _*)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceReaderHolder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceReaderHolder.scala
new file mode 100644
index 0000000000000..6093df26630cd
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceReaderHolder.scala
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.v2
+
+import java.util.Objects
+
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
+import org.apache.spark.sql.sources.v2.reader._
+
+/**
+ * A base class for data source reader holder with customized equals/hashCode methods.
+ */
+trait DataSourceReaderHolder {
+
+  /**
+   * The full output of the data source reader, without column pruning.
+   */
+  def fullOutput: Seq[AttributeReference]
+
+  /**
+   * The held data source reader.
+   */
+  def reader: DataSourceV2Reader
+
+  /**
+   * The metadata of this data source reader that can be used for equality test.
+   */
+  private def metadata: Seq[Any] = {
+    val filters: Any = reader match {
+      case s: SupportsPushDownCatalystFilters => s.pushedCatalystFilters().toSet
+      case s: SupportsPushDownFilters => s.pushedFilters().toSet
+      case _ => Nil
+    }
+    Seq(fullOutput, reader.getClass, reader.readSchema(), filters)
+  }
+
+  def canEqual(other: Any): Boolean
+
+  override def equals(other: Any): Boolean = other match {
+    case other: DataSourceReaderHolder =>
+      canEqual(other) && metadata.length == other.metadata.length &&
+        metadata.zip(other.metadata).forall { case (l, r) => l == r }
+    case _ => false
+  }
+
+  override def hashCode(): Int = {
+    metadata.map(Objects.hashCode).foldLeft(0)((a, b) => 31 * a + b)
+  }
+
+  lazy val output: Seq[Attribute] = reader.readSchema().map(_.name).map { name =>
+    fullOutput.find(_.name == name).get
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
index 3c9b598fd07c9..7eb99a645001a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
@@ -19,11 +19,13 @@ package org.apache.spark.sql.execution.datasources.v2
 
 import org.apache.spark.sql.catalyst.expressions.AttributeReference
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Statistics}
-import org.apache.spark.sql.sources.v2.reader.{DataSourceV2Reader, SupportsReportStatistics}
+import org.apache.spark.sql.sources.v2.reader._
 
 case class DataSourceV2Relation(
-    output: Seq[AttributeReference],
-    reader: DataSourceV2Reader) extends LeafNode {
+    fullOutput: Seq[AttributeReference],
+    reader: DataSourceV2Reader) extends LeafNode with DataSourceReaderHolder {
+
+  override def canEqual(other: Any): Boolean = other.isInstanceOf[DataSourceV2Relation]
 
   override def computeStats(): Statistics = reader match {
     case r: SupportsReportStatistics =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
index 7999c0ceb5749..addc12a3f0901 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
@@ -29,20 +29,14 @@ import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.sql.sources.v2.reader._
 import org.apache.spark.sql.types.StructType
 
+/**
+ * Physical plan node for scanning data from a data source.
+ */
 case class DataSourceV2ScanExec(
-    fullOutput: Array[AttributeReference],
-    @transient reader: DataSourceV2Reader,
-    // TODO: these 3 parameters are only used to determine the equality of the scan node, however,
-    // the reader also have this information, and ideally we can just rely on the equality of the
-    // reader. The only concern is, the reader implementation is outside of Spark and we have no
-    // control.
-    readSchema: StructType,
-    @transient filters: ExpressionSet,
-    hashPartitionKeys: Seq[String]) extends LeafExecNode {
-
-  def output: Seq[Attribute] = readSchema.map(_.name).map { name =>
-    fullOutput.find(_.name == name).get
-  }
+    fullOutput: Seq[AttributeReference],
+    @transient reader: DataSourceV2Reader) extends LeafExecNode with DataSourceReaderHolder {
+
+  override def canEqual(other: Any): Boolean = other.isInstanceOf[DataSourceV2ScanExec]
 
   override def references: AttributeSet = AttributeSet.empty
 
@@ -74,7 +68,7 @@ class RowToUnsafeRowReadTask(rowReadTask: ReadTask[Row], schema: StructType)
   override def preferredLocations: Array[String] = rowReadTask.preferredLocations
 
   override def createReader: DataReader[UnsafeRow] = {
-    new RowToUnsafeDataReader(rowReadTask.createReader, RowEncoder.apply(schema))
+    new RowToUnsafeDataReader(rowReadTask.createReader, RowEncoder.apply(schema).resolveAndBind())
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala
index b80f695b2a87f..f2cda002245e8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala
@@ -29,64 +29,8 @@ import org.apache.spark.sql.sources.v2.reader._
 object DataSourceV2Strategy extends Strategy {
   // TODO: write path
   override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
-    case PhysicalOperation(projects, filters, DataSourceV2Relation(output, reader)) =>
-      val stayUpFilters: Seq[Expression] = reader match {
-        case r: SupportsPushDownCatalystFilters =>
-          r.pushCatalystFilters(filters.toArray)
-
-        case r: SupportsPushDownFilters =>
-          // A map from original Catalyst expressions to corresponding translated data source
-          // filters. If a predicate is not in this map, it means it cannot be pushed down.
-          val translatedMap: Map[Expression, Filter] = filters.flatMap { p =>
-            DataSourceStrategy.translateFilter(p).map(f => p -> f)
-          }.toMap
-
-          // Catalyst predicate expressions that cannot be converted to data source filters.
-          val nonConvertiblePredicates = filters.filterNot(translatedMap.contains)
-
-          // Data source filters that cannot be pushed down. An unhandled filter means
-          // the data source cannot guarantee the rows returned can pass the filter.
-          // As a result we must return it so Spark can plan an extra filter operator.
-          val unhandledFilters = r.pushFilters(translatedMap.values.toArray).toSet
-          val unhandledPredicates = translatedMap.filter { case (_, f) =>
-            unhandledFilters.contains(f)
-          }.keys
-
-          nonConvertiblePredicates ++ unhandledPredicates
-
-        case _ => filters
-      }
-
-      val attrMap = AttributeMap(output.zip(output))
-      val projectSet = AttributeSet(projects.flatMap(_.references))
-      val filterSet = AttributeSet(stayUpFilters.flatMap(_.references))
-
-      // Match original case of attributes.
-      // TODO: nested fields pruning
-      val requiredColumns = (projectSet ++ filterSet).toSeq.map(attrMap)
-      reader match {
-        case r: SupportsPushDownRequiredColumns =>
-          r.pruneColumns(requiredColumns.toStructType)
-        case _ =>
-      }
-
-      val scan = DataSourceV2ScanExec(
-        output.toArray,
-        reader,
-        reader.readSchema(),
-        ExpressionSet(filters),
-        Nil)
-
-      val filterCondition = stayUpFilters.reduceLeftOption(And)
-      val withFilter = filterCondition.map(FilterExec(_, scan)).getOrElse(scan)
-
-      val withProject = if (projects == withFilter.output) {
-        withFilter
-      } else {
-        ProjectExec(projects, withFilter)
-      }
-
-      withProject :: Nil
+    case DataSourceV2Relation(output, reader) =>
+      DataSourceV2ScanExec(output, reader) :: Nil
 
     case _ => Nil
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownOperatorsToDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownOperatorsToDataSource.scala
new file mode 100644
index 0000000000000..0c1708131ae46
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownOperatorsToDataSource.scala
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.v2
+
+import org.apache.spark.sql.catalyst.expressions.{And, Attribute, AttributeMap, Expression, NamedExpression, PredicateHelper}
+import org.apache.spark.sql.catalyst.optimizer.RemoveRedundantProject
+import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project}
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.execution.datasources.DataSourceStrategy
+import org.apache.spark.sql.sources
+import org.apache.spark.sql.sources.v2.reader._
+
+/**
+ * Pushes down various operators to the underlying data source for better performance. Operators are
+ * being pushed down with a specific order. As an example, given a LIMIT has a FILTER child, you
+ * can't push down LIMIT if FILTER is not completely pushed down. When both are pushed down, the
+ * data source should execute FILTER before LIMIT. And required columns are calculated at the end,
+ * because when more operators are pushed down, we may need less columns at Spark side.
+ */
+object PushDownOperatorsToDataSource extends Rule[LogicalPlan] with PredicateHelper {
+  override def apply(plan: LogicalPlan): LogicalPlan = {
+    // Note that, we need to collect the target operator along with PROJECT node, as PROJECT may
+    // appear in many places for column pruning.
+    // TODO: Ideally column pruning should be implemented via a plan property that is propagated
+    // top-down, then we can simplify the logic here and only collect target operators.
+    val filterPushed = plan transformUp {
+      case FilterAndProject(fields, condition, r @ DataSourceV2Relation(_, reader)) =>
+        // Non-deterministic expressions are stateful and we must keep the input sequence unchanged
+        // to avoid changing the result. This means, we can't evaluate the filter conditions that
+        // are after the first non-deterministic condition ahead. Here we only try to push down
+        // deterministic conditions that are before the first non-deterministic condition.
+        val (candidates, containingNonDeterministic) =
+          splitConjunctivePredicates(condition).span(_.deterministic)
+
+        val stayUpFilters: Seq[Expression] = reader match {
+          case r: SupportsPushDownCatalystFilters =>
+            r.pushCatalystFilters(candidates.toArray)
+
+          case r: SupportsPushDownFilters =>
+            // A map from original Catalyst expressions to corresponding translated data source
+            // filters. If a predicate is not in this map, it means it cannot be pushed down.
+            val translatedMap: Map[Expression, sources.Filter] = candidates.flatMap { p =>
+              DataSourceStrategy.translateFilter(p).map(f => p -> f)
+            }.toMap
+
+            // Catalyst predicate expressions that cannot be converted to data source filters.
+            val nonConvertiblePredicates = candidates.filterNot(translatedMap.contains)
+
+            // Data source filters that cannot be pushed down. An unhandled filter means
+            // the data source cannot guarantee the rows returned can pass the filter.
+            // As a result we must return it so Spark can plan an extra filter operator.
+            val unhandledFilters = r.pushFilters(translatedMap.values.toArray).toSet
+            val unhandledPredicates = translatedMap.filter { case (_, f) =>
+              unhandledFilters.contains(f)
+            }.keys
+
+            nonConvertiblePredicates ++ unhandledPredicates
+
+          case _ => candidates
+        }
+
+        val filterCondition = (stayUpFilters ++ containingNonDeterministic).reduceLeftOption(And)
+        val withFilter = filterCondition.map(Filter(_, r)).getOrElse(r)
+        if (withFilter.output == fields) {
+          withFilter
+        } else {
+          Project(fields, withFilter)
+        }
+    }
+
+    // TODO: add more push down rules.
+
+    // TODO: nested fields pruning
+    def pushDownRequiredColumns(plan: LogicalPlan, requiredByParent: Seq[Attribute]): Unit = {
+      plan match {
+        case Project(projectList, child) =>
+          val required = projectList.filter(requiredByParent.contains).flatMap(_.references)
+          pushDownRequiredColumns(child, required)
+
+        case Filter(condition, child) =>
+          val required = requiredByParent ++ condition.references
+          pushDownRequiredColumns(child, required)
+
+        case DataSourceV2Relation(fullOutput, reader) => reader match {
+          case r: SupportsPushDownRequiredColumns =>
+            // Match original case of attributes.
+            val attrMap = AttributeMap(fullOutput.zip(fullOutput))
+            val requiredColumns = requiredByParent.map(attrMap)
+            r.pruneColumns(requiredColumns.toStructType)
+          case _ =>
+        }
+
+        // TODO: there may be more operators can be used to calculate required columns, we can add
+        // more and more in the future.
+        case _ => plan.children.foreach(child => pushDownRequiredColumns(child, child.output))
+      }
+    }
+
+    pushDownRequiredColumns(filterPushed, filterPushed.output)
+    // After column pruning, we may have redundant PROJECT nodes in the query plan, remove them.
+    RemoveRedundantProject(filterPushed)
+  }
+
+  /**
+   * Finds a Filter node(with an optional Project child) above data source relation.
+   */
+  object FilterAndProject {
+    // returns the project list, the filter condition and the data source relation.
+    def unapply(plan: LogicalPlan)
+        : Option[(Seq[NamedExpression], Expression, DataSourceV2Relation)] = plan match {
+
+      case Filter(condition, r: DataSourceV2Relation) => Some((r.output, condition, r))
+
+      case Filter(condition, Project(fields, r: DataSourceV2Relation))
+          if fields.forall(_.deterministic) =>
+        val attributeMap = AttributeMap(fields.map(e => e.toAttribute -> e))
+        val substituted = condition.transform {
+          case a: Attribute => attributeMap.getOrElse(a, a)
+        }
+        Some((fields, substituted, r))
+
+      case _ => None
+    }
+  }
+}
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java
index 7aacf0346d2fb..da2c13f70c52a 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java
@@ -54,6 +54,11 @@ public Filter[] pushFilters(Filter[] filters) {
       return new Filter[0];
     }
 
+    @Override
+    public Filter[] pushedFilters() {
+      return filters;
+    }
+
     @Override
     public List<ReadTask<Row>> createReadTasks() {
       List<ReadTask<Row>> res = new ArrayList<>();
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
index 9ce93d7ae926c..f238e565dc2fc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
@@ -129,6 +129,8 @@ class AdvancedDataSourceV2 extends DataSourceV2 with ReadSupport {
       Array.empty
     }
 
+    override def pushedFilters(): Array[Filter] = filters
+
     override def readSchema(): StructType = {
       requiredSchema
     }

From 02218c4c73c32741390d9906b6190ef2124ce518 Mon Sep 17 00:00:00 2001
From: Ala Luszczak <ala@databricks.com>
Date: Thu, 12 Oct 2017 17:00:22 +0200
Subject: [PATCH 701/779] [SPARK-22251][SQL] Metric 'aggregate time' is
 incorrect when codegen is off

## What changes were proposed in this pull request?

Adding the code for setting 'aggregate time' metric to non-codegen path in HashAggregateExec and to ObjectHashAggregateExces.

## How was this patch tested?

Tested manually.

Author: Ala Luszczak <ala@databricks.com>

Closes #19473 from ala/fix-agg-time.
---
 .../sql/execution/aggregate/HashAggregateExec.scala      | 6 +++++-
 .../execution/aggregate/ObjectHashAggregateExec.scala    | 9 +++++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
index 8b573fdcf25e1..43e5ff89afee6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
@@ -95,11 +95,13 @@ case class HashAggregateExec(
     val peakMemory = longMetric("peakMemory")
     val spillSize = longMetric("spillSize")
     val avgHashProbe = longMetric("avgHashProbe")
+    val aggTime = longMetric("aggTime")
 
     child.execute().mapPartitionsWithIndex { (partIndex, iter) =>
 
+      val beforeAgg = System.nanoTime()
       val hasInput = iter.hasNext
-      if (!hasInput && groupingExpressions.nonEmpty) {
+      val res = if (!hasInput && groupingExpressions.nonEmpty) {
         // This is a grouped aggregate and the input iterator is empty,
         // so return an empty iterator.
         Iterator.empty
@@ -128,6 +130,8 @@ case class HashAggregateExec(
           aggregationIterator
         }
       }
+      aggTime += (System.nanoTime() - beforeAgg) / 1000000
+      res
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectHashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectHashAggregateExec.scala
index 6316e06a8f34e..ec3f9a05b5ccc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectHashAggregateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectHashAggregateExec.scala
@@ -76,7 +76,8 @@ case class ObjectHashAggregateExec(
       aggregateExpressions.flatMap(_.aggregateFunction.inputAggBufferAttributes)
 
   override lazy val metrics = Map(
-    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")
+    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
+    "aggTime" -> SQLMetrics.createTimingMetric(sparkContext, "aggregate time")
   )
 
   override def output: Seq[Attribute] = resultExpressions.map(_.toAttribute)
@@ -96,11 +97,13 @@ case class ObjectHashAggregateExec(
 
   protected override def doExecute(): RDD[InternalRow] = attachTree(this, "execute") {
     val numOutputRows = longMetric("numOutputRows")
+    val aggTime = longMetric("aggTime")
     val fallbackCountThreshold = sqlContext.conf.objectAggSortBasedFallbackThreshold
 
     child.execute().mapPartitionsWithIndexInternal { (partIndex, iter) =>
+      val beforeAgg = System.nanoTime()
       val hasInput = iter.hasNext
-      if (!hasInput && groupingExpressions.nonEmpty) {
+      val res = if (!hasInput && groupingExpressions.nonEmpty) {
         // This is a grouped aggregate and the input kvIterator is empty,
         // so return an empty kvIterator.
         Iterator.empty
@@ -127,6 +130,8 @@ case class ObjectHashAggregateExec(
           aggregationIterator
         }
       }
+      aggTime += (System.nanoTime() - beforeAgg) / 1000000
+      res
     }
   }
 

From 9104add4c7c6b578df15b64a8533a1266f90734e Mon Sep 17 00:00:00 2001
From: Steve Loughran <stevel@hortonworks.com>
Date: Fri, 13 Oct 2017 08:40:26 +0900
Subject: [PATCH 702/779] [SPARK-22217][SQL] ParquetFileFormat to support
 arbitrary OutputCommitters

## What changes were proposed in this pull request?

`ParquetFileFormat` to relax its requirement of output committer class from `org.apache.parquet.hadoop.ParquetOutputCommitter` or subclass thereof (and so implicitly Hadoop `FileOutputCommitter`) to any committer implementing `org.apache.hadoop.mapreduce.OutputCommitter`

This enables output committers which don't write to the filesystem the way `FileOutputCommitter` does to save parquet data from a dataframe: at present you cannot do this.

Before a committer which isn't a subclass of `ParquetOutputCommitter`, it checks to see if the context has requested summary metadata by setting `parquet.enable.summary-metadata`. If true, and the committer class isn't a parquet committer, it raises a RuntimeException with an error message.

(It could downgrade, of course, but raising an exception makes it clear there won't be an summary. It also makes the behaviour testable.)

Note that `SQLConf` already states that any `OutputCommitter` can be used, but that typically it's a subclass of ParquetOutputCommitter. That's not currently true. This patch will make the code consistent with the docs, adding tests to verify,

## How was this patch tested?

The patch includes a test suite, `ParquetCommitterSuite`, with a new committer, `MarkingFileOutputCommitter` which extends `FileOutputCommitter` and writes a marker file in the destination directory. The presence of the marker file can be used to verify the new committer was used. The tests then try the combinations of Parquet committer summary/no-summary and marking committer summary/no-summary.

| committer | summary | outcome |
|-----------|---------|---------|
| parquet   | true    | success |
| parquet   | false   | success |
| marking   | false   | success with marker |
| marking   | true    | exception |

All tests are happy.

Author: Steve Loughran <stevel@hortonworks.com>

Closes #19448 from steveloughran/cloud/SPARK-22217-committer.
---
 .../apache/spark/sql/internal/SQLConf.scala   |   5 +-
 .../parquet/ParquetFileFormat.scala           |  12 +-
 .../parquet/ParquetCommitterSuite.scala       | 152 ++++++++++++++++++
 3 files changed, 165 insertions(+), 4 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCommitterSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 58323740b80cc..618d4a0d6148a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -306,8 +306,9 @@ object SQLConf {
 
   val PARQUET_OUTPUT_COMMITTER_CLASS = buildConf("spark.sql.parquet.output.committer.class")
     .doc("The output committer class used by Parquet. The specified class needs to be a " +
-      "subclass of org.apache.hadoop.mapreduce.OutputCommitter.  Typically, it's also a subclass " +
-      "of org.apache.parquet.hadoop.ParquetOutputCommitter.")
+      "subclass of org.apache.hadoop.mapreduce.OutputCommitter. Typically, it's also a subclass " +
+      "of org.apache.parquet.hadoop.ParquetOutputCommitter. If it is not, then metadata summaries" +
+      "will never be created, irrespective of the value of parquet.enable.summary-metadata")
     .internal()
     .stringConf
     .createWithDefault("org.apache.parquet.hadoop.ParquetOutputCommitter")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
index e1e740500205a..c1535babbae1f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -86,7 +86,7 @@ class ParquetFileFormat
       conf.getClass(
         SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key,
         classOf[ParquetOutputCommitter],
-        classOf[ParquetOutputCommitter])
+        classOf[OutputCommitter])
 
     if (conf.get(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key) == null) {
       logInfo("Using default output committer for Parquet: " +
@@ -98,7 +98,7 @@ class ParquetFileFormat
     conf.setClass(
       SQLConf.OUTPUT_COMMITTER_CLASS.key,
       committerClass,
-      classOf[ParquetOutputCommitter])
+      classOf[OutputCommitter])
 
     // We're not really using `ParquetOutputFormat[Row]` for writing data here, because we override
     // it in `ParquetOutputWriter` to support appending and dynamic partitioning.  The reason why
@@ -138,6 +138,14 @@ class ParquetFileFormat
       conf.setBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, false)
     }
 
+    if (conf.getBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, false)
+      && !classOf[ParquetOutputCommitter].isAssignableFrom(committerClass)) {
+      // output summary is requested, but the class is not a Parquet Committer
+      logWarning(s"Committer $committerClass is not a ParquetOutputCommitter and cannot" +
+        s" create job summaries. " +
+        s"Set Parquet option ${ParquetOutputFormat.ENABLE_JOB_SUMMARY} to false.")
+    }
+
     new OutputWriterFactory {
       // This OutputWriterFactory instance is deserialized when writing Parquet files on the
       // executor side without constructing or deserializing ParquetFileFormat. Therefore, we hold
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCommitterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCommitterSuite.scala
new file mode 100644
index 0000000000000..caa4f6d70c6a9
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCommitterSuite.scala
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.parquet
+
+import java.io.FileNotFoundException
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileStatus, Path}
+import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
+import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
+import org.apache.parquet.hadoop.{ParquetOutputCommitter, ParquetOutputFormat}
+
+import org.apache.spark.{LocalSparkContext, SparkFunSuite}
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.test.SQLTestUtils
+
+/**
+ * Test logic related to choice of output committers.
+ */
+class ParquetCommitterSuite extends SparkFunSuite with SQLTestUtils
+  with LocalSparkContext {
+
+  private val PARQUET_COMMITTER = classOf[ParquetOutputCommitter].getCanonicalName
+
+  protected var spark: SparkSession = _
+
+  /**
+   * Create a new [[SparkSession]] running in local-cluster mode with unsafe and codegen enabled.
+   */
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    spark = SparkSession.builder()
+      .master("local-cluster[2,1,1024]")
+      .appName("testing")
+      .getOrCreate()
+  }
+
+  override def afterAll(): Unit = {
+    try {
+      if (spark != null) {
+        spark.stop()
+        spark = null
+      }
+    } finally {
+      super.afterAll()
+    }
+  }
+
+  test("alternative output committer, merge schema") {
+    writeDataFrame(MarkingFileOutput.COMMITTER, summary = true, check = true)
+  }
+
+  test("alternative output committer, no merge schema") {
+    writeDataFrame(MarkingFileOutput.COMMITTER, summary = false, check = true)
+  }
+
+  test("Parquet output committer, merge schema") {
+    writeDataFrame(PARQUET_COMMITTER, summary = true, check = false)
+  }
+
+  test("Parquet output committer, no merge schema") {
+    writeDataFrame(PARQUET_COMMITTER, summary = false, check = false)
+  }
+
+  /**
+   * Write a trivial dataframe as Parquet, using the given committer
+   * and job summary option.
+   * @param committer committer to use
+   * @param summary create a job summary
+   * @param check look for a marker file
+   * @return if a marker file was sought, it's file status.
+   */
+  private def writeDataFrame(
+      committer: String,
+      summary: Boolean,
+      check: Boolean): Option[FileStatus] = {
+    var result: Option[FileStatus] = None
+    withSQLConf(
+      SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key -> committer,
+      ParquetOutputFormat.ENABLE_JOB_SUMMARY -> summary.toString) {
+        withTempPath { dest =>
+          val df = spark.createDataFrame(Seq((1, "4"), (2, "2")))
+          val destPath = new Path(dest.toURI)
+          df.write.format("parquet").save(destPath.toString)
+          if (check) {
+            result = Some(MarkingFileOutput.checkMarker(
+              destPath,
+              spark.sparkContext.hadoopConfiguration))
+          }
+        }
+    }
+    result
+  }
+}
+
+/**
+ * A file output committer which explicitly touches a file "marker"; this
+ * is how tests can verify that this committer was used.
+ * @param outputPath output path
+ * @param context task context
+ */
+private class MarkingFileOutputCommitter(
+    outputPath: Path,
+    context: TaskAttemptContext) extends FileOutputCommitter(outputPath, context) {
+
+  override def commitJob(context: JobContext): Unit = {
+    super.commitJob(context)
+    MarkingFileOutput.touch(outputPath, context.getConfiguration)
+  }
+}
+
+private object MarkingFileOutput {
+
+  val COMMITTER = classOf[MarkingFileOutputCommitter].getCanonicalName
+
+  /**
+   * Touch the marker.
+   * @param outputPath destination directory
+   * @param conf configuration to create the FS with
+   */
+  def touch(outputPath: Path, conf: Configuration): Unit = {
+    outputPath.getFileSystem(conf).create(new Path(outputPath, "marker")).close()
+  }
+
+  /**
+   * Get the file status of the marker
+   *
+   * @param outputPath destination directory
+   * @param conf configuration to create the FS with
+   * @return the status of the marker
+   * @throws FileNotFoundException if the marker is absent
+   */
+  def checkMarker(outputPath: Path, conf: Configuration): FileStatus = {
+    outputPath.getFileSystem(conf).getFileStatus(new Path(outputPath, "marker"))
+  }
+}

From 3ff766f61afbd09dcc7a73eae02e68a39114ce3f Mon Sep 17 00:00:00 2001
From: Wang Gengliang <ltnwgl@gmail.com>
Date: Thu, 12 Oct 2017 18:47:16 -0700
Subject: [PATCH 703/779] [SPARK-22263][SQL] Refactor deterministic as lazy
 value

## What changes were proposed in this pull request?

The method `deterministic` is frequently called in optimizer.
Refactor `deterministic` as lazy value, in order to avoid redundant computations.

## How was this patch tested?
Simple benchmark test over TPC-DS queries, run time from query string to optimized plan(continuous  20 runs, and get the average of last 5 results):
Before changes: 12601 ms
After changes: 11993ms
This is 4.8% performance improvement.

Also run test with Unit test.

Author: Wang Gengliang <ltnwgl@gmail.com>

Closes #19478 from gengliangwang/deterministicAsLazyVal.
---
 .../sql/catalyst/expressions/CallMethodViaReflection.scala    | 2 +-
 .../apache/spark/sql/catalyst/expressions/Expression.scala    | 4 ++--
 .../org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala  | 2 +-
 .../spark/sql/catalyst/expressions/aggregate/First.scala      | 2 +-
 .../spark/sql/catalyst/expressions/aggregate/Last.scala       | 2 +-
 .../spark/sql/catalyst/expressions/aggregate/collect.scala    | 2 +-
 .../org/apache/spark/sql/catalyst/expressions/misc.scala      | 2 +-
 .../sql/execution/aggregate/TypedAggregateExpression.scala    | 4 ++--
 .../scala/org/apache/spark/sql/execution/aggregate/udaf.scala | 2 +-
 .../org/apache/spark/sql/TypedImperativeAggregateSuite.scala  | 2 +-
 .../src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala   | 4 ++--
 11 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflection.scala
index cd97304302e48..65bb9a8c642b6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflection.scala
@@ -76,7 +76,7 @@ case class CallMethodViaReflection(children: Seq[Expression])
     }
   }
 
-  override def deterministic: Boolean = false
+  override lazy val deterministic: Boolean = false
   override def nullable: Boolean = true
   override val dataType: DataType = StringType
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index c058425b4bc36..0e75ac88dc2b8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -79,7 +79,7 @@ abstract class Expression extends TreeNode[Expression] {
    * An example would be `SparkPartitionID` that relies on the partition id returned by TaskContext.
    * By default leaf expressions are deterministic as Nil.forall(_.deterministic) returns true.
    */
-  def deterministic: Boolean = children.forall(_.deterministic)
+  lazy val deterministic: Boolean = children.forall(_.deterministic)
 
   def nullable: Boolean
 
@@ -265,7 +265,7 @@ trait NonSQLExpression extends Expression {
  * An expression that is nondeterministic.
  */
 trait Nondeterministic extends Expression {
-  final override def deterministic: Boolean = false
+  final override lazy val deterministic: Boolean = false
   final override def foldable: Boolean = false
 
   @transient
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
index 527f1670c25e1..179853032035e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
@@ -49,7 +49,7 @@ case class ScalaUDF(
     udfDeterministic: Boolean = true)
   extends Expression with ImplicitCastInputTypes with NonSQLExpression with UserDefinedExpression {
 
-  override def deterministic: Boolean = udfDeterministic && children.forall(_.deterministic)
+  override lazy val deterministic: Boolean = udfDeterministic && children.forall(_.deterministic)
 
   override def toString: String =
     s"${udfName.map(name => s"UDF:$name").getOrElse("UDF")}(${children.mkString(", ")})"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/First.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/First.scala
index bfc58c22886cc..4e671e1f3e6eb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/First.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/First.scala
@@ -44,7 +44,7 @@ case class First(child: Expression, ignoreNullsExpr: Expression)
   override def nullable: Boolean = true
 
   // First is not a deterministic function.
-  override def deterministic: Boolean = false
+  override lazy val deterministic: Boolean = false
 
   // Return data type.
   override def dataType: DataType = child.dataType
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Last.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Last.scala
index 96a6ec08a160a..0ccabb9d98914 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Last.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Last.scala
@@ -44,7 +44,7 @@ case class Last(child: Expression, ignoreNullsExpr: Expression)
   override def nullable: Boolean = true
 
   // Last is not a deterministic function.
-  override def deterministic: Boolean = false
+  override lazy val deterministic: Boolean = false
 
   // Return data type.
   override def dataType: DataType = child.dataType
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
index 405c2065680f5..be972f006352e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
@@ -44,7 +44,7 @@ abstract class Collect[T <: Growable[Any] with Iterable[Any]] extends TypedImper
 
   // Both `CollectList` and `CollectSet` are non-deterministic since their results depend on the
   // actual order of input rows.
-  override def deterministic: Boolean = false
+  override lazy val deterministic: Boolean = false
 
   override def update(buffer: T, input: InternalRow): T = {
     val value = child.eval(input)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
index ef293ff3f18ea..b86e271fe2958 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
@@ -119,7 +119,7 @@ case class CurrentDatabase() extends LeafExpression with Unevaluable {
 // scalastyle:on line.size.limit
 case class Uuid() extends LeafExpression {
 
-  override def deterministic: Boolean = false
+  override lazy val deterministic: Boolean = false
 
   override def nullable: Boolean = false
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TypedAggregateExpression.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TypedAggregateExpression.scala
index 717758fdf716f..aab8cc50b9526 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TypedAggregateExpression.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TypedAggregateExpression.scala
@@ -127,7 +127,7 @@ case class SimpleTypedAggregateExpression(
     nullable: Boolean)
   extends DeclarativeAggregate with TypedAggregateExpression with NonSQLExpression {
 
-  override def deterministic: Boolean = true
+  override lazy val deterministic: Boolean = true
 
   override def children: Seq[Expression] = inputDeserializer.toSeq :+ bufferDeserializer
 
@@ -221,7 +221,7 @@ case class ComplexTypedAggregateExpression(
     inputAggBufferOffset: Int = 0)
   extends TypedImperativeAggregate[Any] with TypedAggregateExpression with NonSQLExpression {
 
-  override def deterministic: Boolean = true
+  override lazy val deterministic: Boolean = true
 
   override def children: Seq[Expression] = inputDeserializer.toSeq
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
index fec1add18cbf2..72aa4adff4e64 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
@@ -340,7 +340,7 @@ case class ScalaUDAF(
 
   override def dataType: DataType = udaf.dataType
 
-  override def deterministic: Boolean = udaf.deterministic
+  override lazy val deterministic: Boolean = udaf.deterministic
 
   override val inputTypes: Seq[DataType] = udaf.inputSchema.map(_.dataType)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TypedImperativeAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/TypedImperativeAggregateSuite.scala
index b76f168220d84..c5fb17345222a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/TypedImperativeAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/TypedImperativeAggregateSuite.scala
@@ -268,7 +268,7 @@ object TypedImperativeAggregateSuite {
       }
     }
 
-    override def deterministic: Boolean = true
+    override lazy val deterministic: Boolean = true
 
     override def children: Seq[Expression] = Seq(child)
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
index e9bdcf00b9346..68af99ea272a8 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
@@ -48,7 +48,7 @@ private[hive] case class HiveSimpleUDF(
   with Logging
   with UserDefinedExpression {
 
-  override def deterministic: Boolean = isUDFDeterministic && children.forall(_.deterministic)
+  override lazy val deterministic: Boolean = isUDFDeterministic && children.forall(_.deterministic)
 
   override def nullable: Boolean = true
 
@@ -131,7 +131,7 @@ private[hive] case class HiveGenericUDF(
 
   override def nullable: Boolean = true
 
-  override def deterministic: Boolean = isUDFDeterministic && children.forall(_.deterministic)
+  override lazy val deterministic: Boolean = isUDFDeterministic && children.forall(_.deterministic)
 
   override def foldable: Boolean =
     isUDFDeterministic && returnInspector.isInstanceOf[ConstantObjectInspector]

From ec122209fb35a65637df42eded64b0203e105aae Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Fri, 13 Oct 2017 13:09:35 +0800
Subject: [PATCH 704/779] [SPARK-21165][SQL] FileFormatWriter should handle
 mismatched attribute ids between logical and physical plan

## What changes were proposed in this pull request?

Due to optimizer removing some unnecessary aliases, the logical and physical plan may have different output attribute ids. FileFormatWriter should handle this when creating the physical sort node.

## How was this patch tested?

new regression test.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19483 from cloud-fan/bug2.
---
 .../datasources/FileFormatWriter.scala        |  7 +++++-
 .../datasources/FileFormatWriterSuite.scala   |  2 +-
 .../apache/spark/sql/hive/InsertSuite.scala   | 22 +++++++++++++++++++
 3 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
index 75b1695fbc275..1fac01a2c26c6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
@@ -180,8 +180,13 @@ object FileFormatWriter extends Logging {
       val rdd = if (orderingMatched) {
         queryExecution.toRdd
       } else {
+        // SPARK-21165: the `requiredOrdering` is based on the attributes from analyzed plan, and
+        // the physical plan may have different attribute ids due to optimizer removing some
+        // aliases. Here we bind the expression ahead to avoid potential attribute ids mismatch.
+        val orderingExpr = requiredOrdering
+          .map(SortOrder(_, Ascending)).map(BindReferences.bindReference(_, allColumns))
         SortExec(
-          requiredOrdering.map(SortOrder(_, Ascending)),
+          orderingExpr,
           global = false,
           child = queryExecution.executedPlan).execute()
       }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileFormatWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileFormatWriterSuite.scala
index 6f8767db176aa..13f0e0bca86c7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileFormatWriterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileFormatWriterSuite.scala
@@ -32,7 +32,7 @@ class FileFormatWriterSuite extends QueryTest with SharedSQLContext {
     }
   }
 
-  test("FileFormatWriter should respect the input query schema") {
+  test("SPARK-22252: FileFormatWriter should respect the input query schema") {
     withTable("t1", "t2", "t3", "t4") {
       spark.range(1).select('id as 'col1, 'id as 'col2).write.saveAsTable("t1")
       spark.sql("select COL1, COL2 from t1").write.saveAsTable("t2")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
index aa5cae33f5cd9..ab91727049ff5 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
@@ -728,4 +728,26 @@ class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter
       assert(e.contains("mismatched input 'ROW'"))
     }
   }
+
+  test("SPARK-21165: FileFormatWriter should only rely on attributes from analyzed plan") {
+    withSQLConf(("hive.exec.dynamic.partition.mode", "nonstrict")) {
+      withTable("tab1", "tab2") {
+        Seq(("a", "b", 3)).toDF("word", "first", "length").write.saveAsTable("tab1")
+
+        spark.sql(
+          """
+            |CREATE TABLE tab2 (word string, length int)
+            |PARTITIONED BY (first string)
+          """.stripMargin)
+
+        spark.sql(
+          """
+            |INSERT INTO TABLE tab2 PARTITION(first)
+            |SELECT word, length, cast(first as string) as first FROM tab1
+          """.stripMargin)
+
+        checkAnswer(spark.table("tab2"), Row("a", 3, "b"))
+      }
+    }
+  }
 }

From 2f00a71a876321af02865d7cd53ada167e1ce2e3 Mon Sep 17 00:00:00 2001
From: Wang Gengliang <ltnwgl@gmail.com>
Date: Thu, 12 Oct 2017 22:45:19 -0700
Subject: [PATCH 705/779] [SPARK-22257][SQL] Reserve all non-deterministic
 expressions in ExpressionSet

## What changes were proposed in this pull request?

For non-deterministic expressions, they should be considered as not contained in the [[ExpressionSet]].
This is consistent with how we define `semanticEquals` between two expressions.
Otherwise, combining expressions will remove non-deterministic expressions which should be reserved.
E.g.
Combine filters of
```scala
testRelation.where(Rand(0) > 0.1).where(Rand(0) > 0.1)
```
should result in
```scala
testRelation.where(Rand(0) > 0.1 && Rand(0) > 0.1)
```

## How was this patch tested?

Unit test

Author: Wang Gengliang <ltnwgl@gmail.com>

Closes #19475 from gengliangwang/non-deterministic-expressionSet.
---
 .../catalyst/expressions/ExpressionSet.scala  | 23 ++++++---
 .../expressions/ExpressionSetSuite.scala      | 51 +++++++++++++++----
 .../optimizer/FilterPushdownSuite.scala       | 15 ++++++
 3 files changed, 72 insertions(+), 17 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSet.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSet.scala
index 305ac90e245b8..7e8e7b8cd5f18 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSet.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSet.scala
@@ -30,8 +30,9 @@ object ExpressionSet {
 }
 
 /**
- * A [[Set]] where membership is determined based on a canonical representation of an [[Expression]]
- * (i.e. one that attempts to ignore cosmetic differences).  See [[Canonicalize]] for more details.
+ * A [[Set]] where membership is determined based on determinacy and a canonical representation of
+ * an [[Expression]] (i.e. one that attempts to ignore cosmetic differences).
+ * See [[Canonicalize]] for more details.
  *
  * Internally this set uses the canonical representation, but keeps also track of the original
  * expressions to ease debugging.  Since different expressions can share the same canonical
@@ -46,6 +47,10 @@ object ExpressionSet {
  *   set.contains(1 + a) => true
  *   set.contains(a + 2) => false
  * }}}
+ *
+ * For non-deterministic expressions, they are always considered as not contained in the [[Set]].
+ * On adding a non-deterministic expression, simply append it to the original expressions.
+ * This is consistent with how we define `semanticEquals` between two expressions.
  */
 class ExpressionSet protected(
     protected val baseSet: mutable.Set[Expression] = new mutable.HashSet,
@@ -53,7 +58,9 @@ class ExpressionSet protected(
   extends Set[Expression] {
 
   protected def add(e: Expression): Unit = {
-    if (!baseSet.contains(e.canonicalized)) {
+    if (!e.deterministic) {
+      originals += e
+    } else if (!baseSet.contains(e.canonicalized) ) {
       baseSet.add(e.canonicalized)
       originals += e
     }
@@ -74,9 +81,13 @@ class ExpressionSet protected(
   }
 
   override def -(elem: Expression): ExpressionSet = {
-    val newBaseSet = baseSet.clone().filterNot(_ == elem.canonicalized)
-    val newOriginals = originals.clone().filterNot(_.canonicalized == elem.canonicalized)
-    new ExpressionSet(newBaseSet, newOriginals)
+    if (elem.deterministic) {
+      val newBaseSet = baseSet.clone().filterNot(_ == elem.canonicalized)
+      val newOriginals = originals.clone().filterNot(_.canonicalized == elem.canonicalized)
+      new ExpressionSet(newBaseSet, newOriginals)
+    } else {
+      new ExpressionSet(baseSet.clone(), originals.clone())
+    }
   }
 
   override def iterator: Iterator[Expression] = originals.iterator
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSetSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSetSuite.scala
index a1000a0e80799..12eddf557109f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSetSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSetSuite.scala
@@ -175,20 +175,14 @@ class ExpressionSetSuite extends SparkFunSuite {
     aUpper > bUpper || aUpper <= Rand(1L) || aUpper <= 10,
     aUpper <= Rand(1L) || aUpper <= 10 || aUpper > bUpper)
 
-  // Partial reorder case: we don't reorder non-deterministic expressions,
-  // but we can reorder sub-expressions in deterministic AND/OR expressions.
-  // There are two predicates:
-  //   (aUpper > bUpper || bUpper > 100) => we can reorder sub-expressions in it.
-  //   (aUpper === Rand(1L))
-  setTest(1,
+  // Keep all the non-deterministic expressions even they are semantically equal.
+  setTest(2, Rand(1L), Rand(1L))
+
+  setTest(2,
     (aUpper > bUpper || bUpper > 100) && aUpper === Rand(1L),
     (bUpper > 100 || aUpper > bUpper) && aUpper === Rand(1L))
 
-  // There are three predicates:
-  //   (Rand(1L) > aUpper)
-  //   (aUpper <= Rand(1L) && aUpper > bUpper)
-  //   (aUpper > 10 && bUpper > 10) => we can reorder sub-expressions in it.
-  setTest(1,
+  setTest(2,
     Rand(1L) > aUpper || (aUpper <= Rand(1L) && aUpper > bUpper) || (aUpper > 10 && bUpper > 10),
     Rand(1L) > aUpper || (aUpper <= Rand(1L) && aUpper > bUpper) || (bUpper > 10 && aUpper > 10))
 
@@ -219,4 +213,39 @@ class ExpressionSetSuite extends SparkFunSuite {
     assert((initialSet ++ setToAddWithSameExpression).size == 2)
     assert((initialSet ++ setToAddWithOutSameExpression).size == 3)
   }
+
+  test("add single element to set with non-deterministic expressions") {
+    val initialSet = ExpressionSet(aUpper + 1 :: Rand(0) :: Nil)
+
+    assert((initialSet + (aUpper + 1)).size == 2)
+    assert((initialSet + Rand(0)).size == 3)
+    assert((initialSet + (aUpper + 2)).size == 3)
+  }
+
+  test("remove single element to set with non-deterministic expressions") {
+    val initialSet = ExpressionSet(aUpper + 1 :: Rand(0) :: Nil)
+
+    assert((initialSet - (aUpper + 1)).size == 1)
+    assert((initialSet - Rand(0)).size == 2)
+    assert((initialSet - (aUpper + 2)).size == 2)
+  }
+
+  test("add multiple elements to set with non-deterministic expressions") {
+    val initialSet = ExpressionSet(aUpper + 1 :: Rand(0) :: Nil)
+    val setToAddWithSameDeterministicExpression = ExpressionSet(aUpper + 1 :: Rand(0) :: Nil)
+    val setToAddWithOutSameExpression = ExpressionSet(aUpper + 3 :: aUpper + 4 :: Nil)
+
+    assert((initialSet ++ setToAddWithSameDeterministicExpression).size == 3)
+    assert((initialSet ++ setToAddWithOutSameExpression).size == 4)
+  }
+
+  test("remove multiple elements to set with non-deterministic expressions") {
+    val initialSet = ExpressionSet(aUpper + 1 :: Rand(0) :: Nil)
+    val setToRemoveWithSameDeterministicExpression = ExpressionSet(aUpper + 1 :: Rand(0) :: Nil)
+    val setToRemoveWithOutSameExpression = ExpressionSet(aUpper + 3 :: aUpper + 4 :: Nil)
+
+    assert((initialSet -- setToRemoveWithSameDeterministicExpression).size == 1)
+    assert((initialSet -- setToRemoveWithOutSameExpression).size == 2)
+  }
+
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
index 582b3ead5e54a..de0e7c7ee49ac 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
@@ -94,6 +94,21 @@ class FilterPushdownSuite extends PlanTest {
     comparePlans(optimized, correctAnswer)
   }
 
+  test("combine redundant deterministic filters") {
+    val originalQuery =
+      testRelation
+        .where(Rand(0) > 0.1 && 'a === 1)
+        .where(Rand(0) > 0.1 && 'a === 1)
+
+    val optimized = Optimize.execute(originalQuery.analyze)
+    val correctAnswer =
+      testRelation
+        .where(Rand(0) > 0.1 && 'a === 1 && Rand(0) > 0.1)
+        .analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
   test("SPARK-16164: Filter pushdown should keep the ordering in the logical plan") {
     val originalQuery =
       testRelation

From e6e36004afc3f9fc8abea98542248e9de11b4435 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Fri, 13 Oct 2017 23:09:12 +0800
Subject: [PATCH 706/779] [SPARK-14387][SPARK-16628][SPARK-18355][SQL] Use
 Spark schema to read ORC table instead of ORC file schema

## What changes were proposed in this pull request?

Before Hive 2.0, ORC File schema has invalid column names like `_col1` and `_col2`. This is a well-known limitation and there are several Apache Spark issues with `spark.sql.hive.convertMetastoreOrc=true`. This PR ignores ORC File schema and use Spark schema.

## How was this patch tested?

Pass the newly added test case.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #19470 from dongjoon-hyun/SPARK-18355.
---
 .../spark/sql/hive/orc/OrcFileFormat.scala    | 31 ++++++----
 .../sql/hive/execution/SQLQuerySuite.scala    | 62 ++++++++++++++++++-
 2 files changed, 80 insertions(+), 13 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
index c76f0ebb36a60..194e69c93e1a8 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -134,12 +134,11 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
       // SPARK-8501: Empty ORC files always have an empty schema stored in their footer. In this
       // case, `OrcFileOperator.readSchema` returns `None`, and we can't read the underlying file
       // using the given physical schema. Instead, we simply return an empty iterator.
-      val maybePhysicalSchema = OrcFileOperator.readSchema(Seq(file.filePath), Some(conf))
-      if (maybePhysicalSchema.isEmpty) {
+      val isEmptyFile = OrcFileOperator.readSchema(Seq(file.filePath), Some(conf)).isEmpty
+      if (isEmptyFile) {
         Iterator.empty
       } else {
-        val physicalSchema = maybePhysicalSchema.get
-        OrcRelation.setRequiredColumns(conf, physicalSchema, requiredSchema)
+        OrcRelation.setRequiredColumns(conf, dataSchema, requiredSchema)
 
         val orcRecordReader = {
           val job = Job.getInstance(conf)
@@ -163,6 +162,7 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
         // Unwraps `OrcStruct`s to `UnsafeRow`s
         OrcRelation.unwrapOrcStructs(
           conf,
+          dataSchema,
           requiredSchema,
           Some(orcRecordReader.getObjectInspector.asInstanceOf[StructObjectInspector]),
           recordsIterator)
@@ -272,25 +272,32 @@ private[orc] object OrcRelation extends HiveInspectors {
   def unwrapOrcStructs(
       conf: Configuration,
       dataSchema: StructType,
+      requiredSchema: StructType,
       maybeStructOI: Option[StructObjectInspector],
       iterator: Iterator[Writable]): Iterator[InternalRow] = {
     val deserializer = new OrcSerde
-    val mutableRow = new SpecificInternalRow(dataSchema.map(_.dataType))
-    val unsafeProjection = UnsafeProjection.create(dataSchema)
+    val mutableRow = new SpecificInternalRow(requiredSchema.map(_.dataType))
+    val unsafeProjection = UnsafeProjection.create(requiredSchema)
 
     def unwrap(oi: StructObjectInspector): Iterator[InternalRow] = {
-      val (fieldRefs, fieldOrdinals) = dataSchema.zipWithIndex.map {
-        case (field, ordinal) => oi.getStructFieldRef(field.name) -> ordinal
+      val (fieldRefs, fieldOrdinals) = requiredSchema.zipWithIndex.map {
+        case (field, ordinal) =>
+          var ref = oi.getStructFieldRef(field.name)
+          if (ref == null) {
+            ref = oi.getStructFieldRef("_col" + dataSchema.fieldIndex(field.name))
+          }
+          ref -> ordinal
       }.unzip
 
-      val unwrappers = fieldRefs.map(unwrapperFor)
+      val unwrappers = fieldRefs.map(r => if (r == null) null else unwrapperFor(r))
 
       iterator.map { value =>
         val raw = deserializer.deserialize(value)
         var i = 0
         val length = fieldRefs.length
         while (i < length) {
-          val fieldValue = oi.getStructFieldData(raw, fieldRefs(i))
+          val fieldRef = fieldRefs(i)
+          val fieldValue = if (fieldRef == null) null else oi.getStructFieldData(raw, fieldRef)
           if (fieldValue == null) {
             mutableRow.setNullAt(fieldOrdinals(i))
           } else {
@@ -306,8 +313,8 @@ private[orc] object OrcRelation extends HiveInspectors {
   }
 
   def setRequiredColumns(
-      conf: Configuration, physicalSchema: StructType, requestedSchema: StructType): Unit = {
-    val ids = requestedSchema.map(a => physicalSchema.fieldIndex(a.name): Integer)
+      conf: Configuration, dataSchema: StructType, requestedSchema: StructType): Unit = {
+    val ids = requestedSchema.map(a => dataSchema.fieldIndex(a.name): Integer)
     val (sortedIDs, sortedNames) = ids.zip(requestedSchema.fieldNames).sorted.unzip
     HiveShim.appendReadColumns(conf, sortedIDs, sortedNames)
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 09c59000b3e3f..94fa43dec7313 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -34,7 +34,7 @@ import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias}
 import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.hive.HiveUtils
+import org.apache.spark.sql.hive.{HiveExternalCatalog, HiveUtils}
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SQLTestUtils
@@ -2050,4 +2050,64 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       }
     }
   }
+
+  Seq("orc", "parquet").foreach { format =>
+    test(s"SPARK-18355 Read data from a hive table with a new column - $format") {
+      val client = spark.sharedState.externalCatalog.asInstanceOf[HiveExternalCatalog].client
+
+      Seq("true", "false").foreach { value =>
+        withSQLConf(
+          HiveUtils.CONVERT_METASTORE_ORC.key -> value,
+          HiveUtils.CONVERT_METASTORE_PARQUET.key -> value) {
+          withTempDatabase { db =>
+            client.runSqlHive(
+              s"""
+                 |CREATE TABLE $db.t(
+                 |  click_id string,
+                 |  search_id string,
+                 |  uid bigint)
+                 |PARTITIONED BY (
+                 |  ts string,
+                 |  hour string)
+                 |STORED AS $format
+              """.stripMargin)
+
+            client.runSqlHive(
+              s"""
+                 |INSERT INTO TABLE $db.t
+                 |PARTITION (ts = '98765', hour = '01')
+                 |VALUES (12, 2, 12345)
+              """.stripMargin
+            )
+
+            checkAnswer(
+              sql(s"SELECT click_id, search_id, uid, ts, hour FROM $db.t"),
+              Row("12", "2", 12345, "98765", "01"))
+
+            client.runSqlHive(s"ALTER TABLE $db.t ADD COLUMNS (dummy string)")
+
+            checkAnswer(
+              sql(s"SELECT click_id, search_id FROM $db.t"),
+              Row("12", "2"))
+
+            checkAnswer(
+              sql(s"SELECT search_id, click_id FROM $db.t"),
+              Row("2", "12"))
+
+            checkAnswer(
+              sql(s"SELECT search_id FROM $db.t"),
+              Row("2"))
+
+            checkAnswer(
+              sql(s"SELECT dummy, click_id FROM $db.t"),
+              Row(null, "12"))
+
+            checkAnswer(
+              sql(s"SELECT click_id, search_id, uid, dummy, ts, hour FROM $db.t"),
+              Row("12", "2", 12345, null, "98765", "01"))
+          }
+        }
+      }
+    }
+  }
 }

From 6412ea1759d39a2380c572ec24cfd8ae4f2d81f7 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Sat, 14 Oct 2017 00:35:12 +0800
Subject: [PATCH 707/779] [SPARK-21247][SQL] Type comparison should respect
 case-sensitive SQL conf

## What changes were proposed in this pull request?

This is an effort to reduce the difference between Hive and Spark. Spark supports case-sensitivity in columns. Especially, for Struct types, with `spark.sql.caseSensitive=true`, the following is supported.

```scala
scala> sql("select named_struct('a', 1, 'A', 2).a").show
+--------------------------+
|named_struct(a, 1, A, 2).a|
+--------------------------+
|                         1|
+--------------------------+

scala> sql("select named_struct('a', 1, 'A', 2).A").show
+--------------------------+
|named_struct(a, 1, A, 2).A|
+--------------------------+
|                         2|
+--------------------------+
```

And vice versa, with `spark.sql.caseSensitive=false`, the following is supported.
```scala
scala> sql("select named_struct('a', 1).A, named_struct('A', 1).a").show
+--------------------+--------------------+
|named_struct(a, 1).A|named_struct(A, 1).a|
+--------------------+--------------------+
|                   1|                   1|
+--------------------+--------------------+
```

However, types are considered different. For example, SET operations fail.
```scala
scala> sql("SELECT named_struct('a',1) union all (select named_struct('A',2))").show
org.apache.spark.sql.AnalysisException: Union can only be performed on tables with the compatible column types. struct<A:int> <> struct<a:int> at the first column of the second table;;
'Union
:- Project [named_struct(a, 1) AS named_struct(a, 1)#57]
:  +- OneRowRelation$
+- Project [named_struct(A, 2) AS named_struct(A, 2)#58]
   +- OneRowRelation$
```

This PR aims to support case-insensitive type equality. For example, in Set operation, the above operation succeed when `spark.sql.caseSensitive=false`.

```scala
scala> sql("SELECT named_struct('a',1) union all (select named_struct('A',2))").show
+------------------+
|named_struct(a, 1)|
+------------------+
|               [1]|
|               [2]|
+------------------+
```

## How was this patch tested?

Pass the Jenkins with a newly add test case.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #18460 from dongjoon-hyun/SPARK-21247.
---
 .../sql/catalyst/analysis/TypeCoercion.scala  | 10 ++++
 .../org/apache/spark/sql/types/DataType.scala |  7 ++-
 .../catalyst/analysis/TypeCoercionSuite.scala | 52 +++++++++++++++++--
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 38 ++++++++++++++
 4 files changed, 102 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
index 9ffe646b5e4ec..532d22dbf2321 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
@@ -100,6 +100,16 @@ object TypeCoercion {
     case (_: TimestampType, _: DateType) | (_: DateType, _: TimestampType) =>
       Some(TimestampType)
 
+    case (t1 @ StructType(fields1), t2 @ StructType(fields2)) if t1.sameType(t2) =>
+      Some(StructType(fields1.zip(fields2).map { case (f1, f2) =>
+        // Since `t1.sameType(t2)` is true, two StructTypes have the same DataType
+        // except `name` (in case of `spark.sql.caseSensitive=false`) and `nullable`.
+        // - Different names: use f1.name
+        // - Different nullabilities: `nullable` is true iff one of them is nullable.
+        val dataType = findTightestCommonType(f1.dataType, f2.dataType).get
+        StructField(f1.name, dataType, nullable = f1.nullable || f2.nullable)
+      }))
+
     case _ => None
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
index 30745c6a9d42a..d6e0df12218ad 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
@@ -26,6 +26,7 @@ import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.util.Utils
 
 /**
@@ -80,7 +81,11 @@ abstract class DataType extends AbstractDataType {
    * (`StructField.nullable`, `ArrayType.containsNull`, and `MapType.valueContainsNull`).
    */
   private[spark] def sameType(other: DataType): Boolean =
-    DataType.equalsIgnoreNullability(this, other)
+    if (SQLConf.get.caseSensitiveAnalysis) {
+      DataType.equalsIgnoreNullability(this, other)
+    } else {
+      DataType.equalsIgnoreCaseAndNullability(this, other)
+    }
 
   /**
    * Returns the same data type but set all nullability fields are true
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
index d62e3b6dfe34f..793e04f66f0f9 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
@@ -131,14 +131,17 @@ class TypeCoercionSuite extends AnalysisTest {
       widenFunc: (DataType, DataType) => Option[DataType],
       t1: DataType,
       t2: DataType,
-      expected: Option[DataType]): Unit = {
+      expected: Option[DataType],
+      isSymmetric: Boolean = true): Unit = {
     var found = widenFunc(t1, t2)
     assert(found == expected,
       s"Expected $expected as wider common type for $t1 and $t2, found $found")
     // Test both directions to make sure the widening is symmetric.
-    found = widenFunc(t2, t1)
-    assert(found == expected,
-      s"Expected $expected as wider common type for $t2 and $t1, found $found")
+    if (isSymmetric) {
+      found = widenFunc(t2, t1)
+      assert(found == expected,
+        s"Expected $expected as wider common type for $t2 and $t1, found $found")
+    }
   }
 
   test("implicit type cast - ByteType") {
@@ -385,6 +388,47 @@ class TypeCoercionSuite extends AnalysisTest {
     widenTest(NullType, StructType(Seq()), Some(StructType(Seq())))
     widenTest(StringType, MapType(IntegerType, StringType, true), None)
     widenTest(ArrayType(IntegerType), StructType(Seq()), None)
+
+    widenTest(
+      StructType(Seq(StructField("a", IntegerType))),
+      StructType(Seq(StructField("b", IntegerType))),
+      None)
+    widenTest(
+      StructType(Seq(StructField("a", IntegerType, nullable = false))),
+      StructType(Seq(StructField("a", DoubleType, nullable = false))),
+      None)
+
+    widenTest(
+      StructType(Seq(StructField("a", IntegerType, nullable = false))),
+      StructType(Seq(StructField("a", IntegerType, nullable = false))),
+      Some(StructType(Seq(StructField("a", IntegerType, nullable = false)))))
+    widenTest(
+      StructType(Seq(StructField("a", IntegerType, nullable = false))),
+      StructType(Seq(StructField("a", IntegerType, nullable = true))),
+      Some(StructType(Seq(StructField("a", IntegerType, nullable = true)))))
+    widenTest(
+      StructType(Seq(StructField("a", IntegerType, nullable = true))),
+      StructType(Seq(StructField("a", IntegerType, nullable = false))),
+      Some(StructType(Seq(StructField("a", IntegerType, nullable = true)))))
+    widenTest(
+      StructType(Seq(StructField("a", IntegerType, nullable = true))),
+      StructType(Seq(StructField("a", IntegerType, nullable = true))),
+      Some(StructType(Seq(StructField("a", IntegerType, nullable = true)))))
+
+    withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") {
+      widenTest(
+        StructType(Seq(StructField("a", IntegerType))),
+        StructType(Seq(StructField("A", IntegerType))),
+        None)
+    }
+    withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") {
+      checkWidenType(
+        TypeCoercion.findTightestCommonType,
+        StructType(Seq(StructField("a", IntegerType), StructField("B", IntegerType))),
+        StructType(Seq(StructField("A", IntegerType), StructField("b", IntegerType))),
+        Some(StructType(Seq(StructField("a", IntegerType), StructField("B", IntegerType)))),
+        isSymmetric = false)
+    }
   }
 
   test("wider common type for decimal and array") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 93a7777b70b46..f0c58e2e5bf45 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2646,6 +2646,44 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     }
   }
 
+  test("SPARK-21247: Allow case-insensitive type equality in Set operation") {
+    withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") {
+      sql("SELECT struct(1 a) UNION ALL (SELECT struct(2 A))")
+      sql("SELECT struct(1 a) EXCEPT (SELECT struct(2 A))")
+
+      withTable("t", "S") {
+        sql("CREATE TABLE t(c struct<f:int>) USING parquet")
+        sql("CREATE TABLE S(C struct<F:int>) USING parquet")
+        Seq(("c", "C"), ("C", "c"), ("c.f", "C.F"), ("C.F", "c.f")).foreach {
+          case (left, right) =>
+            checkAnswer(sql(s"SELECT * FROM t, S WHERE t.$left = S.$right"), Seq.empty)
+        }
+      }
+    }
+
+    withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") {
+      val m1 = intercept[AnalysisException] {
+        sql("SELECT struct(1 a) UNION ALL (SELECT struct(2 A))")
+      }.message
+      assert(m1.contains("Union can only be performed on tables with the compatible column types"))
+
+      val m2 = intercept[AnalysisException] {
+        sql("SELECT struct(1 a) EXCEPT (SELECT struct(2 A))")
+      }.message
+      assert(m2.contains("Except can only be performed on tables with the compatible column types"))
+
+      withTable("t", "S") {
+        sql("CREATE TABLE t(c struct<f:int>) USING parquet")
+        sql("CREATE TABLE S(C struct<F:int>) USING parquet")
+        checkAnswer(sql("SELECT * FROM t, S WHERE t.c.f = S.C.F"), Seq.empty)
+        val m = intercept[AnalysisException] {
+          sql("SELECT * FROM t, S WHERE c = C")
+        }.message
+        assert(m.contains("cannot resolve '(t.`c` = S.`C`)' due to data type mismatch"))
+      }
+    }
+  }
+
   test("SPARK-21335: support un-aliased subquery") {
     withTempView("v") {
       Seq(1 -> "a").toDF("i", "j").createOrReplaceTempView("v")

From 3823dc88d3816c7d1099f9601426108acc90574c Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Fri, 13 Oct 2017 10:49:48 -0700
Subject: [PATCH 708/779] [SPARK-22252][SQL][FOLLOWUP] Command should not be a
 LeafNode

## What changes were proposed in this pull request?

This is a minor folllowup of #19474 .

#19474 partially reverted #18064 but accidentally introduced a behavior change. `Command` extended `LogicalPlan` before #18064 , but #19474 made it extend `LeafNode`. This is an internal behavior change as now all `Command` subclasses can't define children, and they have to implement `computeStatistic` method.

This PR fixes this by making `Command` extend `LogicalPlan`

## How was this patch tested?

N/A

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19493 from cloud-fan/minor.
---
 .../org/apache/spark/sql/catalyst/plans/logical/Command.scala  | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Command.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Command.scala
index 38f47081b6f55..ec5766e1f67f2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Command.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Command.scala
@@ -24,6 +24,7 @@ import org.apache.spark.sql.catalyst.expressions.Attribute
  * commands can be used by parsers to represent DDL operations.  Commands, unlike queries, are
  * eagerly executed.
  */
-trait Command extends LeafNode {
+trait Command extends LogicalPlan {
   override def output: Seq[Attribute] = Seq.empty
+  override def children: Seq[LogicalPlan] = Seq.empty
 }

From 1bb8b76045420b61a37806d6f4765af15c4052a7 Mon Sep 17 00:00:00 2001
From: Liwei Lin <lwlin7@gmail.com>
Date: Fri, 13 Oct 2017 15:13:06 -0700
Subject: [PATCH 709/779] [MINOR][SS] keyWithIndexToNumValues" ->
 "keyWithIndexToValue"

## What changes were proposed in this pull request?

This PR changes `keyWithIndexToNumValues`  to `keyWithIndexToValue`.

There will be directories on HDFS named with this `keyWithIndexToNumValues`. So if we ever want to fix this, let's fix it now.

## How was this patch tested?

existing unit test cases.

Author: Liwei Lin <lwlin7@gmail.com>

Closes #19435 from lw-lin/keyWithIndex.
---
 .../streaming/state/SymmetricHashJoinStateManager.scala   | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala
index d256fb578d921..6b386308c79fb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala
@@ -384,7 +384,7 @@ class SymmetricHashJoinStateManager(
   }
 
   /** A wrapper around a [[StateStore]] that stores [(key, index) -> value]. */
-  private class KeyWithIndexToValueStore extends StateStoreHandler(KeyWithIndexToValuesType) {
+  private class KeyWithIndexToValueStore extends StateStoreHandler(KeyWithIndexToValueType) {
     private val keyWithIndexExprs = keyAttributes :+ Literal(1L)
     private val keyWithIndexSchema = keySchema.add("index", LongType)
     private val indexOrdinalInKeyWithIndexRow = keyAttributes.size
@@ -471,7 +471,7 @@ class SymmetricHashJoinStateManager(
 object SymmetricHashJoinStateManager {
 
   def allStateStoreNames(joinSides: JoinSide*): Seq[String] = {
-    val allStateStoreTypes: Seq[StateStoreType] = Seq(KeyToNumValuesType, KeyWithIndexToValuesType)
+    val allStateStoreTypes: Seq[StateStoreType] = Seq(KeyToNumValuesType, KeyWithIndexToValueType)
     for (joinSide <- joinSides; stateStoreType <- allStateStoreTypes) yield {
       getStateStoreName(joinSide, stateStoreType)
     }
@@ -483,8 +483,8 @@ object SymmetricHashJoinStateManager {
     override def toString(): String = "keyToNumValues"
   }
 
-  private case object KeyWithIndexToValuesType extends StateStoreType {
-    override def toString(): String = "keyWithIndexToNumValues"
+  private case object KeyWithIndexToValueType extends StateStoreType {
+    override def toString(): String = "keyWithIndexToValue"
   }
 
   private def getStateStoreName(joinSide: JoinSide, storeType: StateStoreType): String = {

From 06df34d35ec088277445ef09cfb24bfe996f072e Mon Sep 17 00:00:00 2001
From: Devaraj K <devaraj@apache.org>
Date: Fri, 13 Oct 2017 17:12:50 -0700
Subject: [PATCH 710/779] [SPARK-11034][LAUNCHER][MESOS] Launcher: add support
 for monitoring Mesos apps

## What changes were proposed in this pull request?

Added Launcher support for monitoring Mesos apps in Client mode. SPARK-11033 can handle the support for Mesos/Cluster mode since the Standalone/Cluster and Mesos/Cluster modes use the same code at client side.

## How was this patch tested?

I verified it manually by running launcher application, able to launch, stop and kill the mesos applications and also can invoke other launcher API's.

Author: Devaraj K <devaraj@apache.org>

Closes #19385 from devaraj-kavali/SPARK-11034.
---
 .../MesosCoarseGrainedSchedulerBackend.scala  | 28 +++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
index 80c0a041b7322..603c980cb268d 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
@@ -32,6 +32,7 @@ import org.apache.spark.{SecurityManager, SparkContext, SparkException, TaskStat
 import org.apache.spark.deploy.mesos.config._
 import org.apache.spark.deploy.security.HadoopDelegationTokenManager
 import org.apache.spark.internal.config
+import org.apache.spark.launcher.{LauncherBackend, SparkAppHandle}
 import org.apache.spark.network.netty.SparkTransportConf
 import org.apache.spark.network.shuffle.mesos.MesosExternalShuffleClient
 import org.apache.spark.rpc.RpcEndpointAddress
@@ -89,6 +90,13 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
   // Synchronization protected by stateLock
   private[this] var stopCalled: Boolean = false
 
+  private val launcherBackend = new LauncherBackend() {
+    override protected def onStopRequest(): Unit = {
+      stopSchedulerBackend()
+      setState(SparkAppHandle.State.KILLED)
+    }
+  }
+
   // If shuffle service is enabled, the Spark driver will register with the shuffle service.
   // This is for cleaning up shuffle files reliably.
   private val shuffleServiceEnabled = conf.getBoolean("spark.shuffle.service.enabled", false)
@@ -182,6 +190,9 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
   override def start() {
     super.start()
 
+    if (sc.deployMode == "client") {
+      launcherBackend.connect()
+    }
     val startedBefore = IdHelper.startedBefore.getAndSet(true)
 
     val suffix = if (startedBefore) {
@@ -202,6 +213,7 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
       sc.conf.getOption("spark.mesos.driver.frameworkId").map(_ + suffix)
     )
 
+    launcherBackend.setState(SparkAppHandle.State.SUBMITTED)
     startScheduler(driver)
   }
 
@@ -295,15 +307,21 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
     this.mesosExternalShuffleClient.foreach(_.init(appId))
     this.schedulerDriver = driver
     markRegistered()
+    launcherBackend.setAppId(appId)
+    launcherBackend.setState(SparkAppHandle.State.RUNNING)
   }
 
   override def sufficientResourcesRegistered(): Boolean = {
     totalCoreCount.get >= maxCoresOption.getOrElse(0) * minRegisteredRatio
   }
 
-  override def disconnected(d: org.apache.mesos.SchedulerDriver) {}
+  override def disconnected(d: org.apache.mesos.SchedulerDriver) {
+    launcherBackend.setState(SparkAppHandle.State.SUBMITTED)
+  }
 
-  override def reregistered(d: org.apache.mesos.SchedulerDriver, masterInfo: MasterInfo) {}
+  override def reregistered(d: org.apache.mesos.SchedulerDriver, masterInfo: MasterInfo) {
+    launcherBackend.setState(SparkAppHandle.State.RUNNING)
+  }
 
   /**
    * Method called by Mesos to offer resources on slaves. We respond by launching an executor,
@@ -611,6 +629,12 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
   }
 
   override def stop() {
+    stopSchedulerBackend()
+    launcherBackend.setState(SparkAppHandle.State.FINISHED)
+    launcherBackend.close()
+  }
+
+  private def stopSchedulerBackend() {
     // Make sure we're not launching tasks during shutdown
     stateLock.synchronized {
       if (stopCalled) {

From e3536406ec6ff65a8b41ba2f2fd40517a760cfd6 Mon Sep 17 00:00:00 2001
From: Steve Loughran <stevel@hortonworks.com>
Date: Fri, 13 Oct 2017 23:08:17 -0700
Subject: [PATCH 711/779] [SPARK-21762][SQL]
 FileFormatWriter/BasicWriteTaskStatsTracker metrics collection fails if a new
 file isn't yet visible

## What changes were proposed in this pull request?

`BasicWriteTaskStatsTracker.getFileSize()` to catch `FileNotFoundException`, log  info and then return 0 as a file size.

This ensures that if a newly created file isn't visible due to the store not always having create consistency, the metric collection doesn't cause the failure.

## How was this patch tested?

New test suite included, `BasicWriteTaskStatsTrackerSuite`. This not only checks the resilience to missing files, but verifies the existing logic as to how file statistics are gathered.

Note that in the current implementation

1. if you call `Tracker..getFinalStats()` more than once, the file size count will increase by size of the last file. This could be fixed by clearing the filename field inside `getFinalStats()` itself.

2. If you pass in an empty or null string to `Tracker.newFile(path)` then IllegalArgumentException is raised, but only in `getFinalStats()`, rather than in `newFile`.  There's a test for this behaviour in the new suite, as it verifies that only FNFEs get swallowed.

Author: Steve Loughran <stevel@hortonworks.com>

Closes #18979 from steveloughran/cloud/SPARK-21762-missing-files-in-metrics.
---
 .../datasources/BasicWriteStatsTracker.scala  |  49 +++-
 .../BasicWriteTaskStatsTrackerSuite.scala     | 220 ++++++++++++++++++
 .../sql/hive/execution/SQLQuerySuite.scala    |   8 +
 3 files changed, 265 insertions(+), 12 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/BasicWriteTaskStatsTrackerSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala
index b8f7d130d569f..11af0aaa7b206 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala
@@ -17,10 +17,13 @@
 
 package org.apache.spark.sql.execution.datasources
 
+import java.io.FileNotFoundException
+
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.SparkContext
+import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
@@ -44,20 +47,32 @@ case class BasicWriteTaskStats(
  * @param hadoopConf
  */
 class BasicWriteTaskStatsTracker(hadoopConf: Configuration)
-  extends WriteTaskStatsTracker {
+  extends WriteTaskStatsTracker with Logging {
 
   private[this] var numPartitions: Int = 0
   private[this] var numFiles: Int = 0
+  private[this] var submittedFiles: Int = 0
   private[this] var numBytes: Long = 0L
   private[this] var numRows: Long = 0L
 
-  private[this] var curFile: String = null
-
+  private[this] var curFile: Option[String] = None
 
-  private def getFileSize(filePath: String): Long = {
+  /**
+   * Get the size of the file expected to have been written by a worker.
+   * @param filePath path to the file
+   * @return the file size or None if the file was not found.
+   */
+  private def getFileSize(filePath: String): Option[Long] = {
     val path = new Path(filePath)
     val fs = path.getFileSystem(hadoopConf)
-    fs.getFileStatus(path).getLen()
+    try {
+      Some(fs.getFileStatus(path).getLen())
+    } catch {
+      case e: FileNotFoundException =>
+        // may arise against eventually consistent object stores
+        logDebug(s"File $path is not yet visible", e)
+        None
+    }
   }
 
 
@@ -70,12 +85,19 @@ class BasicWriteTaskStatsTracker(hadoopConf: Configuration)
   }
 
   override def newFile(filePath: String): Unit = {
-    if (numFiles > 0) {
-      // we assume here that we've finished writing to disk the previous file by now
-      numBytes += getFileSize(curFile)
+    statCurrentFile()
+    curFile = Some(filePath)
+    submittedFiles += 1
+  }
+
+  private def statCurrentFile(): Unit = {
+    curFile.foreach { path =>
+      getFileSize(path).foreach { len =>
+        numBytes += len
+        numFiles += 1
+      }
+      curFile = None
     }
-    curFile = filePath
-    numFiles += 1
   }
 
   override def newRow(row: InternalRow): Unit = {
@@ -83,8 +105,11 @@ class BasicWriteTaskStatsTracker(hadoopConf: Configuration)
   }
 
   override def getFinalStats(): WriteTaskStats = {
-    if (numFiles > 0) {
-      numBytes += getFileSize(curFile)
+    statCurrentFile()
+    if (submittedFiles != numFiles) {
+      logInfo(s"Expected $submittedFiles files, but only saw $numFiles. " +
+        "This could be due to the output format not writing empty files, " +
+        "or files being not immediately visible in the filesystem.")
     }
     BasicWriteTaskStats(numPartitions, numFiles, numBytes, numRows)
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/BasicWriteTaskStatsTrackerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/BasicWriteTaskStatsTrackerSuite.scala
new file mode 100644
index 0000000000000..bf3c8ede9a980
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/BasicWriteTaskStatsTrackerSuite.scala
@@ -0,0 +1,220 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import java.nio.charset.Charset
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.util.Utils
+
+/**
+ * Test how BasicWriteTaskStatsTracker handles files.
+ *
+ * Two different datasets are written (alongside 0), one of
+ * length 10, one of 3. They were chosen to be distinct enough
+ * that it is straightforward to determine which file lengths were added
+ * from the sum of all files added. Lengths like "10" and "5" would
+ * be less informative.
+ */
+class BasicWriteTaskStatsTrackerSuite extends SparkFunSuite {
+
+  private val tempDir = Utils.createTempDir()
+  private val tempDirPath = new Path(tempDir.toURI)
+  private val conf = new Configuration()
+  private val localfs = tempDirPath.getFileSystem(conf)
+  private val data1 = "0123456789".getBytes(Charset.forName("US-ASCII"))
+  private val data2 = "012".getBytes(Charset.forName("US-ASCII"))
+  private val len1 = data1.length
+  private val len2 = data2.length
+
+  /**
+   * In teardown delete the temp dir.
+   */
+  protected override def afterAll(): Unit = {
+    Utils.deleteRecursively(tempDir)
+  }
+
+  /**
+   * Assert that the stats match that expected.
+   * @param tracker tracker to check
+   * @param files number of files expected
+   * @param bytes total number of bytes expected
+   */
+  private def assertStats(
+      tracker: BasicWriteTaskStatsTracker,
+      files: Int,
+      bytes: Int): Unit = {
+    val stats = finalStatus(tracker)
+    assert(files === stats.numFiles, "Wrong number of files")
+    assert(bytes === stats.numBytes, "Wrong byte count of file size")
+  }
+
+  private def finalStatus(tracker: BasicWriteTaskStatsTracker): BasicWriteTaskStats = {
+    tracker.getFinalStats().asInstanceOf[BasicWriteTaskStats]
+  }
+
+  test("No files in run") {
+    val tracker = new BasicWriteTaskStatsTracker(conf)
+    assertStats(tracker, 0, 0)
+  }
+
+  test("Missing File") {
+    val missing = new Path(tempDirPath, "missing")
+    val tracker = new BasicWriteTaskStatsTracker(conf)
+    tracker.newFile(missing.toString)
+    assertStats(tracker, 0, 0)
+  }
+
+  test("Empty filename is forwarded") {
+    val tracker = new BasicWriteTaskStatsTracker(conf)
+    tracker.newFile("")
+    intercept[IllegalArgumentException] {
+      finalStatus(tracker)
+    }
+  }
+
+  test("Null filename is only picked up in final status") {
+    val tracker = new BasicWriteTaskStatsTracker(conf)
+    tracker.newFile(null)
+    intercept[IllegalArgumentException] {
+      finalStatus(tracker)
+    }
+  }
+
+  test("0 byte file") {
+    val file = new Path(tempDirPath, "file0")
+    val tracker = new BasicWriteTaskStatsTracker(conf)
+    tracker.newFile(file.toString)
+    touch(file)
+    assertStats(tracker, 1, 0)
+  }
+
+  test("File with data") {
+    val file = new Path(tempDirPath, "file-with-data")
+    val tracker = new BasicWriteTaskStatsTracker(conf)
+    tracker.newFile(file.toString)
+    write1(file)
+    assertStats(tracker, 1, len1)
+  }
+
+  test("Open file") {
+    val file = new Path(tempDirPath, "file-open")
+    val tracker = new BasicWriteTaskStatsTracker(conf)
+    tracker.newFile(file.toString)
+    val stream = localfs.create(file, true)
+    try {
+      assertStats(tracker, 1, 0)
+      stream.write(data1)
+      stream.flush()
+      assert(1 === finalStatus(tracker).numFiles, "Wrong number of files")
+    } finally {
+      stream.close()
+    }
+  }
+
+  test("Two files") {
+    val file1 = new Path(tempDirPath, "f-2-1")
+    val file2 = new Path(tempDirPath, "f-2-2")
+    val tracker = new BasicWriteTaskStatsTracker(conf)
+    tracker.newFile(file1.toString)
+    write1(file1)
+    tracker.newFile(file2.toString)
+    write2(file2)
+    assertStats(tracker, 2, len1 + len2)
+  }
+
+  test("Three files, last one empty") {
+    val file1 = new Path(tempDirPath, "f-3-1")
+    val file2 = new Path(tempDirPath, "f-3-2")
+    val file3 = new Path(tempDirPath, "f-3-2")
+    val tracker = new BasicWriteTaskStatsTracker(conf)
+    tracker.newFile(file1.toString)
+    write1(file1)
+    tracker.newFile(file2.toString)
+    write2(file2)
+    tracker.newFile(file3.toString)
+    touch(file3)
+    assertStats(tracker, 3, len1 + len2)
+  }
+
+  test("Three files, one not found") {
+    val file1 = new Path(tempDirPath, "f-4-1")
+    val file2 = new Path(tempDirPath, "f-4-2")
+    val file3 = new Path(tempDirPath, "f-3-2")
+    val tracker = new BasicWriteTaskStatsTracker(conf)
+    // file 1
+    tracker.newFile(file1.toString)
+    write1(file1)
+
+    // file 2 is noted, but not created
+    tracker.newFile(file2.toString)
+
+    // file 3 is noted & then created
+    tracker.newFile(file3.toString)
+    write2(file3)
+
+    // the expected size is file1 + file3; only two files are reported
+    // as found
+    assertStats(tracker, 2, len1 + len2)
+  }
+
+  /**
+   * Write a 0-byte file.
+   * @param file file path
+   */
+  private def touch(file: Path): Unit = {
+    localfs.create(file, true).close()
+  }
+
+  /**
+   * Write a byte array.
+   * @param file path to file
+   * @param data data
+   * @return bytes written
+   */
+  private def write(file: Path, data: Array[Byte]): Integer = {
+    val stream = localfs.create(file, true)
+    try {
+      stream.write(data)
+    } finally {
+      stream.close()
+    }
+    data.length
+  }
+
+  /**
+   * Write a data1 array.
+   * @param file file
+   */
+  private def write1(file: Path): Unit = {
+    write(file, data1)
+  }
+
+  /**
+   * Write a data2 array.
+   *
+   * @param file file
+   */
+  private def write2(file: Path): Unit = {
+    write(file, data2)
+  }
+
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 94fa43dec7313..60935c3e85c43 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -2110,4 +2110,12 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       }
     }
   }
+
+  Seq("orc", "parquet", "csv", "json", "text").foreach { format =>
+    test(s"Writing empty datasets should not fail - $format") {
+      withTempDir { dir =>
+        Seq("str").toDS.limit(0).write.format(format).save(dir.getCanonicalPath + "/tmp")
+      }
+    }
+  }
 }

From e0503a7223410289d01bc4b20da3a451730577da Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Fri, 13 Oct 2017 23:24:36 -0700
Subject: [PATCH 712/779] [SPARK-22273][SQL] Fix key/value schema field names
 in HashMapGenerators.

## What changes were proposed in this pull request?

When fixing schema field names using escape characters with `addReferenceMinorObj()` at [SPARK-18952](https://issues.apache.org/jira/browse/SPARK-18952) (#16361), double-quotes around the names were remained and the names become something like `"((java.lang.String) references[1])"`.

```java
/* 055 */     private int maxSteps = 2;
/* 056 */     private int numRows = 0;
/* 057 */     private org.apache.spark.sql.types.StructType keySchema = new org.apache.spark.sql.types.StructType().add("((java.lang.String) references[1])", org.apache.spark.sql.types.DataTypes.StringType);
/* 058 */     private org.apache.spark.sql.types.StructType valueSchema = new org.apache.spark.sql.types.StructType().add("((java.lang.String) references[2])", org.apache.spark.sql.types.DataTypes.LongType);
/* 059 */     private Object emptyVBase;
```

We should remove the double-quotes to refer the values in `references` properly:

```java
/* 055 */     private int maxSteps = 2;
/* 056 */     private int numRows = 0;
/* 057 */     private org.apache.spark.sql.types.StructType keySchema = new org.apache.spark.sql.types.StructType().add(((java.lang.String) references[1]), org.apache.spark.sql.types.DataTypes.StringType);
/* 058 */     private org.apache.spark.sql.types.StructType valueSchema = new org.apache.spark.sql.types.StructType().add(((java.lang.String) references[2]), org.apache.spark.sql.types.DataTypes.LongType);
/* 059 */     private Object emptyVBase;
```

## How was this patch tested?

Existing tests.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #19491 from ueshin/issues/SPARK-22273.
---
 .../execution/aggregate/RowBasedHashMapGenerator.scala    | 8 ++++----
 .../execution/aggregate/VectorizedHashMapGenerator.scala  | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/RowBasedHashMapGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/RowBasedHashMapGenerator.scala
index 9316ebcdf105c..3718424931b40 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/RowBasedHashMapGenerator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/RowBasedHashMapGenerator.scala
@@ -50,10 +50,10 @@ class RowBasedHashMapGenerator(
           val keyName = ctx.addReferenceMinorObj(key.name)
           key.dataType match {
             case d: DecimalType =>
-              s""".add("$keyName", org.apache.spark.sql.types.DataTypes.createDecimalType(
+              s""".add($keyName, org.apache.spark.sql.types.DataTypes.createDecimalType(
                   |${d.precision}, ${d.scale}))""".stripMargin
             case _ =>
-              s""".add("$keyName", org.apache.spark.sql.types.DataTypes.${key.dataType})"""
+              s""".add($keyName, org.apache.spark.sql.types.DataTypes.${key.dataType})"""
           }
         }.mkString("\n").concat(";")
 
@@ -63,10 +63,10 @@ class RowBasedHashMapGenerator(
           val keyName = ctx.addReferenceMinorObj(key.name)
           key.dataType match {
             case d: DecimalType =>
-              s""".add("$keyName", org.apache.spark.sql.types.DataTypes.createDecimalType(
+              s""".add($keyName, org.apache.spark.sql.types.DataTypes.createDecimalType(
                   |${d.precision}, ${d.scale}))""".stripMargin
             case _ =>
-              s""".add("$keyName", org.apache.spark.sql.types.DataTypes.${key.dataType})"""
+              s""".add($keyName, org.apache.spark.sql.types.DataTypes.${key.dataType})"""
           }
         }.mkString("\n").concat(";")
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
index 13f79275cac41..812d405d5ebfe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
@@ -55,10 +55,10 @@ class VectorizedHashMapGenerator(
           val keyName = ctx.addReferenceMinorObj(key.name)
           key.dataType match {
             case d: DecimalType =>
-              s""".add("$keyName", org.apache.spark.sql.types.DataTypes.createDecimalType(
+              s""".add($keyName, org.apache.spark.sql.types.DataTypes.createDecimalType(
                   |${d.precision}, ${d.scale}))""".stripMargin
             case _ =>
-              s""".add("$keyName", org.apache.spark.sql.types.DataTypes.${key.dataType})"""
+              s""".add($keyName, org.apache.spark.sql.types.DataTypes.${key.dataType})"""
           }
         }.mkString("\n").concat(";")
 
@@ -68,10 +68,10 @@ class VectorizedHashMapGenerator(
           val keyName = ctx.addReferenceMinorObj(key.name)
           key.dataType match {
             case d: DecimalType =>
-              s""".add("$keyName", org.apache.spark.sql.types.DataTypes.createDecimalType(
+              s""".add($keyName, org.apache.spark.sql.types.DataTypes.createDecimalType(
                   |${d.precision}, ${d.scale}))""".stripMargin
             case _ =>
-              s""".add("$keyName", org.apache.spark.sql.types.DataTypes.${key.dataType})"""
+              s""".add($keyName, org.apache.spark.sql.types.DataTypes.${key.dataType})"""
           }
         }.mkString("\n").concat(";")
 

From 014dc8471200518d63005eed531777d30d8a6639 Mon Sep 17 00:00:00 2001
From: liulijia <liulijia@meituan.com>
Date: Sat, 14 Oct 2017 17:37:33 +0900
Subject: [PATCH 713/779] [SPARK-22233][CORE] Allow user to filter out empty
 split in HadoopRDD

## What changes were proposed in this pull request?
Add a flag spark.files.ignoreEmptySplits. When true, methods like that use HadoopRDD and NewHadoopRDD such as SparkContext.textFiles will not create a partition for input splits that are empty.

Author: liulijia <liulijia@meituan.com>

Closes #19464 from liutang123/SPARK-22233.
---
 .../spark/internal/config/package.scala       |  6 ++
 .../org/apache/spark/rdd/HadoopRDD.scala      | 12 ++-
 .../org/apache/spark/rdd/NewHadoopRDD.scala   | 13 ++-
 .../scala/org/apache/spark/FileSuite.scala    | 95 +++++++++++++++++--
 4 files changed, 112 insertions(+), 14 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index 19336f854145f..ce013d69579c1 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -270,6 +270,12 @@ package object config {
     .longConf
     .createWithDefault(4 * 1024 * 1024)
 
+  private[spark] val IGNORE_EMPTY_SPLITS = ConfigBuilder("spark.files.ignoreEmptySplits")
+    .doc("If true, methods that use HadoopRDD and NewHadoopRDD such as " +
+      "SparkContext.textFiles will not create a partition for input splits that are empty.")
+    .booleanConf
+    .createWithDefault(false)
+
   private[spark] val SECRET_REDACTION_PATTERN =
     ConfigBuilder("spark.redaction.regex")
       .doc("Regex to decide which Spark configuration properties and environment variables in " +
diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
index 23b344230e490..1f33c0a2b709f 100644
--- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -35,7 +35,7 @@ import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
-import org.apache.spark.internal.config.IGNORE_CORRUPT_FILES
+import org.apache.spark.internal.config.{IGNORE_CORRUPT_FILES, IGNORE_EMPTY_SPLITS}
 import org.apache.spark.rdd.HadoopRDD.HadoopMapPartitionsWithSplitRDD
 import org.apache.spark.scheduler.{HDFSCacheTaskLocation, HostTaskLocation}
 import org.apache.spark.storage.StorageLevel
@@ -134,6 +134,8 @@ class HadoopRDD[K, V](
 
   private val ignoreCorruptFiles = sparkContext.conf.get(IGNORE_CORRUPT_FILES)
 
+  private val ignoreEmptySplits = sparkContext.getConf.get(IGNORE_EMPTY_SPLITS)
+
   // Returns a JobConf that will be used on slaves to obtain input splits for Hadoop reads.
   protected def getJobConf(): JobConf = {
     val conf: Configuration = broadcastedConf.value.value
@@ -195,8 +197,12 @@ class HadoopRDD[K, V](
     val jobConf = getJobConf()
     // add the credentials here as this can be called before SparkContext initialized
     SparkHadoopUtil.get.addCredentials(jobConf)
-    val inputFormat = getInputFormat(jobConf)
-    val inputSplits = inputFormat.getSplits(jobConf, minPartitions)
+    val allInputSplits = getInputFormat(jobConf).getSplits(jobConf, minPartitions)
+    val inputSplits = if (ignoreEmptySplits) {
+      allInputSplits.filter(_.getLength > 0)
+    } else {
+      allInputSplits
+    }
     val array = new Array[Partition](inputSplits.size)
     for (i <- 0 until inputSplits.size) {
       array(i) = new HadoopPartition(id, i, inputSplits(i))
diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
index 482875e6c1ac5..db4eac1d0a775 100644
--- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
@@ -21,6 +21,7 @@ import java.io.IOException
 import java.text.SimpleDateFormat
 import java.util.{Date, Locale}
 
+import scala.collection.JavaConverters.asScalaBufferConverter
 import scala.reflect.ClassTag
 
 import org.apache.hadoop.conf.{Configurable, Configuration}
@@ -34,7 +35,7 @@ import org.apache.spark._
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
-import org.apache.spark.internal.config.IGNORE_CORRUPT_FILES
+import org.apache.spark.internal.config.{IGNORE_CORRUPT_FILES, IGNORE_EMPTY_SPLITS}
 import org.apache.spark.rdd.NewHadoopRDD.NewHadoopMapPartitionsWithSplitRDD
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.{SerializableConfiguration, ShutdownHookManager}
@@ -89,6 +90,8 @@ class NewHadoopRDD[K, V](
 
   private val ignoreCorruptFiles = sparkContext.conf.get(IGNORE_CORRUPT_FILES)
 
+  private val ignoreEmptySplits = sparkContext.getConf.get(IGNORE_EMPTY_SPLITS)
+
   def getConf: Configuration = {
     val conf: Configuration = confBroadcast.value.value
     if (shouldCloneJobConf) {
@@ -121,8 +124,12 @@ class NewHadoopRDD[K, V](
         configurable.setConf(_conf)
       case _ =>
     }
-    val jobContext = new JobContextImpl(_conf, jobId)
-    val rawSplits = inputFormat.getSplits(jobContext).toArray
+    val allRowSplits = inputFormat.getSplits(new JobContextImpl(_conf, jobId)).asScala
+    val rawSplits = if (ignoreEmptySplits) {
+      allRowSplits.filter(_.getLength > 0)
+    } else {
+      allRowSplits
+    }
     val result = new Array[Partition](rawSplits.size)
     for (i <- 0 until rawSplits.size) {
       result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
diff --git a/core/src/test/scala/org/apache/spark/FileSuite.scala b/core/src/test/scala/org/apache/spark/FileSuite.scala
index 02728180ac82d..4da4323ceb5c8 100644
--- a/core/src/test/scala/org/apache/spark/FileSuite.scala
+++ b/core/src/test/scala/org/apache/spark/FileSuite.scala
@@ -31,7 +31,7 @@ import org.apache.hadoop.mapreduce.Job
 import org.apache.hadoop.mapreduce.lib.input.{FileSplit => NewFileSplit, TextInputFormat => NewTextInputFormat}
 import org.apache.hadoop.mapreduce.lib.output.{TextOutputFormat => NewTextOutputFormat}
 
-import org.apache.spark.internal.config.IGNORE_CORRUPT_FILES
+import org.apache.spark.internal.config.{IGNORE_CORRUPT_FILES, IGNORE_EMPTY_SPLITS}
 import org.apache.spark.rdd.{HadoopRDD, NewHadoopRDD}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.Utils
@@ -347,10 +347,10 @@ class FileSuite extends SparkFunSuite with LocalSparkContext {
     }
   }
 
-  test ("allow user to disable the output directory existence checking (old Hadoop API") {
-    val sf = new SparkConf()
-    sf.setAppName("test").setMaster("local").set("spark.hadoop.validateOutputSpecs", "false")
-    sc = new SparkContext(sf)
+  test ("allow user to disable the output directory existence checking (old Hadoop API)") {
+    val conf = new SparkConf()
+    conf.setAppName("test").setMaster("local").set("spark.hadoop.validateOutputSpecs", "false")
+    sc = new SparkContext(conf)
     val randomRDD = sc.parallelize(Array((1, "a"), (1, "a"), (2, "b"), (3, "c")), 1)
     randomRDD.saveAsTextFile(tempDir.getPath + "/output")
     assert(new File(tempDir.getPath + "/output/part-00000").exists() === true)
@@ -380,9 +380,9 @@ class FileSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   test ("allow user to disable the output directory existence checking (new Hadoop API") {
-    val sf = new SparkConf()
-    sf.setAppName("test").setMaster("local").set("spark.hadoop.validateOutputSpecs", "false")
-    sc = new SparkContext(sf)
+    val conf = new SparkConf()
+    conf.setAppName("test").setMaster("local").set("spark.hadoop.validateOutputSpecs", "false")
+    sc = new SparkContext(conf)
     val randomRDD = sc.parallelize(
       Array(("key1", "a"), ("key2", "a"), ("key3", "b"), ("key4", "c")), 1)
     randomRDD.saveAsNewAPIHadoopFile[NewTextOutputFormat[String, String]](
@@ -510,4 +510,83 @@ class FileSuite extends SparkFunSuite with LocalSparkContext {
     }
   }
 
+  test("spark.files.ignoreEmptySplits work correctly (old Hadoop API)") {
+    val conf = new SparkConf()
+    conf.setAppName("test").setMaster("local").set(IGNORE_EMPTY_SPLITS, true)
+    sc = new SparkContext(conf)
+
+    def testIgnoreEmptySplits(
+        data: Array[Tuple2[String, String]],
+        actualPartitionNum: Int,
+        expectedPartitionNum: Int): Unit = {
+      val output = new File(tempDir, "output")
+      sc.parallelize(data, actualPartitionNum)
+        .saveAsHadoopFile[TextOutputFormat[String, String]](output.getPath)
+      for (i <- 0 until actualPartitionNum) {
+        assert(new File(output, s"part-0000$i").exists() === true)
+      }
+      val hadoopRDD = sc.textFile(new File(output, "part-*").getPath)
+      assert(hadoopRDD.partitions.length === expectedPartitionNum)
+      Utils.deleteRecursively(output)
+    }
+
+    // Ensure that if all of the splits are empty, we remove the splits correctly
+    testIgnoreEmptySplits(
+      data = Array.empty[Tuple2[String, String]],
+      actualPartitionNum = 1,
+      expectedPartitionNum = 0)
+
+    // Ensure that if no split is empty, we don't lose any splits
+    testIgnoreEmptySplits(
+      data = Array(("key1", "a"), ("key2", "a"), ("key3", "b")),
+      actualPartitionNum = 2,
+      expectedPartitionNum = 2)
+
+    // Ensure that if part of the splits are empty, we remove the splits correctly
+    testIgnoreEmptySplits(
+      data = Array(("key1", "a"), ("key2", "a")),
+      actualPartitionNum = 5,
+      expectedPartitionNum = 2)
+  }
+
+  test("spark.files.ignoreEmptySplits work correctly (new Hadoop API)") {
+    val conf = new SparkConf()
+    conf.setAppName("test").setMaster("local").set(IGNORE_EMPTY_SPLITS, true)
+    sc = new SparkContext(conf)
+
+    def testIgnoreEmptySplits(
+        data: Array[Tuple2[String, String]],
+        actualPartitionNum: Int,
+        expectedPartitionNum: Int): Unit = {
+      val output = new File(tempDir, "output")
+      sc.parallelize(data, actualPartitionNum)
+        .saveAsNewAPIHadoopFile[NewTextOutputFormat[String, String]](output.getPath)
+      for (i <- 0 until actualPartitionNum) {
+        assert(new File(output, s"part-r-0000$i").exists() === true)
+      }
+      val hadoopRDD = sc.newAPIHadoopFile(new File(output, "part-r-*").getPath,
+        classOf[NewTextInputFormat], classOf[LongWritable], classOf[Text])
+        .asInstanceOf[NewHadoopRDD[_, _]]
+      assert(hadoopRDD.partitions.length === expectedPartitionNum)
+      Utils.deleteRecursively(output)
+    }
+
+    // Ensure that if all of the splits are empty, we remove the splits correctly
+    testIgnoreEmptySplits(
+      data = Array.empty[Tuple2[String, String]],
+      actualPartitionNum = 1,
+      expectedPartitionNum = 0)
+
+    // Ensure that if no split is empty, we don't lose any splits
+    testIgnoreEmptySplits(
+      data = Array(("1", "a"), ("2", "a"), ("3", "b")),
+      actualPartitionNum = 2,
+      expectedPartitionNum = 2)
+
+    // Ensure that if part of the splits are empty, we remove the splits correctly
+    testIgnoreEmptySplits(
+      data = Array(("1", "a"), ("2", "b")),
+      actualPartitionNum = 5,
+      expectedPartitionNum = 2)
+  }
 }

From e8547ffb49071525c06876c856cecc0d4731b918 Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Sat, 14 Oct 2017 17:39:15 -0700
Subject: [PATCH 714/779] [SPARK-22238] Fix plan resolution bug caused by
 EnsureStatefulOpPartitioning

## What changes were proposed in this pull request?

In EnsureStatefulOpPartitioning, we check that the inputRDD to a SparkPlan has the expected partitioning for Streaming Stateful Operators. The problem is that we are not allowed to access this information during planning.
The reason we added that check was because CoalesceExec could actually create RDDs with 0 partitions. We should fix it such that when CoalesceExec says that there is a SinglePartition, there is in fact an inputRDD of 1 partition instead of 0 partitions.

## How was this patch tested?

Regression test in StreamingQuerySuite

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #19467 from brkyvz/stateful-op.
---
 .../plans/physical/partitioning.scala         |  15 +-
 .../execution/basicPhysicalOperators.scala    |  27 +++-
 .../exchange/EnsureRequirements.scala         |   5 +-
 .../FlatMapGroupsWithStateExec.scala          |   2 +-
 .../streaming/IncrementalExecution.scala      |  39 ++---
 .../streaming/statefulOperators.scala         |  11 +-
 .../org/apache/spark/sql/DataFrameSuite.scala |   2 +
 .../spark/sql/execution/PlannerSuite.scala    |  17 +++
 .../streaming/state/StateStoreRDDSuite.scala  |   2 +-
 .../SymmetricHashJoinStateManagerSuite.scala  |   2 +-
 .../sql/streaming/DeduplicateSuite.scala      |  11 +-
 .../EnsureStatefulOpPartitioningSuite.scala   | 138 ------------------
 .../FlatMapGroupsWithStateSuite.scala         |   6 +-
 .../sql/streaming/StatefulOperatorTest.scala  |  49 +++++++
 .../streaming/StreamingAggregationSuite.scala |   8 +-
 .../sql/streaming/StreamingJoinSuite.scala    |   2 +-
 .../sql/streaming/StreamingQuerySuite.scala   |  13 ++
 17 files changed, 160 insertions(+), 189 deletions(-)
 delete mode 100644 sql/core/src/test/scala/org/apache/spark/sql/streaming/EnsureStatefulOpPartitioningSuite.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/streaming/StatefulOperatorTest.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
index 51d78dd1233fe..e57c842ce2a36 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
@@ -49,7 +49,9 @@ case object AllTuples extends Distribution
  * can mean such tuples are either co-located in the same partition or they will be contiguous
  * within a single partition.
  */
-case class ClusteredDistribution(clustering: Seq[Expression]) extends Distribution {
+case class ClusteredDistribution(
+    clustering: Seq[Expression],
+    numPartitions: Option[Int] = None) extends Distribution {
   require(
     clustering != Nil,
     "The clustering expressions of a ClusteredDistribution should not be Nil. " +
@@ -221,6 +223,7 @@ case object SinglePartition extends Partitioning {
 
   override def satisfies(required: Distribution): Boolean = required match {
     case _: BroadcastDistribution => false
+    case ClusteredDistribution(_, desiredPartitions) => desiredPartitions.forall(_ == 1)
     case _ => true
   }
 
@@ -243,8 +246,9 @@ case class HashPartitioning(expressions: Seq[Expression], numPartitions: Int)
 
   override def satisfies(required: Distribution): Boolean = required match {
     case UnspecifiedDistribution => true
-    case ClusteredDistribution(requiredClustering) =>
-      expressions.forall(x => requiredClustering.exists(_.semanticEquals(x)))
+    case ClusteredDistribution(requiredClustering, desiredPartitions) =>
+      expressions.forall(x => requiredClustering.exists(_.semanticEquals(x))) &&
+        desiredPartitions.forall(_ == numPartitions) // if desiredPartitions = None, returns true
     case _ => false
   }
 
@@ -289,8 +293,9 @@ case class RangePartitioning(ordering: Seq[SortOrder], numPartitions: Int)
     case OrderedDistribution(requiredOrdering) =>
       val minSize = Seq(requiredOrdering.size, ordering.size).min
       requiredOrdering.take(minSize) == ordering.take(minSize)
-    case ClusteredDistribution(requiredClustering) =>
-      ordering.map(_.child).forall(x => requiredClustering.exists(_.semanticEquals(x)))
+    case ClusteredDistribution(requiredClustering, desiredPartitions) =>
+      ordering.map(_.child).forall(x => requiredClustering.exists(_.semanticEquals(x))) &&
+        desiredPartitions.forall(_ == numPartitions) // if desiredPartitions = None, returns true
     case _ => false
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
index 63cd1691f4cd7..d15ece304cac4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.execution
 import scala.concurrent.{ExecutionContext, Future}
 import scala.concurrent.duration.Duration
 
-import org.apache.spark.{InterruptibleIterator, TaskContext}
+import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext}
 import org.apache.spark.rdd.{EmptyRDD, PartitionwiseSampledRDD, RDD}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -590,10 +590,33 @@ case class CoalesceExec(numPartitions: Int, child: SparkPlan) extends UnaryExecN
   }
 
   protected override def doExecute(): RDD[InternalRow] = {
-    child.execute().coalesce(numPartitions, shuffle = false)
+    if (numPartitions == 1 && child.execute().getNumPartitions < 1) {
+      // Make sure we don't output an RDD with 0 partitions, when claiming that we have a
+      // `SinglePartition`.
+      new CoalesceExec.EmptyRDDWithPartitions(sparkContext, numPartitions)
+    } else {
+      child.execute().coalesce(numPartitions, shuffle = false)
+    }
   }
 }
 
+object CoalesceExec {
+  /** A simple RDD with no data, but with the given number of partitions. */
+  class EmptyRDDWithPartitions(
+      @transient private val sc: SparkContext,
+      numPartitions: Int) extends RDD[InternalRow](sc, Nil) {
+
+    override def getPartitions: Array[Partition] =
+      Array.tabulate(numPartitions)(i => EmptyPartition(i))
+
+    override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = {
+      Iterator.empty
+    }
+  }
+
+  case class EmptyPartition(index: Int) extends Partition
+}
+
 /**
  * Physical plan for a subquery.
  */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
index d28ce60e276d5..4e2ca37bc1a59 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
@@ -44,13 +44,16 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
 
   /**
    * Given a required distribution, returns a partitioning that satisfies that distribution.
+   * @param requiredDistribution The distribution that is required by the operator
+   * @param numPartitions Used when the distribution doesn't require a specific number of partitions
    */
   private def createPartitioning(
       requiredDistribution: Distribution,
       numPartitions: Int): Partitioning = {
     requiredDistribution match {
       case AllTuples => SinglePartition
-      case ClusteredDistribution(clustering) => HashPartitioning(clustering, numPartitions)
+      case ClusteredDistribution(clustering, desiredPartitions) =>
+        HashPartitioning(clustering, desiredPartitions.getOrElse(numPartitions))
       case OrderedDistribution(ordering) => RangePartitioning(ordering, numPartitions)
       case dist => sys.error(s"Do not know how to satisfy distribution $dist")
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala
index aab06d611a5ea..c81f1a8142784 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala
@@ -64,7 +64,7 @@ case class FlatMapGroupsWithStateExec(
 
   /** Distribute by grouping attributes */
   override def requiredChildDistribution: Seq[Distribution] =
-    ClusteredDistribution(groupingAttributes) :: Nil
+    ClusteredDistribution(groupingAttributes, stateInfo.map(_.numPartitions)) :: Nil
 
   /** Ordering needed for using GroupingIterator */
   override def requiredChildOrdering: Seq[Seq[SortOrder]] =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
index 82f879c763c2b..2e378637727fc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
@@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistrib
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.{QueryExecution, SparkPlan, SparkPlanner, UnaryExecNode}
 import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.streaming.OutputMode
 
 /**
@@ -61,6 +62,10 @@ class IncrementalExecution(
       StreamingDeduplicationStrategy :: Nil
   }
 
+  private val numStateStores = offsetSeqMetadata.conf.get(SQLConf.SHUFFLE_PARTITIONS.key)
+    .map(SQLConf.SHUFFLE_PARTITIONS.valueConverter)
+    .getOrElse(sparkSession.sessionState.conf.numShufflePartitions)
+
   /**
    * See [SPARK-18339]
    * Walk the optimized logical plan and replace CurrentBatchTimestamp
@@ -83,7 +88,11 @@ class IncrementalExecution(
   /** Get the state info of the next stateful operator */
   private def nextStatefulOperationStateInfo(): StatefulOperatorStateInfo = {
     StatefulOperatorStateInfo(
-      checkpointLocation, runId, statefulOperatorId.getAndIncrement(), currentBatchId)
+      checkpointLocation,
+      runId,
+      statefulOperatorId.getAndIncrement(),
+      currentBatchId,
+      numStateStores)
   }
 
   /** Locates save/restore pairs surrounding aggregation. */
@@ -130,34 +139,8 @@ class IncrementalExecution(
     }
   }
 
-  override def preparations: Seq[Rule[SparkPlan]] =
-    Seq(state, EnsureStatefulOpPartitioning) ++ super.preparations
+  override def preparations: Seq[Rule[SparkPlan]] = state +: super.preparations
 
   /** No need assert supported, as this check has already been done */
   override def assertSupported(): Unit = { }
 }
-
-object EnsureStatefulOpPartitioning extends Rule[SparkPlan] {
-  // Needs to be transformUp to avoid extra shuffles
-  override def apply(plan: SparkPlan): SparkPlan = plan transformUp {
-    case so: StatefulOperator =>
-      val numPartitions = plan.sqlContext.sessionState.conf.numShufflePartitions
-      val distributions = so.requiredChildDistribution
-      val children = so.children.zip(distributions).map { case (child, reqDistribution) =>
-        val expectedPartitioning = reqDistribution match {
-          case AllTuples => SinglePartition
-          case ClusteredDistribution(keys) => HashPartitioning(keys, numPartitions)
-          case _ => throw new AnalysisException("Unexpected distribution expected for " +
-            s"Stateful Operator: $so. Expect AllTuples or ClusteredDistribution but got " +
-            s"$reqDistribution.")
-        }
-        if (child.outputPartitioning.guarantees(expectedPartitioning) &&
-            child.execute().getNumPartitions == expectedPartitioning.numPartitions) {
-          child
-        } else {
-          ShuffleExchangeExec(expectedPartitioning, child)
-        }
-      }
-      so.withNewChildren(children)
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala
index 0d85542928ee6..b9b07a2e688f9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala
@@ -43,10 +43,11 @@ case class StatefulOperatorStateInfo(
     checkpointLocation: String,
     queryRunId: UUID,
     operatorId: Long,
-    storeVersion: Long) {
+    storeVersion: Long,
+    numPartitions: Int) {
   override def toString(): String = {
     s"state info [ checkpoint = $checkpointLocation, runId = $queryRunId, " +
-      s"opId = $operatorId, ver = $storeVersion]"
+      s"opId = $operatorId, ver = $storeVersion, numPartitions = $numPartitions]"
   }
 }
 
@@ -239,7 +240,7 @@ case class StateStoreRestoreExec(
     if (keyExpressions.isEmpty) {
       AllTuples :: Nil
     } else {
-      ClusteredDistribution(keyExpressions) :: Nil
+      ClusteredDistribution(keyExpressions, stateInfo.map(_.numPartitions)) :: Nil
     }
   }
 }
@@ -386,7 +387,7 @@ case class StateStoreSaveExec(
     if (keyExpressions.isEmpty) {
       AllTuples :: Nil
     } else {
-      ClusteredDistribution(keyExpressions) :: Nil
+      ClusteredDistribution(keyExpressions, stateInfo.map(_.numPartitions)) :: Nil
     }
   }
 }
@@ -401,7 +402,7 @@ case class StreamingDeduplicateExec(
 
   /** Distribute by grouping attributes */
   override def requiredChildDistribution: Seq[Distribution] =
-    ClusteredDistribution(keyExpressions) :: Nil
+    ClusteredDistribution(keyExpressions, stateInfo.map(_.numPartitions)) :: Nil
 
   override protected def doExecute(): RDD[InternalRow] = {
     metrics // force lazy init at driver
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index ad461fa6144b3..50de2fd3bca8d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -368,6 +368,8 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
     checkAnswer(
       testData.select('key).coalesce(1).select('key),
       testData.select('key).collect().toSeq)
+
+    assert(spark.emptyDataFrame.coalesce(1).rdd.partitions.size === 1)
   }
 
   test("convert $\"attribute name\" into unresolved attribute") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index 86066362da9dd..c25c90d0c70e2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -425,6 +425,23 @@ class PlannerSuite extends SharedSQLContext {
     }
   }
 
+  test("EnsureRequirements should respect ClusteredDistribution's num partitioning") {
+    val distribution = ClusteredDistribution(Literal(1) :: Nil, Some(13))
+    // Number of partitions differ
+    val finalPartitioning = HashPartitioning(Literal(1) :: Nil, 13)
+    val childPartitioning = HashPartitioning(Literal(1) :: Nil, 5)
+    assert(!childPartitioning.satisfies(distribution))
+    val inputPlan = DummySparkPlan(
+        children = DummySparkPlan(outputPartitioning = childPartitioning) :: Nil,
+        requiredChildDistribution = Seq(distribution),
+        requiredChildOrdering = Seq(Seq.empty))
+
+    val outputPlan = EnsureRequirements(spark.sessionState.conf).apply(inputPlan)
+    val shuffle = outputPlan.collect { case e: ShuffleExchangeExec => e }
+    assert(shuffle.size === 1)
+    assert(shuffle.head.newPartitioning === finalPartitioning)
+  }
+
   test("Reuse exchanges") {
     val distribution = ClusteredDistribution(Literal(1) :: Nil)
     val finalPartitioning = HashPartitioning(Literal(1) :: Nil, 5)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreRDDSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreRDDSuite.scala
index defb9ed63a881..65b39f0fbd73d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreRDDSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreRDDSuite.scala
@@ -214,7 +214,7 @@ class StateStoreRDDSuite extends SparkFunSuite with BeforeAndAfter with BeforeAn
       path: String,
       queryRunId: UUID = UUID.randomUUID,
       version: Int = 0): StatefulOperatorStateInfo = {
-    StatefulOperatorStateInfo(path, queryRunId, operatorId = 0, version)
+    StatefulOperatorStateInfo(path, queryRunId, operatorId = 0, version, numPartitions = 5)
   }
 
   private val increment = (store: StateStore, iter: Iterator[String]) => {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManagerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManagerSuite.scala
index d44af1d14c27a..c0216a2ef3e61 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManagerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManagerSuite.scala
@@ -160,7 +160,7 @@ class SymmetricHashJoinStateManagerSuite extends StreamTest with BeforeAndAfter
 
     withTempDir { file =>
       val storeConf = new StateStoreConf()
-      val stateInfo = StatefulOperatorStateInfo(file.getAbsolutePath, UUID.randomUUID, 0, 0)
+      val stateInfo = StatefulOperatorStateInfo(file.getAbsolutePath, UUID.randomUUID, 0, 0, 5)
       val manager = new SymmetricHashJoinStateManager(
         LeftSide, inputValueAttribs, joinKeyExprs, Some(stateInfo), storeConf, new Configuration)
       try {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/DeduplicateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/DeduplicateSuite.scala
index e858b7d9998a8..caf2bab8a5859 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/DeduplicateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/DeduplicateSuite.scala
@@ -19,12 +19,15 @@ package org.apache.spark.sql.streaming
 
 import org.scalatest.BeforeAndAfterAll
 
+import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, HashPartitioning, SinglePartition}
 import org.apache.spark.sql.catalyst.streaming.InternalOutputModes._
-import org.apache.spark.sql.execution.streaming.MemoryStream
+import org.apache.spark.sql.execution.streaming.{MemoryStream, StreamingDeduplicateExec}
 import org.apache.spark.sql.execution.streaming.state.StateStore
 import org.apache.spark.sql.functions._
 
-class DeduplicateSuite extends StateStoreMetricsTest with BeforeAndAfterAll {
+class DeduplicateSuite extends StateStoreMetricsTest
+    with BeforeAndAfterAll
+    with StatefulOperatorTest {
 
   import testImplicits._
 
@@ -41,6 +44,8 @@ class DeduplicateSuite extends StateStoreMetricsTest with BeforeAndAfterAll {
       AddData(inputData, "a"),
       CheckLastBatch("a"),
       assertNumStateRows(total = 1, updated = 1),
+      AssertOnQuery(sq =>
+        checkChildOutputHashPartitioning[StreamingDeduplicateExec](sq, Seq("value"))),
       AddData(inputData, "a"),
       CheckLastBatch(),
       assertNumStateRows(total = 1, updated = 0),
@@ -58,6 +63,8 @@ class DeduplicateSuite extends StateStoreMetricsTest with BeforeAndAfterAll {
       AddData(inputData, "a" -> 1),
       CheckLastBatch("a" -> 1),
       assertNumStateRows(total = 1, updated = 1),
+      AssertOnQuery(sq =>
+        checkChildOutputHashPartitioning[StreamingDeduplicateExec](sq, Seq("_1"))),
       AddData(inputData, "a" -> 2), // Dropped
       CheckLastBatch(),
       assertNumStateRows(total = 1, updated = 0),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/EnsureStatefulOpPartitioningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/EnsureStatefulOpPartitioningSuite.scala
deleted file mode 100644
index ed9823fbddfda..0000000000000
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/EnsureStatefulOpPartitioningSuite.scala
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.streaming
-
-import java.util.UUID
-
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.DataFrame
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.catalyst.plans.physical._
-import org.apache.spark.sql.execution.{SparkPlan, SparkPlanTest, UnaryExecNode}
-import org.apache.spark.sql.execution.exchange.{Exchange, ShuffleExchangeExec}
-import org.apache.spark.sql.execution.streaming.{IncrementalExecution, OffsetSeqMetadata, StatefulOperator, StatefulOperatorStateInfo}
-import org.apache.spark.sql.test.SharedSQLContext
-
-class EnsureStatefulOpPartitioningSuite extends SparkPlanTest with SharedSQLContext {
-
-  import testImplicits._
-
-  private var baseDf: DataFrame = null
-
-  override def beforeAll(): Unit = {
-    super.beforeAll()
-    baseDf = Seq((1, "A"), (2, "b")).toDF("num", "char")
-  }
-
-  test("ClusteredDistribution generates Exchange with HashPartitioning") {
-    testEnsureStatefulOpPartitioning(
-      baseDf.queryExecution.sparkPlan,
-      requiredDistribution = keys => ClusteredDistribution(keys),
-      expectedPartitioning =
-        keys => HashPartitioning(keys, spark.sessionState.conf.numShufflePartitions),
-      expectShuffle = true)
-  }
-
-  test("ClusteredDistribution with coalesce(1) generates Exchange with HashPartitioning") {
-    testEnsureStatefulOpPartitioning(
-      baseDf.coalesce(1).queryExecution.sparkPlan,
-      requiredDistribution = keys => ClusteredDistribution(keys),
-      expectedPartitioning =
-        keys => HashPartitioning(keys, spark.sessionState.conf.numShufflePartitions),
-      expectShuffle = true)
-  }
-
-  test("AllTuples generates Exchange with SinglePartition") {
-    testEnsureStatefulOpPartitioning(
-      baseDf.queryExecution.sparkPlan,
-      requiredDistribution = _ => AllTuples,
-      expectedPartitioning = _ => SinglePartition,
-      expectShuffle = true)
-  }
-
-  test("AllTuples with coalesce(1) doesn't need Exchange") {
-    testEnsureStatefulOpPartitioning(
-      baseDf.coalesce(1).queryExecution.sparkPlan,
-      requiredDistribution = _ => AllTuples,
-      expectedPartitioning = _ => SinglePartition,
-      expectShuffle = false)
-  }
-
-  /**
-   * For `StatefulOperator` with the given `requiredChildDistribution`, and child SparkPlan
-   * `inputPlan`, ensures that the incremental planner adds exchanges, if required, in order to
-   * ensure the expected partitioning.
-   */
-  private def testEnsureStatefulOpPartitioning(
-      inputPlan: SparkPlan,
-      requiredDistribution: Seq[Attribute] => Distribution,
-      expectedPartitioning: Seq[Attribute] => Partitioning,
-      expectShuffle: Boolean): Unit = {
-    val operator = TestStatefulOperator(inputPlan, requiredDistribution(inputPlan.output.take(1)))
-    val executed = executePlan(operator, OutputMode.Complete())
-    if (expectShuffle) {
-      val exchange = executed.children.find(_.isInstanceOf[Exchange])
-      if (exchange.isEmpty) {
-        fail(s"Was expecting an exchange but didn't get one in:\n$executed")
-      }
-      assert(exchange.get ===
-        ShuffleExchangeExec(expectedPartitioning(inputPlan.output.take(1)), inputPlan),
-        s"Exchange didn't have expected properties:\n${exchange.get}")
-    } else {
-      assert(!executed.children.exists(_.isInstanceOf[Exchange]),
-        s"Unexpected exchange found in:\n$executed")
-    }
-  }
-
-  /** Executes a SparkPlan using the IncrementalPlanner used for Structured Streaming. */
-  private def executePlan(
-      p: SparkPlan,
-      outputMode: OutputMode = OutputMode.Append()): SparkPlan = {
-    val execution = new IncrementalExecution(
-      spark,
-      null,
-      OutputMode.Complete(),
-      "chk",
-      UUID.randomUUID(),
-      0L,
-      OffsetSeqMetadata()) {
-      override lazy val sparkPlan: SparkPlan = p transform {
-        case plan: SparkPlan =>
-          val inputMap = plan.children.flatMap(_.output).map(a => (a.name, a)).toMap
-          plan transformExpressions {
-            case UnresolvedAttribute(Seq(u)) =>
-              inputMap.getOrElse(u,
-                sys.error(s"Invalid Test: Cannot resolve $u given input $inputMap"))
-          }
-      }
-    }
-    execution.executedPlan
-  }
-}
-
-/** Used to emulate a `StatefulOperator` with the given requiredDistribution. */
-case class TestStatefulOperator(
-    child: SparkPlan,
-    requiredDist: Distribution) extends UnaryExecNode with StatefulOperator {
-  override def output: Seq[Attribute] = child.output
-  override def doExecute(): RDD[InternalRow] = child.execute()
-  override def requiredChildDistribution: Seq[Distribution] = requiredDist :: Nil
-  override def stateInfo: Option[StatefulOperatorStateInfo] = None
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
index d2e8beb2f5290..aeb83835f981a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
@@ -41,7 +41,9 @@ case class RunningCount(count: Long)
 
 case class Result(key: Long, count: Int)
 
-class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest with BeforeAndAfterAll {
+class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
+    with BeforeAndAfterAll
+    with StatefulOperatorTest {
 
   import testImplicits._
   import GroupStateImpl._
@@ -544,6 +546,8 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest with BeforeAndAf
       AddData(inputData, "a"),
       CheckLastBatch(("a", "1")),
       assertNumStateRows(total = 1, updated = 1),
+      AssertOnQuery(sq => checkChildOutputHashPartitioning[FlatMapGroupsWithStateExec](
+        sq, Seq("value"))),
       AddData(inputData, "a", "b"),
       CheckLastBatch(("a", "2"), ("b", "1")),
       assertNumStateRows(total = 2, updated = 2),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StatefulOperatorTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StatefulOperatorTest.scala
new file mode 100644
index 0000000000000..45142278993bb
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StatefulOperatorTest.scala
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.streaming
+
+import org.apache.spark.sql.catalyst.plans.physical._
+import org.apache.spark.sql.execution.streaming._
+
+trait StatefulOperatorTest {
+  /**
+   * Check that the output partitioning of a child operator of a Stateful operator satisfies the
+   * distribution that we expect for our Stateful operator.
+   */
+  protected def checkChildOutputHashPartitioning[T <: StatefulOperator](
+      sq: StreamingQuery,
+      colNames: Seq[String]): Boolean = {
+    val attr = sq.asInstanceOf[StreamExecution].lastExecution.analyzed.output
+    val partitions = sq.sparkSession.sessionState.conf.numShufflePartitions
+    val groupingAttr = attr.filter(a => colNames.contains(a.name))
+    checkChildOutputPartitioning(sq, HashPartitioning(groupingAttr, partitions))
+  }
+
+  /**
+   * Check that the output partitioning of a child operator of a Stateful operator satisfies the
+   * distribution that we expect for our Stateful operator.
+   */
+  protected def checkChildOutputPartitioning[T <: StatefulOperator](
+      sq: StreamingQuery,
+      expectedPartitioning: Partitioning): Boolean = {
+    val operator = sq.asInstanceOf[StreamExecution].lastExecution
+      .executedPlan.collect { case p: T => p }
+    operator.head.children.forall(
+      _.outputPartitioning.numPartitions == expectedPartitioning.numPartitions)
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
index fe7efa69f7e31..1b4d8556f6ae5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
@@ -44,7 +44,7 @@ object FailureSingleton {
 }
 
 class StreamingAggregationSuite extends StateStoreMetricsTest
-    with BeforeAndAfterAll with Assertions {
+    with BeforeAndAfterAll with Assertions with StatefulOperatorTest {
 
   override def afterAll(): Unit = {
     super.afterAll()
@@ -281,6 +281,8 @@ class StreamingAggregationSuite extends StateStoreMetricsTest
       AddData(inputData, 0L, 5L, 5L, 10L),
       AdvanceManualClock(10 * 1000),
       CheckLastBatch((0L, 1), (5L, 2), (10L, 1)),
+      AssertOnQuery(sq =>
+        checkChildOutputHashPartitioning[StateStoreRestoreExec](sq, Seq("value"))),
 
       // advance clock to 20 seconds, should retain keys >= 10
       AddData(inputData, 15L, 15L, 20L),
@@ -455,8 +457,8 @@ class StreamingAggregationSuite extends StateStoreMetricsTest
         },
         AddBlockData(inputSource), // create an empty trigger
         CheckLastBatch(1),
-        AssertOnQuery("Verify addition of exchange operator") { se =>
-          checkAggregationChain(se, expectShuffling = true, 1)
+        AssertOnQuery("Verify that no exchange is required") { se =>
+          checkAggregationChain(se, expectShuffling = false, 1)
         },
         AddBlockData(inputSource, Seq(2, 3)),
         CheckLastBatch(3),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala
index a6593b71e51de..d32617275aadc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala
@@ -330,7 +330,7 @@ class StreamingInnerJoinSuite extends StreamTest with StateStoreMetricsTest with
       val queryId = UUID.randomUUID
       val opId = 0
       val path = Utils.createDirectory(tempDir.getAbsolutePath, Random.nextString(10)).toString
-      val stateInfo = StatefulOperatorStateInfo(path, queryId, opId, 0L)
+      val stateInfo = StatefulOperatorStateInfo(path, queryId, opId, 0L, 5)
 
       implicit val sqlContext = spark.sqlContext
       val coordinatorRef = sqlContext.streams.stateStoreCoordinator
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index ab35079dca23f..c53889bb8566c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -652,6 +652,19 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging wi
     }
   }
 
+  test("SPARK-22238: don't check for RDD partitions during streaming aggregation preparation") {
+    val stream = MemoryStream[(Int, Int)]
+    val baseDf = Seq((1, "A"), (2, "b")).toDF("num", "char").where("char = 'A'")
+    val otherDf = stream.toDF().toDF("num", "numSq")
+      .join(broadcast(baseDf), "num")
+      .groupBy('char)
+      .agg(sum('numSq))
+
+    testStream(otherDf, OutputMode.Complete())(
+      AddData(stream, (1, 1), (2, 4)),
+      CheckLastBatch(("A", 1)))
+  }
+
   /** Create a streaming DF that only execute one batch in which it returns the given static DF */
   private def createSingleTriggerStreamingDF(triggerDF: DataFrame): DataFrame = {
     require(!triggerDF.isStreaming)

From 13c1559587d0eb533c94f5a492390f81b048b347 Mon Sep 17 00:00:00 2001
From: Mridul Muralidharan <mridul@gmail.com>
Date: Sun, 15 Oct 2017 18:40:53 -0700
Subject: [PATCH 715/779] [SPARK-21549][CORE] Respect OutputFormats with
 no/invalid output directory provided

## What changes were proposed in this pull request?

PR #19294 added support for null's - but spark 2.1 handled other error cases where path argument can be invalid.
Namely:

* empty string
* URI parse exception while creating Path

This is resubmission of PR #19487, which I messed up while updating my repo.

## How was this patch tested?

Enhanced test to cover new support added.

Author: Mridul Muralidharan <mridul@gmail.com>

Closes #19497 from mridulm/master.
---
 .../io/HadoopMapReduceCommitProtocol.scala    | 24 +++++++-------
 .../spark/rdd/PairRDDFunctionsSuite.scala     | 31 +++++++++++++------
 2 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index a7e6859ef6b64..95c99d29c3a9c 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -20,6 +20,7 @@ package org.apache.spark.internal.io
 import java.util.{Date, UUID}
 
 import scala.collection.mutable
+import scala.util.Try
 
 import org.apache.hadoop.conf.Configurable
 import org.apache.hadoop.fs.Path
@@ -47,6 +48,16 @@ class HadoopMapReduceCommitProtocol(jobId: String, path: String)
   /** OutputCommitter from Hadoop is not serializable so marking it transient. */
   @transient private var committer: OutputCommitter = _
 
+  /**
+   * Checks whether there are files to be committed to a valid output location.
+   *
+   * As committing and aborting a job occurs on driver, where `addedAbsPathFiles` is always null,
+   * it is necessary to check whether a valid output path is specified.
+   * [[HadoopMapReduceCommitProtocol#path]] need not be a valid [[org.apache.hadoop.fs.Path]] for
+   * committers not writing to distributed file systems.
+   */
+  private val hasValidPath = Try { new Path(path) }.isSuccess
+
   /**
    * Tracks files staged by this task for absolute output paths. These outputs are not managed by
    * the Hadoop OutputCommitter, so we must move these to their final locations on job commit.
@@ -60,15 +71,6 @@ class HadoopMapReduceCommitProtocol(jobId: String, path: String)
    */
   private def absPathStagingDir: Path = new Path(path, "_temporary-" + jobId)
 
-  /**
-   * Checks whether there are files to be committed to an absolute output location.
-   *
-   * As committing and aborting a job occurs on driver, where `addedAbsPathFiles` is always null,
-   * it is necessary to check whether the output path is specified. Output path may not be required
-   * for committers not writing to distributed file systems.
-   */
-  private def hasAbsPathFiles: Boolean = path != null
-
   protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     val format = context.getOutputFormatClass.newInstance()
     // If OutputFormat is Configurable, we should set conf to it.
@@ -142,7 +144,7 @@ class HadoopMapReduceCommitProtocol(jobId: String, path: String)
     val filesToMove = taskCommits.map(_.obj.asInstanceOf[Map[String, String]])
       .foldLeft(Map[String, String]())(_ ++ _)
     logDebug(s"Committing files staged for absolute locations $filesToMove")
-    if (hasAbsPathFiles) {
+    if (hasValidPath) {
       val fs = absPathStagingDir.getFileSystem(jobContext.getConfiguration)
       for ((src, dst) <- filesToMove) {
         fs.rename(new Path(src), new Path(dst))
@@ -153,7 +155,7 @@ class HadoopMapReduceCommitProtocol(jobId: String, path: String)
 
   override def abortJob(jobContext: JobContext): Unit = {
     committer.abortJob(jobContext, JobStatus.State.FAILED)
-    if (hasAbsPathFiles) {
+    if (hasValidPath) {
       val fs = absPathStagingDir.getFileSystem(jobContext.getConfiguration)
       fs.delete(absPathStagingDir, true)
     }
diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
index 07579c5098014..0a248b6064ee8 100644
--- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
@@ -568,21 +568,34 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
     assert(FakeWriterWithCallback.exception.getMessage contains "failed to write")
   }
 
-  test("saveAsNewAPIHadoopDataset should respect empty output directory when " +
+  test("saveAsNewAPIHadoopDataset should support invalid output paths when " +
     "there are no files to be committed to an absolute output location") {
     val pairs = sc.parallelize(Array((new Integer(1), new Integer(2))), 1)
 
-    val job = NewJob.getInstance(new Configuration(sc.hadoopConfiguration))
-    job.setOutputKeyClass(classOf[Integer])
-    job.setOutputValueClass(classOf[Integer])
-    job.setOutputFormatClass(classOf[NewFakeFormat])
-    val jobConfiguration = job.getConfiguration
+    def saveRddWithPath(path: String): Unit = {
+      val job = NewJob.getInstance(new Configuration(sc.hadoopConfiguration))
+      job.setOutputKeyClass(classOf[Integer])
+      job.setOutputValueClass(classOf[Integer])
+      job.setOutputFormatClass(classOf[NewFakeFormat])
+      if (null != path) {
+        job.getConfiguration.set("mapred.output.dir", path)
+      } else {
+        job.getConfiguration.unset("mapred.output.dir")
+      }
+      val jobConfiguration = job.getConfiguration
+
+      // just test that the job does not fail with java.lang.IllegalArgumentException.
+      pairs.saveAsNewAPIHadoopDataset(jobConfiguration)
+    }
 
-    // just test that the job does not fail with
-    // java.lang.IllegalArgumentException: Can not create a Path from a null string
-    pairs.saveAsNewAPIHadoopDataset(jobConfiguration)
+    saveRddWithPath(null)
+    saveRddWithPath("")
+    saveRddWithPath("::invalid::")
   }
 
+  // In spark 2.1, only null was supported - not other invalid paths.
+  // org.apache.hadoop.mapred.FileOutputFormat.getOutputPath fails with IllegalArgumentException
+  // for non-null invalid paths.
   test("saveAsHadoopDataset should respect empty output directory when " +
     "there are no files to be committed to an absolute output location") {
     val pairs = sc.parallelize(Array((new Integer(1), new Integer(2))), 1)

From 0ae96495dedb54b3b6bae0bd55560820c5ca29a2 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Mon, 16 Oct 2017 13:37:58 +0800
Subject: [PATCH 716/779] [SPARK-22223][SQL] ObjectHashAggregate should not
 introduce unnecessary shuffle

## What changes were proposed in this pull request?

`ObjectHashAggregateExec` should override `outputPartitioning` in order to avoid unnecessary shuffle.

## How was this patch tested?

Added Jenkins test.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #19501 from viirya/SPARK-22223.
---
 .../aggregate/ObjectHashAggregateExec.scala   |  2 ++
 .../spark/sql/DataFrameAggregateSuite.scala   | 30 +++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectHashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectHashAggregateExec.scala
index ec3f9a05b5ccc..66955b8ef723c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectHashAggregateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectHashAggregateExec.scala
@@ -95,6 +95,8 @@ case class ObjectHashAggregateExec(
     }
   }
 
+  override def outputPartitioning: Partitioning = child.outputPartitioning
+
   protected override def doExecute(): RDD[InternalRow] = attachTree(this, "execute") {
     val numOutputRows = longMetric("numOutputRows")
     val aggTime = longMetric("aggTime")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index 8549eac58ee95..06848e4d2b297 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -21,6 +21,7 @@ import scala.util.Random
 
 import org.apache.spark.sql.execution.WholeStageCodegenExec
 import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, ObjectHashAggregateExec, SortAggregateExec}
+import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec
 import org.apache.spark.sql.expressions.Window
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
@@ -636,4 +637,33 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {
       spark.sql("SELECT 3 AS c, 4 AS d, SUM(b) FROM testData2 GROUP BY c, d"),
       Seq(Row(3, 4, 9)))
   }
+
+  test("SPARK-22223: ObjectHashAggregate should not introduce unnecessary shuffle") {
+    withSQLConf(SQLConf.USE_OBJECT_HASH_AGG.key -> "true") {
+      val df = Seq(("1", "2", 1), ("1", "2", 2), ("2", "3", 3), ("2", "3", 4)).toDF("a", "b", "c")
+        .repartition(col("a"))
+
+      val objHashAggDF = df
+        .withColumn("d", expr("(a, b, c)"))
+        .groupBy("a", "b").agg(collect_list("d").as("e"))
+        .withColumn("f", expr("(b, e)"))
+        .groupBy("a").agg(collect_list("f").as("g"))
+      val aggPlan = objHashAggDF.queryExecution.executedPlan
+
+      val sortAggPlans = aggPlan.collect {
+        case sortAgg: SortAggregateExec => sortAgg
+      }
+      assert(sortAggPlans.isEmpty)
+
+      val objHashAggPlans = aggPlan.collect {
+        case objHashAgg: ObjectHashAggregateExec => objHashAgg
+      }
+      assert(objHashAggPlans.nonEmpty)
+
+      val exchangePlans = aggPlan.collect {
+        case shuffle: ShuffleExchangeExec => shuffle
+      }
+      assert(exchangePlans.length == 1)
+    }
+  }
 }

From 0fa10666cf75e3c4929940af49c8a6f6ea874759 Mon Sep 17 00:00:00 2001
From: Xingbo Jiang <xingbo.jiang@databricks.com>
Date: Mon, 16 Oct 2017 22:15:50 +0800
Subject: [PATCH 717/779] [SPARK-22233][CORE][FOLLOW-UP] Allow user to filter
 out empty split in HadoopRDD

## What changes were proposed in this pull request?

Update the config `spark.files.ignoreEmptySplits`, rename it and make it internal.

This is followup of #19464

## How was this patch tested?

Exsiting tests.

Author: Xingbo Jiang <xingbo.jiang@databricks.com>

Closes #19504 from jiangxb1987/partitionsplit.
---
 .../org/apache/spark/internal/config/package.scala | 11 ++++++-----
 .../scala/org/apache/spark/rdd/HadoopRDD.scala     |  4 ++--
 .../scala/org/apache/spark/rdd/NewHadoopRDD.scala  |  4 ++--
 .../test/scala/org/apache/spark/FileSuite.scala    | 14 +++++++++-----
 4 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index ce013d69579c1..efffdca1ea59b 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -270,11 +270,12 @@ package object config {
     .longConf
     .createWithDefault(4 * 1024 * 1024)
 
-  private[spark] val IGNORE_EMPTY_SPLITS = ConfigBuilder("spark.files.ignoreEmptySplits")
-    .doc("If true, methods that use HadoopRDD and NewHadoopRDD such as " +
-      "SparkContext.textFiles will not create a partition for input splits that are empty.")
-    .booleanConf
-    .createWithDefault(false)
+  private[spark] val HADOOP_RDD_IGNORE_EMPTY_SPLITS =
+    ConfigBuilder("spark.hadoopRDD.ignoreEmptySplits")
+      .internal()
+      .doc("When true, HadoopRDD/NewHadoopRDD will not create partitions for empty input splits.")
+      .booleanConf
+      .createWithDefault(false)
 
   private[spark] val SECRET_REDACTION_PATTERN =
     ConfigBuilder("spark.redaction.regex")
diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
index 1f33c0a2b709f..2480559a41b7a 100644
--- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -35,7 +35,7 @@ import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
-import org.apache.spark.internal.config.{IGNORE_CORRUPT_FILES, IGNORE_EMPTY_SPLITS}
+import org.apache.spark.internal.config._
 import org.apache.spark.rdd.HadoopRDD.HadoopMapPartitionsWithSplitRDD
 import org.apache.spark.scheduler.{HDFSCacheTaskLocation, HostTaskLocation}
 import org.apache.spark.storage.StorageLevel
@@ -134,7 +134,7 @@ class HadoopRDD[K, V](
 
   private val ignoreCorruptFiles = sparkContext.conf.get(IGNORE_CORRUPT_FILES)
 
-  private val ignoreEmptySplits = sparkContext.getConf.get(IGNORE_EMPTY_SPLITS)
+  private val ignoreEmptySplits = sparkContext.conf.get(HADOOP_RDD_IGNORE_EMPTY_SPLITS)
 
   // Returns a JobConf that will be used on slaves to obtain input splits for Hadoop reads.
   protected def getJobConf(): JobConf = {
diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
index db4eac1d0a775..e4dd1b6a82498 100644
--- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
@@ -35,7 +35,7 @@ import org.apache.spark._
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
-import org.apache.spark.internal.config.{IGNORE_CORRUPT_FILES, IGNORE_EMPTY_SPLITS}
+import org.apache.spark.internal.config._
 import org.apache.spark.rdd.NewHadoopRDD.NewHadoopMapPartitionsWithSplitRDD
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.{SerializableConfiguration, ShutdownHookManager}
@@ -90,7 +90,7 @@ class NewHadoopRDD[K, V](
 
   private val ignoreCorruptFiles = sparkContext.conf.get(IGNORE_CORRUPT_FILES)
 
-  private val ignoreEmptySplits = sparkContext.getConf.get(IGNORE_EMPTY_SPLITS)
+  private val ignoreEmptySplits = sparkContext.conf.get(HADOOP_RDD_IGNORE_EMPTY_SPLITS)
 
   def getConf: Configuration = {
     val conf: Configuration = confBroadcast.value.value
diff --git a/core/src/test/scala/org/apache/spark/FileSuite.scala b/core/src/test/scala/org/apache/spark/FileSuite.scala
index 4da4323ceb5c8..e9539dc73f6fa 100644
--- a/core/src/test/scala/org/apache/spark/FileSuite.scala
+++ b/core/src/test/scala/org/apache/spark/FileSuite.scala
@@ -31,7 +31,7 @@ import org.apache.hadoop.mapreduce.Job
 import org.apache.hadoop.mapreduce.lib.input.{FileSplit => NewFileSplit, TextInputFormat => NewTextInputFormat}
 import org.apache.hadoop.mapreduce.lib.output.{TextOutputFormat => NewTextOutputFormat}
 
-import org.apache.spark.internal.config.{IGNORE_CORRUPT_FILES, IGNORE_EMPTY_SPLITS}
+import org.apache.spark.internal.config._
 import org.apache.spark.rdd.{HadoopRDD, NewHadoopRDD}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.Utils
@@ -510,9 +510,11 @@ class FileSuite extends SparkFunSuite with LocalSparkContext {
     }
   }
 
-  test("spark.files.ignoreEmptySplits work correctly (old Hadoop API)") {
+  test("spark.hadoopRDD.ignoreEmptySplits work correctly (old Hadoop API)") {
     val conf = new SparkConf()
-    conf.setAppName("test").setMaster("local").set(IGNORE_EMPTY_SPLITS, true)
+      .setAppName("test")
+      .setMaster("local")
+      .set(HADOOP_RDD_IGNORE_EMPTY_SPLITS, true)
     sc = new SparkContext(conf)
 
     def testIgnoreEmptySplits(
@@ -549,9 +551,11 @@ class FileSuite extends SparkFunSuite with LocalSparkContext {
       expectedPartitionNum = 2)
   }
 
-  test("spark.files.ignoreEmptySplits work correctly (new Hadoop API)") {
+  test("spark.hadoopRDD.ignoreEmptySplits work correctly (new Hadoop API)") {
     val conf = new SparkConf()
-    conf.setAppName("test").setMaster("local").set(IGNORE_EMPTY_SPLITS, true)
+      .setAppName("test")
+      .setMaster("local")
+      .set(HADOOP_RDD_IGNORE_EMPTY_SPLITS, true)
     sc = new SparkContext(conf)
 
     def testIgnoreEmptySplits(

From 561505e2fc290fc2cee3b8464ec49df773dca5eb Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Mon, 16 Oct 2017 11:27:08 -0700
Subject: [PATCH 718/779] [SPARK-22282][SQL] Rename OrcRelation to
 OrcFileFormat and remove ORC_COMPRESSION

## What changes were proposed in this pull request?

This PR aims to
- Rename `OrcRelation` to `OrcFileFormat` object.
- Replace `OrcRelation.ORC_COMPRESSION` with `org.apache.orc.OrcConf.COMPRESS`. Since [SPARK-21422](https://issues.apache.org/jira/browse/SPARK-21422), we can use `OrcConf.COMPRESS` instead of Hive's.

```scala
// The references of Hive's classes will be minimized.
val ORC_COMPRESSION = "orc.compress"
```

## How was this patch tested?

Pass the Jenkins with the existing and updated test cases.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #19502 from dongjoon-hyun/SPARK-22282.
---
 .../org/apache/spark/sql/DataFrameWriter.scala |  4 ++--
 .../spark/sql/hive/orc/OrcFileFormat.scala     | 18 ++++++++----------
 .../apache/spark/sql/hive/orc/OrcOptions.scala |  8 +++++---
 .../spark/sql/hive/orc/OrcQuerySuite.scala     | 11 ++++++-----
 .../spark/sql/hive/orc/OrcSourceSuite.scala    |  9 ++++++---
 5 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 07347d2748544..c9e45436ed42f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -520,8 +520,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
    * <li>`compression` (default is the value specified in `spark.sql.orc.compression.codec`):
    * compression codec to use when saving to file. This can be one of the known case-insensitive
    * shorten names(`none`, `snappy`, `zlib`, and `lzo`). This will override
-   * `orc.compress` and `spark.sql.parquet.compression.codec`. If `orc.compress` is given,
-   * it overrides `spark.sql.parquet.compression.codec`.</li>
+   * `orc.compress` and `spark.sql.orc.compression.codec`. If `orc.compress` is given,
+   * it overrides `spark.sql.orc.compression.codec`.</li>
    * </ul>
    *
    * @since 1.5.0
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
index 194e69c93e1a8..d26ec15410d95 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -32,6 +32,7 @@ import org.apache.hadoop.io.{NullWritable, Writable}
 import org.apache.hadoop.mapred.{JobConf, OutputFormat => MapRedOutputFormat, RecordWriter, Reporter}
 import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit}
+import org.apache.orc.OrcConf.COMPRESS
 
 import org.apache.spark.TaskContext
 import org.apache.spark.sql.SparkSession
@@ -72,7 +73,7 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
 
     val configuration = job.getConfiguration
 
-    configuration.set(OrcRelation.ORC_COMPRESSION, orcOptions.compressionCodec)
+    configuration.set(COMPRESS.getAttribute, orcOptions.compressionCodec)
     configuration match {
       case conf: JobConf =>
         conf.setOutputFormat(classOf[OrcOutputFormat])
@@ -93,8 +94,8 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
 
       override def getFileExtension(context: TaskAttemptContext): String = {
         val compressionExtension: String = {
-          val name = context.getConfiguration.get(OrcRelation.ORC_COMPRESSION)
-          OrcRelation.extensionsForCompressionCodecNames.getOrElse(name, "")
+          val name = context.getConfiguration.get(COMPRESS.getAttribute)
+          OrcFileFormat.extensionsForCompressionCodecNames.getOrElse(name, "")
         }
 
         compressionExtension + ".orc"
@@ -120,7 +121,7 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
     if (sparkSession.sessionState.conf.orcFilterPushDown) {
       // Sets pushed predicates
       OrcFilters.createFilter(requiredSchema, filters.toArray).foreach { f =>
-        hadoopConf.set(OrcRelation.SARG_PUSHDOWN, f.toKryo)
+        hadoopConf.set(OrcFileFormat.SARG_PUSHDOWN, f.toKryo)
         hadoopConf.setBoolean(ConfVars.HIVEOPTINDEXFILTER.varname, true)
       }
     }
@@ -138,7 +139,7 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
       if (isEmptyFile) {
         Iterator.empty
       } else {
-        OrcRelation.setRequiredColumns(conf, dataSchema, requiredSchema)
+        OrcFileFormat.setRequiredColumns(conf, dataSchema, requiredSchema)
 
         val orcRecordReader = {
           val job = Job.getInstance(conf)
@@ -160,7 +161,7 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
         Option(TaskContext.get()).foreach(_.addTaskCompletionListener(_ => recordsIterator.close()))
 
         // Unwraps `OrcStruct`s to `UnsafeRow`s
-        OrcRelation.unwrapOrcStructs(
+        OrcFileFormat.unwrapOrcStructs(
           conf,
           dataSchema,
           requiredSchema,
@@ -255,10 +256,7 @@ private[orc] class OrcOutputWriter(
   }
 }
 
-private[orc] object OrcRelation extends HiveInspectors {
-  // The references of Hive's classes will be minimized.
-  val ORC_COMPRESSION = "orc.compress"
-
+private[orc] object OrcFileFormat extends HiveInspectors {
   // This constant duplicates `OrcInputFormat.SARG_PUSHDOWN`, which is unfortunately not public.
   private[orc] val SARG_PUSHDOWN = "sarg.pushdown"
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcOptions.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcOptions.scala
index 7f94c8c579026..6ce90c07b4921 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcOptions.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcOptions.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql.hive.orc
 
 import java.util.Locale
 
+import org.apache.orc.OrcConf.COMPRESS
+
 import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 import org.apache.spark.sql.internal.SQLConf
 
@@ -40,9 +42,9 @@ private[orc] class OrcOptions(
    * Acceptable values are defined in [[shortOrcCompressionCodecNames]].
    */
   val compressionCodec: String = {
-    // `compression`, `orc.compress`, and `spark.sql.orc.compression.codec` are
-    // in order of precedence from highest to lowest.
-    val orcCompressionConf = parameters.get(OrcRelation.ORC_COMPRESSION)
+    // `compression`, `orc.compress`(i.e., OrcConf.COMPRESS), and `spark.sql.orc.compression.codec`
+    // are in order of precedence from highest to lowest.
+    val orcCompressionConf = parameters.get(COMPRESS.getAttribute)
     val codecName = parameters
       .get("compression")
       .orElse(orcCompressionConf)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
index 60ccd996d6d58..1fa9091f967a3 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
@@ -22,6 +22,7 @@ import java.sql.Timestamp
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.hive.ql.io.orc.{OrcStruct, SparkOrcNewRecordReader}
+import org.apache.orc.OrcConf.COMPRESS
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.sql._
@@ -176,11 +177,11 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
     }
   }
 
-  test("SPARK-16610: Respect orc.compress option when compression is unset") {
-    // Respect `orc.compress`.
+  test("SPARK-16610: Respect orc.compress (i.e., OrcConf.COMPRESS) when compression is unset") {
+    // Respect `orc.compress` (i.e., OrcConf.COMPRESS).
     withTempPath { file =>
       spark.range(0, 10).write
-        .option("orc.compress", "ZLIB")
+        .option(COMPRESS.getAttribute, "ZLIB")
         .orc(file.getCanonicalPath)
       val expectedCompressionKind =
         OrcFileOperator.getFileReader(file.getCanonicalPath).get.getCompression
@@ -191,7 +192,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
     withTempPath { file =>
       spark.range(0, 10).write
         .option("compression", "ZLIB")
-        .option("orc.compress", "SNAPPY")
+        .option(COMPRESS.getAttribute, "SNAPPY")
         .orc(file.getCanonicalPath)
       val expectedCompressionKind =
         OrcFileOperator.getFileReader(file.getCanonicalPath).get.getCompression
@@ -598,7 +599,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
       val requestedSchema = StructType(Nil)
       val conf = new Configuration()
       val physicalSchema = OrcFileOperator.readSchema(Seq(path), Some(conf)).get
-      OrcRelation.setRequiredColumns(conf, physicalSchema, requestedSchema)
+      OrcFileFormat.setRequiredColumns(conf, physicalSchema, requestedSchema)
       val maybeOrcReader = OrcFileOperator.getFileReader(path, Some(conf))
       assert(maybeOrcReader.isDefined)
       val orcRecordReader = new SparkOrcNewRecordReader(
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
index 781de6631f324..ef9e67c743837 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
@@ -18,7 +18,9 @@
 package org.apache.spark.sql.hive.orc
 
 import java.io.File
+import java.util.Locale
 
+import org.apache.orc.OrcConf.COMPRESS
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.sql.{QueryTest, Row}
@@ -150,7 +152,8 @@ abstract class OrcSuite extends QueryTest with TestHiveSingleton with BeforeAndA
 
   test("SPARK-18433: Improve DataSource option keys to be more case-insensitive") {
     val conf = sqlContext.sessionState.conf
-    assert(new OrcOptions(Map("Orc.Compress" -> "NONE"), conf).compressionCodec == "NONE")
+    val option = new OrcOptions(Map(COMPRESS.getAttribute.toUpperCase(Locale.ROOT) -> "NONE"), conf)
+    assert(option.compressionCodec == "NONE")
   }
 
   test("SPARK-19459/SPARK-18220: read char/varchar column written by Hive") {
@@ -205,8 +208,8 @@ abstract class OrcSuite extends QueryTest with TestHiveSingleton with BeforeAndA
     // `compression` -> `orc.compression` -> `spark.sql.orc.compression.codec`
     withSQLConf(SQLConf.ORC_COMPRESSION.key -> "uncompressed") {
       assert(new OrcOptions(Map.empty[String, String], conf).compressionCodec == "NONE")
-      val map1 = Map("orc.compress" -> "zlib")
-      val map2 = Map("orc.compress" -> "zlib", "compression" -> "lzo")
+      val map1 = Map(COMPRESS.getAttribute -> "zlib")
+      val map2 = Map(COMPRESS.getAttribute -> "zlib", "compression" -> "lzo")
       assert(new OrcOptions(map1, conf).compressionCodec == "ZLIB")
       assert(new OrcOptions(map2, conf).compressionCodec == "LZO")
     }

From c09a2a76b52905a784d2767cb899dc886c330628 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Mon, 16 Oct 2017 16:16:34 -0700
Subject: [PATCH 719/779] [SPARK-22280][SQL][TEST] Improve StatisticsSuite to
 test `convertMetastore` properly

## What changes were proposed in this pull request?

This PR aims to improve **StatisticsSuite** to test `convertMetastore` configuration properly. Currently, some test logic in `test statistics of LogicalRelation converted from Hive serde tables` depends on the default configuration. New test case is shorter and covers both(true/false) cases explicitly.

This test case was previously modified by SPARK-17410 and SPARK-17284 in Spark 2.3.0.
- https://github.com/apache/spark/commit/a2460be9c30b67b9159fe339d115b84d53cc288a#diff-1c464c86b68c2d0b07e73b7354e74ce7R443

## How was this patch tested?

Pass the Jenkins with the improved test case.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #19500 from dongjoon-hyun/SPARK-22280.
---
 .../spark/sql/hive/StatisticsSuite.scala      | 34 ++++++++-----------
 1 file changed, 14 insertions(+), 20 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index 9ff9ecf7f3677..b9a5ad7657134 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -937,26 +937,20 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
   }
 
   test("test statistics of LogicalRelation converted from Hive serde tables") {
-    val parquetTable = "parquetTable"
-    val orcTable = "orcTable"
-    withTable(parquetTable, orcTable) {
-      sql(s"CREATE TABLE $parquetTable (key STRING, value STRING) STORED AS PARQUET")
-      sql(s"CREATE TABLE $orcTable (key STRING, value STRING) STORED AS ORC")
-      sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src")
-      sql(s"INSERT INTO TABLE $orcTable SELECT * FROM src")
-
-      // the default value for `spark.sql.hive.convertMetastoreParquet` is true, here we just set it
-      // for robustness
-      withSQLConf(HiveUtils.CONVERT_METASTORE_PARQUET.key -> "true") {
-        checkTableStats(parquetTable, hasSizeInBytes = false, expectedRowCounts = None)
-        sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS")
-        checkTableStats(parquetTable, hasSizeInBytes = true, expectedRowCounts = Some(500))
-      }
-      withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> "true") {
-        // We still can get tableSize from Hive before Analyze
-        checkTableStats(orcTable, hasSizeInBytes = true, expectedRowCounts = None)
-        sql(s"ANALYZE TABLE $orcTable COMPUTE STATISTICS")
-        checkTableStats(orcTable, hasSizeInBytes = true, expectedRowCounts = Some(500))
+    Seq("orc", "parquet").foreach { format =>
+      Seq(true, false).foreach { isConverted =>
+        withSQLConf(
+          HiveUtils.CONVERT_METASTORE_ORC.key -> s"$isConverted",
+          HiveUtils.CONVERT_METASTORE_PARQUET.key -> s"$isConverted") {
+          withTable(format) {
+            sql(s"CREATE TABLE $format (key STRING, value STRING) STORED AS $format")
+            sql(s"INSERT INTO TABLE $format SELECT * FROM src")
+
+            checkTableStats(format, hasSizeInBytes = !isConverted, expectedRowCounts = None)
+            sql(s"ANALYZE TABLE $format COMPUTE STATISTICS")
+            checkTableStats(format, hasSizeInBytes = true, expectedRowCounts = Some(500))
+          }
+        }
       }
     }
   }

From e66cabb0215204605ca7928406d4787d41853dd1 Mon Sep 17 00:00:00 2001
From: Ben Barnard <barnardb@gmail.com>
Date: Tue, 17 Oct 2017 09:36:09 +0200
Subject: [PATCH 720/779] [SPARK-20992][SCHEDULER] Add links in documentation
 to Nomad integration.

## What changes were proposed in this pull request?

Adds links to the fork that provides integration with Nomad, in the same places the k8s integration is linked to.

## How was this patch tested?

I clicked on the links to make sure they're correct ;)

Author: Ben Barnard <barnardb@gmail.com>

Closes #19354 from barnardb/link-to-nomad-integration.
---
 docs/cluster-overview.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/cluster-overview.md b/docs/cluster-overview.md
index a2ad958959a50..c42bb4bb8377e 100644
--- a/docs/cluster-overview.md
+++ b/docs/cluster-overview.md
@@ -58,6 +58,9 @@ for providing container-centric infrastructure. Kubernetes support is being acti
 developed in an [apache-spark-on-k8s](https://github.com/apache-spark-on-k8s/) Github organization. 
 For documentation, refer to that project's README.
 
+A third-party project (not supported by the Spark project) exists to add support for
+[Nomad](https://github.com/hashicorp/nomad-spark) as a cluster manager.
+
 # Submitting Applications
 
 Applications can be submitted to a cluster of any type using the `spark-submit` script.

From 8148f19ca1f0e0375603cb4f180c1bad8b0b8042 Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Tue, 17 Oct 2017 09:41:23 +0200
Subject: [PATCH 721/779] [SPARK-22249][SQL] isin with empty list throws
 exception on cached DataFrame

## What changes were proposed in this pull request?

As pointed out in the JIRA, there is a bug which causes an exception to be thrown if `isin` is called with an empty list on a cached DataFrame. The PR fixes it.

## How was this patch tested?

Added UT.

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #19494 from mgaido91/SPARK-22249.
---
 .../columnar/InMemoryTableScanExec.scala          |  1 +
 .../columnar/InMemoryColumnarQuerySuite.scala     | 15 +++++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
index af3636a5a2ca7..846ec03e46a12 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
@@ -102,6 +102,7 @@ case class InMemoryTableScanExec(
     case IsNull(a: Attribute) => statsFor(a).nullCount > 0
     case IsNotNull(a: Attribute) => statsFor(a).count - statsFor(a).nullCount > 0
 
+    case In(_: AttributeReference, list: Seq[Expression]) if list.isEmpty => Literal.FalseLiteral
     case In(a: AttributeReference, list: Seq[Expression]) if list.forall(_.isInstanceOf[Literal]) =>
       list.map(l => statsFor(a).lowerBound <= l.asInstanceOf[Literal] &&
         l.asInstanceOf[Literal] <= statsFor(a).upperBound).reduce(_ || _)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
index 8d411eb191cd9..75d17bc79477d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
@@ -429,4 +429,19 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
       checkAnswer(agg_without_cache, agg_with_cache)
     }
   }
+
+  test("SPARK-22249: IN should work also with cached DataFrame") {
+    val df = spark.range(10).cache()
+    // with an empty list
+    assert(df.filter($"id".isin()).count() == 0)
+    // with a non-empty list
+    assert(df.filter($"id".isin(2)).count() == 1)
+    assert(df.filter($"id".isin(2, 3)).count() == 2)
+    df.unpersist()
+    val dfNulls = spark.range(10).selectExpr("null as id").cache()
+    // with null as value for the attribute
+    assert(dfNulls.filter($"id".isin()).count() == 0)
+    assert(dfNulls.filter($"id".isin(2, 3)).count() == 0)
+    dfNulls.unpersist()
+  }
 }

From 99e32f8ba5d908d5408e9857fd96ac1d7d7e5876 Mon Sep 17 00:00:00 2001
From: Kent Yao <yaooqinn@hotmail.com>
Date: Tue, 17 Oct 2017 17:58:45 +0800
Subject: [PATCH 722/779] [SPARK-22224][SQL] Override toString of
 KeyValue/Relational-GroupedDataset

## What changes were proposed in this pull request?
#### before

```scala
scala> val words = spark.read.textFile("README.md").flatMap(_.split(" "))
words: org.apache.spark.sql.Dataset[String] = [value: string]

scala> val grouped = words.groupByKey(identity)
grouped: org.apache.spark.sql.KeyValueGroupedDataset[String,String] = org.apache.spark.sql.KeyValueGroupedDataset65214862
```
#### after
```scala
scala> val words = spark.read.textFile("README.md").flatMap(_.split(" "))
words: org.apache.spark.sql.Dataset[String] = [value: string]

scala> val grouped = words.groupByKey(identity)
grouped: org.apache.spark.sql.KeyValueGroupedDataset[String,String] = [key: [value: string], value: [value: string]]
```

## How was this patch tested?
existing ut

cc gatorsmile cloud-fan

Author: Kent Yao <yaooqinn@hotmail.com>

Closes #19363 from yaooqinn/minor-dataset-tostring.
---
 .../spark/sql/KeyValueGroupedDataset.scala    | 22 ++++++-
 .../spark/sql/RelationalGroupedDataset.scala  | 19 +++++-
 .../org/apache/spark/sql/DatasetSuite.scala   | 61 +++++++++++++++++++
 .../org/apache/spark/sql/QueryTest.scala      | 12 +---
 4 files changed, 100 insertions(+), 14 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
index cb42e9e4560cf..6bab21dca0cbd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
@@ -24,7 +24,6 @@ import org.apache.spark.api.java.function._
 import org.apache.spark.sql.catalyst.encoders.{encoderFor, ExpressionEncoder}
 import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, CreateStruct}
 import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.catalyst.streaming.InternalOutputModes
 import org.apache.spark.sql.execution.QueryExecution
 import org.apache.spark.sql.expressions.ReduceAggregator
 import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout, OutputMode}
@@ -564,4 +563,25 @@ class KeyValueGroupedDataset[K, V] private[sql](
       encoder: Encoder[R]): Dataset[R] = {
     cogroup(other)((key, left, right) => f.call(key, left.asJava, right.asJava).asScala)(encoder)
   }
+
+  override def toString: String = {
+    val builder = new StringBuilder
+    val kFields = kExprEnc.schema.map {
+      case f => s"${f.name}: ${f.dataType.simpleString(2)}"
+    }
+    val vFields = vExprEnc.schema.map {
+      case f => s"${f.name}: ${f.dataType.simpleString(2)}"
+    }
+    builder.append("KeyValueGroupedDataset: [key: [")
+    builder.append(kFields.take(2).mkString(", "))
+    if (kFields.length > 2) {
+      builder.append(" ... " + (kFields.length - 2) + " more field(s)")
+    }
+    builder.append("], value: [")
+    builder.append(vFields.take(2).mkString(", "))
+    if (vFields.length > 2) {
+      builder.append(" ... " + (vFields.length - 2) + " more field(s)")
+    }
+    builder.append("]]").toString()
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
index cd0ac1feffa51..33ec3a27110a8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.catalyst.util.usePrettyExpression
 import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression
 import org.apache.spark.sql.execution.python.PythonUDF
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.types.{NumericType, StructField, StructType}
+import org.apache.spark.sql.types.{NumericType, StructType}
 
 /**
  * A set of methods for aggregations on a `DataFrame`, created by [[Dataset#groupBy groupBy]],
@@ -465,6 +465,19 @@ class RelationalGroupedDataset protected[sql](
 
     Dataset.ofRows(df.sparkSession, plan)
   }
+
+  override def toString: String = {
+    val builder = new StringBuilder
+    builder.append("RelationalGroupedDataset: [grouping expressions: [")
+    val kFields = groupingExprs.map(_.asInstanceOf[NamedExpression]).map {
+      case f => s"${f.name}: ${f.dataType.simpleString(2)}"
+    }
+    builder.append(kFields.take(2).mkString(", "))
+    if (kFields.length > 2) {
+      builder.append(" ... " + (kFields.length - 2) + " more field(s)")
+    }
+    builder.append(s"], value: ${df.toString}, type: $groupType]").toString()
+  }
 }
 
 private[sql] object RelationalGroupedDataset {
@@ -479,7 +492,9 @@ private[sql] object RelationalGroupedDataset {
   /**
    * The Grouping Type
    */
-  private[sql] trait GroupType
+  private[sql] trait GroupType {
+    override def toString: String = getClass.getSimpleName.stripSuffix("$").stripSuffix("Type")
+  }
 
   /**
    * To indicate it's the GroupBy
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index dace6825ee40e..1537ce3313c09 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -1341,8 +1341,69 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
       Seq(1).toDS().map(_ => ("", TestForTypeAlias.seqOfTupleTypeAlias)),
       ("", Seq((1, 1), (2, 2))))
   }
+
+  test("Check RelationalGroupedDataset toString: Single data") {
+    val kvDataset = (1 to 3).toDF("id").groupBy("id")
+    val expected = "RelationalGroupedDataset: [" +
+      "grouping expressions: [id: int], value: [id: int], type: GroupBy]"
+    val actual = kvDataset.toString
+    assert(expected === actual)
+  }
+
+  test("Check RelationalGroupedDataset toString: over length schema ") {
+    val kvDataset = (1 to 3).map( x => (x, x.toString, x.toLong))
+      .toDF("id", "val1", "val2").groupBy("id")
+    val expected = "RelationalGroupedDataset:" +
+      " [grouping expressions: [id: int]," +
+      " value: [id: int, val1: string ... 1 more field]," +
+      " type: GroupBy]"
+    val actual = kvDataset.toString
+    assert(expected === actual)
+  }
+
+
+  test("Check KeyValueGroupedDataset toString: Single data") {
+    val kvDataset = (1 to 3).toDF("id").as[SingleData].groupByKey(identity)
+    val expected = "KeyValueGroupedDataset: [key: [id: int], value: [id: int]]"
+    val actual = kvDataset.toString
+    assert(expected === actual)
+  }
+
+  test("Check KeyValueGroupedDataset toString: Unnamed KV-pair") {
+    val kvDataset = (1 to 3).map(x => (x, x.toString))
+      .toDF("id", "val1").as[DoubleData].groupByKey(x => (x.id, x.val1))
+    val expected = "KeyValueGroupedDataset:" +
+      " [key: [_1: int, _2: string]," +
+      " value: [id: int, val1: string]]"
+    val actual = kvDataset.toString
+    assert(expected === actual)
+  }
+
+  test("Check KeyValueGroupedDataset toString: Named KV-pair") {
+    val kvDataset = (1 to 3).map( x => (x, x.toString))
+      .toDF("id", "val1").as[DoubleData].groupByKey(x => DoubleData(x.id, x.val1))
+    val expected = "KeyValueGroupedDataset:" +
+      " [key: [id: int, val1: string]," +
+      " value: [id: int, val1: string]]"
+    val actual = kvDataset.toString
+    assert(expected === actual)
+  }
+
+  test("Check KeyValueGroupedDataset toString: over length schema ") {
+    val kvDataset = (1 to 3).map( x => (x, x.toString, x.toLong))
+      .toDF("id", "val1", "val2").as[TripleData].groupByKey(identity)
+    val expected = "KeyValueGroupedDataset:" +
+      " [key: [id: int, val1: string ... 1 more field(s)]," +
+      " value: [id: int, val1: string ... 1 more field(s)]]"
+    val actual = kvDataset.toString
+    assert(expected === actual)
+  }
 }
 
+case class SingleData(id: Int)
+case class DoubleData(id: Int, val1: String)
+case class TripleData(id: Int, val1: String, val2: Long)
+
 case class WithImmutableMap(id: String, map_test: scala.collection.immutable.Map[Long, String])
 case class WithMap(id: String, map_test: scala.collection.Map[Long, String])
 case class WithMapInOption(m: Option[scala.collection.Map[Int, Int]])
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
index f9808834df4a5..fcaca3d75b74f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
@@ -17,23 +17,13 @@
 
 package org.apache.spark.sql
 
-import java.util.{ArrayDeque, Locale, TimeZone}
+import java.util.{Locale, TimeZone}
 
 import scala.collection.JavaConverters._
-import scala.util.control.NonFatal
 
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.aggregate.ImperativeAggregate
 import org.apache.spark.sql.catalyst.plans._
-import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.catalyst.util._
-import org.apache.spark.sql.execution._
-import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression
 import org.apache.spark.sql.execution.columnar.InMemoryRelation
-import org.apache.spark.sql.execution.datasources.LogicalRelation
-import org.apache.spark.sql.execution.streaming.MemoryPlan
-import org.apache.spark.sql.types.{Metadata, ObjectType}
 
 
 abstract class QueryTest extends PlanTest {

From e1960c3d6f380b0dfbba6ee5d8ac6da4bc29a698 Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Tue, 17 Oct 2017 22:54:38 +0800
Subject: [PATCH 723/779] [SPARK-22062][CORE] Spill large block to disk in
 BlockManager's remote fetch to avoid OOM

## What changes were proposed in this pull request?

In the current BlockManager's `getRemoteBytes`, it will call `BlockTransferService#fetchBlockSync` to get remote block. In the `fetchBlockSync`, Spark will allocate a temporary `ByteBuffer` to store the whole fetched block. This will potentially lead to OOM if block size is too big or several blocks are fetched simultaneously in this executor.

So here leveraging the idea of shuffle fetch, to spill the large block to local disk before consumed by upstream code. The behavior is controlled by newly added configuration, if block size is smaller than the threshold, then this block will be persisted in memory; otherwise it will first spill to disk, and then read from disk file.

To achieve this feature, what I did is:

1. Rename `TempShuffleFileManager` to `TempFileManager`, since now it is not only used by shuffle.
2. Add a new `TempFileManager` to manage the files of fetched remote blocks, the files are tracked by weak reference, will be deleted when no use at all.

## How was this patch tested?

This was tested by adding UT, also manual verification in local test to perform GC to clean the files.

Author: jerryshao <sshao@hortonworks.com>

Closes #19476 from jerryshao/SPARK-22062.
---
 .../shuffle/ExternalShuffleClient.java        |   4 +-
 .../shuffle/OneForOneBlockFetcher.java        |  12 +--
 .../spark/network/shuffle/ShuffleClient.java  |  10 +-
 ...eFileManager.java => TempFileManager.java} |  12 +--
 .../scala/org/apache/spark/SparkConf.scala    |   4 +-
 .../spark/internal/config/package.scala       |  15 +--
 .../spark/network/BlockTransferService.scala  |  28 +++--
 .../netty/NettyBlockTransferService.scala     |   6 +-
 .../shuffle/BlockStoreShuffleReader.scala     |   2 +-
 .../apache/spark/storage/BlockManager.scala   | 102 ++++++++++++++++--
 .../spark/storage/BlockManagerMaster.scala    |   6 ++
 .../storage/BlockManagerMasterEndpoint.scala  |  14 +++
 .../spark/storage/BlockManagerMessages.scala  |   7 ++
 .../storage/ShuffleBlockFetcherIterator.scala |   8 +-
 .../org/apache/spark/DistributedSuite.scala   |   2 +-
 .../spark/storage/BlockManagerSuite.scala     |  57 +++++++---
 .../ShuffleBlockFetcherIteratorSuite.scala    |  10 +-
 docs/configuration.md                         |  11 +-
 18 files changed, 236 insertions(+), 74 deletions(-)
 rename common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/{TempShuffleFileManager.java => TempFileManager.java} (74%)

diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleClient.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleClient.java
index 77702447edb88..510017fee2db5 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleClient.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleClient.java
@@ -91,7 +91,7 @@ public void fetchBlocks(
       String execId,
       String[] blockIds,
       BlockFetchingListener listener,
-      TempShuffleFileManager tempShuffleFileManager) {
+      TempFileManager tempFileManager) {
     checkInit();
     logger.debug("External shuffle fetch from {}:{} (executor id {})", host, port, execId);
     try {
@@ -99,7 +99,7 @@ public void fetchBlocks(
           (blockIds1, listener1) -> {
             TransportClient client = clientFactory.createClient(host, port);
             new OneForOneBlockFetcher(client, appId, execId,
-              blockIds1, listener1, conf, tempShuffleFileManager).start();
+              blockIds1, listener1, conf, tempFileManager).start();
           };
 
       int maxRetries = conf.maxIORetries();
diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java
index 66b67e282c80d..3f2f20b4149f1 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java
@@ -58,7 +58,7 @@ public class OneForOneBlockFetcher {
   private final BlockFetchingListener listener;
   private final ChunkReceivedCallback chunkCallback;
   private final TransportConf transportConf;
-  private final TempShuffleFileManager tempShuffleFileManager;
+  private final TempFileManager tempFileManager;
 
   private StreamHandle streamHandle = null;
 
@@ -79,14 +79,14 @@ public OneForOneBlockFetcher(
       String[] blockIds,
       BlockFetchingListener listener,
       TransportConf transportConf,
-      TempShuffleFileManager tempShuffleFileManager) {
+      TempFileManager tempFileManager) {
     this.client = client;
     this.openMessage = new OpenBlocks(appId, execId, blockIds);
     this.blockIds = blockIds;
     this.listener = listener;
     this.chunkCallback = new ChunkCallback();
     this.transportConf = transportConf;
-    this.tempShuffleFileManager = tempShuffleFileManager;
+    this.tempFileManager = tempFileManager;
   }
 
   /** Callback invoked on receipt of each chunk. We equate a single chunk to a single block. */
@@ -125,7 +125,7 @@ public void onSuccess(ByteBuffer response) {
           // Immediately request all chunks -- we expect that the total size of the request is
           // reasonable due to higher level chunking in [[ShuffleBlockFetcherIterator]].
           for (int i = 0; i < streamHandle.numChunks; i++) {
-            if (tempShuffleFileManager != null) {
+            if (tempFileManager != null) {
               client.stream(OneForOneStreamManager.genStreamChunkId(streamHandle.streamId, i),
                 new DownloadCallback(i));
             } else {
@@ -164,7 +164,7 @@ private class DownloadCallback implements StreamCallback {
     private int chunkIndex;
 
     DownloadCallback(int chunkIndex) throws IOException {
-      this.targetFile = tempShuffleFileManager.createTempShuffleFile();
+      this.targetFile = tempFileManager.createTempFile();
       this.channel = Channels.newChannel(Files.newOutputStream(targetFile.toPath()));
       this.chunkIndex = chunkIndex;
     }
@@ -180,7 +180,7 @@ public void onComplete(String streamId) throws IOException {
       ManagedBuffer buffer = new FileSegmentManagedBuffer(transportConf, targetFile, 0,
         targetFile.length());
       listener.onBlockFetchSuccess(blockIds[chunkIndex], buffer);
-      if (!tempShuffleFileManager.registerTempShuffleFileToClean(targetFile)) {
+      if (!tempFileManager.registerTempFileToClean(targetFile)) {
         targetFile.delete();
       }
     }
diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleClient.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleClient.java
index 5bd4412b75275..18b04fedcac5b 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleClient.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleClient.java
@@ -43,10 +43,10 @@ public void init(String appId) { }
    * @param execId the executor id.
    * @param blockIds block ids to fetch.
    * @param listener the listener to receive block fetching status.
-   * @param tempShuffleFileManager TempShuffleFileManager to create and clean temp shuffle files.
-   *                               If it's not <code>null</code>, the remote blocks will be streamed
-   *                               into temp shuffle files to reduce the memory usage, otherwise,
-   *                               they will be kept in memory.
+   * @param tempFileManager TempFileManager to create and clean temp files.
+   *                        If it's not <code>null</code>, the remote blocks will be streamed
+   *                        into temp shuffle files to reduce the memory usage, otherwise,
+   *                        they will be kept in memory.
    */
   public abstract void fetchBlocks(
       String host,
@@ -54,7 +54,7 @@ public abstract void fetchBlocks(
       String execId,
       String[] blockIds,
       BlockFetchingListener listener,
-      TempShuffleFileManager tempShuffleFileManager);
+      TempFileManager tempFileManager);
 
   /**
    * Get the shuffle MetricsSet from ShuffleClient, this will be used in MetricsSystem to
diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/TempShuffleFileManager.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/TempFileManager.java
similarity index 74%
rename from common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/TempShuffleFileManager.java
rename to common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/TempFileManager.java
index 84a5ed6a276bd..552364d274f19 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/TempShuffleFileManager.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/TempFileManager.java
@@ -20,17 +20,17 @@
 import java.io.File;
 
 /**
- * A manager to create temp shuffle block files to reduce the memory usage and also clean temp
+ * A manager to create temp block files to reduce the memory usage and also clean temp
  * files when they won't be used any more.
  */
-public interface TempShuffleFileManager {
+public interface TempFileManager {
 
-  /** Create a temp shuffle block file. */
-  File createTempShuffleFile();
+  /** Create a temp block file. */
+  File createTempFile();
 
   /**
-   * Register a temp shuffle file to clean up when it won't be used any more. Return whether the
+   * Register a temp file to clean up when it won't be used any more. Return whether the
    * file is registered successfully. If `false`, the caller should clean up the file by itself.
    */
-  boolean registerTempShuffleFileToClean(File file);
+  boolean registerTempFileToClean(File file);
 }
diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index e61f943af49f2..57b3744e9c30a 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -662,7 +662,9 @@ private[spark] object SparkConf extends Logging {
     "spark.yarn.jars" -> Seq(
       AlternateConfig("spark.yarn.jar", "2.0")),
     "spark.yarn.access.hadoopFileSystems" -> Seq(
-      AlternateConfig("spark.yarn.access.namenodes", "2.2"))
+      AlternateConfig("spark.yarn.access.namenodes", "2.2")),
+    "spark.maxRemoteBlockSizeFetchToMem" -> Seq(
+      AlternateConfig("spark.reducer.maxReqSizeShuffleToMem", "2.3"))
   )
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index efffdca1ea59b..e7b406af8d9b1 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -357,13 +357,15 @@ package object config {
       .checkValue(_ > 0, "The max no. of blocks in flight cannot be non-positive.")
       .createWithDefault(Int.MaxValue)
 
-  private[spark] val REDUCER_MAX_REQ_SIZE_SHUFFLE_TO_MEM =
-    ConfigBuilder("spark.reducer.maxReqSizeShuffleToMem")
-      .doc("The blocks of a shuffle request will be fetched to disk when size of the request is " +
+  private[spark] val MAX_REMOTE_BLOCK_SIZE_FETCH_TO_MEM =
+    ConfigBuilder("spark.maxRemoteBlockSizeFetchToMem")
+      .doc("Remote block will be fetched to disk when size of the block is " +
         "above this threshold. This is to avoid a giant request takes too much memory. We can " +
-        "enable this config by setting a specific value(e.g. 200m). Note that this config can " +
-        "be enabled only when the shuffle shuffle service is newer than Spark-2.2 or the shuffle" +
-        " service is disabled.")
+        "enable this config by setting a specific value(e.g. 200m). Note this configuration will " +
+        "affect both shuffle fetch and block manager remote block fetch. For users who " +
+        "enabled external shuffle service, this feature can only be worked when external shuffle" +
+        " service is newer than Spark 2.2.")
+      .withAlternative("spark.reducer.maxReqSizeShuffleToMem")
       .bytesConf(ByteUnit.BYTE)
       .createWithDefault(Long.MaxValue)
 
@@ -432,5 +434,4 @@ package object config {
     .stringConf
     .toSequence
     .createOptional
-
 }
diff --git a/core/src/main/scala/org/apache/spark/network/BlockTransferService.scala b/core/src/main/scala/org/apache/spark/network/BlockTransferService.scala
index fe5fd2da039bb..1d8a266d0079c 100644
--- a/core/src/main/scala/org/apache/spark/network/BlockTransferService.scala
+++ b/core/src/main/scala/org/apache/spark/network/BlockTransferService.scala
@@ -25,8 +25,8 @@ import scala.concurrent.duration.Duration
 import scala.reflect.ClassTag
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer}
-import org.apache.spark.network.shuffle.{BlockFetchingListener, ShuffleClient, TempShuffleFileManager}
+import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer, NioManagedBuffer}
+import org.apache.spark.network.shuffle.{BlockFetchingListener, ShuffleClient, TempFileManager}
 import org.apache.spark.storage.{BlockId, StorageLevel}
 import org.apache.spark.util.ThreadUtils
 
@@ -68,7 +68,7 @@ abstract class BlockTransferService extends ShuffleClient with Closeable with Lo
       execId: String,
       blockIds: Array[String],
       listener: BlockFetchingListener,
-      tempShuffleFileManager: TempShuffleFileManager): Unit
+      tempFileManager: TempFileManager): Unit
 
   /**
    * Upload a single block to a remote node, available only after [[init]] is invoked.
@@ -87,7 +87,12 @@ abstract class BlockTransferService extends ShuffleClient with Closeable with Lo
    *
    * It is also only available after [[init]] is invoked.
    */
-  def fetchBlockSync(host: String, port: Int, execId: String, blockId: String): ManagedBuffer = {
+  def fetchBlockSync(
+      host: String,
+      port: Int,
+      execId: String,
+      blockId: String,
+      tempFileManager: TempFileManager): ManagedBuffer = {
     // A monitor for the thread to wait on.
     val result = Promise[ManagedBuffer]()
     fetchBlocks(host, port, execId, Array(blockId),
@@ -96,12 +101,17 @@ abstract class BlockTransferService extends ShuffleClient with Closeable with Lo
           result.failure(exception)
         }
         override def onBlockFetchSuccess(blockId: String, data: ManagedBuffer): Unit = {
-          val ret = ByteBuffer.allocate(data.size.toInt)
-          ret.put(data.nioByteBuffer())
-          ret.flip()
-          result.success(new NioManagedBuffer(ret))
+          data match {
+            case f: FileSegmentManagedBuffer =>
+              result.success(f)
+            case _ =>
+              val ret = ByteBuffer.allocate(data.size.toInt)
+              ret.put(data.nioByteBuffer())
+              ret.flip()
+              result.success(new NioManagedBuffer(ret))
+          }
         }
-      }, tempShuffleFileManager = null)
+      }, tempFileManager)
     ThreadUtils.awaitResult(result.future, Duration.Inf)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
index 6a29e18bf3cbb..b7d8c35032763 100644
--- a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
+++ b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
@@ -32,7 +32,7 @@ import org.apache.spark.network.buffer.ManagedBuffer
 import org.apache.spark.network.client.{RpcResponseCallback, TransportClientBootstrap, TransportClientFactory}
 import org.apache.spark.network.crypto.{AuthClientBootstrap, AuthServerBootstrap}
 import org.apache.spark.network.server._
-import org.apache.spark.network.shuffle.{BlockFetchingListener, OneForOneBlockFetcher, RetryingBlockFetcher, TempShuffleFileManager}
+import org.apache.spark.network.shuffle.{BlockFetchingListener, OneForOneBlockFetcher, RetryingBlockFetcher, TempFileManager}
 import org.apache.spark.network.shuffle.protocol.UploadBlock
 import org.apache.spark.network.util.JavaUtils
 import org.apache.spark.serializer.JavaSerializer
@@ -105,14 +105,14 @@ private[spark] class NettyBlockTransferService(
       execId: String,
       blockIds: Array[String],
       listener: BlockFetchingListener,
-      tempShuffleFileManager: TempShuffleFileManager): Unit = {
+      tempFileManager: TempFileManager): Unit = {
     logTrace(s"Fetch blocks from $host:$port (executor id $execId)")
     try {
       val blockFetchStarter = new RetryingBlockFetcher.BlockFetchStarter {
         override def createAndStart(blockIds: Array[String], listener: BlockFetchingListener) {
           val client = clientFactory.createClient(host, port)
           new OneForOneBlockFetcher(client, appId, execId, blockIds, listener,
-            transportConf, tempShuffleFileManager).start()
+            transportConf, tempFileManager).start()
         }
       }
 
diff --git a/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala b/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala
index c8d1460300934..0562d45ff57c5 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala
@@ -52,7 +52,7 @@ private[spark] class BlockStoreShuffleReader[K, C](
       SparkEnv.get.conf.getSizeAsMb("spark.reducer.maxSizeInFlight", "48m") * 1024 * 1024,
       SparkEnv.get.conf.getInt("spark.reducer.maxReqsInFlight", Int.MaxValue),
       SparkEnv.get.conf.get(config.REDUCER_MAX_BLOCKS_IN_FLIGHT_PER_ADDRESS),
-      SparkEnv.get.conf.get(config.REDUCER_MAX_REQ_SIZE_SHUFFLE_TO_MEM),
+      SparkEnv.get.conf.get(config.MAX_REMOTE_BLOCK_SIZE_FETCH_TO_MEM),
       SparkEnv.get.conf.getBoolean("spark.shuffle.detectCorrupt", true))
 
     val serializerInstance = dep.serializer.newInstance()
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
index a98083df5bd84..e0276a4dc4224 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -18,8 +18,11 @@
 package org.apache.spark.storage
 
 import java.io._
+import java.lang.ref.{ReferenceQueue => JReferenceQueue, WeakReference}
 import java.nio.ByteBuffer
 import java.nio.channels.Channels
+import java.util.Collections
+import java.util.concurrent.ConcurrentHashMap
 
 import scala.collection.mutable
 import scala.collection.mutable.HashMap
@@ -39,7 +42,7 @@ import org.apache.spark.metrics.source.Source
 import org.apache.spark.network._
 import org.apache.spark.network.buffer.ManagedBuffer
 import org.apache.spark.network.netty.SparkTransportConf
-import org.apache.spark.network.shuffle.ExternalShuffleClient
+import org.apache.spark.network.shuffle.{ExternalShuffleClient, TempFileManager}
 import org.apache.spark.network.shuffle.protocol.ExecutorShuffleInfo
 import org.apache.spark.rpc.RpcEnv
 import org.apache.spark.serializer.{SerializerInstance, SerializerManager}
@@ -203,6 +206,13 @@ private[spark] class BlockManager(
 
   private var blockReplicationPolicy: BlockReplicationPolicy = _
 
+  // A TempFileManager used to track all the files of remote blocks which above the
+  // specified memory threshold. Files will be deleted automatically based on weak reference.
+  // Exposed for test
+  private[storage] val remoteBlockTempFileManager =
+    new BlockManager.RemoteBlockTempFileManager(this)
+  private val maxRemoteBlockToMem = conf.get(config.MAX_REMOTE_BLOCK_SIZE_FETCH_TO_MEM)
+
   /**
    * Initializes the BlockManager with the given appId. This is not performed in the constructor as
    * the appId may not be known at BlockManager instantiation time (in particular for the driver,
@@ -632,8 +642,8 @@ private[spark] class BlockManager(
    * Return a list of locations for the given block, prioritizing the local machine since
    * multiple block managers can share the same host, followed by hosts on the same rack.
    */
-  private def getLocations(blockId: BlockId): Seq[BlockManagerId] = {
-    val locs = Random.shuffle(master.getLocations(blockId))
+  private def sortLocations(locations: Seq[BlockManagerId]): Seq[BlockManagerId] = {
+    val locs = Random.shuffle(locations)
     val (preferredLocs, otherLocs) = locs.partition { loc => blockManagerId.host == loc.host }
     blockManagerId.topologyInfo match {
       case None => preferredLocs ++ otherLocs
@@ -653,7 +663,25 @@ private[spark] class BlockManager(
     require(blockId != null, "BlockId is null")
     var runningFailureCount = 0
     var totalFailureCount = 0
-    val locations = getLocations(blockId)
+
+    // Because all the remote blocks are registered in driver, it is not necessary to ask
+    // all the slave executors to get block status.
+    val locationsAndStatus = master.getLocationsAndStatus(blockId)
+    val blockSize = locationsAndStatus.map { b =>
+      b.status.diskSize.max(b.status.memSize)
+    }.getOrElse(0L)
+    val blockLocations = locationsAndStatus.map(_.locations).getOrElse(Seq.empty)
+
+    // If the block size is above the threshold, we should pass our FileManger to
+    // BlockTransferService, which will leverage it to spill the block; if not, then passed-in
+    // null value means the block will be persisted in memory.
+    val tempFileManager = if (blockSize > maxRemoteBlockToMem) {
+      remoteBlockTempFileManager
+    } else {
+      null
+    }
+
+    val locations = sortLocations(blockLocations)
     val maxFetchFailures = locations.size
     var locationIterator = locations.iterator
     while (locationIterator.hasNext) {
@@ -661,7 +689,7 @@ private[spark] class BlockManager(
       logDebug(s"Getting remote block $blockId from $loc")
       val data = try {
         blockTransferService.fetchBlockSync(
-          loc.host, loc.port, loc.executorId, blockId.toString).nioByteBuffer()
+          loc.host, loc.port, loc.executorId, blockId.toString, tempFileManager).nioByteBuffer()
       } catch {
         case NonFatal(e) =>
           runningFailureCount += 1
@@ -684,7 +712,7 @@ private[spark] class BlockManager(
           // take a significant amount of time. To get rid of these stale entries
           // we refresh the block locations after a certain number of fetch failures
           if (runningFailureCount >= maxFailuresBeforeLocationRefresh) {
-            locationIterator = getLocations(blockId).iterator
+            locationIterator = sortLocations(master.getLocations(blockId)).iterator
             logDebug(s"Refreshed locations from the driver " +
               s"after ${runningFailureCount} fetch failures.")
             runningFailureCount = 0
@@ -1512,6 +1540,7 @@ private[spark] class BlockManager(
       // Closing should be idempotent, but maybe not for the NioBlockTransferService.
       shuffleClient.close()
     }
+    remoteBlockTempFileManager.stop()
     diskBlockManager.stop()
     rpcEnv.stop(slaveEndpoint)
     blockInfoManager.clear()
@@ -1552,4 +1581,65 @@ private[spark] object BlockManager {
     override val metricRegistry = new MetricRegistry
     metricRegistry.registerAll(metricSet)
   }
+
+  class RemoteBlockTempFileManager(blockManager: BlockManager)
+      extends TempFileManager with Logging {
+
+    private class ReferenceWithCleanup(file: File, referenceQueue: JReferenceQueue[File])
+        extends WeakReference[File](file, referenceQueue) {
+      private val filePath = file.getAbsolutePath
+
+      def cleanUp(): Unit = {
+        logDebug(s"Clean up file $filePath")
+
+        if (!new File(filePath).delete()) {
+          logDebug(s"Fail to delete file $filePath")
+        }
+      }
+    }
+
+    private val referenceQueue = new JReferenceQueue[File]
+    private val referenceBuffer = Collections.newSetFromMap[ReferenceWithCleanup](
+      new ConcurrentHashMap)
+
+    private val POLL_TIMEOUT = 1000
+    @volatile private var stopped = false
+
+    private val cleaningThread = new Thread() { override def run() { keepCleaning() } }
+    cleaningThread.setDaemon(true)
+    cleaningThread.setName("RemoteBlock-temp-file-clean-thread")
+    cleaningThread.start()
+
+    override def createTempFile(): File = {
+      blockManager.diskBlockManager.createTempLocalBlock()._2
+    }
+
+    override def registerTempFileToClean(file: File): Boolean = {
+      referenceBuffer.add(new ReferenceWithCleanup(file, referenceQueue))
+    }
+
+    def stop(): Unit = {
+      stopped = true
+      cleaningThread.interrupt()
+      cleaningThread.join()
+    }
+
+    private def keepCleaning(): Unit = {
+      while (!stopped) {
+        try {
+          Option(referenceQueue.remove(POLL_TIMEOUT))
+            .map(_.asInstanceOf[ReferenceWithCleanup])
+            .foreach { ref =>
+              referenceBuffer.remove(ref)
+              ref.cleanUp()
+            }
+        } catch {
+          case _: InterruptedException =>
+            // no-op
+          case NonFatal(e) =>
+            logError("Error in cleaning thread", e)
+        }
+      }
+    }
+  }
 }
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala
index 8b1dc0ba6356a..d24421b962774 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala
@@ -84,6 +84,12 @@ class BlockManagerMaster(
     driverEndpoint.askSync[Seq[BlockManagerId]](GetLocations(blockId))
   }
 
+  /** Get locations as well as status of the blockId from the driver */
+  def getLocationsAndStatus(blockId: BlockId): Option[BlockLocationsAndStatus] = {
+    driverEndpoint.askSync[Option[BlockLocationsAndStatus]](
+      GetLocationsAndStatus(blockId))
+  }
+
   /** Get locations of multiple blockIds from the driver */
   def getLocations(blockIds: Array[BlockId]): IndexedSeq[Seq[BlockManagerId]] = {
     driverEndpoint.askSync[IndexedSeq[Seq[BlockManagerId]]](
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
index df0a5f5e229fb..56d0266b8edad 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
@@ -82,6 +82,9 @@ class BlockManagerMasterEndpoint(
     case GetLocations(blockId) =>
       context.reply(getLocations(blockId))
 
+    case GetLocationsAndStatus(blockId) =>
+      context.reply(getLocationsAndStatus(blockId))
+
     case GetLocationsMultipleBlockIds(blockIds) =>
       context.reply(getLocationsMultipleBlockIds(blockIds))
 
@@ -422,6 +425,17 @@ class BlockManagerMasterEndpoint(
     if (blockLocations.containsKey(blockId)) blockLocations.get(blockId).toSeq else Seq.empty
   }
 
+  private def getLocationsAndStatus(blockId: BlockId): Option[BlockLocationsAndStatus] = {
+    val locations = Option(blockLocations.get(blockId)).map(_.toSeq).getOrElse(Seq.empty)
+    val status = locations.headOption.flatMap { bmId => blockManagerInfo(bmId).getStatus(blockId) }
+
+    if (locations.nonEmpty && status.isDefined) {
+      Some(BlockLocationsAndStatus(locations, status.get))
+    } else {
+      None
+    }
+  }
+
   private def getLocationsMultipleBlockIds(
       blockIds: Array[BlockId]): IndexedSeq[Seq[BlockManagerId]] = {
     blockIds.map(blockId => getLocations(blockId))
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala
index 0c0ff144596ac..1bbe7a5b39509 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala
@@ -93,6 +93,13 @@ private[spark] object BlockManagerMessages {
 
   case class GetLocations(blockId: BlockId) extends ToBlockManagerMaster
 
+  case class GetLocationsAndStatus(blockId: BlockId) extends ToBlockManagerMaster
+
+  // The response message of `GetLocationsAndStatus` request.
+  case class BlockLocationsAndStatus(locations: Seq[BlockManagerId], status: BlockStatus) {
+    assert(locations.nonEmpty)
+  }
+
   case class GetLocationsMultipleBlockIds(blockIds: Array[BlockId]) extends ToBlockManagerMaster
 
   case class GetPeers(blockManagerId: BlockManagerId) extends ToBlockManagerMaster
diff --git a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
index 2d176b62f8b36..98b5a735a4529 100644
--- a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
+++ b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
@@ -28,7 +28,7 @@ import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, Queue}
 import org.apache.spark.{SparkException, TaskContext}
 import org.apache.spark.internal.Logging
 import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer}
-import org.apache.spark.network.shuffle.{BlockFetchingListener, ShuffleClient, TempShuffleFileManager}
+import org.apache.spark.network.shuffle.{BlockFetchingListener, ShuffleClient, TempFileManager}
 import org.apache.spark.shuffle.FetchFailedException
 import org.apache.spark.util.Utils
 import org.apache.spark.util.io.ChunkedByteBufferOutputStream
@@ -69,7 +69,7 @@ final class ShuffleBlockFetcherIterator(
     maxBlocksInFlightPerAddress: Int,
     maxReqSizeShuffleToMem: Long,
     detectCorrupt: Boolean)
-  extends Iterator[(BlockId, InputStream)] with TempShuffleFileManager with Logging {
+  extends Iterator[(BlockId, InputStream)] with TempFileManager with Logging {
 
   import ShuffleBlockFetcherIterator._
 
@@ -162,11 +162,11 @@ final class ShuffleBlockFetcherIterator(
     currentResult = null
   }
 
-  override def createTempShuffleFile(): File = {
+  override def createTempFile(): File = {
     blockManager.diskBlockManager.createTempLocalBlock()._2
   }
 
-  override def registerTempShuffleFileToClean(file: File): Boolean = synchronized {
+  override def registerTempFileToClean(file: File): Boolean = synchronized {
     if (isZombie) {
       false
     } else {
diff --git a/core/src/test/scala/org/apache/spark/DistributedSuite.scala b/core/src/test/scala/org/apache/spark/DistributedSuite.scala
index bea67b71a5a12..f8005610f7e4f 100644
--- a/core/src/test/scala/org/apache/spark/DistributedSuite.scala
+++ b/core/src/test/scala/org/apache/spark/DistributedSuite.scala
@@ -171,7 +171,7 @@ class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContex
     val serializerManager = SparkEnv.get.serializerManager
     blockManager.master.getLocations(blockId).foreach { cmId =>
       val bytes = blockTransfer.fetchBlockSync(cmId.host, cmId.port, cmId.executorId,
-        blockId.toString)
+        blockId.toString, null)
       val deserialized = serializerManager.dataDeserializeStream(blockId,
         new ChunkedByteBuffer(bytes.nioByteBuffer()).toInputStream())(data.elementClassTag).toList
       assert(deserialized === (1 to 100).toList)
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
index cfe89fde63f88..d45c194d31adc 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.storage
 
-import java.io.File
 import java.nio.ByteBuffer
 
 import scala.collection.JavaConverters._
@@ -45,14 +44,14 @@ import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer}
 import org.apache.spark.network.client.{RpcResponseCallback, TransportClient}
 import org.apache.spark.network.netty.{NettyBlockTransferService, SparkTransportConf}
 import org.apache.spark.network.server.{NoOpRpcHandler, TransportServer, TransportServerBootstrap}
-import org.apache.spark.network.shuffle.{BlockFetchingListener, ShuffleClient, TempShuffleFileManager}
+import org.apache.spark.network.shuffle.{BlockFetchingListener, ShuffleClient, TempFileManager}
 import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, RegisterExecutor}
 import org.apache.spark.rpc.RpcEnv
 import org.apache.spark.scheduler.LiveListenerBus
 import org.apache.spark.security.{CryptoStreamUtils, EncryptionFunSuite}
 import org.apache.spark.serializer.{JavaSerializer, KryoSerializer, SerializerManager}
 import org.apache.spark.shuffle.sort.SortShuffleManager
-import org.apache.spark.storage.BlockManagerMessages.BlockManagerHeartbeat
+import org.apache.spark.storage.BlockManagerMessages._
 import org.apache.spark.util._
 import org.apache.spark.util.io.ChunkedByteBuffer
 
@@ -512,8 +511,8 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
     when(bmMaster.getLocations(mc.any[BlockId])).thenReturn(Seq(bmId1, bmId2, bmId3))
 
     val blockManager = makeBlockManager(128, "exec", bmMaster)
-    val getLocations = PrivateMethod[Seq[BlockManagerId]]('getLocations)
-    val locations = blockManager invokePrivate getLocations(BroadcastBlockId(0))
+    val sortLocations = PrivateMethod[Seq[BlockManagerId]]('sortLocations)
+    val locations = blockManager invokePrivate sortLocations(bmMaster.getLocations("test"))
     assert(locations.map(_.host) === Seq(localHost, localHost, otherHost))
   }
 
@@ -535,8 +534,8 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
     val blockManager = makeBlockManager(128, "exec", bmMaster)
     blockManager.blockManagerId =
       BlockManagerId(SparkContext.DRIVER_IDENTIFIER, localHost, 1, Some(localRack))
-    val getLocations = PrivateMethod[Seq[BlockManagerId]]('getLocations)
-    val locations = blockManager invokePrivate getLocations(BroadcastBlockId(0))
+    val sortLocations = PrivateMethod[Seq[BlockManagerId]]('sortLocations)
+    val locations = blockManager invokePrivate sortLocations(bmMaster.getLocations("test"))
     assert(locations.map(_.host) === Seq(localHost, localHost, otherHost, otherHost, otherHost))
     assert(locations.flatMap(_.topologyInfo)
       === Seq(localRack, localRack, localRack, otherRack, otherRack))
@@ -1274,13 +1273,18 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
     // so that we have a chance to do location refresh
     val blockManagerIds = (0 to maxFailuresBeforeLocationRefresh)
       .map { i => BlockManagerId(s"id-$i", s"host-$i", i + 1) }
-    when(mockBlockManagerMaster.getLocations(mc.any[BlockId])).thenReturn(blockManagerIds)
+    when(mockBlockManagerMaster.getLocationsAndStatus(mc.any[BlockId])).thenReturn(
+      Option(BlockLocationsAndStatus(blockManagerIds, BlockStatus.empty)))
+    when(mockBlockManagerMaster.getLocations(mc.any[BlockId])).thenReturn(
+      blockManagerIds)
+
     store = makeBlockManager(8000, "executor1", mockBlockManagerMaster,
       transferService = Option(mockBlockTransferService))
     val block = store.getRemoteBytes("item")
       .asInstanceOf[Option[ByteBuffer]]
     assert(block.isDefined)
-    verify(mockBlockManagerMaster, times(2)).getLocations("item")
+    verify(mockBlockManagerMaster, times(1)).getLocationsAndStatus("item")
+    verify(mockBlockManagerMaster, times(1)).getLocations("item")
   }
 
   test("SPARK-17484: block status is properly updated following an exception in put()") {
@@ -1371,8 +1375,32 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
     server.close()
   }
 
+  test("fetch remote block to local disk if block size is larger than threshold") {
+    conf.set(MAX_REMOTE_BLOCK_SIZE_FETCH_TO_MEM, 1000L)
+
+    val mockBlockManagerMaster = mock(classOf[BlockManagerMaster])
+    val mockBlockTransferService = new MockBlockTransferService(0)
+    val blockLocations = Seq(BlockManagerId("id-0", "host-0", 1))
+    val blockStatus = BlockStatus(StorageLevel.DISK_ONLY, 0L, 2000L)
+
+    when(mockBlockManagerMaster.getLocationsAndStatus(mc.any[BlockId])).thenReturn(
+      Option(BlockLocationsAndStatus(blockLocations, blockStatus)))
+    when(mockBlockManagerMaster.getLocations(mc.any[BlockId])).thenReturn(blockLocations)
+
+    store = makeBlockManager(8000, "executor1", mockBlockManagerMaster,
+      transferService = Option(mockBlockTransferService))
+    val block = store.getRemoteBytes("item")
+      .asInstanceOf[Option[ByteBuffer]]
+
+    assert(block.isDefined)
+    assert(mockBlockTransferService.numCalls === 1)
+    // assert FileManager is not null if the block size is larger than threshold.
+    assert(mockBlockTransferService.tempFileManager === store.remoteBlockTempFileManager)
+  }
+
   class MockBlockTransferService(val maxFailures: Int) extends BlockTransferService {
     var numCalls = 0
+    var tempFileManager: TempFileManager = null
 
     override def init(blockDataManager: BlockDataManager): Unit = {}
 
@@ -1382,7 +1410,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
         execId: String,
         blockIds: Array[String],
         listener: BlockFetchingListener,
-        tempShuffleFileManager: TempShuffleFileManager): Unit = {
+        tempFileManager: TempFileManager): Unit = {
       listener.onBlockFetchSuccess("mockBlockId", new NioManagedBuffer(ByteBuffer.allocate(1)))
     }
 
@@ -1394,7 +1422,8 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
 
     override def uploadBlock(
         hostname: String,
-        port: Int, execId: String,
+        port: Int,
+        execId: String,
         blockId: BlockId,
         blockData: ManagedBuffer,
         level: StorageLevel,
@@ -1407,12 +1436,14 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
         host: String,
         port: Int,
         execId: String,
-        blockId: String): ManagedBuffer = {
+        blockId: String,
+        tempFileManager: TempFileManager): ManagedBuffer = {
       numCalls += 1
+      this.tempFileManager = tempFileManager
       if (numCalls <= maxFailures) {
         throw new RuntimeException("Failing block fetch in the mock block transfer service")
       }
-      super.fetchBlockSync(host, port, execId, blockId)
+      super.fetchBlockSync(host, port, execId, blockId, tempFileManager)
     }
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala b/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala
index c371cbcf8dff5..5bfe9905ff17b 100644
--- a/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala
@@ -33,7 +33,7 @@ import org.scalatest.PrivateMethodTester
 import org.apache.spark.{SparkFunSuite, TaskContext}
 import org.apache.spark.network._
 import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer}
-import org.apache.spark.network.shuffle.{BlockFetchingListener, TempShuffleFileManager}
+import org.apache.spark.network.shuffle.{BlockFetchingListener, TempFileManager}
 import org.apache.spark.network.util.LimitedInputStream
 import org.apache.spark.shuffle.FetchFailedException
 import org.apache.spark.util.Utils
@@ -437,12 +437,12 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT
     val remoteBlocks = Map[BlockId, ManagedBuffer](
       ShuffleBlockId(0, 0, 0) -> createMockManagedBuffer())
     val transfer = mock(classOf[BlockTransferService])
-    var tempShuffleFileManager: TempShuffleFileManager = null
+    var tempFileManager: TempFileManager = null
     when(transfer.fetchBlocks(any(), any(), any(), any(), any(), any()))
       .thenAnswer(new Answer[Unit] {
         override def answer(invocation: InvocationOnMock): Unit = {
           val listener = invocation.getArguments()(4).asInstanceOf[BlockFetchingListener]
-          tempShuffleFileManager = invocation.getArguments()(5).asInstanceOf[TempShuffleFileManager]
+          tempFileManager = invocation.getArguments()(5).asInstanceOf[TempFileManager]
           Future {
             listener.onBlockFetchSuccess(
               ShuffleBlockId(0, 0, 0).toString, remoteBlocks(ShuffleBlockId(0, 0, 0)))
@@ -472,13 +472,13 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT
     fetchShuffleBlock(blocksByAddress1)
     // `maxReqSizeShuffleToMem` is 200, which is greater than the block size 100, so don't fetch
     // shuffle block to disk.
-    assert(tempShuffleFileManager == null)
+    assert(tempFileManager == null)
 
     val blocksByAddress2 = Seq[(BlockManagerId, Seq[(BlockId, Long)])](
       (remoteBmId, remoteBlocks.keys.map(blockId => (blockId, 300L)).toSeq))
     fetchShuffleBlock(blocksByAddress2)
     // `maxReqSizeShuffleToMem` is 200, which is smaller than the block size 300, so fetch
     // shuffle block to disk.
-    assert(tempShuffleFileManager != null)
+    assert(tempFileManager != null)
   }
 }
diff --git a/docs/configuration.md b/docs/configuration.md
index 7a777d3c6fa3d..bb06c8faaaed7 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -547,13 +547,14 @@ Apart from these, the following properties are also available, and may be useful
   </td>
 </tr>
 <tr>
-  <td><code>spark.reducer.maxReqSizeShuffleToMem</code></td>
+  <td><code>spark.maxRemoteBlockSizeFetchToMem</code></td>
   <td>Long.MaxValue</td>
   <td>
-    The blocks of a shuffle request will be fetched to disk when size of the request is above
-    this threshold. This is to avoid a giant request takes too much memory. We can enable this
-    config by setting a specific value(e.g. 200m). Note that this config can be enabled only when
-    the shuffle shuffle service is newer than Spark-2.2 or the shuffle service is disabled.
+    The remote block will be fetched to disk when size of the block is above this threshold.
+    This is to avoid a giant request takes too much memory. We can enable this config by setting
+    a specific value(e.g. 200m). Note this configuration will affect both shuffle fetch 
+    and block manager remote block fetch. For users who enabled external shuffle service,
+    this feature can only be worked when external shuffle service is newer than Spark 2.2.
   </td>
 </tr>
 <tr>

From 75d666b95a711787355ca3895057dabadd429023 Mon Sep 17 00:00:00 2001
From: Jose Torres <jose@databricks.com>
Date: Tue, 17 Oct 2017 12:26:53 -0700
Subject: [PATCH 724/779] [SPARK-22136][SS] Evaluate one-sided conditions early
 in stream-stream joins.

## What changes were proposed in this pull request?

Evaluate one-sided conditions early in stream-stream joins.

This is in addition to normal filter pushdown, because integrating it with the join logic allows it to take place in outer join scenarios. This means that rows which can never satisfy the join condition won't clog up the state.

## How was this patch tested?
new unit tests

Author: Jose Torres <jose@databricks.com>

Closes #19452 from joseph-torres/SPARK-22136.
---
 .../streaming/IncrementalExecution.scala      |   2 +-
 .../StreamingSymmetricHashJoinExec.scala      | 134 ++++++++++------
 .../StreamingSymmetricHashJoinHelper.scala    |  70 +++++++-
 .../sql/streaming/StreamingJoinSuite.scala    | 150 +++++++++++++++++-
 ...treamingSymmetricHashJoinHelperSuite.scala | 130 +++++++++++++++
 5 files changed, 433 insertions(+), 53 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingSymmetricHashJoinHelperSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
index 2e378637727fc..a10ed5f2df1b5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
@@ -133,7 +133,7 @@ class IncrementalExecution(
           eventTimeWatermark = Some(offsetSeqMetadata.batchWatermarkMs),
           stateWatermarkPredicates =
             StreamingSymmetricHashJoinHelper.getStateWatermarkPredicates(
-              j.left.output, j.right.output, j.leftKeys, j.rightKeys, j.condition,
+              j.left.output, j.right.output, j.leftKeys, j.rightKeys, j.condition.full,
               Some(offsetSeqMetadata.batchWatermarkMs))
         )
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala
index 9bd2127a28ff6..c351f658cb955 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala
@@ -21,7 +21,7 @@ import java.util.concurrent.TimeUnit.NANOSECONDS
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{Attribute, BindReferences, Expression, GenericInternalRow, JoinedRow, Literal, NamedExpression, PreciseTimestampConversion, UnsafeProjection, UnsafeRow}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, GenericInternalRow, JoinedRow, Literal, UnsafeProjection, UnsafeRow}
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark._
 import org.apache.spark.sql.catalyst.plans.physical._
@@ -29,7 +29,6 @@ import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan}
 import org.apache.spark.sql.execution.streaming.StreamingSymmetricHashJoinHelper._
 import org.apache.spark.sql.execution.streaming.state._
 import org.apache.spark.sql.internal.SessionState
-import org.apache.spark.sql.types.{LongType, TimestampType}
 import org.apache.spark.util.{CompletionIterator, SerializableConfiguration}
 
 
@@ -115,7 +114,8 @@ import org.apache.spark.util.{CompletionIterator, SerializableConfiguration}
  * @param leftKeys  Expression to generate key rows for joining from left input
  * @param rightKeys Expression to generate key rows for joining from right input
  * @param joinType  Type of join (inner, left outer, etc.)
- * @param condition Optional, additional condition to filter output of the equi-join
+ * @param condition Conditions to filter rows, split by left, right, and joined. See
+ *                  [[JoinConditionSplitPredicates]]
  * @param stateInfo Version information required to read join state (buffered rows)
  * @param eventTimeWatermark Watermark of input event, same for both sides
  * @param stateWatermarkPredicates Predicates for removal of state, see
@@ -127,7 +127,7 @@ case class StreamingSymmetricHashJoinExec(
     leftKeys: Seq[Expression],
     rightKeys: Seq[Expression],
     joinType: JoinType,
-    condition: Option[Expression],
+    condition: JoinConditionSplitPredicates,
     stateInfo: Option[StatefulOperatorStateInfo],
     eventTimeWatermark: Option[Long],
     stateWatermarkPredicates: JoinStateWatermarkPredicates,
@@ -141,8 +141,10 @@ case class StreamingSymmetricHashJoinExec(
       condition: Option[Expression],
       left: SparkPlan,
       right: SparkPlan) = {
+
     this(
-      leftKeys, rightKeys, joinType, condition, stateInfo = None, eventTimeWatermark = None,
+      leftKeys, rightKeys, joinType, JoinConditionSplitPredicates(condition, left, right),
+      stateInfo = None, eventTimeWatermark = None,
       stateWatermarkPredicates = JoinStateWatermarkPredicates(), left, right)
   }
 
@@ -161,6 +163,9 @@ case class StreamingSymmetricHashJoinExec(
     new SerializableConfiguration(SessionState.newHadoopConf(
       sparkContext.hadoopConfiguration, sqlContext.conf)))
 
+  val nullLeft = new GenericInternalRow(left.output.map(_.withNullability(true)).length)
+  val nullRight = new GenericInternalRow(right.output.map(_.withNullability(true)).length)
+
   override def requiredChildDistribution: Seq[Distribution] =
     ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil
 
@@ -206,10 +211,15 @@ case class StreamingSymmetricHashJoinExec(
     val updateStartTimeNs = System.nanoTime
     val joinedRow = new JoinedRow
 
+
+    val postJoinFilter =
+      newPredicate(condition.bothSides.getOrElse(Literal(true)), left.output ++ right.output).eval _
     val leftSideJoiner = new OneSideHashJoiner(
-      LeftSide, left.output, leftKeys, leftInputIter, stateWatermarkPredicates.left)
+      LeftSide, left.output, leftKeys, leftInputIter,
+      condition.leftSideOnly, postJoinFilter, stateWatermarkPredicates.left)
     val rightSideJoiner = new OneSideHashJoiner(
-      RightSide, right.output, rightKeys, rightInputIter, stateWatermarkPredicates.right)
+      RightSide, right.output, rightKeys, rightInputIter,
+      condition.rightSideOnly, postJoinFilter, stateWatermarkPredicates.right)
 
     //  Join one side input using the other side's buffered/state rows. Here is how it is done.
     //
@@ -221,43 +231,28 @@ case class StreamingSymmetricHashJoinExec(
     //    matching new left input with new right input, since the new left input has become stored
     //    by that point. This tiny asymmetry is necessary to avoid duplication.
     val leftOutputIter = leftSideJoiner.storeAndJoinWithOtherSide(rightSideJoiner) {
-      (input: UnsafeRow, matched: UnsafeRow) => joinedRow.withLeft(input).withRight(matched)
+      (input: InternalRow, matched: InternalRow) => joinedRow.withLeft(input).withRight(matched)
     }
     val rightOutputIter = rightSideJoiner.storeAndJoinWithOtherSide(leftSideJoiner) {
-      (input: UnsafeRow, matched: UnsafeRow) => joinedRow.withLeft(matched).withRight(input)
+      (input: InternalRow, matched: InternalRow) => joinedRow.withLeft(matched).withRight(input)
     }
 
-    // Filter the joined rows based on the given condition.
-    val outputFilterFunction = newPredicate(condition.getOrElse(Literal(true)), output).eval _
-
     // We need to save the time that the inner join output iterator completes, since outer join
     // output counts as both update and removal time.
     var innerOutputCompletionTimeNs: Long = 0
     def onInnerOutputCompletion = {
       innerOutputCompletionTimeNs = System.nanoTime
     }
-    val filteredInnerOutputIter = CompletionIterator[InternalRow, Iterator[InternalRow]](
-      (leftOutputIter ++ rightOutputIter).filter(outputFilterFunction), onInnerOutputCompletion)
-
-    def matchesWithRightSideState(leftKeyValue: UnsafeRowPair) = {
-      rightSideJoiner.get(leftKeyValue.key).exists(
-        rightValue => {
-          outputFilterFunction(
-            joinedRow.withLeft(leftKeyValue.value).withRight(rightValue))
-        })
-    }
+    // This is the iterator which produces the inner join rows. For outer joins, this will be
+    // prepended to a second iterator producing outer join rows; for inner joins, this is the full
+    // output.
+    val innerOutputIter = CompletionIterator[InternalRow, Iterator[InternalRow]](
+      (leftOutputIter ++ rightOutputIter), onInnerOutputCompletion)
 
-    def matchesWithLeftSideState(rightKeyValue: UnsafeRowPair) = {
-      leftSideJoiner.get(rightKeyValue.key).exists(
-        leftValue => {
-          outputFilterFunction(
-            joinedRow.withLeft(leftValue).withRight(rightKeyValue.value))
-        })
-    }
 
     val outputIter: Iterator[InternalRow] = joinType match {
       case Inner =>
-        filteredInnerOutputIter
+        innerOutputIter
       case LeftOuter =>
         // We generate the outer join input by:
         // * Getting an iterator over the rows that have aged out on the left side. These rows are
@@ -268,28 +263,37 @@ case class StreamingSymmetricHashJoinExec(
         //   we know we can join with null, since there was never (including this batch) a match
         //   within the watermark period. If it does, there must have been a match at some point, so
         //   we know we can't join with null.
-        val nullRight = new GenericInternalRow(right.output.map(_.withNullability(true)).length)
+        def matchesWithRightSideState(leftKeyValue: UnsafeRowPair) = {
+          rightSideJoiner.get(leftKeyValue.key).exists { rightValue =>
+            postJoinFilter(joinedRow.withLeft(leftKeyValue.value).withRight(rightValue))
+          }
+        }
         val removedRowIter = leftSideJoiner.removeOldState()
         val outerOutputIter = removedRowIter
           .filterNot(pair => matchesWithRightSideState(pair))
           .map(pair => joinedRow.withLeft(pair.value).withRight(nullRight))
 
-        filteredInnerOutputIter ++ outerOutputIter
+        innerOutputIter ++ outerOutputIter
       case RightOuter =>
         // See comments for left outer case.
-        val nullLeft = new GenericInternalRow(left.output.map(_.withNullability(true)).length)
+        def matchesWithLeftSideState(rightKeyValue: UnsafeRowPair) = {
+          leftSideJoiner.get(rightKeyValue.key).exists { leftValue =>
+            postJoinFilter(joinedRow.withLeft(leftValue).withRight(rightKeyValue.value))
+          }
+        }
         val removedRowIter = rightSideJoiner.removeOldState()
         val outerOutputIter = removedRowIter
           .filterNot(pair => matchesWithLeftSideState(pair))
           .map(pair => joinedRow.withLeft(nullLeft).withRight(pair.value))
 
-        filteredInnerOutputIter ++ outerOutputIter
+        innerOutputIter ++ outerOutputIter
       case _ => throwBadJoinTypeException()
     }
 
+    val outputProjection = UnsafeProjection.create(left.output ++ right.output, output)
     val outputIterWithMetrics = outputIter.map { row =>
       numOutputRows += 1
-      row
+      outputProjection(row)
     }
 
     // Function to remove old state after all the input has been consumed and output generated
@@ -349,14 +353,36 @@ case class StreamingSymmetricHashJoinExec(
   /**
    * Internal helper class to consume input rows, generate join output rows using other sides
    * buffered state rows, and finally clean up this sides buffered state rows
+   *
+   * @param joinSide The JoinSide - either left or right.
+   * @param inputAttributes The input attributes for this side of the join.
+   * @param joinKeys The join keys.
+   * @param inputIter The iterator of input rows on this side to be joined.
+   * @param preJoinFilterExpr A filter over rows on this side. This filter rejects rows that could
+   *                          never pass the overall join condition no matter what other side row
+   *                          they're joined with.
+   * @param postJoinFilter A filter over joined rows. This filter completes the application of
+   *                       the overall join condition, assuming that preJoinFilter on both sides
+   *                       of the join has already been passed.
+   *                       Passed as a function rather than expression to avoid creating the
+   *                       predicate twice; we also need this filter later on in the parent exec.
+   * @param stateWatermarkPredicate The state watermark predicate. See
+   *                                [[StreamingSymmetricHashJoinExec]] for further description of
+   *                                state watermarks.
    */
   private class OneSideHashJoiner(
       joinSide: JoinSide,
       inputAttributes: Seq[Attribute],
       joinKeys: Seq[Expression],
       inputIter: Iterator[InternalRow],
+      preJoinFilterExpr: Option[Expression],
+      postJoinFilter: (InternalRow) => Boolean,
       stateWatermarkPredicate: Option[JoinStateWatermarkPredicate]) {
 
+    // Filter the joined rows based on the given condition.
+    val preJoinFilter =
+      newPredicate(preJoinFilterExpr.getOrElse(Literal(true)), inputAttributes).eval _
+
     private val joinStateManager = new SymmetricHashJoinStateManager(
       joinSide, inputAttributes, joinKeys, stateInfo, storeConf, hadoopConfBcast.value.value)
     private[this] val keyGenerator = UnsafeProjection.create(joinKeys, inputAttributes)
@@ -388,8 +414,8 @@ case class StreamingSymmetricHashJoinExec(
      */
     def storeAndJoinWithOtherSide(
         otherSideJoiner: OneSideHashJoiner)(
-        generateJoinedRow: (UnsafeRow, UnsafeRow) => JoinedRow): Iterator[InternalRow] = {
-
+        generateJoinedRow: (InternalRow, InternalRow) => JoinedRow):
+    Iterator[InternalRow] = {
       val watermarkAttribute = inputAttributes.find(_.metadata.contains(delayKey))
       val nonLateRows =
         WatermarkSupport.watermarkExpression(watermarkAttribute, eventTimeWatermark) match {
@@ -402,17 +428,31 @@ case class StreamingSymmetricHashJoinExec(
 
       nonLateRows.flatMap { row =>
         val thisRow = row.asInstanceOf[UnsafeRow]
-        val key = keyGenerator(thisRow)
-        val outputIter = otherSideJoiner.joinStateManager.get(key).map { thatRow =>
-          generateJoinedRow(thisRow, thatRow)
-        }
-        val shouldAddToState = // add only if both removal predicates do not match
-          !stateKeyWatermarkPredicateFunc(key) && !stateValueWatermarkPredicateFunc(thisRow)
-        if (shouldAddToState) {
-          joinStateManager.append(key, thisRow)
-          updatedStateRowsCount += 1
+        // If this row fails the pre join filter, that means it can never satisfy the full join
+        // condition no matter what other side row it's matched with. This allows us to avoid
+        // adding it to the state, and generate an outer join row immediately (or do nothing in
+        // the case of inner join).
+        if (preJoinFilter(thisRow)) {
+          val key = keyGenerator(thisRow)
+          val outputIter = otherSideJoiner.joinStateManager.get(key).map { thatRow =>
+            generateJoinedRow(thisRow, thatRow)
+          }.filter(postJoinFilter)
+          val shouldAddToState = // add only if both removal predicates do not match
+            !stateKeyWatermarkPredicateFunc(key) && !stateValueWatermarkPredicateFunc(thisRow)
+          if (shouldAddToState) {
+            joinStateManager.append(key, thisRow)
+            updatedStateRowsCount += 1
+          }
+          outputIter
+        } else {
+          joinSide match {
+            case LeftSide if joinType == LeftOuter =>
+              Iterator(generateJoinedRow(thisRow, nullRight))
+            case RightSide if joinType == RightOuter =>
+              Iterator(generateJoinedRow(thisRow, nullLeft))
+            case _ => Iterator()
+          }
         }
-        outputIter
       }
     }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinHelper.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinHelper.scala
index 64c7189f72ac3..167e991ca62f8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinHelper.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinHelper.scala
@@ -24,8 +24,9 @@ import org.apache.spark.{Partition, SparkContext}
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.{RDD, ZippedPartitionsRDD2}
 import org.apache.spark.sql.catalyst.analysis.StreamingJoinHelper
-import org.apache.spark.sql.catalyst.expressions.{Add, Attribute, AttributeReference, AttributeSet, BoundReference, Cast, CheckOverflow, Expression, ExpressionSet, GreaterThan, GreaterThanOrEqual, LessThan, LessThanOrEqual, Literal, Multiply, NamedExpression, PreciseTimestampConversion, PredicateHelper, Subtract, TimeAdd, TimeSub, UnaryMinus}
+import org.apache.spark.sql.catalyst.expressions.{Add, And, Attribute, AttributeReference, AttributeSet, BoundReference, Cast, CheckOverflow, Expression, ExpressionSet, GreaterThan, GreaterThanOrEqual, LessThan, LessThanOrEqual, Literal, Multiply, NamedExpression, PreciseTimestampConversion, PredicateHelper, Subtract, TimeAdd, TimeSub, UnaryMinus}
 import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark._
+import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.streaming.WatermarkSupport.watermarkExpression
 import org.apache.spark.sql.execution.streaming.state.{StateStoreCoordinatorRef, StateStoreProvider, StateStoreProviderId}
 import org.apache.spark.sql.types._
@@ -66,6 +67,73 @@ object StreamingSymmetricHashJoinHelper extends Logging {
     }
   }
 
+  /**
+   * Wrapper around various useful splits of the join condition.
+   * left AND right AND joined is equivalent to full.
+   *
+   * Note that left and right do not necessarily contain *all* conjuncts which satisfy
+   * their condition. Any conjuncts after the first nondeterministic one are treated as
+   * nondeterministic for purposes of the split.
+   *
+   * @param leftSideOnly Deterministic conjuncts which reference only the left side of the join.
+   * @param rightSideOnly Deterministic conjuncts which reference only the right side of the join.
+   * @param bothSides Conjuncts which are nondeterministic, occur after a nondeterministic conjunct,
+   *                  or reference both left and right sides of the join.
+   * @param full The full join condition.
+   */
+  case class JoinConditionSplitPredicates(
+      leftSideOnly: Option[Expression],
+      rightSideOnly: Option[Expression],
+      bothSides: Option[Expression],
+      full: Option[Expression]) {
+    override def toString(): String = {
+      s"condition = [ leftOnly = ${leftSideOnly.map(_.toString).getOrElse("null")}, " +
+        s"rightOnly = ${rightSideOnly.map(_.toString).getOrElse("null")}, " +
+        s"both = ${bothSides.map(_.toString).getOrElse("null")}, " +
+        s"full = ${full.map(_.toString).getOrElse("null")} ]"
+    }
+  }
+
+  object JoinConditionSplitPredicates extends PredicateHelper {
+    def apply(condition: Option[Expression], left: SparkPlan, right: SparkPlan):
+        JoinConditionSplitPredicates = {
+      // Split the condition into 3 parts:
+      // * Conjuncts that can be evaluated on only the left input.
+      // * Conjuncts that can be evaluated on only the right input.
+      // * Conjuncts that require both left and right input.
+      //
+      // Note that we treat nondeterministic conjuncts as though they require both left and right
+      // input. To maintain their semantics, they need to be evaluated exactly once per joined row.
+      val (leftCondition, rightCondition, joinedCondition) = {
+        if (condition.isEmpty) {
+          (None, None, None)
+        } else {
+          // Span rather than partition, because nondeterministic expressions don't commute
+          // across AND.
+          val (deterministicConjuncts, nonDeterministicConjuncts) =
+            splitConjunctivePredicates(condition.get).span(_.deterministic)
+
+          val (leftConjuncts, nonLeftConjuncts) = deterministicConjuncts.partition { cond =>
+            cond.references.subsetOf(left.outputSet)
+          }
+
+          val (rightConjuncts, nonRightConjuncts) = deterministicConjuncts.partition { cond =>
+            cond.references.subsetOf(right.outputSet)
+          }
+
+          (
+            leftConjuncts.reduceOption(And),
+            rightConjuncts.reduceOption(And),
+            (nonLeftConjuncts.intersect(nonRightConjuncts) ++ nonDeterministicConjuncts)
+              .reduceOption(And)
+          )
+        }
+      }
+
+      JoinConditionSplitPredicates(leftCondition, rightCondition, joinedCondition, condition)
+    }
+  }
+
   /** Get the predicates defining the state watermarks for both sides of the join */
   def getStateWatermarkPredicates(
       leftAttributes: Seq[Attribute],
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala
index d32617275aadc..54eb863dacc83 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala
@@ -365,6 +365,24 @@ class StreamingInnerJoinSuite extends StreamTest with StateStoreMetricsTest with
       }
     }
   }
+
+  test("join between three streams") {
+    val input1 = MemoryStream[Int]
+    val input2 = MemoryStream[Int]
+    val input3 = MemoryStream[Int]
+
+    val df1 = input1.toDF.select('value as "leftKey", ('value * 2) as "leftValue")
+    val df2 = input2.toDF.select('value as "middleKey", ('value * 3) as "middleValue")
+    val df3 = input3.toDF.select('value as "rightKey", ('value * 5) as "rightValue")
+
+    val joined = df1.join(df2, expr("leftKey = middleKey")).join(df3, expr("rightKey = middleKey"))
+
+    testStream(joined)(
+      AddData(input1, 1, 5),
+      AddData(input2, 1, 5, 10),
+      AddData(input3, 5, 10),
+      CheckLastBatch((5, 10, 5, 15, 5, 25)))
+  }
 }
 
 class StreamingOuterJoinSuite extends StreamTest with StateStoreMetricsTest with BeforeAndAfter {
@@ -405,6 +423,130 @@ class StreamingOuterJoinSuite extends StreamTest with StateStoreMetricsTest with
     (input1, input2, joined)
   }
 
+  test("left outer early state exclusion on left") {
+    val (leftInput, df1) = setupStream("left", 2)
+    val (rightInput, df2) = setupStream("right", 3)
+    // Use different schemas to ensure the null row is being generated from the correct side.
+    val left = df1.select('key, window('leftTime, "10 second"), 'leftValue)
+    val right = df2.select('key, window('rightTime, "10 second"), 'rightValue.cast("string"))
+
+    val joined = left.join(
+        right,
+        left("key") === right("key")
+          && left("window") === right("window")
+          && 'leftValue > 4,
+        "left_outer")
+        .select(left("key"), left("window.end").cast("long"), 'leftValue, 'rightValue)
+
+    testStream(joined)(
+      AddData(leftInput, 1, 2, 3),
+      AddData(rightInput, 3, 4, 5),
+      // The left rows with leftValue <= 4 should generate their outer join row now and
+      // not get added to the state.
+      CheckLastBatch(Row(3, 10, 6, "9"), Row(1, 10, 2, null), Row(2, 10, 4, null)),
+      assertNumStateRows(total = 4, updated = 4),
+      // We shouldn't get more outer join rows when the watermark advances.
+      AddData(leftInput, 20),
+      AddData(rightInput, 21),
+      CheckLastBatch(),
+      AddData(rightInput, 20),
+      CheckLastBatch((20, 30, 40, "60"))
+    )
+  }
+
+  test("left outer early state exclusion on right") {
+    val (leftInput, df1) = setupStream("left", 2)
+    val (rightInput, df2) = setupStream("right", 3)
+    // Use different schemas to ensure the null row is being generated from the correct side.
+    val left = df1.select('key, window('leftTime, "10 second"), 'leftValue)
+    val right = df2.select('key, window('rightTime, "10 second"), 'rightValue.cast("string"))
+
+    val joined = left.join(
+      right,
+      left("key") === right("key")
+        && left("window") === right("window")
+        && 'rightValue.cast("int") > 7,
+      "left_outer")
+      .select(left("key"), left("window.end").cast("long"), 'leftValue, 'rightValue)
+
+    testStream(joined)(
+      AddData(leftInput, 3, 4, 5),
+      AddData(rightInput, 1, 2, 3),
+      // The right rows with value <= 7 should never be added to the state.
+      CheckLastBatch(Row(3, 10, 6, "9")),
+      assertNumStateRows(total = 4, updated = 4),
+      // When the watermark advances, we get the outer join rows just as we would if they
+      // were added but didn't match the full join condition.
+      AddData(leftInput, 20),
+      AddData(rightInput, 21),
+      CheckLastBatch(),
+      AddData(rightInput, 20),
+      CheckLastBatch(Row(20, 30, 40, "60"), Row(4, 10, 8, null), Row(5, 10, 10, null))
+    )
+  }
+
+  test("right outer early state exclusion on left") {
+    val (leftInput, df1) = setupStream("left", 2)
+    val (rightInput, df2) = setupStream("right", 3)
+    // Use different schemas to ensure the null row is being generated from the correct side.
+    val left = df1.select('key, window('leftTime, "10 second"), 'leftValue)
+    val right = df2.select('key, window('rightTime, "10 second"), 'rightValue.cast("string"))
+
+    val joined = left.join(
+      right,
+      left("key") === right("key")
+        && left("window") === right("window")
+        && 'leftValue > 4,
+      "right_outer")
+      .select(right("key"), right("window.end").cast("long"), 'leftValue, 'rightValue)
+
+    testStream(joined)(
+      AddData(leftInput, 1, 2, 3),
+      AddData(rightInput, 3, 4, 5),
+      // The left rows with value <= 4 should never be added to the state.
+      CheckLastBatch(Row(3, 10, 6, "9")),
+      assertNumStateRows(total = 4, updated = 4),
+      // When the watermark advances, we get the outer join rows just as we would if they
+      // were added but didn't match the full join condition.
+      AddData(leftInput, 20),
+      AddData(rightInput, 21),
+      CheckLastBatch(),
+      AddData(rightInput, 20),
+      CheckLastBatch(Row(20, 30, 40, "60"), Row(4, 10, null, "12"), Row(5, 10, null, "15"))
+    )
+  }
+
+  test("right outer early state exclusion on right") {
+    val (leftInput, df1) = setupStream("left", 2)
+    val (rightInput, df2) = setupStream("right", 3)
+    // Use different schemas to ensure the null row is being generated from the correct side.
+    val left = df1.select('key, window('leftTime, "10 second"), 'leftValue)
+    val right = df2.select('key, window('rightTime, "10 second"), 'rightValue.cast("string"))
+
+    val joined = left.join(
+      right,
+      left("key") === right("key")
+        && left("window") === right("window")
+        && 'rightValue.cast("int") > 7,
+      "right_outer")
+      .select(right("key"), right("window.end").cast("long"), 'leftValue, 'rightValue)
+
+    testStream(joined)(
+      AddData(leftInput, 3, 4, 5),
+      AddData(rightInput, 1, 2, 3),
+      // The right rows with rightValue <= 7 should generate their outer join row now and
+      // not get added to the state.
+      CheckLastBatch(Row(3, 10, 6, "9"), Row(1, 10, null, "3"), Row(2, 10, null, "6")),
+      assertNumStateRows(total = 4, updated = 4),
+      // We shouldn't get more outer join rows when the watermark advances.
+      AddData(leftInput, 20),
+      AddData(rightInput, 21),
+      CheckLastBatch(),
+      AddData(rightInput, 20),
+      CheckLastBatch((20, 30, 40, "60"))
+    )
+  }
+
   test("windowed left outer join") {
     val (leftInput, rightInput, joined) = setupWindowedJoin("left_outer")
 
@@ -495,7 +637,7 @@ class StreamingOuterJoinSuite extends StreamTest with StateStoreMetricsTest with
 
   // When the join condition isn't true, the outer null rows must be generated, even if the join
   // keys themselves have a match.
-  test("left outer join with non-key condition violated on left") {
+  test("left outer join with non-key condition violated") {
     val (leftInput, simpleLeftDf) = setupStream("left", 2)
     val (rightInput, simpleRightDf) = setupStream("right", 3)
 
@@ -513,14 +655,14 @@ class StreamingOuterJoinSuite extends StreamTest with StateStoreMetricsTest with
       // leftValue <= 10 should generate outer join rows even though it matches right keys
       AddData(leftInput, 1, 2, 3),
       AddData(rightInput, 1, 2, 3),
-      CheckLastBatch(),
+      CheckLastBatch(Row(1, 10, 2, null), Row(2, 10, 4, null), Row(3, 10, 6, null)),
       AddData(leftInput, 20),
       AddData(rightInput, 21),
       CheckLastBatch(),
-      assertNumStateRows(total = 8, updated = 2),
+      assertNumStateRows(total = 5, updated = 2),
       AddData(rightInput, 20),
       CheckLastBatch(
-        Row(20, 30, 40, 60), Row(1, 10, 2, null), Row(2, 10, 4, null), Row(3, 10, 6, null)),
+        Row(20, 30, 40, 60)),
       assertNumStateRows(total = 3, updated = 1),
       // leftValue and rightValue both satisfying condition should not generate outer join rows
       AddData(leftInput, 40, 41),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingSymmetricHashJoinHelperSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingSymmetricHashJoinHelperSuite.scala
new file mode 100644
index 0000000000000..2a854e37bf0df
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingSymmetricHashJoinHelperSuite.scala
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.streaming
+
+import org.apache.spark.sql.Column
+import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer
+import org.apache.spark.sql.catalyst.expressions.AttributeReference
+import org.apache.spark.sql.execution.{LeafExecNode, LocalTableScanExec, SparkPlan}
+import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec
+import org.apache.spark.sql.execution.streaming.StreamingSymmetricHashJoinHelper.JoinConditionSplitPredicates
+import org.apache.spark.sql.types._
+
+class StreamingSymmetricHashJoinHelperSuite extends StreamTest {
+  import org.apache.spark.sql.functions._
+
+  val leftAttributeA = AttributeReference("a", IntegerType)()
+  val leftAttributeB = AttributeReference("b", IntegerType)()
+  val rightAttributeC = AttributeReference("c", IntegerType)()
+  val rightAttributeD = AttributeReference("d", IntegerType)()
+  val leftColA = new Column(leftAttributeA)
+  val leftColB = new Column(leftAttributeB)
+  val rightColC = new Column(rightAttributeC)
+  val rightColD = new Column(rightAttributeD)
+
+  val left = new LocalTableScanExec(Seq(leftAttributeA, leftAttributeB), Seq())
+  val right = new LocalTableScanExec(Seq(rightAttributeC, rightAttributeD), Seq())
+
+  test("empty") {
+    val split = JoinConditionSplitPredicates(None, left, right)
+    assert(split.leftSideOnly.isEmpty)
+    assert(split.rightSideOnly.isEmpty)
+    assert(split.bothSides.isEmpty)
+    assert(split.full.isEmpty)
+  }
+
+  test("only literals") {
+    // Literal-only conjuncts end up on the left side because that's the first bucket they fit in.
+    // There's no semantic reason they couldn't be in any bucket.
+    val predicate = (lit(1) < lit(5) && lit(6) < lit(7) && lit(0) === lit(-1)).expr
+    val split = JoinConditionSplitPredicates(Some(predicate), left, right)
+
+    assert(split.leftSideOnly.contains(predicate))
+    assert(split.rightSideOnly.contains(predicate))
+    assert(split.bothSides.isEmpty)
+    assert(split.full.contains(predicate))
+  }
+
+  test("only left") {
+    val predicate = (leftColA > lit(1) && leftColB > lit(5) && leftColA < leftColB).expr
+    val split = JoinConditionSplitPredicates(Some(predicate), left, right)
+
+    assert(split.leftSideOnly.contains(predicate))
+    assert(split.rightSideOnly.isEmpty)
+    assert(split.bothSides.isEmpty)
+    assert(split.full.contains(predicate))
+  }
+
+  test("only right") {
+    val predicate = (rightColC > lit(1) && rightColD > lit(5) && rightColD < rightColC).expr
+    val split = JoinConditionSplitPredicates(Some(predicate), left, right)
+
+    assert(split.leftSideOnly.isEmpty)
+    assert(split.rightSideOnly.contains(predicate))
+    assert(split.bothSides.isEmpty)
+    assert(split.full.contains(predicate))
+  }
+
+  test("mixed conjuncts") {
+    val predicate =
+      (leftColA > leftColB
+        && rightColC > rightColD
+        && leftColA === rightColC
+        && lit(1) === lit(1)).expr
+    val split = JoinConditionSplitPredicates(Some(predicate), left, right)
+
+    assert(split.leftSideOnly.contains((leftColA > leftColB && lit(1) === lit(1)).expr))
+    assert(split.rightSideOnly.contains((rightColC > rightColD && lit(1) === lit(1)).expr))
+    assert(split.bothSides.contains((leftColA === rightColC).expr))
+    assert(split.full.contains(predicate))
+  }
+
+  test("conjuncts after nondeterministic") {
+    // All conjuncts after a nondeterministic conjunct shouldn't be split because they don't
+    // commute across it.
+    val predicate =
+      (rand() > lit(0)
+        && leftColA > leftColB
+        && rightColC > rightColD
+        && leftColA === rightColC
+        && lit(1) === lit(1)).expr
+    val split = JoinConditionSplitPredicates(Some(predicate), left, right)
+
+    assert(split.leftSideOnly.isEmpty)
+    assert(split.rightSideOnly.isEmpty)
+    assert(split.bothSides.contains(predicate))
+    assert(split.full.contains(predicate))
+  }
+
+
+  test("conjuncts before nondeterministic") {
+    val randCol = rand()
+    val predicate =
+      (leftColA > leftColB
+        && rightColC > rightColD
+        && leftColA === rightColC
+        && lit(1) === lit(1)
+        && randCol > lit(0)).expr
+    val split = JoinConditionSplitPredicates(Some(predicate), left, right)
+
+    assert(split.leftSideOnly.contains((leftColA > leftColB && lit(1) === lit(1)).expr))
+    assert(split.rightSideOnly.contains((rightColC > rightColD && lit(1) === lit(1)).expr))
+    assert(split.bothSides.contains((leftColA === rightColC && randCol > lit(0)).expr))
+    assert(split.full.contains(predicate))
+  }
+}

From 28f9f3f22511e9f2f900764d9bd5b90d2eeee773 Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxing@us.ibm.com>
Date: Tue, 17 Oct 2017 12:50:41 -0700
Subject: [PATCH 725/779] [SPARK-22271][SQL] mean overflows and returns null
 for some decimal variables

## What changes were proposed in this pull request?

In Average.scala, it has
```
  override lazy val evaluateExpression = child.dataType match {
    case DecimalType.Fixed(p, s) =>
      // increase the precision and scale to prevent precision loss
      val dt = DecimalType.bounded(p + 14, s + 4)
      Cast(Cast(sum, dt) / Cast(count, dt), resultType)
    case _ =>
      Cast(sum, resultType) / Cast(count, resultType)
  }

  def setChild (newchild: Expression) = {
    child = newchild
  }

```
It is possible that  Cast(count, dt), resultType) will make the precision of the decimal number bigger than 38, and this causes over flow.  Since count is an integer and doesn't need a scale, I will cast it using DecimalType.bounded(38,0)
## How was this patch tested?
In DataFrameSuite, I will add a test case.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Huaxin Gao <huaxing@us.ibm.com>

Closes #19496 from huaxingao/spark-22271.
---
 .../sql/catalyst/expressions/aggregate/Average.scala     | 3 ++-
 .../test/scala/org/apache/spark/sql/DataFrameSuite.scala | 9 +++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala
index c423e17169e85..708bdbfc36058 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala
@@ -80,7 +80,8 @@ case class Average(child: Expression) extends DeclarativeAggregate with Implicit
     case DecimalType.Fixed(p, s) =>
       // increase the precision and scale to prevent precision loss
       val dt = DecimalType.bounded(p + 14, s + 4)
-      Cast(Cast(sum, dt) / Cast(count, dt), resultType)
+      Cast(Cast(sum, dt) / Cast(count, DecimalType.bounded(DecimalType.MAX_PRECISION, 0)),
+        resultType)
     case _ =>
       Cast(sum, resultType) / Cast(count, resultType)
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 50de2fd3bca8d..473c355cf3c7f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -2105,4 +2105,13 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
       testData2.select(lit(7), 'a, 'b).orderBy(lit(1), lit(2), lit(3)),
       Seq(Row(7, 1, 1), Row(7, 1, 2), Row(7, 2, 1), Row(7, 2, 2), Row(7, 3, 1), Row(7, 3, 2)))
   }
+
+  test("SPARK-22271: mean overflows and returns null for some decimal variables") {
+    val d = 0.034567890
+    val df = Seq(d, d, d, d, d, d, d, d, d, d).toDF("DecimalCol")
+    val result = df.select('DecimalCol cast DecimalType(38, 33))
+      .select(col("DecimalCol")).describe()
+    val mean = result.select("DecimalCol").where($"summary" === "mean")
+    assert(mean.collect().toSet === Set(Row("0.0345678900000000000000000000000000000")))
+  }
 }

From 1437e344ec0c29a44a19f4513986f5f184c44695 Mon Sep 17 00:00:00 2001
From: Michael Mior <mmior@uwaterloo.ca>
Date: Tue, 17 Oct 2017 14:30:52 -0700
Subject: [PATCH 726/779] [SPARK-22050][CORE] Allow BlockUpdated events to be
 optionally logged to the event log

## What changes were proposed in this pull request?

I see that block updates are not logged to the event log.
This makes sense as a default for performance reasons.
However, I find it helpful when trying to get a better understanding of caching for a job to be able to log these updates.
This PR adds a configuration setting `spark.eventLog.blockUpdates` (defaulting to false) which allows block updates to be recorded in the log.
This contribution is original work which is licensed to the Apache Spark project.

## How was this patch tested?

Current and additional unit tests.

Author: Michael Mior <mmior@uwaterloo.ca>

Closes #19263 from michaelmior/log-block-updates.
---
 .../spark/internal/config/package.scala       | 23 +++++++++++++
 .../scheduler/EventLoggingListener.scala      | 18 ++++++----
 .../org/apache/spark/util/JsonProtocol.scala  | 34 +++++++++++++++++--
 .../scheduler/EventLoggingListenerSuite.scala |  2 ++
 .../apache/spark/util/JsonProtocolSuite.scala | 27 +++++++++++++++
 docs/configuration.md                         |  8 +++++
 6 files changed, 104 insertions(+), 8 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index e7b406af8d9b1..0c36bdcdd2904 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -41,6 +41,29 @@ package object config {
     .bytesConf(ByteUnit.MiB)
     .createWithDefaultString("1g")
 
+  private[spark] val EVENT_LOG_COMPRESS =
+    ConfigBuilder("spark.eventLog.compress")
+      .booleanConf
+      .createWithDefault(false)
+
+  private[spark] val EVENT_LOG_BLOCK_UPDATES =
+    ConfigBuilder("spark.eventLog.logBlockUpdates.enabled")
+      .booleanConf
+      .createWithDefault(false)
+
+  private[spark] val EVENT_LOG_TESTING =
+    ConfigBuilder("spark.eventLog.testing")
+      .internal()
+      .booleanConf
+      .createWithDefault(false)
+
+  private[spark] val EVENT_LOG_OUTPUT_BUFFER_SIZE = ConfigBuilder("spark.eventLog.buffer.kb")
+    .bytesConf(ByteUnit.KiB)
+    .createWithDefaultString("100k")
+
+  private[spark] val EVENT_LOG_OVERWRITE =
+    ConfigBuilder("spark.eventLog.overwrite").booleanConf.createWithDefault(false)
+
   private[spark] val EXECUTOR_CLASS_PATH =
     ConfigBuilder(SparkLauncher.EXECUTOR_EXTRA_CLASSPATH).stringConf.createOptional
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
index 9dafa0b7646bf..a77adc5ff3545 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
@@ -37,6 +37,7 @@ import org.json4s.jackson.JsonMethods._
 import org.apache.spark.{SPARK_VERSION, SparkConf}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config._
 import org.apache.spark.io.CompressionCodec
 import org.apache.spark.util.{JsonProtocol, Utils}
 
@@ -45,6 +46,7 @@ import org.apache.spark.util.{JsonProtocol, Utils}
  *
  * Event logging is specified by the following configurable parameters:
  *   spark.eventLog.enabled - Whether event logging is enabled.
+ *   spark.eventLog.logBlockUpdates.enabled - Whether to log block updates
  *   spark.eventLog.compress - Whether to compress logged events
  *   spark.eventLog.overwrite - Whether to overwrite any existing files.
  *   spark.eventLog.dir - Path to the directory in which events are logged.
@@ -64,10 +66,11 @@ private[spark] class EventLoggingListener(
     this(appId, appAttemptId, logBaseDir, sparkConf,
       SparkHadoopUtil.get.newConfiguration(sparkConf))
 
-  private val shouldCompress = sparkConf.getBoolean("spark.eventLog.compress", false)
-  private val shouldOverwrite = sparkConf.getBoolean("spark.eventLog.overwrite", false)
-  private val testing = sparkConf.getBoolean("spark.eventLog.testing", false)
-  private val outputBufferSize = sparkConf.getInt("spark.eventLog.buffer.kb", 100) * 1024
+  private val shouldCompress = sparkConf.get(EVENT_LOG_COMPRESS)
+  private val shouldOverwrite = sparkConf.get(EVENT_LOG_OVERWRITE)
+  private val shouldLogBlockUpdates = sparkConf.get(EVENT_LOG_BLOCK_UPDATES)
+  private val testing = sparkConf.get(EVENT_LOG_TESTING)
+  private val outputBufferSize = sparkConf.get(EVENT_LOG_OUTPUT_BUFFER_SIZE).toInt
   private val fileSystem = Utils.getHadoopFileSystem(logBaseDir, hadoopConf)
   private val compressionCodec =
     if (shouldCompress) {
@@ -216,8 +219,11 @@ private[spark] class EventLoggingListener(
     logEvent(event, flushLogger = true)
   }
 
-  // No-op because logging every update would be overkill
-  override def onBlockUpdated(event: SparkListenerBlockUpdated): Unit = {}
+  override def onBlockUpdated(event: SparkListenerBlockUpdated): Unit = {
+    if (shouldLogBlockUpdates) {
+      logEvent(event, flushLogger = true)
+    }
+  }
 
   // No-op because logging every update would be overkill
   override def onExecutorMetricsUpdate(event: SparkListenerExecutorMetricsUpdate): Unit = { }
diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index 8406826a228db..5e60218c5740b 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -98,8 +98,8 @@ private[spark] object JsonProtocol {
         logStartToJson(logStart)
       case metricsUpdate: SparkListenerExecutorMetricsUpdate =>
         executorMetricsUpdateToJson(metricsUpdate)
-      case blockUpdated: SparkListenerBlockUpdated =>
-        throw new MatchError(blockUpdated)  // TODO(ekl) implement this
+      case blockUpdate: SparkListenerBlockUpdated =>
+        blockUpdateToJson(blockUpdate)
       case _ => parse(mapper.writeValueAsString(event))
     }
   }
@@ -246,6 +246,12 @@ private[spark] object JsonProtocol {
     })
   }
 
+  def blockUpdateToJson(blockUpdate: SparkListenerBlockUpdated): JValue = {
+    val blockUpdatedInfo = blockUpdatedInfoToJson(blockUpdate.blockUpdatedInfo)
+    ("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.blockUpdate) ~
+    ("Block Updated Info" -> blockUpdatedInfo)
+  }
+
   /** ------------------------------------------------------------------- *
    * JSON serialization methods for classes SparkListenerEvents depend on |
    * -------------------------------------------------------------------- */
@@ -458,6 +464,14 @@ private[spark] object JsonProtocol {
     ("Log Urls" -> mapToJson(executorInfo.logUrlMap))
   }
 
+  def blockUpdatedInfoToJson(blockUpdatedInfo: BlockUpdatedInfo): JValue = {
+    ("Block Manager ID" -> blockManagerIdToJson(blockUpdatedInfo.blockManagerId)) ~
+    ("Block ID" -> blockUpdatedInfo.blockId.toString) ~
+    ("Storage Level" -> storageLevelToJson(blockUpdatedInfo.storageLevel)) ~
+    ("Memory Size" -> blockUpdatedInfo.memSize) ~
+    ("Disk Size" -> blockUpdatedInfo.diskSize)
+  }
+
   /** ------------------------------ *
    * Util JSON serialization methods |
    * ------------------------------- */
@@ -515,6 +529,7 @@ private[spark] object JsonProtocol {
     val executorRemoved = Utils.getFormattedClassName(SparkListenerExecutorRemoved)
     val logStart = Utils.getFormattedClassName(SparkListenerLogStart)
     val metricsUpdate = Utils.getFormattedClassName(SparkListenerExecutorMetricsUpdate)
+    val blockUpdate = Utils.getFormattedClassName(SparkListenerBlockUpdated)
   }
 
   def sparkEventFromJson(json: JValue): SparkListenerEvent = {
@@ -538,6 +553,7 @@ private[spark] object JsonProtocol {
       case `executorRemoved` => executorRemovedFromJson(json)
       case `logStart` => logStartFromJson(json)
       case `metricsUpdate` => executorMetricsUpdateFromJson(json)
+      case `blockUpdate` => blockUpdateFromJson(json)
       case other => mapper.readValue(compact(render(json)), Utils.classForName(other))
         .asInstanceOf[SparkListenerEvent]
     }
@@ -676,6 +692,11 @@ private[spark] object JsonProtocol {
     SparkListenerExecutorMetricsUpdate(execInfo, accumUpdates)
   }
 
+  def blockUpdateFromJson(json: JValue): SparkListenerBlockUpdated = {
+    val blockUpdatedInfo = blockUpdatedInfoFromJson(json \ "Block Updated Info")
+    SparkListenerBlockUpdated(blockUpdatedInfo)
+  }
+
   /** --------------------------------------------------------------------- *
    * JSON deserialization methods for classes SparkListenerEvents depend on |
    * ---------------------------------------------------------------------- */
@@ -989,6 +1010,15 @@ private[spark] object JsonProtocol {
     new ExecutorInfo(executorHost, totalCores, logUrls)
   }
 
+  def blockUpdatedInfoFromJson(json: JValue): BlockUpdatedInfo = {
+    val blockManagerId = blockManagerIdFromJson(json \ "Block Manager ID")
+    val blockId = BlockId((json \ "Block ID").extract[String])
+    val storageLevel = storageLevelFromJson(json \ "Storage Level")
+    val memorySize = (json \ "Memory Size").extract[Long]
+    val diskSize = (json \ "Disk Size").extract[Long]
+    BlockUpdatedInfo(blockManagerId, blockId, storageLevel, memorySize, diskSize)
+  }
+
   /** -------------------------------- *
    * Util JSON deserialization methods |
    * --------------------------------- */
diff --git a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
index 6b42775ccb0f6..a9e92fa07b9dd 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
@@ -228,6 +228,7 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit
       SparkListenerStageCompleted,
       SparkListenerTaskStart,
       SparkListenerTaskEnd,
+      SparkListenerBlockUpdated,
       SparkListenerApplicationEnd).map(Utils.getFormattedClassName)
     Utils.tryWithSafeFinally {
       val logStart = SparkListenerLogStart(SPARK_VERSION)
@@ -291,6 +292,7 @@ object EventLoggingListenerSuite {
   def getLoggingConf(logDir: Path, compressionCodec: Option[String] = None): SparkConf = {
     val conf = new SparkConf
     conf.set("spark.eventLog.enabled", "true")
+    conf.set("spark.eventLog.logBlockUpdates.enabled", "true")
     conf.set("spark.eventLog.testing", "true")
     conf.set("spark.eventLog.dir", logDir.toString)
     compressionCodec.foreach { codec =>
diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
index a1a858765a7d4..4abbb8e7894f5 100644
--- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
@@ -96,6 +96,9 @@ class JsonProtocolSuite extends SparkFunSuite {
           .zipWithIndex.map { case (a, i) => a.copy(id = i) }
       SparkListenerExecutorMetricsUpdate("exec3", Seq((1L, 2, 3, accumUpdates)))
     }
+    val blockUpdated =
+      SparkListenerBlockUpdated(BlockUpdatedInfo(BlockManagerId("Stars",
+        "In your multitude...", 300), RDDBlockId(0, 0), StorageLevel.MEMORY_ONLY, 100L, 0L))
 
     testEvent(stageSubmitted, stageSubmittedJsonString)
     testEvent(stageCompleted, stageCompletedJsonString)
@@ -120,6 +123,7 @@ class JsonProtocolSuite extends SparkFunSuite {
     testEvent(nodeBlacklisted, nodeBlacklistedJsonString)
     testEvent(nodeUnblacklisted, nodeUnblacklistedJsonString)
     testEvent(executorMetricsUpdate, executorMetricsUpdateJsonString)
+    testEvent(blockUpdated, blockUpdatedJsonString)
   }
 
   test("Dependent Classes") {
@@ -2007,6 +2011,29 @@ private[spark] object JsonProtocolSuite extends Assertions {
       |}
     """.stripMargin
 
+  private val blockUpdatedJsonString =
+    """
+      |{
+      |  "Event": "SparkListenerBlockUpdated",
+      |  "Block Updated Info": {
+      |    "Block Manager ID": {
+      |      "Executor ID": "Stars",
+      |      "Host": "In your multitude...",
+      |      "Port": 300
+      |    },
+      |    "Block ID": "rdd_0_0",
+      |    "Storage Level": {
+      |      "Use Disk": false,
+      |      "Use Memory": true,
+      |      "Deserialized": true,
+      |      "Replication": 1
+      |    },
+      |    "Memory Size": 100,
+      |    "Disk Size": 0
+      |  }
+      |}
+    """.stripMargin
+
   private val executorBlacklistedJsonString =
     s"""
       |{
diff --git a/docs/configuration.md b/docs/configuration.md
index bb06c8faaaed7..7b9e16a382449 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -714,6 +714,14 @@ Apart from these, the following properties are also available, and may be useful
 
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr>
+  <td><code>spark.eventLog.logBlockUpdates.enabled</code></td>
+  <td>false</td>
+  <td>
+    Whether to log events for every block update, if <code>spark.eventLog.enabled</code> is true.
+    *Warning*: This will increase the size of the event log considerably.
+  </td>
+</tr>
 <tr>
   <td><code>spark.eventLog.compress</code></td>
   <td>false</td>

From f3137feecd30c74c47dbddb0e22b4ddf8cf2f912 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Tue, 17 Oct 2017 20:09:12 -0700
Subject: [PATCH 727/779] [SPARK-22278][SS] Expose current event time watermark
 and current processing time in GroupState

## What changes were proposed in this pull request?

Complex state-updating and/or timeout-handling logic in mapGroupsWithState functions may require taking decisions based on the current event-time watermark and/or processing time. Currently, you can use the SQL function `current_timestamp` to get the current processing time, but it needs to be passed inserted in every row with a select, and then passed through the encoder, which isn't efficient. Furthermore, there is no way to get the current watermark.

This PR exposes both of them through the GroupState API.
Additionally, it also cleans up some of the GroupState docs.

## How was this patch tested?

New unit tests

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #19495 from tdas/SPARK-22278.
---
 .../apache/spark/sql/execution/objects.scala  |   8 +-
 .../FlatMapGroupsWithStateExec.scala          |   7 +-
 .../execution/streaming/GroupStateImpl.scala  |  50 +++---
 .../spark/sql/streaming/GroupState.scala      |  92 ++++++----
 .../FlatMapGroupsWithStateSuite.scala         | 160 +++++++++++++++---
 5 files changed, 238 insertions(+), 79 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
index c68975bea490f..d861109436a08 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
@@ -29,7 +29,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.expressions.objects.Invoke
-import org.apache.spark.sql.catalyst.plans.logical.{FunctionUtils, LogicalGroupState}
+import org.apache.spark.sql.catalyst.plans.logical.{EventTimeWatermark, FunctionUtils, LogicalGroupState}
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution.streaming.GroupStateImpl
 import org.apache.spark.sql.streaming.GroupStateTimeout
@@ -361,8 +361,12 @@ object MapGroupsExec {
       outputObjAttr: Attribute,
       timeoutConf: GroupStateTimeout,
       child: SparkPlan): MapGroupsExec = {
+    val watermarkPresent = child.output.exists {
+      case a: Attribute if a.metadata.contains(EventTimeWatermark.delayKey) => true
+      case _ => false
+    }
     val f = (key: Any, values: Iterator[Any]) => {
-      func(key, values, GroupStateImpl.createForBatch(timeoutConf))
+      func(key, values, GroupStateImpl.createForBatch(timeoutConf, watermarkPresent))
     }
     new MapGroupsExec(f, keyDeserializer, valueDeserializer,
       groupingAttributes, dataAttributes, outputObjAttr, child)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala
index c81f1a8142784..29f38fab3f896 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala
@@ -61,6 +61,10 @@ case class FlatMapGroupsWithStateExec(
 
   private val isTimeoutEnabled = timeoutConf != NoTimeout
   val stateManager = new FlatMapGroupsWithState_StateManager(stateEncoder, isTimeoutEnabled)
+  val watermarkPresent = child.output.exists {
+    case a: Attribute if a.metadata.contains(EventTimeWatermark.delayKey) => true
+    case _ => false
+  }
 
   /** Distribute by grouping attributes */
   override def requiredChildDistribution: Seq[Distribution] =
@@ -190,7 +194,8 @@ case class FlatMapGroupsWithStateExec(
         batchTimestampMs.getOrElse(NO_TIMESTAMP),
         eventTimeWatermark.getOrElse(NO_TIMESTAMP),
         timeoutConf,
-        hasTimedOut)
+        hasTimedOut,
+        watermarkPresent)
 
       // Call function, get the returned objects and convert them to rows
       val mappedIterator = func(keyObj, valueObjIter, groupState).map { obj =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/GroupStateImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/GroupStateImpl.scala
index 4401e86936af9..7f65e3ea9dd5e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/GroupStateImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/GroupStateImpl.scala
@@ -43,7 +43,8 @@ private[sql] class GroupStateImpl[S] private(
     batchProcessingTimeMs: Long,
     eventTimeWatermarkMs: Long,
     timeoutConf: GroupStateTimeout,
-    override val hasTimedOut: Boolean) extends GroupState[S] {
+    override val hasTimedOut: Boolean,
+    watermarkPresent: Boolean) extends GroupState[S] {
 
   private var value: S = optionalValue.getOrElse(null.asInstanceOf[S])
   private var defined: Boolean = optionalValue.isDefined
@@ -90,7 +91,7 @@ private[sql] class GroupStateImpl[S] private(
     if (timeoutConf != ProcessingTimeTimeout) {
       throw new UnsupportedOperationException(
         "Cannot set timeout duration without enabling processing time timeout in " +
-          "map/flatMapGroupsWithState")
+          "[map|flatMap]GroupsWithState")
     }
     if (durationMs <= 0) {
       throw new IllegalArgumentException("Timeout duration must be positive")
@@ -102,10 +103,6 @@ private[sql] class GroupStateImpl[S] private(
     setTimeoutDuration(parseDuration(duration))
   }
 
-  @throws[IllegalArgumentException]("if 'timestampMs' is not positive")
-  @throws[IllegalStateException]("when state is either not initialized, or already removed")
-  @throws[UnsupportedOperationException](
-    "if 'timeout' has not been enabled in [map|flatMap]GroupsWithState in a streaming query")
   override def setTimeoutTimestamp(timestampMs: Long): Unit = {
     checkTimeoutTimestampAllowed()
     if (timestampMs <= 0) {
@@ -119,32 +116,34 @@ private[sql] class GroupStateImpl[S] private(
     timeoutTimestamp = timestampMs
   }
 
-  @throws[IllegalArgumentException]("if 'additionalDuration' is invalid")
-  @throws[IllegalStateException]("when state is either not initialized, or already removed")
-  @throws[UnsupportedOperationException](
-    "if 'timeout' has not been enabled in [map|flatMap]GroupsWithState in a streaming query")
   override def setTimeoutTimestamp(timestampMs: Long, additionalDuration: String): Unit = {
     checkTimeoutTimestampAllowed()
     setTimeoutTimestamp(parseDuration(additionalDuration) + timestampMs)
   }
 
-  @throws[IllegalStateException]("when state is either not initialized, or already removed")
-  @throws[UnsupportedOperationException](
-    "if 'timeout' has not been enabled in [map|flatMap]GroupsWithState in a streaming query")
   override def setTimeoutTimestamp(timestamp: Date): Unit = {
     checkTimeoutTimestampAllowed()
     setTimeoutTimestamp(timestamp.getTime)
   }
 
-  @throws[IllegalArgumentException]("if 'additionalDuration' is invalid")
-  @throws[IllegalStateException]("when state is either not initialized, or already removed")
-  @throws[UnsupportedOperationException](
-    "if 'timeout' has not been enabled in [map|flatMap]GroupsWithState in a streaming query")
   override def setTimeoutTimestamp(timestamp: Date, additionalDuration: String): Unit = {
     checkTimeoutTimestampAllowed()
     setTimeoutTimestamp(timestamp.getTime + parseDuration(additionalDuration))
   }
 
+  override def getCurrentWatermarkMs(): Long = {
+    if (!watermarkPresent) {
+      throw new UnsupportedOperationException(
+        "Cannot get event time watermark timestamp without setting watermark before " +
+          "[map|flatMap]GroupsWithState")
+    }
+    eventTimeWatermarkMs
+  }
+
+  override def getCurrentProcessingTimeMs(): Long = {
+    batchProcessingTimeMs
+  }
+
   override def toString: String = {
     s"GroupState(${getOption.map(_.toString).getOrElse("<undefined>")})"
   }
@@ -187,7 +186,7 @@ private[sql] class GroupStateImpl[S] private(
     if (timeoutConf != EventTimeTimeout) {
       throw new UnsupportedOperationException(
         "Cannot set timeout timestamp without enabling event time timeout in " +
-          "map/flatMapGroupsWithState")
+          "[map|flatMapGroupsWithState")
     }
   }
 }
@@ -202,17 +201,22 @@ private[sql] object GroupStateImpl {
       batchProcessingTimeMs: Long,
       eventTimeWatermarkMs: Long,
       timeoutConf: GroupStateTimeout,
-      hasTimedOut: Boolean): GroupStateImpl[S] = {
+      hasTimedOut: Boolean,
+      watermarkPresent: Boolean): GroupStateImpl[S] = {
     new GroupStateImpl[S](
-      optionalValue, batchProcessingTimeMs, eventTimeWatermarkMs, timeoutConf, hasTimedOut)
+      optionalValue, batchProcessingTimeMs, eventTimeWatermarkMs,
+      timeoutConf, hasTimedOut, watermarkPresent)
   }
 
-  def createForBatch(timeoutConf: GroupStateTimeout): GroupStateImpl[Any] = {
+  def createForBatch(
+      timeoutConf: GroupStateTimeout,
+      watermarkPresent: Boolean): GroupStateImpl[Any] = {
     new GroupStateImpl[Any](
       optionalValue = None,
-      batchProcessingTimeMs = NO_TIMESTAMP,
+      batchProcessingTimeMs = System.currentTimeMillis,
       eventTimeWatermarkMs = NO_TIMESTAMP,
       timeoutConf,
-      hasTimedOut = false)
+      hasTimedOut = false,
+      watermarkPresent)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/GroupState.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/GroupState.scala
index 04a956b70b022..e9510c903acae 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/GroupState.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/GroupState.scala
@@ -205,11 +205,7 @@ trait GroupState[S] extends LogicalGroupState[S] {
   /** Get the state value as a scala Option. */
   def getOption: Option[S]
 
-  /**
-   * Update the value of the state. Note that `null` is not a valid value, and it throws
-   * IllegalArgumentException.
-   */
-  @throws[IllegalArgumentException]("when updating with null")
+  /** Update the value of the state. */
   def update(newState: S): Unit
 
   /** Remove this state. */
@@ -217,80 +213,114 @@ trait GroupState[S] extends LogicalGroupState[S] {
 
   /**
    * Whether the function has been called because the key has timed out.
-   * @note This can return true only when timeouts are enabled in `[map/flatmap]GroupsWithStates`.
+   * @note This can return true only when timeouts are enabled in `[map/flatMap]GroupsWithState`.
    */
   def hasTimedOut: Boolean
 
+
   /**
    * Set the timeout duration in ms for this key.
    *
-   * @note ProcessingTimeTimeout must be enabled in `[map/flatmap]GroupsWithStates`.
+   * @note [[GroupStateTimeout Processing time timeout]] must be enabled in
+   *       `[map/flatMap]GroupsWithState` for calling this method.
+   * @note This method has no effect when used in a batch query.
    */
   @throws[IllegalArgumentException]("if 'durationMs' is not positive")
-  @throws[IllegalStateException]("when state is either not initialized, or already removed")
   @throws[UnsupportedOperationException](
-    "if 'timeout' has not been enabled in [map|flatMap]GroupsWithState in a streaming query")
+    "if processing time timeout has not been enabled in [map|flatMap]GroupsWithState")
   def setTimeoutDuration(durationMs: Long): Unit
 
+
   /**
    * Set the timeout duration for this key as a string. For example, "1 hour", "2 days", etc.
    *
-   * @note ProcessingTimeTimeout must be enabled in `[map/flatmap]GroupsWithStates`.
+   * @note [[GroupStateTimeout Processing time timeout]] must be enabled in
+   *       `[map/flatMap]GroupsWithState` for calling this method.
+   * @note This method has no effect when used in a batch query.
    */
   @throws[IllegalArgumentException]("if 'duration' is not a valid duration")
-  @throws[IllegalStateException]("when state is either not initialized, or already removed")
   @throws[UnsupportedOperationException](
-    "if 'timeout' has not been enabled in [map|flatMap]GroupsWithState in a streaming query")
+    "if processing time timeout has not been enabled in [map|flatMap]GroupsWithState")
   def setTimeoutDuration(duration: String): Unit
 
-  @throws[IllegalArgumentException]("if 'timestampMs' is not positive")
-  @throws[IllegalStateException]("when state is either not initialized, or already removed")
-  @throws[UnsupportedOperationException](
-    "if 'timeout' has not been enabled in [map|flatMap]GroupsWithState in a streaming query")
+
   /**
    * Set the timeout timestamp for this key as milliseconds in epoch time.
    * This timestamp cannot be older than the current watermark.
    *
-   * @note EventTimeTimeout must be enabled in `[map/flatmap]GroupsWithStates`.
+   * @note [[GroupStateTimeout Event time timeout]] must be enabled in
+   *       `[map/flatMap]GroupsWithState` for calling this method.
+   * @note This method has no effect when used in a batch query.
    */
+  @throws[IllegalArgumentException](
+    "if 'timestampMs' is not positive or less than the current watermark in a streaming query")
+  @throws[UnsupportedOperationException](
+    "if processing time timeout has not been enabled in [map|flatMap]GroupsWithState")
   def setTimeoutTimestamp(timestampMs: Long): Unit
 
-  @throws[IllegalArgumentException]("if 'additionalDuration' is invalid")
-  @throws[IllegalStateException]("when state is either not initialized, or already removed")
-  @throws[UnsupportedOperationException](
-    "if 'timeout' has not been enabled in [map|flatMap]GroupsWithState in a streaming query")
+
   /**
    * Set the timeout timestamp for this key as milliseconds in epoch time and an additional
    * duration as a string (e.g. "1 hour", "2 days", etc.).
    * The final timestamp (including the additional duration) cannot be older than the
    * current watermark.
    *
-   * @note EventTimeTimeout must be enabled in `[map/flatmap]GroupsWithStates`.
+   * @note [[GroupStateTimeout Event time timeout]] must be enabled in
+   *       `[map/flatMap]GroupsWithState` for calling this method.
+   * @note This method has no side effect when used in a batch query.
    */
+  @throws[IllegalArgumentException](
+    "if 'additionalDuration' is invalid or the final timeout timestamp is less than " +
+      "the current watermark in a streaming query")
+  @throws[UnsupportedOperationException](
+    "if event time timeout has not been enabled in [map|flatMap]GroupsWithState")
   def setTimeoutTimestamp(timestampMs: Long, additionalDuration: String): Unit
 
-  @throws[IllegalStateException]("when state is either not initialized, or already removed")
-  @throws[UnsupportedOperationException](
-    "if 'timeout' has not been enabled in [map|flatMap]GroupsWithState in a streaming query")
+
   /**
    * Set the timeout timestamp for this key as a java.sql.Date.
    * This timestamp cannot be older than the current watermark.
    *
-   * @note EventTimeTimeout must be enabled in `[map/flatmap]GroupsWithStates`.
+   * @note [[GroupStateTimeout Event time timeout]] must be enabled in
+   *       `[map/flatMap]GroupsWithState` for calling this method.
+   * @note This method has no side effect when used in a batch query.
    */
+  @throws[UnsupportedOperationException](
+    "if event time timeout has not been enabled in [map|flatMap]GroupsWithState")
   def setTimeoutTimestamp(timestamp: java.sql.Date): Unit
 
-  @throws[IllegalArgumentException]("if 'additionalDuration' is invalid")
-  @throws[IllegalStateException]("when state is either not initialized, or already removed")
-  @throws[UnsupportedOperationException](
-    "if 'timeout' has not been enabled in [map|flatMap]GroupsWithState in a streaming query")
+
   /**
    * Set the timeout timestamp for this key as a java.sql.Date and an additional
    * duration as a string (e.g. "1 hour", "2 days", etc.).
    * The final timestamp (including the additional duration) cannot be older than the
    * current watermark.
    *
-   * @note EventTimeTimeout must be enabled in `[map/flatmap]GroupsWithStates`.
+   * @note [[GroupStateTimeout Event time timeout]] must be enabled in
+   *      `[map/flatMap]GroupsWithState` for calling this method.
+   * @note This method has no side effect when used in a batch query.
    */
+  @throws[IllegalArgumentException]("if 'additionalDuration' is invalid")
+  @throws[UnsupportedOperationException](
+    "if event time timeout has not been enabled in [map|flatMap]GroupsWithState")
   def setTimeoutTimestamp(timestamp: java.sql.Date, additionalDuration: String): Unit
+
+
+  /**
+   * Get the current event time watermark as milliseconds in epoch time.
+   *
+   * @note In a streaming query, this can be called only when watermark is set before calling
+   *       `[map/flatMap]GroupsWithState`. In a batch query, this method always returns -1.
+   */
+  @throws[UnsupportedOperationException](
+    "if watermark has not been set before in [map|flatMap]GroupsWithState")
+  def getCurrentWatermarkMs(): Long
+
+
+  /**
+   * Get the current processing time as milliseconds in epoch time.
+   * @note In a streaming query, this will return a constant value throughout the duration of a
+   *       trigger, even if the trigger is re-executed.
+   */
+  def getCurrentProcessingTimeMs(): Long
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
index aeb83835f981a..af08186aadbb0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
@@ -21,6 +21,7 @@ import java.sql.Date
 import java.util.concurrent.ConcurrentHashMap
 
 import org.scalatest.BeforeAndAfterAll
+import org.scalatest.exceptions.TestFailedException
 
 import org.apache.spark.SparkException
 import org.apache.spark.api.java.function.FlatMapGroupsWithStateFunction
@@ -48,6 +49,7 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
   import testImplicits._
   import GroupStateImpl._
   import GroupStateTimeout._
+  import FlatMapGroupsWithStateSuite._
 
   override def afterAll(): Unit = {
     super.afterAll()
@@ -77,13 +79,15 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
 
     // === Tests for state in streaming queries ===
     // Updating empty state
-    state = GroupStateImpl.createForStreaming(None, 1, 1, NoTimeout, hasTimedOut = false)
+    state = GroupStateImpl.createForStreaming(
+      None, 1, 1, NoTimeout, hasTimedOut = false, watermarkPresent = false)
     testState(None)
     state.update("")
     testState(Some(""), shouldBeUpdated = true)
 
     // Updating exiting state
-    state = GroupStateImpl.createForStreaming(Some("2"), 1, 1, NoTimeout, hasTimedOut = false)
+    state = GroupStateImpl.createForStreaming(
+      Some("2"), 1, 1, NoTimeout, hasTimedOut = false, watermarkPresent = false)
     testState(Some("2"))
     state.update("3")
     testState(Some("3"), shouldBeUpdated = true)
@@ -104,8 +108,9 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
   test("GroupState - setTimeout - with NoTimeout") {
     for (initValue <- Seq(None, Some(5))) {
       val states = Seq(
-        GroupStateImpl.createForStreaming(initValue, 1000, 1000, NoTimeout, hasTimedOut = false),
-        GroupStateImpl.createForBatch(NoTimeout)
+        GroupStateImpl.createForStreaming(
+          initValue, 1000, 1000, NoTimeout, hasTimedOut = false, watermarkPresent = false),
+        GroupStateImpl.createForBatch(NoTimeout, watermarkPresent = false)
       )
       for (state <- states) {
         // for streaming queries
@@ -122,7 +127,7 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
   test("GroupState - setTimeout - with ProcessingTimeTimeout") {
     // for streaming queries
     var state: GroupStateImpl[Int] = GroupStateImpl.createForStreaming(
-      None, 1000, 1000, ProcessingTimeTimeout, hasTimedOut = false)
+      None, 1000, 1000, ProcessingTimeTimeout, hasTimedOut = false, watermarkPresent = false)
     assert(state.getTimeoutTimestamp === NO_TIMESTAMP)
     state.setTimeoutDuration(500)
     assert(state.getTimeoutTimestamp === 1500) // can be set without initializing state
@@ -143,7 +148,8 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
     testTimeoutTimestampNotAllowed[UnsupportedOperationException](state)
 
     // for batch queries
-    state = GroupStateImpl.createForBatch(ProcessingTimeTimeout).asInstanceOf[GroupStateImpl[Int]]
+    state = GroupStateImpl.createForBatch(
+      ProcessingTimeTimeout, watermarkPresent = false).asInstanceOf[GroupStateImpl[Int]]
     assert(state.getTimeoutTimestamp === NO_TIMESTAMP)
     state.setTimeoutDuration(500)
     testTimeoutTimestampNotAllowed[UnsupportedOperationException](state)
@@ -160,7 +166,7 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
 
   test("GroupState - setTimeout - with EventTimeTimeout") {
     var state: GroupStateImpl[Int] = GroupStateImpl.createForStreaming(
-      None, 1000, 1000, EventTimeTimeout, false)
+      None, 1000, 1000, EventTimeTimeout, false, watermarkPresent = true)
 
     assert(state.getTimeoutTimestamp === NO_TIMESTAMP)
     testTimeoutDurationNotAllowed[UnsupportedOperationException](state)
@@ -182,7 +188,8 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
     testTimeoutDurationNotAllowed[UnsupportedOperationException](state)
 
     // for batch queries
-    state = GroupStateImpl.createForBatch(EventTimeTimeout).asInstanceOf[GroupStateImpl[Int]]
+    state = GroupStateImpl.createForBatch(EventTimeTimeout, watermarkPresent = false)
+      .asInstanceOf[GroupStateImpl[Int]]
     assert(state.getTimeoutTimestamp === NO_TIMESTAMP)
     testTimeoutDurationNotAllowed[UnsupportedOperationException](state)
     state.setTimeoutTimestamp(5000)
@@ -209,7 +216,7 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
     }
 
     state = GroupStateImpl.createForStreaming(
-      Some(5), 1000, 1000, ProcessingTimeTimeout, hasTimedOut = false)
+      Some(5), 1000, 1000, ProcessingTimeTimeout, hasTimedOut = false, watermarkPresent = false)
     testIllegalTimeout {
       state.setTimeoutDuration(-1000)
     }
@@ -227,7 +234,7 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
     }
 
     state = GroupStateImpl.createForStreaming(
-      Some(5), 1000, 1000, EventTimeTimeout, hasTimedOut = false)
+      Some(5), 1000, 1000, EventTimeTimeout, hasTimedOut = false, watermarkPresent = false)
     testIllegalTimeout {
       state.setTimeoutTimestamp(-10000)
     }
@@ -259,29 +266,92 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
       // for streaming queries
       for (initState <- Seq(None, Some(5))) {
         val state1 = GroupStateImpl.createForStreaming(
-          initState, 1000, 1000, timeoutConf, hasTimedOut = false)
+          initState, 1000, 1000, timeoutConf, hasTimedOut = false, watermarkPresent = false)
         assert(state1.hasTimedOut === false)
 
         val state2 = GroupStateImpl.createForStreaming(
-          initState, 1000, 1000, timeoutConf, hasTimedOut = true)
+          initState, 1000, 1000, timeoutConf, hasTimedOut = true, watermarkPresent = false)
         assert(state2.hasTimedOut === true)
       }
 
       // for batch queries
-      assert(GroupStateImpl.createForBatch(timeoutConf).hasTimedOut === false)
+      assert(
+        GroupStateImpl.createForBatch(timeoutConf, watermarkPresent = false).hasTimedOut === false)
+    }
+  }
+
+  test("GroupState - getCurrentWatermarkMs") {
+    def streamingState(timeoutConf: GroupStateTimeout, watermark: Option[Long]): GroupState[Int] = {
+      GroupStateImpl.createForStreaming(
+        None, 1000, watermark.getOrElse(-1), timeoutConf,
+        hasTimedOut = false, watermark.nonEmpty)
+    }
+
+    def batchState(timeoutConf: GroupStateTimeout, watermarkPresent: Boolean): GroupState[Any] = {
+      GroupStateImpl.createForBatch(timeoutConf, watermarkPresent)
+    }
+
+    def assertWrongTimeoutError(test: => Unit): Unit = {
+      val e = intercept[UnsupportedOperationException] { test }
+      assert(e.getMessage.contains(
+        "Cannot get event time watermark timestamp without setting watermark"))
+    }
+
+    for (timeoutConf <- Seq(NoTimeout, EventTimeTimeout, ProcessingTimeTimeout)) {
+      // Tests for getCurrentWatermarkMs in streaming queries
+      assertWrongTimeoutError { streamingState(timeoutConf, None).getCurrentWatermarkMs() }
+      assert(streamingState(timeoutConf, Some(1000)).getCurrentWatermarkMs() === 1000)
+      assert(streamingState(timeoutConf, Some(2000)).getCurrentWatermarkMs() === 2000)
+
+      // Tests for getCurrentWatermarkMs in batch queries
+      assertWrongTimeoutError {
+        batchState(timeoutConf, watermarkPresent = false).getCurrentWatermarkMs()
+      }
+      assert(batchState(timeoutConf, watermarkPresent = true).getCurrentWatermarkMs() === -1)
+    }
+  }
+
+  test("GroupState - getCurrentProcessingTimeMs") {
+    def streamingState(
+        timeoutConf: GroupStateTimeout,
+        procTime: Long,
+        watermarkPresent: Boolean): GroupState[Int] = {
+      GroupStateImpl.createForStreaming(
+        None, procTime, -1, timeoutConf, hasTimedOut = false, watermarkPresent = false)
+    }
+
+    def batchState(timeoutConf: GroupStateTimeout, watermarkPresent: Boolean): GroupState[Any] = {
+      GroupStateImpl.createForBatch(timeoutConf, watermarkPresent)
+    }
+
+    for (timeoutConf <- Seq(NoTimeout, EventTimeTimeout, ProcessingTimeTimeout)) {
+      for (watermarkPresent <- Seq(false, true)) {
+        // Tests for getCurrentProcessingTimeMs in streaming queries
+        assert(streamingState(timeoutConf, NO_TIMESTAMP, watermarkPresent)
+            .getCurrentProcessingTimeMs() === -1)
+        assert(streamingState(timeoutConf, 1000, watermarkPresent)
+          .getCurrentProcessingTimeMs() === 1000)
+        assert(streamingState(timeoutConf, 2000, watermarkPresent)
+          .getCurrentProcessingTimeMs() === 2000)
+
+        // Tests for getCurrentProcessingTimeMs in batch queries
+        val currentTime = System.currentTimeMillis()
+        assert(batchState(timeoutConf, watermarkPresent).getCurrentProcessingTimeMs >= currentTime)
+      }
     }
   }
 
+
   test("GroupState - primitive type") {
     var intState = GroupStateImpl.createForStreaming[Int](
-      None, 1000, 1000, NoTimeout, hasTimedOut = false)
+      None, 1000, 1000, NoTimeout, hasTimedOut = false, watermarkPresent = false)
     intercept[NoSuchElementException] {
       intState.get
     }
     assert(intState.getOption === None)
 
     intState = GroupStateImpl.createForStreaming[Int](
-      Some(10), 1000, 1000, NoTimeout, hasTimedOut = false)
+      Some(10), 1000, 1000, NoTimeout, hasTimedOut = false, watermarkPresent = false)
     assert(intState.get == 10)
     intState.update(0)
     assert(intState.get == 0)
@@ -304,7 +374,11 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
 
     testStateUpdateWithData(
       testName + "no update",
-      stateUpdates = state => { /* do nothing */ },
+      stateUpdates = state => {
+        assert(state.getCurrentProcessingTimeMs() === currentBatchTimestamp)
+        intercept[Exception] { state.getCurrentWatermarkMs() } // watermark not specified
+        /* no updates */
+      },
       timeoutConf = GroupStateTimeout.NoTimeout,
       priorState = priorState,
       expectedState = priorState)    // should not change
@@ -342,7 +416,11 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
 
         testStateUpdateWithData(
           s"$timeoutConf - $testName - no update",
-          stateUpdates = state => { /* do nothing */ },
+          stateUpdates = state => {
+            assert(state.getCurrentProcessingTimeMs() === currentBatchTimestamp)
+            intercept[Exception] { state.getCurrentWatermarkMs() } // watermark not specified
+            /* no updates */
+          },
           timeoutConf = timeoutConf,
           priorState = priorState,
           priorTimeoutTimestamp = priorTimeoutTimestamp,
@@ -466,7 +544,11 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
 
     testStateUpdateWithTimeout(
       s"$timeoutConf - should timeout - no update/remove",
-      stateUpdates = state => { /* do nothing */ },
+      stateUpdates = state => {
+        assert(state.getCurrentProcessingTimeMs() === currentBatchTimestamp)
+        intercept[Exception] { state.getCurrentWatermarkMs() } // watermark not specified
+        /* no updates */
+      },
       timeoutConf = timeoutConf,
       priorTimeoutTimestamp = beforeTimeoutThreshold,
       expectedState = preTimeoutState,                          // state should not change
@@ -525,6 +607,8 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
     // Function to maintain running count up to 2, and then remove the count
     // Returns the data and the count if state is defined, otherwise does not return anything
     val stateFunc = (key: String, values: Iterator[String], state: GroupState[RunningCount]) => {
+      assertCanGetProcessingTime { state.getCurrentProcessingTimeMs() >= 0 }
+      assertCannotGetWatermark { state.getCurrentWatermarkMs() }
 
       val count = state.getOption.map(_.count).getOrElse(0L) + values.size
       if (count == 3) {
@@ -647,6 +731,9 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
   test("flatMapGroupsWithState - batch") {
     // Function that returns running count only if its even, otherwise does not return
     val stateFunc = (key: String, values: Iterator[String], state: GroupState[RunningCount]) => {
+      assertCanGetProcessingTime { state.getCurrentProcessingTimeMs() > 0 }
+      assertCannotGetWatermark { state.getCurrentWatermarkMs() }
+
       if (state.exists) throw new IllegalArgumentException("state.exists should be false")
       Iterator((key, values.size))
     }
@@ -660,6 +747,9 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
     // Function to maintain running count up to 2, and then remove the count
     // Returns the data and the count (-1 if count reached beyond 2 and state was just removed)
     val stateFunc = (key: String, values: Iterator[String], state: GroupState[RunningCount]) => {
+      assertCanGetProcessingTime { state.getCurrentProcessingTimeMs() >= 0 }
+      assertCannotGetWatermark { state.getCurrentWatermarkMs() }
+
       if (state.hasTimedOut) {
         state.remove()
         Iterator((key, "-1"))
@@ -713,10 +803,10 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
   test("flatMapGroupsWithState - streaming with event time timeout + watermark") {
     // Function to maintain the max event time
     // Returns the max event time in the state, or -1 if the state was removed by timeout
-    val stateFunc = (
-        key: String,
-        values: Iterator[(String, Long)],
-        state: GroupState[Long]) => {
+    val stateFunc = (key: String, values: Iterator[(String, Long)], state: GroupState[Long]) => {
+      assertCanGetProcessingTime { state.getCurrentProcessingTimeMs() >= 0 }
+      assertCanGetWatermark { state.getCurrentWatermarkMs() >= -1 }
+
       val timeoutDelay = 5
       if (key != "a") {
         Iterator.empty
@@ -760,6 +850,8 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
     // Function to maintain running count up to 2, and then remove the count
     // Returns the data and the count (-1 if count reached beyond 2 and state was just removed)
     val stateFunc = (key: String, values: Iterator[String], state: GroupState[RunningCount]) => {
+      assertCanGetProcessingTime { state.getCurrentProcessingTimeMs() >= 0 }
+      assertCannotGetWatermark { state.getCurrentWatermarkMs() }
 
       val count = state.getOption.map(_.count).getOrElse(0L) + values.size
       if (count == 3) {
@@ -802,7 +894,11 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
     // - no initial state
     // - timeouts operations work, does not throw any error [SPARK-20792]
     // - works with primitive state type
+    // - can get processing time
     val stateFunc = (key: String, values: Iterator[String], state: GroupState[Int]) => {
+      assertCanGetProcessingTime { state.getCurrentProcessingTimeMs() > 0 }
+      assertCannotGetWatermark { state.getCurrentWatermarkMs() }
+
       if (state.exists) throw new IllegalArgumentException("state.exists should be false")
       state.setTimeoutTimestamp(0, "1 hour")
       state.update(10)
@@ -1090,4 +1186,24 @@ object FlatMapGroupsWithStateSuite {
     override def metrics: StateStoreMetrics = new StateStoreMetrics(map.size, 0, Map.empty)
     override def hasCommitted: Boolean = true
   }
+
+  def assertCanGetProcessingTime(predicate: => Boolean): Unit = {
+    if (!predicate) throw new TestFailedException("Could not get processing time", 20)
+  }
+
+  def assertCanGetWatermark(predicate: => Boolean): Unit = {
+    if (!predicate) throw new TestFailedException("Could not get processing time", 20)
+  }
+
+  def assertCannotGetWatermark(func: => Unit): Unit = {
+    try {
+      func
+    } catch {
+      case u: UnsupportedOperationException =>
+        return
+      case _ =>
+        throw new TestFailedException("Unexpected exception when trying to get watermark", 20)
+    }
+    throw new TestFailedException("Could get watermark when not expected", 20)
+  }
 }

From 72561ecf4b611d68f8bf695ddd0c4c2cce3a29d9 Mon Sep 17 00:00:00 2001
From: maryannxue <maryann.xue@gmail.com>
Date: Wed, 18 Oct 2017 20:59:40 +0800
Subject: [PATCH 728/779] [SPARK-22266][SQL] The same aggregate function was
 evaluated multiple times

## What changes were proposed in this pull request?

To let the same aggregate function that appear multiple times in an Aggregate be evaluated only once, we need to deduplicate the aggregate expressions. The original code was trying to use a "distinct" call to get a set of aggregate expressions, but did not work, since the "distinct" did not compare semantic equality. And even if it did, further work should be done in result expression rewriting.
In this PR, I changed the "set" to a map mapping the semantic identity of a aggregate expression to itself. Thus, later on, when rewriting result expressions (i.e., output expressions), the aggregate expression reference can be fixed.

## How was this patch tested?

Added a new test in SQLQuerySuite

Author: maryannxue <maryann.xue@gmail.com>

Closes #19488 from maryannxue/spark-22266.
---
 .../sql/catalyst/planning/patterns.scala      | 16 +++++++-----
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 26 +++++++++++++++++++
 2 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
index 8d034c21a4960..cc391aae55787 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
@@ -205,14 +205,17 @@ object PhysicalAggregation {
     case logical.Aggregate(groupingExpressions, resultExpressions, child) =>
       // A single aggregate expression might appear multiple times in resultExpressions.
       // In order to avoid evaluating an individual aggregate function multiple times, we'll
-      // build a set of the distinct aggregate expressions and build a function which can
-      // be used to re-write expressions so that they reference the single copy of the
-      // aggregate function which actually gets computed.
+      // build a set of semantically distinct aggregate expressions and re-write expressions so
+      // that they reference the single copy of the aggregate function which actually gets computed.
+      // Non-deterministic aggregate expressions are not deduplicated.
+      val equivalentAggregateExpressions = new EquivalentExpressions
       val aggregateExpressions = resultExpressions.flatMap { expr =>
         expr.collect {
-          case agg: AggregateExpression => agg
+          // addExpr() always returns false for non-deterministic expressions and do not add them.
+          case agg: AggregateExpression
+            if (!equivalentAggregateExpressions.addExpr(agg)) => agg
         }
-      }.distinct
+      }
 
       val namedGroupingExpressions = groupingExpressions.map {
         case ne: NamedExpression => ne -> ne
@@ -236,7 +239,8 @@ object PhysicalAggregation {
           case ae: AggregateExpression =>
             // The final aggregation buffer's attributes will be `finalAggregationAttributes`,
             // so replace each aggregate expression by its corresponding attribute in the set:
-            ae.resultAttribute
+            equivalentAggregateExpressions.getEquivalentExprs(ae).headOption
+              .getOrElse(ae).asInstanceOf[AggregateExpression].resultAttribute
           case expression =>
             // Since we're using `namedGroupingAttributes` to extract the grouping key
             // columns, we need to replace grouping key expressions with their corresponding
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index f0c58e2e5bf45..caf332d050d7b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -27,6 +27,7 @@ import org.apache.spark.{AccumulatorSuite, SparkException}
 import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart}
 import org.apache.spark.sql.catalyst.util.StringUtils
 import org.apache.spark.sql.execution.aggregate
+import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, SortAggregateExec}
 import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, CartesianProductExec, SortMergeJoinExec}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
@@ -2715,4 +2716,29 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       checkAnswer(df, Row(1, 1, 1))
     }
   }
+
+  test("SRARK-22266: the same aggregate function was calculated multiple times") {
+    val query = "SELECT a, max(b+1), max(b+1) + 1 FROM testData2 GROUP BY a"
+    val df = sql(query)
+    val physical = df.queryExecution.sparkPlan
+    val aggregateExpressions = physical.collectFirst {
+      case agg : HashAggregateExec => agg.aggregateExpressions
+      case agg : SortAggregateExec => agg.aggregateExpressions
+    }
+    assert (aggregateExpressions.isDefined)
+    assert (aggregateExpressions.get.size == 1)
+    checkAnswer(df, Row(1, 3, 4) :: Row(2, 3, 4) :: Row(3, 3, 4) :: Nil)
+  }
+
+  test("Non-deterministic aggregate functions should not be deduplicated") {
+    val query = "SELECT a, first_value(b), first_value(b) + 1 FROM testData2 GROUP BY a"
+    val df = sql(query)
+    val physical = df.queryExecution.sparkPlan
+    val aggregateExpressions = physical.collectFirst {
+      case agg : HashAggregateExec => agg.aggregateExpressions
+      case agg : SortAggregateExec => agg.aggregateExpressions
+    }
+    assert (aggregateExpressions.isDefined)
+    assert (aggregateExpressions.get.size == 2)
+  }
 }

From 1f25d8683a84a479fd7fc77b5a1ea980289b681b Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Wed, 18 Oct 2017 09:14:46 -0700
Subject: [PATCH 729/779] [SPARK-22249][FOLLOWUP][SQL] Check if list of value
 for IN is empty in the optimizer

## What changes were proposed in this pull request?

This PR addresses the comments by gatorsmile on [the previous PR](https://github.com/apache/spark/pull/19494).

## How was this patch tested?

Previous UT and added UT.

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #19522 from mgaido91/SPARK-22249_FOLLOWUP.
---
 .../execution/columnar/InMemoryTableScanExec.scala   |  4 ++--
 .../columnar/InMemoryColumnarQuerySuite.scala        | 12 +++++++++++-
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
index 846ec03e46a12..139da1c519da2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
@@ -102,8 +102,8 @@ case class InMemoryTableScanExec(
     case IsNull(a: Attribute) => statsFor(a).nullCount > 0
     case IsNotNull(a: Attribute) => statsFor(a).count - statsFor(a).nullCount > 0
 
-    case In(_: AttributeReference, list: Seq[Expression]) if list.isEmpty => Literal.FalseLiteral
-    case In(a: AttributeReference, list: Seq[Expression]) if list.forall(_.isInstanceOf[Literal]) =>
+    case In(a: AttributeReference, list: Seq[Expression])
+      if list.forall(_.isInstanceOf[Literal]) && list.nonEmpty =>
       list.map(l => statsFor(a).lowerBound <= l.asInstanceOf[Literal] &&
         l.asInstanceOf[Literal] <= statsFor(a).upperBound).reduce(_ || _)
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
index 75d17bc79477d..2f249c850a088 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
@@ -21,8 +21,9 @@ import java.nio.charset.StandardCharsets
 import java.sql.{Date, Timestamp}
 
 import org.apache.spark.sql.{DataFrame, QueryTest, Row}
-import org.apache.spark.sql.catalyst.expressions.AttributeSet
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, AttributeSet, In}
 import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning
+import org.apache.spark.sql.execution.LocalTableScanExec
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
@@ -444,4 +445,13 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
     assert(dfNulls.filter($"id".isin(2, 3)).count() == 0)
     dfNulls.unpersist()
   }
+
+  test("SPARK-22249: buildFilter should not throw exception when In contains an empty list") {
+    val attribute = AttributeReference("a", IntegerType)()
+    val testRelation = InMemoryRelation(false, 1, MEMORY_ONLY,
+      LocalTableScanExec(Seq(attribute), Nil), None)
+    val tableScanExec = InMemoryTableScanExec(Seq(attribute),
+      Seq(In(attribute, Nil)), testRelation)
+    assert(tableScanExec.partitionFilters.isEmpty)
+  }
 }

From 52facb0062a4253fa45ac0c633d0510a9b684a62 Mon Sep 17 00:00:00 2001
From: Valeriy Avanesov <avanesov@wias-berlin.de>
Date: Wed, 18 Oct 2017 10:46:46 -0700
Subject: [PATCH 730/779] [SPARK-14371][MLLIB] OnlineLDAOptimizer should not
 collect stats for each doc in mini-batch to driver

Hi,

# What changes were proposed in this pull request?

as it was proposed by jkbradley , ```gammat``` are not collected to the driver anymore.

# How was this patch tested?
existing test suite.

Author: Valeriy Avanesov <avanesov@wias-berlin.de>
Author: Valeriy Avanesov <acopich@gmail.com>

Closes #18924 from akopich/master.
---
 .../spark/mllib/clustering/LDAOptimizer.scala | 82 +++++++++++++------
 1 file changed, 57 insertions(+), 25 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index d633893e55f55..693a2a31f026b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -26,6 +26,7 @@ import breeze.stats.distributions.{Gamma, RandBasis}
 import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.graphx._
 import org.apache.spark.graphx.util.PeriodicGraphCheckpointer
+import org.apache.spark.internal.Logging
 import org.apache.spark.mllib.linalg.{DenseVector, Matrices, SparseVector, Vector, Vectors}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
@@ -259,7 +260,7 @@ final class EMLDAOptimizer extends LDAOptimizer {
  */
 @Since("1.4.0")
 @DeveloperApi
-final class OnlineLDAOptimizer extends LDAOptimizer {
+final class OnlineLDAOptimizer extends LDAOptimizer with Logging {
 
   // LDA common parameters
   private var k: Int = 0
@@ -462,31 +463,61 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
     val expElogbetaBc = batch.sparkContext.broadcast(expElogbeta)
     val alpha = this.alpha.asBreeze
     val gammaShape = this.gammaShape
-
-    val stats: RDD[(BDM[Double], List[BDV[Double]])] = batch.mapPartitions { docs =>
+    val optimizeDocConcentration = this.optimizeDocConcentration
+    // If and only if optimizeDocConcentration is set true,
+    // we calculate logphat in the same pass as other statistics.
+    // No calculation of loghat happens otherwise.
+    val logphatPartOptionBase = () => if (optimizeDocConcentration) {
+                                        Some(BDV.zeros[Double](k))
+                                      } else {
+                                        None
+                                      }
+
+    val stats: RDD[(BDM[Double], Option[BDV[Double]], Long)] = batch.mapPartitions { docs =>
       val nonEmptyDocs = docs.filter(_._2.numNonzeros > 0)
 
       val stat = BDM.zeros[Double](k, vocabSize)
-      var gammaPart = List[BDV[Double]]()
+      val logphatPartOption = logphatPartOptionBase()
+      var nonEmptyDocCount: Long = 0L
       nonEmptyDocs.foreach { case (_, termCounts: Vector) =>
+        nonEmptyDocCount += 1
         val (gammad, sstats, ids) = OnlineLDAOptimizer.variationalTopicInference(
           termCounts, expElogbetaBc.value, alpha, gammaShape, k)
-        stat(::, ids) := stat(::, ids).toDenseMatrix + sstats
-        gammaPart = gammad :: gammaPart
+        stat(::, ids) := stat(::, ids) + sstats
+        logphatPartOption.foreach(_ += LDAUtils.dirichletExpectation(gammad))
       }
-      Iterator((stat, gammaPart))
-    }.persist(StorageLevel.MEMORY_AND_DISK)
-    val statsSum: BDM[Double] = stats.map(_._1).treeAggregate(BDM.zeros[Double](k, vocabSize))(
-      _ += _, _ += _)
-    val gammat: BDM[Double] = breeze.linalg.DenseMatrix.vertcat(
-      stats.map(_._2).flatMap(list => list).collect().map(_.toDenseMatrix): _*)
-    stats.unpersist()
+      Iterator((stat, logphatPartOption, nonEmptyDocCount))
+    }
+
+    val elementWiseSum = (
+        u: (BDM[Double], Option[BDV[Double]], Long),
+        v: (BDM[Double], Option[BDV[Double]], Long)) => {
+      u._1 += v._1
+      u._2.foreach(_ += v._2.get)
+      (u._1, u._2, u._3 + v._3)
+    }
+
+    val (statsSum: BDM[Double], logphatOption: Option[BDV[Double]], nonEmptyDocsN: Long) = stats
+      .treeAggregate((BDM.zeros[Double](k, vocabSize), logphatPartOptionBase(), 0L))(
+        elementWiseSum, elementWiseSum
+      )
+
     expElogbetaBc.destroy(false)
-    val batchResult = statsSum *:* expElogbeta.t
 
+    if (nonEmptyDocsN == 0) {
+      logWarning("No non-empty documents were submitted in the batch.")
+      // Therefore, there is no need to update any of the model parameters
+      return this
+    }
+
+    val batchResult = statsSum *:* expElogbeta.t
     // Note that this is an optimization to avoid batch.count
-    updateLambda(batchResult, (miniBatchFraction * corpusSize).ceil.toInt)
-    if (optimizeDocConcentration) updateAlpha(gammat)
+    val batchSize = (miniBatchFraction * corpusSize).ceil.toInt
+    updateLambda(batchResult, batchSize)
+
+    logphatOption.foreach(_ /= nonEmptyDocsN.toDouble)
+    logphatOption.foreach(updateAlpha(_, nonEmptyDocsN))
+
     this
   }
 
@@ -503,21 +534,22 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
   }
 
   /**
-   * Update alpha based on `gammat`, the inferred topic distributions for documents in the
-   * current mini-batch. Uses Newton-Rhapson method.
+   * Update alpha based on `logphat`.
+   * Uses Newton-Rhapson method.
    * @see Section 3.3, Huang: Maximum Likelihood Estimation of Dirichlet Distribution Parameters
    *      (http://jonathan-huang.org/research/dirichlet/dirichlet.pdf)
+   * @param logphat Expectation of estimated log-posterior distribution of
+   *                topics in a document averaged over the batch.
+   * @param nonEmptyDocsN number of non-empty documents
    */
-  private def updateAlpha(gammat: BDM[Double]): Unit = {
+  private def updateAlpha(logphat: BDV[Double], nonEmptyDocsN: Double): Unit = {
     val weight = rho()
-    val N = gammat.rows.toDouble
     val alpha = this.alpha.asBreeze.toDenseVector
-    val logphat: BDV[Double] =
-      sum(LDAUtils.dirichletExpectation(gammat)(::, breeze.linalg.*)).t / N
-    val gradf = N * (-LDAUtils.dirichletExpectation(alpha) + logphat)
 
-    val c = N * trigamma(sum(alpha))
-    val q = -N * trigamma(alpha)
+    val gradf = nonEmptyDocsN * (-LDAUtils.dirichletExpectation(alpha) + logphat)
+
+    val c = nonEmptyDocsN * trigamma(sum(alpha))
+    val q = -nonEmptyDocsN * trigamma(alpha)
     val b = sum(gradf / q) / (1D / c + sum(1D / q))
 
     val dalpha = -(gradf - b) / q

From 6f1d0dea1cdda558c998179789b386f6e52b9e36 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Thu, 19 Oct 2017 13:30:55 +0800
Subject: [PATCH 731/779] [SPARK-22300][BUILD] Update ORC to 1.4.1

## What changes were proposed in this pull request?

Apache ORC 1.4.1 is released yesterday.
- https://orc.apache.org/news/2017/10/16/ORC-1.4.1/

Like ORC-233 (Allow `orc.include.columns` to be empty), there are several important fixes.
This PR updates Apache ORC dependency to use the latest one, 1.4.1.

## How was this patch tested?

Pass the Jenkins.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #19521 from dongjoon-hyun/SPARK-22300.
---
 dev/deps/spark-deps-hadoop-2.6 | 6 +++---
 dev/deps/spark-deps-hadoop-2.7 | 6 +++---
 pom.xml                        | 6 +++++-
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 76fcbd15869f1..6e2fc63d67108 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -2,7 +2,7 @@ JavaEWAH-0.3.2.jar
 RoaringBitmap-0.5.11.jar
 ST4-4.0.4.jar
 activation-1.1.1.jar
-aircompressor-0.3.jar
+aircompressor-0.8.jar
 antlr-2.7.7.jar
 antlr-runtime-3.4.jar
 antlr4-runtime-4.7.jar
@@ -149,8 +149,8 @@ netty-3.9.9.Final.jar
 netty-all-4.0.47.Final.jar
 objenesis-2.1.jar
 opencsv-2.3.jar
-orc-core-1.4.0-nohive.jar
-orc-mapreduce-1.4.0-nohive.jar
+orc-core-1.4.1-nohive.jar
+orc-mapreduce-1.4.1-nohive.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.8.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index cb20072bf8b30..c2bbc253d723a 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -2,7 +2,7 @@ JavaEWAH-0.3.2.jar
 RoaringBitmap-0.5.11.jar
 ST4-4.0.4.jar
 activation-1.1.1.jar
-aircompressor-0.3.jar
+aircompressor-0.8.jar
 antlr-2.7.7.jar
 antlr-runtime-3.4.jar
 antlr4-runtime-4.7.jar
@@ -150,8 +150,8 @@ netty-3.9.9.Final.jar
 netty-all-4.0.47.Final.jar
 objenesis-2.1.jar
 opencsv-2.3.jar
-orc-core-1.4.0-nohive.jar
-orc-mapreduce-1.4.0-nohive.jar
+orc-core-1.4.1-nohive.jar
+orc-mapreduce-1.4.1-nohive.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.8.jar
diff --git a/pom.xml b/pom.xml
index 9fac8b1e53788..b9c972855204a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -128,7 +128,7 @@
     <hive.version.short>1.2.1</hive.version.short>
     <derby.version>10.12.1.1</derby.version>
     <parquet.version>1.8.2</parquet.version>
-    <orc.version>1.4.0</orc.version>
+    <orc.version>1.4.1</orc.version>
     <orc.classifier>nohive</orc.classifier>
     <hive.parquet.version>1.6.0</hive.parquet.version>
     <jetty.version>9.3.20.v20170531</jetty.version>
@@ -1712,6 +1712,10 @@
             <groupId>org.apache.hive</groupId>
             <artifactId>hive-storage-api</artifactId>
           </exclusion>
+          <exclusion>
+            <groupId>io.airlift</groupId>
+            <artifactId>slice</artifactId>
+          </exclusion>
         </exclusions>
       </dependency>
       <dependency>

From dc2714da50ecba1bf1fdf555a82a4314f763a76e Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Thu, 19 Oct 2017 14:56:48 +0800
Subject: [PATCH 732/779] [SPARK-22290][CORE] Avoid creating Hive delegation
 tokens when not necessary.

Hive delegation tokens are only needed when the Spark driver has no access
to the kerberos TGT. That happens only in two situations:

- when using a proxy user
- when using cluster mode without a keytab

This change modifies the Hive provider so that it only generates delegation
tokens in those situations, and tweaks the YARN AM so that it makes the proper
user visible to the Hive code when running with keytabs, so that the TGT
can be used instead of a delegation token.

The effect of this change is that now it's possible to initialize multiple,
non-concurrent SparkContext instances in the same JVM. Before, the second
invocation would fail to fetch a new Hive delegation token, which then could
make the second (or third or...) application fail once the token expired.
With this change, the TGT will be used to authenticate to the HMS instead.

This change also avoids polluting the current logged in user's credentials
when launching applications. The credentials are copied only when running
applications as a proxy user. This makes it possible to implement SPARK-11035
later, where multiple threads might be launching applications, and each app
should have its own set of credentials.

Tested by verifying HDFS and Hive access in following scenarios:
- client and cluster mode
- client and cluster mode with proxy user
- client and cluster mode with principal / keytab
- long-running cluster app with principal / keytab
- pyspark app that creates (and stops) multiple SparkContext instances
  through its lifetime

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #19509 from vanzin/SPARK-22290.
---
 .../apache/spark/deploy/SparkHadoopUtil.scala | 17 +++--
 .../HBaseDelegationTokenProvider.scala        |  4 +-
 .../HadoopDelegationTokenManager.scala        |  2 +-
 .../HadoopDelegationTokenProvider.scala       |  2 +-
 .../HadoopFSDelegationTokenProvider.scala     |  4 +-
 .../HiveDelegationTokenProvider.scala         | 20 +++++-
 docs/running-on-yarn.md                       |  9 +++
 .../spark/deploy/yarn/ApplicationMaster.scala | 69 +++++++++++++++----
 .../org/apache/spark/deploy/yarn/Client.scala |  5 +-
 .../org/apache/spark/deploy/yarn/config.scala |  4 ++
 .../sql/hive/client/HiveClientImpl.scala      |  6 --
 11 files changed, 110 insertions(+), 32 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index 53775db251bc6..1fa10ab943f34 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -61,13 +61,17 @@ class SparkHadoopUtil extends Logging {
    * do a FileSystem.closeAllForUGI in order to avoid leaking Filesystems
    */
   def runAsSparkUser(func: () => Unit) {
+    createSparkUser().doAs(new PrivilegedExceptionAction[Unit] {
+      def run: Unit = func()
+    })
+  }
+
+  def createSparkUser(): UserGroupInformation = {
     val user = Utils.getCurrentUserName()
-    logDebug("running as user: " + user)
+    logDebug("creating UGI for user: " + user)
     val ugi = UserGroupInformation.createRemoteUser(user)
     transferCredentials(UserGroupInformation.getCurrentUser(), ugi)
-    ugi.doAs(new PrivilegedExceptionAction[Unit] {
-      def run: Unit = func()
-    })
+    ugi
   }
 
   def transferCredentials(source: UserGroupInformation, dest: UserGroupInformation) {
@@ -417,6 +421,11 @@ class SparkHadoopUtil extends Logging {
     creds.readTokenStorageStream(new DataInputStream(tokensBuf))
     creds
   }
+
+  def isProxyUser(ugi: UserGroupInformation): Boolean = {
+    ugi.getAuthenticationMethod() == UserGroupInformation.AuthenticationMethod.PROXY
+  }
+
 }
 
 object SparkHadoopUtil {
diff --git a/core/src/main/scala/org/apache/spark/deploy/security/HBaseDelegationTokenProvider.scala b/core/src/main/scala/org/apache/spark/deploy/security/HBaseDelegationTokenProvider.scala
index 78b0e6b2cbf39..5dcde4ec3a8a4 100644
--- a/core/src/main/scala/org/apache/spark/deploy/security/HBaseDelegationTokenProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/security/HBaseDelegationTokenProvider.scala
@@ -56,7 +56,9 @@ private[security] class HBaseDelegationTokenProvider
     None
   }
 
-  override def delegationTokensRequired(hadoopConf: Configuration): Boolean = {
+  override def delegationTokensRequired(
+      sparkConf: SparkConf,
+      hadoopConf: Configuration): Boolean = {
     hbaseConf(hadoopConf).get("hbase.security.authentication") == "kerberos"
   }
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala b/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala
index c134b7ebe38fa..483d0deec8070 100644
--- a/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala
@@ -115,7 +115,7 @@ private[spark] class HadoopDelegationTokenManager(
       hadoopConf: Configuration,
       creds: Credentials): Long = {
     delegationTokenProviders.values.flatMap { provider =>
-      if (provider.delegationTokensRequired(hadoopConf)) {
+      if (provider.delegationTokensRequired(sparkConf, hadoopConf)) {
         provider.obtainDelegationTokens(hadoopConf, sparkConf, creds)
       } else {
         logDebug(s"Service ${provider.serviceName} does not require a token." +
diff --git a/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenProvider.scala b/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenProvider.scala
index 1ba245e84af4b..ed0905088ab25 100644
--- a/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenProvider.scala
@@ -37,7 +37,7 @@ private[spark] trait HadoopDelegationTokenProvider {
    * Returns true if delegation tokens are required for this service. By default, it is based on
    * whether Hadoop security is enabled.
    */
-  def delegationTokensRequired(hadoopConf: Configuration): Boolean
+  def delegationTokensRequired(sparkConf: SparkConf, hadoopConf: Configuration): Boolean
 
   /**
    * Obtain delegation tokens for this service and get the time of the next renewal.
diff --git a/core/src/main/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProvider.scala b/core/src/main/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProvider.scala
index 300773c58b183..21ca669ea98f0 100644
--- a/core/src/main/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProvider.scala
@@ -69,7 +69,9 @@ private[deploy] class HadoopFSDelegationTokenProvider(fileSystems: Configuration
     nextRenewalDate
   }
 
-  def delegationTokensRequired(hadoopConf: Configuration): Boolean = {
+  override def delegationTokensRequired(
+      sparkConf: SparkConf,
+      hadoopConf: Configuration): Boolean = {
     UserGroupInformation.isSecurityEnabled
   }
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/security/HiveDelegationTokenProvider.scala b/core/src/main/scala/org/apache/spark/deploy/security/HiveDelegationTokenProvider.scala
index b31cc595ed83b..ece5ce79c650d 100644
--- a/core/src/main/scala/org/apache/spark/deploy/security/HiveDelegationTokenProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/security/HiveDelegationTokenProvider.scala
@@ -31,7 +31,9 @@ import org.apache.hadoop.security.{Credentials, UserGroupInformation}
 import org.apache.hadoop.security.token.Token
 
 import org.apache.spark.SparkConf
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config.KEYTAB
 import org.apache.spark.util.Utils
 
 private[security] class HiveDelegationTokenProvider
@@ -55,9 +57,21 @@ private[security] class HiveDelegationTokenProvider
     }
   }
 
-  override def delegationTokensRequired(hadoopConf: Configuration): Boolean = {
+  override def delegationTokensRequired(
+      sparkConf: SparkConf,
+      hadoopConf: Configuration): Boolean = {
+    // Delegation tokens are needed only when:
+    // - trying to connect to a secure metastore
+    // - either deploying in cluster mode without a keytab, or impersonating another user
+    //
+    // Other modes (such as client with or without keytab, or cluster mode with keytab) do not need
+    // a delegation token, since there's a valid kerberos TGT for the right user available to the
+    // driver, which is the only process that connects to the HMS.
+    val deployMode = sparkConf.get("spark.submit.deployMode", "client")
     UserGroupInformation.isSecurityEnabled &&
-      hiveConf(hadoopConf).getTrimmed("hive.metastore.uris", "").nonEmpty
+      hiveConf(hadoopConf).getTrimmed("hive.metastore.uris", "").nonEmpty &&
+      (SparkHadoopUtil.get.isProxyUser(UserGroupInformation.getCurrentUser()) ||
+        (deployMode == "cluster" && !sparkConf.contains(KEYTAB)))
   }
 
   override def obtainDelegationTokens(
@@ -83,7 +97,7 @@ private[security] class HiveDelegationTokenProvider
 
         val hive2Token = new Token[DelegationTokenIdentifier]()
         hive2Token.decodeFromUrlString(tokenStr)
-        logInfo(s"Get Token from hive metastore: ${hive2Token.toString}")
+        logDebug(s"Get Token from hive metastore: ${hive2Token.toString}")
         creds.addToken(new Text("hive.server2.delegation.token"), hive2Token)
       }
 
diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index 432639588cc2b..9599d40c545b2 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -401,6 +401,15 @@ To use a custom metrics.properties for the application master and executors, upd
   Principal to be used to login to KDC, while running on secure HDFS. (Works also with the "local" master)
   </td>
 </tr>
+<tr>
+  <td><code>spark.yarn.kerberos.relogin.period</code></td>
+  <td>1m</td>
+  <td>
+  How often to check whether the kerberos TGT should be renewed. This should be set to a value
+  that is shorter than the TGT renewal period (or the TGT lifetime if TGT renewal is not enabled).
+  The default value should be enough for most deployments.
+  </td>
+</tr>
 <tr>
   <td><code>spark.yarn.config.gatewayPath</code></td>
   <td>(none)</td>
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index e227bff88f71d..f6167235f89e4 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -20,6 +20,7 @@ package org.apache.spark.deploy.yarn
 import java.io.{File, IOException}
 import java.lang.reflect.InvocationTargetException
 import java.net.{Socket, URI, URL}
+import java.security.PrivilegedExceptionAction
 import java.util.concurrent.{TimeoutException, TimeUnit}
 
 import scala.collection.mutable.HashMap
@@ -28,6 +29,7 @@ import scala.concurrent.duration.Duration
 import scala.util.control.NonFatal
 
 import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.hadoop.security.UserGroupInformation
 import org.apache.hadoop.yarn.api._
 import org.apache.hadoop.yarn.api.records._
 import org.apache.hadoop.yarn.conf.YarnConfiguration
@@ -49,10 +51,7 @@ import org.apache.spark.util._
 /**
  * Common application master functionality for Spark on Yarn.
  */
-private[spark] class ApplicationMaster(
-    args: ApplicationMasterArguments,
-    client: YarnRMClient)
-  extends Logging {
+private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends Logging {
 
   // TODO: Currently, task to container is computed once (TaskSetManager) - which need not be
   // optimal as more containers are available. Might need to handle this better.
@@ -62,6 +61,46 @@ private[spark] class ApplicationMaster(
     .asInstanceOf[YarnConfiguration]
   private val isClusterMode = args.userClass != null
 
+  private val ugi = {
+    val original = UserGroupInformation.getCurrentUser()
+
+    // If a principal and keytab were provided, log in to kerberos, and set up a thread to
+    // renew the kerberos ticket when needed. Because the UGI API does not expose the TTL
+    // of the TGT, use a configuration to define how often to check that a relogin is necessary.
+    // checkTGTAndReloginFromKeytab() is a no-op if the relogin is not yet needed.
+    val principal = sparkConf.get(PRINCIPAL).orNull
+    val keytab = sparkConf.get(KEYTAB).orNull
+    if (principal != null && keytab != null) {
+      UserGroupInformation.loginUserFromKeytab(principal, keytab)
+
+      val renewer = new Thread() {
+        override def run(): Unit = Utils.tryLogNonFatalError {
+          while (true) {
+            TimeUnit.SECONDS.sleep(sparkConf.get(KERBEROS_RELOGIN_PERIOD))
+            UserGroupInformation.getCurrentUser().checkTGTAndReloginFromKeytab()
+          }
+        }
+      }
+      renewer.setName("am-kerberos-renewer")
+      renewer.setDaemon(true)
+      renewer.start()
+
+      // Transfer the original user's tokens to the new user, since that's needed to connect to
+      // YARN. It also copies over any delegation tokens that might have been created by the
+      // client, which will then be transferred over when starting executors (until new ones
+      // are created by the periodic task).
+      val newUser = UserGroupInformation.getCurrentUser()
+      SparkHadoopUtil.get.transferCredentials(original, newUser)
+      newUser
+    } else {
+      SparkHadoopUtil.get.createSparkUser()
+    }
+  }
+
+  private val client = ugi.doAs(new PrivilegedExceptionAction[YarnRMClient]() {
+    def run: YarnRMClient = new YarnRMClient()
+  })
+
   // Default to twice the number of executors (twice the maximum number of executors if dynamic
   // allocation is enabled), with a minimum of 3.
 
@@ -201,6 +240,13 @@ private[spark] class ApplicationMaster(
   }
 
   final def run(): Int = {
+    ugi.doAs(new PrivilegedExceptionAction[Unit]() {
+      def run: Unit = runImpl()
+    })
+    exitCode
+  }
+
+  private def runImpl(): Unit = {
     try {
       val appAttemptId = client.getAttemptId()
 
@@ -254,11 +300,6 @@ private[spark] class ApplicationMaster(
         }
       }
 
-      // Call this to force generation of secret so it gets populated into the
-      // Hadoop UGI. This has to happen before the startUserApplication which does a
-      // doAs in order for the credentials to be passed on to the executor containers.
-      val securityMgr = new SecurityManager(sparkConf)
-
       // If the credentials file config is present, we must periodically renew tokens. So create
       // a new AMDelegationTokenRenewer
       if (sparkConf.contains(CREDENTIALS_FILE_PATH)) {
@@ -284,6 +325,9 @@ private[spark] class ApplicationMaster(
         credentialRenewerThread.join()
       }
 
+      // Call this to force generation of secret so it gets populated into the Hadoop UGI.
+      val securityMgr = new SecurityManager(sparkConf)
+
       if (isClusterMode) {
         runDriver(securityMgr)
       } else {
@@ -297,7 +341,6 @@ private[spark] class ApplicationMaster(
           ApplicationMaster.EXIT_UNCAUGHT_EXCEPTION,
           "Uncaught exception: " + e)
     }
-    exitCode
   }
 
   /**
@@ -775,10 +818,8 @@ object ApplicationMaster extends Logging {
         sys.props(k) = v
       }
     }
-    SparkHadoopUtil.get.runAsSparkUser { () =>
-      master = new ApplicationMaster(amArgs, new YarnRMClient)
-      System.exit(master.run())
-    }
+    master = new ApplicationMaster(amArgs)
+    System.exit(master.run())
   }
 
   private[spark] def sparkContextInitialized(sc: SparkContext): Unit = {
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 64b2b4d4db549..1fe25c4ddaabf 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -394,7 +394,10 @@ private[spark] class Client(
     if (credentials != null) {
       // Add credentials to current user's UGI, so that following operations don't need to use the
       // Kerberos tgt to get delegations again in the client side.
-      UserGroupInformation.getCurrentUser.addCredentials(credentials)
+      val currentUser = UserGroupInformation.getCurrentUser()
+      if (SparkHadoopUtil.get.isProxyUser(currentUser)) {
+        currentUser.addCredentials(credentials)
+      }
       logDebug(YarnSparkHadoopUtil.get.dumpTokens(credentials).mkString("\n"))
     }
 
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala
index 187803cc6050b..e1af8ba087d6e 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala
@@ -347,6 +347,10 @@ package object config {
     .timeConf(TimeUnit.MILLISECONDS)
     .createWithDefault(Long.MaxValue)
 
+  private[spark] val KERBEROS_RELOGIN_PERIOD = ConfigBuilder("spark.yarn.kerberos.relogin.period")
+    .timeConf(TimeUnit.SECONDS)
+    .createWithDefaultString("1m")
+
   // The list of cache-related config entries. This is used by Client and the AM to clean
   // up the environment so that these settings do not appear on the web UI.
   private[yarn] val CACHE_CONFIGS = Seq(
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index a01c312d5e497..16c95c53b4201 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -111,12 +111,6 @@ private[hive] class HiveClientImpl(
     if (clientLoader.isolationOn) {
       // Switch to the initClassLoader.
       Thread.currentThread().setContextClassLoader(initClassLoader)
-      // Set up kerberos credentials for UserGroupInformation.loginUser within current class loader
-      if (sparkConf.contains("spark.yarn.principal") && sparkConf.contains("spark.yarn.keytab")) {
-        val principal = sparkConf.get("spark.yarn.principal")
-        val keytab = sparkConf.get("spark.yarn.keytab")
-        SparkHadoopUtil.get.loginUserFromKeytab(principal, keytab)
-      }
       try {
         newState()
       } finally {

From 5a07aca4d464e96d75ea17bf6768e24b829872ec Mon Sep 17 00:00:00 2001
From: krishna-pandey <krish.pandey21@gmail.com>
Date: Thu, 19 Oct 2017 08:33:14 +0100
Subject: [PATCH 733/779] [SPARK-22188][CORE] Adding security headers for
 preventing XSS, MitM and MIME sniffing

## What changes were proposed in this pull request?

The HTTP Strict-Transport-Security response header (often abbreviated as HSTS) is a security feature that lets a web site tell browsers that it should only be communicated with using HTTPS, instead of using HTTP.

Note: The Strict-Transport-Security header is ignored by the browser when your site is accessed using HTTP; this is because an attacker may intercept HTTP connections and inject the header or remove it. When your site is accessed over HTTPS with no certificate errors, the browser knows your site is HTTPS capable and will honor the Strict-Transport-Security header.

The HTTP X-XSS-Protection response header is a feature of Internet Explorer, Chrome and Safari that stops pages from loading when they detect reflected cross-site scripting (XSS) attacks.

The HTTP X-Content-Type-Options response header is used to protect against MIME sniffing vulnerabilities.

## How was this patch tested?
Checked on my system locally.

<img width="750" alt="screen shot 2017-10-03 at 6 49 20 pm" src="https://user-images.githubusercontent.com/6433184/31127234-eadf7c0c-a86b-11e7-8e5d-f6ea3f97b210.png">

Author: krishna-pandey <krish.pandey21@gmail.com>
Author: Krishna Pandey <krish.pandey21@gmail.com>

Closes #19419 from krishna-pandey/SPARK-22188.
---
 .../spark/internal/config/package.scala       | 18 +++++++
 .../org/apache/spark/ui/JettyUtils.scala      |  9 ++++
 docs/security.md                              | 47 +++++++++++++++++++
 3 files changed, 74 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index 0c36bdcdd2904..6f0247b73070d 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -452,6 +452,24 @@ package object config {
       .toSequence
       .createWithDefault(Nil)
 
+  private[spark] val UI_X_XSS_PROTECTION =
+    ConfigBuilder("spark.ui.xXssProtection")
+      .doc("Value for HTTP X-XSS-Protection response header")
+      .stringConf
+      .createWithDefaultString("1; mode=block")
+
+  private[spark] val UI_X_CONTENT_TYPE_OPTIONS =
+    ConfigBuilder("spark.ui.xContentTypeOptions.enabled")
+      .doc("Set to 'true' for setting X-Content-Type-Options HTTP response header to 'nosniff'")
+      .booleanConf
+      .createWithDefault(true)
+
+  private[spark] val UI_STRICT_TRANSPORT_SECURITY =
+    ConfigBuilder("spark.ui.strictTransportSecurity")
+      .doc("Value for HTTP Strict Transport Security Response Header")
+      .stringConf
+      .createOptional
+
   private[spark] val EXTRA_LISTENERS = ConfigBuilder("spark.extraListeners")
     .doc("Class names of listeners to add to SparkContext during initialization.")
     .stringConf
diff --git a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
index 5ee04dad6ed4d..0adeb4058b6e4 100644
--- a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
@@ -39,6 +39,7 @@ import org.json4s.jackson.JsonMethods.{pretty, render}
 
 import org.apache.spark.{SecurityManager, SparkConf, SSLOptions}
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config._
 import org.apache.spark.util.Utils
 
 /**
@@ -89,6 +90,14 @@ private[spark] object JettyUtils extends Logging {
             val result = servletParams.responder(request)
             response.setHeader("Cache-Control", "no-cache, no-store, must-revalidate")
             response.setHeader("X-Frame-Options", xFrameOptionsValue)
+            response.setHeader("X-XSS-Protection", conf.get(UI_X_XSS_PROTECTION))
+            if (conf.get(UI_X_CONTENT_TYPE_OPTIONS)) {
+              response.setHeader("X-Content-Type-Options", "nosniff")
+            }
+            if (request.getScheme == "https") {
+              conf.get(UI_STRICT_TRANSPORT_SECURITY).foreach(
+                response.setHeader("Strict-Transport-Security", _))
+            }
             response.getWriter.print(servletParams.extractFn(result))
           } else {
             response.setStatus(HttpServletResponse.SC_FORBIDDEN)
diff --git a/docs/security.md b/docs/security.md
index 1d004003f9a32..15aadf07cf873 100644
--- a/docs/security.md
+++ b/docs/security.md
@@ -186,7 +186,54 @@ configure those ports.
   </tr>
 </table>
 
+### HTTP Security Headers
+
+Apache Spark can be configured to include HTTP Headers which aids in preventing Cross 
+Site Scripting (XSS), Cross-Frame Scripting (XFS), MIME-Sniffing and also enforces HTTP 
+Strict Transport Security.
+
+<table class="table">
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr>
+  <td><code>spark.ui.xXssProtection</code></td>
+  <td><code>1; mode=block</code></td>
+  <td>
+    Value for HTTP X-XSS-Protection response header. You can choose appropriate value 
+    from below:
+    <ul>
+      <li><code>0</code> (Disables XSS filtering)</li> 
+      <li><code>1</code> (Enables XSS filtering. If a cross-site scripting attack is detected, 
+        the browser will sanitize the page.)</li>
+      <li><code>1; mode=block</code> (Enables XSS filtering. The browser will prevent rendering 
+        of the page if an attack is detected.)</li>
+    </ul>
+  </td>
+</tr>
+<tr>
+  <td><code>spark.ui.xContentTypeOptions.enabled</code></td>
+  <td><code>true</code></td>
+  <td>
+    When value is set to "true", X-Content-Type-Options HTTP response header will be set 
+    to "nosniff". Set "false" to disable.
+  </td>
+  </tr>
+<tr>
+  <td><code>spark.ui.strictTransportSecurity</code></td>
+  <td>None</td>
+  <td>
+    Value for HTTP Strict Transport Security (HSTS) Response Header. You can choose appropriate 
+    value from below and set <code>expire-time</code> accordingly, when Spark is SSL/TLS enabled.
+    <ul>
+      <li><code>max-age=&lt;expire-time&gt;</code></li>
+      <li><code>max-age=&lt;expire-time&gt;; includeSubDomains</code></li>
+      <li><code>max-age=&lt;expire-time&gt;; preload</code></li>
+    </ul>
+  </td>
+</tr>
+</table>
+    
 
 See the [configuration page](configuration.html) for more details on the security configuration
 parameters, and <a href="{{site.SPARK_GITHUB_URL}}/tree/master/core/src/main/scala/org/apache/spark/SecurityManager.scala">
 <code>org.apache.spark.SecurityManager</code></a> for implementation details about security.
+

From 7fae7995ba05e0333d1decb7ca74ddb7c1b448d7 Mon Sep 17 00:00:00 2001
From: Andrew Ash <andrew@andrewash.com>
Date: Fri, 20 Oct 2017 09:40:00 +0900
Subject: [PATCH 734/779] [SPARK-22268][BUILD] Fix lint-java

## What changes were proposed in this pull request?

Fix java style issues

## How was this patch tested?

Run `./dev/lint-java` locally since it's not run on Jenkins

Author: Andrew Ash <andrew@andrewash.com>

Closes #19486 from ash211/aash/fix-lint-java.
---
 .../unsafe/sort/UnsafeInMemorySorter.java     |  9 ++++----
 .../sort/UnsafeExternalSorterSuite.java       | 21 +++++++++++--------
 .../sort/UnsafeInMemorySorterSuite.java       |  3 ++-
 .../v2/reader/SupportsPushDownFilters.java    |  1 -
 4 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
index 869ec908be1fb..3bb87a6ed653d 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
@@ -172,10 +172,11 @@ public void free() {
   public void reset() {
     if (consumer != null) {
       consumer.freeArray(array);
-      // the call to consumer.allocateArray may trigger a spill
-      // which in turn access this instance and eventually re-enter this method and try to free the array again.
-      // by setting the array to null and its length to 0 we effectively make the spill code-path a no-op.
-      // setting the array to null also indicates that it has already been de-allocated which prevents a double de-allocation in free().
+      // the call to consumer.allocateArray may trigger a spill which in turn access this instance
+      // and eventually re-enter this method and try to free the array again.  by setting the array
+      // to null and its length to 0 we effectively make the spill code-path a no-op.  setting the
+      // array to null also indicates that it has already been de-allocated which prevents a double
+      // de-allocation in free().
       array = null;
       usableCapacity = 0;
       pos = 0;
diff --git a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
index 6c5451d0fd2a5..d0d0334add0bf 100644
--- a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
+++ b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
@@ -516,12 +516,13 @@ public void testOOMDuringSpill() throws Exception {
     for (int i = 0; sorter.hasSpaceForAnotherRecord(); ++i) {
       insertNumber(sorter, i);
     }
-    // we expect the next insert to attempt growing the pointerssArray
-    // first allocation is expected to fail, then a spill is triggered which attempts another allocation
-    // which also fails and we expect to see this OOM here.
-    // the original code messed with a released array within the spill code
-    // and ended up with a failed assertion.
-    // we also expect the location of the OOM to be org.apache.spark.util.collection.unsafe.sort.UnsafeInMemorySorter.reset
+    // we expect the next insert to attempt growing the pointerssArray first
+    // allocation is expected to fail, then a spill is triggered which
+    // attempts another allocation which also fails and we expect to see this
+    // OOM here.  the original code messed with a released array within the
+    // spill code and ended up with a failed assertion.  we also expect the
+    // location of the OOM to be
+    // org.apache.spark.util.collection.unsafe.sort.UnsafeInMemorySorter.reset
     memoryManager.markconsequentOOM(2);
     try {
       insertNumber(sorter, 1024);
@@ -530,9 +531,11 @@ public void testOOMDuringSpill() throws Exception {
     // we expect an OutOfMemoryError here, anything else (i.e the original NPE is a failure)
     catch (OutOfMemoryError oom){
       String oomStackTrace = Utils.exceptionString(oom);
-      assertThat("expected OutOfMemoryError in org.apache.spark.util.collection.unsafe.sort.UnsafeInMemorySorter.reset",
-              oomStackTrace,
-              Matchers.containsString("org.apache.spark.util.collection.unsafe.sort.UnsafeInMemorySorter.reset"));
+      assertThat("expected OutOfMemoryError in " +
+        "org.apache.spark.util.collection.unsafe.sort.UnsafeInMemorySorter.reset",
+        oomStackTrace,
+        Matchers.containsString(
+          "org.apache.spark.util.collection.unsafe.sort.UnsafeInMemorySorter.reset"));
     }
   }
 
diff --git a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java
index 1a3e11efe9787..594f07dd780f9 100644
--- a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java
+++ b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java
@@ -179,7 +179,8 @@ public int compare(
     } catch (OutOfMemoryError oom) {
       // as expected
     }
-    // [SPARK-21907] this failed on NPE at org.apache.spark.memory.MemoryConsumer.freeArray(MemoryConsumer.java:108)
+    // [SPARK-21907] this failed on NPE at
+    // org.apache.spark.memory.MemoryConsumer.freeArray(MemoryConsumer.java:108)
     sorter.free();
     // simulate a 'back to back' free.
     sorter.free();
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownFilters.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownFilters.java
index d6f297c013375..6b0c9d417eeae 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownFilters.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownFilters.java
@@ -18,7 +18,6 @@
 package org.apache.spark.sql.sources.v2.reader;
 
 import org.apache.spark.annotation.InterfaceStability;
-import org.apache.spark.sql.catalyst.expressions.Expression;
 import org.apache.spark.sql.sources.Filter;
 
 /**

From b034f2565f72aa73c9f0be1e49d148bb4cf05153 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 19 Oct 2017 20:24:51 -0700
Subject: [PATCH 735/779] [SPARK-22026][SQL] data source v2 write path

## What changes were proposed in this pull request?

A working prototype for data source v2 write path.

The writing framework is similar to the reading framework. i.e. `WriteSupport` -> `DataSourceV2Writer` -> `DataWriterFactory` -> `DataWriter`.

Similar to the `FileCommitPotocol`, the writing API has job and task level commit/abort to support the transaction.

## How was this patch tested?

new tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19269 from cloud-fan/data-source-v2-write.
---
 .../spark/sql/sources/v2/WriteSupport.java    |  49 ++++
 .../sources/v2/writer/DataSourceV2Writer.java |  88 +++++++
 .../sql/sources/v2/writer/DataWriter.java     |  92 +++++++
 .../sources/v2/writer/DataWriterFactory.java  |  50 ++++
 .../v2/writer/SupportsWriteInternalRow.java   |  44 ++++
 .../v2/writer/WriterCommitMessage.java        |  33 +++
 .../apache/spark/sql/DataFrameWriter.scala    |  38 ++-
 .../datasources/v2/DataSourceV2Strategy.scala |  11 +-
 .../datasources/v2/WriteToDataSourceV2.scala  | 133 ++++++++++
 .../sql/sources/v2/DataSourceV2Suite.scala    |  69 +++++
 .../sources/v2/SimpleWritableDataSource.scala | 249 ++++++++++++++++++
 11 files changed, 842 insertions(+), 14 deletions(-)
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/WriteSupport.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceV2Writer.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriter.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriterFactory.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/SupportsWriteInternalRow.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/WriterCommitMessage.java
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala

diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/WriteSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/WriteSupport.java
new file mode 100644
index 0000000000000..a8a961598bde3
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/WriteSupport.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2;
+
+import java.util.Optional;
+
+import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.sources.v2.writer.DataSourceV2Writer;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * A mix-in interface for {@link DataSourceV2}. Data sources can implement this interface to
+ * provide data writing ability and save the data to the data source.
+ */
+@InterfaceStability.Evolving
+public interface WriteSupport {
+
+  /**
+   * Creates an optional {@link DataSourceV2Writer} to save the data to this data source. Data
+   * sources can return None if there is no writing needed to be done according to the save mode.
+   *
+   * @param jobId A unique string for the writing job. It's possible that there are many writing
+   *              jobs running at the same time, and the returned {@link DataSourceV2Writer} should
+   *              use this job id to distinguish itself with writers of other jobs.
+   * @param schema the schema of the data to be written.
+   * @param mode the save mode which determines what to do when the data are already in this data
+   *             source, please refer to {@link SaveMode} for more details.
+   * @param options the options for the returned data source writer, which is an immutable
+   *                case-insensitive string-to-string map.
+   */
+  Optional<DataSourceV2Writer> createWriter(
+      String jobId, StructType schema, SaveMode mode, DataSourceV2Options options);
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceV2Writer.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceV2Writer.java
new file mode 100644
index 0000000000000..8d8e33633fb0d
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceV2Writer.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.writer;
+
+import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.sources.v2.DataSourceV2Options;
+import org.apache.spark.sql.sources.v2.WriteSupport;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * A data source writer that is returned by
+ * {@link WriteSupport#createWriter(String, StructType, SaveMode, DataSourceV2Options)}.
+ * It can mix in various writing optimization interfaces to speed up the data saving. The actual
+ * writing logic is delegated to {@link DataWriter}.
+ *
+ * The writing procedure is:
+ *   1. Create a writer factory by {@link #createWriterFactory()}, serialize and send it to all the
+ *      partitions of the input data(RDD).
+ *   2. For each partition, create the data writer, and write the data of the partition with this
+ *      writer. If all the data are written successfully, call {@link DataWriter#commit()}. If
+ *      exception happens during the writing, call {@link DataWriter#abort()}.
+ *   3. If all writers are successfully committed, call {@link #commit(WriterCommitMessage[])}. If
+ *      some writers are aborted, or the job failed with an unknown reason, call
+ *      {@link #abort(WriterCommitMessage[])}.
+ *
+ * Spark won't retry failed writing jobs, users should do it manually in their Spark applications if
+ * they want to retry.
+ *
+ * Please refer to the document of commit/abort methods for detailed specifications.
+ *
+ * Note that, this interface provides a protocol between Spark and data sources for transactional
+ * data writing, but the transaction here is Spark-level transaction, which may not be the
+ * underlying storage transaction. For example, Spark successfully writes data to a Cassandra data
+ * source, but Cassandra may need some more time to reach consistency at storage level.
+ */
+@InterfaceStability.Evolving
+public interface DataSourceV2Writer {
+
+  /**
+   * Creates a writer factory which will be serialized and sent to executors.
+   */
+  DataWriterFactory<Row> createWriterFactory();
+
+  /**
+   * Commits this writing job with a list of commit messages. The commit messages are collected from
+   * successful data writers and are produced by {@link DataWriter#commit()}. If this method
+   * fails(throw exception), this writing job is considered to be failed, and
+   * {@link #abort(WriterCommitMessage[])} will be called. The written data should only be visible
+   * to data source readers if this method succeeds.
+   *
+   * Note that, one partition may have multiple committed data writers because of speculative tasks.
+   * Spark will pick the first successful one and get its commit message. Implementations should be
+   * aware of this and handle it correctly, e.g., have a mechanism to make sure only one data writer
+   * can commit successfully, or have a way to clean up the data of already-committed writers.
+   */
+  void commit(WriterCommitMessage[] messages);
+
+  /**
+   * Aborts this writing job because some data writers are failed to write the records and aborted,
+   * or the Spark job fails with some unknown reasons, or {@link #commit(WriterCommitMessage[])}
+   * fails. If this method fails(throw exception), the underlying data source may have garbage that
+   * need to be cleaned manually, but these garbage should not be visible to data source readers.
+   *
+   * Unless the abort is triggered by the failure of commit, the given messages should have some
+   * null slots as there maybe only a few data writers that are committed before the abort
+   * happens, or some data writers were committed but their commit messages haven't reached the
+   * driver when the abort is triggered. So this is just a "best effort" for data sources to
+   * clean up the data left by data writers.
+   */
+  void abort(WriterCommitMessage[] messages);
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriter.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriter.java
new file mode 100644
index 0000000000000..14261419af6f6
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriter.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.writer;
+
+import org.apache.spark.annotation.InterfaceStability;
+
+/**
+ * A data writer returned by {@link DataWriterFactory#createWriter(int, int)} and is
+ * responsible for writing data for an input RDD partition.
+ *
+ * One Spark task has one exclusive data writer, so there is no thread-safe concern.
+ *
+ * {@link #write(Object)} is called for each record in the input RDD partition. If one record fails
+ * the {@link #write(Object)}, {@link #abort()} is called afterwards and the remaining records will
+ * not be processed. If all records are successfully written, {@link #commit()} is called.
+ *
+ * If this data writer succeeds(all records are successfully written and {@link #commit()}
+ * succeeds), a {@link WriterCommitMessage} will be sent to the driver side and pass to
+ * {@link DataSourceV2Writer#commit(WriterCommitMessage[])} with commit messages from other data
+ * writers. If this data writer fails(one record fails to write or {@link #commit()} fails), an
+ * exception will be sent to the driver side, and Spark will retry this writing task for some times,
+ * each time {@link DataWriterFactory#createWriter(int, int)} gets a different `attemptNumber`,
+ * and finally call {@link DataSourceV2Writer#abort(WriterCommitMessage[])} if all retry fail.
+ *
+ * Besides the retry mechanism, Spark may launch speculative tasks if the existing writing task
+ * takes too long to finish. Different from retried tasks, which are launched one by one after the
+ * previous one fails, speculative tasks are running simultaneously. It's possible that one input
+ * RDD partition has multiple data writers with different `attemptNumber` running at the same time,
+ * and data sources should guarantee that these data writers don't conflict and can work together.
+ * Implementations can coordinate with driver during {@link #commit()} to make sure only one of
+ * these data writers can commit successfully. Or implementations can allow all of them to commit
+ * successfully, and have a way to revert committed data writers without the commit message, because
+ * Spark only accepts the commit message that arrives first and ignore others.
+ *
+ * Note that, Currently the type `T` can only be {@link org.apache.spark.sql.Row} for normal data
+ * source writers, or {@link org.apache.spark.sql.catalyst.InternalRow} for data source writers
+ * that mix in {@link SupportsWriteInternalRow}.
+ */
+@InterfaceStability.Evolving
+public interface DataWriter<T> {
+
+  /**
+   * Writes one record.
+   *
+   * If this method fails(throw exception), {@link #abort()} will be called and this data writer is
+   * considered to be failed.
+   */
+  void write(T record);
+
+  /**
+   * Commits this writer after all records are written successfully, returns a commit message which
+   * will be send back to driver side and pass to
+   * {@link DataSourceV2Writer#commit(WriterCommitMessage[])}.
+   *
+   * The written data should only be visible to data source readers after
+   * {@link DataSourceV2Writer#commit(WriterCommitMessage[])} succeeds, which means this method
+   * should still "hide" the written data and ask the {@link DataSourceV2Writer} at driver side to
+   * do the final commitment via {@link WriterCommitMessage}.
+   *
+   * If this method fails(throw exception), {@link #abort()} will be called and this data writer is
+   * considered to be failed.
+   */
+  WriterCommitMessage commit();
+
+  /**
+   * Aborts this writer if it is failed. Implementations should clean up the data for already
+   * written records.
+   *
+   * This method will only be called if there is one record failed to write, or {@link #commit()}
+   * failed.
+   *
+   * If this method fails(throw exception), the underlying data source may have garbage that need
+   * to be cleaned by {@link DataSourceV2Writer#abort(WriterCommitMessage[])} or manually, but
+   * these garbage should not be visible to data source readers.
+   */
+  void abort();
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriterFactory.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriterFactory.java
new file mode 100644
index 0000000000000..f812d102bda1a
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriterFactory.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.writer;
+
+import java.io.Serializable;
+
+import org.apache.spark.annotation.InterfaceStability;
+
+/**
+ * A factory of {@link DataWriter} returned by {@link DataSourceV2Writer#createWriterFactory()},
+ * which is responsible for creating and initializing the actual data writer at executor side.
+ *
+ * Note that, the writer factory will be serialized and sent to executors, then the data writer
+ * will be created on executors and do the actual writing. So {@link DataWriterFactory} must be
+ * serializable and {@link DataWriter} doesn't need to be.
+ */
+@InterfaceStability.Evolving
+public interface DataWriterFactory<T> extends Serializable {
+
+  /**
+   * Returns a data writer to do the actual writing work.
+   *
+   * @param partitionId A unique id of the RDD partition that the returned writer will process.
+   *                    Usually Spark processes many RDD partitions at the same time,
+   *                    implementations should use the partition id to distinguish writers for
+   *                    different partitions.
+   * @param attemptNumber Spark may launch multiple tasks with the same task id. For example, a task
+   *                      failed, Spark launches a new task wth the same task id but different
+   *                      attempt number. Or a task is too slow, Spark launches new tasks wth the
+   *                      same task id but different attempt number, which means there are multiple
+   *                      tasks with the same task id running at the same time. Implementations can
+   *                      use this attempt number to distinguish writers of different task attempts.
+   */
+  DataWriter<T> createWriter(int partitionId, int attemptNumber);
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/SupportsWriteInternalRow.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/SupportsWriteInternalRow.java
new file mode 100644
index 0000000000000..a8e95901f3b07
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/SupportsWriteInternalRow.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.writer;
+
+import org.apache.spark.annotation.Experimental;
+import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.catalyst.InternalRow;
+
+/**
+ * A mix-in interface for {@link DataSourceV2Writer}. Data source writers can implement this
+ * interface to write {@link InternalRow} directly and avoid the row conversion at Spark side.
+ * This is an experimental and unstable interface, as {@link InternalRow} is not public and may get
+ * changed in the future Spark versions.
+ */
+
+@InterfaceStability.Evolving
+@Experimental
+@InterfaceStability.Unstable
+public interface SupportsWriteInternalRow extends DataSourceV2Writer {
+
+  @Override
+  default DataWriterFactory<Row> createWriterFactory() {
+    throw new IllegalStateException(
+      "createWriterFactory should not be called with SupportsWriteInternalRow.");
+  }
+
+  DataWriterFactory<InternalRow> createInternalRowWriterFactory();
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/WriterCommitMessage.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/WriterCommitMessage.java
new file mode 100644
index 0000000000000..082d6b5dc409f
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/WriterCommitMessage.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.writer;
+
+import java.io.Serializable;
+
+import org.apache.spark.annotation.InterfaceStability;
+
+/**
+ * A commit message returned by {@link DataWriter#commit()} and will be sent back to the driver side
+ * as the input parameter of {@link DataSourceV2Writer#commit(WriterCommitMessage[])}.
+ *
+ * This is an empty interface, data sources should define their own message class and use it in
+ * their {@link DataWriter#commit()} and {@link DataSourceV2Writer#commit(WriterCommitMessage[])}
+ * implementations.
+ */
+@InterfaceStability.Evolving
+public interface WriterCommitMessage extends Serializable {}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index c9e45436ed42f..8d95b24c00619 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -17,7 +17,8 @@
 
 package org.apache.spark.sql
 
-import java.util.{Locale, Properties}
+import java.text.SimpleDateFormat
+import java.util.{Date, Locale, Properties, UUID}
 
 import scala.collection.JavaConverters._
 
@@ -29,7 +30,9 @@ import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan
 import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource, LogicalRelation}
+import org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2
 import org.apache.spark.sql.sources.BaseRelation
+import org.apache.spark.sql.sources.v2.{DataSourceV2, DataSourceV2Options, WriteSupport}
 import org.apache.spark.sql.types.StructType
 
 /**
@@ -231,12 +234,33 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
 
     assertNotBucketed("save")
 
-    runCommand(df.sparkSession, "save") {
-      DataSource(
-        sparkSession = df.sparkSession,
-        className = source,
-        partitionColumns = partitioningColumns.getOrElse(Nil),
-        options = extraOptions.toMap).planForWriting(mode, df.logicalPlan)
+    val cls = DataSource.lookupDataSource(source)
+    if (classOf[DataSourceV2].isAssignableFrom(cls)) {
+      cls.newInstance() match {
+        case ds: WriteSupport =>
+          val options = new DataSourceV2Options(extraOptions.asJava)
+          // Using a timestamp and a random UUID to distinguish different writing jobs. This is good
+          // enough as there won't be tons of writing jobs created at the same second.
+          val jobId = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US)
+            .format(new Date()) + "-" + UUID.randomUUID()
+          val writer = ds.createWriter(jobId, df.logicalPlan.schema, mode, options)
+          if (writer.isPresent) {
+            runCommand(df.sparkSession, "save") {
+              WriteToDataSourceV2(writer.get(), df.logicalPlan)
+            }
+          }
+
+        case _ => throw new AnalysisException(s"$cls does not support data writing.")
+      }
+    } else {
+      // Code path for data source v1.
+      runCommand(df.sparkSession, "save") {
+        DataSource(
+          sparkSession = df.sparkSession,
+          className = source,
+          partitionColumns = partitioningColumns.getOrElse(Nil),
+          options = extraOptions.toMap).planForWriting(mode, df.logicalPlan)
+      }
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala
index f2cda002245e8..df5b524485f54 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala
@@ -18,20 +18,17 @@
 package org.apache.spark.sql.execution.datasources.v2
 
 import org.apache.spark.sql.Strategy
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.execution.{FilterExec, ProjectExec, SparkPlan}
-import org.apache.spark.sql.execution.datasources.DataSourceStrategy
-import org.apache.spark.sql.sources.Filter
-import org.apache.spark.sql.sources.v2.reader._
+import org.apache.spark.sql.execution.SparkPlan
 
 object DataSourceV2Strategy extends Strategy {
-  // TODO: write path
   override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
     case DataSourceV2Relation(output, reader) =>
       DataSourceV2ScanExec(output, reader) :: Nil
 
+    case WriteToDataSourceV2(writer, query) =>
+      WriteToDataSourceV2Exec(writer, planLater(query)) :: Nil
+
     case _ => Nil
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
new file mode 100644
index 0000000000000..92c1e1f4a3383
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.v2
+
+import org.apache.spark.{SparkException, TaskContext}
+import org.apache.spark.internal.Logging
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.sources.v2.writer._
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.util.Utils
+
+/**
+ * The logical plan for writing data into data source v2.
+ */
+case class WriteToDataSourceV2(writer: DataSourceV2Writer, query: LogicalPlan) extends LogicalPlan {
+  override def children: Seq[LogicalPlan] = Seq(query)
+  override def output: Seq[Attribute] = Nil
+}
+
+/**
+ * The physical plan for writing data into data source v2.
+ */
+case class WriteToDataSourceV2Exec(writer: DataSourceV2Writer, query: SparkPlan) extends SparkPlan {
+  override def children: Seq[SparkPlan] = Seq(query)
+  override def output: Seq[Attribute] = Nil
+
+  override protected def doExecute(): RDD[InternalRow] = {
+    val writeTask = writer match {
+      case w: SupportsWriteInternalRow => w.createInternalRowWriterFactory()
+      case _ => new RowToInternalRowDataWriterFactory(writer.createWriterFactory(), query.schema)
+    }
+
+    val rdd = query.execute()
+    val messages = new Array[WriterCommitMessage](rdd.partitions.length)
+
+    logInfo(s"Start processing data source writer: $writer. " +
+      s"The input RDD has ${messages.length} partitions.")
+
+    try {
+      sparkContext.runJob(
+        rdd,
+        (context: TaskContext, iter: Iterator[InternalRow]) =>
+          DataWritingSparkTask.run(writeTask, context, iter),
+        rdd.partitions.indices,
+        (index, message: WriterCommitMessage) => messages(index) = message
+      )
+
+      logInfo(s"Data source writer $writer is committing.")
+      writer.commit(messages)
+      logInfo(s"Data source writer $writer committed.")
+    } catch {
+      case cause: Throwable =>
+        logError(s"Data source writer $writer is aborting.")
+        try {
+          writer.abort(messages)
+        } catch {
+          case t: Throwable =>
+            logError(s"Data source writer $writer failed to abort.")
+            cause.addSuppressed(t)
+            throw new SparkException("Writing job failed.", cause)
+        }
+        logError(s"Data source writer $writer aborted.")
+        throw new SparkException("Writing job aborted.", cause)
+    }
+
+    sparkContext.emptyRDD
+  }
+}
+
+object DataWritingSparkTask extends Logging {
+  def run(
+      writeTask: DataWriterFactory[InternalRow],
+      context: TaskContext,
+      iter: Iterator[InternalRow]): WriterCommitMessage = {
+    val dataWriter = writeTask.createWriter(context.partitionId(), context.attemptNumber())
+
+    // write the data and commit this writer.
+    Utils.tryWithSafeFinallyAndFailureCallbacks(block = {
+      iter.foreach(dataWriter.write)
+      logInfo(s"Writer for partition ${context.partitionId()} is committing.")
+      val msg = dataWriter.commit()
+      logInfo(s"Writer for partition ${context.partitionId()} committed.")
+      msg
+    })(catchBlock = {
+      // If there is an error, abort this writer
+      logError(s"Writer for partition ${context.partitionId()} is aborting.")
+      dataWriter.abort()
+      logError(s"Writer for partition ${context.partitionId()} aborted.")
+    })
+  }
+}
+
+class RowToInternalRowDataWriterFactory(
+    rowWriterFactory: DataWriterFactory[Row],
+    schema: StructType) extends DataWriterFactory[InternalRow] {
+
+  override def createWriter(partitionId: Int, attemptNumber: Int): DataWriter[InternalRow] = {
+    new RowToInternalRowDataWriter(
+      rowWriterFactory.createWriter(partitionId, attemptNumber),
+      RowEncoder.apply(schema).resolveAndBind())
+  }
+}
+
+class RowToInternalRowDataWriter(rowWriter: DataWriter[Row], encoder: ExpressionEncoder[Row])
+  extends DataWriter[InternalRow] {
+
+  override def write(record: InternalRow): Unit = rowWriter.write(encoder.fromRow(record))
+
+  override def commit(): WriterCommitMessage = rowWriter.commit()
+
+  override def abort(): Unit = rowWriter.abort()
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
index f238e565dc2fc..092702a1d5173 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
@@ -21,6 +21,7 @@ import java.util.{ArrayList, List => JList}
 
 import test.org.apache.spark.sql.sources.v2._
 
+import org.apache.spark.SparkException
 import org.apache.spark.sql.{AnalysisException, QueryTest, Row}
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.sources.{Filter, GreaterThan}
@@ -80,6 +81,74 @@ class DataSourceV2Suite extends QueryTest with SharedSQLContext {
       }
     }
   }
+
+  test("simple writable data source") {
+    // TODO: java implementation.
+    Seq(classOf[SimpleWritableDataSource]).foreach { cls =>
+      withTempPath { file =>
+        val path = file.getCanonicalPath
+        assert(spark.read.format(cls.getName).option("path", path).load().collect().isEmpty)
+
+        spark.range(10).select('id, -'id).write.format(cls.getName)
+          .option("path", path).save()
+        checkAnswer(
+          spark.read.format(cls.getName).option("path", path).load(),
+          spark.range(10).select('id, -'id))
+
+        // test with different save modes
+        spark.range(10).select('id, -'id).write.format(cls.getName)
+          .option("path", path).mode("append").save()
+        checkAnswer(
+          spark.read.format(cls.getName).option("path", path).load(),
+          spark.range(10).union(spark.range(10)).select('id, -'id))
+
+        spark.range(5).select('id, -'id).write.format(cls.getName)
+          .option("path", path).mode("overwrite").save()
+        checkAnswer(
+          spark.read.format(cls.getName).option("path", path).load(),
+          spark.range(5).select('id, -'id))
+
+        spark.range(5).select('id, -'id).write.format(cls.getName)
+          .option("path", path).mode("ignore").save()
+        checkAnswer(
+          spark.read.format(cls.getName).option("path", path).load(),
+          spark.range(5).select('id, -'id))
+
+        val e = intercept[Exception] {
+          spark.range(5).select('id, -'id).write.format(cls.getName)
+            .option("path", path).mode("error").save()
+        }
+        assert(e.getMessage.contains("data already exists"))
+
+        // test transaction
+        val failingUdf = org.apache.spark.sql.functions.udf {
+          var count = 0
+          (id: Long) => {
+            if (count > 5) {
+              throw new RuntimeException("testing error")
+            }
+            count += 1
+            id
+          }
+        }
+        // this input data will fail to read middle way.
+        val input = spark.range(10).select(failingUdf('id).as('i)).select('i, -'i)
+        val e2 = intercept[SparkException] {
+          input.write.format(cls.getName).option("path", path).mode("overwrite").save()
+        }
+        assert(e2.getMessage.contains("Writing job aborted"))
+        // make sure we don't have partial data.
+        assert(spark.read.format(cls.getName).option("path", path).load().collect().isEmpty)
+
+        // test internal row writer
+        spark.range(5).select('id, -'id).write.format(cls.getName)
+          .option("path", path).option("internal", "true").mode("overwrite").save()
+        checkAnswer(
+          spark.read.format(cls.getName).option("path", path).load(),
+          spark.range(5).select('id, -'id))
+      }
+    }
+  }
 }
 
 class SimpleDataSourceV2 extends DataSourceV2 with ReadSupport {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala
new file mode 100644
index 0000000000000..6fb60f4d848d7
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala
@@ -0,0 +1,249 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2
+
+import java.io.{BufferedReader, InputStreamReader, IOException}
+import java.text.SimpleDateFormat
+import java.util.{Collections, Date, List => JList, Locale, Optional, UUID}
+
+import scala.collection.JavaConverters._
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, Path}
+
+import org.apache.spark.SparkContext
+import org.apache.spark.sql.{Row, SaveMode}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.sources.v2.reader.{DataReader, DataSourceV2Reader, ReadTask}
+import org.apache.spark.sql.sources.v2.writer._
+import org.apache.spark.sql.types.{DataType, StructType}
+import org.apache.spark.util.SerializableConfiguration
+
+/**
+ * A HDFS based transactional writable data source.
+ * Each task writes data to `target/_temporary/jobId/$jobId-$partitionId-$attemptNumber`.
+ * Each job moves files from `target/_temporary/jobId/` to `target`.
+ */
+class SimpleWritableDataSource extends DataSourceV2 with ReadSupport with WriteSupport {
+
+  private val schema = new StructType().add("i", "long").add("j", "long")
+
+  class Reader(path: String, conf: Configuration) extends DataSourceV2Reader {
+    override def readSchema(): StructType = schema
+
+    override def createReadTasks(): JList[ReadTask[Row]] = {
+      val dataPath = new Path(path)
+      val fs = dataPath.getFileSystem(conf)
+      if (fs.exists(dataPath)) {
+        fs.listStatus(dataPath).filterNot { status =>
+          val name = status.getPath.getName
+          name.startsWith("_") || name.startsWith(".")
+        }.map { f =>
+          val serializableConf = new SerializableConfiguration(conf)
+          new SimpleCSVReadTask(f.getPath.toUri.toString, serializableConf): ReadTask[Row]
+        }.toList.asJava
+      } else {
+        Collections.emptyList()
+      }
+    }
+  }
+
+  class Writer(jobId: String, path: String, conf: Configuration) extends DataSourceV2Writer {
+    override def createWriterFactory(): DataWriterFactory[Row] = {
+      new SimpleCSVDataWriterFactory(path, jobId, new SerializableConfiguration(conf))
+    }
+
+    override def commit(messages: Array[WriterCommitMessage]): Unit = {
+      val finalPath = new Path(path)
+      val jobPath = new Path(new Path(finalPath, "_temporary"), jobId)
+      val fs = jobPath.getFileSystem(conf)
+      try {
+        for (file <- fs.listStatus(jobPath).map(_.getPath)) {
+          val dest = new Path(finalPath, file.getName)
+          if(!fs.rename(file, dest)) {
+            throw new IOException(s"failed to rename($file, $dest)")
+          }
+        }
+      } finally {
+        fs.delete(jobPath, true)
+      }
+    }
+
+    override def abort(messages: Array[WriterCommitMessage]): Unit = {
+      val jobPath = new Path(new Path(path, "_temporary"), jobId)
+      val fs = jobPath.getFileSystem(conf)
+      fs.delete(jobPath, true)
+    }
+  }
+
+  class InternalRowWriter(jobId: String, path: String, conf: Configuration)
+    extends Writer(jobId, path, conf) with SupportsWriteInternalRow {
+
+    override def createWriterFactory(): DataWriterFactory[Row] = {
+      throw new IllegalArgumentException("not expected!")
+    }
+
+    override def createInternalRowWriterFactory(): DataWriterFactory[InternalRow] = {
+      new InternalRowCSVDataWriterFactory(path, jobId, new SerializableConfiguration(conf))
+    }
+  }
+
+  override def createReader(options: DataSourceV2Options): DataSourceV2Reader = {
+    val path = new Path(options.get("path").get())
+    val conf = SparkContext.getActive.get.hadoopConfiguration
+    new Reader(path.toUri.toString, conf)
+  }
+
+  override def createWriter(
+      jobId: String,
+      schema: StructType,
+      mode: SaveMode,
+      options: DataSourceV2Options): Optional[DataSourceV2Writer] = {
+    assert(DataType.equalsStructurally(schema.asNullable, this.schema.asNullable))
+    assert(!SparkContext.getActive.get.conf.getBoolean("spark.speculation", false))
+
+    val path = new Path(options.get("path").get())
+    val internal = options.get("internal").isPresent
+    val conf = SparkContext.getActive.get.hadoopConfiguration
+    val fs = path.getFileSystem(conf)
+
+    if (mode == SaveMode.ErrorIfExists) {
+      if (fs.exists(path)) {
+        throw new RuntimeException("data already exists.")
+      }
+    }
+    if (mode == SaveMode.Ignore) {
+      if (fs.exists(path)) {
+        return Optional.empty()
+      }
+    }
+    if (mode == SaveMode.Overwrite) {
+      fs.delete(path, true)
+    }
+
+    Optional.of(createWriter(jobId, path, conf, internal))
+  }
+
+  private def createWriter(
+      jobId: String, path: Path, conf: Configuration, internal: Boolean): DataSourceV2Writer = {
+    val pathStr = path.toUri.toString
+    if (internal) {
+      new InternalRowWriter(jobId, pathStr, conf)
+    } else {
+      new Writer(jobId, pathStr, conf)
+    }
+  }
+}
+
+class SimpleCSVReadTask(path: String, conf: SerializableConfiguration)
+  extends ReadTask[Row] with DataReader[Row] {
+
+  @transient private var lines: Iterator[String] = _
+  @transient private var currentLine: String = _
+  @transient private var inputStream: FSDataInputStream = _
+
+  override def createReader(): DataReader[Row] = {
+    val filePath = new Path(path)
+    val fs = filePath.getFileSystem(conf.value)
+    inputStream = fs.open(filePath)
+    lines = new BufferedReader(new InputStreamReader(inputStream))
+      .lines().iterator().asScala
+    this
+  }
+
+  override def next(): Boolean = {
+    if (lines.hasNext) {
+      currentLine = lines.next()
+      true
+    } else {
+      false
+    }
+  }
+
+  override def get(): Row = Row(currentLine.split(",").map(_.trim.toLong): _*)
+
+  override def close(): Unit = {
+    inputStream.close()
+  }
+}
+
+class SimpleCSVDataWriterFactory(path: String, jobId: String, conf: SerializableConfiguration)
+  extends DataWriterFactory[Row] {
+
+  override def createWriter(partitionId: Int, attemptNumber: Int): DataWriter[Row] = {
+    val jobPath = new Path(new Path(path, "_temporary"), jobId)
+    val filePath = new Path(jobPath, s"$jobId-$partitionId-$attemptNumber")
+    val fs = filePath.getFileSystem(conf.value)
+    new SimpleCSVDataWriter(fs, filePath)
+  }
+}
+
+class SimpleCSVDataWriter(fs: FileSystem, file: Path) extends DataWriter[Row] {
+
+  private val out = fs.create(file)
+
+  override def write(record: Row): Unit = {
+    out.writeBytes(s"${record.getLong(0)},${record.getLong(1)}\n")
+  }
+
+  override def commit(): WriterCommitMessage = {
+    out.close()
+    null
+  }
+
+  override def abort(): Unit = {
+    try {
+      out.close()
+    } finally {
+      fs.delete(file, false)
+    }
+  }
+}
+
+class InternalRowCSVDataWriterFactory(path: String, jobId: String, conf: SerializableConfiguration)
+  extends DataWriterFactory[InternalRow] {
+
+  override def createWriter(partitionId: Int, attemptNumber: Int): DataWriter[InternalRow] = {
+    val jobPath = new Path(new Path(path, "_temporary"), jobId)
+    val filePath = new Path(jobPath, s"$jobId-$partitionId-$attemptNumber")
+    val fs = filePath.getFileSystem(conf.value)
+    new InternalRowCSVDataWriter(fs, filePath)
+  }
+}
+
+class InternalRowCSVDataWriter(fs: FileSystem, file: Path) extends DataWriter[InternalRow] {
+
+  private val out = fs.create(file)
+
+  override def write(record: InternalRow): Unit = {
+    out.writeBytes(s"${record.getLong(0)},${record.getLong(1)}\n")
+  }
+
+  override def commit(): WriterCommitMessage = {
+    out.close()
+    null
+  }
+
+  override def abort(): Unit = {
+    try {
+      out.close()
+    } finally {
+      fs.delete(file, false)
+    }
+  }
+}

From b84f61cd79a365edd4cc893a1de416c628d9906b Mon Sep 17 00:00:00 2001
From: Eric Perry <eric@ericjperry.com>
Date: Thu, 19 Oct 2017 23:57:41 -0700
Subject: [PATCH 736/779] [SQL] Mark strategies with override for clarity.

## What changes were proposed in this pull request?

This is a very trivial PR, simply marking `strategies` in `SparkPlanner` with the `override` keyword for clarity since it is overriding `strategies` in `QueryPlanner` two levels up in the class hierarchy. I was reading through the code to learn a bit and got stuck on this fact for a little while, so I figured this may be helpful so that another developer new to the project doesn't get stuck where I was.

I did not make a JIRA ticket for this because it is so trivial, but I'm happy to do so to adhere to the contribution guidelines if required.

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)
(If this patch involves UI changes, please attach a screenshot; otherwise, remove this)

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Eric Perry <eric@ericjperry.com>

Closes #19537 from ericjperry/override-strategies.
---
 .../scala/org/apache/spark/sql/execution/SparkPlanner.scala     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala
index b143d44eae17b..74048871f8d42 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala
@@ -33,7 +33,7 @@ class SparkPlanner(
 
   def numPartitions: Int = conf.numShufflePartitions
 
-  def strategies: Seq[Strategy] =
+  override def strategies: Seq[Strategy] =
     experimentalMethods.extraStrategies ++
       extraPlanningStrategies ++ (
       DataSourceV2Strategy ::

From 673876b7eadc6f382afc26fc654b0e7916c9ac5c Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Fri, 20 Oct 2017 08:28:05 +0100
Subject: [PATCH 737/779] [SPARK-22309][ML] Remove unused param in
 `LDAModel.getTopicDistributionMethod`

## What changes were proposed in this pull request?
Remove unused param in `LDAModel.getTopicDistributionMethod`

## How was this patch tested?
existing tests

Author: Zheng RuiFeng <ruifengz@foxmail.com>

Closes #19530 from zhengruifeng/lda_bc.
---
 mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala   | 2 +-
 .../main/scala/org/apache/spark/mllib/clustering/LDAModel.scala | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
index 3da29b1c816b1..4bab670cc159f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
@@ -458,7 +458,7 @@ abstract class LDAModel private[ml] (
     if ($(topicDistributionCol).nonEmpty) {
 
       // TODO: Make the transformer natively in ml framework to avoid extra conversion.
-      val transformer = oldLocalModel.getTopicDistributionMethod(sparkSession.sparkContext)
+      val transformer = oldLocalModel.getTopicDistributionMethod
 
       val t = udf { (v: Vector) => transformer(OldVectors.fromML(v)).asML }
       dataset.withColumn($(topicDistributionCol), t(col($(featuresCol)))).toDF()
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 4ab420058f33d..b8a6e94248421 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -371,7 +371,7 @@ class LocalLDAModel private[spark] (
   /**
    * Get a method usable as a UDF for `topicDistributions()`
    */
-  private[spark] def getTopicDistributionMethod(sc: SparkContext): Vector => Vector = {
+  private[spark] def getTopicDistributionMethod: Vector => Vector = {
     val expElogbeta = exp(LDAUtils.dirichletExpectation(topicsMatrix.asBreeze.toDenseMatrix.t).t)
     val docConcentrationBrz = this.docConcentration.asBreeze
     val gammaShape = this.gammaShape

From e2fea8cd6058a807ff4841b496ea345ff0553044 Mon Sep 17 00:00:00 2001
From: guoxiaolong <guo.xiaolong1@zte.com.cn>
Date: Fri, 20 Oct 2017 09:43:46 +0100
Subject: [PATCH 738/779] [CORE][DOC] Add event log conf.

## What changes were proposed in this pull request?

Event Log Server has a total of five configuration parameters, and now the description of the other two configuration parameters on the doc, user-friendly access and use.

## How was this patch tested?

manual tests

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: guoxiaolong <guo.xiaolong1@zte.com.cn>

Closes #19242 from guoxiaolongzte/addEventLogConf.
---
 docs/configuration.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/docs/configuration.md b/docs/configuration.md
index 7b9e16a382449..d3c358bb74173 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -748,6 +748,20 @@ Apart from these, the following properties are also available, and may be useful
     finished.
   </td>
 </tr>
+<tr>
+  <td><code>spark.eventLog.overwrite</code></td>
+  <td>false</td>
+  <td>
+    Whether to overwrite any existing files.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.eventLog.buffer.kb</code></td>
+  <td>100k</td>
+  <td>
+    Buffer size in KB to use when writing to output streams.
+  </td>
+</tr>
 <tr>
   <td><code>spark.ui.enabled</code></td>
   <td>true</td>

From 16c9cc68c5a70fd50e214f6deba591f0a9ae5cca Mon Sep 17 00:00:00 2001
From: CenYuhai <yuhai.cen@ele.me>
Date: Fri, 20 Oct 2017 09:27:39 -0700
Subject: [PATCH 739/779] [SPARK-21055][SQL] replace grouping__id with
 grouping_id()

## What changes were proposed in this pull request?
spark does not support grouping__id, it has grouping_id() instead.
But it is not convenient for hive user to change to spark-sql
so this pr is to replace grouping__id with grouping_id()
hive user need not to alter their scripts

## How was this patch tested?

test with SQLQuerySuite.scala

Author: CenYuhai <yuhai.cen@ele.me>

Closes #18270 from cenyuhai/SPARK-21055.
---
 .../sql/catalyst/analysis/Analyzer.scala      |  15 +--
 .../sql-tests/inputs/group-analytics.sql      |   6 +-
 .../sql-tests/results/group-analytics.sql.out |  43 ++++---
 .../sql/hive/execution/SQLQuerySuite.scala    | 110 ++++++++++++++++++
 4 files changed, 148 insertions(+), 26 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 8edf575db7969..d6a962a14dc9c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -29,7 +29,7 @@ import org.apache.spark.sql.catalyst.expressions.SubExprUtils._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.expressions.objects.{LambdaVariable, MapObjects, NewInstance, UnresolvedMapObjects}
 import org.apache.spark.sql.catalyst.plans._
-import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, _}
+import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.catalyst.trees.TreeNodeRef
 import org.apache.spark.sql.catalyst.util.toPrettySQL
@@ -293,12 +293,6 @@ class Analyzer(
         Seq(Seq.empty)
     }
 
-    private def hasGroupingAttribute(expr: Expression): Boolean = {
-      expr.collectFirst {
-        case u: UnresolvedAttribute if resolver(u.name, VirtualColumn.hiveGroupingIdName) => u
-      }.isDefined
-    }
-
     private[analysis] def hasGroupingFunction(e: Expression): Boolean = {
       e.collectFirst {
         case g: Grouping => g
@@ -452,9 +446,6 @@ class Analyzer(
     // This require transformUp to replace grouping()/grouping_id() in resolved Filter/Sort
     def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
       case a if !a.childrenResolved => a // be sure all of the children are resolved.
-      case p if p.expressions.exists(hasGroupingAttribute) =>
-        failAnalysis(
-          s"${VirtualColumn.hiveGroupingIdName} is deprecated; use grouping_id() instead")
 
       // Ensure group by expressions and aggregate expressions have been resolved.
       case Aggregate(Seq(c @ Cube(groupByExprs)), aggregateExpressions, child)
@@ -1174,6 +1165,10 @@ class Analyzer(
       case q: LogicalPlan =>
         q transformExpressions {
           case u if !u.childrenResolved => u // Skip until children are resolved.
+          case u: UnresolvedAttribute if resolver(u.name, VirtualColumn.hiveGroupingIdName) =>
+            withPosition(u) {
+              Alias(GroupingID(Nil), VirtualColumn.hiveGroupingIdName)()
+            }
           case u @ UnresolvedGenerator(name, children) =>
             withPosition(u) {
               catalog.lookupFunction(name, children) match {
diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql b/sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql
index 8aff4cb524199..9721f8c60ebce 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql
@@ -38,11 +38,11 @@ SELECT course, year, GROUPING(course), GROUPING(year), GROUPING_ID(course, year)
 GROUP BY CUBE(course, year);
 SELECT course, year, GROUPING(course) FROM courseSales GROUP BY course, year;
 SELECT course, year, GROUPING_ID(course, year) FROM courseSales GROUP BY course, year;
-SELECT course, year, grouping__id FROM courseSales GROUP BY CUBE(course, year);
+SELECT course, year, grouping__id FROM courseSales GROUP BY CUBE(course, year) ORDER BY grouping__id, course, year;
 
 -- GROUPING/GROUPING_ID in having clause
 SELECT course, year FROM courseSales GROUP BY CUBE(course, year)
-HAVING GROUPING(year) = 1 AND GROUPING_ID(course, year) > 0;
+HAVING GROUPING(year) = 1 AND GROUPING_ID(course, year) > 0 ORDER BY course, year;
 SELECT course, year FROM courseSales GROUP BY course, year HAVING GROUPING(course) > 0;
 SELECT course, year FROM courseSales GROUP BY course, year HAVING GROUPING_ID(course) > 0;
 SELECT course, year FROM courseSales GROUP BY CUBE(course, year) HAVING grouping__id > 0;
@@ -54,7 +54,7 @@ SELECT course, year, GROUPING_ID(course, year) FROM courseSales GROUP BY CUBE(co
 ORDER BY GROUPING(course), GROUPING(year), course, year;
 SELECT course, year FROM courseSales GROUP BY course, year ORDER BY GROUPING(course);
 SELECT course, year FROM courseSales GROUP BY course, year ORDER BY GROUPING_ID(course);
-SELECT course, year FROM courseSales GROUP BY CUBE(course, year) ORDER BY grouping__id;
+SELECT course, year FROM courseSales GROUP BY CUBE(course, year) ORDER BY grouping__id, course, year;
 
 -- Aliases in SELECT could be used in ROLLUP/CUBE/GROUPING SETS
 SELECT a + b AS k1, b AS k2, SUM(a - b) FROM testData GROUP BY CUBE(k1, k2);
diff --git a/sql/core/src/test/resources/sql-tests/results/group-analytics.sql.out b/sql/core/src/test/resources/sql-tests/results/group-analytics.sql.out
index ce7a16a4d0c81..3439a05727f95 100644
--- a/sql/core/src/test/resources/sql-tests/results/group-analytics.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/group-analytics.sql.out
@@ -223,22 +223,29 @@ grouping_id() can only be used with GroupingSets/Cube/Rollup;
 
 
 -- !query 16
-SELECT course, year, grouping__id FROM courseSales GROUP BY CUBE(course, year)
+SELECT course, year, grouping__id FROM courseSales GROUP BY CUBE(course, year) ORDER BY grouping__id, course, year
 -- !query 16 schema
-struct<>
+struct<course:string,year:int,grouping__id:int>
 -- !query 16 output
-org.apache.spark.sql.AnalysisException
-grouping__id is deprecated; use grouping_id() instead;
+Java	2012	0
+Java	2013	0
+dotNET	2012	0
+dotNET	2013	0
+Java	NULL	1
+dotNET	NULL	1
+NULL	2012	2
+NULL	2013	2
+NULL	NULL	3
 
 
 -- !query 17
 SELECT course, year FROM courseSales GROUP BY CUBE(course, year)
-HAVING GROUPING(year) = 1 AND GROUPING_ID(course, year) > 0
+HAVING GROUPING(year) = 1 AND GROUPING_ID(course, year) > 0 ORDER BY course, year
 -- !query 17 schema
 struct<course:string,year:int>
 -- !query 17 output
-Java	NULL
 NULL	NULL
+Java	NULL
 dotNET	NULL
 
 
@@ -263,10 +270,13 @@ grouping()/grouping_id() can only be used with GroupingSets/Cube/Rollup;
 -- !query 20
 SELECT course, year FROM courseSales GROUP BY CUBE(course, year) HAVING grouping__id > 0
 -- !query 20 schema
-struct<>
+struct<course:string,year:int>
 -- !query 20 output
-org.apache.spark.sql.AnalysisException
-grouping__id is deprecated; use grouping_id() instead;
+Java	NULL
+NULL	2012
+NULL	2013
+NULL	NULL
+dotNET	NULL
 
 
 -- !query 21
@@ -322,12 +332,19 @@ grouping()/grouping_id() can only be used with GroupingSets/Cube/Rollup;
 
 
 -- !query 25
-SELECT course, year FROM courseSales GROUP BY CUBE(course, year) ORDER BY grouping__id
+SELECT course, year FROM courseSales GROUP BY CUBE(course, year) ORDER BY grouping__id, course, year
 -- !query 25 schema
-struct<>
+struct<course:string,year:int>
 -- !query 25 output
-org.apache.spark.sql.AnalysisException
-grouping__id is deprecated; use grouping_id() instead;
+Java	2012
+Java	2013
+dotNET	2012
+dotNET	2013
+Java	NULL
+dotNET	NULL
+NULL	2012
+NULL	2013
+NULL	NULL
 
 
 -- !query 26
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 60935c3e85c43..2476a440ad82c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -1388,6 +1388,19 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       ).map(i => Row(i._1, i._2, i._3)))
   }
 
+  test("SPARK-21055 replace grouping__id: Wrong Result for Rollup #1") {
+    checkAnswer(sql(
+      "SELECT count(*) AS cnt, key % 5, grouping__id FROM src GROUP BY key%5 WITH ROLLUP"),
+      Seq(
+        (113, 3, 0),
+        (91, 0, 0),
+        (500, null, 1),
+        (84, 1, 0),
+        (105, 2, 0),
+        (107, 4, 0)
+      ).map(i => Row(i._1, i._2, i._3)))
+  }
+
   test("SPARK-8976 Wrong Result for Rollup #2") {
     checkAnswer(sql(
       """
@@ -1409,6 +1422,27 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       ).map(i => Row(i._1, i._2, i._3, i._4)))
   }
 
+  test("SPARK-21055 replace grouping__id: Wrong Result for Rollup #2") {
+    checkAnswer(sql(
+      """
+        |SELECT count(*) AS cnt, key % 5 AS k1, key-5 AS k2, grouping__id AS k3
+        |FROM src GROUP BY key%5, key-5
+        |WITH ROLLUP ORDER BY cnt, k1, k2, k3 LIMIT 10
+      """.stripMargin),
+      Seq(
+        (1, 0, 5, 0),
+        (1, 0, 15, 0),
+        (1, 0, 25, 0),
+        (1, 0, 60, 0),
+        (1, 0, 75, 0),
+        (1, 0, 80, 0),
+        (1, 0, 100, 0),
+        (1, 0, 140, 0),
+        (1, 0, 145, 0),
+        (1, 0, 150, 0)
+      ).map(i => Row(i._1, i._2, i._3, i._4)))
+  }
+
   test("SPARK-8976 Wrong Result for Rollup #3") {
     checkAnswer(sql(
       """
@@ -1430,6 +1464,27 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       ).map(i => Row(i._1, i._2, i._3, i._4)))
   }
 
+  test("SPARK-21055 replace grouping__id: Wrong Result for Rollup #3") {
+    checkAnswer(sql(
+      """
+        |SELECT count(*) AS cnt, key % 5 AS k1, key-5 AS k2, grouping__id AS k3
+        |FROM (SELECT key, key%2, key - 5 FROM src) t GROUP BY key%5, key-5
+        |WITH ROLLUP ORDER BY cnt, k1, k2, k3 LIMIT 10
+      """.stripMargin),
+      Seq(
+        (1, 0, 5, 0),
+        (1, 0, 15, 0),
+        (1, 0, 25, 0),
+        (1, 0, 60, 0),
+        (1, 0, 75, 0),
+        (1, 0, 80, 0),
+        (1, 0, 100, 0),
+        (1, 0, 140, 0),
+        (1, 0, 145, 0),
+        (1, 0, 150, 0)
+      ).map(i => Row(i._1, i._2, i._3, i._4)))
+  }
+
   test("SPARK-8976 Wrong Result for CUBE #1") {
     checkAnswer(sql(
       "SELECT count(*) AS cnt, key % 5, grouping_id() FROM src GROUP BY key%5 WITH CUBE"),
@@ -1443,6 +1498,19 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       ).map(i => Row(i._1, i._2, i._3)))
   }
 
+  test("SPARK-21055 replace grouping__id: Wrong Result for CUBE #1") {
+    checkAnswer(sql(
+      "SELECT count(*) AS cnt, key % 5, grouping__id FROM src GROUP BY key%5 WITH CUBE"),
+      Seq(
+        (113, 3, 0),
+        (91, 0, 0),
+        (500, null, 1),
+        (84, 1, 0),
+        (105, 2, 0),
+        (107, 4, 0)
+      ).map(i => Row(i._1, i._2, i._3)))
+  }
+
   test("SPARK-8976 Wrong Result for CUBE #2") {
     checkAnswer(sql(
       """
@@ -1464,6 +1532,27 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     ).map(i => Row(i._1, i._2, i._3, i._4)))
   }
 
+  test("SPARK-21055 replace grouping__id: Wrong Result for CUBE #2") {
+    checkAnswer(sql(
+      """
+        |SELECT count(*) AS cnt, key % 5 AS k1, key-5 AS k2, grouping__id AS k3
+        |FROM (SELECT key, key%2, key - 5 FROM src) t GROUP BY key%5, key-5
+        |WITH CUBE ORDER BY cnt, k1, k2, k3 LIMIT 10
+      """.stripMargin),
+      Seq(
+        (1, null, -3, 2),
+        (1, null, -1, 2),
+        (1, null, 3, 2),
+        (1, null, 4, 2),
+        (1, null, 5, 2),
+        (1, null, 6, 2),
+        (1, null, 12, 2),
+        (1, null, 14, 2),
+        (1, null, 15, 2),
+        (1, null, 22, 2)
+      ).map(i => Row(i._1, i._2, i._3, i._4)))
+  }
+
   test("SPARK-8976 Wrong Result for GroupingSet") {
     checkAnswer(sql(
       """
@@ -1485,6 +1574,27 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     ).map(i => Row(i._1, i._2, i._3, i._4)))
   }
 
+  test("SPARK-21055 replace grouping__id: Wrong Result for GroupingSet") {
+    checkAnswer(sql(
+      """
+        |SELECT count(*) AS cnt, key % 5 AS k1, key-5 AS k2, grouping__id AS k3
+        |FROM (SELECT key, key%2, key - 5 FROM src) t GROUP BY key%5, key-5
+        |GROUPING SETS (key%5, key-5) ORDER BY cnt, k1, k2, k3 LIMIT 10
+      """.stripMargin),
+      Seq(
+        (1, null, -3, 2),
+        (1, null, -1, 2),
+        (1, null, 3, 2),
+        (1, null, 4, 2),
+        (1, null, 5, 2),
+        (1, null, 6, 2),
+        (1, null, 12, 2),
+        (1, null, 14, 2),
+        (1, null, 15, 2),
+        (1, null, 22, 2)
+      ).map(i => Row(i._1, i._2, i._3, i._4)))
+  }
+
   ignore("SPARK-10562: partition by column with mixed case name") {
     withTable("tbl10562") {
       val df = Seq(2012 -> "a").toDF("Year", "val")

From 568763bafb7acfcf5921d6492034d1f6f87875e2 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Fri, 20 Oct 2017 12:32:45 -0700
Subject: [PATCH 740/779] [INFRA] Close stale PRs.

Closes #19541
Closes #19542

From b8624b06e5d531ebc14acb05da286f96f4bc9515 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Fri, 20 Oct 2017 12:44:30 -0700
Subject: [PATCH 741/779] [SPARK-20396][SQL][PYSPARK][FOLLOW-UP]
 groupby().apply() with pandas udf

## What changes were proposed in this pull request?

This is a follow-up of #18732.
This pr modifies `GroupedData.apply()` method to convert pandas udf to grouped udf implicitly.

## How was this patch tested?

Exisiting tests.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #19517 from ueshin/issues/SPARK-20396/fup2.
---
 .../spark/api/python/PythonRunner.scala       |  1 +
 python/pyspark/serializers.py                 |  1 +
 python/pyspark/sql/functions.py               | 33 ++++++++++------
 python/pyspark/sql/group.py                   | 14 ++++---
 python/pyspark/sql/tests.py                   | 37 ++++++++++++++++++
 python/pyspark/worker.py                      | 39 ++++++++-----------
 .../logical/pythonLogicalOperators.scala      |  9 +++--
 .../spark/sql/RelationalGroupedDataset.scala  |  7 ++--
 .../execution/python/ExtractPythonUDFs.scala  |  6 ++-
 .../python/FlatMapGroupsInPandasExec.scala    |  2 +-
 .../sql/execution/python/PythonUDF.scala      |  2 +-
 .../python/UserDefinedPythonFunction.scala    | 13 ++++++-
 .../python/BatchEvalPythonExecSuite.scala     |  2 +-
 13 files changed, 114 insertions(+), 52 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
index 3688a149443c1..d417303bb147d 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
@@ -36,6 +36,7 @@ private[spark] object PythonEvalType {
   val NON_UDF = 0
   val SQL_BATCHED_UDF = 1
   val SQL_PANDAS_UDF = 2
+  val SQL_PANDAS_GROUPED_UDF = 3
 }
 
 /**
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index ad18bd0c81eaa..a0adeed994456 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -86,6 +86,7 @@ class PythonEvalType(object):
     NON_UDF = 0
     SQL_BATCHED_UDF = 1
     SQL_PANDAS_UDF = 2
+    SQL_PANDAS_GROUPED_UDF = 3
 
 
 class Serializer(object):
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 9bc12c3b7a162..9bc374b93a433 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -2038,13 +2038,22 @@ def _wrap_function(sc, func, returnType):
                                   sc.pythonVer, broadcast_vars, sc._javaAccumulator)
 
 
+class PythonUdfType(object):
+    # row-at-a-time UDFs
+    NORMAL_UDF = 0
+    # scalar vectorized UDFs
+    PANDAS_UDF = 1
+    # grouped vectorized UDFs
+    PANDAS_GROUPED_UDF = 2
+
+
 class UserDefinedFunction(object):
     """
     User defined function in Python
 
     .. versionadded:: 1.3
     """
-    def __init__(self, func, returnType, name=None, vectorized=False):
+    def __init__(self, func, returnType, name=None, pythonUdfType=PythonUdfType.NORMAL_UDF):
         if not callable(func):
             raise TypeError(
                 "Not a function or callable (__call__ is not defined): "
@@ -2058,7 +2067,7 @@ def __init__(self, func, returnType, name=None, vectorized=False):
         self._name = name or (
             func.__name__ if hasattr(func, '__name__')
             else func.__class__.__name__)
-        self.vectorized = vectorized
+        self.pythonUdfType = pythonUdfType
 
     @property
     def returnType(self):
@@ -2090,7 +2099,7 @@ def _create_judf(self):
         wrapped_func = _wrap_function(sc, self.func, self.returnType)
         jdt = spark._jsparkSession.parseDataType(self.returnType.json())
         judf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonFunction(
-            self._name, wrapped_func, jdt, self.vectorized)
+            self._name, wrapped_func, jdt, self.pythonUdfType)
         return judf
 
     def __call__(self, *cols):
@@ -2121,15 +2130,15 @@ def wrapper(*args):
 
         wrapper.func = self.func
         wrapper.returnType = self.returnType
-        wrapper.vectorized = self.vectorized
+        wrapper.pythonUdfType = self.pythonUdfType
 
         return wrapper
 
 
-def _create_udf(f, returnType, vectorized):
+def _create_udf(f, returnType, pythonUdfType):
 
-    def _udf(f, returnType=StringType(), vectorized=vectorized):
-        if vectorized:
+    def _udf(f, returnType=StringType(), pythonUdfType=pythonUdfType):
+        if pythonUdfType == PythonUdfType.PANDAS_UDF:
             import inspect
             argspec = inspect.getargspec(f)
             if len(argspec.args) == 0 and argspec.varargs is None:
@@ -2137,7 +2146,7 @@ def _udf(f, returnType=StringType(), vectorized=vectorized):
                     "0-arg pandas_udfs are not supported. "
                     "Instead, create a 1-arg pandas_udf and ignore the arg in your function."
                 )
-        udf_obj = UserDefinedFunction(f, returnType, vectorized=vectorized)
+        udf_obj = UserDefinedFunction(f, returnType, pythonUdfType=pythonUdfType)
         return udf_obj._wrapped()
 
     # decorator @udf, @udf(), @udf(dataType()), or similar with @pandas_udf
@@ -2145,9 +2154,9 @@ def _udf(f, returnType=StringType(), vectorized=vectorized):
         # If DataType has been passed as a positional argument
         # for decorator use it as a returnType
         return_type = f or returnType
-        return functools.partial(_udf, returnType=return_type, vectorized=vectorized)
+        return functools.partial(_udf, returnType=return_type, pythonUdfType=pythonUdfType)
     else:
-        return _udf(f=f, returnType=returnType, vectorized=vectorized)
+        return _udf(f=f, returnType=returnType, pythonUdfType=pythonUdfType)
 
 
 @since(1.3)
@@ -2181,7 +2190,7 @@ def udf(f=None, returnType=StringType()):
     |         8|      JOHN DOE|          22|
     +----------+--------------+------------+
     """
-    return _create_udf(f, returnType=returnType, vectorized=False)
+    return _create_udf(f, returnType=returnType, pythonUdfType=PythonUdfType.NORMAL_UDF)
 
 
 @since(2.3)
@@ -2252,7 +2261,7 @@ def pandas_udf(f=None, returnType=StringType()):
 
     .. note:: The user-defined function must be deterministic.
     """
-    return _create_udf(f, returnType=returnType, vectorized=True)
+    return _create_udf(f, returnType=returnType, pythonUdfType=PythonUdfType.PANDAS_UDF)
 
 
 blacklist = ['map', 'since', 'ignore_unicode_prefix']
diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py
index 817d0bc83bb77..e11388d604312 100644
--- a/python/pyspark/sql/group.py
+++ b/python/pyspark/sql/group.py
@@ -19,6 +19,7 @@
 from pyspark.rdd import ignore_unicode_prefix
 from pyspark.sql.column import Column, _to_seq, _to_java_column, _create_column_from_literal
 from pyspark.sql.dataframe import DataFrame
+from pyspark.sql.functions import PythonUdfType, UserDefinedFunction
 from pyspark.sql.types import *
 
 __all__ = ["GroupedData"]
@@ -235,11 +236,13 @@ def apply(self, udf):
         .. seealso:: :meth:`pyspark.sql.functions.pandas_udf`
 
         """
-        from pyspark.sql.functions import pandas_udf
+        import inspect
 
         # Columns are special because hasattr always return True
-        if isinstance(udf, Column) or not hasattr(udf, 'func') or not udf.vectorized:
-            raise ValueError("The argument to apply must be a pandas_udf")
+        if isinstance(udf, Column) or not hasattr(udf, 'func') \
+           or udf.pythonUdfType != PythonUdfType.PANDAS_UDF \
+           or len(inspect.getargspec(udf.func).args) != 1:
+            raise ValueError("The argument to apply must be a 1-arg pandas_udf")
         if not isinstance(udf.returnType, StructType):
             raise ValueError("The returnType of the pandas_udf must be a StructType")
 
@@ -268,8 +271,9 @@ def wrapped(*cols):
             return [(result[result.columns[i]], arrow_type)
                     for i, arrow_type in enumerate(arrow_return_types)]
 
-        wrapped_udf_obj = pandas_udf(wrapped, returnType)
-        udf_column = wrapped_udf_obj(*[df[col] for col in df.columns])
+        udf_obj = UserDefinedFunction(
+            wrapped, returnType, name=udf.__name__, pythonUdfType=PythonUdfType.PANDAS_GROUPED_UDF)
+        udf_column = udf_obj(*[df[col] for col in df.columns])
         jdf = self._jgd.flatMapGroupsInPandas(udf_column._jc.expr())
         return DataFrame(jdf, self.sql_ctx)
 
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index bac2ef84ae7a7..685eebcafefba 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -3383,6 +3383,15 @@ def test_vectorized_udf_varargs(self):
         res = df.select(f(col('id')))
         self.assertEquals(df.collect(), res.collect())
 
+    def test_vectorized_udf_unsupported_types(self):
+        from pyspark.sql.functions import pandas_udf, col
+        schema = StructType([StructField("dt", DateType(), True)])
+        df = self.spark.createDataFrame([(datetime.date(1970, 1, 1),)], schema=schema)
+        f = pandas_udf(lambda x: x, DateType())
+        with QuietTest(self.sc):
+            with self.assertRaisesRegexp(Exception, 'Unsupported data type'):
+                df.select(f(col('dt'))).collect()
+
 
 @unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed")
 class GroupbyApplyTests(ReusedPySparkTestCase):
@@ -3492,6 +3501,18 @@ def normalize(pdf):
         expected = expected.assign(norm=expected.norm.astype('float64'))
         self.assertFramesEqual(expected, result)
 
+    def test_datatype_string(self):
+        from pyspark.sql.functions import pandas_udf
+        df = self.data
+
+        foo_udf = pandas_udf(
+            lambda pdf: pdf.assign(v1=pdf.v * pdf.id * 1.0, v2=pdf.v + pdf.id),
+            "id long, v int, v1 double, v2 long")
+
+        result = df.groupby('id').apply(foo_udf).sort('id').toPandas()
+        expected = df.toPandas().groupby('id').apply(foo_udf.func).reset_index(drop=True)
+        self.assertFramesEqual(expected, result)
+
     def test_wrong_return_type(self):
         from pyspark.sql.functions import pandas_udf
         df = self.data
@@ -3517,9 +3538,25 @@ def test_wrong_args(self):
                 df.groupby('id').apply(sum(df.v))
             with self.assertRaisesRegexp(ValueError, 'pandas_udf'):
                 df.groupby('id').apply(df.v + 1)
+            with self.assertRaisesRegexp(ValueError, 'pandas_udf'):
+                df.groupby('id').apply(
+                    pandas_udf(lambda: 1, StructType([StructField("d", DoubleType())])))
+            with self.assertRaisesRegexp(ValueError, 'pandas_udf'):
+                df.groupby('id').apply(
+                    pandas_udf(lambda x, y: x, StructType([StructField("d", DoubleType())])))
             with self.assertRaisesRegexp(ValueError, 'returnType'):
                 df.groupby('id').apply(pandas_udf(lambda x: x, DoubleType()))
 
+    def test_unsupported_types(self):
+        from pyspark.sql.functions import pandas_udf, col
+        schema = StructType(
+            [StructField("id", LongType(), True), StructField("dt", DateType(), True)])
+        df = self.spark.createDataFrame([(1, datetime.date(1970, 1, 1),)], schema=schema)
+        f = pandas_udf(lambda x: x, df.schema)
+        with QuietTest(self.sc):
+            with self.assertRaisesRegexp(Exception, 'Unsupported data type'):
+                df.groupby('id').apply(f).collect()
+
 
 if __name__ == "__main__":
     from pyspark.sql.tests import *
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index eb6d48688dc0a..5e100e0a9a95d 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -32,7 +32,7 @@
 from pyspark.serializers import write_with_length, write_int, read_long, \
     write_long, read_int, SpecialLengths, PythonEvalType, UTF8Deserializer, PickleSerializer, \
     BatchedSerializer, ArrowStreamPandasSerializer
-from pyspark.sql.types import to_arrow_type, StructType
+from pyspark.sql.types import to_arrow_type
 from pyspark import shuffle
 
 pickleSer = PickleSerializer()
@@ -74,28 +74,19 @@ def wrap_udf(f, return_type):
 
 
 def wrap_pandas_udf(f, return_type):
-    # If the return_type is a StructType, it indicates this is a groupby apply udf,
-    # and has already been wrapped under apply(), otherwise, it's a vectorized column udf.
-    # We can distinguish these two by return type because in groupby apply, we always specify
-    # returnType as a StructType, and in vectorized column udf, StructType is not supported.
-    #
-    # TODO: Look into refactoring use of StructType to be more flexible for future pandas_udfs
-    if isinstance(return_type, StructType):
-        return lambda *a: f(*a)
-    else:
-        arrow_return_type = to_arrow_type(return_type)
+    arrow_return_type = to_arrow_type(return_type)
 
-        def verify_result_length(*a):
-            result = f(*a)
-            if not hasattr(result, "__len__"):
-                raise TypeError("Return type of the user-defined functon should be "
-                                "Pandas.Series, but is {}".format(type(result)))
-            if len(result) != len(a[0]):
-                raise RuntimeError("Result vector from pandas_udf was not the required length: "
-                                   "expected %d, got %d" % (len(a[0]), len(result)))
-            return result
+    def verify_result_length(*a):
+        result = f(*a)
+        if not hasattr(result, "__len__"):
+            raise TypeError("Return type of the user-defined functon should be "
+                            "Pandas.Series, but is {}".format(type(result)))
+        if len(result) != len(a[0]):
+            raise RuntimeError("Result vector from pandas_udf was not the required length: "
+                               "expected %d, got %d" % (len(a[0]), len(result)))
+        return result
 
-        return lambda *a: (verify_result_length(*a), arrow_return_type)
+    return lambda *a: (verify_result_length(*a), arrow_return_type)
 
 
 def read_single_udf(pickleSer, infile, eval_type):
@@ -111,6 +102,9 @@ def read_single_udf(pickleSer, infile, eval_type):
     # the last returnType will be the return type of UDF
     if eval_type == PythonEvalType.SQL_PANDAS_UDF:
         return arg_offsets, wrap_pandas_udf(row_func, return_type)
+    elif eval_type == PythonEvalType.SQL_PANDAS_GROUPED_UDF:
+        # a groupby apply udf has already been wrapped under apply()
+        return arg_offsets, row_func
     else:
         return arg_offsets, wrap_udf(row_func, return_type)
 
@@ -133,7 +127,8 @@ def read_udfs(pickleSer, infile, eval_type):
 
     func = lambda _, it: map(mapper, it)
 
-    if eval_type == PythonEvalType.SQL_PANDAS_UDF:
+    if eval_type == PythonEvalType.SQL_PANDAS_UDF \
+       or eval_type == PythonEvalType.SQL_PANDAS_GROUPED_UDF:
         ser = ArrowStreamPandasSerializer()
     else:
         ser = BatchedSerializer(PickleSerializer(), 100)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/pythonLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/pythonLogicalOperators.scala
index 8abab24bc9b44..254687ec00880 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/pythonLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/pythonLogicalOperators.scala
@@ -24,10 +24,11 @@ import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expre
  * This is used by DataFrame.groupby().apply().
  */
 case class FlatMapGroupsInPandas(
-  groupingAttributes: Seq[Attribute],
-  functionExpr: Expression,
-  output: Seq[Attribute],
-  child: LogicalPlan) extends UnaryNode {
+    groupingAttributes: Seq[Attribute],
+    functionExpr: Expression,
+    output: Seq[Attribute],
+    child: LogicalPlan) extends UnaryNode {
+
   /**
    * This is needed because output attributes are considered `references` when
    * passed through the constructor.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
index 33ec3a27110a8..6b45790d5ff6e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
@@ -30,7 +30,7 @@ import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.util.usePrettyExpression
 import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression
-import org.apache.spark.sql.execution.python.PythonUDF
+import org.apache.spark.sql.execution.python.{PythonUDF, PythonUdfType}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{NumericType, StructType}
 
@@ -437,7 +437,7 @@ class RelationalGroupedDataset protected[sql](
   }
 
   /**
-   * Applies a vectorized python user-defined function to each group of data.
+   * Applies a grouped vectorized python user-defined function to each group of data.
    * The user-defined function defines a transformation: `pandas.DataFrame` -> `pandas.DataFrame`.
    * For each group, all elements in the group are passed as a `pandas.DataFrame` and the results
    * for all groups are combined into a new [[DataFrame]].
@@ -449,7 +449,8 @@ class RelationalGroupedDataset protected[sql](
    * workers.
    */
   private[sql] def flatMapGroupsInPandas(expr: PythonUDF): DataFrame = {
-    require(expr.vectorized, "Must pass a vectorized python udf")
+    require(expr.pythonUdfType == PythonUdfType.PANDAS_GROUPED_UDF,
+      "Must pass a grouped vectorized python udf")
     require(expr.dataType.isInstanceOf[StructType],
       "The returnType of the vectorized python udf must be a StructType")
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
index e3f952e221d53..d6825369f7378 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
@@ -137,11 +137,15 @@ object ExtractPythonUDFs extends Rule[SparkPlan] with PredicateHelper {
           udf.references.subsetOf(child.outputSet)
         }
         if (validUdfs.nonEmpty) {
+          if (validUdfs.exists(_.pythonUdfType == PythonUdfType.PANDAS_GROUPED_UDF)) {
+            throw new IllegalArgumentException("Can not use grouped vectorized UDFs")
+          }
+
           val resultAttrs = udfs.zipWithIndex.map { case (u, i) =>
             AttributeReference(s"pythonUDF$i", u.dataType)()
           }
 
-          val evaluation = validUdfs.partition(_.vectorized) match {
+          val evaluation = validUdfs.partition(_.pythonUdfType == PythonUdfType.PANDAS_UDF) match {
             case (vectorizedUdfs, plainUdfs) if plainUdfs.isEmpty =>
               ArrowEvalPythonExec(vectorizedUdfs, child.output ++ resultAttrs, child)
             case (vectorizedUdfs, plainUdfs) if vectorizedUdfs.isEmpty =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasExec.scala
index b996b5bb38ba5..5ed88ada428cb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasExec.scala
@@ -94,7 +94,7 @@ case class FlatMapGroupsInPandasExec(
 
       val columnarBatchIter = new ArrowPythonRunner(
         chainedFunc, bufferSize, reuseWorker,
-        PythonEvalType.SQL_PANDAS_UDF, argOffsets, schema)
+        PythonEvalType.SQL_PANDAS_GROUPED_UDF, argOffsets, schema)
         .compute(grouped, context.partitionId(), context)
 
       columnarBatchIter.flatMap(_.rowIterator.asScala).map(UnsafeProjection.create(output, output))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDF.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDF.scala
index 84a6d9e5be59c..9c07c7638de57 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDF.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDF.scala
@@ -29,7 +29,7 @@ case class PythonUDF(
     func: PythonFunction,
     dataType: DataType,
     children: Seq[Expression],
-    vectorized: Boolean)
+    pythonUdfType: Int)
   extends Expression with Unevaluable with NonSQLExpression with UserDefinedExpression {
 
   override def toString: String = s"$name(${children.mkString(", ")})"
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/UserDefinedPythonFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/UserDefinedPythonFunction.scala
index a30a80acf5c23..b2fe6c300846a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/UserDefinedPythonFunction.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/UserDefinedPythonFunction.scala
@@ -22,6 +22,15 @@ import org.apache.spark.sql.Column
 import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.types.DataType
 
+private[spark] object PythonUdfType {
+  // row-at-a-time UDFs
+  val NORMAL_UDF = 0
+  // scalar vectorized UDFs
+  val PANDAS_UDF = 1
+  // grouped vectorized UDFs
+  val PANDAS_GROUPED_UDF = 2
+}
+
 /**
  * A user-defined Python function. This is used by the Python API.
  */
@@ -29,10 +38,10 @@ case class UserDefinedPythonFunction(
     name: String,
     func: PythonFunction,
     dataType: DataType,
-    vectorized: Boolean) {
+    pythonUdfType: Int) {
 
   def builder(e: Seq[Expression]): PythonUDF = {
-    PythonUDF(name, func, dataType, e, vectorized)
+    PythonUDF(name, func, dataType, e, pythonUdfType)
   }
 
   /** Returns a [[Column]] that will evaluate to calling this UDF with the given input. */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExecSuite.scala
index 153e6e1f88c70..95b21fc9f16ae 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExecSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExecSuite.scala
@@ -109,4 +109,4 @@ class MyDummyPythonUDF extends UserDefinedPythonFunction(
   name = "dummyUDF",
   func = new DummyUDF,
   dataType = BooleanType,
-  vectorized = false)
+  pythonUdfType = PythonUdfType.NORMAL_UDF)

From d9f286d261c6ee9e8dcb46e78d4666318ea25af2 Mon Sep 17 00:00:00 2001
From: Zhenhua Wang <wangzhenhua@huawei.com>
Date: Fri, 20 Oct 2017 20:58:55 -0700
Subject: [PATCH 742/779] [SPARK-22326][SQL] Remove unnecessary hashCode and
 equals methods

## What changes were proposed in this pull request?

Plan equality should be computed by `canonicalized`, so we can remove unnecessary `hashCode` and `equals` methods.

## How was this patch tested?

Existing tests.

Author: Zhenhua Wang <wangzhenhua@huawei.com>

Closes #19539 from wzhfy/remove_equals.
---
 .../apache/spark/sql/catalyst/catalog/interface.scala | 11 -----------
 .../sql/execution/datasources/LogicalRelation.scala   | 11 -----------
 2 files changed, 22 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index 975b084aa6188..1dbae4d37d8f5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -22,8 +22,6 @@ import java.util.Date
 
 import scala.collection.mutable
 
-import com.google.common.base.Objects
-
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.{FunctionIdentifier, InternalRow, TableIdentifier}
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
@@ -440,15 +438,6 @@ case class HiveTableRelation(
 
   def isPartitioned: Boolean = partitionCols.nonEmpty
 
-  override def equals(relation: Any): Boolean = relation match {
-    case other: HiveTableRelation => tableMeta == other.tableMeta && output == other.output
-    case _ => false
-  }
-
-  override def hashCode(): Int = {
-    Objects.hashCode(tableMeta.identifier, output)
-  }
-
   override lazy val canonicalized: HiveTableRelation = copy(
     tableMeta = tableMeta.copy(
       storage = CatalogStorageFormat.empty,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
index 17a61074d3b5c..3e98cb28453a2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
@@ -34,17 +34,6 @@ case class LogicalRelation(
     override val isStreaming: Boolean)
   extends LeafNode with MultiInstanceRelation {
 
-  // Logical Relations are distinct if they have different output for the sake of transformations.
-  override def equals(other: Any): Boolean = other match {
-    case l @ LogicalRelation(otherRelation, _, _, isStreaming) =>
-      relation == otherRelation && output == l.output && isStreaming == l.isStreaming
-    case _ => false
-  }
-
-  override def hashCode: Int = {
-    com.google.common.base.Objects.hashCode(relation, output)
-  }
-
   // Only care about relation when canonicalizing.
   override lazy val canonicalized: LogicalPlan = copy(
     output = output.map(QueryPlan.normalizeExprId(_, output)),

From d8cada8d1d3fce979a4bc1f9879593206722a3b9 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Sat, 21 Oct 2017 10:05:45 -0700
Subject: [PATCH 743/779] [SPARK-20331][SQL][FOLLOW-UP] Add a SQLConf for
 enhanced Hive partition pruning predicate pushdown

## What changes were proposed in this pull request?
This is a follow-up PR of https://github.com/apache/spark/pull/17633.

This PR is to add a conf `spark.sql.hive.advancedPartitionPredicatePushdown.enabled`, which can be used to turn the enhancement off.

## How was this patch tested?
Add a test case

Author: gatorsmile <gatorsmile@gmail.com>

Closes #19547 from gatorsmile/Spark20331FollowUp.
---
 .../apache/spark/sql/internal/SQLConf.scala   | 10 +++++++
 .../spark/sql/hive/client/HiveShim.scala      | 29 ++++++++++++++++++
 .../spark/sql/hive/client/FiltersSuite.scala  | 30 +++++++++++++++----
 3 files changed, 64 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 618d4a0d6148a..4cfe53b2c115b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -173,6 +173,13 @@ object SQLConf {
     .intConf
     .createWithDefault(4)
 
+  val ADVANCED_PARTITION_PREDICATE_PUSHDOWN =
+    buildConf("spark.sql.hive.advancedPartitionPredicatePushdown.enabled")
+      .internal()
+      .doc("When true, advanced partition predicate pushdown into Hive metastore is enabled.")
+      .booleanConf
+      .createWithDefault(true)
+
   val ENABLE_FALL_BACK_TO_HDFS_FOR_STATS =
     buildConf("spark.sql.statistics.fallBackToHdfs")
     .doc("If the table statistics are not available from table metadata enable fall back to hdfs." +
@@ -1092,6 +1099,9 @@ class SQLConf extends Serializable with Logging {
 
   def limitScaleUpFactor: Int = getConf(LIMIT_SCALE_UP_FACTOR)
 
+  def advancedPartitionPredicatePushdownEnabled: Boolean =
+    getConf(ADVANCED_PARTITION_PREDICATE_PUSHDOWN)
+
   def fallBackToHdfsForStatsEnabled: Boolean = getConf(ENABLE_FALL_BACK_TO_HDFS_FOR_STATS)
 
   def preferSortMergeJoin: Boolean = getConf(PREFER_SORTMERGEJOIN)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
index cde20da186acd..5c1ff2b76fdaa 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -585,6 +585,35 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
    * Unsupported predicates are skipped.
    */
   def convertFilters(table: Table, filters: Seq[Expression]): String = {
+    if (SQLConf.get.advancedPartitionPredicatePushdownEnabled) {
+      convertComplexFilters(table, filters)
+    } else {
+      convertBasicFilters(table, filters)
+    }
+  }
+
+  private def convertBasicFilters(table: Table, filters: Seq[Expression]): String = {
+    // hive varchar is treated as catalyst string, but hive varchar can't be pushed down.
+    lazy val varcharKeys = table.getPartitionKeys.asScala
+      .filter(col => col.getType.startsWith(serdeConstants.VARCHAR_TYPE_NAME) ||
+        col.getType.startsWith(serdeConstants.CHAR_TYPE_NAME))
+      .map(col => col.getName).toSet
+
+    filters.collect {
+      case op @ BinaryComparison(a: Attribute, Literal(v, _: IntegralType)) =>
+        s"${a.name} ${op.symbol} $v"
+      case op @ BinaryComparison(Literal(v, _: IntegralType), a: Attribute) =>
+        s"$v ${op.symbol} ${a.name}"
+      case op @ BinaryComparison(a: Attribute, Literal(v, _: StringType))
+        if !varcharKeys.contains(a.name) =>
+        s"""${a.name} ${op.symbol} ${quoteStringLiteral(v.toString)}"""
+      case op @ BinaryComparison(Literal(v, _: StringType), a: Attribute)
+        if !varcharKeys.contains(a.name) =>
+        s"""${quoteStringLiteral(v.toString)} ${op.symbol} ${a.name}"""
+    }.mkString(" and ")
+  }
+
+  private def convertComplexFilters(table: Table, filters: Seq[Expression]): String = {
     // hive varchar is treated as catalyst string, but hive varchar can't be pushed down.
     lazy val varcharKeys = table.getPartitionKeys.asScala
       .filter(col => col.getType.startsWith(serdeConstants.VARCHAR_TYPE_NAME) ||
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
index 031c1a5ec0ec3..19765695fbcb4 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
@@ -26,13 +26,15 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 
 /**
  * A set of tests for the filter conversion logic used when pushing partition pruning into the
  * metastore
  */
-class FiltersSuite extends SparkFunSuite with Logging {
+class FiltersSuite extends SparkFunSuite with Logging with PlanTest {
   private val shim = new Shim_v0_13
 
   private val testTable = new org.apache.hadoop.hive.ql.metadata.Table("default", "test")
@@ -72,10 +74,28 @@ class FiltersSuite extends SparkFunSuite with Logging {
 
   private def filterTest(name: String, filters: Seq[Expression], result: String) = {
     test(name) {
-      val converted = shim.convertFilters(testTable, filters)
-      if (converted != result) {
-        fail(
-          s"Expected filters ${filters.mkString(",")} to convert to '$result' but got '$converted'")
+      withSQLConf(SQLConf.ADVANCED_PARTITION_PREDICATE_PUSHDOWN.key -> "true") {
+        val converted = shim.convertFilters(testTable, filters)
+        if (converted != result) {
+          fail(s"Expected ${filters.mkString(",")} to convert to '$result' but got '$converted'")
+        }
+      }
+    }
+  }
+
+  test("turn on/off ADVANCED_PARTITION_PREDICATE_PUSHDOWN") {
+    import org.apache.spark.sql.catalyst.dsl.expressions._
+    Seq(true, false).foreach { enabled =>
+      withSQLConf(SQLConf.ADVANCED_PARTITION_PREDICATE_PUSHDOWN.key -> enabled.toString) {
+        val filters =
+          (Literal(1) === a("intcol", IntegerType) ||
+            Literal(2) === a("intcol", IntegerType)) :: Nil
+        val converted = shim.convertFilters(testTable, filters)
+        if (enabled) {
+          assert(converted == "(1 = intcol or 2 = intcol)")
+        } else {
+          assert(converted.isEmpty)
+        }
       }
     }
   }

From a763607e4fc24f4dc0f455b67a63acba5be1c80a Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Sat, 21 Oct 2017 10:07:31 -0700
Subject: [PATCH 744/779] [SPARK-21055][SQL][FOLLOW-UP] replace grouping__id
 with grouping_id()

## What changes were proposed in this pull request?
Simplifies the test cases that were added in the PR https://github.com/apache/spark/pull/18270.

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #19546 from gatorsmile/backportSPARK-21055.
---
 .../sql/hive/execution/SQLQuerySuite.scala    | 306 ++++++------------
 1 file changed, 104 insertions(+), 202 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 2476a440ad82c..1cf1c5cd5a472 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -1376,223 +1376,125 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
   }
 
   test("SPARK-8976 Wrong Result for Rollup #1") {
-    checkAnswer(sql(
-      "SELECT count(*) AS cnt, key % 5, grouping_id() FROM src GROUP BY key%5 WITH ROLLUP"),
-      Seq(
-        (113, 3, 0),
-        (91, 0, 0),
-        (500, null, 1),
-        (84, 1, 0),
-        (105, 2, 0),
-        (107, 4, 0)
-      ).map(i => Row(i._1, i._2, i._3)))
-  }
-
-  test("SPARK-21055 replace grouping__id: Wrong Result for Rollup #1") {
-    checkAnswer(sql(
-      "SELECT count(*) AS cnt, key % 5, grouping__id FROM src GROUP BY key%5 WITH ROLLUP"),
-      Seq(
-        (113, 3, 0),
-        (91, 0, 0),
-        (500, null, 1),
-        (84, 1, 0),
-        (105, 2, 0),
-        (107, 4, 0)
-      ).map(i => Row(i._1, i._2, i._3)))
+    Seq("grouping_id()", "grouping__id").foreach { gid =>
+      checkAnswer(sql(
+        s"SELECT count(*) AS cnt, key % 5, $gid FROM src GROUP BY key%5 WITH ROLLUP"),
+        Seq(
+          (113, 3, 0),
+          (91, 0, 0),
+          (500, null, 1),
+          (84, 1, 0),
+          (105, 2, 0),
+          (107, 4, 0)
+        ).map(i => Row(i._1, i._2, i._3)))
+    }
   }
 
   test("SPARK-8976 Wrong Result for Rollup #2") {
-    checkAnswer(sql(
-      """
-        |SELECT count(*) AS cnt, key % 5 AS k1, key-5 AS k2, grouping_id() AS k3
-        |FROM src GROUP BY key%5, key-5
-        |WITH ROLLUP ORDER BY cnt, k1, k2, k3 LIMIT 10
-      """.stripMargin),
-      Seq(
-        (1, 0, 5, 0),
-        (1, 0, 15, 0),
-        (1, 0, 25, 0),
-        (1, 0, 60, 0),
-        (1, 0, 75, 0),
-        (1, 0, 80, 0),
-        (1, 0, 100, 0),
-        (1, 0, 140, 0),
-        (1, 0, 145, 0),
-        (1, 0, 150, 0)
-      ).map(i => Row(i._1, i._2, i._3, i._4)))
-  }
-
-  test("SPARK-21055 replace grouping__id: Wrong Result for Rollup #2") {
-    checkAnswer(sql(
-      """
-        |SELECT count(*) AS cnt, key % 5 AS k1, key-5 AS k2, grouping__id AS k3
-        |FROM src GROUP BY key%5, key-5
-        |WITH ROLLUP ORDER BY cnt, k1, k2, k3 LIMIT 10
-      """.stripMargin),
-      Seq(
-        (1, 0, 5, 0),
-        (1, 0, 15, 0),
-        (1, 0, 25, 0),
-        (1, 0, 60, 0),
-        (1, 0, 75, 0),
-        (1, 0, 80, 0),
-        (1, 0, 100, 0),
-        (1, 0, 140, 0),
-        (1, 0, 145, 0),
-        (1, 0, 150, 0)
-      ).map(i => Row(i._1, i._2, i._3, i._4)))
+    Seq("grouping_id()", "grouping__id").foreach { gid =>
+      checkAnswer(sql(
+        s"""
+          |SELECT count(*) AS cnt, key % 5 AS k1, key-5 AS k2, $gid AS k3
+          |FROM src GROUP BY key%5, key-5
+          |WITH ROLLUP ORDER BY cnt, k1, k2, k3 LIMIT 10
+        """.stripMargin),
+        Seq(
+          (1, 0, 5, 0),
+          (1, 0, 15, 0),
+          (1, 0, 25, 0),
+          (1, 0, 60, 0),
+          (1, 0, 75, 0),
+          (1, 0, 80, 0),
+          (1, 0, 100, 0),
+          (1, 0, 140, 0),
+          (1, 0, 145, 0),
+          (1, 0, 150, 0)
+        ).map(i => Row(i._1, i._2, i._3, i._4)))
+    }
   }
 
   test("SPARK-8976 Wrong Result for Rollup #3") {
-    checkAnswer(sql(
-      """
-        |SELECT count(*) AS cnt, key % 5 AS k1, key-5 AS k2, grouping_id() AS k3
-        |FROM (SELECT key, key%2, key - 5 FROM src) t GROUP BY key%5, key-5
-        |WITH ROLLUP ORDER BY cnt, k1, k2, k3 LIMIT 10
-      """.stripMargin),
-      Seq(
-        (1, 0, 5, 0),
-        (1, 0, 15, 0),
-        (1, 0, 25, 0),
-        (1, 0, 60, 0),
-        (1, 0, 75, 0),
-        (1, 0, 80, 0),
-        (1, 0, 100, 0),
-        (1, 0, 140, 0),
-        (1, 0, 145, 0),
-        (1, 0, 150, 0)
-      ).map(i => Row(i._1, i._2, i._3, i._4)))
-  }
-
-  test("SPARK-21055 replace grouping__id: Wrong Result for Rollup #3") {
-    checkAnswer(sql(
-      """
-        |SELECT count(*) AS cnt, key % 5 AS k1, key-5 AS k2, grouping__id AS k3
-        |FROM (SELECT key, key%2, key - 5 FROM src) t GROUP BY key%5, key-5
-        |WITH ROLLUP ORDER BY cnt, k1, k2, k3 LIMIT 10
-      """.stripMargin),
-      Seq(
-        (1, 0, 5, 0),
-        (1, 0, 15, 0),
-        (1, 0, 25, 0),
-        (1, 0, 60, 0),
-        (1, 0, 75, 0),
-        (1, 0, 80, 0),
-        (1, 0, 100, 0),
-        (1, 0, 140, 0),
-        (1, 0, 145, 0),
-        (1, 0, 150, 0)
-      ).map(i => Row(i._1, i._2, i._3, i._4)))
+    Seq("grouping_id()", "grouping__id").foreach { gid =>
+      checkAnswer(sql(
+        s"""
+          |SELECT count(*) AS cnt, key % 5 AS k1, key-5 AS k2, $gid AS k3
+          |FROM (SELECT key, key%2, key - 5 FROM src) t GROUP BY key%5, key-5
+          |WITH ROLLUP ORDER BY cnt, k1, k2, k3 LIMIT 10
+        """.stripMargin),
+        Seq(
+          (1, 0, 5, 0),
+          (1, 0, 15, 0),
+          (1, 0, 25, 0),
+          (1, 0, 60, 0),
+          (1, 0, 75, 0),
+          (1, 0, 80, 0),
+          (1, 0, 100, 0),
+          (1, 0, 140, 0),
+          (1, 0, 145, 0),
+          (1, 0, 150, 0)
+        ).map(i => Row(i._1, i._2, i._3, i._4)))
+    }
   }
 
   test("SPARK-8976 Wrong Result for CUBE #1") {
-    checkAnswer(sql(
-      "SELECT count(*) AS cnt, key % 5, grouping_id() FROM src GROUP BY key%5 WITH CUBE"),
-      Seq(
-        (113, 3, 0),
-        (91, 0, 0),
-        (500, null, 1),
-        (84, 1, 0),
-        (105, 2, 0),
-        (107, 4, 0)
-      ).map(i => Row(i._1, i._2, i._3)))
-  }
-
-  test("SPARK-21055 replace grouping__id: Wrong Result for CUBE #1") {
-    checkAnswer(sql(
-      "SELECT count(*) AS cnt, key % 5, grouping__id FROM src GROUP BY key%5 WITH CUBE"),
-      Seq(
-        (113, 3, 0),
-        (91, 0, 0),
-        (500, null, 1),
-        (84, 1, 0),
-        (105, 2, 0),
-        (107, 4, 0)
-      ).map(i => Row(i._1, i._2, i._3)))
+    Seq("grouping_id()", "grouping__id").foreach { gid =>
+      checkAnswer(sql(
+        s"SELECT count(*) AS cnt, key % 5, $gid FROM src GROUP BY key%5 WITH CUBE"),
+        Seq(
+          (113, 3, 0),
+          (91, 0, 0),
+          (500, null, 1),
+          (84, 1, 0),
+          (105, 2, 0),
+          (107, 4, 0)
+        ).map(i => Row(i._1, i._2, i._3)))
+    }
   }
 
   test("SPARK-8976 Wrong Result for CUBE #2") {
-    checkAnswer(sql(
-      """
-        |SELECT count(*) AS cnt, key % 5 AS k1, key-5 AS k2, grouping_id() AS k3
-        |FROM (SELECT key, key%2, key - 5 FROM src) t GROUP BY key%5, key-5
-        |WITH CUBE ORDER BY cnt, k1, k2, k3 LIMIT 10
-      """.stripMargin),
-    Seq(
-      (1, null, -3, 2),
-      (1, null, -1, 2),
-      (1, null, 3, 2),
-      (1, null, 4, 2),
-      (1, null, 5, 2),
-      (1, null, 6, 2),
-      (1, null, 12, 2),
-      (1, null, 14, 2),
-      (1, null, 15, 2),
-      (1, null, 22, 2)
-    ).map(i => Row(i._1, i._2, i._3, i._4)))
-  }
-
-  test("SPARK-21055 replace grouping__id: Wrong Result for CUBE #2") {
-    checkAnswer(sql(
-      """
-        |SELECT count(*) AS cnt, key % 5 AS k1, key-5 AS k2, grouping__id AS k3
-        |FROM (SELECT key, key%2, key - 5 FROM src) t GROUP BY key%5, key-5
-        |WITH CUBE ORDER BY cnt, k1, k2, k3 LIMIT 10
-      """.stripMargin),
-      Seq(
-        (1, null, -3, 2),
-        (1, null, -1, 2),
-        (1, null, 3, 2),
-        (1, null, 4, 2),
-        (1, null, 5, 2),
-        (1, null, 6, 2),
-        (1, null, 12, 2),
-        (1, null, 14, 2),
-        (1, null, 15, 2),
-        (1, null, 22, 2)
-      ).map(i => Row(i._1, i._2, i._3, i._4)))
+    Seq("grouping_id()", "grouping__id").foreach { gid =>
+      checkAnswer(sql(
+        s"""
+          |SELECT count(*) AS cnt, key % 5 AS k1, key-5 AS k2, $gid AS k3
+          |FROM (SELECT key, key%2, key - 5 FROM src) t GROUP BY key%5, key-5
+          |WITH CUBE ORDER BY cnt, k1, k2, k3 LIMIT 10
+        """.stripMargin),
+        Seq(
+          (1, null, -3, 2),
+          (1, null, -1, 2),
+          (1, null, 3, 2),
+          (1, null, 4, 2),
+          (1, null, 5, 2),
+          (1, null, 6, 2),
+          (1, null, 12, 2),
+          (1, null, 14, 2),
+          (1, null, 15, 2),
+          (1, null, 22, 2)
+        ).map(i => Row(i._1, i._2, i._3, i._4)))
+    }
   }
 
   test("SPARK-8976 Wrong Result for GroupingSet") {
-    checkAnswer(sql(
-      """
-        |SELECT count(*) AS cnt, key % 5 AS k1, key-5 AS k2, grouping_id() AS k3
-        |FROM (SELECT key, key%2, key - 5 FROM src) t GROUP BY key%5, key-5
-        |GROUPING SETS (key%5, key-5) ORDER BY cnt, k1, k2, k3 LIMIT 10
-      """.stripMargin),
-    Seq(
-      (1, null, -3, 2),
-      (1, null, -1, 2),
-      (1, null, 3, 2),
-      (1, null, 4, 2),
-      (1, null, 5, 2),
-      (1, null, 6, 2),
-      (1, null, 12, 2),
-      (1, null, 14, 2),
-      (1, null, 15, 2),
-      (1, null, 22, 2)
-    ).map(i => Row(i._1, i._2, i._3, i._4)))
-  }
-
-  test("SPARK-21055 replace grouping__id: Wrong Result for GroupingSet") {
-    checkAnswer(sql(
-      """
-        |SELECT count(*) AS cnt, key % 5 AS k1, key-5 AS k2, grouping__id AS k3
-        |FROM (SELECT key, key%2, key - 5 FROM src) t GROUP BY key%5, key-5
-        |GROUPING SETS (key%5, key-5) ORDER BY cnt, k1, k2, k3 LIMIT 10
-      """.stripMargin),
-      Seq(
-        (1, null, -3, 2),
-        (1, null, -1, 2),
-        (1, null, 3, 2),
-        (1, null, 4, 2),
-        (1, null, 5, 2),
-        (1, null, 6, 2),
-        (1, null, 12, 2),
-        (1, null, 14, 2),
-        (1, null, 15, 2),
-        (1, null, 22, 2)
-      ).map(i => Row(i._1, i._2, i._3, i._4)))
+    Seq("grouping_id()", "grouping__id").foreach { gid =>
+      checkAnswer(sql(
+        s"""
+          |SELECT count(*) AS cnt, key % 5 AS k1, key-5 AS k2, $gid AS k3
+          |FROM (SELECT key, key%2, key - 5 FROM src) t GROUP BY key%5, key-5
+          |GROUPING SETS (key%5, key-5) ORDER BY cnt, k1, k2, k3 LIMIT 10
+        """.stripMargin),
+        Seq(
+          (1, null, -3, 2),
+          (1, null, -1, 2),
+          (1, null, 3, 2),
+          (1, null, 4, 2),
+          (1, null, 5, 2),
+          (1, null, 6, 2),
+          (1, null, 12, 2),
+          (1, null, 14, 2),
+          (1, null, 15, 2),
+          (1, null, 22, 2)
+        ).map(i => Row(i._1, i._2, i._3, i._4)))
+    }
   }
 
   ignore("SPARK-10562: partition by column with mixed case name") {

From ff8de99a1c7b4a291e661cd0ad12748f4321e43d Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sun, 22 Oct 2017 02:22:35 +0900
Subject: [PATCH 745/779] [SPARK-22302][INFRA] Remove manual backports for
 subprocess and print explicit message for < Python 2.7

## What changes were proposed in this pull request?

Seems there was a mistake - missing import for `subprocess.call`, while refactoring this script a long ago, which should be used for backports of some missing functions in `subprocess`, specifically in < Python 2.7.

Reproduction is:

```
cd dev && python2.6
```

```
>>> from sparktestsupport import shellutils
>>> shellutils.subprocess_check_call("ls")
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "sparktestsupport/shellutils.py", line 46, in subprocess_check_call
    retcode = call(*popenargs, **kwargs)
NameError: global name 'call' is not defined
```

For Jenkins logs, please see https://amplab.cs.berkeley.edu/jenkins/job/NewSparkPullRequestBuilder/3950/console

Since we dropped the Python 2.6.x support, looks better we remove those workarounds and print out explicit error messages in order to reduce the efforts to find out the root causes for such cases, for example, `https://github.com/apache/spark/pull/19513#issuecomment-337406734`.

## How was this patch tested?

Manually tested:

```
./dev/run-tests
```

```
Python versions prior to 2.7 are not supported.
```

```
./dev/run-tests-jenkins
```

```
Python versions prior to 2.7 are not supported.
```

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #19524 from HyukjinKwon/SPARK-22302.
---
 dev/run-tests                      |  6 ++++++
 dev/run-tests-jenkins              |  7 ++++++-
 dev/sparktestsupport/shellutils.py | 31 ++----------------------------
 3 files changed, 14 insertions(+), 30 deletions(-)

diff --git a/dev/run-tests b/dev/run-tests
index 257d1e8d50bb4..9cf93d000d0ea 100755
--- a/dev/run-tests
+++ b/dev/run-tests
@@ -20,4 +20,10 @@
 FWDIR="$(cd "`dirname $0`"/..; pwd)"
 cd "$FWDIR"
 
+PYTHON_VERSION_CHECK=$(python -c 'import sys; print(sys.version_info < (2, 7, 0))')
+if [[ "$PYTHON_VERSION_CHECK" == "True" ]]; then
+  echo "Python versions prior to 2.7 are not supported."
+  exit -1
+fi
+
 exec python -u ./dev/run-tests.py "$@"
diff --git a/dev/run-tests-jenkins b/dev/run-tests-jenkins
index f41f1ac79e381..03fd6ff0fba40 100755
--- a/dev/run-tests-jenkins
+++ b/dev/run-tests-jenkins
@@ -25,5 +25,10 @@
 FWDIR="$( cd "$( dirname "$0" )/.." && pwd )"
 cd "$FWDIR"
 
-export PATH=/home/anaconda/bin:$PATH
+PYTHON_VERSION_CHECK=$(python -c 'import sys; print(sys.version_info < (2, 7, 0))')
+if [[ "$PYTHON_VERSION_CHECK" == "True" ]]; then
+  echo "Python versions prior to 2.7 are not supported."
+  exit -1
+fi
+
 exec python -u ./dev/run-tests-jenkins.py "$@"
diff --git a/dev/sparktestsupport/shellutils.py b/dev/sparktestsupport/shellutils.py
index 05af87189b18d..c7644da88f770 100644
--- a/dev/sparktestsupport/shellutils.py
+++ b/dev/sparktestsupport/shellutils.py
@@ -21,35 +21,8 @@
 import subprocess
 import sys
 
-
-if sys.version_info >= (2, 7):
-    subprocess_check_output = subprocess.check_output
-    subprocess_check_call = subprocess.check_call
-else:
-    # SPARK-8763
-    # backported from subprocess module in Python 2.7
-    def subprocess_check_output(*popenargs, **kwargs):
-        if 'stdout' in kwargs:
-            raise ValueError('stdout argument not allowed, it will be overridden.')
-        process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs)
-        output, unused_err = process.communicate()
-        retcode = process.poll()
-        if retcode:
-            cmd = kwargs.get("args")
-            if cmd is None:
-                cmd = popenargs[0]
-            raise subprocess.CalledProcessError(retcode, cmd, output=output)
-        return output
-
-    # backported from subprocess module in Python 2.7
-    def subprocess_check_call(*popenargs, **kwargs):
-        retcode = call(*popenargs, **kwargs)
-        if retcode:
-            cmd = kwargs.get("args")
-            if cmd is None:
-                cmd = popenargs[0]
-            raise CalledProcessError(retcode, cmd)
-        return 0
+subprocess_check_output = subprocess.check_output
+subprocess_check_call = subprocess.check_call
 
 
 def exit_from_command_with_retcode(cmd, retcode):

From ca2a780e7c4c4df2488ef933241c6e65264f8d3c Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Sat, 21 Oct 2017 18:01:45 -0700
Subject: [PATCH 746/779] [SPARK-21929][SQL] Support `ALTER TABLE table_name
 ADD COLUMNS(..)` for ORC data source

## What changes were proposed in this pull request?

When [SPARK-19261](https://issues.apache.org/jira/browse/SPARK-19261) implements `ALTER TABLE ADD COLUMNS`, ORC data source is omitted due to SPARK-14387, SPARK-16628, and SPARK-18355. Now, those issues are fixed and Spark 2.3 is [using Spark schema to read ORC table instead of ORC file schema](https://github.com/apache/spark/commit/e6e36004afc3f9fc8abea98542248e9de11b4435). This PR enables `ALTER TABLE ADD COLUMNS` for ORC data source.

## How was this patch tested?

Pass the updated and added test cases.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #19545 from dongjoon-hyun/SPARK-21929.
---
 .../spark/sql/execution/command/tables.scala  |  3 +-
 .../sql/execution/command/DDLSuite.scala      | 90 ++++++++++---------
 .../sql/hive/execution/HiveDDLSuite.scala     |  8 ++
 3 files changed, 58 insertions(+), 43 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index 8d95ca6921cf8..38f91639c0422 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -235,11 +235,10 @@ case class AlterTableAddColumnsCommand(
       DataSource.lookupDataSource(catalogTable.provider.get).newInstance() match {
         // For datasource table, this command can only support the following File format.
         // TextFileFormat only default to one column "value"
-        // OrcFileFormat can not handle difference between user-specified schema and
-        // inferred schema yet. TODO, once this issue is resolved , we can add Orc back.
         // Hive type is already considered as hive serde table, so the logic will not
         // come in here.
         case _: JsonFileFormat | _: CSVFileFormat | _: ParquetFileFormat =>
+        case s if s.getClass.getCanonicalName.endsWith("OrcFileFormat") =>
         case s =>
           throw new AnalysisException(
             s"""
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 4ed2cecc5faff..21a2c62929146 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -2202,56 +2202,64 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
     }
   }
 
+  protected def testAddColumn(provider: String): Unit = {
+    withTable("t1") {
+      sql(s"CREATE TABLE t1 (c1 int) USING $provider")
+      sql("INSERT INTO t1 VALUES (1)")
+      sql("ALTER TABLE t1 ADD COLUMNS (c2 int)")
+      checkAnswer(
+        spark.table("t1"),
+        Seq(Row(1, null))
+      )
+      checkAnswer(
+        sql("SELECT * FROM t1 WHERE c2 is null"),
+        Seq(Row(1, null))
+      )
+
+      sql("INSERT INTO t1 VALUES (3, 2)")
+      checkAnswer(
+        sql("SELECT * FROM t1 WHERE c2 = 2"),
+        Seq(Row(3, 2))
+      )
+    }
+  }
+
+  protected def testAddColumnPartitioned(provider: String): Unit = {
+    withTable("t1") {
+      sql(s"CREATE TABLE t1 (c1 int, c2 int) USING $provider PARTITIONED BY (c2)")
+      sql("INSERT INTO t1 PARTITION(c2 = 2) VALUES (1)")
+      sql("ALTER TABLE t1 ADD COLUMNS (c3 int)")
+      checkAnswer(
+        spark.table("t1"),
+        Seq(Row(1, null, 2))
+      )
+      checkAnswer(
+        sql("SELECT * FROM t1 WHERE c3 is null"),
+        Seq(Row(1, null, 2))
+      )
+      sql("INSERT INTO t1 PARTITION(c2 =1) VALUES (2, 3)")
+      checkAnswer(
+        sql("SELECT * FROM t1 WHERE c3 = 3"),
+        Seq(Row(2, 3, 1))
+      )
+      checkAnswer(
+        sql("SELECT * FROM t1 WHERE c2 = 1"),
+        Seq(Row(2, 3, 1))
+      )
+    }
+  }
+
   val supportedNativeFileFormatsForAlterTableAddColumns = Seq("parquet", "json", "csv")
 
   supportedNativeFileFormatsForAlterTableAddColumns.foreach { provider =>
     test(s"alter datasource table add columns - $provider") {
-      withTable("t1") {
-        sql(s"CREATE TABLE t1 (c1 int) USING $provider")
-        sql("INSERT INTO t1 VALUES (1)")
-        sql("ALTER TABLE t1 ADD COLUMNS (c2 int)")
-        checkAnswer(
-          spark.table("t1"),
-          Seq(Row(1, null))
-        )
-        checkAnswer(
-          sql("SELECT * FROM t1 WHERE c2 is null"),
-          Seq(Row(1, null))
-        )
-
-        sql("INSERT INTO t1 VALUES (3, 2)")
-        checkAnswer(
-          sql("SELECT * FROM t1 WHERE c2 = 2"),
-          Seq(Row(3, 2))
-        )
-      }
+      testAddColumn(provider)
     }
   }
 
   supportedNativeFileFormatsForAlterTableAddColumns.foreach { provider =>
     test(s"alter datasource table add columns - partitioned - $provider") {
-      withTable("t1") {
-        sql(s"CREATE TABLE t1 (c1 int, c2 int) USING $provider PARTITIONED BY (c2)")
-        sql("INSERT INTO t1 PARTITION(c2 = 2) VALUES (1)")
-        sql("ALTER TABLE t1 ADD COLUMNS (c3 int)")
-        checkAnswer(
-          spark.table("t1"),
-          Seq(Row(1, null, 2))
-        )
-        checkAnswer(
-          sql("SELECT * FROM t1 WHERE c3 is null"),
-          Seq(Row(1, null, 2))
-        )
-        sql("INSERT INTO t1 PARTITION(c2 =1) VALUES (2, 3)")
-        checkAnswer(
-          sql("SELECT * FROM t1 WHERE c3 = 3"),
-          Seq(Row(2, 3, 1))
-        )
-        checkAnswer(
-          sql("SELECT * FROM t1 WHERE c2 = 1"),
-          Seq(Row(2, 3, 1))
-        )
-      }
+      testAddColumnPartitioned(provider)
     }
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index 02e26bbe876a0..d3465a641a1a4 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -166,6 +166,14 @@ class HiveCatalogedDDLSuite extends DDLSuite with TestHiveSingleton with BeforeA
   test("drop table") {
     testDropTable(isDatasourceTable = false)
   }
+
+  test("alter datasource table add columns - orc") {
+    testAddColumn("orc")
+  }
+
+  test("alter datasource table add columns - partitioned - orc") {
+    testAddColumnPartitioned("orc")
+  }
 }
 
 class HiveDDLSuite

From 57accf6e3965ff69adc4408623916c5003918235 Mon Sep 17 00:00:00 2001
From: Steven Rand <srand@palantir.com>
Date: Mon, 23 Oct 2017 09:43:45 +0800
Subject: [PATCH 747/779] [SPARK-22319][CORE] call loginUserFromKeytab before
 accessing hdfs

In `SparkSubmit`, call `loginUserFromKeytab` before attempting to make RPC calls to the NameNode.

I manually tested this patch by:

1. Confirming that my Spark application failed to launch with the error reported in https://issues.apache.org/jira/browse/SPARK-22319.
2. Applying this patch and confirming that the app no longer fails to launch, even when I have not manually run `kinit` on the host.

Presumably we also want integration tests for secure clusters so that we catch this sort of thing. I'm happy to take a shot at this if it's feasible and someone can point me in the right direction.

Author: Steven Rand <srand@palantir.com>

Closes #19540 from sjrand/SPARK-22319.

Change-Id: Ic306bfe7181107fbcf92f61d75856afcb5b6f761
---
 .../org/apache/spark/deploy/SparkSubmit.scala | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 135bbe93bf28e..b7e6d0ea021a4 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -342,6 +342,22 @@ object SparkSubmit extends CommandLineUtils with Logging {
     val hadoopConf = conf.getOrElse(SparkHadoopUtil.newConfiguration(sparkConf))
     val targetDir = Utils.createTempDir()
 
+    // assure a keytab is available from any place in a JVM
+    if (clusterManager == YARN || clusterManager == LOCAL || clusterManager == MESOS) {
+      if (args.principal != null) {
+        if (args.keytab != null) {
+          require(new File(args.keytab).exists(), s"Keytab file: ${args.keytab} does not exist")
+          // Add keytab and principal configurations in sysProps to make them available
+          // for later use; e.g. in spark sql, the isolated class loader used to talk
+          // to HiveMetastore will use these settings. They will be set as Java system
+          // properties and then loaded by SparkConf
+          sysProps.put("spark.yarn.keytab", args.keytab)
+          sysProps.put("spark.yarn.principal", args.principal)
+          UserGroupInformation.loginUserFromKeytab(args.principal, args.keytab)
+        }
+      }
+    }
+
     // Resolve glob path for different resources.
     args.jars = Option(args.jars).map(resolveGlobPaths(_, hadoopConf)).orNull
     args.files = Option(args.files).map(resolveGlobPaths(_, hadoopConf)).orNull
@@ -641,22 +657,6 @@ object SparkSubmit extends CommandLineUtils with Logging {
       }
     }
 
-    // assure a keytab is available from any place in a JVM
-    if (clusterManager == YARN || clusterManager == LOCAL || clusterManager == MESOS) {
-      if (args.principal != null) {
-        if (args.keytab != null) {
-          require(new File(args.keytab).exists(), s"Keytab file: ${args.keytab} does not exist")
-          // Add keytab and principal configurations in sysProps to make them available
-          // for later use; e.g. in spark sql, the isolated class loader used to talk
-          // to HiveMetastore will use these settings. They will be set as Java system
-          // properties and then loaded by SparkConf
-          sysProps.put("spark.yarn.keytab", args.keytab)
-          sysProps.put("spark.yarn.principal", args.principal)
-          UserGroupInformation.loginUserFromKeytab(args.principal, args.keytab)
-        }
-      }
-    }
-
     if (clusterManager == MESOS && UserGroupInformation.isSecurityEnabled) {
       setRMPrincipal(sysProps)
     }

From 5a5b6b78517b526771ee5b579d56aa1daa4b3ef1 Mon Sep 17 00:00:00 2001
From: Kohki Nishio <taroplus@me.com>
Date: Mon, 23 Oct 2017 09:55:46 -0700
Subject: [PATCH 748/779] [SPARK-22303][SQL] Handle Oracle specific jdbc types
 in OracleDialect

TIMESTAMP (-101), BINARY_DOUBLE (101) and BINARY_FLOAT (100) are handled in OracleDialect

## What changes were proposed in this pull request?

When a oracle table contains columns whose type is BINARY_FLOAT or BINARY_DOUBLE, spark sql fails to load a table with SQLException

```
java.sql.SQLException: Unsupported type 101
 at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.org$apache$spark$sql$execution$datasources$jdbc$JdbcUtils$$getCatalystType(JdbcUtils.scala:235)
 at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$8.apply(JdbcUtils.scala:292)
 at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$8.apply(JdbcUtils.scala:292)
 at scala.Option.getOrElse(Option.scala:121)
 at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.getSchema(JdbcUtils.scala:291)
 at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$.resolveTable(JDBCRDD.scala:64)
 at org.apache.spark.sql.execution.datasources.jdbc.JDBCRelation.<init>(JDBCRelation.scala:113)
 at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:47)
 at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:306)
 at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
 at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:146)
```

## How was this patch tested?

I updated a UT which covers type conversion test for types (-101, 100, 101), on top of that I tested this change against actual table with those columns and it was able to read and write to the table.

Author: Kohki Nishio <taroplus@me.com>

Closes #19548 from taroplus/oracle_sql_types_101.
---
 .../sql/jdbc/OracleIntegrationSuite.scala     | 43 +++++++++++++++---
 .../datasources/jdbc/JdbcUtils.scala          |  1 -
 .../apache/spark/sql/jdbc/OracleDialect.scala | 44 +++++++++++--------
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala |  6 +++
 4 files changed, 68 insertions(+), 26 deletions(-)

diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala
index 7680ae3835132..90343182712ed 100644
--- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala
+++ b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala
@@ -21,7 +21,7 @@ import java.sql.{Connection, Date, Timestamp}
 import java.util.Properties
 import java.math.BigDecimal
 
-import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.{DataFrame, Row, SaveMode}
 import org.apache.spark.sql.execution.{WholeStageCodegenExec, RowDataSourceScanExec}
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
@@ -52,7 +52,7 @@ class OracleIntegrationSuite extends DockerJDBCIntegrationSuite with SharedSQLCo
   import testImplicits._
 
   override val db = new DatabaseOnDocker {
-    override val imageName = "wnameless/oracle-xe-11g:14.04.4"
+    override val imageName = "wnameless/oracle-xe-11g:16.04"
     override val env = Map(
       "ORACLE_ROOT_PASSWORD" -> "oracle"
     )
@@ -104,15 +104,18 @@ class OracleIntegrationSuite extends DockerJDBCIntegrationSuite with SharedSQLCo
       """.stripMargin.replaceAll("\n", " "))
 
 
-    conn.prepareStatement("CREATE TABLE numerics (b DECIMAL(1), f DECIMAL(3, 2), i DECIMAL(10))").executeUpdate();
+    conn.prepareStatement("CREATE TABLE numerics (b DECIMAL(1), f DECIMAL(3, 2), i DECIMAL(10))").executeUpdate()
     conn.prepareStatement(
-      "INSERT INTO numerics VALUES (4, 1.23, 9999999999)").executeUpdate();
-    conn.commit();
+      "INSERT INTO numerics VALUES (4, 1.23, 9999999999)").executeUpdate()
+    conn.commit()
+
+    conn.prepareStatement("CREATE TABLE oracle_types (d BINARY_DOUBLE, f BINARY_FLOAT)").executeUpdate()
+    conn.commit()
   }
 
 
   test("SPARK-16625 : Importing Oracle numeric types") {
-    val df = sqlContext.read.jdbc(jdbcUrl, "numerics", new Properties);
+    val df = sqlContext.read.jdbc(jdbcUrl, "numerics", new Properties)
     val rows = df.collect()
     assert(rows.size == 1)
     val row = rows(0)
@@ -307,4 +310,32 @@ class OracleIntegrationSuite extends DockerJDBCIntegrationSuite with SharedSQLCo
     assert(values.getInt(1).equals(1))
     assert(values.getBoolean(2).equals(false))
   }
+
+  test("SPARK-22303: handle BINARY_DOUBLE and BINARY_FLOAT as DoubleType and FloatType") {
+    val tableName = "oracle_types"
+    val schema = StructType(Seq(
+      StructField("d", DoubleType, true),
+      StructField("f", FloatType, true)))
+    val props = new Properties()
+
+    // write it back to the table (append mode)
+    val data = spark.sparkContext.parallelize(Seq(Row(1.1, 2.2f)))
+    val dfWrite = spark.createDataFrame(data, schema)
+    dfWrite.write.mode(SaveMode.Append).jdbc(jdbcUrl, tableName, props)
+
+    // read records from oracle_types
+    val dfRead = sqlContext.read.jdbc(jdbcUrl, tableName, new Properties)
+    val rows = dfRead.collect()
+    assert(rows.size == 1)
+
+    // check data types
+    val types = dfRead.schema.map(field => field.dataType)
+    assert(types(0).equals(DoubleType))
+    assert(types(1).equals(FloatType))
+
+    // check values
+    val values = rows(0)
+    assert(values.getDouble(0) === 1.1)
+    assert(values.getFloat(1) === 2.2f)
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
index 71133666b3249..9debc4ff82748 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
@@ -230,7 +230,6 @@ object JdbcUtils extends Logging {
       case java.sql.Types.TIMESTAMP     => TimestampType
       case java.sql.Types.TIMESTAMP_WITH_TIMEZONE
                                         => TimestampType
-      case -101                         => TimestampType // Value for Timestamp with Time Zone in Oracle
       case java.sql.Types.TINYINT       => IntegerType
       case java.sql.Types.VARBINARY     => BinaryType
       case java.sql.Types.VARCHAR       => StringType
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala
index 3b44c1de93a61..e3f106c41c7ff 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala
@@ -23,30 +23,36 @@ import org.apache.spark.sql.types._
 
 
 private case object OracleDialect extends JdbcDialect {
+  private[jdbc] val BINARY_FLOAT = 100
+  private[jdbc] val BINARY_DOUBLE = 101
+  private[jdbc] val TIMESTAMPTZ = -101
 
   override def canHandle(url: String): Boolean = url.startsWith("jdbc:oracle")
 
   override def getCatalystType(
       sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = {
-    if (sqlType == Types.NUMERIC) {
-      val scale = if (null != md) md.build().getLong("scale") else 0L
-      size match {
-        // Handle NUMBER fields that have no precision/scale in special way
-        // because JDBC ResultSetMetaData converts this to 0 precision and -127 scale
-        // For more details, please see
-        // https://github.com/apache/spark/pull/8780#issuecomment-145598968
-        // and
-        // https://github.com/apache/spark/pull/8780#issuecomment-144541760
-        case 0 => Option(DecimalType(DecimalType.MAX_PRECISION, 10))
-        // Handle FLOAT fields in a special way because JDBC ResultSetMetaData converts
-        // this to NUMERIC with -127 scale
-        // Not sure if there is a more robust way to identify the field as a float (or other
-        // numeric types that do not specify a scale.
-        case _ if scale == -127L => Option(DecimalType(DecimalType.MAX_PRECISION, 10))
-        case _ => None
-      }
-    } else {
-      None
+    sqlType match {
+      case Types.NUMERIC =>
+        val scale = if (null != md) md.build().getLong("scale") else 0L
+        size match {
+          // Handle NUMBER fields that have no precision/scale in special way
+          // because JDBC ResultSetMetaData converts this to 0 precision and -127 scale
+          // For more details, please see
+          // https://github.com/apache/spark/pull/8780#issuecomment-145598968
+          // and
+          // https://github.com/apache/spark/pull/8780#issuecomment-144541760
+          case 0 => Option(DecimalType(DecimalType.MAX_PRECISION, 10))
+          // Handle FLOAT fields in a special way because JDBC ResultSetMetaData converts
+          // this to NUMERIC with -127 scale
+          // Not sure if there is a more robust way to identify the field as a float (or other
+          // numeric types that do not specify a scale.
+          case _ if scale == -127L => Option(DecimalType(DecimalType.MAX_PRECISION, 10))
+          case _ => None
+        }
+      case TIMESTAMPTZ => Some(TimestampType) // Value for Timestamp with Time Zone in Oracle
+      case BINARY_FLOAT => Some(FloatType) // Value for OracleTypes.BINARY_FLOAT
+      case BINARY_DOUBLE => Some(DoubleType) // Value for OracleTypes.BINARY_DOUBLE
+      case _ => None
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index 34205e0b2bf08..167b3e0190026 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -815,6 +815,12 @@ class JDBCSuite extends SparkFunSuite
       Some(DecimalType(DecimalType.MAX_PRECISION, 10)))
     assert(oracleDialect.getCatalystType(java.sql.Types.NUMERIC, "numeric", 0, null) ==
       Some(DecimalType(DecimalType.MAX_PRECISION, 10)))
+    assert(oracleDialect.getCatalystType(OracleDialect.BINARY_FLOAT, "BINARY_FLOAT", 0, null) ==
+      Some(FloatType))
+    assert(oracleDialect.getCatalystType(OracleDialect.BINARY_DOUBLE, "BINARY_DOUBLE", 0, null) ==
+      Some(DoubleType))
+    assert(oracleDialect.getCatalystType(OracleDialect.TIMESTAMPTZ, "TIMESTAMP", 0, null) ==
+      Some(TimestampType))
   }
 
   test("table exists query by jdbc dialect") {

From f6290aea24efeb238db88bdaef4e24d50740ca4c Mon Sep 17 00:00:00 2001
From: Zhenhua Wang <wangzhenhua@huawei.com>
Date: Mon, 23 Oct 2017 23:02:36 +0100
Subject: [PATCH 749/779] [SPARK-22285][SQL] Change implementation of
 ApproxCountDistinctForIntervals to TypedImperativeAggregate

## What changes were proposed in this pull request?

The current implementation of `ApproxCountDistinctForIntervals` is `ImperativeAggregate`. The number of `aggBufferAttributes` is the number of total words in the hllppHelper array. Each hllppHelper has 52 words by default relativeSD.

Since this aggregate function is used in equi-height histogram generation, and the number of buckets in histogram is usually hundreds, the number of `aggBufferAttributes` can easily reach tens of thousands or even more.

This leads to a huge method in codegen and causes error:
```
org.codehaus.janino.JaninoRuntimeException: Code of method "apply(Lorg/apache/spark/sql/catalyst/InternalRow;)Lorg/apache/spark/sql/catalyst/expressions/UnsafeRow;" of class "org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection" grows beyond 64 KB.
```
Besides, huge generated methods also result in performance regression.

In this PR, we change its implementation to `TypedImperativeAggregate`. After the fix, `ApproxCountDistinctForIntervals` can deal with more than thousands endpoints without throwing codegen error, and improve performance from `20 sec` to `2 sec` in a test case of 500 endpoints.

## How was this patch tested?

Test by an added test case and existing tests.

Author: Zhenhua Wang <wangzhenhua@huawei.com>

Closes #19506 from wzhfy/change_forIntervals_typedAgg.
---
 .../ApproxCountDistinctForIntervals.scala     | 97 ++++++++++---------
 ...ApproxCountDistinctForIntervalsSuite.scala | 34 +++----
 ...xCountDistinctForIntervalsQuerySuite.scala | 61 ++++++++++++
 3 files changed, 130 insertions(+), 62 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/ApproxCountDistinctForIntervalsQuerySuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproxCountDistinctForIntervals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproxCountDistinctForIntervals.scala
index 096d1b35a8620..d4421ca20a9bd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproxCountDistinctForIntervals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproxCountDistinctForIntervals.scala
@@ -22,9 +22,10 @@ import java.util
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess}
-import org.apache.spark.sql.catalyst.expressions.{AttributeReference, ExpectsInputTypes, Expression}
+import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, GenericInternalRow}
 import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData, HyperLogLogPlusPlusHelper}
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.Platform
 
 /**
  * This function counts the approximate number of distinct values (ndv) in
@@ -46,16 +47,7 @@ case class ApproxCountDistinctForIntervals(
     relativeSD: Double = 0.05,
     mutableAggBufferOffset: Int = 0,
     inputAggBufferOffset: Int = 0)
-  extends ImperativeAggregate with ExpectsInputTypes {
-
-  def this(child: Expression, endpointsExpression: Expression) = {
-    this(
-      child = child,
-      endpointsExpression = endpointsExpression,
-      relativeSD = 0.05,
-      mutableAggBufferOffset = 0,
-      inputAggBufferOffset = 0)
-  }
+  extends TypedImperativeAggregate[Array[Long]] with ExpectsInputTypes {
 
   def this(child: Expression, endpointsExpression: Expression, relativeSD: Expression) = {
     this(
@@ -114,29 +106,11 @@ case class ApproxCountDistinctForIntervals(
   private lazy val totalNumWords = numWordsPerHllpp * hllppArray.length
 
   /** Allocate enough words to store all registers. */
-  override lazy val aggBufferAttributes: Seq[AttributeReference] = {
-    Seq.tabulate(totalNumWords) { i =>
-      AttributeReference(s"MS[$i]", LongType)()
-    }
+  override def createAggregationBuffer(): Array[Long] = {
+    Array.fill(totalNumWords)(0L)
   }
 
-  override def aggBufferSchema: StructType = StructType.fromAttributes(aggBufferAttributes)
-
-  // Note: although this simply copies aggBufferAttributes, this common code can not be placed
-  // in the superclass because that will lead to initialization ordering issues.
-  override lazy val inputAggBufferAttributes: Seq[AttributeReference] =
-    aggBufferAttributes.map(_.newInstance())
-
-  /** Fill all words with zeros. */
-  override def initialize(buffer: InternalRow): Unit = {
-    var word = 0
-    while (word < totalNumWords) {
-      buffer.setLong(mutableAggBufferOffset + word, 0)
-      word += 1
-    }
-  }
-
-  override def update(buffer: InternalRow, input: InternalRow): Unit = {
+  override def update(buffer: Array[Long], input: InternalRow): Array[Long] = {
     val value = child.eval(input)
     // Ignore empty rows
     if (value != null) {
@@ -153,13 +127,14 @@ case class ApproxCountDistinctForIntervals(
       // endpoints are sorted into ascending order already
       if (endpoints.head > doubleValue || endpoints.last < doubleValue) {
         // ignore if the value is out of the whole range
-        return
+        return buffer
       }
 
       val hllppIndex = findHllppIndex(doubleValue)
-      val offset = mutableAggBufferOffset + hllppIndex * numWordsPerHllpp
-      hllppArray(hllppIndex).update(buffer, offset, value, child.dataType)
+      val offset = hllppIndex * numWordsPerHllpp
+      hllppArray(hllppIndex).update(LongArrayInternalRow(buffer), offset, value, child.dataType)
     }
+    buffer
   }
 
   // Find which interval (HyperLogLogPlusPlusHelper) should receive the given value.
@@ -196,17 +171,18 @@ case class ApproxCountDistinctForIntervals(
     }
   }
 
-  override def merge(buffer1: InternalRow, buffer2: InternalRow): Unit = {
+  override def merge(buffer1: Array[Long], buffer2: Array[Long]): Array[Long] = {
     for (i <- hllppArray.indices) {
       hllppArray(i).merge(
-        buffer1 = buffer1,
-        buffer2 = buffer2,
-        offset1 = mutableAggBufferOffset + i * numWordsPerHllpp,
-        offset2 = inputAggBufferOffset + i * numWordsPerHllpp)
+        buffer1 = LongArrayInternalRow(buffer1),
+        buffer2 = LongArrayInternalRow(buffer2),
+        offset1 = i * numWordsPerHllpp,
+        offset2 = i * numWordsPerHllpp)
     }
+    buffer1
   }
 
-  override def eval(buffer: InternalRow): Any = {
+  override def eval(buffer: Array[Long]): Any = {
     val ndvArray = hllppResults(buffer)
     // If the endpoints contains multiple elements with the same value,
     // we set ndv=1 for intervals between these elements.
@@ -218,19 +194,23 @@ case class ApproxCountDistinctForIntervals(
     new GenericArrayData(ndvArray)
   }
 
-  def hllppResults(buffer: InternalRow): Array[Long] = {
+  def hllppResults(buffer: Array[Long]): Array[Long] = {
     val ndvArray = new Array[Long](hllppArray.length)
     for (i <- ndvArray.indices) {
-      ndvArray(i) = hllppArray(i).query(buffer, mutableAggBufferOffset + i * numWordsPerHllpp)
+      ndvArray(i) = hllppArray(i).query(LongArrayInternalRow(buffer), i * numWordsPerHllpp)
     }
     ndvArray
   }
 
-  override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ImperativeAggregate =
+  override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int)
+    : ApproxCountDistinctForIntervals = {
     copy(mutableAggBufferOffset = newMutableAggBufferOffset)
+  }
 
-  override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ImperativeAggregate =
+  override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int)
+    : ApproxCountDistinctForIntervals = {
     copy(inputAggBufferOffset = newInputAggBufferOffset)
+  }
 
   override def children: Seq[Expression] = Seq(child, endpointsExpression)
 
@@ -239,4 +219,31 @@ case class ApproxCountDistinctForIntervals(
   override def dataType: DataType = ArrayType(LongType)
 
   override def prettyName: String = "approx_count_distinct_for_intervals"
+
+  override def serialize(obj: Array[Long]): Array[Byte] = {
+    val byteArray = new Array[Byte](obj.length * 8)
+    var i = 0
+    while (i < obj.length) {
+      Platform.putLong(byteArray, Platform.BYTE_ARRAY_OFFSET + i * 8, obj(i))
+      i += 1
+    }
+    byteArray
+  }
+
+  override def deserialize(bytes: Array[Byte]): Array[Long] = {
+    assert(bytes.length % 8 == 0)
+    val length = bytes.length / 8
+    val longArray = new Array[Long](length)
+    var i = 0
+    while (i < length) {
+      longArray(i) = Platform.getLong(bytes, Platform.BYTE_ARRAY_OFFSET + i * 8)
+      i += 1
+    }
+    longArray
+  }
+
+  private case class LongArrayInternalRow(array: Array[Long]) extends GenericInternalRow {
+    override def getLong(offset: Int): Long = array(offset)
+    override def setLong(offset: Int, value: Long): Unit = { array(offset) = value }
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproxCountDistinctForIntervalsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproxCountDistinctForIntervalsSuite.scala
index d6c38c3608bf8..73f18d4feef3f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproxCountDistinctForIntervalsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproxCountDistinctForIntervalsSuite.scala
@@ -32,7 +32,7 @@ class ApproxCountDistinctForIntervalsSuite extends SparkFunSuite {
     val wrongColumnTypes = Seq(BinaryType, BooleanType, StringType, ArrayType(IntegerType),
       MapType(IntegerType, IntegerType), StructType(Seq(StructField("s", IntegerType))))
     wrongColumnTypes.foreach { dataType =>
-      val wrongColumn = new ApproxCountDistinctForIntervals(
+      val wrongColumn = ApproxCountDistinctForIntervals(
         AttributeReference("a", dataType)(),
         endpointsExpression = CreateArray(Seq(1, 10).map(Literal(_))))
       assert(
@@ -43,7 +43,7 @@ class ApproxCountDistinctForIntervalsSuite extends SparkFunSuite {
         })
     }
 
-    var wrongEndpoints = new ApproxCountDistinctForIntervals(
+    var wrongEndpoints = ApproxCountDistinctForIntervals(
       AttributeReference("a", DoubleType)(),
       endpointsExpression = Literal(0.5d))
     assert(
@@ -52,19 +52,19 @@ class ApproxCountDistinctForIntervalsSuite extends SparkFunSuite {
         case _ => false
       })
 
-    wrongEndpoints = new ApproxCountDistinctForIntervals(
+    wrongEndpoints = ApproxCountDistinctForIntervals(
       AttributeReference("a", DoubleType)(),
       endpointsExpression = CreateArray(Seq(AttributeReference("b", DoubleType)())))
     assert(wrongEndpoints.checkInputDataTypes() ==
       TypeCheckFailure("The endpoints provided must be constant literals"))
 
-    wrongEndpoints = new ApproxCountDistinctForIntervals(
+    wrongEndpoints = ApproxCountDistinctForIntervals(
       AttributeReference("a", DoubleType)(),
       endpointsExpression = CreateArray(Array(10L).map(Literal(_))))
     assert(wrongEndpoints.checkInputDataTypes() ==
       TypeCheckFailure("The number of endpoints must be >= 2 to construct intervals"))
 
-    wrongEndpoints = new ApproxCountDistinctForIntervals(
+    wrongEndpoints = ApproxCountDistinctForIntervals(
       AttributeReference("a", DoubleType)(),
       endpointsExpression = CreateArray(Array("foobar").map(Literal(_))))
     assert(wrongEndpoints.checkInputDataTypes() ==
@@ -75,25 +75,18 @@ class ApproxCountDistinctForIntervalsSuite extends SparkFunSuite {
   private def createEstimator[T](
       endpoints: Array[T],
       dt: DataType,
-      rsd: Double = 0.05): (ApproxCountDistinctForIntervals, InternalRow, InternalRow) = {
+      rsd: Double = 0.05): (ApproxCountDistinctForIntervals, InternalRow, Array[Long]) = {
     val input = new SpecificInternalRow(Seq(dt))
     val aggFunc = ApproxCountDistinctForIntervals(
       BoundReference(0, dt, nullable = true), CreateArray(endpoints.map(Literal(_))), rsd)
-    val buffer = createBuffer(aggFunc)
-    (aggFunc, input, buffer)
-  }
-
-  private def createBuffer(aggFunc: ApproxCountDistinctForIntervals): InternalRow = {
-    val buffer = new SpecificInternalRow(aggFunc.aggBufferAttributes.map(_.dataType))
-    aggFunc.initialize(buffer)
-    buffer
+    (aggFunc, input, aggFunc.createAggregationBuffer())
   }
 
   test("merging ApproxCountDistinctForIntervals instances") {
     val (aggFunc, input, buffer1a) =
       createEstimator(Array[Int](0, 10, 2000, 345678, 1000000), IntegerType)
-    val buffer1b = createBuffer(aggFunc)
-    val buffer2 = createBuffer(aggFunc)
+    val buffer1b = aggFunc.createAggregationBuffer()
+    val buffer2 = aggFunc.createAggregationBuffer()
 
     // Add the lower half to `buffer1a`.
     var i = 0
@@ -123,7 +116,7 @@ class ApproxCountDistinctForIntervalsSuite extends SparkFunSuite {
     }
 
     // Check if the buffers are equal.
-    assert(buffer2 == buffer1a, "Buffers should be equal")
+    assert(buffer2.sameElements(buffer1a), "Buffers should be equal")
   }
 
   test("test findHllppIndex(value) for values in the range") {
@@ -152,6 +145,13 @@ class ApproxCountDistinctForIntervalsSuite extends SparkFunSuite {
     checkHllppIndex(endpoints = Array(1, 3, 5, 7, 7, 9), value = 7, expectedIntervalIndex = 2)
   }
 
+  test("round trip serialization") {
+    val (aggFunc, _, _) = createEstimator(Array(1, 2), DoubleType)
+    val longArray = (1L to 100L).toArray
+    val roundtrip = aggFunc.deserialize(aggFunc.serialize(longArray))
+    assert(roundtrip.sameElements(longArray))
+  }
+
   test("basic operations: update, merge, eval...") {
     val endpoints = Array[Double](0, 0.33, 0.6, 0.6, 0.6, 1.0)
     val data: Seq[Double] = Seq(0, 0.6, 0.3, 1, 0.6, 0.5, 0.6, 0.33)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ApproxCountDistinctForIntervalsQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ApproxCountDistinctForIntervalsQuerySuite.scala
new file mode 100644
index 0000000000000..c7d86bc955d67
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ApproxCountDistinctForIntervalsQuerySuite.scala
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.sql.catalyst.expressions.{Alias, CreateArray, Literal}
+import org.apache.spark.sql.catalyst.expressions.aggregate.ApproxCountDistinctForIntervals
+import org.apache.spark.sql.catalyst.plans.logical.Aggregate
+import org.apache.spark.sql.execution.QueryExecution
+import org.apache.spark.sql.test.SharedSQLContext
+
+class ApproxCountDistinctForIntervalsQuerySuite extends QueryTest with SharedSQLContext {
+  import testImplicits._
+
+  // ApproxCountDistinctForIntervals is used in equi-height histogram generation. An equi-height
+  // histogram usually contains hundreds of buckets. So we need to test
+  // ApproxCountDistinctForIntervals with large number of endpoints
+  // (the number of endpoints == the number of buckets + 1).
+  test("test ApproxCountDistinctForIntervals with large number of endpoints") {
+    val table = "approx_count_distinct_for_intervals_tbl"
+    withTable(table) {
+      (1 to 100000).toDF("col").createOrReplaceTempView(table)
+      // percentiles of 0, 0.001, 0.002 ... 0.999, 1
+      val endpoints = (0 to 1000).map(_ * 100000 / 1000)
+
+      // Since approx_count_distinct_for_intervals is not a public function, here we do
+      // the computation by constructing logical plan.
+      val relation = spark.table(table).logicalPlan
+      val attr = relation.output.find(_.name == "col").get
+      val aggFunc = ApproxCountDistinctForIntervals(attr, CreateArray(endpoints.map(Literal(_))))
+      val aggExpr = aggFunc.toAggregateExpression()
+      val namedExpr = Alias(aggExpr, aggExpr.toString)()
+      val ndvsRow = new QueryExecution(spark, Aggregate(Nil, Seq(namedExpr), relation))
+        .executedPlan.executeTake(1).head
+      val ndvArray = ndvsRow.getArray(0).toLongArray()
+      assert(endpoints.length == ndvArray.length + 1)
+
+      // Each bucket has 100 distinct values.
+      val expectedNdv = 100
+      for (i <- ndvArray.indices) {
+        val ndv = ndvArray(i)
+        val error = math.abs((ndv / expectedNdv.toDouble) - 1.0d)
+        assert(error <= aggFunc.relativeSD * 3.0d, "Error should be within 3 std. errors.")
+      }
+    }
+  }
+}

From 884d4f95f7ebfaa9d8c57cf770d10a2c6ab82d62 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Mon, 23 Oct 2017 17:21:49 -0700
Subject: [PATCH 750/779] [SPARK-21912][SQL][FOLLOW-UP] ORC/Parquet table
 should not create invalid column names

## What changes were proposed in this pull request?

During [SPARK-21912](https://issues.apache.org/jira/browse/SPARK-21912), we skipped testing 'ADD COLUMNS' on ORC tables due to ORC limitation. Since [SPARK-21929](https://issues.apache.org/jira/browse/SPARK-21929) is resolved now, we can test both `ORC` and `PARQUET` completely.

## How was this patch tested?

Pass the updated test case.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #19562 from dongjoon-hyun/SPARK-21912-2.
---
 .../spark/sql/hive/execution/SQLQuerySuite.scala      | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 1cf1c5cd5a472..39e918c3d5209 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -2031,8 +2031,8 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
 
   test("SPARK-21912 ORC/Parquet table should not create invalid column names") {
     Seq(" ", ",", ";", "{", "}", "(", ")", "\n", "\t", "=").foreach { name =>
-      withTable("t21912") {
-        Seq("ORC", "PARQUET").foreach { source =>
+      Seq("ORC", "PARQUET").foreach { source =>
+        withTable("t21912") {
           val m = intercept[AnalysisException] {
             sql(s"CREATE TABLE t21912(`col$name` INT) USING $source")
           }.getMessage
@@ -2049,15 +2049,12 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
             }.getMessage
             assert(m3.contains(s"contains invalid character(s)"))
           }
-        }
 
-        // TODO: After SPARK-21929, we need to check ORC, too.
-        Seq("PARQUET").foreach { source =>
           sql(s"CREATE TABLE t21912(`col` INT) USING $source")
-          val m = intercept[AnalysisException] {
+          val m4 = intercept[AnalysisException] {
             sql(s"ALTER TABLE t21912 ADD COLUMNS(`col$name` INT)")
           }.getMessage
-          assert(m.contains(s"contains invalid character(s)"))
+          assert(m4.contains(s"contains invalid character(s)"))
         }
       }
     }

From d9798c834f3fed060cfd18a8d38c398cb2efcc82 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Tue, 24 Oct 2017 12:44:47 +0900
Subject: [PATCH 751/779] [SPARK-22313][PYTHON] Mark/print deprecation warnings
 as DeprecationWarning for deprecated APIs

## What changes were proposed in this pull request?

This PR proposes to mark the existing warnings as `DeprecationWarning` and print out warnings for deprecated functions.

This could be actually useful for Spark app developers. I use (old) PyCharm and this IDE can detect this specific `DeprecationWarning` in some cases:

**Before**

<img src="https://user-images.githubusercontent.com/6477701/31762664-df68d9f8-b4f6-11e7-8773-f0468f70a2cc.png" height="45" />

**After**

<img src="https://user-images.githubusercontent.com/6477701/31762662-de4d6868-b4f6-11e7-98dc-3c8446a0c28a.png" height="70" />

For console usage, `DeprecationWarning` is usually disabled (see https://docs.python.org/2/library/warnings.html#warning-categories and https://docs.python.org/3/library/warnings.html#warning-categories):

```
>>> import warnings
>>> filter(lambda f: f[2] == DeprecationWarning, warnings.filters)
[('ignore', <_sre.SRE_Pattern object at 0x10ba58c00>, <type 'exceptions.DeprecationWarning'>, <_sre.SRE_Pattern object at 0x10bb04138>, 0), ('ignore', None, <type 'exceptions.DeprecationWarning'>, None, 0)]
```

so, it won't actually mess up the terminal much unless it is intended.

If this is intendedly enabled, it'd should as below:

```
>>> import warnings
>>> warnings.simplefilter('always', DeprecationWarning)
>>>
>>> from pyspark.sql import functions
>>> functions.approxCountDistinct("a")
.../spark/python/pyspark/sql/functions.py:232: DeprecationWarning: Deprecated in 2.1, use approx_count_distinct instead.
  "Deprecated in 2.1, use approx_count_distinct instead.", DeprecationWarning)
...
```

These instances were found by:

```
cd python/pyspark
grep -r "Deprecated" .
grep -r "deprecated" .
grep -r "deprecate" .
```

## How was this patch tested?

Manually tested.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #19535 from HyukjinKwon/deprecated-warning.
---
 python/pyspark/ml/util.py              |  8 ++-
 python/pyspark/mllib/classification.py |  2 +-
 python/pyspark/mllib/evaluation.py     |  6 +--
 python/pyspark/mllib/regression.py     |  8 +--
 python/pyspark/sql/dataframe.py        |  3 ++
 python/pyspark/sql/functions.py        | 18 +++++++
 python/pyspark/streaming/flume.py      | 14 ++++-
 python/pyspark/streaming/kafka.py      | 72 ++++++++++++++++++++++----
 8 files changed, 110 insertions(+), 21 deletions(-)

diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py
index 67772910c0d38..c3c47bd79459a 100644
--- a/python/pyspark/ml/util.py
+++ b/python/pyspark/ml/util.py
@@ -175,7 +175,9 @@ def context(self, sqlContext):
 
         .. note:: Deprecated in 2.1 and will be removed in 3.0, use session instead.
         """
-        warnings.warn("Deprecated in 2.1 and will be removed in 3.0, use session instead.")
+        warnings.warn(
+            "Deprecated in 2.1 and will be removed in 3.0, use session instead.",
+            DeprecationWarning)
         self._jwrite.context(sqlContext._ssql_ctx)
         return self
 
@@ -256,7 +258,9 @@ def context(self, sqlContext):
 
         .. note:: Deprecated in 2.1 and will be removed in 3.0, use session instead.
         """
-        warnings.warn("Deprecated in 2.1 and will be removed in 3.0, use session instead.")
+        warnings.warn(
+            "Deprecated in 2.1 and will be removed in 3.0, use session instead.",
+            DeprecationWarning)
         self._jread.context(sqlContext._ssql_ctx)
         return self
 
diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index e04eeb2b60d71..cce703d432b5a 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -311,7 +311,7 @@ def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
         """
         warnings.warn(
             "Deprecated in 2.0.0. Use ml.classification.LogisticRegression or "
-            "LogisticRegressionWithLBFGS.")
+            "LogisticRegressionWithLBFGS.", DeprecationWarning)
 
         def train(rdd, i):
             return callMLlibFunc("trainLogisticRegressionModelWithSGD", rdd, int(iterations),
diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py
index fc2a0b3b5038a..2cd1da3fbf9aa 100644
--- a/python/pyspark/mllib/evaluation.py
+++ b/python/pyspark/mllib/evaluation.py
@@ -234,7 +234,7 @@ def precision(self, label=None):
         """
         if label is None:
             # note:: Deprecated in 2.0.0. Use accuracy.
-            warnings.warn("Deprecated in 2.0.0. Use accuracy.")
+            warnings.warn("Deprecated in 2.0.0. Use accuracy.", DeprecationWarning)
             return self.call("precision")
         else:
             return self.call("precision", float(label))
@@ -246,7 +246,7 @@ def recall(self, label=None):
         """
         if label is None:
             # note:: Deprecated in 2.0.0. Use accuracy.
-            warnings.warn("Deprecated in 2.0.0. Use accuracy.")
+            warnings.warn("Deprecated in 2.0.0. Use accuracy.", DeprecationWarning)
             return self.call("recall")
         else:
             return self.call("recall", float(label))
@@ -259,7 +259,7 @@ def fMeasure(self, label=None, beta=None):
         if beta is None:
             if label is None:
                 # note:: Deprecated in 2.0.0. Use accuracy.
-                warnings.warn("Deprecated in 2.0.0. Use accuracy.")
+                warnings.warn("Deprecated in 2.0.0. Use accuracy.", DeprecationWarning)
                 return self.call("fMeasure")
             else:
                 return self.call("fMeasure", label)
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index 1b66f5b51044b..ea107d400621d 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -278,7 +278,8 @@ def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
           A condition which decides iteration termination.
           (default: 0.001)
         """
-        warnings.warn("Deprecated in 2.0.0. Use ml.regression.LinearRegression.")
+        warnings.warn(
+            "Deprecated in 2.0.0. Use ml.regression.LinearRegression.", DeprecationWarning)
 
         def train(rdd, i):
             return callMLlibFunc("trainLinearRegressionModelWithSGD", rdd, int(iterations),
@@ -421,7 +422,8 @@ def train(cls, data, iterations=100, step=1.0, regParam=0.01,
         """
         warnings.warn(
             "Deprecated in 2.0.0. Use ml.regression.LinearRegression with elasticNetParam = 1.0. "
-            "Note the default regParam is 0.01 for LassoWithSGD, but is 0.0 for LinearRegression.")
+            "Note the default regParam is 0.01 for LassoWithSGD, but is 0.0 for LinearRegression.",
+            DeprecationWarning)
 
         def train(rdd, i):
             return callMLlibFunc("trainLassoModelWithSGD", rdd, int(iterations), float(step),
@@ -566,7 +568,7 @@ def train(cls, data, iterations=100, step=1.0, regParam=0.01,
         warnings.warn(
             "Deprecated in 2.0.0. Use ml.regression.LinearRegression with elasticNetParam = 0.0. "
             "Note the default regParam is 0.01 for RidgeRegressionWithSGD, but is 0.0 for "
-            "LinearRegression.")
+            "LinearRegression.", DeprecationWarning)
 
         def train(rdd, i):
             return callMLlibFunc("trainRidgeModelWithSGD", rdd, int(iterations), float(step),
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 38b01f0011671..c0b574e2b93a1 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -130,6 +130,8 @@ def registerTempTable(self, name):
 
         .. note:: Deprecated in 2.0, use createOrReplaceTempView instead.
         """
+        warnings.warn(
+            "Deprecated in 2.0, use createOrReplaceTempView instead.", DeprecationWarning)
         self._jdf.createOrReplaceTempView(name)
 
     @since(2.0)
@@ -1308,6 +1310,7 @@ def unionAll(self, other):
 
         .. note:: Deprecated in 2.0, use :func:`union` instead.
         """
+        warnings.warn("Deprecated in 2.0, use union instead.", DeprecationWarning)
         return self.union(other)
 
     @since(2.3)
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 9bc374b93a433..0d40368c9cd6e 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -21,6 +21,7 @@
 import math
 import sys
 import functools
+import warnings
 
 if sys.version < "3":
     from itertools import imap as map
@@ -44,6 +45,14 @@ def _(col):
     return _
 
 
+def _wrap_deprecated_function(func, message):
+    """ Wrap the deprecated function to print out deprecation warnings"""
+    def _(col):
+        warnings.warn(message, DeprecationWarning)
+        return func(col)
+    return functools.wraps(func)(_)
+
+
 def _create_binary_mathfunction(name, doc=""):
     """ Create a binary mathfunction by name"""
     def _(col1, col2):
@@ -207,6 +216,12 @@ def _():
         """returns the relative rank (i.e. percentile) of rows within a window partition.""",
 }
 
+# Wraps deprecated functions (keys) with the messages (values).
+_functions_deprecated = {
+    'toDegrees': 'Deprecated in 2.1, use degrees instead.',
+    'toRadians': 'Deprecated in 2.1, use radians instead.',
+}
+
 for _name, _doc in _functions.items():
     globals()[_name] = since(1.3)(_create_function(_name, _doc))
 for _name, _doc in _functions_1_4.items():
@@ -219,6 +234,8 @@ def _():
     globals()[_name] = since(1.6)(_create_function(_name, _doc))
 for _name, _doc in _functions_2_1.items():
     globals()[_name] = since(2.1)(_create_function(_name, _doc))
+for _name, _message in _functions_deprecated.items():
+    globals()[_name] = _wrap_deprecated_function(globals()[_name], _message)
 del _name, _doc
 
 
@@ -227,6 +244,7 @@ def approxCountDistinct(col, rsd=None):
     """
     .. note:: Deprecated in 2.1, use :func:`approx_count_distinct` instead.
     """
+    warnings.warn("Deprecated in 2.1, use approx_count_distinct instead.", DeprecationWarning)
     return approx_count_distinct(col, rsd)
 
 
diff --git a/python/pyspark/streaming/flume.py b/python/pyspark/streaming/flume.py
index 2fed5940b31ea..5a975d050b0d8 100644
--- a/python/pyspark/streaming/flume.py
+++ b/python/pyspark/streaming/flume.py
@@ -54,8 +54,13 @@ def createStream(ssc, hostname, port,
         :param bodyDecoder:  A function used to decode body (default is utf8_decoder)
         :return: A DStream object
 
-        .. note:: Deprecated in 2.3.0
+        .. note:: Deprecated in 2.3.0. Flume support is deprecated as of Spark 2.3.0.
+            See SPARK-22142.
         """
+        warnings.warn(
+            "Deprecated in 2.3.0. Flume support is deprecated as of Spark 2.3.0. "
+            "See SPARK-22142.",
+            DeprecationWarning)
         jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
         helper = FlumeUtils._get_helper(ssc._sc)
         jstream = helper.createStream(ssc._jssc, hostname, port, jlevel, enableDecompression)
@@ -82,8 +87,13 @@ def createPollingStream(ssc, addresses,
         :param bodyDecoder:  A function used to decode body (default is utf8_decoder)
         :return: A DStream object
 
-        .. note:: Deprecated in 2.3.0
+        .. note:: Deprecated in 2.3.0. Flume support is deprecated as of Spark 2.3.0.
+            See SPARK-22142.
         """
+        warnings.warn(
+            "Deprecated in 2.3.0. Flume support is deprecated as of Spark 2.3.0. "
+            "See SPARK-22142.",
+            DeprecationWarning)
         jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
         hosts = []
         ports = []
diff --git a/python/pyspark/streaming/kafka.py b/python/pyspark/streaming/kafka.py
index 4af4135c81958..fdb9308604489 100644
--- a/python/pyspark/streaming/kafka.py
+++ b/python/pyspark/streaming/kafka.py
@@ -15,6 +15,8 @@
 # limitations under the License.
 #
 
+import warnings
+
 from py4j.protocol import Py4JJavaError
 
 from pyspark.rdd import RDD
@@ -56,8 +58,13 @@ def createStream(ssc, zkQuorum, groupId, topics, kafkaParams=None,
         :param valueDecoder:  A function used to decode value (default is utf8_decoder)
         :return: A DStream object
 
-        .. note:: Deprecated in 2.3.0
+        .. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0.
+            See SPARK-21893.
         """
+        warnings.warn(
+            "Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. "
+            "See SPARK-21893.",
+            DeprecationWarning)
         if kafkaParams is None:
             kafkaParams = dict()
         kafkaParams.update({
@@ -105,8 +112,13 @@ def createDirectStream(ssc, topics, kafkaParams, fromOffsets=None,
         :return: A DStream object
 
         .. note:: Experimental
-        .. note:: Deprecated in 2.3.0
+        .. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0.
+            See SPARK-21893.
         """
+        warnings.warn(
+            "Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. "
+            "See SPARK-21893.",
+            DeprecationWarning)
         if fromOffsets is None:
             fromOffsets = dict()
         if not isinstance(topics, list):
@@ -159,8 +171,13 @@ def createRDD(sc, kafkaParams, offsetRanges, leaders=None,
         :return: An RDD object
 
         .. note:: Experimental
-        .. note:: Deprecated in 2.3.0
+        .. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0.
+            See SPARK-21893.
         """
+        warnings.warn(
+            "Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. "
+            "See SPARK-21893.",
+            DeprecationWarning)
         if leaders is None:
             leaders = dict()
         if not isinstance(kafkaParams, dict):
@@ -229,7 +246,8 @@ class OffsetRange(object):
     """
     Represents a range of offsets from a single Kafka TopicAndPartition.
 
-    .. note:: Deprecated in 2.3.0
+    .. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0.
+        See SPARK-21893.
     """
 
     def __init__(self, topic, partition, fromOffset, untilOffset):
@@ -240,6 +258,10 @@ def __init__(self, topic, partition, fromOffset, untilOffset):
         :param fromOffset: Inclusive starting offset.
         :param untilOffset: Exclusive ending offset.
         """
+        warnings.warn(
+            "Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. "
+            "See SPARK-21893.",
+            DeprecationWarning)
         self.topic = topic
         self.partition = partition
         self.fromOffset = fromOffset
@@ -270,7 +292,8 @@ class TopicAndPartition(object):
     """
     Represents a specific topic and partition for Kafka.
 
-    .. note:: Deprecated in 2.3.0
+    .. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0.
+        See SPARK-21893.
     """
 
     def __init__(self, topic, partition):
@@ -279,6 +302,10 @@ def __init__(self, topic, partition):
         :param topic: Kafka topic name.
         :param partition: Kafka partition id.
         """
+        warnings.warn(
+            "Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. "
+            "See SPARK-21893.",
+            DeprecationWarning)
         self._topic = topic
         self._partition = partition
 
@@ -303,7 +330,8 @@ class Broker(object):
     """
     Represent the host and port info for a Kafka broker.
 
-    .. note:: Deprecated in 2.3.0
+    .. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0.
+        See SPARK-21893.
     """
 
     def __init__(self, host, port):
@@ -312,6 +340,10 @@ def __init__(self, host, port):
         :param host: Broker's hostname.
         :param port: Broker's port.
         """
+        warnings.warn(
+            "Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. "
+            "See SPARK-21893.",
+            DeprecationWarning)
         self._host = host
         self._port = port
 
@@ -323,10 +355,15 @@ class KafkaRDD(RDD):
     """
     A Python wrapper of KafkaRDD, to provide additional information on normal RDD.
 
-    .. note:: Deprecated in 2.3.0
+    .. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0.
+        See SPARK-21893.
     """
 
     def __init__(self, jrdd, ctx, jrdd_deserializer):
+        warnings.warn(
+            "Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. "
+            "See SPARK-21893.",
+            DeprecationWarning)
         RDD.__init__(self, jrdd, ctx, jrdd_deserializer)
 
     def offsetRanges(self):
@@ -345,10 +382,15 @@ class KafkaDStream(DStream):
     """
     A Python wrapper of KafkaDStream
 
-    .. note:: Deprecated in 2.3.0
+    .. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0.
+        See SPARK-21893.
     """
 
     def __init__(self, jdstream, ssc, jrdd_deserializer):
+        warnings.warn(
+            "Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. "
+            "See SPARK-21893.",
+            DeprecationWarning)
         DStream.__init__(self, jdstream, ssc, jrdd_deserializer)
 
     def foreachRDD(self, func):
@@ -383,10 +425,15 @@ class KafkaTransformedDStream(TransformedDStream):
     """
     Kafka specific wrapper of TransformedDStream to transform on Kafka RDD.
 
-    .. note:: Deprecated in 2.3.0
+    .. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0.
+        See SPARK-21893.
     """
 
     def __init__(self, prev, func):
+        warnings.warn(
+            "Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. "
+            "See SPARK-21893.",
+            DeprecationWarning)
         TransformedDStream.__init__(self, prev, func)
 
     @property
@@ -405,7 +452,8 @@ class KafkaMessageAndMetadata(object):
     """
     Kafka message and metadata information. Including topic, partition, offset and message
 
-    .. note:: Deprecated in 2.3.0
+    .. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0.
+        See SPARK-21893.
     """
 
     def __init__(self, topic, partition, offset, key, message):
@@ -419,6 +467,10 @@ def __init__(self, topic, partition, offset, key, message):
         :param message: actual message payload of this Kafka message, the return data is
                         undecoded bytearray.
         """
+        warnings.warn(
+            "Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. "
+            "See SPARK-21893.",
+            DeprecationWarning)
         self.topic = topic
         self.partition = partition
         self.offset = offset

From c30d5cfc7117bdadd63bf730e88398139e0f65f4 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Tue, 24 Oct 2017 08:46:22 +0100
Subject: [PATCH 752/779] [SPARK-20822][SQL] Generate code to directly get
 value from ColumnVector for table cache

## What changes were proposed in this pull request?

This PR generates the Java code to directly get a value for a column in `ColumnVector` without using an iterator (e.g. at lines 54-69 in the generated code example) for table cache (e.g. `dataframe.cache`). This PR improves runtime performance by eliminating data copy from column-oriented storage to `InternalRow` in a `SpecificColumnarIterator` iterator for primitive type. Another PR will support primitive type array.

Benchmark result: **1.2x**
```
OpenJDK 64-Bit Server VM 1.8.0_121-8u121-b13-0ubuntu1.16.04.2-b13 on Linux 4.4.0-22-generic
Intel(R) Xeon(R) CPU E5-2667 v3  3.20GHz
Int Sum with IntDelta cache:             Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
------------------------------------------------------------------------------------------------
InternalRow codegen                            731 /  812         43.0          23.2       1.0X
ColumnVector codegen                           616 /  772         51.0          19.6       1.2X
```
Benchmark program
```
  intSumBenchmark(sqlContext, 1024 * 1024 * 30)
  def intSumBenchmark(sqlContext: SQLContext, values: Int): Unit = {
    import sqlContext.implicits._
    val benchmarkPT = new Benchmark("Int Sum with IntDelta cache", values, 20)
    Seq(("InternalRow", "false"), ("ColumnVector", "true")).foreach {
      case (str, value) =>
        withSQLConf(sqlContext, SQLConf. COLUMN_VECTOR_CODEGEN.key -> value) { // tentatively added for benchmarking
          val dfPassThrough = sqlContext.sparkContext.parallelize(0 to values - 1, 1).toDF().cache()
          dfPassThrough.count()       // force to create df.cache()
          benchmarkPT.addCase(s"$str codegen") { iter =>
            dfPassThrough.agg(sum("value")).collect
          }
          dfPassThrough.unpersist(true)
        }
    }
    benchmarkPT.run()
  }
```

Motivating example
```
val dsInt = spark.range(3).cache
dsInt.count // force to build cache
dsInt.filter(_ > 0).collect
```
Generated code
```
/* 001 */ public Object generate(Object[] references) {
/* 002 */   return new GeneratedIterator(references);
/* 003 */ }
/* 004 */
/* 005 */ final class GeneratedIterator extends org.apache.spark.sql.execution.BufferedRowIterator {
/* 006 */   private Object[] references;
/* 007 */   private scala.collection.Iterator[] inputs;
/* 008 */   private scala.collection.Iterator inmemorytablescan_input;
/* 009 */   private org.apache.spark.sql.execution.metric.SQLMetric inmemorytablescan_numOutputRows;
/* 010 */   private org.apache.spark.sql.execution.metric.SQLMetric inmemorytablescan_scanTime;
/* 011 */   private long inmemorytablescan_scanTime1;
/* 012 */   private org.apache.spark.sql.execution.vectorized.ColumnarBatch inmemorytablescan_batch;
/* 013 */   private int inmemorytablescan_batchIdx;
/* 014 */   private org.apache.spark.sql.execution.vectorized.OnHeapColumnVector inmemorytablescan_colInstance0;
/* 015 */   private UnsafeRow inmemorytablescan_result;
/* 016 */   private org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder inmemorytablescan_holder;
/* 017 */   private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter inmemorytablescan_rowWriter;
/* 018 */   private org.apache.spark.sql.execution.metric.SQLMetric filter_numOutputRows;
/* 019 */   private UnsafeRow filter_result;
/* 020 */   private org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder filter_holder;
/* 021 */   private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter filter_rowWriter;
/* 022 */
/* 023 */   public GeneratedIterator(Object[] references) {
/* 024 */     this.references = references;
/* 025 */   }
/* 026 */
/* 027 */   public void init(int index, scala.collection.Iterator[] inputs) {
/* 028 */     partitionIndex = index;
/* 029 */     this.inputs = inputs;
/* 030 */     inmemorytablescan_input = inputs[0];
/* 031 */     inmemorytablescan_numOutputRows = (org.apache.spark.sql.execution.metric.SQLMetric) references[0];
/* 032 */     inmemorytablescan_scanTime = (org.apache.spark.sql.execution.metric.SQLMetric) references[1];
/* 033 */     inmemorytablescan_scanTime1 = 0;
/* 034 */     inmemorytablescan_batch = null;
/* 035 */     inmemorytablescan_batchIdx = 0;
/* 036 */     inmemorytablescan_colInstance0 = null;
/* 037 */     inmemorytablescan_result = new UnsafeRow(1);
/* 038 */     inmemorytablescan_holder = new org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder(inmemorytablescan_result, 0);
/* 039 */     inmemorytablescan_rowWriter = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(inmemorytablescan_holder, 1);
/* 040 */     filter_numOutputRows = (org.apache.spark.sql.execution.metric.SQLMetric) references[2];
/* 041 */     filter_result = new UnsafeRow(1);
/* 042 */     filter_holder = new org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder(filter_result, 0);
/* 043 */     filter_rowWriter = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(filter_holder, 1);
/* 044 */
/* 045 */   }
/* 046 */
/* 047 */   protected void processNext() throws java.io.IOException {
/* 048 */     if (inmemorytablescan_batch == null) {
/* 049 */       inmemorytablescan_nextBatch();
/* 050 */     }
/* 051 */     while (inmemorytablescan_batch != null) {
/* 052 */       int inmemorytablescan_numRows = inmemorytablescan_batch.numRows();
/* 053 */       int inmemorytablescan_localEnd = inmemorytablescan_numRows - inmemorytablescan_batchIdx;
/* 054 */       for (int inmemorytablescan_localIdx = 0; inmemorytablescan_localIdx < inmemorytablescan_localEnd; inmemorytablescan_localIdx++) {
/* 055 */         int inmemorytablescan_rowIdx = inmemorytablescan_batchIdx + inmemorytablescan_localIdx;
/* 056 */         int inmemorytablescan_value = inmemorytablescan_colInstance0.getInt(inmemorytablescan_rowIdx);
/* 057 */
/* 058 */         boolean filter_isNull = false;
/* 059 */
/* 060 */         boolean filter_value = false;
/* 061 */         filter_value = inmemorytablescan_value > 1;
/* 062 */         if (!filter_value) continue;
/* 063 */
/* 064 */         filter_numOutputRows.add(1);
/* 065 */
/* 066 */         filter_rowWriter.write(0, inmemorytablescan_value);
/* 067 */         append(filter_result);
/* 068 */         if (shouldStop()) { inmemorytablescan_batchIdx = inmemorytablescan_rowIdx + 1; return; }
/* 069 */       }
/* 070 */       inmemorytablescan_batchIdx = inmemorytablescan_numRows;
/* 071 */       inmemorytablescan_batch = null;
/* 072 */       inmemorytablescan_nextBatch();
/* 073 */     }
/* 074 */     inmemorytablescan_scanTime.add(inmemorytablescan_scanTime1 / (1000 * 1000));
/* 075 */     inmemorytablescan_scanTime1 = 0;
/* 076 */   }
/* 077 */
/* 078 */   private void inmemorytablescan_nextBatch() throws java.io.IOException {
/* 079 */     long getBatchStart = System.nanoTime();
/* 080 */     if (inmemorytablescan_input.hasNext()) {
/* 081 */       org.apache.spark.sql.execution.columnar.CachedBatch inmemorytablescan_cachedBatch = (org.apache.spark.sql.execution.columnar.CachedBatch)inmemorytablescan_input.next();
/* 082 */       inmemorytablescan_batch = org.apache.spark.sql.execution.columnar.InMemoryRelation$.MODULE$.createColumn(inmemorytablescan_cachedBatch);
/* 083 */
/* 084 */       inmemorytablescan_numOutputRows.add(inmemorytablescan_batch.numRows());
/* 085 */       inmemorytablescan_batchIdx = 0;
/* 086 */       inmemorytablescan_colInstance0 = (org.apache.spark.sql.execution.vectorized.OnHeapColumnVector) inmemorytablescan_batch.column(0); org.apache.spark.sql.execution.columnar.ColumnAccessor$.MODULE$.decompress(inmemorytablescan_cachedBatch.buffers()[0], (org.apache.spark.sql.execution.vectorized.WritableColumnVector) inmemorytablescan_colInstance0, org.apache.spark.sql.types.DataTypes.IntegerType, inmemorytablescan_cachedBatch.numRows());
/* 087 */
/* 088 */     }
/* 089 */     inmemorytablescan_scanTime1 += System.nanoTime() - getBatchStart;
/* 090 */   }
/* 091 */ }
```

## How was this patch tested?

Add test cases into `DataFrameTungstenSuite` and `WholeStageCodegenSuite`

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #18747 from kiszk/SPARK-20822a.
---
 .../sql/execution/ColumnarBatchScan.scala     |  3 -
 .../sql/execution/WholeStageCodegenExec.scala | 24 ++++----
 .../execution/columnar/ColumnAccessor.scala   |  8 +++
 .../columnar/InMemoryTableScanExec.scala      | 57 +++++++++++++++++--
 .../spark/sql/DataFrameTungstenSuite.scala    | 36 ++++++++++++
 .../execution/WholeStageCodegenSuite.scala    | 32 +++++++++++
 6 files changed, 141 insertions(+), 19 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
index 1afe83ea3539e..eb01e126bcbef 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
@@ -19,7 +19,6 @@ package org.apache.spark.sql.execution
 
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
-import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec
 import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.sql.execution.vectorized.{ColumnarBatch, ColumnVector}
 import org.apache.spark.sql.types.DataType
@@ -31,8 +30,6 @@ import org.apache.spark.sql.types.DataType
  */
 private[sql] trait ColumnarBatchScan extends CodegenSupport {
 
-  val inMemoryTableScan: InMemoryTableScanExec = null
-
   def vectorTypes: Option[Seq[String]] = None
 
   override lazy val metrics = Map(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
index 1aaaf896692d1..e37d133ff336a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
@@ -282,6 +282,18 @@ case class InputAdapter(child: SparkPlan) extends UnaryExecNode with CodegenSupp
 
 object WholeStageCodegenExec {
   val PIPELINE_DURATION_METRIC = "duration"
+
+  private def numOfNestedFields(dataType: DataType): Int = dataType match {
+    case dt: StructType => dt.fields.map(f => numOfNestedFields(f.dataType)).sum
+    case m: MapType => numOfNestedFields(m.keyType) + numOfNestedFields(m.valueType)
+    case a: ArrayType => numOfNestedFields(a.elementType)
+    case u: UserDefinedType[_] => numOfNestedFields(u.sqlType)
+    case _ => 1
+  }
+
+  def isTooManyFields(conf: SQLConf, dataType: DataType): Boolean = {
+    numOfNestedFields(dataType) > conf.wholeStageMaxNumFields
+  }
 }
 
 /**
@@ -490,22 +502,14 @@ case class CollapseCodegenStages(conf: SQLConf) extends Rule[SparkPlan] {
     case _ => true
   }
 
-  private def numOfNestedFields(dataType: DataType): Int = dataType match {
-    case dt: StructType => dt.fields.map(f => numOfNestedFields(f.dataType)).sum
-    case m: MapType => numOfNestedFields(m.keyType) + numOfNestedFields(m.valueType)
-    case a: ArrayType => numOfNestedFields(a.elementType)
-    case u: UserDefinedType[_] => numOfNestedFields(u.sqlType)
-    case _ => 1
-  }
-
   private def supportCodegen(plan: SparkPlan): Boolean = plan match {
     case plan: CodegenSupport if plan.supportCodegen =>
       val willFallback = plan.expressions.exists(_.find(e => !supportCodegen(e)).isDefined)
       // the generated code will be huge if there are too many columns
       val hasTooManyOutputFields =
-        numOfNestedFields(plan.schema) > conf.wholeStageMaxNumFields
+        WholeStageCodegenExec.isTooManyFields(conf, plan.schema)
       val hasTooManyInputFields =
-        plan.children.map(p => numOfNestedFields(p.schema)).exists(_ > conf.wholeStageMaxNumFields)
+        plan.children.exists(p => WholeStageCodegenExec.isTooManyFields(conf, p.schema))
       !willFallback && !hasTooManyOutputFields && !hasTooManyInputFields
     case _ => false
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnAccessor.scala
index 24c8ac81420cb..445933d98e9d4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnAccessor.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnAccessor.scala
@@ -163,4 +163,12 @@ private[sql] object ColumnAccessor {
       throw new RuntimeException("Not support non-primitive type now")
     }
   }
+
+  def decompress(
+      array: Array[Byte], columnVector: WritableColumnVector, dataType: DataType, numRows: Int):
+      Unit = {
+    val byteBuffer = ByteBuffer.wrap(array)
+    val columnAccessor = ColumnAccessor(dataType, byteBuffer)
+    decompress(columnAccessor, columnVector, numRows)
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
index 139da1c519da2..43386e7a03c32 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
@@ -23,21 +23,66 @@ import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning}
-import org.apache.spark.sql.execution.LeafExecNode
-import org.apache.spark.sql.execution.metric.SQLMetrics
-import org.apache.spark.sql.types.UserDefinedType
+import org.apache.spark.sql.execution.{ColumnarBatchScan, LeafExecNode, WholeStageCodegenExec}
+import org.apache.spark.sql.execution.vectorized._
+import org.apache.spark.sql.types._
 
 
 case class InMemoryTableScanExec(
     attributes: Seq[Attribute],
     predicates: Seq[Expression],
     @transient relation: InMemoryRelation)
-  extends LeafExecNode {
+  extends LeafExecNode with ColumnarBatchScan {
 
   override protected def innerChildren: Seq[QueryPlan[_]] = Seq(relation) ++ super.innerChildren
 
-  override lazy val metrics = Map(
-    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
+  override def vectorTypes: Option[Seq[String]] =
+    Option(Seq.fill(attributes.length)(classOf[OnHeapColumnVector].getName))
+
+  /**
+   * If true, get data from ColumnVector in ColumnarBatch, which are generally faster.
+   * If false, get data from UnsafeRow build from ColumnVector
+   */
+  override val supportCodegen: Boolean = {
+    // In the initial implementation, for ease of review
+    // support only primitive data types and # of fields is less than wholeStageMaxNumFields
+    relation.schema.fields.forall(f => f.dataType match {
+      case BooleanType | ByteType | ShortType | IntegerType | LongType |
+           FloatType | DoubleType => true
+      case _ => false
+    }) && !WholeStageCodegenExec.isTooManyFields(conf, relation.schema)
+  }
+
+  private val columnIndices =
+    attributes.map(a => relation.output.map(o => o.exprId).indexOf(a.exprId)).toArray
+
+  private val relationSchema = relation.schema.toArray
+
+  private lazy val columnarBatchSchema = new StructType(columnIndices.map(i => relationSchema(i)))
+
+  private def createAndDecompressColumn(cachedColumnarBatch: CachedBatch): ColumnarBatch = {
+    val rowCount = cachedColumnarBatch.numRows
+    val columnVectors = OnHeapColumnVector.allocateColumns(rowCount, columnarBatchSchema)
+    val columnarBatch = new ColumnarBatch(
+      columnarBatchSchema, columnVectors.asInstanceOf[Array[ColumnVector]], rowCount)
+    columnarBatch.setNumRows(rowCount)
+
+    for (i <- 0 until attributes.length) {
+      ColumnAccessor.decompress(
+        cachedColumnarBatch.buffers(columnIndices(i)),
+        columnarBatch.column(i).asInstanceOf[WritableColumnVector],
+        columnarBatchSchema.fields(i).dataType, rowCount)
+    }
+    columnarBatch
+  }
+
+  override def inputRDDs(): Seq[RDD[InternalRow]] = {
+    assert(supportCodegen)
+    val buffers = relation.cachedColumnBuffers
+    // HACK ALERT: This is actually an RDD[ColumnarBatch].
+    // We're taking advantage of Scala's type erasure here to pass these batches along.
+    Seq(buffers.map(createAndDecompressColumn(_)).asInstanceOf[RDD[InternalRow]])
+  }
 
   override def output: Seq[Attribute] = attributes
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTungstenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTungstenSuite.scala
index fe6ba83b4cbfb..0881212a64de8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTungstenSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTungstenSuite.scala
@@ -73,4 +73,40 @@ class DataFrameTungstenSuite extends QueryTest with SharedSQLContext {
     val df = spark.createDataFrame(data, schema)
     assert(df.select("b").first() === Row(outerStruct))
   }
+
+  test("primitive data type accesses in persist data") {
+    val data = Seq(true, 1.toByte, 3.toShort, 7, 15.toLong,
+      31.25.toFloat, 63.75, null)
+    val dataTypes = Seq(BooleanType, ByteType, ShortType, IntegerType, LongType,
+      FloatType, DoubleType, IntegerType)
+    val schemas = dataTypes.zipWithIndex.map { case (dataType, index) =>
+      StructField(s"col$index", dataType, true)
+    }
+    val rdd = sparkContext.makeRDD(Seq(Row.fromSeq(data)))
+    val df = spark.createDataFrame(rdd, StructType(schemas))
+    val row = df.persist.take(1).apply(0)
+    checkAnswer(df, row)
+  }
+
+  test("access cache multiple times") {
+    val df0 = sparkContext.parallelize(Seq(1, 2, 3), 1).toDF("x").cache
+    df0.count
+    val df1 = df0.filter("x > 1")
+    checkAnswer(df1, Seq(Row(2), Row(3)))
+    val df2 = df0.filter("x > 2")
+    checkAnswer(df2, Row(3))
+
+    val df10 = sparkContext.parallelize(Seq(3, 4, 5, 6), 1).toDF("x").cache
+    for (_ <- 0 to 2) {
+      val df11 = df10.filter("x > 5")
+      checkAnswer(df11, Row(6))
+    }
+  }
+
+  test("access only some column of the all of columns") {
+    val df = spark.range(1, 10).map(i => (i, (i + 1).toDouble)).toDF("l", "d")
+    df.cache
+    df.count
+    assert(df.filter("d < 3").count == 1)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
index 098e4cfeb15b2..bc05dca578c47 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution
 import org.apache.spark.sql.{QueryTest, Row, SaveMode}
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeAndComment, CodeGenerator}
 import org.apache.spark.sql.execution.aggregate.HashAggregateExec
+import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec
 import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec
 import org.apache.spark.sql.execution.joins.SortMergeJoinExec
 import org.apache.spark.sql.expressions.scalalang.typed
@@ -117,6 +118,37 @@ class WholeStageCodegenSuite extends QueryTest with SharedSQLContext {
     assert(ds.collect() === Array(("a", 10.0), ("b", 3.0), ("c", 1.0)))
   }
 
+  test("cache for primitive type should be in WholeStageCodegen with InMemoryTableScanExec") {
+    import testImplicits._
+
+    val dsInt = spark.range(3).cache
+    dsInt.count
+    val dsIntFilter = dsInt.filter(_ > 0)
+    val planInt = dsIntFilter.queryExecution.executedPlan
+    assert(planInt.find(p =>
+      p.isInstanceOf[WholeStageCodegenExec] &&
+      p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[FilterExec] &&
+      p.asInstanceOf[WholeStageCodegenExec].child.asInstanceOf[FilterExec].child
+        .isInstanceOf[InMemoryTableScanExec] &&
+      p.asInstanceOf[WholeStageCodegenExec].child.asInstanceOf[FilterExec].child
+        .asInstanceOf[InMemoryTableScanExec].supportCodegen).isDefined
+    )
+    assert(dsIntFilter.collect() === Array(1, 2))
+
+    // cache for string type is not supported for InMemoryTableScanExec
+    val dsString = spark.range(3).map(_.toString).cache
+    dsString.count
+    val dsStringFilter = dsString.filter(_ == "1")
+    val planString = dsStringFilter.queryExecution.executedPlan
+    assert(planString.find(p =>
+      p.isInstanceOf[WholeStageCodegenExec] &&
+      p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[FilterExec] &&
+      !p.asInstanceOf[WholeStageCodegenExec].child.asInstanceOf[FilterExec].child
+        .isInstanceOf[InMemoryTableScanExec]).isDefined
+    )
+    assert(dsStringFilter.collect() === Array("1"))
+  }
+
   test("SPARK-19512 codegen for comparing structs is incorrect") {
     // this would raise CompileException before the fix
     spark.range(10)

From 8beeaed66bde0ace44495b38dc967816e16b3464 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Tue, 24 Oct 2017 13:56:10 +0100
Subject: [PATCH 753/779] [SPARK-21936][SQL][FOLLOW-UP] backward compatibility
 test framework for HiveExternalCatalog

## What changes were proposed in this pull request?

Adjust Spark download in test to use Apache mirrors and respect its load balancer, and use Spark 2.1.2. This follows on a recent PMC list thread about removing the cloudfront download rather than update it further.

## How was this patch tested?

Existing tests.

Author: Sean Owen <sowen@cloudera.com>

Closes #19564 from srowen/SPARK-21936.2.
---
 .../spark/sql/hive/HiveExternalCatalogVersionsSuite.scala   | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
index 305f5b533d592..5f8c9d5799662 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
@@ -53,7 +53,9 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils {
   private def downloadSpark(version: String): Unit = {
     import scala.sys.process._
 
-    val url = s"https://d3kbcqa49mib13.cloudfront.net/spark-$version-bin-hadoop2.7.tgz"
+    val preferredMirror =
+      Seq("wget", "https://www.apache.org/dyn/closer.lua?preferred=true", "-q", "-O", "-").!!.trim
+    val url = s"$preferredMirror/spark/spark-$version/spark-$version-bin-hadoop2.7.tgz"
 
     Seq("wget", url, "-q", "-P", sparkTestingDir.getCanonicalPath).!
 
@@ -142,7 +144,7 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils {
 
 object PROCESS_TABLES extends QueryTest with SQLTestUtils {
   // Tests the latest version of every release line.
-  val testingVersions = Seq("2.0.2", "2.1.1", "2.2.0")
+  val testingVersions = Seq("2.0.2", "2.1.2", "2.2.0")
 
   protected var spark: SparkSession = _
 

From 3f5ba968c5af7911a2f6c452500b6a629a3de8db Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Tue, 24 Oct 2017 09:11:52 -0700
Subject: [PATCH 754/779] [SPARK-22301][SQL] Add rule to Optimizer for In with
 not-nullable value and empty list

## What changes were proposed in this pull request?

For performance reason, we should resolve in operation on an empty list as false in the optimizations phase, ad discussed in #19522.

## How was this patch tested?
Added UT

cc gatorsmile

Author: Marco Gaido <marcogaido91@gmail.com>
Author: Marco Gaido <mgaido@hortonworks.com>

Closes #19523 from mgaido91/SPARK-22301.
---
 .../sql/catalyst/optimizer/expressions.scala     |  7 +++++--
 .../sql/catalyst/optimizer/OptimizeInSuite.scala | 16 ++++++++++++++++
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
index 273bc6ce27c5d..523b53b39d6b5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
@@ -169,13 +169,16 @@ object ReorderAssociativeOperator extends Rule[LogicalPlan] {
 
 /**
  * Optimize IN predicates:
- * 1. Removes literal repetitions.
- * 2. Replaces [[In (value, seq[Literal])]] with optimized version
+ * 1. Converts the predicate to false when the list is empty and
+ *    the value is not nullable.
+ * 2. Removes literal repetitions.
+ * 3. Replaces [[In (value, seq[Literal])]] with optimized version
  *    [[InSet (value, HashSet[Literal])]] which is much faster.
  */
 object OptimizeIn extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
     case q: LogicalPlan => q transformExpressionsDown {
+      case In(v, list) if list.isEmpty && !v.nullable => FalseLiteral
       case expr @ In(v, list) if expr.inSetConvertible =>
         val newList = ExpressionSet(list).toSeq
         if (newList.size > SQLConf.get.optimizerInSetConversionThreshold) {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
index eaad1e32a8aba..d7acd139225cd 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
@@ -175,4 +175,20 @@ class OptimizeInSuite extends PlanTest {
       }
     }
   }
+
+  test("OptimizedIn test: In empty list gets transformed to FalseLiteral " +
+    "when value is not nullable") {
+    val originalQuery =
+      testRelation
+        .where(In(Literal("a"), Nil))
+        .analyze
+
+    val optimized = Optimize.execute(originalQuery)
+    val correctAnswer =
+      testRelation
+        .where(Literal(false))
+        .analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
 }

From bc1e76632ddec8fc64726086905183d1f312bca4 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Wed, 25 Oct 2017 06:33:44 +0100
Subject: [PATCH 755/779] [SPARK-22348][SQL] The table cache providing
 ColumnarBatch should also do partition batch pruning

## What changes were proposed in this pull request?

We enable table cache `InMemoryTableScanExec` to provide `ColumnarBatch` now. But the cached batches are retrieved without pruning. In this case, we still need to do partition batch pruning.

## How was this patch tested?

Existing tests.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #19569 from viirya/SPARK-22348.
---
 .../columnar/InMemoryTableScanExec.scala      | 70 ++++++++++---------
 .../columnar/InMemoryColumnarQuerySuite.scala | 27 ++++++-
 2 files changed, 64 insertions(+), 33 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
index 43386e7a03c32..2ae3f35eb1da1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
@@ -78,7 +78,7 @@ case class InMemoryTableScanExec(
 
   override def inputRDDs(): Seq[RDD[InternalRow]] = {
     assert(supportCodegen)
-    val buffers = relation.cachedColumnBuffers
+    val buffers = filteredCachedBatches()
     // HACK ALERT: This is actually an RDD[ColumnarBatch].
     // We're taking advantage of Scala's type erasure here to pass these batches along.
     Seq(buffers.map(createAndDecompressColumn(_)).asInstanceOf[RDD[InternalRow]])
@@ -180,19 +180,11 @@ case class InMemoryTableScanExec(
 
   private val inMemoryPartitionPruningEnabled = sqlContext.conf.inMemoryPartitionPruning
 
-  protected override def doExecute(): RDD[InternalRow] = {
-    val numOutputRows = longMetric("numOutputRows")
-
-    if (enableAccumulators) {
-      readPartitions.setValue(0)
-      readBatches.setValue(0)
-    }
-
+  private def filteredCachedBatches(): RDD[CachedBatch] = {
     // Using these variables here to avoid serialization of entire objects (if referenced directly)
     // within the map Partitions closure.
     val schema = relation.partitionStatistics.schema
     val schemaIndex = schema.zipWithIndex
-    val relOutput: AttributeSeq = relation.output
     val buffers = relation.cachedColumnBuffers
 
     buffers.mapPartitionsWithIndexInternal { (index, cachedBatchIterator) =>
@@ -201,35 +193,49 @@ case class InMemoryTableScanExec(
         schema)
       partitionFilter.initialize(index)
 
+      // Do partition batch pruning if enabled
+      if (inMemoryPartitionPruningEnabled) {
+        cachedBatchIterator.filter { cachedBatch =>
+          if (!partitionFilter.eval(cachedBatch.stats)) {
+            logDebug {
+              val statsString = schemaIndex.map { case (a, i) =>
+                val value = cachedBatch.stats.get(i, a.dataType)
+                s"${a.name}: $value"
+              }.mkString(", ")
+              s"Skipping partition based on stats $statsString"
+            }
+            false
+          } else {
+            true
+          }
+        }
+      } else {
+        cachedBatchIterator
+      }
+    }
+  }
+
+  protected override def doExecute(): RDD[InternalRow] = {
+    val numOutputRows = longMetric("numOutputRows")
+
+    if (enableAccumulators) {
+      readPartitions.setValue(0)
+      readBatches.setValue(0)
+    }
+
+    // Using these variables here to avoid serialization of entire objects (if referenced directly)
+    // within the map Partitions closure.
+    val relOutput: AttributeSeq = relation.output
+
+    filteredCachedBatches().mapPartitionsInternal { cachedBatchIterator =>
       // Find the ordinals and data types of the requested columns.
       val (requestedColumnIndices, requestedColumnDataTypes) =
         attributes.map { a =>
           relOutput.indexOf(a.exprId) -> a.dataType
         }.unzip
 
-      // Do partition batch pruning if enabled
-      val cachedBatchesToScan =
-        if (inMemoryPartitionPruningEnabled) {
-          cachedBatchIterator.filter { cachedBatch =>
-            if (!partitionFilter.eval(cachedBatch.stats)) {
-              logDebug {
-                val statsString = schemaIndex.map { case (a, i) =>
-                  val value = cachedBatch.stats.get(i, a.dataType)
-                  s"${a.name}: $value"
-                }.mkString(", ")
-                s"Skipping partition based on stats $statsString"
-              }
-              false
-            } else {
-              true
-            }
-          }
-        } else {
-          cachedBatchIterator
-        }
-
       // update SQL metrics
-      val withMetrics = cachedBatchesToScan.map { batch =>
+      val withMetrics = cachedBatchIterator.map { batch =>
         if (enableAccumulators) {
           readBatches.add(1)
         }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
index 2f249c850a088..e662e294228db 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
@@ -23,7 +23,7 @@ import java.sql.{Date, Timestamp}
 import org.apache.spark.sql.{DataFrame, QueryTest, Row}
 import org.apache.spark.sql.catalyst.expressions.{AttributeReference, AttributeSet, In}
 import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning
-import org.apache.spark.sql.execution.LocalTableScanExec
+import org.apache.spark.sql.execution.{FilterExec, LocalTableScanExec, WholeStageCodegenExec}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
@@ -454,4 +454,29 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
       Seq(In(attribute, Nil)), testRelation)
     assert(tableScanExec.partitionFilters.isEmpty)
   }
+
+  test("SPARK-22348: table cache should do partition batch pruning") {
+    Seq("true", "false").foreach { enabled =>
+      withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> enabled) {
+        val df1 = Seq((1, 1), (1, 1), (2, 2)).toDF("x", "y")
+        df1.unpersist()
+        df1.cache()
+
+        // Push predicate to the cached table.
+        val df2 = df1.where("y = 3")
+
+        val planBeforeFilter = df2.queryExecution.executedPlan.collect {
+          case f: FilterExec => f.child
+        }
+        assert(planBeforeFilter.head.isInstanceOf[InMemoryTableScanExec])
+
+        val execPlan = if (enabled == "true") {
+          WholeStageCodegenExec(planBeforeFilter.head)
+        } else {
+          planBeforeFilter.head
+        }
+        assert(execPlan.executeCollectPublic().length == 0)
+      }
+    }
+  }
 }

From 524abb996abc9970d699623c13469ea3b6d2d3fc Mon Sep 17 00:00:00 2001
From: Yuming Wang <wgyumg@gmail.com>
Date: Tue, 24 Oct 2017 22:59:46 -0700
Subject: [PATCH 756/779] [SPARK-21101][SQL] Catch IllegalStateException when
 CREATE TEMPORARY FUNCTION

## What changes were proposed in this pull request?

It must `override` [`public StructObjectInspector initialize(ObjectInspector[] argOIs)`](https://github.com/apache/hive/blob/release-2.0.0/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDTF.java#L70) when create a UDTF.

If you `override` [`public StructObjectInspector initialize(StructObjectInspector argOIs)`](https://github.com/apache/hive/blob/release-2.0.0/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDTF.java#L49), `IllegalStateException` will throw. per: [HIVE-12377](https://issues.apache.org/jira/browse/HIVE-12377).

This PR catch `IllegalStateException` and point user to `override` `public StructObjectInspector initialize(ObjectInspector[] argOIs)`.

## How was this patch tested?

unit tests

Source code and binary jar: [SPARK-21101.zip](https://github.com/apache/spark/files/1123763/SPARK-21101.zip)
These two source code copy from : https://github.com/apache/hive/blob/release-2.0.0/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDTFStack.java

Author: Yuming Wang <wgyumg@gmail.com>

Closes #18527 from wangyum/SPARK-21101.
---
 .../spark/sql/hive/HiveSessionCatalog.scala   |  11 ++++++--
 .../src/test/resources/SPARK-21101-1.0.jar    | Bin 0 -> 7439 bytes
 .../sql/hive/execution/SQLQuerySuite.scala    |  26 ++++++++++++++++++
 3 files changed, 35 insertions(+), 2 deletions(-)
 create mode 100644 sql/hive/src/test/resources/SPARK-21101-1.0.jar

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
index b256ffc27b199..1f11adbd4f62e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
@@ -94,8 +94,15 @@ private[sql] class HiveSessionCatalog(
         }
       } catch {
         case NonFatal(e) =>
-          val analysisException =
-            new AnalysisException(s"No handler for UDF/UDAF/UDTF '${clazz.getCanonicalName}': $e")
+          val noHandlerMsg = s"No handler for UDF/UDAF/UDTF '${clazz.getCanonicalName}': $e"
+          val errorMsg =
+            if (classOf[GenericUDTF].isAssignableFrom(clazz)) {
+              s"$noHandlerMsg\nPlease make sure your function overrides " +
+                "`public StructObjectInspector initialize(ObjectInspector[] args)`."
+            } else {
+              noHandlerMsg
+            }
+          val analysisException = new AnalysisException(errorMsg)
           analysisException.setStackTrace(e.getStackTrace)
           throw analysisException
       }
diff --git a/sql/hive/src/test/resources/SPARK-21101-1.0.jar b/sql/hive/src/test/resources/SPARK-21101-1.0.jar
new file mode 100644
index 0000000000000000000000000000000000000000..768b2334db5c3aa8b1e4186af5fde86120d3231b
GIT binary patch
literal 7439
zcmb7J1z1#D*XAOFgmkw^#~@uINJ=*%LyqJK2t#+*05TE|HH4DV-Q7rsgi4oO8m=&+
zzz3gKz2EQkKKEbq%sJ<I&U)Xy&faJ5wa<DrRjy&+pj{bAYfbsT7k?hGFVAZ7y3#-;
zb%ooSf4X6z8C<#r-0yyCd3oFW@=y9#H#KQ>B?WmMT^==sBelT+RTUu57@i7{b9iv1
zQk!>DU~$cfTY0#TTLmbCb$vDaK>|5f8?#3}GD@37MO()ujkB1P7MD0)K%2~mWI+4q
z@{Y2AvvS+A{HISWgnp4F`pUw6j<|P&J)E6QcuaWEzJ>L3^ca_6IXGE=5Bz5j+&?|Q
zj$m^e%YSer`d>$9N3fIaKe&_qox3yC?jIo3zk=96-2N#t=6}RldRUsfx<DMBd~X@+
zzk|#jnp$dlIlAqakqZ7x20b}l1sxZ#xh+4BxgFTqxmy3TH|ZNHzmZ+m##vP?%#LCh
zex#*HZKxvz5J!w2$j+Q}uMX!%WBkywp+SCu0ks_RvZ(@ldt2M`=_&-M+KdY~DRKt%
zS>k;BBJ-@-KWJ^N#rinTf~4%X6gKbOjmDORleAB(l_w?F9>Q9~XJ@2E!6-9*Gl0>o
zg@v=KaIdBFt>}1BJs8`8E}_Q2xeK9bsT^54w)1BiX(aZN9c3kyC&D@yWku->4&O&^
zx9Y)^B^Wwt*H%N72iv2hn@APtT1YzFJ5lr|h@q2U6qqs!tfFKRI|LCs)54NMP?+K^
zu`wbEma1?1ak6-@gk<zzlOkB5K;_V$DZl2Y4kGI+4Msh^S8v7U-AETWYY4aLt$Nra
zq+MPdwo*ct|3M@!ciqa)c<~@^$JG#chiYOR<f913gJ2IR_l8BAdkv25gv?V{;xmk`
zlz_z{ZV0-qkLz=xpWeES6A*xop+oA98fL>SwBZR>tVemCtCEIn{*|?j8tz4td~><k
z)g*G8W%fH3Cht#M-p}?UaVP)@PbMaJh7;CB#rv~!nVeOsV{#uQcbRd&iVq=z+Yf)d
zBjUER>TBI^Z{YnJb7)X8!xb6LfD8)<cQ>fHC)(G&T}nuonup|g_zYt-?nkFHOP|`O
zfvKM~21pUp5n{Q3u)SV+6@q`in;hHO&@4+#oIm@xyLvLfipDA3^hTX5Cdchm_K5hY
z?WH8yyLWv*l6WC=%LiwMpWbOWH3mw)SFW)<NLYI&<1qoNOGkWGCoFa5$>FO?fJUa?
zR<Oc~@tYp*n$wPu+ruXN4ZJ@MEZd|XD-;}b>Dl4`LpMl4mmk}`%&^1*T;;c5j2YI}
z)X#G#nWwHePY{!4PmxK(i6CAt-Lw`0flCv%;yudCGi^^SMjw&ieURn7*2~|i`hg9@
z2b1piI~_<OvqL8f`dI3ZHcb3Qvh!U>@-C-X3<PXW{Daxnk%3f}!53ov<&Sw06E$Sh
zbTRAlx&2Q3ezGOcNfgYUFpKNSan3qeZr9CQDl+TBuUke*aFvzaHgD*OkTes;^qmE#
zAPPMXI7VGkvsqUpYP8&!YnDO<!~m+GxOF#X-{amY-LRvWVB<U9;sgqVtU|+GljGyW
zdZ~KJ$F!~wc`Ow~`BkU2)!ta;>J&@AVT_nuN_zFXMTr@cFy9L`a|ocY4Ajo_Fv|=$
z#qki151<*P>H+{-J83ZTf)-ZP0&5x%VL`R~HEm4pw4ylY0Z}-|;MTUt+htq>vf*ii
z(I0XR2NTg%KD)cPNRq<}PO+8dwS@H5Hd9s#cA+Y74k?i2kP;PDLjjWNcpi75N<lx2
z^~9jZvOS6{7GS#tzU;!_Fg0wP_j+FCH`f)zXu@6fvf`bqauO}+w3(C!^vIfmr8rR}
zlnC$>v62#&m!$q!VL|RtgL`K$BiZI;7HSewdKMiDEJ$;k)r{wf<sP(?8R<FVIba^7
z-@CYY;loAQnL*YB-|`_Q`?3i+`;7u2UNAsftbnl#FBpFIT!|$+;B+h+5!XQuuqKN<
zOw*&j;9j~x4+)5pey_RDc>Z#+6s6d{u+OK)Ur7xt^9f^}8S74LE8dI#P=XjW0@j4$
zl4wfUIsu74G+j4Hhc{;Lx@BvVpqFScSl}1J<nkmnuf?3qFOo#8&5fisF>J=DpFBg3
z7{lL_#?Zwkljpz6c_qv*WI0e}Qj#;#kW6TkzT1|N<0JQ41hf13^VVLKZd_3=Ic3I{
z>rMuqPSuqDA{w3o6*oL#P41eA7fK4${(522x0dwWNtIlJ5sC?v*_s>2<2RE9+vN^v
ztWFh>hXpsQ8~VKxsG%#y&=2Dl-f{g=Q`U~j*nS1?(W@V1lOl7DlBI&4bUtf%!%HC`
zQa=}bx53BIMqAsLx^Us+iaoUp;E3T#?fwy(l%W28DiwPtmB9?-KH0DjigTmrfHou6
zvZd}^sHO<@%%A(zZlJz*8vPy5aM<DomW@pv`WP?fLOR#JsY0f-k-{vh$!IvZ0=s!-
zY;Mqv2y(!R{xIw@_lCO@7k#FS3i~YiF1y(44?S7i0D30<jNa<8)FTn~qW6Y2$&Om9
zx1X%tZkrW8_Zm%QJHVcPJyR=WrBz<MN!<|?nqAS`Dy~V>r*>a!VGwG+NKMM{B=dJ_
ztMX@QxDmJ+7nLu>qJCGQ--H@Z#jT3}J|5;(rIRHLf;O{_EALJ&g6(=&nxtzymsV}a
zPT4_JzM~p&lt(MzPFXyHu8%AJ-MjAQIipu$djgXf<!lkn%Q`8AnM-!(s4xAHK7loD
zS-vTcW;;?n^Q7jy^%k!DPnly2r;O{!`@u%kokk7$HC86%NA*xVTny17gFe<M`9q&;
z(++0CLOx2PIIiQdd$&-0R{cCtr}qL+&*~?pufe_yzgT8>B_kO9bY3IQ6H=jh|3Liy
zn)yT3V*;y#Q67J9-}8O;;i(Nc-Wxg`+BhCalVoIelU8DP9L>Y`4{2<MmY0q`q~>=g
za=02VYLcti-qL1_>D7y@c!fuLqL^9thPPx#Olr=1842IHCu<UUGNvf1<H6A_!GlS?
z<x$@sZam-azY*91T-eamopf>4k#1QJ!m?m>eS)Cw2fWfI2RaRPPTyA>(kG(nou<mS
z6B)}fb~z&FX7)>=B-+B>8VYe`h<$#uM)lD;;N_~`b~|8ujmd!_gZuLVfkdGH5}IT$
zV>woe4WrA0N3{=wrON?I<#e*k*vHTtQQ+*5U?M1Ht_L2XuJ6EdhCb#bE9j3fCADP~
z-l2F~EW_<3d)n1LHu@a9>rG>PR`6?LO;P3O^J3z1cDPQn_)o`{nM#48gIf4|M|E-U
zq9{&?-@n`G_I<NGBTFU%ZLL!hJiopnDNl9b@G>Q^tSt)154eS_(uj<dW_H$MOfb^D
z86W%z|H*x)nE}tjTC}kjr`R*L%a4*JbWNo2@4-SIdF2m2GPnSQthI}e?Ja!(!ZItk
z5#GF>2Dx0V8zbbp6?b8r$Va;##99*F+$ceTeTdgC)!PrGp^_^8=^;_pQDV%mqvI-l
zg^t>#M~)>=dn)U63zPKuv&+K@R2qlwe^R49VkJFqfTTH%Jkd*k`q~qkYR%$DzB=J0
z;GHacCRCiLti63iqFp34G6ZLu=eQN#%q-NAyxAu6wp~m78kVywpI@7g{(<SYeeKr`
zTYt$Tus#+VS|BkRn!x{e!}e{z_Q{=On(RECMWsT*iZ7|Vyr21n{9*1rEXcJw%3x_2
z#qv|al*Mu@Za8oxA~}{%DPAx{*d9@{Nk9HvI0q6he3vakQ=Q}P&L+WGj=zlmUdA&g
z#a0oJorQGFmJY*+;jZDX``+hWUyX~pkG)%HZY%Cu%$ea}Ijw8W_{K?b@*whw3SLV-
zk{&yZWcJ^ADZ<FY=;W{cfmKl<X$%(@C911fc_S;$6a9%j#gR2Alzn*Fz(9hc=SWLX
zHnG6uS$9;$=QS*oINTBWeJ6|!CX_OzX>l)?2V+oaKej1+S+9we<#yOXLqZb~ORj@l
zX6V5XZwm|IIPwq8ak7WgQdvW`Fr>8=@mcQH4p}}8-vnZPjaGIM7P;hmRuivoV*nJg
zXU-LxtEq%RoaYtm!m?-@iH}#KV-HmahAJx3x}Gg)AWpp->-}9`yA9jK4D$uzvEfql
zr#$BwUQhTKBYifd=``cGGuX652$>*ezZ;X42JEn>;~8u=)`7eTh^yl6+#Iqgv2{#U
zt+wB_nc7snm8Zcnj#<mA17iyH+SXj4oNj!#SMXLu*luk-Y)3%X$DfQYH%$>V5AFS<
zsy|u_Id5w%>f1@&>KGHd9wt%in``MchaULQ$LQva?UqDN^;xdvjrDa_JZs%2%%d|7
zc~ygT>q!-+q?pZ+_&aO}bZBLYK&m4o&!M#3Ec*lXJLPCP>K&Scs!8tZsa*b=0lb&t
z6xs?8@EgHR{Vf>gkExCf@LP!Q)GIr6>DT#nIXiKbI#F~80p}hP5-J5!PgwBD<^q&F
zLH=gd>@0z5Oj!hvo#Ql!T333<bKj-;2^%{3G>LDJ6C@#$9=v56L`9<NgUW38HJtAa
zoMtAq_0<%a=iShkf$PC!`^zG%DcZ2@qPr7ib43)kk@5Ves*ueTe*1=1-N52bu>paN
zXy}B{tD4>lJB=eM=22*3e!QG2E&uxj^V5*5mznbLBNyZPyIY>Xc=Of`-Gv#my;n(Z
zUgT1W85<izt9713Hw!AY6H?6wsw51nb;Doeztt&Fd0kM6ts<0f)K1|TK&0P&fJ5J<
z9tybt1HAVmf|_UY%%ms-34C!pSYh;RX{iF-rL@B6cN7%7y%^x#dgjvUPYWNXr|=0*
z>og106p&HW852*`0`N<>-+OQBYgBTY%a}Q@wbKnIQyJ&>?v6oZ)A;Fbu|xDX3y25n
zX|J>5>1DCudaLNaw#)7rugT*?;kOhndW$NtI`)0Mp#~>8*AacS_!$)#a0>C%XeT&8
z+|$poRrMQ)B~m@{bG6o|q#$j2%D{o*o5xvULua}OXD(UHLnULNoB<RExbrbNvLDqh
zSO9m7-k28^@&(#M-=x(zQyIS$RNttImNQZqNg&;LZb)SVZt4tC&sDzLin9_IWV&J;
zcvgtx1!!$|AZrRg0f_4-901%(M&q-5(Fx`~hmcE<hII%mi<BEXEO+KG<W{`8;(!&N
z3ft~u5(F&L$SN{lgC>2jG1xD6ro#kgp3_HpoB8QGXM<03M2k2VPu7OY{btVwLN+&$
zFF2BU;|~f%T)McqJW>A3Odo9<OqP(^3VzD3<MqnJoeWNs9uxFD@Fvhv8Q1}F(lxIi
zTQgQV+U3-q9D08oEfD@NVd07R6c(;XZz#Tyq?dt(?pAa-YdCBA$#9hi2V%%H!8QR)
zL{bZ`J}*?tR(Q^2@{y(hN2sHl;)G67DCRTj=w^F{NVO>2Z3(RHN2Qnq<s*&o9p}SS
zt^o$i{#3$4c;bsz?LNZ<O9Jl+tTI4VwQ-<LV^|7_K4D$7S!!MM+ipL_<IakRFYOB_
zHwUaj9=hF9+_8P-%k*(UW$xX9*dI=E9NW>dqJyAJILOwgc*ieeVcCk&w9L6*M$5<}
z#PnHn)|WVsqB8j*U`hU3w(ZWe?d_LK)W53>DwJlPkL&~*`v*GxX0PpnhbL&Jw(?eD
z3cAQSg<Cc|({~Gw=j2DTCX#8?tq@3K6KCZYvA<BR@W9g!2jRA0l;nFipW;62>*B%t
zGcwZ{$F-V-{pMTpO`Q)uP^Y{)PnQ72TyHRpiod<QHnm)}iGVG3rJL!Lot?^OuSFsx
zKn|Jp%^Y=HaC-{grpZWeSObe_Z1`J`48e#U`S@MyF;Z?gH284zTGUiJL4-;MH?kBg
zHvd5GfX!|LOwuG*4(l-U#-;Gj#q2$UUGR*XWj-XpWL~Yv@fKG`Ng4*-vm9Snhs2U9
z#vjfwT5bftKX9N+V_-ZE^e?;aJ6s?sC2R3PyuyMDUKgw@dJmnJ(ILzeB~Gx=anq!e
z&!k0TkvNCp5r%&^`#Fs73+@UELnG1(NoY(--UwBdh_vp~LK64#GNxOl$tVxXQ1C2F
zZt$|qD}cVF?+hdzz)s{Rp{)iIF@Cqa6B{sRa2D~JG=|RdiQq)<0v1xNN%2reiXFG~
zENvrD7}c~)J@-XNSdPIp=v_E=FSB`tLG&~$Gj$APcw58-lXG3dP?D2~i^VU!`D+(y
zMPv--13<wrDDq<4B{Az|%tE}VRs_xq(wFG<=w!GqsT_`_jv!f<NcCEtC6nu+)lS;^
zuCW1e*lO*C5!1Z9Zx3x4>O=C8Gq)6c=$qN7bW`t#Ys8CK@?QUPv@LUYV8SM#lY8kF
zK75w5*G^C)A2%t}2WR9!Lc@90bK#^BTS8rm%%_#S*B5@>CP`<%POxy*x+W~iK4h<U
z2x~4|z(HmyG230HWr5m<A+&Bke~+cbIZCNCzIu>1uxFTnlXz4`g;+B)A9&BC&p=~^
zYHCZ=EwUM#f|}Jhd*4$zD?xZgWzfg<0`P60@^f{K;Buexy?SU5cC-9Zh5Ff<{AXu#
z2Ya5!7T+sZKVxqF5~HIjt*rv&=i}q$`;Ny#dvIB|^zto}eT$BU=5kr_<oH*3$ICE0
z?Cst>Fm{;WCH37ajo>sf7umUIbI8w#LHvls9MDxr(fF!yGR7`%Cj_;JVp5M61kbIU
z>SlQFejKW2z;2MVi?X?&^C-wvjf6K<A;CI`H{Ddt{~SfznqiVhqN`l~B01(pYGL6f
zXzP%;130gR?+B|D0?4fI@4f>zp$X7*^W7$h6xu%XBRIzGO&zhq{^ILhDHvl&D^Mc1
zPL-zqa$g`O@wI`2Tv+wM?yO-JUyn2qK~K!}LC=(wfzbzqpf88;3~`zuy1_Unj)Fs6
zC4q7PO^}(WAq^s>*fDnq>UpwsZY!Y?XAoqDTU!NM5-g|K29>MDz(qwK=h1KzNzfO^
zt^m~%SrG8C!_R8EvCkuSeOL%xdSFqE@=8O#x;g4?n!tU{@*p%8&g3x-S3MnlUFV4l
z(K+5YL`TSm7~yjsuDj%q6E-^S_wj?X2z$zdJ;td?7++%|gWb)@tu4;TjRF8ccb#`S
z-Kxk$H@&K!J^;=<$DJ)An~<UZXq%Z=wX4{r`iWzjm;#G`ysZSQwx;ss=N^v9+;!aw
zMSFA3iDY3XbRYk>JuVmPSRnO96_93;3_t!$S}D?mY|sPJc1Ilivb{jk-W(9_T~hN<
zTp`lzjbJm=P(A}3Eh0{+-B8}LPtnlE){4^2E`Zf`3X~snre3odX9v_)K7a+}V6%Ex
zq27Y`#RVgcv50O$1YG+G`oS(6A!~cx^AVD4P!Q^Wr`o6^v+2`BT2*UWF&Vu#`<uiy
zhp;2$HXbnddP-WNr|t5{Kx1efs4fSjYele-fhe~2aFcJEI$-b`CaH=YXjVwENzZx9
zQc(WnO@rmJgN?v03s-)0bEikYp}c8;Mdqjw2Z>Mjp>jiMM9Z74wRyi>*SEKsC&Yy$
zS6`M7zLIo(b>N5ZC<wc`DF(t*eYTea+Kbhq)}HmTZ<1RSdl>L3v+L|j=WMGHc#a}|
zor^K;%e8Ma`0JO3S*gU@qsxh5arx){*Av9i$-&Xm$pvERtmCFJqykjtR#MYaEYRIo
z<5n43(Z;`i*re=CdP9}$23}D>-=t={n5bNPU)w0Mkh?lxR6uHEN^>NDtC_vEs!gC<
zSWHU)HdiQDc|NlMkgpztotSo!a<$mj!p6p|4{%xW{|%k=+OHLh%b+if5N${L>im5L
z`T_fU1>?&3%CZC5E|1?~f6+63F#Nn0U5O1}?W*CH%ge9Q!>>cV5+A<W6=L!7@(S@?
ziug6~N{skwS3vCF{s;I?n)o&RN}Tus7yUc<SE=IH6jx%!4+`5~QT!=g{F>)Vy!gSB
z^^ZJ%l`_6D{=Jy-jWauN?O!tfUuomVK>z*M_m373m!tTXVf)(qSKasDXa5oN_ZGf>
zi~nfg=QrspZGJUEw2KSLOL6V{yZ9}MetZ6(!B=VY2iW#!@b{$pYmTe5`hz3mmmL3_
zWWVwKeVYBoxD&YWulW8<x<C5q?-!?UwAq1Zlt1;;pO5<2Ft2ETELEB+0L&`}g3F)M
M<peIf<Um9F50R`vMF0Q*

literal 0
HcmV?d00001

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 39e918c3d5209..c11e37a516646 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -2010,6 +2010,32 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     }
   }
 
+  test("SPARK-21101 UDTF should override initialize(ObjectInspector[] args)") {
+    withUserDefinedFunction("udtf_stack1" -> true, "udtf_stack2" -> true) {
+      sql(
+        s"""
+           |CREATE TEMPORARY FUNCTION udtf_stack1
+           |AS 'org.apache.spark.sql.hive.execution.UDTFStack'
+           |USING JAR '${hiveContext.getHiveFile("SPARK-21101-1.0.jar").toURI}'
+        """.stripMargin)
+      val cnt =
+        sql("SELECT udtf_stack1(2, 'A', 10, date '2015-01-01', 'B', 20, date '2016-01-01')").count()
+      assert(cnt === 2)
+
+      sql(
+        s"""
+           |CREATE TEMPORARY FUNCTION udtf_stack2
+           |AS 'org.apache.spark.sql.hive.execution.UDTFStack2'
+           |USING JAR '${hiveContext.getHiveFile("SPARK-21101-1.0.jar").toURI}'
+        """.stripMargin)
+      val e = intercept[org.apache.spark.sql.AnalysisException] {
+        sql("SELECT udtf_stack2(2, 'A', 10, date '2015-01-01', 'B', 20, date '2016-01-01')")
+      }
+      assert(
+        e.getMessage.contains("public StructObjectInspector initialize(ObjectInspector[] args)"))
+    }
+  }
+
   test("SPARK-21721: Clear FileSystem deleterOnExit cache if path is successfully removed") {
     val table = "test21721"
     withTable(table) {

From 427359f077ad469d78c97972d021535f30a1e418 Mon Sep 17 00:00:00 2001
From: Ruben Berenguel Montoro <ruben@mostlymaths.net>
Date: Tue, 24 Oct 2017 23:02:11 -0700
Subject: [PATCH 757/779] [SPARK-13947][SQL] The error message from using an
 invalid column reference is not clear

## What changes were proposed in this pull request?

 Rewritten error message for clarity. Added extra information in case of attribute name collision, hinting the user to double-check referencing two different tables

## How was this patch tested?

No functional changes, only final message has changed. It has been tested manually against the situation proposed in the JIRA ticket. Automated tests in repository pass.

This PR is original work from me and I license this work to the Spark project

Author: Ruben Berenguel Montoro <ruben@mostlymaths.net>
Author: Ruben Berenguel Montoro <ruben@dreamattic.com>
Author: Ruben Berenguel <ruben@mostlymaths.net>

Closes #17100 from rberenguel/SPARK-13947-error-message.
---
 .../sql/catalyst/analysis/CheckAnalysis.scala | 19 ++++++++++++---
 .../analysis/AnalysisErrorSuite.scala         | 23 +++++++++++++------
 .../invalid-correlation.sql.out               |  2 +-
 3 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index d9906bb6e6ede..b5e8bdd79869e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -272,10 +272,23 @@ trait CheckAnalysis extends PredicateHelper {
           case o if o.children.nonEmpty && o.missingInput.nonEmpty =>
             val missingAttributes = o.missingInput.mkString(",")
             val input = o.inputSet.mkString(",")
+            val msgForMissingAttributes = s"Resolved attribute(s) $missingAttributes missing " +
+              s"from $input in operator ${operator.simpleString}."
 
-            failAnalysis(
-              s"resolved attribute(s) $missingAttributes missing from $input " +
-                s"in operator ${operator.simpleString}")
+            val resolver = plan.conf.resolver
+            val attrsWithSameName = o.missingInput.filter { missing =>
+              o.inputSet.exists(input => resolver(missing.name, input.name))
+            }
+
+            val msg = if (attrsWithSameName.nonEmpty) {
+              val sameNames = attrsWithSameName.map(_.name).mkString(",")
+              s"$msgForMissingAttributes Attribute(s) with the same name appear in the " +
+                s"operation: $sameNames. Please check if the right attribute(s) are used."
+            } else {
+              msgForMissingAttributes
+            }
+
+            failAnalysis(msg)
 
           case p @ Project(exprs, _) if containsMultipleGenerators(exprs) =>
             failAnalysis(
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
index 884e113537c93..5d2f8e735e3d4 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
@@ -408,16 +408,25 @@ class AnalysisErrorSuite extends AnalysisTest {
     // CheckAnalysis should throw AnalysisException when Aggregate contains missing attribute(s)
     // Since we manually construct the logical plan at here and Sum only accept
     // LongType, DoubleType, and DecimalType. We use LongType as the type of a.
-    val plan =
-      Aggregate(
-        Nil,
-        Alias(sum(AttributeReference("a", LongType)(exprId = ExprId(1))), "b")() :: Nil,
-        LocalRelation(
-          AttributeReference("a", LongType)(exprId = ExprId(2))))
+    val attrA = AttributeReference("a", LongType)(exprId = ExprId(1))
+    val otherA = AttributeReference("a", LongType)(exprId = ExprId(2))
+    val attrC = AttributeReference("c", LongType)(exprId = ExprId(3))
+    val aliases = Alias(sum(attrA), "b")() :: Alias(sum(attrC), "d")() :: Nil
+    val plan = Aggregate(
+      Nil,
+      aliases,
+      LocalRelation(otherA))
 
     assert(plan.resolved)
 
-    assertAnalysisError(plan, "resolved attribute(s) a#1L missing from a#2L" :: Nil)
+    val resolved = s"${attrA.toString},${attrC.toString}"
+
+    val errorMsg = s"Resolved attribute(s) $resolved missing from ${otherA.toString} " +
+                     s"in operator !Aggregate [${aliases.mkString(", ")}]. " +
+                     s"Attribute(s) with the same name appear in the operation: a. " +
+                     "Please check if the right attribute(s) are used."
+
+    assertAnalysisError(plan, errorMsg :: Nil)
   }
 
   test("error test for self-join") {
diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/invalid-correlation.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/invalid-correlation.sql.out
index e4b1a2dbc675c..2586f26f71c35 100644
--- a/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/invalid-correlation.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/invalid-correlation.sql.out
@@ -63,7 +63,7 @@ WHERE  t1a IN (SELECT   min(t2a)
 struct<>
 -- !query 4 output
 org.apache.spark.sql.AnalysisException
-resolved attribute(s) t2b#x missing from min(t2a)#x,t2c#x in operator !Filter t2c#x IN (list#x [t2b#x]);
+Resolved attribute(s) t2b#x missing from min(t2a)#x,t2c#x in operator !Filter t2c#x IN (list#x [t2b#x]).;
 
 
 -- !query 5

From 6c6950839da991bd41accdb8fb03fbc3b588c1e4 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Wed, 25 Oct 2017 12:51:20 +0100
Subject: [PATCH 758/779] [SPARK-22322][CORE] Update FutureAction for
 compatibility with Scala 2.12 Future

## What changes were proposed in this pull request?

Scala 2.12's `Future` defines two new methods to implement, `transform` and `transformWith`. These can be implemented naturally in Spark's `FutureAction` extension and subclasses, but, only in terms of the new methods that don't exist in Scala 2.11. To support both at the same time, reflection is used to implement these.

## How was this patch tested?

Existing tests.

Author: Sean Owen <sowen@cloudera.com>

Closes #19561 from srowen/SPARK-22322.
---
 .../scala/org/apache/spark/FutureAction.scala | 59 ++++++++++++++++++-
 pom.xml                                       |  2 +-
 .../FlatMapGroupsWithStateSuite.scala         |  3 +-
 .../sql/streaming/StreamingQuerySuite.scala   |  2 +-
 .../util/FileBasedWriteAheadLog.scala         |  2 +-
 5 files changed, 62 insertions(+), 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/FutureAction.scala b/core/src/main/scala/org/apache/spark/FutureAction.scala
index 1034fdcae8e8c..036c9a60630ea 100644
--- a/core/src/main/scala/org/apache/spark/FutureAction.scala
+++ b/core/src/main/scala/org/apache/spark/FutureAction.scala
@@ -89,7 +89,11 @@ trait FutureAction[T] extends Future[T] {
    */
   override def value: Option[Try[T]]
 
-  // These two methods must be implemented in Scala 2.12, but won't be used by Spark
+  // These two methods must be implemented in Scala 2.12. They're implemented as a no-op here
+  // and then filled in with a real implementation in the two subclasses below. The no-op exists
+  // here so that those implementations can declare "override", necessary in 2.12, while working
+  // in 2.11, where the method doesn't exist in the superclass.
+  // After 2.11 support goes away, remove these two:
 
   def transform[S](f: (Try[T]) => Try[S])(implicit executor: ExecutionContext): Future[S] =
     throw new UnsupportedOperationException()
@@ -113,6 +117,42 @@ trait FutureAction[T] extends Future[T] {
 
 }
 
+/**
+ * Scala 2.12 defines the two new transform/transformWith methods mentioned above. Impementing
+ * these for 2.12 in the Spark class here requires delegating to these same methods in an
+ * underlying Future object. But that only exists in 2.12. But these methods are only called
+ * in 2.12. So define helper shims to access these methods on a Future by reflection.
+ */
+private[spark] object FutureAction {
+
+  private val transformTryMethod =
+    try {
+      classOf[Future[_]].getMethod("transform", classOf[(_) => _], classOf[ExecutionContext])
+    } catch {
+      case _: NoSuchMethodException => null // Would fail later in 2.11, but not called in 2.11
+    }
+
+  private val transformWithTryMethod =
+    try {
+      classOf[Future[_]].getMethod("transformWith", classOf[(_) => _], classOf[ExecutionContext])
+    } catch {
+      case _: NoSuchMethodException => null // Would fail later in 2.11, but not called in 2.11
+    }
+
+  private[spark] def transform[T, S](
+      future: Future[T],
+      f: (Try[T]) => Try[S],
+      executor: ExecutionContext): Future[S] =
+    transformTryMethod.invoke(future, f, executor).asInstanceOf[Future[S]]
+
+  private[spark] def transformWith[T, S](
+      future: Future[T],
+      f: (Try[T]) => Future[S],
+      executor: ExecutionContext): Future[S] =
+    transformWithTryMethod.invoke(future, f, executor).asInstanceOf[Future[S]]
+
+}
+
 
 /**
  * A [[FutureAction]] holding the result of an action that triggers a single job. Examples include
@@ -153,6 +193,18 @@ class SimpleFutureAction[T] private[spark](jobWaiter: JobWaiter[_], resultFunc:
     jobWaiter.completionFuture.value.map {res => res.map(_ => resultFunc)}
 
   def jobIds: Seq[Int] = Seq(jobWaiter.jobId)
+
+  override def transform[S](f: (Try[T]) => Try[S])(implicit e: ExecutionContext): Future[S] =
+    FutureAction.transform(
+      jobWaiter.completionFuture,
+      (u: Try[Unit]) => f(u.map(_ => resultFunc)),
+      e)
+
+  override def transformWith[S](f: (Try[T]) => Future[S])(implicit e: ExecutionContext): Future[S] =
+    FutureAction.transformWith(
+      jobWaiter.completionFuture,
+      (u: Try[Unit]) => f(u.map(_ => resultFunc)),
+      e)
 }
 
 
@@ -246,6 +298,11 @@ class ComplexFutureAction[T](run : JobSubmitter => Future[T])
 
   def jobIds: Seq[Int] = subActions.flatMap(_.jobIds)
 
+  override def transform[S](f: (Try[T]) => Try[S])(implicit e: ExecutionContext): Future[S] =
+    FutureAction.transform(p.future, f, e)
+
+  override def transformWith[S](f: (Try[T]) => Future[S])(implicit e: ExecutionContext): Future[S] =
+    FutureAction.transformWith(p.future, f, e)
 }
 
 
diff --git a/pom.xml b/pom.xml
index b9c972855204a..2d59f06811a82 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2692,7 +2692,7 @@
     <profile>
       <id>scala-2.12</id>
       <properties>
-        <scala.version>2.12.3</scala.version>
+        <scala.version>2.12.4</scala.version>
         <scala.binary.version>2.12</scala.binary.version>
       </properties>
       <build>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
index af08186aadbb0..b906393a379ae 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
@@ -33,7 +33,6 @@ import org.apache.spark.sql.catalyst.streaming.InternalOutputModes._
 import org.apache.spark.sql.execution.RDDScanExec
 import org.apache.spark.sql.execution.streaming.{FlatMapGroupsWithStateExec, GroupStateImpl, MemoryStream}
 import org.apache.spark.sql.execution.streaming.state.{StateStore, StateStoreId, StateStoreMetrics, UnsafeRowPair}
-import org.apache.spark.sql.streaming.FlatMapGroupsWithStateSuite.MemoryStateStore
 import org.apache.spark.sql.streaming.util.StreamManualClock
 import org.apache.spark.sql.types.{DataType, IntegerType}
 
@@ -1201,7 +1200,7 @@ object FlatMapGroupsWithStateSuite {
     } catch {
       case u: UnsupportedOperationException =>
         return
-      case _ =>
+      case _: Throwable =>
         throw new TestFailedException("Unexpected exception when trying to get watermark", 20)
     }
     throw new TestFailedException("Could get watermark when not expected", 20)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index c53889bb8566c..cc693909270f8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -744,7 +744,7 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging wi
           assert(returnedValue === expectedReturnValue, "Returned value does not match expected")
         }
       }
-      AwaitTerminationTester.test(expectedBehavior, awaitTermFunc)
+      AwaitTerminationTester.test(expectedBehavior, () => awaitTermFunc())
       true // If the control reached here, then everything worked as expected
     }
   }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala
index d6e15cfdd2723..ab7c8558321c8 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala
@@ -139,7 +139,7 @@ private[streaming] class FileBasedWriteAheadLog(
     def readFile(file: String): Iterator[ByteBuffer] = {
       logDebug(s"Creating log reader with $file")
       val reader = new FileBasedWriteAheadLogReader(file, hadoopConf)
-      CompletionIterator[ByteBuffer, Iterator[ByteBuffer]](reader, reader.close _)
+      CompletionIterator[ByteBuffer, Iterator[ByteBuffer]](reader, () => reader.close())
     }
     if (!closeFileAfterWrite) {
       logFilesToRead.iterator.map(readFile).flatten.asJava

From 1051ebec70bf05971ddc80819d112626b1f1614f Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Wed, 25 Oct 2017 16:31:58 +0100
Subject: [PATCH 759/779] [SPARK-20783][SQL][FOLLOW-UP] Create ColumnVector to
 abstract existing compressed column

## What changes were proposed in this pull request?

Removed one unused method.

## How was this patch tested?

Existing tests.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #19508 from viirya/SPARK-20783-followup.
---
 .../apache/spark/sql/execution/columnar/ColumnAccessor.scala   | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnAccessor.scala
index 445933d98e9d4..85c36b7da9498 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnAccessor.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnAccessor.scala
@@ -63,9 +63,6 @@ private[columnar] abstract class BasicColumnAccessor[JvmType](
   }
 
   protected def underlyingBuffer = buffer
-
-  def getByteBuffer: ByteBuffer =
-    buffer.duplicate.order(ByteOrder.nativeOrder())
 }
 
 private[columnar] class NullColumnAccessor(buffer: ByteBuffer)

From 3d43a9f939764ec265de945921d1ecf2323ca230 Mon Sep 17 00:00:00 2001
From: liuxian <liu.xian3@zte.com.cn>
Date: Wed, 25 Oct 2017 21:34:00 +0530
Subject: [PATCH 760/779] [SPARK-22349] In on-heap mode, when allocating memory
 from pool,we should fill memory with `MEMORY_DEBUG_FILL_CLEAN_VALUE`

## What changes were proposed in this pull request?
In on-heap mode, when allocating memory from pool,we should fill memory with `MEMORY_DEBUG_FILL_CLEAN_VALUE`

## How was this patch tested?
added unit tests

Author: liuxian <liu.xian3@zte.com.cn>

Closes #19572 from 10110346/MEMORY_DEBUG.
---
 .../spark/unsafe/memory/HeapMemoryAllocator.java    |  3 +++
 .../org/apache/spark/unsafe/PlatformUtilSuite.java  | 13 ++++++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java
index 355748238540b..cc9cc429643ad 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java
@@ -56,6 +56,9 @@ public MemoryBlock allocate(long size) throws OutOfMemoryError {
             final MemoryBlock memory = blockReference.get();
             if (memory != null) {
               assert (memory.size() == size);
+              if (MemoryAllocator.MEMORY_DEBUG_FILL_ENABLED) {
+                memory.fill(MemoryAllocator.MEMORY_DEBUG_FILL_CLEAN_VALUE);
+              }
               return memory;
             }
           }
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java
index 4ae49d82efa29..4b141339ec816 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java
@@ -66,10 +66,21 @@ public void overlappingCopyMemory() {
   public void memoryDebugFillEnabledInTest() {
     Assert.assertTrue(MemoryAllocator.MEMORY_DEBUG_FILL_ENABLED);
     MemoryBlock onheap = MemoryAllocator.HEAP.allocate(1);
-    MemoryBlock offheap = MemoryAllocator.UNSAFE.allocate(1);
     Assert.assertEquals(
       Platform.getByte(onheap.getBaseObject(), onheap.getBaseOffset()),
       MemoryAllocator.MEMORY_DEBUG_FILL_CLEAN_VALUE);
+
+    MemoryBlock onheap1 = MemoryAllocator.HEAP.allocate(1024 * 1024);
+    MemoryAllocator.HEAP.free(onheap1);
+    Assert.assertEquals(
+      Platform.getByte(onheap1.getBaseObject(), onheap1.getBaseOffset()),
+      MemoryAllocator.MEMORY_DEBUG_FILL_FREED_VALUE);
+    MemoryBlock onheap2 = MemoryAllocator.HEAP.allocate(1024 * 1024);
+    Assert.assertEquals(
+      Platform.getByte(onheap2.getBaseObject(), onheap2.getBaseOffset()),
+      MemoryAllocator.MEMORY_DEBUG_FILL_CLEAN_VALUE);
+
+    MemoryBlock offheap = MemoryAllocator.UNSAFE.allocate(1);
     Assert.assertEquals(
       Platform.getByte(offheap.getBaseObject(), offheap.getBaseOffset()),
       MemoryAllocator.MEMORY_DEBUG_FILL_CLEAN_VALUE);

From 6ea8a56ca26a7e02e6574f5f763bb91059119a80 Mon Sep 17 00:00:00 2001
From: Andrea zito <andrea.zito@u-hopper.com>
Date: Wed, 25 Oct 2017 10:10:24 -0700
Subject: [PATCH 761/779] [SPARK-21991][LAUNCHER] Fix race condition in
 LauncherServer#acceptConnections

## What changes were proposed in this pull request?
This patch changes the order in which _acceptConnections_ starts the client thread and schedules the client timeout action ensuring that the latter has been scheduled before the former get a chance to cancel it.

## How was this patch tested?
Due to the non-deterministic nature of the patch I wasn't able to add a new test for this issue.

Author: Andrea zito <andrea.zito@u-hopper.com>

Closes #19217 from nivox/SPARK-21991.
---
 .../apache/spark/launcher/LauncherServer.java | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java b/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
index 865d4926da6a9..454bc7a7f924d 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
@@ -232,20 +232,20 @@ public void run() {
         };
         ServerConnection clientConnection = new ServerConnection(client, timeout);
         Thread clientThread = factory.newThread(clientConnection);
-        synchronized (timeout) {
-          clientThread.start();
-          synchronized (clients) {
-            clients.add(clientConnection);
-          }
-          long timeoutMs = getConnectionTimeout();
-          // 0 is used for testing to avoid issues with clock resolution / thread scheduling,
-          // and force an immediate timeout.
-          if (timeoutMs > 0) {
-            timeoutTimer.schedule(timeout, getConnectionTimeout());
-          } else {
-            timeout.run();
-          }
+        synchronized (clients) {
+          clients.add(clientConnection);
+        }
+        
+        long timeoutMs = getConnectionTimeout();
+        // 0 is used for testing to avoid issues with clock resolution / thread scheduling,
+        // and force an immediate timeout.
+        if (timeoutMs > 0) {
+          timeoutTimer.schedule(timeout, timeoutMs);
+        } else {
+          timeout.run();
         }
+            
+        clientThread.start();
       }
     } catch (IOException ioe) {
       if (running) {

From d212ef14be7c2864cc529e48a02a47584e46f7a5 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Wed, 25 Oct 2017 13:53:01 -0700
Subject: [PATCH 762/779] [SPARK-22341][YARN] Impersonate correct user when
 preparing resources.

The bug was introduced in SPARK-22290, which changed how the app's user
is impersonated in the AM. The changed missed an initialization function
that needs to be run as the app owner (who has the right credentials to
read from HDFS).

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #19566 from vanzin/SPARK-22341.
---
 .../spark/deploy/yarn/ApplicationMaster.scala  | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index f6167235f89e4..244d912b9f3aa 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -97,9 +97,7 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends
     }
   }
 
-  private val client = ugi.doAs(new PrivilegedExceptionAction[YarnRMClient]() {
-    def run: YarnRMClient = new YarnRMClient()
-  })
+  private val client = doAsUser { new YarnRMClient() }
 
   // Default to twice the number of executors (twice the maximum number of executors if dynamic
   // allocation is enabled), with a minimum of 3.
@@ -178,7 +176,7 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends
   // Load the list of localized files set by the client. This is used when launching executors,
   // and is loaded here so that these configs don't pollute the Web UI's environment page in
   // cluster mode.
-  private val localResources = {
+  private val localResources = doAsUser {
     logInfo("Preparing Local resources")
     val resources = HashMap[String, LocalResource]()
 
@@ -240,9 +238,9 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends
   }
 
   final def run(): Int = {
-    ugi.doAs(new PrivilegedExceptionAction[Unit]() {
-      def run: Unit = runImpl()
-    })
+    doAsUser {
+      runImpl()
+    }
     exitCode
   }
 
@@ -790,6 +788,12 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends
     }
   }
 
+  private def doAsUser[T](fn: => T): T = {
+    ugi.doAs(new PrivilegedExceptionAction[T]() {
+      override def run: T = fn
+    })
+  }
+
 }
 
 object ApplicationMaster extends Logging {

From b377ef133cdc38d49b460b2cc6ece0b5892804cc Mon Sep 17 00:00:00 2001
From: Sergei Lebedev <s.lebedev@criteo.com>
Date: Wed, 25 Oct 2017 22:15:44 +0100
Subject: [PATCH 763/779] [SPARK-22227][CORE] DiskBlockManager.getAllBlocks now
 tolerates temp files

## What changes were proposed in this pull request?

Prior to this commit getAllBlocks implicitly assumed that the directories
managed by the DiskBlockManager contain only the files corresponding to
valid block IDs. In reality, this assumption was violated during shuffle,
which produces temporary files in the same directory as the resulting
blocks. As a result, calls to getAllBlocks during shuffle were unreliable.

The fix could be made more efficient, but this is probably good enough.

## How was this patch tested?

`DiskBlockManagerSuite`

Author: Sergei Lebedev <s.lebedev@criteo.com>

Closes #19458 from superbobry/block-id-option.
---
 .../scala/org/apache/spark/storage/BlockId.scala | 16 +++++++++++++---
 .../apache/spark/storage/DiskBlockManager.scala  | 11 ++++++++++-
 .../org/apache/spark/storage/BlockIdSuite.scala  |  9 +++------
 .../spark/storage/DiskBlockManagerSuite.scala    |  7 +++++++
 4 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/storage/BlockId.scala b/core/src/main/scala/org/apache/spark/storage/BlockId.scala
index a441baed2800e..7ac2c71c18eb3 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockId.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockId.scala
@@ -19,6 +19,7 @@ package org.apache.spark.storage
 
 import java.util.UUID
 
+import org.apache.spark.SparkException
 import org.apache.spark.annotation.DeveloperApi
 
 /**
@@ -95,6 +96,10 @@ private[spark] case class TestBlockId(id: String) extends BlockId {
   override def name: String = "test_" + id
 }
 
+@DeveloperApi
+class UnrecognizedBlockId(name: String)
+    extends SparkException(s"Failed to parse $name into a block ID")
+
 @DeveloperApi
 object BlockId {
   val RDD = "rdd_([0-9]+)_([0-9]+)".r
@@ -104,10 +109,11 @@ object BlockId {
   val BROADCAST = "broadcast_([0-9]+)([_A-Za-z0-9]*)".r
   val TASKRESULT = "taskresult_([0-9]+)".r
   val STREAM = "input-([0-9]+)-([0-9]+)".r
+  val TEMP_LOCAL = "temp_local_([-A-Fa-f0-9]+)".r
+  val TEMP_SHUFFLE = "temp_shuffle_([-A-Fa-f0-9]+)".r
   val TEST = "test_(.*)".r
 
-  /** Converts a BlockId "name" String back into a BlockId. */
-  def apply(id: String): BlockId = id match {
+  def apply(name: String): BlockId = name match {
     case RDD(rddId, splitIndex) =>
       RDDBlockId(rddId.toInt, splitIndex.toInt)
     case SHUFFLE(shuffleId, mapId, reduceId) =>
@@ -122,9 +128,13 @@ object BlockId {
       TaskResultBlockId(taskId.toLong)
     case STREAM(streamId, uniqueId) =>
       StreamBlockId(streamId.toInt, uniqueId.toLong)
+    case TEMP_LOCAL(uuid) =>
+      TempLocalBlockId(UUID.fromString(uuid))
+    case TEMP_SHUFFLE(uuid) =>
+      TempShuffleBlockId(UUID.fromString(uuid))
     case TEST(value) =>
       TestBlockId(value)
     case _ =>
-      throw new IllegalStateException("Unrecognized BlockId: " + id)
+      throw new UnrecognizedBlockId(name)
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
index 3d43e3c367aac..a69bcc9259995 100644
--- a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
@@ -100,7 +100,16 @@ private[spark] class DiskBlockManager(conf: SparkConf, deleteFilesOnStop: Boolea
 
   /** List all the blocks currently stored on disk by the disk manager. */
   def getAllBlocks(): Seq[BlockId] = {
-    getAllFiles().map(f => BlockId(f.getName))
+    getAllFiles().flatMap { f =>
+      try {
+        Some(BlockId(f.getName))
+      } catch {
+        case _: UnrecognizedBlockId =>
+          // Skip files which do not correspond to blocks, for example temporary
+          // files created by [[SortShuffleWriter]].
+          None
+      }
+    }
   }
 
   /** Produces a unique block id and File suitable for storing local intermediate results. */
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockIdSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockIdSuite.scala
index f0c521b00b583..ff4755833a916 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockIdSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockIdSuite.scala
@@ -35,13 +35,8 @@ class BlockIdSuite extends SparkFunSuite {
   }
 
   test("test-bad-deserialization") {
-    try {
-      // Try to deserialize an invalid block id.
+    intercept[UnrecognizedBlockId] {
       BlockId("myblock")
-      fail()
-    } catch {
-      case e: IllegalStateException => // OK
-      case _: Throwable => fail()
     }
   }
 
@@ -139,6 +134,7 @@ class BlockIdSuite extends SparkFunSuite {
     assert(id.id.getMostSignificantBits() === 5)
     assert(id.id.getLeastSignificantBits() === 2)
     assert(!id.isShuffle)
+    assertSame(id, BlockId(id.toString))
   }
 
   test("temp shuffle") {
@@ -151,6 +147,7 @@ class BlockIdSuite extends SparkFunSuite {
     assert(id.id.getMostSignificantBits() === 1)
     assert(id.id.getLeastSignificantBits() === 2)
     assert(!id.isShuffle)
+    assertSame(id, BlockId(id.toString))
   }
 
   test("test") {
diff --git a/core/src/test/scala/org/apache/spark/storage/DiskBlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/DiskBlockManagerSuite.scala
index 7859b0bba2b48..0c4f3c48ef802 100644
--- a/core/src/test/scala/org/apache/spark/storage/DiskBlockManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/DiskBlockManagerSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.storage
 
 import java.io.{File, FileWriter}
+import java.util.UUID
 
 import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach}
 
@@ -79,6 +80,12 @@ class DiskBlockManagerSuite extends SparkFunSuite with BeforeAndAfterEach with B
     assert(diskBlockManager.getAllBlocks.toSet === ids.toSet)
   }
 
+  test("SPARK-22227: non-block files are skipped") {
+    val file = diskBlockManager.getFile("unmanaged_file")
+    writeToFile(file, 10)
+    assert(diskBlockManager.getAllBlocks().isEmpty)
+  }
+
   def writeToFile(file: File, numBytes: Int) {
     val writer = new FileWriter(file, true)
     for (i <- 0 until numBytes) writer.write(i)

From 841f1d776f420424c20d99cf7110d06c73f9ca20 Mon Sep 17 00:00:00 2001
From: WeichenXu <weichen.xu@databricks.com>
Date: Wed, 25 Oct 2017 14:31:36 -0700
Subject: [PATCH 764/779] [SPARK-22332][ML][TEST] Fix NaiveBayes unit test
 occasionly fail (cause by test dataset not deterministic)

## What changes were proposed in this pull request?

Fix NaiveBayes unit test occasionly fail:
Set seed for `BrzMultinomial.sample`, make `generateNaiveBayesInput` output deterministic dataset.
(If we do not set seed, the generated dataset will be random, and the model will be possible to exceed the tolerance in the test, which trigger this failure)

## How was this patch tested?

Manually run tests multiple times and check each time output models contains the same values.

Author: WeichenXu <weichen.xu@databricks.com>

Closes #19558 from WeichenXu123/fix_nb_test_seed.
---
 .../org/apache/spark/ml/classification/NaiveBayesSuite.scala   | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
index 9730dd68a3b27..0d3adf993383f 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.ml.classification
 import scala.util.Random
 
 import breeze.linalg.{DenseVector => BDV, Vector => BV}
-import breeze.stats.distributions.{Multinomial => BrzMultinomial}
+import breeze.stats.distributions.{Multinomial => BrzMultinomial, RandBasis => BrzRandBasis}
 
 import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.ml.classification.NaiveBayes.{Bernoulli, Multinomial}
@@ -335,6 +335,7 @@ object NaiveBayesSuite {
     val _pi = pi.map(math.exp)
     val _theta = theta.map(row => row.map(math.exp))
 
+    implicit val rngForBrzMultinomial = BrzRandBasis.withSeed(seed)
     for (i <- 0 until nPoints) yield {
       val y = calcLabel(rnd.nextDouble(), _pi)
       val xi = modelType match {

From 5433be44caecaeef45ed1fdae10b223c698a9d14 Mon Sep 17 00:00:00 2001
From: Andrew Ash <andrew@andrewash.com>
Date: Wed, 25 Oct 2017 14:41:02 -0700
Subject: [PATCH 765/779] [SPARK-21991][LAUNCHER][FOLLOWUP] Fix java lint

## What changes were proposed in this pull request?

Fix java lint

## How was this patch tested?

Run `./dev/lint-java`

Author: Andrew Ash <andrew@andrewash.com>

Closes #19574 from ash211/aash/fix-java-lint.
---
 .../main/java/org/apache/spark/launcher/LauncherServer.java   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java b/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
index 454bc7a7f924d..4353e3f263c51 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
@@ -235,7 +235,7 @@ public void run() {
         synchronized (clients) {
           clients.add(clientConnection);
         }
-        
+
         long timeoutMs = getConnectionTimeout();
         // 0 is used for testing to avoid issues with clock resolution / thread scheduling,
         // and force an immediate timeout.
@@ -244,7 +244,7 @@ public void run() {
         } else {
           timeout.run();
         }
-            
+
         clientThread.start();
       }
     } catch (IOException ioe) {

From 592cfeab9caeff955d115a1ca5014ede7d402907 Mon Sep 17 00:00:00 2001
From: Nathan Kronenfeld <nicole.oresme@gmail.com>
Date: Thu, 26 Oct 2017 00:29:49 -0700
Subject: [PATCH 766/779] [SPARK-22308] Support alternative unit testing styles
 in external applications

## What changes were proposed in this pull request?
Support unit tests of external code (i.e., applications that use spark) using scalatest that don't want to use FunSuite.  SharedSparkContext already supports this, but SharedSQLContext does not.

I've introduced SharedSparkSession as a parent to SharedSQLContext, written in a way that it does support all scalatest styles.

## How was this patch tested?
There are three new unit test suites added that just test using FunSpec, FlatSpec, and WordSpec.

Author: Nathan Kronenfeld <nicole.oresme@gmail.com>

Closes #19529 from nkronenfeld/alternative-style-tests-2.
---
 .../org/apache/spark/SharedSparkContext.scala |  17 +-
 .../spark/sql/catalyst/plans/PlanTest.scala   |  10 +-
 .../spark/sql/test/GenericFlatSpecSuite.scala |  45 +++++
 .../spark/sql/test/GenericFunSpecSuite.scala  |  47 +++++
 .../spark/sql/test/GenericWordSpecSuite.scala |  51 ++++++
 .../apache/spark/sql/test/SQLTestUtils.scala  | 173 ++++++++++--------
 .../spark/sql/test/SharedSQLContext.scala     |  84 +--------
 .../spark/sql/test/SharedSparkSession.scala   | 119 ++++++++++++
 8 files changed, 381 insertions(+), 165 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/test/GenericFlatSpecSuite.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/test/GenericFunSpecSuite.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/test/GenericWordSpecSuite.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala

diff --git a/core/src/test/scala/org/apache/spark/SharedSparkContext.scala b/core/src/test/scala/org/apache/spark/SharedSparkContext.scala
index 6aedcb1271ff6..1aa1c421d792e 100644
--- a/core/src/test/scala/org/apache/spark/SharedSparkContext.scala
+++ b/core/src/test/scala/org/apache/spark/SharedSparkContext.scala
@@ -29,10 +29,23 @@ trait SharedSparkContext extends BeforeAndAfterAll with BeforeAndAfterEach { sel
 
   var conf = new SparkConf(false)
 
+  /**
+   * Initialize the [[SparkContext]].  Generally, this is just called from beforeAll; however, in
+   * test using styles other than FunSuite, there is often code that relies on the session between
+   * test group constructs and the actual tests, which may need this session.  It is purely a
+   * semantic difference, but semantically, it makes more sense to call 'initializeContext' between
+   * a 'describe' and an 'it' call than it does to call 'beforeAll'.
+   */
+  protected def initializeContext(): Unit = {
+    if (null == _sc) {
+      _sc = new SparkContext(
+        "local[4]", "test", conf.set("spark.hadoop.fs.file.impl", classOf[DebugFilesystem].getName))
+    }
+  }
+
   override def beforeAll() {
     super.beforeAll()
-    _sc = new SparkContext(
-      "local[4]", "test", conf.set("spark.hadoop.fs.file.impl", classOf[DebugFilesystem].getName))
+    initializeContext()
   }
 
   override def afterAll() {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
index 10bdfafd6f933..82c5307d54360 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.catalyst.plans
 
+import org.scalatest.Suite
+
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer
@@ -29,7 +31,13 @@ import org.apache.spark.sql.internal.SQLConf
 /**
  * Provides helper methods for comparing plans.
  */
-trait PlanTest extends SparkFunSuite with PredicateHelper {
+trait PlanTest extends SparkFunSuite with PlanTestBase
+
+/**
+ * Provides helper methods for comparing plans, but without the overhead of
+ * mandating a FunSuite.
+ */
+trait PlanTestBase extends PredicateHelper { self: Suite =>
 
   // TODO(gatorsmile): remove this from PlanTest and all the analyzer rules
   protected def conf = SQLConf.get
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/GenericFlatSpecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/GenericFlatSpecSuite.scala
new file mode 100644
index 0000000000000..6179585a0d39a
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/GenericFlatSpecSuite.scala
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.test
+
+import org.scalatest.FlatSpec
+
+/**
+ * The purpose of this suite is to make sure that generic FlatSpec-based scala
+ * tests work with a shared spark session
+ */
+class GenericFlatSpecSuite extends FlatSpec with SharedSparkSession {
+  import testImplicits._
+  initializeSession()
+  val ds = Seq((1, 1), (2, 1), (3, 2), (4, 2), (5, 3), (6, 3), (7, 4), (8, 4)).toDS
+
+  "A Simple Dataset" should "have the specified number of elements" in {
+    assert(8 === ds.count)
+  }
+  it should "have the specified number of unique elements" in {
+      assert(8 === ds.distinct.count)
+  }
+  it should "have the specified number of elements in each column" in {
+    assert(8 === ds.select("_1").count)
+    assert(8 === ds.select("_2").count)
+  }
+  it should "have the correct number of distinct elements in each column" in {
+    assert(8 === ds.select("_1").distinct.count)
+    assert(4 === ds.select("_2").distinct.count)
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/GenericFunSpecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/GenericFunSpecSuite.scala
new file mode 100644
index 0000000000000..15139ee8b3047
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/GenericFunSpecSuite.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.test
+
+import org.scalatest.FunSpec
+
+/**
+ * The purpose of this suite is to make sure that generic FunSpec-based scala
+ * tests work with a shared spark session
+ */
+class GenericFunSpecSuite extends FunSpec with SharedSparkSession {
+  import testImplicits._
+  initializeSession()
+  val ds = Seq((1, 1), (2, 1), (3, 2), (4, 2), (5, 3), (6, 3), (7, 4), (8, 4)).toDS
+
+  describe("Simple Dataset") {
+    it("should have the specified number of elements") {
+      assert(8 === ds.count)
+    }
+    it("should have the specified number of unique elements") {
+      assert(8 === ds.distinct.count)
+    }
+    it("should have the specified number of elements in each column") {
+      assert(8 === ds.select("_1").count)
+      assert(8 === ds.select("_2").count)
+    }
+    it("should have the correct number of distinct elements in each column") {
+      assert(8 === ds.select("_1").distinct.count)
+      assert(4 === ds.select("_2").distinct.count)
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/GenericWordSpecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/GenericWordSpecSuite.scala
new file mode 100644
index 0000000000000..b6548bf95fec8
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/GenericWordSpecSuite.scala
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.test
+
+import org.scalatest.WordSpec
+
+/**
+ * The purpose of this suite is to make sure that generic WordSpec-based scala
+ * tests work with a shared spark session
+ */
+class GenericWordSpecSuite extends WordSpec with SharedSparkSession {
+  import testImplicits._
+  initializeSession()
+  val ds = Seq((1, 1), (2, 1), (3, 2), (4, 2), (5, 3), (6, 3), (7, 4), (8, 4)).toDS
+
+  "A Simple Dataset" when {
+    "looked at as complete rows" should {
+      "have the specified number of elements" in {
+        assert(8 === ds.count)
+      }
+      "have the specified number of unique elements" in {
+        assert(8 === ds.distinct.count)
+      }
+    }
+    "refined to specific columns" should {
+      "have the specified number of elements in each column" in {
+        assert(8 === ds.select("_1").count)
+        assert(8 === ds.select("_2").count)
+      }
+      "have the correct number of distinct elements in each column" in {
+        assert(8 === ds.select("_1").distinct.count)
+        assert(4 === ds.select("_2").distinct.count)
+      }
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
index a14a1441a4313..b4248b74f50ab 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
@@ -27,7 +27,7 @@ import scala.language.implicitConversions
 import scala.util.control.NonFatal
 
 import org.apache.hadoop.fs.Path
-import org.scalatest.BeforeAndAfterAll
+import org.scalatest.{BeforeAndAfterAll, Suite}
 import org.scalatest.concurrent.Eventually
 
 import org.apache.spark.SparkFunSuite
@@ -36,14 +36,17 @@ import org.apache.spark.sql.catalyst.FunctionIdentifier
 import org.apache.spark.sql.catalyst.analysis.NoSuchTableException
 import org.apache.spark.sql.catalyst.catalog.SessionCatalog.DEFAULT_DATABASE
 import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.catalyst.plans.PlanTestBase
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.execution.FilterExec
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.util.{UninterruptibleThread, Utils}
+import org.apache.spark.util.UninterruptibleThread
+import org.apache.spark.util.Utils
 
 /**
- * Helper trait that should be extended by all SQL test suites.
+ * Helper trait that should be extended by all SQL test suites within the Spark
+ * code base.
  *
  * This allows subclasses to plugin a custom `SQLContext`. It comes with test data
  * prepared in advance as well as all implicit conversions used extensively by dataframes.
@@ -52,17 +55,99 @@ import org.apache.spark.util.{UninterruptibleThread, Utils}
  * Subclasses should *not* create `SQLContext`s in the test suite constructor, which is
  * prone to leaving multiple overlapping [[org.apache.spark.SparkContext]]s in the same JVM.
  */
-private[sql] trait SQLTestUtils
-  extends SparkFunSuite with Eventually
+private[sql] trait SQLTestUtils extends SparkFunSuite with SQLTestUtilsBase with PlanTest {
+  // Whether to materialize all test data before the first test is run
+  private var loadTestDataBeforeTests = false
+
+  protected override def beforeAll(): Unit = {
+    super.beforeAll()
+    if (loadTestDataBeforeTests) {
+      loadTestData()
+    }
+  }
+
+  /**
+   * Materialize the test data immediately after the `SQLContext` is set up.
+   * This is necessary if the data is accessed by name but not through direct reference.
+   */
+  protected def setupTestData(): Unit = {
+    loadTestDataBeforeTests = true
+  }
+
+  /**
+   * Disable stdout and stderr when running the test. To not output the logs to the console,
+   * ConsoleAppender's `follow` should be set to `true` so that it will honors reassignments of
+   * System.out or System.err. Otherwise, ConsoleAppender will still output to the console even if
+   * we change System.out and System.err.
+   */
+  protected def testQuietly(name: String)(f: => Unit): Unit = {
+    test(name) {
+      quietly {
+        f
+      }
+    }
+  }
+
+  /**
+   * Run a test on a separate `UninterruptibleThread`.
+   */
+  protected def testWithUninterruptibleThread(name: String, quietly: Boolean = false)
+    (body: => Unit): Unit = {
+    val timeoutMillis = 10000
+    @transient var ex: Throwable = null
+
+    def runOnThread(): Unit = {
+      val thread = new UninterruptibleThread(s"Testing thread for test $name") {
+        override def run(): Unit = {
+          try {
+            body
+          } catch {
+            case NonFatal(e) =>
+              ex = e
+          }
+        }
+      }
+      thread.setDaemon(true)
+      thread.start()
+      thread.join(timeoutMillis)
+      if (thread.isAlive) {
+        thread.interrupt()
+        // If this interrupt does not work, then this thread is most likely running something that
+        // is not interruptible. There is not much point to wait for the thread to termniate, and
+        // we rather let the JVM terminate the thread on exit.
+        fail(
+          s"Test '$name' running on o.a.s.util.UninterruptibleThread timed out after" +
+            s" $timeoutMillis ms")
+      } else if (ex != null) {
+        throw ex
+      }
+    }
+
+    if (quietly) {
+      testQuietly(name) { runOnThread() }
+    } else {
+      test(name) { runOnThread() }
+    }
+  }
+}
+
+/**
+ * Helper trait that can be extended by all external SQL test suites.
+ *
+ * This allows subclasses to plugin a custom `SQLContext`.
+ * To use implicit methods, import `testImplicits._` instead of through the `SQLContext`.
+ *
+ * Subclasses should *not* create `SQLContext`s in the test suite constructor, which is
+ * prone to leaving multiple overlapping [[org.apache.spark.SparkContext]]s in the same JVM.
+ */
+private[sql] trait SQLTestUtilsBase
+  extends Eventually
   with BeforeAndAfterAll
   with SQLTestData
-  with PlanTest { self =>
+  with PlanTestBase { self: Suite =>
 
   protected def sparkContext = spark.sparkContext
 
-  // Whether to materialize all test data before the first test is run
-  private var loadTestDataBeforeTests = false
-
   // Shorthand for running a query using our SQLContext
   protected lazy val sql = spark.sql _
 
@@ -77,21 +162,6 @@ private[sql] trait SQLTestUtils
     protected override def _sqlContext: SQLContext = self.spark.sqlContext
   }
 
-  /**
-   * Materialize the test data immediately after the `SQLContext` is set up.
-   * This is necessary if the data is accessed by name but not through direct reference.
-   */
-  protected def setupTestData(): Unit = {
-    loadTestDataBeforeTests = true
-  }
-
-  protected override def beforeAll(): Unit = {
-    super.beforeAll()
-    if (loadTestDataBeforeTests) {
-      loadTestData()
-    }
-  }
-
   protected override def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = {
     SparkSession.setActiveSession(spark)
     super.withSQLConf(pairs: _*)(f)
@@ -297,61 +367,6 @@ private[sql] trait SQLTestUtils
     Dataset.ofRows(spark, plan)
   }
 
-  /**
-   * Disable stdout and stderr when running the test. To not output the logs to the console,
-   * ConsoleAppender's `follow` should be set to `true` so that it will honors reassignments of
-   * System.out or System.err. Otherwise, ConsoleAppender will still output to the console even if
-   * we change System.out and System.err.
-   */
-  protected def testQuietly(name: String)(f: => Unit): Unit = {
-    test(name) {
-      quietly {
-        f
-      }
-    }
-  }
-
-  /**
-   * Run a test on a separate `UninterruptibleThread`.
-   */
-  protected def testWithUninterruptibleThread(name: String, quietly: Boolean = false)
-    (body: => Unit): Unit = {
-    val timeoutMillis = 10000
-    @transient var ex: Throwable = null
-
-    def runOnThread(): Unit = {
-      val thread = new UninterruptibleThread(s"Testing thread for test $name") {
-        override def run(): Unit = {
-          try {
-            body
-          } catch {
-            case NonFatal(e) =>
-              ex = e
-          }
-        }
-      }
-      thread.setDaemon(true)
-      thread.start()
-      thread.join(timeoutMillis)
-      if (thread.isAlive) {
-        thread.interrupt()
-        // If this interrupt does not work, then this thread is most likely running something that
-        // is not interruptible. There is not much point to wait for the thread to termniate, and
-        // we rather let the JVM terminate the thread on exit.
-        fail(
-          s"Test '$name' running on o.a.s.util.UninterruptibleThread timed out after" +
-            s" $timeoutMillis ms")
-      } else if (ex != null) {
-        throw ex
-      }
-    }
-
-    if (quietly) {
-      testQuietly(name) { runOnThread() }
-    } else {
-      test(name) { runOnThread() }
-    }
-  }
 
   /**
    * This method is used to make the given path qualified, when a path
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
index cd8d0708d8a32..4d578e21f5494 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
@@ -17,86 +17,4 @@
 
 package org.apache.spark.sql.test
 
-import scala.concurrent.duration._
-
-import org.scalatest.BeforeAndAfterEach
-import org.scalatest.concurrent.Eventually
-
-import org.apache.spark.{DebugFilesystem, SparkConf}
-import org.apache.spark.sql.{SparkSession, SQLContext}
-import org.apache.spark.sql.internal.SQLConf
-
-/**
- * Helper trait for SQL test suites where all tests share a single [[TestSparkSession]].
- */
-trait SharedSQLContext extends SQLTestUtils with BeforeAndAfterEach with Eventually {
-
-  protected def sparkConf = {
-    new SparkConf()
-      .set("spark.hadoop.fs.file.impl", classOf[DebugFilesystem].getName)
-      .set("spark.unsafe.exceptionOnMemoryLeak", "true")
-      .set(SQLConf.CODEGEN_FALLBACK.key, "false")
-  }
-
-  /**
-   * The [[TestSparkSession]] to use for all tests in this suite.
-   *
-   * By default, the underlying [[org.apache.spark.SparkContext]] will be run in local
-   * mode with the default test configurations.
-   */
-  private var _spark: TestSparkSession = null
-
-  /**
-   * The [[TestSparkSession]] to use for all tests in this suite.
-   */
-  protected implicit def spark: SparkSession = _spark
-
-  /**
-   * The [[TestSQLContext]] to use for all tests in this suite.
-   */
-  protected implicit def sqlContext: SQLContext = _spark.sqlContext
-
-  protected def createSparkSession: TestSparkSession = {
-    new TestSparkSession(sparkConf)
-  }
-
-  /**
-   * Initialize the [[TestSparkSession]].
-   */
-  protected override def beforeAll(): Unit = {
-    SparkSession.sqlListener.set(null)
-    if (_spark == null) {
-      _spark = createSparkSession
-    }
-    // Ensure we have initialized the context before calling parent code
-    super.beforeAll()
-  }
-
-  /**
-   * Stop the underlying [[org.apache.spark.SparkContext]], if any.
-   */
-  protected override def afterAll(): Unit = {
-    super.afterAll()
-    if (_spark != null) {
-      _spark.sessionState.catalog.reset()
-      _spark.stop()
-      _spark = null
-    }
-  }
-
-  protected override def beforeEach(): Unit = {
-    super.beforeEach()
-    DebugFilesystem.clearOpenStreams()
-  }
-
-  protected override def afterEach(): Unit = {
-    super.afterEach()
-    // Clear all persistent datasets after each test
-    spark.sharedState.cacheManager.clearCache()
-    // files can be closed from other threads, so wait a bit
-    // normally this doesn't take more than 1s
-    eventually(timeout(10.seconds)) {
-      DebugFilesystem.assertNoOpenStreams()
-    }
-  }
-}
+trait SharedSQLContext extends SQLTestUtils with SharedSparkSession
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala
new file mode 100644
index 0000000000000..e0568a3c5c99f
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.test
+
+import scala.concurrent.duration._
+
+import org.scalatest.{BeforeAndAfterEach, Suite}
+import org.scalatest.concurrent.Eventually
+
+import org.apache.spark.{DebugFilesystem, SparkConf}
+import org.apache.spark.sql.{SparkSession, SQLContext}
+import org.apache.spark.sql.internal.SQLConf
+
+/**
+ * Helper trait for SQL test suites where all tests share a single [[TestSparkSession]].
+ */
+trait SharedSparkSession
+  extends SQLTestUtilsBase
+  with BeforeAndAfterEach
+  with Eventually { self: Suite =>
+
+  protected def sparkConf = {
+    new SparkConf()
+      .set("spark.hadoop.fs.file.impl", classOf[DebugFilesystem].getName)
+      .set("spark.unsafe.exceptionOnMemoryLeak", "true")
+      .set(SQLConf.CODEGEN_FALLBACK.key, "false")
+  }
+
+  /**
+   * The [[TestSparkSession]] to use for all tests in this suite.
+   *
+   * By default, the underlying [[org.apache.spark.SparkContext]] will be run in local
+   * mode with the default test configurations.
+   */
+  private var _spark: TestSparkSession = null
+
+  /**
+   * The [[TestSparkSession]] to use for all tests in this suite.
+   */
+  protected implicit def spark: SparkSession = _spark
+
+  /**
+   * The [[TestSQLContext]] to use for all tests in this suite.
+   */
+  protected implicit def sqlContext: SQLContext = _spark.sqlContext
+
+  protected def createSparkSession: TestSparkSession = {
+    new TestSparkSession(sparkConf)
+  }
+
+  /**
+   * Initialize the [[TestSparkSession]].  Generally, this is just called from
+   * beforeAll; however, in test using styles other than FunSuite, there is
+   * often code that relies on the session between test group constructs and
+   * the actual tests, which may need this session.  It is purely a semantic
+   * difference, but semantically, it makes more sense to call
+   * 'initializeSession' between a 'describe' and an 'it' call than it does to
+   * call 'beforeAll'.
+   */
+  protected def initializeSession(): Unit = {
+    SparkSession.sqlListener.set(null)
+    if (_spark == null) {
+      _spark = createSparkSession
+    }
+  }
+
+  /**
+   * Make sure the [[TestSparkSession]] is initialized before any tests are run.
+   */
+  protected override def beforeAll(): Unit = {
+    initializeSession()
+
+    // Ensure we have initialized the context before calling parent code
+    super.beforeAll()
+  }
+
+  /**
+   * Stop the underlying [[org.apache.spark.SparkContext]], if any.
+   */
+  protected override def afterAll(): Unit = {
+    super.afterAll()
+    if (_spark != null) {
+      _spark.sessionState.catalog.reset()
+      _spark.stop()
+      _spark = null
+    }
+  }
+
+  protected override def beforeEach(): Unit = {
+    super.beforeEach()
+    DebugFilesystem.clearOpenStreams()
+  }
+
+  protected override def afterEach(): Unit = {
+    super.afterEach()
+    // Clear all persistent datasets after each test
+    spark.sharedState.cacheManager.clearCache()
+    // files can be closed from other threads, so wait a bit
+    // normally this doesn't take more than 1s
+    eventually(timeout(10.seconds)) {
+      DebugFilesystem.assertNoOpenStreams()
+    }
+  }
+}

From 3073344a2551fb198d63f2114a519ab97904cb55 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Thu, 26 Oct 2017 15:50:27 +0800
Subject: [PATCH 767/779] [SPARK-21840][CORE] Add trait that allows conf to be
 directly set in application.

Currently SparkSubmit uses system properties to propagate configuration to
applications. This makes it hard to implement features such as SPARK-11035,
which would allow multiple applications to be started in the same JVM. The
current code would cause the config data from multiple apps to get mixed
up.

This change introduces a new trait, currently internal to Spark, that allows
the app configuration to be passed directly to the application, without
having to use system properties. The current "call main() method" behavior
is maintained as an implementation of this new trait. This will be useful
to allow multiple cluster mode apps to be submitted from the same JVM.

As part of this, SparkSubmit was modified to collect all configuration
directly into a SparkConf instance. Most of the changes are to tests so
they use SparkConf instead of an opaque map.

Tested with existing and added unit tests.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #19519 from vanzin/SPARK-21840.
---
 .../spark/deploy/SparkApplication.scala       |  55 +++++
 .../org/apache/spark/deploy/SparkSubmit.scala | 160 +++++++------
 .../spark/deploy/SparkSubmitSuite.scala       | 213 ++++++++++--------
 .../rest/StandaloneRestSubmitSuite.scala      |   4 +-
 4 files changed, 257 insertions(+), 175 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/deploy/SparkApplication.scala

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkApplication.scala b/core/src/main/scala/org/apache/spark/deploy/SparkApplication.scala
new file mode 100644
index 0000000000000..118b4605675b0
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkApplication.scala
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy
+
+import java.lang.reflect.Modifier
+
+import org.apache.spark.SparkConf
+
+/**
+ * Entry point for a Spark application. Implementations must provide a no-argument constructor.
+ */
+private[spark] trait SparkApplication {
+
+  def start(args: Array[String], conf: SparkConf): Unit
+
+}
+
+/**
+ * Implementation of SparkApplication that wraps a standard Java class with a "main" method.
+ *
+ * Configuration is propagated to the application via system properties, so running multiple
+ * of these in the same JVM may lead to undefined behavior due to configuration leaks.
+ */
+private[deploy] class JavaMainApplication(klass: Class[_]) extends SparkApplication {
+
+  override def start(args: Array[String], conf: SparkConf): Unit = {
+    val mainMethod = klass.getMethod("main", new Array[String](0).getClass)
+    if (!Modifier.isStatic(mainMethod.getModifiers)) {
+      throw new IllegalStateException("The main method in the given main class must be static")
+    }
+
+    val sysProps = conf.getAll.toMap
+    sysProps.foreach { case (k, v) =>
+      sys.props(k) = v
+    }
+
+    mainMethod.invoke(null, args)
+  }
+
+}
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index b7e6d0ea021a4..73b956ef3e470 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -158,7 +158,7 @@ object SparkSubmit extends CommandLineUtils with Logging {
    */
   @tailrec
   private def submit(args: SparkSubmitArguments, uninitLog: Boolean): Unit = {
-    val (childArgs, childClasspath, sysProps, childMainClass) = prepareSubmitEnvironment(args)
+    val (childArgs, childClasspath, sparkConf, childMainClass) = prepareSubmitEnvironment(args)
 
     def doRunMain(): Unit = {
       if (args.proxyUser != null) {
@@ -167,7 +167,7 @@ object SparkSubmit extends CommandLineUtils with Logging {
         try {
           proxyUser.doAs(new PrivilegedExceptionAction[Unit]() {
             override def run(): Unit = {
-              runMain(childArgs, childClasspath, sysProps, childMainClass, args.verbose)
+              runMain(childArgs, childClasspath, sparkConf, childMainClass, args.verbose)
             }
           })
         } catch {
@@ -185,7 +185,7 @@ object SparkSubmit extends CommandLineUtils with Logging {
             }
         }
       } else {
-        runMain(childArgs, childClasspath, sysProps, childMainClass, args.verbose)
+        runMain(childArgs, childClasspath, sparkConf, childMainClass, args.verbose)
       }
     }
 
@@ -235,11 +235,11 @@ object SparkSubmit extends CommandLineUtils with Logging {
   private[deploy] def prepareSubmitEnvironment(
       args: SparkSubmitArguments,
       conf: Option[HadoopConfiguration] = None)
-      : (Seq[String], Seq[String], Map[String, String], String) = {
+      : (Seq[String], Seq[String], SparkConf, String) = {
     // Return values
     val childArgs = new ArrayBuffer[String]()
     val childClasspath = new ArrayBuffer[String]()
-    val sysProps = new HashMap[String, String]()
+    val sparkConf = new SparkConf()
     var childMainClass = ""
 
     // Set the cluster manager
@@ -337,7 +337,6 @@ object SparkSubmit extends CommandLineUtils with Logging {
       }
     }
 
-    val sparkConf = new SparkConf(false)
     args.sparkProperties.foreach { case (k, v) => sparkConf.set(k, v) }
     val hadoopConf = conf.getOrElse(SparkHadoopUtil.newConfiguration(sparkConf))
     val targetDir = Utils.createTempDir()
@@ -351,8 +350,8 @@ object SparkSubmit extends CommandLineUtils with Logging {
           // for later use; e.g. in spark sql, the isolated class loader used to talk
           // to HiveMetastore will use these settings. They will be set as Java system
           // properties and then loaded by SparkConf
-          sysProps.put("spark.yarn.keytab", args.keytab)
-          sysProps.put("spark.yarn.principal", args.principal)
+          sparkConf.set(KEYTAB, args.keytab)
+          sparkConf.set(PRINCIPAL, args.principal)
           UserGroupInformation.loginUserFromKeytab(args.principal, args.keytab)
         }
       }
@@ -364,23 +363,24 @@ object SparkSubmit extends CommandLineUtils with Logging {
     args.pyFiles = Option(args.pyFiles).map(resolveGlobPaths(_, hadoopConf)).orNull
     args.archives = Option(args.archives).map(resolveGlobPaths(_, hadoopConf)).orNull
 
+    // This security manager will not need an auth secret, but set a dummy value in case
+    // spark.authenticate is enabled, otherwise an exception is thrown.
+    lazy val downloadConf = sparkConf.clone().set(SecurityManager.SPARK_AUTH_SECRET_CONF, "unused")
+    lazy val secMgr = new SecurityManager(downloadConf)
+
     // In client mode, download remote files.
     var localPrimaryResource: String = null
     var localJars: String = null
     var localPyFiles: String = null
     if (deployMode == CLIENT) {
-      // This security manager will not need an auth secret, but set a dummy value in case
-      // spark.authenticate is enabled, otherwise an exception is thrown.
-      sparkConf.set(SecurityManager.SPARK_AUTH_SECRET_CONF, "unused")
-      val secMgr = new SecurityManager(sparkConf)
       localPrimaryResource = Option(args.primaryResource).map {
-        downloadFile(_, targetDir, sparkConf, hadoopConf, secMgr)
+        downloadFile(_, targetDir, downloadConf, hadoopConf, secMgr)
       }.orNull
       localJars = Option(args.jars).map {
-        downloadFileList(_, targetDir, sparkConf, hadoopConf, secMgr)
+        downloadFileList(_, targetDir, downloadConf, hadoopConf, secMgr)
       }.orNull
       localPyFiles = Option(args.pyFiles).map {
-        downloadFileList(_, targetDir, sparkConf, hadoopConf, secMgr)
+        downloadFileList(_, targetDir, downloadConf, hadoopConf, secMgr)
       }.orNull
     }
 
@@ -409,7 +409,7 @@ object SparkSubmit extends CommandLineUtils with Logging {
             if (file.exists()) {
               file.toURI.toString
             } else {
-              downloadFile(resource, targetDir, sparkConf, hadoopConf, secMgr)
+              downloadFile(resource, targetDir, downloadConf, hadoopConf, secMgr)
             }
           case _ => uri.toString
         }
@@ -449,7 +449,7 @@ object SparkSubmit extends CommandLineUtils with Logging {
         args.files = mergeFileLists(args.files, args.pyFiles)
       }
       if (localPyFiles != null) {
-        sysProps("spark.submit.pyFiles") = localPyFiles
+        sparkConf.set("spark.submit.pyFiles", localPyFiles)
       }
     }
 
@@ -515,69 +515,69 @@ object SparkSubmit extends CommandLineUtils with Logging {
     }
 
     // Special flag to avoid deprecation warnings at the client
-    sysProps("SPARK_SUBMIT") = "true"
+    sys.props("SPARK_SUBMIT") = "true"
 
     // A list of rules to map each argument to system properties or command-line options in
     // each deploy mode; we iterate through these below
     val options = List[OptionAssigner](
 
       // All cluster managers
-      OptionAssigner(args.master, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES, sysProp = "spark.master"),
+      OptionAssigner(args.master, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES, confKey = "spark.master"),
       OptionAssigner(args.deployMode, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES,
-        sysProp = "spark.submit.deployMode"),
-      OptionAssigner(args.name, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES, sysProp = "spark.app.name"),
-      OptionAssigner(args.ivyRepoPath, ALL_CLUSTER_MGRS, CLIENT, sysProp = "spark.jars.ivy"),
+        confKey = "spark.submit.deployMode"),
+      OptionAssigner(args.name, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES, confKey = "spark.app.name"),
+      OptionAssigner(args.ivyRepoPath, ALL_CLUSTER_MGRS, CLIENT, confKey = "spark.jars.ivy"),
       OptionAssigner(args.driverMemory, ALL_CLUSTER_MGRS, CLIENT,
-        sysProp = "spark.driver.memory"),
+        confKey = "spark.driver.memory"),
       OptionAssigner(args.driverExtraClassPath, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES,
-        sysProp = "spark.driver.extraClassPath"),
+        confKey = "spark.driver.extraClassPath"),
       OptionAssigner(args.driverExtraJavaOptions, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES,
-        sysProp = "spark.driver.extraJavaOptions"),
+        confKey = "spark.driver.extraJavaOptions"),
       OptionAssigner(args.driverExtraLibraryPath, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES,
-        sysProp = "spark.driver.extraLibraryPath"),
+        confKey = "spark.driver.extraLibraryPath"),
 
       // Propagate attributes for dependency resolution at the driver side
-      OptionAssigner(args.packages, STANDALONE | MESOS, CLUSTER, sysProp = "spark.jars.packages"),
+      OptionAssigner(args.packages, STANDALONE | MESOS, CLUSTER, confKey = "spark.jars.packages"),
       OptionAssigner(args.repositories, STANDALONE | MESOS, CLUSTER,
-        sysProp = "spark.jars.repositories"),
-      OptionAssigner(args.ivyRepoPath, STANDALONE | MESOS, CLUSTER, sysProp = "spark.jars.ivy"),
+        confKey = "spark.jars.repositories"),
+      OptionAssigner(args.ivyRepoPath, STANDALONE | MESOS, CLUSTER, confKey = "spark.jars.ivy"),
       OptionAssigner(args.packagesExclusions, STANDALONE | MESOS,
-        CLUSTER, sysProp = "spark.jars.excludes"),
+        CLUSTER, confKey = "spark.jars.excludes"),
 
       // Yarn only
-      OptionAssigner(args.queue, YARN, ALL_DEPLOY_MODES, sysProp = "spark.yarn.queue"),
+      OptionAssigner(args.queue, YARN, ALL_DEPLOY_MODES, confKey = "spark.yarn.queue"),
       OptionAssigner(args.numExecutors, YARN, ALL_DEPLOY_MODES,
-        sysProp = "spark.executor.instances"),
-      OptionAssigner(args.pyFiles, YARN, ALL_DEPLOY_MODES, sysProp = "spark.yarn.dist.pyFiles"),
-      OptionAssigner(args.jars, YARN, ALL_DEPLOY_MODES, sysProp = "spark.yarn.dist.jars"),
-      OptionAssigner(args.files, YARN, ALL_DEPLOY_MODES, sysProp = "spark.yarn.dist.files"),
-      OptionAssigner(args.archives, YARN, ALL_DEPLOY_MODES, sysProp = "spark.yarn.dist.archives"),
-      OptionAssigner(args.principal, YARN, ALL_DEPLOY_MODES, sysProp = "spark.yarn.principal"),
-      OptionAssigner(args.keytab, YARN, ALL_DEPLOY_MODES, sysProp = "spark.yarn.keytab"),
+        confKey = "spark.executor.instances"),
+      OptionAssigner(args.pyFiles, YARN, ALL_DEPLOY_MODES, confKey = "spark.yarn.dist.pyFiles"),
+      OptionAssigner(args.jars, YARN, ALL_DEPLOY_MODES, confKey = "spark.yarn.dist.jars"),
+      OptionAssigner(args.files, YARN, ALL_DEPLOY_MODES, confKey = "spark.yarn.dist.files"),
+      OptionAssigner(args.archives, YARN, ALL_DEPLOY_MODES, confKey = "spark.yarn.dist.archives"),
+      OptionAssigner(args.principal, YARN, ALL_DEPLOY_MODES, confKey = "spark.yarn.principal"),
+      OptionAssigner(args.keytab, YARN, ALL_DEPLOY_MODES, confKey = "spark.yarn.keytab"),
 
       // Other options
       OptionAssigner(args.executorCores, STANDALONE | YARN, ALL_DEPLOY_MODES,
-        sysProp = "spark.executor.cores"),
+        confKey = "spark.executor.cores"),
       OptionAssigner(args.executorMemory, STANDALONE | MESOS | YARN, ALL_DEPLOY_MODES,
-        sysProp = "spark.executor.memory"),
+        confKey = "spark.executor.memory"),
       OptionAssigner(args.totalExecutorCores, STANDALONE | MESOS, ALL_DEPLOY_MODES,
-        sysProp = "spark.cores.max"),
+        confKey = "spark.cores.max"),
       OptionAssigner(args.files, LOCAL | STANDALONE | MESOS, ALL_DEPLOY_MODES,
-        sysProp = "spark.files"),
-      OptionAssigner(args.jars, LOCAL, CLIENT, sysProp = "spark.jars"),
-      OptionAssigner(args.jars, STANDALONE | MESOS, ALL_DEPLOY_MODES, sysProp = "spark.jars"),
+        confKey = "spark.files"),
+      OptionAssigner(args.jars, LOCAL, CLIENT, confKey = "spark.jars"),
+      OptionAssigner(args.jars, STANDALONE | MESOS, ALL_DEPLOY_MODES, confKey = "spark.jars"),
       OptionAssigner(args.driverMemory, STANDALONE | MESOS | YARN, CLUSTER,
-        sysProp = "spark.driver.memory"),
+        confKey = "spark.driver.memory"),
       OptionAssigner(args.driverCores, STANDALONE | MESOS | YARN, CLUSTER,
-        sysProp = "spark.driver.cores"),
+        confKey = "spark.driver.cores"),
       OptionAssigner(args.supervise.toString, STANDALONE | MESOS, CLUSTER,
-        sysProp = "spark.driver.supervise"),
-      OptionAssigner(args.ivyRepoPath, STANDALONE, CLUSTER, sysProp = "spark.jars.ivy"),
+        confKey = "spark.driver.supervise"),
+      OptionAssigner(args.ivyRepoPath, STANDALONE, CLUSTER, confKey = "spark.jars.ivy"),
 
       // An internal option used only for spark-shell to add user jars to repl's classloader,
       // previously it uses "spark.jars" or "spark.yarn.dist.jars" which now may be pointed to
       // remote jars, so adding a new option to only specify local jars for spark-shell internally.
-      OptionAssigner(localJars, ALL_CLUSTER_MGRS, CLIENT, sysProp = "spark.repl.local.jars")
+      OptionAssigner(localJars, ALL_CLUSTER_MGRS, CLIENT, confKey = "spark.repl.local.jars")
     )
 
     // In client mode, launch the application main class directly
@@ -610,24 +610,24 @@ object SparkSubmit extends CommandLineUtils with Logging {
           (deployMode & opt.deployMode) != 0 &&
           (clusterManager & opt.clusterManager) != 0) {
         if (opt.clOption != null) { childArgs += (opt.clOption, opt.value) }
-        if (opt.sysProp != null) { sysProps.put(opt.sysProp, opt.value) }
+        if (opt.confKey != null) { sparkConf.set(opt.confKey, opt.value) }
       }
     }
 
     // In case of shells, spark.ui.showConsoleProgress can be true by default or by user.
     if (isShell(args.primaryResource) && !sparkConf.contains(UI_SHOW_CONSOLE_PROGRESS)) {
-      sysProps(UI_SHOW_CONSOLE_PROGRESS.key) = "true"
+      sparkConf.set(UI_SHOW_CONSOLE_PROGRESS, true)
     }
 
     // Add the application jar automatically so the user doesn't have to call sc.addJar
     // For YARN cluster mode, the jar is already distributed on each node as "app.jar"
     // For python and R files, the primary resource is already distributed as a regular file
     if (!isYarnCluster && !args.isPython && !args.isR) {
-      var jars = sysProps.get("spark.jars").map(x => x.split(",").toSeq).getOrElse(Seq.empty)
+      var jars = sparkConf.getOption("spark.jars").map(x => x.split(",").toSeq).getOrElse(Seq.empty)
       if (isUserJar(args.primaryResource)) {
         jars = jars ++ Seq(args.primaryResource)
       }
-      sysProps.put("spark.jars", jars.mkString(","))
+      sparkConf.set("spark.jars", jars.mkString(","))
     }
 
     // In standalone cluster mode, use the REST client to submit the application (Spark 1.3+).
@@ -653,12 +653,12 @@ object SparkSubmit extends CommandLineUtils with Logging {
     // Let YARN know it's a pyspark app, so it distributes needed libraries.
     if (clusterManager == YARN) {
       if (args.isPython) {
-        sysProps.put("spark.yarn.isPython", "true")
+        sparkConf.set("spark.yarn.isPython", "true")
       }
     }
 
     if (clusterManager == MESOS && UserGroupInformation.isSecurityEnabled) {
-      setRMPrincipal(sysProps)
+      setRMPrincipal(sparkConf)
     }
 
     // In yarn-cluster mode, use yarn.Client as a wrapper around the user class
@@ -689,7 +689,7 @@ object SparkSubmit extends CommandLineUtils with Logging {
         // Second argument is main class
         childArgs += (args.primaryResource, "")
         if (args.pyFiles != null) {
-          sysProps("spark.submit.pyFiles") = args.pyFiles
+          sparkConf.set("spark.submit.pyFiles", args.pyFiles)
         }
       } else if (args.isR) {
         // Second argument is main class
@@ -704,12 +704,12 @@ object SparkSubmit extends CommandLineUtils with Logging {
 
     // Load any properties specified through --conf and the default properties file
     for ((k, v) <- args.sparkProperties) {
-      sysProps.getOrElseUpdate(k, v)
+      sparkConf.setIfMissing(k, v)
     }
 
     // Ignore invalid spark.driver.host in cluster modes.
     if (deployMode == CLUSTER) {
-      sysProps -= "spark.driver.host"
+      sparkConf.remove("spark.driver.host")
     }
 
     // Resolve paths in certain spark properties
@@ -721,15 +721,15 @@ object SparkSubmit extends CommandLineUtils with Logging {
       "spark.yarn.dist.jars")
     pathConfigs.foreach { config =>
       // Replace old URIs with resolved URIs, if they exist
-      sysProps.get(config).foreach { oldValue =>
-        sysProps(config) = Utils.resolveURIs(oldValue)
+      sparkConf.getOption(config).foreach { oldValue =>
+        sparkConf.set(config, Utils.resolveURIs(oldValue))
       }
     }
 
     // Resolve and format python file paths properly before adding them to the PYTHONPATH.
     // The resolving part is redundant in the case of --py-files, but necessary if the user
     // explicitly sets `spark.submit.pyFiles` in his/her default properties file.
-    sysProps.get("spark.submit.pyFiles").foreach { pyFiles =>
+    sparkConf.getOption("spark.submit.pyFiles").foreach { pyFiles =>
       val resolvedPyFiles = Utils.resolveURIs(pyFiles)
       val formattedPyFiles = if (!isYarnCluster && !isMesosCluster) {
         PythonRunner.formatPaths(resolvedPyFiles).mkString(",")
@@ -739,22 +739,22 @@ object SparkSubmit extends CommandLineUtils with Logging {
         // locally.
         resolvedPyFiles
       }
-      sysProps("spark.submit.pyFiles") = formattedPyFiles
+      sparkConf.set("spark.submit.pyFiles", formattedPyFiles)
     }
 
-    (childArgs, childClasspath, sysProps, childMainClass)
+    (childArgs, childClasspath, sparkConf, childMainClass)
   }
 
   // [SPARK-20328]. HadoopRDD calls into a Hadoop library that fetches delegation tokens with
   // renewer set to the YARN ResourceManager.  Since YARN isn't configured in Mesos mode, we
   // must trick it into thinking we're YARN.
-  private def setRMPrincipal(sysProps: HashMap[String, String]): Unit = {
+  private def setRMPrincipal(sparkConf: SparkConf): Unit = {
     val shortUserName = UserGroupInformation.getCurrentUser.getShortUserName
     val key = s"spark.hadoop.${YarnConfiguration.RM_PRINCIPAL}"
     // scalastyle:off println
     printStream.println(s"Setting ${key} to ${shortUserName}")
     // scalastyle:off println
-    sysProps.put(key, shortUserName)
+    sparkConf.set(key, shortUserName)
   }
 
   /**
@@ -766,7 +766,7 @@ object SparkSubmit extends CommandLineUtils with Logging {
   private def runMain(
       childArgs: Seq[String],
       childClasspath: Seq[String],
-      sysProps: Map[String, String],
+      sparkConf: SparkConf,
       childMainClass: String,
       verbose: Boolean): Unit = {
     // scalastyle:off println
@@ -774,14 +774,14 @@ object SparkSubmit extends CommandLineUtils with Logging {
       printStream.println(s"Main class:\n$childMainClass")
       printStream.println(s"Arguments:\n${childArgs.mkString("\n")}")
       // sysProps may contain sensitive information, so redact before printing
-      printStream.println(s"System properties:\n${Utils.redact(sysProps).mkString("\n")}")
+      printStream.println(s"Spark config:\n${Utils.redact(sparkConf.getAll.toMap).mkString("\n")}")
       printStream.println(s"Classpath elements:\n${childClasspath.mkString("\n")}")
       printStream.println("\n")
     }
     // scalastyle:on println
 
     val loader =
-      if (sysProps.getOrElse("spark.driver.userClassPathFirst", "false").toBoolean) {
+      if (sparkConf.get(DRIVER_USER_CLASS_PATH_FIRST)) {
         new ChildFirstURLClassLoader(new Array[URL](0),
           Thread.currentThread.getContextClassLoader)
       } else {
@@ -794,10 +794,6 @@ object SparkSubmit extends CommandLineUtils with Logging {
       addJarToClasspath(jar, loader)
     }
 
-    for ((key, value) <- sysProps) {
-      System.setProperty(key, value)
-    }
-
     var mainClass: Class[_] = null
 
     try {
@@ -823,14 +819,14 @@ object SparkSubmit extends CommandLineUtils with Logging {
         System.exit(CLASS_NOT_FOUND_EXIT_STATUS)
     }
 
-    // SPARK-4170
-    if (classOf[scala.App].isAssignableFrom(mainClass)) {
-      printWarning("Subclasses of scala.App may not work correctly. Use a main() method instead.")
-    }
-
-    val mainMethod = mainClass.getMethod("main", new Array[String](0).getClass)
-    if (!Modifier.isStatic(mainMethod.getModifiers)) {
-      throw new IllegalStateException("The main method in the given main class must be static")
+    val app: SparkApplication = if (classOf[SparkApplication].isAssignableFrom(mainClass)) {
+      mainClass.newInstance().asInstanceOf[SparkApplication]
+    } else {
+      // SPARK-4170
+      if (classOf[scala.App].isAssignableFrom(mainClass)) {
+        printWarning("Subclasses of scala.App may not work correctly. Use a main() method instead.")
+      }
+      new JavaMainApplication(mainClass)
     }
 
     @tailrec
@@ -844,7 +840,7 @@ object SparkSubmit extends CommandLineUtils with Logging {
     }
 
     try {
-      mainMethod.invoke(null, childArgs.toArray)
+      app.start(childArgs.toArray, sparkConf)
     } catch {
       case t: Throwable =>
         findCause(t) match {
@@ -1271,4 +1267,4 @@ private case class OptionAssigner(
     clusterManager: Int,
     deployMode: Int,
     clOption: String = null,
-    sysProp: String = null)
+    confKey: String = null)
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index b52da4c0c8bc3..cfbf56fb8c369 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -176,10 +176,10 @@ class SparkSubmitSuite
       "thejar.jar"
     )
     val appArgs = new SparkSubmitArguments(clArgs)
-    val (_, _, sysProps, _) = prepareSubmitEnvironment(appArgs)
+    val (_, _, conf, _) = prepareSubmitEnvironment(appArgs)
 
     appArgs.deployMode should be ("client")
-    sysProps("spark.submit.deployMode") should be ("client")
+    conf.get("spark.submit.deployMode") should be ("client")
 
     // Both cmd line and configuration are specified, cmdline option takes the priority
     val clArgs1 = Seq(
@@ -190,10 +190,10 @@ class SparkSubmitSuite
       "thejar.jar"
     )
     val appArgs1 = new SparkSubmitArguments(clArgs1)
-    val (_, _, sysProps1, _) = prepareSubmitEnvironment(appArgs1)
+    val (_, _, conf1, _) = prepareSubmitEnvironment(appArgs1)
 
     appArgs1.deployMode should be ("cluster")
-    sysProps1("spark.submit.deployMode") should be ("cluster")
+    conf1.get("spark.submit.deployMode") should be ("cluster")
 
     // Neither cmdline nor configuration are specified, client mode is the default choice
     val clArgs2 = Seq(
@@ -204,9 +204,9 @@ class SparkSubmitSuite
     val appArgs2 = new SparkSubmitArguments(clArgs2)
     appArgs2.deployMode should be (null)
 
-    val (_, _, sysProps2, _) = prepareSubmitEnvironment(appArgs2)
+    val (_, _, conf2, _) = prepareSubmitEnvironment(appArgs2)
     appArgs2.deployMode should be ("client")
-    sysProps2("spark.submit.deployMode") should be ("client")
+    conf2.get("spark.submit.deployMode") should be ("client")
   }
 
   test("handles YARN cluster mode") {
@@ -227,7 +227,7 @@ class SparkSubmitSuite
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
-    val (childArgs, classpath, sysProps, mainClass) = prepareSubmitEnvironment(appArgs)
+    val (childArgs, classpath, conf, mainClass) = prepareSubmitEnvironment(appArgs)
     val childArgsStr = childArgs.mkString(" ")
     childArgsStr should include ("--class org.SomeClass")
     childArgsStr should include ("--arg arg1 --arg arg2")
@@ -240,16 +240,16 @@ class SparkSubmitSuite
     classpath(2) should endWith ("two.jar")
     classpath(3) should endWith ("three.jar")
 
-    sysProps("spark.executor.memory") should be ("5g")
-    sysProps("spark.driver.memory") should be ("4g")
-    sysProps("spark.executor.cores") should be ("5")
-    sysProps("spark.yarn.queue") should be ("thequeue")
-    sysProps("spark.yarn.dist.jars") should include regex (".*one.jar,.*two.jar,.*three.jar")
-    sysProps("spark.yarn.dist.files") should include regex (".*file1.txt,.*file2.txt")
-    sysProps("spark.yarn.dist.archives") should include regex (".*archive1.txt,.*archive2.txt")
-    sysProps("spark.app.name") should be ("beauty")
-    sysProps("spark.ui.enabled") should be ("false")
-    sysProps("SPARK_SUBMIT") should be ("true")
+    conf.get("spark.executor.memory") should be ("5g")
+    conf.get("spark.driver.memory") should be ("4g")
+    conf.get("spark.executor.cores") should be ("5")
+    conf.get("spark.yarn.queue") should be ("thequeue")
+    conf.get("spark.yarn.dist.jars") should include regex (".*one.jar,.*two.jar,.*three.jar")
+    conf.get("spark.yarn.dist.files") should include regex (".*file1.txt,.*file2.txt")
+    conf.get("spark.yarn.dist.archives") should include regex (".*archive1.txt,.*archive2.txt")
+    conf.get("spark.app.name") should be ("beauty")
+    conf.get("spark.ui.enabled") should be ("false")
+    sys.props("SPARK_SUBMIT") should be ("true")
   }
 
   test("handles YARN client mode") {
@@ -270,7 +270,7 @@ class SparkSubmitSuite
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
-    val (childArgs, classpath, sysProps, mainClass) = prepareSubmitEnvironment(appArgs)
+    val (childArgs, classpath, conf, mainClass) = prepareSubmitEnvironment(appArgs)
     childArgs.mkString(" ") should be ("arg1 arg2")
     mainClass should be ("org.SomeClass")
     classpath should have length (4)
@@ -278,17 +278,17 @@ class SparkSubmitSuite
     classpath(1) should endWith ("one.jar")
     classpath(2) should endWith ("two.jar")
     classpath(3) should endWith ("three.jar")
-    sysProps("spark.app.name") should be ("trill")
-    sysProps("spark.executor.memory") should be ("5g")
-    sysProps("spark.executor.cores") should be ("5")
-    sysProps("spark.yarn.queue") should be ("thequeue")
-    sysProps("spark.executor.instances") should be ("6")
-    sysProps("spark.yarn.dist.files") should include regex (".*file1.txt,.*file2.txt")
-    sysProps("spark.yarn.dist.archives") should include regex (".*archive1.txt,.*archive2.txt")
-    sysProps("spark.yarn.dist.jars") should include
+    conf.get("spark.app.name") should be ("trill")
+    conf.get("spark.executor.memory") should be ("5g")
+    conf.get("spark.executor.cores") should be ("5")
+    conf.get("spark.yarn.queue") should be ("thequeue")
+    conf.get("spark.executor.instances") should be ("6")
+    conf.get("spark.yarn.dist.files") should include regex (".*file1.txt,.*file2.txt")
+    conf.get("spark.yarn.dist.archives") should include regex (".*archive1.txt,.*archive2.txt")
+    conf.get("spark.yarn.dist.jars") should include
       regex (".*one.jar,.*two.jar,.*three.jar,.*thejar.jar")
-    sysProps("SPARK_SUBMIT") should be ("true")
-    sysProps("spark.ui.enabled") should be ("false")
+    conf.get("spark.ui.enabled") should be ("false")
+    sys.props("SPARK_SUBMIT") should be ("true")
   }
 
   test("handles standalone cluster mode") {
@@ -316,7 +316,7 @@ class SparkSubmitSuite
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
     appArgs.useRest = useRest
-    val (childArgs, classpath, sysProps, mainClass) = prepareSubmitEnvironment(appArgs)
+    val (childArgs, classpath, conf, mainClass) = prepareSubmitEnvironment(appArgs)
     val childArgsStr = childArgs.mkString(" ")
     if (useRest) {
       childArgsStr should endWith ("thejar.jar org.SomeClass arg1 arg2")
@@ -327,17 +327,18 @@ class SparkSubmitSuite
       mainClass should be ("org.apache.spark.deploy.Client")
     }
     classpath should have size 0
-    sysProps should have size 9
-    sysProps.keys should contain ("SPARK_SUBMIT")
-    sysProps.keys should contain ("spark.master")
-    sysProps.keys should contain ("spark.app.name")
-    sysProps.keys should contain ("spark.jars")
-    sysProps.keys should contain ("spark.driver.memory")
-    sysProps.keys should contain ("spark.driver.cores")
-    sysProps.keys should contain ("spark.driver.supervise")
-    sysProps.keys should contain ("spark.ui.enabled")
-    sysProps.keys should contain ("spark.submit.deployMode")
-    sysProps("spark.ui.enabled") should be ("false")
+    sys.props("SPARK_SUBMIT") should be ("true")
+
+    val confMap = conf.getAll.toMap
+    confMap.keys should contain ("spark.master")
+    confMap.keys should contain ("spark.app.name")
+    confMap.keys should contain ("spark.jars")
+    confMap.keys should contain ("spark.driver.memory")
+    confMap.keys should contain ("spark.driver.cores")
+    confMap.keys should contain ("spark.driver.supervise")
+    confMap.keys should contain ("spark.ui.enabled")
+    confMap.keys should contain ("spark.submit.deployMode")
+    conf.get("spark.ui.enabled") should be ("false")
   }
 
   test("handles standalone client mode") {
@@ -352,14 +353,14 @@ class SparkSubmitSuite
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
-    val (childArgs, classpath, sysProps, mainClass) = prepareSubmitEnvironment(appArgs)
+    val (childArgs, classpath, conf, mainClass) = prepareSubmitEnvironment(appArgs)
     childArgs.mkString(" ") should be ("arg1 arg2")
     mainClass should be ("org.SomeClass")
     classpath should have length (1)
     classpath(0) should endWith ("thejar.jar")
-    sysProps("spark.executor.memory") should be ("5g")
-    sysProps("spark.cores.max") should be ("5")
-    sysProps("spark.ui.enabled") should be ("false")
+    conf.get("spark.executor.memory") should be ("5g")
+    conf.get("spark.cores.max") should be ("5")
+    conf.get("spark.ui.enabled") should be ("false")
   }
 
   test("handles mesos client mode") {
@@ -374,14 +375,14 @@ class SparkSubmitSuite
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
-    val (childArgs, classpath, sysProps, mainClass) = prepareSubmitEnvironment(appArgs)
+    val (childArgs, classpath, conf, mainClass) = prepareSubmitEnvironment(appArgs)
     childArgs.mkString(" ") should be ("arg1 arg2")
     mainClass should be ("org.SomeClass")
     classpath should have length (1)
     classpath(0) should endWith ("thejar.jar")
-    sysProps("spark.executor.memory") should be ("5g")
-    sysProps("spark.cores.max") should be ("5")
-    sysProps("spark.ui.enabled") should be ("false")
+    conf.get("spark.executor.memory") should be ("5g")
+    conf.get("spark.cores.max") should be ("5")
+    conf.get("spark.ui.enabled") should be ("false")
   }
 
   test("handles confs with flag equivalents") {
@@ -394,23 +395,26 @@ class SparkSubmitSuite
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
-    val (_, _, sysProps, mainClass) = prepareSubmitEnvironment(appArgs)
-    sysProps("spark.executor.memory") should be ("5g")
-    sysProps("spark.master") should be ("yarn")
-    sysProps("spark.submit.deployMode") should be ("cluster")
+    val (_, _, conf, mainClass) = prepareSubmitEnvironment(appArgs)
+    conf.get("spark.executor.memory") should be ("5g")
+    conf.get("spark.master") should be ("yarn")
+    conf.get("spark.submit.deployMode") should be ("cluster")
     mainClass should be ("org.apache.spark.deploy.yarn.Client")
   }
 
   test("SPARK-21568 ConsoleProgressBar should be enabled only in shells") {
+    // Unset from system properties since this config is defined in the root pom's test config.
+    sys.props -= UI_SHOW_CONSOLE_PROGRESS.key
+
     val clArgs1 = Seq("--class", "org.apache.spark.repl.Main", "spark-shell")
     val appArgs1 = new SparkSubmitArguments(clArgs1)
-    val (_, _, sysProps1, _) = prepareSubmitEnvironment(appArgs1)
-    sysProps1(UI_SHOW_CONSOLE_PROGRESS.key) should be ("true")
+    val (_, _, conf1, _) = prepareSubmitEnvironment(appArgs1)
+    conf1.get(UI_SHOW_CONSOLE_PROGRESS) should be (true)
 
     val clArgs2 = Seq("--class", "org.SomeClass", "thejar.jar")
     val appArgs2 = new SparkSubmitArguments(clArgs2)
-    val (_, _, sysProps2, _) = prepareSubmitEnvironment(appArgs2)
-    sysProps2.keys should not contain UI_SHOW_CONSOLE_PROGRESS.key
+    val (_, _, conf2, _) = prepareSubmitEnvironment(appArgs2)
+    assert(!conf2.contains(UI_SHOW_CONSOLE_PROGRESS))
   }
 
   test("launch simple application with spark-submit") {
@@ -585,11 +589,11 @@ class SparkSubmitSuite
       "--files", files,
       "thejar.jar")
     val appArgs = new SparkSubmitArguments(clArgs)
-    val sysProps = SparkSubmit.prepareSubmitEnvironment(appArgs)._3
+    val (_, _, conf, _) = SparkSubmit.prepareSubmitEnvironment(appArgs)
     appArgs.jars should be (Utils.resolveURIs(jars))
     appArgs.files should be (Utils.resolveURIs(files))
-    sysProps("spark.jars") should be (Utils.resolveURIs(jars + ",thejar.jar"))
-    sysProps("spark.files") should be (Utils.resolveURIs(files))
+    conf.get("spark.jars") should be (Utils.resolveURIs(jars + ",thejar.jar"))
+    conf.get("spark.files") should be (Utils.resolveURIs(files))
 
     // Test files and archives (Yarn)
     val clArgs2 = Seq(
@@ -600,11 +604,11 @@ class SparkSubmitSuite
       "thejar.jar"
     )
     val appArgs2 = new SparkSubmitArguments(clArgs2)
-    val sysProps2 = SparkSubmit.prepareSubmitEnvironment(appArgs2)._3
+    val (_, _, conf2, _) = SparkSubmit.prepareSubmitEnvironment(appArgs2)
     appArgs2.files should be (Utils.resolveURIs(files))
     appArgs2.archives should be (Utils.resolveURIs(archives))
-    sysProps2("spark.yarn.dist.files") should be (Utils.resolveURIs(files))
-    sysProps2("spark.yarn.dist.archives") should be (Utils.resolveURIs(archives))
+    conf2.get("spark.yarn.dist.files") should be (Utils.resolveURIs(files))
+    conf2.get("spark.yarn.dist.archives") should be (Utils.resolveURIs(archives))
 
     // Test python files
     val clArgs3 = Seq(
@@ -615,12 +619,12 @@ class SparkSubmitSuite
       "mister.py"
     )
     val appArgs3 = new SparkSubmitArguments(clArgs3)
-    val sysProps3 = SparkSubmit.prepareSubmitEnvironment(appArgs3)._3
+    val (_, _, conf3, _) = SparkSubmit.prepareSubmitEnvironment(appArgs3)
     appArgs3.pyFiles should be (Utils.resolveURIs(pyFiles))
-    sysProps3("spark.submit.pyFiles") should be (
+    conf3.get("spark.submit.pyFiles") should be (
       PythonRunner.formatPaths(Utils.resolveURIs(pyFiles)).mkString(","))
-    sysProps3(PYSPARK_DRIVER_PYTHON.key) should be ("python3.4")
-    sysProps3(PYSPARK_PYTHON.key) should be ("python3.5")
+    conf3.get(PYSPARK_DRIVER_PYTHON.key) should be ("python3.4")
+    conf3.get(PYSPARK_PYTHON.key) should be ("python3.5")
   }
 
   test("resolves config paths correctly") {
@@ -644,9 +648,9 @@ class SparkSubmitSuite
       "thejar.jar"
     )
     val appArgs = new SparkSubmitArguments(clArgs)
-    val sysProps = SparkSubmit.prepareSubmitEnvironment(appArgs)._3
-    sysProps("spark.jars") should be(Utils.resolveURIs(jars + ",thejar.jar"))
-    sysProps("spark.files") should be(Utils.resolveURIs(files))
+    val (_, _, conf, _) = SparkSubmit.prepareSubmitEnvironment(appArgs)
+    conf.get("spark.jars") should be(Utils.resolveURIs(jars + ",thejar.jar"))
+    conf.get("spark.files") should be(Utils.resolveURIs(files))
 
     // Test files and archives (Yarn)
     val f2 = File.createTempFile("test-submit-files-archives", "", tmpDir)
@@ -661,9 +665,9 @@ class SparkSubmitSuite
       "thejar.jar"
     )
     val appArgs2 = new SparkSubmitArguments(clArgs2)
-    val sysProps2 = SparkSubmit.prepareSubmitEnvironment(appArgs2)._3
-    sysProps2("spark.yarn.dist.files") should be(Utils.resolveURIs(files))
-    sysProps2("spark.yarn.dist.archives") should be(Utils.resolveURIs(archives))
+    val (_, _, conf2, _) = SparkSubmit.prepareSubmitEnvironment(appArgs2)
+    conf2.get("spark.yarn.dist.files") should be(Utils.resolveURIs(files))
+    conf2.get("spark.yarn.dist.archives") should be(Utils.resolveURIs(archives))
 
     // Test python files
     val f3 = File.createTempFile("test-submit-python-files", "", tmpDir)
@@ -676,8 +680,8 @@ class SparkSubmitSuite
       "mister.py"
     )
     val appArgs3 = new SparkSubmitArguments(clArgs3)
-    val sysProps3 = SparkSubmit.prepareSubmitEnvironment(appArgs3)._3
-    sysProps3("spark.submit.pyFiles") should be(
+    val (_, _, conf3, _) = SparkSubmit.prepareSubmitEnvironment(appArgs3)
+    conf3.get("spark.submit.pyFiles") should be(
       PythonRunner.formatPaths(Utils.resolveURIs(pyFiles)).mkString(","))
 
     // Test remote python files
@@ -693,11 +697,9 @@ class SparkSubmitSuite
       "hdfs:///tmp/mister.py"
     )
     val appArgs4 = new SparkSubmitArguments(clArgs4)
-    val sysProps4 = SparkSubmit.prepareSubmitEnvironment(appArgs4)._3
+    val (_, _, conf4, _) = SparkSubmit.prepareSubmitEnvironment(appArgs4)
     // Should not format python path for yarn cluster mode
-    sysProps4("spark.submit.pyFiles") should be(
-      Utils.resolveURIs(remotePyFiles)
-    )
+    conf4.get("spark.submit.pyFiles") should be(Utils.resolveURIs(remotePyFiles))
   }
 
   test("user classpath first in driver") {
@@ -771,14 +773,14 @@ class SparkSubmitSuite
       jar2.toString)
 
     val appArgs = new SparkSubmitArguments(args)
-    val sysProps = SparkSubmit.prepareSubmitEnvironment(appArgs)._3
-    sysProps("spark.yarn.dist.jars").split(",").toSet should be
+    val (_, _, conf, _) = SparkSubmit.prepareSubmitEnvironment(appArgs)
+    conf.get("spark.yarn.dist.jars").split(",").toSet should be
       (Set(jar1.toURI.toString, jar2.toURI.toString))
-    sysProps("spark.yarn.dist.files").split(",").toSet should be
+    conf.get("spark.yarn.dist.files").split(",").toSet should be
       (Set(file1.toURI.toString, file2.toURI.toString))
-    sysProps("spark.yarn.dist.pyFiles").split(",").toSet should be
+    conf.get("spark.yarn.dist.pyFiles").split(",").toSet should be
       (Set(pyFile1.getAbsolutePath, pyFile2.getAbsolutePath))
-    sysProps("spark.yarn.dist.archives").split(",").toSet should be
+    conf.get("spark.yarn.dist.archives").split(",").toSet should be
       (Set(archive1.toURI.toString, archive2.toURI.toString))
   }
 
@@ -897,18 +899,18 @@ class SparkSubmitSuite
       )
 
     val appArgs = new SparkSubmitArguments(args)
-    val sysProps = SparkSubmit.prepareSubmitEnvironment(appArgs, Some(hadoopConf))._3
+    val (_, _, conf, _) = SparkSubmit.prepareSubmitEnvironment(appArgs, Some(hadoopConf))
 
     // All the resources should still be remote paths, so that YARN client will not upload again.
-    sysProps("spark.yarn.dist.jars") should be (tmpJarPath)
-    sysProps("spark.yarn.dist.files") should be (s"s3a://${file.getAbsolutePath}")
-    sysProps("spark.yarn.dist.pyFiles") should be (s"s3a://${pyFile.getAbsolutePath}")
+    conf.get("spark.yarn.dist.jars") should be (tmpJarPath)
+    conf.get("spark.yarn.dist.files") should be (s"s3a://${file.getAbsolutePath}")
+    conf.get("spark.yarn.dist.pyFiles") should be (s"s3a://${pyFile.getAbsolutePath}")
 
     // Local repl jars should be a local path.
-    sysProps("spark.repl.local.jars") should (startWith("file:"))
+    conf.get("spark.repl.local.jars") should (startWith("file:"))
 
     // local py files should not be a URI format.
-    sysProps("spark.submit.pyFiles") should (startWith("/"))
+    conf.get("spark.submit.pyFiles") should (startWith("/"))
   }
 
   test("download remote resource if it is not supported by yarn service") {
@@ -955,9 +957,9 @@ class SparkSubmitSuite
     )
 
     val appArgs = new SparkSubmitArguments(args)
-    val sysProps = SparkSubmit.prepareSubmitEnvironment(appArgs, Some(hadoopConf))._3
+    val (_, _, conf, _) = SparkSubmit.prepareSubmitEnvironment(appArgs, Some(hadoopConf))
 
-    val jars = sysProps("spark.yarn.dist.jars").split(",").toSet
+    val jars = conf.get("spark.yarn.dist.jars").split(",").toSet
 
     // The URI of remote S3 resource should still be remote.
     assert(jars.contains(tmpS3JarPath))
@@ -996,6 +998,21 @@ class SparkSubmitSuite
     conf.set("fs.s3a.impl", classOf[TestFileSystem].getCanonicalName)
     conf.set("fs.s3a.impl.disable.cache", "true")
   }
+
+  test("start SparkApplication without modifying system properties") {
+    val args = Array(
+      "--class", classOf[TestSparkApplication].getName(),
+      "--master", "local",
+      "--conf", "spark.test.hello=world",
+      "spark-internal",
+      "hello")
+
+    val exception = intercept[SparkException] {
+      SparkSubmit.main(args)
+    }
+
+    assert(exception.getMessage() === "hello")
+  }
 }
 
 object SparkSubmitSuite extends SparkFunSuite with TimeLimits {
@@ -1115,3 +1132,17 @@ class TestFileSystem extends org.apache.hadoop.fs.LocalFileSystem {
 
   override def open(path: Path): FSDataInputStream = super.open(local(path))
 }
+
+class TestSparkApplication extends SparkApplication with Matchers {
+
+  override def start(args: Array[String], conf: SparkConf): Unit = {
+    assert(args.size === 1)
+    assert(args(0) === "hello")
+    assert(conf.get("spark.test.hello") === "world")
+    assert(sys.props.get("spark.test.hello") === None)
+
+    // This is how the test verifies the application was actually run.
+    throw new SparkException(args(0))
+  }
+
+}
diff --git a/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
index 70887dc5dd97a..490baf040491f 100644
--- a/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
@@ -445,9 +445,9 @@ class StandaloneRestSubmitSuite extends SparkFunSuite with BeforeAndAfterEach {
       "--class", mainClass,
       mainJar) ++ appArgs
     val args = new SparkSubmitArguments(commandLineArgs)
-    val (_, _, sparkProperties, _) = SparkSubmit.prepareSubmitEnvironment(args)
+    val (_, _, sparkConf, _) = SparkSubmit.prepareSubmitEnvironment(args)
     new RestSubmissionClient("spark://host:port").constructSubmitRequest(
-      mainJar, mainClass, appArgs, sparkProperties.toMap, Map.empty)
+      mainJar, mainClass, appArgs, sparkConf.getAll.toMap, Map.empty)
   }
 
   /** Return the response as a submit response, or fail with error otherwise. */

From a83d8d5adcb4e0061e43105767242ba9770dda96 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Thu, 26 Oct 2017 20:54:36 +0900
Subject: [PATCH 768/779] [SPARK-17902][R] Revive stringsAsFactors option for
 collect() in SparkR

## What changes were proposed in this pull request?

This PR proposes to revive `stringsAsFactors` option in collect API, which was mistakenly removed in https://github.com/apache/spark/commit/71a138cd0e0a14e8426f97877e3b52a562bbd02c.

Simply, it casts `charactor` to `factor` if it meets the condition, `stringsAsFactors && is.character(vec)` in primitive type conversion.

## How was this patch tested?

Unit test in `R/pkg/tests/fulltests/test_sparkSQL.R`.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #19551 from HyukjinKwon/SPARK-17902.
---
 R/pkg/R/DataFrame.R                   | 3 +++
 R/pkg/tests/fulltests/test_sparkSQL.R | 6 ++++++
 2 files changed, 9 insertions(+)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 176bb3b8a8d0c..aaa3349d57506 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1191,6 +1191,9 @@ setMethod("collect",
                     vec <- do.call(c, col)
                     stopifnot(class(vec) != "list")
                     class(vec) <- PRIMITIVE_TYPES[[colType]]
+                    if (is.character(vec) && stringsAsFactors) {
+                      vec <- as.factor(vec)
+                    }
                     df[[colIndex]] <- vec
                   } else {
                     df[[colIndex]] <- col
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index 4382ef2ed4525..0c8118a7c73f3 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -499,6 +499,12 @@ test_that("create DataFrame with different data types", {
   expect_equal(collect(df), data.frame(l, stringsAsFactors = FALSE))
 })
 
+test_that("SPARK-17902: collect() with stringsAsFactors enabled", {
+  df <- suppressWarnings(collect(createDataFrame(iris), stringsAsFactors = TRUE))
+  expect_equal(class(iris$Species), class(df$Species))
+  expect_equal(iris$Species, df$Species)
+})
+
 test_that("SPARK-17811: can create DataFrame containing NA as date and time", {
   df <- data.frame(
     id = 1:2,

From 0e9a750a8d389b3a17834584d31c204c77c6970d Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Thu, 26 Oct 2017 11:05:16 -0500
Subject: [PATCH 769/779] [SPARK-20643][CORE] Add listener implementation to
 collect app state.

The initial listener code is based on the existing JobProgressListener (and others),
and tries to mimic their behavior as much as possible. The change also includes
some minor code movement so that some types and methods from the initial history
server code code can be reused.

The code introduces a few mutable versions of public API types, used internally,
to make it easier to update information without ugly copy methods, and also to
make certain updates cheaper.

Note the code here is not 100% correct. This is meant as a building ground for
the UI integration in the next milestones. As different parts of the UI are
ported, fixes will be made to the different parts of this code to account
for the needed behavior.

I also added annotations to API types so that Jackson is able to correctly
deserialize options, sequences and maps that store primitive types.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #19383 from vanzin/SPARK-20643.
---
 .../apache/spark/util/kvstore/KVTypeInfo.java |   2 +
 .../apache/spark/util/kvstore/LevelDB.java    |   2 +-
 .../spark/status/api/v1/StageStatus.java      |   3 +-
 .../deploy/history/FsHistoryProvider.scala    |  37 +-
 .../apache/spark/deploy/history/config.scala  |   6 -
 .../spark/status/AppStatusListener.scala      | 531 ++++++++++++++
 .../org/apache/spark/status/KVUtils.scala     |  73 ++
 .../org/apache/spark/status/LiveEntity.scala  | 526 +++++++++++++
 .../status/api/v1/AllStagesResource.scala     |   4 +-
 .../org/apache/spark/status/api/v1/api.scala  |  11 +-
 .../org/apache/spark/status/storeTypes.scala  |  98 +++
 .../history/FsHistoryProviderSuite.scala      |   2 +-
 .../spark/status/AppStatusListenerSuite.scala | 690 ++++++++++++++++++
 project/MimaExcludes.scala                    |   2 +
 14 files changed, 1942 insertions(+), 45 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
 create mode 100644 core/src/main/scala/org/apache/spark/status/KVUtils.scala
 create mode 100644 core/src/main/scala/org/apache/spark/status/LiveEntity.scala
 create mode 100644 core/src/main/scala/org/apache/spark/status/storeTypes.scala
 create mode 100644 core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala

diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVTypeInfo.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVTypeInfo.java
index a2b077e4531ee..870b484f99068 100644
--- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVTypeInfo.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVTypeInfo.java
@@ -46,6 +46,7 @@ public KVTypeInfo(Class<?> type) throws Exception {
       KVIndex idx = f.getAnnotation(KVIndex.class);
       if (idx != null) {
         checkIndex(idx, indices);
+        f.setAccessible(true);
         indices.put(idx.value(), idx);
         f.setAccessible(true);
         accessors.put(idx.value(), new FieldAccessor(f));
@@ -58,6 +59,7 @@ public KVTypeInfo(Class<?> type) throws Exception {
         checkIndex(idx, indices);
         Preconditions.checkArgument(m.getParameterTypes().length == 0,
           "Annotated method %s::%s should not have any parameters.", type.getName(), m.getName());
+        m.setAccessible(true);
         indices.put(idx.value(), idx);
         m.setAccessible(true);
         accessors.put(idx.value(), new MethodAccessor(m));
diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java
index ff48b155fab31..4f9e10ca20066 100644
--- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java
@@ -76,7 +76,7 @@ public LevelDB(File path, KVStoreSerializer serializer) throws Exception {
     this.types = new ConcurrentHashMap<>();
 
     Options options = new Options();
-    options.createIfMissing(!path.exists());
+    options.createIfMissing(true);
     this._db = new AtomicReference<>(JniDBFactory.factory.open(path, options));
 
     byte[] versionData = db().get(STORE_VERSION_KEY);
diff --git a/core/src/main/java/org/apache/spark/status/api/v1/StageStatus.java b/core/src/main/java/org/apache/spark/status/api/v1/StageStatus.java
index 9dbb565aab707..40b5f627369d5 100644
--- a/core/src/main/java/org/apache/spark/status/api/v1/StageStatus.java
+++ b/core/src/main/java/org/apache/spark/status/api/v1/StageStatus.java
@@ -23,7 +23,8 @@ public enum StageStatus {
   ACTIVE,
   COMPLETE,
   FAILED,
-  PENDING;
+  PENDING,
+  SKIPPED;
 
   public static StageStatus fromString(String str) {
     return EnumUtil.parseIgnoreCase(StageStatus.class, str);
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index 3889dd097ee59..cf97597b484d8 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -42,6 +42,7 @@ import org.apache.spark.deploy.history.config._
 import org.apache.spark.internal.Logging
 import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.ReplayListenerBus._
+import org.apache.spark.status.KVUtils._
 import org.apache.spark.status.api.v1
 import org.apache.spark.ui.SparkUI
 import org.apache.spark.util.{Clock, SystemClock, ThreadUtils, Utils}
@@ -129,29 +130,15 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
   // Visible for testing.
   private[history] val listing: KVStore = storePath.map { path =>
     val dbPath = new File(path, "listing.ldb")
-
-    def openDB(): LevelDB = new LevelDB(dbPath, new KVStoreScalaSerializer())
+    val metadata = new FsHistoryProviderMetadata(CURRENT_LISTING_VERSION, logDir.toString())
 
     try {
-      val db = openDB()
-      val meta = db.getMetadata(classOf[KVStoreMetadata])
-
-      if (meta == null) {
-        db.setMetadata(new KVStoreMetadata(CURRENT_LISTING_VERSION, logDir))
-        db
-      } else if (meta.version != CURRENT_LISTING_VERSION || !logDir.equals(meta.logDir)) {
-        logInfo("Detected mismatched config in existing DB, deleting...")
-        db.close()
-        Utils.deleteRecursively(dbPath)
-        openDB()
-      } else {
-        db
-      }
+      open(new File(path, "listing.ldb"), metadata)
     } catch {
-      case _: UnsupportedStoreVersionException =>
+      case _: UnsupportedStoreVersionException | _: MetadataMismatchException =>
         logInfo("Detected incompatible DB versions, deleting...")
         Utils.deleteRecursively(dbPath)
-        openDB()
+        open(new File(path, "listing.ldb"), metadata)
     }
   }.getOrElse(new InMemoryStore())
 
@@ -720,19 +707,7 @@ private[history] object FsHistoryProvider {
   private[history] val CURRENT_LISTING_VERSION = 1L
 }
 
-/**
- * A KVStoreSerializer that provides Scala types serialization too, and uses the same options as
- * the API serializer.
- */
-private class KVStoreScalaSerializer extends KVStoreSerializer {
-
-  mapper.registerModule(DefaultScalaModule)
-  mapper.setSerializationInclusion(JsonInclude.Include.NON_NULL)
-  mapper.setDateFormat(v1.JacksonMessageWriter.makeISODateFormat)
-
-}
-
-private[history] case class KVStoreMetadata(
+private[history] case class FsHistoryProviderMetadata(
   version: Long,
   logDir: String)
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/config.scala b/core/src/main/scala/org/apache/spark/deploy/history/config.scala
index fb9e997def0dd..52dedc1a2ed41 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/config.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/config.scala
@@ -19,16 +19,10 @@ package org.apache.spark.deploy.history
 
 import java.util.concurrent.TimeUnit
 
-import scala.annotation.meta.getter
-
 import org.apache.spark.internal.config.ConfigBuilder
-import org.apache.spark.util.kvstore.KVIndex
 
 private[spark] object config {
 
-  /** Use this to annotate constructor params to be used as KVStore indices. */
-  type KVIndexParam = KVIndex @getter
-
   val DEFAULT_LOG_DIR = "file:/tmp/spark-events"
 
   val EVENT_LOG_DIR = ConfigBuilder("spark.history.fs.logDirectory")
diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
new file mode 100644
index 0000000000000..f120685c941df
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
@@ -0,0 +1,531 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.status
+
+import java.util.Date
+
+import scala.collection.mutable.HashMap
+
+import org.apache.spark._
+import org.apache.spark.executor.TaskMetrics
+import org.apache.spark.internal.Logging
+import org.apache.spark.scheduler._
+import org.apache.spark.status.api.v1
+import org.apache.spark.storage._
+import org.apache.spark.ui.SparkUI
+import org.apache.spark.util.kvstore.KVStore
+
+/**
+ * A Spark listener that writes application information to a data store. The types written to the
+ * store are defined in the `storeTypes.scala` file and are based on the public REST API.
+ */
+private class AppStatusListener(kvstore: KVStore) extends SparkListener with Logging {
+
+  private var sparkVersion = SPARK_VERSION
+  private var appInfo: v1.ApplicationInfo = null
+  private var coresPerTask: Int = 1
+
+  // Keep track of live entities, so that task metrics can be efficiently updated (without
+  // causing too many writes to the underlying store, and other expensive operations).
+  private val liveStages = new HashMap[(Int, Int), LiveStage]()
+  private val liveJobs = new HashMap[Int, LiveJob]()
+  private val liveExecutors = new HashMap[String, LiveExecutor]()
+  private val liveTasks = new HashMap[Long, LiveTask]()
+  private val liveRDDs = new HashMap[Int, LiveRDD]()
+
+  override def onOtherEvent(event: SparkListenerEvent): Unit = event match {
+    case SparkListenerLogStart(version) => sparkVersion = version
+    case _ =>
+  }
+
+  override def onApplicationStart(event: SparkListenerApplicationStart): Unit = {
+    assert(event.appId.isDefined, "Application without IDs are not supported.")
+
+    val attempt = new v1.ApplicationAttemptInfo(
+      event.appAttemptId,
+      new Date(event.time),
+      new Date(-1),
+      new Date(event.time),
+      -1L,
+      event.sparkUser,
+      false,
+      sparkVersion)
+
+    appInfo = new v1.ApplicationInfo(
+      event.appId.get,
+      event.appName,
+      None,
+      None,
+      None,
+      None,
+      Seq(attempt))
+
+    kvstore.write(new ApplicationInfoWrapper(appInfo))
+  }
+
+  override def onApplicationEnd(event: SparkListenerApplicationEnd): Unit = {
+    val old = appInfo.attempts.head
+    val attempt = new v1.ApplicationAttemptInfo(
+      old.attemptId,
+      old.startTime,
+      new Date(event.time),
+      new Date(event.time),
+      event.time - old.startTime.getTime(),
+      old.sparkUser,
+      true,
+      old.appSparkVersion)
+
+    appInfo = new v1.ApplicationInfo(
+      appInfo.id,
+      appInfo.name,
+      None,
+      None,
+      None,
+      None,
+      Seq(attempt))
+    kvstore.write(new ApplicationInfoWrapper(appInfo))
+  }
+
+  override def onExecutorAdded(event: SparkListenerExecutorAdded): Unit = {
+    // This needs to be an update in case an executor re-registers after the driver has
+    // marked it as "dead".
+    val exec = getOrCreateExecutor(event.executorId)
+    exec.host = event.executorInfo.executorHost
+    exec.isActive = true
+    exec.totalCores = event.executorInfo.totalCores
+    exec.maxTasks = event.executorInfo.totalCores / coresPerTask
+    exec.executorLogs = event.executorInfo.logUrlMap
+    update(exec)
+  }
+
+  override def onExecutorRemoved(event: SparkListenerExecutorRemoved): Unit = {
+    liveExecutors.remove(event.executorId).foreach { exec =>
+      exec.isActive = false
+      update(exec)
+    }
+  }
+
+  override def onExecutorBlacklisted(event: SparkListenerExecutorBlacklisted): Unit = {
+    updateBlackListStatus(event.executorId, true)
+  }
+
+  override def onExecutorUnblacklisted(event: SparkListenerExecutorUnblacklisted): Unit = {
+    updateBlackListStatus(event.executorId, false)
+  }
+
+  override def onNodeBlacklisted(event: SparkListenerNodeBlacklisted): Unit = {
+    updateNodeBlackList(event.hostId, true)
+  }
+
+  override def onNodeUnblacklisted(event: SparkListenerNodeUnblacklisted): Unit = {
+    updateNodeBlackList(event.hostId, false)
+  }
+
+  private def updateBlackListStatus(execId: String, blacklisted: Boolean): Unit = {
+    liveExecutors.get(execId).foreach { exec =>
+      exec.isBlacklisted = blacklisted
+      update(exec)
+    }
+  }
+
+  private def updateNodeBlackList(host: String, blacklisted: Boolean): Unit = {
+    // Implicitly (un)blacklist every executor associated with the node.
+    liveExecutors.values.foreach { exec =>
+      if (exec.hostname == host) {
+        exec.isBlacklisted = blacklisted
+        update(exec)
+      }
+    }
+  }
+
+  override def onJobStart(event: SparkListenerJobStart): Unit = {
+    // Compute (a potential over-estimate of) the number of tasks that will be run by this job.
+    // This may be an over-estimate because the job start event references all of the result
+    // stages' transitive stage dependencies, but some of these stages might be skipped if their
+    // output is available from earlier runs.
+    // See https://github.com/apache/spark/pull/3009 for a more extensive discussion.
+    val numTasks = {
+      val missingStages = event.stageInfos.filter(_.completionTime.isEmpty)
+      missingStages.map(_.numTasks).sum
+    }
+
+    val lastStageInfo = event.stageInfos.lastOption
+    val lastStageName = lastStageInfo.map(_.name).getOrElse("(Unknown Stage Name)")
+
+    val jobGroup = Option(event.properties)
+      .flatMap { p => Option(p.getProperty(SparkContext.SPARK_JOB_GROUP_ID)) }
+
+    val job = new LiveJob(
+      event.jobId,
+      lastStageName,
+      Some(new Date(event.time)),
+      event.stageIds,
+      jobGroup,
+      numTasks)
+    liveJobs.put(event.jobId, job)
+    update(job)
+
+    event.stageInfos.foreach { stageInfo =>
+      // A new job submission may re-use an existing stage, so this code needs to do an update
+      // instead of just a write.
+      val stage = getOrCreateStage(stageInfo)
+      stage.jobs :+= job
+      stage.jobIds += event.jobId
+      update(stage)
+    }
+  }
+
+  override def onJobEnd(event: SparkListenerJobEnd): Unit = {
+    liveJobs.remove(event.jobId).foreach { job =>
+      job.status = event.jobResult match {
+        case JobSucceeded => JobExecutionStatus.SUCCEEDED
+        case JobFailed(_) => JobExecutionStatus.FAILED
+      }
+
+      job.completionTime = Some(new Date(event.time))
+      update(job)
+    }
+  }
+
+  override def onStageSubmitted(event: SparkListenerStageSubmitted): Unit = {
+    val stage = getOrCreateStage(event.stageInfo)
+    stage.status = v1.StageStatus.ACTIVE
+    stage.schedulingPool = Option(event.properties).flatMap { p =>
+      Option(p.getProperty("spark.scheduler.pool"))
+    }.getOrElse(SparkUI.DEFAULT_POOL_NAME)
+
+    // Look at all active jobs to find the ones that mention this stage.
+    stage.jobs = liveJobs.values
+      .filter(_.stageIds.contains(event.stageInfo.stageId))
+      .toSeq
+    stage.jobIds = stage.jobs.map(_.jobId).toSet
+
+    stage.jobs.foreach { job =>
+      job.completedStages = job.completedStages - event.stageInfo.stageId
+      job.activeStages += 1
+      update(job)
+    }
+
+    event.stageInfo.rddInfos.foreach { info =>
+      if (info.storageLevel.isValid) {
+        update(liveRDDs.getOrElseUpdate(info.id, new LiveRDD(info)))
+      }
+    }
+
+    update(stage)
+  }
+
+  override def onTaskStart(event: SparkListenerTaskStart): Unit = {
+    val task = new LiveTask(event.taskInfo, event.stageId, event.stageAttemptId)
+    liveTasks.put(event.taskInfo.taskId, task)
+    update(task)
+
+    liveStages.get((event.stageId, event.stageAttemptId)).foreach { stage =>
+      stage.activeTasks += 1
+      stage.firstLaunchTime = math.min(stage.firstLaunchTime, event.taskInfo.launchTime)
+      update(stage)
+
+      stage.jobs.foreach { job =>
+        job.activeTasks += 1
+        update(job)
+      }
+    }
+
+    liveExecutors.get(event.taskInfo.executorId).foreach { exec =>
+      exec.activeTasks += 1
+      exec.totalTasks += 1
+      update(exec)
+    }
+  }
+
+  override def onTaskGettingResult(event: SparkListenerTaskGettingResult): Unit = {
+    // Call update on the task so that the "getting result" time is written to the store; the
+    // value is part of the mutable TaskInfo state that the live entity already references.
+    liveTasks.get(event.taskInfo.taskId).foreach { task =>
+      update(task)
+    }
+  }
+
+  override def onTaskEnd(event: SparkListenerTaskEnd): Unit = {
+    // TODO: can this really happen?
+    if (event.taskInfo == null) {
+      return
+    }
+
+    val metricsDelta = liveTasks.remove(event.taskInfo.taskId).map { task =>
+      val errorMessage = event.reason match {
+        case Success =>
+          None
+        case k: TaskKilled =>
+          Some(k.reason)
+        case e: ExceptionFailure => // Handle ExceptionFailure because we might have accumUpdates
+          Some(e.toErrorString)
+        case e: TaskFailedReason => // All other failure cases
+          Some(e.toErrorString)
+        case other =>
+          logInfo(s"Unhandled task end reason: $other")
+          None
+      }
+      task.errorMessage = errorMessage
+      val delta = task.updateMetrics(event.taskMetrics)
+      update(task)
+      delta
+    }.orNull
+
+    val (completedDelta, failedDelta) = event.reason match {
+      case Success =>
+        (1, 0)
+      case _ =>
+        (0, 1)
+    }
+
+    liveStages.get((event.stageId, event.stageAttemptId)).foreach { stage =>
+      if (metricsDelta != null) {
+        stage.metrics.update(metricsDelta)
+      }
+      stage.activeTasks -= 1
+      stage.completedTasks += completedDelta
+      stage.failedTasks += failedDelta
+      update(stage)
+
+      stage.jobs.foreach { job =>
+        job.activeTasks -= 1
+        job.completedTasks += completedDelta
+        job.failedTasks += failedDelta
+        update(job)
+      }
+
+      val esummary = stage.executorSummary(event.taskInfo.executorId)
+      esummary.taskTime += event.taskInfo.duration
+      esummary.succeededTasks += completedDelta
+      esummary.failedTasks += failedDelta
+      if (metricsDelta != null) {
+        esummary.metrics.update(metricsDelta)
+      }
+      update(esummary)
+    }
+
+    liveExecutors.get(event.taskInfo.executorId).foreach { exec =>
+      if (event.taskMetrics != null) {
+        val readMetrics = event.taskMetrics.shuffleReadMetrics
+        exec.totalGcTime += event.taskMetrics.jvmGCTime
+        exec.totalInputBytes += event.taskMetrics.inputMetrics.bytesRead
+        exec.totalShuffleRead += readMetrics.localBytesRead + readMetrics.remoteBytesRead
+        exec.totalShuffleWrite += event.taskMetrics.shuffleWriteMetrics.bytesWritten
+      }
+
+      exec.activeTasks -= 1
+      exec.completedTasks += completedDelta
+      exec.failedTasks += failedDelta
+      exec.totalDuration += event.taskInfo.duration
+      update(exec)
+    }
+  }
+
+  override def onStageCompleted(event: SparkListenerStageCompleted): Unit = {
+    liveStages.remove((event.stageInfo.stageId, event.stageInfo.attemptId)).foreach { stage =>
+      stage.info = event.stageInfo
+
+      // Because of SPARK-20205, old event logs may contain valid stages without a submission time
+      // in their start event. In those cases, we can only detect whether a stage was skipped by
+      // waiting until the completion event, at which point the field would have been set.
+      stage.status = event.stageInfo.failureReason match {
+        case Some(_) => v1.StageStatus.FAILED
+        case _ if event.stageInfo.submissionTime.isDefined => v1.StageStatus.COMPLETE
+        case _ => v1.StageStatus.SKIPPED
+      }
+      update(stage)
+
+      stage.jobs.foreach { job =>
+        stage.status match {
+          case v1.StageStatus.COMPLETE =>
+            job.completedStages += event.stageInfo.stageId
+          case v1.StageStatus.SKIPPED =>
+            job.skippedStages += event.stageInfo.stageId
+            job.skippedTasks += event.stageInfo.numTasks
+          case _ =>
+            job.failedStages += 1
+        }
+        job.activeStages -= 1
+        update(job)
+      }
+
+      stage.executorSummaries.values.foreach(update)
+      update(stage)
+    }
+  }
+
+  override def onBlockManagerAdded(event: SparkListenerBlockManagerAdded): Unit = {
+    // This needs to set fields that are already set by onExecutorAdded because the driver is
+    // considered an "executor" in the UI, but does not have a SparkListenerExecutorAdded event.
+    val exec = getOrCreateExecutor(event.blockManagerId.executorId)
+    exec.hostPort = event.blockManagerId.hostPort
+    event.maxOnHeapMem.foreach { _ =>
+      exec.totalOnHeap = event.maxOnHeapMem.get
+      exec.totalOffHeap = event.maxOffHeapMem.get
+    }
+    exec.isActive = true
+    exec.maxMemory = event.maxMem
+    update(exec)
+  }
+
+  override def onBlockManagerRemoved(event: SparkListenerBlockManagerRemoved): Unit = {
+    // Nothing to do here. Covered by onExecutorRemoved.
+  }
+
+  override def onUnpersistRDD(event: SparkListenerUnpersistRDD): Unit = {
+    liveRDDs.remove(event.rddId)
+    kvstore.delete(classOf[RDDStorageInfoWrapper], event.rddId)
+  }
+
+  override def onExecutorMetricsUpdate(event: SparkListenerExecutorMetricsUpdate): Unit = {
+    event.accumUpdates.foreach { case (taskId, sid, sAttempt, accumUpdates) =>
+      liveTasks.get(taskId).foreach { task =>
+        val metrics = TaskMetrics.fromAccumulatorInfos(accumUpdates)
+        val delta = task.updateMetrics(metrics)
+        update(task)
+
+        liveStages.get((sid, sAttempt)).foreach { stage =>
+          stage.metrics.update(delta)
+          update(stage)
+
+          val esummary = stage.executorSummary(event.execId)
+          esummary.metrics.update(delta)
+          update(esummary)
+        }
+      }
+    }
+  }
+
+  override def onBlockUpdated(event: SparkListenerBlockUpdated): Unit = {
+    event.blockUpdatedInfo.blockId match {
+      case block: RDDBlockId => updateRDDBlock(event, block)
+      case _ => // TODO: API only covers RDD storage.
+    }
+  }
+
+  private def updateRDDBlock(event: SparkListenerBlockUpdated, block: RDDBlockId): Unit = {
+    val executorId = event.blockUpdatedInfo.blockManagerId.executorId
+
+    // Whether values are being added to or removed from the existing accounting.
+    val storageLevel = event.blockUpdatedInfo.storageLevel
+    val diskDelta = event.blockUpdatedInfo.diskSize * (if (storageLevel.useDisk) 1 else -1)
+    val memoryDelta = event.blockUpdatedInfo.memSize * (if (storageLevel.useMemory) 1 else -1)
+
+    // Function to apply a delta to a value, but ensure that it doesn't go negative.
+    def newValue(old: Long, delta: Long): Long = math.max(0, old + delta)
+
+    val updatedStorageLevel = if (storageLevel.isValid) {
+      Some(storageLevel.description)
+    } else {
+      None
+    }
+
+    // We need information about the executor to update some memory accounting values in the
+    // RDD info, so read that beforehand.
+    val maybeExec = liveExecutors.get(executorId)
+    var rddBlocksDelta = 0
+
+    // Update the block entry in the RDD info, keeping track of the deltas above so that we
+    // can update the executor information too.
+    liveRDDs.get(block.rddId).foreach { rdd =>
+      val partition = rdd.partition(block.name)
+
+      val executors = if (updatedStorageLevel.isDefined) {
+        if (!partition.executors.contains(executorId)) {
+          rddBlocksDelta = 1
+        }
+        partition.executors + executorId
+      } else {
+        rddBlocksDelta = -1
+        partition.executors - executorId
+      }
+
+      // Only update the partition if it's still stored in some executor, otherwise get rid of it.
+      if (executors.nonEmpty) {
+        if (updatedStorageLevel.isDefined) {
+          partition.storageLevel = updatedStorageLevel.get
+        }
+        partition.memoryUsed = newValue(partition.memoryUsed, memoryDelta)
+        partition.diskUsed = newValue(partition.diskUsed, diskDelta)
+        partition.executors = executors
+      } else {
+        rdd.removePartition(block.name)
+      }
+
+      maybeExec.foreach { exec =>
+        if (exec.rddBlocks + rddBlocksDelta > 0) {
+          val dist = rdd.distribution(exec)
+          dist.memoryRemaining = newValue(dist.memoryRemaining, -memoryDelta)
+          dist.memoryUsed = newValue(dist.memoryUsed, memoryDelta)
+          dist.diskUsed = newValue(dist.diskUsed, diskDelta)
+
+          if (exec.hasMemoryInfo) {
+            if (storageLevel.useOffHeap) {
+              dist.offHeapUsed = newValue(dist.offHeapUsed, memoryDelta)
+              dist.offHeapRemaining = newValue(dist.offHeapRemaining, -memoryDelta)
+            } else {
+              dist.onHeapUsed = newValue(dist.onHeapUsed, memoryDelta)
+              dist.onHeapRemaining = newValue(dist.onHeapRemaining, -memoryDelta)
+            }
+          }
+        } else {
+          rdd.removeDistribution(exec)
+        }
+      }
+
+      if (updatedStorageLevel.isDefined) {
+        rdd.storageLevel = updatedStorageLevel.get
+      }
+      rdd.memoryUsed = newValue(rdd.memoryUsed, memoryDelta)
+      rdd.diskUsed = newValue(rdd.diskUsed, diskDelta)
+      update(rdd)
+    }
+
+    maybeExec.foreach { exec =>
+      if (exec.hasMemoryInfo) {
+        if (storageLevel.useOffHeap) {
+          exec.usedOffHeap = newValue(exec.usedOffHeap, memoryDelta)
+        } else {
+          exec.usedOnHeap = newValue(exec.usedOnHeap, memoryDelta)
+        }
+      }
+      exec.memoryUsed = newValue(exec.memoryUsed, memoryDelta)
+      exec.diskUsed = newValue(exec.diskUsed, diskDelta)
+      exec.rddBlocks += rddBlocksDelta
+      if (exec.hasMemoryInfo || rddBlocksDelta != 0) {
+        update(exec)
+      }
+    }
+  }
+
+  private def getOrCreateExecutor(executorId: String): LiveExecutor = {
+    liveExecutors.getOrElseUpdate(executorId, new LiveExecutor(executorId))
+  }
+
+  private def getOrCreateStage(info: StageInfo): LiveStage = {
+    val stage = liveStages.getOrElseUpdate((info.stageId, info.attemptId), new LiveStage())
+    stage.info = info
+    stage
+  }
+
+  private def update(entity: LiveEntity): Unit = {
+    entity.write(kvstore)
+  }
+
+}
diff --git a/core/src/main/scala/org/apache/spark/status/KVUtils.scala b/core/src/main/scala/org/apache/spark/status/KVUtils.scala
new file mode 100644
index 0000000000000..4638511944c61
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/status/KVUtils.scala
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.status
+
+import java.io.File
+
+import scala.annotation.meta.getter
+import scala.language.implicitConversions
+import scala.reflect.{classTag, ClassTag}
+
+import com.fasterxml.jackson.annotation.JsonInclude
+import com.fasterxml.jackson.module.scala.DefaultScalaModule
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.kvstore._
+
+private[spark] object KVUtils extends Logging {
+
+  /** Use this to annotate constructor params to be used as KVStore indices. */
+  type KVIndexParam = KVIndex @getter
+
+  /**
+   * A KVStoreSerializer that provides Scala types serialization too, and uses the same options as
+   * the API serializer.
+   */
+  private[spark] class KVStoreScalaSerializer extends KVStoreSerializer {
+
+    mapper.registerModule(DefaultScalaModule)
+    mapper.setSerializationInclusion(JsonInclude.Include.NON_NULL)
+
+  }
+
+  /**
+   * Open or create a LevelDB store.
+   *
+   * @param path Location of the store.
+   * @param metadata Metadata value to compare to the data in the store. If the store does not
+   *                 contain any metadata (e.g. it's a new store), this value is written as
+   *                 the store's metadata.
+   */
+  def open[M: ClassTag](path: File, metadata: M): LevelDB = {
+    require(metadata != null, "Metadata is required.")
+
+    val db = new LevelDB(path, new KVStoreScalaSerializer())
+    val dbMeta = db.getMetadata(classTag[M].runtimeClass)
+    if (dbMeta == null) {
+      db.setMetadata(metadata)
+    } else if (dbMeta != metadata) {
+      db.close()
+      throw new MetadataMismatchException()
+    }
+
+    db
+  }
+
+  private[spark] class MetadataMismatchException extends Exception
+
+}
diff --git a/core/src/main/scala/org/apache/spark/status/LiveEntity.scala b/core/src/main/scala/org/apache/spark/status/LiveEntity.scala
new file mode 100644
index 0000000000000..63fa36580bc7d
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/status/LiveEntity.scala
@@ -0,0 +1,526 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.status
+
+import java.util.Date
+
+import scala.collection.mutable.HashMap
+
+import org.apache.spark.JobExecutionStatus
+import org.apache.spark.executor.TaskMetrics
+import org.apache.spark.scheduler.{AccumulableInfo, StageInfo, TaskInfo}
+import org.apache.spark.status.api.v1
+import org.apache.spark.storage.RDDInfo
+import org.apache.spark.ui.SparkUI
+import org.apache.spark.util.AccumulatorContext
+import org.apache.spark.util.kvstore.KVStore
+
+/**
+ * A mutable representation of a live entity in Spark (jobs, stages, tasks, et al). Every live
+ * entity uses one of these instances to keep track of their evolving state, and periodically
+ * flush an immutable view of the entity to the app state store.
+ */
+private[spark] abstract class LiveEntity {
+
+  def write(store: KVStore): Unit = {
+    store.write(doUpdate())
+  }
+
+  /**
+   * Returns an updated view of entity data, to be stored in the status store, reflecting the
+   * latest information collected by the listener.
+   */
+  protected def doUpdate(): Any
+
+}
+
+private class LiveJob(
+    val jobId: Int,
+    name: String,
+    submissionTime: Option[Date],
+    val stageIds: Seq[Int],
+    jobGroup: Option[String],
+    numTasks: Int) extends LiveEntity {
+
+  var activeTasks = 0
+  var completedTasks = 0
+  var failedTasks = 0
+
+  var skippedTasks = 0
+  var skippedStages = Set[Int]()
+
+  var status = JobExecutionStatus.RUNNING
+  var completionTime: Option[Date] = None
+
+  var completedStages: Set[Int] = Set()
+  var activeStages = 0
+  var failedStages = 0
+
+  override protected def doUpdate(): Any = {
+    val info = new v1.JobData(
+      jobId,
+      name,
+      None, // description is always None?
+      submissionTime,
+      completionTime,
+      stageIds,
+      jobGroup,
+      status,
+      numTasks,
+      activeTasks,
+      completedTasks,
+      skippedTasks,
+      failedTasks,
+      activeStages,
+      completedStages.size,
+      skippedStages.size,
+      failedStages)
+    new JobDataWrapper(info, skippedStages)
+  }
+
+}
+
+private class LiveTask(
+    info: TaskInfo,
+    stageId: Int,
+    stageAttemptId: Int) extends LiveEntity {
+
+  import LiveEntityHelpers._
+
+  private var recordedMetrics: v1.TaskMetrics = null
+
+  var errorMessage: Option[String] = None
+
+  /**
+   * Update the metrics for the task and return the difference between the previous and new
+   * values.
+   */
+  def updateMetrics(metrics: TaskMetrics): v1.TaskMetrics = {
+    if (metrics != null) {
+      val old = recordedMetrics
+      recordedMetrics = new v1.TaskMetrics(
+        metrics.executorDeserializeTime,
+        metrics.executorDeserializeCpuTime,
+        metrics.executorRunTime,
+        metrics.executorCpuTime,
+        metrics.resultSize,
+        metrics.jvmGCTime,
+        metrics.resultSerializationTime,
+        metrics.memoryBytesSpilled,
+        metrics.diskBytesSpilled,
+        new v1.InputMetrics(
+          metrics.inputMetrics.bytesRead,
+          metrics.inputMetrics.recordsRead),
+        new v1.OutputMetrics(
+          metrics.outputMetrics.bytesWritten,
+          metrics.outputMetrics.recordsWritten),
+        new v1.ShuffleReadMetrics(
+          metrics.shuffleReadMetrics.remoteBlocksFetched,
+          metrics.shuffleReadMetrics.localBlocksFetched,
+          metrics.shuffleReadMetrics.fetchWaitTime,
+          metrics.shuffleReadMetrics.remoteBytesRead,
+          metrics.shuffleReadMetrics.remoteBytesReadToDisk,
+          metrics.shuffleReadMetrics.localBytesRead,
+          metrics.shuffleReadMetrics.recordsRead),
+        new v1.ShuffleWriteMetrics(
+          metrics.shuffleWriteMetrics.bytesWritten,
+          metrics.shuffleWriteMetrics.writeTime,
+          metrics.shuffleWriteMetrics.recordsWritten))
+      if (old != null) calculateMetricsDelta(recordedMetrics, old) else recordedMetrics
+    } else {
+      null
+    }
+  }
+
+  /**
+   * Return a new TaskMetrics object containing the delta of the various fields of the given
+   * metrics objects. This is currently targeted at updating stage data, so it does not
+   * necessarily calculate deltas for all the fields.
+   */
+  private def calculateMetricsDelta(
+      metrics: v1.TaskMetrics,
+      old: v1.TaskMetrics): v1.TaskMetrics = {
+    val shuffleWriteDelta = new v1.ShuffleWriteMetrics(
+      metrics.shuffleWriteMetrics.bytesWritten - old.shuffleWriteMetrics.bytesWritten,
+      0L,
+      metrics.shuffleWriteMetrics.recordsWritten - old.shuffleWriteMetrics.recordsWritten)
+
+    val shuffleReadDelta = new v1.ShuffleReadMetrics(
+      0L, 0L, 0L,
+      metrics.shuffleReadMetrics.remoteBytesRead - old.shuffleReadMetrics.remoteBytesRead,
+      metrics.shuffleReadMetrics.remoteBytesReadToDisk -
+        old.shuffleReadMetrics.remoteBytesReadToDisk,
+      metrics.shuffleReadMetrics.localBytesRead - old.shuffleReadMetrics.localBytesRead,
+      metrics.shuffleReadMetrics.recordsRead - old.shuffleReadMetrics.recordsRead)
+
+    val inputDelta = new v1.InputMetrics(
+      metrics.inputMetrics.bytesRead - old.inputMetrics.bytesRead,
+      metrics.inputMetrics.recordsRead - old.inputMetrics.recordsRead)
+
+    val outputDelta = new v1.OutputMetrics(
+      metrics.outputMetrics.bytesWritten - old.outputMetrics.bytesWritten,
+      metrics.outputMetrics.recordsWritten - old.outputMetrics.recordsWritten)
+
+    new v1.TaskMetrics(
+      0L, 0L,
+      metrics.executorRunTime - old.executorRunTime,
+      metrics.executorCpuTime - old.executorCpuTime,
+      0L, 0L, 0L,
+      metrics.memoryBytesSpilled - old.memoryBytesSpilled,
+      metrics.diskBytesSpilled - old.diskBytesSpilled,
+      inputDelta,
+      outputDelta,
+      shuffleReadDelta,
+      shuffleWriteDelta)
+  }
+
+  override protected def doUpdate(): Any = {
+    val task = new v1.TaskData(
+      info.taskId,
+      info.index,
+      info.attemptNumber,
+      new Date(info.launchTime),
+      if (info.finished) Some(info.duration) else None,
+      info.executorId,
+      info.host,
+      info.status,
+      info.taskLocality.toString(),
+      info.speculative,
+      newAccumulatorInfos(info.accumulables),
+      errorMessage,
+      Option(recordedMetrics))
+    new TaskDataWrapper(task)
+  }
+
+}
+
+private class LiveExecutor(val executorId: String) extends LiveEntity {
+
+  var hostPort: String = null
+  var host: String = null
+  var isActive = true
+  var totalCores = 0
+
+  var rddBlocks = 0
+  var memoryUsed = 0L
+  var diskUsed = 0L
+  var maxTasks = 0
+  var maxMemory = 0L
+
+  var totalTasks = 0
+  var activeTasks = 0
+  var completedTasks = 0
+  var failedTasks = 0
+  var totalDuration = 0L
+  var totalGcTime = 0L
+  var totalInputBytes = 0L
+  var totalShuffleRead = 0L
+  var totalShuffleWrite = 0L
+  var isBlacklisted = false
+
+  var executorLogs = Map[String, String]()
+
+  // Memory metrics. They may not be recorded (e.g. old event logs) so if totalOnHeap is not
+  // initialized, the store will not contain this information.
+  var totalOnHeap = -1L
+  var totalOffHeap = 0L
+  var usedOnHeap = 0L
+  var usedOffHeap = 0L
+
+  def hasMemoryInfo: Boolean = totalOnHeap >= 0L
+
+  def hostname: String = if (host != null) host else hostPort.split(":")(0)
+
+  override protected def doUpdate(): Any = {
+    val memoryMetrics = if (totalOnHeap >= 0) {
+      Some(new v1.MemoryMetrics(usedOnHeap, usedOffHeap, totalOnHeap, totalOffHeap))
+    } else {
+      None
+    }
+
+    val info = new v1.ExecutorSummary(
+      executorId,
+      if (hostPort != null) hostPort else host,
+      isActive,
+      rddBlocks,
+      memoryUsed,
+      diskUsed,
+      totalCores,
+      maxTasks,
+      activeTasks,
+      failedTasks,
+      completedTasks,
+      totalTasks,
+      totalDuration,
+      totalGcTime,
+      totalInputBytes,
+      totalShuffleRead,
+      totalShuffleWrite,
+      isBlacklisted,
+      maxMemory,
+      executorLogs,
+      memoryMetrics)
+    new ExecutorSummaryWrapper(info)
+  }
+
+}
+
+/** Metrics tracked per stage (both total and per executor). */
+private class MetricsTracker {
+  var executorRunTime = 0L
+  var executorCpuTime = 0L
+  var inputBytes = 0L
+  var inputRecords = 0L
+  var outputBytes = 0L
+  var outputRecords = 0L
+  var shuffleReadBytes = 0L
+  var shuffleReadRecords = 0L
+  var shuffleWriteBytes = 0L
+  var shuffleWriteRecords = 0L
+  var memoryBytesSpilled = 0L
+  var diskBytesSpilled = 0L
+
+  def update(delta: v1.TaskMetrics): Unit = {
+    executorRunTime += delta.executorRunTime
+    executorCpuTime += delta.executorCpuTime
+    inputBytes += delta.inputMetrics.bytesRead
+    inputRecords += delta.inputMetrics.recordsRead
+    outputBytes += delta.outputMetrics.bytesWritten
+    outputRecords += delta.outputMetrics.recordsWritten
+    shuffleReadBytes += delta.shuffleReadMetrics.localBytesRead +
+      delta.shuffleReadMetrics.remoteBytesRead
+    shuffleReadRecords += delta.shuffleReadMetrics.recordsRead
+    shuffleWriteBytes += delta.shuffleWriteMetrics.bytesWritten
+    shuffleWriteRecords += delta.shuffleWriteMetrics.recordsWritten
+    memoryBytesSpilled += delta.memoryBytesSpilled
+    diskBytesSpilled += delta.diskBytesSpilled
+  }
+
+}
+
+private class LiveExecutorStageSummary(
+    stageId: Int,
+    attemptId: Int,
+    executorId: String) extends LiveEntity {
+
+  var taskTime = 0L
+  var succeededTasks = 0
+  var failedTasks = 0
+  var killedTasks = 0
+
+  val metrics = new MetricsTracker()
+
+  override protected def doUpdate(): Any = {
+    val info = new v1.ExecutorStageSummary(
+      taskTime,
+      failedTasks,
+      succeededTasks,
+      metrics.inputBytes,
+      metrics.outputBytes,
+      metrics.shuffleReadBytes,
+      metrics.shuffleWriteBytes,
+      metrics.memoryBytesSpilled,
+      metrics.diskBytesSpilled)
+    new ExecutorStageSummaryWrapper(stageId, attemptId, executorId, info)
+  }
+
+}
+
+private class LiveStage extends LiveEntity {
+
+  import LiveEntityHelpers._
+
+  var jobs = Seq[LiveJob]()
+  var jobIds = Set[Int]()
+
+  var info: StageInfo = null
+  var status = v1.StageStatus.PENDING
+
+  var schedulingPool: String = SparkUI.DEFAULT_POOL_NAME
+
+  var activeTasks = 0
+  var completedTasks = 0
+  var failedTasks = 0
+
+  var firstLaunchTime = Long.MaxValue
+
+  val metrics = new MetricsTracker()
+
+  val executorSummaries = new HashMap[String, LiveExecutorStageSummary]()
+
+  def executorSummary(executorId: String): LiveExecutorStageSummary = {
+    executorSummaries.getOrElseUpdate(executorId,
+      new LiveExecutorStageSummary(info.stageId, info.attemptId, executorId))
+  }
+
+  override protected def doUpdate(): Any = {
+    val update = new v1.StageData(
+      status,
+      info.stageId,
+      info.attemptId,
+
+      activeTasks,
+      completedTasks,
+      failedTasks,
+
+      metrics.executorRunTime,
+      metrics.executorCpuTime,
+      info.submissionTime.map(new Date(_)),
+      if (firstLaunchTime < Long.MaxValue) Some(new Date(firstLaunchTime)) else None,
+      info.completionTime.map(new Date(_)),
+
+      metrics.inputBytes,
+      metrics.inputRecords,
+      metrics.outputBytes,
+      metrics.outputRecords,
+      metrics.shuffleReadBytes,
+      metrics.shuffleReadRecords,
+      metrics.shuffleWriteBytes,
+      metrics.shuffleWriteRecords,
+      metrics.memoryBytesSpilled,
+      metrics.diskBytesSpilled,
+
+      info.name,
+      info.details,
+      schedulingPool,
+
+      newAccumulatorInfos(info.accumulables.values),
+      None,
+      None)
+
+    new StageDataWrapper(update, jobIds)
+  }
+
+}
+
+private class LiveRDDPartition(val blockName: String) {
+
+  var executors = Set[String]()
+  var storageLevel: String = null
+  var memoryUsed = 0L
+  var diskUsed = 0L
+
+  def toApi(): v1.RDDPartitionInfo = {
+    new v1.RDDPartitionInfo(
+      blockName,
+      storageLevel,
+      memoryUsed,
+      diskUsed,
+      executors.toSeq.sorted)
+  }
+
+}
+
+private class LiveRDDDistribution(val exec: LiveExecutor) {
+
+  var memoryRemaining = exec.maxMemory
+  var memoryUsed = 0L
+  var diskUsed = 0L
+
+  var onHeapUsed = 0L
+  var offHeapUsed = 0L
+  var onHeapRemaining = 0L
+  var offHeapRemaining = 0L
+
+  def toApi(): v1.RDDDataDistribution = {
+    new v1.RDDDataDistribution(
+      exec.hostPort,
+      memoryUsed,
+      memoryRemaining,
+      diskUsed,
+      if (exec.hasMemoryInfo) Some(onHeapUsed) else None,
+      if (exec.hasMemoryInfo) Some(offHeapUsed) else None,
+      if (exec.hasMemoryInfo) Some(onHeapRemaining) else None,
+      if (exec.hasMemoryInfo) Some(offHeapRemaining) else None)
+  }
+
+}
+
+private class LiveRDD(info: RDDInfo) extends LiveEntity {
+
+  var storageLevel: String = info.storageLevel.description
+  var memoryUsed = 0L
+  var diskUsed = 0L
+
+  private val partitions = new HashMap[String, LiveRDDPartition]()
+  private val distributions = new HashMap[String, LiveRDDDistribution]()
+
+  def partition(blockName: String): LiveRDDPartition = {
+    partitions.getOrElseUpdate(blockName, new LiveRDDPartition(blockName))
+  }
+
+  def removePartition(blockName: String): Unit = partitions.remove(blockName)
+
+  def distribution(exec: LiveExecutor): LiveRDDDistribution = {
+    distributions.getOrElseUpdate(exec.hostPort, new LiveRDDDistribution(exec))
+  }
+
+  def removeDistribution(exec: LiveExecutor): Unit = {
+    distributions.remove(exec.hostPort)
+  }
+
+  override protected def doUpdate(): Any = {
+    val parts = if (partitions.nonEmpty) {
+      Some(partitions.values.toList.sortBy(_.blockName).map(_.toApi()))
+    } else {
+      None
+    }
+
+    val dists = if (distributions.nonEmpty) {
+      Some(distributions.values.toList.sortBy(_.exec.executorId).map(_.toApi()))
+    } else {
+      None
+    }
+
+    val rdd = new v1.RDDStorageInfo(
+      info.id,
+      info.name,
+      info.numPartitions,
+      partitions.size,
+      storageLevel,
+      memoryUsed,
+      diskUsed,
+      dists,
+      parts)
+
+    new RDDStorageInfoWrapper(rdd)
+  }
+
+}
+
+private object LiveEntityHelpers {
+
+  def newAccumulatorInfos(accums: Iterable[AccumulableInfo]): Seq[v1.AccumulableInfo] = {
+    accums
+      .filter { acc =>
+        // We don't need to store internal or SQL accumulables as their values will be shown in
+        // other places, so drop them to reduce the memory usage.
+        !acc.internal && (!acc.metadata.isDefined ||
+          acc.metadata.get != Some(AccumulatorContext.SQL_ACCUM_IDENTIFIER))
+      }
+      .map { acc =>
+        new v1.AccumulableInfo(
+          acc.id,
+          acc.name.map(_.intern()).orNull,
+          acc.update.map(_.toString()),
+          acc.value.map(_.toString()).orNull)
+      }
+      .toSeq
+  }
+
+}
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/AllStagesResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/AllStagesResource.scala
index 4a4ed954d689e..5f69949c618fd 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/AllStagesResource.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/AllStagesResource.scala
@@ -71,7 +71,7 @@ private[v1] object AllStagesResource {
 
     val taskData = if (includeDetails) {
       Some(stageUiData.taskData.map { case (k, v) =>
-        k -> convertTaskData(v, stageUiData.lastUpdateTime) })
+        k -> convertTaskData(v, stageUiData.lastUpdateTime) }.toMap)
     } else {
       None
     }
@@ -88,7 +88,7 @@ private[v1] object AllStagesResource {
           memoryBytesSpilled = summary.memoryBytesSpilled,
           diskBytesSpilled = summary.diskBytesSpilled
         )
-      })
+      }.toMap)
     } else {
       None
     }
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/api.scala b/core/src/main/scala/org/apache/spark/status/api/v1/api.scala
index 31659b25db318..bff6f90823f40 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/api.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/api.scala
@@ -16,11 +16,11 @@
  */
 package org.apache.spark.status.api.v1
 
+import java.lang.{Long => JLong}
 import java.util.Date
 
-import scala.collection.Map
-
 import com.fasterxml.jackson.annotation.JsonIgnoreProperties
+import com.fasterxml.jackson.databind.annotation.JsonDeserialize
 
 import org.apache.spark.JobExecutionStatus
 
@@ -129,9 +129,13 @@ class RDDDataDistribution private[spark](
     val memoryUsed: Long,
     val memoryRemaining: Long,
     val diskUsed: Long,
+    @JsonDeserialize(contentAs = classOf[JLong])
     val onHeapMemoryUsed: Option[Long],
+    @JsonDeserialize(contentAs = classOf[JLong])
     val offHeapMemoryUsed: Option[Long],
+    @JsonDeserialize(contentAs = classOf[JLong])
     val onHeapMemoryRemaining: Option[Long],
+    @JsonDeserialize(contentAs = classOf[JLong])
     val offHeapMemoryRemaining: Option[Long])
 
 class RDDPartitionInfo private[spark](
@@ -179,7 +183,8 @@ class TaskData private[spark](
     val index: Int,
     val attempt: Int,
     val launchTime: Date,
-    val duration: Option[Long] = None,
+    @JsonDeserialize(contentAs = classOf[JLong])
+    val duration: Option[Long],
     val executorId: String,
     val host: String,
     val status: String,
diff --git a/core/src/main/scala/org/apache/spark/status/storeTypes.scala b/core/src/main/scala/org/apache/spark/status/storeTypes.scala
new file mode 100644
index 0000000000000..9579accd2cba7
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/status/storeTypes.scala
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.status
+
+import com.fasterxml.jackson.annotation.JsonIgnore
+
+import org.apache.spark.status.KVUtils._
+import org.apache.spark.status.api.v1._
+import org.apache.spark.util.kvstore.KVIndex
+
+private[spark] class ApplicationInfoWrapper(val info: ApplicationInfo) {
+
+  @JsonIgnore @KVIndex
+  def id: String = info.id
+
+}
+
+private[spark] class ExecutorSummaryWrapper(val info: ExecutorSummary) {
+
+  @JsonIgnore @KVIndex
+  private[this] val id: String = info.id
+
+  @JsonIgnore @KVIndex("active")
+  private[this] val active: Boolean = info.isActive
+
+  @JsonIgnore @KVIndex("host")
+  val host: String = info.hostPort.split(":")(0)
+
+}
+
+/**
+ * Keep track of the existing stages when the job was submitted, and those that were
+ * completed during the job's execution. This allows a more accurate acounting of how
+ * many tasks were skipped for the job.
+ */
+private[spark] class JobDataWrapper(
+    val info: JobData,
+    val skippedStages: Set[Int]) {
+
+  @JsonIgnore @KVIndex
+  private[this] val id: Int = info.jobId
+
+}
+
+private[spark] class StageDataWrapper(
+    val info: StageData,
+    val jobIds: Set[Int]) {
+
+  @JsonIgnore @KVIndex
+  def id: Array[Int] = Array(info.stageId, info.attemptId)
+
+}
+
+private[spark] class TaskDataWrapper(val info: TaskData) {
+
+  @JsonIgnore @KVIndex
+  def id: Long = info.taskId
+
+}
+
+private[spark] class RDDStorageInfoWrapper(val info: RDDStorageInfo) {
+
+  @JsonIgnore @KVIndex
+  def id: Int = info.id
+
+  @JsonIgnore @KVIndex("cached")
+  def cached: Boolean = info.numCachedPartitions > 0
+
+}
+
+private[spark] class ExecutorStageSummaryWrapper(
+    val stageId: Int,
+    val stageAttemptId: Int,
+    val executorId: String,
+    val info: ExecutorStageSummary) {
+
+  @JsonIgnore @KVIndex
+  val id: Array[Any] = Array(stageId, stageAttemptId, executorId)
+
+  @JsonIgnore @KVIndex("stage")
+  private[this] val stage: Array[Int] = Array(stageId, stageAttemptId)
+
+}
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
index 2141934c92640..03bd3eaf579f3 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
@@ -611,7 +611,7 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
 
     // Manually overwrite the version in the listing db; this should cause the new provider to
     // discard all data because the versions don't match.
-    val meta = new KVStoreMetadata(FsHistoryProvider.CURRENT_LISTING_VERSION + 1,
+    val meta = new FsHistoryProviderMetadata(FsHistoryProvider.CURRENT_LISTING_VERSION + 1,
       conf.get(LOCAL_STORE_DIR).get)
     oldProvider.listing.setMetadata(meta)
     oldProvider.stop()
diff --git a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
new file mode 100644
index 0000000000000..6f7a0c14dd684
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
@@ -0,0 +1,690 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.status
+
+import java.io.File
+import java.util.{Date, Properties}
+
+import scala.collection.JavaConverters._
+import scala.reflect.{classTag, ClassTag}
+
+import org.scalatest.BeforeAndAfter
+
+import org.apache.spark._
+import org.apache.spark.executor.TaskMetrics
+import org.apache.spark.scheduler._
+import org.apache.spark.scheduler.cluster._
+import org.apache.spark.status.api.v1
+import org.apache.spark.storage._
+import org.apache.spark.util.Utils
+import org.apache.spark.util.kvstore._
+
+class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
+
+  private var time: Long = _
+  private var testDir: File = _
+  private var store: KVStore = _
+
+  before {
+    time = 0L
+    testDir = Utils.createTempDir()
+    store = KVUtils.open(testDir, getClass().getName())
+  }
+
+  after {
+    store.close()
+    Utils.deleteRecursively(testDir)
+  }
+
+  test("scheduler events") {
+    val listener = new AppStatusListener(store)
+
+    // Start the application.
+    time += 1
+    listener.onApplicationStart(SparkListenerApplicationStart(
+      "name",
+      Some("id"),
+      time,
+      "user",
+      Some("attempt"),
+      None))
+
+    check[ApplicationInfoWrapper]("id") { app =>
+      assert(app.info.name === "name")
+      assert(app.info.id === "id")
+      assert(app.info.attempts.size === 1)
+
+      val attempt = app.info.attempts.head
+      assert(attempt.attemptId === Some("attempt"))
+      assert(attempt.startTime === new Date(time))
+      assert(attempt.lastUpdated === new Date(time))
+      assert(attempt.endTime.getTime() === -1L)
+      assert(attempt.sparkUser === "user")
+      assert(!attempt.completed)
+    }
+
+    // Start a couple of executors.
+    time += 1
+    val execIds = Array("1", "2")
+
+    execIds.foreach { id =>
+      listener.onExecutorAdded(SparkListenerExecutorAdded(time, id,
+        new ExecutorInfo(s"$id.example.com", 1, Map())))
+    }
+
+    execIds.foreach { id =>
+      check[ExecutorSummaryWrapper](id) { exec =>
+        assert(exec.info.id === id)
+        assert(exec.info.hostPort === s"$id.example.com")
+        assert(exec.info.isActive)
+      }
+    }
+
+    // Start a job with 2 stages / 4 tasks each
+    time += 1
+    val stages = Seq(
+      new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1"),
+      new StageInfo(2, 0, "stage2", 4, Nil, Seq(1), "details2"))
+
+    val jobProps = new Properties()
+    jobProps.setProperty(SparkContext.SPARK_JOB_GROUP_ID, "jobGroup")
+    jobProps.setProperty("spark.scheduler.pool", "schedPool")
+
+    listener.onJobStart(SparkListenerJobStart(1, time, stages, jobProps))
+
+    check[JobDataWrapper](1) { job =>
+      assert(job.info.jobId === 1)
+      assert(job.info.name === stages.last.name)
+      assert(job.info.description === None)
+      assert(job.info.status === JobExecutionStatus.RUNNING)
+      assert(job.info.submissionTime === Some(new Date(time)))
+      assert(job.info.jobGroup === Some("jobGroup"))
+    }
+
+    stages.foreach { info =>
+      check[StageDataWrapper](key(info)) { stage =>
+        assert(stage.info.status === v1.StageStatus.PENDING)
+        assert(stage.jobIds === Set(1))
+      }
+    }
+
+    // Submit stage 1
+    time += 1
+    stages.head.submissionTime = Some(time)
+    listener.onStageSubmitted(SparkListenerStageSubmitted(stages.head, jobProps))
+
+    check[JobDataWrapper](1) { job =>
+      assert(job.info.numActiveStages === 1)
+    }
+
+    check[StageDataWrapper](key(stages.head)) { stage =>
+      assert(stage.info.status === v1.StageStatus.ACTIVE)
+      assert(stage.info.submissionTime === Some(new Date(stages.head.submissionTime.get)))
+      assert(stage.info.schedulingPool === "schedPool")
+    }
+
+    // Start tasks from stage 1
+    time += 1
+    var _taskIdTracker = -1L
+    def nextTaskId(): Long = {
+      _taskIdTracker += 1
+      _taskIdTracker
+    }
+
+    def createTasks(count: Int, time: Long): Seq[TaskInfo] = {
+      (1 to count).map { id =>
+        val exec = execIds(id.toInt % execIds.length)
+        val taskId = nextTaskId()
+        new TaskInfo(taskId, taskId.toInt, 1, time, exec, s"$exec.example.com",
+          TaskLocality.PROCESS_LOCAL, id % 2 == 0)
+      }
+    }
+
+    val s1Tasks = createTasks(4, time)
+    s1Tasks.foreach { task =>
+      listener.onTaskStart(SparkListenerTaskStart(stages.head.stageId, stages.head.attemptId, task))
+    }
+
+    assert(store.count(classOf[TaskDataWrapper]) === s1Tasks.size)
+
+    check[JobDataWrapper](1) { job =>
+      assert(job.info.numActiveTasks === s1Tasks.size)
+    }
+
+    check[StageDataWrapper](key(stages.head)) { stage =>
+      assert(stage.info.numActiveTasks === s1Tasks.size)
+      assert(stage.info.firstTaskLaunchedTime === Some(new Date(s1Tasks.head.launchTime)))
+    }
+
+    s1Tasks.foreach { task =>
+      check[TaskDataWrapper](task.taskId) { wrapper =>
+        assert(wrapper.info.taskId === task.taskId)
+        assert(wrapper.info.index === task.index)
+        assert(wrapper.info.attempt === task.attemptNumber)
+        assert(wrapper.info.launchTime === new Date(task.launchTime))
+        assert(wrapper.info.executorId === task.executorId)
+        assert(wrapper.info.host === task.host)
+        assert(wrapper.info.status === task.status)
+        assert(wrapper.info.taskLocality === task.taskLocality.toString())
+        assert(wrapper.info.speculative === task.speculative)
+      }
+    }
+
+    // Send executor metrics update. Only update one metric to avoid a lot of boilerplate code.
+    s1Tasks.foreach { task =>
+      val accum = new AccumulableInfo(1L, Some(InternalAccumulator.MEMORY_BYTES_SPILLED),
+        Some(1L), None, true, false, None)
+      listener.onExecutorMetricsUpdate(SparkListenerExecutorMetricsUpdate(
+        task.executorId,
+        Seq((task.taskId, stages.head.stageId, stages.head.attemptId, Seq(accum)))))
+    }
+
+    check[StageDataWrapper](key(stages.head)) { stage =>
+      assert(stage.info.memoryBytesSpilled === s1Tasks.size)
+    }
+
+    val execs = store.view(classOf[ExecutorStageSummaryWrapper]).index("stage")
+      .first(key(stages.head)).last(key(stages.head)).asScala.toSeq
+    assert(execs.size > 0)
+    execs.foreach { exec =>
+      assert(exec.info.memoryBytesSpilled === s1Tasks.size / 2)
+    }
+
+    // Fail one of the tasks, re-start it.
+    time += 1
+    s1Tasks.head.markFinished(TaskState.FAILED, time)
+    listener.onTaskEnd(SparkListenerTaskEnd(stages.head.stageId, stages.head.attemptId,
+      "taskType", TaskResultLost, s1Tasks.head, null))
+
+    time += 1
+    val reattempt = {
+      val orig = s1Tasks.head
+      // Task reattempts have a different ID, but the same index as the original.
+      new TaskInfo(nextTaskId(), orig.index, orig.attemptNumber + 1, time, orig.executorId,
+        s"${orig.executorId}.example.com", TaskLocality.PROCESS_LOCAL, orig.speculative)
+    }
+    listener.onTaskStart(SparkListenerTaskStart(stages.head.stageId, stages.head.attemptId,
+      reattempt))
+
+    assert(store.count(classOf[TaskDataWrapper]) === s1Tasks.size + 1)
+
+    check[JobDataWrapper](1) { job =>
+      assert(job.info.numFailedTasks === 1)
+      assert(job.info.numActiveTasks === s1Tasks.size)
+    }
+
+    check[StageDataWrapper](key(stages.head)) { stage =>
+      assert(stage.info.numFailedTasks === 1)
+      assert(stage.info.numActiveTasks === s1Tasks.size)
+    }
+
+    check[TaskDataWrapper](s1Tasks.head.taskId) { task =>
+      assert(task.info.status === s1Tasks.head.status)
+      assert(task.info.duration === Some(s1Tasks.head.duration))
+      assert(task.info.errorMessage == Some(TaskResultLost.toErrorString))
+    }
+
+    check[TaskDataWrapper](reattempt.taskId) { task =>
+      assert(task.info.index === s1Tasks.head.index)
+      assert(task.info.attempt === reattempt.attemptNumber)
+    }
+
+    // Succeed all tasks in stage 1.
+    val pending = s1Tasks.drop(1) ++ Seq(reattempt)
+
+    val s1Metrics = TaskMetrics.empty
+    s1Metrics.setExecutorCpuTime(2L)
+    s1Metrics.setExecutorRunTime(4L)
+
+    time += 1
+    pending.foreach { task =>
+      task.markFinished(TaskState.FINISHED, time)
+      listener.onTaskEnd(SparkListenerTaskEnd(stages.head.stageId, stages.head.attemptId,
+        "taskType", Success, task, s1Metrics))
+    }
+
+    check[JobDataWrapper](1) { job =>
+      assert(job.info.numFailedTasks === 1)
+      assert(job.info.numActiveTasks === 0)
+      assert(job.info.numCompletedTasks === pending.size)
+    }
+
+    check[StageDataWrapper](key(stages.head)) { stage =>
+      assert(stage.info.numFailedTasks === 1)
+      assert(stage.info.numActiveTasks === 0)
+      assert(stage.info.numCompleteTasks === pending.size)
+    }
+
+    pending.foreach { task =>
+      check[TaskDataWrapper](task.taskId) { wrapper =>
+        assert(wrapper.info.errorMessage === None)
+        assert(wrapper.info.taskMetrics.get.executorCpuTime === 2L)
+        assert(wrapper.info.taskMetrics.get.executorRunTime === 4L)
+      }
+    }
+
+    assert(store.count(classOf[TaskDataWrapper]) === pending.size + 1)
+
+    // End stage 1.
+    time += 1
+    stages.head.completionTime = Some(time)
+    listener.onStageCompleted(SparkListenerStageCompleted(stages.head))
+
+    check[JobDataWrapper](1) { job =>
+      assert(job.info.numActiveStages === 0)
+      assert(job.info.numCompletedStages === 1)
+    }
+
+    check[StageDataWrapper](key(stages.head)) { stage =>
+      assert(stage.info.status === v1.StageStatus.COMPLETE)
+      assert(stage.info.numFailedTasks === 1)
+      assert(stage.info.numActiveTasks === 0)
+      assert(stage.info.numCompleteTasks === pending.size)
+    }
+
+    // Submit stage 2.
+    time += 1
+    stages.last.submissionTime = Some(time)
+    listener.onStageSubmitted(SparkListenerStageSubmitted(stages.last, jobProps))
+
+    check[JobDataWrapper](1) { job =>
+      assert(job.info.numActiveStages === 1)
+    }
+
+    check[StageDataWrapper](key(stages.last)) { stage =>
+      assert(stage.info.status === v1.StageStatus.ACTIVE)
+      assert(stage.info.submissionTime === Some(new Date(stages.last.submissionTime.get)))
+    }
+
+    // Start and fail all tasks of stage 2.
+    time += 1
+    val s2Tasks = createTasks(4, time)
+    s2Tasks.foreach { task =>
+      listener.onTaskStart(SparkListenerTaskStart(stages.last.stageId, stages.last.attemptId, task))
+    }
+
+    time += 1
+    s2Tasks.foreach { task =>
+      task.markFinished(TaskState.FAILED, time)
+      listener.onTaskEnd(SparkListenerTaskEnd(stages.last.stageId, stages.last.attemptId,
+        "taskType", TaskResultLost, task, null))
+    }
+
+    check[JobDataWrapper](1) { job =>
+      assert(job.info.numFailedTasks === 1 + s2Tasks.size)
+      assert(job.info.numActiveTasks === 0)
+    }
+
+    check[StageDataWrapper](key(stages.last)) { stage =>
+      assert(stage.info.numFailedTasks === s2Tasks.size)
+      assert(stage.info.numActiveTasks === 0)
+    }
+
+    // Fail stage 2.
+    time += 1
+    stages.last.completionTime = Some(time)
+    stages.last.failureReason = Some("uh oh")
+    listener.onStageCompleted(SparkListenerStageCompleted(stages.last))
+
+    check[JobDataWrapper](1) { job =>
+      assert(job.info.numCompletedStages === 1)
+      assert(job.info.numFailedStages === 1)
+    }
+
+    check[StageDataWrapper](key(stages.last)) { stage =>
+      assert(stage.info.status === v1.StageStatus.FAILED)
+      assert(stage.info.numFailedTasks === s2Tasks.size)
+      assert(stage.info.numActiveTasks === 0)
+      assert(stage.info.numCompleteTasks === 0)
+    }
+
+    // - Re-submit stage 2, all tasks, and succeed them and the stage.
+    val oldS2 = stages.last
+    val newS2 = new StageInfo(oldS2.stageId, oldS2.attemptId + 1, oldS2.name, oldS2.numTasks,
+      oldS2.rddInfos, oldS2.parentIds, oldS2.details, oldS2.taskMetrics)
+
+    time += 1
+    newS2.submissionTime = Some(time)
+    listener.onStageSubmitted(SparkListenerStageSubmitted(newS2, jobProps))
+    assert(store.count(classOf[StageDataWrapper]) === 3)
+
+    val newS2Tasks = createTasks(4, time)
+
+    newS2Tasks.foreach { task =>
+      listener.onTaskStart(SparkListenerTaskStart(newS2.stageId, newS2.attemptId, task))
+    }
+
+    time += 1
+    newS2Tasks.foreach { task =>
+      task.markFinished(TaskState.FINISHED, time)
+      listener.onTaskEnd(SparkListenerTaskEnd(newS2.stageId, newS2.attemptId, "taskType", Success,
+        task, null))
+    }
+
+    time += 1
+    newS2.completionTime = Some(time)
+    listener.onStageCompleted(SparkListenerStageCompleted(newS2))
+
+    check[JobDataWrapper](1) { job =>
+      assert(job.info.numActiveStages === 0)
+      assert(job.info.numFailedStages === 1)
+      assert(job.info.numCompletedStages === 2)
+    }
+
+    check[StageDataWrapper](key(newS2)) { stage =>
+      assert(stage.info.status === v1.StageStatus.COMPLETE)
+      assert(stage.info.numActiveTasks === 0)
+      assert(stage.info.numCompleteTasks === newS2Tasks.size)
+    }
+
+    // End job.
+    time += 1
+    listener.onJobEnd(SparkListenerJobEnd(1, time, JobSucceeded))
+
+    check[JobDataWrapper](1) { job =>
+      assert(job.info.status === JobExecutionStatus.SUCCEEDED)
+    }
+
+    // Submit a second job that re-uses stage 1 and stage 2. Stage 1 won't be re-run, but
+    // stage 2 will. In any case, the DAGScheduler creates new info structures that are copies
+    // of the old stages, so mimic that behavior here. The "new" stage 1 is submitted without
+    // a submission time, which means it is "skipped", and the stage 2 re-execution should not
+    // change the stats of the already finished job.
+    time += 1
+    val j2Stages = Seq(
+      new StageInfo(3, 0, "stage1", 4, Nil, Nil, "details1"),
+      new StageInfo(4, 0, "stage2", 4, Nil, Seq(3), "details2"))
+    j2Stages.last.submissionTime = Some(time)
+    listener.onJobStart(SparkListenerJobStart(2, time, j2Stages, null))
+    assert(store.count(classOf[JobDataWrapper]) === 2)
+
+    listener.onStageSubmitted(SparkListenerStageSubmitted(j2Stages.head, jobProps))
+    listener.onStageCompleted(SparkListenerStageCompleted(j2Stages.head))
+    listener.onStageSubmitted(SparkListenerStageSubmitted(j2Stages.last, jobProps))
+    assert(store.count(classOf[StageDataWrapper]) === 5)
+
+    time += 1
+    val j2s2Tasks = createTasks(4, time)
+
+    j2s2Tasks.foreach { task =>
+      listener.onTaskStart(SparkListenerTaskStart(j2Stages.last.stageId, j2Stages.last.attemptId,
+        task))
+    }
+
+    time += 1
+    j2s2Tasks.foreach { task =>
+      task.markFinished(TaskState.FINISHED, time)
+      listener.onTaskEnd(SparkListenerTaskEnd(j2Stages.last.stageId, j2Stages.last.attemptId,
+        "taskType", Success, task, null))
+    }
+
+    time += 1
+    j2Stages.last.completionTime = Some(time)
+    listener.onStageCompleted(SparkListenerStageCompleted(j2Stages.last))
+
+    time += 1
+    listener.onJobEnd(SparkListenerJobEnd(2, time, JobSucceeded))
+
+    check[JobDataWrapper](1) { job =>
+      assert(job.info.numCompletedStages === 2)
+      assert(job.info.numCompletedTasks === s1Tasks.size + s2Tasks.size)
+    }
+
+    check[JobDataWrapper](2) { job =>
+      assert(job.info.status === JobExecutionStatus.SUCCEEDED)
+      assert(job.info.numCompletedStages === 1)
+      assert(job.info.numCompletedTasks === j2s2Tasks.size)
+      assert(job.info.numSkippedStages === 1)
+      assert(job.info.numSkippedTasks === s1Tasks.size)
+    }
+
+    // Blacklist an executor.
+    time += 1
+    listener.onExecutorBlacklisted(SparkListenerExecutorBlacklisted(time, "1", 42))
+    check[ExecutorSummaryWrapper]("1") { exec =>
+      assert(exec.info.isBlacklisted)
+    }
+
+    time += 1
+    listener.onExecutorUnblacklisted(SparkListenerExecutorUnblacklisted(time, "1"))
+    check[ExecutorSummaryWrapper]("1") { exec =>
+      assert(!exec.info.isBlacklisted)
+    }
+
+    // Blacklist a node.
+    time += 1
+    listener.onNodeBlacklisted(SparkListenerNodeBlacklisted(time, "1.example.com", 2))
+    check[ExecutorSummaryWrapper]("1") { exec =>
+      assert(exec.info.isBlacklisted)
+    }
+
+    time += 1
+    listener.onNodeUnblacklisted(SparkListenerNodeUnblacklisted(time, "1.example.com"))
+    check[ExecutorSummaryWrapper]("1") { exec =>
+      assert(!exec.info.isBlacklisted)
+    }
+
+    // Stop executors.
+    listener.onExecutorRemoved(SparkListenerExecutorRemoved(41L, "1", "Test"))
+    listener.onExecutorRemoved(SparkListenerExecutorRemoved(41L, "2", "Test"))
+
+    Seq("1", "2").foreach { id =>
+      check[ExecutorSummaryWrapper](id) { exec =>
+        assert(exec.info.id === id)
+        assert(!exec.info.isActive)
+      }
+    }
+
+    // End the application.
+    listener.onApplicationEnd(SparkListenerApplicationEnd(42L))
+
+    check[ApplicationInfoWrapper]("id") { app =>
+      assert(app.info.name === "name")
+      assert(app.info.id === "id")
+      assert(app.info.attempts.size === 1)
+
+      val attempt = app.info.attempts.head
+      assert(attempt.attemptId === Some("attempt"))
+      assert(attempt.startTime === new Date(1L))
+      assert(attempt.lastUpdated === new Date(42L))
+      assert(attempt.endTime === new Date(42L))
+      assert(attempt.duration === 41L)
+      assert(attempt.sparkUser === "user")
+      assert(attempt.completed)
+    }
+  }
+
+  test("storage events") {
+    val listener = new AppStatusListener(store)
+    val maxMemory = 42L
+
+    // Register a couple of block managers.
+    val bm1 = BlockManagerId("1", "1.example.com", 42)
+    val bm2 = BlockManagerId("2", "2.example.com", 84)
+    Seq(bm1, bm2).foreach { bm =>
+      listener.onExecutorAdded(SparkListenerExecutorAdded(1L, bm.executorId,
+        new ExecutorInfo(bm.host, 1, Map())))
+      listener.onBlockManagerAdded(SparkListenerBlockManagerAdded(1L, bm, maxMemory))
+      check[ExecutorSummaryWrapper](bm.executorId) { exec =>
+        assert(exec.info.maxMemory === maxMemory)
+      }
+    }
+
+    val rdd1b1 = RDDBlockId(1, 1)
+    val level = StorageLevel.MEMORY_AND_DISK
+
+    // Submit a stage and make sure the RDD is recorded.
+    val rddInfo = new RDDInfo(rdd1b1.rddId, "rdd1", 2, level, Nil)
+    val stage = new StageInfo(1, 0, "stage1", 4, Seq(rddInfo), Nil, "details1")
+    listener.onStageSubmitted(SparkListenerStageSubmitted(stage, new Properties()))
+
+    check[RDDStorageInfoWrapper](rdd1b1.rddId) { wrapper =>
+      assert(wrapper.info.name === rddInfo.name)
+      assert(wrapper.info.numPartitions === rddInfo.numPartitions)
+      assert(wrapper.info.storageLevel === rddInfo.storageLevel.description)
+    }
+
+    // Add partition 1 replicated on two block managers.
+    listener.onBlockUpdated(SparkListenerBlockUpdated(BlockUpdatedInfo(bm1, rdd1b1, level, 1L, 1L)))
+
+    check[RDDStorageInfoWrapper](rdd1b1.rddId) { wrapper =>
+      assert(wrapper.info.memoryUsed === 1L)
+      assert(wrapper.info.diskUsed === 1L)
+
+      assert(wrapper.info.dataDistribution.isDefined)
+      assert(wrapper.info.dataDistribution.get.size === 1)
+
+      val dist = wrapper.info.dataDistribution.get.head
+      assert(dist.address === bm1.hostPort)
+      assert(dist.memoryUsed === 1L)
+      assert(dist.diskUsed === 1L)
+      assert(dist.memoryRemaining === maxMemory - dist.memoryUsed)
+
+      assert(wrapper.info.partitions.isDefined)
+      assert(wrapper.info.partitions.get.size === 1)
+
+      val part = wrapper.info.partitions.get.head
+      assert(part.blockName === rdd1b1.name)
+      assert(part.storageLevel === level.description)
+      assert(part.memoryUsed === 1L)
+      assert(part.diskUsed === 1L)
+      assert(part.executors === Seq(bm1.executorId))
+    }
+
+    check[ExecutorSummaryWrapper](bm1.executorId) { exec =>
+      assert(exec.info.rddBlocks === 1L)
+      assert(exec.info.memoryUsed === 1L)
+      assert(exec.info.diskUsed === 1L)
+    }
+
+    listener.onBlockUpdated(SparkListenerBlockUpdated(BlockUpdatedInfo(bm2, rdd1b1, level, 1L, 1L)))
+
+    check[RDDStorageInfoWrapper](rdd1b1.rddId) { wrapper =>
+      assert(wrapper.info.memoryUsed === 2L)
+      assert(wrapper.info.diskUsed === 2L)
+      assert(wrapper.info.dataDistribution.get.size === 2L)
+      assert(wrapper.info.partitions.get.size === 1L)
+
+      val dist = wrapper.info.dataDistribution.get.find(_.address == bm2.hostPort).get
+      assert(dist.memoryUsed === 1L)
+      assert(dist.diskUsed === 1L)
+      assert(dist.memoryRemaining === maxMemory - dist.memoryUsed)
+
+      val part = wrapper.info.partitions.get(0)
+      assert(part.memoryUsed === 2L)
+      assert(part.diskUsed === 2L)
+      assert(part.executors === Seq(bm1.executorId, bm2.executorId))
+    }
+
+    check[ExecutorSummaryWrapper](bm2.executorId) { exec =>
+      assert(exec.info.rddBlocks === 1L)
+      assert(exec.info.memoryUsed === 1L)
+      assert(exec.info.diskUsed === 1L)
+    }
+
+    // Add a second partition only to bm 1.
+    val rdd1b2 = RDDBlockId(1, 2)
+    listener.onBlockUpdated(SparkListenerBlockUpdated(BlockUpdatedInfo(bm1, rdd1b2, level,
+      3L, 3L)))
+
+    check[RDDStorageInfoWrapper](rdd1b1.rddId) { wrapper =>
+      assert(wrapper.info.memoryUsed === 5L)
+      assert(wrapper.info.diskUsed === 5L)
+      assert(wrapper.info.dataDistribution.get.size === 2L)
+      assert(wrapper.info.partitions.get.size === 2L)
+
+      val dist = wrapper.info.dataDistribution.get.find(_.address == bm1.hostPort).get
+      assert(dist.memoryUsed === 4L)
+      assert(dist.diskUsed === 4L)
+      assert(dist.memoryRemaining === maxMemory - dist.memoryUsed)
+
+      val part = wrapper.info.partitions.get.find(_.blockName === rdd1b2.name).get
+      assert(part.storageLevel === level.description)
+      assert(part.memoryUsed === 3L)
+      assert(part.diskUsed === 3L)
+      assert(part.executors === Seq(bm1.executorId))
+    }
+
+    check[ExecutorSummaryWrapper](bm1.executorId) { exec =>
+      assert(exec.info.rddBlocks === 2L)
+      assert(exec.info.memoryUsed === 4L)
+      assert(exec.info.diskUsed === 4L)
+    }
+
+    // Remove block 1 from bm 1.
+    listener.onBlockUpdated(SparkListenerBlockUpdated(BlockUpdatedInfo(bm1, rdd1b1,
+      StorageLevel.NONE, 1L, 1L)))
+
+    check[RDDStorageInfoWrapper](rdd1b1.rddId) { wrapper =>
+      assert(wrapper.info.memoryUsed === 4L)
+      assert(wrapper.info.diskUsed === 4L)
+      assert(wrapper.info.dataDistribution.get.size === 2L)
+      assert(wrapper.info.partitions.get.size === 2L)
+
+      val dist = wrapper.info.dataDistribution.get.find(_.address == bm1.hostPort).get
+      assert(dist.memoryUsed === 3L)
+      assert(dist.diskUsed === 3L)
+      assert(dist.memoryRemaining === maxMemory - dist.memoryUsed)
+
+      val part = wrapper.info.partitions.get.find(_.blockName === rdd1b1.name).get
+      assert(part.storageLevel === level.description)
+      assert(part.memoryUsed === 1L)
+      assert(part.diskUsed === 1L)
+      assert(part.executors === Seq(bm2.executorId))
+    }
+
+    check[ExecutorSummaryWrapper](bm1.executorId) { exec =>
+      assert(exec.info.rddBlocks === 1L)
+      assert(exec.info.memoryUsed === 3L)
+      assert(exec.info.diskUsed === 3L)
+    }
+
+    // Remove block 2 from bm 2. This should leave only block 2 info in the store.
+    listener.onBlockUpdated(SparkListenerBlockUpdated(BlockUpdatedInfo(bm2, rdd1b1,
+      StorageLevel.NONE, 1L, 1L)))
+
+    check[RDDStorageInfoWrapper](rdd1b1.rddId) { wrapper =>
+      assert(wrapper.info.memoryUsed === 3L)
+      assert(wrapper.info.diskUsed === 3L)
+      assert(wrapper.info.dataDistribution.get.size === 1L)
+      assert(wrapper.info.partitions.get.size === 1L)
+      assert(wrapper.info.partitions.get(0).blockName === rdd1b2.name)
+    }
+
+    check[ExecutorSummaryWrapper](bm2.executorId) { exec =>
+      assert(exec.info.rddBlocks === 0L)
+      assert(exec.info.memoryUsed === 0L)
+      assert(exec.info.diskUsed === 0L)
+    }
+
+    // Unpersist RDD1.
+    listener.onUnpersistRDD(SparkListenerUnpersistRDD(rdd1b1.rddId))
+        intercept[NoSuchElementException] {
+      check[RDDStorageInfoWrapper](rdd1b1.rddId) { _ => () }
+    }
+
+  }
+
+  private def key(stage: StageInfo): Array[Int] = Array(stage.stageId, stage.attemptId)
+
+  private def check[T: ClassTag](key: Any)(fn: T => Unit): Unit = {
+    val value = store.read(classTag[T].runtimeClass, key).asInstanceOf[T]
+    fn(value)
+  }
+
+}
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index dd299e074535e..45b8870f3b62f 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -36,6 +36,8 @@ object MimaExcludes {
 
   // Exclude rules for 2.3.x
   lazy val v23excludes = v22excludes ++ Seq(
+    // SPARK-18085: Better History Server scalability for many / large applications
+    ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.status.api.v1.ExecutorSummary.executorLogs"),
     // [SPARK-20495][SQL] Add StorageLevel to cacheTable API
     ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.cacheTable"),
 

From 4f8dc6b01ea787243a38678ea8199fbb0814cffc Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Thu, 26 Oct 2017 21:41:45 +0100
Subject: [PATCH 770/779] [SPARK-22328][CORE] ClosureCleaner should not miss
 referenced superclass fields

## What changes were proposed in this pull request?

When the given closure uses some fields defined in super class, `ClosureCleaner` can't figure them and don't set it properly. Those fields will be in null values.

## How was this patch tested?

Added test.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #19556 from viirya/SPARK-22328.
---
 .../apache/spark/util/ClosureCleaner.scala    | 73 ++++++++++++++++---
 .../spark/util/ClosureCleanerSuite.scala      | 72 ++++++++++++++++++
 2 files changed, 133 insertions(+), 12 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala
index 48a1d7b84b61b..dfece5dd0670b 100644
--- a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala
+++ b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala
@@ -91,6 +91,54 @@ private[spark] object ClosureCleaner extends Logging {
     (seen - obj.getClass).toList
   }
 
+  /** Initializes the accessed fields for outer classes and their super classes. */
+  private def initAccessedFields(
+      accessedFields: Map[Class[_], Set[String]],
+      outerClasses: Seq[Class[_]]): Unit = {
+    for (cls <- outerClasses) {
+      var currentClass = cls
+      assert(currentClass != null, "The outer class can't be null.")
+
+      while (currentClass != null) {
+        accessedFields(currentClass) = Set.empty[String]
+        currentClass = currentClass.getSuperclass()
+      }
+    }
+  }
+
+  /** Sets accessed fields for given class in clone object based on given object. */
+  private def setAccessedFields(
+      outerClass: Class[_],
+      clone: AnyRef,
+      obj: AnyRef,
+      accessedFields: Map[Class[_], Set[String]]): Unit = {
+    for (fieldName <- accessedFields(outerClass)) {
+      val field = outerClass.getDeclaredField(fieldName)
+      field.setAccessible(true)
+      val value = field.get(obj)
+      field.set(clone, value)
+    }
+  }
+
+  /** Clones a given object and sets accessed fields in cloned object. */
+  private def cloneAndSetFields(
+      parent: AnyRef,
+      obj: AnyRef,
+      outerClass: Class[_],
+      accessedFields: Map[Class[_], Set[String]]): AnyRef = {
+    val clone = instantiateClass(outerClass, parent)
+
+    var currentClass = outerClass
+    assert(currentClass != null, "The outer class can't be null.")
+
+    while (currentClass != null) {
+      setAccessedFields(currentClass, clone, obj, accessedFields)
+      currentClass = currentClass.getSuperclass()
+    }
+
+    clone
+  }
+
   /**
    * Clean the given closure in place.
    *
@@ -202,9 +250,8 @@ private[spark] object ClosureCleaner extends Logging {
       logDebug(s" + populating accessed fields because this is the starting closure")
       // Initialize accessed fields with the outer classes first
       // This step is needed to associate the fields to the correct classes later
-      for (cls <- outerClasses) {
-        accessedFields(cls) = Set.empty[String]
-      }
+      initAccessedFields(accessedFields, outerClasses)
+
       // Populate accessed fields by visiting all fields and methods accessed by this and
       // all of its inner closures. If transitive cleaning is enabled, this may recursively
       // visits methods that belong to other classes in search of transitively referenced fields.
@@ -250,13 +297,8 @@ private[spark] object ClosureCleaner extends Logging {
       // required fields from the original object. We need the parent here because the Java
       // language specification requires the first constructor parameter of any closure to be
       // its enclosing object.
-      val clone = instantiateClass(cls, parent)
-      for (fieldName <- accessedFields(cls)) {
-        val field = cls.getDeclaredField(fieldName)
-        field.setAccessible(true)
-        val value = field.get(obj)
-        field.set(clone, value)
-      }
+      val clone = cloneAndSetFields(parent, obj, cls, accessedFields)
+
       // If transitive cleaning is enabled, we recursively clean any enclosing closure using
       // the already populated accessed fields map of the starting closure
       if (cleanTransitively && isClosure(clone.getClass)) {
@@ -395,8 +437,15 @@ private[util] class FieldAccessFinder(
             if (!visitedMethods.contains(m)) {
               // Keep track of visited methods to avoid potential infinite cycles
               visitedMethods += m
-              ClosureCleaner.getClassReader(cl).accept(
-                new FieldAccessFinder(fields, findTransitively, Some(m), visitedMethods), 0)
+
+              var currentClass = cl
+              assert(currentClass != null, "The outer class can't be null.")
+
+              while (currentClass != null) {
+                ClosureCleaner.getClassReader(currentClass).accept(
+                  new FieldAccessFinder(fields, findTransitively, Some(m), visitedMethods), 0)
+                currentClass = currentClass.getSuperclass()
+              }
             }
           }
         }
diff --git a/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite.scala b/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite.scala
index 4920b7ee8bfb4..9a19baee9569e 100644
--- a/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite.scala
@@ -119,6 +119,63 @@ class ClosureCleanerSuite extends SparkFunSuite {
   test("createNullValue") {
     new TestCreateNullValue().run()
   }
+
+  test("SPARK-22328: ClosureCleaner misses referenced superclass fields: case 1") {
+    val concreteObject = new TestAbstractClass {
+      val n2 = 222
+      val s2 = "bbb"
+      val d2 = 2.0d
+
+      def run(): Seq[(Int, Int, String, String, Double, Double)] = {
+        withSpark(new SparkContext("local", "test")) { sc =>
+          val rdd = sc.parallelize(1 to 1)
+          body(rdd)
+        }
+      }
+
+      def body(rdd: RDD[Int]): Seq[(Int, Int, String, String, Double, Double)] = rdd.map { _ =>
+        (n1, n2, s1, s2, d1, d2)
+      }.collect()
+    }
+    assert(concreteObject.run() === Seq((111, 222, "aaa", "bbb", 1.0d, 2.0d)))
+  }
+
+  test("SPARK-22328: ClosureCleaner misses referenced superclass fields: case 2") {
+    val concreteObject = new TestAbstractClass2 {
+      val n2 = 222
+      val s2 = "bbb"
+      val d2 = 2.0d
+      def getData: Int => (Int, Int, String, String, Double, Double) = _ => (n1, n2, s1, s2, d1, d2)
+    }
+    withSpark(new SparkContext("local", "test")) { sc =>
+      val rdd = sc.parallelize(1 to 1).map(concreteObject.getData)
+      assert(rdd.collect() === Seq((111, 222, "aaa", "bbb", 1.0d, 2.0d)))
+    }
+  }
+
+  test("SPARK-22328: multiple outer classes have the same parent class") {
+    val concreteObject = new TestAbstractClass2 {
+
+      val innerObject = new TestAbstractClass2 {
+        override val n1 = 222
+        override val s1 = "bbb"
+      }
+
+      val innerObject2 = new TestAbstractClass2 {
+        override val n1 = 444
+        val n3 = 333
+        val s3 = "ccc"
+        val d3 = 3.0d
+
+        def getData: Int => (Int, Int, String, String, Double, Double, Int, String) =
+          _ => (n1, n3, s1, s3, d1, d3, innerObject.n1, innerObject.s1)
+      }
+    }
+    withSpark(new SparkContext("local", "test")) { sc =>
+      val rdd = sc.parallelize(1 to 1).map(concreteObject.innerObject2.getData)
+      assert(rdd.collect() === Seq((444, 333, "aaa", "ccc", 1.0d, 3.0d, 222, "bbb")))
+    }
+  }
 }
 
 // A non-serializable class we create in closures to make sure that we aren't
@@ -377,3 +434,18 @@ class TestCreateNullValue {
     nestedClosure()
   }
 }
+
+abstract class TestAbstractClass extends Serializable {
+  val n1 = 111
+  val s1 = "aaa"
+  protected val d1 = 1.0d
+
+  def run(): Seq[(Int, Int, String, String, Double, Double)]
+  def body(rdd: RDD[Int]): Seq[(Int, Int, String, String, Double, Double)]
+}
+
+abstract class TestAbstractClass2 extends Serializable {
+  val n1 = 111
+  val s1 = "aaa"
+  protected val d1 = 1.0d
+}

From 5415963d2caaf95604211419ffc4e29fff38e1d7 Mon Sep 17 00:00:00 2001
From: "Susan X. Huynh" <xhuynh@mesosphere.com>
Date: Thu, 26 Oct 2017 16:13:48 -0700
Subject: [PATCH 771/779] [SPARK-22131][MESOS] Mesos driver secrets

## Background

In #18837 , ArtRand added Mesos secrets support to the dispatcher. **This PR is to add the same secrets support to the drivers.** This means if the secret configs are set, the driver will launch executors that have access to either env or file-based secrets.

One use case for this is to support TLS in the driver <=> executor communication.

## What changes were proposed in this pull request?

Most of the changes are a refactor of the dispatcher secrets support (#18837) - moving it to a common place that can be used by both the dispatcher and drivers. The same goes for the unit tests.

## How was this patch tested?

There are four config combinations: [env or file-based] x [value or reference secret]. For each combination:
- Added a unit test.
- Tested in DC/OS.

Author: Susan X. Huynh <xhuynh@mesosphere.com>

Closes #19437 from susanxhuynh/sh-mesos-driver-secret.
---
 docs/running-on-mesos.md                      | 111 ++++++++++---
 .../apache/spark/deploy/mesos/config.scala    |  64 ++++----
 .../cluster/mesos/MesosClusterScheduler.scala | 138 +++-------------
 .../MesosCoarseGrainedSchedulerBackend.scala  |  31 +++-
 .../MesosFineGrainedSchedulerBackend.scala    |   4 +-
 .../mesos/MesosSchedulerBackendUtil.scala     |  92 ++++++++++-
 .../mesos/MesosClusterSchedulerSuite.scala    | 150 +++---------------
 ...osCoarseGrainedSchedulerBackendSuite.scala |  34 +++-
 .../MesosSchedulerBackendUtilSuite.scala      |   7 +-
 .../spark/scheduler/cluster/mesos/Utils.scala | 107 +++++++++++++
 10 files changed, 434 insertions(+), 304 deletions(-)

diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md
index e0944bc9f5f86..b7e3e6473c338 100644
--- a/docs/running-on-mesos.md
+++ b/docs/running-on-mesos.md
@@ -485,39 +485,106 @@ See the [configuration page](configuration.html) for information on Spark config
 </tr>
 
 <tr>
-  <td><code>spark.mesos.driver.secret.envkeys</code></td>
-  <td><code>(none)</code></td>
   <td>
-    A comma-separated list that, if set, the contents of the secret referenced
-    by spark.mesos.driver.secret.names or spark.mesos.driver.secret.values will be
-    set to the provided environment variable in the driver's process.
+    <code>spark.mesos.driver.secret.values</code>,
+    <code>spark.mesos.driver.secret.names</code>,
+    <code>spark.mesos.executor.secret.values</code>,
+    <code>spark.mesos.executor.secret.names</code>,
   </td>
-  </tr>
-  <tr>
-<td><code>spark.mesos.driver.secret.filenames</code></td>
   <td><code>(none)</code></td>
   <td>
-    A comma-separated list that, if set, the contents of the secret referenced by
-    spark.mesos.driver.secret.names or spark.mesos.driver.secret.values will be
-    written to the provided file. Paths are relative to the container's work
-    directory.  Absolute paths must already exist.  Consult the Mesos Secret
-    protobuf for more information.
+    <p>
+      A secret is specified by its contents and destination. These properties
+      specify a secret's contents. To specify a secret's destination, see the cell below.
+    </p>
+    <p>
+      You can specify a secret's contents either (1) by value or (2) by reference.
+    </p>
+    <p>
+      (1) To specify a secret by value, set the
+      <code>spark.mesos.[driver|executor].secret.values</code>
+      property, to make the secret available in the driver or executors.
+      For example, to make a secret password "guessme" available to the driver process, set:
+    
+      <pre>spark.mesos.driver.secret.values=guessme</pre>
+    </p>
+    <p>
+      (2) To specify a secret that has been placed in a secret store
+      by reference, specify its name within the secret store
+      by setting the <code>spark.mesos.[driver|executor].secret.names</code>
+      property. For example, to make a secret password named "password" in a secret store
+      available to the driver process, set:
+
+      <pre>spark.mesos.driver.secret.names=password</pre>
+    </p>
+    <p>
+      Note: To use a secret store, make sure one has been integrated with Mesos via a custom
+      <a href="http://mesos.apache.org/documentation/latest/secrets/">SecretResolver
+      module</a>.
+    </p>
+    <p>
+      To specify multiple secrets, provide a comma-separated list:
+
+      <pre>spark.mesos.driver.secret.values=guessme,passwd123</pre>
+
+      or
+
+      <pre>spark.mesos.driver.secret.names=password1,password2</pre>
+    </p>
   </td>
 </tr>
+
 <tr>
-  <td><code>spark.mesos.driver.secret.names</code></td>
-  <td><code>(none)</code></td>
   <td>
-    A comma-separated list of secret references. Consult the Mesos Secret
-    protobuf for more information.
+    <code>spark.mesos.driver.secret.envkeys</code>,
+    <code>spark.mesos.driver.secret.filenames</code>,
+    <code>spark.mesos.executor.secret.envkeys</code>,
+    <code>spark.mesos.executor.secret.filenames</code>,
   </td>
-</tr>
-<tr>
-  <td><code>spark.mesos.driver.secret.values</code></td>
   <td><code>(none)</code></td>
   <td>
-    A comma-separated list of secret values. Consult the Mesos Secret
-    protobuf for more information.
+    <p>
+      A secret is specified by its contents and destination. These properties
+      specify a secret's destination. To specify a secret's contents, see the cell above.
+    </p>
+    <p>
+      You can specify a secret's destination in the driver or
+      executors as either (1) an environment variable or (2) as a file.
+    </p>
+    <p>
+      (1) To make an environment-based secret, set the
+      <code>spark.mesos.[driver|executor].secret.envkeys</code> property.
+      The secret will appear as an environment variable with the
+      given name in the driver or executors. For example, to make a secret password available
+      to the driver process as $PASSWORD, set:
+
+      <pre>spark.mesos.driver.secret.envkeys=PASSWORD</pre>
+    </p>
+    <p>
+      (2) To make a file-based secret, set the
+      <code>spark.mesos.[driver|executor].secret.filenames</code> property.
+      The secret will appear in the contents of a file with the given file name in
+      the driver or executors. For example, to make a secret password available in a
+      file named "pwdfile" in the driver process, set:
+
+      <pre>spark.mesos.driver.secret.filenames=pwdfile</pre>
+    </p>
+    <p>
+      Paths are relative to the container's work directory. Absolute paths must
+      already exist. Note: File-based secrets require a custom
+      <a href="http://mesos.apache.org/documentation/latest/secrets/">SecretResolver
+      module</a>.
+    </p>
+    <p>
+      To specify env vars or file names corresponding to multiple secrets,
+      provide a comma-separated list:
+
+      <pre>spark.mesos.driver.secret.envkeys=PASSWORD1,PASSWORD2</pre>
+    
+      or
+    
+      <pre>spark.mesos.driver.secret.filenames=pwdfile1,pwdfile2</pre>
+    </p>
   </td>
 </tr>
 
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala
index 7e85de91c5d36..821534eb4fc38 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala
@@ -23,6 +23,39 @@ import org.apache.spark.internal.config.ConfigBuilder
 
 package object config {
 
+  private[spark] class MesosSecretConfig private[config](taskType: String) {
+    private[spark] val SECRET_NAMES =
+      ConfigBuilder(s"spark.mesos.$taskType.secret.names")
+        .doc("A comma-separated list of secret reference names. Consult the Mesos Secret " +
+          "protobuf for more information.")
+        .stringConf
+        .toSequence
+        .createOptional
+
+    private[spark] val SECRET_VALUES =
+      ConfigBuilder(s"spark.mesos.$taskType.secret.values")
+        .doc("A comma-separated list of secret values.")
+        .stringConf
+        .toSequence
+        .createOptional
+
+    private[spark] val SECRET_ENVKEYS =
+      ConfigBuilder(s"spark.mesos.$taskType.secret.envkeys")
+        .doc("A comma-separated list of the environment variables to contain the secrets." +
+          "The environment variable will be set on the driver.")
+        .stringConf
+        .toSequence
+        .createOptional
+
+    private[spark] val SECRET_FILENAMES =
+      ConfigBuilder(s"spark.mesos.$taskType.secret.filenames")
+        .doc("A comma-separated list of file paths secret will be written to.  Consult the Mesos " +
+          "Secret protobuf for more information.")
+        .stringConf
+        .toSequence
+        .createOptional
+  }
+
   /* Common app configuration. */
 
   private[spark] val SHUFFLE_CLEANER_INTERVAL_S =
@@ -64,36 +97,9 @@ package object config {
       .stringConf
       .createOptional
 
-  private[spark] val SECRET_NAME =
-    ConfigBuilder("spark.mesos.driver.secret.names")
-      .doc("A comma-separated list of secret reference names. Consult the Mesos Secret protobuf " +
-        "for more information.")
-      .stringConf
-      .toSequence
-      .createOptional
-
-  private[spark] val SECRET_VALUE =
-    ConfigBuilder("spark.mesos.driver.secret.values")
-      .doc("A comma-separated list of secret values.")
-      .stringConf
-      .toSequence
-      .createOptional
+  private[spark] val driverSecretConfig = new MesosSecretConfig("driver")
 
-  private[spark] val SECRET_ENVKEY =
-    ConfigBuilder("spark.mesos.driver.secret.envkeys")
-      .doc("A comma-separated list of the environment variables to contain the secrets." +
-        "The environment variable will be set on the driver.")
-      .stringConf
-      .toSequence
-      .createOptional
-
-  private[spark] val SECRET_FILENAME =
-    ConfigBuilder("spark.mesos.driver.secret.filenames")
-      .doc("A comma-seperated list of file paths secret will be written to.  Consult the Mesos " +
-        "Secret protobuf for more information.")
-      .stringConf
-      .toSequence
-      .createOptional
+  private[spark] val executorSecretConfig = new MesosSecretConfig("executor")
 
   private[spark] val DRIVER_FAILOVER_TIMEOUT =
     ConfigBuilder("spark.mesos.driver.failoverTimeout")
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
index ec533f91474f2..82470264f2a4a 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
@@ -28,7 +28,6 @@ import org.apache.mesos.{Scheduler, SchedulerDriver}
 import org.apache.mesos.Protos.{TaskState => MesosTaskState, _}
 import org.apache.mesos.Protos.Environment.Variable
 import org.apache.mesos.Protos.TaskStatus.Reason
-import org.apache.mesos.protobuf.ByteString
 
 import org.apache.spark.{SecurityManager, SparkConf, SparkException, TaskState}
 import org.apache.spark.deploy.mesos.MesosDriverDescription
@@ -394,39 +393,20 @@ private[spark] class MesosClusterScheduler(
     }
 
     // add secret environment variables
-    getSecretEnvVar(desc).foreach { variable =>
-      if (variable.getSecret.getReference.isInitialized) {
-        logInfo(s"Setting reference secret ${variable.getSecret.getReference.getName}" +
-          s"on file ${variable.getName}")
-      } else {
-        logInfo(s"Setting secret on environment variable name=${variable.getName}")
-      }
-      envBuilder.addVariables(variable)
+    MesosSchedulerBackendUtil.getSecretEnvVar(desc.conf, config.driverSecretConfig)
+      .foreach { variable =>
+        if (variable.getSecret.getReference.isInitialized) {
+          logInfo(s"Setting reference secret ${variable.getSecret.getReference.getName} " +
+            s"on file ${variable.getName}")
+        } else {
+          logInfo(s"Setting secret on environment variable name=${variable.getName}")
+        }
+        envBuilder.addVariables(variable)
     }
 
     envBuilder.build()
   }
 
-  private def getSecretEnvVar(desc: MesosDriverDescription): List[Variable] = {
-    val secrets = getSecrets(desc)
-    val secretEnvKeys = desc.conf.get(config.SECRET_ENVKEY).getOrElse(Nil)
-    if (illegalSecretInput(secretEnvKeys, secrets)) {
-      throw new SparkException(
-        s"Need to give equal numbers of secrets and environment keys " +
-          s"for environment-based reference secrets got secrets $secrets, " +
-          s"and keys $secretEnvKeys")
-    }
-
-    secrets.zip(secretEnvKeys).map {
-      case (s, k) =>
-        Variable.newBuilder()
-          .setName(k)
-          .setType(Variable.Type.SECRET)
-          .setSecret(s)
-          .build
-    }.toList
-  }
-
   private def getDriverUris(desc: MesosDriverDescription): List[CommandInfo.URI] = {
     val confUris = List(conf.getOption("spark.mesos.uris"),
       desc.conf.getOption("spark.mesos.uris"),
@@ -440,6 +420,23 @@ private[spark] class MesosClusterScheduler(
       CommandInfo.URI.newBuilder().setValue(uri.trim()).setCache(useFetchCache).build())
   }
 
+  private def getContainerInfo(desc: MesosDriverDescription): ContainerInfo.Builder = {
+    val containerInfo = MesosSchedulerBackendUtil.buildContainerInfo(desc.conf)
+
+    MesosSchedulerBackendUtil.getSecretVolume(desc.conf, config.driverSecretConfig)
+      .foreach { volume =>
+        if (volume.getSource.getSecret.getReference.isInitialized) {
+          logInfo(s"Setting reference secret ${volume.getSource.getSecret.getReference.getName} " +
+            s"on file ${volume.getContainerPath}")
+        } else {
+          logInfo(s"Setting secret on file name=${volume.getContainerPath}")
+        }
+        containerInfo.addVolumes(volume)
+    }
+
+    containerInfo
+  }
+
   private def getDriverCommandValue(desc: MesosDriverDescription): String = {
     val dockerDefined = desc.conf.contains("spark.mesos.executor.docker.image")
     val executorUri = getDriverExecutorURI(desc)
@@ -579,89 +576,6 @@ private[spark] class MesosClusterScheduler(
       .build
   }
 
-  private def getContainerInfo(desc: MesosDriverDescription): ContainerInfo.Builder = {
-    val containerInfo = MesosSchedulerBackendUtil.containerInfo(desc.conf)
-
-    getSecretVolume(desc).foreach { volume =>
-      if (volume.getSource.getSecret.getReference.isInitialized) {
-        logInfo(s"Setting reference secret ${volume.getSource.getSecret.getReference.getName}" +
-          s"on file ${volume.getContainerPath}")
-      } else {
-        logInfo(s"Setting secret on file name=${volume.getContainerPath}")
-      }
-      containerInfo.addVolumes(volume)
-    }
-
-    containerInfo
-  }
-
-
-  private def getSecrets(desc: MesosDriverDescription): Seq[Secret] = {
-    def createValueSecret(data: String): Secret = {
-      Secret.newBuilder()
-        .setType(Secret.Type.VALUE)
-        .setValue(Secret.Value.newBuilder().setData(ByteString.copyFrom(data.getBytes)))
-        .build()
-    }
-
-    def createReferenceSecret(name: String): Secret = {
-      Secret.newBuilder()
-        .setReference(Secret.Reference.newBuilder().setName(name))
-        .setType(Secret.Type.REFERENCE)
-        .build()
-    }
-
-    val referenceSecrets: Seq[Secret] =
-      desc.conf.get(config.SECRET_NAME).getOrElse(Nil).map(s => createReferenceSecret(s))
-
-    val valueSecrets: Seq[Secret] = {
-      desc.conf.get(config.SECRET_VALUE).getOrElse(Nil).map(s => createValueSecret(s))
-    }
-
-    if (valueSecrets.nonEmpty && referenceSecrets.nonEmpty) {
-      throw new SparkException("Cannot specify VALUE type secrets and REFERENCE types ones")
-    }
-
-    if (referenceSecrets.nonEmpty) referenceSecrets else valueSecrets
-  }
-
-  private def illegalSecretInput(dest: Seq[String], s: Seq[Secret]): Boolean = {
-    if (dest.isEmpty) {  // no destination set (ie not using secrets of this type
-      return false
-    }
-    if (dest.nonEmpty && s.nonEmpty) {
-      // make sure there is a destination for each secret of this type
-      if (dest.length != s.length) {
-        return true
-      }
-    }
-    false
-  }
-
-  private def getSecretVolume(desc: MesosDriverDescription): List[Volume] = {
-    val secrets = getSecrets(desc)
-    val secretPaths: Seq[String] =
-      desc.conf.get(config.SECRET_FILENAME).getOrElse(Nil)
-
-    if (illegalSecretInput(secretPaths, secrets)) {
-      throw new SparkException(
-        s"Need to give equal numbers of secrets and file paths for file-based " +
-          s"reference secrets got secrets $secrets, and paths $secretPaths")
-    }
-
-    secrets.zip(secretPaths).map {
-      case (s, p) =>
-        val source = Volume.Source.newBuilder()
-          .setType(Volume.Source.Type.SECRET)
-          .setSecret(s)
-        Volume.newBuilder()
-          .setContainerPath(p)
-          .setSource(source)
-          .setMode(Volume.Mode.RO)
-          .build
-    }.toList
-  }
-
   /**
    * This method takes all the possible candidates and attempt to schedule them with Mesos offers.
    * Every time a new task is scheduled, the afterLaunchCallback is called to perform post scheduled
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
index 603c980cb268d..104ed01d293ce 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
@@ -28,7 +28,7 @@ import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.concurrent.Future
 
-import org.apache.spark.{SecurityManager, SparkContext, SparkException, TaskState}
+import org.apache.spark.{SecurityManager, SparkConf, SparkContext, SparkException, TaskState}
 import org.apache.spark.deploy.mesos.config._
 import org.apache.spark.deploy.security.HadoopDelegationTokenManager
 import org.apache.spark.internal.config
@@ -244,6 +244,17 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
         .setValue(value)
         .build())
     }
+
+    MesosSchedulerBackendUtil.getSecretEnvVar(conf, executorSecretConfig).foreach { variable =>
+      if (variable.getSecret.getReference.isInitialized) {
+        logInfo(s"Setting reference secret ${variable.getSecret.getReference.getName} " +
+          s"on file ${variable.getName}")
+      } else {
+        logInfo(s"Setting secret on environment variable name=${variable.getName}")
+      }
+      environment.addVariables(variable)
+    }
+
     val command = CommandInfo.newBuilder()
       .setEnvironment(environment)
 
@@ -424,6 +435,22 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
     }
   }
 
+  private def getContainerInfo(conf: SparkConf): ContainerInfo.Builder = {
+    val containerInfo = MesosSchedulerBackendUtil.buildContainerInfo(conf)
+
+    MesosSchedulerBackendUtil.getSecretVolume(conf, executorSecretConfig).foreach { volume =>
+      if (volume.getSource.getSecret.getReference.isInitialized) {
+        logInfo(s"Setting reference secret ${volume.getSource.getSecret.getReference.getName} " +
+          s"on file ${volume.getContainerPath}")
+      } else {
+        logInfo(s"Setting secret on file name=${volume.getContainerPath}")
+      }
+      containerInfo.addVolumes(volume)
+    }
+
+    containerInfo
+  }
+
   /**
    * Returns a map from OfferIDs to the tasks to launch on those offers.  In order to maximize
    * per-task memory and IO, tasks are round-robin assigned to offers.
@@ -475,7 +502,7 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
             .setName(s"${sc.appName} $taskId")
             .setLabels(MesosProtoUtils.mesosLabels(taskLabels))
             .addAllResources(resourcesToUse.asJava)
-            .setContainer(MesosSchedulerBackendUtil.containerInfo(sc.conf))
+            .setContainer(getContainerInfo(sc.conf))
 
           tasks(offer.getId) ::= taskBuilder.build()
           remainingResources(offerId) = resourcesLeft.asJava
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackend.scala
index 66b8e0a640121..d6d939d246109 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackend.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackend.scala
@@ -28,6 +28,7 @@ import org.apache.mesos.SchedulerDriver
 import org.apache.mesos.protobuf.ByteString
 
 import org.apache.spark.{SparkContext, SparkException, TaskState}
+import org.apache.spark.deploy.mesos.config
 import org.apache.spark.executor.MesosExecutorBackend
 import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.cluster.ExecutorInfo
@@ -159,7 +160,8 @@ private[spark] class MesosFineGrainedSchedulerBackend(
       .setCommand(command)
       .setData(ByteString.copyFrom(createExecArg()))
 
-    executorInfo.setContainer(MesosSchedulerBackendUtil.containerInfo(sc.conf))
+    executorInfo.setContainer(
+      MesosSchedulerBackendUtil.buildContainerInfo(sc.conf))
     (executorInfo.build(), resourcesAfterMem.asJava)
   }
 
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala
index f29e541addf23..bfb73611f0530 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala
@@ -17,11 +17,15 @@
 
 package org.apache.spark.scheduler.cluster.mesos
 
-import org.apache.mesos.Protos.{ContainerInfo, Image, NetworkInfo, Parameter, Volume}
+import org.apache.mesos.Protos.{ContainerInfo, Environment, Image, NetworkInfo, Parameter, Secret, Volume}
 import org.apache.mesos.Protos.ContainerInfo.{DockerInfo, MesosInfo}
+import org.apache.mesos.Protos.Environment.Variable
+import org.apache.mesos.protobuf.ByteString
 
-import org.apache.spark.{SparkConf, SparkException}
+import org.apache.spark.SparkConf
+import org.apache.spark.SparkException
 import org.apache.spark.deploy.mesos.config.{NETWORK_LABELS, NETWORK_NAME}
+import org.apache.spark.deploy.mesos.config.MesosSecretConfig
 import org.apache.spark.internal.Logging
 
 /**
@@ -122,7 +126,7 @@ private[mesos] object MesosSchedulerBackendUtil extends Logging {
     .toList
   }
 
-  def containerInfo(conf: SparkConf): ContainerInfo.Builder = {
+  def buildContainerInfo(conf: SparkConf): ContainerInfo.Builder = {
     val containerType = if (conf.contains("spark.mesos.executor.docker.image") &&
       conf.get("spark.mesos.containerizer", "docker") == "docker") {
       ContainerInfo.Type.DOCKER
@@ -173,6 +177,88 @@ private[mesos] object MesosSchedulerBackendUtil extends Logging {
     containerInfo
   }
 
+  private def getSecrets(conf: SparkConf, secretConfig: MesosSecretConfig): Seq[Secret] = {
+    def createValueSecret(data: String): Secret = {
+      Secret.newBuilder()
+        .setType(Secret.Type.VALUE)
+        .setValue(Secret.Value.newBuilder().setData(ByteString.copyFrom(data.getBytes)))
+        .build()
+    }
+
+    def createReferenceSecret(name: String): Secret = {
+      Secret.newBuilder()
+        .setReference(Secret.Reference.newBuilder().setName(name))
+        .setType(Secret.Type.REFERENCE)
+        .build()
+    }
+
+    val referenceSecrets: Seq[Secret] =
+      conf.get(secretConfig.SECRET_NAMES).getOrElse(Nil).map { s => createReferenceSecret(s) }
+
+    val valueSecrets: Seq[Secret] = {
+      conf.get(secretConfig.SECRET_VALUES).getOrElse(Nil).map { s => createValueSecret(s) }
+    }
+
+    if (valueSecrets.nonEmpty && referenceSecrets.nonEmpty) {
+      throw new SparkException("Cannot specify both value-type and reference-type secrets.")
+    }
+
+    if (referenceSecrets.nonEmpty) referenceSecrets else valueSecrets
+  }
+
+  private def illegalSecretInput(dest: Seq[String], secrets: Seq[Secret]): Boolean = {
+    if (dest.nonEmpty) {
+      // make sure there is a one-to-one correspondence between destinations and secrets
+      if (dest.length != secrets.length) {
+        return true
+      }
+    }
+    false
+  }
+
+  def getSecretVolume(conf: SparkConf, secretConfig: MesosSecretConfig): List[Volume] = {
+    val secrets = getSecrets(conf, secretConfig)
+    val secretPaths: Seq[String] =
+      conf.get(secretConfig.SECRET_FILENAMES).getOrElse(Nil)
+
+    if (illegalSecretInput(secretPaths, secrets)) {
+      throw new SparkException(
+        s"Need to give equal numbers of secrets and file paths for file-based " +
+          s"reference secrets got secrets $secrets, and paths $secretPaths")
+    }
+
+    secrets.zip(secretPaths).map { case (s, p) =>
+      val source = Volume.Source.newBuilder()
+        .setType(Volume.Source.Type.SECRET)
+        .setSecret(s)
+      Volume.newBuilder()
+        .setContainerPath(p)
+        .setSource(source)
+        .setMode(Volume.Mode.RO)
+        .build
+    }.toList
+  }
+
+  def getSecretEnvVar(conf: SparkConf, secretConfig: MesosSecretConfig):
+    List[Variable] = {
+    val secrets = getSecrets(conf, secretConfig)
+    val secretEnvKeys = conf.get(secretConfig.SECRET_ENVKEYS).getOrElse(Nil)
+    if (illegalSecretInput(secretEnvKeys, secrets)) {
+      throw new SparkException(
+        s"Need to give equal numbers of secrets and environment keys " +
+          s"for environment-based reference secrets got secrets $secrets, " +
+          s"and keys $secretEnvKeys")
+    }
+
+    secrets.zip(secretEnvKeys).map { case (s, k) =>
+      Variable.newBuilder()
+        .setName(k)
+        .setType(Variable.Type.SECRET)
+        .setSecret(s)
+        .build
+    }.toList
+  }
+
   private def dockerInfo(
       image: String,
       forcePullImage: Boolean,
diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala
index ff63e3f4ccfc3..77acee608f25f 100644
--- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala
+++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala
@@ -24,7 +24,6 @@ import scala.collection.JavaConverters._
 import org.apache.mesos.Protos.{Environment, Secret, TaskState => MesosTaskState, _}
 import org.apache.mesos.Protos.Value.{Scalar, Type}
 import org.apache.mesos.SchedulerDriver
-import org.apache.mesos.protobuf.ByteString
 import org.mockito.{ArgumentCaptor, Matchers}
 import org.mockito.Mockito._
 import org.scalatest.mockito.MockitoSugar
@@ -32,6 +31,7 @@ import org.scalatest.mockito.MockitoSugar
 import org.apache.spark.{LocalSparkContext, SparkConf, SparkFunSuite}
 import org.apache.spark.deploy.Command
 import org.apache.spark.deploy.mesos.MesosDriverDescription
+import org.apache.spark.deploy.mesos.config
 
 class MesosClusterSchedulerSuite extends SparkFunSuite with LocalSparkContext with MockitoSugar {
 
@@ -341,132 +341,33 @@ class MesosClusterSchedulerSuite extends SparkFunSuite with LocalSparkContext wi
   }
 
   test("Creates an env-based reference secrets.") {
-    setScheduler()
-
-    val mem = 1000
-    val cpu = 1
-    val secretName = "/path/to/secret,/anothersecret"
-    val envKey = "SECRET_ENV_KEY,PASSWORD"
-    val driverDesc = new MesosDriverDescription(
-      "d1",
-      "jar",
-      mem,
-      cpu,
-      true,
-      command,
-      Map("spark.mesos.executor.home" -> "test",
-        "spark.app.name" -> "test",
-        "spark.mesos.driver.secret.names" -> secretName,
-        "spark.mesos.driver.secret.envkeys" -> envKey),
-      "s1",
-      new Date())
-    val response = scheduler.submitDriver(driverDesc)
-    assert(response.success)
-    val offer = Utils.createOffer("o1", "s1", mem, cpu)
-    scheduler.resourceOffers(driver, Collections.singletonList(offer))
-    val launchedTasks = Utils.verifyTaskLaunched(driver, "o1")
-    assert(launchedTasks.head
-      .getCommand
-      .getEnvironment
-      .getVariablesCount == 3)  // SPARK_SUBMIT_OPS and the secret
-    val variableOne = launchedTasks.head.getCommand.getEnvironment
-      .getVariablesList.asScala.filter(_.getName == "SECRET_ENV_KEY").head
-    assert(variableOne.getSecret.isInitialized)
-    assert(variableOne.getSecret.getType == Secret.Type.REFERENCE)
-    assert(variableOne.getSecret.getReference.getName == "/path/to/secret")
-    assert(variableOne.getType == Environment.Variable.Type.SECRET)
-    val variableTwo = launchedTasks.head.getCommand.getEnvironment
-      .getVariablesList.asScala.filter(_.getName == "PASSWORD").head
-    assert(variableTwo.getSecret.isInitialized)
-    assert(variableTwo.getSecret.getType == Secret.Type.REFERENCE)
-    assert(variableTwo.getSecret.getReference.getName == "/anothersecret")
-    assert(variableTwo.getType == Environment.Variable.Type.SECRET)
+    val launchedTasks = launchDriverTask(
+      Utils.configEnvBasedRefSecrets(config.driverSecretConfig))
+    Utils.verifyEnvBasedRefSecrets(launchedTasks)
   }
 
   test("Creates an env-based value secrets.") {
-    setScheduler()
-    val mem = 1000
-    val cpu = 1
-    val secretValues = "user,password"
-    val envKeys = "USER,PASSWORD"
-    val driverDesc = new MesosDriverDescription(
-      "d1",
-      "jar",
-      mem,
-      cpu,
-      true,
-      command,
-      Map("spark.mesos.executor.home" -> "test",
-        "spark.app.name" -> "test",
-        "spark.mesos.driver.secret.values" -> secretValues,
-        "spark.mesos.driver.secret.envkeys" -> envKeys),
-      "s1",
-      new Date())
-    val response = scheduler.submitDriver(driverDesc)
-    assert(response.success)
-    val offer = Utils.createOffer("o1", "s1", mem, cpu)
-    scheduler.resourceOffers(driver, Collections.singletonList(offer))
-    val launchedTasks = Utils.verifyTaskLaunched(driver, "o1")
-    assert(launchedTasks.head
-      .getCommand
-      .getEnvironment
-      .getVariablesCount == 3)  // SPARK_SUBMIT_OPS and the secret
-    val variableOne = launchedTasks.head.getCommand.getEnvironment
-      .getVariablesList.asScala.filter(_.getName == "USER").head
-    assert(variableOne.getSecret.isInitialized)
-    assert(variableOne.getSecret.getType == Secret.Type.VALUE)
-    assert(variableOne.getSecret.getValue.getData == ByteString.copyFrom("user".getBytes))
-    assert(variableOne.getType == Environment.Variable.Type.SECRET)
-    val variableTwo = launchedTasks.head.getCommand.getEnvironment
-      .getVariablesList.asScala.filter(_.getName == "PASSWORD").head
-    assert(variableTwo.getSecret.isInitialized)
-    assert(variableTwo.getSecret.getType == Secret.Type.VALUE)
-    assert(variableTwo.getSecret.getValue.getData == ByteString.copyFrom("password".getBytes))
-    assert(variableTwo.getType == Environment.Variable.Type.SECRET)
+    val launchedTasks = launchDriverTask(
+      Utils.configEnvBasedValueSecrets(config.driverSecretConfig))
+    Utils.verifyEnvBasedValueSecrets(launchedTasks)
   }
 
   test("Creates file-based reference secrets.") {
-    setScheduler()
-    val mem = 1000
-    val cpu = 1
-    val secretName = "/path/to/secret,/anothersecret"
-    val secretPath = "/topsecret,/mypassword"
-    val driverDesc = new MesosDriverDescription(
-      "d1",
-      "jar",
-      mem,
-      cpu,
-      true,
-      command,
-      Map("spark.mesos.executor.home" -> "test",
-        "spark.app.name" -> "test",
-        "spark.mesos.driver.secret.names" -> secretName,
-        "spark.mesos.driver.secret.filenames" -> secretPath),
-      "s1",
-      new Date())
-    val response = scheduler.submitDriver(driverDesc)
-    assert(response.success)
-    val offer = Utils.createOffer("o1", "s1", mem, cpu)
-    scheduler.resourceOffers(driver, Collections.singletonList(offer))
-    val launchedTasks = Utils.verifyTaskLaunched(driver, "o1")
-    val volumes = launchedTasks.head.getContainer.getVolumesList
-    assert(volumes.size() == 2)
-    val secretVolOne = volumes.get(0)
-    assert(secretVolOne.getContainerPath == "/topsecret")
-    assert(secretVolOne.getSource.getSecret.getType == Secret.Type.REFERENCE)
-    assert(secretVolOne.getSource.getSecret.getReference.getName == "/path/to/secret")
-    val secretVolTwo = volumes.get(1)
-    assert(secretVolTwo.getContainerPath == "/mypassword")
-    assert(secretVolTwo.getSource.getSecret.getType == Secret.Type.REFERENCE)
-    assert(secretVolTwo.getSource.getSecret.getReference.getName == "/anothersecret")
+    val launchedTasks = launchDriverTask(
+      Utils.configFileBasedRefSecrets(config.driverSecretConfig))
+    Utils.verifyFileBasedRefSecrets(launchedTasks)
   }
 
   test("Creates a file-based value secrets.") {
+    val launchedTasks = launchDriverTask(
+      Utils.configFileBasedValueSecrets(config.driverSecretConfig))
+    Utils.verifyFileBasedValueSecrets(launchedTasks)
+  }
+
+  private def launchDriverTask(addlSparkConfVars: Map[String, String]): List[TaskInfo] = {
     setScheduler()
     val mem = 1000
     val cpu = 1
-    val secretValues = "user,password"
-    val secretPath = "/whoami,/mypassword"
     val driverDesc = new MesosDriverDescription(
       "d1",
       "jar",
@@ -475,27 +376,14 @@ class MesosClusterSchedulerSuite extends SparkFunSuite with LocalSparkContext wi
       true,
       command,
       Map("spark.mesos.executor.home" -> "test",
-        "spark.app.name" -> "test",
-        "spark.mesos.driver.secret.values" -> secretValues,
-        "spark.mesos.driver.secret.filenames" -> secretPath),
+        "spark.app.name" -> "test") ++
+        addlSparkConfVars,
       "s1",
       new Date())
     val response = scheduler.submitDriver(driverDesc)
     assert(response.success)
     val offer = Utils.createOffer("o1", "s1", mem, cpu)
     scheduler.resourceOffers(driver, Collections.singletonList(offer))
-    val launchedTasks = Utils.verifyTaskLaunched(driver, "o1")
-    val volumes = launchedTasks.head.getContainer.getVolumesList
-    assert(volumes.size() == 2)
-    val secretVolOne = volumes.get(0)
-    assert(secretVolOne.getContainerPath == "/whoami")
-    assert(secretVolOne.getSource.getSecret.getType == Secret.Type.VALUE)
-    assert(secretVolOne.getSource.getSecret.getValue.getData ==
-      ByteString.copyFrom("user".getBytes))
-    val secretVolTwo = volumes.get(1)
-    assert(secretVolTwo.getContainerPath == "/mypassword")
-    assert(secretVolTwo.getSource.getSecret.getType == Secret.Type.VALUE)
-    assert(secretVolTwo.getSource.getSecret.getValue.getData ==
-      ByteString.copyFrom("password".getBytes))
+    Utils.verifyTaskLaunched(driver, "o1")
   }
 }
diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
index 6c40792112f49..f4bd1ee9da6f7 100644
--- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
+++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
@@ -21,7 +21,6 @@ import java.util.concurrent.TimeUnit
 
 import scala.collection.JavaConverters._
 import scala.concurrent.duration._
-import scala.reflect.ClassTag
 
 import org.apache.mesos.{Protos, Scheduler, SchedulerDriver}
 import org.apache.mesos.Protos._
@@ -38,7 +37,7 @@ import org.apache.spark.internal.config._
 import org.apache.spark.network.shuffle.mesos.MesosExternalShuffleClient
 import org.apache.spark.rpc.{RpcAddress, RpcEndpointRef}
 import org.apache.spark.scheduler.TaskSchedulerImpl
-import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.{RegisterExecutor, RemoveExecutor}
+import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.{RegisterExecutor}
 import org.apache.spark.scheduler.cluster.mesos.Utils._
 
 class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite
@@ -653,6 +652,37 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite
     offerResourcesAndVerify(2, true)
   }
 
+  test("Creates an env-based reference secrets.") {
+    val launchedTasks = launchExecutorTasks(configEnvBasedRefSecrets(executorSecretConfig))
+    verifyEnvBasedRefSecrets(launchedTasks)
+  }
+
+  test("Creates an env-based value secrets.") {
+    val launchedTasks = launchExecutorTasks(configEnvBasedValueSecrets(executorSecretConfig))
+    verifyEnvBasedValueSecrets(launchedTasks)
+  }
+
+  test("Creates file-based reference secrets.") {
+    val launchedTasks = launchExecutorTasks(configFileBasedRefSecrets(executorSecretConfig))
+    verifyFileBasedRefSecrets(launchedTasks)
+  }
+
+  test("Creates a file-based value secrets.") {
+    val launchedTasks = launchExecutorTasks(configFileBasedValueSecrets(executorSecretConfig))
+    verifyFileBasedValueSecrets(launchedTasks)
+  }
+
+  private def launchExecutorTasks(sparkConfVars: Map[String, String]): List[TaskInfo] = {
+    setBackend(sparkConfVars)
+
+    val (mem, cpu) = (backend.executorMemory(sc), 4)
+
+    val offer1 = createOffer("o1", "s1", mem, cpu)
+    backend.resourceOffers(driver, List(offer1).asJava)
+
+    verifyTaskLaunched(driver, "o1")
+  }
+
   private case class Resources(mem: Int, cpus: Int, gpus: Int = 0)
 
   private def registerMockExecutor(executorId: String, slaveId: String, cores: Integer) = {
diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtilSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtilSuite.scala
index f49d7c29eda49..442c43960ec1f 100644
--- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtilSuite.scala
+++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtilSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.scheduler.cluster.mesos
 
 import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.mesos.config
 
 class MesosSchedulerBackendUtilSuite extends SparkFunSuite {
 
@@ -26,7 +27,8 @@ class MesosSchedulerBackendUtilSuite extends SparkFunSuite {
     conf.set("spark.mesos.executor.docker.parameters", "a,b")
     conf.set("spark.mesos.executor.docker.image", "test")
 
-    val containerInfo = MesosSchedulerBackendUtil.containerInfo(conf)
+    val containerInfo = MesosSchedulerBackendUtil.buildContainerInfo(
+      conf)
     val params = containerInfo.getDocker.getParametersList
 
     assert(params.size() == 0)
@@ -37,7 +39,8 @@ class MesosSchedulerBackendUtilSuite extends SparkFunSuite {
     conf.set("spark.mesos.executor.docker.parameters", "a=1,b=2,c=3")
     conf.set("spark.mesos.executor.docker.image", "test")
 
-    val containerInfo = MesosSchedulerBackendUtil.containerInfo(conf)
+    val containerInfo = MesosSchedulerBackendUtil.buildContainerInfo(
+      conf)
     val params = containerInfo.getDocker.getParametersList
     assert(params.size() == 3)
     assert(params.get(0).getKey == "a")
diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/Utils.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/Utils.scala
index 833db0c1ff334..5636ac52bd4a7 100644
--- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/Utils.scala
+++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/Utils.scala
@@ -24,9 +24,12 @@ import scala.collection.JavaConverters._
 import org.apache.mesos.Protos._
 import org.apache.mesos.Protos.Value.{Range => MesosRange, Ranges, Scalar}
 import org.apache.mesos.SchedulerDriver
+import org.apache.mesos.protobuf.ByteString
 import org.mockito.{ArgumentCaptor, Matchers}
 import org.mockito.Mockito._
 
+import org.apache.spark.deploy.mesos.config.MesosSecretConfig
+
 object Utils {
 
   val TEST_FRAMEWORK_ID = FrameworkID.newBuilder()
@@ -105,4 +108,108 @@ object Utils {
   def createTaskId(taskId: String): TaskID = {
     TaskID.newBuilder().setValue(taskId).build()
   }
+
+  def configEnvBasedRefSecrets(secretConfig: MesosSecretConfig): Map[String, String] = {
+    val secretName = "/path/to/secret,/anothersecret"
+    val envKey = "SECRET_ENV_KEY,PASSWORD"
+    Map(
+      secretConfig.SECRET_NAMES.key -> secretName,
+      secretConfig.SECRET_ENVKEYS.key -> envKey
+    )
+  }
+
+  def verifyEnvBasedRefSecrets(launchedTasks: List[TaskInfo]): Unit = {
+    val envVars = launchedTasks.head
+      .getCommand
+      .getEnvironment
+      .getVariablesList
+      .asScala
+    assert(envVars
+      .count(!_.getName.startsWith("SPARK_")) == 2)  // user-defined secret env vars
+    val variableOne = envVars.filter(_.getName == "SECRET_ENV_KEY").head
+    assert(variableOne.getSecret.isInitialized)
+    assert(variableOne.getSecret.getType == Secret.Type.REFERENCE)
+    assert(variableOne.getSecret.getReference.getName == "/path/to/secret")
+    assert(variableOne.getType == Environment.Variable.Type.SECRET)
+    val variableTwo = envVars.filter(_.getName == "PASSWORD").head
+    assert(variableTwo.getSecret.isInitialized)
+    assert(variableTwo.getSecret.getType == Secret.Type.REFERENCE)
+    assert(variableTwo.getSecret.getReference.getName == "/anothersecret")
+    assert(variableTwo.getType == Environment.Variable.Type.SECRET)
+  }
+
+  def configEnvBasedValueSecrets(secretConfig: MesosSecretConfig): Map[String, String] = {
+    val secretValues = "user,password"
+    val envKeys = "USER,PASSWORD"
+    Map(
+      secretConfig.SECRET_VALUES.key -> secretValues,
+      secretConfig.SECRET_ENVKEYS.key -> envKeys
+    )
+  }
+
+  def verifyEnvBasedValueSecrets(launchedTasks: List[TaskInfo]): Unit = {
+    val envVars = launchedTasks.head
+      .getCommand
+      .getEnvironment
+      .getVariablesList
+      .asScala
+    assert(envVars
+      .count(!_.getName.startsWith("SPARK_")) == 2)  // user-defined secret env vars
+    val variableOne = envVars.filter(_.getName == "USER").head
+    assert(variableOne.getSecret.isInitialized)
+    assert(variableOne.getSecret.getType == Secret.Type.VALUE)
+    assert(variableOne.getSecret.getValue.getData == ByteString.copyFrom("user".getBytes))
+    assert(variableOne.getType == Environment.Variable.Type.SECRET)
+    val variableTwo = envVars.filter(_.getName == "PASSWORD").head
+    assert(variableTwo.getSecret.isInitialized)
+    assert(variableTwo.getSecret.getType == Secret.Type.VALUE)
+    assert(variableTwo.getSecret.getValue.getData == ByteString.copyFrom("password".getBytes))
+    assert(variableTwo.getType == Environment.Variable.Type.SECRET)
+  }
+
+  def configFileBasedRefSecrets(secretConfig: MesosSecretConfig): Map[String, String] = {
+    val secretName = "/path/to/secret,/anothersecret"
+    val secretPath = "/topsecret,/mypassword"
+    Map(
+      secretConfig.SECRET_NAMES.key -> secretName,
+      secretConfig.SECRET_FILENAMES.key -> secretPath
+    )
+  }
+
+  def verifyFileBasedRefSecrets(launchedTasks: List[TaskInfo]): Unit = {
+    val volumes = launchedTasks.head.getContainer.getVolumesList
+    assert(volumes.size() == 2)
+    val secretVolOne = volumes.get(0)
+    assert(secretVolOne.getContainerPath == "/topsecret")
+    assert(secretVolOne.getSource.getSecret.getType == Secret.Type.REFERENCE)
+    assert(secretVolOne.getSource.getSecret.getReference.getName == "/path/to/secret")
+    val secretVolTwo = volumes.get(1)
+    assert(secretVolTwo.getContainerPath == "/mypassword")
+    assert(secretVolTwo.getSource.getSecret.getType == Secret.Type.REFERENCE)
+    assert(secretVolTwo.getSource.getSecret.getReference.getName == "/anothersecret")
+  }
+
+  def configFileBasedValueSecrets(secretConfig: MesosSecretConfig): Map[String, String] = {
+    val secretValues = "user,password"
+    val secretPath = "/whoami,/mypassword"
+    Map(
+      secretConfig.SECRET_VALUES.key -> secretValues,
+      secretConfig.SECRET_FILENAMES.key -> secretPath
+    )
+  }
+
+  def verifyFileBasedValueSecrets(launchedTasks: List[TaskInfo]): Unit = {
+    val volumes = launchedTasks.head.getContainer.getVolumesList
+    assert(volumes.size() == 2)
+    val secretVolOne = volumes.get(0)
+    assert(secretVolOne.getContainerPath == "/whoami")
+    assert(secretVolOne.getSource.getSecret.getType == Secret.Type.VALUE)
+    assert(secretVolOne.getSource.getSecret.getValue.getData ==
+      ByteString.copyFrom("user".getBytes))
+    val secretVolTwo = volumes.get(1)
+    assert(secretVolTwo.getContainerPath == "/mypassword")
+    assert(secretVolTwo.getSource.getSecret.getType == Secret.Type.VALUE)
+    assert(secretVolTwo.getSource.getSecret.getValue.getData ==
+      ByteString.copyFrom("password".getBytes))
+  }
 }

From 8e9863531bebbd4d83eafcbc2b359b8bd0ac5734 Mon Sep 17 00:00:00 2001
From: Jose Torres <jose@databricks.com>
Date: Thu, 26 Oct 2017 16:55:30 -0700
Subject: [PATCH 772/779] [SPARK-22366] Support ignoring missing files

## What changes were proposed in this pull request?

Add a flag "spark.sql.files.ignoreMissingFiles" to parallel the existing flag "spark.sql.files.ignoreCorruptFiles".

## How was this patch tested?

new unit test

Author: Jose Torres <jose@databricks.com>

Closes #19581 from joseph-torres/SPARK-22366.
---
 .../apache/spark/sql/internal/SQLConf.scala   |  8 +++++
 .../execution/datasources/FileScanRDD.scala   | 13 +++++---
 .../parquet/ParquetQuerySuite.scala           | 33 +++++++++++++++++++
 3 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 4cfe53b2c115b..21e4685fcc456 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -614,6 +614,12 @@ object SQLConf {
     .booleanConf
     .createWithDefault(false)
 
+  val IGNORE_MISSING_FILES = buildConf("spark.sql.files.ignoreMissingFiles")
+    .doc("Whether to ignore missing files. If true, the Spark jobs will continue to run when " +
+      "encountering missing files and the contents that have been read will still be returned.")
+    .booleanConf
+    .createWithDefault(false)
+
   val MAX_RECORDS_PER_FILE = buildConf("spark.sql.files.maxRecordsPerFile")
     .doc("Maximum number of records to write out to a single file. " +
       "If this value is zero or negative, there is no limit.")
@@ -1014,6 +1020,8 @@ class SQLConf extends Serializable with Logging {
 
   def ignoreCorruptFiles: Boolean = getConf(IGNORE_CORRUPT_FILES)
 
+  def ignoreMissingFiles: Boolean = getConf(IGNORE_MISSING_FILES)
+
   def maxRecordsPerFile: Long = getConf(MAX_RECORDS_PER_FILE)
 
   def useCompression: Boolean = getConf(COMPRESS_CACHED)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
index 9df20731c71d5..8731ee88f87f2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
@@ -66,6 +66,7 @@ class FileScanRDD(
   extends RDD[InternalRow](sparkSession.sparkContext, Nil) {
 
   private val ignoreCorruptFiles = sparkSession.sessionState.conf.ignoreCorruptFiles
+  private val ignoreMissingFiles = sparkSession.sessionState.conf.ignoreMissingFiles
 
   override def compute(split: RDDPartition, context: TaskContext): Iterator[InternalRow] = {
     val iterator = new Iterator[Object] with AutoCloseable {
@@ -142,7 +143,7 @@ class FileScanRDD(
           // Sets InputFileBlockHolder for the file block's information
           InputFileBlockHolder.set(currentFile.filePath, currentFile.start, currentFile.length)
 
-          if (ignoreCorruptFiles) {
+          if (ignoreMissingFiles || ignoreCorruptFiles) {
             currentIterator = new NextIterator[Object] {
               // The readFunction may read some bytes before consuming the iterator, e.g.,
               // vectorized Parquet reader. Here we use lazy val to delay the creation of
@@ -158,9 +159,13 @@ class FileScanRDD(
                     null
                   }
                 } catch {
-                  // Throw FileNotFoundException even `ignoreCorruptFiles` is true
-                  case e: FileNotFoundException => throw e
-                  case e @ (_: RuntimeException | _: IOException) =>
+                  case e: FileNotFoundException if ignoreMissingFiles =>
+                    logWarning(s"Skipped missing file: $currentFile", e)
+                    finished = true
+                    null
+                  // Throw FileNotFoundException even if `ignoreCorruptFiles` is true
+                  case e: FileNotFoundException if !ignoreMissingFiles => throw e
+                  case e @ (_: RuntimeException | _: IOException) if ignoreCorruptFiles =>
                     logWarning(
                       s"Skipped the rest of the content in the corrupted file: $currentFile", e)
                     finished = true
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
index 2efff3f57d7d3..e822e40b146ee 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
@@ -316,6 +316,39 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
     }
   }
 
+  testQuietly("Enabling/disabling ignoreMissingFiles") {
+    def testIgnoreMissingFiles(): Unit = {
+      withTempDir { dir =>
+        val basePath = dir.getCanonicalPath
+        spark.range(1).toDF("a").write.parquet(new Path(basePath, "first").toString)
+        spark.range(1, 2).toDF("a").write.parquet(new Path(basePath, "second").toString)
+        val thirdPath = new Path(basePath, "third")
+        spark.range(2, 3).toDF("a").write.parquet(thirdPath.toString)
+        val df = spark.read.parquet(
+          new Path(basePath, "first").toString,
+          new Path(basePath, "second").toString,
+          new Path(basePath, "third").toString)
+
+        val fs = thirdPath.getFileSystem(spark.sparkContext.hadoopConfiguration)
+        fs.delete(thirdPath, true)
+        checkAnswer(
+          df,
+          Seq(Row(0), Row(1)))
+      }
+    }
+
+    withSQLConf(SQLConf.IGNORE_MISSING_FILES.key -> "true") {
+      testIgnoreMissingFiles()
+    }
+
+    withSQLConf(SQLConf.IGNORE_MISSING_FILES.key -> "false") {
+      val exception = intercept[SparkException] {
+        testIgnoreMissingFiles()
+      }
+      assert(exception.getMessage().contains("does not exist"))
+    }
+  }
+
   /**
    * this is part of test 'Enabling/disabling ignoreCorruptFiles' but run in a loop
    * to increase the chance of failure

From 9b262f6a08c0c1b474d920d49b9fdd574c401d39 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 26 Oct 2017 17:39:53 -0700
Subject: [PATCH 773/779] [SPARK-22356][SQL] data source table should support
 overlapped columns between data and partition schema

## What changes were proposed in this pull request?

This is a regression introduced by #14207. After Spark 2.1, we store the inferred schema when creating the table, to avoid inferring schema again at read path. However, there is one special case: overlapped columns between data and partition. For this case, it breaks the assumption of table schema that there is on ovelap between data and partition schema, and partition columns should be at the end. The result is, for Spark 2.1, the table scan has incorrect schema that puts partition columns at the end. For Spark 2.2, we add a check in CatalogTable to validate table schema, which fails at this case.

To fix this issue, a simple and safe approach is to fallback to old behavior when overlapeed columns detected, i.e. store empty schema in metastore.

## How was this patch tested?

new regression test

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19579 from cloud-fan/bug2.
---
 .../command/createDataSourceTables.scala      | 35 +++++++++++++----
 .../datasources/HadoopFsRelation.scala        | 25 ++++++++----
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 16 ++++++++
 .../HiveExternalCatalogVersionsSuite.scala    | 38 ++++++++++++++-----
 4 files changed, 89 insertions(+), 25 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
index 9e3907996995c..306f43dc4214a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -24,6 +24,7 @@ import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.sources.BaseRelation
+import org.apache.spark.sql.types.StructType
 
 /**
  * A command used to create a data source table.
@@ -85,14 +86,32 @@ case class CreateDataSourceTableCommand(table: CatalogTable, ignoreIfExists: Boo
       }
     }
 
-    val newTable = table.copy(
-      schema = dataSource.schema,
-      partitionColumnNames = partitionColumnNames,
-      // If metastore partition management for file source tables is enabled, we start off with
-      // partition provider hive, but no partitions in the metastore. The user has to call
-      // `msck repair table` to populate the table partitions.
-      tracksPartitionsInCatalog = partitionColumnNames.nonEmpty &&
-        sessionState.conf.manageFilesourcePartitions)
+    val newTable = dataSource match {
+      // Since Spark 2.1, we store the inferred schema of data source in metastore, to avoid
+      // inferring the schema again at read path. However if the data source has overlapped columns
+      // between data and partition schema, we can't store it in metastore as it breaks the
+      // assumption of table schema. Here we fallback to the behavior of Spark prior to 2.1, store
+      // empty schema in metastore and infer it at runtime. Note that this also means the new
+      // scalable partitioning handling feature(introduced at Spark 2.1) is disabled in this case.
+      case r: HadoopFsRelation if r.overlappedPartCols.nonEmpty =>
+        logWarning("It is not recommended to create a table with overlapped data and partition " +
+          "columns, as Spark cannot store a valid table schema and has to infer it at runtime, " +
+          "which hurts performance. Please check your data files and remove the partition " +
+          "columns in it.")
+        table.copy(schema = new StructType(), partitionColumnNames = Nil)
+
+      case _ =>
+        table.copy(
+          schema = dataSource.schema,
+          partitionColumnNames = partitionColumnNames,
+          // If metastore partition management for file source tables is enabled, we start off with
+          // partition provider hive, but no partitions in the metastore. The user has to call
+          // `msck repair table` to populate the table partitions.
+          tracksPartitionsInCatalog = partitionColumnNames.nonEmpty &&
+            sessionState.conf.manageFilesourcePartitions)
+
+    }
+
     // We will return Nil or throw exception at the beginning if the table already exists, so when
     // we reach here, the table should not exist and we should set `ignoreIfExists` to false.
     sessionState.catalog.createTable(newTable, ignoreIfExists = false)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
index 9a08524476baa..89d8a85a9cbd2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.execution.datasources
 
+import java.util.Locale
+
 import scala.collection.mutable
 
 import org.apache.spark.sql.{SparkSession, SQLContext}
@@ -50,15 +52,22 @@ case class HadoopFsRelation(
 
   override def sqlContext: SQLContext = sparkSession.sqlContext
 
-  val schema: StructType = {
-    val getColName: (StructField => String) =
-      if (sparkSession.sessionState.conf.caseSensitiveAnalysis) _.name else _.name.toLowerCase
-    val overlappedPartCols = mutable.Map.empty[String, StructField]
-    partitionSchema.foreach { partitionField =>
-      if (dataSchema.exists(getColName(_) == getColName(partitionField))) {
-        overlappedPartCols += getColName(partitionField) -> partitionField
-      }
+  private def getColName(f: StructField): String = {
+    if (sparkSession.sessionState.conf.caseSensitiveAnalysis) {
+      f.name
+    } else {
+      f.name.toLowerCase(Locale.ROOT)
+    }
+  }
+
+  val overlappedPartCols = mutable.Map.empty[String, StructField]
+  partitionSchema.foreach { partitionField =>
+    if (dataSchema.exists(getColName(_) == getColName(partitionField))) {
+      overlappedPartCols += getColName(partitionField) -> partitionField
     }
+  }
+
+  val schema: StructType = {
     StructType(dataSchema.map(f => overlappedPartCols.getOrElse(getColName(f), f)) ++
       partitionSchema.filterNot(f => overlappedPartCols.contains(getColName(f))))
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index caf332d050d7b..5d0bba69daca1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2741,4 +2741,20 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     assert (aggregateExpressions.isDefined)
     assert (aggregateExpressions.get.size == 2)
   }
+
+  test("SPARK-22356: overlapped columns between data and partition schema in data source tables") {
+    withTempPath { path =>
+      Seq((1, 1, 1), (1, 2, 1)).toDF("i", "p", "j")
+        .write.mode("overwrite").parquet(new File(path, "p=1").getCanonicalPath)
+      withTable("t") {
+        sql(s"create table t using parquet options(path='${path.getCanonicalPath}')")
+        // We should respect the column order in data schema.
+        assert(spark.table("t").columns === Array("i", "p", "j"))
+        checkAnswer(spark.table("t"), Row(1, 1, 1) :: Row(1, 1, 1) :: Nil)
+        // The DESC TABLE should report same schema as table scan.
+        assert(sql("desc t").select("col_name")
+          .as[String].collect().mkString(",").contains("i,p,j"))
+      }
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
index 5f8c9d5799662..6859432c406a9 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
@@ -40,7 +40,7 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils {
   private val tmpDataDir = Utils.createTempDir(namePrefix = "test-data")
   // For local test, you can set `sparkTestingDir` to a static value like `/tmp/test-spark`, to
   // avoid downloading Spark of different versions in each run.
-  private val sparkTestingDir = Utils.createTempDir(namePrefix = "test-spark")
+  private val sparkTestingDir = new File("/tmp/test-spark")
   private val unusedJar = TestUtils.createJarWithClasses(Seq.empty)
 
   override def afterAll(): Unit = {
@@ -77,35 +77,38 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils {
     super.beforeAll()
 
     val tempPyFile = File.createTempFile("test", ".py")
+    // scalastyle:off line.size.limit
     Files.write(tempPyFile.toPath,
       s"""
         |from pyspark.sql import SparkSession
+        |import os
         |
         |spark = SparkSession.builder.enableHiveSupport().getOrCreate()
         |version_index = spark.conf.get("spark.sql.test.version.index", None)
         |
         |spark.sql("create table data_source_tbl_{} using json as select 1 i".format(version_index))
         |
-        |spark.sql("create table hive_compatible_data_source_tbl_" + version_index + \\
-        |          " using parquet as select 1 i")
+        |spark.sql("create table hive_compatible_data_source_tbl_{} using parquet as select 1 i".format(version_index))
         |
         |json_file = "${genDataDir("json_")}" + str(version_index)
         |spark.range(1, 2).selectExpr("cast(id as int) as i").write.json(json_file)
-        |spark.sql("create table external_data_source_tbl_" + version_index + \\
-        |          "(i int) using json options (path '{}')".format(json_file))
+        |spark.sql("create table external_data_source_tbl_{}(i int) using json options (path '{}')".format(version_index, json_file))
         |
         |parquet_file = "${genDataDir("parquet_")}" + str(version_index)
         |spark.range(1, 2).selectExpr("cast(id as int) as i").write.parquet(parquet_file)
-        |spark.sql("create table hive_compatible_external_data_source_tbl_" + version_index + \\
-        |          "(i int) using parquet options (path '{}')".format(parquet_file))
+        |spark.sql("create table hive_compatible_external_data_source_tbl_{}(i int) using parquet options (path '{}')".format(version_index, parquet_file))
         |
         |json_file2 = "${genDataDir("json2_")}" + str(version_index)
         |spark.range(1, 2).selectExpr("cast(id as int) as i").write.json(json_file2)
-        |spark.sql("create table external_table_without_schema_" + version_index + \\
-        |          " using json options (path '{}')".format(json_file2))
+        |spark.sql("create table external_table_without_schema_{} using json options (path '{}')".format(version_index, json_file2))
+        |
+        |parquet_file2 = "${genDataDir("parquet2_")}" + str(version_index)
+        |spark.range(1, 3).selectExpr("1 as i", "cast(id as int) as p", "1 as j").write.parquet(os.path.join(parquet_file2, "p=1"))
+        |spark.sql("create table tbl_with_col_overlap_{} using parquet options(path '{}')".format(version_index, parquet_file2))
         |
         |spark.sql("create view v_{} as select 1 i".format(version_index))
       """.stripMargin.getBytes("utf8"))
+    // scalastyle:on line.size.limit
 
     PROCESS_TABLES.testingVersions.zipWithIndex.foreach { case (version, index) =>
       val sparkHome = new File(sparkTestingDir, s"spark-$version")
@@ -153,6 +156,7 @@ object PROCESS_TABLES extends QueryTest with SQLTestUtils {
       .enableHiveSupport()
       .getOrCreate()
     spark = session
+    import session.implicits._
 
     testingVersions.indices.foreach { index =>
       Seq(
@@ -194,6 +198,22 @@ object PROCESS_TABLES extends QueryTest with SQLTestUtils {
 
       // test permanent view
       checkAnswer(sql(s"select i from v_$index"), Row(1))
+
+      // SPARK-22356: overlapped columns between data and partition schema in data source tables
+      val tbl_with_col_overlap = s"tbl_with_col_overlap_$index"
+      // For Spark 2.2.0 and 2.1.x, the behavior is different from Spark 2.0.
+      if (testingVersions(index).startsWith("2.1") || testingVersions(index) == "2.2.0") {
+        spark.sql("msck repair table " + tbl_with_col_overlap)
+        assert(spark.table(tbl_with_col_overlap).columns === Array("i", "j", "p"))
+        checkAnswer(spark.table(tbl_with_col_overlap), Row(1, 1, 1) :: Row(1, 1, 1) :: Nil)
+        assert(sql("desc " + tbl_with_col_overlap).select("col_name")
+          .as[String].collect().mkString(",").contains("i,j,p"))
+      } else {
+        assert(spark.table(tbl_with_col_overlap).columns === Array("i", "p", "j"))
+        checkAnswer(spark.table(tbl_with_col_overlap), Row(1, 1, 1) :: Row(1, 1, 1) :: Nil)
+        assert(sql("desc " + tbl_with_col_overlap).select("col_name")
+          .as[String].collect().mkString(",").contains("i,p,j"))
+      }
     }
   }
 }

From 5c3a1f3fad695317c2fff1243cdb9b3ceb25c317 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 26 Oct 2017 17:51:16 -0700
Subject: [PATCH 774/779] [SPARK-22355][SQL] Dataset.collect is not threadsafe

## What changes were proposed in this pull request?

It's possible that users create a `Dataset`, and call `collect` of this `Dataset` in many threads at the same time. Currently `Dataset#collect` just call `encoder.fromRow` to convert spark rows to objects of type T, and this encoder is per-dataset. This means `Dataset#collect` is not thread-safe, because the encoder uses a projection to output the object to a re-usable row.

This PR fixes this problem, by creating a new projection when calling `Dataset#collect`, so that we have the re-usable row for each method call, instead of each Dataset.

## How was this patch tested?

N/A

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19577 from cloud-fan/encoder.
---
 .../scala/org/apache/spark/sql/Dataset.scala  | 33 ++++++++++++-------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index b70dfc05330f8..0e23983786b08 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -39,6 +39,7 @@ import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.catalog.HiveTableRelation
 import org.apache.spark.sql.catalyst.encoders._
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection
 import org.apache.spark.sql.catalyst.json.{JacksonGenerator, JSONOptions}
 import org.apache.spark.sql.catalyst.optimizer.CombineUnions
 import org.apache.spark.sql.catalyst.parser.{ParseException, ParserUtils}
@@ -198,15 +199,10 @@ class Dataset[T] private[sql](
    */
   private[sql] implicit val exprEnc: ExpressionEncoder[T] = encoderFor(encoder)
 
-  /**
-   * Encoder is used mostly as a container of serde expressions in Dataset.  We build logical
-   * plans by these serde expressions and execute it within the query framework.  However, for
-   * performance reasons we may want to use encoder as a function to deserialize internal rows to
-   * custom objects, e.g. collect.  Here we resolve and bind the encoder so that we can call its
-   * `fromRow` method later.
-   */
-  private val boundEnc =
-    exprEnc.resolveAndBind(logicalPlan.output, sparkSession.sessionState.analyzer)
+  // The deserializer expression which can be used to build a projection and turn rows to objects
+  // of type T, after collecting rows to the driver side.
+  private val deserializer =
+    exprEnc.resolveAndBind(logicalPlan.output, sparkSession.sessionState.analyzer).deserializer
 
   private implicit def classTag = exprEnc.clsTag
 
@@ -2661,7 +2657,15 @@ class Dataset[T] private[sql](
    */
   def toLocalIterator(): java.util.Iterator[T] = {
     withAction("toLocalIterator", queryExecution) { plan =>
-      plan.executeToIterator().map(boundEnc.fromRow).asJava
+      // This projection writes output to a `InternalRow`, which means applying this projection is
+      // not thread-safe. Here we create the projection inside this method to make `Dataset`
+      // thread-safe.
+      val objProj = GenerateSafeProjection.generate(deserializer :: Nil)
+      plan.executeToIterator().map { row =>
+        // The row returned by SafeProjection is `SpecificInternalRow`, which ignore the data type
+        // parameter of its `get` method, so it's safe to use null here.
+        objProj(row).get(0, null).asInstanceOf[T]
+      }.asJava
     }
   }
 
@@ -3102,7 +3106,14 @@ class Dataset[T] private[sql](
    * Collect all elements from a spark plan.
    */
   private def collectFromPlan(plan: SparkPlan): Array[T] = {
-    plan.executeCollect().map(boundEnc.fromRow)
+    // This projection writes output to a `InternalRow`, which means applying this projection is not
+    // thread-safe. Here we create the projection inside this method to make `Dataset` thread-safe.
+    val objProj = GenerateSafeProjection.generate(deserializer :: Nil)
+    plan.executeCollect().map { row =>
+      // The row returned by SafeProjection is `SpecificInternalRow`, which ignore the data type
+      // parameter of its `get` method, so it's safe to use null here.
+      objProj(row).get(0, null).asInstanceOf[T]
+    }
   }
 
   private def sortInternal(global: Boolean, sortExprs: Seq[Column]): Dataset[T] = {

From 17af727e38c3faaeab5b91a8cdab5f2181cf3fc4 Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Thu, 26 Oct 2017 23:02:46 -0700
Subject: [PATCH 775/779] [SPARK-21375][PYSPARK][SQL] Add Date and Timestamp
 support to ArrowConverters for toPandas() Conversion

## What changes were proposed in this pull request?

Adding date and timestamp support with Arrow for `toPandas()` and `pandas_udf`s.  Timestamps are stored in Arrow as UTC and manifested to the user as timezone-naive localized to the Python system timezone.

## How was this patch tested?

Added Scala tests for date and timestamp types under ArrowConverters, ArrowUtils, and ArrowWriter suites.  Added Python tests for `toPandas()` and `pandas_udf`s with date and timestamp types.

Author: Bryan Cutler <cutlerb@gmail.com>
Author: Takuya UESHIN <ueshin@databricks.com>

Closes #18664 from BryanCutler/arrow-date-timestamp-SPARK-21375.
---
 python/pyspark/serializers.py                 |  24 +++-
 python/pyspark/sql/dataframe.py               |   7 +-
 python/pyspark/sql/tests.py                   | 106 ++++++++++++++--
 python/pyspark/sql/types.py                   |  36 ++++++
 .../vectorized/ArrowColumnVector.java         |  34 +++++
 .../scala/org/apache/spark/sql/Dataset.scala  |   4 +-
 .../sql/execution/arrow/ArrowConverters.scala |   3 +-
 .../sql/execution/arrow/ArrowUtils.scala      |  30 +++--
 .../sql/execution/arrow/ArrowWriter.scala     |  39 +++++-
 .../python/ArrowEvalPythonExec.scala          |   2 +-
 .../execution/python/ArrowPythonRunner.scala  |   5 +-
 .../python/FlatMapGroupsInPandasExec.scala    |   4 +-
 .../arrow/ArrowConvertersSuite.scala          | 120 ++++++++++++++++--
 .../sql/execution/arrow/ArrowUtilsSuite.scala |  26 +++-
 .../execution/arrow/ArrowWriterSuite.scala    |  24 ++--
 .../vectorized/ArrowColumnVectorSuite.scala   |  22 ++--
 .../vectorized/ColumnarBatchSuite.scala       |   4 +-
 17 files changed, 417 insertions(+), 73 deletions(-)

diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index a0adeed994456..d7979f095da76 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -214,6 +214,7 @@ def __repr__(self):
 
 
 def _create_batch(series):
+    from pyspark.sql.types import _check_series_convert_timestamps_internal
     import pyarrow as pa
     # Make input conform to [(series1, type1), (series2, type2), ...]
     if not isinstance(series, (list, tuple)) or \
@@ -224,12 +225,25 @@ def _create_batch(series):
     # If a nullable integer series has been promoted to floating point with NaNs, need to cast
     # NOTE: this is not necessary with Arrow >= 0.7
     def cast_series(s, t):
-        if t is None or s.dtype == t.to_pandas_dtype():
+        if type(t) == pa.TimestampType:
+            # NOTE: convert to 'us' with astype here, unit ignored in `from_pandas` see ARROW-1680
+            return _check_series_convert_timestamps_internal(s.fillna(0))\
+                .values.astype('datetime64[us]', copy=False)
+        elif t == pa.date32():
+            # TODO: this converts the series to Python objects, possibly avoid with Arrow >= 0.8
+            return s.dt.date
+        elif t is None or s.dtype == t.to_pandas_dtype():
             return s
         else:
             return s.fillna(0).astype(t.to_pandas_dtype(), copy=False)
 
-    arrs = [pa.Array.from_pandas(cast_series(s, t), mask=s.isnull(), type=t) for s, t in series]
+    # Some object types don't support masks in Arrow, see ARROW-1721
+    def create_array(s, t):
+        casted = cast_series(s, t)
+        mask = None if casted.dtype == 'object' else s.isnull()
+        return pa.Array.from_pandas(casted, mask=mask, type=t)
+
+    arrs = [create_array(s, t) for s, t in series]
     return pa.RecordBatch.from_arrays(arrs, ["_%d" % i for i in xrange(len(arrs))])
 
 
@@ -260,11 +274,13 @@ def load_stream(self, stream):
         """
         Deserialize ArrowRecordBatches to an Arrow table and return as a list of pandas.Series.
         """
+        from pyspark.sql.types import _check_dataframe_localize_timestamps
         import pyarrow as pa
         reader = pa.open_stream(stream)
         for batch in reader:
-            table = pa.Table.from_batches([batch])
-            yield [c.to_pandas() for c in table.itercolumns()]
+            # NOTE: changed from pa.Columns.to_pandas, timezone issue in conversion fixed in 0.7.1
+            pdf = _check_dataframe_localize_timestamps(batch.to_pandas())
+            yield [c for _, c in pdf.iteritems()]
 
     def __repr__(self):
         return "ArrowStreamPandasSerializer"
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index c0b574e2b93a1..406686e6df724 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1883,11 +1883,13 @@ def toPandas(self):
         import pandas as pd
         if self.sql_ctx.getConf("spark.sql.execution.arrow.enabled", "false").lower() == "true":
             try:
+                from pyspark.sql.types import _check_dataframe_localize_timestamps
                 import pyarrow
                 tables = self._collectAsArrow()
                 if tables:
                     table = pyarrow.concat_tables(tables)
-                    return table.to_pandas()
+                    pdf = table.to_pandas()
+                    return _check_dataframe_localize_timestamps(pdf)
                 else:
                     return pd.DataFrame.from_records([], columns=self.columns)
             except ImportError as e:
@@ -1955,6 +1957,7 @@ def _to_corrected_pandas_type(dt):
     """
     When converting Spark SQL records to Pandas DataFrame, the inferred data type may be wrong.
     This method gets the corrected data type for Pandas if that type may be inferred uncorrectly.
+    NOTE: DateType is inferred incorrectly as 'object', TimestampType is correct with datetime64[ns]
     """
     import numpy as np
     if type(dt) == ByteType:
@@ -1965,6 +1968,8 @@ def _to_corrected_pandas_type(dt):
         return np.int32
     elif type(dt) == FloatType:
         return np.float32
+    elif type(dt) == DateType:
+        return 'datetime64[ns]'
     else:
         return None
 
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 685eebcafefba..98afae662b42d 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -3086,18 +3086,38 @@ class ArrowTests(ReusedPySparkTestCase):
 
     @classmethod
     def setUpClass(cls):
+        from datetime import datetime
         ReusedPySparkTestCase.setUpClass()
+
+        # Synchronize default timezone between Python and Java
+        cls.tz_prev = os.environ.get("TZ", None)  # save current tz if set
+        tz = "America/Los_Angeles"
+        os.environ["TZ"] = tz
+        time.tzset()
+
         cls.spark = SparkSession(cls.sc)
+        cls.spark.conf.set("spark.sql.session.timeZone", tz)
         cls.spark.conf.set("spark.sql.execution.arrow.enabled", "true")
         cls.schema = StructType([
             StructField("1_str_t", StringType(), True),
             StructField("2_int_t", IntegerType(), True),
             StructField("3_long_t", LongType(), True),
             StructField("4_float_t", FloatType(), True),
-            StructField("5_double_t", DoubleType(), True)])
-        cls.data = [("a", 1, 10, 0.2, 2.0),
-                    ("b", 2, 20, 0.4, 4.0),
-                    ("c", 3, 30, 0.8, 6.0)]
+            StructField("5_double_t", DoubleType(), True),
+            StructField("6_date_t", DateType(), True),
+            StructField("7_timestamp_t", TimestampType(), True)])
+        cls.data = [("a", 1, 10, 0.2, 2.0, datetime(1969, 1, 1), datetime(1969, 1, 1, 1, 1, 1)),
+                    ("b", 2, 20, 0.4, 4.0, datetime(2012, 2, 2), datetime(2012, 2, 2, 2, 2, 2)),
+                    ("c", 3, 30, 0.8, 6.0, datetime(2100, 3, 3), datetime(2100, 3, 3, 3, 3, 3))]
+
+    @classmethod
+    def tearDownClass(cls):
+        del os.environ["TZ"]
+        if cls.tz_prev is not None:
+            os.environ["TZ"] = cls.tz_prev
+        time.tzset()
+        ReusedPySparkTestCase.tearDownClass()
+        cls.spark.stop()
 
     def assertFramesEqual(self, df_with_arrow, df_without):
         msg = ("DataFrame from Arrow is not equal" +
@@ -3106,8 +3126,8 @@ def assertFramesEqual(self, df_with_arrow, df_without):
         self.assertTrue(df_without.equals(df_with_arrow), msg=msg)
 
     def test_unsupported_datatype(self):
-        schema = StructType([StructField("dt", DateType(), True)])
-        df = self.spark.createDataFrame([(datetime.date(1970, 1, 1),)], schema=schema)
+        schema = StructType([StructField("decimal", DecimalType(), True)])
+        df = self.spark.createDataFrame([(None,)], schema=schema)
         with QuietTest(self.sc):
             self.assertRaises(Exception, lambda: df.toPandas())
 
@@ -3385,13 +3405,77 @@ def test_vectorized_udf_varargs(self):
 
     def test_vectorized_udf_unsupported_types(self):
         from pyspark.sql.functions import pandas_udf, col
-        schema = StructType([StructField("dt", DateType(), True)])
-        df = self.spark.createDataFrame([(datetime.date(1970, 1, 1),)], schema=schema)
-        f = pandas_udf(lambda x: x, DateType())
+        schema = StructType([StructField("dt", DecimalType(), True)])
+        df = self.spark.createDataFrame([(None,)], schema=schema)
+        f = pandas_udf(lambda x: x, DecimalType())
         with QuietTest(self.sc):
             with self.assertRaisesRegexp(Exception, 'Unsupported data type'):
                 df.select(f(col('dt'))).collect()
 
+    def test_vectorized_udf_null_date(self):
+        from pyspark.sql.functions import pandas_udf, col
+        from datetime import date
+        schema = StructType().add("date", DateType())
+        data = [(date(1969, 1, 1),),
+                (date(2012, 2, 2),),
+                (None,),
+                (date(2100, 4, 4),)]
+        df = self.spark.createDataFrame(data, schema=schema)
+        date_f = pandas_udf(lambda t: t, returnType=DateType())
+        res = df.select(date_f(col("date")))
+        self.assertEquals(df.collect(), res.collect())
+
+    def test_vectorized_udf_timestamps(self):
+        from pyspark.sql.functions import pandas_udf, col
+        from datetime import datetime
+        schema = StructType([
+            StructField("idx", LongType(), True),
+            StructField("timestamp", TimestampType(), True)])
+        data = [(0, datetime(1969, 1, 1, 1, 1, 1)),
+                (1, datetime(2012, 2, 2, 2, 2, 2)),
+                (2, None),
+                (3, datetime(2100, 4, 4, 4, 4, 4))]
+        df = self.spark.createDataFrame(data, schema=schema)
+
+        # Check that a timestamp passed through a pandas_udf will not be altered by timezone calc
+        f_timestamp_copy = pandas_udf(lambda t: t, returnType=TimestampType())
+        df = df.withColumn("timestamp_copy", f_timestamp_copy(col("timestamp")))
+
+        @pandas_udf(returnType=BooleanType())
+        def check_data(idx, timestamp, timestamp_copy):
+            is_equal = timestamp.isnull()  # use this array to check values are equal
+            for i in range(len(idx)):
+                # Check that timestamps are as expected in the UDF
+                is_equal[i] = (is_equal[i] and data[idx[i]][1] is None) or \
+                    timestamp[i].to_pydatetime() == data[idx[i]][1]
+            return is_equal
+
+        result = df.withColumn("is_equal", check_data(col("idx"), col("timestamp"),
+                                                      col("timestamp_copy"))).collect()
+        # Check that collection values are correct
+        self.assertEquals(len(data), len(result))
+        for i in range(len(result)):
+            self.assertEquals(data[i][1], result[i][1])  # "timestamp" col
+            self.assertTrue(result[i][3])  # "is_equal" data in udf was as expected
+
+    def test_vectorized_udf_return_timestamp_tz(self):
+        from pyspark.sql.functions import pandas_udf, col
+        import pandas as pd
+        df = self.spark.range(10)
+
+        @pandas_udf(returnType=TimestampType())
+        def gen_timestamps(id):
+            ts = [pd.Timestamp(i, unit='D', tz='America/Los_Angeles') for i in id]
+            return pd.Series(ts)
+
+        result = df.withColumn("ts", gen_timestamps(col("id"))).collect()
+        spark_ts_t = TimestampType()
+        for r in result:
+            i, ts = r
+            ts_tz = pd.Timestamp(i, unit='D', tz='America/Los_Angeles').to_pydatetime()
+            expected = spark_ts_t.fromInternal(spark_ts_t.toInternal(ts_tz))
+            self.assertEquals(expected, ts)
+
 
 @unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed")
 class GroupbyApplyTests(ReusedPySparkTestCase):
@@ -3550,8 +3634,8 @@ def test_wrong_args(self):
     def test_unsupported_types(self):
         from pyspark.sql.functions import pandas_udf, col
         schema = StructType(
-            [StructField("id", LongType(), True), StructField("dt", DateType(), True)])
-        df = self.spark.createDataFrame([(1, datetime.date(1970, 1, 1),)], schema=schema)
+            [StructField("id", LongType(), True), StructField("dt", DecimalType(), True)])
+        df = self.spark.createDataFrame([(1, None,)], schema=schema)
         f = pandas_udf(lambda x: x, df.schema)
         with QuietTest(self.sc):
             with self.assertRaisesRegexp(Exception, 'Unsupported data type'):
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index f65273d5f0b6c..7dd8fa04160e0 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -1619,11 +1619,47 @@ def to_arrow_type(dt):
         arrow_type = pa.decimal(dt.precision, dt.scale)
     elif type(dt) == StringType:
         arrow_type = pa.string()
+    elif type(dt) == DateType:
+        arrow_type = pa.date32()
+    elif type(dt) == TimestampType:
+        # Timestamps should be in UTC, JVM Arrow timestamps require a timezone to be read
+        arrow_type = pa.timestamp('us', tz='UTC')
     else:
         raise TypeError("Unsupported type in conversion to Arrow: " + str(dt))
     return arrow_type
 
 
+def _check_dataframe_localize_timestamps(pdf):
+    """
+    Convert timezone aware timestamps to timezone-naive in local time
+
+    :param pdf: pandas.DataFrame
+    :return pandas.DataFrame where any timezone aware columns have be converted to tz-naive
+    """
+    from pandas.api.types import is_datetime64tz_dtype
+    for column, series in pdf.iteritems():
+        # TODO: handle nested timestamps, such as ArrayType(TimestampType())?
+        if is_datetime64tz_dtype(series.dtype):
+            pdf[column] = series.dt.tz_convert('tzlocal()').dt.tz_localize(None)
+    return pdf
+
+
+def _check_series_convert_timestamps_internal(s):
+    """
+    Convert a tz-naive timestamp in local tz to UTC normalized for Spark internal storage
+    :param s: a pandas.Series
+    :return pandas.Series where if it is a timestamp, has been UTC normalized without a time zone
+    """
+    from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
+    # TODO: handle nested timestamps, such as ArrayType(TimestampType())?
+    if is_datetime64_dtype(s.dtype):
+        return s.dt.tz_localize('tzlocal()').dt.tz_convert('UTC')
+    elif is_datetime64tz_dtype(s.dtype):
+        return s.dt.tz_convert('UTC')
+    else:
+        return s
+
+
 def _test():
     import doctest
     from pyspark.context import SparkContext
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
index 1f171049820b2..51ea719f8c4a6 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
@@ -320,6 +320,10 @@ public ArrowColumnVector(ValueVector vector) {
       accessor = new StringAccessor((NullableVarCharVector) vector);
     } else if (vector instanceof NullableVarBinaryVector) {
       accessor = new BinaryAccessor((NullableVarBinaryVector) vector);
+    } else if (vector instanceof NullableDateDayVector) {
+      accessor = new DateAccessor((NullableDateDayVector) vector);
+    } else if (vector instanceof NullableTimeStampMicroTZVector) {
+      accessor = new TimestampAccessor((NullableTimeStampMicroTZVector) vector);
     } else if (vector instanceof ListVector) {
       ListVector listVector = (ListVector) vector;
       accessor = new ArrayAccessor(listVector);
@@ -575,6 +579,36 @@ final byte[] getBinary(int rowId) {
     }
   }
 
+  private static class DateAccessor extends ArrowVectorAccessor {
+
+    private final NullableDateDayVector.Accessor accessor;
+
+    DateAccessor(NullableDateDayVector vector) {
+      super(vector);
+      this.accessor = vector.getAccessor();
+    }
+
+    @Override
+    final int getInt(int rowId) {
+      return accessor.get(rowId);
+    }
+  }
+
+  private static class TimestampAccessor extends ArrowVectorAccessor {
+
+    private final NullableTimeStampMicroTZVector.Accessor accessor;
+
+    TimestampAccessor(NullableTimeStampMicroTZVector vector) {
+      super(vector);
+      this.accessor = vector.getAccessor();
+    }
+
+    @Override
+    final long getLong(int rowId) {
+      return accessor.get(rowId);
+    }
+  }
+
   private static class ArrayAccessor extends ArrowVectorAccessor {
 
     private final UInt4Vector.Accessor accessor;
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 0e23983786b08..fe4e192e43dfe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -3154,9 +3154,11 @@ class Dataset[T] private[sql](
   private[sql] def toArrowPayload: RDD[ArrowPayload] = {
     val schemaCaptured = this.schema
     val maxRecordsPerBatch = sparkSession.sessionState.conf.arrowMaxRecordsPerBatch
+    val timeZoneId = sparkSession.sessionState.conf.sessionLocalTimeZone
     queryExecution.toRdd.mapPartitionsInternal { iter =>
       val context = TaskContext.get()
-      ArrowConverters.toPayloadIterator(iter, schemaCaptured, maxRecordsPerBatch, context)
+      ArrowConverters.toPayloadIterator(
+        iter, schemaCaptured, maxRecordsPerBatch, timeZoneId, context)
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
index 561a067a2f81f..05ea1517fcac9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
@@ -74,9 +74,10 @@ private[sql] object ArrowConverters {
       rowIter: Iterator[InternalRow],
       schema: StructType,
       maxRecordsPerBatch: Int,
+      timeZoneId: String,
       context: TaskContext): Iterator[ArrowPayload] = {
 
-    val arrowSchema = ArrowUtils.toArrowSchema(schema)
+    val arrowSchema = ArrowUtils.toArrowSchema(schema, timeZoneId)
     val allocator =
       ArrowUtils.rootAllocator.newChildAllocator("toPayloadIterator", 0, Long.MaxValue)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowUtils.scala
index 2caf1ef02909a..6ad11bda84bf6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowUtils.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.arrow
 import scala.collection.JavaConverters._
 
 import org.apache.arrow.memory.RootAllocator
-import org.apache.arrow.vector.types.FloatingPointPrecision
+import org.apache.arrow.vector.types.{DateUnit, FloatingPointPrecision, TimeUnit}
 import org.apache.arrow.vector.types.pojo.{ArrowType, Field, FieldType, Schema}
 
 import org.apache.spark.sql.types._
@@ -31,7 +31,8 @@ object ArrowUtils {
 
   // todo: support more types.
 
-  def toArrowType(dt: DataType): ArrowType = dt match {
+  /** Maps data type from Spark to Arrow. NOTE: timeZoneId required for TimestampTypes */
+  def toArrowType(dt: DataType, timeZoneId: String): ArrowType = dt match {
     case BooleanType => ArrowType.Bool.INSTANCE
     case ByteType => new ArrowType.Int(8, true)
     case ShortType => new ArrowType.Int(8 * 2, true)
@@ -42,6 +43,13 @@ object ArrowUtils {
     case StringType => ArrowType.Utf8.INSTANCE
     case BinaryType => ArrowType.Binary.INSTANCE
     case DecimalType.Fixed(precision, scale) => new ArrowType.Decimal(precision, scale)
+    case DateType => new ArrowType.Date(DateUnit.DAY)
+    case TimestampType =>
+      if (timeZoneId == null) {
+        throw new UnsupportedOperationException("TimestampType must supply timeZoneId parameter")
+      } else {
+        new ArrowType.Timestamp(TimeUnit.MICROSECOND, timeZoneId)
+      }
     case _ => throw new UnsupportedOperationException(s"Unsupported data type: ${dt.simpleString}")
   }
 
@@ -58,22 +66,27 @@ object ArrowUtils {
     case ArrowType.Utf8.INSTANCE => StringType
     case ArrowType.Binary.INSTANCE => BinaryType
     case d: ArrowType.Decimal => DecimalType(d.getPrecision, d.getScale)
+    case date: ArrowType.Date if date.getUnit == DateUnit.DAY => DateType
+    case ts: ArrowType.Timestamp if ts.getUnit == TimeUnit.MICROSECOND => TimestampType
     case _ => throw new UnsupportedOperationException(s"Unsupported data type: $dt")
   }
 
-  def toArrowField(name: String, dt: DataType, nullable: Boolean): Field = {
+  /** Maps field from Spark to Arrow. NOTE: timeZoneId required for TimestampType */
+  def toArrowField(
+      name: String, dt: DataType, nullable: Boolean, timeZoneId: String): Field = {
     dt match {
       case ArrayType(elementType, containsNull) =>
         val fieldType = new FieldType(nullable, ArrowType.List.INSTANCE, null)
-        new Field(name, fieldType, Seq(toArrowField("element", elementType, containsNull)).asJava)
+        new Field(name, fieldType,
+          Seq(toArrowField("element", elementType, containsNull, timeZoneId)).asJava)
       case StructType(fields) =>
         val fieldType = new FieldType(nullable, ArrowType.Struct.INSTANCE, null)
         new Field(name, fieldType,
           fields.map { field =>
-            toArrowField(field.name, field.dataType, field.nullable)
+            toArrowField(field.name, field.dataType, field.nullable, timeZoneId)
           }.toSeq.asJava)
       case dataType =>
-        val fieldType = new FieldType(nullable, toArrowType(dataType), null)
+        val fieldType = new FieldType(nullable, toArrowType(dataType, timeZoneId), null)
         new Field(name, fieldType, Seq.empty[Field].asJava)
     }
   }
@@ -94,9 +107,10 @@ object ArrowUtils {
     }
   }
 
-  def toArrowSchema(schema: StructType): Schema = {
+  /** Maps schema from Spark to Arrow. NOTE: timeZoneId required for TimestampType in StructType */
+  def toArrowSchema(schema: StructType, timeZoneId: String): Schema = {
     new Schema(schema.map { field =>
-      toArrowField(field.name, field.dataType, field.nullable)
+      toArrowField(field.name, field.dataType, field.nullable, timeZoneId)
     }.asJava)
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala
index 0b740735ffe19..e4af4f65da127 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala
@@ -21,7 +21,7 @@ import scala.collection.JavaConverters._
 
 import org.apache.arrow.vector._
 import org.apache.arrow.vector.complex._
-import org.apache.arrow.vector.util.DecimalUtility
+import org.apache.arrow.vector.types.pojo.ArrowType
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.SpecializedGetters
@@ -29,8 +29,8 @@ import org.apache.spark.sql.types._
 
 object ArrowWriter {
 
-  def create(schema: StructType): ArrowWriter = {
-    val arrowSchema = ArrowUtils.toArrowSchema(schema)
+  def create(schema: StructType, timeZoneId: String): ArrowWriter = {
+    val arrowSchema = ArrowUtils.toArrowSchema(schema, timeZoneId)
     val root = VectorSchemaRoot.create(arrowSchema, ArrowUtils.rootAllocator)
     create(root)
   }
@@ -55,6 +55,8 @@ object ArrowWriter {
       case (DoubleType, vector: NullableFloat8Vector) => new DoubleWriter(vector)
       case (StringType, vector: NullableVarCharVector) => new StringWriter(vector)
       case (BinaryType, vector: NullableVarBinaryVector) => new BinaryWriter(vector)
+      case (DateType, vector: NullableDateDayVector) => new DateWriter(vector)
+      case (TimestampType, vector: NullableTimeStampMicroTZVector) => new TimestampWriter(vector)
       case (ArrayType(_, _), vector: ListVector) =>
         val elementVector = createFieldWriter(vector.getDataVector())
         new ArrayWriter(vector, elementVector)
@@ -69,9 +71,7 @@ object ArrowWriter {
   }
 }
 
-class ArrowWriter(
-    val root: VectorSchemaRoot,
-    fields: Array[ArrowFieldWriter]) {
+class ArrowWriter(val root: VectorSchemaRoot, fields: Array[ArrowFieldWriter]) {
 
   def schema: StructType = StructType(fields.map { f =>
     StructField(f.name, f.dataType, f.nullable)
@@ -255,6 +255,33 @@ private[arrow] class BinaryWriter(
   }
 }
 
+private[arrow] class DateWriter(val valueVector: NullableDateDayVector) extends ArrowFieldWriter {
+
+  override def valueMutator: NullableDateDayVector#Mutator = valueVector.getMutator()
+
+  override def setNull(): Unit = {
+    valueMutator.setNull(count)
+  }
+
+  override def setValue(input: SpecializedGetters, ordinal: Int): Unit = {
+    valueMutator.setSafe(count, input.getInt(ordinal))
+  }
+}
+
+private[arrow] class TimestampWriter(
+    val valueVector: NullableTimeStampMicroTZVector) extends ArrowFieldWriter {
+
+  override def valueMutator: NullableTimeStampMicroTZVector#Mutator = valueVector.getMutator()
+
+  override def setNull(): Unit = {
+    valueMutator.setNull(count)
+  }
+
+  override def setValue(input: SpecializedGetters, ordinal: Int): Unit = {
+    valueMutator.setSafe(count, input.getLong(ordinal))
+  }
+}
+
 private[arrow] class ArrayWriter(
     val valueVector: ListVector,
     val elementWriter: ArrowFieldWriter) extends ArrowFieldWriter {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
index 81896187ecc46..0db463a5fbd89 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
@@ -79,7 +79,7 @@ case class ArrowEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], chi
 
     val columnarBatchIter = new ArrowPythonRunner(
         funcs, bufferSize, reuseWorker,
-        PythonEvalType.SQL_PANDAS_UDF, argOffsets, schema)
+        PythonEvalType.SQL_PANDAS_UDF, argOffsets, schema, conf.sessionLocalTimeZone)
       .compute(batchIter, context.partitionId(), context)
 
     new Iterator[InternalRow] {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
index f6c03c415dc66..94c05b9b5e49f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
@@ -43,7 +43,8 @@ class ArrowPythonRunner(
     reuseWorker: Boolean,
     evalType: Int,
     argOffsets: Array[Array[Int]],
-    schema: StructType)
+    schema: StructType,
+    timeZoneId: String)
   extends BasePythonRunner[Iterator[InternalRow], ColumnarBatch](
     funcs, bufferSize, reuseWorker, evalType, argOffsets) {
 
@@ -60,7 +61,7 @@ class ArrowPythonRunner(
       }
 
       protected override def writeIteratorToStream(dataOut: DataOutputStream): Unit = {
-        val arrowSchema = ArrowUtils.toArrowSchema(schema)
+        val arrowSchema = ArrowUtils.toArrowSchema(schema, timeZoneId)
         val allocator = ArrowUtils.rootAllocator.newChildAllocator(
           s"stdout writer for $pythonExec", 0, Long.MaxValue)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasExec.scala
index 5ed88ada428cb..cc93fda9f81da 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasExec.scala
@@ -94,8 +94,8 @@ case class FlatMapGroupsInPandasExec(
 
       val columnarBatchIter = new ArrowPythonRunner(
         chainedFunc, bufferSize, reuseWorker,
-        PythonEvalType.SQL_PANDAS_GROUPED_UDF, argOffsets, schema)
-        .compute(grouped, context.partitionId(), context)
+        PythonEvalType.SQL_PANDAS_GROUPED_UDF, argOffsets, schema, conf.sessionLocalTimeZone)
+          .compute(grouped, context.partitionId(), context)
 
       columnarBatchIter.flatMap(_.rowIterator.asScala).map(UnsafeProjection.create(output, output))
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala
index 30422b657742c..ba2903babbba8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala
@@ -32,6 +32,8 @@ import org.scalatest.BeforeAndAfterAll
 import org.apache.spark.{SparkException, TaskContext}
 import org.apache.spark.sql.{DataFrame, Row}
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types.{BinaryType, IntegerType, StructField, StructType}
 import org.apache.spark.util.Utils
@@ -793,6 +795,103 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
     collectAndValidate(df, json, "binaryData.json")
   }
 
+  test("date type conversion") {
+    val json =
+      s"""
+         |{
+         |  "schema" : {
+         |    "fields" : [ {
+         |      "name" : "date",
+         |      "type" : {
+         |        "name" : "date",
+         |        "unit" : "DAY"
+         |      },
+         |      "nullable" : true,
+         |      "children" : [ ],
+         |      "typeLayout" : {
+         |        "vectors" : [ {
+         |          "type" : "VALIDITY",
+         |          "typeBitWidth" : 1
+         |        }, {
+         |          "type" : "DATA",
+         |          "typeBitWidth" : 32
+         |        } ]
+         |      }
+         |    } ]
+         |  },
+         |  "batches" : [ {
+         |    "count" : 4,
+         |    "columns" : [ {
+         |      "name" : "date",
+         |      "count" : 4,
+         |      "VALIDITY" : [ 1, 1, 1, 1 ],
+         |      "DATA" : [ -1, 0, 16533, 382607 ]
+         |    } ]
+         |  } ]
+         |}
+       """.stripMargin
+
+    val d1 = DateTimeUtils.toJavaDate(-1)  // "1969-12-31"
+    val d2 = DateTimeUtils.toJavaDate(0)  // "1970-01-01"
+    val d3 = Date.valueOf("2015-04-08")
+    val d4 = Date.valueOf("3017-07-18")
+
+    val df = Seq(d1, d2, d3, d4).toDF("date")
+
+    collectAndValidate(df, json, "dateData.json")
+  }
+
+  test("timestamp type conversion") {
+    withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> "America/Los_Angeles") {
+      val json =
+        s"""
+           |{
+           |  "schema" : {
+           |    "fields" : [ {
+           |      "name" : "timestamp",
+           |      "type" : {
+           |        "name" : "timestamp",
+           |        "unit" : "MICROSECOND",
+           |        "timezone" : "America/Los_Angeles"
+           |      },
+           |      "nullable" : true,
+           |      "children" : [ ],
+           |      "typeLayout" : {
+           |        "vectors" : [ {
+           |          "type" : "VALIDITY",
+           |          "typeBitWidth" : 1
+           |        }, {
+           |          "type" : "DATA",
+           |          "typeBitWidth" : 64
+           |        } ]
+           |      }
+           |    } ]
+           |  },
+           |  "batches" : [ {
+           |    "count" : 4,
+           |    "columns" : [ {
+           |      "name" : "timestamp",
+           |      "count" : 4,
+           |      "VALIDITY" : [ 1, 1, 1, 1 ],
+           |      "DATA" : [ -1234, 0, 1365383415567000, 33057298500000000 ]
+           |    } ]
+           |  } ]
+           |}
+         """.stripMargin
+
+      val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS z", Locale.US)
+      val ts1 = DateTimeUtils.toJavaTimestamp(-1234L)
+      val ts2 = DateTimeUtils.toJavaTimestamp(0L)
+      val ts3 = new Timestamp(sdf.parse("2013-04-08 01:10:15.567 UTC").getTime)
+      val ts4 = new Timestamp(sdf.parse("3017-07-18 14:55:00.000 UTC").getTime)
+      val data = Seq(ts1, ts2, ts3, ts4)
+
+      val df = data.toDF("timestamp")
+
+      collectAndValidate(df, json, "timestampData.json", "America/Los_Angeles")
+    }
+  }
+
   test("floating-point NaN") {
     val json =
       s"""
@@ -1486,15 +1585,6 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
     runUnsupported { decimalData.toArrowPayload.collect() }
     runUnsupported { mapData.toDF().toArrowPayload.collect() }
     runUnsupported { complexData.toArrowPayload.collect() }
-
-    val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS z", Locale.US)
-    val d1 = new Date(sdf.parse("2015-04-08 13:10:15.000 UTC").getTime)
-    val d2 = new Date(sdf.parse("2016-05-09 13:10:15.000 UTC").getTime)
-    runUnsupported { Seq(d1, d2).toDF("date").toArrowPayload.collect() }
-
-    val ts1 = new Timestamp(sdf.parse("2013-04-08 01:10:15.567 UTC").getTime)
-    val ts2 = new Timestamp(sdf.parse("2013-04-08 13:10:10.789 UTC").getTime)
-    runUnsupported { Seq(ts1, ts2).toDF("timestamp").toArrowPayload.collect() }
   }
 
   test("test Arrow Validator") {
@@ -1638,7 +1728,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
     val schema = StructType(Seq(StructField("int", IntegerType, nullable = true)))
 
     val ctx = TaskContext.empty()
-    val payloadIter = ArrowConverters.toPayloadIterator(inputRows.toIterator, schema, 0, ctx)
+    val payloadIter = ArrowConverters.toPayloadIterator(inputRows.toIterator, schema, 0, null, ctx)
     val outputRowIter = ArrowConverters.fromPayloadIterator(payloadIter, ctx)
 
     assert(schema.equals(outputRowIter.schema))
@@ -1657,22 +1747,24 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
   }
 
   /** Test that a converted DataFrame to Arrow record batch equals batch read from JSON file */
-  private def collectAndValidate(df: DataFrame, json: String, file: String): Unit = {
+  private def collectAndValidate(
+      df: DataFrame, json: String, file: String, timeZoneId: String = null): Unit = {
     // NOTE: coalesce to single partition because can only load 1 batch in validator
     val arrowPayload = df.coalesce(1).toArrowPayload.collect().head
     val tempFile = new File(tempDataPath, file)
     Files.write(json, tempFile, StandardCharsets.UTF_8)
-    validateConversion(df.schema, arrowPayload, tempFile)
+    validateConversion(df.schema, arrowPayload, tempFile, timeZoneId)
   }
 
   private def validateConversion(
       sparkSchema: StructType,
       arrowPayload: ArrowPayload,
-      jsonFile: File): Unit = {
+      jsonFile: File,
+      timeZoneId: String = null): Unit = {
     val allocator = new RootAllocator(Long.MaxValue)
     val jsonReader = new JsonFileReader(jsonFile, allocator)
 
-    val arrowSchema = ArrowUtils.toArrowSchema(sparkSchema)
+    val arrowSchema = ArrowUtils.toArrowSchema(sparkSchema, timeZoneId)
     val jsonSchema = jsonReader.start()
     Validator.compareSchemas(arrowSchema, jsonSchema)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowUtilsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowUtilsSuite.scala
index 638619fd39d06..d801f62b62323 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowUtilsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowUtilsSuite.scala
@@ -17,7 +17,10 @@
 
 package org.apache.spark.sql.execution.arrow
 
+import org.apache.arrow.vector.types.pojo.{ArrowType, Field, FieldType, Schema}
+
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
 
 class ArrowUtilsSuite extends SparkFunSuite {
@@ -25,7 +28,7 @@ class ArrowUtilsSuite extends SparkFunSuite {
   def roundtrip(dt: DataType): Unit = {
     dt match {
       case schema: StructType =>
-        assert(ArrowUtils.fromArrowSchema(ArrowUtils.toArrowSchema(schema)) === schema)
+        assert(ArrowUtils.fromArrowSchema(ArrowUtils.toArrowSchema(schema, null)) === schema)
       case _ =>
         roundtrip(new StructType().add("value", dt))
     }
@@ -42,6 +45,27 @@ class ArrowUtilsSuite extends SparkFunSuite {
     roundtrip(StringType)
     roundtrip(BinaryType)
     roundtrip(DecimalType.SYSTEM_DEFAULT)
+    roundtrip(DateType)
+    val tsExMsg = intercept[UnsupportedOperationException] {
+      roundtrip(TimestampType)
+    }
+    assert(tsExMsg.getMessage.contains("timeZoneId"))
+  }
+
+  test("timestamp") {
+
+    def roundtripWithTz(timeZoneId: String): Unit = {
+      val schema = new StructType().add("value", TimestampType)
+      val arrowSchema = ArrowUtils.toArrowSchema(schema, timeZoneId)
+      val fieldType = arrowSchema.findField("value").getType.asInstanceOf[ArrowType.Timestamp]
+      assert(fieldType.getTimezone() === timeZoneId)
+      assert(ArrowUtils.fromArrowSchema(arrowSchema) === schema)
+    }
+
+    roundtripWithTz(DateTimeUtils.defaultTimeZone().getID)
+    roundtripWithTz("Asia/Tokyo")
+    roundtripWithTz("UTC")
+    roundtripWithTz("America/Los_Angeles")
   }
 
   test("array") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala
index e9a629315f5f4..a71e30aa3ca96 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala
@@ -27,9 +27,9 @@ import org.apache.spark.unsafe.types.UTF8String
 class ArrowWriterSuite extends SparkFunSuite {
 
   test("simple") {
-    def check(dt: DataType, data: Seq[Any]): Unit = {
+    def check(dt: DataType, data: Seq[Any], timeZoneId: String = null): Unit = {
       val schema = new StructType().add("value", dt, nullable = true)
-      val writer = ArrowWriter.create(schema)
+      val writer = ArrowWriter.create(schema, timeZoneId)
       assert(writer.schema === schema)
 
       data.foreach { datum =>
@@ -51,6 +51,8 @@ class ArrowWriterSuite extends SparkFunSuite {
             case DoubleType => reader.getDouble(rowId)
             case StringType => reader.getUTF8String(rowId)
             case BinaryType => reader.getBinary(rowId)
+            case DateType => reader.getInt(rowId)
+            case TimestampType => reader.getLong(rowId)
           }
           assert(value === datum)
       }
@@ -66,12 +68,14 @@ class ArrowWriterSuite extends SparkFunSuite {
     check(DoubleType, Seq(1.0d, 2.0d, null, 4.0d))
     check(StringType, Seq("a", "b", null, "d").map(UTF8String.fromString))
     check(BinaryType, Seq("a".getBytes(), "b".getBytes(), null, "d".getBytes()))
+    check(DateType, Seq(0, 1, 2, null, 4))
+    check(TimestampType, Seq(0L, 3.6e9.toLong, null, 8.64e10.toLong), "America/Los_Angeles")
   }
 
   test("get multiple") {
-    def check(dt: DataType, data: Seq[Any]): Unit = {
+    def check(dt: DataType, data: Seq[Any], timeZoneId: String = null): Unit = {
       val schema = new StructType().add("value", dt, nullable = false)
-      val writer = ArrowWriter.create(schema)
+      val writer = ArrowWriter.create(schema, timeZoneId)
       assert(writer.schema === schema)
 
       data.foreach { datum =>
@@ -88,6 +92,8 @@ class ArrowWriterSuite extends SparkFunSuite {
         case LongType => reader.getLongs(0, data.size)
         case FloatType => reader.getFloats(0, data.size)
         case DoubleType => reader.getDoubles(0, data.size)
+        case DateType => reader.getInts(0, data.size)
+        case TimestampType => reader.getLongs(0, data.size)
       }
       assert(values === data)
 
@@ -100,12 +106,14 @@ class ArrowWriterSuite extends SparkFunSuite {
     check(LongType, (0 until 10).map(_.toLong))
     check(FloatType, (0 until 10).map(_.toFloat))
     check(DoubleType, (0 until 10).map(_.toDouble))
+    check(DateType, (0 until 10))
+    check(TimestampType, (0 until 10).map(_ * 4.32e10.toLong), "America/Los_Angeles")
   }
 
   test("array") {
     val schema = new StructType()
       .add("arr", ArrayType(IntegerType, containsNull = true), nullable = true)
-    val writer = ArrowWriter.create(schema)
+    val writer = ArrowWriter.create(schema, null)
     assert(writer.schema === schema)
 
     writer.write(InternalRow(ArrayData.toArrayData(Array(1, 2, 3))))
@@ -144,7 +152,7 @@ class ArrowWriterSuite extends SparkFunSuite {
 
   test("nested array") {
     val schema = new StructType().add("nested", ArrayType(ArrayType(IntegerType)))
-    val writer = ArrowWriter.create(schema)
+    val writer = ArrowWriter.create(schema, null)
     assert(writer.schema === schema)
 
     writer.write(InternalRow(ArrayData.toArrayData(Array(
@@ -195,7 +203,7 @@ class ArrowWriterSuite extends SparkFunSuite {
   test("struct") {
     val schema = new StructType()
       .add("struct", new StructType().add("i", IntegerType).add("str", StringType))
-    val writer = ArrowWriter.create(schema)
+    val writer = ArrowWriter.create(schema, null)
     assert(writer.schema === schema)
 
     writer.write(InternalRow(InternalRow(1, UTF8String.fromString("str1"))))
@@ -231,7 +239,7 @@ class ArrowWriterSuite extends SparkFunSuite {
   test("nested struct") {
     val schema = new StructType().add("struct",
       new StructType().add("nested", new StructType().add("i", IntegerType).add("str", StringType)))
-    val writer = ArrowWriter.create(schema)
+    val writer = ArrowWriter.create(schema, null)
     assert(writer.schema === schema)
 
     writer.write(InternalRow(InternalRow(InternalRow(1, UTF8String.fromString("str1")))))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
index d24a9e1f4bd16..068a17bf772e1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
@@ -29,7 +29,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
   test("boolean") {
     val allocator = ArrowUtils.rootAllocator.newChildAllocator("boolean", 0, Long.MaxValue)
-    val vector = ArrowUtils.toArrowField("boolean", BooleanType, nullable = true)
+    val vector = ArrowUtils.toArrowField("boolean", BooleanType, nullable = true, null)
       .createVector(allocator).asInstanceOf[NullableBitVector]
     vector.allocateNew()
     val mutator = vector.getMutator()
@@ -58,7 +58,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
   test("byte") {
     val allocator = ArrowUtils.rootAllocator.newChildAllocator("byte", 0, Long.MaxValue)
-    val vector = ArrowUtils.toArrowField("byte", ByteType, nullable = true)
+    val vector = ArrowUtils.toArrowField("byte", ByteType, nullable = true, null)
       .createVector(allocator).asInstanceOf[NullableTinyIntVector]
     vector.allocateNew()
     val mutator = vector.getMutator()
@@ -87,7 +87,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
   test("short") {
     val allocator = ArrowUtils.rootAllocator.newChildAllocator("short", 0, Long.MaxValue)
-    val vector = ArrowUtils.toArrowField("short", ShortType, nullable = true)
+    val vector = ArrowUtils.toArrowField("short", ShortType, nullable = true, null)
       .createVector(allocator).asInstanceOf[NullableSmallIntVector]
     vector.allocateNew()
     val mutator = vector.getMutator()
@@ -116,7 +116,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
   test("int") {
     val allocator = ArrowUtils.rootAllocator.newChildAllocator("int", 0, Long.MaxValue)
-    val vector = ArrowUtils.toArrowField("int", IntegerType, nullable = true)
+    val vector = ArrowUtils.toArrowField("int", IntegerType, nullable = true, null)
       .createVector(allocator).asInstanceOf[NullableIntVector]
     vector.allocateNew()
     val mutator = vector.getMutator()
@@ -145,7 +145,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
   test("long") {
     val allocator = ArrowUtils.rootAllocator.newChildAllocator("long", 0, Long.MaxValue)
-    val vector = ArrowUtils.toArrowField("long", LongType, nullable = true)
+    val vector = ArrowUtils.toArrowField("long", LongType, nullable = true, null)
       .createVector(allocator).asInstanceOf[NullableBigIntVector]
     vector.allocateNew()
     val mutator = vector.getMutator()
@@ -174,7 +174,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
   test("float") {
     val allocator = ArrowUtils.rootAllocator.newChildAllocator("float", 0, Long.MaxValue)
-    val vector = ArrowUtils.toArrowField("float", FloatType, nullable = true)
+    val vector = ArrowUtils.toArrowField("float", FloatType, nullable = true, null)
       .createVector(allocator).asInstanceOf[NullableFloat4Vector]
     vector.allocateNew()
     val mutator = vector.getMutator()
@@ -203,7 +203,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
   test("double") {
     val allocator = ArrowUtils.rootAllocator.newChildAllocator("double", 0, Long.MaxValue)
-    val vector = ArrowUtils.toArrowField("double", DoubleType, nullable = true)
+    val vector = ArrowUtils.toArrowField("double", DoubleType, nullable = true, null)
       .createVector(allocator).asInstanceOf[NullableFloat8Vector]
     vector.allocateNew()
     val mutator = vector.getMutator()
@@ -232,7 +232,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
   test("string") {
     val allocator = ArrowUtils.rootAllocator.newChildAllocator("string", 0, Long.MaxValue)
-    val vector = ArrowUtils.toArrowField("string", StringType, nullable = true)
+    val vector = ArrowUtils.toArrowField("string", StringType, nullable = true, null)
       .createVector(allocator).asInstanceOf[NullableVarCharVector]
     vector.allocateNew()
     val mutator = vector.getMutator()
@@ -260,7 +260,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
   test("binary") {
     val allocator = ArrowUtils.rootAllocator.newChildAllocator("binary", 0, Long.MaxValue)
-    val vector = ArrowUtils.toArrowField("binary", BinaryType, nullable = true)
+    val vector = ArrowUtils.toArrowField("binary", BinaryType, nullable = true, null)
       .createVector(allocator).asInstanceOf[NullableVarBinaryVector]
     vector.allocateNew()
     val mutator = vector.getMutator()
@@ -288,7 +288,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
   test("array") {
     val allocator = ArrowUtils.rootAllocator.newChildAllocator("array", 0, Long.MaxValue)
-    val vector = ArrowUtils.toArrowField("array", ArrayType(IntegerType), nullable = true)
+    val vector = ArrowUtils.toArrowField("array", ArrayType(IntegerType), nullable = true, null)
       .createVector(allocator).asInstanceOf[ListVector]
     vector.allocateNew()
     val mutator = vector.getMutator()
@@ -345,7 +345,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
   test("struct") {
     val allocator = ArrowUtils.rootAllocator.newChildAllocator("struct", 0, Long.MaxValue)
     val schema = new StructType().add("int", IntegerType).add("long", LongType)
-    val vector = ArrowUtils.toArrowField("struct", schema, nullable = true)
+    val vector = ArrowUtils.toArrowField("struct", schema, nullable = true, null)
       .createVector(allocator).asInstanceOf[NullableMapVector]
     vector.allocateNew()
     val mutator = vector.getMutator()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
index 0b179aa97c479..4cfc776e51db1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
@@ -1249,11 +1249,11 @@ class ColumnarBatchSuite extends SparkFunSuite {
 
   test("create columnar batch from Arrow column vectors") {
     val allocator = ArrowUtils.rootAllocator.newChildAllocator("int", 0, Long.MaxValue)
-    val vector1 = ArrowUtils.toArrowField("int1", IntegerType, nullable = true)
+    val vector1 = ArrowUtils.toArrowField("int1", IntegerType, nullable = true, null)
       .createVector(allocator).asInstanceOf[NullableIntVector]
     vector1.allocateNew()
     val mutator1 = vector1.getMutator()
-    val vector2 = ArrowUtils.toArrowField("int2", IntegerType, nullable = true)
+    val vector2 = ArrowUtils.toArrowField("int2", IntegerType, nullable = true, null)
       .createVector(allocator).asInstanceOf[NullableIntVector]
     vector2.allocateNew()
     val mutator2 = vector2.getMutator()

From 36b826f5d17ae7be89135cb2c43ff797f9e7fe48 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Fri, 27 Oct 2017 07:52:10 -0700
Subject: [PATCH 776/779] [TRIVIAL][SQL] Code cleaning in ResolveReferences

## What changes were proposed in this pull request?
This PR is to clean the related codes majorly based on the today's code review on  https://github.com/apache/spark/pull/19559

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #19585 from gatorsmile/trivialFixes.
---
 .../sql/catalyst/analysis/Analyzer.scala      | 21 +++++++++++--------
 .../scala/org/apache/spark/sql/Column.scala   | 10 ++++-----
 .../spark/sql/RelationalGroupedDataset.scala  |  4 ++--
 .../sql/execution/WholeStageCodegenExec.scala |  5 ++---
 4 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index d6a962a14dc9c..6384a141e83b3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -783,6 +783,17 @@ class Analyzer(
       }
     }
 
+    private def resolve(e: Expression, q: LogicalPlan): Expression = e match {
+      case u @ UnresolvedAttribute(nameParts) =>
+        // Leave unchanged if resolution fails. Hopefully will be resolved next round.
+        val result = withPosition(u) { q.resolveChildren(nameParts, resolver).getOrElse(u) }
+        logDebug(s"Resolving $u to $result")
+        result
+      case UnresolvedExtractValue(child, fieldExpr) if child.resolved =>
+        ExtractValue(child, fieldExpr, resolver)
+      case _ => e.mapChildren(resolve(_, q))
+    }
+
     def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators {
       case p: LogicalPlan if !p.childrenResolved => p
 
@@ -841,15 +852,7 @@ class Analyzer(
 
       case q: LogicalPlan =>
         logTrace(s"Attempting to resolve ${q.simpleString}")
-        q.transformExpressionsUp  {
-          case u @ UnresolvedAttribute(nameParts) =>
-            // Leave unchanged if resolution fails. Hopefully will be resolved next round.
-            val result = withPosition(u) { q.resolveChildren(nameParts, resolver).getOrElse(u) }
-            logDebug(s"Resolving $u to $result")
-            result
-          case UnresolvedExtractValue(child, fieldExpr) if child.resolved =>
-            ExtractValue(child, fieldExpr, resolver)
-        }
+        q.mapExpressions(resolve(_, q))
     }
 
     def newAliases(expressions: Seq[NamedExpression]): Seq[NamedExpression] = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index 8468a8a96349a..92988680871a4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.encoders.{encoderFor, ExpressionEncoder}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
-import org.apache.spark.sql.catalyst.util.usePrettyExpression
+import org.apache.spark.sql.catalyst.util.toPrettySQL
 import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression
 import org.apache.spark.sql.expressions.Window
 import org.apache.spark.sql.functions.lit
@@ -44,7 +44,7 @@ private[sql] object Column {
     e match {
       case a: AggregateExpression if a.aggregateFunction.isInstanceOf[TypedAggregateExpression] =>
         a.aggregateFunction.toString
-      case expr => usePrettyExpression(expr).sql
+      case expr => toPrettySQL(expr)
     }
   }
 }
@@ -137,7 +137,7 @@ class Column(val expr: Expression) extends Logging {
     case _ => UnresolvedAttribute.quotedString(name)
   })
 
-  override def toString: String = usePrettyExpression(expr).sql
+  override def toString: String = toPrettySQL(expr)
 
   override def equals(that: Any): Boolean = that match {
     case that: Column => that.expr.equals(this.expr)
@@ -175,7 +175,7 @@ class Column(val expr: Expression) extends Logging {
         case c @ Cast(_: NamedExpression, _, _) => UnresolvedAlias(c)
       } match {
         case ne: NamedExpression => ne
-        case other => Alias(expr, usePrettyExpression(expr).sql)()
+        case _ => Alias(expr, toPrettySQL(expr))()
       }
 
     case a: AggregateExpression if a.aggregateFunction.isInstanceOf[TypedAggregateExpression] =>
@@ -184,7 +184,7 @@ class Column(val expr: Expression) extends Logging {
     // Wait until the struct is resolved. This will generate a nicer looking alias.
     case struct: CreateNamedStructLike => UnresolvedAlias(struct)
 
-    case expr: Expression => Alias(expr, usePrettyExpression(expr).sql)()
+    case expr: Expression => Alias(expr, toPrettySQL(expr))()
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
index 6b45790d5ff6e..21e94fa8bb0b1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.analysis.{Star, UnresolvedAlias, Unresolved
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.catalyst.util.usePrettyExpression
+import org.apache.spark.sql.catalyst.util.toPrettySQL
 import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression
 import org.apache.spark.sql.execution.python.{PythonUDF, PythonUdfType}
 import org.apache.spark.sql.internal.SQLConf
@@ -85,7 +85,7 @@ class RelationalGroupedDataset protected[sql](
     case expr: NamedExpression => expr
     case a: AggregateExpression if a.aggregateFunction.isInstanceOf[TypedAggregateExpression] =>
       UnresolvedAlias(a, Some(Column.generateAlias))
-    case expr: Expression => Alias(expr, usePrettyExpression(expr).sql)()
+    case expr: Expression => Alias(expr, toPrettySQL(expr))()
   }
 
   private[this] def aggregateNumericColumns(colNames: String*)(f: Expression => AggregateFunction)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
index e37d133ff336a..286cb3bb0767c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
@@ -521,10 +521,9 @@ case class CollapseCodegenStages(conf: SQLConf) extends Rule[SparkPlan] {
     case p if !supportCodegen(p) =>
       // collapse them recursively
       InputAdapter(insertWholeStageCodegen(p))
-    case j @ SortMergeJoinExec(_, _, _, _, left, right) =>
+    case j: SortMergeJoinExec =>
       // The children of SortMergeJoin should do codegen separately.
-      j.copy(left = InputAdapter(insertWholeStageCodegen(left)),
-        right = InputAdapter(insertWholeStageCodegen(right)))
+      j.withNewChildren(j.children.map(child => InputAdapter(insertWholeStageCodegen(child))))
     case p =>
       p.withNewChildren(p.children.map(insertInputAdapter))
   }

From b3d8fc3dc458d42cf11d961762ce99f551f68548 Mon Sep 17 00:00:00 2001
From: Marco Gaido <mgaido@hortonworks.com>
Date: Fri, 27 Oct 2017 13:43:09 -0700
Subject: [PATCH 777/779] [SPARK-22226][SQL] splitExpression can create too
 many method calls in the outer class

## What changes were proposed in this pull request?

SPARK-18016 introduced `NestedClass` to avoid that the many methods generated by `splitExpressions` contribute to the outer class' constant pool, making it growing too much. Unfortunately, despite their definition is stored in the `NestedClass`, they all are invoked in the outer class and for each method invocation, there are two entries added to the constant pool: a `Methodref` and a `Utf8` entry (you can easily check this compiling a simple sample class with `janinoc` and looking at its Constant Pool). This limits the scalability of the solution with very large methods which are split in a lot of small ones. This means that currently we are generating classes like this one:

```
class SpecificUnsafeProjection extends org.apache.spark.sql.catalyst.expressions.UnsafeProjection {
...
  public UnsafeRow apply(InternalRow i) {
     rowWriter.zeroOutNullBytes();
     apply_0(i);
     apply_1(i);
...
    nestedClassInstance.apply_862(i);
    nestedClassInstance.apply_863(i);
...
    nestedClassInstance1.apply_1612(i);
    nestedClassInstance1.apply_1613(i);
...
  }
...
  private class NestedClass {
    private void apply_862(InternalRow i) { ... }
    private void apply_863(InternalRow i) { ... }
...
  }
  private class NestedClass1 {
    private void apply_1612(InternalRow i) { ... }
    private void apply_1613(InternalRow i) { ... }
...
  }
}
```

This PR reduce the Constant Pool size of the outer class by adding a new method to each nested class: in this method we invoke all the small methods generated by `splitExpression` in that nested class. In this way, in the outer class there is only one method invocation per nested class, reducing by orders of magnitude the entries in its constant pool because of method invocations. This means that after the patch the generated code becomes:

```
class SpecificUnsafeProjection extends org.apache.spark.sql.catalyst.expressions.UnsafeProjection {
...
  public UnsafeRow apply(InternalRow i) {
     rowWriter.zeroOutNullBytes();
     apply_0(i);
     apply_1(i);
     ...
     nestedClassInstance.apply(i);
     nestedClassInstance1.apply(i);
     ...
  }
...
  private class NestedClass {
    private void apply_862(InternalRow i) { ... }
    private void apply_863(InternalRow i) { ... }
...
    private void apply(InternalRow i) {
      apply_862(i);
      apply_863(i);
      ...
    }
  }
  private class NestedClass1 {
    private void apply_1612(InternalRow i) { ... }
    private void apply_1613(InternalRow i) { ... }
...
    private void apply(InternalRow i) {
      apply_1612(i);
      apply_1613(i);
      ...
    }
  }
}
```

## How was this patch tested?

Added UT and existing UTs

Author: Marco Gaido <mgaido@hortonworks.com>
Author: Marco Gaido <marcogaido91@gmail.com>

Closes #19480 from mgaido91/SPARK-22226.
---
 .../expressions/codegen/CodeGenerator.scala   | 156 ++++++++++++++++--
 .../expressions/CodeGenerationSuite.scala     |  17 ++
 .../org/apache/spark/sql/DataFrameSuite.scala |  12 ++
 3 files changed, 167 insertions(+), 18 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 2cb66599076a9..58738b52b299f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -77,6 +77,22 @@ case class SubExprEliminationState(isNull: String, value: String)
  */
 case class SubExprCodes(codes: Seq[String], states: Map[Expression, SubExprEliminationState])
 
+/**
+ * The main information about a new added function.
+ *
+ * @param functionName String representing the name of the function
+ * @param innerClassName Optional value which is empty if the function is added to
+ *                       the outer class, otherwise it contains the name of the
+ *                       inner class in which the function has been added.
+ * @param innerClassInstance Optional value which is empty if the function is added to
+ *                           the outer class, otherwise it contains the name of the
+ *                           instance of the inner class in the outer class.
+ */
+private[codegen] case class NewFunctionSpec(
+    functionName: String,
+    innerClassName: Option[String],
+    innerClassInstance: Option[String])
+
 /**
  * A context for codegen, tracking a list of objects that could be passed into generated Java
  * function.
@@ -228,8 +244,8 @@ class CodegenContext {
   /**
    * Holds the class and instance names to be generated, where `OuterClass` is a placeholder
    * standing for whichever class is generated as the outermost class and which will contain any
-   * nested sub-classes. All other classes and instance names in this list will represent private,
-   * nested sub-classes.
+   * inner sub-classes. All other classes and instance names in this list will represent private,
+   * inner sub-classes.
    */
   private val classes: mutable.ListBuffer[(String, String)] =
     mutable.ListBuffer[(String, String)](outerClassName -> null)
@@ -260,8 +276,8 @@ class CodegenContext {
 
   /**
    * Adds a function to the generated class. If the code for the `OuterClass` grows too large, the
-   * function will be inlined into a new private, nested class, and a class-qualified name for the
-   * function will be returned. Otherwise, the function will be inined to the `OuterClass` the
+   * function will be inlined into a new private, inner class, and a class-qualified name for the
+   * function will be returned. Otherwise, the function will be inlined to the `OuterClass` the
    * simple `funcName` will be returned.
    *
    * @param funcName the class-unqualified name of the function
@@ -271,19 +287,27 @@ class CodegenContext {
    *                           it is eventually referenced and a returned qualified function name
    *                           cannot otherwise be accessed.
    * @return the name of the function, qualified by class if it will be inlined to a private,
-   *         nested sub-class
+   *         inner class
    */
   def addNewFunction(
       funcName: String,
       funcCode: String,
       inlineToOuterClass: Boolean = false): String = {
-    // The number of named constants that can exist in the class is limited by the Constant Pool
-    // limit, 65,536. We cannot know how many constants will be inserted for a class, so we use a
-    // threshold of 1600k bytes to determine when a function should be inlined to a private, nested
-    // sub-class.
+    val newFunction = addNewFunctionInternal(funcName, funcCode, inlineToOuterClass)
+    newFunction match {
+      case NewFunctionSpec(functionName, None, None) => functionName
+      case NewFunctionSpec(functionName, Some(_), Some(innerClassInstance)) =>
+        innerClassInstance + "." + functionName
+    }
+  }
+
+  private[this] def addNewFunctionInternal(
+      funcName: String,
+      funcCode: String,
+      inlineToOuterClass: Boolean): NewFunctionSpec = {
     val (className, classInstance) = if (inlineToOuterClass) {
       outerClassName -> ""
-    } else if (currClassSize > 1600000) {
+    } else if (currClassSize > CodeGenerator.GENERATED_CLASS_SIZE_THRESHOLD) {
       val className = freshName("NestedClass")
       val classInstance = freshName("nestedClassInstance")
 
@@ -294,17 +318,23 @@ class CodegenContext {
       currClass()
     }
 
-    classSize(className) += funcCode.length
-    classFunctions(className) += funcName -> funcCode
+    addNewFunctionToClass(funcName, funcCode, className)
 
     if (className == outerClassName) {
-      funcName
+      NewFunctionSpec(funcName, None, None)
     } else {
-
-      s"$classInstance.$funcName"
+      NewFunctionSpec(funcName, Some(className), Some(classInstance))
     }
   }
 
+  private[this] def addNewFunctionToClass(
+      funcName: String,
+      funcCode: String,
+      className: String) = {
+    classSize(className) += funcCode.length
+    classFunctions(className) += funcName -> funcCode
+  }
+
   /**
    * Declares all function code. If the added functions are too many, split them into nested
    * sub-classes to avoid hitting Java compiler constant pool limitation.
@@ -738,7 +768,7 @@ class CodegenContext {
   /**
    * Splits the generated code of expressions into multiple functions, because function has
    * 64kb code size limit in JVM. If the class to which the function would be inlined would grow
-   * beyond 1600kb, we declare a private, nested sub-class, and the function is inlined to it
+   * beyond 1000kb, we declare a private, inner sub-class, and the function is inlined to it
    * instead, because classes have a constant pool limit of 65,536 named values.
    *
    * @param row the variable name of row that is used by expressions
@@ -801,10 +831,90 @@ class CodegenContext {
            |  ${makeSplitFunction(body)}
            |}
          """.stripMargin
-        addNewFunction(name, code)
+        addNewFunctionInternal(name, code, inlineToOuterClass = false)
       }
 
-      foldFunctions(functions.map(name => s"$name(${arguments.map(_._2).mkString(", ")})"))
+      val (outerClassFunctions, innerClassFunctions) = functions.partition(_.innerClassName.isEmpty)
+
+      val argsString = arguments.map(_._2).mkString(", ")
+      val outerClassFunctionCalls = outerClassFunctions.map(f => s"${f.functionName}($argsString)")
+
+      val innerClassFunctionCalls = generateInnerClassesFunctionCalls(
+        innerClassFunctions,
+        func,
+        arguments,
+        returnType,
+        makeSplitFunction,
+        foldFunctions)
+
+      foldFunctions(outerClassFunctionCalls ++ innerClassFunctionCalls)
+    }
+  }
+
+  /**
+   * Here we handle all the methods which have been added to the inner classes and
+   * not to the outer class.
+   * Since they can be many, their direct invocation in the outer class adds many entries
+   * to the outer class' constant pool. This can cause the constant pool to past JVM limit.
+   * Moreover, this can cause also the outer class method where all the invocations are
+   * performed to grow beyond the 64k limit.
+   * To avoid these problems, we group them and we call only the grouping methods in the
+   * outer class.
+   *
+   * @param functions a [[Seq]] of [[NewFunctionSpec]] defined in the inner classes
+   * @param funcName the split function name base.
+   * @param arguments the list of (type, name) of the arguments of the split function.
+   * @param returnType the return type of the split function.
+   * @param makeSplitFunction makes split function body, e.g. add preparation or cleanup.
+   * @param foldFunctions folds the split function calls.
+   * @return an [[Iterable]] containing the methods' invocations
+   */
+  private def generateInnerClassesFunctionCalls(
+      functions: Seq[NewFunctionSpec],
+      funcName: String,
+      arguments: Seq[(String, String)],
+      returnType: String,
+      makeSplitFunction: String => String,
+      foldFunctions: Seq[String] => String): Iterable[String] = {
+    val innerClassToFunctions = mutable.LinkedHashMap.empty[(String, String), Seq[String]]
+    functions.foreach(f => {
+      val key = (f.innerClassName.get, f.innerClassInstance.get)
+      val value = f.functionName +: innerClassToFunctions.getOrElse(key, Seq.empty[String])
+      innerClassToFunctions.put(key, value)
+    })
+
+    val argDefinitionString = arguments.map { case (t, name) => s"$t $name" }.mkString(", ")
+    val argInvocationString = arguments.map(_._2).mkString(", ")
+
+    innerClassToFunctions.flatMap {
+      case ((innerClassName, innerClassInstance), innerClassFunctions) =>
+        // for performance reasons, the functions are prepended, instead of appended,
+        // thus here they are in reversed order
+        val orderedFunctions = innerClassFunctions.reverse
+        if (orderedFunctions.size > CodeGenerator.MERGE_SPLIT_METHODS_THRESHOLD) {
+          // Adding a new function to each inner class which contains the invocation of all the
+          // ones which have been added to that inner class. For example,
+          //   private class NestedClass {
+          //     private void apply_862(InternalRow i) { ... }
+          //     private void apply_863(InternalRow i) { ... }
+          //       ...
+          //     private void apply(InternalRow i) {
+          //       apply_862(i);
+          //       apply_863(i);
+          //       ...
+          //     }
+          //   }
+          val body = foldFunctions(orderedFunctions.map(name => s"$name($argInvocationString)"))
+          val code = s"""
+              |private $returnType $funcName($argDefinitionString) {
+              |  ${makeSplitFunction(body)}
+              |}
+            """.stripMargin
+          addNewFunctionToClass(funcName, code, innerClassName)
+          Seq(s"$innerClassInstance.$funcName($argInvocationString)")
+        } else {
+          orderedFunctions.map(f => s"$innerClassInstance.$f($argInvocationString)")
+        }
     }
   }
 
@@ -1013,6 +1123,16 @@ object CodeGenerator extends Logging {
   // This is the value of HugeMethodLimit in the OpenJDK JVM settings
   val DEFAULT_JVM_HUGE_METHOD_LIMIT = 8000
 
+  // This is the threshold over which the methods in an inner class are grouped in a single
+  // method which is going to be called by the outer class instead of the many small ones
+  val MERGE_SPLIT_METHODS_THRESHOLD = 3
+
+  // The number of named constants that can exist in the class is limited by the Constant Pool
+  // limit, 65,536. We cannot know how many constants will be inserted for a class, so we use a
+  // threshold of 1000k bytes to determine when a function should be inlined to a private, inner
+  // class.
+  val GENERATED_CLASS_SIZE_THRESHOLD = 1000000
+
   /**
    * Compile the Java source code into a Java class, using Janino.
    *
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
index 7ea0bec145481..1e6f7b65e7e72 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
@@ -201,6 +201,23 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper {
     }
   }
 
+  test("SPARK-22226: group splitted expressions into one method per nested class") {
+    val length = 10000
+    val expressions = Seq.fill(length) {
+      ToUTCTimestamp(
+        Literal.create(Timestamp.valueOf("2017-10-10 00:00:00"), TimestampType),
+        Literal.create("PST", StringType))
+    }
+    val plan = GenerateMutableProjection.generate(expressions)
+    val actual = plan(new GenericInternalRow(length)).toSeq(expressions.map(_.dataType))
+    val expected = Seq.fill(length)(
+      DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2017-10-10 07:00:00")))
+
+    if (actual != expected) {
+      fail(s"Incorrect Evaluation: expressions: $expressions, actual: $actual, expected: $expected")
+    }
+  }
+
   test("test generated safe and unsafe projection") {
     val schema = new StructType(Array(
       StructField("a", StringType, true),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 473c355cf3c7f..17c88b0690800 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -2106,6 +2106,18 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
       Seq(Row(7, 1, 1), Row(7, 1, 2), Row(7, 2, 1), Row(7, 2, 2), Row(7, 3, 1), Row(7, 3, 2)))
   }
 
+  test("SPARK-22226: splitExpressions should not generate codes beyond 64KB") {
+    val colNumber = 10000
+    val input = spark.range(2).rdd.map(_ => Row(1 to colNumber: _*))
+    val df = sqlContext.createDataFrame(input, StructType(
+      (1 to colNumber).map(colIndex => StructField(s"_$colIndex", IntegerType, false))))
+    val newCols = (1 to colNumber).flatMap { colIndex =>
+      Seq(expr(s"if(1000 < _$colIndex, 1000, _$colIndex)"),
+        expr(s"sqrt(_$colIndex)"))
+    }
+    df.select(newCols: _*).collect()
+  }
+
   test("SPARK-22271: mean overflows and returns null for some decimal variables") {
     val d = 0.034567890
     val df = Seq(d, d, d, d, d, d, d, d, d, d).toDF("DecimalCol")

From 20eb95e5e9c562261b44e4e47cad67a31390fa59 Mon Sep 17 00:00:00 2001
From: WeichenXu <weichen.xu@databricks.com>
Date: Fri, 27 Oct 2017 15:19:27 -0700
Subject: [PATCH 778/779] [SPARK-21911][ML][PYSPARK] Parallel Model Evaluation
 for ML Tuning in PySpark

## What changes were proposed in this pull request?

Add parallelism support for ML tuning in pyspark.

## How was this patch tested?

Test updated.

Author: WeichenXu <weichen.xu@databricks.com>

Closes #19122 from WeichenXu123/par-ml-tuning-py.
---
 .../spark/ml/tuning/CrossValidatorSuite.scala |  4 +-
 .../ml/tuning/TrainValidationSplitSuite.scala |  4 +-
 python/pyspark/ml/tests.py                    | 39 +++++++++
 python/pyspark/ml/tuning.py                   | 86 ++++++++++++-------
 4 files changed, 96 insertions(+), 37 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
index a01744f7b67fd..853eeb39bf8df 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
@@ -137,8 +137,8 @@ class CrossValidatorSuite
     cv.setParallelism(2)
     val cvParallelModel = cv.fit(dataset)
 
-    val serialMetrics = cvSerialModel.avgMetrics.sorted
-    val parallelMetrics = cvParallelModel.avgMetrics.sorted
+    val serialMetrics = cvSerialModel.avgMetrics
+    val parallelMetrics = cvParallelModel.avgMetrics
     assert(serialMetrics === parallelMetrics)
 
     val parentSerial = cvSerialModel.bestModel.parent.asInstanceOf[LogisticRegression]
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
index 2ed4fbb601b61..f8d9c66be2c40 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
@@ -138,8 +138,8 @@ class TrainValidationSplitSuite
     cv.setParallelism(2)
     val cvParallelModel = cv.fit(dataset)
 
-    val serialMetrics = cvSerialModel.validationMetrics.sorted
-    val parallelMetrics = cvParallelModel.validationMetrics.sorted
+    val serialMetrics = cvSerialModel.validationMetrics
+    val parallelMetrics = cvParallelModel.validationMetrics
     assert(serialMetrics === parallelMetrics)
 
     val parentSerial = cvSerialModel.bestModel.parent.asInstanceOf[LogisticRegression]
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 8b8bcc7b13a38..2f1f3af957e4d 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -836,6 +836,27 @@ def test_save_load_simple_estimator(self):
         loadedModel = CrossValidatorModel.load(cvModelPath)
         self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
 
+    def test_parallel_evaluation(self):
+        dataset = self.spark.createDataFrame(
+            [(Vectors.dense([0.0]), 0.0),
+             (Vectors.dense([0.4]), 1.0),
+             (Vectors.dense([0.5]), 0.0),
+             (Vectors.dense([0.6]), 1.0),
+             (Vectors.dense([1.0]), 1.0)] * 10,
+            ["features", "label"])
+
+        lr = LogisticRegression()
+        grid = ParamGridBuilder().addGrid(lr.maxIter, [5, 6]).build()
+        evaluator = BinaryClassificationEvaluator()
+
+        # test save/load of CrossValidator
+        cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
+        cv.setParallelism(1)
+        cvSerialModel = cv.fit(dataset)
+        cv.setParallelism(2)
+        cvParallelModel = cv.fit(dataset)
+        self.assertEqual(cvSerialModel.avgMetrics, cvParallelModel.avgMetrics)
+
     def test_save_load_nested_estimator(self):
         temp_path = tempfile.mkdtemp()
         dataset = self.spark.createDataFrame(
@@ -986,6 +1007,24 @@ def test_save_load_simple_estimator(self):
         loadedModel = TrainValidationSplitModel.load(tvsModelPath)
         self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
 
+    def test_parallel_evaluation(self):
+        dataset = self.spark.createDataFrame(
+            [(Vectors.dense([0.0]), 0.0),
+             (Vectors.dense([0.4]), 1.0),
+             (Vectors.dense([0.5]), 0.0),
+             (Vectors.dense([0.6]), 1.0),
+             (Vectors.dense([1.0]), 1.0)] * 10,
+            ["features", "label"])
+        lr = LogisticRegression()
+        grid = ParamGridBuilder().addGrid(lr.maxIter, [5, 6]).build()
+        evaluator = BinaryClassificationEvaluator()
+        tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
+        tvs.setParallelism(1)
+        tvsSerialModel = tvs.fit(dataset)
+        tvs.setParallelism(2)
+        tvsParallelModel = tvs.fit(dataset)
+        self.assertEqual(tvsSerialModel.validationMetrics, tvsParallelModel.validationMetrics)
+
     def test_save_load_nested_estimator(self):
         # This tests saving and loading the trained model only.
         # Save/load for TrainValidationSplit will be added later: SPARK-13786
diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py
index 00c348aa9f7de..47351133524e7 100644
--- a/python/pyspark/ml/tuning.py
+++ b/python/pyspark/ml/tuning.py
@@ -14,15 +14,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
 import itertools
 import numpy as np
+from multiprocessing.pool import ThreadPool
 
 from pyspark import since, keyword_only
 from pyspark.ml import Estimator, Model
 from pyspark.ml.common import _py2java
 from pyspark.ml.param import Params, Param, TypeConverters
-from pyspark.ml.param.shared import HasSeed
+from pyspark.ml.param.shared import HasParallelism, HasSeed
 from pyspark.ml.util import *
 from pyspark.ml.wrapper import JavaParams
 from pyspark.sql.functions import rand
@@ -170,7 +170,7 @@ def _to_java_impl(self):
         return java_estimator, java_epms, java_evaluator
 
 
-class CrossValidator(Estimator, ValidatorParams, MLReadable, MLWritable):
+class CrossValidator(Estimator, ValidatorParams, HasParallelism, MLReadable, MLWritable):
     """
 
     K-fold cross validation performs model selection by splitting the dataset into a set of
@@ -193,7 +193,8 @@ class CrossValidator(Estimator, ValidatorParams, MLReadable, MLWritable):
     >>> lr = LogisticRegression()
     >>> grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
     >>> evaluator = BinaryClassificationEvaluator()
-    >>> cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
+    >>> cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator,
+    ...     parallelism=2)
     >>> cvModel = cv.fit(dataset)
     >>> cvModel.avgMetrics[0]
     0.5
@@ -208,23 +209,23 @@ class CrossValidator(Estimator, ValidatorParams, MLReadable, MLWritable):
 
     @keyword_only
     def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3,
-                 seed=None):
+                 seed=None, parallelism=1):
         """
         __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3,\
-                 seed=None)
+                 seed=None, parallelism=1)
         """
         super(CrossValidator, self).__init__()
-        self._setDefault(numFolds=3)
+        self._setDefault(numFolds=3, parallelism=1)
         kwargs = self._input_kwargs
         self._set(**kwargs)
 
     @keyword_only
     @since("1.4.0")
     def setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3,
-                  seed=None):
+                  seed=None, parallelism=1):
         """
         setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3,\
-                  seed=None):
+                  seed=None, parallelism=1):
         Sets params for cross validator.
         """
         kwargs = self._input_kwargs
@@ -255,18 +256,27 @@ def _fit(self, dataset):
         randCol = self.uid + "_rand"
         df = dataset.select("*", rand(seed).alias(randCol))
         metrics = [0.0] * numModels
+
+        pool = ThreadPool(processes=min(self.getParallelism(), numModels))
+
         for i in range(nFolds):
             validateLB = i * h
             validateUB = (i + 1) * h
             condition = (df[randCol] >= validateLB) & (df[randCol] < validateUB)
-            validation = df.filter(condition)
-            train = df.filter(~condition)
-            models = est.fit(train, epm)
-            for j in range(numModels):
-                model = models[j]
+            validation = df.filter(condition).cache()
+            train = df.filter(~condition).cache()
+
+            def singleTrain(paramMap):
+                model = est.fit(train, paramMap)
                 # TODO: duplicate evaluator to take extra params from input
-                metric = eva.evaluate(model.transform(validation, epm[j]))
-                metrics[j] += metric/nFolds
+                metric = eva.evaluate(model.transform(validation, paramMap))
+                return metric
+
+            currentFoldMetrics = pool.map(singleTrain, epm)
+            for j in range(numModels):
+                metrics[j] += (currentFoldMetrics[j] / nFolds)
+            validation.unpersist()
+            train.unpersist()
 
         if eva.isLargerBetter():
             bestIndex = np.argmax(metrics)
@@ -316,9 +326,10 @@ def _from_java(cls, java_stage):
         estimator, epms, evaluator = super(CrossValidator, cls)._from_java_impl(java_stage)
         numFolds = java_stage.getNumFolds()
         seed = java_stage.getSeed()
+        parallelism = java_stage.getParallelism()
         # Create a new instance of this stage.
         py_stage = cls(estimator=estimator, estimatorParamMaps=epms, evaluator=evaluator,
-                       numFolds=numFolds, seed=seed)
+                       numFolds=numFolds, seed=seed, parallelism=parallelism)
         py_stage._resetUid(java_stage.uid())
         return py_stage
 
@@ -337,6 +348,7 @@ def _to_java(self):
         _java_obj.setEstimator(estimator)
         _java_obj.setSeed(self.getSeed())
         _java_obj.setNumFolds(self.getNumFolds())
+        _java_obj.setParallelism(self.getParallelism())
 
         return _java_obj
 
@@ -427,7 +439,7 @@ def _to_java(self):
         return _java_obj
 
 
-class TrainValidationSplit(Estimator, ValidatorParams, MLReadable, MLWritable):
+class TrainValidationSplit(Estimator, ValidatorParams, HasParallelism, MLReadable, MLWritable):
     """
     .. note:: Experimental
 
@@ -448,7 +460,8 @@ class TrainValidationSplit(Estimator, ValidatorParams, MLReadable, MLWritable):
     >>> lr = LogisticRegression()
     >>> grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
     >>> evaluator = BinaryClassificationEvaluator()
-    >>> tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
+    >>> tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator,
+    ...     parallelism=2)
     >>> tvsModel = tvs.fit(dataset)
     >>> evaluator.evaluate(tvsModel.transform(dataset))
     0.8333...
@@ -461,23 +474,23 @@ class TrainValidationSplit(Estimator, ValidatorParams, MLReadable, MLWritable):
 
     @keyword_only
     def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, trainRatio=0.75,
-                 seed=None):
+                 parallelism=1, seed=None):
         """
         __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, trainRatio=0.75,\
-                 seed=None)
+                 parallelism=1, seed=None)
         """
         super(TrainValidationSplit, self).__init__()
-        self._setDefault(trainRatio=0.75)
+        self._setDefault(trainRatio=0.75, parallelism=1)
         kwargs = self._input_kwargs
         self._set(**kwargs)
 
     @since("2.0.0")
     @keyword_only
     def setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, trainRatio=0.75,
-                  seed=None):
+                  parallelism=1, seed=None):
         """
         setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, trainRatio=0.75,\
-                  seed=None):
+                  parallelism=1, seed=None):
         Sets params for the train validation split.
         """
         kwargs = self._input_kwargs
@@ -506,15 +519,20 @@ def _fit(self, dataset):
         seed = self.getOrDefault(self.seed)
         randCol = self.uid + "_rand"
         df = dataset.select("*", rand(seed).alias(randCol))
-        metrics = [0.0] * numModels
         condition = (df[randCol] >= tRatio)
-        validation = df.filter(condition)
-        train = df.filter(~condition)
-        models = est.fit(train, epm)
-        for j in range(numModels):
-            model = models[j]
-            metric = eva.evaluate(model.transform(validation, epm[j]))
-            metrics[j] += metric
+        validation = df.filter(condition).cache()
+        train = df.filter(~condition).cache()
+
+        def singleTrain(paramMap):
+            model = est.fit(train, paramMap)
+            metric = eva.evaluate(model.transform(validation, paramMap))
+            return metric
+
+        pool = ThreadPool(processes=min(self.getParallelism(), numModels))
+        metrics = pool.map(singleTrain, epm)
+        train.unpersist()
+        validation.unpersist()
+
         if eva.isLargerBetter():
             bestIndex = np.argmax(metrics)
         else:
@@ -563,9 +581,10 @@ def _from_java(cls, java_stage):
         estimator, epms, evaluator = super(TrainValidationSplit, cls)._from_java_impl(java_stage)
         trainRatio = java_stage.getTrainRatio()
         seed = java_stage.getSeed()
+        parallelism = java_stage.getParallelism()
         # Create a new instance of this stage.
         py_stage = cls(estimator=estimator, estimatorParamMaps=epms, evaluator=evaluator,
-                       trainRatio=trainRatio, seed=seed)
+                       trainRatio=trainRatio, seed=seed, parallelism=parallelism)
         py_stage._resetUid(java_stage.uid())
         return py_stage
 
@@ -584,6 +603,7 @@ def _to_java(self):
         _java_obj.setEstimator(estimator)
         _java_obj.setTrainRatio(self.getTrainRatio())
         _java_obj.setSeed(self.getSeed())
+        _java_obj.setParallelism(self.getParallelism())
 
         return _java_obj
 

From 01f6ba0e7a12ef818d56e7d5b1bd889b79f2b57c Mon Sep 17 00:00:00 2001
From: Sathiya <sathiya.kumar@polytechnique.edu>
Date: Fri, 27 Oct 2017 18:57:08 -0700
Subject: [PATCH 779/779] [SPARK-22181][SQL] Adds ReplaceExceptWithFilter rule

## What changes were proposed in this pull request?

Adds a new optimisation rule 'ReplaceExceptWithNotFilter' that replaces Except logical with Filter operator and schedule it before applying 'ReplaceExceptWithAntiJoin' rule. This way we can avoid expensive join operation if one or both of the datasets of the Except operation are fully derived out of Filters from a same parent.

## How was this patch tested?

The patch is tested locally using spark-shell + unit test.

Author: Sathiya <sathiya.kumar@polytechnique.edu>

Closes #19451 from sathiyapk/SPARK-22181-optimize-exceptWithFilter.
---
 .../sql/catalyst/expressions/subquery.scala   |  10 ++
 .../sql/catalyst/optimizer/Optimizer.scala    |   1 +
 .../optimizer/ReplaceExceptWithFilter.scala   | 101 +++++++++++++++++
 .../apache/spark/sql/internal/SQLConf.scala   |  15 +++
 .../optimizer/ReplaceOperatorSuite.scala      | 106 +++++++++++++++++-
 .../resources/sql-tests/inputs/except.sql     |  57 ++++++++++
 .../sql-tests/results/except.sql.out          | 105 +++++++++++++++++
 7 files changed, 394 insertions(+), 1 deletion(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceExceptWithFilter.scala
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/except.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/except.sql.out

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala
index c6146042ef1a6..6acc87a3e7367 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala
@@ -89,6 +89,16 @@ object SubqueryExpression {
       case _ => false
     }.isDefined
   }
+
+  /**
+   * Returns true when an expression contains a subquery
+   */
+  def hasSubquery(e: Expression): Boolean = {
+    e.find {
+      case _: SubqueryExpression => true
+      case _ => false
+    }.isDefined
+  }
 }
 
 object SubExprUtils extends PredicateHelper {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index d829e01441dcc..3273a61dc7b35 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -76,6 +76,7 @@ abstract class Optimizer(sessionCatalog: SessionCatalog)
       OptimizeSubqueries) ::
     Batch("Replace Operators", fixedPoint,
       ReplaceIntersectWithSemiJoin,
+      ReplaceExceptWithFilter,
       ReplaceExceptWithAntiJoin,
       ReplaceDistinctWithAggregate) ::
     Batch("Aggregate", fixedPoint,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceExceptWithFilter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceExceptWithFilter.scala
new file mode 100644
index 0000000000000..89bfcee078fba
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceExceptWithFilter.scala
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import scala.annotation.tailrec
+
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.rules.Rule
+
+
+/**
+ * If one or both of the datasets in the logical [[Except]] operator are purely transformed using
+ * [[Filter]], this rule will replace logical [[Except]] operator with a [[Filter]] operator by
+ * flipping the filter condition of the right child.
+ * {{{
+ *   SELECT a1, a2 FROM Tab1 WHERE a2 = 12 EXCEPT SELECT a1, a2 FROM Tab1 WHERE a1 = 5
+ *   ==>  SELECT DISTINCT a1, a2 FROM Tab1 WHERE a2 = 12 AND (a1 is null OR a1 <> 5)
+ * }}}
+ *
+ * Note:
+ * Before flipping the filter condition of the right node, we should:
+ * 1. Combine all it's [[Filter]].
+ * 2. Apply InferFiltersFromConstraints rule (to take into account of NULL values in the condition).
+ */
+object ReplaceExceptWithFilter extends Rule[LogicalPlan] {
+
+  def apply(plan: LogicalPlan): LogicalPlan = {
+    if (!plan.conf.replaceExceptWithFilter) {
+      return plan
+    }
+
+    plan.transform {
+      case Except(left, right) if isEligible(left, right) =>
+        Distinct(Filter(Not(transformCondition(left, skipProject(right))), left))
+    }
+  }
+
+  private def transformCondition(left: LogicalPlan, right: LogicalPlan): Expression = {
+    val filterCondition =
+      InferFiltersFromConstraints(combineFilters(right)).asInstanceOf[Filter].condition
+
+    val attributeNameMap: Map[String, Attribute] = left.output.map(x => (x.name, x)).toMap
+
+    filterCondition.transform { case a : AttributeReference => attributeNameMap(a.name) }
+  }
+
+  // TODO: This can be further extended in the future.
+  private def isEligible(left: LogicalPlan, right: LogicalPlan): Boolean = (left, right) match {
+    case (_, right @ (Project(_, _: Filter) | Filter(_, _))) => verifyConditions(left, right)
+    case _ => false
+  }
+
+  private def verifyConditions(left: LogicalPlan, right: LogicalPlan): Boolean = {
+    val leftProjectList = projectList(left)
+    val rightProjectList = projectList(right)
+
+    left.output.size == left.output.map(_.name).distinct.size &&
+      left.find(_.expressions.exists(SubqueryExpression.hasSubquery)).isEmpty &&
+        right.find(_.expressions.exists(SubqueryExpression.hasSubquery)).isEmpty &&
+          Project(leftProjectList, nonFilterChild(skipProject(left))).sameResult(
+            Project(rightProjectList, nonFilterChild(skipProject(right))))
+  }
+
+  private def projectList(node: LogicalPlan): Seq[NamedExpression] = node match {
+    case p: Project => p.projectList
+    case x => x.output
+  }
+
+  private def skipProject(node: LogicalPlan): LogicalPlan = node match {
+    case p: Project => p.child
+    case x => x
+  }
+
+  private def nonFilterChild(plan: LogicalPlan) = plan.find(!_.isInstanceOf[Filter]).getOrElse {
+    throw new IllegalStateException("Leaf node is expected")
+  }
+
+  private def combineFilters(plan: LogicalPlan): LogicalPlan = {
+    @tailrec
+    def iterate(plan: LogicalPlan, acc: LogicalPlan): LogicalPlan = {
+      if (acc.fastEquals(plan)) acc else iterate(acc, CombineFilters(acc))
+    }
+    iterate(plan, CombineFilters(plan))
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 21e4685fcc456..5203e8833fbbb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -948,6 +948,19 @@ object SQLConf {
       .intConf
       .createWithDefault(10000)
 
+  val REPLACE_EXCEPT_WITH_FILTER = buildConf("spark.sql.optimizer.replaceExceptWithFilter")
+    .internal()
+    .doc("When true, the apply function of the rule verifies whether the right node of the" +
+      " except operation is of type Filter or Project followed by Filter. If yes, the rule" +
+      " further verifies 1) Excluding the filter operations from the right (as well as the" +
+      " left node, if any) on the top, whether both the nodes evaluates to a same result." +
+      " 2) The left and right nodes don't contain any SubqueryExpressions. 3) The output" +
+      " column names of the left node are distinct. If all the conditions are met, the" +
+      " rule will replace the except operation with a Filter by flipping the filter" +
+      " condition(s) of the right node.")
+    .booleanConf
+    .createWithDefault(true)
+
   object Deprecated {
     val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"
   }
@@ -1233,6 +1246,8 @@ class SQLConf extends Serializable with Logging {
 
   def arrowMaxRecordsPerBatch: Int = getConf(ARROW_EXECUTION_MAX_RECORDS_PER_BATCH)
 
+  def replaceExceptWithFilter: Boolean = getConf(REPLACE_EXCEPT_WITH_FILTER)
+
   /** ********************** SQLConf functionality methods ************ */
 
   /** Set Spark SQL configuration properties. */
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceOperatorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceOperatorSuite.scala
index 85988d2fb948c..0fa1aaeb9e164 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceOperatorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceOperatorSuite.scala
@@ -17,9 +17,10 @@
 
 package org.apache.spark.sql.catalyst.optimizer
 
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
-import org.apache.spark.sql.catalyst.expressions.Alias
+import org.apache.spark.sql.catalyst.expressions.{Alias, Not}
 import org.apache.spark.sql.catalyst.expressions.aggregate.First
 import org.apache.spark.sql.catalyst.plans.{LeftAnti, LeftSemi, PlanTest}
 import org.apache.spark.sql.catalyst.plans.logical._
@@ -31,6 +32,7 @@ class ReplaceOperatorSuite extends PlanTest {
     val batches =
       Batch("Replace Operators", FixedPoint(100),
         ReplaceDistinctWithAggregate,
+        ReplaceExceptWithFilter,
         ReplaceExceptWithAntiJoin,
         ReplaceIntersectWithSemiJoin,
         ReplaceDeduplicateWithAggregate) :: Nil
@@ -50,6 +52,108 @@ class ReplaceOperatorSuite extends PlanTest {
     comparePlans(optimized, correctAnswer)
   }
 
+  test("replace Except with Filter while both the nodes are of type Filter") {
+    val attributeA = 'a.int
+    val attributeB = 'b.int
+
+    val table1 = LocalRelation.fromExternalRows(Seq(attributeA, attributeB), data = Seq(Row(1, 2)))
+    val table2 = Filter(attributeB === 2, Filter(attributeA === 1, table1))
+    val table3 = Filter(attributeB < 1, Filter(attributeA >= 2, table1))
+
+    val query = Except(table2, table3)
+    val optimized = Optimize.execute(query.analyze)
+
+    val correctAnswer =
+      Aggregate(table1.output, table1.output,
+        Filter(Not((attributeA.isNotNull && attributeB.isNotNull) &&
+          (attributeA >= 2 && attributeB < 1)),
+          Filter(attributeB === 2, Filter(attributeA === 1, table1)))).analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("replace Except with Filter while only right node is of type Filter") {
+    val attributeA = 'a.int
+    val attributeB = 'b.int
+
+    val table1 = LocalRelation.fromExternalRows(Seq(attributeA, attributeB), data = Seq(Row(1, 2)))
+    val table2 = Filter(attributeB < 1, Filter(attributeA >= 2, table1))
+
+    val query = Except(table1, table2)
+    val optimized = Optimize.execute(query.analyze)
+
+    val correctAnswer =
+      Aggregate(table1.output, table1.output,
+        Filter(Not((attributeA.isNotNull && attributeB.isNotNull) &&
+          (attributeA >= 2 && attributeB < 1)), table1)).analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("replace Except with Filter while both the nodes are of type Project") {
+    val attributeA = 'a.int
+    val attributeB = 'b.int
+
+    val table1 = LocalRelation.fromExternalRows(Seq(attributeA, attributeB), data = Seq(Row(1, 2)))
+    val table2 = Project(Seq(attributeA, attributeB), table1)
+    val table3 = Project(Seq(attributeA, attributeB),
+      Filter(attributeB < 1, Filter(attributeA >= 2, table1)))
+
+    val query = Except(table2, table3)
+    val optimized = Optimize.execute(query.analyze)
+
+    val correctAnswer =
+      Aggregate(table1.output, table1.output,
+        Filter(Not((attributeA.isNotNull && attributeB.isNotNull) &&
+          (attributeA >= 2 && attributeB < 1)),
+          Project(Seq(attributeA, attributeB), table1))).analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("replace Except with Filter while only right node is of type Project") {
+    val attributeA = 'a.int
+    val attributeB = 'b.int
+
+    val table1 = LocalRelation.fromExternalRows(Seq(attributeA, attributeB), data = Seq(Row(1, 2)))
+    val table2 = Filter(attributeB === 2, Filter(attributeA === 1, table1))
+    val table3 = Project(Seq(attributeA, attributeB),
+      Filter(attributeB < 1, Filter(attributeA >= 2, table1)))
+
+    val query = Except(table2, table3)
+    val optimized = Optimize.execute(query.analyze)
+
+    val correctAnswer =
+      Aggregate(table1.output, table1.output,
+          Filter(Not((attributeA.isNotNull && attributeB.isNotNull) &&
+            (attributeA >= 2 && attributeB < 1)),
+            Filter(attributeB === 2, Filter(attributeA === 1, table1)))).analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("replace Except with Filter while left node is Project and right node is Filter") {
+    val attributeA = 'a.int
+    val attributeB = 'b.int
+
+    val table1 = LocalRelation.fromExternalRows(Seq(attributeA, attributeB), data = Seq(Row(1, 2)))
+    val table2 = Project(Seq(attributeA, attributeB),
+      Filter(attributeB < 1, Filter(attributeA >= 2, table1)))
+    val table3 = Filter(attributeB === 2, Filter(attributeA === 1, table1))
+
+    val query = Except(table2, table3)
+    val optimized = Optimize.execute(query.analyze)
+
+    val correctAnswer =
+      Aggregate(table1.output, table1.output,
+        Filter(Not((attributeA.isNotNull && attributeB.isNotNull) &&
+          (attributeA === 1 && attributeB === 2)),
+          Project(Seq(attributeA, attributeB),
+            Filter(attributeB < 1, Filter(attributeA >= 2, table1))))).analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
   test("replace Except with Left-anti Join") {
     val table1 = LocalRelation('a.int, 'b.int)
     val table2 = LocalRelation('c.int, 'd.int)
diff --git a/sql/core/src/test/resources/sql-tests/inputs/except.sql b/sql/core/src/test/resources/sql-tests/inputs/except.sql
new file mode 100644
index 0000000000000..1d579e65f3473
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/except.sql
@@ -0,0 +1,57 @@
+-- Tests different scenarios of except operation
+create temporary view t1 as select * from values
+  ("one", 1),
+  ("two", 2),
+  ("three", 3),
+  ("one", NULL)
+  as t1(k, v);
+
+create temporary view t2 as select * from values
+  ("one", 1),
+  ("two", 22),
+  ("one", 5),
+  ("one", NULL),
+  (NULL, 5)
+  as t2(k, v);
+
+
+-- Except operation that will be replaced by left anti join
+SELECT * FROM t1 EXCEPT SELECT * FROM t2;
+
+
+-- Except operation that will be replaced by Filter: SPARK-22181
+SELECT * FROM t1 EXCEPT SELECT * FROM t1 where v <> 1 and v <> 2;
+
+
+-- Except operation that will be replaced by Filter: SPARK-22181
+SELECT * FROM t1 where v <> 1 and v <> 22 EXCEPT SELECT * FROM t1 where v <> 2 and v >= 3;
+
+
+-- Except operation that will be replaced by Filter: SPARK-22181
+SELECT t1.* FROM t1, t2 where t1.k = t2.k
+EXCEPT
+SELECT t1.* FROM t1, t2 where t1.k = t2.k and t1.k != 'one';
+
+
+-- Except operation that will be replaced by left anti join
+SELECT * FROM t2 where v >= 1 and v <> 22 EXCEPT SELECT * FROM t1;
+
+
+-- Except operation that will be replaced by left anti join
+SELECT (SELECT min(k) FROM t2 WHERE t2.k = t1.k) min_t2 FROM t1
+MINUS
+SELECT (SELECT min(k) FROM t2) abs_min_t2 FROM t1 WHERE  t1.k = 'one';
+
+
+-- Except operation that will be replaced by left anti join
+SELECT t1.k
+FROM   t1
+WHERE  t1.v <= (SELECT   max(t2.v)
+                FROM     t2
+                WHERE    t2.k = t1.k)
+MINUS
+SELECT t1.k
+FROM   t1
+WHERE  t1.v >= (SELECT   min(t2.v)
+                FROM     t2
+                WHERE    t2.k = t1.k);
diff --git a/sql/core/src/test/resources/sql-tests/results/except.sql.out b/sql/core/src/test/resources/sql-tests/results/except.sql.out
new file mode 100644
index 0000000000000..c9b712d4d2949
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/except.sql.out
@@ -0,0 +1,105 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 9
+
+
+-- !query 0
+create temporary view t1 as select * from values
+  ("one", 1),
+  ("two", 2),
+  ("three", 3),
+  ("one", NULL)
+  as t1(k, v)
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+create temporary view t2 as select * from values
+  ("one", 1),
+  ("two", 22),
+  ("one", 5),
+  ("one", NULL),
+  (NULL, 5)
+  as t2(k, v)
+-- !query 1 schema
+struct<>
+-- !query 1 output
+
+
+
+-- !query 2
+SELECT * FROM t1 EXCEPT SELECT * FROM t2
+-- !query 2 schema
+struct<k:string,v:int>
+-- !query 2 output
+three	3
+two	2
+
+
+-- !query 3
+SELECT * FROM t1 EXCEPT SELECT * FROM t1 where v <> 1 and v <> 2
+-- !query 3 schema
+struct<k:string,v:int>
+-- !query 3 output
+one	1
+one	NULL
+two	2
+
+
+-- !query 4
+SELECT * FROM t1 where v <> 1 and v <> 22 EXCEPT SELECT * FROM t1 where v <> 2 and v >= 3
+-- !query 4 schema
+struct<k:string,v:int>
+-- !query 4 output
+two	2
+
+
+-- !query 5
+SELECT t1.* FROM t1, t2 where t1.k = t2.k
+EXCEPT
+SELECT t1.* FROM t1, t2 where t1.k = t2.k and t1.k != 'one'
+-- !query 5 schema
+struct<k:string,v:int>
+-- !query 5 output
+one	1
+one	NULL
+
+
+-- !query 6
+SELECT * FROM t2 where v >= 1 and v <> 22 EXCEPT SELECT * FROM t1
+-- !query 6 schema
+struct<k:string,v:int>
+-- !query 6 output
+NULL	5
+one	5
+
+
+-- !query 7
+SELECT (SELECT min(k) FROM t2 WHERE t2.k = t1.k) min_t2 FROM t1
+MINUS
+SELECT (SELECT min(k) FROM t2) abs_min_t2 FROM t1 WHERE  t1.k = 'one'
+-- !query 7 schema
+struct<min_t2:string>
+-- !query 7 output
+NULL
+two
+
+
+-- !query 8
+SELECT t1.k
+FROM   t1
+WHERE  t1.v <= (SELECT   max(t2.v)
+                FROM     t2
+                WHERE    t2.k = t1.k)
+MINUS
+SELECT t1.k
+FROM   t1
+WHERE  t1.v >= (SELECT   min(t2.v)
+                FROM     t2
+                WHERE    t2.k = t1.k)
+-- !query 8 schema
+struct<k:string>
+-- !query 8 output
+two